AI Semantic Matching System for Candidates and Job Postings
Keyword matching in recruitment finds "Python developer" in a resume when the job requires "Software Engineer." Semantic matching understands that "machine learning," "ML," and "predictive modeling" are synonyms, and that "5 years in Django" is relevant to a "Flask developer" position.
Two-Stage Matching System
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from anthropic import Anthropic
import json
import re
class ResumeJDEncoder:
"""Encoding resumes and job descriptions into embeddings"""
def __init__(self):
# Multilingual model: Russian + English
self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
def extract_resume_sections(self, resume_text: str) -> dict:
"""Divide resume into semantic blocks"""
# In production: ML resume parser (Affinda, Sovren or custom)
sections = {
'skills': '',
'experience': '',
'education': '',
'full_text': resume_text
}
# Simplified extraction via patterns
skills_pattern = r'(?:навыки|skills|технологии|technologies|стек)[:\s]*([^\n]+(?:\n[^\n]+){0,5})'
match = re.search(skills_pattern, resume_text, re.IGNORECASE)
if match:
sections['skills'] = match.group(1)
return sections
def encode_resume(self, resume: dict) -> dict:
"""Multi-aspect resume encoding"""
texts_to_encode = {
'full': resume.get('full_text', ''),
'skills': resume.get('skills', ''),
'title': resume.get('current_title', ''),
}
embeddings = {}
for key, text in texts_to_encode.items():
if text.strip():
embeddings[key] = self.model.encode(text, normalize_embeddings=True)
return embeddings
def encode_job(self, job: dict) -> dict:
"""Job description encoding"""
texts = {
'full': job.get('description', ''),
'requirements': ' '.join(job.get('requirements', [])),
'title': job.get('title', ''),
}
embeddings = {}
for key, text in texts.items():
if text.strip():
embeddings[key] = self.model.encode(text, normalize_embeddings=True)
return embeddings
class SemanticMatcher:
"""Two-stage matching: fast ANN + precise LLM"""
def __init__(self):
self.encoder = ResumeJDEncoder()
self.llm = Anthropic()
def compute_embedding_score(self, resume_embs: dict,
job_embs: dict) -> float:
"""Fast scoring via cosine similarity of embeddings"""
scores = []
weights = {'full': 0.4, 'skills': 0.4, 'title': 0.2}
for key, weight in weights.items():
r_emb = resume_embs.get(key)
j_emb = job_embs.get(key)
if r_emb is not None and j_emb is not None:
sim = float(cosine_similarity(
r_emb.reshape(1, -1), j_emb.reshape(1, -1)
)[0, 0])
scores.append(sim * weight)
return sum(scores) / sum(weights[k] for k in weights if resume_embs.get(k) is not None) if scores else 0.0
def deep_match(self, resume: dict, job: dict) -> dict:
"""Detailed LLM analysis of compatibility (for top candidates)"""
response = self.llm.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=500,
messages=[{
"role": "user",
"content": f"""Analyze candidate-job match. Return detailed assessment.
JOB:
Title: {job.get('title', '')}
Requirements: {', '.join(job.get('requirements', [])[:10])}
Nice-to-have: {', '.join(job.get('nice_to_have', [])[:5])}
Seniority: {job.get('seniority', 'mid')}
CANDIDATE:
Title: {resume.get('current_title', '')}
Years of experience: {resume.get('years_experience', 0)}
Skills: {', '.join(resume.get('skills', [])[:15])}
Summary: {resume.get('summary', '')[:300]}
Return JSON:
{{
"match_score": 0-100,
"strengths": ["..."],
"gaps": ["..."],
"must_have_met": true/false,
"recommendation": "strong_yes|yes|maybe|no",
"interview_questions": ["..."]
}}"""
}]
)
try:
return json.loads(response.content[0].text)
except Exception:
return {'match_score': 50, 'recommendation': 'maybe', 'strengths': [], 'gaps': []}
def rank_candidates(self, job: dict,
candidates: list[dict],
top_k_deep: int = 10) -> list[dict]:
"""
Two-stage pipeline:
1. Fast ANN matching for entire database → top-N
2. Deep LLM analysis for top-K finalists
"""
job_embs = self.encoder.encode_job(job)
# Stage 1: fast scoring
for candidate in candidates:
resume_embs = self.encoder.encode_resume(candidate)
candidate['embedding_score'] = self.compute_embedding_score(resume_embs, job_embs)
# Top-K by embedding score
top_candidates = sorted(candidates, key=lambda x: -x['embedding_score'])[:top_k_deep * 3]
# Stage 2: deep analysis of top candidates
results = []
for candidate in top_candidates[:top_k_deep]:
deep_result = self.deep_match(candidate, job)
results.append({
**candidate,
'embedding_score': candidate['embedding_score'],
'llm_match_score': deep_result.get('match_score', 50),
'final_score': (candidate['embedding_score'] * 0.4 +
deep_result.get('match_score', 50) / 100 * 0.6),
'strengths': deep_result.get('strengths', []),
'gaps': deep_result.get('gaps', []),
'recommendation': deep_result.get('recommendation', 'maybe'),
'interview_questions': deep_result.get('interview_questions', [])
})
return sorted(results, key=lambda x: -x['final_score'])
class BiasAuditor:
"""Bias audit in matching"""
def audit_demographic_bias(self, match_results: pd.DataFrame) -> dict:
"""Check differential selection by protected characteristics"""
audit = {}
for group_col in ['gender', 'age_group', 'university_tier']:
if group_col not in match_results.columns:
continue
group_stats = match_results.groupby(group_col)['final_score'].agg(
['mean', 'count', 'std']
)
# Disparate Impact: ratio between groups > 0.8 is considered acceptable
if len(group_stats) >= 2:
min_mean = group_stats['mean'].min()
max_mean = group_stats['mean'].max()
di_ratio = min_mean / max_mean if max_mean > 0 else 1.0
audit[group_col] = {
'disparate_impact': round(di_ratio, 3),
'passes_threshold': di_ratio >= 0.8,
'group_means': group_stats['mean'].round(3).to_dict()
}
return audit
Semantic matching increases quality-of-hire: the percentage of candidates who pass probation grows by 20-30%. Time to fill a position is reduced by 35-40% through accurate primary screening. Bias auditing is a mandatory component for compliance with Fair Hiring requirements and EEOC standards.







