AI Candidate-Job Semantic Matching System Development

We design and deploy artificial intelligence systems: from prototype to production-ready solutions. Our team combines expertise in machine learning, data engineering and MLOps to make AI work not in the lab, but in real business.
Showing 1 of 1 servicesAll 1566 services
AI Candidate-Job Semantic Matching System Development
Medium
~1-2 weeks
FAQ
AI Development Areas
AI Solution Development Stages
Latest works
  • image_website-b2b-advance_0.png
    B2B ADVANCE company website development
    1212
  • image_web-applications_feedme_466_0.webp
    Development of a web application for FEEDME
    1161
  • image_websites_belfingroup_462_0.webp
    Website development for BELFINGROUP
    852
  • image_ecommerce_furnoro_435_0.webp
    Development of an online store for the company FURNORO
    1041
  • image_logo-advance_0.png
    B2B Advance company logo design
    561
  • image_crm_enviok_479_0.webp
    Development of a web application for Enviok
    822

AI Semantic Matching System for Candidates and Job Postings

Keyword matching in recruitment finds "Python developer" in a resume when the job requires "Software Engineer." Semantic matching understands that "machine learning," "ML," and "predictive modeling" are synonyms, and that "5 years in Django" is relevant to a "Flask developer" position.

Two-Stage Matching System

import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from anthropic import Anthropic
import json
import re

class ResumeJDEncoder:
    """Encoding resumes and job descriptions into embeddings"""

    def __init__(self):
        # Multilingual model: Russian + English
        self.model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')

    def extract_resume_sections(self, resume_text: str) -> dict:
        """Divide resume into semantic blocks"""
        # In production: ML resume parser (Affinda, Sovren or custom)
        sections = {
            'skills': '',
            'experience': '',
            'education': '',
            'full_text': resume_text
        }

        # Simplified extraction via patterns
        skills_pattern = r'(?:навыки|skills|технологии|technologies|стек)[:\s]*([^\n]+(?:\n[^\n]+){0,5})'
        match = re.search(skills_pattern, resume_text, re.IGNORECASE)
        if match:
            sections['skills'] = match.group(1)

        return sections

    def encode_resume(self, resume: dict) -> dict:
        """Multi-aspect resume encoding"""
        texts_to_encode = {
            'full': resume.get('full_text', ''),
            'skills': resume.get('skills', ''),
            'title': resume.get('current_title', ''),
        }

        embeddings = {}
        for key, text in texts_to_encode.items():
            if text.strip():
                embeddings[key] = self.model.encode(text, normalize_embeddings=True)

        return embeddings

    def encode_job(self, job: dict) -> dict:
        """Job description encoding"""
        texts = {
            'full': job.get('description', ''),
            'requirements': ' '.join(job.get('requirements', [])),
            'title': job.get('title', ''),
        }

        embeddings = {}
        for key, text in texts.items():
            if text.strip():
                embeddings[key] = self.model.encode(text, normalize_embeddings=True)

        return embeddings


class SemanticMatcher:
    """Two-stage matching: fast ANN + precise LLM"""

    def __init__(self):
        self.encoder = ResumeJDEncoder()
        self.llm = Anthropic()

    def compute_embedding_score(self, resume_embs: dict,
                                  job_embs: dict) -> float:
        """Fast scoring via cosine similarity of embeddings"""
        scores = []
        weights = {'full': 0.4, 'skills': 0.4, 'title': 0.2}

        for key, weight in weights.items():
            r_emb = resume_embs.get(key)
            j_emb = job_embs.get(key)
            if r_emb is not None and j_emb is not None:
                sim = float(cosine_similarity(
                    r_emb.reshape(1, -1), j_emb.reshape(1, -1)
                )[0, 0])
                scores.append(sim * weight)

        return sum(scores) / sum(weights[k] for k in weights if resume_embs.get(k) is not None) if scores else 0.0

    def deep_match(self, resume: dict, job: dict) -> dict:
        """Detailed LLM analysis of compatibility (for top candidates)"""
        response = self.llm.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=500,
            messages=[{
                "role": "user",
                "content": f"""Analyze candidate-job match. Return detailed assessment.

JOB:
Title: {job.get('title', '')}
Requirements: {', '.join(job.get('requirements', [])[:10])}
Nice-to-have: {', '.join(job.get('nice_to_have', [])[:5])}
Seniority: {job.get('seniority', 'mid')}

CANDIDATE:
Title: {resume.get('current_title', '')}
Years of experience: {resume.get('years_experience', 0)}
Skills: {', '.join(resume.get('skills', [])[:15])}
Summary: {resume.get('summary', '')[:300]}

Return JSON:
{{
  "match_score": 0-100,
  "strengths": ["..."],
  "gaps": ["..."],
  "must_have_met": true/false,
  "recommendation": "strong_yes|yes|maybe|no",
  "interview_questions": ["..."]
}}"""
            }]
        )

        try:
            return json.loads(response.content[0].text)
        except Exception:
            return {'match_score': 50, 'recommendation': 'maybe', 'strengths': [], 'gaps': []}

    def rank_candidates(self, job: dict,
                          candidates: list[dict],
                          top_k_deep: int = 10) -> list[dict]:
        """
        Two-stage pipeline:
        1. Fast ANN matching for entire database → top-N
        2. Deep LLM analysis for top-K finalists
        """
        job_embs = self.encoder.encode_job(job)

        # Stage 1: fast scoring
        for candidate in candidates:
            resume_embs = self.encoder.encode_resume(candidate)
            candidate['embedding_score'] = self.compute_embedding_score(resume_embs, job_embs)

        # Top-K by embedding score
        top_candidates = sorted(candidates, key=lambda x: -x['embedding_score'])[:top_k_deep * 3]

        # Stage 2: deep analysis of top candidates
        results = []
        for candidate in top_candidates[:top_k_deep]:
            deep_result = self.deep_match(candidate, job)
            results.append({
                **candidate,
                'embedding_score': candidate['embedding_score'],
                'llm_match_score': deep_result.get('match_score', 50),
                'final_score': (candidate['embedding_score'] * 0.4 +
                                deep_result.get('match_score', 50) / 100 * 0.6),
                'strengths': deep_result.get('strengths', []),
                'gaps': deep_result.get('gaps', []),
                'recommendation': deep_result.get('recommendation', 'maybe'),
                'interview_questions': deep_result.get('interview_questions', [])
            })

        return sorted(results, key=lambda x: -x['final_score'])


class BiasAuditor:
    """Bias audit in matching"""

    def audit_demographic_bias(self, match_results: pd.DataFrame) -> dict:
        """Check differential selection by protected characteristics"""
        audit = {}

        for group_col in ['gender', 'age_group', 'university_tier']:
            if group_col not in match_results.columns:
                continue

            group_stats = match_results.groupby(group_col)['final_score'].agg(
                ['mean', 'count', 'std']
            )

            # Disparate Impact: ratio between groups > 0.8 is considered acceptable
            if len(group_stats) >= 2:
                min_mean = group_stats['mean'].min()
                max_mean = group_stats['mean'].max()
                di_ratio = min_mean / max_mean if max_mean > 0 else 1.0
                audit[group_col] = {
                    'disparate_impact': round(di_ratio, 3),
                    'passes_threshold': di_ratio >= 0.8,
                    'group_means': group_stats['mean'].round(3).to_dict()
                }

        return audit

Semantic matching increases quality-of-hire: the percentage of candidates who pass probation grows by 20-30%. Time to fill a position is reduced by 35-40% through accurate primary screening. Bias auditing is a mandatory component for compliance with Fair Hiring requirements and EEOC standards.