AI Data Extraction from Passports and Personal ID Documents
Identification document recognition — area with special requirements: field accuracy >99.5% for critical fields (series/number, date of birth), handling document wear, working with documents from different countries, forgery detection.
MRZ — machine readable zone
Machine Readable Zone (MRZ) — two lines at the bottom of a passport with ICAO 9303 checksums. This is a reliable entry point: MRZ contains all key fields and is mathematically verified.
import re
from dataclasses import dataclass
from typing import Optional
@dataclass
class MRZData:
document_type: str
issuing_country: str
surname: str
given_names: str
document_number: str
nationality: str
date_of_birth: str # YYMMDD
sex: str
expiry_date: str # YYMMDD
personal_number: str
check_digits_valid: bool
class MRZParser:
"""
MRZ parser for TD1 (ID cards, 3 lines × 30 characters)
and TD3 (passports, 2 lines × 44 characters).
"""
WEIGHTS = [7, 3, 1]
def _check_digit(self, s: str) -> int:
"""ICAO 9303 check digit"""
charset = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ<'
values = {c: i for i, c in enumerate(charset)}
total = sum(
values.get(c, 0) * self.WEIGHTS[i % 3]
for i, c in enumerate(s)
)
return total % 10
def parse_td3(self, line1: str, line2: str) -> Optional[MRZData]:
"""TD3 — passport, 2 lines of 44 characters"""
if len(line1) != 44 or len(line2) != 44:
return None
# Line 1
doc_type = line1[0:2].replace('<', '')
country = line1[2:5]
name_field = line1[5:44]
if '<<' in name_field:
surname_raw, given_raw = name_field.split('<<', 1)
else:
surname_raw, given_raw = name_field, ''
# Line 2
doc_num = line2[0:9].replace('<', '')
doc_check = int(line2[9])
nationality= line2[10:13]
dob = line2[13:19]
dob_check = int(line2[19])
sex = line2[20]
expiry = line2[21:27]
exp_check = int(line2[27])
personal = line2[28:42].replace('<', '')
composite_check = int(line2[43])
# Check digit verification
valid = all([
self._check_digit(line2[0:9]) == doc_check,
self._check_digit(line2[13:19]) == dob_check,
self._check_digit(line2[21:27]) == exp_check,
self._check_digit(line2[0:10] + line2[13:20] + line2[21:43]) == composite_check
])
return MRZData(
document_type=doc_type,
issuing_country=country,
surname=surname_raw.replace('<', ' ').strip(),
given_names=given_raw.replace('<', ' ').strip(),
document_number=doc_num,
nationality=nationality,
date_of_birth=dob,
sex=sex,
expiry_date=expiry,
personal_number=personal,
check_digits_valid=valid
)
OCR of VIZ (Visual Inspection Zone)
In addition to MRZ, need to read the visual zone: address of registration, place of birth (not in Russian passport MRZ). For this — regional OCR with correcting dictionary of populated places:
from paddleocr import PaddleOCR
from rapidfuzz import process, fuzz
import json
class PassportVIZExtractor:
def __init__(self, region_dict_path: str):
self.ocr = PaddleOCR(
use_angle_cls=True, lang='en',
det_model_dir='models/det/',
rec_model_dir='models/rec/' # fine-tuned on passports
)
with open(region_dict_path) as f:
self.regions = json.load(f) # list of regions/cities
def extract_fields(self, page_image) -> dict:
result = self.ocr.ocr(page_image, cls=True)
if not result or not result[0]:
return {}
# Group lines by vertical position
lines = sorted(
[(r[0][0][1], r[1][0]) for r in result[0]],
key=lambda x: x[0]
)
fields = {}
for y_pos, text in lines:
if 'birthplace' in text.lower():
fields['birth_place_label_y'] = y_pos
elif 'birthplace' in fields and \
abs(y_pos - fields.get('birth_place_label_y', 0)) < 50:
fields['birth_place_raw'] = text
# Normalization via fuzzy-matching to directory
match, score, _ = process.extractOne(
text, self.regions, scorer=fuzz.token_sort_ratio
)
fields['birth_place_normalized'] = match if score > 70 else text
return fields
Forgery Detection (Basic Level)
import numpy as np
import cv2
def detect_basic_tampering(image: np.ndarray) -> dict:
"""
Simple forgery indicators:
- JPEG artifacts in different blocks (copy-paste from another photo)
- Anomalous sharpness on separate fields (overlay)
- DPI mismatch between zones
"""
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Error Level Analysis: detect areas with different compression
import tempfile, os
with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
tmp_path = tmp.name
cv2.imwrite(tmp_path, image, [cv2.IMWRITE_JPEG_QUALITY, 90])
recompressed = cv2.imread(tmp_path)
os.unlink(tmp_path)
ela = cv2.absdiff(image, recompressed)
ela_gray = cv2.cvtColor(ela, cv2.COLOR_BGR2GRAY)
# High ELA regions — potential overlays
high_ela_mask = ela_gray > ela_gray.mean() + 3 * ela_gray.std()
tamper_ratio = high_ela_mask.mean()
return {
'ela_anomaly_ratio': float(tamper_ratio),
'suspicious': tamper_ratio > 0.05, # >5% pixels anomalous
'ela_map': ela_gray
}
Accuracy on MIDV-2020 Benchmark
| Field | Extraction Accuracy | Method |
|---|---|---|
| MRZ (all fields) | 99.8% | MRZ OCR + check digits |
| Series/number (RF passport) | 99.3% | PaddleOCR fine-tuned |
| Date of birth | 99.1% | MRZ + VIZ cross-check |
| Full name | 97.8% | VIZ + BERT NER |
| Address of residence | 94.2% | VIZ + FIAS directory |
Timeline
| Task | Timeline |
|---|---|
| MRZ + basic fields (RF/EU passports) | 2–4 weeks |
| Multi-document system (10+ types) | 6–9 weeks |
| System with forgery detection and liveness | 10–16 weeks |







