AI-based Data Extraction from Invoices and Invoice Documents
Manual invoice processing is a typical pain point: accounting spends 3–8 minutes per document, manual entry errors are 1–3%, with volumes of 500+ invoices per month this is noticeable. A Document AI solution reduces processing time to 5–15 seconds per document with field accuracy >98% on standard formats.
Document AI Architecture for Invoices
Three approaches in order of complexity and accuracy:
1. Rule-based + OCR — for fixed templates (one supplier, always the same layout). Fast, cheap, breaks on any template change.
2. LayoutLM/DocTR — accounts for spatial location of text, works on variable templates.
3. Multimodal LLM (GPT-4V, Claude Vision, Gemini) — understands arbitrary formats, high accuracy, but cost per-document is higher.
LayoutLMv3 — production solution
from transformers import (
LayoutLMv3Processor,
LayoutLMv3ForTokenClassification
)
import torch
from PIL import Image
# Labels for invoice extraction
LABEL_LIST = [
'O', # non-field
'B-INVOICE_NUMBER',
'I-INVOICE_NUMBER',
'B-INVOICE_DATE',
'I-INVOICE_DATE',
'B-DUE_DATE',
'B-VENDOR_NAME',
'I-VENDOR_NAME',
'B-VENDOR_ADDRESS',
'I-VENDOR_ADDRESS',
'B-TOTAL_AMOUNT',
'I-TOTAL_AMOUNT',
'B-TAX_AMOUNT',
'I-TAX_AMOUNT',
'B-LINE_ITEM_DESC',
'I-LINE_ITEM_DESC',
'B-LINE_ITEM_AMOUNT',
]
LABEL2ID = {l: i for i, l in enumerate(LABEL_LIST)}
ID2LABEL = {i: l for l, i in LABEL2ID.items()}
class InvoiceExtractor:
def __init__(self, model_path: str):
self.processor = LayoutLMv3Processor.from_pretrained(
model_path, apply_ocr=True # built-in OCR via Tesseract
)
self.model = LayoutLMv3ForTokenClassification.from_pretrained(
model_path,
num_labels=len(LABEL_LIST),
id2label=ID2LABEL,
label2id=LABEL2ID
).eval().cuda()
@torch.no_grad()
def extract(self, image_path: str) -> dict:
image = Image.open(image_path).convert('RGB')
encoding = self.processor(
image,
return_tensors='pt',
truncation=True,
max_length=512
).to('cuda')
outputs = self.model(**encoding)
predictions = outputs.logits.argmax(dim=-1).squeeze().cpu()
# Decoding tokens → fields
tokens = self.processor.tokenizer.convert_ids_to_tokens(
encoding['input_ids'].squeeze().cpu()
)
boxes = encoding['bbox'].squeeze().cpu().numpy()
pred_ids = predictions.numpy()
fields = {}
current_field = None
current_tokens = []
for token, pred_id in zip(tokens, pred_ids):
if token in ['[CLS]', '[SEP]', '[PAD]']:
continue
label = ID2LABEL[pred_id]
if label.startswith('B-'):
if current_field and current_tokens:
fields[current_field] = self._tokens_to_text(current_tokens)
current_field = label[2:]
current_tokens = [token]
elif label.startswith('I-') and current_field:
current_tokens.append(token)
else:
if current_field and current_tokens:
fields[current_field] = self._tokens_to_text(current_tokens)
current_field = None
current_tokens = []
return fields
def _tokens_to_text(self, tokens: list) -> str:
text = self.processor.tokenizer.convert_tokens_to_string(tokens)
return text.strip()
Post-processing and Validation
Raw model output requires normalization: amounts with different separators, date formats, INN/VAT numbers.
import re
from datetime import datetime
from decimal import Decimal
class InvoiceFieldValidator:
def validate_and_normalize(self, raw_fields: dict) -> dict:
validated = {}
# Amount: '1.234,56 €' → Decimal('1234.56')
if 'TOTAL_AMOUNT' in raw_fields:
validated['total_amount'] = self._parse_amount(
raw_fields['TOTAL_AMOUNT']
)
# Date: various formats → ISO 8601
if 'INVOICE_DATE' in raw_fields:
validated['invoice_date'] = self._parse_date(
raw_fields['INVOICE_DATE']
)
# Invoice number — minimal validation (not empty, alphanumeric)
if 'INVOICE_NUMBER' in raw_fields:
inv_num = re.sub(r'\s+', '', raw_fields['INVOICE_NUMBER'])
validated['invoice_number'] = inv_num if inv_num else None
return validated
def _parse_amount(self, text: str) -> Decimal | None:
# Remove currency symbols and spaces
cleaned = re.sub(r'[€$£₽\s]', '', text)
# Normalize separators
if re.match(r'^\d{1,3}(\.\d{3})*,\d{2}$', cleaned):
# European format: 1.234,56
cleaned = cleaned.replace('.', '').replace(',', '.')
elif re.match(r'^\d{1,3}(,\d{3})*\.\d{2}$', cleaned):
# American: 1,234.56
cleaned = cleaned.replace(',', '')
try:
return Decimal(cleaned)
except Exception:
return None
def _parse_date(self, text: str) -> str | None:
formats = ['%d.%m.%Y', '%d/%m/%Y', '%Y-%m-%d',
'%d %b %Y', '%B %d, %Y', '%d.%m.%y']
for fmt in formats:
try:
return datetime.strptime(text.strip(), fmt).date().isoformat()
except ValueError:
continue
return None
Accuracy by Field Type (Production Data)
| Field | LayoutLMv3 | GPT-4V | Rule-based |
|---|---|---|---|
| Invoice number | 97.3% | 99.1% | 99.8%* |
| Date | 96.8% | 98.7% | 98.2%* |
| Total amount | 95.1% | 98.4% | 96.5%* |
| Line items | 88.4% | 94.2% | 40%* |
| Vendor address | 91.2% | 96.8% | 72%* |
*only for fixed templates
Timeline
| Task | Timeline |
|---|---|
| Setup DocTR/AWS Textract for standard formats | 1–2 weeks |
| Fine-tuning LayoutLMv3 on enterprise dataset | 4–6 weeks |
| Complete system with ERP integration | 6–10 weeks |







