AI Integration into Document Management Systems
Electronic document management systems — EDM, ECM, EDMS — store documents but don't understand their content. An employee receives a scanned contract → manually enters requisites → selects document type → assigns an approval route. AI integration automates this entire process: the document enters the system, AI reads it, extracts requisites, classifies it, creates a card, and launches the required route.
AI Layer Architecture for EDM Systems
[Incoming Document]
PDF/scan/DOCX/email
↓
[Document Preprocessor]
OCR (Tesseract/Google Cloud Vision) → normalized text
↓
[AI Processing Pipeline]
├── Classification: document type
├── NER: counterparty, dates, amounts, requisites
├── Summary: brief content summary
└── Routing: approval route determination
↓
[EDM API]
Card creation + workflow launch
Document Classification
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
class DocumentClassifier:
DOCUMENT_TYPES = [
"contract", "invoice", "delivery_note", "act",
"order", "memo", "commercial_proposal",
"power_of_attorney", "charter", "protocol", "incoming_letter"
]
def __init__(self, model_path: str = "cointegrated/rubert-tiny2"):
# For production — fine-tuned BERT on company document corpus
self.tokenizer = AutoTokenizer.from_pretrained(model_path)
self.model = AutoModelForSequenceClassification.from_pretrained(
model_path,
num_labels=len(self.DOCUMENT_TYPES)
)
self.model.eval()
def classify(self, text: str) -> dict:
# Take first 512 tokens (document header carries main semantics)
inputs = self.tokenizer(
text[:2000],
return_tensors="pt",
truncation=True,
max_length=512,
padding=True
)
with torch.no_grad():
logits = self.model(**inputs).logits
probs = torch.softmax(logits, dim=-1)[0]
top_idx = probs.argmax().item()
return {
"type": self.DOCUMENT_TYPES[top_idx],
"confidence": float(probs[top_idx]),
"alternatives": [
{"type": self.DOCUMENT_TYPES[i], "score": float(probs[i])}
for i in probs.topk(3).indices.tolist()
if i != top_idx
]
}
Requisite Extraction via NER + LLM
For structured documents (invoices, contracts), a combination works better: NER for fast standard field extraction + LLM for complex cases:
from langchain_openai import ChatOpenAI
import re
from datetime import datetime
class DocumentExtractor:
EXTRACTION_PROMPT = """Extract document requisites.
Document text:
{text}
Document type: {doc_type}
Extract (return null if not found):
- contractor_name: counterparty name
- contractor_inn: counterparty INN
- contract_number: contract/invoice number
- contract_date: document date (ISO 8601)
- total_amount: amount (number)
- currency: currency (RUB/USD/EUR)
- payment_deadline: payment deadline (if present)
- subject: contract subject (1-2 sentences)
- signatory: signatory from counterparty side
Return JSON."""
def __init__(self):
self.llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
def extract_requisites(self, text: str, doc_type: str) -> dict:
# First quick regex-based extraction
fast_extract = self._regex_extract(text)
# LLM for missing fields and validation
llm_result = self.llm.invoke(
self.EXTRACTION_PROMPT.format(
text=text[:3000],
doc_type=doc_type
)
)
import json
llm_data = json.loads(llm_result.content)
# Merge: regex takes priority for numeric fields (more accurate)
return {**llm_data, **fast_extract}
def _regex_extract(self, text: str) -> dict:
result = {}
# INN: 10 or 12 digits
inn_match = re.search(r'\bINN[:\s]*(\d{10,12})\b', text)
if inn_match:
result["contractor_inn"] = inn_match.group(1)
# Amounts with currency
amount_match = re.search(
r'(\d[\d\s,]*\.?\d*)\s*(rub|rubles|RUB|USD|EUR)',
text, re.IGNORECASE
)
if amount_match:
amount_str = amount_match.group(1).replace(' ', '').replace(',', '.')
result["total_amount"] = float(amount_str)
return result
Integration with Popular EDM Systems
class SEDIntegration:
"""Integration with 1C:Document Management, Directum, DocsVision"""
def push_to_directum(self, extracted: dict, original_file: bytes) -> dict:
"""Creates document card in Directum"""
import requests
# Upload file
upload_response = requests.post(
f"{self.directum_url}/api/v1/documents",
headers={"Authorization": f"Bearer {self.token}"},
files={"file": original_file}
)
doc_id = upload_response.json()["id"]
# Fill in card properties
card_response = requests.patch(
f"{self.directum_url}/api/v1/documents/{doc_id}/properties",
headers={"Authorization": f"Bearer {self.token}"},
json={
"DocumentType": extracted["type"],
"Counterparty": extracted.get("contractor_name"),
"INN": extracted.get("contractor_inn"),
"Amount": extracted.get("total_amount"),
"DocumentDate": extracted.get("contract_date"),
"Subject": extracted.get("subject")
}
)
# Launch approval route
route = self._determine_route(extracted)
requests.post(
f"{self.directum_url}/api/v1/documents/{doc_id}/workflow/{route}",
headers={"Authorization": f"Bearer {self.token}"}
)
return {"doc_id": doc_id, "route": route}
def _determine_route(self, extracted: dict) -> str:
"""Determines approval route based on document parameters"""
amount = extracted.get("total_amount", 0)
doc_type = extracted.get("type", "")
if doc_type == "contract":
if amount > 1_000_000:
return "contract_large" # director + lawyer + finance
elif amount > 100_000:
return "contract_medium" # manager + lawyer
else:
return "contract_standard" # manager only
elif doc_type == "invoice":
return "invoice_approval"
return "standard"
Case study: manufacturing company, 500 incoming documents per month. Before implementation: 2 operators spent 40% of work time on manual requisite entry. After: automatic requisite extraction accuracy 94% (verified on 1000 documents), 89% of documents processed without operator involvement, operators handle only exceptions (confidence < 0.8) and disputed route checks. Processing time per incoming document: 8 minutes → 45 seconds.
Timeline
- Classifier + requisite extractor: 3–4 weeks
- Integration with specific EDM system: 2–3 weeks
- Fine-tuning models on client documents: 1–2 additional weeks







