AI Data Labeling Platform Development
Data labeling platform is not just Label Studio with a "deploy" button. It's a task queue management system, quality control via IAA (Inter-Annotator Agreement), automatic pre-annotation through weak models, and a closed active learning loop where the model itself requests the most informative examples.
Platform Architecture
[Raw Data Sources]
↓
[Ingestion & Preprocessing] ← format conversion, deduplication
↓
[Pre-annotation (weak models)] ← save 40-70% of manual work
↓
[Task Queue Management] ← distribute between annotators
↓
[Annotation Interface] ← Label Studio / custom UI
↓
[Quality Control] ← IAA, gold standard, review pipeline
↓
[Export & Model Training] ← JSONL, COCO, YOLO, HuggingFace datasets
↓
[Active Learning Loop] ← model requests difficult examples
Task and Annotator Management
from anthropic import Anthropic
import pandas as pd
from enum import Enum
from dataclasses import dataclass, field
from datetime import datetime
import uuid
import numpy as np
class TaskStatus(Enum):
PENDING = "pending"
PRE_ANNOTATED = "pre_annotated"
IN_REVIEW = "in_review"
COMPLETED = "completed"
DISPUTED = "disputed"
@dataclass
class AnnotationTask:
task_id: str
data: dict # raw data (text, image_url, etc.)
task_type: str # classification, ner, segmentation
annotations: list = field(default_factory=list)
pre_annotation: dict = None
status: TaskStatus = TaskStatus.PENDING
assigned_to: list = field(default_factory=list)
created_at: datetime = field(default_factory=datetime.now)
difficulty_score: float = 0.5
class AnnotationPlatform:
def __init__(self, db_connection):
self.db = db_connection
self.llm = Anthropic()
self.quality_threshold = 0.8 # Minimum IAA
self.annotators_per_task = 2
def ingest_data(self, raw_data: list[dict], task_type: str) -> list[AnnotationTask]:
"""Data reception and task creation"""
tasks = []
for item in raw_data:
task = AnnotationTask(
task_id=str(uuid.uuid4()),
data=item,
task_type=task_type
)
tasks.append(task)
# Preliminary difficulty estimation
tasks = self._estimate_difficulty(tasks)
# Prioritization: easy tasks first for quick start
tasks.sort(key=lambda t: t.difficulty_score)
return tasks
def _estimate_difficulty(self, tasks: list[AnnotationTask]) -> list[AnnotationTask]:
"""LLM-based task difficulty estimation for prioritization"""
# Batch estimation via LLM
sample_texts = [t.data.get('text', '')[:200] for t in tasks[:20]]
if not any(sample_texts):
return tasks
text_list = "\n".join([f"{i+1}. {t}" for i, t in enumerate(sample_texts)])
response = self.llm.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=300,
messages=[{
"role": "user",
"content": f"""Rate the annotation difficulty of these texts (0-1, where 1 is hardest).
Consider: ambiguity, domain specificity, length complexity.
Texts:
{text_list}
Return only comma-separated scores, e.g.: 0.3, 0.7, 0.5..."""
}]
)
try:
scores = [float(s.strip()) for s in response.content[0].text.split(',')]
for i, task in enumerate(tasks[:len(scores)]):
task.difficulty_score = scores[i]
except Exception:
pass
return tasks
Quality Control via IAA
def compute_iaa(self, annotations: list[dict], task_type: str) -> float:
"""
Inter-Annotator Agreement:
- Classification: Cohen's Kappa
- NER: F1 agreement
- Segmentation: IoU agreement
"""
if len(annotations) < 2:
return 1.0
if task_type == 'classification':
return self._cohen_kappa(annotations)
elif task_type == 'ner':
return self._ner_agreement(annotations)
else:
return self._pairwise_agreement(annotations)
def _cohen_kappa(self, annotations: list[dict]) -> float:
"""Cohen's Kappa for classification"""
from sklearn.metrics import cohen_kappa_score
if len(annotations) == 2:
labels_a = [a['label'] for a in annotations[0]['items']]
labels_b = [a['label'] for a in annotations[1]['items']]
if len(labels_a) != len(labels_b):
return 0.0
try:
return cohen_kappa_score(labels_a, labels_b)
except Exception:
return 0.0
return 0.5 # Default for >2 annotators (need Fleiss kappa)
def _ner_agreement(self, annotations: list[dict]) -> float:
"""F1 agreement for named entities"""
if len(annotations) < 2:
return 1.0
spans_a = set(
(e['start'], e['end'], e['label'])
for e in annotations[0].get('entities', [])
)
spans_b = set(
(e['start'], e['end'], e['label'])
for e in annotations[1].get('entities', [])
)
if not spans_a and not spans_b:
return 1.0
intersection = spans_a & spans_b
if not intersection:
return 0.0
precision = len(intersection) / len(spans_b)
recall = len(intersection) / len(spans_a)
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return f1
def review_disputed_task(self, task: AnnotationTask,
annotations: list[dict]) -> dict:
"""Resolve disputed cases via LLM"""
response = self.llm.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=400,
messages=[{
"role": "user",
"content": f"""You are a senior annotation expert. Resolve this labeling dispute.
Task type: {task.task_type}
Text: {task.data.get('text', '')[:500]}
Annotator A: {annotations[0]}
Annotator B: {annotations[1]}
Provide:
1. Correct annotation
2. Brief reasoning (1-2 sentences)
3. Guideline clarification needed (if any)"""
}]
)
return {
'resolution': response.content[0].text,
'resolved_by': 'llm_arbitration',
'task_id': task.task_id
}
Automatic Pre-annotation
class PreAnnotationEngine:
"""Pre-annotation to reduce annotator workload"""
def __init__(self, task_type: str):
self.task_type = task_type
self.weak_model = None
self.confidence_threshold = 0.85 # Only high-confidence labels accepted without review
def pre_annotate_classification(self, texts: list[str],
labels: list[str]) -> list[dict]:
"""Zero-shot classification via NLI"""
from transformers import pipeline
if self.weak_model is None:
self.weak_model = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=0
)
results = []
batch_size = 32
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
preds = self.weak_model(batch, candidate_labels=labels, batch_size=batch_size)
for pred in preds:
top_label = pred['labels'][0]
confidence = pred['scores'][0]
results.append({
'label': top_label,
'confidence': confidence,
'auto_accepted': confidence >= self.confidence_threshold
})
return results
def pre_annotate_ner(self, texts: list[str]) -> list[dict]:
"""NER via GLiNER (general NER)"""
from gliner import GLiNER
if self.weak_model is None:
self.weak_model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")
entity_types = ["person", "organization", "location", "date", "product"]
results = []
for text in texts:
entities = self.weak_model.predict_entities(text, entity_types)
results.append({
'entities': [
{'start': e['start'], 'end': e['end'],
'label': e['label'], 'confidence': e['score']}
for e in entities
],
'auto_accepted': all(e['score'] >= self.confidence_threshold for e in entities)
})
return results
Active Learning Loop
class ActiveLearningLoop:
"""Smart selection of next tasks for annotation"""
def select_informative_samples(self, unlabeled_pool: list[dict],
current_model,
strategy: str = 'uncertainty',
budget: int = 100) -> list[int]:
"""
Strategies:
- uncertainty: least confident predictions
- diversity: most diverse by feature space
- hybrid: combination of both
"""
texts = [item.get('text', '') for item in unlabeled_pool]
if strategy == 'uncertainty':
probs = current_model.predict_proba(texts)
# Highest entropy = highest uncertainty
entropy = -np.sum(probs * np.log(probs + 1e-10), axis=1)
return np.argsort(entropy)[-budget:].tolist()
elif strategy == 'diversity':
# Core-set: maximally diverse examples
embeddings = current_model.encode(texts) # if encoder available
selected = [np.random.randint(len(texts))]
for _ in range(budget - 1):
dists = np.min(
np.linalg.norm(
embeddings[:, None] - embeddings[selected],
axis=2
),
axis=1
)
selected.append(np.argmax(dists))
return selected
return list(range(min(budget, len(unlabeled_pool))))
Platform Metrics
| Metric | Without Pre-annotation | With Pre-annotation | Active Learning |
|---|---|---|---|
| Tasks per 1K documents | 1000 | 300-400 | 150-200 |
| IAA (classification) | 0.82 | 0.88 | 0.91 |
| Time for 1K documents | 8-12 hours | 3-4 hours | 1.5-2 hours |
| Final model accuracy | 100% | 97-99% | 98-99% |
A full-featured platform is deployed in 2-3 weeks. Label Studio self-hosted as basic UI plus custom backend for orchestration is the typical architecture for teams up to 20 annotators.







