Label Studio Data Labeling Integration
Label Studio is an open-source annotation platform with support for text, images, audio, and time series. Deploys in 15 minutes, scales via Kubernetes, integrates with ML backends for automatic pre-annotation.
Quick Start
# Docker Compose with PostgreSQL and Redis
cat > docker-compose.yml << 'EOFCOMPOSE'
version: '3.8'
services:
label-studio:
image: heartexlabs/label-studio:latest
ports:
- "8080:8080"
environment:
DJANGO_DB: default
POSTGRE_NAME: labelstudio
POSTGRE_USER: labelstudio
POSTGRE_PASSWORD: secret
POSTGRE_HOST: postgres
REDIS_LOCATION: redis://redis:6379/1
LABEL_STUDIO_HOST: http://localhost:8080
volumes:
- ./label-studio-data:/label-studio/data
depends_on:
- postgres
- redis
postgres:
image: postgres:14
environment:
POSTGRES_DB: labelstudio
POSTGRES_USER: labelstudio
POSTGRES_PASSWORD: secret
volumes:
- postgres_data:/var/lib/postgresql/data
redis:
image: redis:7
volumes:
postgres_data:
EOFCOMPOSE
docker compose up -d
Creating Projects via API
import label_studio_sdk
from label_studio_sdk import Client
ls = Client(url='http://localhost:8080', api_key='your-api-key')
# NER project
ner_project = ls.start_project(
title='Named Entity Recognition',
label_config='''
<View>
<Labels name="label" toName="text">
<Label value="Person" background="#FF6B6B"/>
<Label value="Organization" background="#4ECDC4"/>
<Label value="Location" background="#45B7D1"/>
<Label value="Date" background="#FFA07A"/>
</Labels>
<Text name="text" value="$text"/>
</View>
'''
)
# Classification with multiple labels
classification_project = ls.start_project(
title='Sentiment Classification',
label_config='''
<View>
<Text name="text" value="$text" granularity="sentence"/>
<Choices name="sentiment" toName="text" choice="single">
<Choice value="Positive"/>
<Choice value="Negative"/>
<Choice value="Neutral"/>
</Choices>
<Rating name="confidence" toName="text" maxRating="5" defaultValue="3"/>
</View>
'''
)
Loading Tasks and Exporting Annotations
import json
def upload_tasks(project, texts: list[str], batch_size: int = 500):
"""Batch task loading"""
tasks = [{"data": {"text": t}} for t in texts]
for i in range(0, len(tasks), batch_size):
batch = tasks[i:i + batch_size]
project.import_tasks(batch)
print(f"Uploaded {min(i + batch_size, len(tasks))}/{len(tasks)}")
def export_annotations(project, export_type: str = 'JSON') -> list[dict]:
"""Export completed annotations"""
# Only completed tasks
tasks = project.get_tasks(filters={
'completion_count__gt': 0
})
annotations = []
for task in tasks:
if task['total_annotations'] > 0:
annotations.append({
'id': task['id'],
'data': task['data'],
'annotations': task['annotations']
})
return annotations
def convert_to_huggingface_format(annotations: list[dict],
task_type: str = 'ner') -> dict:
"""Convert to HuggingFace datasets format"""
if task_type == 'classification':
return {
'text': [a['data']['text'] for a in annotations],
'label': [
a['annotations'][0]['result'][0]['value']['choices'][0]
for a in annotations
if a['annotations'] and a['annotations'][0]['result']
]
}
return {}
ML Backend for Pre-annotation
from label_studio_ml import LabelStudioMLBase
from transformers import pipeline
class SentimentMLBackend(LabelStudioMLBase):
"""Pre-annotation via HuggingFace model"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.classifier = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli"
)
self.labels = ['Positive', 'Negative', 'Neutral']
def predict(self, tasks: list[dict], **kwargs) -> list[dict]:
predictions = []
for task in tasks:
text = task['data'].get('text', '')
result = self.classifier(text, candidate_labels=self.labels)
predictions.append({
'result': [{
'from_name': 'sentiment',
'to_name': 'text',
'type': 'choices',
'value': {'choices': [result['labels'][0]]}
}],
'score': result['scores'][0]
})
return predictions
# Run ML backend
# label-studio-ml start sentiment_backend --port 9090
Label Studio with ML backend reduces manual labeling time by 60-70%. 10K texts with pre-annotation: 2-3 days vs 7-10 without. Labeled dataset cost drops from $0.03-0.05 to $0.01-0.015 per example with pre-annotation.







