Data Cleaning and Preprocessing for LLM Fine-tuning
Data cleaning for LLM fine-tuning has its own specifics: not only must you remove technical junk (HTML tags, duplicates), but also filter toxic content, fix encoding issues, and ensure examples actually match the target task.
Cleaning Pipeline
import re
import unicodedata
from dataclasses import dataclass
@dataclass
class CleaningResult:
original: str
cleaned: str
removed: bool
removal_reason: str = None
class TextCleaner:
def clean(self, text: str) -> CleaningResult:
cleaned = text
# 1. Unicode normalization
cleaned = unicodedata.normalize('NFKC', cleaned)
# 2. Remove HTML/XML tags
cleaned = re.sub(r'<[^>]+>', ' ', cleaned)
# 3. Clean URLs (optionally — replace with placeholder)
cleaned = re.sub(
r'https?://[^\s]+', '[URL]', cleaned
)
# 4. Normalize whitespace
cleaned = re.sub(r'\s+', ' ', cleaned).strip()
# 5. Remove repeating characters (aaaaaaa → aa)
cleaned = re.sub(r'(.)\1{4,}', r'\1\1', cleaned)
# Check for minimum length
if len(cleaned.split()) < 3:
return CleaningResult(text, cleaned, True, "too_short")
return CleaningResult(text, cleaned, False)
class DataFilter:
def __init__(self):
# Toxicity (can use detoxify or fasttext)
from detoxify import Detoxify
self.toxicity_model = Detoxify('multilingual')
def is_toxic(self, text: str, threshold: float = 0.7) -> bool:
result = self.toxicity_model.predict(text)
return result['toxicity'] > threshold
def has_pii(self, text: str) -> bool:
"""Simple heuristic for PII detection"""
patterns = [
r'\b\d{3}-\d{2}-\d{4}\b', # SSN
r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # Email
r'\b(?:\+7|8)?[\s-]?\(?\d{3}\)?[\s-]?\d{3}[\s-]?\d{2}[\s-]?\d{2}\b', # RU phone
r'\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b', # Credit card
]
for pattern in patterns:
if re.search(pattern, text):
return True
return False
Output Field Cleaning
class OutputCleaner:
def clean_output(self, output: str, task_type: str) -> tuple[str, bool]:
cleaned = output.strip()
# Remove unwanted model phrases
unwanted_starts = [
"As an AI language model",
"As a helpful assistant",
"I don't have access to real-time",
"I cannot browse the internet",
"Certainly! Here",
"Of course! I'd be happy to",
]
for phrase in unwanted_starts:
if cleaned.lower().startswith(phrase.lower()):
# Remove intro phrase
cleaned = cleaned[len(phrase):].lstrip('.,! ')
# Check: output should not contain meta-comments
meta_indicators = [
"Note: This is a fictional",
"[This response was",
"Disclaimer:",
]
for indicator in meta_indicators:
if indicator in cleaned:
idx = cleaned.find(indicator)
cleaned = cleaned[:idx].strip()
# Minimum length
if len(cleaned.split()) < 5:
return cleaned, True # Mark for removal
return cleaned, False
Duplicate Detection at Multiple Levels
from datasketch import MinHash, MinHashLSH
def find_near_duplicates(texts: list[str],
threshold: float = 0.8) -> list[tuple]:
"""MinHash LSH for efficient near-duplicate search O(n log n)"""
lsh = MinHashLSH(threshold=threshold, num_perm=128)
minhashes = {}
for i, text in enumerate(texts):
m = MinHash(num_perm=128)
for word in text.lower().split():
m.update(word.encode('utf8'))
lsh.insert(f"doc_{i}", m)
minhashes[f"doc_{i}"] = m
duplicates = []
for i, text in enumerate(texts):
key = f"doc_{i}"
result = lsh.query(minhashes[key])
result.remove(key)
if result:
duplicates.append((i, [int(r.split('_')[1]) for r in result]))
return duplicates
Statistics After Cleaning
After cleaning the dataset, verify:
- % removed examples by each reason (too_short, toxic, pii, duplicate)
- Output length distribution (histogram)
- Vocabulary diversity (type-token ratio)
- Target domain coverage (how well examples cover tasks)
Typical result: from 50,000 raw examples, 35,000-42,000 high-quality examples remain after cleaning. A 15-30% reduction is normal, and final model quality improves as a result.







