Developing a Speech-to-Text System Building a production-grade STT system is more than just a Whisper API call. The task includes: selecting a model for the accent and subject area, post-processing to correct specific vocabulary, streaming recognition with acceptable latency, and orchestrating multiple providers for reliability. ### Choosing an STT stack
import asyncio
import io
import json
from typing import AsyncGenerator, Optional
import httpx
import websockets
import numpy as np
class STTProviderComparator:
"""Сравнение провайдеров STT по метрикам"""
PROVIDERS = {
"openai_whisper": {
"wer_general": 0.05, # Word Error Rate для стандартной речи
"wer_noisy": 0.12,
"russian_support": "excellent",
"latency_ms": 600, # Batch mode
"streaming": False,
"cost_per_hour": 0.36,
},
"deepgram_nova2": {
"wer_general": 0.08,
"wer_noisy": 0.15,
"russian_support": "good",
"latency_ms": 250,
"streaming": True,
"cost_per_hour": 0.35,
},
"azure_speech": {
"wer_general": 0.09,
"wer_noisy": 0.14,
"russian_support": "excellent",
"latency_ms": 300,
"streaming": True,
"cost_per_hour": 0.96,
},
"whisper_selfhosted": {
"wer_general": 0.05,
"wer_noisy": 0.12,
"russian_support": "excellent",
"latency_ms": 400, # Large-v3 на A100
"streaming": False,
"cost_per_hour": 0.08, # Self-hosted
},
}
def recommend_provider(self, requirements: dict) -> str:
"""
Выбор провайдера по требованиям.
requirements: {'streaming': bool, 'max_latency_ms': int, 'language': str,
'volume_hours_monthly': float}
"""
candidates = []
for name, props in self.PROVIDERS.items():
# Фильтрация по hard constraints
if requirements.get('streaming') and not props['streaming']:
continue
if props['latency_ms'] > requirements.get('max_latency_ms', 9999):
continue
# Scoring
wer_score = 1 - props['wer_general']
latency_score = 1 - props['latency_ms'] / 1000
# Экономика при большом объёме
monthly_cost = props['cost_per_hour'] * requirements.get('volume_hours_monthly', 100)
cost_score = 1 / (1 + monthly_cost / 1000)
total_score = wer_score * 0.4 + latency_score * 0.3 + cost_score * 0.3
candidates.append((name, round(total_score, 3)))
return max(candidates, key=lambda x: x[1])[0] if candidates else "openai_whisper"
class StreamingSTTClient:
"""Потоковое распознавание речи через Deepgram WebSocket"""
def __init__(self, api_key: str, language: str = "ru"):
self.api_key = api_key
self.language = language
self.base_url = "wss://api.deepgram.com/v1/listen"
async def transcribe_stream(self, audio_chunks: AsyncGenerator[bytes, None],
sample_rate: int = 16000) -> AsyncGenerator[str, None]:
"""
Потоковое распознавание аудио.
Возвращает промежуточные (interim) и финальные транскрипты.
"""
params = (
f"?language={self.language}"
f"&encoding=linear16"
f"&sample_rate={sample_rate}"
f"&channels=1"
f"&model=nova-2"
f"&smart_format=true"
f"&punctuate=true"
f"&endpointing=300" # ms тишины для детекции конца фразы
f"&interim_results=true"
)
async with websockets.connect(
self.base_url + params,
extra_headers={"Authorization": f"Token {self.api_key}"},
max_size=10_000_000
) as ws:
async def send_audio():
async for chunk in audio_chunks:
await ws.send(chunk)
await ws.send(json.dumps({"type": "CloseStream"}))
asyncio.create_task(send_audio())
async for message in ws:
data = json.loads(message)
if data.get("type") == "Results":
channel = data.get("channel", {})
alternatives = channel.get("alternatives", [])
if alternatives:
transcript = alternatives[0].get("transcript", "")
is_final = data.get("is_final", False)
if transcript:
yield transcript if is_final else f"[interim] {transcript}"
class DomainSpecificPostProcessor:
"""
Пост-обработка транскрипта для предметной области.
STT-модели часто ошибаются на терминах, именах собственных, аббревиатурах.
"""
def __init__(self, domain_vocabulary: dict):
"""
domain_vocabulary: {'исходное_слово': 'правильное_слово'}
Пример: {'питсбург': 'питер', 'эксель': 'Excel'}
"""
self.vocabulary = {k.lower(): v for k, v in domain_vocabulary.items()}
def correct_transcript(self, transcript: str) -> str:
"""Замена ошибочно распознанных слов"""
words = transcript.split()
corrected = []
for word in words:
clean = word.lower().rstrip('.,!?;:')
punct = word[len(clean):]
corrected.append(self.vocabulary.get(clean, word.rstrip('.,!?;:')) + punct)
return ' '.join(corrected)
def normalize_numbers_and_dates(self, transcript: str) -> str:
"""Нормализация числительных и дат из текста в структурированный формат"""
import re
# Простые замены числительных (production: использовать pymorphy2)
number_words = {
'ноль': '0', 'один': '1', 'два': '2', 'три': '3', 'четыре': '4',
'пять': '5', 'шесть': '6', 'семь': '7', 'восемь': '8', 'девять': '9',
}
result = transcript.lower()
for word, digit in number_words.items():
result = result.replace(word, digit)
return result
class STTPipeline:
"""Полный пайплайн STT с fallback и мониторингом"""
def __init__(self, primary_provider, fallback_provider=None,
post_processor: Optional[DomainSpecificPostProcessor] = None):
self.primary = primary_provider
self.fallback = fallback_provider
self.post_processor = post_processor
self._error_count = 0
async def transcribe(self, audio_data: bytes,
language: str = "ru") -> dict:
"""
Транскрипция с автоматическим fallback.
"""
try:
transcript, confidence = await self._call_provider(
self.primary, audio_data, language
)
provider_used = "primary"
except Exception as e:
self._error_count += 1
if self.fallback:
transcript, confidence = await self._call_provider(
self.fallback, audio_data, language
)
provider_used = "fallback"
else:
raise
# Пост-обработка
if self.post_processor:
transcript = self.post_processor.correct_transcript(transcript)
return {
"transcript": transcript,
"confidence": confidence,
"provider": provider_used,
"language": language,
}
async def _call_provider(self, provider, audio: bytes, language: str) -> tuple:
"""Stub: заменить на реальный вызов провайдера"""
raise NotImplementedError
```### STT Quality Assessment | Metric | Description | Target Value | |---------|---------|-----------------| | WER (Word Error Rate) | % of words with errors | < 8% for clean speech | | CER (Character Error Rate) | % of characters with errors | < 3% | | RTF (Real-Time Factor) | audio time/length | < 0.3 for streaming | | First word latency | delay to first result | < 400ms | | Domain WER | WER on specialized terms | < 12% | **Self-hosted vs API:** for volume over 500 hours per month, self-hosted Whisper Large-v3 on 2×A100 pays for itself in 3-4 months compared to OpenAI API. Additional benefit: fine-tuning on specific vocabulary reduces domain WER from 15% to 5-8%.