Implementing Automatic Call Transcription Automated call transcription is a basic speech analytics tool for call centers. It converts hundreds of hours of recordings into structured text for quality control, agent training, and pattern detection. ### Automated transcription pipeline
import asyncio
from pathlib import Path
from faster_whisper import WhisperModel
from pyannote.audio import Pipeline
class CallTranscriber:
def __init__(self):
self.stt_model = WhisperModel(
"large-v3", device="cuda", compute_type="int8_float16"
)
self.diarization_pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token="HF_TOKEN"
)
async def transcribe_call(self, audio_path: str) -> dict:
# 1. Транскрипция
segments, info = self.stt_model.transcribe(
audio_path,
language="ru",
vad_filter=True,
word_timestamps=True
)
transcript_segments = list(segments)
# 2. Диаризация (кто говорил когда)
diarization = self.diarization_pipeline(
audio_path,
num_speakers=2 # оператор + клиент
)
# 3. Сопоставление
result = self._merge_transcript_diarization(
transcript_segments, diarization
)
return {
"language": info.language,
"duration": info.duration,
"turns": result,
"full_text": " ".join(seg.text for seg in transcript_segments)
}
```### Telephone audio specifications Telephony in the Russian Federation: 8 kHz, μ-law, PCMA. Preprocessing is required:```python
import subprocess
def prepare_call_audio(input_path: str) -> str:
output_path = input_path + "_prepared.wav"
subprocess.run([
"ffmpeg", "-i", input_path,
"-ar", "16000", # апсемплирование 8→16kHz
"-ac", "1", # моно
"-af", "afftdn=nf=-25,highpass=f=200,lowpass=f=4000", # телефонный фильтр
output_path, "-y", "-loglevel", "error"
], check=True)
return output_path
```### Role identification (operator/client)```python
def identify_speaker_roles(diarization_result) -> dict:
"""Определяем кто оператор, кто клиент по характеристикам речи"""
speaker_stats = {}
for segment, _, speaker in diarization_result.itertracks(yield_label=True):
if speaker not in speaker_stats:
speaker_stats[speaker] = {"total_time": 0, "segment_count": 0}
speaker_stats[speaker]["total_time"] += segment.end - segment.start
speaker_stats[speaker]["segment_count"] += 1
# Оператор обычно говорит больше и чаще
operator = max(speaker_stats, key=lambda s: speaker_stats[s]["segment_count"])
return {spk: ("OPERATOR" if spk == operator else "CUSTOMER")
for spk in speaker_stats}
```Timeframe: Basic autotranscription – 3–5 days. With diarization and CRM integration – 2–3 weeks.