Implementation of automatic generation of SRT/VTT subtitle files. SRT (Sub
Rip Subtitle) and VTT (WebVTT) are standard subtitle formats. SRT is used in video players and when uploading to YouTube/Vimeo. VTT is for HTML5 web players and streaming platforms. ### SRT and VTT generation```python from faster_whisper import WhisperModel from datetime import timedelta
model = WhisperModel("large-v3", device="cuda")
def format_time_srt(seconds: float) -> str: td = timedelta(seconds=seconds) total_seconds = int(td.total_seconds()) hours = total_seconds // 3600 minutes = (total_seconds % 3600) // 60 secs = total_seconds % 60 milliseconds = int((seconds % 1) * 1000) return f"{hours:02d}:{minutes:02d}:{secs:02d},{milliseconds:03d}"
def format_time_vtt(seconds: float) -> str: # VTT использует точку вместо запятой return format_time_srt(seconds).replace(",", ".")
def generate_srt(audio_path: str, language: str = "ru") -> str: segments, _ = model.transcribe( audio_path, language=language, vad_filter=True ) lines = [] for i, seg in enumerate(segments, 1): start = format_time_srt(seg.start) end = format_time_srt(seg.end) text = seg.text.strip() lines.append(f"{i}\n{start} --> {end}\n{text}\n") return "\n".join(lines)
def generate_vtt(audio_path: str, language: str = "ru") -> str:
segments, _ = model.transcribe(
audio_path, language=language, vad_filter=True
)
lines = ["WEBVTT\n"]
for seg in segments:
start = format_time_vtt(seg.start)
end = format_time_vtt(seg.end)
text = seg.text.strip()
lines.append(f"{start} --> {end}\n{text}\n")
return "\n".join(lines)
### VTT with advanced capabilitiespython
def generate_vtt_with_styling(segments, speaker_map: dict = None) -> str:
"""VTT с позиционированием и стилями"""
lines = ["WEBVTT\n"]
for i, seg in enumerate(segments):
start = format_time_vtt(seg.start)
end = format_time_vtt(seg.end)
speaker = speaker_map.get(seg.speaker, "") if speaker_map else ""
# Позиционирование: line:90% — внизу экрана
position = "line:90% position:50% align:center"
speaker_tag = f"<v {speaker}>" if speaker else ""
lines.append(f"{i+1}\n{start} --> {end} {position}\n{speaker_tag}{seg.text.strip()}\n")
return "\n".join(lines)
### Subtitle post-processingpython
def optimize_subtitles(segments: list, max_line_length: int = 42,
max_duration: float = 7.0,
min_duration: float = 1.2) -> list:
"""Оптимизируем субтитры под стандарты вещания"""
optimized = []
for seg in segments:
duration = seg.end - seg.start
text = seg.text.strip()
# Ограничиваем длину строки
if len(text) > max_line_length:
mid = text.rfind(" ", 0, max_line_length)
text = text[:mid] + "\n" + text[mid+1:]
# Минимальная длительность
end = max(seg.end, seg.start + min_duration)
optimized.append({**seg.__dict__, "text": text, "end": end})
return optimized







