AI-based multimedia localization: dubbing and subtitling. AI-based localization automates the process of "original video → target language subtitles → synchronized dubbing." It's used for training courses, corporate content, You
Tube channels, and marketing videos. The cost is 5-15 times lower than studio translation and dubbing. ### Full localization pipelineОригинальное видео (EN) ↓ Извлечение аудио (ffmpeg) ↓ STT с таймингами (Whisper) ↓ Перевод с сохранением структуры (GPT-4o / DeepL) ↓ Генерация субтитров (SRT/VTT файлы) ↓ TTS с voice cloning (XTTS v2 / ElevenLabs) ↓ Выравнивание длительностей (time-stretching) ↓ Финальная сборка видео (ffmpeg mux)### Transcription with timestamps```python
import whisper
import json
from datetime import timedelta
def transcribe_with_timestamps(video_path: str, language: str = None) -> dict: model = whisper.load_model("large-v3")
# Извлекаем аудио
import subprocess
audio_path = video_path.replace(".mp4", "_audio.wav")
subprocess.run(["ffmpeg", "-i", video_path, "-ac", "1", "-ar", "16000", "-vn", audio_path])
result = model.transcribe(
audio_path,
language=language,
word_timestamps=True,
task="transcribe"
)
return result # segments содержат start/end/text
### Translation with timing preservationpython
from openai import AsyncOpenAI
import asyncio
client = AsyncOpenAI()
async def translate_segments( segments: list[dict], target_language: str, context: str = "" ) -> list[dict]: """Переводим с учётом контекста соседних реплик"""
translated = []
batch_size = 20
for i in range(0, len(segments), batch_size):
batch = segments[i:i+batch_size]
texts = [s["text"] for s in batch]
response = await client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "system",
"content": f"""Переводи на {target_language}. Сохраняй:
1. Разбивку на сегменты (одна строка входа = одна строка выхода)
2. Примерную длину (±20% от оригинала) — важно для синхронизации с губами
3. Неформальный тон, если оригинал неформальный
Контекст видео: {context}
Верни JSON массив строк."""
}, {
"role": "user",
"content": json.dumps(texts, ensure_ascii=False)
}],
response_format={"type": "json_object"}
)
translated_texts = json.loads(response.choices[0].message.content)["translations"]
for seg, trans in zip(batch, translated_texts):
translated.append({**seg, "text": trans})
return translated
### Generating SRT subtitlespython
def generate_srt(segments: list[dict], output_path: str) -> None:
def format_time(seconds: float) -> str:
td = timedelta(seconds=seconds)
total_seconds = int(td.total_seconds())
ms = int((td.total_seconds() - total_seconds) * 1000)
h, rem = divmod(total_seconds, 3600)
m, s = divmod(rem, 60)
return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
with open(output_path, "w", encoding="utf-8") as f:
for i, seg in enumerate(segments, 1):
f.write(f"{i}\n")
f.write(f"{format_time(seg['start'])} --> {format_time(seg['end'])}\n")
f.write(f"{seg['text'].strip()}\n\n")
### TTS dubbing with synchronizationpython
from TTS.api import TTS
import subprocess
from pydub import AudioSegment, effects
import io
class DubbingGenerator: def init(self): self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")
def synthesize_segment(self, text: str, target_duration: float, lang: str = "ru") -> bytes:
# Синтезируем
audio_wav = self.tts.tts(text=text, language=lang)
audio = AudioSegment(data=bytes(audio_wav), sample_width=2, frame_rate=22050, channels=1)
actual_duration = len(audio) / 1000.0 # сек
ratio = actual_duration / target_duration
# Растягиваем/сжимаем аудио до нужной длительности
if 0.7 < ratio < 1.4: # допустимый диапазон стретчинга
adjusted = audio._spawn(
audio.raw_data,
overrides={"frame_rate": int(audio.frame_rate * ratio)}
).set_frame_rate(22050)
else:
adjusted = audio # оставляем как есть, слишком большое отклонение
buf = io.BytesIO()
adjusted.export(buf, format="wav")
return buf.getvalue()
def assemble_dubbing_track(
self,
segments: list[dict],
total_duration_sec: float
) -> bytes:
timeline = AudioSegment.silent(duration=int(total_duration_sec * 1000))
for seg in segments:
audio = AudioSegment.from_wav(io.BytesIO(
self.synthesize_segment(seg["text"], seg["end"] - seg["start"])
))
position_ms = int(seg["start"] * 1000)
timeline = timeline.overlay(audio, position=position_ms)
buf = io.BytesIO()
timeline.export(buf, format="wav")
return buf.getvalue()
### Final video compilationpython
def mux_video_with_dubbing(
original_video: str,
dubbing_audio: str,
subtitles_srt: str,
output_path: str,
original_audio_volume: float = 0.1 # оригинал тихо на фоне
) -> None:
subprocess.run([
"ffmpeg",
"-i", original_video,
"-i", dubbing_audio,
"-filter_complex",
f"[0:a]volume={original_audio_volume}[orig];[1:a][orig]amix=inputs=2[aout]",
"-map", "0:v",
"-map", "[aout]",
"-vf", f"subtitles={subtitles_srt}:force_style='FontSize=24,PrimaryColour=&HFFFFFF'",
"-c:v", "libx264", "-c:a", "aac",
"-y", output_path
], check=True)







