Implementation of speech synthesis from long texts (Long-Form TTS) Most TTS engines have limitations: OpenAI TTS - 4096 characters per request, Eleven
Labs - 5000. For audiobooks, lectures and long documents, a special pipeline of splitting and assembling is needed. ### Text splitting strategy```python import re from dataclasses import dataclass
@dataclass class TextChunk: text: str index: int char_start: int char_end: int
def split_text_for_tts( text: str, max_chars: int = 4000, overlap_sentences: int = 0 ) -> list[TextChunk]: """Разбивка по предложениям с сохранением контекста""" # Разбиваем на предложения sentence_pattern = r'(?<=[.!?])\s+(?=[А-ЯA-Z])|(?<=\n)\n+' sentences = re.split(sentence_pattern, text)
chunks = []
current_chunk = ""
current_start = 0
char_pos = 0
for sentence in sentences:
if len(current_chunk) + len(sentence) > max_chars and current_chunk:
chunks.append(TextChunk(
text=current_chunk.strip(),
index=len(chunks),
char_start=current_start,
char_end=char_pos
))
current_chunk = sentence
current_start = char_pos
else:
current_chunk += " " + sentence if current_chunk else sentence
char_pos += len(sentence) + 1
if current_chunk:
chunks.append(TextChunk(current_chunk.strip(), len(chunks),
current_start, char_pos))
return chunks
### Parallel generation and assemblypython
import asyncio
from openai import AsyncOpenAI
from pydub import AudioSegment
import io
client = AsyncOpenAI()
async def synthesize_chunk(chunk: TextChunk, voice: str) -> tuple[int, bytes]: response = await client.audio.speech.create( model="tts-1-hd", voice=voice, input=chunk.text, response_format="mp3" ) return chunk.index, response.content
async def synthesize_long_text(text: str, voice: str = "alloy") -> bytes: chunks = split_text_for_tts(text, max_chars=4000)
# Параллельный синтез (с rate limiting)
semaphore = asyncio.Semaphore(5)
async def bounded_synthesize(chunk):
async with semaphore:
return await synthesize_chunk(chunk, voice)
results = await asyncio.gather(*[bounded_synthesize(c) for c in chunks])
# Сортируем по индексу и склеиваем
sorted_results = sorted(results, key=lambda x: x[0])
combined = AudioSegment.empty()
for _, audio_bytes in sorted_results:
audio = AudioSegment.from_mp3(io.BytesIO(audio_bytes))
combined += audio
output = io.BytesIO()
combined.export(output, format="mp3", bitrate="128k")
return output.getvalue()







