Development of AI-based voice control for interfaces for people with disabilities. Voice control is the primary input method for people with musculoskeletal disorders, visually impaired people, and elderly users with cognitive disabilities. Standards: WCAG 2.1 criterion 3.3.7, EN 301 549, GOST R 52872-2019. ### Voice control architecture
from faster_whisper import WhisperModel
from openai import AsyncOpenAI
import asyncio
import pyaudio
import numpy as np
class AccessibilityVoiceController:
def __init__(self, app_commands: dict):
self.stt = WhisperModel("base", device="cuda", compute_type="int8")
self.llm = AsyncOpenAI()
self.commands = app_commands # {"открыть профиль": handler_fn, ...}
self.wake_word = "помощник"
async def listen_and_execute(self):
audio_stream = self._open_mic_stream()
while True:
audio_chunk = audio_stream.read(frames=16000 * 3) # 3 секунды
audio_np = np.frombuffer(audio_chunk, dtype=np.int16).astype(np.float32) / 32768.0
segments, _ = self.stt.transcribe(audio_np, language="ru", vad_filter=True)
text = " ".join(s.text for s in segments).strip().lower()
if not text or self.wake_word not in text:
continue
command_text = text.split(self.wake_word, 1)[-1].strip()
await self.process_command(command_text)
async def process_command(self, text: str):
# Точное совпадение
for cmd, handler in self.commands.items():
if cmd in text:
await handler()
await self.speak_feedback(f"Выполняю: {cmd}")
return
# Нечёткое распознавание через LLM
intent = await self.classify_intent_with_llm(text)
if intent and intent in self.commands:
await self.commands[intent]()
await self.speak_feedback(f"Понял, выполняю")
else:
await self.speak_feedback("Не понял команду. Повторите, пожалуйста.")
async def classify_intent_with_llm(self, text: str) -> str | None:
available = list(self.commands.keys())
response = await self.llm.chat.completions.create(
model="gpt-4o-mini",
messages=[{
"role": "system",
"content": f"Определи, какой команде соответствует фраза пользователя. Доступные команды: {available}. Верни только название команды или 'null'."
}, {
"role": "user",
"content": text
}]
)
result = response.choices[0].message.content.strip()
return result if result != "null" else None
```### TTS feedback```python
import edge_tts
import tempfile
import pygame
async def speak_feedback(text: str, voice: str = "ru-RU-DmitryNeural"):
"""Озвучиваем системный ответ через Edge TTS (бесплатно)"""
tts = edge_tts.Communicate(text=text, voice=voice, rate="+10%")
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
await tts.save(f.name)
pygame.mixer.music.load(f.name)
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
await asyncio.sleep(0.1)
```### Navigating the web interface```python
# Команды для веб-навигации через Playwright/Selenium
class WebAccessibilityCommands:
COMMAND_MAP = {
"перейти в профиль": lambda p: p.goto("/profile"),
"открыть настройки": lambda p: p.goto("/settings"),
"увеличить шрифт": lambda p: p.evaluate("document.documentElement.style.fontSize = '120%'"),
"уменьшить шрифт": lambda p: p.evaluate("document.documentElement.style.fontSize = '90%'"),
"нажать кнопку сохранить": lambda p: p.click("button:has-text('Сохранить')"),
"прокрутить вниз": lambda p: p.keyboard.press("End"),
"прочитать страницу": lambda p: read_page_content(p),
"заполнить поле имени": fill_name_field,
}
```### Screen reader compatibility: Voice control complements (not replaces) screen readers. Integration via ARIA live regions:```html
<!-- Статус голосовых команд для скринридера -->
<div
id="voice-status"
role="status"
aria-live="polite"
aria-atomic="true"
class="sr-only"
>
<!-- Сюда JS вставляет: "Команда выполнена: открыть профиль" -->
</div>
<!-- Визуальный индикатор прослушивания -->
<button
id="voice-toggle"
aria-label="Голосовое управление"
aria-pressed="false"
>
<span class="mic-icon" aria-hidden="true"></span>
<span class="sr-only">Активировать голосовое управление</span>
</button>
```### Accessibility and Personalization | Need | Solution | |------------|---------| | Speech Impairments/Stuttering | Extended Timeout (10 sec), Repetition | | Accent/Dialect | Fine-tuned Whisper or Yandex SpeechKit | | Slow Speech | Reduced VAD Threshold | | Cognitive Features | Simple One-Word Commands, Prompts | | Works in Noise | DeepFilterNet Before STT | Timeframe: Voice control for one web app — 2–3 weeks. Customizable system with user profiles and accent training — 6–8 weeks.