"""Process-wide voice recording + TTS API for the TUI gateway.
Wraps ``tools.voice_mode`` (recording/transcription) and ``tools.tts_tool``
(text-to-speech) behind idempotent, stateful entry points that the gateway's
``voice.record``, ``voice.toggle``, and ``voice.tts`` JSON-RPC handlers can
call from a dedicated thread. The gateway imports this module lazily so that
missing optional audio deps (sounddevice, faster-whisper, numpy) surface as
an ``ImportError`` at call time, not at startup.
Two usage modes are exposed:
* **Push-to-talk** (``start_recording`` / ``stop_and_transcribe``) — single
manually-bounded capture used when the caller drives the start/stop pair
explicitly.
* **Continuous (VAD)** (``start_continuous`` / ``stop_continuous``) — mirrors
the classic CLI voice mode: recording auto-stops on silence, transcribes,
hands the result to a callback, and then auto-restarts for the next turn.
Three consecutive no-speech cycles stop the loop and fire
``on_silent_limit`` so the UI can turn the mode off.
"""
from __future__ import annotations
import logging
import os
import sys
import threading
from typing import Any, Callable, Optional
_VOICE_MOD_ALIASES = {
"ctrl": "c-",
"control": "c-",
"alt": "a-",
"option": "a-",
"opt": "a-",
}
_VOICE_NAMED_KEYS = {
"space": "space",
"spc": "space",
"enter": "enter",
"return": "enter",
"ret": "enter",
"tab": "tab",
"escape": "escape",
"esc": "escape",
"backspace": "backspace",
"bs": "backspace",
"delete": "delete",
"del": "delete",
}
_VOICE_RESERVED_CTRL_CHARS = frozenset({"c", "d", "l"})
_VOICE_RESERVED_ALT_CHARS_MAC = frozenset({"c", "d", "l"})
_DEFAULT_PT_KEY = "c-b"
def voice_record_key_from_config(cfg: Any) -> Any:
"""Shape-safe ``cfg.voice.record_key`` lookup.
``load_config()`` deep-merges raw YAML and preserves scalar
overrides, so a hand-edited ``voice: true`` / ``voice: cmd+b``
leaves ``cfg["voice"]`` as a bool/str instead of a dict, and the
naive ``.get("voice", {}).get("record_key")`` chain raises
AttributeError before voice can even start (Copilot round-11 on
#19835). Return ``None`` for malformed shapes so call sites can
feed the result straight into the normalizer/formatter and get
the documented default.
"""
if not isinstance(cfg, dict):
return None
voice = cfg.get("voice")
if not isinstance(voice, dict):
return None
return voice.get("record_key")
def normalize_voice_record_key_for_prompt_toolkit(raw: Any) -> str:
"""Coerce ``voice.record_key`` into prompt_toolkit's ``c-x`` / ``a-x`` format.
Mirrors the TUI parser contract (``ui-tui/src/lib/platform.ts``)
so one config value binds the same shortcut in both runtimes:
* non-string / empty / typo'd / bare-char / multi-modifier / reserved
``ctrl+c|d|l`` → documented default ``c-b``
* single-char keys: ``ctrl+o`` → ``c-o``
* named keys: ``ctrl+space`` → ``c-space`` (aliases collapse:
``ctrl+return`` → ``c-enter``)
* ``super`` / ``win`` / ``windows`` → ``c-b`` (TUI-only modifiers —
prompt_toolkit has no super mod; the CLI binding site is
expected to warn when this fallback fires so users see the
cross-runtime split, Copilot round-11 on #19835)
"""
if not isinstance(raw, str):
return _DEFAULT_PT_KEY
lowered = raw.strip().lower()
if not lowered:
return _DEFAULT_PT_KEY
parts = [p.strip() for p in lowered.split("+") if p.strip()]
if not parts:
return _DEFAULT_PT_KEY
if len(parts) > 2:
return _DEFAULT_PT_KEY
if len(parts) == 1:
return _DEFAULT_PT_KEY
modifier_token, key_token = parts
if modifier_token in {"super", "win", "windows"}:
return _DEFAULT_PT_KEY
normalized_mod = _VOICE_MOD_ALIASES.get(modifier_token)
if not normalized_mod:
return _DEFAULT_PT_KEY
if len(key_token) == 1:
if normalized_mod == "c-" and key_token in _VOICE_RESERVED_CTRL_CHARS:
return _DEFAULT_PT_KEY
if (
normalized_mod == "a-"
and sys.platform == "darwin"
and key_token in _VOICE_RESERVED_ALT_CHARS_MAC
):
return _DEFAULT_PT_KEY
return f"{normalized_mod}{key_token}"
named = _VOICE_NAMED_KEYS.get(key_token)
if not named:
return _DEFAULT_PT_KEY
return f"{normalized_mod}{named}"
def format_voice_record_key_for_status(raw: Any) -> str:
"""Render ``voice.record_key`` for ``/voice status`` in CLI-friendly form.
Mirrors the TUI's ``formatVoiceRecordKey``: returns ``Ctrl+B`` /
``Alt+Space`` / ``Ctrl+Enter``. Malformed configs surface as the
documented default so status never advertises a shortcut that
won't bind (Copilot round-10 on #19835).
"""
normalized = normalize_voice_record_key_for_prompt_toolkit(raw)
if normalized.startswith("c-"):
prefix, key = "Ctrl+", normalized[2:]
elif normalized.startswith("a-"):
prefix, key = "Alt+", normalized[2:]
elif "+" in normalized:
mod, key = normalized.split("+", 1)
prefix = mod[0].upper() + mod[1:] + "+"
else:
return "Ctrl+B"
if not key:
return prefix.rstrip("+")
if len(key) == 1:
return prefix + key.upper()
return prefix + key[0].upper() + key[1:]
from tools.voice_mode import (
create_audio_recorder,
is_whisper_hallucination,
play_audio_file,
transcribe_recording,
)
logger = logging.getLogger(__name__)
def _debug(msg: str) -> None:
"""Emit a debug breadcrumb when HERMES_VOICE_DEBUG=1.
Goes to stderr so the TUI gateway wraps it as a gateway.stderr event,
which createGatewayEventHandler shows as an Activity line — exactly
what we need to diagnose "why didn't the loop auto-restart?" in the
user's real terminal without shipping a separate debug RPC.
Any OSError / BrokenPipeError is swallowed because this fires from
background threads (silence callback, TTS daemon, beep) where a
broken stderr pipe must not kill the whole gateway — the main
command pipe (stdin+stdout) is what actually matters.
"""
if os.environ.get("HERMES_VOICE_DEBUG", "").strip() != "1":
return
try:
print(f"[voice] {msg}", file=sys.stderr, flush=True)
except (BrokenPipeError, OSError):
pass
def _beeps_enabled() -> bool:
"""CLI parity: voice.beep_enabled in config.yaml (default True)."""
try:
from hermes_cli.config import load_config
voice_cfg = load_config().get("voice", {})
if isinstance(voice_cfg, dict):
return bool(voice_cfg.get("beep_enabled", True))
except Exception:
pass
return True
def _play_beep(frequency: int, count: int = 1) -> None:
"""Audible cue matching cli.py's record/stop beeps.
880 Hz single-beep on start (cli.py:_voice_start_recording line 7532),
660 Hz double-beep on stop (cli.py:_voice_stop_and_transcribe line 7585).
Best-effort — sounddevice failures are silently swallowed so the
voice loop never breaks because a speaker was unavailable.
"""
if not _beeps_enabled():
return
try:
from tools.voice_mode import play_beep
play_beep(frequency=frequency, count=count)
except Exception as e:
_debug(f"beep {frequency}Hz failed: {e}")
_recorder = None
_recorder_lock = threading.Lock()
_continuous_lock = threading.Lock()
_continuous_active = False
_continuous_stopping = False
_continuous_auto_restart: bool = True
_continuous_recorder: Any = None
_tts_playing = threading.Event()
_tts_playing.set()
_continuous_on_transcript: Optional[Callable[[str], None]] = None
_continuous_on_status: Optional[Callable[[str], None]] = None
_continuous_on_silent_limit: Optional[Callable[[], None]] = None
_continuous_no_speech_count = 0
_CONTINUOUS_NO_SPEECH_LIMIT = 3
def start_recording() -> None:
"""Begin capturing from the default input device (push-to-talk).
Idempotent — calling again while a recording is in progress is a no-op.
"""
global _recorder
with _recorder_lock:
if _recorder is not None and getattr(_recorder, "is_recording", False):
return
rec = create_audio_recorder()
rec.start()
_recorder = rec
def stop_and_transcribe() -> Optional[str]:
"""Stop the active push-to-talk recording, transcribe, return text.
Returns ``None`` when no recording is active, when the microphone
captured no speech, or when Whisper returned a known hallucination.
"""
global _recorder
with _recorder_lock:
rec = _recorder
_recorder = None
if rec is None:
return None
wav_path = rec.stop()
if not wav_path:
return None
try:
result = transcribe_recording(wav_path)
except Exception as e:
logger.warning("voice transcription failed: %s", e)
return None
finally:
try:
if os.path.isfile(wav_path):
os.unlink(wav_path)
except Exception:
pass
if not result.get("success"):
return None
text = (result.get("transcript") or "").strip()
if not text or is_whisper_hallucination(text):
return None
return text
def start_continuous(
on_transcript: Callable[[str], None],
on_status: Optional[Callable[[str], None]] = None,
on_silent_limit: Optional[Callable[[], None]] = None,
silence_threshold: int = 200,
silence_duration: float = 3.0,
auto_restart: bool = True,
) -> bool:
"""Start a VAD-driven continuous recording loop.
The loop calls ``on_transcript(text)`` each time speech is detected and
transcribed successfully. If ``auto_restart`` is True, it auto-restarts
for the next turn and resets the no-speech counter for that loop. If
``auto_restart`` is False, the first silence-triggered transcription ends
the loop and reports ``"idle"``; no-speech counts are retained across
starts so a push-to-talk caller can still enforce the three-strikes guard.
After ``_CONTINUOUS_NO_SPEECH_LIMIT`` consecutive silent cycles (no speech
picked up at all) the loop stops itself and calls ``on_silent_limit`` so the
UI can reflect "voice off". Returns False if a previous stop is still
transcribing/cleaning up; otherwise returns True. Idempotent — calling while
already active is a successful no-op.
``on_status`` is called with ``"listening"`` / ``"transcribing"`` /
``"idle"`` so the UI can show a live indicator.
"""
global _continuous_active, _continuous_recorder, _continuous_auto_restart
global _continuous_on_transcript, _continuous_on_status, _continuous_on_silent_limit
global _continuous_no_speech_count
with _continuous_lock:
if _continuous_active:
_debug("start_continuous: already active — no-op")
return True
if _continuous_stopping:
_debug("start_continuous: stop/transcribe in progress — busy")
return False
_continuous_active = True
_continuous_auto_restart = auto_restart
_continuous_on_transcript = on_transcript
_continuous_on_status = on_status
_continuous_on_silent_limit = on_silent_limit
if auto_restart:
_continuous_no_speech_count = 0
if _continuous_recorder is None:
_continuous_recorder = create_audio_recorder()
_continuous_recorder._silence_threshold = silence_threshold
_continuous_recorder._silence_duration = silence_duration
rec = _continuous_recorder
_debug(
f"start_continuous: begin (threshold={silence_threshold}, duration={silence_duration}s)"
)
_play_beep(frequency=880, count=1)
try:
rec.start(on_silence_stop=_continuous_on_silence)
except Exception as e:
logger.error("failed to start continuous recording: %s", e)
_debug(f"start_continuous: rec.start raised {type(e).__name__}: {e}")
with _continuous_lock:
_continuous_active = False
raise
if on_status:
try:
on_status("listening")
except Exception:
pass
return True
def stop_continuous(force_transcribe: bool = False) -> None:
"""Stop the active continuous loop and release the microphone.
Idempotent — calling while not active is a no-op. If ``force_transcribe`` is
True, the recorder stops synchronously, then transcription/cleanup runs on a
background thread before reporting ``"idle"``. Otherwise the buffer is
discarded.
"""
global _continuous_active, _continuous_on_transcript, _continuous_stopping
global _continuous_on_status, _continuous_on_silent_limit
global _continuous_recorder, _continuous_no_speech_count
with _continuous_lock:
if not _continuous_active:
return
_continuous_active = False
rec = _continuous_recorder
on_status = _continuous_on_status
on_transcript = _continuous_on_transcript
on_silent_limit = _continuous_on_silent_limit
auto_restart = _continuous_auto_restart
track_no_speech = force_transcribe and not auto_restart
_continuous_stopping = rec is not None
_continuous_on_transcript = None
_continuous_on_status = None
_continuous_on_silent_limit = None
if not track_no_speech:
_continuous_no_speech_count = 0
if rec is not None:
if force_transcribe and on_transcript:
if on_status:
try:
on_status("transcribing")
except Exception:
pass
try:
wav_path = rec.stop()
except Exception as e:
logger.warning("failed to stop recorder: %s", e)
try:
rec.cancel()
except Exception as cancel_error:
logger.warning("failed to cancel recorder: %s", cancel_error)
wav_path = None
def _transcribe_and_cleanup():
global _continuous_no_speech_count, _continuous_stopping
transcript: Optional[str] = None
should_halt = False
try:
if wav_path:
try:
result = transcribe_recording(wav_path)
if result.get("success"):
text = (result.get("transcript") or "").strip()
if text and not is_whisper_hallucination(text):
transcript = text
finally:
if os.path.isfile(wav_path):
os.unlink(wav_path)
except Exception as e:
logger.warning("failed to stop/transcribe recorder: %s", e)
finally:
if transcript:
try:
on_transcript(transcript)
except Exception as e:
logger.warning("on_transcript callback raised: %s", e)
if track_no_speech:
with _continuous_lock:
if transcript:
_continuous_no_speech_count = 0
else:
_continuous_no_speech_count += 1
should_halt = (
_continuous_no_speech_count
>= _CONTINUOUS_NO_SPEECH_LIMIT
)
if should_halt:
_continuous_no_speech_count = 0
if should_halt and on_silent_limit:
try:
on_silent_limit()
except Exception:
pass
_play_beep(frequency=660, count=2)
with _continuous_lock:
_continuous_stopping = False
if on_status:
try:
on_status("idle")
except Exception:
pass
threading.Thread(target=_transcribe_and_cleanup, daemon=True).start()
return
else:
try:
rec.cancel()
except Exception as e:
logger.warning("failed to cancel recorder: %s", e)
with _continuous_lock:
_continuous_stopping = False
_play_beep(frequency=660, count=2)
if on_status:
try:
on_status("idle")
except Exception:
pass
def is_continuous_active() -> bool:
"""Whether a continuous voice loop is currently running."""
with _continuous_lock:
return _continuous_active
def _continuous_on_silence() -> None:
"""AudioRecorder silence callback — runs in a daemon thread.
Stops the current capture, transcribes, delivers the text via
``on_transcript``, and — if the loop is still active — starts the
next capture. Three consecutive silent cycles end the loop.
"""
global _continuous_active, _continuous_no_speech_count
_debug("_continuous_on_silence: fired")
with _continuous_lock:
if not _continuous_active:
_debug("_continuous_on_silence: loop inactive — abort")
return
rec = _continuous_recorder
on_transcript = _continuous_on_transcript
on_status = _continuous_on_status
on_silent_limit = _continuous_on_silent_limit
if rec is None:
_debug("_continuous_on_silence: no recorder — abort")
return
if on_status:
try:
on_status("transcribing")
except Exception:
pass
wav_path = rec.stop()
peak_rms = getattr(rec, "_peak_rms", -1)
_debug(
f"_continuous_on_silence: rec.stop -> {wav_path!r} (peak_rms={peak_rms})"
)
_play_beep(frequency=660, count=2)
transcript: Optional[str] = None
if wav_path:
try:
result = transcribe_recording(wav_path)
success = bool(result.get("success"))
text = (result.get("transcript") or "").strip()
err = result.get("error")
_debug(
f"_continuous_on_silence: transcribe -> success={success} "
f"text={text!r} err={err!r}"
)
if success and text and not is_whisper_hallucination(text):
transcript = text
except Exception as e:
logger.warning("continuous transcription failed: %s", e)
_debug(f"_continuous_on_silence: transcribe raised {type(e).__name__}: {e}")
finally:
try:
if os.path.isfile(wav_path):
os.unlink(wav_path)
except Exception:
pass
with _continuous_lock:
if not _continuous_active:
_debug("_continuous_on_silence: stopped during transcribe — no restart")
return
if transcript:
_continuous_no_speech_count = 0
else:
_continuous_no_speech_count += 1
should_halt = _continuous_no_speech_count >= _CONTINUOUS_NO_SPEECH_LIMIT
no_speech = _continuous_no_speech_count
if transcript and on_transcript:
try:
on_transcript(transcript)
except Exception as e:
logger.warning("on_transcript callback raised: %s", e)
if should_halt:
_debug(f"_continuous_on_silence: {no_speech} silent cycles — halting")
with _continuous_lock:
_continuous_active = False
_continuous_no_speech_count = 0
if on_silent_limit:
try:
on_silent_limit()
except Exception:
pass
try:
rec.cancel()
except Exception:
pass
if on_status:
try:
on_status("idle")
except Exception:
pass
return
if not _tts_playing.is_set():
_debug("_continuous_on_silence: waiting for TTS to finish")
_tts_playing.wait(timeout=60)
import time as _time
_time.sleep(0.3)
with _continuous_lock:
if not _continuous_active:
_debug("_continuous_on_silence: stopped while waiting for TTS")
return
if _continuous_auto_restart:
_debug(f"_continuous_on_silence: restarting loop (no_speech={no_speech})")
_play_beep(frequency=880, count=1)
try:
rec.start(on_silence_stop=_continuous_on_silence)
except Exception as e:
logger.error("failed to restart continuous recording: %s", e)
_debug(f"_continuous_on_silence: restart raised {type(e).__name__}: {e}")
with _continuous_lock:
_continuous_active = False
if on_status:
try:
on_status("idle")
except Exception:
pass
return
if on_status:
try:
on_status("listening")
except Exception:
pass
else:
_debug("_continuous_on_silence: auto_restart=False, stopping loop")
with _continuous_lock:
_continuous_active = False
if on_status:
try:
on_status("idle")
except Exception:
pass
def speak_text(text: str) -> None:
"""Synthesize ``text`` with the configured TTS provider and play it.
Mirrors cli.py:_voice_speak_response exactly — same markdown strip
pipeline, same 4000-char cap, same explicit mp3 output path, same
MP3-over-OGG playback choice (afplay misbehaves on OGG), same cleanup
of both extensions. Keeping these in sync means a voice-mode TTS
session in the TUI sounds identical to one in the classic CLI.
While playback is in flight the module-level _tts_playing Event is
cleared so the continuous-recording loop knows to wait before
re-arming the mic (otherwise the agent's spoken reply feedback-loops
through the microphone and the agent ends up replying to itself).
"""
if not text or not text.strip():
return
import re
import tempfile
import time
paused_recording = False
with _continuous_lock:
if (
_continuous_active
and _continuous_recorder is not None
and getattr(_continuous_recorder, "is_recording", False)
):
try:
_continuous_recorder.cancel()
paused_recording = True
except Exception as e:
logger.warning("failed to pause recorder for TTS: %s", e)
_tts_playing.clear()
_debug(f"speak_text: TTS begin (paused_recording={paused_recording})")
try:
from tools.tts_tool import text_to_speech_tool
tts_text = text[:4000] if len(text) > 4000 else text
tts_text = re.sub(r'```[\s\S]*?```', ' ', tts_text)
tts_text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', tts_text)
tts_text = re.sub(r'https?://\S+', '', tts_text)
tts_text = re.sub(r'\*\*(.+?)\*\*', r'\1', tts_text)
tts_text = re.sub(r'\*(.+?)\*', r'\1', tts_text)
tts_text = re.sub(r'`(.+?)`', r'\1', tts_text)
tts_text = re.sub(r'^#+\s*', '', tts_text, flags=re.MULTILINE)
tts_text = re.sub(r'^\s*[-*]\s+', '', tts_text, flags=re.MULTILINE)
tts_text = re.sub(r'---+', '', tts_text)
tts_text = re.sub(r'\n{3,}', '\n\n', tts_text)
tts_text = tts_text.strip()
if not tts_text:
return
os.makedirs(os.path.join(tempfile.gettempdir(), "hermes_voice"), exist_ok=True)
mp3_path = os.path.join(
tempfile.gettempdir(),
"hermes_voice",
f"tts_{time.strftime('%Y%m%d_%H%M%S')}.mp3",
)
_debug(f"speak_text: synthesizing {len(tts_text)} chars -> {mp3_path}")
text_to_speech_tool(text=tts_text, output_path=mp3_path)
if os.path.isfile(mp3_path) and os.path.getsize(mp3_path) > 0:
_debug(f"speak_text: playing {mp3_path} ({os.path.getsize(mp3_path)} bytes)")
play_audio_file(mp3_path)
try:
os.unlink(mp3_path)
ogg_path = mp3_path.rsplit(".", 1)[0] + ".ogg"
if os.path.isfile(ogg_path):
os.unlink(ogg_path)
except OSError:
pass
else:
_debug(f"speak_text: TTS tool produced no audio at {mp3_path}")
except Exception as e:
logger.warning("Voice TTS playback failed: %s", e)
_debug(f"speak_text raised {type(e).__name__}: {e}")
finally:
_tts_playing.set()
_debug("speak_text: TTS done")
if paused_recording:
time.sleep(0.3)
with _continuous_lock:
if _continuous_active and _continuous_recorder is not None:
try:
_continuous_recorder.start(
on_silence_stop=_continuous_on_silence
)
_debug("speak_text: recording resumed after TTS")
except Exception as e:
logger.warning(
"failed to resume recorder after TTS: %s", e
)