|
| 1 | +/** |
| 2 | + * @license |
| 3 | + * Copyright 2026 Google LLC |
| 4 | + * SPDX-License-Identifier: Apache-2.0 |
| 5 | + */ |
| 6 | + |
| 7 | +import { debugLogger } from '../utils/debugLogger.js'; |
| 8 | +import type { |
| 9 | + AudioInputProvider, |
| 10 | + SpeechToTextAdapter, |
| 11 | + TextToSpeechAdapter, |
| 12 | + VoiceSessionConfig, |
| 13 | +} from './types.js'; |
| 14 | +import { VoiceState } from './types.js'; |
| 15 | + |
| 16 | +/** |
| 17 | + * Orchestrates the voice mode lifecycle. |
| 18 | + * |
| 19 | + * Wires together an AudioInputProvider, SpeechToTextAdapter, and |
| 20 | + * TextToSpeechAdapter into a coherent listen→transcribe→respond→speak loop. |
| 21 | + * |
| 22 | + * This is a skeleton — real audio backends will be injected later. |
| 23 | + * The controller is intentionally thin so it can be tested without hardware. |
| 24 | + */ |
| 25 | +export class VoiceModeController { |
| 26 | + private state: VoiceState = VoiceState.Idle; |
| 27 | + private readonly audioInput: AudioInputProvider; |
| 28 | + private readonly stt: SpeechToTextAdapter; |
| 29 | + private readonly tts: TextToSpeechAdapter; |
| 30 | + private readonly config: VoiceSessionConfig; |
| 31 | + |
| 32 | + constructor( |
| 33 | + audioInput: AudioInputProvider, |
| 34 | + stt: SpeechToTextAdapter, |
| 35 | + tts: TextToSpeechAdapter, |
| 36 | + config: VoiceSessionConfig = {}, |
| 37 | + ) { |
| 38 | + this.audioInput = audioInput; |
| 39 | + this.stt = stt; |
| 40 | + this.tts = tts; |
| 41 | + this.config = config; |
| 42 | + } |
| 43 | + |
| 44 | + /** Current lifecycle state. */ |
| 45 | + getState(): VoiceState { |
| 46 | + return this.state; |
| 47 | + } |
| 48 | + |
| 49 | + /** |
| 50 | + * Start the voice session. |
| 51 | + * Opens the audio input and begins the listen loop. |
| 52 | + */ |
| 53 | + async start(): Promise<void> { |
| 54 | + if (this.state !== VoiceState.Idle) { |
| 55 | + debugLogger.warn( |
| 56 | + `VoiceModeController.start() called in state "${this.state}", ignoring.`, |
| 57 | + ); |
| 58 | + return; |
| 59 | + } |
| 60 | + |
| 61 | + debugLogger.log( |
| 62 | + `[voice] Starting voice mode (locale=${this.config.locale ?? 'default'}, ` + |
| 63 | + `sampleRate=${String(this.config.sampleRate ?? 16000)})`, |
| 64 | + ); |
| 65 | + |
| 66 | + this.state = VoiceState.Listening; |
| 67 | + |
| 68 | + await this.audioInput.start(async (chunk) => { |
| 69 | + if (this.state !== VoiceState.Listening) return; |
| 70 | + |
| 71 | + try { |
| 72 | + this.state = VoiceState.Processing; |
| 73 | + const transcript = await this.stt.transcribe(chunk); |
| 74 | + |
| 75 | + if (transcript.trim().length === 0) { |
| 76 | + this.state = VoiceState.Listening; |
| 77 | + return; |
| 78 | + } |
| 79 | + |
| 80 | + debugLogger.log(`[voice] Transcript: "${transcript}"`); |
| 81 | + |
| 82 | + // In the future this is where the transcript feeds into the Gemini |
| 83 | + // conversation loop (GeminiClient.sendMessageStream). For now, we |
| 84 | + // echo it back through TTS as a proof-of-lifecycle. |
| 85 | + this.state = VoiceState.Speaking; |
| 86 | + await this.tts.speak(transcript); |
| 87 | + } catch (err) { |
| 88 | + debugLogger.error('[voice] Error in voice pipeline:', err); |
| 89 | + this.state = VoiceState.Error; |
| 90 | + } finally { |
| 91 | + if ( |
| 92 | + this.state === VoiceState.Speaking || |
| 93 | + this.state === VoiceState.Processing |
| 94 | + ) { |
| 95 | + this.state = VoiceState.Listening; |
| 96 | + } |
| 97 | + } |
| 98 | + }); |
| 99 | + } |
| 100 | + |
| 101 | + /** Stop the voice session and release resources. */ |
| 102 | + async stop(): Promise<void> { |
| 103 | + debugLogger.log('[voice] Stopping voice mode.'); |
| 104 | + await this.tts.cancel(); |
| 105 | + await this.audioInput.stop(); |
| 106 | + this.state = VoiceState.Idle; |
| 107 | + } |
| 108 | +} |
0 commit comments