Skip to content

Commit 5e313f9

Browse files
committed
feat(cli): introduce experimental voice mode architecture skeleton
1 parent d012929 commit 5e313f9

9 files changed

Lines changed: 230 additions & 5 deletions

File tree

.claude/settings.local.json

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
{
2+
"permissions": {
3+
"allow": [
4+
"Bash(done)",
5+
"Bash(xargs grep -l \"mode\")",
6+
"Bash(xargs grep -l \"hook\\\\|middleware\\\\|plugin\")",
7+
"Bash(xargs grep -l \"mode\\\\|Mode\")",
8+
"Bash(xargs grep -l \"SecurityModel\\\\|sandbox\")",
9+
"Bash(xargs -I {} bash -c 'echo \"\"=== {} ===\"\" && head -50 \"\"{}\"\"')",
10+
"Bash(node --version)",
11+
"Bash(npm install)",
12+
"Bash(npm run build)",
13+
"Bash(npm start)",
14+
"Bash(node packages/cli/bundle/gemini.js --version)",
15+
"Bash(node bundle/gemini.js --version)",
16+
"Bash(node bundle/gemini.js --help)",
17+
"Bash(npm run build --workspace=@google/gemini-cli-core)",
18+
"Bash(npm run build --workspace=@google/gemini-cli)",
19+
"Bash(node bundle/gemini.js --voice)",
20+
"Bash(npm run bundle)",
21+
"Bash(npm start -- --voice)",
22+
"Bash(node bundle/gemini.js)",
23+
"Bash(echo \"EXIT CODE: $?\")",
24+
"Bash(npm run test --workspace=@google/gemini-cli-core)",
25+
"Bash(npm run test --workspace=@google/gemini-cli)",
26+
"Bash(npx vitest run packages/cli/src/gemini.test.tsx)",
27+
"Bash(git checkout -- 'packages/cli/src/ui/components/__snapshots__/ConfigInitDisplay.test.tsx.snap')",
28+
"Bash(git checkout -- package-lock.json)"
29+
]
30+
}
31+
}

packages/cli/src/config/config.ts

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ export interface CliArgs {
7878
allowedTools: string[] | undefined;
7979
acp?: boolean;
8080
experimentalAcp?: boolean;
81+
experimentalVoice: boolean | undefined;
8182
extensions: string[] | undefined;
8283
listExtensions: boolean | undefined;
8384
resume: string | typeof RESUME_LATEST | undefined;
@@ -182,6 +183,11 @@ export async function parseArguments(
182183
description:
183184
'Starts the agent in ACP mode (deprecated, use --acp instead)',
184185
})
186+
.option('experimental-voice', {
187+
type: 'boolean',
188+
description: '[Experimental] Start in hands-free voice mode',
189+
hidden: true,
190+
})
185191
.option('allowed-mcp-server-names', {
186192
type: 'array',
187193
string: true,
@@ -758,7 +764,7 @@ export async function loadCliConfig(
758764
bugCommand: settings.advanced?.bugCommand,
759765
model: resolvedModel,
760766
maxSessionTurns: settings.model?.maxSessionTurns,
761-
767+
experimentalVoice: argv.experimentalVoice || false,
762768
listExtensions: argv.listExtensions || false,
763769
listSessions: argv.listSessions || false,
764770
deleteSession: argv.deleteSession,

packages/cli/src/gemini.test.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -484,6 +484,7 @@ describe('gemini.tsx main function kitty protocol', () => {
484484
allowedMcpServerNames: undefined,
485485
allowedTools: undefined,
486486
experimentalAcp: undefined,
487+
experimentalVoice: undefined,
487488
extensions: undefined,
488489
listExtensions: undefined,
489490
includeDirectories: undefined,

packages/cli/src/gemini.tsx

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,15 @@ export async function main() {
676676
return runAcpClient(config, settings, argv);
677677
}
678678

679+
if (config.getExperimentalVoice()) {
680+
writeToStderr(
681+
'[experimental] Voice mode is not yet implemented. ' +
682+
'The --voice flag registers the architectural skeleton only.\n',
683+
);
684+
await runExitCleanup();
685+
process.exit(ExitCodes.SUCCESS);
686+
}
687+
679688
let input = config.getQuestion();
680689
const useAlternateBuffer = shouldEnterAlternateScreen(
681690
isAlternateBufferEnabled(config),

packages/cli/src/gemini_cleanup.test.tsx

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ describe('gemini.tsx main function cleanup', () => {
217217
getMcpClientManager: vi.fn(),
218218
getIdeMode: vi.fn(() => false),
219219
getAcpMode: vi.fn(() => true),
220+
getExperimentalVoice: vi.fn(() => false),
220221
getScreenReader: vi.fn(() => false),
221222
getGeminiMdFileCount: vi.fn(() => 0),
222223
getProjectRoot: vi.fn(() => '/'),

packages/cli/src/test-utils/mockConfig.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ export const createMockConfig = (overrides: Partial<Config> = {}): Config =>
4343
getSessionId: vi.fn().mockReturnValue('mock-session-id'),
4444
getContentGeneratorConfig: vi.fn(() => ({ authType: 'google' })),
4545
getAcpMode: vi.fn(() => false),
46+
getExperimentalVoice: vi.fn(() => false),
4647
isBrowserLaunchSuppressed: vi.fn(() => false),
4748
setRemoteAdminSettings: vi.fn(),
4849
isYoloModeDisabled: vi.fn(() => false),

packages/core/src/config/config.ts

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -516,6 +516,7 @@ export interface ConfigParameters {
516516
disableLoopDetection?: boolean;
517517
maxSessionTurns?: number;
518518
acpMode?: boolean;
519+
experimentalVoice?: boolean;
519520
listSessions?: boolean;
520521
deleteSession?: string;
521522
listExtensions?: boolean;
@@ -715,6 +716,7 @@ export class Config implements McpContext {
715716
| Record<string, SummarizeToolOutputSettings>
716717
| undefined;
717718
private readonly acpMode: boolean = false;
719+
private readonly experimentalVoice: boolean = false;
718720
private readonly loadMemoryFromIncludeDirectories: boolean = false;
719721
private readonly includeDirectoryTree: boolean = true;
720722
private readonly importFormat: 'tree' | 'flat';
@@ -912,6 +914,7 @@ export class Config implements McpContext {
912914
};
913915
this.maxSessionTurns = params.maxSessionTurns ?? -1;
914916
this.acpMode = params.acpMode ?? false;
917+
this.experimentalVoice = params.experimentalVoice ?? false;
915918
this.listSessions = params.listSessions ?? false;
916919
this.deleteSession = params.deleteSession;
917920
this.listExtensions = params.listExtensions ?? false;
@@ -2255,10 +2258,8 @@ export class Config implements McpContext {
22552258
return this.acpMode;
22562259
}
22572260

2258-
async waitForMcpInit(): Promise<void> {
2259-
if (this.mcpInitializationPromise) {
2260-
await this.mcpInitializationPromise;
2261-
}
2261+
getExperimentalVoice(): boolean {
2262+
return this.experimentalVoice;
22622263
}
22632264

22642265
getListExtensions(): boolean {

packages/core/src/voice/types.ts

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
/**
8+
* Contracts for the Hands-Free Voice Mode pipeline.
9+
*
10+
* Architecture:
11+
* Mic → AudioInputProvider → SpeechToTextAdapter → [Gemini API] → TextToSpeechAdapter → Speaker
12+
*
13+
* Each interface is designed to be swappable — the initial implementation
14+
* will use no-op stubs, and real backends (native audio, WebSocket bridges,
15+
* MCP audio servers, Gemini Live API) can be plugged in later.
16+
*/
17+
18+
/** PCM audio chunk emitted by an AudioInputProvider. */
19+
export interface AudioChunk {
20+
/** Raw PCM sample data. */
21+
readonly samples: Buffer;
22+
/** Sample rate in Hz (e.g. 16000). */
23+
readonly sampleRate: number;
24+
/** Number of audio channels (1 = mono, 2 = stereo). */
25+
readonly channels: number;
26+
}
27+
28+
/** Captures audio from a microphone or other input device. */
29+
export interface AudioInputProvider {
30+
/** Begin capturing audio. Implementations should emit chunks via the callback. */
31+
start(onChunk: (chunk: AudioChunk) => void): Promise<void>;
32+
/** Stop capturing and release resources. */
33+
stop(): Promise<void>;
34+
/** Whether the provider is currently capturing. */
35+
isActive(): boolean;
36+
}
37+
38+
/** Converts an audio chunk to text (speech-to-text). */
39+
export interface SpeechToTextAdapter {
40+
/** Transcribe a single audio chunk. Returns the transcribed text. */
41+
transcribe(chunk: AudioChunk): Promise<string>;
42+
}
43+
44+
/** Converts text to audible speech (text-to-speech). */
45+
export interface TextToSpeechAdapter {
46+
/** Synthesize text into audio and play it back. Resolves when playback ends. */
47+
speak(text: string): Promise<void>;
48+
/** Interrupt any in-progress playback. */
49+
cancel(): Promise<void>;
50+
}
51+
52+
/** Configuration for a voice session. */
53+
export interface VoiceSessionConfig {
54+
/** Sample rate in Hz for audio capture (default: 16000). */
55+
sampleRate?: number;
56+
/** Locale/language code for STT/TTS (e.g. "en-US"). */
57+
locale?: string;
58+
}
59+
60+
/** Lifecycle states for the voice mode controller. */
61+
export enum VoiceState {
62+
Idle = 'idle',
63+
Listening = 'listening',
64+
Processing = 'processing',
65+
Speaking = 'speaking',
66+
Error = 'error',
67+
}
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
import { debugLogger } from '../utils/debugLogger.js';
8+
import type {
9+
AudioInputProvider,
10+
SpeechToTextAdapter,
11+
TextToSpeechAdapter,
12+
VoiceSessionConfig,
13+
} from './types.js';
14+
import { VoiceState } from './types.js';
15+
16+
/**
17+
* Orchestrates the voice mode lifecycle.
18+
*
19+
* Wires together an AudioInputProvider, SpeechToTextAdapter, and
20+
* TextToSpeechAdapter into a coherent listen→transcribe→respond→speak loop.
21+
*
22+
* This is a skeleton — real audio backends will be injected later.
23+
* The controller is intentionally thin so it can be tested without hardware.
24+
*/
25+
export class VoiceModeController {
26+
private state: VoiceState = VoiceState.Idle;
27+
private readonly audioInput: AudioInputProvider;
28+
private readonly stt: SpeechToTextAdapter;
29+
private readonly tts: TextToSpeechAdapter;
30+
private readonly config: VoiceSessionConfig;
31+
32+
constructor(
33+
audioInput: AudioInputProvider,
34+
stt: SpeechToTextAdapter,
35+
tts: TextToSpeechAdapter,
36+
config: VoiceSessionConfig = {},
37+
) {
38+
this.audioInput = audioInput;
39+
this.stt = stt;
40+
this.tts = tts;
41+
this.config = config;
42+
}
43+
44+
/** Current lifecycle state. */
45+
getState(): VoiceState {
46+
return this.state;
47+
}
48+
49+
/**
50+
* Start the voice session.
51+
* Opens the audio input and begins the listen loop.
52+
*/
53+
async start(): Promise<void> {
54+
if (this.state !== VoiceState.Idle) {
55+
debugLogger.warn(
56+
`VoiceModeController.start() called in state "${this.state}", ignoring.`,
57+
);
58+
return;
59+
}
60+
61+
debugLogger.log(
62+
`[voice] Starting voice mode (locale=${this.config.locale ?? 'default'}, ` +
63+
`sampleRate=${String(this.config.sampleRate ?? 16000)})`,
64+
);
65+
66+
this.state = VoiceState.Listening;
67+
68+
await this.audioInput.start(async (chunk) => {
69+
if (this.state !== VoiceState.Listening) return;
70+
71+
try {
72+
this.state = VoiceState.Processing;
73+
const transcript = await this.stt.transcribe(chunk);
74+
75+
if (transcript.trim().length === 0) {
76+
this.state = VoiceState.Listening;
77+
return;
78+
}
79+
80+
debugLogger.log(`[voice] Transcript: "${transcript}"`);
81+
82+
// In the future this is where the transcript feeds into the Gemini
83+
// conversation loop (GeminiClient.sendMessageStream). For now, we
84+
// echo it back through TTS as a proof-of-lifecycle.
85+
this.state = VoiceState.Speaking;
86+
await this.tts.speak(transcript);
87+
} catch (err) {
88+
debugLogger.error('[voice] Error in voice pipeline:', err);
89+
this.state = VoiceState.Error;
90+
} finally {
91+
if (
92+
this.state === VoiceState.Speaking ||
93+
this.state === VoiceState.Processing
94+
) {
95+
this.state = VoiceState.Listening;
96+
}
97+
}
98+
});
99+
}
100+
101+
/** Stop the voice session and release resources. */
102+
async stop(): Promise<void> {
103+
debugLogger.log('[voice] Stopping voice mode.');
104+
await this.tts.cancel();
105+
await this.audioInput.stop();
106+
this.state = VoiceState.Idle;
107+
}
108+
}

0 commit comments

Comments
 (0)