feat(cli): introduce experimental voice mode architecture skeleton

Sangini-spec · Sangini-spec · commit 5e313f9c75e0 · 2026-03-08T22:03:15.000+05:30
diff --git a/.claude/settings.local.json b/.claude/settings.local.json
@@ -0,0 +1,31 @@
+{
+  "permissions": {
+    "allow": [
+      "Bash(done)",
+      "Bash(xargs grep -l \"mode\")",
+      "Bash(xargs grep -l \"hook\\\\|middleware\\\\|plugin\")",
+      "Bash(xargs grep -l \"mode\\\\|Mode\")",
+      "Bash(xargs grep -l \"SecurityModel\\\\|sandbox\")",
+      "Bash(xargs -I {} bash -c 'echo \"\"=== {} ===\"\" && head -50 \"\"{}\"\"')",
+      "Bash(node --version)",
+      "Bash(npm install)",
+      "Bash(npm run build)",
+      "Bash(npm start)",
+      "Bash(node packages/cli/bundle/gemini.js --version)",
+      "Bash(node bundle/gemini.js --version)",
+      "Bash(node bundle/gemini.js --help)",
+      "Bash(npm run build --workspace=@google/gemini-cli-core)",
+      "Bash(npm run build --workspace=@google/gemini-cli)",
+      "Bash(node bundle/gemini.js --voice)",
+      "Bash(npm run bundle)",
+      "Bash(npm start -- --voice)",
+      "Bash(node bundle/gemini.js)",
+      "Bash(echo \"EXIT CODE: $?\")",
+      "Bash(npm run test --workspace=@google/gemini-cli-core)",
+      "Bash(npm run test --workspace=@google/gemini-cli)",
+      "Bash(npx vitest run packages/cli/src/gemini.test.tsx)",
+      "Bash(git checkout -- 'packages/cli/src/ui/components/__snapshots__/ConfigInitDisplay.test.tsx.snap')",
+      "Bash(git checkout -- package-lock.json)"
+    ]
+  }
+}
diff --git a/packages/cli/src/config/config.ts b/packages/cli/src/config/config.ts
@@ -78,6 +78,7 @@ export interface CliArgs {
   allowedTools: string[] | undefined;
   acp?: boolean;
   experimentalAcp?: boolean;
+  experimentalVoice: boolean | undefined;
   extensions: string[] | undefined;
   listExtensions: boolean | undefined;
   resume: string | typeof RESUME_LATEST | undefined;
@@ -182,6 +183,11 @@ export async function parseArguments(
           description:
             'Starts the agent in ACP mode (deprecated, use --acp instead)',
         })
+        .option('experimental-voice', {
+          type: 'boolean',
+          description: '[Experimental] Start in hands-free voice mode',
+          hidden: true,
+        })
         .option('allowed-mcp-server-names', {
           type: 'array',
           string: true,
@@ -758,7 +764,7 @@ export async function loadCliConfig(
     bugCommand: settings.advanced?.bugCommand,
     model: resolvedModel,
     maxSessionTurns: settings.model?.maxSessionTurns,
-
+    experimentalVoice: argv.experimentalVoice || false,
     listExtensions: argv.listExtensions || false,
     listSessions: argv.listSessions || false,
     deleteSession: argv.deleteSession,
diff --git a/packages/cli/src/gemini.test.tsx b/packages/cli/src/gemini.test.tsx
@@ -484,6 +484,7 @@ describe('gemini.tsx main function kitty protocol', () => {
       allowedMcpServerNames: undefined,
       allowedTools: undefined,
       experimentalAcp: undefined,
+      experimentalVoice: undefined,
       extensions: undefined,
       listExtensions: undefined,
       includeDirectories: undefined,
diff --git a/packages/cli/src/gemini.tsx b/packages/cli/src/gemini.tsx
@@ -676,6 +676,15 @@ export async function main() {
       return runAcpClient(config, settings, argv);
     }
 
+    if (config.getExperimentalVoice()) {
+      writeToStderr(
+        '[experimental] Voice mode is not yet implemented. ' +
+          'The --voice flag registers the architectural skeleton only.\n',
+      );
+      await runExitCleanup();
+      process.exit(ExitCodes.SUCCESS);
+    }
+
     let input = config.getQuestion();
     const useAlternateBuffer = shouldEnterAlternateScreen(
       isAlternateBufferEnabled(config),
diff --git a/packages/cli/src/gemini_cleanup.test.tsx b/packages/cli/src/gemini_cleanup.test.tsx
@@ -217,6 +217,7 @@ describe('gemini.tsx main function cleanup', () => {
       getMcpClientManager: vi.fn(),
       getIdeMode: vi.fn(() => false),
       getAcpMode: vi.fn(() => true),
+      getExperimentalVoice: vi.fn(() => false),
       getScreenReader: vi.fn(() => false),
       getGeminiMdFileCount: vi.fn(() => 0),
       getProjectRoot: vi.fn(() => '/'),
diff --git a/packages/cli/src/test-utils/mockConfig.ts b/packages/cli/src/test-utils/mockConfig.ts
@@ -43,6 +43,7 @@ export const createMockConfig = (overrides: Partial<Config> = {}): Config =>
     getSessionId: vi.fn().mockReturnValue('mock-session-id'),
     getContentGeneratorConfig: vi.fn(() => ({ authType: 'google' })),
     getAcpMode: vi.fn(() => false),
+    getExperimentalVoice: vi.fn(() => false),
     isBrowserLaunchSuppressed: vi.fn(() => false),
     setRemoteAdminSettings: vi.fn(),
     isYoloModeDisabled: vi.fn(() => false),
diff --git a/packages/core/src/config/config.ts b/packages/core/src/config/config.ts
@@ -516,6 +516,7 @@ export interface ConfigParameters {
   disableLoopDetection?: boolean;
   maxSessionTurns?: number;
   acpMode?: boolean;
+  experimentalVoice?: boolean;
   listSessions?: boolean;
   deleteSession?: string;
   listExtensions?: boolean;
@@ -715,6 +716,7 @@ export class Config implements McpContext {
     | Record<string, SummarizeToolOutputSettings>
     | undefined;
   private readonly acpMode: boolean = false;
+  private readonly experimentalVoice: boolean = false;
   private readonly loadMemoryFromIncludeDirectories: boolean = false;
   private readonly includeDirectoryTree: boolean = true;
   private readonly importFormat: 'tree' | 'flat';
@@ -912,6 +914,7 @@ export class Config implements McpContext {
     };
     this.maxSessionTurns = params.maxSessionTurns ?? -1;
     this.acpMode = params.acpMode ?? false;
+    this.experimentalVoice = params.experimentalVoice ?? false;
     this.listSessions = params.listSessions ?? false;
     this.deleteSession = params.deleteSession;
     this.listExtensions = params.listExtensions ?? false;
@@ -2255,10 +2258,8 @@ export class Config implements McpContext {
     return this.acpMode;
   }
 
-  async waitForMcpInit(): Promise<void> {
-    if (this.mcpInitializationPromise) {
-      await this.mcpInitializationPromise;
-    }
+  getExperimentalVoice(): boolean {
+    return this.experimentalVoice;
   }
 
   getListExtensions(): boolean {
diff --git a/packages/core/src/voice/types.ts b/packages/core/src/voice/types.ts
@@ -0,0 +1,67 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+/**
+ * Contracts for the Hands-Free Voice Mode pipeline.
+ *
+ * Architecture:
+ *   Mic → AudioInputProvider → SpeechToTextAdapter → [Gemini API] → TextToSpeechAdapter → Speaker
+ *
+ * Each interface is designed to be swappable — the initial implementation
+ * will use no-op stubs, and real backends (native audio, WebSocket bridges,
+ * MCP audio servers, Gemini Live API) can be plugged in later.
+ */
+
+/** PCM audio chunk emitted by an AudioInputProvider. */
+export interface AudioChunk {
+  /** Raw PCM sample data. */
+  readonly samples: Buffer;
+  /** Sample rate in Hz (e.g. 16000). */
+  readonly sampleRate: number;
+  /** Number of audio channels (1 = mono, 2 = stereo). */
+  readonly channels: number;
+}
+
+/** Captures audio from a microphone or other input device. */
+export interface AudioInputProvider {
+  /** Begin capturing audio. Implementations should emit chunks via the callback. */
+  start(onChunk: (chunk: AudioChunk) => void): Promise<void>;
+  /** Stop capturing and release resources. */
+  stop(): Promise<void>;
+  /** Whether the provider is currently capturing. */
+  isActive(): boolean;
+}
+
+/** Converts an audio chunk to text (speech-to-text). */
+export interface SpeechToTextAdapter {
+  /** Transcribe a single audio chunk. Returns the transcribed text. */
+  transcribe(chunk: AudioChunk): Promise<string>;
+}
+
+/** Converts text to audible speech (text-to-speech). */
+export interface TextToSpeechAdapter {
+  /** Synthesize text into audio and play it back. Resolves when playback ends. */
+  speak(text: string): Promise<void>;
+  /** Interrupt any in-progress playback. */
+  cancel(): Promise<void>;
+}
+
+/** Configuration for a voice session. */
+export interface VoiceSessionConfig {
+  /** Sample rate in Hz for audio capture (default: 16000). */
+  sampleRate?: number;
+  /** Locale/language code for STT/TTS (e.g. "en-US"). */
+  locale?: string;
+}
+
+/** Lifecycle states for the voice mode controller. */
+export enum VoiceState {
+  Idle = 'idle',
+  Listening = 'listening',
+  Processing = 'processing',
+  Speaking = 'speaking',
+  Error = 'error',
+}
diff --git a/packages/core/src/voice/voiceModeController.ts b/packages/core/src/voice/voiceModeController.ts
@@ -0,0 +1,108 @@
+/**
+ * @license
+ * Copyright 2026 Google LLC
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { debugLogger } from '../utils/debugLogger.js';
+import type {
+  AudioInputProvider,
+  SpeechToTextAdapter,
+  TextToSpeechAdapter,
+  VoiceSessionConfig,
+} from './types.js';
+import { VoiceState } from './types.js';
+
+/**
+ * Orchestrates the voice mode lifecycle.
+ *
+ * Wires together an AudioInputProvider, SpeechToTextAdapter, and
+ * TextToSpeechAdapter into a coherent listen→transcribe→respond→speak loop.
+ *
+ * This is a skeleton — real audio backends will be injected later.
+ * The controller is intentionally thin so it can be tested without hardware.
+ */
+export class VoiceModeController {
+  private state: VoiceState = VoiceState.Idle;
+  private readonly audioInput: AudioInputProvider;
+  private readonly stt: SpeechToTextAdapter;
+  private readonly tts: TextToSpeechAdapter;
+  private readonly config: VoiceSessionConfig;
+
+  constructor(
+    audioInput: AudioInputProvider,
+    stt: SpeechToTextAdapter,
+    tts: TextToSpeechAdapter,
+    config: VoiceSessionConfig = {},
+  ) {
+    this.audioInput = audioInput;
+    this.stt = stt;
+    this.tts = tts;
+    this.config = config;
+  }
+
+  /** Current lifecycle state. */
+  getState(): VoiceState {
+    return this.state;
+  }
+
+  /**
+   * Start the voice session.
+   * Opens the audio input and begins the listen loop.
+   */
+  async start(): Promise<void> {
+    if (this.state !== VoiceState.Idle) {
+      debugLogger.warn(
+        `VoiceModeController.start() called in state "${this.state}", ignoring.`,
+      );
+      return;
+    }
+
+    debugLogger.log(
+      `[voice] Starting voice mode (locale=${this.config.locale ?? 'default'}, ` +
+        `sampleRate=${String(this.config.sampleRate ?? 16000)})`,
+    );
+
+    this.state = VoiceState.Listening;
+
+    await this.audioInput.start(async (chunk) => {
+      if (this.state !== VoiceState.Listening) return;
+
+      try {
+        this.state = VoiceState.Processing;
+        const transcript = await this.stt.transcribe(chunk);
+
+        if (transcript.trim().length === 0) {
+          this.state = VoiceState.Listening;
+          return;
+        }
+
+        debugLogger.log(`[voice] Transcript: "${transcript}"`);
+
+        // In the future this is where the transcript feeds into the Gemini
+        // conversation loop (GeminiClient.sendMessageStream). For now, we
+        // echo it back through TTS as a proof-of-lifecycle.
+        this.state = VoiceState.Speaking;
+        await this.tts.speak(transcript);
+      } catch (err) {
+        debugLogger.error('[voice] Error in voice pipeline:', err);
+        this.state = VoiceState.Error;
+      } finally {
+        if (
+          this.state === VoiceState.Speaking ||
+          this.state === VoiceState.Processing
+        ) {
+          this.state = VoiceState.Listening;
+        }
+      }
+    });
+  }
+
+  /** Stop the voice session and release resources. */
+  async stop(): Promise<void> {
+    debugLogger.log('[voice] Stopping voice mode.');
+    await this.tts.cancel();
+    await this.audioInput.stop();
+    this.state = VoiceState.Idle;
+  }
+}