fix(plan): Fix AskUser evals (#22074)

Adib234 · ruomengz · commit a6108a624975 · 2026-03-13T10:49:01.000-04:00
diff --git a/evals/ask_user.eval.ts b/evals/ask_user.eval.ts
@@ -5,31 +5,62 @@
  */
 
 import { describe, expect } from 'vitest';
-import { evalTest } from './test-helper.js';
+import { appEvalTest, AppEvalCase } from './app-test-helper.js';
+import { EvalPolicy } from './test-helper.js';
+
+function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
+  return appEvalTest(policy, {
+    ...evalCase,
+    configOverrides: {
+      ...evalCase.configOverrides,
+      general: {
+        ...evalCase.configOverrides?.general,
+        approvalMode: 'default',
+        enableAutoUpdate: false,
+        enableAutoUpdateNotification: false,
+      },
+    },
+    files: {
+      ...evalCase.files,
+    },
+  });
+}
 
 describe('ask_user', () => {
-  evalTest('USUALLY_PASSES', {
+  askUserEvalTest('USUALLY_PASSES', {
     name: 'Agent uses AskUser tool to present multiple choice options',
     prompt: `Use the ask_user tool to ask me what my favorite color is. Provide 3 options: red, green, or blue.`,
+    setup: async (rig) => {
+      rig.setBreakpoint(['ask_user']);
+    },
     assert: async (rig) => {
-      const wasToolCalled = await rig.waitForToolCall('ask_user');
-      expect(wasToolCalled, 'Expected ask_user tool to be called').toBe(true);
+      const confirmation = await rig.waitForPendingConfirmation('ask_user');
+      expect(
+        confirmation,
+        'Expected a pending confirmation for ask_user tool',
+      ).toBeDefined();
     },
   });
 
-  evalTest('USUALLY_PASSES', {
+  askUserEvalTest('USUALLY_PASSES', {
     name: 'Agent uses AskUser tool to clarify ambiguous requirements',
     files: {
       'package.json': JSON.stringify({ name: 'my-app', version: '1.0.0' }),
     },
     prompt: `I want to build a new feature in this app. Ask me questions to clarify the requirements before proceeding.`,
+    setup: async (rig) => {
+      rig.setBreakpoint(['ask_user']);
+    },
     assert: async (rig) => {
-      const wasToolCalled = await rig.waitForToolCall('ask_user');
-      expect(wasToolCalled, 'Expected ask_user tool to be called').toBe(true);
+      const confirmation = await rig.waitForPendingConfirmation('ask_user');
+      expect(
+        confirmation,
+        'Expected a pending confirmation for ask_user tool',
+      ).toBeDefined();
     },
   });
 
-  evalTest('USUALLY_PASSES', {
+  askUserEvalTest('USUALLY_PASSES', {
     name: 'Agent uses AskUser tool before performing significant ambiguous rework',
     files: {
       'packages/core/src/index.ts': '// index\nexport const version = "1.0.0";',
@@ -39,54 +70,62 @@ describe('ask_user', () => {
       }),
       'README.md': '# Gemini CLI',
     },
-    prompt: `Refactor the entire core package to be better.`,
+    prompt: `I want to completely rewrite the core package to support the upcoming V2 architecture, but I haven't decided what that looks like yet. We need to figure out the requirements first. Can you ask me some questions to help nail down the design?`,
+    setup: async (rig) => {
+      rig.setBreakpoint(['enter_plan_mode', 'ask_user']);
+    },
     assert: async (rig) => {
-      const wasPlanModeCalled = await rig.waitForToolCall('enter_plan_mode');
-      expect(wasPlanModeCalled, 'Expected enter_plan_mode to be called').toBe(
-        true,
-      );
+      // It might call enter_plan_mode first.
+      let confirmation = await rig.waitForPendingConfirmation([
+        'enter_plan_mode',
+        'ask_user',
+      ]);
+      expect(confirmation, 'Expected a tool call confirmation').toBeDefined();
+
+      if (confirmation?.name === 'enter_plan_mode') {
+        rig.acceptConfirmation('enter_plan_mode');
+        confirmation = await rig.waitForPendingConfirmation('ask_user');
+      }
 
-      const wasAskUserCalled = await rig.waitForToolCall('ask_user');
       expect(
-        wasAskUserCalled,
-        'Expected ask_user tool to be called to clarify the significant rework',
-      ).toBe(true);
+        confirmation?.toolName,
+        'Expected ask_user to be called to clarify the significant rework',
+      ).toBe('ask_user');
     },
   });
 
   // --- Regression Tests for Recent Fixes ---
 
-  // Regression test for issue #20177: Ensure the agent does not use `ask_user` to
+  // Regression test for issue #20177: Ensure the agent does not use \`ask_user\` to
   // confirm shell commands. Fixed via prompt refinements and tool definition
   // updates to clarify that shell command confirmation is handled by the UI.
   // See fix: https://github.com/google-gemini/gemini-cli/pull/20504
-  evalTest('USUALLY_PASSES', {
+  askUserEvalTest('USUALLY_PASSES', {
     name: 'Agent does NOT use AskUser to confirm shell commands',
     files: {
       'package.json': JSON.stringify({
         scripts: { build: 'echo building' },
       }),
     },
     prompt: `Run 'npm run build' in the current directory.`,
+    setup: async (rig) => {
+      rig.setBreakpoint(['run_shell_command', 'ask_user']);
+    },
     assert: async (rig) => {
-      await rig.waitForTelemetryReady();
-
-      const toolLogs = rig.readToolLogs();
-      const wasShellCalled = toolLogs.some(
-        (log) => log.toolRequest.name === 'run_shell_command',
-      );
-      const wasAskUserCalled = toolLogs.some(
-        (log) => log.toolRequest.name === 'ask_user',
-      );
+      const confirmation = await rig.waitForPendingConfirmation([
+        'run_shell_command',
+        'ask_user',
+      ]);
 
       expect(
-        wasShellCalled,
-        'Expected run_shell_command tool to be called',
-      ).toBe(true);
+        confirmation,
+        'Expected a pending confirmation for a tool',
+      ).toBeDefined();
+
       expect(
-        wasAskUserCalled,
+        confirmation?.toolName,
         'ask_user should not be called to confirm shell commands',
-      ).toBe(false);
+      ).toBe('run_shell_command');
     },
   });
 });
diff --git a/packages/cli/src/test-utils/AppRig.tsx b/packages/cli/src/test-utils/AppRig.tsx
@@ -487,7 +487,7 @@ export class AppRig {
   }
 
   async waitForPendingConfirmation(
-    toolNameOrDisplayName?: string | RegExp,
+    toolNameOrDisplayName?: string | RegExp | string[],
     timeout = 30000,
   ): Promise<PendingConfirmation> {
     const matches = (p: PendingConfirmation) => {
@@ -498,6 +498,12 @@ export class AppRig {
           p.toolDisplayName === toolNameOrDisplayName
         );
       }
+      if (Array.isArray(toolNameOrDisplayName)) {
+        return (
+          toolNameOrDisplayName.includes(p.toolName) ||
+          toolNameOrDisplayName.includes(p.toolDisplayName || '')
+        );
+      }
       return (
         toolNameOrDisplayName.test(p.toolName) ||
         toolNameOrDisplayName.test(p.toolDisplayName || '')
diff --git a/packages/test-utils/src/test-rig.ts b/packages/test-utils/src/test-rig.ts
@@ -353,6 +353,7 @@ export class TestRig {
     testName: string,
     options: {
       settings?: Record<string, unknown>;
+      state?: Record<string, unknown>;
       fakeResponsesPath?: string;
     } = {},
   ) {
@@ -382,6 +383,9 @@ export class TestRig {
 
     // Create a settings file to point the CLI to the local collector
     this._createSettingsFile(options.settings);
+
+    // Create persistent state file
+    this._createStateFile(options.state);
   }
 
   private _cleanDir(dir: string) {
@@ -473,6 +477,24 @@ export class TestRig {
     );
   }
 
+  private _createStateFile(overrideState?: Record<string, unknown>) {
+    if (!this.homeDir) throw new Error('TestRig homeDir is not initialized');
+    const userGeminiDir = join(this.homeDir, GEMINI_DIR);
+    mkdirSync(userGeminiDir, { recursive: true });
+
+    const state = deepMerge(
+      {
+        terminalSetupPromptShown: true, // Default to true in tests to avoid blocking prompts
+      },
+      overrideState ?? {},
+    );
+
+    writeFileSync(
+      join(userGeminiDir, 'state.json'),
+      JSON.stringify(state, null, 2),
+    );
+  }
+
   createFile(fileName: string, content: string) {
     const filePath = join(this.testDir!, fileName);
     writeFileSync(filePath, content);