Skip to content

Commit a6108a6

Browse files
Adib234ruomengz
authored andcommitted
fix(plan): Fix AskUser evals (#22074)
1 parent d343c2a commit a6108a6

3 files changed

Lines changed: 101 additions & 34 deletions

File tree

evals/ask_user.eval.ts

Lines changed: 72 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5,31 +5,62 @@
55
*/
66

77
import { describe, expect } from 'vitest';
8-
import { evalTest } from './test-helper.js';
8+
import { appEvalTest, AppEvalCase } from './app-test-helper.js';
9+
import { EvalPolicy } from './test-helper.js';
10+
11+
function askUserEvalTest(policy: EvalPolicy, evalCase: AppEvalCase) {
12+
return appEvalTest(policy, {
13+
...evalCase,
14+
configOverrides: {
15+
...evalCase.configOverrides,
16+
general: {
17+
...evalCase.configOverrides?.general,
18+
approvalMode: 'default',
19+
enableAutoUpdate: false,
20+
enableAutoUpdateNotification: false,
21+
},
22+
},
23+
files: {
24+
...evalCase.files,
25+
},
26+
});
27+
}
928

1029
describe('ask_user', () => {
11-
evalTest('USUALLY_PASSES', {
30+
askUserEvalTest('USUALLY_PASSES', {
1231
name: 'Agent uses AskUser tool to present multiple choice options',
1332
prompt: `Use the ask_user tool to ask me what my favorite color is. Provide 3 options: red, green, or blue.`,
33+
setup: async (rig) => {
34+
rig.setBreakpoint(['ask_user']);
35+
},
1436
assert: async (rig) => {
15-
const wasToolCalled = await rig.waitForToolCall('ask_user');
16-
expect(wasToolCalled, 'Expected ask_user tool to be called').toBe(true);
37+
const confirmation = await rig.waitForPendingConfirmation('ask_user');
38+
expect(
39+
confirmation,
40+
'Expected a pending confirmation for ask_user tool',
41+
).toBeDefined();
1742
},
1843
});
1944

20-
evalTest('USUALLY_PASSES', {
45+
askUserEvalTest('USUALLY_PASSES', {
2146
name: 'Agent uses AskUser tool to clarify ambiguous requirements',
2247
files: {
2348
'package.json': JSON.stringify({ name: 'my-app', version: '1.0.0' }),
2449
},
2550
prompt: `I want to build a new feature in this app. Ask me questions to clarify the requirements before proceeding.`,
51+
setup: async (rig) => {
52+
rig.setBreakpoint(['ask_user']);
53+
},
2654
assert: async (rig) => {
27-
const wasToolCalled = await rig.waitForToolCall('ask_user');
28-
expect(wasToolCalled, 'Expected ask_user tool to be called').toBe(true);
55+
const confirmation = await rig.waitForPendingConfirmation('ask_user');
56+
expect(
57+
confirmation,
58+
'Expected a pending confirmation for ask_user tool',
59+
).toBeDefined();
2960
},
3061
});
3162

32-
evalTest('USUALLY_PASSES', {
63+
askUserEvalTest('USUALLY_PASSES', {
3364
name: 'Agent uses AskUser tool before performing significant ambiguous rework',
3465
files: {
3566
'packages/core/src/index.ts': '// index\nexport const version = "1.0.0";',
@@ -39,54 +70,62 @@ describe('ask_user', () => {
3970
}),
4071
'README.md': '# Gemini CLI',
4172
},
42-
prompt: `Refactor the entire core package to be better.`,
73+
prompt: `I want to completely rewrite the core package to support the upcoming V2 architecture, but I haven't decided what that looks like yet. We need to figure out the requirements first. Can you ask me some questions to help nail down the design?`,
74+
setup: async (rig) => {
75+
rig.setBreakpoint(['enter_plan_mode', 'ask_user']);
76+
},
4377
assert: async (rig) => {
44-
const wasPlanModeCalled = await rig.waitForToolCall('enter_plan_mode');
45-
expect(wasPlanModeCalled, 'Expected enter_plan_mode to be called').toBe(
46-
true,
47-
);
78+
// It might call enter_plan_mode first.
79+
let confirmation = await rig.waitForPendingConfirmation([
80+
'enter_plan_mode',
81+
'ask_user',
82+
]);
83+
expect(confirmation, 'Expected a tool call confirmation').toBeDefined();
84+
85+
if (confirmation?.name === 'enter_plan_mode') {
86+
rig.acceptConfirmation('enter_plan_mode');
87+
confirmation = await rig.waitForPendingConfirmation('ask_user');
88+
}
4889

49-
const wasAskUserCalled = await rig.waitForToolCall('ask_user');
5090
expect(
51-
wasAskUserCalled,
52-
'Expected ask_user tool to be called to clarify the significant rework',
53-
).toBe(true);
91+
confirmation?.toolName,
92+
'Expected ask_user to be called to clarify the significant rework',
93+
).toBe('ask_user');
5494
},
5595
});
5696

5797
// --- Regression Tests for Recent Fixes ---
5898

59-
// Regression test for issue #20177: Ensure the agent does not use `ask_user` to
99+
// Regression test for issue #20177: Ensure the agent does not use \`ask_user\` to
60100
// confirm shell commands. Fixed via prompt refinements and tool definition
61101
// updates to clarify that shell command confirmation is handled by the UI.
62102
// See fix: https://github.com/google-gemini/gemini-cli/pull/20504
63-
evalTest('USUALLY_PASSES', {
103+
askUserEvalTest('USUALLY_PASSES', {
64104
name: 'Agent does NOT use AskUser to confirm shell commands',
65105
files: {
66106
'package.json': JSON.stringify({
67107
scripts: { build: 'echo building' },
68108
}),
69109
},
70110
prompt: `Run 'npm run build' in the current directory.`,
111+
setup: async (rig) => {
112+
rig.setBreakpoint(['run_shell_command', 'ask_user']);
113+
},
71114
assert: async (rig) => {
72-
await rig.waitForTelemetryReady();
73-
74-
const toolLogs = rig.readToolLogs();
75-
const wasShellCalled = toolLogs.some(
76-
(log) => log.toolRequest.name === 'run_shell_command',
77-
);
78-
const wasAskUserCalled = toolLogs.some(
79-
(log) => log.toolRequest.name === 'ask_user',
80-
);
115+
const confirmation = await rig.waitForPendingConfirmation([
116+
'run_shell_command',
117+
'ask_user',
118+
]);
81119

82120
expect(
83-
wasShellCalled,
84-
'Expected run_shell_command tool to be called',
85-
).toBe(true);
121+
confirmation,
122+
'Expected a pending confirmation for a tool',
123+
).toBeDefined();
124+
86125
expect(
87-
wasAskUserCalled,
126+
confirmation?.toolName,
88127
'ask_user should not be called to confirm shell commands',
89-
).toBe(false);
128+
).toBe('run_shell_command');
90129
},
91130
});
92131
});

packages/cli/src/test-utils/AppRig.tsx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -487,7 +487,7 @@ export class AppRig {
487487
}
488488

489489
async waitForPendingConfirmation(
490-
toolNameOrDisplayName?: string | RegExp,
490+
toolNameOrDisplayName?: string | RegExp | string[],
491491
timeout = 30000,
492492
): Promise<PendingConfirmation> {
493493
const matches = (p: PendingConfirmation) => {
@@ -498,6 +498,12 @@ export class AppRig {
498498
p.toolDisplayName === toolNameOrDisplayName
499499
);
500500
}
501+
if (Array.isArray(toolNameOrDisplayName)) {
502+
return (
503+
toolNameOrDisplayName.includes(p.toolName) ||
504+
toolNameOrDisplayName.includes(p.toolDisplayName || '')
505+
);
506+
}
501507
return (
502508
toolNameOrDisplayName.test(p.toolName) ||
503509
toolNameOrDisplayName.test(p.toolDisplayName || '')

packages/test-utils/src/test-rig.ts

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,6 +353,7 @@ export class TestRig {
353353
testName: string,
354354
options: {
355355
settings?: Record<string, unknown>;
356+
state?: Record<string, unknown>;
356357
fakeResponsesPath?: string;
357358
} = {},
358359
) {
@@ -382,6 +383,9 @@ export class TestRig {
382383

383384
// Create a settings file to point the CLI to the local collector
384385
this._createSettingsFile(options.settings);
386+
387+
// Create persistent state file
388+
this._createStateFile(options.state);
385389
}
386390

387391
private _cleanDir(dir: string) {
@@ -473,6 +477,24 @@ export class TestRig {
473477
);
474478
}
475479

480+
private _createStateFile(overrideState?: Record<string, unknown>) {
481+
if (!this.homeDir) throw new Error('TestRig homeDir is not initialized');
482+
const userGeminiDir = join(this.homeDir, GEMINI_DIR);
483+
mkdirSync(userGeminiDir, { recursive: true });
484+
485+
const state = deepMerge(
486+
{
487+
terminalSetupPromptShown: true, // Default to true in tests to avoid blocking prompts
488+
},
489+
overrideState ?? {},
490+
);
491+
492+
writeFileSync(
493+
join(userGeminiDir, 'state.json'),
494+
JSON.stringify(state, null, 2),
495+
);
496+
}
497+
476498
createFile(fileName: string, content: string) {
477499
const filePath = join(this.testDir!, fileName);
478500
writeFileSync(filePath, content);

0 commit comments

Comments
 (0)