Skip to content

Commit d45a45d

Browse files
chore: strengthen validation guidance in system prompt (#18544)
1 parent 69f562b commit d45a45d

7 files changed

Lines changed: 399 additions & 270 deletions

File tree

evals/test-helper.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
4949
// bootstrap test projects.
5050
const rootNodeModules = path.join(process.cwd(), 'node_modules');
5151
const testNodeModules = path.join(rig.testDir || '', 'node_modules');
52-
if (fs.existsSync(rootNodeModules)) {
52+
if (fs.existsSync(rootNodeModules) && !fs.existsSync(testNodeModules)) {
5353
fs.symlinkSync(rootNodeModules, testNodeModules, 'dir');
5454
}
5555

@@ -162,7 +162,7 @@ export function evalTest(policy: EvalPolicy, evalCase: EvalCase) {
162162
if (policy === 'USUALLY_PASSES' && !process.env['RUN_EVALS']) {
163163
it.skip(evalCase.name, fn);
164164
} else {
165-
it(evalCase.name, fn);
165+
it(evalCase.name, fn, evalCase.timeout);
166166
}
167167
}
168168

evals/validation_fidelity.eval.ts

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
import { describe, expect } from 'vitest';
8+
import { evalTest } from './test-helper.js';
9+
10+
describe('validation_fidelity', () => {
11+
evalTest('ALWAYS_PASSES', {
12+
name: 'should perform exhaustive validation autonomously when guided by system instructions',
13+
files: {
14+
'src/types.ts': `
15+
export interface LogEntry {
16+
level: 'info' | 'warn' | 'error';
17+
message: string;
18+
}
19+
`,
20+
'src/logger.ts': `
21+
import { LogEntry } from './types.js';
22+
23+
export function formatLog(entry: LogEntry): string {
24+
return \`[\${entry.level.toUpperCase()}] \${entry.message}\`;
25+
}
26+
`,
27+
'src/logger.test.ts': `
28+
import { expect, test } from 'vitest';
29+
import { formatLog } from './logger.js';
30+
import { LogEntry } from './types.js';
31+
32+
test('formats log correctly', () => {
33+
const entry: LogEntry = { level: 'info', message: 'test message' };
34+
expect(formatLog(entry)).toBe('[INFO] test message');
35+
});
36+
`,
37+
'package.json': JSON.stringify({
38+
name: 'test-project',
39+
type: 'module',
40+
scripts: {
41+
test: 'vitest run',
42+
build: 'tsc --noEmit',
43+
},
44+
}),
45+
'tsconfig.json': JSON.stringify({
46+
compilerOptions: {
47+
target: 'ESNext',
48+
module: 'ESNext',
49+
moduleResolution: 'node',
50+
strict: true,
51+
esModuleInterop: true,
52+
skipLibCheck: true,
53+
forceConsistentCasingInFileNames: true,
54+
},
55+
}),
56+
},
57+
prompt:
58+
"Refactor the 'LogEntry' interface in 'src/types.ts' to rename the 'message' field to 'payload'.",
59+
timeout: 600000,
60+
assert: async (rig) => {
61+
// The goal of this eval is to see if the agent realizes it needs to update usages
62+
// AND run 'npm run build' or 'tsc' autonomously to ensure project-wide structural integrity.
63+
64+
const toolLogs = rig.readToolLogs();
65+
const shellCalls = toolLogs.filter(
66+
(log) => log.toolRequest.name === 'run_shell_command',
67+
);
68+
69+
const hasBuildOrTsc = shellCalls.some((log) => {
70+
const cmd = JSON.parse(log.toolRequest.args).command.toLowerCase();
71+
return (
72+
cmd.includes('npm run build') ||
73+
cmd.includes('tsc') ||
74+
cmd.includes('typecheck') ||
75+
cmd.includes('npm run verify')
76+
);
77+
});
78+
79+
expect(
80+
hasBuildOrTsc,
81+
'Expected the agent to autonomously run a build or type-check command to verify the refactoring',
82+
).toBe(true);
83+
},
84+
});
85+
});
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
import { describe, expect } from 'vitest';
8+
import { evalTest } from './test-helper.js';
9+
10+
describe('validation_fidelity_pre_existing_errors', () => {
11+
evalTest('ALWAYS_PASSES', {
12+
name: 'should handle pre-existing project errors gracefully during validation',
13+
files: {
14+
'src/math.ts': `
15+
export function add(a: number, b: number): number {
16+
return a + b;
17+
}
18+
`,
19+
'src/index.ts': `
20+
import { add } from './math.js';
21+
console.log(add(1, 2));
22+
`,
23+
'src/utils.ts': `
24+
export function multiply(a: number, b: number): number {
25+
return a * c; // 'c' is not defined - PRE-EXISTING ERROR
26+
}
27+
`,
28+
'package.json': JSON.stringify({
29+
name: 'test-project',
30+
type: 'module',
31+
scripts: {
32+
test: 'vitest run',
33+
build: 'tsc --noEmit',
34+
},
35+
}),
36+
'tsconfig.json': JSON.stringify({
37+
compilerOptions: {
38+
target: 'ESNext',
39+
module: 'ESNext',
40+
moduleResolution: 'node',
41+
strict: true,
42+
esModuleInterop: true,
43+
skipLibCheck: true,
44+
forceConsistentCasingInFileNames: true,
45+
},
46+
}),
47+
},
48+
prompt: "In src/math.ts, rename the 'add' function to 'sum'.",
49+
timeout: 600000,
50+
assert: async (rig) => {
51+
const toolLogs = rig.readToolLogs();
52+
const replaceCalls = toolLogs.filter(
53+
(log) => log.toolRequest.name === 'replace',
54+
);
55+
56+
// Verify it did the work in math.ts
57+
const mathRefactor = replaceCalls.some((log) => {
58+
const args = JSON.parse(log.toolRequest.args);
59+
return (
60+
args.file_path.endsWith('src/math.ts') &&
61+
args.new_string.includes('sum')
62+
);
63+
});
64+
expect(mathRefactor, 'Agent should have refactored math.ts').toBe(true);
65+
66+
const shellCalls = toolLogs.filter(
67+
(log) => log.toolRequest.name === 'run_shell_command',
68+
);
69+
const ranValidation = shellCalls.some((log) => {
70+
const cmd = JSON.parse(log.toolRequest.args).command.toLowerCase();
71+
return cmd.includes('build') || cmd.includes('tsc');
72+
});
73+
74+
expect(ranValidation, 'Agent should have attempted validation').toBe(
75+
true,
76+
);
77+
},
78+
});
79+
});

0 commit comments

Comments
 (0)