|
| 1 | +/** |
| 2 | + * @license |
| 3 | + * Copyright 2026 Google LLC |
| 4 | + * SPDX-License-Identifier: Apache-2.0 |
| 5 | + */ |
| 6 | + |
| 7 | +import { describe, expect } from 'vitest'; |
| 8 | +import { evalTest } from './test-helper.js'; |
| 9 | + |
| 10 | +describe('validation_fidelity', () => { |
| 11 | + evalTest('ALWAYS_PASSES', { |
| 12 | + name: 'should perform exhaustive validation autonomously when guided by system instructions', |
| 13 | + files: { |
| 14 | + 'src/types.ts': ` |
| 15 | +export interface LogEntry { |
| 16 | + level: 'info' | 'warn' | 'error'; |
| 17 | + message: string; |
| 18 | +} |
| 19 | +`, |
| 20 | + 'src/logger.ts': ` |
| 21 | +import { LogEntry } from './types.js'; |
| 22 | +
|
| 23 | +export function formatLog(entry: LogEntry): string { |
| 24 | + return \`[\${entry.level.toUpperCase()}] \${entry.message}\`; |
| 25 | +} |
| 26 | +`, |
| 27 | + 'src/logger.test.ts': ` |
| 28 | +import { expect, test } from 'vitest'; |
| 29 | +import { formatLog } from './logger.js'; |
| 30 | +import { LogEntry } from './types.js'; |
| 31 | +
|
| 32 | +test('formats log correctly', () => { |
| 33 | + const entry: LogEntry = { level: 'info', message: 'test message' }; |
| 34 | + expect(formatLog(entry)).toBe('[INFO] test message'); |
| 35 | +}); |
| 36 | +`, |
| 37 | + 'package.json': JSON.stringify({ |
| 38 | + name: 'test-project', |
| 39 | + type: 'module', |
| 40 | + scripts: { |
| 41 | + test: 'vitest run', |
| 42 | + build: 'tsc --noEmit', |
| 43 | + }, |
| 44 | + }), |
| 45 | + 'tsconfig.json': JSON.stringify({ |
| 46 | + compilerOptions: { |
| 47 | + target: 'ESNext', |
| 48 | + module: 'ESNext', |
| 49 | + moduleResolution: 'node', |
| 50 | + strict: true, |
| 51 | + esModuleInterop: true, |
| 52 | + skipLibCheck: true, |
| 53 | + forceConsistentCasingInFileNames: true, |
| 54 | + }, |
| 55 | + }), |
| 56 | + }, |
| 57 | + prompt: |
| 58 | + "Refactor the 'LogEntry' interface in 'src/types.ts' to rename the 'message' field to 'payload'.", |
| 59 | + timeout: 600000, |
| 60 | + assert: async (rig) => { |
| 61 | + // The goal of this eval is to see if the agent realizes it needs to update usages |
| 62 | + // AND run 'npm run build' or 'tsc' autonomously to ensure project-wide structural integrity. |
| 63 | + |
| 64 | + const toolLogs = rig.readToolLogs(); |
| 65 | + const shellCalls = toolLogs.filter( |
| 66 | + (log) => log.toolRequest.name === 'run_shell_command', |
| 67 | + ); |
| 68 | + |
| 69 | + const hasBuildOrTsc = shellCalls.some((log) => { |
| 70 | + const cmd = JSON.parse(log.toolRequest.args).command.toLowerCase(); |
| 71 | + return ( |
| 72 | + cmd.includes('npm run build') || |
| 73 | + cmd.includes('tsc') || |
| 74 | + cmd.includes('typecheck') || |
| 75 | + cmd.includes('npm run verify') |
| 76 | + ); |
| 77 | + }); |
| 78 | + |
| 79 | + expect( |
| 80 | + hasBuildOrTsc, |
| 81 | + 'Expected the agent to autonomously run a build or type-check command to verify the refactoring', |
| 82 | + ).toBe(true); |
| 83 | + }, |
| 84 | + }); |
| 85 | +}); |
0 commit comments