|
| 1 | +/** |
| 2 | + * @license |
| 3 | + * Copyright 2026 Google LLC |
| 4 | + * SPDX-License-Identifier: Apache-2.0 |
| 5 | + */ |
| 6 | + |
| 7 | +import { describe, expect } from 'vitest'; |
| 8 | +import { appEvalTest } from './app-test-helper.js'; |
| 9 | + |
| 10 | +describe('generalist_delegation', () => { |
| 11 | + // --- Positive Evals (Should Delegate) --- |
| 12 | + |
| 13 | + appEvalTest('ALWAYS_PASSES', { |
| 14 | + name: 'should delegate batch error fixing to generalist agent', |
| 15 | + configOverrides: { |
| 16 | + agents: { |
| 17 | + overrides: { |
| 18 | + generalist: { enabled: true }, |
| 19 | + }, |
| 20 | + }, |
| 21 | + model: 'gemini-3-flash-preview', |
| 22 | + experimental: { |
| 23 | + enableAgents: true, |
| 24 | + }, |
| 25 | + excludeTools: ['run_shell_command'], |
| 26 | + }, |
| 27 | + files: { |
| 28 | + 'file1.ts': 'console.log("no semi")', |
| 29 | + 'file2.ts': 'console.log("no semi")', |
| 30 | + 'file3.ts': 'console.log("no semi")', |
| 31 | + 'file4.ts': 'console.log("no semi")', |
| 32 | + 'file5.ts': 'console.log("no semi")', |
| 33 | + 'file6.ts': 'console.log("no semi")', |
| 34 | + 'file7.ts': 'console.log("no semi")', |
| 35 | + 'file8.ts': 'console.log("no semi")', |
| 36 | + 'file9.ts': 'console.log("no semi")', |
| 37 | + 'file10.ts': 'console.log("no semi")', |
| 38 | + }, |
| 39 | + prompt: |
| 40 | + 'I have 10 files (file1.ts to file10.ts) that are missing semicolons. Can you fix them?', |
| 41 | + setup: async (rig) => { |
| 42 | + rig.setBreakpoint(['generalist']); |
| 43 | + }, |
| 44 | + assert: async (rig) => { |
| 45 | + const confirmation = await rig.waitForPendingConfirmation( |
| 46 | + 'generalist', |
| 47 | + 60000, |
| 48 | + ); |
| 49 | + expect( |
| 50 | + confirmation, |
| 51 | + 'Expected a tool call for generalist agent', |
| 52 | + ).toBeTruthy(); |
| 53 | + await rig.resolveTool(confirmation); |
| 54 | + await rig.waitForIdle(60000); |
| 55 | + }, |
| 56 | + }); |
| 57 | + |
| 58 | + appEvalTest('ALWAYS_PASSES', { |
| 59 | + name: 'should autonomously delegate complex batch task to generalist agent', |
| 60 | + configOverrides: { |
| 61 | + agents: { |
| 62 | + overrides: { |
| 63 | + generalist: { enabled: true }, |
| 64 | + }, |
| 65 | + }, |
| 66 | + model: 'gemini-3-flash-preview', |
| 67 | + experimental: { |
| 68 | + enableAgents: true, |
| 69 | + }, |
| 70 | + excludeTools: ['run_shell_command'], |
| 71 | + }, |
| 72 | + files: { |
| 73 | + 'src/a.ts': 'export const a = 1;', |
| 74 | + 'src/b.ts': 'export const b = 2;', |
| 75 | + 'src/c.ts': 'export const c = 3;', |
| 76 | + 'src/d.ts': 'export const d = 4;', |
| 77 | + 'src/e.ts': 'export const e = 5;', |
| 78 | + }, |
| 79 | + prompt: |
| 80 | + 'Please update all files in the src directory. For each file, add a comment at the top that says "Processed by Gemini".', |
| 81 | + setup: async (rig) => { |
| 82 | + rig.setBreakpoint(['generalist']); |
| 83 | + }, |
| 84 | + assert: async (rig) => { |
| 85 | + const confirmation = await rig.waitForPendingConfirmation( |
| 86 | + 'generalist', |
| 87 | + 60000, |
| 88 | + ); |
| 89 | + expect( |
| 90 | + confirmation, |
| 91 | + 'Expected autonomously delegate to generalist for batch task', |
| 92 | + ).toBeTruthy(); |
| 93 | + await rig.resolveTool(confirmation); |
| 94 | + await rig.waitForIdle(60000); |
| 95 | + }, |
| 96 | + }); |
| 97 | + |
| 98 | + // --- Negative Evals (Should NOT Delegate - Assertive Handling) --- |
| 99 | + |
| 100 | + appEvalTest('ALWAYS_PASSES', { |
| 101 | + name: 'should NOT delegate simple read and fix to generalist agent', |
| 102 | + configOverrides: { |
| 103 | + agents: { |
| 104 | + overrides: { |
| 105 | + generalist: { enabled: true }, |
| 106 | + }, |
| 107 | + }, |
| 108 | + model: 'gemini-3-flash-preview', |
| 109 | + experimental: { |
| 110 | + enableAgents: true, |
| 111 | + }, |
| 112 | + excludeTools: ['run_shell_command'], |
| 113 | + }, |
| 114 | + files: { |
| 115 | + 'README.md': 'This is a proyect.', |
| 116 | + }, |
| 117 | + prompt: |
| 118 | + 'There is a typo in README.md ("proyect"). Please fix it to "project".', |
| 119 | + setup: async (rig) => { |
| 120 | + // Break on everything to see what it calls |
| 121 | + rig.setBreakpoint(['*']); |
| 122 | + }, |
| 123 | + assert: async (rig) => { |
| 124 | + await rig.drainBreakpointsUntilIdle((confirmation) => { |
| 125 | + expect( |
| 126 | + confirmation.toolName, |
| 127 | + `Agent should NOT have delegated to generalist.`, |
| 128 | + ).not.toBe('generalist'); |
| 129 | + }); |
| 130 | + |
| 131 | + const output = rig.getStaticOutput(); |
| 132 | + expect(output).toMatch(/project/i); |
| 133 | + }, |
| 134 | + }); |
| 135 | + |
| 136 | + appEvalTest('ALWAYS_PASSES', { |
| 137 | + name: 'should NOT delegate simple direct question to generalist agent', |
| 138 | + configOverrides: { |
| 139 | + agents: { |
| 140 | + overrides: { |
| 141 | + generalist: { enabled: true }, |
| 142 | + }, |
| 143 | + }, |
| 144 | + model: 'gemini-3-flash-preview', |
| 145 | + experimental: { |
| 146 | + enableAgents: true, |
| 147 | + }, |
| 148 | + excludeTools: ['run_shell_command'], |
| 149 | + }, |
| 150 | + files: { |
| 151 | + 'src/VERSION': '1.2.3', |
| 152 | + }, |
| 153 | + prompt: 'Can you tell me the version number in the src folder?', |
| 154 | + setup: async (rig) => { |
| 155 | + rig.setBreakpoint(['*']); |
| 156 | + }, |
| 157 | + assert: async (rig) => { |
| 158 | + await rig.drainBreakpointsUntilIdle((confirmation) => { |
| 159 | + expect( |
| 160 | + confirmation.toolName, |
| 161 | + `Agent should NOT have delegated to generalist.`, |
| 162 | + ).not.toBe('generalist'); |
| 163 | + }); |
| 164 | + |
| 165 | + const output = rig.getStaticOutput(); |
| 166 | + expect(output).toMatch(/1\.2\.3/); |
| 167 | + }, |
| 168 | + }); |
| 169 | +}); |
0 commit comments