Skip to content

Commit 20004fb

Browse files
committed
feat(evals): add reliability harvester and 500/503 retry support
1 parent fc03891 commit 20004fb

5 files changed

Lines changed: 506 additions & 71 deletions

File tree

.github/workflows/chained_e2e.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -334,8 +334,20 @@ jobs:
334334
if: "${{ steps.check_evals.outputs.should_run == 'true' }}"
335335
env:
336336
GEMINI_API_KEY: '${{ secrets.GEMINI_API_KEY }}'
337+
GEMINI_MODEL: 'gemini-3-pro-preview'
338+
# Disable Vitest internal retries to avoid double-retrying;
339+
# custom retry logic is handled in evals/test-helper.ts
340+
VITEST_RETRY: 0
337341
run: 'npm run test:always_passing_evals'
338342

343+
- name: 'Upload Reliability Logs'
344+
if: "always() && steps.check_evals.outputs.should_run == 'true'"
345+
uses: 'actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02' # ratchet:actions/upload-artifact@v4
346+
with:
347+
name: 'eval-logs-${{ github.run_id }}-${{ github.run_attempt }}'
348+
path: 'evals/logs/api-reliability.jsonl'
349+
retention-days: 7
350+
339351
e2e:
340352
name: 'E2E'
341353
if: |

.github/workflows/evals-nightly.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,9 @@ jobs:
6161
GEMINI_MODEL: '${{ matrix.model }}'
6262
RUN_EVALS: "${{ github.event.inputs.run_all != 'false' }}"
6363
TEST_NAME_PATTERN: '${{ github.event.inputs.test_name_pattern }}'
64+
# Disable Vitest internal retries to avoid double-retrying;
65+
# custom retry logic is handled in evals/test-helper.ts
66+
VITEST_RETRY: 0
6467
run: |
6568
CMD="npm run test:all_evals"
6669
PATTERN="${TEST_NAME_PATTERN}"

evals/test-helper.test.ts

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
/**
2+
* @license
3+
* Copyright 2026 Google LLC
4+
* SPDX-License-Identifier: Apache-2.0
5+
*/
6+
7+
import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest';
8+
import fs from 'node:fs';
9+
import path from 'node:path';
10+
import { internalEvalTest } from './test-helper.js';
11+
import { TestRig } from '@google/gemini-cli-test-utils';
12+
13+
// Mock TestRig to control API success/failure
14+
vi.mock('@google/gemini-cli-test-utils', () => {
15+
return {
16+
TestRig: vi.fn().mockImplementation(() => ({
17+
setup: vi.fn(),
18+
run: vi.fn(),
19+
cleanup: vi.fn(),
20+
readToolLogs: vi.fn().mockReturnValue([]),
21+
_lastRunStderr: '',
22+
})),
23+
};
24+
});
25+
26+
describe('evalTest reliability logic', () => {
27+
const LOG_DIR = path.resolve(process.cwd(), 'evals/logs');
28+
const RELIABILITY_LOG = path.join(LOG_DIR, 'api-reliability.jsonl');
29+
30+
beforeEach(() => {
31+
vi.clearAllMocks();
32+
if (fs.existsSync(RELIABILITY_LOG)) {
33+
fs.unlinkSync(RELIABILITY_LOG);
34+
}
35+
});
36+
37+
afterEach(() => {
38+
if (fs.existsSync(RELIABILITY_LOG)) {
39+
fs.unlinkSync(RELIABILITY_LOG);
40+
}
41+
});
42+
43+
it('should retry 3 times on 500 INTERNAL error and then SKIP', async () => {
44+
const mockRig = new TestRig() as any;
45+
(TestRig as any).mockReturnValue(mockRig);
46+
47+
// Simulate permanent 500 error
48+
mockRig.run.mockRejectedValue(new Error('status: INTERNAL - API Down'));
49+
50+
// Execute the test function directly
51+
await internalEvalTest({
52+
name: 'test-api-failure',
53+
prompt: 'do something',
54+
assert: async () => {},
55+
});
56+
57+
// Verify retries: 1 initial + 3 retries = 4 setups/runs
58+
expect(mockRig.run).toHaveBeenCalledTimes(4);
59+
60+
// Verify log content
61+
const logContent = fs
62+
.readFileSync(RELIABILITY_LOG, 'utf-8')
63+
.trim()
64+
.split('\n');
65+
expect(logContent.length).toBe(4);
66+
67+
const entries = logContent.map((line) => JSON.parse(line));
68+
expect(entries[0].status).toBe('RETRY');
69+
expect(entries[0].attempt).toBe(0);
70+
expect(entries[3].status).toBe('SKIP');
71+
expect(entries[3].attempt).toBe(3);
72+
expect(entries[3].testName).toBe('test-api-failure');
73+
});
74+
75+
it('should fail immediately on non-500 errors (like assertion failures)', async () => {
76+
const mockRig = new TestRig() as any;
77+
(TestRig as any).mockReturnValue(mockRig);
78+
79+
// Simulate a real logic error/bug
80+
mockRig.run.mockResolvedValue('Success');
81+
const assertError = new Error('Assertion failed: expected foo to be bar');
82+
83+
// Expect the test function to throw immediately
84+
await expect(
85+
internalEvalTest({
86+
name: 'test-logic-failure',
87+
prompt: 'do something',
88+
assert: async () => {
89+
throw assertError;
90+
},
91+
}),
92+
).rejects.toThrow('Assertion failed');
93+
94+
// Verify NO retries: only 1 attempt
95+
expect(mockRig.run).toHaveBeenCalledTimes(1);
96+
97+
// Verify NO reliability log was created (it's not an API error)
98+
expect(fs.existsSync(RELIABILITY_LOG)).toBe(false);
99+
});
100+
101+
it('should recover if a retry succeeds', async () => {
102+
const mockRig = new TestRig() as any;
103+
(TestRig as any).mockReturnValue(mockRig);
104+
105+
// Fail once, then succeed
106+
mockRig.run
107+
.mockRejectedValueOnce(new Error('status: INTERNAL'))
108+
.mockResolvedValueOnce('Success');
109+
110+
await internalEvalTest({
111+
name: 'test-recovery',
112+
prompt: 'do something',
113+
assert: async () => {},
114+
});
115+
116+
// Ran twice: initial (fail) + retry 1 (success)
117+
expect(mockRig.run).toHaveBeenCalledTimes(2);
118+
119+
// Log should only have the one RETRY entry
120+
const logContent = fs
121+
.readFileSync(RELIABILITY_LOG, 'utf-8')
122+
.trim()
123+
.split('\n');
124+
expect(logContent.length).toBe(1);
125+
expect(JSON.parse(logContent[0]).status).toBe('RETRY');
126+
});
127+
128+
it('should retry 3 times on 503 UNAVAILABLE error and then SKIP', async () => {
129+
const mockRig = new TestRig() as any;
130+
(TestRig as any).mockReturnValue(mockRig);
131+
132+
// Simulate permanent 503 error
133+
mockRig.run.mockRejectedValue(
134+
new Error('status: UNAVAILABLE - Service Busy'),
135+
);
136+
137+
await internalEvalTest({
138+
name: 'test-api-503',
139+
prompt: 'do something',
140+
assert: async () => {},
141+
});
142+
143+
expect(mockRig.run).toHaveBeenCalledTimes(4);
144+
145+
const logContent = fs
146+
.readFileSync(RELIABILITY_LOG, 'utf-8')
147+
.trim()
148+
.split('\n');
149+
const entries = logContent.map((line) => JSON.parse(line));
150+
expect(entries[0].errorCode).toBe('503');
151+
expect(entries[3].status).toBe('SKIP');
152+
});
153+
154+
it('should throw if an absolute path is used in files', async () => {
155+
const mockRig = new TestRig() as any;
156+
(TestRig as any).mockReturnValue(mockRig);
157+
mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
158+
if (!fs.existsSync(mockRig.testDir)) {
159+
fs.mkdirSync(mockRig.testDir, { recursive: true });
160+
}
161+
162+
try {
163+
await expect(
164+
internalEvalTest({
165+
name: 'test-absolute-path',
166+
prompt: 'do something',
167+
files: {
168+
'/etc/passwd': 'hacked',
169+
},
170+
assert: async () => {},
171+
}),
172+
).rejects.toThrow('Invalid file path in test case: /etc/passwd');
173+
} finally {
174+
if (fs.existsSync(mockRig.testDir)) {
175+
fs.rmSync(mockRig.testDir, { recursive: true, force: true });
176+
}
177+
}
178+
});
179+
180+
it('should throw if directory traversal is detected in files', async () => {
181+
const mockRig = new TestRig() as any;
182+
(TestRig as any).mockReturnValue(mockRig);
183+
mockRig.testDir = path.resolve(process.cwd(), 'test-dir-tmp');
184+
185+
// Create a mock test-dir
186+
if (!fs.existsSync(mockRig.testDir)) {
187+
fs.mkdirSync(mockRig.testDir, { recursive: true });
188+
}
189+
190+
try {
191+
await expect(
192+
internalEvalTest({
193+
name: 'test-traversal',
194+
prompt: 'do something',
195+
files: {
196+
'../sensitive.txt': 'hacked',
197+
},
198+
assert: async () => {},
199+
}),
200+
).rejects.toThrow('Invalid file path in test case: ../sensitive.txt');
201+
} finally {
202+
if (fs.existsSync(mockRig.testDir)) {
203+
fs.rmSync(mockRig.testDir, { recursive: true, force: true });
204+
}
205+
}
206+
});
207+
});

0 commit comments

Comments
 (0)