Skip to content

Commit a8e80d4

Browse files
committed
Reject blocked experiment outputs
1 parent fb12478 commit a8e80d4

5 files changed

Lines changed: 98 additions & 1 deletion

File tree

src/agentworld/research/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
ExperimentManifest,
99
format_experiment_manifest_for_prompt,
1010
load_experiment_manifest,
11+
validate_experiment_execution,
1112
validate_experiment_manifest,
1213
write_experiment_manifest,
1314
)
@@ -35,6 +36,7 @@
3536
"load_experiment_manifest",
3637
"load_hypothesis_manifest",
3738
"validate_citation_verification",
39+
"validate_experiment_execution",
3840
"validate_experiment_manifest",
3941
"validate_literature_evidence",
4042
"write_experiment_manifest",

src/agentworld/research/experiment.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,38 @@ def validate_experiment_manifest(path: Path) -> list[str]:
115115
return problems
116116

117117

118+
def validate_experiment_execution(workspace: RunWorkspace) -> list[str]:
119+
results_path = workspace.results_dir / "results.json"
120+
if not results_path.exists():
121+
return []
122+
try:
123+
payload = json.loads(results_path.read_text(encoding="utf-8"))
124+
except json.JSONDecodeError as exc:
125+
return [f"results.json is not valid JSON: {exc}"]
126+
if not isinstance(payload, dict):
127+
return ["results.json must contain a JSON object."]
128+
129+
problems: list[str] = []
130+
if payload.get("experiments_executed") is False:
131+
problems.append("results.json reports experiments_executed=false.")
132+
execution_status = str(payload.get("execution_status") or "").strip().lower()
133+
if execution_status in {"blocked", "failed", "not_run", "not_executed", "skipped"}:
134+
problems.append(f"results.json reports execution_status={execution_status}.")
135+
if payload.get("execution_blocker"):
136+
problems.append("results.json contains execution_blocker; the experiment did not complete.")
137+
138+
expected_outputs = payload.get("expected_outputs_on_success")
139+
if isinstance(expected_outputs, dict) and not problems:
140+
missing = []
141+
for value in expected_outputs.values():
142+
candidate = workspace.run_root / str(value)
143+
if not candidate.exists():
144+
missing.append(str(value))
145+
if missing:
146+
problems.append("results.json expected output(s) are missing: " + ", ".join(missing))
147+
return problems
148+
149+
118150
def format_experiment_manifest_for_prompt(manifest: ExperimentManifest, max_results: int = 5) -> str:
119151
lines = [
120152
f"Experiment manifest generated at: {manifest.generated_at}",

src/agentworld/workflows/auto_research.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
)
2424
from ..research import (
2525
validate_citation_verification,
26+
validate_experiment_execution,
2627
validate_experiment_manifest,
2728
validate_literature_evidence,
2829
write_experiment_manifest,
@@ -548,6 +549,7 @@ def _validate_auto_research_artifacts(self, workspace: RunWorkspace, stage: Stag
548549
problems.extend(f"{stage.title}: {problem}" for problem in validate_literature_evidence(workspace))
549550
if stage.number >= 5:
550551
problems.extend(f"{stage.title}: {problem}" for problem in validate_experiment_manifest(workspace.experiment_manifest))
552+
problems.extend(f"{stage.title}: {problem}" for problem in validate_experiment_execution(workspace))
551553
if stage.number >= 7:
552554
citation_path = workspace.artifacts_dir / "citation_verification.json"
553555
problems.extend(f"{stage.title}: {problem}" for problem in validate_citation_verification(citation_path))

tests/test_auto_research_workflow.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,18 @@ def _materialize_stage_files(self, slug: str, workspace) -> list[str]:
8888
if slug == "05_experimentation":
8989
write_text(
9090
workspace.results_dir / "results.json",
91-
json.dumps([{"setting": "base", "score": 0.81}, {"setting": "ablation", "score": 0.77}], indent=2),
91+
json.dumps(
92+
{
93+
"experiments_executed": True,
94+
"execution_status": "completed",
95+
"model_results": [
96+
{"setting": "base", "score": 0.81},
97+
{"setting": "ablation", "score": 0.77},
98+
],
99+
},
100+
indent=2,
101+
ensure_ascii=True,
102+
),
92103
)
93104
return ["workspace/results/results.json"]
94105
if slug == "06_analysis":

tests/test_research_manifests.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from agentworld.research import (
99
format_experiment_manifest_for_prompt,
1010
format_hypothesis_manifest_for_prompt,
11+
validate_experiment_execution,
1112
validate_experiment_manifest,
1213
write_experiment_manifest,
1314
write_hypothesis_manifest,
@@ -162,6 +163,55 @@ def test_write_experiment_manifest_collects_schema_and_context(self) -> None:
162163
self.assertEqual(schema["row_count"], 2)
163164
self.assertIn("Result Artifacts", format_experiment_manifest_for_prompt(manifest))
164165

166+
def test_validate_experiment_execution_rejects_blocked_results(self) -> None:
167+
with tempfile.TemporaryDirectory() as tmp:
168+
workspace = create_run_workspace(runs_dir=Path(tmp), run_id="blocked", goal="test")
169+
write_text(
170+
workspace.results_dir / "results.json",
171+
json.dumps(
172+
{
173+
"experiments_executed": False,
174+
"execution_status": "blocked",
175+
"execution_blocker": {"reason": "permission approval required"},
176+
},
177+
indent=2,
178+
ensure_ascii=True,
179+
),
180+
)
181+
182+
problems = validate_experiment_execution(workspace)
183+
184+
self.assertIn("results.json reports experiments_executed=false.", problems)
185+
self.assertIn("results.json reports execution_status=blocked.", problems)
186+
self.assertIn("results.json contains execution_blocker; the experiment did not complete.", problems)
187+
188+
def test_validate_experiment_execution_checks_declared_outputs(self) -> None:
189+
with tempfile.TemporaryDirectory() as tmp:
190+
workspace = create_run_workspace(runs_dir=Path(tmp), run_id="outputs", goal="test")
191+
write_text(workspace.results_dir / "metrics.json", "{}\n")
192+
write_text(
193+
workspace.results_dir / "results.json",
194+
json.dumps(
195+
{
196+
"experiments_executed": True,
197+
"execution_status": "completed",
198+
"expected_outputs_on_success": {
199+
"metrics": "workspace/results/metrics.json",
200+
"figure": "workspace/figures/missing.png",
201+
},
202+
},
203+
indent=2,
204+
ensure_ascii=True,
205+
),
206+
)
207+
208+
problems = validate_experiment_execution(workspace)
209+
210+
self.assertEqual(
211+
problems,
212+
["results.json expected output(s) are missing: workspace/figures/missing.png"],
213+
)
214+
165215

166216
if __name__ == "__main__":
167217
unittest.main()

0 commit comments

Comments
 (0)