forked from Science-Discovery/LOCA
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoffline_evaluate.py
More file actions
43 lines (37 loc) · 1.68 KB
/
offline_evaluate.py
File metadata and controls
43 lines (37 loc) · 1.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import argparse
import os
from src.scorer.phybench import phybench_evaluate
from src.scorer.ABench import ABench_evaluate
def main():
parser = argparse.ArgumentParser(description="Offline Evaluation for LLMs on Physics Benchmarks")
parser.add_argument("--input", required=True, help="Path to the test questions (JSONL format)")
parser.add_argument("--cleaned_input", default=None, help="Path to cleaned test questions (JSONL format), optional")
parser.add_argument("--results", required=True, help="Path to the model's answers (JSONL format)")
parser.add_argument("--output_dir", required=True, help="Directory to save evaluation results")
parser.add_argument("--scheme", choices=["phybench", "ABench"], default="phybench",
help="Evaluation scheme to use (default: phybench)")
args = parser.parse_args()
try:
if args.scheme == "phybench":
phybench_evaluate.run_evaluation(
test_path=args.input,
cleaned_input=args.cleaned_input,
results_path=args.results,
output_dir=args.output_dir
)
elif args.scheme == "ABench":
ABench_evaluate.run_evaluation(
test_path=args.input,
results_path=args.results,
output_dir=args.output_dir
)
else:
raise ValueError("Unsupported evaluation scheme specified.")
print(f"Finished!Results saved at: {os.path.abspath(args.output_dir)}")
except Exception as e:
import traceback
print(f"Failed: {repr(e)}")
traceback.print_exc()
exit(1)
if __name__ == "__main__":
main()