Merge branch 'main' into jean/default_max_tokens

neginraoof · web-flow · commit 43dfdf214e03 · 2025-06-05T01:56:31.000-07:00
diff --git a/README.md b/README.md
@@ -428,9 +428,9 @@ If you find Evalchemy useful, please consider citing us!
 
 ```
 @software{Evalchemy: Automatic evals for LLMs,
-  author = {Guha, Etash and Raoof, Negin and Mercat, Jean and Marten, Ryan and Frankel, Eric and Keh, Sedrick and Grover, Sachin and Smyrnis, George and Vu, Trung and Saad-Falcon, Jon and Choi, Caroline and Arora, Kushal and Merrill, Mike and Deng, Yichuan and Suvarna, Ashima and Bansal, Hritik and Nezhurina, Marianna and Choi, Yejin and Heckel, Reinhard and Oh, Seewong and Hashimoto, Tatsunori and Jitsev, Jenia and Shankar, Vaishaal and Dimakis, Alex and Sathiamoorthy, Mahesh and Schmidt, Ludwig},
-  month = nov,
+  author = {Raoof, Negin and Guha, Etash Kumar and Marten, Ryan and Mercat, Jean and Frankel, Eric and Keh, Sedrick and Bansal, Hritik and Smyrnis, Georgios and Nezhurina, Marianna and Vu, Trung and Sprague, Zayne Rea and Merrill, Mike A and Chen, Liangyu and Choi, Caroline and Khan, Zaid and Grover, Sachin and Feuer, Benjamin and Suvarna, Ashima and Su, Shiye and Zhao, Wanjia and Sharma, Kartik and Ji, Charlie Cheng-Jie and Arora, Kushal and Li, Jeffrey and Gokaslan, Aaron and Pratt, Sarah M and Muennighoff, Niklas and Saad-Falcon, Jon and Yang, John and Aali, Asad and Pimpalgaonkar, Shreyas and Albalak, Alon and Dave, Achal and Pouransari, Hadi and Durrett, Greg and Oh, Sewoong and Hashimoto, Tatsunori and Shankar, Vaishaal and Choi, Yejin and Bansal, Mohit and Hegde, Chinmay and Heckel, Reinhard and Jitsev, Jenia and Sathiamoorthy, Maheswaran and Dimakis, Alex and Schmidt, Ludwig}
+  month = June,
   title = {{Evalchemy}},
-  year = {2024}
+  year = {2025}
 }
 ```
diff --git a/eval/chat_benchmarks/CodeForces/eval_instruct.py b/eval/chat_benchmarks/CodeForces/eval_instruct.py
@@ -63,7 +63,6 @@ def __init__(
         super().__init__(logger=logger, system_instruction=system_instruction)
         self.debug = debug
         self.max_new_tokens = max_tokens
-        self.seed = seed
         self.n_repeat = 3
         self.filter_interaction_questions = True
 
diff --git a/eval/chat_benchmarks/MTBench/eval_instruct.py b/eval/chat_benchmarks/MTBench/eval_instruct.py
@@ -116,7 +116,6 @@ def get_model_answers(self, model: LM, model_id: str, questions: List[Dict[str,
         max_turns = max(len(q["turns"]) for q in questions)
         answer_file = self.answer_dir / f"{model_id}.jsonl"
 
-        self.config.max_new_token = self.max_new_token
         # Process each turn
         for turn_num in range(max_turns):
             self.logger.info(f"Processing Turn {turn_num + 1}")
diff --git a/eval/chat_benchmarks/MultiPLE/eval_instruct.py b/eval/chat_benchmarks/MultiPLE/eval_instruct.py
@@ -115,7 +115,7 @@ def __init__(
         super().__init__(logger)
         self.languages = languages
         self.data_dir = data_dir
-        self.max_tokens = max_tokens
+        self.max_tokens = max_tokens or 1024
         self.num_workers = num_workers
         self.timeout = timeout
         self.debug = debug
diff --git a/eval/chat_benchmarks/alpaca_eval/eval_instruct.py b/eval/chat_benchmarks/alpaca_eval/eval_instruct.py
@@ -31,7 +31,7 @@ def __init__(
         dataset_name: str = "tatsu-lab/alpaca_eval",
         subset: str = "alpaca_eval",
         split: str = "eval",
-        max_tokens: int = 1024,
+        max_tokens: Optional[int] = 1024,
         temperature: float = 0.5,
         do_sample: bool = True,
         debug: bool = False,
@@ -57,7 +57,7 @@ def __init__(
         self.dataset_name = dataset_name
         self.subset = subset
         self.split = split
-        self.max_tokens = max_tokens
+        self.max_tokens = max_tokens if max_tokens is not None else 1024
         self.temperature = temperature
         self.do_sample = do_sample
         self.debug = debug
diff --git a/eval/chat_benchmarks/zeroeval/eval_instruct.py b/eval/chat_benchmarks/zeroeval/eval_instruct.py
@@ -33,15 +33,15 @@ def __init__(
         self,
         tasks: List[str] = ["zebra-grid", "numersense-v2", "crux", "math-l5"],
         config: Optional[ZeroEvalConfig] = None,
-        max_tokens: int = 4096,
+        max_tokens: Optional[int] = 4096,
         debug: bool = False,
         logger: Optional[logging.Logger] = None,
         system_instruction: Optional[str] = None,
     ):
         super().__init__(logger, system_instruction=system_instruction)
         self.tasks = tasks
         self.config = config or ZeroEvalConfig()
-        self.max_new_tokens = int(max_tokens)
+        self.max_new_tokens = int(max_tokens) if max_tokens is not None else 4096
         self.debug = debug
 
     def load_dataset(self, data_name: str) -> Tuple[List[str], List[str], List[Dict[str, Any]], Dict[str, Any]]: