fix eval configs

terrykong · terrykong · commit 723c4b3b2bdd · 2025-10-17T23:48:02.000Z
Signed-off-by: Terry Kong &lt;terryk@nvidia.com&gt;
diff --git a/nemo_rl/data/__init__.py b/nemo_rl/data/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import NotRequired, TypedDict
+from typing import Literal, NotRequired, TypedDict
 
 
 # TODO: split this typed dict up so it can be PreferenceDataConfig | ResponseDataConfig | etc
@@ -31,7 +31,7 @@ class DataConfig(TypedDict):
     add_generation_prompt: NotRequired[bool]
     add_system_prompt: NotRequired[bool]
     split: NotRequired[str | None]
-    shuffle: NotRequired[bool]
+    shuffle: bool
     seed: NotRequired[int | None]
     download_dir: NotRequired[str]
     train_data_path: NotRequired[str]
@@ -43,9 +43,110 @@ class DataConfig(TypedDict):
     num_workers: NotRequired[int]
 
 
-# TODO: split this typed dict up so it can be MMLUConfig | AIMEConfig | etc
-#       so that we can type check the configs more rigorously as opposed to saying everything
-#       is not required.
-class MathDataConfig(DataConfig):
-    problem_key: NotRequired[str]
-    solution_key: NotRequired[str]
+# ===============================================================================
+# Eval Dataset Configs
+# ===============================================================================
+# These configs correspond to the eval datasets in data/datasets/eval_datasets/
+# Note: TypedDict doesn't allow narrowing types in child classes, so each config
+# is defined independently with common fields repeated.
+
+
+class MMLUEvalDataConfig(TypedDict):
+    """Config for MMLU and multilingual MMLU datasets.
+
+    Supports dataset_name: "mmlu" or "mmlu_{language}" where language is one of:
+    AR-XY, BN-BD, DE-DE, EN-US, ES-LA, FR-FR, HI-IN, ID-ID, IT-IT, JA-JP,
+    KO-KR, PT-BR, ZH-CN, SW-KE, YO-NG
+    """
+
+    max_input_seq_length: int
+    dataset_name: Literal[
+        "mmlu",
+        "mmlu_AR-XY",
+        "mmlu_BN-BD",
+        "mmlu_DE-DE",
+        "mmlu_EN-US",
+        "mmlu_ES-LA",
+        "mmlu_FR-FR",
+        "mmlu_HI-IN",
+        "mmlu_ID-ID",
+        "mmlu_IT-IT",
+        "mmlu_JA-JP",
+        "mmlu_KO-KR",
+        "mmlu_PT-BR",
+        "mmlu_ZH-CN",
+        "mmlu_SW-KE",
+        "mmlu_YO-NG",
+    ]
+    shuffle: NotRequired[bool]
+    prompt_file: NotRequired[str | None]
+    system_prompt_file: NotRequired[str | None]
+
+
+class MMLUProEvalDataConfig(TypedDict):
+    """Config for MMLU Pro dataset."""
+
+    max_input_seq_length: int
+    dataset_name: Literal["mmlu_pro"]
+    shuffle: NotRequired[bool]
+    prompt_file: NotRequired[str | None]
+    system_prompt_file: NotRequired[str | None]
+
+
+class AIMEEvalDataConfig(TypedDict):
+    """Config for AIME datasets."""
+
+    max_input_seq_length: int
+    dataset_name: Literal["aime2024", "aime2025"]
+    shuffle: NotRequired[bool]
+    prompt_file: NotRequired[str | None]
+    system_prompt_file: NotRequired[str | None]
+
+
+class GPQAEvalDataConfig(TypedDict):
+    """Config for GPQA datasets."""
+
+    max_input_seq_length: int
+    dataset_name: Literal["gpqa", "gpqa_diamond"]
+    shuffle: NotRequired[bool]
+    prompt_file: NotRequired[str | None]
+    system_prompt_file: NotRequired[str | None]
+
+
+class MathEvalDataConfig(TypedDict):
+    """Config for Math datasets."""
+
+    max_input_seq_length: int
+    dataset_name: Literal["math", "math500"]
+    shuffle: NotRequired[bool]
+    prompt_file: NotRequired[str | None]
+    system_prompt_file: NotRequired[str | None]
+
+
+class LocalMathEvalDataConfig(TypedDict):
+    """Config for local math datasets loaded from files.
+
+    dataset_name can be a URL or local file path.
+    Requires additional fields: problem_key, solution_key, file_format, split.
+    """
+
+    max_input_seq_length: int
+    dataset_name: str  # URL or file path
+    problem_key: str
+    solution_key: str
+    file_format: Literal["csv", "json"]
+    split: NotRequired[str | None]
+    shuffle: NotRequired[bool]
+    prompt_file: NotRequired[str | None]
+    system_prompt_file: NotRequired[str | None]
+
+
+# Union type for all eval dataset configs
+EvalDataConfigType = (
+    MMLUEvalDataConfig
+    | MMLUProEvalDataConfig
+    | AIMEEvalDataConfig
+    | GPQAEvalDataConfig
+    | MathEvalDataConfig
+    | LocalMathEvalDataConfig
+)
diff --git a/nemo_rl/evals/eval.py b/nemo_rl/evals/eval.py
@@ -25,7 +25,7 @@
 from transformers import AutoTokenizer
 
 from nemo_rl.algorithms.utils import set_seed
-from nemo_rl.data import MathDataConfig
+from nemo_rl.data import EvalDataConfigType
 from nemo_rl.data.collate_fn import eval_collate_fn
 from nemo_rl.data.datasets import AllTaskProcessedDataset
 from nemo_rl.data.llm_message_utils import get_keys_from_message_log
@@ -58,7 +58,7 @@ class MasterConfig(TypedDict):
     eval: EvalConfig
     generation: GenerationConfig  # Fixed: was 'generate'
     tokenizer: TokenizerConfig  # Added missing tokenizer key
-    data: MathDataConfig
+    data: EvalDataConfigType
     env: _PassThroughMathConfig
     cluster: ClusterConfig