EvolvingLMMs-Lab · kcz358 · Nov 25, 2025 · Nov 18, 2025
diff --git a/lmms_eval/tasks/mmstar/_default_template_yaml b/lmms_eval/tasks/mmstar/_default_template_yaml
@@ -0,0 +1,36 @@
+dataset_kwargs:
+  token: True
+test_split: val
+output_type: generate_until
+doc_to_target: "answer"
+metric_list:
+  - metric: coarse perception
+    aggregation: !function utils.mmstar_aggregate_results
+    higher_is_better: true
+  - metric: fine-grained perception
+    aggregation: !function utils.mmstar_aggregate_results
+    higher_is_better: true
+  - metric: instance reasoning
+    aggregation: !function utils.mmstar_aggregate_results
+    higher_is_better: true
+  - metric: logical reasoning
+    aggregation: !function utils.mmstar_aggregate_results
+    higher_is_better: true
+  - metric: science & technology
+    aggregation: !function utils.mmstar_aggregate_results
+    higher_is_better: true
+  - metric: math
+    aggregation: !function utils.mmstar_aggregate_results
+    higher_is_better: true
+  - metric: average
+    aggregation: !function utils.mmstar_aggregate_results
+    higher_is_better: true
+lmms_eval_specific_kwargs:
+  default:
+    pre_prompt: ""
+    post_prompt: "\nAnswer with the option's letter from the given choices directly"
+  qwen3_vl:
+    pre_prompt: "Question: "
+    post_prompt: "Answer with the option letter only."
+metadata:
+  - version: 0.0
diff --git a/lmms_eval/tasks/mmstar/ko_utils.py b/lmms_eval/tasks/mmstar/ko_utils.py
diff --git a/lmms_eval/tasks/mmstar/mmstar.yaml b/lmms_eval/tasks/mmstar/mmstar.yaml
@@ -1,40 +1,6 @@
 dataset_path: Lin-Chen/MMStar
-dataset_kwargs:
-  token: True
 task: "mmstar"
-test_split: val
-output_type: generate_until
 doc_to_visual: !function utils.mmstar_doc_to_visual
 doc_to_text: !function utils.mmstar_doc_to_text
-doc_to_target: "answer"
-# The return value of process_results will be used by metrics
 process_results: !function utils.mmstar_process_results
-# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
-metric_list:
-  - metric: coarse perception
-    aggregation: !function utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: fine-grained perception
-    aggregation: !function utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: instance reasoning
-    aggregation: !function utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: logical reasoning
-    aggregation: !function utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: science & technology
-    aggregation: !function utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: math
-    aggregation: !function utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: average
-    aggregation: !function utils.mmstar_aggregate_results
-    higher_is_better: true
-lmms_eval_specific_kwargs:
-  default:
-    pre_prompt: ""
-    post_prompt: "\nAnswer with the option's letter from the given choices directly"
-metadata:
-  - version: 0.0
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/mmstar/mmstar_ko.yaml b/lmms_eval/tasks/mmstar/mmstar_ko.yaml
@@ -1,40 +1,6 @@
 dataset_path: NCSOFT/K-MMStar
-dataset_kwargs:
-  token: True
 task: "mmstar_ko"
-test_split: val
-output_type: generate_until
-doc_to_visual: !function ko_utils.mmstar_doc_to_visual
-doc_to_text: !function ko_utils.mmstar_doc_to_text
-doc_to_target: "answer"
-# The return value of process_results will be used by metrics
-process_results: !function ko_utils.mmstar_process_results
-# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
-metric_list:
-  - metric: coarse perception
-    aggregation: !function ko_utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: fine-grained perception
-    aggregation: !function ko_utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: instance reasoning
-    aggregation: !function ko_utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: logical reasoning
-    aggregation: !function ko_utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: science & technology
-    aggregation: !function ko_utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: math
-    aggregation: !function ko_utils.mmstar_aggregate_results
-    higher_is_better: true
-  - metric: average
-    aggregation: !function ko_utils.mmstar_aggregate_results
-    higher_is_better: true
-lmms_eval_specific_kwargs:
-  default:
-    pre_prompt: ""
-    post_prompt: ""
-metadata:
-  - version: 0.0
+doc_to_visual: !function utils.mmstar_doc_to_visual
+doc_to_text: !function utils.mmstar_doc_to_text
+process_results: !function utils.mmstar_process_results_ko
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/mmstar/mmstar_oc.yaml b/lmms_eval/tasks/mmstar/mmstar_oc.yaml
@@ -0,0 +1,6 @@
+dataset_path: morpheushoc/MMStar_opencompass
+task: "mmstar_oc"
+doc_to_visual: !function utils.mmstar_oc_doc_to_visual
+doc_to_text: !function utils.mmstar_oc_doc_to_text
+process_results: !function utils.mmstar_process_results
+include: _default_template_yaml
diff --git a/lmms_eval/tasks/mmstar/utils.py b/lmms_eval/tasks/mmstar/utils.py
@@ -1,9 +1,13 @@
+import base64
 import datetime
+import io
 import json
 import os
+import string
 from collections import defaultdict
 
 from loguru import logger as eval_logger
+from PIL import Image
 
 from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
 
@@ -26,6 +30,39 @@ def mmstar_doc_to_visual(doc):
     return [doc["image"].convert("RGB")]
 
 
+def mmstar_oc_doc_to_visual(doc):
+    """
+    Opencompass version of MMStar
+    https://huggingface.co/datasets/morpheushoc/MMStar_opencompass
+    """
+    byte_string = doc["image"]
+    img_data = base64.b64decode(byte_string)
+    image = Image.open(io.BytesIO(img_data))
+    return [image]
+
+
+def mmstar_oc_doc_to_text(doc, lmms_eval_specific_kwargs=None):
+    """
+    Opencompass version of MMStar: https://huggingface.co/datasets/morpheushoc/MMStar_opencompass
+    Modified from: https://github.com/open-compass/VLMEvalKit/blob/19c0e386c0967936b5ab4357abdabd670ba5d361/vlmeval/vlm/qwen3_vl/prompt.py#L93
+    """
+    pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
+    post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
+
+    question = doc["question"]
+    options = {cand: doc[cand] for cand in string.ascii_uppercase if cand in doc}
+
+    options_prompt = "Options:\n"
+    for key, item in options.items():
+        options_prompt += f"{key}. {item}\n"
+
+    prompt = f"{pre_prompt}{question}\n"
+    prompt += options_prompt
+    prompt += f"{post_prompt}"
+    prompt = prompt.rstrip()
+    return prompt
+
+
 def mmstar_doc_to_text(doc, lmms_eval_specific_kwargs=None):
     question = doc["question"].strip()
     if "pre_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["pre_prompt"] != "":
@@ -55,6 +92,41 @@ def exact_match(pred, gt):
     return 0.0
 
 
+def exact_match_ko(pred, gt):
+    """Brought from MMStar"""
+    answer = gt.lower().replace("\n", " ").strip()
+    predict = pred.lower().replace("\n", " ").strip()
+    try:
+        if answer == predict[0]:
+            return 1.0
+        elif predict[0] == "(" and answer == predict[1]:
+            return 1.0
+        elif predict[0:3] == "옵션 " and answer == predict[3]:
+            return 1.0
+        elif predict[0:4] == "정답은 " and answer == predict[4]:
+            return 1.0
+    except Exception as e:
+        return 0.0
+    return 0.0
+
+
+def mmstar_process_results_ko(doc, results):
+    """
+    Args:
+        doc: a instance of the eval dataset
+        results: [pred]
+    Returns:
+        a dictionary with key: metric name, value: metric value
+    """
+    pred = results[0]
+    gt = doc["answer"]
+
+    score = exact_match_ko(pred, gt)
+    category = doc["category"]
+    l2_category = doc["l2_category"]
+    return {category: {"question_id": doc["index"], "l2_category": l2_category, "score": score}, "average": {"question_id": doc["index"], "l2_category": l2_category, "score": score}}
+
+
 def mmstar_process_results(doc, results):
     """
     Args: