Skip to content

Commit 4bcd251

Browse files
committed
refactor mmstar with default template
add an OpenCompass version of MMstar with official Qwen3 prompt template
1 parent c75f1a4 commit 4bcd251

File tree

6 files changed

+119
-168
lines changed

6 files changed

+119
-168
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
dataset_kwargs:
2+
token: True
3+
test_split: val
4+
output_type: generate_until
5+
doc_to_target: "answer"
6+
metric_list:
7+
- metric: coarse perception
8+
aggregation: !function utils.mmstar_aggregate_results
9+
higher_is_better: true
10+
- metric: fine-grained perception
11+
aggregation: !function utils.mmstar_aggregate_results
12+
higher_is_better: true
13+
- metric: instance reasoning
14+
aggregation: !function utils.mmstar_aggregate_results
15+
higher_is_better: true
16+
- metric: logical reasoning
17+
aggregation: !function utils.mmstar_aggregate_results
18+
higher_is_better: true
19+
- metric: science & technology
20+
aggregation: !function utils.mmstar_aggregate_results
21+
higher_is_better: true
22+
- metric: math
23+
aggregation: !function utils.mmstar_aggregate_results
24+
higher_is_better: true
25+
- metric: average
26+
aggregation: !function utils.mmstar_aggregate_results
27+
higher_is_better: true
28+
lmms_eval_specific_kwargs:
29+
default:
30+
pre_prompt: ""
31+
post_prompt: "\nAnswer with the option's letter from the given choices directly"
32+
qwen3_vl:
33+
pre_prompt: "Question: "
34+
post_prompt: "Answer with the option letter only."
35+
metadata:
36+
- version: 0.0

lmms_eval/tasks/mmstar/ko_utils.py

Lines changed: 0 additions & 95 deletions
This file was deleted.

lmms_eval/tasks/mmstar/mmstar.yaml

Lines changed: 1 addition & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,6 @@
11
dataset_path: Lin-Chen/MMStar
2-
dataset_kwargs:
3-
token: True
42
task: "mmstar"
5-
test_split: val
6-
output_type: generate_until
73
doc_to_visual: !function utils.mmstar_doc_to_visual
84
doc_to_text: !function utils.mmstar_doc_to_text
9-
doc_to_target: "answer"
10-
# The return value of process_results will be used by metrics
115
process_results: !function utils.mmstar_process_results
12-
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
13-
metric_list:
14-
- metric: coarse perception
15-
aggregation: !function utils.mmstar_aggregate_results
16-
higher_is_better: true
17-
- metric: fine-grained perception
18-
aggregation: !function utils.mmstar_aggregate_results
19-
higher_is_better: true
20-
- metric: instance reasoning
21-
aggregation: !function utils.mmstar_aggregate_results
22-
higher_is_better: true
23-
- metric: logical reasoning
24-
aggregation: !function utils.mmstar_aggregate_results
25-
higher_is_better: true
26-
- metric: science & technology
27-
aggregation: !function utils.mmstar_aggregate_results
28-
higher_is_better: true
29-
- metric: math
30-
aggregation: !function utils.mmstar_aggregate_results
31-
higher_is_better: true
32-
- metric: average
33-
aggregation: !function utils.mmstar_aggregate_results
34-
higher_is_better: true
35-
lmms_eval_specific_kwargs:
36-
default:
37-
pre_prompt: ""
38-
post_prompt: "\nAnswer with the option's letter from the given choices directly"
39-
metadata:
40-
- version: 0.0
6+
include: _default_template_yaml
Lines changed: 4 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,6 @@
11
dataset_path: NCSOFT/K-MMStar
2-
dataset_kwargs:
3-
token: True
42
task: "mmstar_ko"
5-
test_split: val
6-
output_type: generate_until
7-
doc_to_visual: !function ko_utils.mmstar_doc_to_visual
8-
doc_to_text: !function ko_utils.mmstar_doc_to_text
9-
doc_to_target: "answer"
10-
# The return value of process_results will be used by metrics
11-
process_results: !function ko_utils.mmstar_process_results
12-
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
13-
metric_list:
14-
- metric: coarse perception
15-
aggregation: !function ko_utils.mmstar_aggregate_results
16-
higher_is_better: true
17-
- metric: fine-grained perception
18-
aggregation: !function ko_utils.mmstar_aggregate_results
19-
higher_is_better: true
20-
- metric: instance reasoning
21-
aggregation: !function ko_utils.mmstar_aggregate_results
22-
higher_is_better: true
23-
- metric: logical reasoning
24-
aggregation: !function ko_utils.mmstar_aggregate_results
25-
higher_is_better: true
26-
- metric: science & technology
27-
aggregation: !function ko_utils.mmstar_aggregate_results
28-
higher_is_better: true
29-
- metric: math
30-
aggregation: !function ko_utils.mmstar_aggregate_results
31-
higher_is_better: true
32-
- metric: average
33-
aggregation: !function ko_utils.mmstar_aggregate_results
34-
higher_is_better: true
35-
lmms_eval_specific_kwargs:
36-
default:
37-
pre_prompt: ""
38-
post_prompt: ""
39-
metadata:
40-
- version: 0.0
3+
doc_to_visual: !function utils.mmstar_doc_to_visual
4+
doc_to_text: !function utils.mmstar_doc_to_text
5+
process_results: !function utils.mmstar_process_results_ko
6+
include: _default_template_yaml
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
dataset_path: morpheushoc/MMStar_opencompass
2+
task: "mmstar_oc"
3+
doc_to_visual: !function utils.mmstar_oc_doc_to_visual
4+
doc_to_text: !function utils.mmstar_oc_doc_to_text
5+
process_results: !function utils.mmstar_process_results
6+
include: _default_template_yaml

lmms_eval/tasks/mmstar/utils.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
1+
import base64
12
import datetime
3+
import io
24
import json
35
import os
6+
import string
47
from collections import defaultdict
58

69
from loguru import logger as eval_logger
10+
from PIL import Image
711

812
from lmms_eval.tasks._task_utils.file_utils import generate_submission_file
913

@@ -26,6 +30,39 @@ def mmstar_doc_to_visual(doc):
2630
return [doc["image"].convert("RGB")]
2731

2832

33+
def mmstar_oc_doc_to_visual(doc):
34+
"""
35+
Opencompass version of MMStar
36+
https://huggingface.co/datasets/morpheushoc/MMStar_opencompass
37+
"""
38+
byte_string = doc["image"]
39+
img_data = base64.b64decode(byte_string)
40+
image = Image.open(io.BytesIO(img_data))
41+
return [image]
42+
43+
44+
def mmstar_oc_doc_to_text(doc, lmms_eval_specific_kwargs=None):
45+
"""
46+
Opencompass version of MMStar: https://huggingface.co/datasets/morpheushoc/MMStar_opencompass
47+
Modified from: https://github.com/open-compass/VLMEvalKit/blob/19c0e386c0967936b5ab4357abdabd670ba5d361/vlmeval/vlm/qwen3_vl/prompt.py#L93
48+
"""
49+
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
50+
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")
51+
52+
question = doc["question"]
53+
options = {cand: doc[cand] for cand in string.ascii_uppercase if cand in doc}
54+
55+
options_prompt = "Options:\n"
56+
for key, item in options.items():
57+
options_prompt += f"{key}. {item}\n"
58+
59+
prompt = f"{pre_prompt}{question}\n"
60+
prompt += options_prompt
61+
prompt += f"{post_prompt}"
62+
prompt = prompt.rstrip()
63+
return prompt
64+
65+
2966
def mmstar_doc_to_text(doc, lmms_eval_specific_kwargs=None):
3067
question = doc["question"].strip()
3168
if "pre_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["pre_prompt"] != "":
@@ -55,6 +92,41 @@ def exact_match(pred, gt):
5592
return 0.0
5693

5794

95+
def exact_match_ko(pred, gt):
96+
"""Brought from MMStar"""
97+
answer = gt.lower().replace("\n", " ").strip()
98+
predict = pred.lower().replace("\n", " ").strip()
99+
try:
100+
if answer == predict[0]:
101+
return 1.0
102+
elif predict[0] == "(" and answer == predict[1]:
103+
return 1.0
104+
elif predict[0:3] == "옵션 " and answer == predict[3]:
105+
return 1.0
106+
elif predict[0:4] == "정답은 " and answer == predict[4]:
107+
return 1.0
108+
except Exception as e:
109+
return 0.0
110+
return 0.0
111+
112+
113+
def mmstar_process_results_ko(doc, results):
114+
"""
115+
Args:
116+
doc: a instance of the eval dataset
117+
results: [pred]
118+
Returns:
119+
a dictionary with key: metric name, value: metric value
120+
"""
121+
pred = results[0]
122+
gt = doc["answer"]
123+
124+
score = exact_match_ko(pred, gt)
125+
category = doc["category"]
126+
l2_category = doc["l2_category"]
127+
return {category: {"question_id": doc["index"], "l2_category": l2_category, "score": score}, "average": {"question_id": doc["index"], "l2_category": l2_category, "score": score}}
128+
129+
58130
def mmstar_process_results(doc, results):
59131
"""
60132
Args:

0 commit comments

Comments
 (0)