Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions lmms_eval/tasks/mmstar/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
dataset_kwargs:
token: True
test_split: val
output_type: generate_until
doc_to_target: "answer"
metric_list:
- metric: coarse perception
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: fine-grained perception
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: instance reasoning
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: logical reasoning
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: science & technology
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: math
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: average
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly"
qwen3_vl:
pre_prompt: "Question: "
post_prompt: "Answer with the option letter only."
metadata:
- version: 0.0
95 changes: 0 additions & 95 deletions lmms_eval/tasks/mmstar/ko_utils.py

This file was deleted.

36 changes: 1 addition & 35 deletions lmms_eval/tasks/mmstar/mmstar.yaml
Original file line number Diff line number Diff line change
@@ -1,40 +1,6 @@
dataset_path: Lin-Chen/MMStar
dataset_kwargs:
token: True
task: "mmstar"
test_split: val
output_type: generate_until
doc_to_visual: !function utils.mmstar_doc_to_visual
doc_to_text: !function utils.mmstar_doc_to_text
doc_to_target: "answer"
# The return value of process_results will be used by metrics
process_results: !function utils.mmstar_process_results
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: coarse perception
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: fine-grained perception
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: instance reasoning
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: logical reasoning
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: science & technology
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: math
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
- metric: average
aggregation: !function utils.mmstar_aggregate_results
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: "\nAnswer with the option's letter from the given choices directly"
metadata:
- version: 0.0
include: _default_template_yaml
42 changes: 4 additions & 38 deletions lmms_eval/tasks/mmstar/mmstar_ko.yaml
Original file line number Diff line number Diff line change
@@ -1,40 +1,6 @@
dataset_path: NCSOFT/K-MMStar
dataset_kwargs:
token: True
task: "mmstar_ko"
test_split: val
output_type: generate_until
doc_to_visual: !function ko_utils.mmstar_doc_to_visual
doc_to_text: !function ko_utils.mmstar_doc_to_text
doc_to_target: "answer"
# The return value of process_results will be used by metrics
process_results: !function ko_utils.mmstar_process_results
# Note that the metric name can be either a registed metric function (such as the case for GQA) or a key name returned by process_results
metric_list:
- metric: coarse perception
aggregation: !function ko_utils.mmstar_aggregate_results
higher_is_better: true
- metric: fine-grained perception
aggregation: !function ko_utils.mmstar_aggregate_results
higher_is_better: true
- metric: instance reasoning
aggregation: !function ko_utils.mmstar_aggregate_results
higher_is_better: true
- metric: logical reasoning
aggregation: !function ko_utils.mmstar_aggregate_results
higher_is_better: true
- metric: science & technology
aggregation: !function ko_utils.mmstar_aggregate_results
higher_is_better: true
- metric: math
aggregation: !function ko_utils.mmstar_aggregate_results
higher_is_better: true
- metric: average
aggregation: !function ko_utils.mmstar_aggregate_results
higher_is_better: true
lmms_eval_specific_kwargs:
default:
pre_prompt: ""
post_prompt: ""
metadata:
- version: 0.0
doc_to_visual: !function utils.mmstar_doc_to_visual
doc_to_text: !function utils.mmstar_doc_to_text
process_results: !function utils.mmstar_process_results_ko
include: _default_template_yaml
6 changes: 6 additions & 0 deletions lmms_eval/tasks/mmstar/mmstar_oc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
dataset_path: morpheushoc/MMStar_opencompass
task: "mmstar_oc"
doc_to_visual: !function utils.mmstar_oc_doc_to_visual
doc_to_text: !function utils.mmstar_oc_doc_to_text
process_results: !function utils.mmstar_process_results
include: _default_template_yaml
72 changes: 72 additions & 0 deletions lmms_eval/tasks/mmstar/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import base64
import datetime
import io
import json
import os
import string
from collections import defaultdict

from loguru import logger as eval_logger
from PIL import Image

from lmms_eval.tasks._task_utils.file_utils import generate_submission_file

Expand All @@ -26,6 +30,39 @@ def mmstar_doc_to_visual(doc):
return [doc["image"].convert("RGB")]


def mmstar_oc_doc_to_visual(doc):
"""
Opencompass version of MMStar
https://huggingface.co/datasets/morpheushoc/MMStar_opencompass
"""
byte_string = doc["image"]
img_data = base64.b64decode(byte_string)
image = Image.open(io.BytesIO(img_data))
return [image]


def mmstar_oc_doc_to_text(doc, lmms_eval_specific_kwargs=None):
"""
Opencompass version of MMStar: https://huggingface.co/datasets/morpheushoc/MMStar_opencompass
Modified from: https://github.com/open-compass/VLMEvalKit/blob/19c0e386c0967936b5ab4357abdabd670ba5d361/vlmeval/vlm/qwen3_vl/prompt.py#L93
"""
pre_prompt = lmms_eval_specific_kwargs.get("pre_prompt", "")
post_prompt = lmms_eval_specific_kwargs.get("post_prompt", "")

question = doc["question"]
options = {cand: doc[cand] for cand in string.ascii_uppercase if cand in doc}

options_prompt = "Options:\n"
for key, item in options.items():
options_prompt += f"{key}. {item}\n"

prompt = f"{pre_prompt}{question}\n"
prompt += options_prompt
prompt += f"{post_prompt}"
prompt = prompt.rstrip()
return prompt


def mmstar_doc_to_text(doc, lmms_eval_specific_kwargs=None):
question = doc["question"].strip()
if "pre_prompt" in lmms_eval_specific_kwargs and lmms_eval_specific_kwargs["pre_prompt"] != "":
Expand Down Expand Up @@ -55,6 +92,41 @@ def exact_match(pred, gt):
return 0.0


def exact_match_ko(pred, gt):
"""Brought from MMStar"""
answer = gt.lower().replace("\n", " ").strip()
predict = pred.lower().replace("\n", " ").strip()
try:
if answer == predict[0]:
return 1.0
elif predict[0] == "(" and answer == predict[1]:
return 1.0
elif predict[0:3] == "옵션 " and answer == predict[3]:
return 1.0
elif predict[0:4] == "정답은 " and answer == predict[4]:
return 1.0
except Exception as e:
return 0.0
return 0.0


def mmstar_process_results_ko(doc, results):
"""
Args:
doc: a instance of the eval dataset
results: [pred]
Returns:
a dictionary with key: metric name, value: metric value
"""
pred = results[0]
gt = doc["answer"]

score = exact_match_ko(pred, gt)
category = doc["category"]
l2_category = doc["l2_category"]
return {category: {"question_id": doc["index"], "l2_category": l2_category, "score": score}, "average": {"question_id": doc["index"], "l2_category": l2_category, "score": score}}


def mmstar_process_results(doc, results):
"""
Args:
Expand Down