CLongEval/eval.py at main · zexuanqiu/CLongEval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import argparse
import jsonlines
import sys
sys.setrecursionlimit(10000)

from metrics import (
    qa_f1_zh_score,
    rouge_zh_score,
    news_labeling_zh_score,
    typo_detection_zh_score,
    retrieval_zh_edit_score,
    table_querying_match_score,
)

dataset2metric = {
    "long_story_qa": qa_f1_zh_score,
    "long_conversation_memory": qa_f1_zh_score,
    "long_story_summarization": rouge_zh_score,
    "stacked_news_labeling": news_labeling_zh_score,
    "stacked_typo_detection": typo_detection_zh_score,
    "key_passage_retrieval": retrieval_zh_edit_score,
    "table_querying": table_querying_match_score
}

def parse_args(args=None):
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, default=None, choices=['chatglm3-6b-32k', 'qwen-7b-32k', 'chinese-llama2-7b-64k', 'chinese-alpaca2-7b-64k', 'internlm2-7b-200k', 'internlm2-20b-200k', "moonshot-v1", "glm4-128k", "gpt4-turbo-128k"])
    parser.add_argument("--datasets", type=str, default=None)
    return parser.parse_args(args)

def scorer(dataset_name, predictions, answers):
    assert len(predictions) == len(answers)
    total_score = 0.
    for prediction, ground_truth in zip(predictions, answers):
        score = dataset2metric[dataset_name](prediction=prediction, ground_truth=ground_truth)
        total_score += score
    return round(100 * total_score / len(predictions), 5)

def get_prediction_answer(jsonl_file, model_name):
    predictions = []
    ground_truths = []
    with jsonlines.open(jsonl_file, 'r') as f_reader:
        for cur_json_obj in f_reader:
            cur_pred = cur_json_obj['response_{}'.format(model_name)]
            if "HTTP_ERROR" in cur_pred or "UNKNOW_ERROR" in cur_pred:
                # Due to network errors or other issues, there were a few instances where results could not be generated using gpt4-turbo-128k even after calling api 5 times. We have ignored the scoring calculations for these samples.
                continue
            if 'typo_detection' in jsonl_file and model_name != "gpt4-turbo-128k" and "single" not in jsonl_file: # ad_hoc code
                cur_ans = cur_json_obj['answer']
                cur_ans_list = cur_ans.split("，")
                cur_ans = "，".join([cur_ans_list[0], cur_ans_list[2], cur_ans_list[0]]) # pargraph_id, wrong_word, right_word
            else:
                cur_ans = cur_json_obj['answer']
            predictions.append(cur_pred)
            ground_truths.append(cur_ans)
    return predictions, ground_truths

if __name__ == "__main__":
    args = parse_args()
    chosen_datasets = args.datasets.split("-")
    scores = []
    infer_model_dir = os.path.join('./inference_results', args.model_name)
    dataset_dirs = list(sorted(os.listdir(infer_model_dir)))
    if not os.path.exists("eval_results"):
        os.makedirs("eval_results")

    for cur_dataset_dir in dataset_dirs:
        if cur_dataset_dir in chosen_datasets:
            print(cur_dataset_dir)
            small_file  = os.path.join(infer_model_dir, cur_dataset_dir, "small.jsonl")
            if os.path.exists(small_file):
                small_predicitons, small_answers = get_prediction_answer(small_file, args.model_name)
                small_score = scorer(cur_dataset_dir, small_predicitons, small_answers)
            else:
                small_score = None

            medium_file = os.path.join(infer_model_dir, cur_dataset_dir, "medium.jsonl")
            if os.path.exists(medium_file):
                medium_predictions, medium_answers = get_prediction_answer(medium_file, args.model_name)
                medium_score = scorer(cur_dataset_dir, medium_predictions, medium_answers)
            else:
                medium_score = None

            large_file = os.path.join(infer_model_dir, cur_dataset_dir, "large.jsonl")
            if os.path.exists(large_file):
                large_predictions, large_answers = get_prediction_answer(large_file, args.model_name)
                large_score = scorer(cur_dataset_dir, large_predictions, large_answers)
            else:
                large_score = None

            if os.path.exists(small_file) and os.path.exists(medium_file) and os.path.exists(large_file):
                all_predictions = small_predicitons + medium_predictions + large_predictions
                all_answers = small_answers + medium_answers + large_answers
                all_score = scorer(cur_dataset_dir, all_predictions, all_answers)
            else:
                all_score = None

            cur_scores_dict = {"dataset": cur_dataset_dir, "small": small_score, "medium": medium_score, "large": large_score, "all": all_score, "metric": dataset2metric[cur_dataset_dir].__name__}
            scores.append(cur_scores_dict)

    with jsonlines.open("./eval_results/{}.jsonl".format(args.model_name), mode="a") as writer:
        for score_dict in scores:
            writer.write(score_dict)