-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocess.py
More file actions
88 lines (63 loc) · 2.79 KB
/
preprocess.py
File metadata and controls
88 lines (63 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# preprocess.py
import yaml
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
import os
import argparse
def tokenize_fn(examples, tokenizer, config):
"""Tokenize a batch of examples."""
inputs = examples[config["datasets"]["train"]["columns"]["source"]]
targets = examples[config["datasets"]["train"]["columns"]["target"]]
src_langs = examples[config["datasets"]["train"]["columns"]["src_lang"]]
tgt_langs = examples[config["datasets"]["train"]["columns"]["tgt_lang"]]
input_ids, attention_masks, labels = [], [], []
for src, tgt, src_lang, tgt_lang in zip(inputs, targets, src_langs, tgt_langs):
tokenizer.src_lang = src_lang
tokenizer.tgt_lang = tgt_lang
tokenized = tokenizer(
src,
text_target=tgt,
max_length=config["tokenization"]["max_length"],
padding=config["tokenization"]["padding"],
truncation=config["tokenization"]["truncation"],
return_attention_mask=True,
)
input_ids.append(tokenized["input_ids"])
attention_masks.append(tokenized["attention_mask"])
labels.append(tokenized["labels"])
return {
"input_ids": input_ids,
"labels": labels,
"attention_mask": attention_masks,
}
def preprocess_data(config_path="config.yaml", output_dir="data/tokenized"):
"""Preprocess and save tokenized datasets."""
with open(config_path, "r") as f:
config = yaml.safe_load(f)
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(config["model"]["name"])
# Load datasets
train_ds = load_dataset(config["datasets"]["train"]["name"], split=config["datasets"]["train"]["split"])
valid_ds = load_dataset(config["datasets"]["validation"]["name"], split=config["datasets"]["validation"]["split"])
print("Tokenizing training data...")
tokenized_train = train_ds.map(
lambda x: tokenize_fn(x, tokenizer, config),
batched=True,
remove_columns=train_ds.column_names
)
print("Tokenizing validation data...")
tokenized_valid = valid_ds.map(
lambda x: tokenize_fn(x, tokenizer, config),
batched=True,
remove_columns=valid_ds.column_names
)
# Save processed datasets
os.makedirs(output_dir, exist_ok=True)
tokenized_train.save_to_disk(os.path.join(output_dir, "train"))
tokenized_valid.save_to_disk(os.path.join(output_dir, "validation"))
print(f"✅ Tokenized datasets saved to: {output_dir}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Preprocess and tokenize datasets.")
parser.add_argument("--config", type=str, default="config.yaml", help="Path to config YAML file.")
args = parser.parse_args()
preprocess_data(config_path=args.config)