-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathsplit_data.py
More file actions
33 lines (26 loc) · 1.14 KB
/
split_data.py
File metadata and controls
33 lines (26 loc) · 1.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import json
import random
import argparse
from itertools import chain
parser = argparse.ArgumentParser(description='Process chat instructions')
parser.add_argument("--files", nargs="+", action="append", help="Input files")
parser.add_argument("--train_file", type=str, default="data/DroidCall_train.jsonl", help="Output train file")
parser.add_argument("--test_file", type=str, default="data/DroidCall_test.jsonl", help="Output test file")
parser.add_argument("--num_test", type=int, default=200, help="Number of test instructions")
args = parser.parse_args()
if __name__ == "__main__":
files = list(chain(*args.files))
all_lines = []
for file in files:
with open(file, "r") as f:
all_lines.extend(f.readlines())
# shuffle the lines
# and put the first num_test lines into test file
# and the rest into train file
random.shuffle(all_lines)
with open(args.train_file, "w") as f:
for line in all_lines[args.num_test:]:
f.write(line)
with open(args.test_file, "w") as f:
for line in all_lines[:args.num_test]:
f.write(line)