-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.py
More file actions
89 lines (69 loc) · 2.5 KB
/
script.py
File metadata and controls
89 lines (69 loc) · 2.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import re
from pathlib import Path
from src.utils import load_csv
def gen_vocab_files(exp_folder: Path):
constants = set()
preds = set()
data = load_csv(exp_folder / "csvs" / "all.csv")
print(len(data))
for entry in data:
print(entry)
preds.add(entry[0])
constants.add(entry[1])
constants.add(entry[2])
constants = list(sorted(constants))
preds = list(sorted(preds))
print("PREDS", preds)
(exp_folder / "vocab").mkdir(exist_ok=True)
with open(exp_folder / "vocab" / "constants.txt", "w+") as f:
f.write("\n".join(constants))
with open(exp_folder / "vocab" / "pred.txt", "w+") as f:
f.write("\n".join(preds))
with open(exp_folder / "program.dlog", "r") as f:
program = f.read()
special = [x for x in re.findall('"(.*?)"', program) if x[0] == "s"]
with open(exp_folder / "vocab" / "special.txt", "w+") as f:
f.write("\n".join(special))
def preprocess_data(exp_folder: Path | str):
exp_folder = Path(exp_folder)
file_names = ["kg.csv", "train.csv", "test.csv", "val.csv"]
file_names = [exp_folder / "csvs" / f for f in file_names]
for f in file_names:
if f.exists():
to_proper_csv(f)
gen_vocab_files(exp_folder)
def prolog_to_csv(line):
assert ")" in line
return line.replace(".", "").replace(")", "").replace("(", ",")
def kg_to_csv(line):
line = line.split("\t")
assert len(line) == 3
line = line[1], line[0], line[2]
return ",".join(line)
def to_proper_csv(file_name):
with open(file_name, "r") as f:
data = f.readlines()
data = [line.strip() for line in data]
data = [line for line in data if line]
if any("(" in line for line in data):
data = [prolog_to_csv(line) for line in data]
if any("\t" in line for line in data):
data = [kg_to_csv(line) for line in data]
with open(file_name, "w") as f:
f.write("\n".join(data))
def sort_file(filename):
with open(filename, "r") as f:
lines = f.read().split("\n")
lines = [line.strip() for line in lines if line.strip()]
lines = list(sorted(lines))
with open(filename, "w") as f:
f.write("\n".join(lines))
if __name__ == "__main__":
kg = "umls2"
to_proper_csv(f"data/{kg}/csvs/val.csv")
to_proper_csv(f"data/{kg}/csvs/test.csv")
to_proper_csv(f"data/{kg}/csvs/train.csv")
to_proper_csv(f"data/{kg}/csvs/kg.csv")
to_proper_csv(f"data/{kg}/csvs/all.csv")
gen_vocab_files(Path(f"data/{kg}"))