-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpreprocessing.py
More file actions
79 lines (65 loc) · 2.46 KB
/
preprocessing.py
File metadata and controls
79 lines (65 loc) · 2.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
import jieba
import pandas as pd
from collections import defaultdict, Counter
import json
def tokenize_zh(df, col_name="source", save_file=False):
word_freq = defaultdict(int)
for sent in df[col_name]:
for word in jieba.cut(sent):
word_freq[word] += 1
if save_file:
json_data = json.dumps(word_freq, indent=4, ensure_ascii=False)
with open("weibo_word_freq.json", "w", encoding="utf-8") as f:
f.write(json_data)
return word_freq
class FrequencyCounter(object):
def __init__(self, df, col_name="source"):
self.df = df
self.col_name = col_name
def _process_extracted_words(self):
whole_dict = {}
for num, item in enumerate(self.df[self.col_name].values):
if item.strip():
word_list = []
for word in item.strip().split("¥"):
if word:
word_list.append(word)
whole_dict[num] = word_list
else:
whole_dict[num] = ""
return whole_dict
def _tokenize_keywords(self):
toks = []
keyword_dict = self._process_extracted_words()
for keyword in [i for v in keyword_dict.values() for i in v]:
if len(keyword) > 4:
for tok in jieba.cut(keyword):
toks.append(tok)
else:
toks.append(keyword)
return toks
def _split_by_char(self):
toks = []
for sent in self.df[self.col_name]:
for tok in sent:
if tok != " ": # remove space
toks.append(tok)
return toks
def count_freq(self, save_file=False, by_char=False):
if by_char:
toks = self._split_by_char()
else:
toks = self._tokenize_keywords()
if save_file:
json_data = json.dumps(dict(Counter(toks).most_common()), indent=4, ensure_ascii=False)
with open("weibo_char_freq.json", "w", encoding="utf-8") as f:
f.write(json_data)
return Counter(toks).most_common()
if __name__ == "__main__":
df_train = pd.read_csv("./SMP2020/train.tsv", sep="\t")
df_dev = pd.read_csv("./SMP2020/dev.tsv", sep="\t")
df_test = pd.read_csv("./SMP2020/test.tsv", sep="\t")
df = pd.concat([df_train, df_dev, df_test], axis=0)
fc = FrequencyCounter(df, col_name="text_a")
_ = fc.count_freq(save_file=True, by_char=True)