-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess.py
More file actions
116 lines (90 loc) · 3.92 KB
/
process.py
File metadata and controls
116 lines (90 loc) · 3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
Code for processing the datasets, writing the extracted keyphrases
to files and getting the final accuracy statistics.
'''
import os, io
from ranks import saliencerank, textrank, tpr, singletpr
from utils import *
import numpy as np
"""Outputs the keyphrases in sepatare text files."""
def writeFiles(keyphrases, fileName, fileDir):
keyphraseFile = io.open(fileDir + "/" + fileName + ".txt", 'wb')
keyphraseFile.write('; '.join(keyphrases).encode())
keyphraseFile.close()
"""Switch for running the different algorithms"""
def algorithm_switch(argument, topics, pt, txt, article_ID, alpha=0.1):
if argument == 0:
return textrank(txt)
elif argument == 1:
return tpr(topics, pt, txt, article_ID)
elif argument == 2:
return saliencerank(topics, pt, txt, article_ID, alpha)
elif argument == 3:
return singletpr(topics, pt, txt, article_ID)
"""Process the Inspec (Hulth2003) dataset
The keyphrases for each document are written to files. """
def process_hulth(lda_file, docsXtopics_file, output_dir, flag):
directory = "data/inspec/all"
articles = os.listdir(directory)
text_articles = []
for article in articles:
if article.endswith(".abstr"):
text_articles.append(article)
text_articles.sort()
pt = load_docsXtopics_from_file(docsXtopics_file)
pt = np.array(pt, dtype='float64')
topics = parse_weights_from_file(lda_file)
for article_ID in range(len(text_articles)):
articleFile = io.open(directory + "/" + text_articles[article_ID], 'rb')
text = articleFile.read()
text = text.strip(b'\t\n\r')
text = text.split(b'\r\n')
article = text[1].decode()
phrases = algorithm_switch(flag, topics, pt, article, article_ID)
phrases_topk = []
for k, _ in phrases:
phrases_topk.append(k)
writeFiles(phrases_topk, text_articles[article_ID], output_dir)
print("article ID:", article_ID, "\t phrases topk:" + ';'.join(phrases_topk))
"""Process the 500N dataset
The keyphrases for each document are written to files. """
def process_500N(lda_file, docsXtopics_file, output_dir, flag):
directory = "data/500N/all"
articles = os.listdir(directory)
text_articles = []
for article in articles:
if article.endswith(".txt"):
text_articles.append(article)
text_articles.sort()
pt = load_docsXtopics_from_file(docsXtopics_file)
pt = np.array(pt, dtype='float64')
topics = parse_weights_from_file(lda_file)
for article_ID in range(len(text_articles)):
articleFile = io.open(directory + "/" + text_articles[article_ID], 'rb')
text = articleFile.read()
text = text.split(b'\n')
article = text[1].decode()
phrases = algorithm_switch(flag, topics, pt, article, article_ID)
phrases_topk = []
for k, _ in phrases:
phrases_topk.append(k)
writeFiles(phrases_topk, text_articles[article_ID][:-4], output_dir)
print("article ID:", article_ID, "\t phrases topk:" + ';'.join(phrases_topk))
'''Runs a single algorithm on both Inspec (Hulth2003) and 500N datasets and outputs stats. '''
def process_datasets(algorithm, data_set="inspec"):
if data_set == "inspec":
output_dir = "results/inspec"
lda_file = "lda/lda-topicsXvocab-500-Hulth2003.txt"
docXtopics_file = "lda/lda-docxXtopics-500-Hulth2003.txt"
gold_standard_directory = "data/inspec/all/"
process_hulth(lda_file, docXtopics_file, output_dir, algorithm)
elif data_set == "500N":
output_dir = "results/500N"
lda_file = "lda/lda-topicsXvocab-500-500N.txt"
docXtopics_file = "lda/lda-docxXtopics-500-500N.txt"
gold_standard_directory = "data/500N/all/"
process_500N(lda_file, docXtopics_file, output_dir, algorithm)
else:
print("data_set must be inspec or 500N")