-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwebsite_data.py
More file actions
executable file
·146 lines (136 loc) · 5.14 KB
/
website_data.py
File metadata and controls
executable file
·146 lines (136 loc) · 5.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#! /usr/bin/python
import MySQLdb
import psycopg2
import psycopg2.extras
import bs4
import re
import json
import sys
import SocketServer
import lxml.html
from urllib2 import urlopen
from pattern.vector import count, KNN, stem, PORTER, words, Document
conn = psycopg2.connect("dbname='nlpdatabase' user='jpyle' password='xxxsecretxxx' host='192.168.200.35'")
knn = None
pages = None
urlalias = None
revurlalias = None
def setup():
global pages
global urlalias
global revurlalias
global knn
pages = dict()
urlalias = dict()
revurlalias = dict()
knn = KNN()
db = MySQLdb.connect(host="192.168.200.26",
user="root",
passwd="xxxsecretxxx",
db="pla")
cur = db.cursor()
cur.execute("select source, alias from url_alias")
for row in cur.fetchall():
urlalias[row[1]] = row[0]
revurlalias[row[0]] = row[1]
cur.execute("select tid, name, description, vid from taxonomy_term_data;")
for row in cur.fetchall():
url = 'taxonomy/term/' + str(row[0])
pages[url] = row[1]
if url in revurlalias:
pages[revurlalias[url]] = row[1]
url = revurlalias[url]
if row[3] == 3:
soup = bs4.BeautifulSoup(row[2])
the_text = re.sub(r'[\n\r]+', r' ', soup.get_text(' ')).lower()
knn.train(Document(the_text, stemmer=PORTER), url)
knn.train(Document(row[1].lower()), url)
cur.execute("select a.tid, c.body_value, d.title from taxonomy_term_data as a inner join field_data_field_practice_areas as b on (a.tid=b.field_practice_areas_tid and b.entity_type='node' and b.bundle != 'professionals' and b.deleted=0) inner join field_data_body as c on (b.entity_id=c.entity_id and b.entity_type=c.entity_type) inner join node as d on (c.entity_id=d.nid);")
for row in cur.fetchall():
url = 'taxonomy/term/' + str(row[0])
if url in revurlalias:
url = revurlalias[url]
soup = bs4.BeautifulSoup(row[1])
the_text = re.sub(r'[\n\r]+', r' ', soup.get_text(' ')).lower()
knn.train(Document(the_text, stemmer=PORTER), url)
knn.train(Document(row[2].lower()), url)
cur.execute("select nid, title from node where status=1;")
for row in cur.fetchall():
url = 'node/' + str(row[0])
pages[url] = row[1]
if url in revurlalias:
pages[revurlalias[url]] = row[1]
db.close()
pgcur = conn.cursor()
pgcur.execute("select query, target from website_queries where target is not null group by query, target")
for row in pgcur.fetchall():
words = re.split(r'[\n\r,;]+ *', row[1])
for word in words:
print("training on " + row[0].lower() + " for " + word)
knn.train(Document(row[0].lower()), word)
conn.commit()
pgcur.close()
def store_query(query, suggestion):
pgcur = conn.cursor()
pgcur.execute("insert into website_queries (query, orig_sug) values (%s, %s);", ( query, suggestion ))
conn.commit()
pgcur.close()
def evaluate_query(query):
probs = dict()
for key, value in knn.classify(Document(query), discrete=False).iteritems():
probs[key] = value
if not len(probs):
probs[knn.classify(Document(query))] = 1.0
seen = set()
probs = map(lambda x: fixurl(x, seen), sorted(probs, key=probs.get, reverse=True))
probs = [prob for prob in probs if prob is not None]
return probs
def fixurl(x, seen):
if x in pages:
description = pages[x]
elif x in urlalias and urlalias[x] in pages:
description = pages[urlalias[x]]
else:
try:
description = fixup(lxml.html.parse(urlopen(x)).find(".//title").text)
if not description:
description = x
except Exception as the_err:
sys.stderr.write(str(the_err) + "\n")
description = x
pages[x] = description
if re.search(r'^http', x):
url = str(x)
else:
url = '/' + str(x)
if description in seen:
return None
seen.add(description)
return '<a href="' + url + '">' + description + '</a>'
def fixup(x):
x = re.sub(r'^ +', r'', x)
x = re.sub(r' +$', r'', x)
x = re.sub(r'[\(\[\]\)\n\r\t`#%$*]', r'', x)
return x
#print 'https://philalegal.org/taxonomy/term/' + str(knn.classify(Document('I want to sue the IRS.')))
class MyTCPHandler(SocketServer.BaseRequestHandler):
def handle(self):
self.data = self.request.recv(1024).strip()
print "{} wrote:".format(self.client_address[0])
print self.data
query = self.data
if query.startswith("___RESET___"):
setup()
self.request.sendall('{"message": "Thank you"}')
else:
orig_query = query
query = query.lower()
response = json.dumps(evaluate_query(query))
store_query(orig_query, response)
self.request.sendall(response)
if __name__ == "__main__":
setup()
HOST, PORT = "localhost", 6693
SocketServer.ThreadingTCPServer.allow_reuse_address = True
server = SocketServer.TCPServer((HOST, PORT), MyTCPHandler)
server.serve_forever()