This repository was archived by the owner on Feb 26, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtagger.py
More file actions
73 lines (63 loc) · 2.15 KB
/
tagger.py
File metadata and controls
73 lines (63 loc) · 2.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import random
import platform
import sys
from subprocess import call
import github
explanation = """DEV: a repository primarily used for development of a tool, component, application, app, or
API
HW repo: a repository primarily used for homework, assignments and other course-related
work and code
EDU: a repository primarily used to host tutorials, lectures, educational information and
code related to teaching
DOCS: a repository primarily used for tracking and storage of non-educational documents
WEB: a repository primarily used to host static personal websites or blogs
OTHER: use this category only if there is no strong correlation to any other repository
category, for example, empty repositories
"""
githubClient = github.Github()
FILENAME = 'data/testset.csv'
MAX_REPO_ID = 76000000
SEPARATOR = ','
if platform.system() == 'Darwin':
command = 'open'
elif platform.system() == 'Linux':
command = 'xdg-open'
else:
raise NotImplementedError("Not implemented for Windows")
def read_visited_repos():
visited_repos = set()
try:
with open(FILENAME, 'r') as file:
for line in file:
visited_repos.add(line.split(SEPARATOR)[0])
except FileNotFoundError:
pass
finally:
return set()
def tag_repo(url):
tags = {"DEV", "HW", "EDU", "DOCS", "WEB", "DATA", "OTHER"}
call([command, url])
print(explanation)
user_input = input("Classify:")
if user_input.upper() in tags:
return user_input.upper()
elif user_input.upper() == 'EXIT':
sys.exit()
else:
return None
if __name__ == '__main__':
visited_repos = read_visited_repos()
with open(FILENAME, 'a') as file:
while True:
try:
repo = githubClient.get_repo(random.randrange(MAX_REPO_ID))
url = repo.html_url
except github.GithubException:
# try different id
continue
if url in visited_repos:
continue
tag = tag_repo(url)
if tag is not None:
file.write(repo.html_url + SEPARATOR + tag + '\n')
visited_repos.add(id)