GithubRepoClassification/tagger.py at master · mbornstein/GithubRepoClassification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import random
import platform
import sys
from subprocess import call

import github

explanation = """DEV: a repository primarily used for development of a tool, component, application, app, or
API
HW repo: a repository primarily used for homework, assignments and other course-related
work and code
EDU: a repository primarily used to host tutorials, lectures, educational information and
code related to teaching
DOCS: a repository primarily used for tracking and storage of non-educational documents
WEB: a repository primarily used to host static personal websites or blogs
OTHER: use this category only if there is no strong correlation to any other repository
category, for example, empty repositories
"""

githubClient = github.Github()
FILENAME = 'data/testset.csv'
MAX_REPO_ID = 76000000
SEPARATOR = ','

if platform.system() == 'Darwin':
    command = 'open'
elif platform.system() == 'Linux':
    command = 'xdg-open'
else:
    raise NotImplementedError("Not implemented for Windows")


def read_visited_repos():
    visited_repos = set()
    try:
        with open(FILENAME, 'r') as file:
            for line in file:
                visited_repos.add(line.split(SEPARATOR)[0])
    except FileNotFoundError:
        pass
    finally:
        return set()


def tag_repo(url):
    tags = {"DEV", "HW", "EDU", "DOCS", "WEB", "DATA", "OTHER"}
    call([command, url])
    print(explanation)
    user_input = input("Classify:")
    if user_input.upper() in tags:
        return user_input.upper()
    elif user_input.upper() == 'EXIT':
        sys.exit()
    else:
        return None

if __name__ == '__main__':
    visited_repos = read_visited_repos()

    with open(FILENAME, 'a') as file:
        while True:
            try:
                repo = githubClient.get_repo(random.randrange(MAX_REPO_ID))
                url = repo.html_url
            except github.GithubException:
                # try different id
                continue
            if url in visited_repos:
                continue
            tag = tag_repo(url)
            if tag is not None:
                file.write(repo.html_url + SEPARATOR + tag + '\n')
                visited_repos.add(id)