Issue-Classification/app.py at main · 695roshan/Issue-Classification · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
import os
import re
import gensim
import joblib
import numpy as np
from flask_cors import CORS
from langdetect import detect
from dotenv import load_dotenv
import nltk
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer
from supabase import create_client, Client
from flask import Flask,jsonify, request,Response,render_template
from concurrent.futures import ThreadPoolExecutor, TimeoutError
from prometheus_client import Counter, Gauge, generate_latest, Summary

import warnings
warnings.filterwarnings("ignore")
nltk.download('stopwords')

# Metrics initialization
accuracy_metric = Gauge('accuracy', 'Accuracy of predictions')
avg_confidence_metric = Gauge('avg_prediction_confidence', 'Average prediction confidence')
predictions_per_category = Counter('predictions_per_category', 'Number of predictions per category', ['label'])
correct_predictions = Counter('correct_predictions_per_category', 'Number of correct predictions per category', ['label'])
incorrect_predictions = Counter('incorrect_predictions_per_category', 'Number of incorrect predictions per category', ['label'])
request_latency = Summary('request_latency_seconds', 'Request latency in seconds')

# Track accuracy and confidence globally
total_predictions = Counter('total_predictions', 'Total number of predictions')
total_correct = Counter('total_correct', 'Total correct predictions')
confidence_sum = 0  # Total sum of confidence scores for average calculation

load_dotenv()
url= os.environ.get('SUPABASE_URL')
key= os.environ.get('SUPABASE_KEY')

# Only initialize Supabase if both credentials are available
supabase: Client = None
if url and key:
    supabase = create_client(url, key)

app = Flask(__name__)
CORS(app)

LABELS=['bug','enhancement','question']

@app.route('/')
def home():
    return render_template('index.html')

@app.route('/metrics')
def metrics():
    return Response(generate_latest(), mimetype='text/plain')

def insert_into_db(issue_title,issue_body,predicted_label):
    '''
    Inserts the issue title, issue body and the predicted label into the database.
    Returns the id number for the issue
    '''
    # Get the current number of rows in the dataset, which is used to calculate the issue id
    response = (supabase.table("issues")
                        .select("*",count="exact")
                        .execute())
    issue_id=response.count+1
    #insert the id, title, body, predicted label of the issue into the database
    response = (supabase.table("issues")
                        .insert({"id":issue_id,"issue_title": issue_title,'issue_body':issue_body,'predicted_label':predicted_label})
                        .execute())
    return issue_id

def update_issue_in_db(issue_id,corrected_label):
    '''
    Inserts the corrected label for an issue in the database using its id
    '''
    # Check if an issue with the provided issue_id exists in the table
    response1 = (supabase.table("issues")
                         .select("*",count="exact")
                         .eq("id", issue_id)
                         .execute())
    #if the issue exists then update the label, otherwise return an error message
    if response1.count==1:
        response2 = (supabase.table("issues")
                            .update({"corrected_label": corrected_label})
                            .eq("id", issue_id)
                            .execute())
        predicted_label=LABELS[response1.data[0]['predicted_label']]
        return {"success":f"Label for issue id {issue_id} was changed from {predicted_label} to {LABELS[corrected_label]}"}
    else:
        return {'error':f'Issue with id {issue_id} does not exist'}

def avg_word2vec(doc):
    '''
    Averages the vectors of each word in a sentence to a single vector
    '''
    #Load the model
    w2v_model = gensim.models.Word2Vec.load("./models/word2vec.model")
    # remove out-of-vocabulary words
    valid_words = [w2v_model.wv[word] for word in doc if word in w2v_model.wv.index_to_key]
    if not valid_words:
        return np.zeros(50)  # Return zero vector if no words are found
    return np.mean(valid_words, axis=0)

def filter_stopwords(text):
    # Noise Removal: converting every non alphabetical or non english character to blank space
    preprocessed_text = re.sub('[^a-zA-Z?!]', ' ', text)
    # Normalization: making every character lowercase
    preprocessed_text = preprocessed_text.lower()
    # Tokenization: splitting words by space
    preprocessed_text = preprocessed_text.split()
    preprocessed_text = [word for word in preprocessed_text if not word in stopwords.words('english')]
    return preprocessed_text

def make_prediction (issue_title,issue_body):
    '''
    Pre-processes the text and makes a prediction using the trained model
    Returns a list of the predicted probabilities of the labels
    '''
    # filter and normalize stopwords
    preprocessed_title=filter_stopwords(issue_title)
    preprocessed_body=filter_stopwords(issue_body)

    lemmatizer=WordNetLemmatizer()
    # perform lemmatization
    preprocessed_title = [lemmatizer.lemmatize(word) for word in preprocessed_title]
    preprocessed_body = [lemmatizer.lemmatize(word) for word in preprocessed_body]

    # Compute separate vectors for title and body
    title_vectors=[avg_word2vec(preprocessed_title)]
    body_vectors=[avg_word2vec(preprocessed_body)]

    # Convert to NumPy arrays
    title_vectors = np.array(title_vectors)
    body_vectors = np.array(body_vectors)

    # Concatenate title and body vectors to form final feature vector
    feature_vec = np.hstack((title_vectors, body_vectors))

    # the input to the model should be of the shape (1,100)
    feature_vec.reshape(1,-1)

    # load the random forest classifier
    classifier = joblib.load("./models/rf_classifier.joblib")

    return classifier.predict_proba(feature_vec).tolist()[0]

@app.route('/api/predict',methods=["POST"])
@request_latency.time()
def predict():
    '''
    API endpoint to predict label (bug, enhancement, question) from issue title and body
    '''
    global confidence_sum

    if request.method == 'POST':
        issue_title=request.form.get('title')
        issue_body=request.form.get('body')
        # Check if the title and body are empty
        if issue_title=="":
            return jsonify({'error':'Please enter the issue title'})
        if issue_body=="":
            return jsonify({'error':'Please enter the issue body'})

        # Check if the title and body are in English
        if detect(issue_title)!='en':
            return jsonify({'error':'Please enter the issue title in English'})
        if detect(issue_body)!='en':
            return jsonify({'error':'Please enter the issue body in English'})

        # Check if the issue title and body only contain stopwords or not
        if filter_stopwords(issue_title)==[]:
            return jsonify({'error':'Issue title only contains stopwords'})
        if filter_stopwords(issue_body)==[]:
            return jsonify({'error':'Issue body only contains stopwords'})

        # Check the length of the title and body
        if len(issue_title)<10:
            return jsonify({'error':'Please enter a longer issue title'})
        if len(issue_body)<20:
            return jsonify({'error':'Please enter a longer issue body'})
        # If the length of the title or the body exceeds a maximum limit, truncate them
        if len(issue_title)>200:
            issue_title=issue_title[:200]
        if len(issue_body)>5000:
            issue_body=issue_body[:5000]

        # Initialize a thread pool
        executor = ThreadPoolExecutor(max_workers=2)
        # Submit the prediction task to the thread pool
        future = executor.submit(make_prediction, issue_title,issue_body)
        # If the prediction time exceeds a certain threshold we return an error
        PREDICTION_TIMEOUT = 20 #20 seconds
        # If the maximum probability is below a certain threshold we show a warning to the user
        CONFIDENCE_THRESHOLD=0.7
        try:
            # preds=make_prediction(f'{issue_title} {issue_body}')
            preds = future.result(timeout=PREDICTION_TIMEOUT)
            max_prob=max(preds)
            max_prob_label=preds.index(max_prob)
            probs=[round(pred,2) for pred in preds]

            # Update metrics
            confidence_sum += max(probs)
            total_predictions.inc()
            avg_confidence_metric.set(confidence_sum / total_predictions._value.get())
            predictions_per_category.labels(LABELS[max_prob_label]).inc()

            issue_id=insert_into_db(issue_title,issue_body,max_prob_label)
            issue_label=LABELS[max_prob_label]
            warn=''
            if max_prob<CONFIDENCE_THRESHOLD:
                warn=f'Warning: The prediction confidence is low. Please verify the predicted label.'
            issue = {'id':issue_id,'label': issue_label,'probs':probs,'warning':warn}
            return jsonify(issue)
        except TimeoutError:
            # Handle timeout by returning an error message
            return jsonify({"error": "Prediction timed out. Please try again later."})

@app.route('/api/correct', methods=["POST"])
@request_latency.time()
def correct():
    '''
    API endpoint to correct a predicted label
    '''
    if request.method == 'POST':
        issue_id=str(request.form.get('issue_id'))
        corrected_label=str(request.form.get('corrected_label')).lower()

        if issue_id=="":
            return jsonify({'error':'Please enter an issue id'})
        if not issue_id.isnumeric():
            return jsonify({'error':'Please enter a valid numeric issue id'})
        if corrected_label=="" or corrected_label not in LABELS:
            return jsonify({'error':'Please enter a valid corrected label (bug,enhancement,question)'})

    result=update_issue_in_db(int(issue_id),LABELS.index(corrected_label))

    if "success" in result:
        # the message is: #f"Label for issue id {issue_id} was changed from {predicted_label} to {LABELS[corrected_label]}"
        predicted_label = result["success"].split(' ')[8]
        if corrected_label == predicted_label:
            total_correct.inc()
            correct_predictions.labels(corrected_label).inc()
        else:
            incorrect_predictions.labels(predicted_label).inc()

        accuracy_metric.set(total_correct._value.get() / total_predictions._value.get())

    return jsonify(result)

if __name__ == '__main__':
    app.run(host='0.0.0.0',debug=True)