jaredjoss commited on
Commit
6c23f4f
1 Parent(s): d4c375b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +163 -0
app.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from pytube import extract
4
+ import re
5
+ import string
6
+ import pickle
7
+ import nltk
8
+ import nltk.sentiment.util
9
+ from nltk.corpus import stopwords
10
+ from nltk.stem import WordNetLemmatizer
11
+
12
+ from keras.preprocessing.text import Tokenizer
13
+ from keras.preprocessing.sequence import pad_sequences
14
+ from tensorflow import keras
15
+
16
+ from youtube_comment_downloader import *
17
+
18
+ # get YouTube ID
19
+ def getID(url):
20
+ print("Getting YouTube ID...")
21
+ return extract.video_id(url)
22
+
23
+ # function to clean comments
24
+ def clean_text(text):
25
+ lemmatizer = WordNetLemmatizer()
26
+ # stopwords
27
+ sw = ["i","me","my","myself","we","our","ours","ourselves","you","you're","you've","you'll","you'd","your","yours","yourself","yourselves","he","him","his","himself","she","she's","her","hers","herself","it","it's","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","that'll","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","don't","should","should've","now","d","ll","m","o","re","ve","y","ain","aren","aren't","couldn","couldn't","didn","didn't","doesn","doesn't","hadn","hadn't","hasn","hasn't","haven","haven't","isn","isn't","ma","mightn","mightn't","mustn","mustn't","needn","needn't","shan","shan't","shouldn","shouldn't","wasn","wasn't","weren","weren't","won","won't","wouldn","wouldn't"]
28
+ # remove symbols and Emojis
29
+ text = text.lower()
30
+ text = re.sub('@', '', text)
31
+ text = re.sub('\[.*?\]', '', text)
32
+ text = re.sub('https?://\S+|www\.\S+', '', text)
33
+ text = re.sub('<.*?>+', '', text)
34
+ text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
35
+ text = re.sub('\n', '', text)
36
+ text = re.sub('\w*\d\w*', '', text)
37
+ text = re.sub(r"[^a-zA-Z ]+", "", text)
38
+
39
+ # tokenize the data
40
+ text = nltk.word_tokenize(text)
41
+
42
+ # lemmatize
43
+ text = [lemmatizer.lemmatize(t) for t in text]
44
+ text = [lemmatizer.lemmatize(t, 'v') for t in text]
45
+
46
+ # mark Negation
47
+ tokens_neg_marked = nltk.sentiment.util.mark_negation(text)
48
+
49
+ # remove stopwords
50
+ text = [t for t in tokens_neg_marked
51
+ if t.replace("_NEG", "").isalnum() and
52
+ t.replace("_NEG", "") not in sw]
53
+
54
+ return text
55
+
56
+ def getSentenceTrain():
57
+ # open sentences_train file
58
+ sentences_train_f = open('Deep learning/pickles/sentences_train.pickle', "rb")
59
+ sentences_train = pickle.load(sentences_train_f)
60
+ sentences_train_f.close()
61
+ return sentences_train
62
+
63
+ SGD_74_f = open('Shallow machine learning/pickles/SGD_74.pickle', "rb")
64
+ SGD_train = pickle.load(SGD_74_f)
65
+ SGD_74_f.close()
66
+
67
+ logreg_79_f = open('Shallow machine learning/pickles/logreg_79.pickle', "rb")
68
+ logreg_train = pickle.load(logreg_79_f)
69
+ logreg_79_f.close()
70
+
71
+ # get saved CNN model
72
+ model = keras.models.load_model("Deep learning/CNN_82")
73
+
74
+ def vote(test_point, _test):
75
+ print("Voting on video effectivess...\n")
76
+ pos_weighting = []
77
+ result = ''
78
+ confidence = 0
79
+ algos_score = 0
80
+
81
+ algorithms = [
82
+ {'name': 'SGD', 'accuracy': 0.74*100, 'trained': SGD_train},
83
+ {'name': 'Logistic Regression', 'accuracy': 0.79*100, 'trained': logreg_train},
84
+ {'name': 'CNN', 'accuracy': 0.82*100, 'trained': model}
85
+ ]
86
+
87
+ for algo in algorithms:
88
+ weight = algo['accuracy']
89
+ algos_score += weight
90
+ if algo['name'] == "CNN":
91
+ pred = algo['trained'].predict(_test)
92
+ if pred[0][0] > 0.5:
93
+ pos_weighting.append(weight)
94
+ print("CNN voted for: effective" if pred[0][0]>0.5 else "CNN voted for: ineffective")
95
+ else:
96
+ pred = algo['trained'].predict(test_point)
97
+ if pred == 'pos':
98
+ pos_weighting.append(weight)
99
+ print(algo['name'] + " voted for: effective" if pred=='pos' else algo['name'] + " voted for: ineffective")
100
+
101
+ pos_result = sum(pos_weighting)/algos_score
102
+ if pos_result < 0.5:
103
+ result = 'ineffective'
104
+ confidence = 1 - pos_result
105
+ else:
106
+ result = 'effective'
107
+ confidence = pos_result
108
+
109
+ return result, confidence
110
+
111
+ def quantizeEffectiveness(url):
112
+ # 1. Get YouTube ID
113
+ print("Getting YouTube ID...")
114
+ videoId = getID(url)
115
+
116
+ # 2. Download comments
117
+ print("Downloading comments...")
118
+ downloader = YoutubeCommentDownloader()
119
+ comments_downloaded = downloader.get_comments_from_url(f'https://www.youtube.com/watch?v={videoId}')
120
+ comments = [comment for comment in comments_downloaded]
121
+ comments_df = pd.DataFrame(comments)
122
+
123
+ # 3. Clean comments
124
+ print("Cleaning Comments...")
125
+ comments_df['text'] = comments_df['text'].apply(lambda x: clean_text(x))
126
+
127
+ # get all words of video into one list
128
+ all_words = [item for sublist in comments_df['text'].tolist() for item in sublist]
129
+
130
+ # 4. Create test dataframe
131
+ test = pd.DataFrame([[videoId]], columns=['VideoId'])
132
+
133
+ # 5. Get documents (pre-processd comments)
134
+ test_documents = []
135
+ test_documents.append(all_words)
136
+ test['cleaned'] = test_documents
137
+ test['cleaned_string'] = [' '.join(map(str, l)) for l in test['cleaned']]
138
+
139
+ # 6. Get ML test point
140
+ test_point = test['cleaned_string']
141
+ test_sentence = test['cleaned_string'].values
142
+
143
+ # 7. Get trained sentences
144
+ sentences_train = getSentenceTrain()
145
+
146
+ # 8. Tokenize the data
147
+ print("Tokenizing the data...")
148
+ tokenizer = Tokenizer(num_words=5000)
149
+ tokenizer.fit_on_texts(sentences_train)
150
+
151
+ # 9. Get DL test point
152
+ _test = pad_sequences(tokenizer.texts_to_sequences(test_sentence), padding='post', maxlen=100)
153
+
154
+ # 10. Vote on video effectiveness
155
+ result, confidence = vote(test_point, _test)
156
+
157
+ def greet(url):
158
+ result, confidence = quantizeEffectiveness(url)
159
+
160
+ return f"The video (ID: {videoId}) is {result} with a confidence of {round(confidence*100,2)}%"
161
+
162
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text")
163
+ iface.launch()