Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
from pytube import extract
|
4 |
+
import re
|
5 |
+
import string
|
6 |
+
import pickle
|
7 |
+
import nltk
|
8 |
+
import nltk.sentiment.util
|
9 |
+
from nltk.corpus import stopwords
|
10 |
+
from nltk.stem import WordNetLemmatizer
|
11 |
+
|
12 |
+
from keras.preprocessing.text import Tokenizer
|
13 |
+
from keras.preprocessing.sequence import pad_sequences
|
14 |
+
from tensorflow import keras
|
15 |
+
|
16 |
+
from youtube_comment_downloader import *
|
17 |
+
|
18 |
+
# get YouTube ID
|
19 |
+
def getID(url):
|
20 |
+
print("Getting YouTube ID...")
|
21 |
+
return extract.video_id(url)
|
22 |
+
|
23 |
+
# function to clean comments
|
24 |
+
def clean_text(text):
|
25 |
+
lemmatizer = WordNetLemmatizer()
|
26 |
+
# stopwords
|
27 |
+
sw = ["i","me","my","myself","we","our","ours","ourselves","you","you're","you've","you'll","you'd","your","yours","yourself","yourselves","he","him","his","himself","she","she's","her","hers","herself","it","it's","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","that'll","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","don't","should","should've","now","d","ll","m","o","re","ve","y","ain","aren","aren't","couldn","couldn't","didn","didn't","doesn","doesn't","hadn","hadn't","hasn","hasn't","haven","haven't","isn","isn't","ma","mightn","mightn't","mustn","mustn't","needn","needn't","shan","shan't","shouldn","shouldn't","wasn","wasn't","weren","weren't","won","won't","wouldn","wouldn't"]
|
28 |
+
# remove symbols and Emojis
|
29 |
+
text = text.lower()
|
30 |
+
text = re.sub('@', '', text)
|
31 |
+
text = re.sub('\[.*?\]', '', text)
|
32 |
+
text = re.sub('https?://\S+|www\.\S+', '', text)
|
33 |
+
text = re.sub('<.*?>+', '', text)
|
34 |
+
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
|
35 |
+
text = re.sub('\n', '', text)
|
36 |
+
text = re.sub('\w*\d\w*', '', text)
|
37 |
+
text = re.sub(r"[^a-zA-Z ]+", "", text)
|
38 |
+
|
39 |
+
# tokenize the data
|
40 |
+
text = nltk.word_tokenize(text)
|
41 |
+
|
42 |
+
# lemmatize
|
43 |
+
text = [lemmatizer.lemmatize(t) for t in text]
|
44 |
+
text = [lemmatizer.lemmatize(t, 'v') for t in text]
|
45 |
+
|
46 |
+
# mark Negation
|
47 |
+
tokens_neg_marked = nltk.sentiment.util.mark_negation(text)
|
48 |
+
|
49 |
+
# remove stopwords
|
50 |
+
text = [t for t in tokens_neg_marked
|
51 |
+
if t.replace("_NEG", "").isalnum() and
|
52 |
+
t.replace("_NEG", "") not in sw]
|
53 |
+
|
54 |
+
return text
|
55 |
+
|
56 |
+
def getSentenceTrain():
|
57 |
+
# open sentences_train file
|
58 |
+
sentences_train_f = open('Deep learning/pickles/sentences_train.pickle', "rb")
|
59 |
+
sentences_train = pickle.load(sentences_train_f)
|
60 |
+
sentences_train_f.close()
|
61 |
+
return sentences_train
|
62 |
+
|
63 |
+
SGD_74_f = open('Shallow machine learning/pickles/SGD_74.pickle', "rb")
|
64 |
+
SGD_train = pickle.load(SGD_74_f)
|
65 |
+
SGD_74_f.close()
|
66 |
+
|
67 |
+
logreg_79_f = open('Shallow machine learning/pickles/logreg_79.pickle', "rb")
|
68 |
+
logreg_train = pickle.load(logreg_79_f)
|
69 |
+
logreg_79_f.close()
|
70 |
+
|
71 |
+
# get saved CNN model
|
72 |
+
model = keras.models.load_model("Deep learning/CNN_82")
|
73 |
+
|
74 |
+
def vote(test_point, _test):
|
75 |
+
print("Voting on video effectivess...\n")
|
76 |
+
pos_weighting = []
|
77 |
+
result = ''
|
78 |
+
confidence = 0
|
79 |
+
algos_score = 0
|
80 |
+
|
81 |
+
algorithms = [
|
82 |
+
{'name': 'SGD', 'accuracy': 0.74*100, 'trained': SGD_train},
|
83 |
+
{'name': 'Logistic Regression', 'accuracy': 0.79*100, 'trained': logreg_train},
|
84 |
+
{'name': 'CNN', 'accuracy': 0.82*100, 'trained': model}
|
85 |
+
]
|
86 |
+
|
87 |
+
for algo in algorithms:
|
88 |
+
weight = algo['accuracy']
|
89 |
+
algos_score += weight
|
90 |
+
if algo['name'] == "CNN":
|
91 |
+
pred = algo['trained'].predict(_test)
|
92 |
+
if pred[0][0] > 0.5:
|
93 |
+
pos_weighting.append(weight)
|
94 |
+
print("CNN voted for: effective" if pred[0][0]>0.5 else "CNN voted for: ineffective")
|
95 |
+
else:
|
96 |
+
pred = algo['trained'].predict(test_point)
|
97 |
+
if pred == 'pos':
|
98 |
+
pos_weighting.append(weight)
|
99 |
+
print(algo['name'] + " voted for: effective" if pred=='pos' else algo['name'] + " voted for: ineffective")
|
100 |
+
|
101 |
+
pos_result = sum(pos_weighting)/algos_score
|
102 |
+
if pos_result < 0.5:
|
103 |
+
result = 'ineffective'
|
104 |
+
confidence = 1 - pos_result
|
105 |
+
else:
|
106 |
+
result = 'effective'
|
107 |
+
confidence = pos_result
|
108 |
+
|
109 |
+
return result, confidence
|
110 |
+
|
111 |
+
def quantizeEffectiveness(url):
|
112 |
+
# 1. Get YouTube ID
|
113 |
+
print("Getting YouTube ID...")
|
114 |
+
videoId = getID(url)
|
115 |
+
|
116 |
+
# 2. Download comments
|
117 |
+
print("Downloading comments...")
|
118 |
+
downloader = YoutubeCommentDownloader()
|
119 |
+
comments_downloaded = downloader.get_comments_from_url(f'https://www.youtube.com/watch?v={videoId}')
|
120 |
+
comments = [comment for comment in comments_downloaded]
|
121 |
+
comments_df = pd.DataFrame(comments)
|
122 |
+
|
123 |
+
# 3. Clean comments
|
124 |
+
print("Cleaning Comments...")
|
125 |
+
comments_df['text'] = comments_df['text'].apply(lambda x: clean_text(x))
|
126 |
+
|
127 |
+
# get all words of video into one list
|
128 |
+
all_words = [item for sublist in comments_df['text'].tolist() for item in sublist]
|
129 |
+
|
130 |
+
# 4. Create test dataframe
|
131 |
+
test = pd.DataFrame([[videoId]], columns=['VideoId'])
|
132 |
+
|
133 |
+
# 5. Get documents (pre-processd comments)
|
134 |
+
test_documents = []
|
135 |
+
test_documents.append(all_words)
|
136 |
+
test['cleaned'] = test_documents
|
137 |
+
test['cleaned_string'] = [' '.join(map(str, l)) for l in test['cleaned']]
|
138 |
+
|
139 |
+
# 6. Get ML test point
|
140 |
+
test_point = test['cleaned_string']
|
141 |
+
test_sentence = test['cleaned_string'].values
|
142 |
+
|
143 |
+
# 7. Get trained sentences
|
144 |
+
sentences_train = getSentenceTrain()
|
145 |
+
|
146 |
+
# 8. Tokenize the data
|
147 |
+
print("Tokenizing the data...")
|
148 |
+
tokenizer = Tokenizer(num_words=5000)
|
149 |
+
tokenizer.fit_on_texts(sentences_train)
|
150 |
+
|
151 |
+
# 9. Get DL test point
|
152 |
+
_test = pad_sequences(tokenizer.texts_to_sequences(test_sentence), padding='post', maxlen=100)
|
153 |
+
|
154 |
+
# 10. Vote on video effectiveness
|
155 |
+
result, confidence = vote(test_point, _test)
|
156 |
+
|
157 |
+
def greet(url):
|
158 |
+
result, confidence = quantizeEffectiveness(url)
|
159 |
+
|
160 |
+
return f"The video (ID: {videoId}) is {result} with a confidence of {round(confidence*100,2)}%"
|
161 |
+
|
162 |
+
iface = gr.Interface(fn=greet, inputs="text", outputs="text")
|
163 |
+
iface.launch()
|