|
import gradio as gr |
|
import pandas as pd |
|
from pytube import extract |
|
import re |
|
import string |
|
import pickle |
|
import nltk |
|
import nltk.sentiment.util |
|
from nltk.corpus import stopwords |
|
from nltk.stem import WordNetLemmatizer |
|
|
|
from keras.preprocessing.text import Tokenizer |
|
from keras.preprocessing.sequence import pad_sequences |
|
from tensorflow import keras |
|
|
|
from youtube_comment_downloader import * |
|
|
|
nltk.download('punkt') |
|
|
|
|
|
def getID(url): |
|
print("Getting YouTube ID...") |
|
return extract.video_id(url) |
|
|
|
|
|
def clean_text(text): |
|
lemmatizer = WordNetLemmatizer() |
|
|
|
sw = ["i","me","my","myself","we","our","ours","ourselves","you","you're","you've","you'll","you'd","your","yours","yourself","yourselves","he","him","his","himself","she","she's","her","hers","herself","it","it's","its","itself","they","them","their","theirs","themselves","what","which","who","whom","this","that","that'll","these","those","am","is","are","was","were","be","been","being","have","has","had","having","do","does","did","doing","a","an","the","and","but","if","or","because","as","until","while","of","at","by","for","with","about","against","between","into","through","during","before","after","above","below","to","from","up","down","in","out","on","off","over","under","again","further","then","once","here","there","when","where","why","how","all","any","both","each","few","more","most","other","some","such","no","nor","not","only","own","same","so","than","too","very","s","t","can","will","just","don","don't","should","should've","now","d","ll","m","o","re","ve","y","ain","aren","aren't","couldn","couldn't","didn","didn't","doesn","doesn't","hadn","hadn't","hasn","hasn't","haven","haven't","isn","isn't","ma","mightn","mightn't","mustn","mustn't","needn","needn't","shan","shan't","shouldn","shouldn't","wasn","wasn't","weren","weren't","won","won't","wouldn","wouldn't"] |
|
|
|
text = text.lower() |
|
text = re.sub('@', '', text) |
|
text = re.sub('\[.*?\]', '', text) |
|
text = re.sub('https?://\S+|www\.\S+', '', text) |
|
text = re.sub('<.*?>+', '', text) |
|
text = re.sub('[%s]' % re.escape(string.punctuation), '', text) |
|
text = re.sub('\n', '', text) |
|
text = re.sub('\w*\d\w*', '', text) |
|
text = re.sub(r"[^a-zA-Z ]+", "", text) |
|
|
|
|
|
text = nltk.word_tokenize(text) |
|
|
|
|
|
text = [lemmatizer.lemmatize(t) for t in text] |
|
text = [lemmatizer.lemmatize(t, 'v') for t in text] |
|
|
|
|
|
tokens_neg_marked = nltk.sentiment.util.mark_negation(text) |
|
|
|
|
|
text = [t for t in tokens_neg_marked |
|
if t.replace("_NEG", "").isalnum() and |
|
t.replace("_NEG", "") not in sw] |
|
|
|
return text |
|
|
|
def getSentenceTrain(): |
|
|
|
sentences_train_f = open('Deep learning/pickles/sentences_train.pickle', "rb") |
|
sentences_train = pickle.load(sentences_train_f) |
|
sentences_train_f.close() |
|
return sentences_train |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def vote(test_point, _test): |
|
print("Voting on video effectivess...\n") |
|
pos_weighting = [] |
|
result = '' |
|
confidence = 0 |
|
algos_score = 0 |
|
|
|
algorithms = [ |
|
{'name': 'SGD', 'accuracy': 0.74*100, 'trained': SGD_train}, |
|
{'name': 'Logistic Regression', 'accuracy': 0.79*100, 'trained': logreg_train}, |
|
{'name': 'CNN', 'accuracy': 0.82*100, 'trained': model} |
|
] |
|
|
|
for algo in algorithms: |
|
weight = algo['accuracy'] |
|
algos_score += weight |
|
if algo['name'] == "CNN": |
|
pred = algo['trained'].predict(_test) |
|
if pred[0][0] > 0.5: |
|
pos_weighting.append(weight) |
|
print("CNN voted for: effective" if pred[0][0]>0.5 else "CNN voted for: ineffective") |
|
else: |
|
pred = algo['trained'].predict(test_point) |
|
if pred == 'pos': |
|
pos_weighting.append(weight) |
|
print(algo['name'] + " voted for: effective" if pred=='pos' else algo['name'] + " voted for: ineffective") |
|
|
|
pos_result = sum(pos_weighting)/algos_score |
|
if pos_result < 0.5: |
|
result = 'ineffective' |
|
confidence = 1 - pos_result |
|
else: |
|
result = 'effective' |
|
confidence = pos_result |
|
|
|
return result, confidence |
|
|
|
def quantizeEffectiveness(url): |
|
|
|
print("Getting YouTube ID...") |
|
videoId = getID(url) |
|
|
|
|
|
print("Downloading comments...") |
|
downloader = YoutubeCommentDownloader() |
|
comments_downloaded = downloader.get_comments_from_url(f'https://www.youtube.com/watch?v={videoId}') |
|
comments = [comment for comment in comments_downloaded] |
|
comments_df = pd.DataFrame(comments) |
|
|
|
|
|
print("Cleaning Comments...") |
|
comments_df['text'] = comments_df['text'].apply(lambda x: clean_text(x)) |
|
|
|
|
|
all_words = [item for sublist in comments_df['text'].tolist() for item in sublist] |
|
|
|
|
|
test = pd.DataFrame([[videoId]], columns=['VideoId']) |
|
|
|
|
|
test_documents = [] |
|
test_documents.append(all_words) |
|
test['cleaned'] = test_documents |
|
test['cleaned_string'] = [' '.join(map(str, l)) for l in test['cleaned']] |
|
|
|
|
|
test_point = test['cleaned_string'] |
|
test_sentence = test['cleaned_string'].values |
|
|
|
|
|
sentences_train = getSentenceTrain() |
|
|
|
|
|
print("Tokenizing the data...") |
|
tokenizer = Tokenizer(num_words=5000) |
|
tokenizer.fit_on_texts(sentences_train) |
|
|
|
|
|
_test = pad_sequences(tokenizer.texts_to_sequences(test_sentence), padding='post', maxlen=100) |
|
|
|
|
|
result, confidence = vote(test_point, _test) |
|
|
|
def greet(url): |
|
|
|
|
|
|
|
return "Hello" |
|
|
|
iface = gr.Interface(fn=greet, inputs="text", outputs="text") |
|
iface.launch() |