diff --git "a/Shallow machine learning/.ipynb_checkpoints/Binary classification-checkpoint.ipynb" "b/Shallow machine learning/.ipynb_checkpoints/Binary classification-checkpoint.ipynb" new file mode 100644--- /dev/null +++ "b/Shallow machine learning/.ipynb_checkpoints/Binary classification-checkpoint.ipynb" @@ -0,0 +1,2502 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import regex as re\n", + "\n", + "from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix, classification_report\n", + "\n", + "from sklearn.ensemble import AdaBoostClassifier\n", + "from xgboost import XGBClassifier\n", + "\n", + "from sklearn.linear_model import LogisticRegression, SGDClassifier\n", + "from sklearn.svm import SVC, LinearSVC, NuSVC\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.naive_bayes import MultinomialNB, BernoulliNB\n", + "\n", + "import pickle\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | VideoID | \n", + "Effectiveness | \n", + "
---|---|---|
0 | \n", + "pvuN_WvF1to | \n", + "1.0 | \n", + "
1 | \n", + "eRLJscAlk1M | \n", + "5.0 | \n", + "
2 | \n", + "VbiRNT_gWUQ | \n", + "2.0 | \n", + "
3 | \n", + "5dVcn8NjbwY | \n", + "NaN | \n", + "
4 | \n", + "5scez5dqtAc | \n", + "4.0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "
186 | \n", + "TZ0j6kr4ZJ0 | \n", + "3.0 | \n", + "
187 | \n", + "8DiWzvE52ZY | \n", + "1.0 | \n", + "
188 | \n", + "OwqIy8Ikv-c | \n", + "2.0 | \n", + "
189 | \n", + "lPgZfhnCAdI | \n", + "1.0 | \n", + "
190 | \n", + "dSu5sXmsur4 | \n", + "3.0 | \n", + "
191 rows × 2 columns
\n", + "\n", + " | VideoID | \n", + "Effectiveness | \n", + "
---|---|---|
0 | \n", + "pvuN_WvF1to | \n", + "1.0 | \n", + "
1 | \n", + "eRLJscAlk1M | \n", + "5.0 | \n", + "
2 | \n", + "VbiRNT_gWUQ | \n", + "2.0 | \n", + "
3 | \n", + "5scez5dqtAc | \n", + "4.0 | \n", + "
4 | \n", + "JDcro7dPqpA | \n", + "2.0 | \n", + "
... | \n", + "... | \n", + "... | \n", + "
164 | \n", + "TZ0j6kr4ZJ0 | \n", + "3.0 | \n", + "
165 | \n", + "8DiWzvE52ZY | \n", + "1.0 | \n", + "
166 | \n", + "OwqIy8Ikv-c | \n", + "2.0 | \n", + "
167 | \n", + "lPgZfhnCAdI | \n", + "1.0 | \n", + "
168 | \n", + "dSu5sXmsur4 | \n", + "3.0 | \n", + "
169 rows × 2 columns
\n", + "\n", + " | VideoID | \n", + "Effectiveness | \n", + "
---|---|---|
0 | \n", + "pvuN_WvF1to | \n", + "neg | \n", + "
1 | \n", + "eRLJscAlk1M | \n", + "pos | \n", + "
2 | \n", + "VbiRNT_gWUQ | \n", + "neg | \n", + "
3 | \n", + "5scez5dqtAc | \n", + "pos | \n", + "
4 | \n", + "JDcro7dPqpA | \n", + "neg | \n", + "
... | \n", + "... | \n", + "... | \n", + "
132 | \n", + "JYZpxRy5Mfg | \n", + "pos | \n", + "
133 | \n", + "xXMlFFY9uEI | \n", + "pos | \n", + "
134 | \n", + "8DiWzvE52ZY | \n", + "neg | \n", + "
135 | \n", + "OwqIy8Ikv-c | \n", + "neg | \n", + "
136 | \n", + "lPgZfhnCAdI | \n", + "neg | \n", + "
137 rows × 2 columns
\n", + "\n", + " | VideoID | \n", + "Effectiveness | \n", + "cleaned | \n", + "cleaned_string | \n", + "num_comments | \n", + "average_word_length | \n", + "average_sentence_length | \n", + "average_punctuation_count | \n", + "average_emoji_count | \n", + "average_sentiment | \n", + "sentiment_ratio_negative | \n", + "sentiment_ratio_neutral | \n", + "sentiment_ratio_positive | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "pvuN_WvF1to | \n", + "neg | \n", + "[clean, version, video, child, love, northeast... | \n", + "clean version video child love northeast india... | \n", + "125533 | \n", + "11.370739 | \n", + "1.292959 | \n", + "2.123577 | \n", + "0.588371 | \n", + "0.095633 | \n", + "0.137295 | \n", + "0.529606 | \n", + "0.333100 | \n", + "
1 | \n", + "eRLJscAlk1M | \n", + "pos | \n", + "[step, take, help, fight, climate, change, wel... | \n", + "step take help fight climate change well equal... | \n", + "161953 | \n", + "17.195229 | \n", + "1.594994 | \n", + "2.718289 | \n", + "0.489704 | \n", + "0.037611 | \n", + "0.202355 | \n", + "0.500905 | \n", + "0.296740 | \n", + "
2 | \n", + "VbiRNT_gWUQ | \n", + "neg | \n", + "[country, disappear, video, year, old, world, ... | \n", + "country disappear video year old world map did... | \n", + "27616 | \n", + "18.386660 | \n", + "1.726789 | \n", + "3.540701 | \n", + "0.117903 | \n", + "0.052846 | \n", + "0.196010 | \n", + "0.445177 | \n", + "0.358814 | \n", + "
3 | \n", + "5scez5dqtAc | \n", + "pos | \n", + "[im, watch, trump, biden, ha, already, start, ... | \n", + "im watch trump biden ha already start process ... | \n", + "13773 | \n", + "32.300443 | \n", + "2.364554 | \n", + "5.870616 | \n", + "0.060626 | \n", + "0.020608 | \n", + "0.301387 | \n", + "0.315545 | \n", + "0.383068 | \n", + "
4 | \n", + "JDcro7dPqpA | \n", + "neg | \n", + "[fun, fact, cow, belch, fart, adult, version, ... | \n", + "fun fact cow belch fart adult version bill nye... | \n", + "18821 | \n", + "34.869454 | \n", + "2.559588 | \n", + "6.624250 | \n", + "0.106902 | \n", + "0.032238 | \n", + "0.296796 | \n", + "0.313480 | \n", + "0.389724 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
132 | \n", + "JYZpxRy5Mfg | \n", + "pos | \n", + "[usually, consumer_NEG, say_NEG, though_NEG, s... | \n", + "usually consumer_NEG say_NEG though_NEG suppor... | \n", + "415 | \n", + "19.036145 | \n", + "1.759036 | \n", + "3.508434 | \n", + "0.207229 | \n", + "0.090164 | \n", + "0.149398 | \n", + "0.513253 | \n", + "0.337349 | \n", + "
133 | \n", + "xXMlFFY9uEI | \n", + "pos | \n", + "[joe, biden, ha, plan, fix, thing, forefront, ... | \n", + "joe biden ha plan fix thing forefront news sev... | \n", + "431 | \n", + "37.774942 | \n", + "2.700696 | \n", + "9.039443 | \n", + "0.153132 | \n", + "0.034621 | \n", + "0.225058 | \n", + "0.396752 | \n", + "0.378190 | \n", + "
134 | \n", + "8DiWzvE52ZY | \n", + "neg | \n", + "[marios, leave, hand, doe, intro, impressive, ... | \n", + "marios leave hand doe intro impressive today p... | \n", + "5262 | \n", + "18.298556 | \n", + "1.779742 | \n", + "3.726720 | \n", + "0.136640 | \n", + "0.143438 | \n", + "0.129609 | \n", + "0.403079 | \n", + "0.467313 | \n", + "
135 | \n", + "OwqIy8Ikv-c | \n", + "neg | \n", + "[lie, interseting, isnt, group_NEG, consist_NE... | \n", + "lie interseting isnt group_NEG consist_NEG com... | \n", + "14421 | \n", + "57.651203 | \n", + "3.803966 | \n", + "13.288954 | \n", + "0.029402 | \n", + "0.049250 | \n", + "0.249983 | \n", + "0.278275 | \n", + "0.471743 | \n", + "
136 | \n", + "lPgZfhnCAdI | \n", + "neg | \n", + "[miss, man, wa, hero, didnt, cherish_NEG, enou... | \n", + "miss man wa hero didnt cherish_NEG enough_NEG ... | \n", + "3777 | \n", + "40.999735 | \n", + "3.014562 | \n", + "8.415674 | \n", + "0.034948 | \n", + "0.017825 | \n", + "0.294943 | \n", + "0.291236 | \n", + "0.413820 | \n", + "
137 rows × 13 columns
\n", + "\n", + " | 0 | \n", + "1 | \n", + "2 | \n", + "3 | \n", + "4 | \n", + "5 | \n", + "6 | \n", + "7 | \n", + "8 | \n", + "9 | \n", + "... | \n", + "569679 | \n", + "569680 | \n", + "average_word_length | \n", + "average_sentence_length | \n", + "average_punctuation_count | \n", + "average_emoji_count | \n", + "average_sentiment | \n", + "sentiment_ratio_negative | \n", + "sentiment_ratio_neutral | \n", + "sentiment_ratio_positive | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0.000259 | \n", + "0.000000 | \n", + "0.000315 | \n", + "0.000000 | \n", + "0.000435 | \n", + "0.000000 | \n", + "0.000372 | \n", + "0.000237 | \n", + "0.000085 | \n", + "0.000149 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "11.370739 | \n", + "1.292959 | \n", + "2.123577 | \n", + "0.588371 | \n", + "0.095633 | \n", + "0.137295 | \n", + "0.529606 | \n", + "0.333100 | \n", + "
1 | \n", + "0.000473 | \n", + "0.000053 | \n", + "0.000222 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000084 | \n", + "0.000000 | \n", + "0.000067 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "17.195229 | \n", + "1.594994 | \n", + "2.718289 | \n", + "0.489704 | \n", + "0.037611 | \n", + "0.202355 | \n", + "0.500905 | \n", + "0.296740 | \n", + "
2 | \n", + "0.000201 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000397 | \n", + "0.000000 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "18.386660 | \n", + "1.726789 | \n", + "3.540701 | \n", + "0.117903 | \n", + "0.052846 | \n", + "0.196010 | \n", + "0.445177 | \n", + "0.358814 | \n", + "
3 | \n", + "0.000539 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "32.300443 | \n", + "2.364554 | \n", + "5.870616 | \n", + "0.060626 | \n", + "0.020608 | \n", + "0.301387 | \n", + "0.315545 | \n", + "0.383068 | \n", + "
4 | \n", + "0.000181 | \n", + "0.000263 | \n", + "0.000000 | \n", + "0.000258 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "34.869454 | \n", + "2.559588 | \n", + "6.624250 | \n", + "0.106902 | \n", + "0.032238 | \n", + "0.296796 | \n", + "0.313480 | \n", + "0.389724 | \n", + "
... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
132 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "19.036145 | \n", + "1.759036 | \n", + "3.508434 | \n", + "0.207229 | \n", + "0.090164 | \n", + "0.149398 | \n", + "0.513253 | \n", + "0.337349 | \n", + "
133 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "37.774942 | \n", + "2.700696 | \n", + "9.039443 | \n", + "0.153132 | \n", + "0.034621 | \n", + "0.225058 | \n", + "0.396752 | \n", + "0.378190 | \n", + "
134 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "18.298556 | \n", + "1.779742 | \n", + "3.726720 | \n", + "0.136640 | \n", + "0.143438 | \n", + "0.129609 | \n", + "0.403079 | \n", + "0.467313 | \n", + "
135 | \n", + "0.000000 | \n", + "0.000225 | \n", + "0.000379 | \n", + "0.000883 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "57.651203 | \n", + "3.803966 | \n", + "13.288954 | \n", + "0.029402 | \n", + "0.049250 | \n", + "0.249983 | \n", + "0.278275 | \n", + "0.471743 | \n", + "
136 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000 | \n", + "... | \n", + "0.0 | \n", + "0.0 | \n", + "40.999735 | \n", + "3.014562 | \n", + "8.415674 | \n", + "0.034948 | \n", + "0.017825 | \n", + "0.294943 | \n", + "0.291236 | \n", + "0.413820 | \n", + "
137 rows × 569689 columns
\n", + "\n", + " | Multinomial Naive Bayes | \n", + "Bernoulli Naive Bayes | \n", + "SGD | \n", + "Logistic Regression | \n", + "Support Vector Classifier | \n", + "Linear Support Vector Classifier | \n", + "Nu-Support Vector Classifier | \n", + "Random Forest | \n", + "XGBoost | \n", + "AdaBoost Classifier | \n", + "CNN | \n", + "Best Score | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|
Accuracy | \n", + "52.38 | \n", + "69.05 | \n", + "73.81 | \n", + "78.57 | \n", + "52.38 | \n", + "66.67 | \n", + "61.90 | \n", + "71.43 | \n", + "73.81 | \n", + "61.90 | \n", + "82.35 | \n", + "CNN | \n", + "
Precision | \n", + "26.19 | \n", + "69.05 | \n", + "73.81 | \n", + "78.60 | \n", + "51.67 | \n", + "67.07 | \n", + "61.82 | \n", + "73.21 | \n", + "74.26 | \n", + "61.82 | \n", + "88.88 | \n", + "CNN | \n", + "
Recall | \n", + "50.00 | \n", + "69.09 | \n", + "73.86 | \n", + "78.41 | \n", + "51.36 | \n", + "66.14 | \n", + "61.82 | \n", + "70.68 | \n", + "74.09 | \n", + "61.82 | \n", + "61.53 | \n", + "Logistic Regression | \n", + "
F1-score | \n", + "34.38 | \n", + "69.03 | \n", + "73.79 | \n", + "78.46 | \n", + "49.52 | \n", + "65.97 | \n", + "61.82 | \n", + "70.35 | \n", + "73.79 | \n", + "61.82 | \n", + "72.72 | \n", + "Logistic Regression | \n", + "