{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "92e48866",
"metadata": {},
"source": [
"## Model Training"
]
},
{
"cell_type": "markdown",
"id": "25791a74",
"metadata": {},
"source": [
"#### 1.1 Import Data and Required Packages\n",
"##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "0f4cd21f-8f72-42f0-b92b-c848a74a9755",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting catboost\n",
" Downloading catboost-1.2.5-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)\n",
"Collecting graphviz (from catboost)\n",
" Downloading graphviz-0.20.3-py3-none-any.whl.metadata (12 kB)\n",
"Requirement already satisfied: matplotlib in /home/adarsh/anaconda3/lib/python3.11/site-packages (from catboost) (3.8.0)\n",
"Requirement already satisfied: numpy>=1.16.0 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from catboost) (1.26.4)\n",
"Requirement already satisfied: pandas>=0.24 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from catboost) (2.1.4)\n",
"Requirement already satisfied: scipy in /home/adarsh/anaconda3/lib/python3.11/site-packages (from catboost) (1.11.4)\n",
"Requirement already satisfied: plotly in /home/adarsh/anaconda3/lib/python3.11/site-packages (from catboost) (5.9.0)\n",
"Requirement already satisfied: six in /home/adarsh/anaconda3/lib/python3.11/site-packages (from catboost) (1.16.0)\n",
"Requirement already satisfied: python-dateutil>=2.8.2 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from pandas>=0.24->catboost) (2.8.2)\n",
"Requirement already satisfied: pytz>=2020.1 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from pandas>=0.24->catboost) (2023.3.post1)\n",
"Requirement already satisfied: tzdata>=2022.1 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from pandas>=0.24->catboost) (2023.3)\n",
"Requirement already satisfied: contourpy>=1.0.1 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from matplotlib->catboost) (1.2.0)\n",
"Requirement already satisfied: cycler>=0.10 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from matplotlib->catboost) (0.11.0)\n",
"Requirement already satisfied: fonttools>=4.22.0 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from matplotlib->catboost) (4.25.0)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from matplotlib->catboost) (1.4.4)\n",
"Requirement already satisfied: packaging>=20.0 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from matplotlib->catboost) (23.1)\n",
"Requirement already satisfied: pillow>=6.2.0 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from matplotlib->catboost) (10.2.0)\n",
"Requirement already satisfied: pyparsing>=2.3.1 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from matplotlib->catboost) (3.0.9)\n",
"Requirement already satisfied: tenacity>=6.2.0 in /home/adarsh/anaconda3/lib/python3.11/site-packages (from plotly->catboost) (8.2.2)\n",
"Downloading catboost-1.2.5-cp311-cp311-manylinux2014_x86_64.whl (98.2 MB)\n",
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m98.2/98.2 MB\u001b[0m \u001b[31m206.5 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m[36m0:00:11\u001b[0m\n",
"\u001b[?25hDownloading graphviz-0.20.3-py3-none-any.whl (47 kB)\n",
"\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.1/47.1 kB\u001b[0m \u001b[31m183.3 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m1m394.2 kB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\n",
"\u001b[?25hInstalling collected packages: graphviz, catboost\n",
"Successfully installed catboost-1.2.5 graphviz-0.20.3\n"
]
}
],
"source": [
"!pip3 install catboost"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "b080dfb2",
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'xgboost'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[5], line 16\u001b[0m\n\u001b[1;32m 14\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RandomizedSearchCV\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mcatboost\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m CatBoostRegressor\n\u001b[0;32m---> 16\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mxgboost\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m XGBRegressor\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mwarnings\u001b[39;00m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'xgboost'"
]
}
],
"source": [
"# Basic Import\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt \n",
"import seaborn as sns\n",
"# Modelling\n",
"from sklearn.metrics import mean_squared_error, r2_score\n",
"from sklearn.neighbors import KNeighborsRegressor\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor\n",
"from sklearn.svm import SVR\n",
"from sklearn.linear_model import LinearRegression, Ridge,Lasso\n",
"from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n",
"from sklearn.model_selection import RandomizedSearchCV\n",
"from catboost import CatBoostRegressor\n",
"from xgboost import XGBRegressor\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"id": "e45079ad",
"metadata": {},
"source": [
"#### Import the CSV Data as Pandas DataFrame"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "e11c6255",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('data/stud.csv')"
]
},
{
"cell_type": "markdown",
"id": "20634923",
"metadata": {},
"source": [
"#### Show Top 5 Records"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e7e412a2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" gender | \n",
" race_ethnicity | \n",
" parental_level_of_education | \n",
" lunch | \n",
" test_preparation_course | \n",
" math_score | \n",
" reading_score | \n",
" writing_score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" female | \n",
" group B | \n",
" bachelor's degree | \n",
" standard | \n",
" none | \n",
" 72 | \n",
" 72 | \n",
" 74 | \n",
"
\n",
" \n",
" 1 | \n",
" female | \n",
" group C | \n",
" some college | \n",
" standard | \n",
" completed | \n",
" 69 | \n",
" 90 | \n",
" 88 | \n",
"
\n",
" \n",
" 2 | \n",
" female | \n",
" group B | \n",
" master's degree | \n",
" standard | \n",
" none | \n",
" 90 | \n",
" 95 | \n",
" 93 | \n",
"
\n",
" \n",
" 3 | \n",
" male | \n",
" group A | \n",
" associate's degree | \n",
" free/reduced | \n",
" none | \n",
" 47 | \n",
" 57 | \n",
" 44 | \n",
"
\n",
" \n",
" 4 | \n",
" male | \n",
" group C | \n",
" some college | \n",
" standard | \n",
" none | \n",
" 76 | \n",
" 78 | \n",
" 75 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" gender race_ethnicity parental_level_of_education lunch \\\n",
"0 female group B bachelor's degree standard \n",
"1 female group C some college standard \n",
"2 female group B master's degree standard \n",
"3 male group A associate's degree free/reduced \n",
"4 male group C some college standard \n",
"\n",
" test_preparation_course math_score reading_score writing_score \n",
"0 none 72 72 74 \n",
"1 completed 69 90 88 \n",
"2 none 90 95 93 \n",
"3 none 47 57 44 \n",
"4 none 76 78 75 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "fbd32281",
"metadata": {},
"source": [
"#### Preparing X and Y variables"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "56d72fde",
"metadata": {},
"outputs": [],
"source": [
"X = df.drop(columns=['math_score'],axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "cd613177",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" gender | \n",
" race_ethnicity | \n",
" parental_level_of_education | \n",
" lunch | \n",
" test_preparation_course | \n",
" reading_score | \n",
" writing_score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" female | \n",
" group B | \n",
" bachelor's degree | \n",
" standard | \n",
" none | \n",
" 72 | \n",
" 74 | \n",
"
\n",
" \n",
" 1 | \n",
" female | \n",
" group C | \n",
" some college | \n",
" standard | \n",
" completed | \n",
" 90 | \n",
" 88 | \n",
"
\n",
" \n",
" 2 | \n",
" female | \n",
" group B | \n",
" master's degree | \n",
" standard | \n",
" none | \n",
" 95 | \n",
" 93 | \n",
"
\n",
" \n",
" 3 | \n",
" male | \n",
" group A | \n",
" associate's degree | \n",
" free/reduced | \n",
" none | \n",
" 57 | \n",
" 44 | \n",
"
\n",
" \n",
" 4 | \n",
" male | \n",
" group C | \n",
" some college | \n",
" standard | \n",
" none | \n",
" 78 | \n",
" 75 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" gender race_ethnicity parental_level_of_education lunch \\\n",
"0 female group B bachelor's degree standard \n",
"1 female group C some college standard \n",
"2 female group B master's degree standard \n",
"3 male group A associate's degree free/reduced \n",
"4 male group C some college standard \n",
"\n",
" test_preparation_course reading_score writing_score \n",
"0 none 72 74 \n",
"1 completed 90 88 \n",
"2 none 95 93 \n",
"3 none 57 44 \n",
"4 none 78 75 "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.head()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "f237ea14",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Categories in 'gender' variable: ['female' 'male']\n",
"Categories in 'race_ethnicity' variable: ['group B' 'group C' 'group A' 'group D' 'group E']\n",
"Categories in'parental level of education' variable: [\"bachelor's degree\" 'some college' \"master's degree\" \"associate's degree\"\n",
" 'high school' 'some high school']\n",
"Categories in 'lunch' variable: ['standard' 'free/reduced']\n",
"Categories in 'test preparation course' variable: ['none' 'completed']\n"
]
}
],
"source": [
"print(\"Categories in 'gender' variable: \",end=\" \" )\n",
"print(df['gender'].unique())\n",
"\n",
"print(\"Categories in 'race_ethnicity' variable: \",end=\" \")\n",
"print(df['race_ethnicity'].unique())\n",
"\n",
"print(\"Categories in'parental level of education' variable:\",end=\" \" )\n",
"print(df['parental_level_of_education'].unique())\n",
"\n",
"print(\"Categories in 'lunch' variable: \",end=\" \" )\n",
"print(df['lunch'].unique())\n",
"\n",
"print(\"Categories in 'test preparation course' variable: \",end=\" \" )\n",
"print(df['test_preparation_course'].unique())"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "924b7f9d",
"metadata": {},
"outputs": [],
"source": [
"y = df['math_score']"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "ffc69816",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 72\n",
"1 69\n",
"2 90\n",
"3 47\n",
"4 76\n",
" ..\n",
"995 88\n",
"996 62\n",
"997 59\n",
"998 68\n",
"999 77\n",
"Name: math_score, Length: 1000, dtype: int64"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "1e290fe3",
"metadata": {},
"outputs": [],
"source": [
"# Create Column Transformer with 3 types of transformers\n",
"num_features = X.select_dtypes(exclude=\"object\").columns\n",
"cat_features = X.select_dtypes(include=\"object\").columns\n",
"\n",
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
"from sklearn.compose import ColumnTransformer\n",
"\n",
"numeric_transformer = StandardScaler()\n",
"oh_transformer = OneHotEncoder()\n",
"\n",
"preprocessor = ColumnTransformer(\n",
" [\n",
" (\"OneHotEncoder\", oh_transformer, cat_features),\n",
" (\"StandardScaler\", numeric_transformer, num_features), \n",
" ]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "9c68f99a",
"metadata": {},
"outputs": [],
"source": [
"X = preprocessor.fit_transform(X)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "72459f1d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1000, 19)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "ed5c4e99",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((800, 19), (200, 19))"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# separate dataset into train and test\n",
"from sklearn.model_selection import train_test_split\n",
"X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)\n",
"X_train.shape, X_test.shape"
]
},
{
"cell_type": "markdown",
"id": "4cd80317",
"metadata": {},
"source": [
"#### Create an Evaluate Function to give all metrics after model Training"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "8c247bd0",
"metadata": {},
"outputs": [],
"source": [
"def evaluate_model(true, predicted):\n",
" mae = mean_absolute_error(true, predicted)\n",
" mse = mean_squared_error(true, predicted)\n",
" rmse = np.sqrt(mean_squared_error(true, predicted))\n",
" r2_square = r2_score(true, predicted)\n",
" return mae, rmse, r2_square"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "79ccb8e7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Linear Regression\n",
"Model performance for Training set\n",
"- Root Mean Squared Error: 5.3243\n",
"- Mean Absolute Error: 4.2671\n",
"- R2 Score: 0.8743\n",
"----------------------------------\n",
"Model performance for Test set\n",
"- Root Mean Squared Error: 5.3960\n",
"- Mean Absolute Error: 4.2158\n",
"- R2 Score: 0.8803\n",
"===================================\n",
"\n",
"\n",
"Lasso\n",
"Model performance for Training set\n",
"- Root Mean Squared Error: 6.5938\n",
"- Mean Absolute Error: 5.2063\n",
"- R2 Score: 0.8071\n",
"----------------------------------\n",
"Model performance for Test set\n",
"- Root Mean Squared Error: 6.5197\n",
"- Mean Absolute Error: 5.1579\n",
"- R2 Score: 0.8253\n",
"===================================\n",
"\n",
"\n",
"Ridge\n",
"Model performance for Training set\n",
"- Root Mean Squared Error: 5.3233\n",
"- Mean Absolute Error: 4.2650\n",
"- R2 Score: 0.8743\n",
"----------------------------------\n",
"Model performance for Test set\n",
"- Root Mean Squared Error: 5.3904\n",
"- Mean Absolute Error: 4.2111\n",
"- R2 Score: 0.8806\n",
"===================================\n",
"\n",
"\n",
"K-Neighbors Regressor\n",
"Model performance for Training set\n",
"- Root Mean Squared Error: 5.7077\n",
"- Mean Absolute Error: 4.5167\n",
"- R2 Score: 0.8555\n",
"----------------------------------\n",
"Model performance for Test set\n",
"- Root Mean Squared Error: 7.2530\n",
"- Mean Absolute Error: 5.6210\n",
"- R2 Score: 0.7838\n",
"===================================\n",
"\n",
"\n",
"Decision Tree\n",
"Model performance for Training set\n",
"- Root Mean Squared Error: 0.2795\n",
"- Mean Absolute Error: 0.0187\n",
"- R2 Score: 0.9997\n",
"----------------------------------\n",
"Model performance for Test set\n",
"- Root Mean Squared Error: 7.6371\n",
"- Mean Absolute Error: 6.0250\n",
"- R2 Score: 0.7603\n",
"===================================\n",
"\n",
"\n",
"Random Forest Regressor\n",
"Model performance for Training set\n",
"- Root Mean Squared Error: 2.2851\n",
"- Mean Absolute Error: 1.8253\n",
"- R2 Score: 0.9768\n",
"----------------------------------\n",
"Model performance for Test set\n",
"- Root Mean Squared Error: 6.0959\n",
"- Mean Absolute Error: 4.7194\n",
"- R2 Score: 0.8473\n",
"===================================\n",
"\n",
"\n",
"XGBRegressor\n",
"Model performance for Training set\n",
"- Root Mean Squared Error: 0.9087\n",
"- Mean Absolute Error: 0.6148\n",
"- R2 Score: 0.9963\n",
"----------------------------------\n",
"Model performance for Test set\n",
"- Root Mean Squared Error: 6.5889\n",
"- Mean Absolute Error: 5.0844\n",
"- R2 Score: 0.8216\n",
"===================================\n",
"\n",
"\n",
"CatBoosting Regressor\n",
"Model performance for Training set\n",
"- Root Mean Squared Error: 3.0427\n",
"- Mean Absolute Error: 2.4054\n",
"- R2 Score: 0.9589\n",
"----------------------------------\n",
"Model performance for Test set\n",
"- Root Mean Squared Error: 6.0086\n",
"- Mean Absolute Error: 4.6125\n",
"- R2 Score: 0.8516\n",
"===================================\n",
"\n",
"\n",
"AdaBoost Regressor\n",
"Model performance for Training set\n",
"- Root Mean Squared Error: 5.7843\n",
"- Mean Absolute Error: 4.7564\n",
"- R2 Score: 0.8516\n",
"----------------------------------\n",
"Model performance for Test set\n",
"- Root Mean Squared Error: 6.0447\n",
"- Mean Absolute Error: 4.6813\n",
"- R2 Score: 0.8498\n",
"===================================\n",
"\n",
"\n"
]
}
],
"source": [
"models = {\n",
" \"Linear Regression\": LinearRegression(),\n",
" \"Lasso\": Lasso(),\n",
" \"Ridge\": Ridge(),\n",
" \"K-Neighbors Regressor\": KNeighborsRegressor(),\n",
" \"Decision Tree\": DecisionTreeRegressor(),\n",
" \"Random Forest Regressor\": RandomForestRegressor(),\n",
" \"XGBRegressor\": XGBRegressor(), \n",
" \"CatBoosting Regressor\": CatBoostRegressor(verbose=False),\n",
" \"AdaBoost Regressor\": AdaBoostRegressor()\n",
"}\n",
"model_list = []\n",
"r2_list =[]\n",
"\n",
"for i in range(len(list(models))):\n",
" model = list(models.values())[i]\n",
" model.fit(X_train, y_train) # Train model\n",
"\n",
" # Make predictions\n",
" y_train_pred = model.predict(X_train)\n",
" y_test_pred = model.predict(X_test)\n",
" \n",
" # Evaluate Train and Test dataset\n",
" model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)\n",
"\n",
" model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)\n",
"\n",
" \n",
" print(list(models.keys())[i])\n",
" model_list.append(list(models.keys())[i])\n",
" \n",
" print('Model performance for Training set')\n",
" print(\"- Root Mean Squared Error: {:.4f}\".format(model_train_rmse))\n",
" print(\"- Mean Absolute Error: {:.4f}\".format(model_train_mae))\n",
" print(\"- R2 Score: {:.4f}\".format(model_train_r2))\n",
"\n",
" print('----------------------------------')\n",
" \n",
" print('Model performance for Test set')\n",
" print(\"- Root Mean Squared Error: {:.4f}\".format(model_test_rmse))\n",
" print(\"- Mean Absolute Error: {:.4f}\".format(model_test_mae))\n",
" print(\"- R2 Score: {:.4f}\".format(model_test_r2))\n",
" r2_list.append(model_test_r2)\n",
" \n",
" print('='*35)\n",
" print('\\n')"
]
},
{
"cell_type": "markdown",
"id": "06480b5a",
"metadata": {},
"source": [
"### Results"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "e0159e5f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Model Name | \n",
" R2_Score | \n",
"
\n",
" \n",
" \n",
" \n",
" 2 | \n",
" Ridge | \n",
" 0.880593 | \n",
"
\n",
" \n",
" 0 | \n",
" Linear Regression | \n",
" 0.880345 | \n",
"
\n",
" \n",
" 7 | \n",
" CatBoosting Regressor | \n",
" 0.851632 | \n",
"
\n",
" \n",
" 8 | \n",
" AdaBoost Regressor | \n",
" 0.849847 | \n",
"
\n",
" \n",
" 5 | \n",
" Random Forest Regressor | \n",
" 0.847291 | \n",
"
\n",
" \n",
" 1 | \n",
" Lasso | \n",
" 0.825320 | \n",
"
\n",
" \n",
" 6 | \n",
" XGBRegressor | \n",
" 0.821589 | \n",
"
\n",
" \n",
" 3 | \n",
" K-Neighbors Regressor | \n",
" 0.783813 | \n",
"
\n",
" \n",
" 4 | \n",
" Decision Tree | \n",
" 0.760313 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Model Name R2_Score\n",
"2 Ridge 0.880593\n",
"0 Linear Regression 0.880345\n",
"7 CatBoosting Regressor 0.851632\n",
"8 AdaBoost Regressor 0.849847\n",
"5 Random Forest Regressor 0.847291\n",
"1 Lasso 0.825320\n",
"6 XGBRegressor 0.821589\n",
"3 K-Neighbors Regressor 0.783813\n",
"4 Decision Tree 0.760313"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=[\"R2_Score\"],ascending=False)"
]
},
{
"cell_type": "markdown",
"id": "357a7c1c",
"metadata": {},
"source": [
"## Linear Regression"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "9a6ad559",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Accuracy of the model is 88.03\n"
]
}
],
"source": [
"lin_model = LinearRegression(fit_intercept=True)\n",
"lin_model = lin_model.fit(X_train, y_train)\n",
"y_pred = lin_model.predict(X_test)\n",
"score = r2_score(y_test, y_pred)*100\n",
"print(\" Accuracy of the model is %.2f\" %score)"
]
},
{
"cell_type": "markdown",
"id": "1d31453e",
"metadata": {},
"source": [
"## Plot y_pred and y_test"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "eb557b0a",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"plt.scatter(y_test,y_pred);\n",
"plt.xlabel('Actual');\n",
"plt.ylabel('Predicted');"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "1e707ec3",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"sns.regplot(x=y_test,y=y_pred,ci=None,color ='red');"
]
},
{
"cell_type": "markdown",
"id": "79c2fe28",
"metadata": {},
"source": [
"#### Difference between Actual and Predicted Values"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "7c9a8b48",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Actual Value | \n",
" Predicted Value | \n",
" Difference | \n",
"
\n",
" \n",
" \n",
" \n",
" 521 | \n",
" 91 | \n",
" 76.507812 | \n",
" 14.492188 | \n",
"
\n",
" \n",
" 737 | \n",
" 53 | \n",
" 58.953125 | \n",
" -5.953125 | \n",
"
\n",
" \n",
" 740 | \n",
" 80 | \n",
" 76.960938 | \n",
" 3.039062 | \n",
"
\n",
" \n",
" 660 | \n",
" 74 | \n",
" 76.757812 | \n",
" -2.757812 | \n",
"
\n",
" \n",
" 411 | \n",
" 84 | \n",
" 87.539062 | \n",
" -3.539062 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 408 | \n",
" 52 | \n",
" 43.546875 | \n",
" 8.453125 | \n",
"
\n",
" \n",
" 332 | \n",
" 62 | \n",
" 62.031250 | \n",
" -0.031250 | \n",
"
\n",
" \n",
" 208 | \n",
" 74 | \n",
" 67.976562 | \n",
" 6.023438 | \n",
"
\n",
" \n",
" 613 | \n",
" 65 | \n",
" 67.132812 | \n",
" -2.132812 | \n",
"
\n",
" \n",
" 78 | \n",
" 61 | \n",
" 62.492188 | \n",
" -1.492188 | \n",
"
\n",
" \n",
"
\n",
"
200 rows × 3 columns
\n",
"
"
],
"text/plain": [
" Actual Value Predicted Value Difference\n",
"521 91 76.507812 14.492188\n",
"737 53 58.953125 -5.953125\n",
"740 80 76.960938 3.039062\n",
"660 74 76.757812 -2.757812\n",
"411 84 87.539062 -3.539062\n",
".. ... ... ...\n",
"408 52 43.546875 8.453125\n",
"332 62 62.031250 -0.031250\n",
"208 74 67.976562 6.023438\n",
"613 65 67.132812 -2.132812\n",
"78 61 62.492188 -1.492188\n",
"\n",
"[200 rows x 3 columns]"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})\n",
"pred_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3acf1fbc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}