Spaces:
Sleeping
Sleeping
import seaborn as sns | |
import pandas as pd | |
import numpy as np | |
import pyod | |
import pyreadr | |
import urllib | |
import rdata | |
import wget | |
import os | |
import gradio as gr | |
import joblib | |
import subprocess | |
import pandas as pd | |
import json | |
import uuid | |
import warnings | |
from sklearn.metrics import f1_score, confusion_matrix | |
from pyod.models.mcd import MCD | |
from pyod.utils.data import generate_data | |
from pyod.utils.data import evaluate_print | |
from sklearn.datasets import fetch_openml | |
from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
from sklearn.compose import make_column_transformer | |
from sklearn.pipeline import make_pipeline | |
from sklearn.model_selection import train_test_split | |
from sklearn.linear_model import LinearRegression | |
from sklearn.metrics import mean_squared_error, r2_score | |
from pathlib import Path | |
from threading import Lock | |
from huggingface_hub import CommitScheduler | |
from huggingface_hub import HfApi | |
from IPython.display import display | |
import warnings | |
from IPython.display import display, HTML | |
# Ignore all warnings | |
warnings.filterwarnings("ignore") | |
# Download the dataset - For realworld scenarion we would use the the csv with the appeneded data | |
url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata" | |
dst_path = "./creditcard.Rdata" | |
wget.download(url, dst_path) | |
# Load the dataset | |
parsed_res = rdata.parser.parse_file(dst_path) | |
res = rdata.conversion.convert(parsed_res) | |
dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1) | |
# Prepare the data | |
y = dataset['Class'].astype(int) # Convert to integers | |
df = dataset.drop(['Class'], axis=1) | |
df.columns = df.columns.astype(str) | |
print("Data subsets created") | |
# Split the data | |
X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y) | |
X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train) | |
# Reset indices | |
X_train.reset_index(drop=True, inplace=True) | |
y_train.reset_index(drop=True, inplace=True) | |
# Define the numerical features and the pipeline for numerical features | |
numerical_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', | |
'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', | |
'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] | |
numerical_pipeline = make_pipeline( | |
StandardScaler() # Example: Standardize numerical features | |
) | |
# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately. | |
preprocessor = make_column_transformer( | |
(numerical_pipeline, numerical_features) | |
) | |
# Creating model | |
clf = MCD() | |
# Creating a pipeline combining preprocessing steps (imputation and encoding) with linear regression modeling. | |
model_pipeline = make_pipeline( | |
preprocessor, # Applying preprocessing steps | |
clf # Training linear regression model | |
) | |
print("Preprocessing Data") | |
# Fit the model and train model to predict anomalies | |
model_pipeline.fit(X_train) | |
y_test_pred = model_pipeline.predict(X_test) | |
print("Serializing Model") | |
saved_model_path = "model.joblib" | |
joblib.dump(model_pipeline, saved_model_path) | |
print("Model Serialized and Saved") |