import seaborn as sns import pandas as pd import numpy as np import pyod import pyreadr import urllib import rdata import wget import os import gradio as gr import joblib import subprocess import pandas as pd import json import uuid import warnings from sklearn.metrics import f1_score, confusion_matrix from pyod.models.mcd import MCD from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print from sklearn.datasets import fetch_openml from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import make_column_transformer from sklearn.pipeline import make_pipeline from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score from pathlib import Path from threading import Lock from huggingface_hub import CommitScheduler from huggingface_hub import HfApi from IPython.display import display import warnings from IPython.display import display, HTML # Ignore all warnings warnings.filterwarnings("ignore") # Download the dataset - For realworld scenarion we would use the the csv with the appeneded data url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata" dst_path = "./creditcard.Rdata" wget.download(url, dst_path) # Load the dataset parsed_res = rdata.parser.parse_file(dst_path) res = rdata.conversion.convert(parsed_res) dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1) # Prepare the data y = dataset['Class'].astype(int) # Convert to integers df = dataset.drop(['Class'], axis=1) df.columns = df.columns.astype(str) print("Data subsets created") # Split the data X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y) X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train) # Reset indices X_train.reset_index(drop=True, inplace=True) y_train.reset_index(drop=True, inplace=True) # Define the numerical features and the pipeline for numerical features numerical_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'] numerical_pipeline = make_pipeline( StandardScaler() # Example: Standardize numerical features ) # Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately. preprocessor = make_column_transformer( (numerical_pipeline, numerical_features) ) # Creating model clf = MCD() # Creating a pipeline combining preprocessing steps (imputation and encoding) with linear regression modeling. model_pipeline = make_pipeline( preprocessor, # Applying preprocessing steps clf # Training linear regression model ) print("Preprocessing Data") # Fit the model and train model to predict anomalies model_pipeline.fit(X_train) y_test_pred = model_pipeline.predict(X_test) print("Serializing Model") saved_model_path = "model.joblib" joblib.dump(model_pipeline, saved_model_path) print("Model Serialized and Saved")