import seaborn as sns import pandas as pd import numpy as np import pyod import pyreadr import urllib import rdata import wget import os import joblib import warnings from pyod.models.mcd import MCD from sklearn.preprocessing import StandardScaler from sklearn.compose import make_column_transformer from sklearn.pipeline import make_pipeline from sklearn.model_selection import train_test_split # Ignore all warnings warnings.filterwarnings("ignore") # Download the dataset url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata" dst_path = "./creditcard.Rdata" wget.download(url, dst_path) # Load the dataset parsed_res = rdata.parser.parse_file(dst_path) res = rdata.conversion.convert(parsed_res) dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1) # Prepare the data y = dataset['Class'].astype(int) # Convert to integers df = dataset.drop(['Class'], axis=1) df.columns = df.columns.astype(str) print("Data subsets created") # Split the data X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y) X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train) # Reset indices X_train.reset_index(drop=True, inplace=True) y_train.reset_index(drop=True, inplace=True) # Define the numerical features and the pipeline for numerical features numerical_features = [f'V{i}' for i in range(1, 29)] + ['Amount'] numerical_pipeline = make_pipeline( StandardScaler() # Example: Standardize numerical features ) # Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately. preprocessor = make_column_transformer( (numerical_pipeline, numerical_features) ) # Creating model clf = MCD() # Creating a pipeline combining preprocessing steps (imputation and encoding) with the MCD model model_pipeline = make_pipeline( preprocessor, # Applying preprocessing steps clf # Training MCD model ) print("Preprocessing Data") # Fit the model and train model to predict anomalies model_pipeline.fit(X_train) y_test_pred = model_pipeline.predict(X_test) print("Serializing Model") # Save the model in the current working directory saved_model_path = "model.joblib" joblib.dump(model_pipeline, saved_model_path) print(f"Model Serialized and Saved to {saved_model_path}")