Spaces:

kgauvin603
/

OCW-FraudDetection

Sleeping

File size: 3,219 Bytes

b92fc39

import seaborn as sns
import pandas as pd
import numpy as np
import pyod
import pyreadr
import urllib
import rdata
import wget
import os
import gradio as gr
import joblib
import subprocess
import pandas as pd
import json
import uuid
import warnings
from sklearn.metrics import f1_score, confusion_matrix
from pyod.models.mcd import MCD
from pyod.utils.data import generate_data
from pyod.utils.data import evaluate_print
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from pathlib import Path
from threading import Lock
from huggingface_hub import CommitScheduler
from huggingface_hub import HfApi
from IPython.display import display
import warnings
from IPython.display import display, HTML
# Ignore all warnings
warnings.filterwarnings("ignore")

# Download the dataset - For realworld scenarion we would use the the csv with the appeneded data
url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata"
dst_path = "./creditcard.Rdata"
wget.download(url, dst_path)

# Load the dataset
parsed_res = rdata.parser.parse_file(dst_path)
res = rdata.conversion.convert(parsed_res)
dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1)

# Prepare the data
y = dataset['Class'].astype(int)  # Convert to integers
df = dataset.drop(['Class'], axis=1)
df.columns = df.columns.astype(str)

print("Data subsets created")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y)
X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train)

# Reset indices
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

# Define the numerical features and the pipeline for numerical features
numerical_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
                      'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
                      'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

numerical_pipeline = make_pipeline(
    StandardScaler()  # Example: Standardize numerical features
)

# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
preprocessor = make_column_transformer(
    (numerical_pipeline, numerical_features)
)

# Creating model
clf = MCD()

# Creating a pipeline combining preprocessing steps (imputation and encoding) with linear regression modeling.
model_pipeline = make_pipeline(
    preprocessor,  # Applying preprocessing steps
    clf  # Training linear regression model
)

print("Preprocessing Data")

# Fit the model and train model to predict anomalies
model_pipeline.fit(X_train)
y_test_pred = model_pipeline.predict(X_test)

print("Serializing Model")

saved_model_path = "model.joblib"

joblib.dump(model_pipeline, saved_model_path)

print("Model Serialized and Saved")