Spaces:

kgauvin603
/

OCW-FraudDetection

Sleeping

App Files Files Community

kgauvin603 commited on Aug 23

Commit

b92fc39

•

1 Parent(s): 30dd22c

Create train.py

Browse files

Files changed (1) hide show

train.py +98 -0

train.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import seaborn as sns
+import pandas as pd
+import numpy as np
+import pyod
+import pyreadr
+import urllib
+import rdata
+import wget
+import os
+import gradio as gr
+import joblib
+import subprocess
+import pandas as pd
+import json
+import uuid
+import warnings
+from sklearn.metrics import f1_score, confusion_matrix
+from pyod.models.mcd import MCD
+from pyod.utils.data import generate_data
+from pyod.utils.data import evaluate_print
+from sklearn.datasets import fetch_openml
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import make_column_transformer
+from sklearn.pipeline import make_pipeline
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error, r2_score
+from pathlib import Path
+from threading import Lock
+from huggingface_hub import CommitScheduler
+from huggingface_hub import HfApi
+from IPython.display import display
+import warnings
+from IPython.display import display, HTML
+# Ignore all warnings
+warnings.filterwarnings("ignore")
+# Download the dataset - For realworld scenarion we would use the the csv with the appeneded data
+url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata"
+dst_path = "./creditcard.Rdata"
+wget.download(url, dst_path)
+# Load the dataset
+parsed_res = rdata.parser.parse_file(dst_path)
+res = rdata.conversion.convert(parsed_res)
+dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1)
+# Prepare the data
+y = dataset['Class'].astype(int)  # Convert to integers
+df = dataset.drop(['Class'], axis=1)
+df.columns = df.columns.astype(str)
+print("Data subsets created")
+# Split the data
+X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y)
+X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train)
+# Reset indices
+X_train.reset_index(drop=True, inplace=True)
+y_train.reset_index(drop=True, inplace=True)
+# Define the numerical features and the pipeline for numerical features
+numerical_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
+                      'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
+                      'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
+numerical_pipeline = make_pipeline(
+    StandardScaler()  # Example: Standardize numerical features
+)
+# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
+preprocessor = make_column_transformer(
+    (numerical_pipeline, numerical_features)
+)
+# Creating model
+clf = MCD()
+# Creating a pipeline combining preprocessing steps (imputation and encoding) with linear regression modeling.
+model_pipeline = make_pipeline(
+    preprocessor,  # Applying preprocessing steps
+    clf  # Training linear regression model
+)
+print("Preprocessing Data")
+# Fit the model and train model to predict anomalies
+model_pipeline.fit(X_train)
+y_test_pred = model_pipeline.predict(X_test)
+print("Serializing Model")
+saved_model_path = "model.joblib"
+joblib.dump(model_pipeline, saved_model_path)
+print("Model Serialized and Saved")