kgauvin603 commited on
Commit
b92fc39
1 Parent(s): 30dd22c

Create train.py

Browse files
Files changed (1) hide show
  1. train.py +98 -0
train.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import seaborn as sns
2
+ import pandas as pd
3
+ import numpy as np
4
+ import pyod
5
+ import pyreadr
6
+ import urllib
7
+ import rdata
8
+ import wget
9
+ import os
10
+ import gradio as gr
11
+ import joblib
12
+ import subprocess
13
+ import pandas as pd
14
+ import json
15
+ import uuid
16
+ import warnings
17
+ from sklearn.metrics import f1_score, confusion_matrix
18
+ from pyod.models.mcd import MCD
19
+ from pyod.utils.data import generate_data
20
+ from pyod.utils.data import evaluate_print
21
+ from sklearn.datasets import fetch_openml
22
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder
23
+ from sklearn.compose import make_column_transformer
24
+ from sklearn.pipeline import make_pipeline
25
+ from sklearn.model_selection import train_test_split
26
+ from sklearn.linear_model import LinearRegression
27
+ from sklearn.metrics import mean_squared_error, r2_score
28
+ from pathlib import Path
29
+ from threading import Lock
30
+ from huggingface_hub import CommitScheduler
31
+ from huggingface_hub import HfApi
32
+ from IPython.display import display
33
+ import warnings
34
+ from IPython.display import display, HTML
35
+ # Ignore all warnings
36
+ warnings.filterwarnings("ignore")
37
+
38
+ # Download the dataset - For realworld scenarion we would use the the csv with the appeneded data
39
+ url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata"
40
+ dst_path = "./creditcard.Rdata"
41
+ wget.download(url, dst_path)
42
+
43
+ # Load the dataset
44
+ parsed_res = rdata.parser.parse_file(dst_path)
45
+ res = rdata.conversion.convert(parsed_res)
46
+ dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1)
47
+
48
+ # Prepare the data
49
+ y = dataset['Class'].astype(int) # Convert to integers
50
+ df = dataset.drop(['Class'], axis=1)
51
+ df.columns = df.columns.astype(str)
52
+
53
+ print("Data subsets created")
54
+
55
+ # Split the data
56
+ X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y)
57
+ X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train)
58
+
59
+ # Reset indices
60
+ X_train.reset_index(drop=True, inplace=True)
61
+ y_train.reset_index(drop=True, inplace=True)
62
+
63
+ # Define the numerical features and the pipeline for numerical features
64
+ numerical_features = ['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
65
+ 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
66
+ 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']
67
+
68
+ numerical_pipeline = make_pipeline(
69
+ StandardScaler() # Example: Standardize numerical features
70
+ )
71
+
72
+ # Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
73
+ preprocessor = make_column_transformer(
74
+ (numerical_pipeline, numerical_features)
75
+ )
76
+
77
+ # Creating model
78
+ clf = MCD()
79
+
80
+ # Creating a pipeline combining preprocessing steps (imputation and encoding) with linear regression modeling.
81
+ model_pipeline = make_pipeline(
82
+ preprocessor, # Applying preprocessing steps
83
+ clf # Training linear regression model
84
+ )
85
+
86
+ print("Preprocessing Data")
87
+
88
+ # Fit the model and train model to predict anomalies
89
+ model_pipeline.fit(X_train)
90
+ y_test_pred = model_pipeline.predict(X_test)
91
+
92
+ print("Serializing Model")
93
+
94
+ saved_model_path = "model.joblib"
95
+
96
+ joblib.dump(model_pipeline, saved_model_path)
97
+
98
+ print("Model Serialized and Saved")