Spaces:

kgauvin603
/

OCW-FraudDetection

Sleeping

App Files Files Community

OCW-FraudDetection / train.py

kgauvin603

Update train.py

176392b verified about 1 month ago

raw

history blame contribute delete

No virus

2.38 kB

	import seaborn as sns
	import pandas as pd
	import numpy as np
	import pyod
	import pyreadr
	import urllib
	import rdata
	import wget
	import os
	import joblib
	import warnings
	from pyod.models.mcd import MCD
	from sklearn.preprocessing import StandardScaler
	from sklearn.compose import make_column_transformer
	from sklearn.pipeline import make_pipeline
	from sklearn.model_selection import train_test_split

	# Ignore all warnings
	warnings.filterwarnings("ignore")

	# Download the dataset
	url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata"
	dst_path = "./creditcard.Rdata"
	wget.download(url, dst_path)

	# Load the dataset
	parsed_res = rdata.parser.parse_file(dst_path)
	res = rdata.conversion.convert(parsed_res)
	dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1)

	# Prepare the data
	y = dataset['Class'].astype(int) # Convert to integers
	df = dataset.drop(['Class'], axis=1)
	df.columns = df.columns.astype(str)

	print("Data subsets created")

	# Split the data
	X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y)
	X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train)

	# Reset indices
	X_train.reset_index(drop=True, inplace=True)
	y_train.reset_index(drop=True, inplace=True)

	# Define the numerical features and the pipeline for numerical features
	numerical_features = [f'V{i}' for i in range(1, 29)] + ['Amount']

	numerical_pipeline = make_pipeline(
	StandardScaler() # Example: Standardize numerical features
	)

	# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
	preprocessor = make_column_transformer(
	(numerical_pipeline, numerical_features)
	)

	# Creating model
	clf = MCD()

	# Creating a pipeline combining preprocessing steps (imputation and encoding) with the MCD model
	model_pipeline = make_pipeline(
	preprocessor, # Applying preprocessing steps
	clf # Training MCD model
	)

	print("Preprocessing Data")

	# Fit the model and train model to predict anomalies
	model_pipeline.fit(X_train)
	y_test_pred = model_pipeline.predict(X_test)

	print("Serializing Model")

	# Save the model in the current working directory
	saved_model_path = "model.joblib"
	joblib.dump(model_pipeline, saved_model_path)

	print(f"Model Serialized and Saved to {saved_model_path}")