Spaces:

kgauvin603
/

OCW-FraudDetection

Sleeping

App Files Files Community

OCW-FraudDetection / app.py

kgauvin603

Update app.py

b6bd2c7 verified about 1 month ago

raw

history blame contribute delete

No virus

6.39 kB


	#import os
	#import gradio as gr
	#import joblib
	#import subprocess
	#import pandas as pd
	#import json
	#from pathlib import Path
	#from threading import Lock
	#from huggingface_hub import CommitScheduler
	#import uuid
	#from huggingface_hub import HfApi

	import seaborn as sns
	import pandas as pd
	import numpy as np
	import pyod
	import pyreadr
	import urllib
	import rdata
	import wget
	import os
	import gradio as gr
	import joblib
	import subprocess
	import pandas as pd
	import json
	import uuid
	from sklearn.metrics import f1_score, confusion_matrix
	from pyod.models.mcd import MCD
	from pyod.utils.data import generate_data
	from pyod.utils.data import evaluate_print
	from sklearn.datasets import fetch_openml
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import make_column_transformer
	from sklearn.pipeline import make_pipeline
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error, r2_score
	from pathlib import Path
	from threading import Lock
	from huggingface_hub import CommitScheduler
	from huggingface_hub import HfApi
	#from IPython.display import display, HTML
	import warnings

	# Ignore all warnings
	warnings.filterwarnings("ignore")


	# Download the dataset
	url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata"
	dst_path = "./creditcard.Rdata"
	wget.download(url, dst_path)

	# Load the dataset
	parsed_res = rdata.parser.parse_file(dst_path)
	res = rdata.conversion.convert(parsed_res)
	dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1)

	# Prepare the data
	y = dataset['Class'].astype(int) # Convert to integers
	df = dataset.drop(['Class'], axis=1)
	df.columns = df.columns.astype(str)

	print("Data subsets created")

	# Split the data
	X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y)
	X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train)

	# Reset indices
	X_train.reset_index(drop=True, inplace=True)
	y_train.reset_index(drop=True, inplace=True)

	# Define the numerical features and the pipeline for numerical features
	numerical_features = [f'V{i}' for i in range(1, 29)] + ['Amount']

	numerical_pipeline = make_pipeline(
	StandardScaler() # Example: Standardize numerical features
	)

	# Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
	preprocessor = make_column_transformer(
	(numerical_pipeline, numerical_features)
	)

	# Creating model
	clf = MCD()

	# Creating a pipeline combining preprocessing steps (imputation and encoding) with the MCD model
	model_pipeline = make_pipeline(
	preprocessor, # Applying preprocessing steps
	clf # Training MCD model
	)

	print("Preprocessing Data")

	# Fit the model and train model to predict anomalies
	model_pipeline.fit(X_train)
	y_test_pred = model_pipeline.predict(X_test)

	# Define the predict function
	def predict(csv_filename):
	# Read the CSV file
	df = pd.read_csv(csv_filename, header=None)

	# Convert the DataFrame to a list of floats
	client_data = df.iloc[0].tolist()

	# Check if the length of client_data is 29
	if len(client_data) != 29:
	raise ValueError("The CSV file must contain exactly 29 values.")

	# Unpack the list of values
	V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, Amount = client_data

	# Create the data dictionary
	data = {
	'V1': V1, 'V2': V2, 'V3': V3, 'V4': V4, 'V5': V5, 'V6': V6, 'V7': V7, 'V8': V8, 'V9': V9, 'V10': V10,
	'V11': V11, 'V12': V12, 'V13': V13, 'V14': V14, 'V15': V15, 'V16': V16, 'V17': V17, 'V18': V18, 'V19': V19, 'V20': V20,
	'V21': V21, 'V22': V22, 'V23': V23, 'V24': V24, 'V25': V25, 'V26': V26, 'V27': V27, 'V28': V28, 'Amount': Amount
	}

	# Convert the data dictionary to a DataFrame
	input_df = pd.DataFrame([data])

	# Make predictions using the loaded model
	prediction = model_pipeline.predict(input_df)

	return prediction[0], Amount # Return both the prediction and Amount

	# Define a function to map the names to their respective CSV filenames
	def get_csv_filename(name):
	name_to_filename = {
	'Ted': 'Ted.csv',
	'Bill': 'Bill.csv',
	'Jill': 'Jill.csv',
	'Juan': 'Juan.csv'
	}
	return name_to_filename.get(name, 'Ted.csv') # Default to 'Ted.csv' if name not found

	# Define the Gradio interface function for single prediction
	def gradio_predict(name):
	csv_filename = get_csv_filename(name)
	prediction, amount = predict(csv_filename)
	return f"The flagged transaction amount is {amount} and the prediction is {prediction}"

	# Define the function for bulk analysis
	def bulk_analysis(file):
	# Read the uploaded CSV file
	df = pd.read_csv(file.name)

	# Assuming the last column is 'Amount' and the rest are features
	X_test = df.iloc[:, :-1]
	y_test = df.iloc[:, -1]

	# Make predictions using the loaded model
	y_test_pred = model_pipeline.predict(X_test)

	# Debugging: Print counts of anomalies in actual and predicted
	actual_anomalies = sum(y_test == 1)
	predicted_anomalies = sum(y_test_pred == 1)
	print(f"Actual anomalies: {actual_anomalies}, Predicted anomalies: {predicted_anomalies}")

	# Find rows where actual and predicted are both 1
	correctly_predicted_anomalies = X_test[(y_test == 1) & (y_test_pred == 1)]
	print(f"Correctly predicted anomalies: {len(correctly_predicted_anomalies)}")

	# Save the results to a CSV file
	#result_filename = "correct_anomalies.csv"
	#correctly_predicted_anomalies.to_csv(result_filename, index=False)

	r#eturn result_filename # Return the path to the saved file


	# Create the Gradio interface
	iface = gr.Interface(
	fn=gradio_predict,
	inputs=gr.Dropdown(choices=['Ted', 'Bill', 'Jill', 'Juan'], label="Select a name"),
	outputs="text"
	)

	# Add the bulk analysis upload interface
	bulk_iface = gr.Interface(
	fn=bulk_analysis,
	inputs=gr.File(label="Bulk Analysis"),
	outputs="text"
	)

	# Combine the interfaces
	combined_iface = gr.TabbedInterface(
	[iface, bulk_iface],
	tab_names=["Single Prediction", "Bulk Analysis"]
	)

	# Launch the interface
	combined_iface.launch(share=True)