#import os #import gradio as gr #import joblib #import subprocess #import pandas as pd #import json #from pathlib import Path #from threading import Lock #from huggingface_hub import CommitScheduler #import uuid #from huggingface_hub import HfApi import seaborn as sns import pandas as pd import numpy as np import pyod import pyreadr import urllib import rdata import wget import os import gradio as gr import joblib import subprocess import pandas as pd import json import uuid from sklearn.metrics import f1_score, confusion_matrix from pyod.models.mcd import MCD from pyod.utils.data import generate_data from pyod.utils.data import evaluate_print from sklearn.datasets import fetch_openml from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import make_column_transformer from sklearn.pipeline import make_pipeline from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.metrics import mean_squared_error, r2_score from pathlib import Path from threading import Lock from huggingface_hub import CommitScheduler from huggingface_hub import HfApi #from IPython.display import display, HTML import warnings # Ignore all warnings warnings.filterwarnings("ignore") # Download the dataset url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata" dst_path = "./creditcard.Rdata" wget.download(url, dst_path) # Load the dataset parsed_res = rdata.parser.parse_file(dst_path) res = rdata.conversion.convert(parsed_res) dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1) # Prepare the data y = dataset['Class'].astype(int) # Convert to integers df = dataset.drop(['Class'], axis=1) df.columns = df.columns.astype(str) print("Data subsets created") # Split the data X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y) X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train) # Reset indices X_train.reset_index(drop=True, inplace=True) y_train.reset_index(drop=True, inplace=True) # Define the numerical features and the pipeline for numerical features numerical_features = [f'V{i}' for i in range(1, 29)] + ['Amount'] numerical_pipeline = make_pipeline( StandardScaler() # Example: Standardize numerical features ) # Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately. preprocessor = make_column_transformer( (numerical_pipeline, numerical_features) ) # Creating model clf = MCD() # Creating a pipeline combining preprocessing steps (imputation and encoding) with the MCD model model_pipeline = make_pipeline( preprocessor, # Applying preprocessing steps clf # Training MCD model ) print("Preprocessing Data") # Fit the model and train model to predict anomalies model_pipeline.fit(X_train) y_test_pred = model_pipeline.predict(X_test) # Define the predict function def predict(csv_filename): # Read the CSV file df = pd.read_csv(csv_filename, header=None) # Convert the DataFrame to a list of floats client_data = df.iloc[0].tolist() # Check if the length of client_data is 29 if len(client_data) != 29: raise ValueError("The CSV file must contain exactly 29 values.") # Unpack the list of values V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, Amount = client_data # Create the data dictionary data = { 'V1': V1, 'V2': V2, 'V3': V3, 'V4': V4, 'V5': V5, 'V6': V6, 'V7': V7, 'V8': V8, 'V9': V9, 'V10': V10, 'V11': V11, 'V12': V12, 'V13': V13, 'V14': V14, 'V15': V15, 'V16': V16, 'V17': V17, 'V18': V18, 'V19': V19, 'V20': V20, 'V21': V21, 'V22': V22, 'V23': V23, 'V24': V24, 'V25': V25, 'V26': V26, 'V27': V27, 'V28': V28, 'Amount': Amount } # Convert the data dictionary to a DataFrame input_df = pd.DataFrame([data]) # Make predictions using the loaded model prediction = model_pipeline.predict(input_df) return prediction[0], Amount # Return both the prediction and Amount # Define a function to map the names to their respective CSV filenames def get_csv_filename(name): name_to_filename = { 'Ted': 'Ted.csv', 'Bill': 'Bill.csv', 'Jill': 'Jill.csv', 'Juan': 'Juan.csv' } return name_to_filename.get(name, 'Ted.csv') # Default to 'Ted.csv' if name not found # Define the Gradio interface function for single prediction def gradio_predict(name): csv_filename = get_csv_filename(name) prediction, amount = predict(csv_filename) return f"The flagged transaction amount is {amount} and the prediction is {prediction}" # Define the function for bulk analysis def bulk_analysis(file): # Read the uploaded CSV file df = pd.read_csv(file.name) # Assuming the last column is 'Amount' and the rest are features X_test = df.iloc[:, :-1] y_test = df.iloc[:, -1] # Make predictions using the loaded model y_test_pred = model_pipeline.predict(X_test) # Debugging: Print counts of anomalies in actual and predicted actual_anomalies = sum(y_test == 1) predicted_anomalies = sum(y_test_pred == 1) print(f"Actual anomalies: {actual_anomalies}, Predicted anomalies: {predicted_anomalies}") # Find rows where actual and predicted are both 1 correctly_predicted_anomalies = X_test[(y_test == 1) & (y_test_pred == 1)] print(f"Correctly predicted anomalies: {len(correctly_predicted_anomalies)}") # Save the results to a CSV file #result_filename = "correct_anomalies.csv" #correctly_predicted_anomalies.to_csv(result_filename, index=False) r#eturn result_filename # Return the path to the saved file # Create the Gradio interface iface = gr.Interface( fn=gradio_predict, inputs=gr.Dropdown(choices=['Ted', 'Bill', 'Jill', 'Juan'], label="Select a name"), outputs="text" ) # Add the bulk analysis upload interface bulk_iface = gr.Interface( fn=bulk_analysis, inputs=gr.File(label="Bulk Analysis"), outputs="text" ) # Combine the interfaces combined_iface = gr.TabbedInterface( [iface, bulk_iface], tab_names=["Single Prediction", "Bulk Analysis"] ) # Launch the interface combined_iface.launch(share=True)