kgauvin603 commited on
Commit
b6bd2c7
1 Parent(s): 176392b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -4
app.py CHANGED
@@ -47,9 +47,154 @@ import warnings
47
  # Ignore all warnings
48
  warnings.filterwarnings("ignore")
49
 
50
- # Run the training script placed in the same directory as app.py
51
- # The training script will train and persist a linear regression
52
- # model with the filename 'model.joblib'
53
- subprocess.run(['python', 'train.py'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
 
 
47
  # Ignore all warnings
48
  warnings.filterwarnings("ignore")
49
 
50
+
51
+ # Download the dataset
52
+ url = "http://www.ulb.ac.be/di/map/adalpozz/data/creditcard.Rdata"
53
+ dst_path = "./creditcard.Rdata"
54
+ wget.download(url, dst_path)
55
+
56
+ # Load the dataset
57
+ parsed_res = rdata.parser.parse_file(dst_path)
58
+ res = rdata.conversion.convert(parsed_res)
59
+ dataset = res['creditcard'].reset_index(drop=True).drop(['Time'], axis=1)
60
+
61
+ # Prepare the data
62
+ y = dataset['Class'].astype(int) # Convert to integers
63
+ df = dataset.drop(['Class'], axis=1)
64
+ df.columns = df.columns.astype(str)
65
+
66
+ print("Data subsets created")
67
+
68
+ # Split the data
69
+ X_train, X_test, y_train, y_test = train_test_split(df, y, train_size=0.6, random_state=0, stratify=y)
70
+ X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=0.2, random_state=0, stratify=y_train)
71
+
72
+ # Reset indices
73
+ X_train.reset_index(drop=True, inplace=True)
74
+ y_train.reset_index(drop=True, inplace=True)
75
+
76
+ # Define the numerical features and the pipeline for numerical features
77
+ numerical_features = [f'V{i}' for i in range(1, 29)] + ['Amount']
78
+
79
+ numerical_pipeline = make_pipeline(
80
+ StandardScaler() # Example: Standardize numerical features
81
+ )
82
+
83
+ # Creating a column transformer named preprocessor to apply specific pipelines to numerical and categorical features separately.
84
+ preprocessor = make_column_transformer(
85
+ (numerical_pipeline, numerical_features)
86
+ )
87
+
88
+ # Creating model
89
+ clf = MCD()
90
+
91
+ # Creating a pipeline combining preprocessing steps (imputation and encoding) with the MCD model
92
+ model_pipeline = make_pipeline(
93
+ preprocessor, # Applying preprocessing steps
94
+ clf # Training MCD model
95
+ )
96
+
97
+ print("Preprocessing Data")
98
+
99
+ # Fit the model and train model to predict anomalies
100
+ model_pipeline.fit(X_train)
101
+ y_test_pred = model_pipeline.predict(X_test)
102
+
103
+ # Define the predict function
104
+ def predict(csv_filename):
105
+ # Read the CSV file
106
+ df = pd.read_csv(csv_filename, header=None)
107
+
108
+ # Convert the DataFrame to a list of floats
109
+ client_data = df.iloc[0].tolist()
110
+
111
+ # Check if the length of client_data is 29
112
+ if len(client_data) != 29:
113
+ raise ValueError("The CSV file must contain exactly 29 values.")
114
+
115
+ # Unpack the list of values
116
+ V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, Amount = client_data
117
+
118
+ # Create the data dictionary
119
+ data = {
120
+ 'V1': V1, 'V2': V2, 'V3': V3, 'V4': V4, 'V5': V5, 'V6': V6, 'V7': V7, 'V8': V8, 'V9': V9, 'V10': V10,
121
+ 'V11': V11, 'V12': V12, 'V13': V13, 'V14': V14, 'V15': V15, 'V16': V16, 'V17': V17, 'V18': V18, 'V19': V19, 'V20': V20,
122
+ 'V21': V21, 'V22': V22, 'V23': V23, 'V24': V24, 'V25': V25, 'V26': V26, 'V27': V27, 'V28': V28, 'Amount': Amount
123
+ }
124
+
125
+ # Convert the data dictionary to a DataFrame
126
+ input_df = pd.DataFrame([data])
127
+
128
+ # Make predictions using the loaded model
129
+ prediction = model_pipeline.predict(input_df)
130
+
131
+ return prediction[0], Amount # Return both the prediction and Amount
132
+
133
+ # Define a function to map the names to their respective CSV filenames
134
+ def get_csv_filename(name):
135
+ name_to_filename = {
136
+ 'Ted': 'Ted.csv',
137
+ 'Bill': 'Bill.csv',
138
+ 'Jill': 'Jill.csv',
139
+ 'Juan': 'Juan.csv'
140
+ }
141
+ return name_to_filename.get(name, 'Ted.csv') # Default to 'Ted.csv' if name not found
142
+
143
+ # Define the Gradio interface function for single prediction
144
+ def gradio_predict(name):
145
+ csv_filename = get_csv_filename(name)
146
+ prediction, amount = predict(csv_filename)
147
+ return f"The flagged transaction amount is {amount} and the prediction is {prediction}"
148
+
149
+ # Define the function for bulk analysis
150
+ def bulk_analysis(file):
151
+ # Read the uploaded CSV file
152
+ df = pd.read_csv(file.name)
153
+
154
+ # Assuming the last column is 'Amount' and the rest are features
155
+ X_test = df.iloc[:, :-1]
156
+ y_test = df.iloc[:, -1]
157
+
158
+ # Make predictions using the loaded model
159
+ y_test_pred = model_pipeline.predict(X_test)
160
+
161
+ # Debugging: Print counts of anomalies in actual and predicted
162
+ actual_anomalies = sum(y_test == 1)
163
+ predicted_anomalies = sum(y_test_pred == 1)
164
+ print(f"Actual anomalies: {actual_anomalies}, Predicted anomalies: {predicted_anomalies}")
165
+
166
+ # Find rows where actual and predicted are both 1
167
+ correctly_predicted_anomalies = X_test[(y_test == 1) & (y_test_pred == 1)]
168
+ print(f"Correctly predicted anomalies: {len(correctly_predicted_anomalies)}")
169
+
170
+ # Save the results to a CSV file
171
+ #result_filename = "correct_anomalies.csv"
172
+ #correctly_predicted_anomalies.to_csv(result_filename, index=False)
173
+
174
+ r#eturn result_filename # Return the path to the saved file
175
+
176
+
177
+ # Create the Gradio interface
178
+ iface = gr.Interface(
179
+ fn=gradio_predict,
180
+ inputs=gr.Dropdown(choices=['Ted', 'Bill', 'Jill', 'Juan'], label="Select a name"),
181
+ outputs="text"
182
+ )
183
+
184
+ # Add the bulk analysis upload interface
185
+ bulk_iface = gr.Interface(
186
+ fn=bulk_analysis,
187
+ inputs=gr.File(label="Bulk Analysis"),
188
+ outputs="text"
189
+ )
190
+
191
+ # Combine the interfaces
192
+ combined_iface = gr.TabbedInterface(
193
+ [iface, bulk_iface],
194
+ tab_names=["Single Prediction", "Bulk Analysis"]
195
+ )
196
+
197
+ # Launch the interface
198
+ combined_iface.launch(share=True)
199
 
200