Sambit20030731 commited on
Commit
6747401
1 Parent(s): 8112a45

Upload 8 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ WORKDIR /code
4
+
5
+ COPY ./requirements.txt /code/requirements.txt
6
+
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
8
+
9
+ COPY . .
10
+
11
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,451 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #install dependencies
2
+ from flask import Flask, render_template, request, redirect, url_for
3
+ import os
4
+ import shutil
5
+ import webview
6
+ import tkinter as tk
7
+ from tkinter import filedialog
8
+ import openpyxl
9
+ import pandas as pd
10
+ import requests
11
+ from fuzzywuzzy import fuzz
12
+ from openpyxl.styles import PatternFill
13
+ from openpyxl.styles.alignment import Alignment
14
+ import google.generativeai as genai
15
+
16
+
17
+ app = Flask(__name__, static_folder='./static', template_folder='./templates')
18
+ app.config['UPLOAD_FOLDER'] = 'uploads'
19
+ app.config['OUTPUT_FOLDER'] = 'output'
20
+ output_file = None
21
+ window = webview.create_window('DeDuplicae-Vendor', app)
22
+
23
+
24
+ #connect to google gemini API key
25
+ GOOGLE_API_KEY='AIzaSyCtACPu9EOnEa1_iAWsv_u__PQRpaCT564'
26
+ genai.configure(api_key=GOOGLE_API_KEY)
27
+
28
+
29
+ #Load the gemini model
30
+ model = genai.GenerativeModel('gemini-pro')
31
+
32
+
33
+ # Function to apply to df1 to create the cont_person_name column
34
+ def process_fuzzy_ratios(rows_dict):
35
+ fuzz_data = {}
36
+ for key, row in enumerate(rows_dict):
37
+ if key == 0:
38
+ # For the first row, delete specified columns
39
+ del row["address_fuzzy_ratio"]
40
+ del row["bank_fuzzy_ratio"]
41
+ del row["name_fuzzy_ratio"]
42
+ del row["accgrp_fuzzy_ratio"]
43
+ del row["tax_fuzzy_ratio"]
44
+ del row["postal_fuzzy_ratio"]
45
+ else:
46
+ # For subsequent rows, store data in fuzz_data dictionary
47
+ fuzz_data["row_" + str(key + 1)] = {
48
+ "address_fuzzy_ratio": row.pop("address_fuzzy_ratio"),
49
+ "bank_fuzzy_ratio": row.pop("bank_fuzzy_ratio"),
50
+ "name_fuzzy_ratio": row.pop("name_fuzzy_ratio"),
51
+ "accgrp_fuzzy_ratio": row.pop("accgrp_fuzzy_ratio"),
52
+ "tax_fuzzy_ratio": row.pop("tax_fuzzy_ratio"),
53
+ "postal_fuzzy_ratio": row.pop("postal_fuzzy_ratio")
54
+ }
55
+ return fuzz_data, rows_dict
56
+
57
+
58
+ # Code to perform gemini analysis
59
+ def gemini_analysis(dataframe):
60
+ prev_row_duplicate = False
61
+ prev_row_number = None
62
+ for index, row in dataframe.iterrows():
63
+
64
+ # Find duplicate pairs
65
+ if row['Remarks'] == 'Duplicate':
66
+ if prev_row_duplicate:
67
+ duplicate_pairs=[]
68
+ row1 = dataframe.loc[index-1].to_dict()
69
+ row2 = row.to_dict()
70
+ duplicate_pairs.append(row1)
71
+ duplicate_pairs.append(row2)
72
+ fuzzy_ratios, duplicate_pairs = process_fuzzy_ratios(duplicate_pairs)
73
+ for dictionary in duplicate_pairs:
74
+ for _ in range(12):
75
+ if dictionary:
76
+ dictionary.popitem()
77
+ main_data_str = "[{}]".format(', '.join([str(d) for d in duplicate_pairs]))
78
+ fuzzy_data_str = "{}".format(fuzzy_ratios)
79
+ qs="I have the data",main_data_str,"The corresponding fuzzy ratios are here: ",fuzzy_data_str,"Give a concise explanation why these two rows are duplicate based on analyzing the main data and explaining which column values are same and which column values are different?"
80
+
81
+ # Ask gemini to analyse the data
82
+ try:
83
+ response = model.generate_content(qs)
84
+ dataframe.at[index-1, 'Explanation'] = response.text
85
+ except requests.HTTPError:
86
+ dataframe.at[index-1, 'Explanation'] = 'An error occured'
87
+ except ValueError:
88
+ dataframe.at[index-1, 'Explanation'] = 'An error occured'
89
+ except Exception:
90
+ dataframe.at[index-1, 'Explanation'] = 'An error occured'
91
+ prev_row_duplicate = True
92
+ else:
93
+ prev_row_duplicate = False
94
+
95
+
96
+
97
+ # The logic to find duplicacy
98
+ def process_csv(file, check=['Tax','Bank','Address','Name','PostCode','AccGrp']):
99
+
100
+ def calculate_tax_duplicacy(df):
101
+ df.sort_values(['Tax'], inplace=True)
102
+ df = df.reset_index(drop=True)
103
+ df.at[0, 'tax_fuzzy_ratio'] = 100
104
+ last_row_index = len(df) - 1
105
+ df.at[last_row_index, 'tax_fuzzy_ratio'] = 100
106
+ for i in range(1, last_row_index):
107
+ current_tax = df['Tax'].iloc[i]
108
+ previous_tax = df['Tax'].iloc[i - 1]
109
+ fuzzy_ratio = fuzz.ratio(previous_tax, current_tax)
110
+ df.at[i, 'tax_fuzzy_ratio'] = fuzzy_ratio
111
+ df['tax_fuzzy_ratio'] = pd.to_numeric(df['tax_fuzzy_ratio'], errors='coerce')
112
+
113
+ # Calculate the duplicate groups based on tax column
114
+ group_counter = 1
115
+ df.at[0, 'tax_based_group'] = group_counter
116
+ for i in range(1, len(df)):
117
+ if df.at[i, 'tax_fuzzy_ratio'] > 90:
118
+ df.at[i, 'tax_based_group'] = df.at[i - 1, 'tax_based_group']
119
+ else:
120
+ group_counter += 1
121
+ df.at[i, 'tax_based_group'] = group_counter
122
+ return df
123
+
124
+ def calculate_bank_duplicacy(df):
125
+ df.sort_values(['Group_tax', 'Bank'], inplace=True)
126
+ df = df.reset_index(drop=True)
127
+ df.at[0, 'bank_fuzzy_ratio'] = 100
128
+ df.at[last_row_index, 'bank_fuzzy_ratio'] = 100
129
+ for i in range(1, last_row_index):
130
+ current_address = df['Bank'].iloc[i]
131
+ previous_address = df['Bank'].iloc[i - 1]
132
+ fuzzy_ratio = fuzz.ratio(previous_address, current_address)
133
+ df.at[i, 'bank_fuzzy_ratio'] = fuzzy_ratio
134
+ df['bank_fuzzy_ratio'] = pd.to_numeric(df['bank_fuzzy_ratio'], errors='coerce')
135
+
136
+ # Calculate the duplicate groups for bank column
137
+ bank_group_counter = 1
138
+ df.at[0, 'bank_based_group'] = str(bank_group_counter)
139
+ group = df.at[0, 'tax_based_group']
140
+ for i in range(1, len(df)):
141
+ if df.at[i, 'bank_fuzzy_ratio'] >= 100:
142
+ df.at[i, 'bank_based_group'] = df.at[i - 1, 'bank_based_group']
143
+ else:
144
+ if df.at[i, 'tax_based_group'] != group:
145
+ bank_group_counter = 1
146
+ group = df.at[i, 'tax_based_group']
147
+ else:
148
+ bank_group_counter += 1
149
+ df.at[i, 'bank_based_group'] = str(bank_group_counter)
150
+ return df
151
+
152
+ def calculate_address_duplicacy(df):
153
+ df.sort_values(['Group_tax_bank', 'Address'], inplace=True)
154
+ df = df.reset_index(drop=True)
155
+ df.at[0, 'address_fuzzy_ratio'] = 100
156
+ df.at[last_row_index, 'address_fuzzy_ratio'] = 100
157
+ for i in range(1, last_row_index):
158
+ current_address = df['Address'].iloc[i]
159
+ previous_address = df['Address'].iloc[i - 1]
160
+ fuzzy_ratio = fuzz.ratio(previous_address, current_address)
161
+ df.at[i, 'address_fuzzy_ratio'] = fuzzy_ratio
162
+ df['address_fuzzy_ratio'] = pd.to_numeric(df['address_fuzzy_ratio'], errors='coerce')
163
+
164
+ # Calculate the duplicate groups for address column
165
+ address_group_counter = 1
166
+ df.at[0, 'address_based_group'] = str(address_group_counter)
167
+ group = df.at[0, 'Group_tax_bank']
168
+ for i in range(1, len(df)):
169
+ if df.at[i, 'address_fuzzy_ratio'] > 70:
170
+ df.at[i, 'address_based_group'] = df.at[i - 1, 'address_based_group']
171
+ else:
172
+ if df.at[i, 'Group_tax_bank'] != group:
173
+ address_group_counter = 1
174
+ group = df.at[i, 'Group_tax_bank']
175
+ else:
176
+ address_group_counter += 1
177
+ df.at[i, 'address_based_group'] = str(address_group_counter)
178
+ return df
179
+
180
+ def calculate_name_duplicacy(df):
181
+ df.sort_values(['Group_tax_bank_add', 'Name'], inplace=True)
182
+ df = df.reset_index(drop=True)
183
+ df.at[0, 'name_fuzzy_ratio'] = 100
184
+ df.at[last_row_index, 'name_fuzzy_ratio'] = 100
185
+ for i in range(1, last_row_index):
186
+ current_address = df['Name'].iloc[i]
187
+ previous_address = df['Name'].iloc[i - 1]
188
+ fuzzy_ratio = fuzz.ratio(previous_address, current_address)
189
+ df.at[i, 'name_fuzzy_ratio'] = fuzzy_ratio
190
+ df['name_fuzzy_ratio'] = pd.to_numeric(df['name_fuzzy_ratio'], errors='coerce')
191
+
192
+ # Calculate the duplicate groups for name column
193
+ name_group_counter = 1
194
+ df.at[0, 'name_based_group'] = str(name_group_counter)
195
+ group = df.at[0, 'Group_tax_bank_add']
196
+ for i in range(1, len(df)):
197
+ if df.at[i, 'name_fuzzy_ratio'] > 80:
198
+ df.at[i, 'name_based_group'] = df.at[i - 1, 'name_based_group']
199
+ else:
200
+ if df.at[i, 'Group_tax_bank_add'] != group:
201
+ name_group_counter = 1
202
+ group = df.at[i, 'Group_tax_bank_add']
203
+ else:
204
+ name_group_counter += 1
205
+ df.at[i, 'name_based_group'] = str(name_group_counter)
206
+ return df
207
+
208
+ def calculate_postcode_duplicacy(df):
209
+ df.sort_values(['Group_tax_bank_add_name', 'POSTCODE1'], inplace=True)
210
+ df = df.reset_index(drop=True)
211
+ df.at[0, 'postal_fuzzy_ratio'] = 100
212
+ df.at[last_row_index, 'postal_fuzzy_ratio'] = 100
213
+ for i in range(1, last_row_index):
214
+ current_address = df['POSTCODE1'].iloc[i]
215
+ previous_address = df['POSTCODE1'].iloc[i - 1]
216
+ fuzzy_ratio = fuzz.ratio(previous_address, current_address)
217
+ df.at[i, 'postal_fuzzy_ratio'] = fuzzy_ratio
218
+ df['postal_fuzzy_ratio'] = pd.to_numeric(df['postal_fuzzy_ratio'], errors='coerce')
219
+
220
+ # Calculate the duplicate groups for postcode column
221
+ postcode_group_counter = 1
222
+ df.at[0, 'postal_based_group'] = str(postcode_group_counter)
223
+ group = df.at[0, 'Group_tax_bank_add_name']
224
+ for i in range(1, len(df)):
225
+ if df.at[i, 'postal_fuzzy_ratio'] > 90:
226
+ df.at[i, 'postal_based_group'] = df.at[i - 1, 'postal_based_group']
227
+ else:
228
+ if df.at[i, 'Group_tax_bank_add_name'] != group:
229
+ postcode_group_counter = 1
230
+ group = df.at[i, 'Group_tax_bank_add_name']
231
+ else:
232
+ postcode_group_counter += 1
233
+ df.at[i, 'postal_based_group'] = str(postcode_group_counter)
234
+ return df
235
+
236
+ def calculate_accgrp_duplicacy(df):
237
+ df.sort_values(['Group_tax_bank_add_name_post', 'KTOKK'], inplace=True)
238
+ df = df.reset_index(drop=True)
239
+ df.at[0, 'accgrp_fuzzy_ratio'] = 100
240
+ df.at[last_row_index, 'accgrp_fuzzy_ratio'] = 100
241
+ for i in range(1, last_row_index):
242
+ current_address = df['KTOKK'].iloc[i]
243
+ previous_address = df['KTOKK'].iloc[i - 1]
244
+ fuzzy_ratio = fuzz.ratio(previous_address, current_address)
245
+ df.at[i, 'accgrp_fuzzy_ratio'] = fuzzy_ratio
246
+ df['accgrp_fuzzy_ratio'] = pd.to_numeric(df['accgrp_fuzzy_ratio'], errors='coerce')
247
+
248
+ # Calculate the duplicate groups for accgrp column
249
+ accgrp_group_counter = 1
250
+ df.at[0, 'accgrp_based_group'] = str(accgrp_group_counter)
251
+ group = df.at[0, 'Group_tax_bank_add_name_post']
252
+ for i in range(1, len(df)):
253
+ if df.at[i, 'accgrp_fuzzy_ratio'] >= 100:
254
+ df.at[i, 'accgrp_based_group'] = df.at[i - 1, 'accgrp_based_group']
255
+ else:
256
+ if df.at[i, 'Group_tax_bank_add_name_post'] != group:
257
+ accgrp_group_counter = 1
258
+ group = df.at[i, 'Group_tax_bank_add_name_post']
259
+ else:
260
+ accgrp_group_counter += 1
261
+ df.at[i, 'accgrp_based_group'] = str(accgrp_group_counter)
262
+ return df
263
+
264
+ # Search for the header row
265
+ def find_header_row(file_path, specified_headers, sheet_name):
266
+ workbook = openpyxl.load_workbook(file_path)
267
+ sheet = workbook[sheet_name]
268
+ header_row = None
269
+ temp_values = []
270
+ for row in sheet.iter_rows():
271
+ for cell in row:
272
+ if cell.value in specified_headers:
273
+ header_row = cell.row
274
+ break
275
+ if header_row is not None:
276
+ break
277
+ if header_row is None:
278
+ return
279
+ # Store values in temporary variable
280
+ for row in range(1, header_row):
281
+ for cell in sheet[row]:
282
+ temp_values.append(cell.value)
283
+
284
+ # Read DataFrame below the header row using pandas
285
+ df = pd.DataFrame(sheet.iter_rows(min_row=header_row + 1, values_only=True),
286
+ columns=[cell.value for cell in next(sheet.iter_rows(min_row=header_row))])
287
+ return header_row, temp_values, df
288
+
289
+
290
+ sheet_name1 = 'General Data '
291
+
292
+ specified_headers = ["LIFNR", "KTOKK", "NAMEFIRST", "NAMELAST", "NAME3", "NAME4", "STREET", "POSTCODE1", "CITY1", "COUNTRY", "REGION", "SMTPADDR", "BANKL", "BANKN", "TAXTYPE", "TAXNUM", "Unnamed: 16", "Unnamed: 17", "Unnamed: 18"]
293
+ header_row, temp_values, df = find_header_row(file, specified_headers, sheet_name1)
294
+ # Replace null values with a blank space
295
+ df = df.fillna(" ")
296
+
297
+ # Creating new columns by concatenating original columns
298
+ df['Address'] = df['STREET'].astype(str) + '-' + df['CITY1'].astype(str) + '-' + df['COUNTRY'].astype(str) + '-' + \
299
+ df['REGION'].astype(str)
300
+ df['Name'] = df['NAMEFIRST'].astype(str) + '-' + df['NAMELAST'].astype(str) + '-' + df['NAME3'].astype(str) + '-' + \
301
+ df['NAME4'].astype(str)
302
+ df['Bank'] = df['BANKL'].astype(str) + '-' + df['BANKN'].astype(str)
303
+ df['Tax'] = df['TAXTYPE'].astype(str) + '-' + df['TAXNUM'].astype(str)
304
+
305
+ # Converting all concatenated columns to lowercase
306
+ df['Name'] = df['Name'].str.lower()
307
+ df['Address'] = df['Address'].str.lower()
308
+ df['Bank'] = df['Bank'].str.lower()
309
+ df['Tax'] = df['Tax'].str.lower()
310
+
311
+ # Create new columns with the following names for fuzzy ratio
312
+ df['name_fuzzy_ratio'] = ''
313
+ df['accgrp_fuzzy_ratio'] = ''
314
+ df['address_fuzzy_ratio'] = ''
315
+ df['bank_fuzzy_ratio'] = ''
316
+ df['tax_fuzzy_ratio'] = ''
317
+ df['postal_fuzzy_ratio'] = ''
318
+
319
+ # Create new columns with the following names for crearing groups
320
+ df['name_based_group'] = ''
321
+ df['accgrp_based_group'] = ''
322
+ df['address_based_group'] = ''
323
+ df['bank_based_group'] = ''
324
+ df['tax_based_group'] = ''
325
+ df['postal_based_group'] = ''
326
+
327
+ # Calculate last row index value
328
+ last_row_index = len(df) - 1
329
+
330
+ # Calculate the fuzzy ratios for tax column
331
+ if 'Tax' in check:
332
+ df = calculate_tax_duplicacy(df)
333
+ df['Group_tax'] = df.apply(lambda row: '{}'.format(row['tax_based_group']), axis=1)
334
+
335
+ # Calculate the fuzzy ratios for bank column
336
+ if 'Bank' in check:
337
+ df = calculate_bank_duplicacy(df)
338
+ df['Group_tax_bank'] = df.apply(lambda row: '{}_{}'.format(row['tax_based_group'], row['bank_based_group']), axis=1)
339
+
340
+ # Calculate the fuzzy ratios for address column
341
+ if 'Address' in check:
342
+ df = calculate_address_duplicacy(df)
343
+ df['Group_tax_bank_add'] = df.apply(lambda row: '{}_{}'.format(row['Group_tax_bank'], row['address_based_group']),
344
+ axis=1)
345
+
346
+ # Calculate the fuzzy ratios for name column
347
+ if 'Name' in check:
348
+ df = calculate_name_duplicacy(df)
349
+ df['Group_tax_bank_add_name'] = df.apply(
350
+ lambda row: '{}_{}'.format(row['Group_tax_bank_add'], row['name_based_group']), axis=1)
351
+
352
+ # Calculate the fuzzy ratios for postcode column
353
+ if 'PostCode' in check:
354
+ df = calculate_postcode_duplicacy(df)
355
+ df['Group_tax_bank_add_name_post'] = df.apply(
356
+ lambda row: '{}_{}'.format(row['Group_tax_bank_add_name'], row['postal_based_group']), axis=1)
357
+
358
+ # Calculate the fuzzy ratios for accgrp column
359
+ if 'AccGrp' in check:
360
+ df = calculate_accgrp_duplicacy(df)
361
+ df['Group_tax_bank_add_name_post_accgrp'] = df.apply(
362
+ lambda row: '{}_{}'.format(row['Group_tax_bank_add_name_post'], row['accgrp_based_group']), axis=1)
363
+
364
+ # Find the final duplicate groups in AND condition
365
+ duplicate_groups = df['Group_tax_bank_add_name_post_accgrp'].duplicated(keep=False)
366
+ df['Remarks'] = ['Duplicate' if is_duplicate else 'Unique' for is_duplicate in duplicate_groups]
367
+
368
+ # Ask gemini to analyse the duplicate columns
369
+ gemini_analysis(df)
370
+
371
+ # Drop the columns related to fuzzy ratios and groups
372
+ columns_to_drop = ['name_fuzzy_ratio', 'accgrp_fuzzy_ratio', 'address_fuzzy_ratio', 'bank_fuzzy_ratio',
373
+ 'tax_fuzzy_ratio', 'postal_fuzzy_ratio', 'name_based_group', 'accgrp_based_group',
374
+ 'address_based_group', 'bank_based_group', 'tax_based_group', 'postal_based_group',
375
+ 'Group_tax_bank', 'Group_tax_bank_add', 'Group_tax_bank_add_name',
376
+ 'Group_tax_bank_add_name_post', 'Group_tax', 'Group_tax_bank_add_name_post_accgrp']
377
+ df = df.drop(columns=columns_to_drop, axis=1)
378
+
379
+ df.to_excel('output/output.xlsx', index=False)
380
+
381
+ excel_writer = pd.ExcelWriter('output/output.xlsx', engine='openpyxl')
382
+ df.to_excel(excel_writer, index=False, sheet_name='Sheet1')
383
+
384
+ # Access the workbook
385
+ workbook = excel_writer.book
386
+ worksheet = workbook['Sheet1']
387
+
388
+ # Apply row coloring based on the value in the 'Remarks' column and also wrap the texts
389
+ duplicate_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
390
+ for idx, row in df.iterrows():
391
+ if row['Remarks'] == 'Duplicate':
392
+ for cell in worksheet[idx + 2]:
393
+ cell.alignment = Alignment(wrap_text=True)
394
+ cell.fill = duplicate_fill
395
+
396
+ # Iterate over columns and set their width
397
+ for col in worksheet.columns:
398
+ col_letter = col[0].column_letter
399
+ worksheet.column_dimensions[col_letter].width = 28
400
+
401
+ # Iterate over rows and set their height
402
+ for row in worksheet.iter_rows():
403
+ worksheet.row_dimensions[row[0].row].height = 20
404
+
405
+ # Save the changes
406
+ excel_writer.close()
407
+
408
+ output_path = os.path.join(app.config['OUTPUT_FOLDER'], 'output.xlsx')
409
+
410
+ return output_path
411
+
412
+ def save_error_message(error_message):
413
+ with open('static/error.txt', 'w') as f:
414
+ f.write(error_message)
415
+
416
+ @app.route('/', methods=['GET', 'POST'])
417
+ def upload_file():
418
+ global output_file
419
+ error_message = None
420
+ if request.method == 'POST':
421
+ file = request.files['file']
422
+ selected_options = request.form.getlist('option')
423
+ if file:
424
+ try:
425
+ file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
426
+ file.save(file_path)
427
+ output_file = process_csv(file_path)
428
+ return redirect(url_for('upload_file'))
429
+ except Exception as e:
430
+ error_message = str(e)
431
+ save_error_message(error_message)
432
+ return render_template('index.html', output_file=output_file, error_message=error_message)
433
+
434
+
435
+ def save_file_dialog(default_filename="output.xlsx", filetypes=(("XLSX files", ".xlsx"), ("All files", ".*"))):
436
+ root = tk.Tk()
437
+ root.withdraw()
438
+ file_path = filedialog.asksaveasfilename(initialfile=default_filename, filetypes=filetypes, defaultextension=".xlsx")
439
+ return file_path
440
+
441
+
442
+ @app.route('/downloads/output.xlsx')
443
+ def download_file():
444
+ output_file_path = os.path.join(app.config['OUTPUT_FOLDER'], 'output.xlsx')
445
+ selected_path = save_file_dialog()
446
+ if selected_path:
447
+ shutil.copyfile(output_file_path, selected_path)
448
+ return redirect(url_for('upload_file'))
449
+
450
+ if __name__ == '__main__':
451
+ app.run(debug=True)
output/readme.txt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Deduplication
requirement.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ flask
2
+ os
3
+ shutil
4
+ tkinter
5
+ openpyxl
6
+ pandas
7
+ requests
8
+ fuzzywuzzy
9
+ google-generativeai
static/script.js ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ function submitForm() {
2
+ var fileInput = document.getElementById('csvFile');
3
+ var processingMsg = document.getElementById('processingMsg');
4
+
5
+ if (fileInput.files.length === 0) {
6
+ alert('Please select a CSV file.');
7
+ return;
8
+ }
9
+
10
+ var formData = new FormData();
11
+ formData.append('csvFile', fileInput.files[0]);
12
+
13
+ // Show processing message
14
+ document.getElementById('uploadForm').classList.add('hidden');
15
+ processingMsg.classList.remove('hidden');
16
+
17
+ // Simulate backend processing (replace with actual AJAX call)
18
+ setTimeout(function() {
19
+ // After processing (simulated with setTimeout), show success message
20
+ processingMsg.innerHTML = '<p>File processed successfully. <a href="#" onclick="downloadProcessedFile()">Download processed file</a></p>';
21
+ }, 2000);
22
+ }
23
+
24
+ function downloadProcessedFile() {
25
+ // Here you can add code to download the processed file
26
+ alert('Downloading processed file...');
27
+ // Replace this alert with your actual download logic
28
+ }
29
+
30
+ document.getElementById('submitBtn').addEventListener('click', function() {
31
+ var fileInput = document.getElementById('csvFile');
32
+ var file = fileInput.files[0];
33
+ if (file) {
34
+ var formData = new FormData();
35
+ formData.append('file', file);
36
+
37
+ // Capture checkbox values
38
+ var checkboxes = document.querySelectorAll('input[name="option"]:checked');
39
+ checkboxes.forEach(function(checkbox) {
40
+ formData.append('option', checkbox.value);
41
+ });
42
+
43
+ var xhr = new XMLHttpRequest();
44
+ xhr.open('POST', '/');
45
+ xhr.upload.onprogress = function(event) {
46
+ if (event.lengthComputable) {
47
+ var percentComplete = (event.loaded / event.total) * 100;
48
+ document.getElementById('progressBar').style.width = percentComplete + '%';
49
+ }
50
+ };
51
+ xhr.onloadstart = function() {
52
+ document.getElementById('processingMsg').classList.remove('hidden');
53
+ };
54
+ xhr.onloadend = function() {
55
+ document.getElementById('processingMsg').classList.add('hidden');
56
+ document.getElementById('downloadBtn').classList.remove('hidden');
57
+ var response = JSON.parse(xhr.responseText);
58
+ document.getElementById('downloadBtn').addEventListener('click', function() {
59
+ window.location.href = '/downloads/output.xlsx';
60
+ });
61
+ };
62
+ xhr.send(formData);
63
+ }
64
+ });
static/styles.css ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: Arial, sans-serif;
3
+ background-color: #f0f0f0;
4
+ margin: 0;
5
+ padding: 100px 20px;
6
+ }
7
+
8
+ .container {
9
+ max-width: 600px;
10
+ margin: 0 auto;
11
+ background-color: #fff;
12
+ padding: 20px;
13
+ border-radius: 5px;
14
+ box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
15
+ display: flex;
16
+ flex-direction: column;
17
+ align-items: center;
18
+ justify-content: center;
19
+ }
20
+
21
+ h1 {
22
+ text-align: center;
23
+ color: #333;
24
+ }
25
+
26
+ form {
27
+ display: flex;
28
+ flex-direction: column;
29
+ }
30
+
31
+ input[type="file"] {
32
+ margin-bottom: 10px;
33
+ }
34
+
35
+ button {
36
+ padding: 10px 20px;
37
+ background-color: #007bff;
38
+ color: #fff;
39
+ border: none;
40
+ cursor: pointer;
41
+ }
42
+
43
+ button:hover {
44
+ background-color: #0056b3;
45
+ }
46
+
47
+ #processingMsg {
48
+ text-align: center;
49
+ }
50
+
51
+
52
+ .hidden {
53
+ display: none;
54
+ }
55
+
56
+ #downloadBtn {
57
+ border-box: 5px;
58
+ margin-top: 20px;
59
+ }
60
+
61
+ #downloadBtn button {
62
+ border-box: 5px;
63
+ padding: 10px 20px;
64
+ }
65
+
66
+ .options-container {
67
+ margin-top: 20px;
68
+ display: flex;
69
+ flex-wrap: wrap;
70
+ justify-content: center;
71
+ }
72
+
73
+ .option {
74
+ margin-right: 20px;
75
+ margin-bottom: 10px;
76
+ }
77
+
78
+ .option label {
79
+ margin-left: 5px;
80
+ }
81
+
82
+ .options-wrapper {
83
+ background-color: #f2f2f2;
84
+ border-radius: 8px;
85
+ padding: 20px;
86
+ margin-top: 20px;
87
+ }
88
+
89
+ #checkbox-heading {
90
+ text-align: center;
91
+ font-size: 16px;
92
+ margin-bottom: 10px;
93
+ }
94
+
95
+ #explanation-note {
96
+ text-align: center;
97
+ margin-top: 20px;
98
+ font-style: italic;
99
+ }
100
+
101
+ #submitBtn {
102
+ margin-top: 20px;
103
+ border-radius: 5px;
104
+ }
105
+
106
+ .spinner {
107
+ border: 4px solid rgba(0, 0, 0, 0.1);
108
+ border-left-color: #333;
109
+ border-radius: 50%;
110
+ width: 50px;
111
+ height: 50px;
112
+ animation: spin 1s linear infinite;
113
+ margin: 20px auto;
114
+ }
115
+
116
+ @keyframes spin {
117
+ 0% { transform: rotate(0deg); }
118
+ 100% { transform: rotate(360deg); }
119
+ }
templates/index.html ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>CSV File Upload</title>
7
+ <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
8
+ </head>
9
+ <body>
10
+ <div class="container">
11
+ <h1>Vendor Master De-Duplication Tool</h1>
12
+ <form id="uploadForm" enctype="multipart/form-data">
13
+ <input type="file" name="file" id="csvFile" accept=".xlsx">
14
+ </form>
15
+ <div class="options-wrapper">
16
+ <div id="checkbox-heading">Select the options based on which duplication check will be performed and submit</div>
17
+ <div class="options-container">
18
+ <div class="option">
19
+ <input type="checkbox" name="option" value="Tax" id="option1" checked>
20
+ <label for="option1">Tax</label>
21
+ </div>
22
+ <div class="option">
23
+ <input type="checkbox" name="option" value="Bank" id="option2" checked>
24
+ <label for="option2">Bank</label>
25
+ </div>
26
+ <div class="option">
27
+ <input type="checkbox" name="option" value="Address" id="option3" checked>
28
+ <label for="option3">Address</label>
29
+ </div>
30
+ <div class="option">
31
+ <input type="checkbox" name="option" value="Name" id="option4" checked>
32
+ <label for="option4">Name</label>
33
+ </div>
34
+ <div class="option">
35
+ <input type="checkbox" name="option" value="PostCode" id="option5" checked>
36
+ <label for="option5">PostCode</label>
37
+ </div>
38
+ <div class="option">
39
+ <input type="checkbox" name="option" value="AccGrp" id="option6" checked>
40
+ <label for="option6">AccGrp</label>
41
+ </div>
42
+ </div>
43
+ </div>
44
+ <button type="button" id="submitBtn">Submit</button>
45
+ <div id="processingMsg" class="hidden">
46
+ <div class="spinner"></div>
47
+ </div>
48
+ <div id="progressBar"></div>
49
+ <div id="downloadBtn" class="hidden">
50
+ <a id="downloadLink" href="{{ url_for('download_file', filename='output.xlsx') }}">
51
+ <button>Download Processed XLSX</button>
52
+ </a>
53
+ </div>
54
+ <div id="explanation-note">
55
+ Note: The last column titled 'explanation' in output file contains the analysis for potential duplicates with the following row.
56
+ </div>
57
+ </div>
58
+ <script src="{{ url_for('static', filename='script.js') }}"></script>
59
+ </body>
60
+ </html>
uploads/readme.txt.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Deduplication