Mitanshu Sukhwani commited on
Commit
c94dde8
β€’
1 Parent(s): bcdaf27

add app and requirements

Browse files
Files changed (3) hide show
  1. .gitignore +1 -0
  2. app.py +252 -0
  3. requirements.txt +6 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pdf
app.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import the required libraries
2
+ import gradio as gr
3
+ import cv2 # OpenCV, to read and manipulate images
4
+ import easyocr # EasyOCR, for OCR
5
+ import torch # PyTorch, for deep learning
6
+ import pymupdf # PDF manipulation
7
+ from transformers import pipeline # Hugging Face Transformers, for NER
8
+ import os # OS, for file operations
9
+ from glob import glob # Glob, to get file paths
10
+
11
+ ##########################################################################################################
12
+ # Initiate the models
13
+
14
+ # Easyocr model
15
+ print("Initiating easyocr")
16
+ reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available(), model_storage_directory='.')
17
+
18
+ # Use gpu if available
19
+ print("Using gpu if available")
20
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
21
+ print(f"Using device: {device}")
22
+
23
+ # Ner model
24
+ print("Initiating nlp pipeline")
25
+ nlp = pipeline("token-classification", model="dslim/distilbert-NER", device=device)
26
+
27
+ ##########################################################################################################
28
+ ## Functions
29
+
30
+ # Define img_format
31
+ img_format = "png"
32
+
33
+ # Convert pdf to set of images
34
+ def convert_to_images(pdf_file_path):
35
+
36
+ # Create a directory to store pdf images
37
+ pdf_images_dir = f'{pdf_file_path}_images'
38
+ os.makedirs(pdf_images_dir, exist_ok=True)
39
+
40
+ # DPI
41
+ dpi = 150
42
+
43
+ # Convert the PDF to images
44
+ print("Converting PDF to images...")
45
+ doc = pymupdf.open(pdf_file_path) # open document
46
+ for page in doc: # iterate through the pages
47
+ pix = page.get_pixmap(dpi=dpi) # render page to an image
48
+ pix.save(f"{pdf_images_dir}/page-{page.number}.{img_format}") # store image as a PNG
49
+
50
+ # Return the directory with the images
51
+ return pdf_images_dir
52
+
53
+ # Do the redaction
54
+ def redact_image(pdf_image_path, redaction_score_threshold):
55
+
56
+ # Loop through the images
57
+ print("Redacting sensitive information...")
58
+
59
+ print(f"Processing {pdf_image_path}...")
60
+ # Read the image
61
+ cv_image = cv2.imread(pdf_image_path)
62
+
63
+ # Read the text from the image
64
+ result = reader.readtext(cv_image, height_ths=0, width_ths=0, x_ths=0, y_ths=0)
65
+
66
+ # Get the text from the result
67
+ text = ' '.join([text for (bbox, text, prob) in result])
68
+
69
+ # Perform NER on the text
70
+ ner_results = nlp(text)
71
+
72
+ # Draw bounding boxes
73
+ for ((bbox, text, prob),ner_result) in zip(result, ner_results):
74
+
75
+ # Get the coordinates of the bounding box
76
+ (top_left, top_right, bottom_right, bottom_left) = bbox
77
+ top_left = tuple(map(int, top_left))
78
+ bottom_right = tuple(map(int, bottom_right))
79
+
80
+ # Calculate the centers of the top and bottom of the bounding box
81
+ # center_top = (int((top_left[0] + top_right[0]) / 2), int((top_left[1] + top_right[1]) / 2))
82
+ # center_bottom = (int((bottom_left[0] + bottom_right[0]) / 2), int((bottom_left[1] + bottom_right[1]) / 2))
83
+
84
+
85
+ # If the NER result is not empty, and the score is high
86
+ if len(ner_result) > 0 and ner_result['score'] > redaction_score_threshold:
87
+
88
+ # Get the entity and score
89
+ # entity = ner_result[0]['entity']
90
+ # score = str(ner_result[0]['score'])
91
+
92
+ # Apply a irreversible redaction
93
+ cv2.rectangle(cv_image, top_left, bottom_right, (0, 0, 0), -1)
94
+ # else:
95
+ # entity = 'O'
96
+ # score = '0'
97
+
98
+ # # Draw the bounding box
99
+ # cv2.rectangle(cv_image, top_left, bottom_right, (0, 255, 0), 1)
100
+ # # Draw the entity and score
101
+ # cv2.putText(cv_image, entity, center_top, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)
102
+ # cv2.putText(cv_image, score, center_bottom, cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 0, 0), 2)
103
+
104
+ # Save the redacted image
105
+ print(f"Saving redacted {pdf_image_path}...")
106
+ redacted_image_path = pdf_image_path.replace(f'.{img_format}', f'_redacted.{img_format}')
107
+ # Save the redacted image in png format
108
+ cv2.imwrite(redacted_image_path, cv_image)
109
+
110
+ return redacted_image_path
111
+
112
+ # Convert the set of redacted images to a pdf
113
+ def stich_images_to_pdf(redacted_image_files, input_pdf_name):
114
+
115
+ # Sort the redacted images
116
+ redacted_image_files.sort()
117
+
118
+ # Convert the redacted images to a single PDF
119
+ print("Converting redacted images to PDF...")
120
+ redacted_pdf_folder = "/tmp/gradio/redacted"
121
+ os.makedirs(redacted_pdf_folder, exist_ok=True )
122
+ redacted_pdf_path = f'{redacted_pdf_folder}/{input_pdf_name}_redacted.pdf'
123
+
124
+ doc = pymupdf.open()
125
+ for redacted_image_file in redacted_image_files:
126
+ img = pymupdf.open(redacted_image_file) # open pic as document
127
+ rect = img[0].rect # pic dimension
128
+ pdfbytes = img.convert_to_pdf() # make a PDF stream
129
+ img.close() # no longer needed
130
+ imgPDF = pymupdf.open("pdf", pdfbytes) # open stream as PDF
131
+ page = doc.new_page(width = rect.width, # new page with ...
132
+ height = rect.height) # pic dimension
133
+ page.show_pdf_page(rect, imgPDF, 0) # image fills the page
134
+ doc.save(redacted_pdf_path)
135
+
136
+ # print(f"PDF saved as {redacted_pdf_path}")
137
+
138
+ return redacted_pdf_path
139
+
140
+ def cleanup(redacted_image_files, pdf_images, pdf_images_dir, original_pdf):
141
+
142
+ # Remove the directory with the images
143
+ print("Cleaning up...")
144
+
145
+ # Remove the redacted images
146
+ for file in redacted_image_files:
147
+ os.remove(file)
148
+
149
+ # Remove the pdf images
150
+ for file in pdf_images:
151
+ os.remove(file)
152
+
153
+ # Remove the pdf images directory
154
+ os.rmdir(pdf_images_dir)
155
+
156
+ # Remove original pdf
157
+ os.remove(original_pdf)
158
+
159
+ return None
160
+
161
+ # Func to control ui
162
+ def predict(input_pdf_path, sensitivity):
163
+
164
+ print("Setting threshold")
165
+ # Convert sensitivity to threshold
166
+ redaction_score_threshold = (100-sensitivity)/100
167
+
168
+ # Get file name
169
+ print("Getting filename")
170
+ input_pdf_name = input_pdf_path.split('.')[-2]
171
+
172
+ # Convert the PDF to images
173
+ print("Converting pdf to images")
174
+ pdf_images_dir = convert_to_images(input_pdf_path)
175
+
176
+ # Get the file paths of the images
177
+ print("Gathering converted images")
178
+ pdf_images = glob(f'{pdf_images_dir}/*.{img_format}', recursive=True)
179
+ pdf_images.sort()
180
+
181
+ # Redact images
182
+ print("Redacting images")
183
+ redacted_image_files = []
184
+
185
+ for pdf_image in pdf_images:
186
+
187
+ redacted_image_files.append(redact_image(pdf_image, redaction_score_threshold))
188
+
189
+
190
+ # Convert the redacted images to a single PDF
191
+ print("Stitching images to pdf")
192
+ redacted_pdf_path = stich_images_to_pdf(redacted_image_files, input_pdf_name)
193
+
194
+ print("Cleaning up")
195
+ cleanup(redacted_image_files, pdf_images, pdf_images_dir, input_pdf_path)
196
+
197
+ return redacted_pdf_path
198
+
199
+ ##########################################################################################################
200
+
201
+ contact_text = """
202
+ # Contact Information
203
+
204
+ πŸ‘€ [Mitanshu Sukhwani](https://www.linkedin.com/in/mitanshusukhwani/)
205
+
206
+ βœ‰οΈ [email protected]
207
+
208
+ πŸ™ [mitanshu7](https://github.com/mitanshu7)
209
+ """
210
+
211
+ ##########################################################################################################
212
+ # Gradio interface
213
+
214
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
215
+
216
+ # Title and description
217
+ gr.Markdown("# RedactNLP: Redact your PDF!")
218
+ gr.Markdown("## How redaction happens:")
219
+ gr.Markdown("""
220
+ 1. The PDF pages are converted to images.
221
+ 2. EasyOCR is run on the converted images to extract text.
222
+ 3. "FacebookAI/xlm-roberta-large-finetuned-conll03-english" model does the token classification.
223
+ 4. Non-recoverable mask is applied to identified elements.
224
+ """)
225
+
226
+ # Input Section
227
+ pdf_file_input = gr.File(file_count='single', file_types=['pdf'], label='Upload PDF', show_label=True, interactive=True)
228
+
229
+ # Slider for results count
230
+ slider_input = gr.Slider(
231
+ minimum=0, maximum=100, value=80, step=1,
232
+ label="Sensitivity to remove elements. Higher is more sensitive, hence will redact aggresively."
233
+ )
234
+
235
+ # Submission Button
236
+ submit_btn = gr.Button("Redact")
237
+
238
+ # Output section
239
+ output = gr.File(file_count='single', file_types=['pdf'], label='Download redacted PDF', show_label=True, interactive=False)
240
+
241
+ # Attribution
242
+ gr.Markdown(contact_text)
243
+
244
+ # Link button click to the prediction function
245
+ submit_btn.click(predict, [pdf_file_input, slider_input], output)
246
+
247
+
248
+ ################################################################################
249
+
250
+ if __name__ == "__main__":
251
+ demo.launch()
252
+
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ opencv-python
4
+ easyocr
5
+ pymupdf
6
+ gradio