nam194 commited on
Commit
f91691d
1 Parent(s): 86f8cd3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -14
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  from imports import *
 
3
  login(token="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")
4
 
5
 
@@ -54,16 +55,98 @@ def sentiment(sent: str):
54
  return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})
55
 
56
 
57
- def pdf_to_imgs(pdf):
58
- path_to_pdf = pdf.name
59
-
60
- # convert PDF to PIL images (one image by page)
61
- first_page = True # we want here only the first page as image
62
- if first_page: last_page = 1
63
- else: last_page = None
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- imgs = pdf2image.convert_from_path(path_to_pdf, last_page=last_page)
66
- return np.array(imgs[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
 
69
  with gr.Blocks() as demo:
@@ -73,15 +156,19 @@ with gr.Blocks() as demo:
73
  text_output = gr.Textbox(label="Result:")
74
  text_button = gr.Button("Predict")
75
  with gr.Tab("Extract infomation from resume"):
76
- # with gr.Row():
77
- file_input = gr.File(label="Upload pdf", file_types=[".pdf"])
78
- image_output = gr.Image(type="numpy", label="Image of the first page")
79
- image_button = gr.Button("Predict")
 
 
 
80
 
81
  # with gr.Accordion("Open for More!"):
82
  # gr.Markdown("Look at me...")
83
 
84
  text_button.click(sentiment, inputs=text_input, outputs=text_output)
85
- image_button.click(pdf_to_imgs, inputs=file_input, outputs=image_output)
 
86
 
87
  demo.launch()
 
1
  import gradio as gr
2
  from imports import *
3
+ from parse_info import *
4
  login(token="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")
5
 
6
 
 
55
  return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})
56
 
57
 
58
+ processor = transformers.AutoProcessor.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label", use_auth_token=True, apply_ocr=False)
59
+ model = transformers.LayoutLMv3ForTokenClassification.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label").to(device)
60
+ label_list = ['person_name', 'dob_key', 'dob_value', 'gender_key', 'gender_value', 'phonenumber_key', 'phonenumber_value', 'email_key', 'email_value',
61
+ 'address_key', 'address_value', 'socical_address_value', 'education', 'education_name', 'education_time', 'experience', 'experience_name',
62
+ 'experience_time', 'information', 'undefined', 'designation_key', 'designation_value', 'degree_key', 'degree_value', 'skill_key', 'skill_value']
63
+ id2label = {0: 'person_name', 1: 'dob_key', 2: 'dob_value', 3: 'gender_key', 4: 'gender_value', 5: 'phonenumber_key', 6: 'phonenumber_value',
64
+ 7: 'email_key', 8: 'email_value', 9: 'address_key', 10: 'address_value', 11: 'socical_address_value', 12: 'education', 13: 'education_name',
65
+ 14: 'education_time', 15: 'experience', 16: 'experience_name', 17: 'experience_time', 18: 'information', 19: 'undefined', 20: 'designation_key',
66
+ 21: 'designation_value', 22: 'degree_key', 23: 'degree_value', 24: 'skill_key', 25: 'skill_value'}
67
+ key_list = ["person_name","dob_value","gender_value","phonenumber_value","email_value","address_value",
68
+ "socical_address_value","education_name","education_time","experience_name","experience_time",
69
+ "designation_value","degree_value","skill_value"]
70
+ label2id = {v: k for k, v in id2label.items()}
71
+ def pred_resume(pdf_path) -> dict:
72
+ global key_list, device
73
+ result = {}
74
+ for i in key_list:
75
+ result[i] = []
76
+ DPI = 200/77
77
+ global label_list, id2label, label2id
78
 
79
+ # read pdf, convert to img
80
+ doc = fitz.open(pdf_path.name)
81
+ num_pages = len(doc)
82
+ images = pdf2image.convert_from_path(pdf_path)
83
+ block_dict = {}
84
+
85
+ # get all data in pdf
86
+ page_num = 1
87
+ for page in doc:
88
+ file_dict = page.get_text('dict')
89
+ block = file_dict['blocks']
90
+ block_dict[page_num] = block
91
+ page_num += 1
92
+
93
+ # predict each page in pdf
94
+ for page_num, blocks in block_dict.items():
95
+ bboxes, words = [], [] # store bounding boxes, text in a page
96
+ image = images[page_num-1]
97
+ for block in blocks:
98
+ if block['type'] == 0:
99
+ for line in block['lines']:
100
+ for span in line['spans']:
101
+ xmin, ymin, xmax, ymax = [int(i)*DPI for i in list(span['bbox'])]
102
+ text = unidecode(span['text']).strip()
103
+ if text.replace(" ","") != "":
104
+ bboxes.append(normalize_bbox([xmin, ymin, xmax, ymax], image.size))
105
+ words.append(decontracted(text))
106
+ fake_label = ["O"] * len(words)
107
+ encoding = processor(image, words, boxes=bboxes, word_labels=fake_label, truncation=True, stride=256,
108
+ padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True)
109
+ labels = encoding["labels"]
110
+ offset_mapping = encoding.pop('offset_mapping')
111
+ overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
112
+ encoding = {k: torch.tensor(v) for k,v in encoding.items() if k != "labels"}
113
+ x = []
114
+ for i in range(0, len(encoding['pixel_values'])):
115
+ x.append(encoding['pixel_values'][i])
116
+ x = torch.stack(x)
117
+ encoding['pixel_values'] = x
118
+
119
+ # forawrd to model
120
+ with torch.no_grad():
121
+ outputs = model(**{k: v.to(device) for k,v in encoding.items() if k != "labels"})
122
+
123
+ # process output
124
+ predictions = outputs["logits"].argmax(-1).squeeze().tolist()
125
+ if outputs["logits"].shape[0] > 1:
126
+ for i, label in enumerate(labels):
127
+ if i>0:
128
+ labels[i] = labels[i][256:]
129
+ predictions[i] = predictions[i][256:]
130
+ predictions = [j for i in predictions for j in i]
131
+ labels = [j for i in labels for j in i]
132
+ true_predictions = [id2label[pred] for pred, label in zip(predictions, labels) if label != -100]
133
+ for i, pred in enumerate(true_predictions):
134
+ if pred in key_list:
135
+ result[pred].append(words[i])
136
+ return str(result)
137
+ def norm(result: str) -> str:
138
+ result = ast.literal_eval(result)
139
+ result["person_name"] = " ".join([parse_string(i).capitalize() for i in " ".join(result["person_name"]).split()])
140
+ result["email_value"] = parse_email(result["email_value"])
141
+ result["phonenumber_value"] = "".join([i for i in "".join(result["phonenumber_value"]) if i.isdigit()])
142
+ result["address_value"] = parse_address(result["address_value"])
143
+ result["designation_value"] = parse_designation(result["designation_value"])
144
+ result["experience_time"] = parse_time(result["experience_time"])
145
+ result["gender_value"] = parse_gender(result["gender_value"])
146
+ result["skill_value"] = parse_skill(result["skill_value"])
147
+ result["education_name"] = parse_designation(result["education_name"])
148
+ result["experience_name"] = parse_designation(result["experience_name"])
149
+ return str(result)
150
 
151
 
152
  with gr.Blocks() as demo:
 
156
  text_output = gr.Textbox(label="Result:")
157
  text_button = gr.Button("Predict")
158
  with gr.Tab("Extract infomation from resume"):
159
+ with gr.Row():
160
+ file_input = gr.File(label="Upload pdf", file_types=[".pdf"])
161
+ cv_output = gr.Textbox(label="Information fields")
162
+ resume_button = gr.Button("Extract")
163
+ with gr.Row():
164
+ normalize_output = gr.Textbox(label="Normalize by rule-based:")
165
+ normalize_button = gr.Button("Normailze")
166
 
167
  # with gr.Accordion("Open for More!"):
168
  # gr.Markdown("Look at me...")
169
 
170
  text_button.click(sentiment, inputs=text_input, outputs=text_output)
171
+ resume_button.click(pred_resume, inputs=file_input, outputs=cv_output)
172
+ normalize_button.click(norm, inputs=cv_output, outputs=normalize_output)
173
 
174
  demo.launch()