File size: 9,646 Bytes
9e5b4bd
988450a
f91691d
4e55f8f
012faab
 
988450a
4e55f8f
 
 
 
988450a
 
4e55f8f
 
 
 
 
 
 
 
 
 
 
 
 
 
1984dbe
988450a
 
4e55f8f
 
 
988450a
246d50e
988450a
4e55f8f
 
 
 
 
246d50e
 
 
 
 
4e55f8f
 
 
 
 
 
 
988450a
fe3240a
988450a
9e5b4bd
f91691d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe3240a
f91691d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e5b4bd
 
 
4e55f8f
 
fe3240a
79a32e1
988450a
 
f91691d
 
 
 
 
 
 
9e5b4bd
988450a
 
9e5b4bd
988450a
f91691d
 
9e5b4bd
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
import gradio as gr
from imports import *
from parse_info import *
login(token="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")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dict_ = {
    0: "negative",
    1: "positive",
    2: "neutral"}
tokenizer_sent = AutoTokenizer.from_pretrained("nam194/sentiment", use_fast=False)
model_sent = AutoModelForSequenceClassification.from_pretrained("nam194/sentiment", num_labels=3, use_auth_token=True).to(device)
def cvt2cls(data):
    data = list(set(data))
    try: 
        data.remove(20)
    except:
        pass
    for i, num in enumerate(data):
        if num == 20:
            continue
        if num>=10:
            data[i] -= 10
    return data
ner_tags = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
topic_tags = {0: 'chỗ để xe', 1: 'con người', 2: 'công việc', 3: 'cơ sở vật chất', 4: 'dự án', 5: 'lương', 6: 'môi trường làm việc', 7: 'ot/thời gian', 8: 'văn phòng', 9: 'đãi ngộ'}
config = RobertaConfig.from_pretrained("nam194/ner", num_labels=21)
tokenizer_topic = AutoTokenizer.from_pretrained("nam194/ner", use_fast=False) 
model_topic = PhoBertLstmCrf.from_pretrained("nam194/ner", config=config, from_tf=False).to(device)
model_topic.resize_token_embeddings(len(tokenizer_topic)) 


def sentiment(sent: str):
    sent_ = normalize(text=sent) # segment input sentence, maybe raise ConnectionError: HTTPConnectionPool())
    input_sent = torch.tensor([tokenizer_sent.encode(sent_)]).to(device)
    with torch.no_grad():
        out_sent = model_sent(input_sent)
        logits_sent = out_sent.logits.softmax(dim=-1).tolist()[0]
        pred_sent = dict_[np.argmax(logits_sent)]

    sent = replace_all(text=sent) # segment input sentence, maybe raise ConnectionError: HTTPConnectionPool())
    sent_segment = sent.split(".")
    for i, s in enumerate(sent_segment):
        s = s.strip()
        sent_segment[i] = underthesea.word_tokenize(s, format="text").split()
    dump = [[i, 'O'] for s in sent_segment for i in s]
    dump_set = NerDataset(feature_for_phobert([dump], tokenizer=tokenizer_topic, use_crf=True))    
    dump_iter = DataLoader(dump_set, batch_size=1)
    with torch.no_grad():
        for idx, batch in enumerate(dump_iter):
            batch = { k:v.to(device) for k, v in batch.items() }        
            outputs = model_topic(**batch)
    pred_topic = list(set([topic_tags[i] for i in cvt2cls(outputs["tags"][0])]))
    return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})
    

processor = transformers.AutoProcessor.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label", use_auth_token=True, apply_ocr=False) 
model = transformers.LayoutLMv3ForTokenClassification.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label").to(device)
label_list = ['person_name', 'dob_key', 'dob_value', 'gender_key', 'gender_value', 'phonenumber_key', 'phonenumber_value', 'email_key', 'email_value',
              'address_key', 'address_value', 'socical_address_value', 'education', 'education_name', 'education_time', 'experience', 'experience_name',
              'experience_time', 'information', 'undefined', 'designation_key', 'designation_value', 'degree_key', 'degree_value', 'skill_key', 'skill_value']
id2label = {0: 'person_name', 1: 'dob_key', 2: 'dob_value', 3: 'gender_key', 4: 'gender_value', 5: 'phonenumber_key', 6: 'phonenumber_value',
            7: 'email_key', 8: 'email_value', 9: 'address_key', 10: 'address_value', 11: 'socical_address_value', 12: 'education', 13: 'education_name',
            14: 'education_time', 15: 'experience', 16: 'experience_name', 17: 'experience_time', 18: 'information', 19: 'undefined', 20: 'designation_key',
            21: 'designation_value', 22: 'degree_key', 23: 'degree_value', 24: 'skill_key', 25: 'skill_value'}
key_list = ["person_name","dob_value","gender_value","phonenumber_value","email_value","address_value",
                "socical_address_value","education_name","education_time","experience_name","experience_time",
                "designation_value","degree_value","skill_value"]
label2id = {v: k for k, v in id2label.items()}
def pred_resume(pdf_path) -> dict:
    global key_list, device
    result = {}
    for i in key_list:
        result[i] = []
    DPI = 200/77
    global label_list, id2label, label2id

    # read pdf, convert to img
    doc = fitz.open(pdf_path.name)
    num_pages = len(doc)
    images = pdf2image.convert_from_path(pdf_path)
    block_dict = {} 

    # get all data in pdf
    page_num = 1
    for page in doc: 
        file_dict = page.get_text('dict') 
        block = file_dict['blocks'] 
        block_dict[page_num] = block 
        page_num += 1

    # predict each page in pdf
    for page_num, blocks in block_dict.items():
        bboxes, words = [], [] # store bounding boxes, text in a page
        image = images[page_num-1]
        for block in blocks:
            if block['type'] == 0:
                for line in block['lines']:
                    for span in line['spans']:
                        xmin, ymin, xmax, ymax = [int(i)*DPI for i in list(span['bbox'])]
                        text = unidecode(span['text']).strip()
                        if text.replace(" ","") !=  "":
                            bboxes.append(normalize_bbox([xmin, ymin, xmax, ymax], image.size))
                            words.append(decontracted(text))
        fake_label = ["O"] * len(words)
        encoding = processor(image, words, boxes=bboxes, word_labels=fake_label, truncation=True, stride=256,
                      padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True)
        labels = encoding["labels"]
        offset_mapping = encoding.pop('offset_mapping')
        overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
        encoding = {k: torch.tensor(v) for k,v in encoding.items() if k != "labels"}
        x = []
        for i in range(0, len(encoding['pixel_values'])):
            x.append(encoding['pixel_values'][i])
        x = torch.stack(x)
        encoding['pixel_values'] = x

        # forawrd to model
        with torch.no_grad():
            outputs = model(**{k: v.to(device) for k,v in encoding.items() if k != "labels"})
        
        # process output
        predictions = outputs["logits"].argmax(-1).squeeze().tolist()
        if outputs["logits"].shape[0] > 1:
            for i, label in enumerate(labels):
                if i>0:
                    labels[i] = labels[i][256:]
                    predictions[i] = predictions[i][256:]
            predictions = [j for i in predictions for j in i]
        labels = [j for i in labels for j in i]
        true_predictions = [id2label[pred] for pred, label in zip(predictions, labels) if label != -100]
        for i, pred in enumerate(true_predictions):
            if pred in key_list:
                result[pred].append(words[i])
    return str(result)
def norm(result: str) -> str:
    result = ast.literal_eval(result)
    result["person_name"] = " ".join([parse_string(i).capitalize() for i in " ".join(result["person_name"]).split()])
    result["email_value"] = parse_email(result["email_value"])
    result["phonenumber_value"] = "".join([i for i in "".join(result["phonenumber_value"]) if i.isdigit()])
    result["address_value"] = parse_address(result["address_value"])
    result["designation_value"] = parse_designation(result["designation_value"])
    result["experience_time"] = parse_time(result["experience_time"])
    result["gender_value"] = parse_gender(result["gender_value"])
    result["skill_value"] = parse_skill(result["skill_value"])
    result["education_name"] = parse_designation(result["education_name"])
    result["experience_name"] = parse_designation(result["experience_name"])
    return str(result)


with gr.Blocks() as demo:
    gr.Markdown("Demo projects Review Company and Resume parser phase 1.")
    with gr.Tab("Review Company"):
        text_input = gr.Textbox(label="Input sentence (ex: Sếp tốt, bảo hiểm đóng full lương bảo hiểm cho nhân viên. Hàng năm tăng lương ổn OT không trả thêm tiền, chỉ cho ngày nghỉ và hỗ trợ ăn tối.):", placeholder="input here...")
        text_output = gr.Textbox(label="Result:")
        text_button = gr.Button("Predict")
    with gr.Tab("Extract infomation from resume"):
        with gr.Row():
            file_input = gr.File(label="Upload pdf", file_types=[".pdf"])
            cv_output = gr.Textbox(label="Information fields")
            resume_button = gr.Button("Extract")
        with gr.Row():
            normalize_output = gr.Textbox(label="Normalize by rule-based:")
            normalize_button = gr.Button("Normailze")

    # with gr.Accordion("Open for More!"):
    #     gr.Markdown("Look at me...")

    text_button.click(sentiment, inputs=text_input, outputs=text_output)
    resume_button.click(pred_resume, inputs=file_input, outputs=cv_output)
    normalize_button.click(norm, inputs=cv_output, outputs=normalize_output)

demo.launch()