Spaces:

nam194
/

Review_company_analysis_and_Resume_parsing

Running

Review_company_analysis_and_Resume_parsing

File size: 9,646 Bytes

import gradio as gr
from imports import *
from parse_info import *
login(token="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")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dict_ = {
    0: "negative",
    1: "positive",
    2: "neutral"}
tokenizer_sent = AutoTokenizer.from_pretrained("nam194/sentiment", use_fast=False)
model_sent = AutoModelForSequenceClassification.from_pretrained("nam194/sentiment", num_labels=3, use_auth_token=True).to(device)
def cvt2cls(data):
    data = list(set(data))
    try: 
        data.remove(20)
    except:
        pass
    for i, num in enumerate(data):
        if num == 20:
            continue
        if num>=10:
            data[i] -= 10
    return data
ner_tags = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
topic_tags = {0: 'chỗ để xe', 1: 'con người', 2: 'công việc', 3: 'cơ sở vật chất', 4: 'dự án', 5: 'lương', 6: 'môi trường làm việc', 7: 'ot/thời gian', 8: 'văn phòng', 9: 'đãi ngộ'}
config = RobertaConfig.from_pretrained("nam194/ner", num_labels=21)
tokenizer_topic = AutoTokenizer.from_pretrained("nam194/ner", use_fast=False) 
model_topic = PhoBertLstmCrf.from_pretrained("nam194/ner", config=config, from_tf=False).to(device)
model_topic.resize_token_embeddings(len(tokenizer_topic)) 


def sentiment(sent: str):
    sent_ = normalize(text=sent) # segment input sentence, maybe raise ConnectionError: HTTPConnectionPool())
    input_sent = torch.tensor([tokenizer_sent.encode(sent_)]).to(device)
    with torch.no_grad():
        out_sent = model_sent(input_sent)
        logits_sent = out_sent.logits.softmax(dim=-1).tolist()[0]
        pred_sent = dict_[np.argmax(logits_sent)]

    sent = replace_all(text=sent) # segment input sentence, maybe raise ConnectionError: HTTPConnectionPool())
    sent_segment = sent.split(".")
    for i, s in enumerate(sent_segment):
        s = s.strip()
        sent_segment[i] = underthesea.word_tokenize(s, format="text").split()
    dump = [[i, 'O'] for s in sent_segment for i in s]
    dump_set = NerDataset(feature_for_phobert([dump], tokenizer=tokenizer_topic, use_crf=True))    
    dump_iter = DataLoader(dump_set, batch_size=1)
    with torch.no_grad():
        for idx, batch in enumerate(dump_iter):
            batch = { k:v.to(device) for k, v in batch.items() }        
            outputs = model_topic(**batch)
    pred_topic = list(set([topic_tags[i] for i in cvt2cls(outputs["tags"][0])]))
    return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})
    

processor = transformers.AutoProcessor.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label", use_auth_token=True, apply_ocr=False) 
model = transformers.LayoutLMv3ForTokenClassification.from_pretrained("nam194/resume_parsing_layoutlmv3_large_custom_label").to(device)
label_list = ['person_name', 'dob_key', 'dob_value', 'gender_key', 'gender_value', 'phonenumber_key', 'phonenumber_value', 'email_key', 'email_value',
              'address_key', 'address_value', 'socical_address_value', 'education', 'education_name', 'education_time', 'experience', 'experience_name',
              'experience_time', 'information', 'undefined', 'designation_key', 'designation_value', 'degree_key', 'degree_value', 'skill_key', 'skill_value']
id2label = {0: 'person_name', 1: 'dob_key', 2: 'dob_value', 3: 'gender_key', 4: 'gender_value', 5: 'phonenumber_key', 6: 'phonenumber_value',
            7: 'email_key', 8: 'email_value', 9: 'address_key', 10: 'address_value', 11: 'socical_address_value', 12: 'education', 13: 'education_name',
            14: 'education_time', 15: 'experience', 16: 'experience_name', 17: 'experience_time', 18: 'information', 19: 'undefined', 20: 'designation_key',
            21: 'designation_value', 22: 'degree_key', 23: 'degree_value', 24: 'skill_key', 25: 'skill_value'}
key_list = ["person_name","dob_value","gender_value","phonenumber_value","email_value","address_value",
                "socical_address_value","education_name","education_time","experience_name","experience_time",
                "designation_value","degree_value","skill_value"]
label2id = {v: k for k, v in id2label.items()}
def pred_resume(pdf_path) -> dict:
    global key_list, device
    result = {}
    for i in key_list:
        result[i] = []
    DPI = 200/77
    global label_list, id2label, label2id

    # read pdf, convert to img
    doc = fitz.open(pdf_path.name)
    num_pages = len(doc)
    images = pdf2image.convert_from_path(pdf_path)
    block_dict = {} 

    # get all data in pdf
    page_num = 1
    for page in doc: 
        file_dict = page.get_text('dict') 
        block = file_dict['blocks'] 
        block_dict[page_num] = block 
        page_num += 1

    # predict each page in pdf
    for page_num, blocks in block_dict.items():
        bboxes, words = [], [] # store bounding boxes, text in a page
        image = images[page_num-1]
        for block in blocks:
            if block['type'] == 0:
                for line in block['lines']:
                    for span in line['spans']:
                        xmin, ymin, xmax, ymax = [int(i)*DPI for i in list(span['bbox'])]
                        text = unidecode(span['text']).strip()
                        if text.replace(" ","") !=  "":
                            bboxes.append(normalize_bbox([xmin, ymin, xmax, ymax], image.size))
                            words.append(decontracted(text))
        fake_label = ["O"] * len(words)
        encoding = processor(image, words, boxes=bboxes, word_labels=fake_label, truncation=True, stride=256,
                      padding="max_length", max_length=512, return_overflowing_tokens=True, return_offsets_mapping=True)
        labels = encoding["labels"]
        offset_mapping = encoding.pop('offset_mapping')
        overflow_to_sample_mapping = encoding.pop('overflow_to_sample_mapping')
        encoding = {k: torch.tensor(v) for k,v in encoding.items() if k != "labels"}
        x = []
        for i in range(0, len(encoding['pixel_values'])):
            x.append(encoding['pixel_values'][i])
        x = torch.stack(x)
        encoding['pixel_values'] = x

        # forawrd to model
        with torch.no_grad():
            outputs = model(**{k: v.to(device) for k,v in encoding.items() if k != "labels"})
        
        # process output
        predictions = outputs["logits"].argmax(-1).squeeze().tolist()
        if outputs["logits"].shape[0] > 1:
            for i, label in enumerate(labels):
                if i>0:
                    labels[i] = labels[i][256:]
                    predictions[i] = predictions[i][256:]
            predictions = [j for i in predictions for j in i]
        labels = [j for i in labels for j in i]
        true_predictions = [id2label[pred] for pred, label in zip(predictions, labels) if label != -100]
        for i, pred in enumerate(true_predictions):
            if pred in key_list:
                result[pred].append(words[i])
    return str(result)
def norm(result: str) -> str:
    result = ast.literal_eval(result)
    result["person_name"] = " ".join([parse_string(i).capitalize() for i in " ".join(result["person_name"]).split()])
    result["email_value"] = parse_email(result["email_value"])
    result["phonenumber_value"] = "".join([i for i in "".join(result["phonenumber_value"]) if i.isdigit()])
    result["address_value"] = parse_address(result["address_value"])
    result["designation_value"] = parse_designation(result["designation_value"])
    result["experience_time"] = parse_time(result["experience_time"])
    result["gender_value"] = parse_gender(result["gender_value"])
    result["skill_value"] = parse_skill(result["skill_value"])
    result["education_name"] = parse_designation(result["education_name"])
    result["experience_name"] = parse_designation(result["experience_name"])
    return str(result)


with gr.Blocks() as demo:
    gr.Markdown("Demo projects Review Company and Resume parser phase 1.")
    with gr.Tab("Review Company"):
        text_input = gr.Textbox(label="Input sentence (ex: Sếp tốt, bảo hiểm đóng full lương bảo hiểm cho nhân viên. Hàng năm tăng lương ổn OT không trả thêm tiền, chỉ cho ngày nghỉ và hỗ trợ ăn tối.):", placeholder="input here...")
        text_output = gr.Textbox(label="Result:")
        text_button = gr.Button("Predict")
    with gr.Tab("Extract infomation from resume"):
        with gr.Row():
            file_input = gr.File(label="Upload pdf", file_types=[".pdf"])
            cv_output = gr.Textbox(label="Information fields")
            resume_button = gr.Button("Extract")
        with gr.Row():
            normalize_output = gr.Textbox(label="Normalize by rule-based:")
            normalize_button = gr.Button("Normailze")

    # with gr.Accordion("Open for More!"):
    #     gr.Markdown("Look at me...")

    text_button.click(sentiment, inputs=text_input, outputs=text_output)
    resume_button.click(pred_resume, inputs=file_input, outputs=cv_output)
    normalize_button.click(norm, inputs=cv_output, outputs=normalize_output)

demo.launch()