File size: 4,381 Bytes
9e5b4bd 988450a 4e55f8f 012faab 988450a 4e55f8f 988450a 4e55f8f 1984dbe 988450a 4e55f8f 988450a 246d50e 988450a 4e55f8f 246d50e 4e55f8f 988450a fe3240a 988450a 9e5b4bd fe3240a 9e5b4bd 4e55f8f fe3240a 79a32e1 988450a 012faab 5d0c783 012faab 988450a 9e5b4bd 988450a 9e5b4bd 988450a fe3240a 9e5b4bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import gradio as gr
from imports import *
login(token="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")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dict_ = {
0: "negative",
1: "positive",
2: "neutral"}
tokenizer_sent = AutoTokenizer.from_pretrained("nam194/sentiment", use_fast=False)
model_sent = AutoModelForSequenceClassification.from_pretrained("nam194/sentiment", num_labels=3, use_auth_token=True).to(device)
def cvt2cls(data):
data = list(set(data))
try:
data.remove(20)
except:
pass
for i, num in enumerate(data):
if num == 20:
continue
if num>=10:
data[i] -= 10
return data
ner_tags = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
topic_tags = {0: 'chỗ để xe', 1: 'con người', 2: 'công việc', 3: 'cơ sở vật chất', 4: 'dự án', 5: 'lương', 6: 'môi trường làm việc', 7: 'ot/thời gian', 8: 'văn phòng', 9: 'đãi ngộ'}
config = RobertaConfig.from_pretrained("nam194/ner", num_labels=21)
tokenizer_topic = AutoTokenizer.from_pretrained("nam194/ner", use_fast=False)
model_topic = PhoBertLstmCrf.from_pretrained("nam194/ner", config=config, from_tf=False).to(device)
model_topic.resize_token_embeddings(len(tokenizer_topic))
def sentiment(sent: str):
sent_ = normalize(text=sent) # segment input sentence, maybe raise ConnectionError: HTTPConnectionPool())
input_sent = torch.tensor([tokenizer_sent.encode(sent_)]).to(device)
with torch.no_grad():
out_sent = model_sent(input_sent)
logits_sent = out_sent.logits.softmax(dim=-1).tolist()[0]
pred_sent = dict_[np.argmax(logits_sent)]
sent = replace_all(text=sent) # segment input sentence, maybe raise ConnectionError: HTTPConnectionPool())
sent_segment = sent.split(".")
for i, s in enumerate(sent_segment):
s = s.strip()
sent_segment[i] = underthesea.word_tokenize(s, format="text").split()
dump = [[i, 'O'] for s in sent_segment for i in s]
dump_set = NerDataset(feature_for_phobert([dump], tokenizer=tokenizer_topic, use_crf=True))
dump_iter = DataLoader(dump_set, batch_size=1)
with torch.no_grad():
for idx, batch in enumerate(dump_iter):
batch = { k:v.to(device) for k, v in batch.items() }
outputs = model_topic(**batch)
pred_topic = list(set([topic_tags[i] for i in cvt2cls(outputs["tags"][0])]))
return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})
def pdf_to_imgs(pdf):
path_to_pdf = pdf.name
# convert PDF to PIL images (one image by page)
first_page = True # we want here only the first page as image
if first_page: last_page = 1
else: last_page = None
imgs = pdf2image.convert_from_path(path_to_pdf, last_page=last_page)
return np.array(imgs[0])
with gr.Blocks() as demo:
gr.Markdown("Demo projects Review Company and Resume parser phase 1.")
with gr.Tab("Review Company"):
text_input = gr.Textbox(label="Input sentence (ex: Sếp tốt, bảo hiểm đóng full lương bảo hiểm cho nhân viên. Hàng năm tăng lương ổn OT không trả thêm tiền, chỉ cho ngày nghỉ và hỗ trợ ăn tối.):", placeholder="input here...")
text_output = gr.Textbox(label="Result:")
text_button = gr.Button("Predict")
with gr.Tab("Extract infomation from resume"):
# with gr.Row():
file_input = gr.File(label="Upload pdf", file_types=[".pdf"])
image_output = gr.Image(type="numpy", label="Image of the first page")
image_button = gr.Button("Predict")
# with gr.Accordion("Open for More!"):
# gr.Markdown("Look at me...")
text_button.click(sentiment, inputs=text_input, outputs=text_output)
image_button.click(pdf_to_imgs, inputs=file_input, outputs=image_output)
demo.launch() |