File size: 3,780 Bytes
9e5b4bd 988450a 4e55f8f 988450a 4e55f8f 988450a 4e55f8f 1984dbe 988450a 4e55f8f 988450a 246d50e 988450a 4e55f8f 246d50e 4e55f8f 988450a 79a32e1 988450a 9e5b4bd 4e55f8f 79a32e1 988450a 9e5b4bd ebfa8f4 9e5b4bd 988450a 9e5b4bd 988450a 9e5b4bd 988450a 9e5b4bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import numpy as np
import gradio as gr
from imports import *
from huggingface_hub import login
login(token="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")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dict_ = {
0: "negative",
1: "positive",
2: "neutral"}
tokenizer_sent = AutoTokenizer.from_pretrained("nam194/sentiment", use_fast=False)
model_sent = AutoModelForSequenceClassification.from_pretrained("nam194/sentiment", num_labels=3, use_auth_token=True).to(device)
def cvt2cls(data):
data = list(set(data))
try:
data.remove(20)
except:
pass
for i, num in enumerate(data):
if num == 20:
continue
if num>=10:
data[i] -= 10
return data
ner_tags = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
topic_tags = {0: 'chỗ để xe', 1: 'con người', 2: 'công việc', 3: 'cơ sở vật chất', 4: 'dự án', 5: 'lương', 6: 'môi trường làm việc', 7: 'ot/thời gian', 8: 'văn phòng', 9: 'đãi ngộ'}
config = RobertaConfig.from_pretrained("nam194/ner", num_labels=21)
tokenizer_topic = AutoTokenizer.from_pretrained("nam194/ner", use_fast=False)
model_topic = PhoBertLstmCrf.from_pretrained("nam194/ner", config=config, from_tf=False).to(device)
model_topic.resize_token_embeddings(len(tokenizer_topic))
def sentiment(sent: str):
sent_ = normalize(text=sent) # segment input sentence, maybe raise ConnectionError: HTTPConnectionPool())
input_sent = torch.tensor([tokenizer_sent.encode(sent_)]).to(device)
with torch.no_grad():
out_sent = model_sent(input_sent)
logits_sent = out_sent.logits.softmax(dim=-1).tolist()[0]
pred_sent = dict_[np.argmax(logits_sent)]
sent = replace_all(text=sent) # segment input sentence, maybe raise ConnectionError: HTTPConnectionPool())
sent_segment = sent.split(".")
for i, s in enumerate(sent_segment):
s = s.strip()
sent_segment[i] = underthesea.word_tokenize(s, format="text").split()
dump = [[i, 'O'] for s in sent_segment for i in s]
dump_set = NerDataset(feature_for_phobert([dump], tokenizer=tokenizer_topic, use_crf=True))
dump_iter = DataLoader(dump_set, batch_size=1)
with torch.no_grad():
for idx, batch in enumerate(dump_iter):
batch = { k:v.to(device) for k, v in batch.items() }
outputs = model_topic(**batch)
pred_topic = list(set([topic_tags[i] for i in cvt2cls(outputs["tags"][0])]))
return str({"sentiment": pred_sent, "topic": pred_topic})
def flip_image(x):
return np.fliplr(x)
with gr.Blocks() as demo:
gr.Markdown("Demo projects Review Company and Resume parser phase 1.")
with gr.Tab("Review Company"):
text_input = gr.Textbox(label="Input sentence:", placeholder="input here...")
text_output = gr.Textbox(label="Result:")
text_button = gr.Button("Predict")
with gr.Tab("Extract infomation from resume"):
with gr.Row():
image_input = gr.Image()
image_output = gr.Image()
image_button = gr.Button("Predict")
# with gr.Accordion("Open for More!"):
# gr.Markdown("Look at me...")
text_button.click(sentiment, inputs=text_input, outputs=text_output)
image_button.click(flip_image, inputs=image_input, outputs=image_output)
demo.launch() |