Spaces:

nam194
/

Review_company_analysis_and_Resume_parsing

Running

App Files Files Community

Review_company_analysis_and_Resume_parsing / app.py

nam194

Update app.py

fe3240a over 1 year ago

raw

history blame

4.45 kB

	import numpy as np
	import pdf2image
	import gradio as gr
	from imports import *
	from huggingface_hub import login
	login(token="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")
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	dict_ = {
	0: "negative",
	1: "positive",
	2: "neutral"}
	tokenizer_sent = AutoTokenizer.from_pretrained("nam194/sentiment", use_fast=False)
	model_sent = AutoModelForSequenceClassification.from_pretrained("nam194/sentiment", num_labels=3, use_auth_token=True).to(device)
	def cvt2cls(data):
	data = list(set(data))
	try:
	data.remove(20)
	except:
	pass
	for i, num in enumerate(data):
	if num == 20:
	continue
	if num>=10:
	data[i] -= 10
	return data
	ner_tags = {0: 'B-chỗ để xe', 1: 'B-con người', 2: 'B-công việc', 3: 'B-cơ sở vật chất', 4: 'B-dự án', 5: 'B-lương', 6: 'B-môi trường làm việc', 7: 'B-ot/thời gian', 8: 'B-văn phòng', 9: 'B-đãi ngộ', 10: 'I-chỗ để xe', 11: 'I-con người', 12: 'I-công việc', 13: 'I-cơ sở vật chất', 14: 'I-dự án', 15: 'I-lương', 16: 'I-môi trường làm việc', 17: 'I-ot/thời gian', 18: 'I-văn phòng', 19: 'I-đãi ngộ', 20: 'O'}
	topic_tags = {0: 'chỗ để xe', 1: 'con người', 2: 'công việc', 3: 'cơ sở vật chất', 4: 'dự án', 5: 'lương', 6: 'môi trường làm việc', 7: 'ot/thời gian', 8: 'văn phòng', 9: 'đãi ngộ'}
	config = RobertaConfig.from_pretrained("nam194/ner", num_labels=21)
	tokenizer_topic = AutoTokenizer.from_pretrained("nam194/ner", use_fast=False)
	model_topic = PhoBertLstmCrf.from_pretrained("nam194/ner", config=config, from_tf=False).to(device)
	model_topic.resize_token_embeddings(len(tokenizer_topic))


	def sentiment(sent: str):
	sent_ = normalize(text=sent) # segment input sentence, maybe raise ConnectionError: HTTPConnectionPool())
	input_sent = torch.tensor([tokenizer_sent.encode(sent_)]).to(device)
	with torch.no_grad():
	out_sent = model_sent(input_sent)
	logits_sent = out_sent.logits.softmax(dim=-1).tolist()[0]
	pred_sent = dict_[np.argmax(logits_sent)]

	sent = replace_all(text=sent) # segment input sentence, maybe raise ConnectionError: HTTPConnectionPool())
	sent_segment = sent.split(".")
	for i, s in enumerate(sent_segment):
	s = s.strip()
	sent_segment[i] = underthesea.word_tokenize(s, format="text").split()
	dump = [[i, 'O'] for s in sent_segment for i in s]
	dump_set = NerDataset(feature_for_phobert([dump], tokenizer=tokenizer_topic, use_crf=True))
	dump_iter = DataLoader(dump_set, batch_size=1)
	with torch.no_grad():
	for idx, batch in enumerate(dump_iter):
	batch = { k:v.to(device) for k, v in batch.items() }
	outputs = model_topic(**batch)
	pred_topic = list(set([topic_tags[i] for i in cvt2cls(outputs["tags"][0])]))
	return "Sentiment: " + pred_sent + "\n" + "Topic in sentence: " + ". ".join([i.capitalize() for i in pred_topic]) # str({"sentiment": pred_sent, "topic": pred_topic})


	def pdf_to_imgs(pdf):
	path_to_pdf = pdf.name

	# convert PDF to PIL images (one image by page)
	first_page = True # we want here only the first page as image
	if first_page: last_page = 1
	else: last_page = None

	imgs = pdf2image.convert_from_path(path_to_pdf, last_page=last_page)
	return np.array(imgs[0])


	with gr.Blocks() as demo:
	gr.Markdown("Demo projects Review Company and Resume parser phase 1.")
	with gr.Tab("Review Company"):
	text_input = gr.Textbox(label="Input sentence (ex: Sếp tốt, bảo hiểm đóng full lương bảo hiểm cho nhân viên. Hàng năm tăng lương ổn OT không trả thêm tiền, chỉ cho ngày nghỉ và hỗ trợ ăn tối.):", placeholder="input here...")
	text_output = gr.Textbox(label="Result:")
	text_button = gr.Button("Predict")
	with gr.Tab("Extract infomation from resume"):
	with gr.Row():
	file_input = gr.File(label="PDF", file_types=[".pdf"])
	image_output = gr.Image(type="numpy", label="Image of the first page")
	image_button = gr.Button("Predict")

	# with gr.Accordion("Open for More!"):
	# gr.Markdown("Look at me...")

	text_button.click(sentiment, inputs=text_input, outputs=text_output)
	image_button.click(pdf_to_imgs, inputs=file_input, outputs=image_output)

	demo.launch()