File size: 2,781 Bytes
19d4726
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import logging
import os
from typing import List, Tuple

import gradio as gr
import pandas as pd
import spacy
from transformers import AutoModelForTokenClassification, AutoTokenizer

try:
    nlp = spacy.load("pt_core_news_sm")
except Exception:
    os.system("python -m spacy download pt_core_news_sm")
    nlp = spacy.load("pt_core_news_sm")

model = AutoModelForTokenClassification.from_pretrained("Emanuel/bertimbau-base-pos")
tokenizer = AutoTokenizer.from_pretrained("Emanuel/bertimbau-base-pos")
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)


def predict(text, nlp, logger=None) -> Tuple[List[str], List[str]]:
    doc = nlp(text)
    tokens = [token.text for token in doc]

    logger.info("Starting predictions for sentence: {}".format(text))

    input_tokens = tokenizer(
        tokens,
        return_tensors="pt",
        is_split_into_words=True,
        return_offsets_mapping=True,
        return_special_tokens_mask=True,
    )
    output = model(input_tokens["input_ids"])

    i_token = 0
    labels = []
    for off, is_special_token, pred in zip(
        input_tokens["offset_mapping"][0],
        input_tokens["special_tokens_mask"][0],
        output.logits[0],
    ):
        if is_special_token or off[0] > 0:
            continue
        label = model.config.__dict__["id2label"][int(pred.argmax(axis=-1))]
        if logger is not None:
            logger.info("{}, {}, {}".format(off, tokens[i_token], label))
        labels.append(label)
        i_token += 1

    return tokens, labels


def text_analysis(text):
    tokens, labels = predict(text, nlp, logger)
    pos_count = pd.DataFrame(
        {
            "token": tokens,
            "etiqueta": labels,
        }
    )
    pos_tokens = []
    for token, label in zip(tokens, labels):
        pos_tokens.extend([(token, label), (" ", None)])

    return pos_tokens, pos_count


css = open("style.css").read()
top_html = open("top.html").read()
bottom_html = open("bottom.html").read()

with gr.Blocks(css=css) as demo:
    gr.HTML(top_html)
    text = gr.Textbox(placeholder="Insira um texto...", label="Texto de entrada")
    output_highlighted = gr.HighlightedText()
    output_df = gr.Dataframe()
    submit_btn = gr.Button("Enviar")
    submit_btn.click(
        fn=text_analysis, inputs=text, outputs=[output_highlighted, output_df]
    )
    examples = gr.Examples(
        examples=[
            [
                "A população não poderia ter acesso a relatórios que explicassem, por exemplo, os motivos exatos de atrasos em obras de linhas e estações."
            ],
            ["Filme 'Star Wars : Os Últimos Jedi' ganha trailer definitivo; assista."],
        ],
        inputs=[text],
        label="Exemplos",
    )
    gr.HTML(bottom_html)


demo.launch()