|
--- |
|
inference: false |
|
license: cc-by-4.0 |
|
datasets: |
|
- wikiann |
|
language: |
|
- bg |
|
metrics: |
|
- accuracy |
|
--- |
|
|
|
# 🇧🇬 BERT - Bulgarian Named Entity Recognition |
|
The model [rmihaylov/bert-base-bg](https://huggingface.co/rmihaylov/bert-base-bg) fine-tuned on a Bulgarian subset of [wikiann](https://huggingface.co/datasets/wikiann). |
|
|
|
## Usage |
|
Import the libraries: |
|
```python |
|
from typing import List, Dict |
|
|
|
import torch |
|
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline |
|
``` |
|
|
|
Firstly, you'll have to define these methods, since we are using a subword Tokenizer: |
|
```python |
|
def predict( |
|
text: str, |
|
model: torch.nn.Module, |
|
tokenizer: AutoTokenizer, |
|
labels_tags={ |
|
0: "O", |
|
1: "B-PER", 2: "I-PER", |
|
3: "B-ORG", 4: "I-ORG", |
|
5: "B-LOC", 6: "I-LOC" |
|
}) -> List[Dict[str, str]]: |
|
tokens_data = tokenizer(text) |
|
tokens = tokenizer.convert_ids_to_tokens(tokens_data["input_ids"]) |
|
words = subwords_to_words(tokens) |
|
|
|
input_ids = torch.LongTensor(tokens_data["input_ids"]).unsqueeze(0) |
|
attention_mask = torch.LongTensor(tokens_data["attention_mask"]).unsqueeze(0) |
|
|
|
out = model(input_ids, attention_mask=attention_mask).logits |
|
out = out.argmax(-1).squeeze(0).tolist() |
|
|
|
prediction = [labels_tags[idx] if idx in labels_tags else idx for idx in out] |
|
|
|
return merge_words_and_predictions(words, prediction) |
|
|
|
|
|
def subwords_to_words(tokens: List[str]) -> List[str]: |
|
out_tokens = [] |
|
curr_token = "" |
|
tags = [] |
|
|
|
for token in tokens: |
|
if token == "[SEP]": |
|
curr_token = curr_token.replace("▁", "") |
|
out_tokens.append(curr_token) |
|
out_tokens.append("[SEP]") |
|
break |
|
|
|
if "▁" in token and curr_token == "": |
|
curr_token += token |
|
|
|
elif "▁" in token and curr_token != "": |
|
curr_token = curr_token.replace("▁", "") |
|
out_tokens.append(curr_token) |
|
curr_token = "" |
|
curr_token += token |
|
|
|
elif "▁" not in token: |
|
curr_token += token |
|
|
|
return out_tokens |
|
|
|
|
|
def merge_words_and_predictions(words: List[str], entities: List[str]) -> List[Dict[str, str]]: |
|
result = [] |
|
curr_word = [] |
|
|
|
for i, (word, entity) in enumerate(zip(words[1:], entities[1:])): |
|
if "B-" in entity: |
|
if curr_word: |
|
curr_word = " ".join(curr_word) |
|
result.append({ |
|
"word": curr_word, |
|
"entity_group": entities[i][2:] |
|
}) |
|
curr_word = [word] |
|
else: |
|
curr_word.append(word) |
|
|
|
if "I-" in entity: |
|
curr_word.append(word) |
|
|
|
if "O" == entity: |
|
if curr_word: |
|
curr_word = " ".join(curr_word) |
|
result.append({ |
|
"word": curr_word, |
|
"entity_group": entities[i][2:] |
|
}) |
|
|
|
curr_word = [] |
|
|
|
return result |
|
``` |
|
|
|
Then, you should initialize the `AutoTokenizer` and `AutoModelForTokenClassification` objects: |
|
```python |
|
MODEL_ID = "auhide/bert-bg-ner" |
|
|
|
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) |
|
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID) |
|
``` |
|
|
|
Finally, you can call the `predict()` method from above like that: |
|
```python |
|
text = "Барух Спиноза е роден в Амстердам" |
|
print(f"Input: {text}") |
|
print("NERs:", predict(text, model=model, tokenizer=tokenizer)) |
|
``` |
|
```sh |
|
Input: Барух Спиноза е роден в Амстердам |
|
NERs: [{'word': 'Барух Спиноза', 'entity_group': 'PER'}, {'word': 'Амстердам', 'entity_group': 'LOC'}] |
|
``` |
|
|
|
Note: There are three types of entities - `PER`, `ORG`, `LOC`. |