File size: 2,864 Bytes
09c10d8 ae58ac5 09c10d8 ae58ac5 b6461e6 ae58ac5 09c10d8 ae58ac5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
---
language: ru
tags:
- russian
- text-to-text
- PyTorch
- Transformers
license: apache-2.0
widget:
- text: <LM>Водка "Русская валюта" премиум люкс 38% 0,25л, Россия
pipeline_tag: text2text-generation
---
This is a named entity recognizer for goods and brands extraction from receipts of fiscal data operators in Russian.
It was developed for the special multi-staged competition devoted to receipt structurization. This competition was organized by [Open Data Science community](https://ods.ai) and [Alpha Bank](https://alfabank.ru), and it was consisted of [the first](https://ods.ai/competitions/nlp-receipts), [the second](https://ods.ai/competitions/alfabank-nlp-receipts-2) and [the final](https://ods.ai/competitions/alfabank-nlp-receipts-final) stage. But this model can be used for any receipt parsing and structurization in Russian. The repository with code for fine-tuning and inference is available on [gitflic.ru](https://gitflic.ru/project/bond005/ods-ner-2023).
Example of using:
```
from typing import Tuple
import torch
from transformers import T5ForConditionalGeneration, GPT2Tokenizer
MODEL_NAME = 'bond005/FRED-T5-large-ods-ner-2023'
START_TAG = '<LM>'
END_TAG = '</s>'
def initialize_recognizer(model_path: str) -> Tuple[GPT2Tokenizer, T5ForConditionalGeneration]:
model = T5ForConditionalGeneration.from_pretrained(model_path)
if not torch.cuda.is_available():
raise ValueError('CUDA is not available!')
model = model.cuda()
model.eval()
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
return tokenizer, model
def recognize(text: str, tokenizer: GPT2Tokenizer, model: T5ForConditionalGeneration) -> Tuple[str, str]:
if text.startswith(START_TAG):
x = tokenizer(text, return_tensors='pt', padding=True).to(model.device)
else:
x = tokenizer(START_TAG + text, return_tensors='pt', padding=True).to(model.device)
out = model.generate(**x)
predictions = tokenizer.decode(out[0], skip_special_tokens=True).strip()
while predictions.endswith(END_TAG):
predictions = predictions[:-len(END_TAG)].strip()
prediction_pair = predictions.split(';')
if len(prediction_pair) == 0:
goods = ''
brands = ''
elif len(prediction_pair) == 1:
goods = prediction_pair[0].strip()
brands = ''
else:
goods = prediction_pair[0].strip()
brands = prediction_pair[1].strip()
return goods, brands
recognizer = initialize_recognizer(MODEL_NAME)
goods_and_brands = recognize(text='Водка "Русская валюта" премиум люкс 38% 0,25л, Россия',
tokenizer=recognizer[0], model=recognizer[1])
print(f'GOODS: {goods_and_brands[0]}')
# водка
print(f'BRANDS: {goods_and_brands[1]}')
# русская валюта
``` |