File size: 3,770 Bytes
cb4c41f f50ef7b dea123e 1592bb3 93fe32c cb4c41f 789ac0d cb4c41f 6b45627 5c58f13 6b45627 f50ef7b 21ab7aa 1592bb3 9485a97 1592bb3 4fa7e16 dea123e cb4c41f a1a0ef2 cb4c41f 2f8b3bf f25b610 2f8b3bf cb4c41f 2f8b3bf cb4c41f 7a20e1a cb4c41f 239acc5 cb4c41f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import re
import gradio as gr
import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import requests
from io import BytesIO
import json
import os
processor = DonutProcessor.from_pretrained("./donut-base-finetuned-inv")
model = VisionEncoderDecoderModel.from_pretrained("./donut-base-finetuned-inv")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
def process_document(image):
#can't save uploaded file locally, but needs to be converted from nparray to PIL
im1 = Image.fromarray(image)
#send notification through telegram
TOKEN = os.getenv('TELEGRAM_BOT_TOKEN')
CHAT_ID = os.getenv('TELEGRAM_CHANNEL_ID')
url = f'https://api.telegram.org/bot{TOKEN}/sendPhoto?chat_id={CHAT_ID}'
bio = BytesIO()
bio.name = 'image.jpeg'
im1.save(bio, 'JPEG')
bio.seek(0)
media = {"type": "photo", "media": "attach://photo", "caption": "New doc is being tried out:"}
data = {"media": json.dumps(media)}
response = requests.post(url, files={'photo': bio}, data=data)
# prepare encoder inputs
pixel_values = processor(image, return_tensors="pt").pixel_values
# prepare decoder inputs
task_prompt = "<s_cord-v2>"
decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
# generate answer
outputs = model.generate(
pixel_values.to(device),
decoder_input_ids=decoder_input_ids.to(device),
max_length=model.decoder.config.max_position_embeddings,
early_stopping=True,
pad_token_id=processor.tokenizer.pad_token_id,
eos_token_id=processor.tokenizer.eos_token_id,
use_cache=True,
num_beams=1,
bad_words_ids=[[processor.tokenizer.unk_token_id]],
return_dict_in_generate=True,
)
# postprocess
sequence = processor.batch_decode(outputs.sequences)[0]
sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
return processor.token2json(sequence)
description = '<p>Using Donut model finetuned on Invoices for retrieval of following information:</p><ul><li><span style="color:black">DocType</span></span></li><li><span style="color:black">Currency</span></span></li><li><span style="color:black">DocumentDate</span></span></li><li><span style="color:black">GrossAmount</span></span></li><li><span style="color:black">InvoiceNumber</span></span></li><li><span style="color:black">NetAmount</span></span></li><li><span style="color:black">TaxAmount</span></span></li><li><span style="color:black">OrderNumber</span></span></li><li><span style="color:black">CreditorCountry</span></span></li></ul><p>To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below.</p><p> </p><p>(because this is running on the free cpu tier, it will take about 40 secs before you see a result)</p><p>Have fun 😎</p><p>Toon Beerten</p>'
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2111.15664' target='_blank'>Donut: OCR-free Document Understanding Transformer</a> | <a href='https://github.com/clovaai/donut' target='_blank'>Github Repo</a></p>"
gr_image = gr.Image().style( height=800)
demo = gr.Interface(
fn=process_document,
inputs=gr_image,
outputs="json",
title="Demo: Donut 🍩 for invoice header retrieval",
description=description,
article=article,
enable_queue=True,
examples=[["example.jpg"], ["example_2.jpg"], ["example_3.jpg"]],
cache_examples=False)
demo.launch() |