invoice_document_headers_extraction_with_donut

Running

invoice_document_headers_extraction_with_donut

File size: 3,770 Bytes

cb4c41f
 
 
 
 
f50ef7b
dea123e
1592bb3
93fe32c
 
 
cb4c41f
789ac0d
 
cb4c41f
 
 
 
 
6b45627
5c58f13
6b45627
f50ef7b
21ab7aa
 
 
1592bb3
 
9485a97
1592bb3
4fa7e16
 
 
dea123e
cb4c41f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a1a0ef2
cb4c41f
 
2f8b3bf
f25b610
2f8b3bf
cb4c41f
 
2f8b3bf
cb4c41f
7a20e1a
cb4c41f
 
 
239acc5
cb4c41f

import re
import gradio as gr

import torch
from transformers import DonutProcessor, VisionEncoderDecoderModel
from PIL import Image
import requests
from io import BytesIO
import json
import os


processor = DonutProcessor.from_pretrained("./donut-base-finetuned-inv")
model = VisionEncoderDecoderModel.from_pretrained("./donut-base-finetuned-inv")

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def process_document(image):
    #can't save uploaded file locally, but needs to be converted from nparray to PIL
    im1 = Image.fromarray(image)
    
    #send notification through telegram
    TOKEN = os.getenv('TELEGRAM_BOT_TOKEN')
    CHAT_ID = os.getenv('TELEGRAM_CHANNEL_ID')
    url = f'https://api.telegram.org/bot{TOKEN}/sendPhoto?chat_id={CHAT_ID}'
    bio = BytesIO()
    bio.name = 'image.jpeg'
    im1.save(bio, 'JPEG')
    bio.seek(0)
    media = {"type": "photo", "media": "attach://photo", "caption": "New doc is being tried out:"}
    data = {"media": json.dumps(media)}
    response = requests.post(url, files={'photo': bio}, data=data)
    
    # prepare encoder inputs
    pixel_values = processor(image, return_tensors="pt").pixel_values
    
    # prepare decoder inputs
    task_prompt = "<s_cord-v2>"
    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
          
    # generate answer
    outputs = model.generate(
        pixel_values.to(device),
        decoder_input_ids=decoder_input_ids.to(device),
        max_length=model.decoder.config.max_position_embeddings,
        early_stopping=True,
        pad_token_id=processor.tokenizer.pad_token_id,
        eos_token_id=processor.tokenizer.eos_token_id,
        use_cache=True,
        num_beams=1,
        bad_words_ids=[[processor.tokenizer.unk_token_id]],
        return_dict_in_generate=True,
    )
    
    # postprocess
    sequence = processor.batch_decode(outputs.sequences)[0]
    sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "")
    sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
    
    return processor.token2json(sequence)

description = '<p>Using Donut model finetuned on Invoices for retrieval of following information:</p><ul><li><span style="color:black">DocType</span></span></li><li><span style="color:black">Currency</span></span></li><li><span style="color:black">DocumentDate</span></span></li><li><span style="color:black">GrossAmount</span></span></li><li><span style="color:black">InvoiceNumber</span></span></li><li><span style="color:black">NetAmount</span></span></li><li><span style="color:black">TaxAmount</span></span></li><li><span style="color:black">OrderNumber</span></span></li><li><span style="color:black">CreditorCountry</span></span></li></ul><p>To use it, simply upload your image and click &#39;submit&#39;, or click one of the examples to load them. Read more at the links below.</p><p>&nbsp;</p><p>(because this is running on the free cpu tier, it will take about 40 secs before you see a result)</p><p>Have fun&nbsp;😎</p><p>Toon Beerten</p>'
article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2111.15664' target='_blank'>Donut: OCR-free Document Understanding Transformer</a> | <a href='https://github.com/clovaai/donut' target='_blank'>Github Repo</a></p>"


gr_image = gr.Image().style( height=800) 

demo = gr.Interface(
    fn=process_document,
    inputs=gr_image,
    outputs="json",
    title="Demo: Donut 🍩 for invoice header retrieval",
    description=description,
    article=article,
    enable_queue=True,
    examples=[["example.jpg"], ["example_2.jpg"], ["example_3.jpg"]],
    cache_examples=False)

demo.launch()