Spaces:
Runtime error
Runtime error
File size: 2,056 Bytes
8d9306e 737da8f 374fa3e 8d9306e 0bb133b f884ea7 b523155 0514a3d 5c4c715 9711ed9 0a5b087 f884ea7 fd6cb9f 8d9306e d28411b 8d9306e 0bb133b 8d9306e f705683 8d9306e f705683 cc62e3c 9a6a97f f705683 8d9306e 3568832 9a6a97f 8d9306e f705683 c0cae7b 8d9306e 686f21e 8d9306e d28411b e53a130 cc62e3c 8d9306e 686f21e 0bb133b 374fa3e e53a130 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 |
import streamlit as st
from PIL import Image
import numpy as np
# Designing the interface
st.title("🖼️ French Image Captioning Demo 📝")
st.write("[Yih-Dar SHIEH](https://huggingface.co/ydshieh)")
st.sidebar.markdown(
"""
An image captioning model [ViT-GPT2](https://huggingface.co/flax-community/vit-gpt2) by combining the ViT model and a French GPT2 model.
[Part of the [Huggingface JAX/Flax event](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/).]\n
The GPT2 model source code is modified so it can accept an encoder's output.
The pretained weights of both models are loaded, with a set of randomly initialized cross-attention weigths.
The model is trained on 65000 images from the COCO dataset for about 1500 steps (batch_size=256), with the original English cpationis being translated to French for training purpose.
"""
)
#image = Image.open('samples/val_000000039769.jpg')
#show = st.image(image, use_column_width=True)
#show.image(image, 'Preloaded Image', use_column_width=True)
with st.spinner('Loading and compiling ViT-GPT2 model ...'):
from model import *
# st.sidebar.write(f'Vit-GPT2 model loaded :)')
st.sidebar.title("Select a sample image")
sample_name = st.sidebar.selectbox(
"Please choose an image",
sample_fns
)
sample_name = f"COCO_val2014_{sample_name.replace('.jpg', '').zfill(12)}.jpg"
sample_path = os.path.join(sample_dir, sample_name)
image = Image.open(sample_path)
show = st.image(image, width=480)
show.image(image, '\n\nSelected Image', width=480)
# For newline
st.sidebar.write('\n')
with st.spinner('Generating image caption ...'):
caption = predict(image)
caption_en = translator.translate(caption, src='fr', dest='en').text
st.header(f'**Prediction (in French) **{caption}')
st.header(f'**English Translation**: {caption_en}')
st.sidebar.header("ViT-GPT2 predicts:")
st.sidebar.write(f"**French**: {caption}")
st.sidebar.write(f"**English Translation**: {caption_en}")
image.close() |