Spaces:

Pinwheel
/

GLIP-BLIP-Object-Detection-VQA

Runtime error

File size: 1,492 Bytes

128757a

import os
import gradio as gr
import warnings

warnings.filterwarnings("ignore")

os.system("python setup.py build develop --user")

from maskrcnn_benchmark.config import cfg
from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo
import vqa
import vqa

# Use this command for evaluate the GLIP-T model
config_file = "configs/glip_Swin_T_O365_GoldG.yaml"
weight_file = "checkpoints/glip_tiny_model_o365_goldg_cc_sbu.pth"

# manual override some options
cfg.local_rank = 0
cfg.num_gpus = 1
cfg.merge_from_file(config_file)
cfg.merge_from_list(["MODEL.WEIGHT", weight_file])
cfg.merge_from_list(["MODEL.DEVICE", "cuda"])

glip_demo = GLIPDemo(
    cfg,
    min_image_size=800,
    confidence_threshold=0.7,
    show_mask_heatmaps=False
)
blip_demo = vqa.VQA(
    model_path = 'checkpoints/model_base_vqa_capfilt_large.pth'
)

def predict(image, object, question):
    result, _ = glip_demo.run_on_web_image(image[:, :, [2, 1, 0]], object, 0.5)
    answer = blip_demo.vqa_demo(image, question)
    return result[:, :, [2, 1, 0]], answer

image = gr.inputs.Image()

gr.Interface(
    description="GLIP + BLIP VQA Demo.",
    fn=predict,
    inputs=[
        "image", 
        gr.Textbox(label='Objects', lines=1, placeholder="Objects here.."), 
        gr.Textbox(label='Question', lines=1, placeholder="Question here..")],

    outputs=[
        gr.outputs.Image(
            type="pil",
            label="grounding results"
        ),
        gr.Textbox(label="Answer")
    ],
).launch()