Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig | |
from PIL import Image | |
import requests | |
from io import BytesIO | |
import spaces # Import spaces for ZeroGPU support | |
# Load the model and processor | |
repo_name = "cyan2k/molmo-7B-O-bnb-4bit" | |
arguments = { | |
"device_map": "auto", # Device will be set automatically | |
"torch_dtype": "auto", # Use appropriate precision | |
"trust_remote_code": True # Allow loading remote code | |
} | |
# Load the processor (this part doesn't need GPU yet) | |
processor = AutoProcessor.from_pretrained(repo_name, **arguments) | |
# Define the function for image description | |
# This ensures the function gets GPU access when needed | |
def describe_image(image, question): | |
# Load the model inside the function and move it to GPU | |
model = AutoModelForCausalLM.from_pretrained(repo_name, **arguments).to('cuda') | |
# Process the uploaded image along with the user's question | |
inputs = processor.process( | |
images=[image], | |
text=question if question else "Describe this image in great detail without missing any piece of information" | |
) | |
# Move inputs to model device (GPU) | |
inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()} | |
# Generate output using the model on GPU | |
output = model.generate_from_batch( | |
inputs, | |
GenerationConfig(max_new_tokens=1024, stop_strings="<|endoftext|>"), | |
tokenizer=processor.tokenizer, | |
) | |
# Decode the generated tokens | |
generated_tokens = output[0, inputs["input_ids"].size(1):] | |
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True) | |
return generated_text | |
# Gradio interface | |
def gradio_app(): | |
with gr.Blocks() as demo: | |
gr.Markdown("# Long Image Description with Molmo-7B 4 bit quantized\n### Note: This model size has been reduced by six times without much of loss in Performance.\n### Upload an image and ask a question about it!") | |
with gr.Row(): | |
image_input = gr.Image(type="pil", label="Upload an Image") | |
question_input = gr.Textbox(placeholder="Ask a question about the image (e.g., 'What is happening in this image?')", label="Question (Optional)") | |
output_text = gr.Textbox(label="Image Description", interactive=False) | |
# Submit button to generate the description | |
submit_btn = gr.Button("Generate Description") | |
# Callback to run when submit button is clicked | |
submit_btn.click( | |
fn=describe_image, | |
inputs=[image_input, question_input], | |
outputs=output_text | |
) | |
# Launch the Gradio interface | |
demo.launch() | |
# Launch the Gradio app | |
gradio_app() | |