Molmo-4bit / app.py
zamal's picture
Update app.py
59219bc verified
import gradio as gr
from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
from PIL import Image
import requests
from io import BytesIO
import spaces # Import spaces for ZeroGPU support
# Load the model and processor
repo_name = "cyan2k/molmo-7B-O-bnb-4bit"
arguments = {
"device_map": "auto", # Device will be set automatically
"torch_dtype": "auto", # Use appropriate precision
"trust_remote_code": True # Allow loading remote code
}
# Load the processor (this part doesn't need GPU yet)
processor = AutoProcessor.from_pretrained(repo_name, **arguments)
# Define the function for image description
@spaces.GPU # This ensures the function gets GPU access when needed
def describe_image(image, question):
# Load the model inside the function and move it to GPU
model = AutoModelForCausalLM.from_pretrained(repo_name, **arguments).to('cuda')
# Process the uploaded image along with the user's question
inputs = processor.process(
images=[image],
text=question if question else "Describe this image in great detail without missing any piece of information"
)
# Move inputs to model device (GPU)
inputs = {k: v.to('cuda').unsqueeze(0) for k, v in inputs.items()}
# Generate output using the model on GPU
output = model.generate_from_batch(
inputs,
GenerationConfig(max_new_tokens=1024, stop_strings="<|endoftext|>"),
tokenizer=processor.tokenizer,
)
# Decode the generated tokens
generated_tokens = output[0, inputs["input_ids"].size(1):]
generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
return generated_text
# Gradio interface
def gradio_app():
with gr.Blocks() as demo:
gr.Markdown("# Long Image Description with Molmo-7B 4 bit quantized\n### Note: This model size has been reduced by six times without much of loss in Performance.\n### Upload an image and ask a question about it!")
with gr.Row():
image_input = gr.Image(type="pil", label="Upload an Image")
question_input = gr.Textbox(placeholder="Ask a question about the image (e.g., 'What is happening in this image?')", label="Question (Optional)")
output_text = gr.Textbox(label="Image Description", interactive=False)
# Submit button to generate the description
submit_btn = gr.Button("Generate Description")
# Callback to run when submit button is clicked
submit_btn.click(
fn=describe_image,
inputs=[image_input, question_input],
outputs=output_text
)
# Launch the Gradio interface
demo.launch()
# Launch the Gradio app
gradio_app()