Spaces:

yasserrmd
/

MolmoVision

Running

App Files Files Community

MolmoVision / app.py

yasserrmd

Update app.py

5c83476 verified 15 days ago

raw

history blame contribute delete

1.91 kB

	#import spaces
	import gradio as gr
	from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
	from PIL import Image
	import torch
	import requests

	# Load the processor and model
	processor = AutoProcessor.from_pretrained(
	'allenai/Molmo-7B-D-0924',
	trust_remote_code=True,
	torch_dtype='auto',
	device_map='auto'
	)

	model = AutoModelForCausalLM.from_pretrained(
	'allenai/Molmo-7B-D-0924',
	trust_remote_code=True,
	torch_dtype='auto',
	device_map='auto'
	)

	#@spaces.GPU
	def describe_image(image, prompt):
	# Process the image with the user-provided text prompt
	inputs = processor.process(images=[image], text=prompt)

	# Move inputs to the correct device and make a batch of size 1
	inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}

	# Generate output with a maximum of 200 new tokens
	output = model.generate_from_batch(
	inputs,
	GenerationConfig(max_new_tokens=200, stop_strings="<\|endoftext\|>"),
	tokenizer=processor.tokenizer
	)

	# Decode and return the generated text
	generated_tokens = output[0, inputs['input_ids'].size(1):]
	generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)

	return generated_text

	# Gradio interface using the latest API
	with gr.Blocks() as demo:
	gr.Markdown("# Visual Language Model - Molmo")
	with gr.Row():
	image_input = gr.Image(type="pil", label="Upload an image")
	text_input = gr.Textbox(label="Enter a prompt", placeholder="Describe this image...")
	output_text = gr.Textbox(label="Generated Description")
	submit_button = gr.Button("Generate Description")

	# Connect the inputs (image, text prompt) to the function and output
	submit_button.click(fn=describe_image, inputs=[image_input, text_input], outputs=output_text)

	# Launch the app
	demo.launch()