Spaces:

yasserrmd
/

MolmoVision

Running

App Files Files Community

yasserrmd commited on Sep 27

Commit

cf2a851

•

1 Parent(s): f22a6e1

Create app.py

Browse files

Files changed (1) hide show

app.py +51 -0

app.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import spaces
+import gradio as gr
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+from PIL import Image
+import torch
+import requests
+# Load the processor and model
+processor = AutoProcessor.from_pretrained(
+    'allenai/Molmo-7B-D-0924',
+    trust_remote_code=True,
+    torch_dtype='auto',
+    device_map='auto'
+)
+model = AutoModelForCausalLM.from_pretrained(
+    'allenai/Molmo-7B-D-0924',
+    trust_remote_code=True,
+    torch_dtype='auto',
+    device_map='auto'
+)
+@spaces.GPU
+def describe_image(image):
+    # Process the image
+    inputs = processor.process(images=[image], text="Describe this image.")
+    # Move inputs to the correct device and make a batch of size 1
+    inputs = {k: v.to(model.device).unsqueeze(0) for k, v in inputs.items()}
+    # Generate output with maximum 200 new tokens
+    output = model.generate_from_batch(
+        inputs,
+        GenerationConfig(max_new_tokens=200, stop_strings="<|endoftext|>"),
+        tokenizer=processor.tokenizer
+    )
+    # Decode and return generated text
+    generated_tokens = output[0, inputs['input_ids'].size(1):]
+    generated_text = processor.tokenizer.decode(generated_tokens, skip_special_tokens=True)
+    return generated_text
+# Gradio interface
+gr.Interface(
+    fn=describe_image,
+    inputs=gr.inputs.Image(type="pil"),
+    outputs="text",
+    title="Visual Language Model - Molmo",
+    description="Upload an image, and the model will generate a detailed description of it."
+).launch()