import requests import torch from PIL import Image from transformers import MllamaForConditionalGeneration, AutoProcessor import gradio as gr import spaces class VisionInstructChat: def __init__(self): # Initialize the model and processor self.model_id = "meta-llama/Llama-3.2-90B-Vision-Instruct" self.model = MllamaForConditionalGeneration.from_pretrained( self.model_id, torch_dtype=torch.bfloat16, device_map="auto", ) self.processor = AutoProcessor.from_pretrained(self.model_id) # Method to handle the model's response to an image and text input @spaces def chat_with_model(self, history, image, user_text): if image is None or not user_text.strip(): return history + [["Please upload an image and enter a prompt."]] # Prepare messages for the model messages = [ {"role": "user", "content": [ {"type": "image"}, {"type": "text", "text": user_text} ]} ] input_text = self.processor.apply_chat_template(messages, add_generation_prompt=True) inputs = self.processor(image, input_text, return_tensors="pt").to(self.model.device) # Generate response output = self.model.generate(**inputs, max_new_tokens=100) response = self.processor.decode(output[0]) # Add user prompt and model response to chat history history.append([user_text, response]) return history # Method to reset the chat history @spaces def reset_chat(self): return [] # Method to create the Gradio interface def launch_interface(self): with gr.Blocks() as demo: gr.Markdown("### Chat with Vision-Instruct Model") # Chat history chat_history = gr.Chatbot(label="Chat History") # Inputs: Image and Text with gr.Row(): with gr.Column(scale=3): image_input = gr.Image(type="pil", label="Upload Image") with gr.Column(scale=7): user_input = gr.Textbox(placeholder="Type your message here...", label="Your Prompt") # Submit and Clear buttons submit_button = gr.Button("Send") clear_button = gr.Button("Clear Chat") # Button actions submit_button.click(fn=self.chat_with_model, inputs=[chat_history, image_input, user_input], outputs=chat_history) clear_button.click(fn=self.reset_chat, outputs=chat_history) demo.launch() # Create an instance of the class and launch the interface chat_app = VisionInstructChat() chat_app.launch_interface()