Spaces:

VinitT
/

Llama-3.2-11B-Vision-Instruct

Running

File size: 3,368 Bytes

import os
import streamlit as st
from huggingface_hub import login
from transformers import MllamaForConditionalGeneration, AutoProcessor
from PIL import Image
import torch

# Step 1: Log in to Hugging Face with your access token from secrets
huggingface_token = os.getenv("HUGGINGFACE_TOKEN")  # Fetch the token from environment
if huggingface_token:
    login(token=huggingface_token)  # Authenticate using the token
else:
    st.error("Hugging Face token not found. Please set it in the Secrets section.")

# Step 2: Load the model and processor
try:
    model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
    model = MllamaForConditionalGeneration.from_pretrained(
        model_name,
        token=huggingface_token,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )
    processor = AutoProcessor.from_pretrained(
        model_name,
        use_auth_token=huggingface_token,
    )
    st.success("Model and processor loaded successfully!")
except Exception as e:
    st.error(f"Error loading model or processor: {str(e)}")

# Step 3: Create a simple Streamlit app
def main():
    st.title("Llama 3.2 11B Vision Model")
    st.write("Upload an image and enter a prompt to generate output.")

    # Upload image
    image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
    prompt = st.text_area("Enter your prompt here:")

    if st.button("Generate Output"):
        if image_file and prompt:
            # Load image
            image = Image.open(image_file).convert("RGB")
            st.image(image, caption="Uploaded Image", use_column_width=True)

            try:
                # Prepare the messages in the format expected by the processor
                messages = [
                    {
                        "role": "user",
                        "content": [
                            {"type": "text", "text": prompt},
                            {"type": "image"}
                        ]
                    }
                ]

                # Apply chat template
                input_text = processor.apply_chat_template(messages, add_generation_prompt=True)

                # Prepare inputs for the model
                inputs = processor(
                    text=input_text,
                    images=[image],
                    return_tensors="pt"
                ).to("cuda" if torch.cuda.is_available() else "cpu")

                # Generate output
                with torch.no_grad():
                    output_ids = model.generate(
                        **inputs,
                        max_new_tokens=250,
                    )

                # Decode the output
                output_text = processor.batch_decode(output_ids, skip_special_tokens=True)[0]

                # Extract the generated response
                # Remove the prompt part from the output_text
                if input_text in output_text:
                    generated_output = output_text.replace(input_text, "").strip()
                else:
                    generated_output = output_text.strip()

                st.write("Generated Output:", generated_output)
            except Exception as e:
                st.error(f"Error during prediction: {str(e)}")
        else:
            st.warning("Please upload an image and enter a prompt.")

if __name__ == "__main__":
    main()