rhymes-ai/Aria · rtx 4090

seems to run okay on 3* 24gb cards. it could describe the cat:-0 nice works.
also i updated yor code a little bit here so it works with newer pytorch verison
torch.amp.autocast(device_type=model.device.type, dtype=torch.bfloat16):

/usr/local/bin/python3 /home/myles/Desktop/Aria/aria.py
grouped_gemm is not installed, using sequential GEMM, which is slower.
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:11<00:00, 1.07it/s]
The seen_tokens attribute is deprecated and will be removed in v4.41. Use the cache_position model input instead.
Starting from v4.46, the logits model output will have the same type as the model (except at train time, where it will always be FP32)
The image shows a close-up of a cat. The cat has black fur on its head and around its eyes, and white fur on the rest of its face and body. It is looking directly at the camera with a curious expression. The cat is situated on a cardboard box, with its front paws resting on the edge and a green background behind it.<|im_end|>

import os
import requests
import torch
from PIL import Image
from transformers import AutoModelForCausalLM, AutoProcessor

# Set the visible devices (you can also do this in the shell before running the script)
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2"

model_id_or_path = "rhymes-ai/Aria"

# Load the model and automatically distribute layers across the visible devices
model = AutoModelForCausalLM.from_pretrained(
    model_id_or_path,
    device_map="auto",  # Automatically distribute model layers
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
)

# Load the processor
processor = AutoProcessor.from_pretrained(model_id_or_path, trust_remote_code=True)

# Load an example image
image_path = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"
image = Image.open(requests.get(image_path, stream=True).raw)

# Prepare the input message structure
messages = [
    {
        "role": "user",
        "content": [
            {"text": None, "type": "image"},
            {"text": "what is the image?", "type": "text"},
        ],
    }
]

# Prepare inputs for the model
text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=text, images=image, return_tensors="pt")
inputs["pixel_values"] = inputs["pixel_values"].to(model.dtype)

# Move all input tensors to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Perform inference
with torch.inference_mode(), torch.amp.autocast(device_type=model.device.type, dtype=torch.bfloat16):
    output = model.generate(
        **inputs,
        max_new_tokens=64000,
        stop_strings=["<|im_end|>"],
        tokenizer=processor.tokenizer,
        do_sample=True,
        temperature=0.9,
    )
    output_ids = output[0][inputs["input_ids"].shape[1]:]
    result = processor.decode(output_ids, skip_special_tokens=True)

print(result)