import json from transformers import AutoTokenizer import torch import llava.model.language_model.llava_olmo1p58b as llava_olmo ## import llava.model.language_model.llava_llama as llava_llama from OLMo_Bitnet_1B.modeling_olmo import OLMoForCausalLM from PIL import Image import requests from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path from llava.conversation import conv_templates device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') DEFAULT_IMAGE_TOKEN = "" IMAGE_TOKEN_INDEX = -200 # Define Image and Text inputs.. text = "What are the four major tournaments of the sport shown in the image?" url = "https://farm3.staticflickr.com/2157/2439959136_d932f4e816_z.jpg" image = Image.open(requests.get(url, stream=True).raw) # LOAD MODEL FROM CHECKPOINT with open('./checkpoints/llava-LlavaOLMoBitnet1B-Run3-finetune/config.json') as json_file: data = json.load(json_file) config_class = llava_olmo.LlavaOLMoBitnet1BConfig(**data) model = llava_olmo.LlavaOLMoBitnet1BForCausalLM(config_class).to(device) weight_checkpoint = torch.load('./checkpoints/llava-LlavaOLMoBitnet1B-Run3-finetune/pytorch_model.bin') model.load_state_dict(weight_checkpoint) # pre-process image; Apply chat template and tokenize text image_processor = model.model.vision_tower.image_processor tokenizer = AutoTokenizer.from_pretrained( "NousResearch/OLMo-Bitnet-1B", model_max_length=2048, padding_side="right", pad_token_id=1, use_fast=True, legacy=False, unk_token='<|padding|>', ) image_tensor = process_images([image], image_processor, model.config)[0] text = DEFAULT_IMAGE_TOKEN + '\n' + text conv = conv_templates['llava_v1'].copy() conv.append_message(conv.roles[0], text) conv.append_message(conv.roles[1], None) prompt = conv.get_prompt() text_tokens = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device) # Generate response from the model response = model.generate(images=image_tensor.unsqueeze(0).to(device), inputs=text_tokens, max_new_tokens=400) decoded_text = tokenizer.batch_decode(response, skip_special_tokens=True)[0] print("\n\n", "-"*100) print(decoded_text[:decoded_text.find('')].replace('|||IP_ADDRESS|||', '')) # The replace part is due to unwanted token introduction at start print("-"*100)