|
import json |
|
from transformers import AutoTokenizer |
|
import torch |
|
|
|
import llava.model.language_model.llava_olmo1p58b as llava_olmo |
|
import llava.model.language_model.llava_llama as llava_llama |
|
|
|
from OLMo_Bitnet_1B.modeling_olmo import OLMoForCausalLM |
|
from PIL import Image |
|
import requests |
|
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path |
|
from llava.conversation import conv_templates |
|
|
|
|
|
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') |
|
DEFAULT_IMAGE_TOKEN = "<image>" |
|
IMAGE_TOKEN_INDEX = -200 |
|
|
|
|
|
text = "What are the four major tournaments of the sport shown in the image?" |
|
url = "https://farm3.staticflickr.com/2157/2439959136_d932f4e816_z.jpg" |
|
image = Image.open(requests.get(url, stream=True).raw) |
|
|
|
|
|
|
|
with open('./checkpoints/llava-LlavaOLMoBitnet1B-Run3-finetune/config.json') as json_file: |
|
data = json.load(json_file) |
|
|
|
config_class = llava_olmo.LlavaOLMoBitnet1BConfig(**data) |
|
model = llava_olmo.LlavaOLMoBitnet1BForCausalLM(config_class).to(device) |
|
weight_checkpoint = torch.load('./checkpoints/llava-LlavaOLMoBitnet1B-Run3-finetune/pytorch_model.bin') |
|
model.load_state_dict(weight_checkpoint) |
|
|
|
|
|
image_processor = model.model.vision_tower.image_processor |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
"NousResearch/OLMo-Bitnet-1B", |
|
model_max_length=2048, |
|
padding_side="right", |
|
pad_token_id=1, |
|
use_fast=True, |
|
legacy=False, |
|
unk_token='<|padding|>', |
|
) |
|
|
|
|
|
image_tensor = process_images([image], image_processor, model.config)[0] |
|
|
|
text = DEFAULT_IMAGE_TOKEN + '\n' + text |
|
conv = conv_templates['llava_v1'].copy() |
|
conv.append_message(conv.roles[0], text) |
|
conv.append_message(conv.roles[1], None) |
|
prompt = conv.get_prompt() |
|
|
|
text_tokens = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device) |
|
|
|
|
|
response = model.generate(images=image_tensor.unsqueeze(0).to(device), inputs=text_tokens, max_new_tokens=400) |
|
decoded_text = tokenizer.batch_decode(response, skip_special_tokens=True)[0] |
|
print("\n\n", "-"*100) |
|
print(decoded_text[:decoded_text.find('</s>')].replace('|||IP_ADDRESS|||', '')) |
|
print("-"*100) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
''' |
|
# ORIGINAL CODE WITH ONLY OLMO: |
|
with open('llava/config.json') as json_file: |
|
data = json.load(json_file) |
|
|
|
text = "Paris is a historic city with architectural marvels. It is also " |
|
# text = ["Language modeling is "] |
|
|
|
config_class = llava_olmo.LlavaOLMoBitnet1BConfig(**data) |
|
lolmo = llava_olmo.LlavaOLMoBitnet1BForCausalLM(config_class).to(device) |
|
lolmo.load_state_dict(torch.load('OLMo_Bitnet_1B/pytorch_model.bin'), strict=False) |
|
|
|
olmo = OLMoForCausalLM(config_class).to(device) |
|
olmo.load_state_dict(torch.load('OLMo_Bitnet_1B/pytorch_model.bin')) |
|
actual_olmo = OLMoForCausalLM.from_pretrained("allenai/OLMo-1B").to(device) |
|
|
|
actual_olmo_tokenizer = OLMoTokenizerFast.from_pretrained("allenai/OLMo-1B") |
|
olmo_tokenizer = AutoTokenizer.from_pretrained("NousResearch/OLMo-Bitnet-1B") |
|
|
|
olmo_tokens = olmo_tokenizer(text, return_tensors='pt', return_token_type_ids=False).to(device) |
|
# olmo_tokens = actual_olmo_tokenizer(text, return_tensors='pt', return_token_type_ids=False).to(device) |
|
|
|
|
|
response = lolmo.generate(inputs=olmo_tokens['input_ids'], attention_mask=olmo_tokens['attention_mask'], max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) |
|
# response = olmo.generate(inputs=olmo_tokens['input_ids'], attention_mask=olmo_tokens['attention_mask'], max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) |
|
|
|
|
|
print(olmo_tokenizer.batch_decode(response, skip_special_tokens=True)[0]) |
|
''' |