LlavaOLMoBitnet1B / llava_olmo.py

all the files required for inference

2ce0406 3 months ago

3.85 kB

	import json
	from transformers import AutoTokenizer
	import torch

	import llava.model.language_model.llava_olmo1p58b as llava_olmo ##
	import llava.model.language_model.llava_llama as llava_llama

	from OLMo_Bitnet_1B.modeling_olmo import OLMoForCausalLM
	from PIL import Image
	import requests
	from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
	from llava.conversation import conv_templates


	device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
	DEFAULT_IMAGE_TOKEN = "<image>"
	IMAGE_TOKEN_INDEX = -200

	# Define Image and Text inputs..
	text = "What are the four major tournaments of the sport shown in the image?"
	url = "https://farm3.staticflickr.com/2157/2439959136_d932f4e816_z.jpg"
	image = Image.open(requests.get(url, stream=True).raw)


	# LOAD MODEL FROM CHECKPOINT
	with open('./checkpoints/llava-LlavaOLMoBitnet1B-Run3-finetune/config.json') as json_file:
	data = json.load(json_file)

	config_class = llava_olmo.LlavaOLMoBitnet1BConfig(**data)
	model = llava_olmo.LlavaOLMoBitnet1BForCausalLM(config_class).to(device)
	weight_checkpoint = torch.load('./checkpoints/llava-LlavaOLMoBitnet1B-Run3-finetune/pytorch_model.bin')
	model.load_state_dict(weight_checkpoint)

	# pre-process image; Apply chat template and tokenize text
	image_processor = model.model.vision_tower.image_processor
	tokenizer = AutoTokenizer.from_pretrained(
	"NousResearch/OLMo-Bitnet-1B",
	model_max_length=2048,
	padding_side="right",
	pad_token_id=1,
	use_fast=True,
	legacy=False,
	unk_token='<\|padding\|>',
	)


	image_tensor = process_images([image], image_processor, model.config)[0]

	text = DEFAULT_IMAGE_TOKEN + '\n' + text
	conv = conv_templates['llava_v1'].copy()
	conv.append_message(conv.roles[0], text)
	conv.append_message(conv.roles[1], None)
	prompt = conv.get_prompt()

	text_tokens = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(device)

	# Generate response from the model
	response = model.generate(images=image_tensor.unsqueeze(0).to(device), inputs=text_tokens, max_new_tokens=400)
	decoded_text = tokenizer.batch_decode(response, skip_special_tokens=True)[0]
	print("\n\n", "-"*100)
	print(decoded_text[:decoded_text.find('</s>')].replace('\|\|\|IP_ADDRESS\|\|\|', '')) # The replace part is due to unwanted token introduction at start
	print("-"*100)


	#
	##
	#
	#
	#
	'''
	# ORIGINAL CODE WITH ONLY OLMO:
	with open('llava/config.json') as json_file:
	data = json.load(json_file)

	text = "Paris is a historic city with architectural marvels. It is also "
	# text = ["Language modeling is "]

	config_class = llava_olmo.LlavaOLMoBitnet1BConfig(**data)
	lolmo = llava_olmo.LlavaOLMoBitnet1BForCausalLM(config_class).to(device)
	lolmo.load_state_dict(torch.load('OLMo_Bitnet_1B/pytorch_model.bin'), strict=False)

	olmo = OLMoForCausalLM(config_class).to(device)
	olmo.load_state_dict(torch.load('OLMo_Bitnet_1B/pytorch_model.bin'))
	actual_olmo = OLMoForCausalLM.from_pretrained("allenai/OLMo-1B").to(device)

	actual_olmo_tokenizer = OLMoTokenizerFast.from_pretrained("allenai/OLMo-1B")
	olmo_tokenizer = AutoTokenizer.from_pretrained("NousResearch/OLMo-Bitnet-1B")

	olmo_tokens = olmo_tokenizer(text, return_tensors='pt', return_token_type_ids=False).to(device)
	# olmo_tokens = actual_olmo_tokenizer(text, return_tensors='pt', return_token_type_ids=False).to(device)


	response = lolmo.generate(inputs=olmo_tokens['input_ids'], attention_mask=olmo_tokens['attention_mask'], max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
	# response = olmo.generate(inputs=olmo_tokens['input_ids'], attention_mask=olmo_tokens['attention_mask'], max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)


	print(olmo_tokenizer.batch_decode(response, skip_special_tokens=True)[0])
	'''