tinyllama-cinder-openhermes-32k / run-tiny-merge.py

give a better example in the docstring

cbf1aa6 9 months ago

9.16 kB

	#!/usr/bin/env python3

	"""
	Code here was refactored from gist:
	https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b

	CodeLlama example:
	https://huggingface.co/collections/mlabonne/codellama-6509bc68c2d4c8fc379ee87f

	Hugging Face Fine-Tuning example:
	https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing

	2024-02-07 - unable to get unsloth to install.
	If you want to fine-tune, here's an example Unsloth fine tuning guide for:
	Alpaca + TinyLlama + RoPE Scaling full example.ipynb
	https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing

	"""

	import os
	import transformers
	import torch
	import logging
	from ddare.merge import merge_tensors
	from ddare.tensor import (
	dare_ties_sparsification,
	relative_norm,
	divide_tensor_into_sets,
	)
	from ddare.util import get_device
	import re
	from typing import Dict, Tuple, List


	logging.basicConfig(level=logging.INFO)
	log = logging.getLogger(__name__)


	def get_models(
	models: List[str],
	trust_remote_code: bool,
	):
	"""
	get the models

	:param models: model names to download
	:param trust_remote_code: are you sure??? True/False
	"""
	config = {
	"torch_dtype": torch.float16,
	"low_cpu_mem_usage": False,
	"trust_remote_code": trust_remote_code,
	}
	loaded_models = []
	num_models = len(models)
	for midx, model_path in enumerate(models):
	log.info(
	f"loading model={midx + 1}/{num_models} "
	f"model={model_path} "
	)
	loaded_models.append(
	transformers.AutoModelForCausalLM.from_pretrained(
	model_path, **config
	)
	)
	return loaded_models


	def pm(
	model,
	):
	"""
	pretty print model

	:param model: show me the model
	"""
	keys = model.state_dict().keys()
	log.info(f"model keys={len(keys)}")
	for i, k in enumerate(keys):
	tensor = model.state_dict()[k]
	log.info(
	f"{i:3d} {k} shape={tensor.shape} "
	f"type={tensor.dtype} dev={tensor.device} "
	f"contig={tensor.is_contiguous()}"
	)


	def run_text_test(
	model,
	tokenizer_path: str,
	question: str,
	device: str = "cuda",
	):
	"""
	run a question on the model and return the answer

	:param model: initialized model
	:param tokenizer_path: tokenizer path/name
	:param question: what are you asking?
	:param device: where do you want to run "cpu"/"gpu"?
	"""
	base_model = model.to(device)
	log.info(f"loading tokenizer={tokenizer_path}")
	tokenizer = transformers.AutoTokenizer.from_pretrained(
	tokenizer_path,
	torch_dtype=torch.float16,
	)

	inputs = tokenizer(question, return_tensors="pt").to(
	device
	)
	with torch.backends.cuda.sdp_kernel(
	enable_flash=True,
	enable_math=False,
	enable_mem_efficient=True,
	):
	outputs = base_model.generate(
	**inputs,
	max_new_tokens=256,
	)
	answer = tokenizer.decode(
	outputs[0], skip_special_tokens=True
	)
	log.info(
	"\n"
	"----------"
	"\n"
	f"tokenizer={tokenizer}\n "
	f"question:\n{question}\n"
	f"answer:\n{answer}\n"
	"----------"
	)
	base_model = base_model.to(device)
	return tokenizer


	def get_layer_type(key: str) -> Tuple[int, str]:
	"""
	get the layer type

	:param key: name of the layer
	:return: layer id and name
	"""
	matcher = re.compile(r"model.layers.(\d+).(.+)")
	m = matcher.match(key)
	if m is None:
	if "model.norm.weight" == key:
	return -1, "norm"
	if "model.embed_tokens.weight" == key:
	return -1, "embed"
	if "lm_head.weight" == key:
	return -1, "head"
	log.info(f"Unknown key {key}")
	return -1, "unknown"
	return int(m.group(1)), m.group(2)


	def merge_model_with_ties(
	models: List[str],
	model_dst: str,
	trust_remote_code: bool = True,
	):
	"""
	merge the list of models into one model
	called model_dst

	:param models: list of models to merge
	:param model_dst: name of the new model
	:param trust_remote_code: are you sure? True/False
	"""
	models = get_models(
	models=models,
	trust_remote_code=trust_remote_code,
	)
	config = {}
	result_dict: Dict[str, torch.Tensor] = {}
	device = get_device()
	keys = models[0].state_dict().keys()
	num_keys = len(keys)
	for k in keys:
	block, layer_type = get_layer_type(k)
	m0: torch.Tensor = models[0].state_dict()[k]
	result = m0.clone()
	sets = divide_tensor_into_sets(tensor=m0, n_sets=4)

	# get the src layers to merge
	m = [
	models[1].state_dict()[k],
	models[2].state_dict()[k],
	models[3].state_dict()[k],
	models[4].state_dict()[k],
	]

	# build a ratio
	ratio = {
	"to_q": 0.0,
	"to_k": 0.0,
	"to_v": 0.0,
	}.get(layer_type, 0.5)

	norm_ratio = 0.68
	log.info(
	f"model={k} {num_keys} shape={m0.shape} "
	f"dtype={m0.dtype} {m0.device} "
	f"ratio={ratio} "
	f"contig={m0.is_contiguous()} "
	f"norm={norm_ratio}"
	)

	# for all tensors
	for i, tensor in enumerate(m):
	if layer_type == "to_k":
	# Get to_q key
	q_base = models[0].state_dict()[
	k.replace("to_k", "to_q")
	]
	q_merge = models[i].state_dict()[
	k.replace("to_k", "to_q")
	]
	scale = relative_norm(q_merge, q_base)
	tensor = tensor.to(device) / scale
	del scale
	elif layer_type == "to_q":
	scale = relative_norm(tensor, m0)
	tensor = tensor.to(device) * scale
	del scale
	slice_mask = (sets == i).bool()
	new_tensor = dare_ties_sparsification(
	model_a_param=m0,
	model_b_param=tensor,
	drop_rate=norm_ratio,
	ties="sum",
	rescale="off",
	device=device,
	**config,
	)
	new_tensor = merge_tensors(
	"slerp", m0, tensor, ratio
	)
	result = torch.where(
	slice_mask, new_tensor, result
	)
	del new_tensor, slice_mask

	result_dict[k] = result
	# end of merge

	log.info(f"done merge saving to file: {model_dst}")
	out_model = (
	transformers.AutoModelForCausalLM.from_pretrained(
	model_dst, **config
	)
	)
	out_model.state_dict = lambda: result_dict
	out_model.save_pretrained(model_dst)


	def run():
	"""
	run the merge and upload the model and tokenizer

	This requires having the Hugging Face token
	set before it will work:
	```huggingface-cli login```
	"""
	question = "why is the sky blue?"
	log.info(
	f"merging models and asking the question: {question}"
	)
	model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
	model_dst = "matlok/tinyllama-cinder-openhermes-32k"
	device = "cuda"
	config = {
	"torch_dtype": torch.float16,
	"low_cpu_mem_usage": False,
	"trust_remote_code": True,
	}
	models = [
	model_src,
	"Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct",
	"Doctor-Shotgun/TinyLlama-1.1B-32k",
	"Tensoic/TinyLlama-1.1B-3T-openhermes",
	"Josephgflowers/TinyLlama-3T-Cinder-v1.3",
	]
	merge_model_with_ties(
	models=models, model_dst=model_dst
	)
	log.info(f"loading newly-created file: {model_dst}")
	model = (
	transformers.AutoModelForCausalLM.from_pretrained(
	model_dst, **config
	)
	)
	log.info(
	f"loaded new model file: {model_dst} "
	f"asking question: {question} "
	)
	run_text_test(
	model=model,
	tokenizer_path=model_src,
	question=question,
	device=device,
	)

	# clean the temp merge dir
	# remove model dir to prevent issues with the tokenizer upload
	model_org = model_dst.split("/")[0]
	if os.path.exists(model_org):
	os.system(f"rm -rf ./{model_org}")

	log.info(f"uploading model: {model_dst}")
	model.push_to_hub(model_dst)

	log.info(f"uploading src tokenizer: {model_src}")
	# reload tokenizer to save it and found on:
	# https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing#scrollTo=QQn30cRtAZ-P
	tokenizer = transformers.AutoTokenizer.from_pretrained(
	model_src, trust_remote_code=True
	)
	# https://huggingface.co/docs/transformers/model_sharing#use-the-pushtohub-function
	# tokenizer.push_to_hub("my-awesome-model")
	tokenizer.push_to_hub(model_dst)
	log.info(
	f"done loading new model: {model} "
	f"file: {model_dst}"
	)


	if __name__ == "__main__":
	run()