tinyllama-cinder-openhermes-32k / run-tiny-merge.py
matlok's picture
give a better example in the docstring
cbf1aa6
#!/usr/bin/env python3
"""
Code here was refactored from gist:
https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
CodeLlama example:
https://huggingface.co/collections/mlabonne/codellama-6509bc68c2d4c8fc379ee87f
Hugging Face Fine-Tuning example:
https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing
2024-02-07 - unable to get unsloth to install.
If you want to fine-tune, here's an example Unsloth fine tuning guide for:
Alpaca + TinyLlama + RoPE Scaling full example.ipynb
https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing
"""
import os
import transformers
import torch
import logging
from ddare.merge import merge_tensors
from ddare.tensor import (
dare_ties_sparsification,
relative_norm,
divide_tensor_into_sets,
)
from ddare.util import get_device
import re
from typing import Dict, Tuple, List
logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)
def get_models(
models: List[str],
trust_remote_code: bool,
):
"""
get the models
:param models: model names to download
:param trust_remote_code: are you sure??? True/False
"""
config = {
"torch_dtype": torch.float16,
"low_cpu_mem_usage": False,
"trust_remote_code": trust_remote_code,
}
loaded_models = []
num_models = len(models)
for midx, model_path in enumerate(models):
log.info(
f"loading model={midx + 1}/{num_models} "
f"model={model_path} "
)
loaded_models.append(
transformers.AutoModelForCausalLM.from_pretrained(
model_path, **config
)
)
return loaded_models
def pm(
model,
):
"""
pretty print model
:param model: show me the model
"""
keys = model.state_dict().keys()
log.info(f"model keys={len(keys)}")
for i, k in enumerate(keys):
tensor = model.state_dict()[k]
log.info(
f"{i:3d} {k} shape={tensor.shape} "
f"type={tensor.dtype} dev={tensor.device} "
f"contig={tensor.is_contiguous()}"
)
def run_text_test(
model,
tokenizer_path: str,
question: str,
device: str = "cuda",
):
"""
run a question on the model and return the answer
:param model: initialized model
:param tokenizer_path: tokenizer path/name
:param question: what are you asking?
:param device: where do you want to run "cpu"/"gpu"?
"""
base_model = model.to(device)
log.info(f"loading tokenizer={tokenizer_path}")
tokenizer = transformers.AutoTokenizer.from_pretrained(
tokenizer_path,
torch_dtype=torch.float16,
)
inputs = tokenizer(question, return_tensors="pt").to(
device
)
with torch.backends.cuda.sdp_kernel(
enable_flash=True,
enable_math=False,
enable_mem_efficient=True,
):
outputs = base_model.generate(
**inputs,
max_new_tokens=256,
)
answer = tokenizer.decode(
outputs[0], skip_special_tokens=True
)
log.info(
"\n"
"----------"
"\n"
f"tokenizer={tokenizer}\n "
f"question:\n{question}\n"
f"answer:\n{answer}\n"
"----------"
)
base_model = base_model.to(device)
return tokenizer
def get_layer_type(key: str) -> Tuple[int, str]:
"""
get the layer type
:param key: name of the layer
:return: layer id and name
"""
matcher = re.compile(r"model.layers.(\d+).(.+)")
m = matcher.match(key)
if m is None:
if "model.norm.weight" == key:
return -1, "norm"
if "model.embed_tokens.weight" == key:
return -1, "embed"
if "lm_head.weight" == key:
return -1, "head"
log.info(f"Unknown key {key}")
return -1, "unknown"
return int(m.group(1)), m.group(2)
def merge_model_with_ties(
models: List[str],
model_dst: str,
trust_remote_code: bool = True,
):
"""
merge the list of models into one model
called model_dst
:param models: list of models to merge
:param model_dst: name of the new model
:param trust_remote_code: are you sure? True/False
"""
models = get_models(
models=models,
trust_remote_code=trust_remote_code,
)
config = {}
result_dict: Dict[str, torch.Tensor] = {}
device = get_device()
keys = models[0].state_dict().keys()
num_keys = len(keys)
for k in keys:
block, layer_type = get_layer_type(k)
m0: torch.Tensor = models[0].state_dict()[k]
result = m0.clone()
sets = divide_tensor_into_sets(tensor=m0, n_sets=4)
# get the src layers to merge
m = [
models[1].state_dict()[k],
models[2].state_dict()[k],
models[3].state_dict()[k],
models[4].state_dict()[k],
]
# build a ratio
ratio = {
"to_q": 0.0,
"to_k": 0.0,
"to_v": 0.0,
}.get(layer_type, 0.5)
norm_ratio = 0.68
log.info(
f"model={k} {num_keys} shape={m0.shape} "
f"dtype={m0.dtype} {m0.device} "
f"ratio={ratio} "
f"contig={m0.is_contiguous()} "
f"norm={norm_ratio}"
)
# for all tensors
for i, tensor in enumerate(m):
if layer_type == "to_k":
# Get to_q key
q_base = models[0].state_dict()[
k.replace("to_k", "to_q")
]
q_merge = models[i].state_dict()[
k.replace("to_k", "to_q")
]
scale = relative_norm(q_merge, q_base)
tensor = tensor.to(device) / scale
del scale
elif layer_type == "to_q":
scale = relative_norm(tensor, m0)
tensor = tensor.to(device) * scale
del scale
slice_mask = (sets == i).bool()
new_tensor = dare_ties_sparsification(
model_a_param=m0,
model_b_param=tensor,
drop_rate=norm_ratio,
ties="sum",
rescale="off",
device=device,
**config,
)
new_tensor = merge_tensors(
"slerp", m0, tensor, ratio
)
result = torch.where(
slice_mask, new_tensor, result
)
del new_tensor, slice_mask
result_dict[k] = result
# end of merge
log.info(f"done merge saving to file: {model_dst}")
out_model = (
transformers.AutoModelForCausalLM.from_pretrained(
model_dst, **config
)
)
out_model.state_dict = lambda: result_dict
out_model.save_pretrained(model_dst)
def run():
"""
run the merge and upload the model and tokenizer
This requires having the Hugging Face token
set before it will work:
```huggingface-cli login```
"""
question = "why is the sky blue?"
log.info(
f"merging models and asking the question: {question}"
)
model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
model_dst = "matlok/tinyllama-cinder-openhermes-32k"
device = "cuda"
config = {
"torch_dtype": torch.float16,
"low_cpu_mem_usage": False,
"trust_remote_code": True,
}
models = [
model_src,
"Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct",
"Doctor-Shotgun/TinyLlama-1.1B-32k",
"Tensoic/TinyLlama-1.1B-3T-openhermes",
"Josephgflowers/TinyLlama-3T-Cinder-v1.3",
]
merge_model_with_ties(
models=models, model_dst=model_dst
)
log.info(f"loading newly-created file: {model_dst}")
model = (
transformers.AutoModelForCausalLM.from_pretrained(
model_dst, **config
)
)
log.info(
f"loaded new model file: {model_dst} "
f"asking question: {question} "
)
run_text_test(
model=model,
tokenizer_path=model_src,
question=question,
device=device,
)
# clean the temp merge dir
# remove model dir to prevent issues with the tokenizer upload
model_org = model_dst.split("/")[0]
if os.path.exists(model_org):
os.system(f"rm -rf ./{model_org}")
log.info(f"uploading model: {model_dst}")
model.push_to_hub(model_dst)
log.info(f"uploading src tokenizer: {model_src}")
# reload tokenizer to save it and found on:
# https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing#scrollTo=QQn30cRtAZ-P
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_src, trust_remote_code=True
)
# https://huggingface.co/docs/transformers/model_sharing#use-the-pushtohub-function
# tokenizer.push_to_hub("my-awesome-model")
tokenizer.push_to_hub(model_dst)
log.info(
f"done loading new model: {model} "
f"file: {model_dst}"
)
if __name__ == "__main__":
run()