v2 release with script and confirmed the response works end-to-end

Browse files

Files changed (3) hide show

README.md +95 -231
model.safetensors +1 -1
run-tiny-merge.py +252 -0

README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 license: unknown
 ---
-## Merging models like lego blocks using ddare and ties
 This model was merged with the following HuggingFace TinyLlama models using ties:
@@ -20,9 +20,11 @@ Please refer to the Unsloth fine-tuning guide for:
 ## How do I generate my own model merges?
-Here's the standalone python script we used with logs below:
 ```python3
 import transformers
 import torch
 import logging
@@ -32,6 +34,12 @@ from ddare.util import get_device
 import re
 from typing import Dict, Tuple, List
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger(__name__)
@@ -49,7 +57,7 @@ def get_models(
     num_models = len(models)
     for midx, model_path in enumerate(models):
         log.info(
-            f"loading model={midx}/{num_models} "
             f"model={model_path} "
         )
         loaded_models.append(
@@ -76,30 +84,42 @@ def pm(
 def run_text_test(
     model,
-    model_path,
-    device: str,
     question: str,
 ):
     base_model = model.to(device)
     log.info(
-        f"loading model={model_path}"
     )
     tokenizer = transformers.AutoTokenizer.from_pretrained(
-        model_path,
-        torch_dtype=torch.float16)
     inputs = tokenizer(
         question,
         return_tensors="pt"
-    ).to("cuda")
     with torch.backends.cuda.sdp_kernel(
         enable_flash=True,
         enable_math=False,
         enable_mem_efficient=False
     ):
-        outputs = base_model.generate(**inputs)
-    log.info(tokenizer.decode(outputs[0], skip_special_tokens=True))
-    base_model = base_model.to("cpu")
 def get_layer_type(
@@ -144,6 +164,7 @@ def merge_model_with_ties(
             models[1].state_dict()[k],
             models[2].state_dict()[k],
             models[3].state_dict()[k],
         ]
         # build a ratio
@@ -193,7 +214,7 @@ def merge_model_with_ties(
     # end of merge
     log.info(
-        f"{config} - done merge saving to file: {model_dst}"
     )
     out_model = (
         transformers.AutoModelForCausalLM.from_pretrained(
@@ -206,9 +227,13 @@ def merge_model_with_ties(
 def run():
-    log.info("start")
     model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
     model_dst = "matlok/tinyllama-cinder-openhermes-32k"
     config = {
         'torch_dtype': torch.float16,
         'low_cpu_mem_usage': False,
@@ -230,7 +255,16 @@ def run():
         model_dst,
         **config
     )
-    pm(model=model)
     log.info(f"done loading new model: {model} file: {model_dst}")
@@ -238,7 +272,6 @@ if __name__ == "__main__":
     run()
 ```
 ### Logs
 Here's the logs from the code above:
@@ -248,12 +281,14 @@ Total VRAM 12282 MB, total RAM 85434 MB
 Set vram state to: NORMAL_VRAM
 Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native
 VAE dtype: torch.bfloat16
-INFO:__main__:start
-INFO:__main__:loading model=0/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
-INFO:__main__:loading model=1/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct
-INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k
-INFO:__main__:loading model=3/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes
-INFO:__main__:loading model=4/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3
 INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
@@ -455,213 +490,42 @@ INFO:__main__:model=model.layers.21.input_layernorm.weight 201 shape=torch.Size(
 INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:{} - done merge saving to file: matlok/tinyllama-cinder-openhermes-32k
-config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 724/724 [00:00<00:00, 6.15MB/s]
-model.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.20G/2.20G [00:57<00:00, 38.0MB/s]
-generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 133/133 [00:00<00:00, 1.82MB/s]
 INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k
-INFO:__main__:model keys=201
-INFO:__main__:  0 model.embed_tokens.weight shape=torch.Size([32000, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:  1 model.layers.0.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:  2 model.layers.0.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:  3 model.layers.0.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:  4 model.layers.0.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:  5 model.layers.0.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:  6 model.layers.0.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:  7 model.layers.0.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:  8 model.layers.0.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:  9 model.layers.0.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 10 model.layers.1.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 11 model.layers.1.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 12 model.layers.1.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 13 model.layers.1.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 14 model.layers.1.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 15 model.layers.1.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 16 model.layers.1.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 17 model.layers.1.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 18 model.layers.1.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 19 model.layers.2.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 20 model.layers.2.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 21 model.layers.2.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 22 model.layers.2.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 23 model.layers.2.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 24 model.layers.2.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 25 model.layers.2.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 26 model.layers.2.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 27 model.layers.2.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 28 model.layers.3.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 29 model.layers.3.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 30 model.layers.3.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 31 model.layers.3.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 32 model.layers.3.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 33 model.layers.3.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 34 model.layers.3.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 35 model.layers.3.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 36 model.layers.3.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 37 model.layers.4.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 38 model.layers.4.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 39 model.layers.4.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 40 model.layers.4.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 41 model.layers.4.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 42 model.layers.4.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 43 model.layers.4.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 44 model.layers.4.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 45 model.layers.4.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 46 model.layers.5.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 47 model.layers.5.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 48 model.layers.5.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 49 model.layers.5.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 50 model.layers.5.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 51 model.layers.5.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 52 model.layers.5.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 53 model.layers.5.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 54 model.layers.5.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 55 model.layers.6.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 56 model.layers.6.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 57 model.layers.6.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 58 model.layers.6.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 59 model.layers.6.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 60 model.layers.6.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 61 model.layers.6.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 62 model.layers.6.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 63 model.layers.6.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 64 model.layers.7.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 65 model.layers.7.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 66 model.layers.7.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 67 model.layers.7.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 68 model.layers.7.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 69 model.layers.7.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 70 model.layers.7.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 71 model.layers.7.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 72 model.layers.7.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 73 model.layers.8.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 74 model.layers.8.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 75 model.layers.8.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 76 model.layers.8.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 77 model.layers.8.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 78 model.layers.8.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 79 model.layers.8.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 80 model.layers.8.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 81 model.layers.8.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 82 model.layers.9.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 83 model.layers.9.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 84 model.layers.9.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 85 model.layers.9.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 86 model.layers.9.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 87 model.layers.9.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 88 model.layers.9.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 89 model.layers.9.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 90 model.layers.9.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 91 model.layers.10.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 92 model.layers.10.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 93 model.layers.10.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 94 model.layers.10.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 95 model.layers.10.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 96 model.layers.10.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 97 model.layers.10.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 98 model.layers.10.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__: 99 model.layers.10.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:100 model.layers.11.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:101 model.layers.11.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:102 model.layers.11.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:103 model.layers.11.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:104 model.layers.11.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:105 model.layers.11.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:106 model.layers.11.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:107 model.layers.11.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:108 model.layers.11.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:109 model.layers.12.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:110 model.layers.12.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:111 model.layers.12.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:112 model.layers.12.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:113 model.layers.12.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:114 model.layers.12.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:115 model.layers.12.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:116 model.layers.12.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:117 model.layers.12.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:118 model.layers.13.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:119 model.layers.13.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:120 model.layers.13.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:121 model.layers.13.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:122 model.layers.13.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:123 model.layers.13.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:124 model.layers.13.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:125 model.layers.13.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:126 model.layers.13.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:127 model.layers.14.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:128 model.layers.14.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:129 model.layers.14.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:130 model.layers.14.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:131 model.layers.14.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:132 model.layers.14.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:133 model.layers.14.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:134 model.layers.14.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:135 model.layers.14.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:136 model.layers.15.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:137 model.layers.15.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:138 model.layers.15.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:139 model.layers.15.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:140 model.layers.15.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:141 model.layers.15.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:142 model.layers.15.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:143 model.layers.15.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:144 model.layers.15.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:145 model.layers.16.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:146 model.layers.16.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:147 model.layers.16.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:148 model.layers.16.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:149 model.layers.16.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:150 model.layers.16.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:151 model.layers.16.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:152 model.layers.16.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:153 model.layers.16.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:154 model.layers.17.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:155 model.layers.17.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:156 model.layers.17.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:157 model.layers.17.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:158 model.layers.17.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:159 model.layers.17.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:160 model.layers.17.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:161 model.layers.17.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:162 model.layers.17.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:163 model.layers.18.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:164 model.layers.18.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:165 model.layers.18.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:166 model.layers.18.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:167 model.layers.18.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:168 model.layers.18.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:169 model.layers.18.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:170 model.layers.18.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:171 model.layers.18.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:172 model.layers.19.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:173 model.layers.19.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:174 model.layers.19.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:175 model.layers.19.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:176 model.layers.19.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:177 model.layers.19.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:178 model.layers.19.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:179 model.layers.19.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:180 model.layers.19.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:181 model.layers.20.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:182 model.layers.20.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:183 model.layers.20.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:184 model.layers.20.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:185 model.layers.20.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:186 model.layers.20.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:187 model.layers.20.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:188 model.layers.20.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:189 model.layers.20.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:190 model.layers.21.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:191 model.layers.21.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:192 model.layers.21.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:193 model.layers.21.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:194 model.layers.21.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:195 model.layers.21.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:196 model.layers.21.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:197 model.layers.21.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:198 model.layers.21.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:199 model.norm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
-INFO:__main__:200 lm_head.weight shape=torch.Size([32000, 2048]) type=torch.float16 dev=cpu contig=True
 INFO:__main__:done loading new model: LlamaForCausalLM(
   (model): LlamaModel(
     (embed_tokens): Embedding(32000, 2048)
@@ -689,9 +553,9 @@ INFO:__main__:done loading new model: LlamaForCausalLM(
   (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
 ) file: matlok/tinyllama-cinder-openhermes-32k
-real	1m18.070s
-user	2m10.228s
-sys	0m14.040s
 ```
 Note: code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b)

 license: unknown
 ---
+## Merging AI Models like Lego Blocks
 This model was merged with the following HuggingFace TinyLlama models using ties:
 ## How do I generate my own model merges?
+Here's [the standalone python script](https://huggingface.co/matlok/tinyllama-cinder-openhermes-32k/blob/main/run-tiny-merge.py) used with logs below:
 ```python3
+#!/usr/bin/env python3
 import transformers
 import torch
 import logging
 import re
 from typing import Dict, Tuple, List
+# If you want to fine-tune, here's an example Unsloth fine tuning guide for:
+# Alpaca + TinyLlama + RoPE Scaling full example.ipynb
+# https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing#scrollTo=LjY75GoYUCB8
+# code here was refactored from gist: https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger(__name__)
     num_models = len(models)
     for midx, model_path in enumerate(models):
         log.info(
+            f"loading model={midx + 1}/{num_models} "
             f"model={model_path} "
         )
         loaded_models.append(
 def run_text_test(
     model,
+    tokenizer_path,
     question: str,
+    device: str = "cuda",
 ):
     base_model = model.to(device)
     log.info(
+        f"loading tokenizer={tokenizer_path}"
     )
     tokenizer = transformers.AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        torch_dtype=torch.float16,
+    )
     inputs = tokenizer(
         question,
         return_tensors="pt"
+    ).to(device)
     with torch.backends.cuda.sdp_kernel(
         enable_flash=True,
         enable_math=False,
         enable_mem_efficient=False
     ):
+        outputs = base_model.generate(
+            **inputs,
+            max_new_tokens=1000,
+        )
+    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    log.info(
+        "\n"
+        "----------"
+        f"tokenizer={tokenizer}\n "
+        f"question:\n{question}\n"
+        f"answer:\n{answer}\n"
+        "----------"
+    )
+    base_model = base_model.to(device)
 def get_layer_type(
             models[1].state_dict()[k],
             models[2].state_dict()[k],
             models[3].state_dict()[k],
+            models[4].state_dict()[k],
         ]
         # build a ratio
     # end of merge
     log.info(
+        f"done merge saving to file: {model_dst}"
     )
     out_model = (
         transformers.AutoModelForCausalLM.from_pretrained(
 def run():
+    question = (
+        "why is the sky blue?"
+    )
+    log.info(f"merging models and asking the question: {question}")
     model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
     model_dst = "matlok/tinyllama-cinder-openhermes-32k"
+    device = "cuda"
     config = {
         'torch_dtype': torch.float16,
         'low_cpu_mem_usage': False,
         model_dst,
         **config
     )
+    log.info(
+        f"loaded new model file: {model_dst} "
+        f"asking question: {question} "
+    )
+    run_text_test(
+        model=model,
+        tokenizer_path=model_src,
+        question=question,
+        device=device,
+    )
     log.info(f"done loading new model: {model} file: {model_dst}")
     run()
 ```
 ### Logs
 Here's the logs from the code above:
 Set vram state to: NORMAL_VRAM
 Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native
 VAE dtype: torch.bfloat16
+INFO:__main__:merging models and asking the question: why is the sky blue?
+INFO:__main__:loading model=1/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct
+/d/venvs/dev/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
+  return self.fget.__get__(instance, owner)()
+INFO:__main__:loading model=3/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k
+INFO:__main__:loading model=4/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes
+INFO:__main__:loading model=5/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3
 INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
+INFO:__main__:done merge saving to file: matlok/tinyllama-cinder-openhermes-32k
 INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k
+INFO:__main__:loaded new model file: matlok/tinyllama-cinder-openhermes-32k asking question: why is the sky blue?
+INFO:__main__:loading tokenizer=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
+INFO:__main__:
+----------tokenizer=LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
+	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
+}
+ question:
+why is the sky blue?
+answer:
+why is the sky blue?
+The sky is blue because it is made up of the colors of the visible spectrum. The visible spectrum is a range of colors that can be seen with the naked eye. The colors in the visible spectrum are made up of light waves that are shorter than the wavelengths of the visible light. The shorter wavelengths of light are absorbed more easily by the atmosphere, which is why the sky is blue.
+What is the color of the sky?
+The color of the sky is blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
+What is the color of the sky in the winter?
+The color of the sky in the winter is usually a deep blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
+What is the color of the sky in the summer?
+The color of the sky in the summer is usually a bright yellow. This is because the visible spectrum is made up of the colors of the yellow and orange parts of the spectrum. The yellow part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
+What is the color of the sky in the spring?
+The color of the sky in the spring is usually a bright green. This is because the visible spectrum is made up of the colors of the green and blue parts of the spectrum. The green part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The blue part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
+What is the color of the sky in the fall?
+The color of the sky in the fall is usually a deep red. This is because the visible spectrum is made up of the colors of the red and orange parts of the spectrum. The red part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
+What is the color of the sky in the winter?
+The color of the sky in the winter is usually a deep blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
+What is the color of the sky in the summer?
+The color of the sky in the summer is usually a bright yellow. This is because the visible spectrum is made up of the colors of the yellow and orange parts of the spectrum. The yellow part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
+What is the color of the sky in the spring?
+The color of the sky in the spring is usually a bright green. This is because the visible spectrum is made up of the colors of the green and blue parts of the spectrum. The green part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The blue part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
+What is the color of the sky in the fall?
+The color of the sky in the fall is usually a deep red. This is because the visible spectrum is made up of the colors of the red and orange parts of the spectrum. The red part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
+What is the color of the
+----------
 INFO:__main__:done loading new model: LlamaForCausalLM(
   (model): LlamaModel(
     (embed_tokens): Embedding(32000, 2048)
   (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
 ) file: matlok/tinyllama-cinder-openhermes-32k
+real	0m49.612s
+user	3m2.617s
+sys	0m14.655s
 ```
 Note: code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bb7be5c697e99c1fbf4bdc5531632e881e8f625d38da3a228daa96ae90ac6452
 size 2200119664

 version https://git-lfs.github.com/spec/v1
+oid sha256:cbdca20af4eed297d35d3ec8b116884a6c83c4f83109b7b9f7ffd37f71af04b2
 size 2200119664

run-tiny-merge.py ADDED Viewed

	@@ -0,0 +1,252 @@

+#!/usr/bin/env python3
+import transformers
+import torch
+import logging
+from ddare.merge import merge_tensors
+from ddare.tensor import (
+    dare_ties_sparsification,
+    relative_norm,
+    divide_tensor_into_sets
+)
+from ddare.util import get_device
+import re
+from typing import Dict, Tuple, List
+# If you want to fine-tune, here's an example Unsloth fine tuning guide for:
+# Alpaca + TinyLlama + RoPE Scaling full example.ipynb
+# https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing
+# code here was refactored from gist:
+# https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger(__name__)
+def get_models(
+    models: List[str],
+    trust_remote_code: bool,
+):
+    config = {
+        'torch_dtype': torch.float16,
+        'low_cpu_mem_usage': False,
+        'trust_remote_code': trust_remote_code,
+    }
+    loaded_models = []
+    num_models = len(models)
+    for midx, model_path in enumerate(models):
+        log.info(
+            f"loading model={midx + 1}/{num_models} "
+            f"model={model_path} "
+        )
+        loaded_models.append(
+            transformers.AutoModelForCausalLM.from_pretrained(
+                model_path,
+                **config
+            )
+        )
+    return loaded_models
+def pm(
+    model,
+):
+    keys = model.state_dict().keys()
+    log.info(f"model keys={len(keys)}")
+    for i, k in enumerate(keys):
+        tensor = model.state_dict()[k]
+        log.info(
+            f"{i:3d} {k} shape={tensor.shape} "
+            f"type={tensor.dtype} dev={tensor.device} "
+            f"contig={tensor.is_contiguous()}")
+def run_text_test(
+    model,
+    tokenizer_path,
+    question: str,
+    device: str = "cuda",
+):
+    base_model = model.to(device)
+    log.info(
+        f"loading tokenizer={tokenizer_path}"
+    )
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        torch_dtype=torch.float16,
+    )
+    inputs = tokenizer(
+        question,
+        return_tensors="pt"
+    ).to(device)
+    with torch.backends.cuda.sdp_kernel(
+        enable_flash=True,
+        enable_math=False,
+        enable_mem_efficient=False
+    ):
+        outputs = base_model.generate(
+            **inputs,
+            max_new_tokens=1000,
+        )
+    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    log.info(
+        "\n"
+        "----------"
+        f"tokenizer={tokenizer}\n "
+        f"question:\n{question}\n"
+        f"answer:\n{answer}\n"
+        "----------"
+    )
+    base_model = base_model.to(device)
+def get_layer_type(
+    key: str
+) -> Tuple[int, str]:
+    matcher = re.compile(r"model.layers.(\d+).(.+)")
+    m = matcher.match(key)
+    if m is None:
+        if "model.norm.weight" == key:
+            return -1, "norm"
+        if "model.embed_tokens.weight" == key:
+            return -1, "embed"
+        if "lm_head.weight" == key:
+            return -1, "head"
+        log.info(f"Unknown key {key}")
+        return -1, "unknown"
+    return int(m.group(1)), m.group(2)
+def merge_model_with_ties(
+    models: List[str],
+    model_dst: str,
+    trust_remote_code: bool = True
+):
+    models = get_models(
+        models=models,
+        trust_remote_code=trust_remote_code,
+    )
+    config = {}
+    result_dict: Dict[str, torch.Tensor] = {}
+    device = get_device()
+    keys = models[0].state_dict().keys()
+    num_keys = len(keys)
+    for k in keys:
+        block, layer_type = get_layer_type(k)
+        m0: torch.Tensor = models[0].state_dict()[k]
+        result = m0.clone()
+        sets = divide_tensor_into_sets(tensor=m0, n_sets=4)
+        # get the src layers to merge
+        m = [
+            models[1].state_dict()[k],
+            models[2].state_dict()[k],
+            models[3].state_dict()[k],
+            models[4].state_dict()[k],
+        ]
+        # build a ratio
+        ratio = {
+            'to_q': 0.0,
+            'to_k': 0.0,
+            'to_v': 0.0,
+        }.get(layer_type, .5)
+        norm_ratio = 0.68
+        log.info(
+            f"model={k} {num_keys} shape={m0.shape} "
+            f"dtype={m0.dtype} {m0.device} "
+            f"raio={ratio} "
+            f"contig={m0.is_contiguous()} "
+            f"norm={norm_ratio}")
+        # for all tensors
+        for i, tensor in enumerate(m):
+            if layer_type == "to_k":
+                # Get to_q key
+                q_base = models[0].state_dict()[k.replace("to_k", "to_q")]
+                q_merge = models[i].state_dict()[k.replace("to_k", "to_q")]
+                scale = relative_norm(q_merge, q_base)
+                tensor = tensor.to(device) / scale
+                del scale
+            elif layer_type == "to_q":
+                scale = relative_norm(tensor, m0)
+                tensor = tensor.to(device) * scale
+                del scale
+            slice_mask = (
+                sets == i
+            ).bool()
+            new_tensor = dare_ties_sparsification(
+                model_a_param=m0,
+                model_b_param=tensor,
+                drop_rate=norm_ratio,
+                ties="sum",
+                rescale="off",
+                device=device,
+                **config)
+            new_tensor = merge_tensors("slerp", m0, tensor, ratio)
+            result = torch.where(slice_mask, new_tensor, result)
+            del new_tensor, slice_mask
+        result_dict[k] = result
+    # end of merge
+    log.info(
+        f"done merge saving to file: {model_dst}"
+    )
+    out_model = (
+        transformers.AutoModelForCausalLM.from_pretrained(
+            model_dst,
+            **config
+        )
+    )
+    out_model.state_dict = lambda: result_dict
+    out_model.save_pretrained(model_dst)
+def run():
+    question = (
+        "why is the sky blue?"
+    )
+    log.info(f"merging models and asking the question: {question}")
+    model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+    model_dst = "matlok/tinyllama-cinder-openhermes-32k"
+    device = "cuda"
+    config = {
+        'torch_dtype': torch.float16,
+        'low_cpu_mem_usage': False,
+        'trust_remote_code': True,
+    }
+    models = [
+        model_src,
+        "Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct",
+        "Doctor-Shotgun/TinyLlama-1.1B-32k",
+        "Tensoic/TinyLlama-1.1B-3T-openhermes",
+        "Josephgflowers/TinyLlama-3T-Cinder-v1.3",
+    ]
+    merge_model_with_ties(
+        models=models,
+        model_dst=model_dst
+    )
+    log.info(f"loading newly-created file: {model_dst}")
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_dst,
+        **config
+    )
+    log.info(
+        f"loaded new model file: {model_dst} "
+        f"asking question: {question} "
+    )
+    run_text_test(
+        model=model,
+        tokenizer_path=model_src,
+        question=question,
+        device=device,
+    )
+    log.info(f"done loading new model: {model} file: {model_dst}")
+if __name__ == "__main__":
+    run()