matlok
/

tinyllama-cinder-openhermes-32k

@@ -24,27 +24,51 @@ TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 Please refer to the Unsloth fine-tuning guide for:
 - [Alpaca + TinyLlama + RoPE Scaling full example.ipynb](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
 ## How do I generate my own model merges?
 ```python3
 #!/usr/bin/env python3
 import transformers
 import torch
 import logging
 from ddare.merge import merge_tensors
-from ddare.tensor import dare_ties_sparsification, relative_norm, divide_tensor_into_sets
 from ddare.util import get_device
 import re
 from typing import Dict, Tuple, List
-# If you want to fine-tune, here's an example Unsloth fine tuning guide for:
-# Alpaca + TinyLlama + RoPE Scaling full example.ipynb
-# https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing
-# code here was refactored from gist: https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger(__name__)
@@ -54,10 +78,16 @@ def get_models(
     models: List[str],
     trust_remote_code: bool,
 ):
     config = {
-        'torch_dtype': torch.float16,
-        'low_cpu_mem_usage': False,
-        'trust_remote_code': trust_remote_code,
     }
     loaded_models = []
     num_models = len(models)
@@ -68,8 +98,7 @@ def get_models(
         )
         loaded_models.append(
             transformers.AutoModelForCausalLM.from_pretrained(
-                model_path,
-                **config
             )
         )
     return loaded_models
@@ -78,6 +107,11 @@ def get_models(
 def pm(
     model,
 ):
     keys = model.state_dict().keys()
     log.info(f"model keys={len(keys)}")
     for i, k in enumerate(keys):
@@ -85,52 +119,66 @@ def pm(
         log.info(
             f"{i:3d} {k} shape={tensor.shape} "
             f"type={tensor.dtype} dev={tensor.device} "
-            f"contig={tensor.is_contiguous()}")
 def run_text_test(
     model,
-    tokenizer_path,
     question: str,
     device: str = "cuda",
 ):
     base_model = model.to(device)
-    log.info(
-        f"loading tokenizer={tokenizer_path}"
-    )
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         tokenizer_path,
         torch_dtype=torch.float16,
     )
-    inputs = tokenizer(
-        question,
-        return_tensors="pt"
-    ).to(device)
     with torch.backends.cuda.sdp_kernel(
         enable_flash=True,
         enable_math=False,
-        enable_mem_efficient=False
     ):
         outputs = base_model.generate(
             **inputs,
-            max_new_tokens=1000,
         )
-    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
     log.info(
         "\n"
         "----------"
         f"tokenizer={tokenizer}\n "
         f"question:\n{question}\n"
         f"answer:\n{answer}\n"
         "----------"
     )
     base_model = base_model.to(device)
-def get_layer_type(
-    key: str
-) -> Tuple[int, str]:
     matcher = re.compile(r"model.layers.(\d+).(.+)")
     m = matcher.match(key)
     if m is None:
@@ -148,8 +196,16 @@ def get_layer_type(
 def merge_model_with_ties(
     models: List[str],
     model_dst: str,
-    trust_remote_code: bool = True
 ):
     models = get_models(
         models=models,
         trust_remote_code=trust_remote_code,
@@ -175,25 +231,30 @@ def merge_model_with_ties(
         # build a ratio
         ratio = {
-            'to_q': 0.0,
-            'to_k': 0.0,
-            'to_v': 0.0,
-        }.get(layer_type, .5)
         norm_ratio = 0.68
         log.info(
             f"model={k} {num_keys} shape={m0.shape} "
             f"dtype={m0.dtype} {m0.device} "
-            f"raio={ratio} "
             f"contig={m0.is_contiguous()} "
-            f"norm={norm_ratio}")
         # for all tensors
         for i, tensor in enumerate(m):
             if layer_type == "to_k":
                 # Get to_q key
-                q_base = models[0].state_dict()[k.replace("to_k", "to_q")]
-                q_merge = models[i].state_dict()[k.replace("to_k", "to_q")]
                 scale = relative_norm(q_merge, q_base)
                 tensor = tensor.to(device) / scale
                 del scale
@@ -201,9 +262,7 @@ def merge_model_with_ties(
                 scale = relative_norm(tensor, m0)
                 tensor = tensor.to(device) * scale
                 del scale
-            slice_mask = (
-                sets == i
-            ).bool()
             new_tensor = dare_ties_sparsification(
                 model_a_param=m0,
                 model_b_param=tensor,
@@ -211,21 +270,23 @@ def merge_model_with_ties(
                 ties="sum",
                 rescale="off",
                 device=device,
-                **config)
-            new_tensor = merge_tensors("slerp", m0, tensor, ratio)
-            result = torch.where(slice_mask, new_tensor, result)
             del new_tensor, slice_mask
         result_dict[k] = result
     # end of merge
-    log.info(
-        f"done merge saving to file: {model_dst}"
-    )
     out_model = (
         transformers.AutoModelForCausalLM.from_pretrained(
-            model_dst,
-            **config
         )
     )
     out_model.state_dict = lambda: result_dict
@@ -233,17 +294,24 @@ def merge_model_with_ties(
 def run():
-    question = (
-        "why is the sky blue?"
     )
-    log.info(f"merging models and asking the question: {question}")
     model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
     model_dst = "matlok/tinyllama-cinder-openhermes-32k"
     device = "cuda"
     config = {
-        'torch_dtype': torch.float16,
-        'low_cpu_mem_usage': False,
-        'trust_remote_code': True,
     }
     models = [
         model_src,
@@ -253,13 +321,13 @@ def run():
         "Josephgflowers/TinyLlama-3T-Cinder-v1.3",
     ]
     merge_model_with_ties(
-        models=models,
-        model_dst=model_dst
     )
     log.info(f"loading newly-created file: {model_dst}")
-    model = transformers.AutoModelForCausalLM.from_pretrained(
-        model_dst,
-        **config
     )
     log.info(
         f"loaded new model file: {model_dst} "
@@ -271,7 +339,29 @@ def run():
         question=question,
         device=device,
     )
-    log.info(f"done loading new model: {model} file: {model_dst}")
 if __name__ == "__main__":
@@ -283,223 +373,244 @@ if __name__ == "__main__":
 Here's the logs from the code above:
 ```
 Total VRAM 12282 MB, total RAM 85434 MB
 Set vram state to: NORMAL_VRAM
 Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native
 VAE dtype: torch.bfloat16
 INFO:__main__:merging models and asking the question: why is the sky blue?
-INFO:__main__:loading model=1/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
-INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct
-/d/venvs/dev/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
-  return self.fget.__get__(instance, owner)()
-INFO:__main__:loading model=3/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k
-INFO:__main__:loading model=4/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes
-INFO:__main__:loading model=5/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3
-INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.0.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.0.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.0.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.0.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.0.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.0.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.0.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.1.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.1.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.1.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.1.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.1.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.1.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.1.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.1.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.1.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.2.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.2.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.2.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.2.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.2.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.2.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.2.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.2.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.2.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.3.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.3.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.3.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.3.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.3.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.3.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.3.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.3.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.3.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.4.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.4.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.4.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.4.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.4.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.4.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.4.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.4.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.4.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.5.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.5.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.5.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.5.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.5.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.5.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.5.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.5.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.5.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.6.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.6.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.6.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.6.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.6.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.6.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.6.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.6.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.6.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.7.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.7.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.7.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.7.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.7.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.7.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.7.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.7.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.7.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.8.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.8.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.8.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.8.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.8.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.8.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.8.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.8.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.8.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.9.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.9.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.9.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.9.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.9.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.9.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.9.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.9.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.9.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.10.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.10.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.10.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.10.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.10.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.10.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.10.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.10.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.10.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.11.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.11.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.11.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.11.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.11.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.11.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.11.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.11.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.11.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.12.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.12.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.12.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.12.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.12.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.12.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.12.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.12.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.12.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.13.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.13.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.13.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.13.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.13.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.13.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.13.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.13.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.13.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.14.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.14.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.14.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.14.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.14.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.14.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.14.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.14.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.14.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.15.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.15.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.15.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.15.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.15.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.15.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.15.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.15.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.15.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.16.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.16.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.16.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.16.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.16.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.16.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.16.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.16.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.16.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.17.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.17.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.17.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.17.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.17.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.17.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.17.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.17.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.17.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.18.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.18.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.18.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.18.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.18.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.18.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.18.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.18.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.18.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.19.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.19.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.19.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.19.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.19.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.19.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.19.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.19.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.19.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.20.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.20.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.20.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.20.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.20.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.20.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.20.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.20.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.20.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.21.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.21.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.21.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.21.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.21.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.21.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.21.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.21.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
-INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 INFO:__main__:done merge saving to file: matlok/tinyllama-cinder-openhermes-32k
 INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k
-INFO:__main__:loaded new model file: matlok/tinyllama-cinder-openhermes-32k asking question: why is the sky blue?
 INFO:__main__:loading tokenizer=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 INFO:__main__:
 ----------
@@ -512,27 +623,20 @@ tokenizer=LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-intermediate
 why is the sky blue?
 answer:
 why is the sky blue?
-The sky is blue because it is made up of the colors of the visible spectrum. The visible spectrum is a range of colors that can be seen with the naked eye. The colors in the visible spectrum are made up of light waves that are shorter than the wavelengths of the visible light. The shorter wavelengths of light are absorbed more easily by the atmosphere, which is why the sky is blue.
-What is the color of the sky?
-The color of the sky is blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
-What is the color of the sky in the winter?
-The color of the sky in the winter is usually a deep blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
-What is the color of the sky in the summer?
-The color of the sky in the summer is usually a bright yellow. This is because the visible spectrum is made up of the colors of the yellow and orange parts of the spectrum. The yellow part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
-What is the color of the sky in the spring?
-The color of the sky in the spring is usually a bright green. This is because the visible spectrum is made up of the colors of the green and blue parts of the spectrum. The green part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The blue part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
-What is the color of the sky in the fall?
-The color of the sky in the fall is usually a deep red. This is because the visible spectrum is made up of the colors of the red and orange parts of the spectrum. The red part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
-What is the color of the sky in the winter?
-The color of the sky in the winter is usually a deep blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
-What is the color of the sky in the summer?
-The color of the sky in the summer is usually a bright yellow. This is because the visible spectrum is made up of the colors of the yellow and orange parts of the spectrum. The yellow part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
-What is the color of the sky in the spring?
-The color of the sky in the spring is usually a bright green. This is because the visible spectrum is made up of the colors of the green and blue parts of the spectrum. The green part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The blue part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
-What is the color of the sky in the fall?
-The color of the sky in the fall is usually a deep red. This is because the visible spectrum is made up of the colors of the red and orange parts of the spectrum. The red part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
-What is the color of the
 ----------
 INFO:__main__:done loading new model: LlamaForCausalLM(
   (model): LlamaModel(
     (embed_tokens): Embedding(32000, 2048)
@@ -560,9 +664,13 @@ INFO:__main__:done loading new model: LlamaForCausalLM(
   (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
 ) file: matlok/tinyllama-cinder-openhermes-32k
-real	0m49.612s
-user	3m2.617s
-sys	0m14.655s
 ```
-Note: code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b)

 Please refer to the Unsloth fine-tuning guide for:
+### Fine-tuning using HuggingFace SFTTrainer
+- [Fine-tuning using HuggingFace SFTTrainer](https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing)
+### Fine-tuning using Unsloth
+2024-02-07 - unable to use unsloth due to pip install issues. Maybe others in the future will have more luck:
 - [Alpaca + TinyLlama + RoPE Scaling full example.ipynb](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
 ## How do I generate my own model merges?
+This requires running having the HuggingFace token set before it will work:
+If you're using the command line you can use:
+```sh
+huggingface-cli login
+```
+```sh
+time ./run-tiny-merge.py
+```
+### What's this code doing?
+Here's the latest version:
 ```python3
 #!/usr/bin/env python3
+import os
 import transformers
 import torch
 import logging
 from ddare.merge import merge_tensors
+from ddare.tensor import (
+    dare_ties_sparsification,
+    relative_norm,
+    divide_tensor_into_sets,
+)
 from ddare.util import get_device
 import re
 from typing import Dict, Tuple, List
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger(__name__)
     models: List[str],
     trust_remote_code: bool,
 ):
+    """
+    get the models
+    :param models: model names to download
+    :param trust_remote_code: are you sure??? True/False
+    """
     config = {
+        "torch_dtype": torch.float16,
+        "low_cpu_mem_usage": False,
+        "trust_remote_code": trust_remote_code,
     }
     loaded_models = []
     num_models = len(models)
         )
         loaded_models.append(
             transformers.AutoModelForCausalLM.from_pretrained(
+                model_path, **config
             )
         )
     return loaded_models
 def pm(
     model,
 ):
+    """
+    pretty print model
+    :param model: show me the model
+    """
     keys = model.state_dict().keys()
     log.info(f"model keys={len(keys)}")
     for i, k in enumerate(keys):
         log.info(
             f"{i:3d} {k} shape={tensor.shape} "
             f"type={tensor.dtype} dev={tensor.device} "
+            f"contig={tensor.is_contiguous()}"
+        )
 def run_text_test(
     model,
+    tokenizer_path: str,
     question: str,
     device: str = "cuda",
 ):
+    """
+    run a question on the model and return the answer
+    :param model: initialized model
+    :param tokenizer_path: tokenizer path/name
+    :param question: what are you asking?
+    :param device: where do you want to run "cpu"/"gpu"?
+    """
     base_model = model.to(device)
+    log.info(f"loading tokenizer={tokenizer_path}")
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         tokenizer_path,
         torch_dtype=torch.float16,
     )
+    inputs = tokenizer(question, return_tensors="pt").to(
+        device
+    )
     with torch.backends.cuda.sdp_kernel(
         enable_flash=True,
         enable_math=False,
+        enable_mem_efficient=True,
     ):
         outputs = base_model.generate(
             **inputs,
+            max_new_tokens=256,
         )
+    answer = tokenizer.decode(
+        outputs[0], skip_special_tokens=True
+    )
     log.info(
         "\n"
         "----------"
+        "\n"
         f"tokenizer={tokenizer}\n "
         f"question:\n{question}\n"
         f"answer:\n{answer}\n"
         "----------"
     )
     base_model = base_model.to(device)
+    return tokenizer
+def get_layer_type(key: str) -> Tuple[int, str]:
+    """
+    get the layer type
+    :param key: name of the layer
+    :return: layer id and name
+    """
     matcher = re.compile(r"model.layers.(\d+).(.+)")
     m = matcher.match(key)
     if m is None:
 def merge_model_with_ties(
     models: List[str],
     model_dst: str,
+    trust_remote_code: bool = True,
 ):
+    """
+    merge the list of models into one model
+    called model_dst
+    :param models: list of models to merge
+    :param model_dst: name of the new model
+    :param trust_remote_code: are you sure? True/False
+    """
     models = get_models(
         models=models,
         trust_remote_code=trust_remote_code,
         # build a ratio
         ratio = {
+            "to_q": 0.0,
+            "to_k": 0.0,
+            "to_v": 0.0,
+        }.get(layer_type, 0.5)
         norm_ratio = 0.68
         log.info(
             f"model={k} {num_keys} shape={m0.shape} "
             f"dtype={m0.dtype} {m0.device} "
+            f"ratio={ratio} "
             f"contig={m0.is_contiguous()} "
+            f"norm={norm_ratio}"
+        )
         # for all tensors
         for i, tensor in enumerate(m):
             if layer_type == "to_k":
                 # Get to_q key
+                q_base = models[0].state_dict()[
+                    k.replace("to_k", "to_q")
+                ]
+                q_merge = models[i].state_dict()[
+                    k.replace("to_k", "to_q")
+                ]
                 scale = relative_norm(q_merge, q_base)
                 tensor = tensor.to(device) / scale
                 del scale
                 scale = relative_norm(tensor, m0)
                 tensor = tensor.to(device) * scale
                 del scale
+            slice_mask = (sets == i).bool()
             new_tensor = dare_ties_sparsification(
                 model_a_param=m0,
                 model_b_param=tensor,
                 ties="sum",
                 rescale="off",
                 device=device,
+                **config,
+            )
+            new_tensor = merge_tensors(
+                "slerp", m0, tensor, ratio
+            )
+            result = torch.where(
+                slice_mask, new_tensor, result
+            )
             del new_tensor, slice_mask
         result_dict[k] = result
     # end of merge
+    log.info(f"done merge saving to file: {model_dst}")
     out_model = (
         transformers.AutoModelForCausalLM.from_pretrained(
+            model_dst, **config
         )
     )
     out_model.state_dict = lambda: result_dict
 def run():
+    """
+    run the merge and upload the model and tokenizer
+    This requires running having the HuggingFace token
+    set before it will work:
+    ```huggingface-cli login```
+    """
+    question = "why is the sky blue?"
+    log.info(
+        f"merging models and asking the question: {question}"
     )
     model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
     model_dst = "matlok/tinyllama-cinder-openhermes-32k"
     device = "cuda"
     config = {
+        "torch_dtype": torch.float16,
+        "low_cpu_mem_usage": False,
+        "trust_remote_code": True,
     }
     models = [
         model_src,
         "Josephgflowers/TinyLlama-3T-Cinder-v1.3",
     ]
     merge_model_with_ties(
+        models=models, model_dst=model_dst
     )
     log.info(f"loading newly-created file: {model_dst}")
+    model = (
+        transformers.AutoModelForCausalLM.from_pretrained(
+            model_dst, **config
+        )
     )
     log.info(
         f"loaded new model file: {model_dst} "
         question=question,
         device=device,
     )
+    # clean the temp merge dir
+    # remove model dir to prevent issues with the tokenizer upload
+    model_org = model_dst.split("/")[0]
+    if os.path.exists(model_org):
+        os.system(f"rm -rf ./{model_org}")
+    log.info(f"uploading model: {model_dst}")
+    model.push_to_hub(model_dst)
+    log.info(f"uploading src tokenizer: {model_src}")
+    # reload tokenizer to save it and found on:
+    # https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing#scrollTo=QQn30cRtAZ-P
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_src, trust_remote_code=True
+    )
+    # https://huggingface.co/docs/transformers/model_sharing#use-the-pushtohub-function
+    # tokenizer.push_to_hub("my-awesome-model")
+    tokenizer.push_to_hub(model_dst)
+    log.info(
+        f"done loading new model: {model} "
+        f"file: {model_dst}"
+    )
 if __name__ == "__main__":
 Here's the logs from the code above:
 ```
+time ./run-tiny-merge.py
 Total VRAM 12282 MB, total RAM 85434 MB
 Set vram state to: NORMAL_VRAM
 Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native
 VAE dtype: torch.bfloat16
 INFO:__main__:merging models and asking the question: why is the sky blue?
+INFO:__main__:loading model=1/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+config.json: 100%|█████████████████████████████████████| 560/560 [00:00<00:00, 5.23MB/s]
+model.safetensors: 100%|███████████████████████████| 4.40G/4.40G [00:48<00:00, 90.2MB/s]
+generation_config.json: 100%|███████████████████████████| 129/129 [00:00<00:00, 721kB/s]
+INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct
+config.json: 100%|█████████████████████████████████████| 695/695 [00:00<00:00, 3.04MB/s]
+pytorch_model.bin: 100%|███████████████████████████| 2.20G/2.20G [00:23<00:00, 92.6MB/s]
+generation_config.json: 100%|███████████████████████████| 129/129 [00:00<00:00, 566kB/s]
+INFO:__main__:loading model=3/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k
+config.json: 100%|█████████████████████████████████████| 686/686 [00:00<00:00, 3.57MB/s]
+model.safetensors: 100%|███████████████████████████| 2.20G/2.20G [00:24<00:00, 90.5MB/s]
+generation_config.json: 100%|██████████████████████████| 124/124 [00:00<00:00, 1.80MB/s]
+INFO:__main__:loading model=4/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes
+config.json: 100%|█████████████████████████████████████| 702/702 [00:00<00:00, 2.97MB/s]
+pytorch_model.bin: 100%|███████████████████████████| 2.20G/2.20G [00:23<00:00, 92.7MB/s]
+generation_config.json: 100%|███████████████████████████| 124/124 [00:00<00:00, 671kB/s]
+INFO:__main__:loading model=5/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3
+config.json: 100%|█████████████████████████████████████| 713/713 [00:00<00:00, 9.35MB/s]
+model.safetensors: 100%|███████████████████████████| 2.20G/2.20G [00:24<00:00, 91.5MB/s]
+generation_config.json: 100%|██████████████████████████| 138/138 [00:00<00:00, 1.86MB/s]
+INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.0.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.0.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.0.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.0.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.0.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.0.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.0.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.1.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.1.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.1.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.1.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.1.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.1.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.1.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.1.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.1.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.2.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.2.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.2.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.2.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.2.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.2.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.2.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.2.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.2.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.3.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.3.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.3.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.3.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.3.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.3.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.3.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.3.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.3.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.4.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.4.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.4.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.4.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.4.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.4.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.4.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.4.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.4.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.5.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.5.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.5.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.5.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.5.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.5.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.5.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.5.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.5.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.6.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.6.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.6.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.6.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.6.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.6.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.6.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.6.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.6.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.7.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.7.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.7.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.7.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.7.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.7.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.7.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.7.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.7.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.8.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.8.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.8.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.8.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.8.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.8.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.8.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.8.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.8.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.9.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.9.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.9.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.9.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.9.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.9.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.9.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.9.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.9.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.10.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.10.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.10.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.10.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.10.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.10.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.10.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.10.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.10.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.11.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.11.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.11.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.11.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.11.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.11.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.11.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.11.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.11.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.12.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.12.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.12.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.12.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.12.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.12.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.12.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.12.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.12.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.13.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.13.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.13.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.13.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.13.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.13.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.13.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.13.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.13.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.14.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.14.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.14.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.14.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.14.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.14.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.14.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.14.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.14.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.15.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.15.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.15.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.15.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.15.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.15.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.15.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.15.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.15.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.16.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.16.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.16.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.16.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.16.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.16.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.16.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.16.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.16.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.17.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.17.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.17.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.17.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.17.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.17.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.17.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.17.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.17.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.18.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.18.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.18.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.18.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.18.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.18.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.18.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.18.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.18.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.19.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.19.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.19.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.19.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.19.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.19.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.19.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.19.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.19.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.20.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.20.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.20.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.20.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.20.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.20.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.20.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.20.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.20.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.21.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.21.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.21.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.21.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.21.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.21.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.21.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.21.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
+INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68
 INFO:__main__:done merge saving to file: matlok/tinyllama-cinder-openhermes-32k
+config.json: 100%|█████████████████████████████████████| 724/724 [00:00<00:00, 7.75MB/s]
+model.safetensors: 100%|███████████████████████████| 2.20G/2.20G [00:23<00:00, 91.8MB/s]
+generation_config.json: 100%|██████████████████████████| 133/133 [00:00<00:00, 1.58MB/s]
 INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k
+INFO:__main__:loaded new model file: matlok/tinyllama-cinder-openhermes-32k asking question: why is the sky blue?
 INFO:__main__:loading tokenizer=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
+tokenizer_config.json: 100%|███████████████████████████| 776/776 [00:00<00:00, 8.26MB/s]
+tokenizer.model: 100%|███████████████████████████████| 500k/500k [00:00<00:00, 64.6MB/s]
+tokenizer.json: 100%|██████████████████████████████| 1.84M/1.84M [00:01<00:00, 1.57MB/s]
+special_tokens_map.json: 100%|█████████████████████████| 414/414 [00:00<00:00, 2.47MB/s]
 Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
 INFO:__main__:
 ----------
 why is the sky blue?
 answer:
 why is the sky blue?
+Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky.
+Why is the sky blue?
+Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky.
+Why is the sky blue?
+Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky.
+Why is the sky blue?
+Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky.
+Why is the sky blue?
+Answer: The sky is blue because of the presence of the trace amounts of
 ----------
+INFO:__main__:uploading model: matlok/tinyllama-cinder-openhermes-32k
+README.md: 100%|████████████████████████████████████| 45.6k/45.6k [00:00<00:00, 297MB/s]
+model.safetensors: 100%|███████████████████████████| 2.20G/2.20G [01:18<00:00, 28.0MB/s]
+INFO:__main__:uploading src tokenizer: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
 INFO:__main__:done loading new model: LlamaForCausalLM(
   (model): LlamaModel(
     (embed_tokens): Embedding(32000, 2048)
   (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
 ) file: matlok/tinyllama-cinder-openhermes-32k
+real 4m44.626s
+user 2m54.434s
+sys	 0m25.981s
 ```
+### Acknowlegdements
+- Code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b)
+- [Fine tuning example](https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing)
+- [CodeLlama example](https://huggingface.co/collections/mlabonne/codellama-6509bc68c2d4c8fc379ee87f)

run-tiny-merge.py CHANGED Viewed

@@ -1,5 +1,21 @@
 #!/usr/bin/env python3
 import transformers
 import torch
 import logging
@@ -7,18 +23,12 @@ from ddare.merge import merge_tensors
 from ddare.tensor import (
     dare_ties_sparsification,
     relative_norm,
-    divide_tensor_into_sets
 )
 from ddare.util import get_device
 import re
 from typing import Dict, Tuple, List
-# If you want to fine-tune, here's an example Unsloth fine tuning guide for:
-# Alpaca + TinyLlama + RoPE Scaling full example.ipynb
-# https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing
-# code here was refactored from gist:
-# https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger(__name__)
@@ -28,10 +38,16 @@ def get_models(
     models: List[str],
     trust_remote_code: bool,
 ):
     config = {
-        'torch_dtype': torch.float16,
-        'low_cpu_mem_usage': False,
-        'trust_remote_code': trust_remote_code,
     }
     loaded_models = []
     num_models = len(models)
@@ -42,8 +58,7 @@ def get_models(
         )
         loaded_models.append(
             transformers.AutoModelForCausalLM.from_pretrained(
-                model_path,
-                **config
             )
         )
     return loaded_models
@@ -52,6 +67,11 @@ def get_models(
 def pm(
     model,
 ):
     keys = model.state_dict().keys()
     log.info(f"model keys={len(keys)}")
     for i, k in enumerate(keys):
@@ -59,38 +79,46 @@ def pm(
         log.info(
             f"{i:3d} {k} shape={tensor.shape} "
             f"type={tensor.dtype} dev={tensor.device} "
-            f"contig={tensor.is_contiguous()}")
 def run_text_test(
     model,
-    tokenizer_path,
     question: str,
     device: str = "cuda",
 ):
     base_model = model.to(device)
-    log.info(
-        f"loading tokenizer={tokenizer_path}"
-    )
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         tokenizer_path,
         torch_dtype=torch.float16,
     )
-    inputs = tokenizer(
-        question,
-        return_tensors="pt"
-    ).to(device)
     with torch.backends.cuda.sdp_kernel(
         enable_flash=True,
         enable_math=False,
-        enable_mem_efficient=False
     ):
         outputs = base_model.generate(
             **inputs,
-            max_new_tokens=1000,
         )
-    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
     log.info(
         "\n"
         "----------"
@@ -101,11 +129,16 @@ def run_text_test(
         "----------"
     )
     base_model = base_model.to(device)
-def get_layer_type(
-    key: str
-) -> Tuple[int, str]:
     matcher = re.compile(r"model.layers.(\d+).(.+)")
     m = matcher.match(key)
     if m is None:
@@ -123,8 +156,16 @@ def get_layer_type(
 def merge_model_with_ties(
     models: List[str],
     model_dst: str,
-    trust_remote_code: bool = True
 ):
     models = get_models(
         models=models,
         trust_remote_code=trust_remote_code,
@@ -150,25 +191,30 @@ def merge_model_with_ties(
         # build a ratio
         ratio = {
-            'to_q': 0.0,
-            'to_k': 0.0,
-            'to_v': 0.0,
-        }.get(layer_type, .5)
         norm_ratio = 0.68
         log.info(
             f"model={k} {num_keys} shape={m0.shape} "
             f"dtype={m0.dtype} {m0.device} "
-            f"raio={ratio} "
             f"contig={m0.is_contiguous()} "
-            f"norm={norm_ratio}")
         # for all tensors
         for i, tensor in enumerate(m):
             if layer_type == "to_k":
                 # Get to_q key
-                q_base = models[0].state_dict()[k.replace("to_k", "to_q")]
-                q_merge = models[i].state_dict()[k.replace("to_k", "to_q")]
                 scale = relative_norm(q_merge, q_base)
                 tensor = tensor.to(device) / scale
                 del scale
@@ -176,9 +222,7 @@ def merge_model_with_ties(
                 scale = relative_norm(tensor, m0)
                 tensor = tensor.to(device) * scale
                 del scale
-            slice_mask = (
-                sets == i
-            ).bool()
             new_tensor = dare_ties_sparsification(
                 model_a_param=m0,
                 model_b_param=tensor,
@@ -186,21 +230,23 @@ def merge_model_with_ties(
                 ties="sum",
                 rescale="off",
                 device=device,
-                **config)
-            new_tensor = merge_tensors("slerp", m0, tensor, ratio)
-            result = torch.where(slice_mask, new_tensor, result)
             del new_tensor, slice_mask
         result_dict[k] = result
     # end of merge
-    log.info(
-        f"done merge saving to file: {model_dst}"
-    )
     out_model = (
         transformers.AutoModelForCausalLM.from_pretrained(
-            model_dst,
-            **config
         )
     )
     out_model.state_dict = lambda: result_dict
@@ -208,17 +254,24 @@ def merge_model_with_ties(
 def run():
-    question = (
-        "why is the sky blue?"
     )
-    log.info(f"merging models and asking the question: {question}")
     model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
     model_dst = "matlok/tinyllama-cinder-openhermes-32k"
     device = "cuda"
     config = {
-        'torch_dtype': torch.float16,
-        'low_cpu_mem_usage': False,
-        'trust_remote_code': True,
     }
     models = [
         model_src,
@@ -228,13 +281,13 @@ def run():
         "Josephgflowers/TinyLlama-3T-Cinder-v1.3",
     ]
     merge_model_with_ties(
-        models=models,
-        model_dst=model_dst
     )
     log.info(f"loading newly-created file: {model_dst}")
-    model = transformers.AutoModelForCausalLM.from_pretrained(
-        model_dst,
-        **config
     )
     log.info(
         f"loaded new model file: {model_dst} "
@@ -246,7 +299,29 @@ def run():
         question=question,
         device=device,
     )
-    log.info(f"done loading new model: {model} file: {model_dst}")
 if __name__ == "__main__":

 #!/usr/bin/env python3
+"""
+If you want to fine-tune, here's an example Unsloth fine tuning guide for:
+Alpaca + TinyLlama + RoPE Scaling full example.ipynb
+https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing
+Code here was refactored from gist:
+https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
+Fine tuning example:
+https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing
+CodeLlama example:
+https://huggingface.co/collections/mlabonne/codellama-6509bc68c2d4c8fc379ee87f
+"""
+import os
 import transformers
 import torch
 import logging
 from ddare.tensor import (
     dare_ties_sparsification,
     relative_norm,
+    divide_tensor_into_sets,
 )
 from ddare.util import get_device
 import re
 from typing import Dict, Tuple, List
 logging.basicConfig(level=logging.INFO)
 log = logging.getLogger(__name__)
     models: List[str],
     trust_remote_code: bool,
 ):
+    """
+    get the models
+    :param models: model names to download
+    :param trust_remote_code: are you sure??? True/False
+    """
     config = {
+        "torch_dtype": torch.float16,
+        "low_cpu_mem_usage": False,
+        "trust_remote_code": trust_remote_code,
     }
     loaded_models = []
     num_models = len(models)
         )
         loaded_models.append(
             transformers.AutoModelForCausalLM.from_pretrained(
+                model_path, **config
             )
         )
     return loaded_models
 def pm(
     model,
 ):
+    """
+    pretty print model
+    :param model: show me the model
+    """
     keys = model.state_dict().keys()
     log.info(f"model keys={len(keys)}")
     for i, k in enumerate(keys):
         log.info(
             f"{i:3d} {k} shape={tensor.shape} "
             f"type={tensor.dtype} dev={tensor.device} "
+            f"contig={tensor.is_contiguous()}"
+        )
 def run_text_test(
     model,
+    tokenizer_path: str,
     question: str,
     device: str = "cuda",
 ):
+    """
+    run a question on the model and return the answer
+    :param model: initialized model
+    :param tokenizer_path: tokenizer path/name
+    :param question: what are you asking?
+    :param device: where do you want to run "cpu"/"gpu"?
+    """
     base_model = model.to(device)
+    log.info(f"loading tokenizer={tokenizer_path}")
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         tokenizer_path,
         torch_dtype=torch.float16,
     )
+    inputs = tokenizer(question, return_tensors="pt").to(
+        device
+    )
     with torch.backends.cuda.sdp_kernel(
         enable_flash=True,
         enable_math=False,
+        enable_mem_efficient=True,
     ):
         outputs = base_model.generate(
             **inputs,
+            max_new_tokens=256,
         )
+    answer = tokenizer.decode(
+        outputs[0], skip_special_tokens=True
+    )
     log.info(
         "\n"
         "----------"
         "----------"
     )
     base_model = base_model.to(device)
+    return tokenizer
+def get_layer_type(key: str) -> Tuple[int, str]:
+    """
+    get the layer type
+    :param key: name of the layer
+    :return: layer id and name
+    """
     matcher = re.compile(r"model.layers.(\d+).(.+)")
     m = matcher.match(key)
     if m is None:
 def merge_model_with_ties(
     models: List[str],
     model_dst: str,
+    trust_remote_code: bool = True,
 ):
+    """
+    merge the list of models into one model
+    called model_dst
+    :param models: list of models to merge
+    :param model_dst: name of the new model
+    :param trust_remote_code: are you sure? True/False
+    """
     models = get_models(
         models=models,
         trust_remote_code=trust_remote_code,
         # build a ratio
         ratio = {
+            "to_q": 0.0,
+            "to_k": 0.0,
+            "to_v": 0.0,
+        }.get(layer_type, 0.5)
         norm_ratio = 0.68
         log.info(
             f"model={k} {num_keys} shape={m0.shape} "
             f"dtype={m0.dtype} {m0.device} "
+            f"ratio={ratio} "
             f"contig={m0.is_contiguous()} "
+            f"norm={norm_ratio}"
+        )
         # for all tensors
         for i, tensor in enumerate(m):
             if layer_type == "to_k":
                 # Get to_q key
+                q_base = models[0].state_dict()[
+                    k.replace("to_k", "to_q")
+                ]
+                q_merge = models[i].state_dict()[
+                    k.replace("to_k", "to_q")
+                ]
                 scale = relative_norm(q_merge, q_base)
                 tensor = tensor.to(device) / scale
                 del scale
                 scale = relative_norm(tensor, m0)
                 tensor = tensor.to(device) * scale
                 del scale
+            slice_mask = (sets == i).bool()
             new_tensor = dare_ties_sparsification(
                 model_a_param=m0,
                 model_b_param=tensor,
                 ties="sum",
                 rescale="off",
                 device=device,
+                **config,
+            )
+            new_tensor = merge_tensors(
+                "slerp", m0, tensor, ratio
+            )
+            result = torch.where(
+                slice_mask, new_tensor, result
+            )
             del new_tensor, slice_mask
         result_dict[k] = result
     # end of merge
+    log.info(f"done merge saving to file: {model_dst}")
     out_model = (
         transformers.AutoModelForCausalLM.from_pretrained(
+            model_dst, **config
         )
     )
     out_model.state_dict = lambda: result_dict
 def run():
+    """
+    run the merge and upload the model and tokenizer
+    This requires running having the HuggingFace token
+    set before it will work:
+    ```huggingface-cli login```
+    """
+    question = "why is the sky blue?"
+    log.info(
+        f"merging models and asking the question: {question}"
     )
     model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
     model_dst = "matlok/tinyllama-cinder-openhermes-32k"
     device = "cuda"
     config = {
+        "torch_dtype": torch.float16,
+        "low_cpu_mem_usage": False,
+        "trust_remote_code": True,
     }
     models = [
         model_src,
         "Josephgflowers/TinyLlama-3T-Cinder-v1.3",
     ]
     merge_model_with_ties(
+        models=models, model_dst=model_dst
     )
     log.info(f"loading newly-created file: {model_dst}")
+    model = (
+        transformers.AutoModelForCausalLM.from_pretrained(
+            model_dst, **config
+        )
     )
     log.info(
         f"loaded new model file: {model_dst} "
         question=question,
         device=device,
     )
+    # clean the temp merge dir
+    # remove model dir to prevent issues with the tokenizer upload
+    model_org = model_dst.split("/")[0]
+    if os.path.exists(model_org):
+        os.system(f"rm -rf ./{model_org}")
+    log.info(f"uploading model: {model_dst}")
+    model.push_to_hub(model_dst)
+    log.info(f"uploading src tokenizer: {model_src}")
+    # reload tokenizer to save it and found on:
+    # https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing#scrollTo=QQn30cRtAZ-P
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_src, trust_remote_code=True
+    )
+    # https://huggingface.co/docs/transformers/model_sharing#use-the-pushtohub-function
+    # tokenizer.push_to_hub("my-awesome-model")
+    tokenizer.push_to_hub(model_dst)
+    log.info(
+        f"done loading new model: {model} "
+        f"file: {model_dst}"
+    )
 if __name__ == "__main__":