matlok commited on
Commit
5a55332
1 Parent(s): f48f821

v2 release with script and confirmed the response works end-to-end

Browse files
Files changed (3) hide show
  1. README.md +95 -231
  2. model.safetensors +1 -1
  3. run-tiny-merge.py +252 -0
README.md CHANGED
@@ -2,7 +2,7 @@
2
  license: unknown
3
  ---
4
 
5
- ## Merging models like lego blocks using ddare and ties
6
 
7
  This model was merged with the following HuggingFace TinyLlama models using ties:
8
 
@@ -20,9 +20,11 @@ Please refer to the Unsloth fine-tuning guide for:
20
 
21
  ## How do I generate my own model merges?
22
 
23
- Here's the standalone python script we used with logs below:
24
 
25
  ```python3
 
 
26
  import transformers
27
  import torch
28
  import logging
@@ -32,6 +34,12 @@ from ddare.util import get_device
32
  import re
33
  from typing import Dict, Tuple, List
34
 
 
 
 
 
 
 
35
  logging.basicConfig(level=logging.INFO)
36
  log = logging.getLogger(__name__)
37
 
@@ -49,7 +57,7 @@ def get_models(
49
  num_models = len(models)
50
  for midx, model_path in enumerate(models):
51
  log.info(
52
- f"loading model={midx}/{num_models} "
53
  f"model={model_path} "
54
  )
55
  loaded_models.append(
@@ -76,30 +84,42 @@ def pm(
76
 
77
  def run_text_test(
78
  model,
79
- model_path,
80
- device: str,
81
  question: str,
 
82
  ):
83
  base_model = model.to(device)
84
  log.info(
85
- f"loading model={model_path}"
86
  )
87
  tokenizer = transformers.AutoTokenizer.from_pretrained(
88
- model_path,
89
- torch_dtype=torch.float16)
 
90
 
91
  inputs = tokenizer(
92
  question,
93
  return_tensors="pt"
94
- ).to("cuda")
95
  with torch.backends.cuda.sdp_kernel(
96
  enable_flash=True,
97
  enable_math=False,
98
  enable_mem_efficient=False
99
  ):
100
- outputs = base_model.generate(**inputs)
101
- log.info(tokenizer.decode(outputs[0], skip_special_tokens=True))
102
- base_model = base_model.to("cpu")
 
 
 
 
 
 
 
 
 
 
 
103
 
104
 
105
  def get_layer_type(
@@ -144,6 +164,7 @@ def merge_model_with_ties(
144
  models[1].state_dict()[k],
145
  models[2].state_dict()[k],
146
  models[3].state_dict()[k],
 
147
  ]
148
 
149
  # build a ratio
@@ -193,7 +214,7 @@ def merge_model_with_ties(
193
  # end of merge
194
 
195
  log.info(
196
- f"{config} - done merge saving to file: {model_dst}"
197
  )
198
  out_model = (
199
  transformers.AutoModelForCausalLM.from_pretrained(
@@ -206,9 +227,13 @@ def merge_model_with_ties(
206
 
207
 
208
  def run():
209
- log.info("start")
 
 
 
210
  model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
211
  model_dst = "matlok/tinyllama-cinder-openhermes-32k"
 
212
  config = {
213
  'torch_dtype': torch.float16,
214
  'low_cpu_mem_usage': False,
@@ -230,7 +255,16 @@ def run():
230
  model_dst,
231
  **config
232
  )
233
- pm(model=model)
 
 
 
 
 
 
 
 
 
234
  log.info(f"done loading new model: {model} file: {model_dst}")
235
 
236
 
@@ -238,7 +272,6 @@ if __name__ == "__main__":
238
  run()
239
  ```
240
 
241
-
242
  ### Logs
243
 
244
  Here's the logs from the code above:
@@ -248,12 +281,14 @@ Total VRAM 12282 MB, total RAM 85434 MB
248
  Set vram state to: NORMAL_VRAM
249
  Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native
250
  VAE dtype: torch.bfloat16
251
- INFO:__main__:start
252
- INFO:__main__:loading model=0/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
253
- INFO:__main__:loading model=1/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct
254
- INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k
255
- INFO:__main__:loading model=3/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes
256
- INFO:__main__:loading model=4/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3
 
 
257
  INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
258
  INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
259
  INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
@@ -455,213 +490,42 @@ INFO:__main__:model=model.layers.21.input_layernorm.weight 201 shape=torch.Size(
455
  INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
456
  INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
457
  INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
458
- INFO:__main__:{} - done merge saving to file: matlok/tinyllama-cinder-openhermes-32k
459
- config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 724/724 [00:00<00:00, 6.15MB/s]
460
- model.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2.20G/2.20G [00:57<00:00, 38.0MB/s]
461
- generation_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 133/133 [00:00<00:00, 1.82MB/s]
462
  INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k
463
- INFO:__main__:model keys=201
464
- INFO:__main__: 0 model.embed_tokens.weight shape=torch.Size([32000, 2048]) type=torch.float16 dev=cpu contig=True
465
- INFO:__main__: 1 model.layers.0.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
466
- INFO:__main__: 2 model.layers.0.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
467
- INFO:__main__: 3 model.layers.0.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
468
- INFO:__main__: 4 model.layers.0.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
469
- INFO:__main__: 5 model.layers.0.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
470
- INFO:__main__: 6 model.layers.0.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
471
- INFO:__main__: 7 model.layers.0.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
472
- INFO:__main__: 8 model.layers.0.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
473
- INFO:__main__: 9 model.layers.0.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
474
- INFO:__main__: 10 model.layers.1.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
475
- INFO:__main__: 11 model.layers.1.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
476
- INFO:__main__: 12 model.layers.1.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
477
- INFO:__main__: 13 model.layers.1.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
478
- INFO:__main__: 14 model.layers.1.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
479
- INFO:__main__: 15 model.layers.1.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
480
- INFO:__main__: 16 model.layers.1.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
481
- INFO:__main__: 17 model.layers.1.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
482
- INFO:__main__: 18 model.layers.1.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
483
- INFO:__main__: 19 model.layers.2.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
484
- INFO:__main__: 20 model.layers.2.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
485
- INFO:__main__: 21 model.layers.2.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
486
- INFO:__main__: 22 model.layers.2.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
487
- INFO:__main__: 23 model.layers.2.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
488
- INFO:__main__: 24 model.layers.2.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
489
- INFO:__main__: 25 model.layers.2.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
490
- INFO:__main__: 26 model.layers.2.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
491
- INFO:__main__: 27 model.layers.2.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
492
- INFO:__main__: 28 model.layers.3.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
493
- INFO:__main__: 29 model.layers.3.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
494
- INFO:__main__: 30 model.layers.3.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
495
- INFO:__main__: 31 model.layers.3.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
496
- INFO:__main__: 32 model.layers.3.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
497
- INFO:__main__: 33 model.layers.3.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
498
- INFO:__main__: 34 model.layers.3.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
499
- INFO:__main__: 35 model.layers.3.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
500
- INFO:__main__: 36 model.layers.3.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
501
- INFO:__main__: 37 model.layers.4.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
502
- INFO:__main__: 38 model.layers.4.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
503
- INFO:__main__: 39 model.layers.4.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
504
- INFO:__main__: 40 model.layers.4.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
505
- INFO:__main__: 41 model.layers.4.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
506
- INFO:__main__: 42 model.layers.4.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
507
- INFO:__main__: 43 model.layers.4.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
508
- INFO:__main__: 44 model.layers.4.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
509
- INFO:__main__: 45 model.layers.4.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
510
- INFO:__main__: 46 model.layers.5.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
511
- INFO:__main__: 47 model.layers.5.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
512
- INFO:__main__: 48 model.layers.5.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
513
- INFO:__main__: 49 model.layers.5.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
514
- INFO:__main__: 50 model.layers.5.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
515
- INFO:__main__: 51 model.layers.5.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
516
- INFO:__main__: 52 model.layers.5.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
517
- INFO:__main__: 53 model.layers.5.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
518
- INFO:__main__: 54 model.layers.5.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
519
- INFO:__main__: 55 model.layers.6.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
520
- INFO:__main__: 56 model.layers.6.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
521
- INFO:__main__: 57 model.layers.6.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
522
- INFO:__main__: 58 model.layers.6.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
523
- INFO:__main__: 59 model.layers.6.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
524
- INFO:__main__: 60 model.layers.6.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
525
- INFO:__main__: 61 model.layers.6.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
526
- INFO:__main__: 62 model.layers.6.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
527
- INFO:__main__: 63 model.layers.6.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
528
- INFO:__main__: 64 model.layers.7.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
529
- INFO:__main__: 65 model.layers.7.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
530
- INFO:__main__: 66 model.layers.7.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
531
- INFO:__main__: 67 model.layers.7.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
532
- INFO:__main__: 68 model.layers.7.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
533
- INFO:__main__: 69 model.layers.7.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
534
- INFO:__main__: 70 model.layers.7.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
535
- INFO:__main__: 71 model.layers.7.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
536
- INFO:__main__: 72 model.layers.7.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
537
- INFO:__main__: 73 model.layers.8.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
538
- INFO:__main__: 74 model.layers.8.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
539
- INFO:__main__: 75 model.layers.8.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
540
- INFO:__main__: 76 model.layers.8.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
541
- INFO:__main__: 77 model.layers.8.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
542
- INFO:__main__: 78 model.layers.8.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
543
- INFO:__main__: 79 model.layers.8.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
544
- INFO:__main__: 80 model.layers.8.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
545
- INFO:__main__: 81 model.layers.8.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
546
- INFO:__main__: 82 model.layers.9.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
547
- INFO:__main__: 83 model.layers.9.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
548
- INFO:__main__: 84 model.layers.9.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
549
- INFO:__main__: 85 model.layers.9.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
550
- INFO:__main__: 86 model.layers.9.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
551
- INFO:__main__: 87 model.layers.9.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
552
- INFO:__main__: 88 model.layers.9.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
553
- INFO:__main__: 89 model.layers.9.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
554
- INFO:__main__: 90 model.layers.9.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
555
- INFO:__main__: 91 model.layers.10.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
556
- INFO:__main__: 92 model.layers.10.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
557
- INFO:__main__: 93 model.layers.10.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
558
- INFO:__main__: 94 model.layers.10.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
559
- INFO:__main__: 95 model.layers.10.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
560
- INFO:__main__: 96 model.layers.10.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
561
- INFO:__main__: 97 model.layers.10.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
562
- INFO:__main__: 98 model.layers.10.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
563
- INFO:__main__: 99 model.layers.10.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
564
- INFO:__main__:100 model.layers.11.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
565
- INFO:__main__:101 model.layers.11.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
566
- INFO:__main__:102 model.layers.11.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
567
- INFO:__main__:103 model.layers.11.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
568
- INFO:__main__:104 model.layers.11.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
569
- INFO:__main__:105 model.layers.11.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
570
- INFO:__main__:106 model.layers.11.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
571
- INFO:__main__:107 model.layers.11.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
572
- INFO:__main__:108 model.layers.11.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
573
- INFO:__main__:109 model.layers.12.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
574
- INFO:__main__:110 model.layers.12.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
575
- INFO:__main__:111 model.layers.12.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
576
- INFO:__main__:112 model.layers.12.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
577
- INFO:__main__:113 model.layers.12.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
578
- INFO:__main__:114 model.layers.12.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
579
- INFO:__main__:115 model.layers.12.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
580
- INFO:__main__:116 model.layers.12.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
581
- INFO:__main__:117 model.layers.12.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
582
- INFO:__main__:118 model.layers.13.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
583
- INFO:__main__:119 model.layers.13.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
584
- INFO:__main__:120 model.layers.13.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
585
- INFO:__main__:121 model.layers.13.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
586
- INFO:__main__:122 model.layers.13.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
587
- INFO:__main__:123 model.layers.13.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
588
- INFO:__main__:124 model.layers.13.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
589
- INFO:__main__:125 model.layers.13.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
590
- INFO:__main__:126 model.layers.13.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
591
- INFO:__main__:127 model.layers.14.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
592
- INFO:__main__:128 model.layers.14.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
593
- INFO:__main__:129 model.layers.14.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
594
- INFO:__main__:130 model.layers.14.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
595
- INFO:__main__:131 model.layers.14.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
596
- INFO:__main__:132 model.layers.14.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
597
- INFO:__main__:133 model.layers.14.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
598
- INFO:__main__:134 model.layers.14.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
599
- INFO:__main__:135 model.layers.14.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
600
- INFO:__main__:136 model.layers.15.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
601
- INFO:__main__:137 model.layers.15.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
602
- INFO:__main__:138 model.layers.15.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
603
- INFO:__main__:139 model.layers.15.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
604
- INFO:__main__:140 model.layers.15.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
605
- INFO:__main__:141 model.layers.15.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
606
- INFO:__main__:142 model.layers.15.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
607
- INFO:__main__:143 model.layers.15.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
608
- INFO:__main__:144 model.layers.15.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
609
- INFO:__main__:145 model.layers.16.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
610
- INFO:__main__:146 model.layers.16.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
611
- INFO:__main__:147 model.layers.16.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
612
- INFO:__main__:148 model.layers.16.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
613
- INFO:__main__:149 model.layers.16.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
614
- INFO:__main__:150 model.layers.16.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
615
- INFO:__main__:151 model.layers.16.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
616
- INFO:__main__:152 model.layers.16.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
617
- INFO:__main__:153 model.layers.16.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
618
- INFO:__main__:154 model.layers.17.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
619
- INFO:__main__:155 model.layers.17.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
620
- INFO:__main__:156 model.layers.17.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
621
- INFO:__main__:157 model.layers.17.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
622
- INFO:__main__:158 model.layers.17.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
623
- INFO:__main__:159 model.layers.17.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
624
- INFO:__main__:160 model.layers.17.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
625
- INFO:__main__:161 model.layers.17.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
626
- INFO:__main__:162 model.layers.17.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
627
- INFO:__main__:163 model.layers.18.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
628
- INFO:__main__:164 model.layers.18.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
629
- INFO:__main__:165 model.layers.18.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
630
- INFO:__main__:166 model.layers.18.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
631
- INFO:__main__:167 model.layers.18.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
632
- INFO:__main__:168 model.layers.18.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
633
- INFO:__main__:169 model.layers.18.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
634
- INFO:__main__:170 model.layers.18.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
635
- INFO:__main__:171 model.layers.18.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
636
- INFO:__main__:172 model.layers.19.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
637
- INFO:__main__:173 model.layers.19.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
638
- INFO:__main__:174 model.layers.19.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
639
- INFO:__main__:175 model.layers.19.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
640
- INFO:__main__:176 model.layers.19.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
641
- INFO:__main__:177 model.layers.19.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
642
- INFO:__main__:178 model.layers.19.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
643
- INFO:__main__:179 model.layers.19.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
644
- INFO:__main__:180 model.layers.19.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
645
- INFO:__main__:181 model.layers.20.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
646
- INFO:__main__:182 model.layers.20.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
647
- INFO:__main__:183 model.layers.20.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
648
- INFO:__main__:184 model.layers.20.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
649
- INFO:__main__:185 model.layers.20.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
650
- INFO:__main__:186 model.layers.20.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
651
- INFO:__main__:187 model.layers.20.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
652
- INFO:__main__:188 model.layers.20.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
653
- INFO:__main__:189 model.layers.20.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
654
- INFO:__main__:190 model.layers.21.self_attn.q_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
655
- INFO:__main__:191 model.layers.21.self_attn.k_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
656
- INFO:__main__:192 model.layers.21.self_attn.v_proj.weight shape=torch.Size([256, 2048]) type=torch.float16 dev=cpu contig=True
657
- INFO:__main__:193 model.layers.21.self_attn.o_proj.weight shape=torch.Size([2048, 2048]) type=torch.float16 dev=cpu contig=True
658
- INFO:__main__:194 model.layers.21.mlp.gate_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
659
- INFO:__main__:195 model.layers.21.mlp.up_proj.weight shape=torch.Size([5632, 2048]) type=torch.float16 dev=cpu contig=True
660
- INFO:__main__:196 model.layers.21.mlp.down_proj.weight shape=torch.Size([2048, 5632]) type=torch.float16 dev=cpu contig=True
661
- INFO:__main__:197 model.layers.21.input_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
662
- INFO:__main__:198 model.layers.21.post_attention_layernorm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
663
- INFO:__main__:199 model.norm.weight shape=torch.Size([2048]) type=torch.float16 dev=cpu contig=True
664
- INFO:__main__:200 lm_head.weight shape=torch.Size([32000, 2048]) type=torch.float16 dev=cpu contig=True
665
  INFO:__main__:done loading new model: LlamaForCausalLM(
666
  (model): LlamaModel(
667
  (embed_tokens): Embedding(32000, 2048)
@@ -689,9 +553,9 @@ INFO:__main__:done loading new model: LlamaForCausalLM(
689
  (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
690
  ) file: matlok/tinyllama-cinder-openhermes-32k
691
 
692
- real 1m18.070s
693
- user 2m10.228s
694
- sys 0m14.040s
695
  ```
696
 
697
  Note: code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b)
 
2
  license: unknown
3
  ---
4
 
5
+ ## Merging AI Models like Lego Blocks
6
 
7
  This model was merged with the following HuggingFace TinyLlama models using ties:
8
 
 
20
 
21
  ## How do I generate my own model merges?
22
 
23
+ Here's [the standalone python script](https://huggingface.co/matlok/tinyllama-cinder-openhermes-32k/blob/main/run-tiny-merge.py) used with logs below:
24
 
25
  ```python3
26
+ #!/usr/bin/env python3
27
+
28
  import transformers
29
  import torch
30
  import logging
 
34
  import re
35
  from typing import Dict, Tuple, List
36
 
37
+ # If you want to fine-tune, here's an example Unsloth fine tuning guide for:
38
+ # Alpaca + TinyLlama + RoPE Scaling full example.ipynb
39
+ # https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing#scrollTo=LjY75GoYUCB8
40
+
41
+ # code here was refactored from gist: https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
42
+
43
  logging.basicConfig(level=logging.INFO)
44
  log = logging.getLogger(__name__)
45
 
 
57
  num_models = len(models)
58
  for midx, model_path in enumerate(models):
59
  log.info(
60
+ f"loading model={midx + 1}/{num_models} "
61
  f"model={model_path} "
62
  )
63
  loaded_models.append(
 
84
 
85
  def run_text_test(
86
  model,
87
+ tokenizer_path,
 
88
  question: str,
89
+ device: str = "cuda",
90
  ):
91
  base_model = model.to(device)
92
  log.info(
93
+ f"loading tokenizer={tokenizer_path}"
94
  )
95
  tokenizer = transformers.AutoTokenizer.from_pretrained(
96
+ tokenizer_path,
97
+ torch_dtype=torch.float16,
98
+ )
99
 
100
  inputs = tokenizer(
101
  question,
102
  return_tensors="pt"
103
+ ).to(device)
104
  with torch.backends.cuda.sdp_kernel(
105
  enable_flash=True,
106
  enable_math=False,
107
  enable_mem_efficient=False
108
  ):
109
+ outputs = base_model.generate(
110
+ **inputs,
111
+ max_new_tokens=1000,
112
+ )
113
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
114
+ log.info(
115
+ "\n"
116
+ "----------"
117
+ f"tokenizer={tokenizer}\n "
118
+ f"question:\n{question}\n"
119
+ f"answer:\n{answer}\n"
120
+ "----------"
121
+ )
122
+ base_model = base_model.to(device)
123
 
124
 
125
  def get_layer_type(
 
164
  models[1].state_dict()[k],
165
  models[2].state_dict()[k],
166
  models[3].state_dict()[k],
167
+ models[4].state_dict()[k],
168
  ]
169
 
170
  # build a ratio
 
214
  # end of merge
215
 
216
  log.info(
217
+ f"done merge saving to file: {model_dst}"
218
  )
219
  out_model = (
220
  transformers.AutoModelForCausalLM.from_pretrained(
 
227
 
228
 
229
  def run():
230
+ question = (
231
+ "why is the sky blue?"
232
+ )
233
+ log.info(f"merging models and asking the question: {question}")
234
  model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
235
  model_dst = "matlok/tinyllama-cinder-openhermes-32k"
236
+ device = "cuda"
237
  config = {
238
  'torch_dtype': torch.float16,
239
  'low_cpu_mem_usage': False,
 
255
  model_dst,
256
  **config
257
  )
258
+ log.info(
259
+ f"loaded new model file: {model_dst} "
260
+ f"asking question: {question} "
261
+ )
262
+ run_text_test(
263
+ model=model,
264
+ tokenizer_path=model_src,
265
+ question=question,
266
+ device=device,
267
+ )
268
  log.info(f"done loading new model: {model} file: {model_dst}")
269
 
270
 
 
272
  run()
273
  ```
274
 
 
275
  ### Logs
276
 
277
  Here's the logs from the code above:
 
281
  Set vram state to: NORMAL_VRAM
282
  Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native
283
  VAE dtype: torch.bfloat16
284
+ INFO:__main__:merging models and asking the question: why is the sky blue?
285
+ INFO:__main__:loading model=1/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
286
+ INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct
287
+ /d/venvs/dev/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
288
+ return self.fget.__get__(instance, owner)()
289
+ INFO:__main__:loading model=3/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k
290
+ INFO:__main__:loading model=4/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes
291
+ INFO:__main__:loading model=5/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3
292
  INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
293
  INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
294
  INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
 
490
  INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
491
  INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
492
  INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
493
+ INFO:__main__:done merge saving to file: matlok/tinyllama-cinder-openhermes-32k
 
 
 
494
  INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k
495
+ INFO:__main__:loaded new model file: matlok/tinyllama-cinder-openhermes-32k asking question: why is the sky blue?
496
+ INFO:__main__:loading tokenizer=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
497
+ Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
498
+ INFO:__main__:
499
+ ----------tokenizer=LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False), added_tokens_decoder={
500
+ 0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
501
+ 1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
502
+ 2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
503
+ }
504
+ question:
505
+ why is the sky blue?
506
+ answer:
507
+ why is the sky blue?
508
+ The sky is blue because it is made up of the colors of the visible spectrum. The visible spectrum is a range of colors that can be seen with the naked eye. The colors in the visible spectrum are made up of light waves that are shorter than the wavelengths of the visible light. The shorter wavelengths of light are absorbed more easily by the atmosphere, which is why the sky is blue.
509
+ What is the color of the sky?
510
+ The color of the sky is blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
511
+ What is the color of the sky in the winter?
512
+ The color of the sky in the winter is usually a deep blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
513
+ What is the color of the sky in the summer?
514
+ The color of the sky in the summer is usually a bright yellow. This is because the visible spectrum is made up of the colors of the yellow and orange parts of the spectrum. The yellow part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
515
+ What is the color of the sky in the spring?
516
+ The color of the sky in the spring is usually a bright green. This is because the visible spectrum is made up of the colors of the green and blue parts of the spectrum. The green part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The blue part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
517
+ What is the color of the sky in the fall?
518
+ The color of the sky in the fall is usually a deep red. This is because the visible spectrum is made up of the colors of the red and orange parts of the spectrum. The red part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
519
+ What is the color of the sky in the winter?
520
+ The color of the sky in the winter is usually a deep blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
521
+ What is the color of the sky in the summer?
522
+ The color of the sky in the summer is usually a bright yellow. This is because the visible spectrum is made up of the colors of the yellow and orange parts of the spectrum. The yellow part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
523
+ What is the color of the sky in the spring?
524
+ The color of the sky in the spring is usually a bright green. This is because the visible spectrum is made up of the colors of the green and blue parts of the spectrum. The green part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The blue part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
525
+ What is the color of the sky in the fall?
526
+ The color of the sky in the fall is usually a deep red. This is because the visible spectrum is made up of the colors of the red and orange parts of the spectrum. The red part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
527
+ What is the color of the
528
+ ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529
  INFO:__main__:done loading new model: LlamaForCausalLM(
530
  (model): LlamaModel(
531
  (embed_tokens): Embedding(32000, 2048)
 
553
  (lm_head): Linear(in_features=2048, out_features=32000, bias=False)
554
  ) file: matlok/tinyllama-cinder-openhermes-32k
555
 
556
+ real 0m49.612s
557
+ user 3m2.617s
558
+ sys 0m14.655s
559
  ```
560
 
561
  Note: code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b)
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bb7be5c697e99c1fbf4bdc5531632e881e8f625d38da3a228daa96ae90ac6452
3
  size 2200119664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbdca20af4eed297d35d3ec8b116884a6c83c4f83109b7b9f7ffd37f71af04b2
3
  size 2200119664
run-tiny-merge.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ import transformers
4
+ import torch
5
+ import logging
6
+ from ddare.merge import merge_tensors
7
+ from ddare.tensor import (
8
+ dare_ties_sparsification,
9
+ relative_norm,
10
+ divide_tensor_into_sets
11
+ )
12
+ from ddare.util import get_device
13
+ import re
14
+ from typing import Dict, Tuple, List
15
+
16
+ # If you want to fine-tune, here's an example Unsloth fine tuning guide for:
17
+ # Alpaca + TinyLlama + RoPE Scaling full example.ipynb
18
+ # https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing
19
+
20
+ # code here was refactored from gist:
21
+ # https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b
22
+
23
+ logging.basicConfig(level=logging.INFO)
24
+ log = logging.getLogger(__name__)
25
+
26
+
27
+ def get_models(
28
+ models: List[str],
29
+ trust_remote_code: bool,
30
+ ):
31
+ config = {
32
+ 'torch_dtype': torch.float16,
33
+ 'low_cpu_mem_usage': False,
34
+ 'trust_remote_code': trust_remote_code,
35
+ }
36
+ loaded_models = []
37
+ num_models = len(models)
38
+ for midx, model_path in enumerate(models):
39
+ log.info(
40
+ f"loading model={midx + 1}/{num_models} "
41
+ f"model={model_path} "
42
+ )
43
+ loaded_models.append(
44
+ transformers.AutoModelForCausalLM.from_pretrained(
45
+ model_path,
46
+ **config
47
+ )
48
+ )
49
+ return loaded_models
50
+
51
+
52
+ def pm(
53
+ model,
54
+ ):
55
+ keys = model.state_dict().keys()
56
+ log.info(f"model keys={len(keys)}")
57
+ for i, k in enumerate(keys):
58
+ tensor = model.state_dict()[k]
59
+ log.info(
60
+ f"{i:3d} {k} shape={tensor.shape} "
61
+ f"type={tensor.dtype} dev={tensor.device} "
62
+ f"contig={tensor.is_contiguous()}")
63
+
64
+
65
+ def run_text_test(
66
+ model,
67
+ tokenizer_path,
68
+ question: str,
69
+ device: str = "cuda",
70
+ ):
71
+ base_model = model.to(device)
72
+ log.info(
73
+ f"loading tokenizer={tokenizer_path}"
74
+ )
75
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
76
+ tokenizer_path,
77
+ torch_dtype=torch.float16,
78
+ )
79
+
80
+ inputs = tokenizer(
81
+ question,
82
+ return_tensors="pt"
83
+ ).to(device)
84
+ with torch.backends.cuda.sdp_kernel(
85
+ enable_flash=True,
86
+ enable_math=False,
87
+ enable_mem_efficient=False
88
+ ):
89
+ outputs = base_model.generate(
90
+ **inputs,
91
+ max_new_tokens=1000,
92
+ )
93
+ answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
94
+ log.info(
95
+ "\n"
96
+ "----------"
97
+ f"tokenizer={tokenizer}\n "
98
+ f"question:\n{question}\n"
99
+ f"answer:\n{answer}\n"
100
+ "----------"
101
+ )
102
+ base_model = base_model.to(device)
103
+
104
+
105
+ def get_layer_type(
106
+ key: str
107
+ ) -> Tuple[int, str]:
108
+ matcher = re.compile(r"model.layers.(\d+).(.+)")
109
+ m = matcher.match(key)
110
+ if m is None:
111
+ if "model.norm.weight" == key:
112
+ return -1, "norm"
113
+ if "model.embed_tokens.weight" == key:
114
+ return -1, "embed"
115
+ if "lm_head.weight" == key:
116
+ return -1, "head"
117
+ log.info(f"Unknown key {key}")
118
+ return -1, "unknown"
119
+ return int(m.group(1)), m.group(2)
120
+
121
+
122
+ def merge_model_with_ties(
123
+ models: List[str],
124
+ model_dst: str,
125
+ trust_remote_code: bool = True
126
+ ):
127
+ models = get_models(
128
+ models=models,
129
+ trust_remote_code=trust_remote_code,
130
+ )
131
+ config = {}
132
+ result_dict: Dict[str, torch.Tensor] = {}
133
+ device = get_device()
134
+ keys = models[0].state_dict().keys()
135
+ num_keys = len(keys)
136
+ for k in keys:
137
+ block, layer_type = get_layer_type(k)
138
+ m0: torch.Tensor = models[0].state_dict()[k]
139
+ result = m0.clone()
140
+ sets = divide_tensor_into_sets(tensor=m0, n_sets=4)
141
+
142
+ # get the src layers to merge
143
+ m = [
144
+ models[1].state_dict()[k],
145
+ models[2].state_dict()[k],
146
+ models[3].state_dict()[k],
147
+ models[4].state_dict()[k],
148
+ ]
149
+
150
+ # build a ratio
151
+ ratio = {
152
+ 'to_q': 0.0,
153
+ 'to_k': 0.0,
154
+ 'to_v': 0.0,
155
+ }.get(layer_type, .5)
156
+
157
+ norm_ratio = 0.68
158
+ log.info(
159
+ f"model={k} {num_keys} shape={m0.shape} "
160
+ f"dtype={m0.dtype} {m0.device} "
161
+ f"raio={ratio} "
162
+ f"contig={m0.is_contiguous()} "
163
+ f"norm={norm_ratio}")
164
+
165
+ # for all tensors
166
+ for i, tensor in enumerate(m):
167
+ if layer_type == "to_k":
168
+ # Get to_q key
169
+ q_base = models[0].state_dict()[k.replace("to_k", "to_q")]
170
+ q_merge = models[i].state_dict()[k.replace("to_k", "to_q")]
171
+ scale = relative_norm(q_merge, q_base)
172
+ tensor = tensor.to(device) / scale
173
+ del scale
174
+ elif layer_type == "to_q":
175
+ scale = relative_norm(tensor, m0)
176
+ tensor = tensor.to(device) * scale
177
+ del scale
178
+ slice_mask = (
179
+ sets == i
180
+ ).bool()
181
+ new_tensor = dare_ties_sparsification(
182
+ model_a_param=m0,
183
+ model_b_param=tensor,
184
+ drop_rate=norm_ratio,
185
+ ties="sum",
186
+ rescale="off",
187
+ device=device,
188
+ **config)
189
+ new_tensor = merge_tensors("slerp", m0, tensor, ratio)
190
+ result = torch.where(slice_mask, new_tensor, result)
191
+ del new_tensor, slice_mask
192
+
193
+ result_dict[k] = result
194
+ # end of merge
195
+
196
+ log.info(
197
+ f"done merge saving to file: {model_dst}"
198
+ )
199
+ out_model = (
200
+ transformers.AutoModelForCausalLM.from_pretrained(
201
+ model_dst,
202
+ **config
203
+ )
204
+ )
205
+ out_model.state_dict = lambda: result_dict
206
+ out_model.save_pretrained(model_dst)
207
+
208
+
209
+ def run():
210
+ question = (
211
+ "why is the sky blue?"
212
+ )
213
+ log.info(f"merging models and asking the question: {question}")
214
+ model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
215
+ model_dst = "matlok/tinyllama-cinder-openhermes-32k"
216
+ device = "cuda"
217
+ config = {
218
+ 'torch_dtype': torch.float16,
219
+ 'low_cpu_mem_usage': False,
220
+ 'trust_remote_code': True,
221
+ }
222
+ models = [
223
+ model_src,
224
+ "Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct",
225
+ "Doctor-Shotgun/TinyLlama-1.1B-32k",
226
+ "Tensoic/TinyLlama-1.1B-3T-openhermes",
227
+ "Josephgflowers/TinyLlama-3T-Cinder-v1.3",
228
+ ]
229
+ merge_model_with_ties(
230
+ models=models,
231
+ model_dst=model_dst
232
+ )
233
+ log.info(f"loading newly-created file: {model_dst}")
234
+ model = transformers.AutoModelForCausalLM.from_pretrained(
235
+ model_dst,
236
+ **config
237
+ )
238
+ log.info(
239
+ f"loaded new model file: {model_dst} "
240
+ f"asking question: {question} "
241
+ )
242
+ run_text_test(
243
+ model=model,
244
+ tokenizer_path=model_src,
245
+ question=question,
246
+ device=device,
247
+ )
248
+ log.info(f"done loading new model: {model} file: {model_dst}")
249
+
250
+
251
+ if __name__ == "__main__":
252
+ run()