|
--- |
|
license: unknown |
|
--- |
|
|
|
## Merging AI Models like Lego Blocks |
|
|
|
This model was merged with the following Hugging Face TinyLlama models using ties: |
|
|
|
- TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T |
|
- Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct |
|
- Doctor-Shotgun/TinyLlama-1.1B-32k |
|
- Tensoic/TinyLlama-1.1B-3T-openhermes |
|
- Josephgflowers/TinyLlama-3T-Cinder-v1.3 |
|
|
|
## How do I fine-tune this model? |
|
|
|
### Fine-tuning using Hugging Face SFTTrainer |
|
|
|
- [Fine-tuning using Hugging Face SFTTrainer](https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing) |
|
|
|
### Fine-tuning using Unsloth |
|
|
|
2024-02-07 was unable to use unsloth due to pip install issues. Maybe others in the future will have more luck: |
|
|
|
- [Alpaca + TinyLlama + RoPE Scaling full example.ipynb](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing) |
|
|
|
## How do I generate my own model merges? |
|
|
|
This requires setting up your [Hugging Face User Account Access Tokens](https://huggingface.co/settings/tokens) before it will work: |
|
|
|
If you're using the command line you can use: |
|
|
|
```sh |
|
huggingface-cli login |
|
``` |
|
|
|
```sh |
|
time ./run-tiny-merge.py |
|
``` |
|
|
|
### What's this code doing? |
|
|
|
Here's the latest version: |
|
|
|
```python3 |
|
#!/usr/bin/env python3 |
|
|
|
import os |
|
import transformers |
|
import torch |
|
import logging |
|
from ddare.merge import merge_tensors |
|
from ddare.tensor import ( |
|
dare_ties_sparsification, |
|
relative_norm, |
|
divide_tensor_into_sets, |
|
) |
|
from ddare.util import get_device |
|
import re |
|
from typing import Dict, Tuple, List |
|
|
|
|
|
logging.basicConfig(level=logging.INFO) |
|
log = logging.getLogger(__name__) |
|
|
|
|
|
def get_models( |
|
models: List[str], |
|
trust_remote_code: bool, |
|
): |
|
""" |
|
get the models |
|
|
|
:param models: model names to download |
|
:param trust_remote_code: are you sure??? True/False |
|
""" |
|
config = { |
|
"torch_dtype": torch.float16, |
|
"low_cpu_mem_usage": False, |
|
"trust_remote_code": trust_remote_code, |
|
} |
|
loaded_models = [] |
|
num_models = len(models) |
|
for midx, model_path in enumerate(models): |
|
log.info( |
|
f"loading model={midx + 1}/{num_models} " |
|
f"model={model_path} " |
|
) |
|
loaded_models.append( |
|
transformers.AutoModelForCausalLM.from_pretrained( |
|
model_path, **config |
|
) |
|
) |
|
return loaded_models |
|
|
|
|
|
def pm( |
|
model, |
|
): |
|
""" |
|
pretty print model |
|
|
|
:param model: show me the model |
|
""" |
|
keys = model.state_dict().keys() |
|
log.info(f"model keys={len(keys)}") |
|
for i, k in enumerate(keys): |
|
tensor = model.state_dict()[k] |
|
log.info( |
|
f"{i:3d} {k} shape={tensor.shape} " |
|
f"type={tensor.dtype} dev={tensor.device} " |
|
f"contig={tensor.is_contiguous()}" |
|
) |
|
|
|
|
|
def run_text_test( |
|
model, |
|
tokenizer_path: str, |
|
question: str, |
|
device: str = "cuda", |
|
): |
|
""" |
|
run a question on the model and return the answer |
|
|
|
:param model: initialized model |
|
:param tokenizer_path: tokenizer path/name |
|
:param question: what are you asking? |
|
:param device: where do you want to run "cpu"/"gpu"? |
|
""" |
|
base_model = model.to(device) |
|
log.info(f"loading tokenizer={tokenizer_path}") |
|
tokenizer = transformers.AutoTokenizer.from_pretrained( |
|
tokenizer_path, |
|
torch_dtype=torch.float16, |
|
) |
|
|
|
inputs = tokenizer(question, return_tensors="pt").to( |
|
device |
|
) |
|
with torch.backends.cuda.sdp_kernel( |
|
enable_flash=True, |
|
enable_math=False, |
|
enable_mem_efficient=True, |
|
): |
|
outputs = base_model.generate( |
|
**inputs, |
|
max_new_tokens=256, |
|
) |
|
answer = tokenizer.decode( |
|
outputs[0], skip_special_tokens=True |
|
) |
|
log.info( |
|
"\n" |
|
"----------" |
|
"\n" |
|
f"tokenizer={tokenizer}\n " |
|
f"question:\n{question}\n" |
|
f"answer:\n{answer}\n" |
|
"----------" |
|
) |
|
base_model = base_model.to(device) |
|
return tokenizer |
|
|
|
|
|
def get_layer_type(key: str) -> Tuple[int, str]: |
|
""" |
|
get the layer type |
|
|
|
:param key: name of the layer |
|
:return: layer id and name |
|
""" |
|
matcher = re.compile(r"model.layers.(\d+).(.+)") |
|
m = matcher.match(key) |
|
if m is None: |
|
if "model.norm.weight" == key: |
|
return -1, "norm" |
|
if "model.embed_tokens.weight" == key: |
|
return -1, "embed" |
|
if "lm_head.weight" == key: |
|
return -1, "head" |
|
log.info(f"Unknown key {key}") |
|
return -1, "unknown" |
|
return int(m.group(1)), m.group(2) |
|
|
|
|
|
def merge_model_with_ties( |
|
models: List[str], |
|
model_dst: str, |
|
trust_remote_code: bool = True, |
|
): |
|
""" |
|
merge the list of models into one model |
|
called model_dst |
|
|
|
:param models: list of models to merge |
|
:param model_dst: name of the new model |
|
:param trust_remote_code: are you sure? True/False |
|
""" |
|
models = get_models( |
|
models=models, |
|
trust_remote_code=trust_remote_code, |
|
) |
|
config = {} |
|
result_dict: Dict[str, torch.Tensor] = {} |
|
device = get_device() |
|
keys = models[0].state_dict().keys() |
|
num_keys = len(keys) |
|
for k in keys: |
|
block, layer_type = get_layer_type(k) |
|
m0: torch.Tensor = models[0].state_dict()[k] |
|
result = m0.clone() |
|
sets = divide_tensor_into_sets(tensor=m0, n_sets=4) |
|
|
|
# get the src layers to merge |
|
m = [ |
|
models[1].state_dict()[k], |
|
models[2].state_dict()[k], |
|
models[3].state_dict()[k], |
|
models[4].state_dict()[k], |
|
] |
|
|
|
# build a ratio |
|
ratio = { |
|
"to_q": 0.0, |
|
"to_k": 0.0, |
|
"to_v": 0.0, |
|
}.get(layer_type, 0.5) |
|
|
|
norm_ratio = 0.68 |
|
log.info( |
|
f"model={k} {num_keys} shape={m0.shape} " |
|
f"dtype={m0.dtype} {m0.device} " |
|
f"ratio={ratio} " |
|
f"contig={m0.is_contiguous()} " |
|
f"norm={norm_ratio}" |
|
) |
|
|
|
# for all tensors |
|
for i, tensor in enumerate(m): |
|
if layer_type == "to_k": |
|
# Get to_q key |
|
q_base = models[0].state_dict()[ |
|
k.replace("to_k", "to_q") |
|
] |
|
q_merge = models[i].state_dict()[ |
|
k.replace("to_k", "to_q") |
|
] |
|
scale = relative_norm(q_merge, q_base) |
|
tensor = tensor.to(device) / scale |
|
del scale |
|
elif layer_type == "to_q": |
|
scale = relative_norm(tensor, m0) |
|
tensor = tensor.to(device) * scale |
|
del scale |
|
slice_mask = (sets == i).bool() |
|
new_tensor = dare_ties_sparsification( |
|
model_a_param=m0, |
|
model_b_param=tensor, |
|
drop_rate=norm_ratio, |
|
ties="sum", |
|
rescale="off", |
|
device=device, |
|
**config, |
|
) |
|
new_tensor = merge_tensors( |
|
"slerp", m0, tensor, ratio |
|
) |
|
result = torch.where( |
|
slice_mask, new_tensor, result |
|
) |
|
del new_tensor, slice_mask |
|
|
|
result_dict[k] = result |
|
# end of merge |
|
|
|
log.info(f"done merge saving to file: {model_dst}") |
|
out_model = ( |
|
transformers.AutoModelForCausalLM.from_pretrained( |
|
model_dst, **config |
|
) |
|
) |
|
out_model.state_dict = lambda: result_dict |
|
out_model.save_pretrained(model_dst) |
|
|
|
|
|
def run(): |
|
""" |
|
run the merge and upload the model and tokenizer |
|
|
|
This requires having the Hugging Face token |
|
set before it will work: |
|
```huggingface-cli login``` |
|
""" |
|
question = "why is the sky blue?" |
|
log.info( |
|
f"merging models and asking the question: {question}" |
|
) |
|
model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T" |
|
model_dst = "matlok/tinyllama-cinder-openhermes-32k" |
|
device = "cuda" |
|
config = { |
|
"torch_dtype": torch.float16, |
|
"low_cpu_mem_usage": False, |
|
"trust_remote_code": True, |
|
} |
|
models = [ |
|
model_src, |
|
"Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct", |
|
"Doctor-Shotgun/TinyLlama-1.1B-32k", |
|
"Tensoic/TinyLlama-1.1B-3T-openhermes", |
|
"Josephgflowers/TinyLlama-3T-Cinder-v1.3", |
|
] |
|
merge_model_with_ties( |
|
models=models, model_dst=model_dst |
|
) |
|
log.info(f"loading newly-created file: {model_dst}") |
|
model = ( |
|
transformers.AutoModelForCausalLM.from_pretrained( |
|
model_dst, **config |
|
) |
|
) |
|
log.info( |
|
f"loaded new model file: {model_dst} " |
|
f"asking question: {question} " |
|
) |
|
run_text_test( |
|
model=model, |
|
tokenizer_path=model_src, |
|
question=question, |
|
device=device, |
|
) |
|
|
|
# clean the temp merge dir |
|
# remove model dir to prevent issues with the tokenizer upload |
|
model_org = model_dst.split("/")[0] |
|
if os.path.exists(model_org): |
|
os.system(f"rm -rf ./{model_org}") |
|
|
|
log.info(f"uploading model: {model_dst}") |
|
model.push_to_hub(model_dst) |
|
|
|
log.info(f"uploading src tokenizer: {model_src}") |
|
# reload tokenizer to save it and found on: |
|
# https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing#scrollTo=QQn30cRtAZ-P |
|
tokenizer = transformers.AutoTokenizer.from_pretrained( |
|
model_src, trust_remote_code=True |
|
) |
|
# https://huggingface.co/docs/transformers/model_sharing#use-the-pushtohub-function |
|
# tokenizer.push_to_hub("my-awesome-model") |
|
tokenizer.push_to_hub(model_dst) |
|
log.info( |
|
f"done loading new model: {model} " |
|
f"file: {model_dst}" |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
run() |
|
``` |
|
|
|
### Logs |
|
|
|
Here's the logs from the code above: |
|
|
|
``` |
|
time ./run-tiny-merge.py |
|
Total VRAM 12282 MB, total RAM 85434 MB |
|
Set vram state to: NORMAL_VRAM |
|
Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native |
|
VAE dtype: torch.bfloat16 |
|
INFO:__main__:merging models and asking the question: why is the sky blue? |
|
INFO:__main__:loading model=1/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T |
|
config.json: 100%|βββββββββββββββββββββββββββββββββββββ| 560/560 [00:00<00:00, 5.23MB/s] |
|
model.safetensors: 100%|βββββββββββββββββββββββββββ| 4.40G/4.40G [00:48<00:00, 90.2MB/s] |
|
generation_config.json: 100%|βββββββββββββββββββββββββββ| 129/129 [00:00<00:00, 721kB/s] |
|
INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct |
|
config.json: 100%|βββββββββββββββββββββββββββββββββββββ| 695/695 [00:00<00:00, 3.04MB/s] |
|
pytorch_model.bin: 100%|βββββββββββββββββββββββββββ| 2.20G/2.20G [00:23<00:00, 92.6MB/s] |
|
generation_config.json: 100%|βββββββββββββββββββββββββββ| 129/129 [00:00<00:00, 566kB/s] |
|
INFO:__main__:loading model=3/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k |
|
config.json: 100%|βββββββββββββββββββββββββββββββββββββ| 686/686 [00:00<00:00, 3.57MB/s] |
|
model.safetensors: 100%|βββββββββββββββββββββββββββ| 2.20G/2.20G [00:24<00:00, 90.5MB/s] |
|
generation_config.json: 100%|ββββββββββββββββββββββββββ| 124/124 [00:00<00:00, 1.80MB/s] |
|
INFO:__main__:loading model=4/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes |
|
config.json: 100%|βββββββββββββββββββββββββββββββββββββ| 702/702 [00:00<00:00, 2.97MB/s] |
|
pytorch_model.bin: 100%|βββββββββββββββββββββββββββ| 2.20G/2.20G [00:23<00:00, 92.7MB/s] |
|
generation_config.json: 100%|βββββββββββββββββββββββββββ| 124/124 [00:00<00:00, 671kB/s] |
|
INFO:__main__:loading model=5/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3 |
|
config.json: 100%|βββββββββββββββββββββββββββββββββββββ| 713/713 [00:00<00:00, 9.35MB/s] |
|
model.safetensors: 100%|βββββββββββββββββββββββββββ| 2.20G/2.20G [00:24<00:00, 91.5MB/s] |
|
generation_config.json: 100%|ββββββββββββββββββββββββββ| 138/138 [00:00<00:00, 1.86MB/s] |
|
INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.0.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.1.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.2.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.3.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.4.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.5.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.6.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.7.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.8.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.9.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.10.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.11.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.12.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.13.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.14.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.15.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.16.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.17.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.18.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.19.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.20.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu ratio=0.5 contig=True norm=0.68 |
|
INFO:__main__:done merge saving to file: matlok/tinyllama-cinder-openhermes-32k |
|
config.json: 100%|βββββββββββββββββββββββββββββββββββββ| 724/724 [00:00<00:00, 7.75MB/s] |
|
model.safetensors: 100%|βββββββββββββββββββββββββββ| 2.20G/2.20G [00:23<00:00, 91.8MB/s] |
|
generation_config.json: 100%|ββββββββββββββββββββββββββ| 133/133 [00:00<00:00, 1.58MB/s] |
|
INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k |
|
INFO:__main__:loaded new model file: matlok/tinyllama-cinder-openhermes-32k asking question: why is the sky blue? |
|
INFO:__main__:loading tokenizer=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T |
|
tokenizer_config.json: 100%|βββββββββββββββββββββββββββ| 776/776 [00:00<00:00, 8.26MB/s] |
|
tokenizer.model: 100%|βββββββββββββββββββββββββββββββ| 500k/500k [00:00<00:00, 64.6MB/s] |
|
tokenizer.json: 100%|ββββββββββββββββββββββββββββββ| 1.84M/1.84M [00:01<00:00, 1.57MB/s] |
|
special_tokens_map.json: 100%|βββββββββββββββββββββββββ| 414/414 [00:00<00:00, 2.47MB/s] |
|
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation. |
|
INFO:__main__: |
|
---------- |
|
tokenizer=LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False), added_tokens_decoder={ |
|
0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), |
|
1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), |
|
2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True), |
|
} |
|
question: |
|
why is the sky blue? |
|
answer: |
|
why is the sky blue? |
|
Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky. |
|
Why is the sky blue? |
|
Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky. |
|
Why is the sky blue? |
|
Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky. |
|
Why is the sky blue? |
|
Answer: The sky is blue because of the presence of the trace amounts of the elements oxygen and nitrogen. These elements are present in the atmosphere in very small amounts. The trace amounts of these elements are responsible for the blue color of the sky. |
|
Why is the sky blue? |
|
Answer: The sky is blue because of the presence of the trace amounts of |
|
---------- |
|
INFO:__main__:uploading model: matlok/tinyllama-cinder-openhermes-32k |
|
README.md: 100%|ββββββββββββββββββββββββββββββββββββ| 45.6k/45.6k [00:00<00:00, 297MB/s] |
|
model.safetensors: 100%|βββββββββββββββββββββββββββ| 2.20G/2.20G [01:18<00:00, 28.0MB/s] |
|
INFO:__main__:uploading src tokenizer: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T |
|
INFO:__main__:done loading new model: LlamaForCausalLM( |
|
(model): LlamaModel( |
|
(embed_tokens): Embedding(32000, 2048) |
|
(layers): ModuleList( |
|
(0-21): 22 x LlamaDecoderLayer( |
|
(self_attn): LlamaSdpaAttention( |
|
(q_proj): Linear(in_features=2048, out_features=2048, bias=False) |
|
(k_proj): Linear(in_features=2048, out_features=256, bias=False) |
|
(v_proj): Linear(in_features=2048, out_features=256, bias=False) |
|
(o_proj): Linear(in_features=2048, out_features=2048, bias=False) |
|
(rotary_emb): LlamaRotaryEmbedding() |
|
) |
|
(mlp): LlamaMLP( |
|
(gate_proj): Linear(in_features=2048, out_features=5632, bias=False) |
|
(up_proj): Linear(in_features=2048, out_features=5632, bias=False) |
|
(down_proj): Linear(in_features=5632, out_features=2048, bias=False) |
|
(act_fn): SiLU() |
|
) |
|
(input_layernorm): LlamaRMSNorm() |
|
(post_attention_layernorm): LlamaRMSNorm() |
|
) |
|
) |
|
(norm): LlamaRMSNorm() |
|
) |
|
(lm_head): Linear(in_features=2048, out_features=32000, bias=False) |
|
) file: matlok/tinyllama-cinder-openhermes-32k |
|
|
|
real 4m44.626s |
|
user 2m54.434s |
|
sys 0m25.981s |
|
``` |
|
|
|
### Acknowlegdements |
|
|
|
- Code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b) |
|
- [Fine tuning example](https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing) |
|
- [CodeLlama example](https://huggingface.co/collections/mlabonne/codellama-6509bc68c2d4c8fc379ee87f) |
|
|