fixing newline char in logs/readme

17917cf 9 months ago

45.3 kB

	---
	license: unknown
	---

	## Merging AI Models like Lego Blocks

	This model was merged with the following HuggingFace TinyLlama models using ties:

	- TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
	- Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct
	- Doctor-Shotgun/TinyLlama-1.1B-32k
	- Tensoic/TinyLlama-1.1B-3T-openhermes
	- Josephgflowers/TinyLlama-3T-Cinder-v1.3

	## How do I fine-tune this model?

	Please refer to the Unsloth fine-tuning guide for:

	- [Alpaca + TinyLlama + RoPE Scaling full example.ipynb](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)

	## How do I generate my own model merges?

	Here's [the standalone python script](https://huggingface.co/matlok/tinyllama-cinder-openhermes-32k/blob/main/run-tiny-merge.py) used with logs below:

	```python3
	#!/usr/bin/env python3

	import transformers
	import torch
	import logging
	from ddare.merge import merge_tensors
	from ddare.tensor import dare_ties_sparsification, relative_norm, divide_tensor_into_sets
	from ddare.util import get_device
	import re
	from typing import Dict, Tuple, List

	# If you want to fine-tune, here's an example Unsloth fine tuning guide for:
	# Alpaca + TinyLlama + RoPE Scaling full example.ipynb
	# https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing#scrollTo=LjY75GoYUCB8

	# code here was refactored from gist: https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b

	logging.basicConfig(level=logging.INFO)
	log = logging.getLogger(__name__)


	def get_models(
	models: List[str],
	trust_remote_code: bool,
	):
	config = {
	'torch_dtype': torch.float16,
	'low_cpu_mem_usage': False,
	'trust_remote_code': trust_remote_code,
	}
	loaded_models = []
	num_models = len(models)
	for midx, model_path in enumerate(models):
	log.info(
	f"loading model={midx + 1}/{num_models} "
	f"model={model_path} "
	)
	loaded_models.append(
	transformers.AutoModelForCausalLM.from_pretrained(
	model_path,
	**config
	)
	)
	return loaded_models


	def pm(
	model,
	):
	keys = model.state_dict().keys()
	log.info(f"model keys={len(keys)}")
	for i, k in enumerate(keys):
	tensor = model.state_dict()[k]
	log.info(
	f"{i:3d} {k} shape={tensor.shape} "
	f"type={tensor.dtype} dev={tensor.device} "
	f"contig={tensor.is_contiguous()}")


	def run_text_test(
	model,
	tokenizer_path,
	question: str,
	device: str = "cuda",
	):
	base_model = model.to(device)
	log.info(
	f"loading tokenizer={tokenizer_path}"
	)
	tokenizer = transformers.AutoTokenizer.from_pretrained(
	tokenizer_path,
	torch_dtype=torch.float16,
	)

	inputs = tokenizer(
	question,
	return_tensors="pt"
	).to(device)
	with torch.backends.cuda.sdp_kernel(
	enable_flash=True,
	enable_math=False,
	enable_mem_efficient=False
	):
	outputs = base_model.generate(
	**inputs,
	max_new_tokens=1000,
	)
	answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
	log.info(
	"\n"
	"----------"
	f"tokenizer={tokenizer}\n "
	f"question:\n{question}\n"
	f"answer:\n{answer}\n"
	"----------"
	)
	base_model = base_model.to(device)


	def get_layer_type(
	key: str
	) -> Tuple[int, str]:
	matcher = re.compile(r"model.layers.(\d+).(.+)")
	m = matcher.match(key)
	if m is None:
	if "model.norm.weight" == key:
	return -1, "norm"
	if "model.embed_tokens.weight" == key:
	return -1, "embed"
	if "lm_head.weight" == key:
	return -1, "head"
	log.info(f"Unknown key {key}")
	return -1, "unknown"
	return int(m.group(1)), m.group(2)


	def merge_model_with_ties(
	models: List[str],
	model_dst: str,
	trust_remote_code: bool = True
	):
	models = get_models(
	models=models,
	trust_remote_code=trust_remote_code,
	)
	config = {}
	result_dict: Dict[str, torch.Tensor] = {}
	device = get_device()
	keys = models[0].state_dict().keys()
	num_keys = len(keys)
	for k in keys:
	block, layer_type = get_layer_type(k)
	m0: torch.Tensor = models[0].state_dict()[k]
	result = m0.clone()
	sets = divide_tensor_into_sets(tensor=m0, n_sets=4)

	# get the src layers to merge
	m = [
	models[1].state_dict()[k],
	models[2].state_dict()[k],
	models[3].state_dict()[k],
	models[4].state_dict()[k],
	]

	# build a ratio
	ratio = {
	'to_q': 0.0,
	'to_k': 0.0,
	'to_v': 0.0,
	}.get(layer_type, .5)

	norm_ratio = 0.68
	log.info(
	f"model={k} {num_keys} shape={m0.shape} "
	f"dtype={m0.dtype} {m0.device} "
	f"raio={ratio} "
	f"contig={m0.is_contiguous()} "
	f"norm={norm_ratio}")

	# for all tensors
	for i, tensor in enumerate(m):
	if layer_type == "to_k":
	# Get to_q key
	q_base = models[0].state_dict()[k.replace("to_k", "to_q")]
	q_merge = models[i].state_dict()[k.replace("to_k", "to_q")]
	scale = relative_norm(q_merge, q_base)
	tensor = tensor.to(device) / scale
	del scale
	elif layer_type == "to_q":
	scale = relative_norm(tensor, m0)
	tensor = tensor.to(device) * scale
	del scale
	slice_mask = (
	sets == i
	).bool()
	new_tensor = dare_ties_sparsification(
	model_a_param=m0,
	model_b_param=tensor,
	drop_rate=norm_ratio,
	ties="sum",
	rescale="off",
	device=device,
	**config)
	new_tensor = merge_tensors("slerp", m0, tensor, ratio)
	result = torch.where(slice_mask, new_tensor, result)
	del new_tensor, slice_mask

	result_dict[k] = result
	# end of merge

	log.info(
	f"done merge saving to file: {model_dst}"
	)
	out_model = (
	transformers.AutoModelForCausalLM.from_pretrained(
	model_dst,
	**config
	)
	)
	out_model.state_dict = lambda: result_dict
	out_model.save_pretrained(model_dst)


	def run():
	question = (
	"why is the sky blue?"
	)
	log.info(f"merging models and asking the question: {question}")
	model_src = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
	model_dst = "matlok/tinyllama-cinder-openhermes-32k"
	device = "cuda"
	config = {
	'torch_dtype': torch.float16,
	'low_cpu_mem_usage': False,
	'trust_remote_code': True,
	}
	models = [
	model_src,
	"Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct",
	"Doctor-Shotgun/TinyLlama-1.1B-32k",
	"Tensoic/TinyLlama-1.1B-3T-openhermes",
	"Josephgflowers/TinyLlama-3T-Cinder-v1.3",
	]
	merge_model_with_ties(
	models=models,
	model_dst=model_dst
	)
	log.info(f"loading newly-created file: {model_dst}")
	model = transformers.AutoModelForCausalLM.from_pretrained(
	model_dst,
	**config
	)
	log.info(
	f"loaded new model file: {model_dst} "
	f"asking question: {question} "
	)
	run_text_test(
	model=model,
	tokenizer_path=model_src,
	question=question,
	device=device,
	)
	log.info(f"done loading new model: {model} file: {model_dst}")


	if __name__ == "__main__":
	run()
	```

	### Logs

	Here's the logs from the code above:

	```
	Total VRAM 12282 MB, total RAM 85434 MB
	Set vram state to: NORMAL_VRAM
	Device: cuda:0 NVIDIA GeForce RTX 4070 Ti : native
	VAE dtype: torch.bfloat16
	INFO:__main__:merging models and asking the question: why is the sky blue?
	INFO:__main__:loading model=1/5 model=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
	INFO:__main__:loading model=2/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k-Instruct
	/d/venvs/dev/lib/python3.11/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
	return self.fget.__get__(instance, owner)()
	INFO:__main__:loading model=3/5 model=Doctor-Shotgun/TinyLlama-1.1B-32k
	INFO:__main__:loading model=4/5 model=Tensoic/TinyLlama-1.1B-3T-openhermes
	INFO:__main__:loading model=5/5 model=Josephgflowers/TinyLlama-3T-Cinder-v1.3
	INFO:__main__:model=model.embed_tokens.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.0.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.0.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.0.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.0.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.0.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.0.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.0.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.0.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.0.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.1.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.1.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.1.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.1.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.1.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.1.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.1.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.1.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.1.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.2.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.2.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.2.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.2.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.2.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.2.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.2.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.2.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.2.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.3.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.3.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.3.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.3.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.3.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.3.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.3.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.3.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.3.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.4.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.4.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.4.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.4.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.4.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.4.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.4.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.4.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.4.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.5.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.5.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.5.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.5.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.5.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.5.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.5.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.5.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.5.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.6.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.6.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.6.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.6.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.6.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.6.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.6.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.6.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.6.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.7.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.7.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.7.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.7.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.7.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.7.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.7.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.7.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.7.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.8.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.8.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.8.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.8.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.8.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.8.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.8.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.8.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.8.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.9.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.9.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.9.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.9.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.9.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.9.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.9.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.9.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.9.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.10.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.10.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.10.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.10.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.10.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.10.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.10.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.10.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.10.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.11.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.11.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.11.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.11.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.11.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.11.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.11.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.11.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.11.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.12.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.12.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.12.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.12.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.12.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.12.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.12.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.12.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.12.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.13.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.13.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.13.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.13.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.13.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.13.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.13.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.13.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.13.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.14.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.14.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.14.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.14.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.14.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.14.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.14.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.14.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.14.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.15.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.15.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.15.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.15.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.15.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.15.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.15.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.15.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.15.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.16.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.16.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.16.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.16.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.16.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.16.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.16.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.16.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.16.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.17.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.17.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.17.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.17.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.17.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.17.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.17.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.17.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.17.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.18.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.18.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.18.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.18.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.18.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.18.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.18.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.18.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.18.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.19.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.19.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.19.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.19.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.19.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.19.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.19.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.19.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.19.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.20.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.20.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.20.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.20.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.20.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.20.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.20.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.20.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.20.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.21.self_attn.q_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.21.self_attn.k_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.21.self_attn.v_proj.weight 201 shape=torch.Size([256, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.21.self_attn.o_proj.weight 201 shape=torch.Size([2048, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.21.mlp.gate_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.21.mlp.up_proj.weight 201 shape=torch.Size([5632, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.21.mlp.down_proj.weight 201 shape=torch.Size([2048, 5632]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.21.input_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.layers.21.post_attention_layernorm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=model.norm.weight 201 shape=torch.Size([2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:model=lm_head.weight 201 shape=torch.Size([32000, 2048]) dtype=torch.float16 cpu raio=0.5 contig=True norm=0.68
	INFO:__main__:done merge saving to file: matlok/tinyllama-cinder-openhermes-32k
	INFO:__main__:loading newly-created file: matlok/tinyllama-cinder-openhermes-32k
	INFO:__main__:loaded new model file: matlok/tinyllama-cinder-openhermes-32k asking question: why is the sky blue?
	INFO:__main__:loading tokenizer=TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
	Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
	INFO:__main__:
	----------
	tokenizer=LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False), added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	}
	question:
	why is the sky blue?
	answer:
	why is the sky blue?
	The sky is blue because it is made up of the colors of the visible spectrum. The visible spectrum is a range of colors that can be seen with the naked eye. The colors in the visible spectrum are made up of light waves that are shorter than the wavelengths of the visible light. The shorter wavelengths of light are absorbed more easily by the atmosphere, which is why the sky is blue.
	What is the color of the sky?
	The color of the sky is blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
	What is the color of the sky in the winter?
	The color of the sky in the winter is usually a deep blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
	What is the color of the sky in the summer?
	The color of the sky in the summer is usually a bright yellow. This is because the visible spectrum is made up of the colors of the yellow and orange parts of the spectrum. The yellow part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
	What is the color of the sky in the spring?
	The color of the sky in the spring is usually a bright green. This is because the visible spectrum is made up of the colors of the green and blue parts of the spectrum. The green part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The blue part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
	What is the color of the sky in the fall?
	The color of the sky in the fall is usually a deep red. This is because the visible spectrum is made up of the colors of the red and orange parts of the spectrum. The red part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
	What is the color of the sky in the winter?
	The color of the sky in the winter is usually a deep blue. This is because the visible spectrum is made up of the colors of the blue and violet parts of the spectrum. The blue part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The violet part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
	What is the color of the sky in the summer?
	The color of the sky in the summer is usually a bright yellow. This is because the visible spectrum is made up of the colors of the yellow and orange parts of the spectrum. The yellow part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
	What is the color of the sky in the spring?
	The color of the sky in the spring is usually a bright green. This is because the visible spectrum is made up of the colors of the green and blue parts of the spectrum. The green part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The blue part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
	What is the color of the sky in the fall?
	The color of the sky in the fall is usually a deep red. This is because the visible spectrum is made up of the colors of the red and orange parts of the spectrum. The red part of the spectrum is made up of light waves that are shorter than the wavelengths of the visible light. The orange part of the spectrum is made up of light waves that are longer than the wavelengths of the visible light.
	What is the color of the
	----------
	INFO:__main__:done loading new model: LlamaForCausalLM(
	(model): LlamaModel(
	(embed_tokens): Embedding(32000, 2048)
	(layers): ModuleList(
	(0-21): 22 x LlamaDecoderLayer(
	(self_attn): LlamaSdpaAttention(
	(q_proj): Linear(in_features=2048, out_features=2048, bias=False)
	(k_proj): Linear(in_features=2048, out_features=256, bias=False)
	(v_proj): Linear(in_features=2048, out_features=256, bias=False)
	(o_proj): Linear(in_features=2048, out_features=2048, bias=False)
	(rotary_emb): LlamaRotaryEmbedding()
	)
	(mlp): LlamaMLP(
	(gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
	(up_proj): Linear(in_features=2048, out_features=5632, bias=False)
	(down_proj): Linear(in_features=5632, out_features=2048, bias=False)
	(act_fn): SiLU()
	)
	(input_layernorm): LlamaRMSNorm()
	(post_attention_layernorm): LlamaRMSNorm()
	)
	)
	(norm): LlamaRMSNorm()
	)
	(lm_head): Linear(in_features=2048, out_features=32000, bias=False)
	) file: matlok/tinyllama-cinder-openhermes-32k

	real 0m49.612s
	user 3m2.617s
	sys 0m14.655s
	```

	Note: code sample above was modified from [this very helpful GitHub gist](https://gist.github.com/maldevide/08829eada04ad9bd78e46c1a3787d42b)