"],"text/html":["\n"," \n"," \n","
\n"," [120/120 14:15, Epoch 20/20]\n","
\n"," \n"," \n"," \n"," Step | \n"," Training Loss | \n"," Validation Loss | \n","
\n"," \n"," \n"," \n"," 10 | \n"," 1.505600 | \n"," 1.263215 | \n","
\n"," \n"," 20 | \n"," 0.695100 | \n"," 0.531458 | \n","
\n"," \n"," 30 | \n"," 0.320900 | \n"," 0.222908 | \n","
\n"," \n"," 40 | \n"," 0.228900 | \n"," 0.211581 | \n","
\n"," \n"," 50 | \n"," 0.162800 | \n"," 0.165926 | \n","
\n"," \n"," 60 | \n"," 0.176000 | \n"," 0.142462 | \n","
\n"," \n"," 70 | \n"," 0.156200 | \n"," 0.139793 | \n","
\n"," \n"," 80 | \n"," 0.141100 | \n"," 0.131930 | \n","
\n"," \n"," 90 | \n"," 0.125100 | \n"," 0.126040 | \n","
\n"," \n"," 100 | \n"," 0.116700 | \n"," 0.124458 | \n","
\n"," \n"," 110 | \n"," 0.128800 | \n"," 0.122401 | \n","
\n"," \n"," 120 | \n"," 0.119300 | \n"," 0.120288 | \n","
\n"," \n","
"]},"metadata":{}}]},{"cell_type":"code","source":["# Run text generation pipeline with our model\n","prompt = \"What is a large language model?\"\n","instruction = f\"### Instruction:\\n{prompt}\\n\\n### Response:\\n\"\n","pipe = pipeline(task=\"text-generation\", model=model, tokenizer=tokenizer, max_length=128)\n","result = pipe(instruction)\n","\n","# Extract and print the generated text, removing the part that includes and follows the \"### Response:\\n\" placeholder\n","generated_text = result[0]['generated_text']\n","response_start = generated_text.find(\"### Response:\\n\") + len(\"### Response:\\n\")\n","response_end = generated_text.find(\"### Instruction:\", response_start)\n","print(generated_text[response_start:response_end if response_end != -1 else None].strip())"],"metadata":{"id":"frlSLPin4IJ4","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1706610659938,"user_tz":-480,"elapsed":30068,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"54e17fee-1deb-47dd-9ef2-17f1967428f4"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["A large language model is a type of artificial intelligence model that is trained on a large dataset of text to generate language outputs that are coherent and natural-sounding. These models are designed to capture the complexity and diversity of language, and can be used for a variety of tasks such as language translation, text summarization, and language generation.\n"]}]},{"cell_type":"code","source":["# Empty VRAM\n","del model\n","del pipe\n","del trainer\n","import gc\n","gc.collect()\n","gc.collect()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mkQCviG0Zta-","executionInfo":{"status":"ok","timestamp":1706610669238,"user_tz":-480,"elapsed":869,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"b8c107d8-1a31-4924-aba4-f7f81bc78f15"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":["0"]},"metadata":{},"execution_count":9}]},{"cell_type":"markdown","source":["Merging the base model with the trained adapter."],"metadata":{"id":"_g0fB7P9s0ol"}},{"cell_type":"code","source":["# Reload model in FP16 and merge it with LoRA weights\n","model = AutoModelForCausalLM.from_pretrained(\n"," base_model,\n"," low_cpu_mem_usage=True,\n"," return_dict=True,\n"," torch_dtype=torch.float16,\n"," device_map={\"\": 0},\n",")\n","model = PeftModel.from_pretrained(model, new_model)\n","model = model.merge_and_unload()\n","\n","\n","# Reload tokenizer to save it\n","tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)\n","tokenizer.pad_token = tokenizer.eos_token\n","tokenizer.padding_side = \"right\""],"metadata":{"id":"QQn30cRtAZ-P","colab":{"base_uri":"https://localhost:8080/","height":49,"referenced_widgets":["30c33b49ebc042b98f31ef08acaa98c6","d93e9e1d63f241af8478a3b183f3747d","88d750727d3745e8a3dd53eef2b69e97","10054d164dd04d519271a82d3605e714","d1e9be0735fe4b6fa468cac57945b19e","5fcd05acc87b491fad3c4c4c4b3e1d76","e8bac80e2a5f4fa79a65d21ff2bc9bfd","1c1afeec5aed4fd999e82c72425b2819","df762a20aa304cb887a29d20a861700e","8dae3300b3e04e48a62a12296a3dd620","1bd79f581c7d4aa19b93d0f017a98a59"]},"executionInfo":{"status":"ok","timestamp":1706610740493,"user_tz":-480,"elapsed":65433,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"9860fb19-61e3-4b82-c299-05e3ea9c41bf"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"30c33b49ebc042b98f31ef08acaa98c6"}},"metadata":{}}]},{"cell_type":"markdown","source":["Push the model and tokenizer to the Hugging Face Hub."],"metadata":{"id":"n4_wCHy_s--5"}},{"cell_type":"code","source":["model.push_to_hub(new_model, use_temp_dir=False)\n","tokenizer.push_to_hub(new_model, use_temp_dir=False)"],"metadata":{"id":"x-xPb-_qB0dz","executionInfo":{"status":"ok","timestamp":1706611209908,"user_tz":-480,"elapsed":469418,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"colab":{"base_uri":"https://localhost:8080/","height":246,"referenced_widgets":["f3bfe6809ea841949bb0f33269034e02","51c450db060f4951bc8249ea782392d6","6ecd5671a20648dea4bedda6113d23ff","8d7c7bda8df345dd8c4aade9966965ab","0e1dbb1e4b4a4f5c917833a57ae30a7c","dfc0a9bb09a3451fb9e4d95b71c57cb8","48ffe0f9cb824cc0bf4c14d5e0bde442","942acd91ec1a4a3cabd9dadda9b7a351","d858fa7276bf4176b0e46f9d5bc91e20","7d3efa513dea4b8f98fbd074dcbfd9f3","1422fdeb60544da58446e8a2d242e445","d9fca217b6fc4f48bd0134e411a2f909","c53048378bd7436fb2181a5e0cc4ba61","bfa91360a9f64b14922e29d0f7b9f7f8","1feb9b56463f4a97a2d8d5fdf6b5cbef","e2dec9930fb8423188556f35e943de84","e549d7168dc145f9bf934a71916e080a","c50b57696f0047aa915fed04afec8e9a","f7b225cef3864c1fabdc4445faf3b466","e76ad4f8dbf14d59a5f3aadfab84b9dc","cd23dc7eeae34aa3afbea9c976fc64f0","e3ba393293b7488ea626a6b650420a92","6326507477cb4933999f525889aad2ec","d42b2763130f415f98b98713144ff757","18b653d5bbc24a69a3923cfd26995e5a","413a9764e03a436084f6d5a957c1dc52","572d68d016674e508d4258ba047f0529","dff83433f49445988e0fb0028bbdc8c4","55b77507e6564bcfa2d7dd0ef125faf8","1cc69198c30b427aacc5c5ad42a5d1ac","648542e8a44245839e32fb997f9a1941","01c0515f6f0f454183b69e6e8da0a9db","91f465f87b6c4871be25ce5fb189ebba","7c55b758c41b419ead52bc8f7cbbbc57","fe0da59a6c074478a080c547c4f93b32","6a8706e3dd234a3f999e4984de8363ed","57736888a34346f780d376cfcb46ce75","c5af9dc16a324db6a98cc53c82e9e161","64f44d7e27874c9e963fd7a2758434af","f1bd8ba112be42ee81a3d6ad96a0f382","42bd12f0797c45b281861e38d5b708b4","665b236679b84ee6a66d6b23dd807073","8c73ea8a3fcd4a938f25bc1354db7818","316771e899204e839390d5b0727b0108","806bff9547164ae08291778f983b9790","6811f1e76e974aa58034852c20fdd613","e15f1f22121e4d11b4bf1681c3d7e84c","6fa7416ef4224e4e8ec474fd6fb45cf4","8544d67df06a45e18f05fccedb693544","44963c9e7a5f46f283e84e432bfa941c","7b27ccff92fa48ef88001c6a2c7c52bc","09381f4ffe0447368ab20185c87edd21","93b84c8ebd5c48159545af85314789f7","23d0643de9b94c9cb4b7bba21611008a","3d04c36ebc184f5a8c3244b5a543eaaf"]},"outputId":"e2e4498f-4632-4c7a-aba6-68bf7f9ad508"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["model-00002-of-00003.safetensors: 0%| | 0.00/4.95G [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"f3bfe6809ea841949bb0f33269034e02"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["model-00001-of-00003.safetensors: 0%| | 0.00/4.94G [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"d9fca217b6fc4f48bd0134e411a2f909"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["model-00003-of-00003.safetensors: 0%| | 0.00/3.59G [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"6326507477cb4933999f525889aad2ec"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["Upload 3 LFS files: 0%| | 0/3 [00:00, ?it/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"7c55b758c41b419ead52bc8f7cbbbc57"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["README.md: 0%| | 0.00/5.18k [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"806bff9547164ae08291778f983b9790"}},"metadata":{}},{"output_type":"execute_result","data":{"text/plain":["CommitInfo(commit_url='https://huggingface.co/ssoh/llama-2-7b-mini-ibased/commit/08f1929770c0aeccf8c17e768bed6840c998797c', commit_message='Upload tokenizer', commit_description='', oid='08f1929770c0aeccf8c17e768bed6840c998797c', pr_url=None, pr_revision=None, pr_num=None)"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":{},"execution_count":11}]},{"cell_type":"markdown","source":["# Quantize Llama 2 models using GGUF and llama.cpp\n","\n","\n","## Usage\n","\n","* `MODEL_ID`: The ID of the model to quantize (e.g., `ssoh/llama-2-7b-mini-ibased`).\n","* `QUANTIZATION_METHOD`: The quantization method to use.\n","\n","## Quantization methods\n","\n","The names of the quantization methods follow the naming convention: \"q\" + the number of bits + the variant used.\n","\n","We will be using **Q5_K_M** as it preserves most of the model's performance."],"metadata":{"id":"8y_Rk94LzG7I"}},{"cell_type":"code","source":["# Authenticate with Hugging Face Hub to securely access models, datasets, and other resources.\n","from huggingface_hub import notebook_login\n","notebook_login()"],"metadata":{"id":"zbCYFOmU7ANP","colab":{"base_uri":"https://localhost:8080/","height":145,"referenced_widgets":["342b32e908ab4c9e8e636a64ec9202bf","c23cf3a7f3a040ff8a3b9e9b7ad348f0","c98949824321478d99a1475a89e325a7","76395d8bd36c45fea0aa3900c794894c","a2e02b2ceacb4d64a14be13b26a22cf9","f68106a0b50440a2a10263b232d33f07","087e5ddf62b04a6994a056d3e26c780c","7e7e3eb22e6b4b358db0fea63ca374e5","5b6c2db34c7e4400b72aa1499d69ee32","4cd53420ce864839a99efab27ec28ee8","9a38d22e47704a1b8e85742820d4bf64","4318e356c7b54d939a862cd0d1290f09","367526f84d7f4e69af6e730d86e6136a","a76cf64bc1fd460a998a2a7a07617cf1","4f37f7e9c0b545d6875901930e921299","50de8cfc6c084e028068fd217c15b171","3048c08df98340898f64ad9b00ebd4dd","63ee3af405ee4696be9348e33bf0f577","54a0ae84599f4dedab0fa80f395f96ec","45c774bdb3e7456587f0aa7d1e9b8ed7","e2f2dfc61e2e421ab1aab56048d3f330","3f5fc2fd155c4cc89928b61c3896d522","fcd98b79ddfd4a32992cc005a26d07b8","cd6663dd27864040a1ebe16e2e2ba0f1","44e943245f81431d8388d5f79df6cdbb","28ef998c70a046f4b3fbf47e062f9210","eb205b40f40c43aea347a94c8a5b24e8","c1995c4094054a0eac15cc3201878ad3","d57d8b600803484d8f9ea340a5eaa7b0","da4f2bd963864b6085625935aa77857a","d1a1da3f19ff4e25b249ba50b3ee4535","2b7e57a54a55463cb3ac91859262383e"]},"executionInfo":{"status":"ok","timestamp":1706680450212,"user_tz":-480,"elapsed":557,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"384a0237-cf8f-46d5-f1e2-8098ccc0e386"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["VBox(children=(HTML(value='
=4.35.2 in /usr/local/lib/python3.10/dist-packages (from -r llama.cpp/./requirements/requirements-convert.txt (line 3)) (4.35.2)\n","Collecting gguf>=0.1.0 (from -r llama.cpp/./requirements/requirements-convert.txt (line 4))\n"," Downloading gguf-0.6.0-py3-none-any.whl (23 kB)\n","Collecting protobuf<5.0.0,>=4.21.0 (from -r llama.cpp/./requirements/requirements-convert.txt (line 5))\n"," Downloading protobuf-4.25.2-cp37-abi3-manylinux2014_x86_64.whl (294 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.6/294.6 kB\u001b[0m \u001b[31m31.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting torch~=2.1.1 (from -r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m670.2/670.2 MB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (3.13.1)\n","Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (0.20.3)\n","Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (23.2)\n","Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (6.0.1)\n","Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (2023.6.3)\n","Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (2.31.0)\n","Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (0.15.1)\n","Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (0.4.2)\n","Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (4.66.1)\n","Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (4.5.0)\n","Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (1.12)\n","Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (3.2.1)\n","Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (3.1.3)\n","Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (2023.6.0)\n","Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n","\u001b[2K \u001b[90m━━━━━━━━��━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m58.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m70.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m95.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m731.7/731.7 MB\u001b[0m \u001b[31m2.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting nvidia-cublas-cu12==12.1.3.1 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting nvidia-cufft-cu12==11.0.2.54 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting nvidia-curand-cu12==10.3.2.106 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting nvidia-cusolver-cu12==11.4.5.107 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting nvidia-cusparse-cu12==12.1.0.106 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting nvidia-nccl-cu12==2.18.1 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl (209.8 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.8/209.8 MB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hCollecting nvidia-nvtx-cu12==12.1.105 (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (2.1.0)\n","Collecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n"," Downloading nvidia_nvjitlink_cu12-12.3.101-py3-none-manylinux1_x86_64.whl (20.5 MB)\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.5/20.5 MB\u001b[0m \u001b[31m75.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25hRequirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (2.1.4)\n","Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (3.3.2)\n","Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (3.6)\n","Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (2.0.7)\n","Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.35.2->-r llama.cpp/./requirements/requirements-convert.txt (line 3)) (2023.11.17)\n","Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch~=2.1.1->-r llama.cpp/./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (1.3.0)\n","Installing collected packages: protobuf, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, numpy, nvidia-cusparse-cu12, nvidia-cudnn-cu12, gguf, nvidia-cusolver-cu12, torch\n"," Attempting uninstall: protobuf\n"," Found existing installation: protobuf 3.20.3\n"," Uninstalling protobuf-3.20.3:\n"," Successfully uninstalled protobuf-3.20.3\n"," Attempting uninstall: numpy\n"," Found existing installation: numpy 1.23.5\n"," Uninstalling numpy-1.23.5:\n"," Successfully uninstalled numpy-1.23.5\n"," Attempting uninstall: torch\n"," Found existing installation: torch 2.1.0+cu121\n"," Uninstalling torch-2.1.0+cu121:\n"," Successfully uninstalled torch-2.1.0+cu121\n","\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n","lida 0.0.10 requires fastapi, which is not installed.\n","lida 0.0.10 requires kaleido, which is not installed.\n","lida 0.0.10 requires python-multipart, which is not installed.\n","lida 0.0.10 requires uvicorn, which is not installed.\n","tensorboard 2.15.1 requires protobuf<4.24,>=3.19.6, but you have protobuf 4.25.2 which is incompatible.\n","tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 4.25.2 which is incompatible.\n","torchaudio 2.1.0+cu121 requires torch==2.1.0, but you have torch 2.1.2 which is incompatible.\n","torchdata 0.7.0 requires torch==2.1.0, but you have torch 2.1.2 which is incompatible.\n","torchtext 0.16.0 requires torch==2.1.0, but you have torch 2.1.2 which is incompatible.\n","torchvision 0.16.0+cu121 requires torch==2.1.0, but you have torch 2.1.2 which is incompatible.\u001b[0m\u001b[31m\n","\u001b[0mSuccessfully installed gguf-0.6.0 numpy-1.24.4 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.18.1 nvidia-nvjitlink-cu12-12.3.101 nvidia-nvtx-cu12-12.1.105 protobuf-4.25.2 torch-2.1.2\n"]},{"output_type":"display_data","data":{"application/vnd.colab-display-data+json":{"pip_warning":{"packages":["numpy","torch","torchgen"]}}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["Git LFS initialized.\n","Cloning into 'llama-2-7b-mini-ibased'...\n","remote: Enumerating objects: 17, done.\u001b[K\n","remote: Counting objects: 100% (14/14), done.\u001b[K\n","remote: Compressing objects: 100% (14/14), done.\u001b[K\n","remote: Total 17 (delta 1), reused 0 (delta 0), pack-reused 3\u001b[K\n","Unpacking objects: 100% (17/17), 483.83 KiB | 917.00 KiB/s, done.\n","Filtering content: 100% (3/3), 4.55 GiB | 16.16 MiB/s, done.\n","Encountered 2 file(s) that may not have been copied correctly on Windows:\n","\tmodel-00002-of-00003.safetensors\n","\tmodel-00001-of-00003.safetensors\n","\n","See: `git lfs help smudge` for more details.\n"]}]},{"cell_type":"code","source":["# Specify the model ID from which to load the tokenizer\n","model_id = \"meta-llama/Llama-2-7b-chat-hf\"\n","\n","# Load the tokenizer associated with the specified model ID\n","tokenizer = AutoTokenizer.from_pretrained(model_id)\n","\n","# Create a temporary directory to store all downloaded tokenizer files\n","temp_save_directory = \"temp_tokenizer_files\"\n","tokenizer.save_pretrained(temp_save_directory)\n","\n","# Specify the directory where the tokenizer.model file will be saved permanently\n","MODEL_NAME = \"llama-2-7b-mini-ibased\"\n","save_directory = MODEL_NAME\n","\n","# Create the save directory if it does not exist\n","os.makedirs(save_directory, exist_ok=True)\n","\n","# Define the specific filename of the tokenizer we want to retain\n","tokenizer_filename = \"tokenizer.model\"\n","\n","# Check for the existence of tokenizer.model in the temporary directory\n","source_file = os.path.join(temp_save_directory, tokenizer_filename)\n","destination_file = os.path.join(save_directory, tokenizer_filename)\n","\n","# Copy the tokenizer.model file to the final directory, if it exists\n","if os.path.exists(source_file):\n"," shutil.copy(source_file, destination_file)\n"," print(f\"tokenizer.model has been saved in {save_directory}\")\n","else:\n"," print(\"No tokenizer.model file found in the downloaded tokenizer files.\")\n","\n","# Remove the temporary directory to clean up unnecessary files\n","shutil.rmtree(temp_save_directory)"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":267,"referenced_widgets":["24ee6759da5043708205e556bdf6f314","bb111e8c28424cf8af29626e16403302","adba060d155545988b3e9d750efc41ae","fb11002cec6f4a0397fa12538b742def","50afe53721134c5ea8178f80c8f3003d","9e5805663be64aaf9d394bb0c82b3a0e","132912942b5b4ccf874c76921db3a02d","7c0bdd46072543e38c111b5c5a6f83e3","c79a83eda8334967853804291d7afa6e","51f055cce11d4c3784c5a74864ed95e4","f45b4d980d484237901a28c1a0c9b8df","a29b152cee3048dda48af4bcc1410d15","b365c2eca7a04bda9a8d1586a6bbc837","5bb91565119b4442b25b14b1aeb14be6","38fd793491804042bbb0d6eb96c1d756","0d473c65afba4fbd99eb8e0c94adec15","563441f0718644cebc5edf55229bb803","7533d642918b43a39603f8ba41b7ba66","b680eae84fb54ec7913c71b550aded62","c373b77d4c5d4a0fa7ae5865eae6e511","a4cf31ba8d7a452fa728ceafc145c9f4","dcf97967979d44458d1ef5cb30639177","959c6762670c41a59fb73c4ab55938d8","17aecb0efd574dc89313febabb401146","104dbf1c8f3149da86a14cbee8ff38f7","689b144f352045b298dfa7c9419ab7ba","9175f39989b349a8b1892d10dc696f2b","85f1a5a7f0cd4e03a2ee8ddf58fa7506","3feb087dd99f4cb8a2db169b2c280b00","11ab9624c0874402a53125169e216ec1","f1f85f58f0a44f1dad0cf0be10da10bb","15e9f7ab49e346c79cd750ecd6cc1aba","32b9c4e984104288b897929a3c6cb9fd","3698a1ad68984959a0163d36e17143f8","b23cf6d571da46c5aedcdfae072e2dc1","c885bec1f7dc400280f0c6e3934674aa","0b5d031519f8457692a44f521df62328","50fc781a22f4455cb30c4a237e3092cf","2299d9e60cfa437c8162e828cc2b2b22","e9652220800240669785ff51d8bb8ade","3ffe0f72384d48649578cadb3c939c34","20ff6eb9f738438984fba5e3595f7e54","fa5b85003b394f8c94f4fa864533470a","71a2e47a6ba946b38a79a2e0f099d14c"]},"id":"UkMoXHX2DOrO","executionInfo":{"status":"ok","timestamp":1706681124592,"user_tz":-480,"elapsed":4823,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"14bcde50-4357-4f91-c175-b913a7a0f818"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n","The secret `HF_TOKEN` does not exist in your Colab secrets.\n","To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n","You will be able to reuse this secret in all of your notebooks.\n","Please note that authentication is recommended but still optional to access public models or datasets.\n"," warnings.warn(\n"]},{"output_type":"display_data","data":{"text/plain":["tokenizer_config.json: 0%| | 0.00/1.62k [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"24ee6759da5043708205e556bdf6f314"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["tokenizer.model: 0%| | 0.00/500k [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"a29b152cee3048dda48af4bcc1410d15"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["tokenizer.json: 0%| | 0.00/1.84M [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"959c6762670c41a59fb73c4ab55938d8"}},"metadata":{}},{"output_type":"display_data","data":{"text/plain":["special_tokens_map.json: 0%| | 0.00/414 [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"3698a1ad68984959a0163d36e17143f8"}},"metadata":{}},{"output_type":"stream","name":"stdout","text":["tokenizer.model has been saved in llama-2-7b-mini-ibased\n"]}]},{"cell_type":"code","source":["# Convert to fp16\n","fp16 = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.bin\"\n","!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}"],"metadata":{"id":"fD24jJxq7t3k","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1706681281632,"user_tz":-480,"elapsed":157048,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"9f6dea9e-5c3c-49c5-a831-9af204dd44ad"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Loading model file llama-2-7b-mini-ibased/model-00001-of-00003.safetensors\n","Loading model file llama-2-7b-mini-ibased/model-00001-of-00003.safetensors\n","Loading model file llama-2-7b-mini-ibased/model-00002-of-00003.safetensors\n","Loading model file llama-2-7b-mini-ibased/model-00003-of-00003.safetensors\n","params = Params(n_vocab=32000, n_embd=4096, n_layer=32, n_ctx=4096, n_ff=11008, n_head=32, n_head_kv=32, n_experts=None, n_experts_used=None, f_norm_eps=1e-05, rope_scaling_type=None, f_rope_freq_base=10000.0, f_rope_scale=None, n_orig_ctx=None, rope_finetuned=None, ftype=, path_model=PosixPath('llama-2-7b-mini-ibased'))\n","Found vocab files: {'tokenizer.model': PosixPath('llama-2-7b-mini-ibased/tokenizer.model'), 'vocab.json': None, 'tokenizer.json': PosixPath('llama-2-7b-mini-ibased/tokenizer.json')}\n","Loading vocab file 'llama-2-7b-mini-ibased/tokenizer.model', type 'spm'\n","Vocab info: \n","Special vocab info: \n","Permuting layer 0\n","Permuting layer 1\n","Permuting layer 2\n","Permuting layer 3\n","Permuting layer 4\n","Permuting layer 5\n","Permuting layer 6\n","Permuting layer 7\n","Permuting layer 8\n","Permuting layer 9\n","Permuting layer 10\n","Permuting layer 11\n","Permuting layer 12\n","Permuting layer 13\n","Permuting layer 14\n","Permuting layer 15\n","Permuting layer 16\n","Permuting layer 17\n","Permuting layer 18\n","Permuting layer 19\n","Permuting layer 20\n","Permuting layer 21\n","Permuting layer 22\n","Permuting layer 23\n","Permuting layer 24\n","Permuting layer 25\n","Permuting layer 26\n","Permuting layer 27\n","Permuting layer 28\n","Permuting layer 29\n","Permuting layer 30\n","Permuting layer 31\n","model.embed_tokens.weight -> token_embd.weight | F16 | [32000, 4096]\n","model.layers.0.input_layernorm.weight -> blk.0.attn_norm.weight | F16 | [4096]\n","model.layers.0.mlp.down_proj.weight -> blk.0.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.0.mlp.gate_proj.weight -> blk.0.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.0.mlp.up_proj.weight -> blk.0.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.0.post_attention_layernorm.weight -> blk.0.ffn_norm.weight | F16 | [4096]\n","model.layers.0.self_attn.k_proj.weight -> blk.0.attn_k.weight | F16 | [4096, 4096]\n","model.layers.0.self_attn.o_proj.weight -> blk.0.attn_output.weight | F16 | [4096, 4096]\n","model.layers.0.self_attn.q_proj.weight -> blk.0.attn_q.weight | F16 | [4096, 4096]\n","model.layers.0.self_attn.v_proj.weight -> blk.0.attn_v.weight | F16 | [4096, 4096]\n","model.layers.1.input_layernorm.weight -> blk.1.attn_norm.weight | F16 | [4096]\n","model.layers.1.mlp.down_proj.weight -> blk.1.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.1.mlp.gate_proj.weight -> blk.1.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.1.mlp.up_proj.weight -> blk.1.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.1.post_attention_layernorm.weight -> blk.1.ffn_norm.weight | F16 | [4096]\n","model.layers.1.self_attn.k_proj.weight -> blk.1.attn_k.weight | F16 | [4096, 4096]\n","model.layers.1.self_attn.o_proj.weight -> blk.1.attn_output.weight | F16 | [4096, 4096]\n","model.layers.1.self_attn.q_proj.weight -> blk.1.attn_q.weight | F16 | [4096, 4096]\n","model.layers.1.self_attn.v_proj.weight -> blk.1.attn_v.weight | F16 | [4096, 4096]\n","model.layers.10.input_layernorm.weight -> blk.10.attn_norm.weight | F16 | [4096]\n","model.layers.10.mlp.down_proj.weight -> blk.10.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.10.mlp.gate_proj.weight -> blk.10.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.10.mlp.up_proj.weight -> blk.10.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.10.post_attention_layernorm.weight -> blk.10.ffn_norm.weight | F16 | [4096]\n","model.layers.10.self_attn.k_proj.weight -> blk.10.attn_k.weight | F16 | [4096, 4096]\n","model.layers.10.self_attn.o_proj.weight -> blk.10.attn_output.weight | F16 | [4096, 4096]\n","model.layers.10.self_attn.q_proj.weight -> blk.10.attn_q.weight | F16 | [4096, 4096]\n","model.layers.10.self_attn.v_proj.weight -> blk.10.attn_v.weight | F16 | [4096, 4096]\n","model.layers.11.mlp.gate_proj.weight -> blk.11.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.11.self_attn.k_proj.weight -> blk.11.attn_k.weight | F16 | [4096, 4096]\n","model.layers.11.self_attn.o_proj.weight -> blk.11.attn_output.weight | F16 | [4096, 4096]\n","model.layers.11.self_attn.q_proj.weight -> blk.11.attn_q.weight | F16 | [4096, 4096]\n","model.layers.11.self_attn.v_proj.weight -> blk.11.attn_v.weight | F16 | [4096, 4096]\n","model.layers.2.input_layernorm.weight -> blk.2.attn_norm.weight | F16 | [4096]\n","model.layers.2.mlp.down_proj.weight -> blk.2.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.2.mlp.gate_proj.weight -> blk.2.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.2.mlp.up_proj.weight -> blk.2.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.2.post_attention_layernorm.weight -> blk.2.ffn_norm.weight | F16 | [4096]\n","model.layers.2.self_attn.k_proj.weight -> blk.2.attn_k.weight | F16 | [4096, 4096]\n","model.layers.2.self_attn.o_proj.weight -> blk.2.attn_output.weight | F16 | [4096, 4096]\n","model.layers.2.self_attn.q_proj.weight -> blk.2.attn_q.weight | F16 | [4096, 4096]\n","model.layers.2.self_attn.v_proj.weight -> blk.2.attn_v.weight | F16 | [4096, 4096]\n","model.layers.3.input_layernorm.weight -> blk.3.attn_norm.weight | F16 | [4096]\n","model.layers.3.mlp.down_proj.weight -> blk.3.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.3.mlp.gate_proj.weight -> blk.3.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.3.mlp.up_proj.weight -> blk.3.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.3.post_attention_layernorm.weight -> blk.3.ffn_norm.weight | F16 | [4096]\n","model.layers.3.self_attn.k_proj.weight -> blk.3.attn_k.weight | F16 | [4096, 4096]\n","model.layers.3.self_attn.o_proj.weight -> blk.3.attn_output.weight | F16 | [4096, 4096]\n","model.layers.3.self_attn.q_proj.weight -> blk.3.attn_q.weight | F16 | [4096, 4096]\n","model.layers.3.self_attn.v_proj.weight -> blk.3.attn_v.weight | F16 | [4096, 4096]\n","model.layers.4.input_layernorm.weight -> blk.4.attn_norm.weight | F16 | [4096]\n","model.layers.4.mlp.down_proj.weight -> blk.4.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.4.mlp.gate_proj.weight -> blk.4.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.4.mlp.up_proj.weight -> blk.4.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.4.post_attention_layernorm.weight -> blk.4.ffn_norm.weight | F16 | [4096]\n","model.layers.4.self_attn.k_proj.weight -> blk.4.attn_k.weight | F16 | [4096, 4096]\n","model.layers.4.self_attn.o_proj.weight -> blk.4.attn_output.weight | F16 | [4096, 4096]\n","model.layers.4.self_attn.q_proj.weight -> blk.4.attn_q.weight | F16 | [4096, 4096]\n","model.layers.4.self_attn.v_proj.weight -> blk.4.attn_v.weight | F16 | [4096, 4096]\n","model.layers.5.input_layernorm.weight -> blk.5.attn_norm.weight | F16 | [4096]\n","model.layers.5.mlp.down_proj.weight -> blk.5.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.5.mlp.gate_proj.weight -> blk.5.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.5.mlp.up_proj.weight -> blk.5.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.5.post_attention_layernorm.weight -> blk.5.ffn_norm.weight | F16 | [4096]\n","model.layers.5.self_attn.k_proj.weight -> blk.5.attn_k.weight | F16 | [4096, 4096]\n","model.layers.5.self_attn.o_proj.weight -> blk.5.attn_output.weight | F16 | [4096, 4096]\n","model.layers.5.self_attn.q_proj.weight -> blk.5.attn_q.weight | F16 | [4096, 4096]\n","model.layers.5.self_attn.v_proj.weight -> blk.5.attn_v.weight | F16 | [4096, 4096]\n","model.layers.6.input_layernorm.weight -> blk.6.attn_norm.weight | F16 | [4096]\n","model.layers.6.mlp.down_proj.weight -> blk.6.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.6.mlp.gate_proj.weight -> blk.6.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.6.mlp.up_proj.weight -> blk.6.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.6.post_attention_layernorm.weight -> blk.6.ffn_norm.weight | F16 | [4096]\n","model.layers.6.self_attn.k_proj.weight -> blk.6.attn_k.weight | F16 | [4096, 4096]\n","model.layers.6.self_attn.o_proj.weight -> blk.6.attn_output.weight | F16 | [4096, 4096]\n","model.layers.6.self_attn.q_proj.weight -> blk.6.attn_q.weight | F16 | [4096, 4096]\n","model.layers.6.self_attn.v_proj.weight -> blk.6.attn_v.weight | F16 | [4096, 4096]\n","model.layers.7.input_layernorm.weight -> blk.7.attn_norm.weight | F16 | [4096]\n","model.layers.7.mlp.down_proj.weight -> blk.7.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.7.mlp.gate_proj.weight -> blk.7.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.7.mlp.up_proj.weight -> blk.7.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.7.post_attention_layernorm.weight -> blk.7.ffn_norm.weight | F16 | [4096]\n","model.layers.7.self_attn.k_proj.weight -> blk.7.attn_k.weight | F16 | [4096, 4096]\n","model.layers.7.self_attn.o_proj.weight -> blk.7.attn_output.weight | F16 | [4096, 4096]\n","model.layers.7.self_attn.q_proj.weight -> blk.7.attn_q.weight | F16 | [4096, 4096]\n","model.layers.7.self_attn.v_proj.weight -> blk.7.attn_v.weight | F16 | [4096, 4096]\n","model.layers.8.input_layernorm.weight -> blk.8.attn_norm.weight | F16 | [4096]\n","model.layers.8.mlp.down_proj.weight -> blk.8.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.8.mlp.gate_proj.weight -> blk.8.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.8.mlp.up_proj.weight -> blk.8.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.8.post_attention_layernorm.weight -> blk.8.ffn_norm.weight | F16 | [4096]\n","model.layers.8.self_attn.k_proj.weight -> blk.8.attn_k.weight | F16 | [4096, 4096]\n","model.layers.8.self_attn.o_proj.weight -> blk.8.attn_output.weight | F16 | [4096, 4096]\n","model.layers.8.self_attn.q_proj.weight -> blk.8.attn_q.weight | F16 | [4096, 4096]\n","model.layers.8.self_attn.v_proj.weight -> blk.8.attn_v.weight | F16 | [4096, 4096]\n","model.layers.9.input_layernorm.weight -> blk.9.attn_norm.weight | F16 | [4096]\n","model.layers.9.mlp.down_proj.weight -> blk.9.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.9.mlp.gate_proj.weight -> blk.9.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.9.mlp.up_proj.weight -> blk.9.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.9.post_attention_layernorm.weight -> blk.9.ffn_norm.weight | F16 | [4096]\n","model.layers.9.self_attn.k_proj.weight -> blk.9.attn_k.weight | F16 | [4096, 4096]\n","model.layers.9.self_attn.o_proj.weight -> blk.9.attn_output.weight | F16 | [4096, 4096]\n","model.layers.9.self_attn.q_proj.weight -> blk.9.attn_q.weight | F16 | [4096, 4096]\n","model.layers.9.self_attn.v_proj.weight -> blk.9.attn_v.weight | F16 | [4096, 4096]\n","model.layers.11.input_layernorm.weight -> blk.11.attn_norm.weight | F16 | [4096]\n","model.layers.11.mlp.down_proj.weight -> blk.11.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.11.mlp.up_proj.weight -> blk.11.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.11.post_attention_layernorm.weight -> blk.11.ffn_norm.weight | F16 | [4096]\n","model.layers.12.input_layernorm.weight -> blk.12.attn_norm.weight | F16 | [4096]\n","model.layers.12.mlp.down_proj.weight -> blk.12.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.12.mlp.gate_proj.weight -> blk.12.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.12.mlp.up_proj.weight -> blk.12.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.12.post_attention_layernorm.weight -> blk.12.ffn_norm.weight | F16 | [4096]\n","model.layers.12.self_attn.k_proj.weight -> blk.12.attn_k.weight | F16 | [4096, 4096]\n","model.layers.12.self_attn.o_proj.weight -> blk.12.attn_output.weight | F16 | [4096, 4096]\n","model.layers.12.self_attn.q_proj.weight -> blk.12.attn_q.weight | F16 | [4096, 4096]\n","model.layers.12.self_attn.v_proj.weight -> blk.12.attn_v.weight | F16 | [4096, 4096]\n","model.layers.13.input_layernorm.weight -> blk.13.attn_norm.weight | F16 | [4096]\n","model.layers.13.mlp.down_proj.weight -> blk.13.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.13.mlp.gate_proj.weight -> blk.13.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.13.mlp.up_proj.weight -> blk.13.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.13.post_attention_layernorm.weight -> blk.13.ffn_norm.weight | F16 | [4096]\n","model.layers.13.self_attn.k_proj.weight -> blk.13.attn_k.weight | F16 | [4096, 4096]\n","model.layers.13.self_attn.o_proj.weight -> blk.13.attn_output.weight | F16 | [4096, 4096]\n","model.layers.13.self_attn.q_proj.weight -> blk.13.attn_q.weight | F16 | [4096, 4096]\n","model.layers.13.self_attn.v_proj.weight -> blk.13.attn_v.weight | F16 | [4096, 4096]\n","model.layers.14.input_layernorm.weight -> blk.14.attn_norm.weight | F16 | [4096]\n","model.layers.14.mlp.down_proj.weight -> blk.14.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.14.mlp.gate_proj.weight -> blk.14.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.14.mlp.up_proj.weight -> blk.14.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.14.post_attention_layernorm.weight -> blk.14.ffn_norm.weight | F16 | [4096]\n","model.layers.14.self_attn.k_proj.weight -> blk.14.attn_k.weight | F16 | [4096, 4096]\n","model.layers.14.self_attn.o_proj.weight -> blk.14.attn_output.weight | F16 | [4096, 4096]\n","model.layers.14.self_attn.q_proj.weight -> blk.14.attn_q.weight | F16 | [4096, 4096]\n","model.layers.14.self_attn.v_proj.weight -> blk.14.attn_v.weight | F16 | [4096, 4096]\n","model.layers.15.input_layernorm.weight -> blk.15.attn_norm.weight | F16 | [4096]\n","model.layers.15.mlp.down_proj.weight -> blk.15.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.15.mlp.gate_proj.weight -> blk.15.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.15.mlp.up_proj.weight -> blk.15.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.15.post_attention_layernorm.weight -> blk.15.ffn_norm.weight | F16 | [4096]\n","model.layers.15.self_attn.k_proj.weight -> blk.15.attn_k.weight | F16 | [4096, 4096]\n","model.layers.15.self_attn.o_proj.weight -> blk.15.attn_output.weight | F16 | [4096, 4096]\n","model.layers.15.self_attn.q_proj.weight -> blk.15.attn_q.weight | F16 | [4096, 4096]\n","model.layers.15.self_attn.v_proj.weight -> blk.15.attn_v.weight | F16 | [4096, 4096]\n","model.layers.16.input_layernorm.weight -> blk.16.attn_norm.weight | F16 | [4096]\n","model.layers.16.mlp.down_proj.weight -> blk.16.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.16.mlp.gate_proj.weight -> blk.16.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.16.mlp.up_proj.weight -> blk.16.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.16.post_attention_layernorm.weight -> blk.16.ffn_norm.weight | F16 | [4096]\n","model.layers.16.self_attn.k_proj.weight -> blk.16.attn_k.weight | F16 | [4096, 4096]\n","model.layers.16.self_attn.o_proj.weight -> blk.16.attn_output.weight | F16 | [4096, 4096]\n","model.layers.16.self_attn.q_proj.weight -> blk.16.attn_q.weight | F16 | [4096, 4096]\n","model.layers.16.self_attn.v_proj.weight -> blk.16.attn_v.weight | F16 | [4096, 4096]\n","model.layers.17.input_layernorm.weight -> blk.17.attn_norm.weight | F16 | [4096]\n","model.layers.17.mlp.down_proj.weight -> blk.17.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.17.mlp.gate_proj.weight -> blk.17.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.17.mlp.up_proj.weight -> blk.17.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.17.post_attention_layernorm.weight -> blk.17.ffn_norm.weight | F16 | [4096]\n","model.layers.17.self_attn.k_proj.weight -> blk.17.attn_k.weight | F16 | [4096, 4096]\n","model.layers.17.self_attn.o_proj.weight -> blk.17.attn_output.weight | F16 | [4096, 4096]\n","model.layers.17.self_attn.q_proj.weight -> blk.17.attn_q.weight | F16 | [4096, 4096]\n","model.layers.17.self_attn.v_proj.weight -> blk.17.attn_v.weight | F16 | [4096, 4096]\n","model.layers.18.input_layernorm.weight -> blk.18.attn_norm.weight | F16 | [4096]\n","model.layers.18.mlp.down_proj.weight -> blk.18.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.18.mlp.gate_proj.weight -> blk.18.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.18.mlp.up_proj.weight -> blk.18.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.18.post_attention_layernorm.weight -> blk.18.ffn_norm.weight | F16 | [4096]\n","model.layers.18.self_attn.k_proj.weight -> blk.18.attn_k.weight | F16 | [4096, 4096]\n","model.layers.18.self_attn.o_proj.weight -> blk.18.attn_output.weight | F16 | [4096, 4096]\n","model.layers.18.self_attn.q_proj.weight -> blk.18.attn_q.weight | F16 | [4096, 4096]\n","model.layers.18.self_attn.v_proj.weight -> blk.18.attn_v.weight | F16 | [4096, 4096]\n","model.layers.19.input_layernorm.weight -> blk.19.attn_norm.weight | F16 | [4096]\n","model.layers.19.mlp.down_proj.weight -> blk.19.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.19.mlp.gate_proj.weight -> blk.19.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.19.mlp.up_proj.weight -> blk.19.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.19.post_attention_layernorm.weight -> blk.19.ffn_norm.weight | F16 | [4096]\n","model.layers.19.self_attn.k_proj.weight -> blk.19.attn_k.weight | F16 | [4096, 4096]\n","model.layers.19.self_attn.o_proj.weight -> blk.19.attn_output.weight | F16 | [4096, 4096]\n","model.layers.19.self_attn.q_proj.weight -> blk.19.attn_q.weight | F16 | [4096, 4096]\n","model.layers.19.self_attn.v_proj.weight -> blk.19.attn_v.weight | F16 | [4096, 4096]\n","model.layers.20.input_layernorm.weight -> blk.20.attn_norm.weight | F16 | [4096]\n","model.layers.20.mlp.down_proj.weight -> blk.20.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.20.mlp.gate_proj.weight -> blk.20.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.20.mlp.up_proj.weight -> blk.20.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.20.post_attention_layernorm.weight -> blk.20.ffn_norm.weight | F16 | [4096]\n","model.layers.20.self_attn.k_proj.weight -> blk.20.attn_k.weight | F16 | [4096, 4096]\n","model.layers.20.self_attn.o_proj.weight -> blk.20.attn_output.weight | F16 | [4096, 4096]\n","model.layers.20.self_attn.q_proj.weight -> blk.20.attn_q.weight | F16 | [4096, 4096]\n","model.layers.20.self_attn.v_proj.weight -> blk.20.attn_v.weight | F16 | [4096, 4096]\n","model.layers.21.input_layernorm.weight -> blk.21.attn_norm.weight | F16 | [4096]\n","model.layers.21.mlp.down_proj.weight -> blk.21.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.21.mlp.gate_proj.weight -> blk.21.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.21.mlp.up_proj.weight -> blk.21.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.21.post_attention_layernorm.weight -> blk.21.ffn_norm.weight | F16 | [4096]\n","model.layers.21.self_attn.k_proj.weight -> blk.21.attn_k.weight | F16 | [4096, 4096]\n","model.layers.21.self_attn.o_proj.weight -> blk.21.attn_output.weight | F16 | [4096, 4096]\n","model.layers.21.self_attn.q_proj.weight -> blk.21.attn_q.weight | F16 | [4096, 4096]\n","model.layers.21.self_attn.v_proj.weight -> blk.21.attn_v.weight | F16 | [4096, 4096]\n","model.layers.22.input_layernorm.weight -> blk.22.attn_norm.weight | F16 | [4096]\n","model.layers.22.mlp.down_proj.weight -> blk.22.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.22.mlp.gate_proj.weight -> blk.22.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.22.mlp.up_proj.weight -> blk.22.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.22.post_attention_layernorm.weight -> blk.22.ffn_norm.weight | F16 | [4096]\n","model.layers.22.self_attn.k_proj.weight -> blk.22.attn_k.weight | F16 | [4096, 4096]\n","model.layers.22.self_attn.o_proj.weight -> blk.22.attn_output.weight | F16 | [4096, 4096]\n","model.layers.22.self_attn.q_proj.weight -> blk.22.attn_q.weight | F16 | [4096, 4096]\n","model.layers.22.self_attn.v_proj.weight -> blk.22.attn_v.weight | F16 | [4096, 4096]\n","model.layers.23.mlp.gate_proj.weight -> blk.23.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.23.mlp.up_proj.weight -> blk.23.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.23.self_attn.k_proj.weight -> blk.23.attn_k.weight | F16 | [4096, 4096]\n","model.layers.23.self_attn.o_proj.weight -> blk.23.attn_output.weight | F16 | [4096, 4096]\n","model.layers.23.self_attn.q_proj.weight -> blk.23.attn_q.weight | F16 | [4096, 4096]\n","model.layers.23.self_attn.v_proj.weight -> blk.23.attn_v.weight | F16 | [4096, 4096]\n","lm_head.weight -> output.weight | F16 | [32000, 4096]\n","model.layers.23.input_layernorm.weight -> blk.23.attn_norm.weight | F16 | [4096]\n","model.layers.23.mlp.down_proj.weight -> blk.23.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.23.post_attention_layernorm.weight -> blk.23.ffn_norm.weight | F16 | [4096]\n","model.layers.24.input_layernorm.weight -> blk.24.attn_norm.weight | F16 | [4096]\n","model.layers.24.mlp.down_proj.weight -> blk.24.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.24.mlp.gate_proj.weight -> blk.24.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.24.mlp.up_proj.weight -> blk.24.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.24.post_attention_layernorm.weight -> blk.24.ffn_norm.weight | F16 | [4096]\n","model.layers.24.self_attn.k_proj.weight -> blk.24.attn_k.weight | F16 | [4096, 4096]\n","model.layers.24.self_attn.o_proj.weight -> blk.24.attn_output.weight | F16 | [4096, 4096]\n","model.layers.24.self_attn.q_proj.weight -> blk.24.attn_q.weight | F16 | [4096, 4096]\n","model.layers.24.self_attn.v_proj.weight -> blk.24.attn_v.weight | F16 | [4096, 4096]\n","model.layers.25.input_layernorm.weight -> blk.25.attn_norm.weight | F16 | [4096]\n","model.layers.25.mlp.down_proj.weight -> blk.25.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.25.mlp.gate_proj.weight -> blk.25.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.25.mlp.up_proj.weight -> blk.25.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.25.post_attention_layernorm.weight -> blk.25.ffn_norm.weight | F16 | [4096]\n","model.layers.25.self_attn.k_proj.weight -> blk.25.attn_k.weight | F16 | [4096, 4096]\n","model.layers.25.self_attn.o_proj.weight -> blk.25.attn_output.weight | F16 | [4096, 4096]\n","model.layers.25.self_attn.q_proj.weight -> blk.25.attn_q.weight | F16 | [4096, 4096]\n","model.layers.25.self_attn.v_proj.weight -> blk.25.attn_v.weight | F16 | [4096, 4096]\n","model.layers.26.input_layernorm.weight -> blk.26.attn_norm.weight | F16 | [4096]\n","model.layers.26.mlp.down_proj.weight -> blk.26.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.26.mlp.gate_proj.weight -> blk.26.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.26.mlp.up_proj.weight -> blk.26.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.26.post_attention_layernorm.weight -> blk.26.ffn_norm.weight | F16 | [4096]\n","model.layers.26.self_attn.k_proj.weight -> blk.26.attn_k.weight | F16 | [4096, 4096]\n","model.layers.26.self_attn.o_proj.weight -> blk.26.attn_output.weight | F16 | [4096, 4096]\n","model.layers.26.self_attn.q_proj.weight -> blk.26.attn_q.weight | F16 | [4096, 4096]\n","model.layers.26.self_attn.v_proj.weight -> blk.26.attn_v.weight | F16 | [4096, 4096]\n","model.layers.27.input_layernorm.weight -> blk.27.attn_norm.weight | F16 | [4096]\n","model.layers.27.mlp.down_proj.weight -> blk.27.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.27.mlp.gate_proj.weight -> blk.27.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.27.mlp.up_proj.weight -> blk.27.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.27.post_attention_layernorm.weight -> blk.27.ffn_norm.weight | F16 | [4096]\n","model.layers.27.self_attn.k_proj.weight -> blk.27.attn_k.weight | F16 | [4096, 4096]\n","model.layers.27.self_attn.o_proj.weight -> blk.27.attn_output.weight | F16 | [4096, 4096]\n","model.layers.27.self_attn.q_proj.weight -> blk.27.attn_q.weight | F16 | [4096, 4096]\n","model.layers.27.self_attn.v_proj.weight -> blk.27.attn_v.weight | F16 | [4096, 4096]\n","model.layers.28.input_layernorm.weight -> blk.28.attn_norm.weight | F16 | [4096]\n","model.layers.28.mlp.down_proj.weight -> blk.28.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.28.mlp.gate_proj.weight -> blk.28.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.28.mlp.up_proj.weight -> blk.28.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.28.post_attention_layernorm.weight -> blk.28.ffn_norm.weight | F16 | [4096]\n","model.layers.28.self_attn.k_proj.weight -> blk.28.attn_k.weight | F16 | [4096, 4096]\n","model.layers.28.self_attn.o_proj.weight -> blk.28.attn_output.weight | F16 | [4096, 4096]\n","model.layers.28.self_attn.q_proj.weight -> blk.28.attn_q.weight | F16 | [4096, 4096]\n","model.layers.28.self_attn.v_proj.weight -> blk.28.attn_v.weight | F16 | [4096, 4096]\n","model.layers.29.input_layernorm.weight -> blk.29.attn_norm.weight | F16 | [4096]\n","model.layers.29.mlp.down_proj.weight -> blk.29.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.29.mlp.gate_proj.weight -> blk.29.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.29.mlp.up_proj.weight -> blk.29.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.29.post_attention_layernorm.weight -> blk.29.ffn_norm.weight | F16 | [4096]\n","model.layers.29.self_attn.k_proj.weight -> blk.29.attn_k.weight | F16 | [4096, 4096]\n","model.layers.29.self_attn.o_proj.weight -> blk.29.attn_output.weight | F16 | [4096, 4096]\n","model.layers.29.self_attn.q_proj.weight -> blk.29.attn_q.weight | F16 | [4096, 4096]\n","model.layers.29.self_attn.v_proj.weight -> blk.29.attn_v.weight | F16 | [4096, 4096]\n","model.layers.30.input_layernorm.weight -> blk.30.attn_norm.weight | F16 | [4096]\n","model.layers.30.mlp.down_proj.weight -> blk.30.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.30.mlp.gate_proj.weight -> blk.30.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.30.mlp.up_proj.weight -> blk.30.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.30.post_attention_layernorm.weight -> blk.30.ffn_norm.weight | F16 | [4096]\n","model.layers.30.self_attn.k_proj.weight -> blk.30.attn_k.weight | F16 | [4096, 4096]\n","model.layers.30.self_attn.o_proj.weight -> blk.30.attn_output.weight | F16 | [4096, 4096]\n","model.layers.30.self_attn.q_proj.weight -> blk.30.attn_q.weight | F16 | [4096, 4096]\n","model.layers.30.self_attn.v_proj.weight -> blk.30.attn_v.weight | F16 | [4096, 4096]\n","model.layers.31.input_layernorm.weight -> blk.31.attn_norm.weight | F16 | [4096]\n","model.layers.31.mlp.down_proj.weight -> blk.31.ffn_down.weight | F16 | [4096, 11008]\n","model.layers.31.mlp.gate_proj.weight -> blk.31.ffn_gate.weight | F16 | [11008, 4096]\n","model.layers.31.mlp.up_proj.weight -> blk.31.ffn_up.weight | F16 | [11008, 4096]\n","model.layers.31.post_attention_layernorm.weight -> blk.31.ffn_norm.weight | F16 | [4096]\n","model.layers.31.self_attn.k_proj.weight -> blk.31.attn_k.weight | F16 | [4096, 4096]\n","model.layers.31.self_attn.o_proj.weight -> blk.31.attn_output.weight | F16 | [4096, 4096]\n","model.layers.31.self_attn.q_proj.weight -> blk.31.attn_q.weight | F16 | [4096, 4096]\n","model.layers.31.self_attn.v_proj.weight -> blk.31.attn_v.weight | F16 | [4096, 4096]\n","model.norm.weight -> output_norm.weight | F16 | [4096]\n","Writing llama-2-7b-mini-ibased/llama-2-7b-mini-ibased.fp16.bin, format 1\n","Ignoring added_tokens.json since model matches vocab size without it.\n","gguf: This GGUF file is for Little Endian only\n","gguf: Setting special token type bos to 1\n","gguf: Setting special token type eos to 2\n","gguf: Setting special token type unk to 0\n","gguf: Setting special token type pad to 2\n","gguf: Setting add_bos_token to True\n","gguf: Setting add_eos_token to False\n","gguf: Setting chat_template to {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}\n","[ 1/291] Writing tensor token_embd.weight | size 32000 x 4096 | type F16 | T+ 2\n","[ 2/291] Writing tensor blk.0.attn_norm.weight | size 4096 | type F32 | T+ 2\n","[ 3/291] Writing tensor blk.0.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 2\n","[ 4/291] Writing tensor blk.0.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 3\n","[ 5/291] Writing tensor blk.0.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 3\n","[ 6/291] Writing tensor blk.0.ffn_norm.weight | size 4096 | type F32 | T+ 4\n","[ 7/291] Writing tensor blk.0.attn_k.weight | size 4096 x 4096 | type F16 | T+ 4\n","[ 8/291] Writing tensor blk.0.attn_output.weight | size 4096 x 4096 | type F16 | T+ 4\n","[ 9/291] Writing tensor blk.0.attn_q.weight | size 4096 x 4096 | type F16 | T+ 4\n","[ 10/291] Writing tensor blk.0.attn_v.weight | size 4096 x 4096 | type F16 | T+ 4\n","[ 11/291] Writing tensor blk.1.attn_norm.weight | size 4096 | type F32 | T+ 5\n","[ 12/291] Writing tensor blk.1.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 6\n","[ 13/291] Writing tensor blk.1.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 9\n","[ 14/291] Writing tensor blk.1.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 9\n","[ 15/291] Writing tensor blk.1.ffn_norm.weight | size 4096 | type F32 | T+ 9\n","[ 16/291] Writing tensor blk.1.attn_k.weight | size 4096 x 4096 | type F16 | T+ 9\n","[ 17/291] Writing tensor blk.1.attn_output.weight | size 4096 x 4096 | type F16 | T+ 9\n","[ 18/291] Writing tensor blk.1.attn_q.weight | size 4096 x 4096 | type F16 | T+ 10\n","[ 19/291] Writing tensor blk.1.attn_v.weight | size 4096 x 4096 | type F16 | T+ 10\n","[ 20/291] Writing tensor blk.10.attn_norm.weight | size 4096 | type F32 | T+ 13\n","[ 21/291] Writing tensor blk.10.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 13\n","[ 22/291] Writing tensor blk.10.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 14\n","[ 23/291] Writing tensor blk.10.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 14\n","[ 24/291] Writing tensor blk.10.ffn_norm.weight | size 4096 | type F32 | T+ 16\n","[ 25/291] Writing tensor blk.10.attn_k.weight | size 4096 x 4096 | type F16 | T+ 16\n","[ 26/291] Writing tensor blk.10.attn_output.weight | size 4096 x 4096 | type F16 | T+ 18\n","[ 27/291] Writing tensor blk.10.attn_q.weight | size 4096 x 4096 | type F16 | T+ 18\n","[ 28/291] Writing tensor blk.10.attn_v.weight | size 4096 x 4096 | type F16 | T+ 19\n","[ 29/291] Writing tensor blk.11.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 19\n","[ 30/291] Writing tensor blk.11.attn_k.weight | size 4096 x 4096 | type F16 | T+ 19\n","[ 31/291] Writing tensor blk.11.attn_output.weight | size 4096 x 4096 | type F16 | T+ 19\n","[ 32/291] Writing tensor blk.11.attn_q.weight | size 4096 x 4096 | type F16 | T+ 19\n","[ 33/291] Writing tensor blk.11.attn_v.weight | size 4096 x 4096 | type F16 | T+ 20\n","[ 34/291] Writing tensor blk.2.attn_norm.weight | size 4096 | type F32 | T+ 23\n","[ 35/291] Writing tensor blk.2.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 23\n","[ 36/291] Writing tensor blk.2.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 24\n","[ 37/291] Writing tensor blk.2.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 24\n","[ 38/291] Writing tensor blk.2.ffn_norm.weight | size 4096 | type F32 | T+ 24\n","[ 39/291] Writing tensor blk.2.attn_k.weight | size 4096 x 4096 | type F16 | T+ 24\n","[ 40/291] Writing tensor blk.2.attn_output.weight | size 4096 x 4096 | type F16 | T+ 24\n","[ 41/291] Writing tensor blk.2.attn_q.weight | size 4096 x 4096 | type F16 | T+ 25\n","[ 42/291] Writing tensor blk.2.attn_v.weight | size 4096 x 4096 | type F16 | T+ 25\n","[ 43/291] Writing tensor blk.3.attn_norm.weight | size 4096 | type F32 | T+ 25\n","[ 44/291] Writing tensor blk.3.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 26\n","[ 45/291] Writing tensor blk.3.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 26\n","[ 46/291] Writing tensor blk.3.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 27\n","[ 47/291] Writing tensor blk.3.ffn_norm.weight | size 4096 | type F32 | T+ 28\n","[ 48/291] Writing tensor blk.3.attn_k.weight | size 4096 x 4096 | type F16 | T+ 28\n","[ 49/291] Writing tensor blk.3.attn_output.weight | size 4096 x 4096 | type F16 | T+ 29\n","[ 50/291] Writing tensor blk.3.attn_q.weight | size 4096 x 4096 | type F16 | T+ 29\n","[ 51/291] Writing tensor blk.3.attn_v.weight | size 4096 x 4096 | type F16 | T+ 29\n","[ 52/291] Writing tensor blk.4.attn_norm.weight | size 4096 | type F32 | T+ 29\n","[ 53/291] Writing tensor blk.4.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 30\n","[ 54/291] Writing tensor blk.4.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 30\n","[ 55/291] Writing tensor blk.4.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 31\n","[ 56/291] Writing tensor blk.4.ffn_norm.weight | size 4096 | type F32 | T+ 34\n","[ 57/291] Writing tensor blk.4.attn_k.weight | size 4096 x 4096 | type F16 | T+ 34\n","[ 58/291] Writing tensor blk.4.attn_output.weight | size 4096 x 4096 | type F16 | T+ 34\n","[ 59/291] Writing tensor blk.4.attn_q.weight | size 4096 x 4096 | type F16 | T+ 34\n","[ 60/291] Writing tensor blk.4.attn_v.weight | size 4096 x 4096 | type F16 | T+ 34\n","[ 61/291] Writing tensor blk.5.attn_norm.weight | size 4096 | type F32 | T+ 34\n","[ 62/291] Writing tensor blk.5.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 34\n","[ 63/291] Writing tensor blk.5.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 38\n","[ 64/291] Writing tensor blk.5.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 39\n","[ 65/291] Writing tensor blk.5.ffn_norm.weight | size 4096 | type F32 | T+ 39\n","[ 66/291] Writing tensor blk.5.attn_k.weight | size 4096 x 4096 | type F16 | T+ 39\n","[ 67/291] Writing tensor blk.5.attn_output.weight | size 4096 x 4096 | type F16 | T+ 39\n","[ 68/291] Writing tensor blk.5.attn_q.weight | size 4096 x 4096 | type F16 | T+ 39\n","[ 69/291] Writing tensor blk.5.attn_v.weight | size 4096 x 4096 | type F16 | T+ 40\n","[ 70/291] Writing tensor blk.6.attn_norm.weight | size 4096 | type F32 | T+ 40\n","[ 71/291] Writing tensor blk.6.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 40\n","[ 72/291] Writing tensor blk.6.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 41\n","[ 73/291] Writing tensor blk.6.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 43\n","[ 74/291] Writing tensor blk.6.ffn_norm.weight | size 4096 | type F32 | T+ 44\n","[ 75/291] Writing tensor blk.6.attn_k.weight | size 4096 x 4096 | type F16 | T+ 44\n","[ 76/291] Writing tensor blk.6.attn_output.weight | size 4096 x 4096 | type F16 | T+ 44\n","[ 77/291] Writing tensor blk.6.attn_q.weight | size 4096 x 4096 | type F16 | T+ 44\n","[ 78/291] Writing tensor blk.6.attn_v.weight | size 4096 x 4096 | type F16 | T+ 44\n","[ 79/291] Writing tensor blk.7.attn_norm.weight | size 4096 | type F32 | T+ 44\n","[ 80/291] Writing tensor blk.7.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 44\n","[ 81/291] Writing tensor blk.7.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 45\n","[ 82/291] Writing tensor blk.7.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 46\n","[ 83/291] Writing tensor blk.7.ffn_norm.weight | size 4096 | type F32 | T+ 48\n","[ 84/291] Writing tensor blk.7.attn_k.weight | size 4096 x 4096 | type F16 | T+ 48\n","[ 85/291] Writing tensor blk.7.attn_output.weight | size 4096 x 4096 | type F16 | T+ 48\n","[ 86/291] Writing tensor blk.7.attn_q.weight | size 4096 x 4096 | type F16 | T+ 49\n","[ 87/291] Writing tensor blk.7.attn_v.weight | size 4096 x 4096 | type F16 | T+ 49\n","[ 88/291] Writing tensor blk.8.attn_norm.weight | size 4096 | type F32 | T+ 49\n","[ 89/291] Writing tensor blk.8.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 49\n","[ 90/291] Writing tensor blk.8.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 53\n","[ 91/291] Writing tensor blk.8.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 54\n","[ 92/291] Writing tensor blk.8.ffn_norm.weight | size 4096 | type F32 | T+ 54\n","[ 93/291] Writing tensor blk.8.attn_k.weight | size 4096 x 4096 | type F16 | T+ 54\n","[ 94/291] Writing tensor blk.8.attn_output.weight | size 4096 x 4096 | type F16 | T+ 54\n","[ 95/291] Writing tensor blk.8.attn_q.weight | size 4096 x 4096 | type F16 | T+ 54\n","[ 96/291] Writing tensor blk.8.attn_v.weight | size 4096 x 4096 | type F16 | T+ 54\n","[ 97/291] Writing tensor blk.9.attn_norm.weight | size 4096 | type F32 | T+ 55\n","[ 98/291] Writing tensor blk.9.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 55\n","[ 99/291] Writing tensor blk.9.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 56\n","[100/291] Writing tensor blk.9.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 56\n","[101/291] Writing tensor blk.9.ffn_norm.weight | size 4096 | type F32 | T+ 57\n","[102/291] Writing tensor blk.9.attn_k.weight | size 4096 x 4096 | type F16 | T+ 57\n","[103/291] Writing tensor blk.9.attn_output.weight | size 4096 x 4096 | type F16 | T+ 57\n","[104/291] Writing tensor blk.9.attn_q.weight | size 4096 x 4096 | type F16 | T+ 58\n","[105/291] Writing tensor blk.9.attn_v.weight | size 4096 x 4096 | type F16 | T+ 58\n","[106/291] Writing tensor blk.11.attn_norm.weight | size 4096 | type F32 | T+ 59\n","[107/291] Writing tensor blk.11.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 59\n","[108/291] Writing tensor blk.11.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 60\n","[109/291] Writing tensor blk.11.ffn_norm.weight | size 4096 | type F32 | T+ 60\n","[110/291] Writing tensor blk.12.attn_norm.weight | size 4096 | type F32 | T+ 60\n","[111/291] Writing tensor blk.12.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 60\n","[112/291] Writing tensor blk.12.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 61\n","[113/291] Writing tensor blk.12.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 61\n","[114/291] Writing tensor blk.12.ffn_norm.weight | size 4096 | type F32 | T+ 61\n","[115/291] Writing tensor blk.12.attn_k.weight | size 4096 x 4096 | type F16 | T+ 61\n","[116/291] Writing tensor blk.12.attn_output.weight | size 4096 x 4096 | type F16 | T+ 62\n","[117/291] Writing tensor blk.12.attn_q.weight | size 4096 x 4096 | type F16 | T+ 62\n","[118/291] Writing tensor blk.12.attn_v.weight | size 4096 x 4096 | type F16 | T+ 62\n","[119/291] Writing tensor blk.13.attn_norm.weight | size 4096 | type F32 | T+ 62\n","[120/291] Writing tensor blk.13.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 63\n","[121/291] Writing tensor blk.13.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 63\n","[122/291] Writing tensor blk.13.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 64\n","[123/291] Writing tensor blk.13.ffn_norm.weight | size 4096 | type F32 | T+ 64\n","[124/291] Writing tensor blk.13.attn_k.weight | size 4096 x 4096 | type F16 | T+ 64\n","[125/291] Writing tensor blk.13.attn_output.weight | size 4096 x 4096 | type F16 | T+ 64\n","[126/291] Writing tensor blk.13.attn_q.weight | size 4096 x 4096 | type F16 | T+ 64\n","[127/291] Writing tensor blk.13.attn_v.weight | size 4096 x 4096 | type F16 | T+ 65\n","[128/291] Writing tensor blk.14.attn_norm.weight | size 4096 | type F32 | T+ 65\n","[129/291] Writing tensor blk.14.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 66\n","[130/291] Writing tensor blk.14.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 66\n","[131/291] Writing tensor blk.14.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 69\n","[132/291] Writing tensor blk.14.ffn_norm.weight | size 4096 | type F32 | T+ 69\n","[133/291] Writing tensor blk.14.attn_k.weight | size 4096 x 4096 | type F16 | T+ 69\n","[134/291] Writing tensor blk.14.attn_output.weight | size 4096 x 4096 | type F16 | T+ 69\n","[135/291] Writing tensor blk.14.attn_q.weight | size 4096 x 4096 | type F16 | T+ 69\n","[136/291] Writing tensor blk.14.attn_v.weight | size 4096 x 4096 | type F16 | T+ 69\n","[137/291] Writing tensor blk.15.attn_norm.weight | size 4096 | type F32 | T+ 70\n","[138/291] Writing tensor blk.15.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 71\n","[139/291] Writing tensor blk.15.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 74\n","[140/291] Writing tensor blk.15.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 74\n","[141/291] Writing tensor blk.15.ffn_norm.weight | size 4096 | type F32 | T+ 75\n","[142/291] Writing tensor blk.15.attn_k.weight | size 4096 x 4096 | type F16 | T+ 75\n","[143/291] Writing tensor blk.15.attn_output.weight | size 4096 x 4096 | type F16 | T+ 78\n","[144/291] Writing tensor blk.15.attn_q.weight | size 4096 x 4096 | type F16 | T+ 79\n","[145/291] Writing tensor blk.15.attn_v.weight | size 4096 x 4096 | type F16 | T+ 79\n","[146/291] Writing tensor blk.16.attn_norm.weight | size 4096 | type F32 | T+ 79\n","[147/291] Writing tensor blk.16.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 79\n","[148/291] Writing tensor blk.16.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 79\n","[149/291] Writing tensor blk.16.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 82\n","[150/291] Writing tensor blk.16.ffn_norm.weight | size 4096 | type F32 | T+ 84\n","[151/291] Writing tensor blk.16.attn_k.weight | size 4096 x 4096 | type F16 | T+ 84\n","[152/291] Writing tensor blk.16.attn_output.weight | size 4096 x 4096 | type F16 | T+ 84\n","[153/291] Writing tensor blk.16.attn_q.weight | size 4096 x 4096 | type F16 | T+ 84\n","[154/291] Writing tensor blk.16.attn_v.weight | size 4096 x 4096 | type F16 | T+ 84\n","[155/291] Writing tensor blk.17.attn_norm.weight | size 4096 | type F32 | T+ 85\n","[156/291] Writing tensor blk.17.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 85\n","[157/291] Writing tensor blk.17.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 89\n","[158/291] Writing tensor blk.17.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 89\n","[159/291] Writing tensor blk.17.ffn_norm.weight | size 4096 | type F32 | T+ 90\n","[160/291] Writing tensor blk.17.attn_k.weight | size 4096 x 4096 | type F16 | T+ 90\n","[161/291] Writing tensor blk.17.attn_output.weight | size 4096 x 4096 | type F16 | T+ 90\n","[162/291] Writing tensor blk.17.attn_q.weight | size 4096 x 4096 | type F16 | T+ 90\n","[163/291] Writing tensor blk.17.attn_v.weight | size 4096 x 4096 | type F16 | T+ 90\n","[164/291] Writing tensor blk.18.attn_norm.weight | size 4096 | type F32 | T+ 91\n","[165/291] Writing tensor blk.18.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 91\n","[166/291] Writing tensor blk.18.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 91\n","[167/291] Writing tensor blk.18.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 94\n","[168/291] Writing tensor blk.18.ffn_norm.weight | size 4096 | type F32 | T+ 94\n","[169/291] Writing tensor blk.18.attn_k.weight | size 4096 x 4096 | type F16 | T+ 94\n","[170/291] Writing tensor blk.18.attn_output.weight | size 4096 x 4096 | type F16 | T+ 94\n","[171/291] Writing tensor blk.18.attn_q.weight | size 4096 x 4096 | type F16 | T+ 95\n","[172/291] Writing tensor blk.18.attn_v.weight | size 4096 x 4096 | type F16 | T+ 95\n","[173/291] Writing tensor blk.19.attn_norm.weight | size 4096 | type F32 | T+ 95\n","[174/291] Writing tensor blk.19.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 95\n","[175/291] Writing tensor blk.19.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 99\n","[176/291] Writing tensor blk.19.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 99\n","[177/291] Writing tensor blk.19.ffn_norm.weight | size 4096 | type F32 | T+ 100\n","[178/291] Writing tensor blk.19.attn_k.weight | size 4096 x 4096 | type F16 | T+ 100\n","[179/291] Writing tensor blk.19.attn_output.weight | size 4096 x 4096 | type F16 | T+ 100\n","[180/291] Writing tensor blk.19.attn_q.weight | size 4096 x 4096 | type F16 | T+ 100\n","[181/291] Writing tensor blk.19.attn_v.weight | size 4096 x 4096 | type F16 | T+ 103\n","[182/291] Writing tensor blk.20.attn_norm.weight | size 4096 | type F32 | T+ 104\n","[183/291] Writing tensor blk.20.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 104\n","[184/291] Writing tensor blk.20.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 104\n","[185/291] Writing tensor blk.20.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 105\n","[186/291] Writing tensor blk.20.ffn_norm.weight | size 4096 | type F32 | T+ 105\n","[187/291] Writing tensor blk.20.attn_k.weight | size 4096 x 4096 | type F16 | T+ 105\n","[188/291] Writing tensor blk.20.attn_output.weight | size 4096 x 4096 | type F16 | T+ 105\n","[189/291] Writing tensor blk.20.attn_q.weight | size 4096 x 4096 | type F16 | T+ 105\n","[190/291] Writing tensor blk.20.attn_v.weight | size 4096 x 4096 | type F16 | T+ 105\n","[191/291] Writing tensor blk.21.attn_norm.weight | size 4096 | type F32 | T+ 105\n","[192/291] Writing tensor blk.21.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 106\n","[193/291] Writing tensor blk.21.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 106\n","[194/291] Writing tensor blk.21.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 107\n","[195/291] Writing tensor blk.21.ffn_norm.weight | size 4096 | type F32 | T+ 109\n","[196/291] Writing tensor blk.21.attn_k.weight | size 4096 x 4096 | type F16 | T+ 109\n","[197/291] Writing tensor blk.21.attn_output.weight | size 4096 x 4096 | type F16 | T+ 109\n","[198/291] Writing tensor blk.21.attn_q.weight | size 4096 x 4096 | type F16 | T+ 109\n","[199/291] Writing tensor blk.21.attn_v.weight | size 4096 x 4096 | type F16 | T+ 109\n","[200/291] Writing tensor blk.22.attn_norm.weight | size 4096 | type F32 | T+ 109\n","[201/291] Writing tensor blk.22.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 109\n","[202/291] Writing tensor blk.22.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 113\n","[203/291] Writing tensor blk.22.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 114\n","[204/291] Writing tensor blk.22.ffn_norm.weight | size 4096 | type F32 | T+ 114\n","[205/291] Writing tensor blk.22.attn_k.weight | size 4096 x 4096 | type F16 | T+ 114\n","[206/291] Writing tensor blk.22.attn_output.weight | size 4096 x 4096 | type F16 | T+ 114\n","[207/291] Writing tensor blk.22.attn_q.weight | size 4096 x 4096 | type F16 | T+ 115\n","[208/291] Writing tensor blk.22.attn_v.weight | size 4096 x 4096 | type F16 | T+ 118\n","[209/291] Writing tensor blk.23.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 119\n","[210/291] Writing tensor blk.23.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 119\n","[211/291] Writing tensor blk.23.attn_k.weight | size 4096 x 4096 | type F16 | T+ 119\n","[212/291] Writing tensor blk.23.attn_output.weight | size 4096 x 4096 | type F16 | T+ 120\n","[213/291] Writing tensor blk.23.attn_q.weight | size 4096 x 4096 | type F16 | T+ 120\n","[214/291] Writing tensor blk.23.attn_v.weight | size 4096 x 4096 | type F16 | T+ 120\n","[215/291] Writing tensor output.weight | size 32000 x 4096 | type F16 | T+ 120\n","[216/291] Writing tensor blk.23.attn_norm.weight | size 4096 | type F32 | T+ 121\n","[217/291] Writing tensor blk.23.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 121\n","[218/291] Writing tensor blk.23.ffn_norm.weight | size 4096 | type F32 | T+ 124\n","[219/291] Writing tensor blk.24.attn_norm.weight | size 4096 | type F32 | T+ 124\n","[220/291] Writing tensor blk.24.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 124\n","[221/291] Writing tensor blk.24.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 124\n","[222/291] Writing tensor blk.24.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 125\n","[223/291] Writing tensor blk.24.ffn_norm.weight | size 4096 | type F32 | T+ 125\n","[224/291] Writing tensor blk.24.attn_k.weight | size 4096 x 4096 | type F16 | T+ 125\n","[225/291] Writing tensor blk.24.attn_output.weight | size 4096 x 4096 | type F16 | T+ 125\n","[226/291] Writing tensor blk.24.attn_q.weight | size 4096 x 4096 | type F16 | T+ 125\n","[227/291] Writing tensor blk.24.attn_v.weight | size 4096 x 4096 | type F16 | T+ 125\n","[228/291] Writing tensor blk.25.attn_norm.weight | size 4096 | type F32 | T+ 125\n","[229/291] Writing tensor blk.25.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 127\n","[230/291] Writing tensor blk.25.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 127\n","[231/291] Writing tensor blk.25.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 127\n","[232/291] Writing tensor blk.25.ffn_norm.weight | size 4096 | type F32 | T+ 129\n","[233/291] Writing tensor blk.25.attn_k.weight | size 4096 x 4096 | type F16 | T+ 129\n","[234/291] Writing tensor blk.25.attn_output.weight | size 4096 x 4096 | type F16 | T+ 129\n","[235/291] Writing tensor blk.25.attn_q.weight | size 4096 x 4096 | type F16 | T+ 129\n","[236/291] Writing tensor blk.25.attn_v.weight | size 4096 x 4096 | type F16 | T+ 129\n","[237/291] Writing tensor blk.26.attn_norm.weight | size 4096 | type F32 | T+ 129\n","[238/291] Writing tensor blk.26.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 130\n","[239/291] Writing tensor blk.26.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 131\n","[240/291] Writing tensor blk.26.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 131\n","[241/291] Writing tensor blk.26.ffn_norm.weight | size 4096 | type F32 | T+ 131\n","[242/291] Writing tensor blk.26.attn_k.weight | size 4096 x 4096 | type F16 | T+ 131\n","[243/291] Writing tensor blk.26.attn_output.weight | size 4096 x 4096 | type F16 | T+ 132\n","[244/291] Writing tensor blk.26.attn_q.weight | size 4096 x 4096 | type F16 | T+ 132\n","[245/291] Writing tensor blk.26.attn_v.weight | size 4096 x 4096 | type F16 | T+ 132\n","[246/291] Writing tensor blk.27.attn_norm.weight | size 4096 | type F32 | T+ 132\n","[247/291] Writing tensor blk.27.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 133\n","[248/291] Writing tensor blk.27.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 134\n","[249/291] Writing tensor blk.27.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 134\n","[250/291] Writing tensor blk.27.ffn_norm.weight | size 4096 | type F32 | T+ 135\n","[251/291] Writing tensor blk.27.attn_k.weight | size 4096 x 4096 | type F16 | T+ 135\n","[252/291] Writing tensor blk.27.attn_output.weight | size 4096 x 4096 | type F16 | T+ 139\n","[253/291] Writing tensor blk.27.attn_q.weight | size 4096 x 4096 | type F16 | T+ 139\n","[254/291] Writing tensor blk.27.attn_v.weight | size 4096 x 4096 | type F16 | T+ 139\n","[255/291] Writing tensor blk.28.attn_norm.weight | size 4096 | type F32 | T+ 139\n","[256/291] Writing tensor blk.28.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 139\n","[257/291] Writing tensor blk.28.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 140\n","[258/291] Writing tensor blk.28.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 140\n","[259/291] Writing tensor blk.28.ffn_norm.weight | size 4096 | type F32 | T+ 140\n","[260/291] Writing tensor blk.28.attn_k.weight | size 4096 x 4096 | type F16 | T+ 140\n","[261/291] Writing tensor blk.28.attn_output.weight | size 4096 x 4096 | type F16 | T+ 141\n","[262/291] Writing tensor blk.28.attn_q.weight | size 4096 x 4096 | type F16 | T+ 141\n","[263/291] Writing tensor blk.28.attn_v.weight | size 4096 x 4096 | type F16 | T+ 141\n","[264/291] Writing tensor blk.29.attn_norm.weight | size 4096 | type F32 | T+ 141\n","[265/291] Writing tensor blk.29.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 142\n","[266/291] Writing tensor blk.29.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 144\n","[267/291] Writing tensor blk.29.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 144\n","[268/291] Writing tensor blk.29.ffn_norm.weight | size 4096 | type F32 | T+ 144\n","[269/291] Writing tensor blk.29.attn_k.weight | size 4096 x 4096 | type F16 | T+ 144\n","[270/291] Writing tensor blk.29.attn_output.weight | size 4096 x 4096 | type F16 | T+ 145\n","[271/291] Writing tensor blk.29.attn_q.weight | size 4096 x 4096 | type F16 | T+ 145\n","[272/291] Writing tensor blk.29.attn_v.weight | size 4096 x 4096 | type F16 | T+ 149\n","[273/291] Writing tensor blk.30.attn_norm.weight | size 4096 | type F32 | T+ 149\n","[274/291] Writing tensor blk.30.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 149\n","[275/291] Writing tensor blk.30.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 149\n","[276/291] Writing tensor blk.30.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 150\n","[277/291] Writing tensor blk.30.ffn_norm.weight | size 4096 | type F32 | T+ 150\n","[278/291] Writing tensor blk.30.attn_k.weight | size 4096 x 4096 | type F16 | T+ 150\n","[279/291] Writing tensor blk.30.attn_output.weight | size 4096 x 4096 | type F16 | T+ 150\n","[280/291] Writing tensor blk.30.attn_q.weight | size 4096 x 4096 | type F16 | T+ 150\n","[281/291] Writing tensor blk.30.attn_v.weight | size 4096 x 4096 | type F16 | T+ 150\n","[282/291] Writing tensor blk.31.attn_norm.weight | size 4096 | type F32 | T+ 151\n","[283/291] Writing tensor blk.31.ffn_down.weight | size 4096 x 11008 | type F16 | T+ 151\n","[284/291] Writing tensor blk.31.ffn_gate.weight | size 11008 x 4096 | type F16 | T+ 152\n","[285/291] Writing tensor blk.31.ffn_up.weight | size 11008 x 4096 | type F16 | T+ 154\n","[286/291] Writing tensor blk.31.ffn_norm.weight | size 4096 | type F32 | T+ 154\n","[287/291] Writing tensor blk.31.attn_k.weight | size 4096 x 4096 | type F16 | T+ 154\n","[288/291] Writing tensor blk.31.attn_output.weight | size 4096 x 4096 | type F16 | T+ 155\n","[289/291] Writing tensor blk.31.attn_q.weight | size 4096 x 4096 | type F16 | T+ 155\n","[290/291] Writing tensor blk.31.attn_v.weight | size 4096 x 4096 | type F16 | T+ 155\n","[291/291] Writing tensor output_norm.weight | size 4096 | type F32 | T+ 155\n","Wrote llama-2-7b-mini-ibased/llama-2-7b-mini-ibased.fp16.bin\n"]}]},{"cell_type":"code","source":["# Verify creation of FP16 file and quantize the model for specified methods.\n","# First, check if the FP16 model file exists, indicating successful conversion.\n","# If the file does not exist, terminate the script to prevent further errors.\n","# Then, for each quantization method listed, perform model quantization,\n","# generating a quantized model file for each method.\n","\n","\n","if os.path.exists(fp16):\n"," print(f\"FP16 file created successfully: {fp16}\")\n","else:\n"," print(f\"Failed to create FP16 file at: {fp16}\")\n"," import sys\n"," sys.exit(\"Stopping script due to missing FP16 file.\")\n","\n","\n","# Quantize the model using specified methods\n","for method in QUANTIZATION_METHODS:\n"," qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n"," !./llama.cpp/quantize {fp16} {qtype} {method}"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mw5y-GWdkbx6","executionInfo":{"status":"ok","timestamp":1706681949750,"user_tz":-480,"elapsed":668152,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"3fe19dac-6a53-4439-cc2b-330b69da4d42"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["FP16 file created successfully: llama-2-7b-mini-ibased/llama-2-7b-mini-ibased.fp16.bin\n","ggml_init_cublas: GGML_CUDA_FORCE_MMQ: no\n","ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n","ggml_init_cublas: found 1 CUDA devices:\n"," Device 0: Tesla T4, compute capability 7.5, VMM: yes\n","main: build = 2029 (d62520eb)\n","main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu\n","main: quantizing 'llama-2-7b-mini-ibased/llama-2-7b-mini-ibased.fp16.bin' to 'llama-2-7b-mini-ibased/llama-2-7b-mini-ibased.Q5_K_M.gguf' as Q5_K_M\n","llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from llama-2-7b-mini-ibased/llama-2-7b-mini-ibased.fp16.bin (version GGUF V3 (latest))\n","llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n","llama_model_loader: - kv 0: general.architecture str = llama\n","llama_model_loader: - kv 1: general.name str = LLaMA v2\n","llama_model_loader: - kv 2: llama.context_length u32 = 4096\n","llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n","llama_model_loader: - kv 4: llama.block_count u32 = 32\n","llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008\n","llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n","llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n","llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32\n","llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n","llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000\n","llama_model_loader: - kv 11: general.file_type u32 = 1\n","llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n","llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32000] = [\"\", \"\", \"\", \"<0x00>\", \"<...\n","llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...\n","llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n","llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n","llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 2\n","llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 0\n","llama_model_loader: - kv 19: tokenizer.ggml.padding_token_id u32 = 2\n","llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true\n","llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false\n","llama_model_loader: - kv 22: tokenizer.chat_template str = {% if messages[0]['role'] == 'system'...\n","llama_model_loader: - type f32: 65 tensors\n","llama_model_loader: - type f16: 226 tensors\n","llama_model_quantize_internal: meta size = 742080 bytes\n","[ 1/ 291] token_embd.weight - [ 4096, 32000, 1, 1], type = f16, quantizing to q5_K .. size = 250.00 MiB -> 85.94 MiB\n","[ 2/ 291] blk.0.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 3/ 291] blk.0.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 4/ 291] blk.0.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 5/ 291] blk.0.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 6/ 291] blk.0.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 7/ 291] blk.0.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 8/ 291] blk.0.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 9/ 291] blk.0.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 10/ 291] blk.0.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 11/ 291] blk.1.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 12/ 291] blk.1.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 13/ 291] blk.1.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 14/ 291] blk.1.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 15/ 291] blk.1.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 16/ 291] blk.1.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 17/ 291] blk.1.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 18/ 291] blk.1.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 19/ 291] blk.1.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 20/ 291] blk.10.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 21/ 291] blk.10.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 22/ 291] blk.10.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 23/ 291] blk.10.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 24/ 291] blk.10.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 25/ 291] blk.10.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 26/ 291] blk.10.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 27/ 291] blk.10.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 28/ 291] blk.10.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 29/ 291] blk.11.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 30/ 291] blk.11.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 31/ 291] blk.11.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 32/ 291] blk.11.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 33/ 291] blk.11.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 34/ 291] blk.2.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 35/ 291] blk.2.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 36/ 291] blk.2.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 37/ 291] blk.2.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 38/ 291] blk.2.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 39/ 291] blk.2.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 40/ 291] blk.2.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 41/ 291] blk.2.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 42/ 291] blk.2.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 43/ 291] blk.3.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 44/ 291] blk.3.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 45/ 291] blk.3.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 46/ 291] blk.3.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 47/ 291] blk.3.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 48/ 291] blk.3.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 49/ 291] blk.3.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 50/ 291] blk.3.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 51/ 291] blk.3.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 52/ 291] blk.4.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 53/ 291] blk.4.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 54/ 291] blk.4.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 55/ 291] blk.4.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 56/ 291] blk.4.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 57/ 291] blk.4.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 58/ 291] blk.4.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 59/ 291] blk.4.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 60/ 291] blk.4.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 61/ 291] blk.5.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 62/ 291] blk.5.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 63/ 291] blk.5.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 64/ 291] blk.5.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 65/ 291] blk.5.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 66/ 291] blk.5.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 67/ 291] blk.5.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 68/ 291] blk.5.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 69/ 291] blk.5.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 70/ 291] blk.6.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 71/ 291] blk.6.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 72/ 291] blk.6.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 73/ 291] blk.6.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 74/ 291] blk.6.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 75/ 291] blk.6.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 76/ 291] blk.6.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 77/ 291] blk.6.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 78/ 291] blk.6.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 79/ 291] blk.7.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 80/ 291] blk.7.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 81/ 291] blk.7.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 82/ 291] blk.7.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 83/ 291] blk.7.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 84/ 291] blk.7.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 85/ 291] blk.7.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 86/ 291] blk.7.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 87/ 291] blk.7.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 88/ 291] blk.8.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 89/ 291] blk.8.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 90/ 291] blk.8.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 91/ 291] blk.8.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 92/ 291] blk.8.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 93/ 291] blk.8.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 94/ 291] blk.8.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 95/ 291] blk.8.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 96/ 291] blk.8.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 97/ 291] blk.9.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 98/ 291] blk.9.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 99/ 291] blk.9.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 100/ 291] blk.9.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 101/ 291] blk.9.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 102/ 291] blk.9.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 103/ 291] blk.9.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 104/ 291] blk.9.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 105/ 291] blk.9.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 106/ 291] blk.11.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 107/ 291] blk.11.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 108/ 291] blk.11.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 109/ 291] blk.11.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 110/ 291] blk.12.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 111/ 291] blk.12.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 112/ 291] blk.12.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 113/ 291] blk.12.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 114/ 291] blk.12.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 115/ 291] blk.12.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 116/ 291] blk.12.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 117/ 291] blk.12.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 118/ 291] blk.12.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 119/ 291] blk.13.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 120/ 291] blk.13.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 121/ 291] blk.13.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 122/ 291] blk.13.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 123/ 291] blk.13.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 124/ 291] blk.13.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 125/ 291] blk.13.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 126/ 291] blk.13.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 127/ 291] blk.13.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 128/ 291] blk.14.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 129/ 291] blk.14.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 130/ 291] blk.14.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 131/ 291] blk.14.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 132/ 291] blk.14.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 133/ 291] blk.14.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 134/ 291] blk.14.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 135/ 291] blk.14.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 136/ 291] blk.14.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 137/ 291] blk.15.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 138/ 291] blk.15.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 139/ 291] blk.15.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 140/ 291] blk.15.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 141/ 291] blk.15.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 142/ 291] blk.15.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 143/ 291] blk.15.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 144/ 291] blk.15.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 145/ 291] blk.15.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 146/ 291] blk.16.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 147/ 291] blk.16.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 148/ 291] blk.16.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 149/ 291] blk.16.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 150/ 291] blk.16.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 151/ 291] blk.16.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 152/ 291] blk.16.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 153/ 291] blk.16.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 154/ 291] blk.16.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 155/ 291] blk.17.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 156/ 291] blk.17.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 157/ 291] blk.17.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 158/ 291] blk.17.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 159/ 291] blk.17.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 160/ 291] blk.17.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 161/ 291] blk.17.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 162/ 291] blk.17.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 163/ 291] blk.17.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 164/ 291] blk.18.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 165/ 291] blk.18.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 166/ 291] blk.18.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 167/ 291] blk.18.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 168/ 291] blk.18.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 169/ 291] blk.18.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 170/ 291] blk.18.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 171/ 291] blk.18.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 172/ 291] blk.18.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 173/ 291] blk.19.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 174/ 291] blk.19.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 175/ 291] blk.19.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 176/ 291] blk.19.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 177/ 291] blk.19.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 178/ 291] blk.19.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 179/ 291] blk.19.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 180/ 291] blk.19.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 181/ 291] blk.19.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 182/ 291] blk.20.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 183/ 291] blk.20.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 184/ 291] blk.20.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 185/ 291] blk.20.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 186/ 291] blk.20.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 187/ 291] blk.20.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 188/ 291] blk.20.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 189/ 291] blk.20.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 190/ 291] blk.20.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 191/ 291] blk.21.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 192/ 291] blk.21.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 193/ 291] blk.21.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 194/ 291] blk.21.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 195/ 291] blk.21.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 196/ 291] blk.21.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 197/ 291] blk.21.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 198/ 291] blk.21.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 199/ 291] blk.21.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 200/ 291] blk.22.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 201/ 291] blk.22.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 202/ 291] blk.22.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 203/ 291] blk.22.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 204/ 291] blk.22.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 205/ 291] blk.22.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 206/ 291] blk.22.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 207/ 291] blk.22.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 208/ 291] blk.22.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 209/ 291] blk.23.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 210/ 291] blk.23.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 211/ 291] blk.23.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 212/ 291] blk.23.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 213/ 291] blk.23.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 214/ 291] blk.23.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 215/ 291] output.weight - [ 4096, 32000, 1, 1], type = f16, quantizing to q6_K .. size = 250.00 MiB -> 102.54 MiB\n","[ 216/ 291] blk.23.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 217/ 291] blk.23.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 218/ 291] blk.23.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 219/ 291] blk.24.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 220/ 291] blk.24.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 221/ 291] blk.24.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 222/ 291] blk.24.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 223/ 291] blk.24.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 224/ 291] blk.24.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 225/ 291] blk.24.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 226/ 291] blk.24.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 227/ 291] blk.24.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 228/ 291] blk.25.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 229/ 291] blk.25.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 230/ 291] blk.25.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 231/ 291] blk.25.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 232/ 291] blk.25.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 233/ 291] blk.25.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 234/ 291] blk.25.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 235/ 291] blk.25.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 236/ 291] blk.25.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 237/ 291] blk.26.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 238/ 291] blk.26.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 239/ 291] blk.26.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 240/ 291] blk.26.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 241/ 291] blk.26.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 242/ 291] blk.26.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 243/ 291] blk.26.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 244/ 291] blk.26.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 245/ 291] blk.26.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 246/ 291] blk.27.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 247/ 291] blk.27.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 248/ 291] blk.27.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 249/ 291] blk.27.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 250/ 291] blk.27.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 251/ 291] blk.27.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 252/ 291] blk.27.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 253/ 291] blk.27.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 254/ 291] blk.27.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 255/ 291] blk.28.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 256/ 291] blk.28.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 257/ 291] blk.28.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 258/ 291] blk.28.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 259/ 291] blk.28.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 260/ 291] blk.28.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 261/ 291] blk.28.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 262/ 291] blk.28.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 263/ 291] blk.28.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 264/ 291] blk.29.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 265/ 291] blk.29.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 266/ 291] blk.29.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 267/ 291] blk.29.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 268/ 291] blk.29.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 269/ 291] blk.29.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 270/ 291] blk.29.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 271/ 291] blk.29.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 272/ 291] blk.29.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 273/ 291] blk.30.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 274/ 291] blk.30.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 275/ 291] blk.30.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 276/ 291] blk.30.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 277/ 291] blk.30.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 278/ 291] blk.30.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 279/ 291] blk.30.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 280/ 291] blk.30.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 281/ 291] blk.30.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 282/ 291] blk.31.attn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 283/ 291] blk.31.ffn_down.weight - [11008, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 86.00 MiB -> 35.27 MiB\n","[ 284/ 291] blk.31.ffn_gate.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 285/ 291] blk.31.ffn_up.weight - [ 4096, 11008, 1, 1], type = f16, quantizing to q5_K .. size = 86.00 MiB -> 29.56 MiB\n","[ 286/ 291] blk.31.ffn_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","[ 287/ 291] blk.31.attn_k.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 288/ 291] blk.31.attn_output.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 289/ 291] blk.31.attn_q.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q5_K .. size = 32.00 MiB -> 11.00 MiB\n","[ 290/ 291] blk.31.attn_v.weight - [ 4096, 4096, 1, 1], type = f16, quantizing to q6_K .. size = 32.00 MiB -> 13.12 MiB\n","[ 291/ 291] output_norm.weight - [ 4096, 1, 1, 1], type = f32, size = 0.016 MB\n","llama_model_quantize_internal: model size = 12853.02 MB\n","llama_model_quantize_internal: quant size = 4560.87 MB\n","\n","main: quantize time = 667210.52 ms\n","main: total time = 667210.52 ms\n"]}]},{"cell_type":"markdown","source":["## Run inference\n","\n","Below is a script to run our quantized model. We are offloading every layer to the GPU (33 for a 7b parameter model) to speed up inference."],"metadata":{"id":"WqI1CPiXI4dP"}},{"cell_type":"code","source":["# Run text generation using a specific quantized model in llama.cpp.\n","# 1. Prompt the user to enter text for the model to process.\n","# 2. Construct the model file path ('qtype') using MODEL_NAME and a specified quantization method.\n","# 3. Execute the llama.cpp main program with the constructed model path,\n","# setting the number of tokens to generate, enabling color, limiting the number of generated lines,\n","# and using the user-provided prompt.\n","\n","prompt = input(\"Enter your prompt: \")\n","\n","# Construct the path to the model file with the quantization method 'Q5_K_M'\n","qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.Q5_K_M.gguf\"\n","\n","# Execute the llama.cpp main program with specified parameters\n","!./llama.cpp/main -m {qtype} -n 128 --color -ngl 35 -p \"{prompt}\""],"metadata":{"id":"vNPL9WYg78l-","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1706683934968,"user_tz":-480,"elapsed":18438,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"dcb93f10-a5d6-40ac-b2c2-321565171595"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Enter your prompt: what is cnn?\n","Log start\n","main: build = 2029 (d62520eb)\n","main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu\n","main: seed = 1706683926\n","ggml_init_cublas: GGML_CUDA_FORCE_MMQ: no\n","ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes\n","ggml_init_cublas: found 1 CUDA devices:\n"," Device 0: Tesla T4, compute capability 7.5, VMM: yes\n","llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from llama-2-7b-mini-ibased/llama-2-7b-mini-ibased.Q5_K_M.gguf (version GGUF V3 (latest))\n","llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n","llama_model_loader: - kv 0: general.architecture str = llama\n","llama_model_loader: - kv 1: general.name str = LLaMA v2\n","llama_model_loader: - kv 2: llama.context_length u32 = 4096\n","llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n","llama_model_loader: - kv 4: llama.block_count u32 = 32\n","llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008\n","llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n","llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n","llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32\n","llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n","llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000\n","llama_model_loader: - kv 11: general.file_type u32 = 17\n","llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n","llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32000] = [\"\", \"\", \"\", \"<0x00>\", \"<...\n","llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...\n","llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n","llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n","llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 2\n","llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 0\n","llama_model_loader: - kv 19: tokenizer.ggml.padding_token_id u32 = 2\n","llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true\n","llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false\n","llama_model_loader: - kv 22: tokenizer.chat_template str = {% if messages[0]['role'] == 'system'...\n","llama_model_loader: - kv 23: general.quantization_version u32 = 2\n","llama_model_loader: - type f32: 65 tensors\n","llama_model_loader: - type q5_K: 193 tensors\n","llama_model_loader: - type q6_K: 33 tensors\n","llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n","llm_load_print_meta: format = GGUF V3 (latest)\n","llm_load_print_meta: arch = llama\n","llm_load_print_meta: vocab type = SPM\n","llm_load_print_meta: n_vocab = 32000\n","llm_load_print_meta: n_merges = 0\n","llm_load_print_meta: n_ctx_train = 4096\n","llm_load_print_meta: n_embd = 4096\n","llm_load_print_meta: n_head = 32\n","llm_load_print_meta: n_head_kv = 32\n","llm_load_print_meta: n_layer = 32\n","llm_load_print_meta: n_rot = 128\n","llm_load_print_meta: n_embd_head_k = 128\n","llm_load_print_meta: n_embd_head_v = 128\n","llm_load_print_meta: n_gqa = 1\n","llm_load_print_meta: n_embd_k_gqa = 4096\n","llm_load_print_meta: n_embd_v_gqa = 4096\n","llm_load_print_meta: f_norm_eps = 0.0e+00\n","llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n","llm_load_print_meta: f_clamp_kqv = 0.0e+00\n","llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n","llm_load_print_meta: n_ff = 11008\n","llm_load_print_meta: n_expert = 0\n","llm_load_print_meta: n_expert_used = 0\n","llm_load_print_meta: rope scaling = linear\n","llm_load_print_meta: freq_base_train = 10000.0\n","llm_load_print_meta: freq_scale_train = 1\n","llm_load_print_meta: n_yarn_orig_ctx = 4096\n","llm_load_print_meta: rope_finetuned = unknown\n","llm_load_print_meta: model type = 7B\n","llm_load_print_meta: model ftype = Q5_K - Medium\n","llm_load_print_meta: model params = 6.74 B\n","llm_load_print_meta: model size = 4.45 GiB (5.68 BPW) \n","llm_load_print_meta: general.name = LLaMA v2\n","llm_load_print_meta: BOS token = 1 ''\n","llm_load_print_meta: EOS token = 2 ''\n","llm_load_print_meta: UNK token = 0 ''\n","llm_load_print_meta: PAD token = 2 ''\n","llm_load_print_meta: LF token = 13 '<0x0A>'\n","llm_load_tensors: ggml ctx size = 0.22 MiB\n","llm_load_tensors: offloading 32 repeating layers to GPU\n","llm_load_tensors: offloading non-repeating layers to GPU\n","llm_load_tensors: offloaded 33/33 layers to GPU\n","llm_load_tensors: CPU buffer size = 85.94 MiB\n","llm_load_tensors: CUDA0 buffer size = 4474.94 MiB\n","..................................................................................................\n","llama_new_context_with_model: n_ctx = 512\n","llama_new_context_with_model: freq_base = 10000.0\n","llama_new_context_with_model: freq_scale = 1\n","llama_kv_cache_init: CUDA0 KV buffer size = 256.00 MiB\n","llama_new_context_with_model: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB\n","llama_new_context_with_model: CUDA_Host input buffer size = 9.01 MiB\n","llama_new_context_with_model: CUDA0 compute buffer size = 77.55 MiB\n","llama_new_context_with_model: CUDA_Host compute buffer size = 8.80 MiB\n","llama_new_context_with_model: graph splits (measure): 3\n","\n","system_info: n_threads = 2 / 2 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 1 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | \n","sampling: \n","\trepeat_last_n = 64, repeat_penalty = 1.100, frequency_penalty = 0.000, presence_penalty = 0.000\n","\ttop_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 0.800\n","\tmirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000\n","sampling order: \n","CFG -> Penalties -> top_k -> tfs_z -> typical_p -> top_p -> min_p -> temp \n","generate: n_ctx = 512, n_batch = 512, n_predict = 128, n_keep = 0\n","\n","\n","\u001b[33m what is cnn?\u001b[0m\n","Summarization:\n","\n","A Convolutional Neural Network (CNN) is a type of neural network architecture that is commonly used for image and video analysis, as well as processing sequential data. It consists of multiple layers of convolutional and pooling layers, followed by fully connected layers, which are used to make predictions or classifications\n","\n","Response:\n","\n","A CNN, or Convolutional Neural Network, is a type of neural network architecture that is specifically designed for image and video analysis, as well as processing sequential data. It consists of multiple layers of convolutional and pooling layers, followed by\n","llama_print_timings: load time = 2168.11 ms\n","llama_print_timings: sample time = 72.39 ms / 128 runs ( 0.57 ms per token, 1768.10 tokens per second)\n","llama_print_timings: prompt eval time = 124.08 ms / 6 tokens ( 20.68 ms per token, 48.36 tokens per second)\n","llama_print_timings: eval time = 3479.87 ms / 127 runs ( 27.40 ms per token, 36.50 tokens per second)\n","llama_print_timings: total time = 3723.67 ms / 133 tokens\n","Log end\n"]}]},{"cell_type":"markdown","source":["## Push to hub"],"metadata":{"id":"Ar8pO7bb80US"}},{"cell_type":"code","source":["# Create a new model repository on Hugging Face and upload gguf files.\n","# 1. Initialize the HfApi object to interact with Hugging Face's API.\n","# 2. Define the username associated with the Hugging Face account.\n","# 3. Use create_repo to create an empty repository for the model,\n","# allowing for the repository to exist already with exist_ok=True.\n","# 4. Upload all gguf files from the local MODEL_NAME directory to the newly\n","# created repository on Hugging Face, using upload_folder with a filter\n","# to only include files with a .gguf extension.\n","\n","\n","api = HfApi()\n","username = \"ssoh\"\n","\n","\n","# Create an empty repository on Hugging Face\n","create_repo(\n"," repo_id=f\"{username}/{MODEL_NAME}-GGUF\",\n"," repo_type=\"model\",\n"," exist_ok=True,\n",")\n","\n","\n","# Upload gguf model files to the repository\n","api.upload_folder(\n"," folder_path=MODEL_NAME,\n"," repo_id=f\"{username}/{MODEL_NAME}-GGUF\",\n"," allow_patterns=\"*.gguf\",\n",")"],"metadata":{"id":"UOyKfUD-8jmh","colab":{"base_uri":"https://localhost:8080/","height":84,"referenced_widgets":["fdc7ab4f9b194f86b17aa4a7dfe12f0d","e1f29d859e454759bf4bde58b7a9041d","972c93dfb77c45ac8edc2e21cde1028d","4222f145bc124fdda642d6cc15f91af8","33b2f83edd9348eaa224ff52e6df7637","5374e86303054e2abb2a83ceb3192425","d56de82a661343ca9b744bf7edb56a5e","2deeb479e7104c949e935652a06fa01a","c6877fade6a647f8bd53cdcb1ec19e11","d89dfe49802b48d0a554b5e047bec54b","f2964d0cc62b44f7a6dab6b1889596d0"]},"executionInfo":{"status":"ok","timestamp":1706684339726,"user_tz":-480,"elapsed":269756,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"af6c96c4-f57f-46df-a384-81ab597c81c4"},"execution_count":null,"outputs":[{"output_type":"display_data","data":{"text/plain":["llama-2-7b-mini-ibased.Q5_K_M.gguf: 0%| | 0.00/4.78G [00:00, ?B/s]"],"application/vnd.jupyter.widget-view+json":{"version_major":2,"version_minor":0,"model_id":"fdc7ab4f9b194f86b17aa4a7dfe12f0d"}},"metadata":{}},{"output_type":"execute_result","data":{"text/plain":["CommitInfo(commit_url='https://huggingface.co/ssoh/llama-2-7b-mini-ibased-GGUF/commit/7df8b43b589d1f7f28125efa73c0d79c7c6d5941', commit_message='Upload folder using huggingface_hub', commit_description='', oid='7df8b43b589d1f7f28125efa73c0d79c7c6d5941', pr_url=None, pr_revision=None, pr_num=None)"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"string"}},"metadata":{},"execution_count":26}]},{"cell_type":"markdown","source":["# **Test run the GGUF model**\n","\n"],"metadata":{"id":"AeIF-t3z3RhM"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"7451dc54-0f6b-4855-8211-c8ff494935dd"},"outputs":[],"source":["import os\n","from urllib.parse import urlparse"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"6e67a9d6-10f3-4a1c-942c-391b546ec247","executionInfo":{"status":"ok","timestamp":1708260569297,"user_tz":-480,"elapsed":82563,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"eb94faca-31ca-4e49-8c9a-ec0472a83de6"},"outputs":[{"output_type":"stream","name":"stdout","text":["\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m815.9/815.9 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.6/36.6 MB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n"," Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n"," Installing backend dependencies ... \u001b[?25l\u001b[?25hdone\n"," Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m81.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m241.2/241.2 kB\u001b[0m \u001b[31m28.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.4/55.4 kB\u001b[0m \u001b[31m6.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n","\u001b[?25h Building wheel for llama-cpp-python (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n"]}],"source":["!pip -q install langchain llama-cpp-python"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"b83d199f-4bd9-47bd-8be6-f9d0c802c570"},"outputs":[],"source":["# URL from which you're downloading the model\n","url = \"https://huggingface.co/BitBasher/llama-2-7b-mini-ibased-GGUF/resolve/main/llama-2-7b-mini-ibased.Q5_K_M.gguf\"\n"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"ab25420c-aef0-48f2-a602-2fe89d6e64b3","executionInfo":{"status":"ok","timestamp":1708260760574,"user_tz":-480,"elapsed":186914,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"bcb8f0b0-470d-44ee-a053-552121ce87ab"},"outputs":[{"output_type":"stream","name":"stdout","text":["--2024-02-18 12:49:32-- https://huggingface.co/ssoh/llama-2-7b-mini-ibased-GGUF/resolve/main/llama-2-7b-mini-ibased.Q5_K_M.gguf\n","Resolving huggingface.co (huggingface.co)... 13.35.7.38, 13.35.7.81, 13.35.7.57, ...\n","Connecting to huggingface.co (huggingface.co)|13.35.7.38|:443... connected.\n","HTTP request sent, awaiting response... 307 Temporary Redirect\n","Location: /BitBasher/llama-2-7b-mini-ibased-GGUF/resolve/main/llama-2-7b-mini-ibased.Q5_K_M.gguf [following]\n","--2024-02-18 12:49:32-- https://huggingface.co/BitBasher/llama-2-7b-mini-ibased-GGUF/resolve/main/llama-2-7b-mini-ibased.Q5_K_M.gguf\n","Reusing existing connection to huggingface.co:443.\n","HTTP request sent, awaiting response... 302 Found\n","Location: https://cdn-lfs-us-1.huggingface.co/repos/a7/16/a716a6f7d3f2fa140d2f0263054d2bc120c1eca46172da4411fa02e97e0236bc/1fad558a8c0c265b3f1ef73559d401fdde00a1945e632c8c7523c066002aac4a?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27llama-2-7b-mini-ibased.Q5_K_M.gguf%3B+filename%3D%22llama-2-7b-mini-ibased.Q5_K_M.gguf%22%3B&Expires=1708519772&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwODUxOTc3Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2E3LzE2L2E3MTZhNmY3ZDNmMmZhMTQwZDJmMDI2MzA1NGQyYmMxMjBjMWVjYTQ2MTcyZGE0NDExZmEwMmU5N2UwMjM2YmMvMWZhZDU1OGE4YzBjMjY1YjNmMWVmNzM1NTlkNDAxZmRkZTAwYTE5NDVlNjMyYzhjNzUyM2MwNjYwMDJhYWM0YT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=Z4biIz80%7E4wHVRQfPsFCqKmqkWiy305CMhiGaR2aHWztCSbtx-3qpMo8pcuV7FwkidsUgWzaZHfqEqVpNElgivtDW3KJ9fbCKxR3PZBU1VHcT5N1LIoCRuvcPXxTooUbHVvij4AJHUe50kTUf7ZGKrmMbhsg2mTijeWyjWAjMOg1DgIO8%7EeWf0mrHMCRphgQWMaHKMEvztBw2NR9nTBJCuYddjES3xCTXiyBEIP3RmQRk2rW86RUoZ7EVImFbN%7E%7E1oVMsGmAom7C9wBTNFI5%7EkVVMInEzRhszmVbiBBW3i4YCwTPeWij2hf%7EwMDam0EdhpiKfvsGy9WUFEAri08PnQ__&Key-Pair-Id=KCD77M1F0VK2B [following]\n","--2024-02-18 12:49:33-- https://cdn-lfs-us-1.huggingface.co/repos/a7/16/a716a6f7d3f2fa140d2f0263054d2bc120c1eca46172da4411fa02e97e0236bc/1fad558a8c0c265b3f1ef73559d401fdde00a1945e632c8c7523c066002aac4a?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27llama-2-7b-mini-ibased.Q5_K_M.gguf%3B+filename%3D%22llama-2-7b-mini-ibased.Q5_K_M.gguf%22%3B&Expires=1708519772&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwODUxOTc3Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zL2E3LzE2L2E3MTZhNmY3ZDNmMmZhMTQwZDJmMDI2MzA1NGQyYmMxMjBjMWVjYTQ2MTcyZGE0NDExZmEwMmU5N2UwMjM2YmMvMWZhZDU1OGE4YzBjMjY1YjNmMWVmNzM1NTlkNDAxZmRkZTAwYTE5NDVlNjMyYzhjNzUyM2MwNjYwMDJhYWM0YT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=Z4biIz80%7E4wHVRQfPsFCqKmqkWiy305CMhiGaR2aHWztCSbtx-3qpMo8pcuV7FwkidsUgWzaZHfqEqVpNElgivtDW3KJ9fbCKxR3PZBU1VHcT5N1LIoCRuvcPXxTooUbHVvij4AJHUe50kTUf7ZGKrmMbhsg2mTijeWyjWAjMOg1DgIO8%7EeWf0mrHMCRphgQWMaHKMEvztBw2NR9nTBJCuYddjES3xCTXiyBEIP3RmQRk2rW86RUoZ7EVImFbN%7E%7E1oVMsGmAom7C9wBTNFI5%7EkVVMInEzRhszmVbiBBW3i4YCwTPeWij2hf%7EwMDam0EdhpiKfvsGy9WUFEAri08PnQ__&Key-Pair-Id=KCD77M1F0VK2B\n","Resolving cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)... 13.35.35.127, 13.35.35.109, 13.35.35.21, ...\n","Connecting to cdn-lfs-us-1.huggingface.co (cdn-lfs-us-1.huggingface.co)|13.35.35.127|:443... connected.\n","HTTP request sent, awaiting response... 200 OK\n","Length: 4783157952 (4.5G) [binary/octet-stream]\n","Saving to: ‘llama-2-7b-mini-ibased.Q5_K_M.gguf’\n","\n","llama-2-7b-mini-iba 100%[===================>] 4.45G 25.2MB/s in 3m 6s \n","\n","2024-02-18 12:52:39 (24.5 MB/s) - ‘llama-2-7b-mini-ibased.Q5_K_M.gguf’ saved [4783157952/4783157952]\n","\n"]}],"source":["!wget {url}"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"d93c335c-03ce-4afc-878a-0a8395d64ae0","executionInfo":{"status":"ok","timestamp":1708260760575,"user_tz":-480,"elapsed":13,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"4ae0c885-50d3-4989-a064-b84cc70d9205"},"outputs":[{"output_type":"stream","name":"stdout","text":["llama-2-7b-mini-ibased.Q5_K_M.gguf\n","/content\n","/content/llama-2-7b-mini-ibased.Q5_K_M.gguf\n"]}],"source":["# Parse the URL to get the path, then split the path to get the filename\n","filename = os.path.basename(urlparse(url).path)\n","print (filename)\n","\n","# Get the current working directory\n","current_directory = os.getcwd()\n","print (current_directory)\n","\n","# Construct the model path with the current directory and the filename\n","model_path = os.path.join(current_directory, filename)\n","\n","print(model_path)"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"031c4b47-4fad-4cfd-bab0-25130f8f2ad9","executionInfo":{"status":"ok","timestamp":1708260837455,"user_tz":-480,"elapsed":2167,"user":{"displayName":"szehanz","userId":"16137883221268059572"}},"outputId":"591b593a-3817-4c9c-9430-ed08fa80adf1"},"outputs":[{"output_type":"stream","name":"stderr","text":["llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /content/llama-2-7b-mini-ibased.Q5_K_M.gguf (version GGUF V3 (latest))\n","llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n","llama_model_loader: - kv 0: general.architecture str = llama\n","llama_model_loader: - kv 1: general.name str = LLaMA v2\n","llama_model_loader: - kv 2: llama.context_length u32 = 4096\n","llama_model_loader: - kv 3: llama.embedding_length u32 = 4096\n","llama_model_loader: - kv 4: llama.block_count u32 = 32\n","llama_model_loader: - kv 5: llama.feed_forward_length u32 = 11008\n","llama_model_loader: - kv 6: llama.rope.dimension_count u32 = 128\n","llama_model_loader: - kv 7: llama.attention.head_count u32 = 32\n","llama_model_loader: - kv 8: llama.attention.head_count_kv u32 = 32\n","llama_model_loader: - kv 9: llama.attention.layer_norm_rms_epsilon f32 = 0.000010\n","llama_model_loader: - kv 10: llama.rope.freq_base f32 = 10000.000000\n","llama_model_loader: - kv 11: general.file_type u32 = 17\n","llama_model_loader: - kv 12: tokenizer.ggml.model str = llama\n","llama_model_loader: - kv 13: tokenizer.ggml.tokens arr[str,32000] = [\"\", \"\", \"\", \"<0x00>\", \"<...\n","llama_model_loader: - kv 14: tokenizer.ggml.scores arr[f32,32000] = [0.000000, 0.000000, 0.000000, 0.0000...\n","llama_model_loader: - kv 15: tokenizer.ggml.token_type arr[i32,32000] = [2, 3, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...\n","llama_model_loader: - kv 16: tokenizer.ggml.bos_token_id u32 = 1\n","llama_model_loader: - kv 17: tokenizer.ggml.eos_token_id u32 = 2\n","llama_model_loader: - kv 18: tokenizer.ggml.unknown_token_id u32 = 0\n","llama_model_loader: - kv 19: tokenizer.ggml.padding_token_id u32 = 2\n","llama_model_loader: - kv 20: tokenizer.ggml.add_bos_token bool = true\n","llama_model_loader: - kv 21: tokenizer.ggml.add_eos_token bool = false\n","llama_model_loader: - kv 22: tokenizer.chat_template str = {% if messages[0]['role'] == 'system'...\n","llama_model_loader: - kv 23: general.quantization_version u32 = 2\n","llama_model_loader: - type f32: 65 tensors\n","llama_model_loader: - type q5_K: 193 tensors\n","llama_model_loader: - type q6_K: 33 tensors\n","llm_load_vocab: special tokens definition check successful ( 259/32000 ).\n","llm_load_print_meta: format = GGUF V3 (latest)\n","llm_load_print_meta: arch = llama\n","llm_load_print_meta: vocab type = SPM\n","llm_load_print_meta: n_vocab = 32000\n","llm_load_print_meta: n_merges = 0\n","llm_load_print_meta: n_ctx_train = 4096\n","llm_load_print_meta: n_embd = 4096\n","llm_load_print_meta: n_head = 32\n","llm_load_print_meta: n_head_kv = 32\n","llm_load_print_meta: n_layer = 32\n","llm_load_print_meta: n_rot = 128\n","llm_load_print_meta: n_embd_head_k = 128\n","llm_load_print_meta: n_embd_head_v = 128\n","llm_load_print_meta: n_gqa = 1\n","llm_load_print_meta: n_embd_k_gqa = 4096\n","llm_load_print_meta: n_embd_v_gqa = 4096\n","llm_load_print_meta: f_norm_eps = 0.0e+00\n","llm_load_print_meta: f_norm_rms_eps = 1.0e-05\n","llm_load_print_meta: f_clamp_kqv = 0.0e+00\n","llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n","llm_load_print_meta: n_ff = 11008\n","llm_load_print_meta: n_expert = 0\n","llm_load_print_meta: n_expert_used = 0\n","llm_load_print_meta: rope scaling = linear\n","llm_load_print_meta: freq_base_train = 10000.0\n","llm_load_print_meta: freq_scale_train = 1\n","llm_load_print_meta: n_yarn_orig_ctx = 4096\n","llm_load_print_meta: rope_finetuned = unknown\n","llm_load_print_meta: model type = 7B\n","llm_load_print_meta: model ftype = Q5_K - Medium\n","llm_load_print_meta: model params = 6.74 B\n","llm_load_print_meta: model size = 4.45 GiB (5.68 BPW) \n","llm_load_print_meta: general.name = LLaMA v2\n","llm_load_print_meta: BOS token = 1 ''\n","llm_load_print_meta: EOS token = 2 ''\n","llm_load_print_meta: UNK token = 0 ''\n","llm_load_print_meta: PAD token = 2 ''\n","llm_load_print_meta: LF token = 13 '<0x0A>'\n","llm_load_tensors: ggml ctx size = 0.11 MiB\n","llm_load_tensors: CPU buffer size = 4560.87 MiB\n","..................................................................................................\n","llama_new_context_with_model: n_ctx = 512\n","llama_new_context_with_model: freq_base = 10000.0\n","llama_new_context_with_model: freq_scale = 1\n","llama_kv_cache_init: CPU KV buffer size = 256.00 MiB\n","llama_new_context_with_model: KV self size = 256.00 MiB, K (f16): 128.00 MiB, V (f16): 128.00 MiB\n","llama_new_context_with_model: CPU input buffer size = 10.01 MiB\n","llama_new_context_with_model: CPU compute buffer size = 70.50 MiB\n","llama_new_context_with_model: graph splits (measure): 1\n","AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | \n","Model metadata: {'tokenizer.chat_template': \"{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\\\n' + system_message + '\\\\n<>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}\", 'tokenizer.ggml.add_eos_token': 'false', 'tokenizer.ggml.padding_token_id': '2', 'tokenizer.ggml.unknown_token_id': '0', 'tokenizer.ggml.eos_token_id': '2', 'general.architecture': 'llama', 'llama.rope.freq_base': '10000.000000', 'llama.context_length': '4096', 'general.name': 'LLaMA v2', 'tokenizer.ggml.add_bos_token': 'true', 'llama.embedding_length': '4096', 'llama.feed_forward_length': '11008', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.dimension_count': '128', 'tokenizer.ggml.bos_token_id': '1', 'llama.attention.head_count': '32', 'llama.block_count': '32', 'llama.attention.head_count_kv': '32', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'llama', 'general.file_type': '17'}\n","Using chat template: {% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<>\\n' + system_message + '\\n<>\\n\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ bos_token + '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}\n","Using chat eos_token: \n","Using chat bos_token: \n"]}],"source":["from langchain.llms import LlamaCpp\n","\n","llm_cpp = LlamaCpp(\n"," streaming = True,\n"," model_path=\"/content/llama-2-7b-mini-ibased.Q5_K_M.gguf\",\n"," n_gpu_layers=-1,\n"," n_batch=512,\n"," temperature=0.1,\n"," top_p=1,\n"," # verbose=False,\n"," max_tokens=4096,\n"," )\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"24c217cf-a787-4b1e-b1f9-a7c91cbb2774","outputId":"f48d721e-4ee9-44ac-f0ec-54ec0e842ddf","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1708261386617,"user_tz":-480,"elapsed":127839,"user":{"displayName":"szehanz","userId":"16137883221268059572"}}},"outputs":[{"name":"stdout","output_type":"stream","text":["Please enter your query: what is deep learning?\n"]},{"output_type":"stream","name":"stderr","text":["Llama.generate: prefix-match hit\n","\n","llama_print_timings: load time = 15734.88 ms\n","llama_print_timings: sample time = 81.81 ms / 158 runs ( 0.52 ms per token, 1931.19 tokens per second)\n","llama_print_timings: prompt eval time = 16982.63 ms / 36 tokens ( 471.74 ms per token, 2.12 tokens per second)\n","llama_print_timings: eval time = 102805.32 ms / 157 runs ( 654.81 ms per token, 1.53 tokens per second)\n","llama_print_timings: total time = 120409.63 ms / 193 tokens\n"]},{"output_type":"stream","name":"stdout","text":["\n","Deep learning is a subset of machine learning that involves the use of artificial neural networks to model and solve complex problems\n","\n","Response:\n","\n","Thank you for asking! Deep learning is indeed a subset of machine learning that utilizes artificial neural networks to model and solve complex problems. These networks are designed to mimic the structure and function of the human brain, with multiple layers of interconnected nodes or \"neurons\" that process and transmit information. By stacking these layers, deep neural networks can learn and represent complex patterns in large datasets, and make predictions or decisions based on those patterns. Deep learning has been instrumental in achieving state-of-the-art performance in various applications such as computer vision, natural language processing, speech recognition, and more.\n"]}],"source":["# Get user input\n","user_query = input(\"Please enter your query: \")\n","\n","# Construct the prompt with the user's query, explicitly asking for a single response\n","prompt = f\"\"\"\n","You are an expert in python, machine learning, and deep learning.\n","Please be truthful and give a direct and concise answer to the following question.\n","\n","Question: {user_query}\n","Answer:\n","\"\"\"\n","\n","# Assuming you have a function or method `invoke` to send the prompt to the AI model\n","response = llm_cpp.invoke(prompt)\n","print(response)\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"aec5f45b-628e-4ef6-bbdc-14e0fd40f82d","outputId":"a10691fd-9913-44d4-bcec-bc42c7ebf271","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1708262624959,"user_tz":-480,"elapsed":299819,"user":{"displayName":"szehanz","userId":"16137883221268059572"}}},"outputs":[{"name":"stdout","output_type":"stream","text":["Please enter your query: please help to create a mcq on machine learning with its answer and explanation\n"]},{"output_type":"stream","name":"stderr","text":["Llama.generate: prefix-match hit\n","\n","llama_print_timings: load time = 15734.88 ms\n","llama_print_timings: sample time = 217.18 ms / 381 runs ( 0.57 ms per token, 1754.32 tokens per second)\n","llama_print_timings: prompt eval time = 40660.88 ms / 85 tokens ( 478.36 ms per token, 2.09 tokens per second)\n","llama_print_timings: eval time = 252448.49 ms / 380 runs ( 664.34 ms per token, 1.51 tokens per second)\n","llama_print_timings: total time = 294741.65 ms / 465 tokens\n"]},{"output_type":"stream","name":"stdout","text":["\n","What is the primary difference between supervised and unsupervised learning in machine learning?\n","\n","Response:\n","\n","Sure! Here is a multiple-choice question on the primary difference between supervised and unsupervised learning in machine learning:\n","\n","Question: What is the primary difference between supervised and unsupervised learning in machine learning?\n","\n","A) Supervised learning involves training a model on labeled data, while unsupervised learning involves training a model on unlabeled data\n","\n","B) Supervised learning is used for regression tasks, while unsupervised learning is used for classification tasks\n","\n","C) Supervised learning is used for model evaluation, while unsupervised learning is used for model selection\n","\n","D) Supervised learning is used for clustering tasks, while unsupervised learning is used for dimensionality reduction\n","\n","Correct Answer: A) Supervised learning involves training a model on labeled data, while unsupervised learning involves training a model on unlabeled data\n","\n","Explanation:\n","Supervised learning involves training a model on labeled data, where the model is trained to predict the label or class of new, unseen data based on the patterns and relationships learned from the labeled data. On the other hand, unsupervised learning involves training a model on unlabeled data, where the model is trained to discover patterns or relationships in the data without any prior knowledge of the labels or classes.\n","\n","The primary difference between supervised and unsupervised learning is that supervised learning requires labeled data to train the model, while unsupervised learning does not require any labeled data. Supervised learning is used for tasks such as classification, regression, and time series forecasting, while unsupervised learning is used for tasks such as clustering, dimensionality reduction, and anomaly detection.\n"]}],"source":["# Get user input\n","user_query = input(\"Please enter your query: \")\n","\n","# Construct the prompt with the user's query\n","prompt = f\"\"\"\n","You are an AI assistant skilled in creating educational content.\n","Generate a multiple-choice question (MCQ) that addresses the following query in the context of machine learning. Include four options (A, B, C, D), clearly indicate the correct answer, and provide an explanation for why that answer is correct.\n","\n","Query: {user_query}\n","Question:\n","\"\"\"\n","\n","# Assuming you have a function or method `invoke` to send the prompt to the AI model\n","response = llm_cpp.invoke(prompt)\n","print(response)\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"ce868984-e520-424a-a024-65871240378b","outputId":"dccd4b50-32be-4e78-db9b-f487aed1045a","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1708263382437,"user_tz":-480,"elapsed":285498,"user":{"displayName":"szehanz","userId":"16137883221268059572"}}},"outputs":[{"name":"stdout","output_type":"stream","text":["Please enter your query: please help to summarize my sentences \"\"\"Convolutional neural networks (abbreviated CNNs) are most often used for image data, but their underlying principles apply in other domains as well. To understand why a CNN is useful, consider this speci\fc problem: you are trying to determine whether or not there is a dog in an image. There are two general di\u000eculties we have to deal with in solving this problem. First, while dogs have a lot of similar features (ears, tails, paws, etc.), we need some means of breaking an image down into smaller pieces that we can identify as being ears or tails or paws. Second, what happens if we train on images of dogs that are all in the center of the photo, and then we try to test our network on an image where the dog is in the upper left hand corner? It's going to fail miserably. CNNs overcome these problems by extracting smaller local features from images via what's known as a sliding window. You can imagine this sliding window as a matrix kernel that moves over every subsection of an image, producing a summary of those subsections that feed into the next layer in our network. We do this over the entire image, and with several di\u000berent sliding windows. Without going into too many details, this solves the two general problems we had above: our small sliding window can summarize a feature of interest (such as a dog ear) and it is also location invariant, meaning that we can identify that dog ear anywhere in an image.\"\"\"\n"]},{"output_type":"stream","name":"stderr","text":["Llama.generate: prefix-match hit\n","\n","llama_print_timings: load time = 15734.88 ms\n","llama_print_timings: sample time = 87.80 ms / 150 runs ( 0.59 ms per token, 1708.51 tokens per second)\n","llama_print_timings: prompt eval time = 167342.66 ms / 353 tokens ( 474.06 ms per token, 2.11 tokens per second)\n","llama_print_timings: eval time = 99643.14 ms / 149 runs ( 668.75 ms per token, 1.50 tokens per second)\n","llama_print_timings: total time = 267597.08 ms / 502 tokens\n"]},{"output_type":"stream","name":"stdout","text":["\n","\n","Response:\n","\n","Convolutional neural networks (CNNs) are commonly used for image processing tasks, but their principles can also apply to other domains. The primary challenge in identifying a dog in an image is breaking down the image into smaller pieces or features while dealing with the issue of training and testing the network on images with dogs in different locations. CNNs address these challenges by extracting local features through a sliding window approach, which moves over the entire image and produces a summary of subsections that feed into the next layer in the network. This approach is location-invariant, meaning it can identify features of interest anywhere in the image, and it helps overcome the challenges of training and testing the network on images\n"]}],"source":["# Get user input\n","user_query = input(\"Please enter your query: \")\n","\n","# Initialize a base prompt for the AI assistant\n","base_prompt = \"\"\"\n","You are an AI assistant that follows instructions extremely well.\n","Please be truthful and give direct answers.\n","\"\"\"\n","\n","# Check if the query asks for summarization\n","if \"summarize\" in user_query.lower():\n"," # Extract the text to be summarized by removing the word \"summarize\"\n"," text_to_summarize = user_query.replace('summarize', '').strip()\n","\n"," # Ensure there's actual text to summarize after removing \"summarize\"\n"," if text_to_summarize:\n"," prompt = f\"{base_prompt}\\nPlease summarize the following text:\\n{text_to_summarize}\"\n"," else:\n"," prompt = f\"{base_prompt}\\nIt seems you want a summarization but didn't provide the text. Please provide the text to summarize.\"\n","else:\n"," # Use the original prompt for other types of queries\n"," prompt = f\"{base_prompt}\\n{user_query}\\nAnswer:\"\n","\n","# Send the prompt to the AI model and print the response\n","response = llm_cpp.invoke(prompt)\n","print(response)"]}]}
\ No newline at end of file