{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "source": [ "# @title # ⚡ Imat-AutoGGUF\n", "\n", "# @markdown Made by https://huggingface.co/Virt-io\n", "\n", "# @markdown Edited https://github.com/mlabonne/llm-course LazyMergekit to work with Imatrix\n", "\n", "# @markdown\n", "\n", "# @markdown The `token` corresponds to the name of the secret that stores your [Hugging Face access token](https://huggingface.co/settings/tokens) in Colab.\n", "\n", "# @markdown ---\n", "\n", "# @markdown ### ⚡ Quantization parameters\n", "MODEL_ID = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\" # @param {type:\"string\"}\n", "IMATRIX_OPTION = 'Imatrix' # @param [\"Imatrix\", \"Imatrix-RP\"]\n", "if IMATRIX_OPTION == \"Imatrix\":\n", " IMATRIX = f\"Google-Colab-Imatrix-GGUF/Imatrix/imatrix.txt\"\n", "if IMATRIX_OPTION == \"Imatrix-RP\":\n", " IMATRIX = f\"Google-Colab-Imatrix-GGUF/Imatrix/imatrix-with-rp-data.txt\"\n", "print(IMATRIX)\n", "QUANTIZATION_METHODS = \"IQ4_NL, Q8_0\" # @param {type:\"string\"}\n", "QUANTIZATION_METHODS = QUANTIZATION_METHODS.replace(\" \", \"\").split(\",\")\n", "\n", "# @markdown ---\n", "\n", "# @markdown ### 🤗 Hugging Face Hub\n", "username = \"Virt-io\" # @param {type:\"string\"}\n", "token = \"HF_TOKEN\" # @param {type:\"string\"}\n", "\n", "MODEL_NAME = MODEL_ID.split('/')[-1]\n", "\n", "# Git clone llamacpp\n", "!git clone https://github.com/ggerganov/llama.cpp\n", "!cd llama.cpp && git pull\n", "\n", "# Download model\n", "!git lfs install\n", "!git clone https://huggingface.co/{MODEL_ID}\n", "\n", "# Download Imatrix\n", "!git clone https://huggingface.co/Virt-io/Google-Colab-Imatrix-GGUF\n", "\n", "# Install python dependencies and reload instance\n", "!pip install -r llama.cpp/requirements/requirements-convert.txt\n", "\n", "# Build llamacpp\n", "!cd llama.cpp && make clean && LLAMA_CUBLAS=1 LLAMA_CUDA_FORCE_MMQ=1 LLAMA_LTO=1 LLAMA_CUDA_DMMV_X=64 LLAMA_CUDA_MMV_Y=4 LLAMA_CUDA_KQUANTS_ITER=2 LLAMA_CUDA_F16=1 LLAMA_CUDA_DMMV_F16=1 make -j16\n", "\n", "# Convert to fp16\n", "fp16 = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.gguf\"\n", "!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}\n", "\n", "# Run imatrix\n", "imat_dat = f\"{fp16}.{IMATRIX_OPTION}.dat\"\n", "!./llama.cpp/imatrix -ngl 100 -c 512 -b 512 --model {fp16} -f {IMATRIX} -o {imat_dat}\n", "\n", "# Quantize the model for each method in the QUANTIZATION_METHODS list\n", "for method in QUANTIZATION_METHODS:\n", " qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n", " !./llama.cpp/quantize --imatrix {imat_dat} {fp16} {qtype} {method}" ], "metadata": { "id": "fD24jJxq7t3k" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "# @markdown Upload to HF\n", "!pip install -q huggingface_hub\n", "from huggingface_hub import create_repo, HfApi\n", "from google.colab import userdata, runtime\n", "\n", "# Defined in the secrets tab in Google Colab\n", "hf_token = userdata.get(token)\n", "api = HfApi()\n", "\n", "# Create empty repo\n", "create_repo(\n", " repo_id = f\"{username}/{MODEL_NAME}-GGUF\",\n", " repo_type=\"model\",\n", " exist_ok=True,\n", " token=hf_token\n", ")\n", "\n", "# Upload gguf files\n", "api.upload_folder(\n", " folder_path=MODEL_NAME,\n", " repo_id=f\"{username}/{MODEL_NAME}-GGUF\",\n", " allow_patterns=[\"*.gguf\", \"*.fp16.gguf\", \"*.dat\", \"*.md\"],\n", " token=hf_token\n", ")\n", "\n", "# Kill runtime\n", "# runtime.unassign()" ], "metadata": { "id": "F7Q8_Y1_e3BX" }, "execution_count": null, "outputs": [] } ] }