File size: 4,810 Bytes
eeb0016
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbbe441
eeb0016
 
 
 
dbbe441
 
eeb0016
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a853257
eeb0016
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1dd1b3f
eeb0016
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "provenance": [],
      "gpuType": "T4"
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    },
    "language_info": {
      "name": "python"
    },
    "accelerator": "GPU"
  },
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "# @title # ⚡ Imat-AutoGGUF\n",
        "\n",
        "# @markdown Made by https://huggingface.co/Virt-io\n",
        "\n",
        "# @markdown Edited https://github.com/mlabonne/llm-course LazyMergekit to work with Imatrix\n",
        "\n",
        "# @markdown\n",
        "\n",
        "# @markdown The `token` corresponds to the name of the secret that stores your [Hugging Face access token](https://huggingface.co/settings/tokens) in Colab.\n",
        "\n",
        "# @markdown ---\n",
        "\n",
        "# @markdown ### ⚡ Quantization parameters\n",
        "MODEL_ID = \"TinyLlama/TinyLlama-1.1B-Chat-v1.0\" # @param {type:\"string\"}\n",
        "IMATRIX_OPTION = 'Imatrix' # @param [\"Imatrix\", \"Imatrix-RP\", \"Imatrix-RP-Extended\"]\n",
        "if IMATRIX_OPTION == \"Imatrix\":\n",
        "  IMATRIX = f\"Google-Colab-Imatrix-GGUF/Imatrix/imatrix.txt\"\n",
        "if IMATRIX_OPTION == \"Imatrix-RP\":\n",
        "  IMATRIX = f\"Google-Colab-Imatrix-GGUF/Imatrix/imatrix-with-rp-data.txt\"\n",
        "if IMATRIX_OPTION == \"Imatrix-RP-Extended\":\n",
        "  IMATRIX = f\"Google-Colab-Imatrix-GGUF/Imatrix/imatrix-rp-extended.txt\"\n",
        "print(IMATRIX)\n",
        "QUANTIZATION_METHODS = \"IQ4_NL, Q8_0\" # @param {type:\"string\"}\n",
        "QUANTIZATION_METHODS = QUANTIZATION_METHODS.replace(\" \", \"\").split(\",\")\n",
        "\n",
        "# @markdown ---\n",
        "\n",
        "# @markdown ### 🤗 Hugging Face Hub\n",
        "username = \"Virt-io\" # @param {type:\"string\"}\n",
        "token = \"HF_TOKEN\" # @param {type:\"string\"}\n",
        "\n",
        "MODEL_NAME = MODEL_ID.split('/')[-1]\n",
        "\n",
        "# Git clone llamacpp\n",
        "!git clone https://github.com/ggerganov/llama.cpp\n",
        "!cd llama.cpp && git pull\n",
        "\n",
        "# Download model\n",
        "!git lfs install\n",
        "!git clone https://huggingface.co/{MODEL_ID}\n",
        "\n",
        "# Download Imatrix\n",
        "!git clone https://huggingface.co/Virt-io/Google-Colab-Imatrix-GGUF\n",
        "\n",
        "# Install python dependencies and reload instance\n",
        "!pip install -r llama.cpp/requirements/requirements-convert.txt\n",
        "\n",
        "# Build llamacpp\n",
        "!cd llama.cpp && make clean && LLAMA_CUDA=1 LLAMA_LTO=1 LLAMA_CUDA_DMMV_X=64 LLAMA_CUDA_MMV_Y=4 LLAMA_CUDA_KQUANTS_ITER=2 LLAMA_CUDA_F16=1 LLAMA_CUDA_DMMV_F16=1 make -j16\n",
        "\n",
        "# Convert to fp16\n",
        "fp16 = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.fp16.gguf\"\n",
        "!python llama.cpp/convert.py {MODEL_NAME} --outtype f16 --outfile {fp16}\n",
        "\n",
        "# Run imatrix\n",
        "imat_dat = f\"{fp16}.{IMATRIX_OPTION}.dat\"\n",
        "!./llama.cpp/imatrix -ngl 100 -c 512 -b 512 --model {fp16} -f {IMATRIX} -o {imat_dat}\n",
        "\n",
        "# Quantize the model for each method in the QUANTIZATION_METHODS list\n",
        "for method in QUANTIZATION_METHODS:\n",
        "  qtype = f\"{MODEL_NAME}/{MODEL_NAME.lower()}.{method.upper()}.gguf\"\n",
        "  !./llama.cpp/quantize --imatrix {imat_dat} {fp16} {qtype} {method}"
      ],
      "metadata": {
        "id": "fD24jJxq7t3k"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# @markdown Upload to HF\n",
        "!pip install -q huggingface_hub\n",
        "from huggingface_hub import create_repo, HfApi\n",
        "from google.colab import userdata, runtime\n",
        "\n",
        "# Defined in the secrets tab in Google Colab\n",
        "hf_token = userdata.get(token)\n",
        "api = HfApi()\n",
        "\n",
        "# Create empty repo\n",
        "create_repo(\n",
        "    repo_id = f\"{username}/{MODEL_NAME}-GGUF\",\n",
        "    repo_type=\"model\",\n",
        "    exist_ok=True,\n",
        "    token=hf_token\n",
        ")\n",
        "\n",
        "# Upload gguf files\n",
        "api.upload_folder(\n",
        "    folder_path=MODEL_NAME,\n",
        "    repo_id=f\"{username}/{MODEL_NAME}-GGUF\",\n",
        "    allow_patterns=[\"*.gguf\", \"*.fp16.gguf\", \"*.dat\", \"*.md\"],\n",
        "    token=hf_token\n",
        ")\n",
        "\n",
        "# Kill runtime\n",
        "runtime.unassign()"
      ],
      "metadata": {
        "id": "F7Q8_Y1_e3BX"
      },
      "execution_count": null,
      "outputs": []
    }
  ]
}