{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "da47e672",
"metadata": {},
"outputs": [],
"source": [
"# !pip3 install https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.6/autoawq-0.1.6+cu118-cp310-cp310-linux_x86_64.whl"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "27063032",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tue Nov 7 14:32:21 2023 \r\n",
"+-----------------------------------------------------------------------------+\r\n",
"| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |\r\n",
"|-------------------------------+----------------------+----------------------+\r\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n",
"| | | MIG M. |\r\n",
"|===============================+======================+======================|\r\n",
"| 0 NVIDIA A100 80G... On | 00000001:00:00.0 Off | 0 |\r\n",
"| N/A 37C P0 66W / 300W | 5536MiB / 81920MiB | 0% Default |\r\n",
"| | | Disabled |\r\n",
"+-------------------------------+----------------------+----------------------+\r\n",
" \r\n",
"+-----------------------------------------------------------------------------+\r\n",
"| Processes: |\r\n",
"| GPU GI CI PID Type Process name GPU Memory |\r\n",
"| ID ID Usage |\r\n",
"|=============================================================================|\r\n",
"+-----------------------------------------------------------------------------+\r\n"
]
}
],
"source": [
"!nvidia-smi"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1bde5916",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2023-11-07 14:32:32,101] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
]
}
],
"source": [
"from awq import AutoAWQForCausalLM\n",
"from transformers import AutoConfig, AwqConfig, AutoTokenizer, AutoModelForCausalLM\n",
"import torch\n",
"\n",
"model_path = 'mesolitica/malaysian-llama2-7b-32k-instructions'"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "c658280e",
"metadata": {},
"outputs": [],
"source": [
"# !pip3 install transformers==4.35.0"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "803a0c91",
"metadata": {},
"outputs": [],
"source": [
"!rm -rf test"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "838ddb85",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "f7b2ffd3bc464a598567299228d8966b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/2 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "637b41e1",
"metadata": {},
"outputs": [],
"source": [
"model.save_pretrained('./test', safe_serialization = False)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "417dbbf5",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "13af4d1d7ddf4bae900710fcf9a9d775",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/3 [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"model = AutoAWQForCausalLM.from_pretrained('./test')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "212056b5",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"AWQ: 100%|██████████| 32/32 [08:38<00:00, 16.21s/it]\n"
]
}
],
"source": [
"quant_path = 'malaysian-llama2-7b-32k-instructions-awq'\n",
"quant_config = { \"zero_point\": True, \"q_group_size\": 128, \"w_bit\": 4, \"version\": \"GEMM\" }\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
"model.quantize(tokenizer, quant_config=quant_config, calib_data = 'mesolitica/malaysian-calibration')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "77e03f18",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:root:`quant_config.json` is being deprecated in the future in favor of quantization_config in config.json.\n"
]
},
{
"data": {
"text/plain": [
"('malaysian-llama2-7b-32k-instructions-awq/tokenizer_config.json',\n",
" 'malaysian-llama2-7b-32k-instructions-awq/special_tokens_map.json',\n",
" 'malaysian-llama2-7b-32k-instructions-awq/tokenizer.model',\n",
" 'malaysian-llama2-7b-32k-instructions-awq/added_tokens.json',\n",
" 'malaysian-llama2-7b-32k-instructions-awq/tokenizer.json')"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.save_quantized(quant_path, safetensors = False)\n",
"tokenizer.save_pretrained(quant_path)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "fd35b057",
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "021e6f72e5594b4995338e27cfcc3a05",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"tokenizer.model: 0%| | 0.00/500k [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-AWQ/commit/ea465a1be780a5091d89685d69ec7146ba0d69e4', commit_message='Upload tokenizer', commit_description='', oid='ea465a1be780a5091d89685d69ec7146ba0d69e4', pr_url=None, pr_revision=None, pr_num=None)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tokenizer.push_to_hub('mesolitica/malaysian-llama2-7b-32k-instructions-AWQ')"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "816dacc8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-AWQ/commit/69be7a3e995592db52910fe2e848e85dc2637ad3', commit_message='Upload config', commit_description='', oid='69be7a3e995592db52910fe2e848e85dc2637ad3', pr_url=None, pr_revision=None, pr_num=None)"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"quantization_config = AwqConfig(\n",
" bits=quant_config['w_bit'],\n",
" group_size=quant_config['q_group_size'],\n",
" zero_point=quant_config['zero_point'],\n",
" backend='autoawq',\n",
" version=quant_config['version'].lower(),\n",
")\n",
"\n",
"config = AutoConfig.from_pretrained(model_path)\n",
"config.quantization_config = quantization_config\n",
"\n",
"config.push_to_hub('mesolitica/malaysian-llama2-7b-32k-instructions-AWQ')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "846835fa",
"metadata": {},
"outputs": [],
"source": [
"from huggingface_hub import HfApi\n",
"\n",
"api = HfApi()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "f8c2bef7",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c84977a07bc84c708cf3b9eea3672dda",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"pytorch_model.bin: 0%| | 0.00/3.89G [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/plain": [
"'https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-AWQ/blob/main/pytorch_model.bin'"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"api.upload_file(\n",
" path_or_fileobj='malaysian-llama2-7b-32k-instructions-awq/pytorch_model.bin',\n",
" path_in_repo=\"pytorch_model.bin\",\n",
" repo_id='mesolitica/malaysian-llama2-7b-32k-instructions-AWQ',\n",
" repo_type=\"model\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "b6b0f30f",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "cfd383ecaf3f42689d5e6e158a2b1a06",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading (…)lve/main/config.json: 0%| | 0.00/870 [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "c88703bd0b8b40c5b25a4d8eb1fdfbe4",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Downloading pytorch_model.bin: 0%| | 0.00/3.89G [00:00, ?B/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"quantized_model = AutoModelForCausalLM.from_pretrained('mesolitica/malaysian-llama2-7b-32k-instructions-AWQ')\n",
"_ = quantized_model.cuda()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "698cd4c9",
"metadata": {},
"outputs": [],
"source": [
"def parse_llama_chat(messages):\n",
"\n",
" system = messages[0]['content']\n",
" user_query = messages[-1]['content']\n",
"\n",
" users, assistants = [], []\n",
" for q in messages[1:-1]:\n",
" if q['role'] == 'user':\n",
" users.append(q['content'])\n",
" elif q['role'] == 'assistant':\n",
" assistants.append(q['content'])\n",
"\n",
" texts = [f'[INST] <>\\n{system}\\n<>\\n\\n']\n",
" for u, a in zip(users, assistants):\n",
" texts.append(f'{u.strip()} [/INST] {a.strip()} [INST] ')\n",
" texts.append(f'{user_query.strip()} [/INST]')\n",
" prompt = ''.join(texts).strip()\n",
" return prompt"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "63315893",
"metadata": {},
"outputs": [],
"source": [
"messages = [\n",
" {'role': 'system', 'content': 'awak adalah AI yang mampu jawab segala soalan'},\n",
" {'role': 'user', 'content': 'kwsp tu apa'}\n",
"]\n",
"prompt = parse_llama_chat(messages)\n",
"inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "8a3c15d8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 7.54 s, sys: 3.84 ms, total: 7.54 s\n",
"Wall time: 7.54 s\n"
]
},
{
"data": {
"text/plain": [
"' [INST] <>\\nawak adalah AI yang mampu jawab segala soalan\\n<>\\n\\nkwsp tu apa [/INST] KWSP adalah singkatan bagi \"Kumpulan Wang Simpanan Pekerja\", yang merujuk kepada skim simpanan persaraan yang dilaksanakan di Malaysia yang bertujuan untuk menyediakan dana persaraan untuk pekerja dan majikan. Program ini memerlukan majikan untuk menyumbang sejumlah wang bagi pihak pekerja, dan pekerja dikehendaki menyumbang sejumlah yang sama bagi pihak mereka sendiri. Dana ini dikumpulkan dalam dana berasingan dan dikawal selia oleh kerajaan. KWSP menyediakan faedah persaraan kepada ahlinya, seperti pengeluaran, pelaburan, dan pencen. Skim ini terkenal kerana tadbir urusnya yang baik dan reputasinya sebagai salah satu dana simpanan persaraan terbesar dan paling dipercayai di Asia. '"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"\n",
"generate_kwargs = dict(\n",
" inputs,\n",
" max_new_tokens=1024,\n",
" top_p=0.95,\n",
" top_k=50,\n",
" temperature=0.9,\n",
" do_sample=True,\n",
" num_beams=1,\n",
")\n",
"r = quantized_model.generate(**generate_kwargs)\n",
"tokenizer.decode(r[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d73d43a0",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}