huseinzol05 commited on
Commit
d8f6328
1 Parent(s): 05288b8

Upload autoawq-llama2-13b.ipynb

Browse files
Files changed (1) hide show
  1. autoawq-llama2-13b.ipynb +488 -0
autoawq-llama2-13b.ipynb ADDED
@@ -0,0 +1,488 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 6,
6
+ "id": "da47e672",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "# !pip3 install https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.6/autoawq-0.1.6+cu118-cp310-cp310-linux_x86_64.whl"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 7,
16
+ "id": "27063032",
17
+ "metadata": {},
18
+ "outputs": [
19
+ {
20
+ "name": "stdout",
21
+ "output_type": "stream",
22
+ "text": [
23
+ "Tue Nov 7 15:30:49 2023 \r\n",
24
+ "+-----------------------------------------------------------------------------+\r\n",
25
+ "| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |\r\n",
26
+ "|-------------------------------+----------------------+----------------------+\r\n",
27
+ "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n",
28
+ "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n",
29
+ "| | | MIG M. |\r\n",
30
+ "|===============================+======================+======================|\r\n",
31
+ "| 0 NVIDIA A100 80G... On | 00000001:00:00.0 Off | 0 |\r\n",
32
+ "| N/A 45C P0 49W / 300W | 4MiB / 81920MiB | 0% Default |\r\n",
33
+ "| | | Disabled |\r\n",
34
+ "+-------------------------------+----------------------+----------------------+\r\n",
35
+ " \r\n",
36
+ "+-----------------------------------------------------------------------------+\r\n",
37
+ "| Processes: |\r\n",
38
+ "| GPU GI CI PID Type Process name GPU Memory |\r\n",
39
+ "| ID ID Usage |\r\n",
40
+ "|=============================================================================|\r\n",
41
+ "| No running processes found |\r\n",
42
+ "+-----------------------------------------------------------------------------+\r\n"
43
+ ]
44
+ }
45
+ ],
46
+ "source": [
47
+ "!nvidia-smi"
48
+ ]
49
+ },
50
+ {
51
+ "cell_type": "code",
52
+ "execution_count": null,
53
+ "id": "1bde5916",
54
+ "metadata": {
55
+ "scrolled": true
56
+ },
57
+ "outputs": [],
58
+ "source": [
59
+ "from awq import AutoAWQForCausalLM\n",
60
+ "from transformers import AutoConfig, AwqConfig, AutoTokenizer, AutoModelForCausalLM\n",
61
+ "import torch\n",
62
+ "\n",
63
+ "model_path = 'mesolitica/malaysian-llama2-13b-32k-instructions'"
64
+ ]
65
+ },
66
+ {
67
+ "cell_type": "code",
68
+ "execution_count": null,
69
+ "id": "c658280e",
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "# !pip3 install transformers==4.35.0"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": null,
79
+ "id": "803a0c91",
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "!rm -rf test"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "code",
88
+ "execution_count": 9,
89
+ "id": "838ddb85",
90
+ "metadata": {},
91
+ "outputs": [
92
+ {
93
+ "data": {
94
+ "application/vnd.jupyter.widget-view+json": {
95
+ "model_id": "f99e5a0f2eb9406293008093078468bc",
96
+ "version_major": 2,
97
+ "version_minor": 0
98
+ },
99
+ "text/plain": [
100
+ "Loading checkpoint shards: 0%| | 0/3 [00:00<?, ?it/s]"
101
+ ]
102
+ },
103
+ "metadata": {},
104
+ "output_type": "display_data"
105
+ }
106
+ ],
107
+ "source": [
108
+ "model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype = torch.bfloat16)"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": 10,
114
+ "id": "637b41e1",
115
+ "metadata": {},
116
+ "outputs": [],
117
+ "source": [
118
+ "model.save_pretrained('./test', safe_serialization = False)"
119
+ ]
120
+ },
121
+ {
122
+ "cell_type": "code",
123
+ "execution_count": 11,
124
+ "id": "417dbbf5",
125
+ "metadata": {},
126
+ "outputs": [
127
+ {
128
+ "data": {
129
+ "application/vnd.jupyter.widget-view+json": {
130
+ "model_id": "180037c27f734363891a24a866177783",
131
+ "version_major": 2,
132
+ "version_minor": 0
133
+ },
134
+ "text/plain": [
135
+ "Loading checkpoint shards: 0%| | 0/6 [00:00<?, ?it/s]"
136
+ ]
137
+ },
138
+ "metadata": {},
139
+ "output_type": "display_data"
140
+ }
141
+ ],
142
+ "source": [
143
+ "model = AutoAWQForCausalLM.from_pretrained('./test')"
144
+ ]
145
+ },
146
+ {
147
+ "cell_type": "code",
148
+ "execution_count": 12,
149
+ "id": "212056b5",
150
+ "metadata": {
151
+ "scrolled": true
152
+ },
153
+ "outputs": [
154
+ {
155
+ "name": "stderr",
156
+ "output_type": "stream",
157
+ "text": [
158
+ "AWQ: 100%|██████████| 40/40 [15:49<00:00, 23.73s/it]\n"
159
+ ]
160
+ }
161
+ ],
162
+ "source": [
163
+ "quant_path = 'malaysian-llama2-13b-32k-instructions-awq'\n",
164
+ "quant_config = { \"zero_point\": True, \"q_group_size\": 128, \"w_bit\": 4, \"version\": \"GEMM\" }\n",
165
+ "\n",
166
+ "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n",
167
+ "model.quantize(tokenizer, quant_config=quant_config, calib_data = 'mesolitica/malaysian-calibration')"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 13,
173
+ "id": "77e03f18",
174
+ "metadata": {
175
+ "scrolled": true
176
+ },
177
+ "outputs": [
178
+ {
179
+ "name": "stderr",
180
+ "output_type": "stream",
181
+ "text": [
182
+ "WARNING:root:`quant_config.json` is being deprecated in the future in favor of quantization_config in config.json.\n"
183
+ ]
184
+ },
185
+ {
186
+ "data": {
187
+ "text/plain": [
188
+ "('malaysian-llama2-13b-32k-instructions-awq/tokenizer_config.json',\n",
189
+ " 'malaysian-llama2-13b-32k-instructions-awq/special_tokens_map.json',\n",
190
+ " 'malaysian-llama2-13b-32k-instructions-awq/tokenizer.model',\n",
191
+ " 'malaysian-llama2-13b-32k-instructions-awq/added_tokens.json',\n",
192
+ " 'malaysian-llama2-13b-32k-instructions-awq/tokenizer.json')"
193
+ ]
194
+ },
195
+ "execution_count": 13,
196
+ "metadata": {},
197
+ "output_type": "execute_result"
198
+ }
199
+ ],
200
+ "source": [
201
+ "model.save_quantized(quant_path, safetensors = False)\n",
202
+ "tokenizer.save_pretrained(quant_path)"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": 14,
208
+ "id": "fd35b057",
209
+ "metadata": {
210
+ "scrolled": false
211
+ },
212
+ "outputs": [
213
+ {
214
+ "data": {
215
+ "application/vnd.jupyter.widget-view+json": {
216
+ "model_id": "58f54f49cb1c4fa197c83927ea1ff8c6",
217
+ "version_major": 2,
218
+ "version_minor": 0
219
+ },
220
+ "text/plain": [
221
+ "tokenizer.model: 0%| | 0.00/500k [00:00<?, ?B/s]"
222
+ ]
223
+ },
224
+ "metadata": {},
225
+ "output_type": "display_data"
226
+ },
227
+ {
228
+ "data": {
229
+ "text/plain": [
230
+ "CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions-AWQ/commit/e45742e819bb91811c9df8bc45c49ef4ce8bc240', commit_message='Upload tokenizer', commit_description='', oid='e45742e819bb91811c9df8bc45c49ef4ce8bc240', pr_url=None, pr_revision=None, pr_num=None)"
231
+ ]
232
+ },
233
+ "execution_count": 14,
234
+ "metadata": {},
235
+ "output_type": "execute_result"
236
+ }
237
+ ],
238
+ "source": [
239
+ "tokenizer.push_to_hub('mesolitica/malaysian-llama2-13b-32k-instructions-AWQ')"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 15,
245
+ "id": "816dacc8",
246
+ "metadata": {},
247
+ "outputs": [
248
+ {
249
+ "data": {
250
+ "text/plain": [
251
+ "CommitInfo(commit_url='https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions-AWQ/commit/84db1390a86f31ef76c6ec11436c09a2f84d055a', commit_message='Upload config', commit_description='', oid='84db1390a86f31ef76c6ec11436c09a2f84d055a', pr_url=None, pr_revision=None, pr_num=None)"
252
+ ]
253
+ },
254
+ "execution_count": 15,
255
+ "metadata": {},
256
+ "output_type": "execute_result"
257
+ }
258
+ ],
259
+ "source": [
260
+ "quantization_config = AwqConfig(\n",
261
+ " bits=quant_config['w_bit'],\n",
262
+ " group_size=quant_config['q_group_size'],\n",
263
+ " zero_point=quant_config['zero_point'],\n",
264
+ " backend='autoawq',\n",
265
+ " version=quant_config['version'].lower(),\n",
266
+ ")\n",
267
+ "\n",
268
+ "config = AutoConfig.from_pretrained(model_path)\n",
269
+ "config.quantization_config = quantization_config\n",
270
+ "\n",
271
+ "config.push_to_hub('mesolitica/malaysian-llama2-13b-32k-instructions-AWQ')"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 16,
277
+ "id": "846835fa",
278
+ "metadata": {},
279
+ "outputs": [],
280
+ "source": [
281
+ "from huggingface_hub import HfApi\n",
282
+ "\n",
283
+ "api = HfApi()"
284
+ ]
285
+ },
286
+ {
287
+ "cell_type": "code",
288
+ "execution_count": 17,
289
+ "id": "f8c2bef7",
290
+ "metadata": {},
291
+ "outputs": [
292
+ {
293
+ "data": {
294
+ "application/vnd.jupyter.widget-view+json": {
295
+ "model_id": "1761caf19c82498db586096cf2fac148",
296
+ "version_major": 2,
297
+ "version_minor": 0
298
+ },
299
+ "text/plain": [
300
+ "pytorch_model.bin: 0%| | 0.00/7.25G [00:00<?, ?B/s]"
301
+ ]
302
+ },
303
+ "metadata": {},
304
+ "output_type": "display_data"
305
+ },
306
+ {
307
+ "data": {
308
+ "text/plain": [
309
+ "'https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions-AWQ/blob/main/pytorch_model.bin'"
310
+ ]
311
+ },
312
+ "execution_count": 17,
313
+ "metadata": {},
314
+ "output_type": "execute_result"
315
+ }
316
+ ],
317
+ "source": [
318
+ "api.upload_file(\n",
319
+ " path_or_fileobj='malaysian-llama2-13b-32k-instructions-awq/pytorch_model.bin',\n",
320
+ " path_in_repo=\"pytorch_model.bin\",\n",
321
+ " repo_id='mesolitica/malaysian-llama2-13b-32k-instructions-AWQ',\n",
322
+ " repo_type=\"model\",\n",
323
+ ")"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": 18,
329
+ "id": "b6b0f30f",
330
+ "metadata": {
331
+ "scrolled": true
332
+ },
333
+ "outputs": [
334
+ {
335
+ "data": {
336
+ "application/vnd.jupyter.widget-view+json": {
337
+ "model_id": "e36d6b5c58dc4e129719dd9ed4f18372",
338
+ "version_major": 2,
339
+ "version_minor": 0
340
+ },
341
+ "text/plain": [
342
+ "Downloading (…)lve/main/config.json: 0%| | 0.00/871 [00:00<?, ?B/s]"
343
+ ]
344
+ },
345
+ "metadata": {},
346
+ "output_type": "display_data"
347
+ },
348
+ {
349
+ "name": "stderr",
350
+ "output_type": "stream",
351
+ "text": [
352
+ "You have loaded an AWQ model on CPU and have a CUDA device available, make sure to set your model on a GPU device in order to run your model.\n"
353
+ ]
354
+ },
355
+ {
356
+ "data": {
357
+ "application/vnd.jupyter.widget-view+json": {
358
+ "model_id": "c6162d87ba45442e8809f94ab7d85a8b",
359
+ "version_major": 2,
360
+ "version_minor": 0
361
+ },
362
+ "text/plain": [
363
+ "Downloading pytorch_model.bin: 0%| | 0.00/7.25G [00:00<?, ?B/s]"
364
+ ]
365
+ },
366
+ "metadata": {},
367
+ "output_type": "display_data"
368
+ }
369
+ ],
370
+ "source": [
371
+ "quantized_model = AutoModelForCausalLM.from_pretrained('mesolitica/malaysian-llama2-13b-32k-instructions-AWQ')\n",
372
+ "_ = quantized_model.cuda()"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "execution_count": 19,
378
+ "id": "698cd4c9",
379
+ "metadata": {},
380
+ "outputs": [],
381
+ "source": [
382
+ "def parse_llama_chat(messages):\n",
383
+ "\n",
384
+ " system = messages[0]['content']\n",
385
+ " user_query = messages[-1]['content']\n",
386
+ "\n",
387
+ " users, assistants = [], []\n",
388
+ " for q in messages[1:-1]:\n",
389
+ " if q['role'] == 'user':\n",
390
+ " users.append(q['content'])\n",
391
+ " elif q['role'] == 'assistant':\n",
392
+ " assistants.append(q['content'])\n",
393
+ "\n",
394
+ " texts = [f'<s>[INST] <<SYS>>\\n{system}\\n<</SYS>>\\n\\n']\n",
395
+ " for u, a in zip(users, assistants):\n",
396
+ " texts.append(f'{u.strip()} [/INST] {a.strip()} </s><s>[INST] ')\n",
397
+ " texts.append(f'{user_query.strip()} [/INST]')\n",
398
+ " prompt = ''.join(texts).strip()\n",
399
+ " return prompt"
400
+ ]
401
+ },
402
+ {
403
+ "cell_type": "code",
404
+ "execution_count": 20,
405
+ "id": "63315893",
406
+ "metadata": {},
407
+ "outputs": [],
408
+ "source": [
409
+ "messages = [\n",
410
+ " {'role': 'system', 'content': 'awak adalah AI yang mampu jawab segala soalan'},\n",
411
+ " {'role': 'user', 'content': 'kwsp tu apa'}\n",
412
+ "]\n",
413
+ "prompt = parse_llama_chat(messages)\n",
414
+ "inputs = tokenizer([prompt], return_tensors='pt', add_special_tokens=False).to('cuda')"
415
+ ]
416
+ },
417
+ {
418
+ "cell_type": "code",
419
+ "execution_count": 21,
420
+ "id": "8a3c15d8",
421
+ "metadata": {},
422
+ "outputs": [
423
+ {
424
+ "name": "stdout",
425
+ "output_type": "stream",
426
+ "text": [
427
+ "CPU times: user 1.04 s, sys: 15.7 ms, total: 1.06 s\n",
428
+ "Wall time: 1.06 s\n"
429
+ ]
430
+ },
431
+ {
432
+ "data": {
433
+ "text/plain": [
434
+ "'<s> [INST] <<SYS>>\\nawak adalah AI yang mampu jawab segala soalan\\n<</SYS>>\\n\\nkwsp tu apa [/INST] KWSP merupakan singkatan kepada Kumpulan Wang Simpanan Pekerja. </s>'"
435
+ ]
436
+ },
437
+ "execution_count": 21,
438
+ "metadata": {},
439
+ "output_type": "execute_result"
440
+ }
441
+ ],
442
+ "source": [
443
+ "%%time\n",
444
+ "\n",
445
+ "generate_kwargs = dict(\n",
446
+ " inputs,\n",
447
+ " max_new_tokens=1024,\n",
448
+ " top_p=0.95,\n",
449
+ " top_k=50,\n",
450
+ " temperature=0.9,\n",
451
+ " do_sample=True,\n",
452
+ " num_beams=1,\n",
453
+ ")\n",
454
+ "r = quantized_model.generate(**generate_kwargs)\n",
455
+ "tokenizer.decode(r[0])"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": null,
461
+ "id": "d73d43a0",
462
+ "metadata": {},
463
+ "outputs": [],
464
+ "source": []
465
+ }
466
+ ],
467
+ "metadata": {
468
+ "kernelspec": {
469
+ "display_name": "Python 3 (ipykernel)",
470
+ "language": "python",
471
+ "name": "python3"
472
+ },
473
+ "language_info": {
474
+ "codemirror_mode": {
475
+ "name": "ipython",
476
+ "version": 3
477
+ },
478
+ "file_extension": ".py",
479
+ "mimetype": "text/x-python",
480
+ "name": "python",
481
+ "nbconvert_exporter": "python",
482
+ "pygments_lexer": "ipython3",
483
+ "version": "3.10.12"
484
+ }
485
+ },
486
+ "nbformat": 4,
487
+ "nbformat_minor": 5
488
+ }