File size: 1,380 Bytes
66a181a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, AwqConfig
from huggingface_hub import notebook_login, HfApi
from peft import PeftModel, PeftConfig
from optimum.gptq import GPTQQuantizer, load_quantized_model
from accelerate import Accelerator
import torch
model_id = "mistralai/Mistral-Nemo-Instruct-2407"
quant_dataset = "c4"
gptq_repo = "Granther/Mistral-Nemo-Instruct-GPTQ"
awq_repo = ""
# GPTQ
gptq_dir = "gptq/"
# AWQ
awq_dir = "awq/"
accelerator = Accelerator()
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, ignore_mismatched_sizes=True)
model, tokenizer = accelerator.prepare(model, tokenizer)
quantizer = GPTQQuantizer(bits=4,
dataset="c4",
group_size=64, # The size of groups to perform quant calcs on
desc_act=True, # Perplexity is better, compute speed is worse
sym=True, # Symetrical quant
true_sequential=True, #
#block_name_to_quantize = "layers.0",
tokenizer=tokenizer)
print("Made it to quant_model")
quantized_model = quantizer.quantize_model(model, tokenizer=tokenizer)
tokenizer.save_pretrained(gptq_dir)
#gptq_config.save_pretrained(gptq_dir)
quantized_model.save_pretrained(gptq_dir) |