from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig, AwqConfig | |
from huggingface_hub import notebook_login, HfApi | |
from peft import PeftModel, PeftConfig | |
from optimum.gptq import GPTQQuantizer, load_quantized_model | |
from accelerate import Accelerator | |
import torch | |
model_id = "mistralai/Mistral-Nemo-Instruct-2407" | |
quant_dataset = "c4" | |
gptq_repo = "Granther/Mistral-Nemo-Instruct-GPTQ" | |
awq_repo = "" | |
# GPTQ | |
gptq_dir = "gptq/" | |
# AWQ | |
awq_dir = "awq/" | |
accelerator = Accelerator() | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained(model_id, ignore_mismatched_sizes=True) | |
model, tokenizer = accelerator.prepare(model, tokenizer) | |
quantizer = GPTQQuantizer(bits=4, | |
dataset="c4", | |
group_size=64, # The size of groups to perform quant calcs on | |
desc_act=True, # Perplexity is better, compute speed is worse | |
sym=True, # Symetrical quant | |
true_sequential=True, # | |
#block_name_to_quantize = "layers.0", | |
tokenizer=tokenizer) | |
print("Made it to quant_model") | |
quantized_model = quantizer.quantize_model(model, tokenizer=tokenizer) | |
tokenizer.save_pretrained(gptq_dir) | |
#gptq_config.save_pretrained(gptq_dir) | |
quantized_model.save_pretrained(gptq_dir) |