import argparse import math import datetime import time import os import gc from tqdm import tqdm import copy os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:512' import torch import torch.multiprocessing as mp from torch import nn, optim from transformers import AutoTokenizer, AutoModelForCausalLM from datasets import load_dataset from lib import codebook, utils from lib.algo import quip, preprocess, outlier_channel_split as ocs import glog parser = argparse.ArgumentParser() parser.add_argument('--seed', default=0, type=int) parser.add_argument('--num_cpu_threads', default=8, type=int) parser.add_argument('--batch_size', default=8, type=int) parser.add_argument('--devset_size', default=64, type=int) parser.add_argument('--ctx_size', default=2048, type=int) parser.add_argument('--save_path', type=str) parser.add_argument('--hessian_path', type=str) parser.add_argument('--base_model', default='meta-llama/Llama-2-70b-hf', type=str) parser.add_argument('--sigma_reg', default=1e-2, type=float) parser.add_argument('--sigma_reg2', default=1e-2, type=float) parser.add_argument('--incoh_mode', default='had', type=str, choices=['had', 'kron']) parser.add_argument('--lora_rank', default=0, type=int, help='if <=0 then turned off') parser.add_argument('--scale_override', default=-1, type=float) parser.add_argument('--codebook', default='D4', type=str) parser.add_argument('--quip_tune_iters', default=10, type=int) parser.add_argument('--remove_mean', action='store_true') parser.add_argument('--outlier_channel_split', action='store_true') parser.add_argument('--ocs_down_size', default=2**15, type=int) parser.add_argument('--use_fp64', action='store_true') parser.add_argument('--full_svd', action='store_true') parser.add_argument('--no_use_buffered', action='store_true') parser.add_argument('--q_buffer_size', default=2, type=int) parser.add_argument('--rescale_WH', action='store_true') parser.add_argument('--sample_proc', default=1, type=int) def quantize_kqv(layer, idx, cb, args, device='cpu', check_only=False): dtype_ = torch.float64 if args.use_fp64 else torch.float32 hatw_path = f'{args.save_path}/{idx}' W_q = layer.self_attn.q_proj.weight W_k = layer.self_attn.k_proj.weight W_v = layer.self_attn.v_proj.weight W_q_scale = W_k_scale = W_v_scale = if os.path.exists(hatw_path): if check_only: return hatW = utils.load_quip(hatw_path, cb, args, device)'loaded saved hatW from {hatw_path}') else: H_data = torch.load(f'{args.hessian_path}/{idx}', map_location=torch.device('cpu')) H = utils.flat_to_sym(H_data['flatH'], H_data['n']) mu = H_data['mu'] n = H_data['n'] W_qkv = torch.vstack(( / W_q_scale, / W_k_scale, / W_v_scale)).to(dtype_) H, mu = preprocess.basic_preprocess(H, mu, n, args) hatW, attr = quip.quantize(H, W_qkv, args.lora_rank, cb, args, device) attr.update({ 'W_q_scale': W_q_scale.cpu(), 'W_k_scale': W_k_scale.cpu(), 'W_v_scale': W_v_scale.cpu(), }), hatw_path) utils.show_metrics(hatW, W_qkv,, f'layer {idx} qkv') utils.clean() W_q_next = (hatW[0:(W_q.shape[0]), :] * W_q_scale).half() W_k_next = (hatW[(W_q.shape[0]):(W_q.shape[0] + W_k.shape[0]), :] * W_k_scale).half() W_v_next = (hatW[(W_q.shape[0] + W_k.shape[0]):\ (W_q.shape[0] + W_k.shape[0] + W_v.shape[0]), :] * W_v_scale).half() if args.remove_mean: layer.self_attn.q_proj.bias = nn.Parameter( ( @ mu - @ mu).half()) layer.self_attn.k_proj.bias = nn.Parameter( ( @ mu - @ mu).half()) layer.self_attn.v_proj.bias = nn.Parameter( ( @ mu - @ mu).half()) W_q.copy_(W_q_next) W_k.copy_(W_k_next) W_v.copy_(W_v_next) def quantize_o(layer, idx, cb, args, device='cpu', check_only=False): dtype_ = torch.float64 if args.use_fp64 else torch.float32 hatw_path = f'{args.save_path}/{idx}' W_o = layer.self_attn.o_proj.weight W_o_scale = if os.path.exists(hatw_path): if check_only: return hatW = utils.load_quip(hatw_path, cb, args, device)'loading saved hatW from {hatw_path}') else: H_data = torch.load(f'{args.hessian_path}/{idx}', map_location=torch.device('cpu')) H = utils.flat_to_sym(H_data['flatH'], H_data['n']) mu = H_data['mu'] n = H_data['n'] W_orig = / W_o_scale H, mu = preprocess.basic_preprocess(H, mu, n, args) hatW, attr = quip.quantize(H, W_orig, args.lora_rank, cb, args, device) attr.update({'W_o_scale': W_o_scale}), hatw_path) utils.show_metrics(hatW, W_orig,, f'layer {idx} o') utils.clean() W_o_next = (hatW * W_o_scale).half() if args.remove_mean: layer.self_attn.o_proj.bias = nn.Parameter( ( @ mu - @ mu).half()) W_o.copy_(W_o_next) def quantize_up(layer, idx, cb, args, device='cpu', check_only=False): dtype_ = torch.float64 if args.use_fp64 else torch.float32 hatw_path = f'{args.save_path}/{idx}' W_up = layer.mlp.up_proj.weight W_gate = layer.mlp.gate_proj.weight W_up_scale = W_gate_scale = if os.path.exists(hatw_path): if check_only: return'loading saved hatW from {hatw_path}') hatW = utils.load_quip(hatw_path, cb, args, device) else: H_data = torch.load(f'{args.hessian_path}/{idx}', map_location=torch.device('cpu')) H = utils.flat_to_sym(H_data['flatH'], H_data['n']) mu = H_data['mu'] n = H_data['n'] W_upgate = torch.vstack( ( / W_up_scale, / W_gate_scale)).to(dtype_) H, mu = preprocess.basic_preprocess(H, mu, n, args) hatW, attr = quip.quantize(H, W_upgate, args.lora_rank, cb, args, device) attr.update({ 'W_up_scale': W_up_scale, 'W_gate_scale': W_gate_scale, }), hatw_path) utils.show_metrics(hatW, W_upgate,, f'layer {idx} up') utils.clean() W_up_next = (hatW[0:(W_up.shape[0]), :] * W_up_scale).half() W_gate_next = (hatW[(W_up.shape[0]):(W_up.shape[0] + W_gate.shape[0]), :] * W_gate_scale).half() if args.remove_mean: layer.mlp.up_proj.bias = nn.Parameter( ( @ mu - @ mu).half()) layer.mlp.gate_proj.bias = nn.Parameter( ( @ mu - @ mu).half()) W_up.copy_(W_up_next) W_gate.copy_(W_gate_next) def quantize_down(layer, idx, cb, args, device='cpu', check_only=False): dtype_ = torch.float64 if args.use_fp64 else torch.float32 hatw_path = f'{args.save_path}/{idx}' W_down = layer.mlp.down_proj.weight W_down_scale = if os.path.exists(hatw_path): if check_only: return'loading saved hatW from {hatw_path}') hatW = utils.load_quip(hatw_path, cb, args, device) if args.outlier_channel_split: extra_inds = torch.load(hatw_path)['ocs_extra_inds'] else: H_data = torch.load(f'{args.hessian_path}/{idx}', map_location=torch.device('cpu')) H = utils.flat_to_sym(H_data['flatH'], H_data['n']) mu = H_data['mu'] n = H_data['n'] if args.outlier_channel_split: # outlier channel split to next power of two'outlier channel splitting to {args.ocs_down_size}') W_down, H, mu, extra_inds, dupe_inds = ocs.outlier_channel_split( W_down, H, mu, args.ocs_down_size) n = args.ocs_down_size utils.clean() W_orig = / W_down_scale H, mu = preprocess.basic_preprocess(H, mu, n, args) hatW, attr = quip.quantize(H, W_orig, args.lora_rank, cb, args, device) attr.update({'W_down_scale': W_down_scale}) if args.outlier_channel_split: attr['ocs_extra_inds'] = extra_inds attr['ocs_dupe_inds'] = dupe_inds, hatw_path) utils.show_metrics(hatW, W_orig,, f'layer {idx} down') utils.clean() W_down_next = (hatW * W_down_scale).half() if args.remove_mean: layer.mlp.down_proj.bias = nn.Parameter( ( @ mu - @ mu).half()) if args.outlier_channel_split: # fuse back outlier channel split W_down_next = ocs.fuse_W(W_down_next, extra_inds) layer.mlp.down_proj.weight.copy_(W_down_next) def quantize_layer(layer, idx, cb, args, device='cpu', return_layer=False): # check_only=not return_layer -> If we are not returning the layer just check # if it has been quantized already. Otherwise, load it for returning. torch.manual_seed(idx) torch.set_grad_enabled(False) utils.clean() quantize_kqv(layer, idx, cb, args, device, check_only=not return_layer) utils.clean() quantize_o(layer, idx, cb, args, device, check_only=not return_layer) utils.clean() quantize_up(layer, idx, cb, args, device, check_only=not return_layer) utils.clean() quantize_down(layer, idx, cb, args, device, check_only=not return_layer) utils.clean()'finished layer {idx}') if return_layer: return layer def quantize_layer_queue(in_q, cb, args, device): while True: next_item = in_q.get() if next_item is None: return quantize_layer(*next_item, cb, args, device, False) def main(args): dtype_ = torch.float64 if args.use_fp64 else torch.float32 cb = codebook.get_codebook(args.codebook) model = AutoModelForCausalLM.from_pretrained(args.base_model, torch_dtype='auto', low_cpu_mem_usage=True) # save configs all_config = {'quant_args': args, 'model_config': model.config} all_config['model_config'].update({ 'quip_params': { 'outlier_channel_split': args.outlier_channel_split, 'lora_rank': args.lora_rank, 'rescale_WH': args.rescale_WH, 'codebook': args.codebook, 'codebook_version': cb.version, 'codesz': cb.codesz, 'idx_dtype': str(cb.idx_dtype), 'fused': True, 'packsz': cb.packsz, } }) if args.outlier_channel_split: all_config['model_config'].quip_params['ocs_down_size'] = args.ocs_down_size, os.path.join(args.save_path, '')) tokenizer = AutoTokenizer.from_pretrained(args.base_model) tokenizer.pad_token = tokenizer.eos_token'loaded model') dataset = load_dataset('togethercomputer/RedPajama-Data-1T-Sample', split='train') devset = utils.sample_devset(dataset, tokenizer, args.devset_size, args.ctx_size, args.sample_proc)'loaded dataset and devset') # Reduce cpu memory consumption at the expense of latency. Tune as needed nproc = torch.cuda.device_count() if nproc > 1: # If we only have one process run the serial version # and calculate activation errors too layer_q = mp.Queue(maxsize=args.q_buffer_size) quantize_procs = [] for i in range(nproc): p = mp.Process(target=quantize_layer_queue, args=(layer_q, cb, args, i)) p.start() quantize_procs.append(p) for _ in range(len(model.model.layers)): layer_q.put((copy.deepcopy(model.model.layers[_]), _)) for p in quantize_procs: layer_q.put(None) for p in quantize_procs: p.join()'done quantizing') # do the rest of the stuff on gpu 0 device = 0 # load quantized layers from disk and calculate activation errors orig_emb = model.model.embed_tokens(devset) quant_emb = orig_emb.clone() position_ids = torch.arange(args.ctx_size, dtype=torch.int32)[None, :].to(device) + \ torch.zeros(args.batch_size, args.ctx_size, dtype=torch.int32).to(device) if hasattr(model.config, 'sliding_window'): attention_mask = model.model._prepare_decoder_attention_mask( torch.ones(args.batch_size, args.ctx_size, dtype=torch.bool), (args.batch_size, args.ctx_size), quant_emb[0:args.batch_size], 0, sliding_window=model.config.sliding_window).to(device) else: attention_mask = model.model._prepare_decoder_attention_mask( torch.ones(args.batch_size, args.ctx_size, dtype=torch.bool), (args.batch_size, args.ctx_size), quant_emb[0:args.batch_size], 0).to(device) for i in range(len(model.model.layers)): model.model.layers[i] = model.model.layers[i].to(device) for j in range(args.devset_size // args.batch_size): orig_emb[args.batch_size * j : args.batch_size * (j + 1)] = \ model.model.layers[i]( orig_emb[args.batch_size * j : args.batch_size * (j + 1)].to(device), position_ids=position_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False)[0].cpu() model.model.layers[i] = model.model.layers[i].cpu() model.model.layers[i] = quantize_layer(model.model.layers[i], i, cb, args, device=device, return_layer=True).to(device) for j in range(args.devset_size // args.batch_size): quant_emb[args.batch_size * j : args.batch_size * (j + 1)] = \ model.model.layers[i]( quant_emb[args.batch_size * j : args.batch_size * (j + 1)].to(device), position_ids=position_ids, attention_mask=attention_mask, use_cache=False, output_attentions=False)[0].cpu() model.model.layers[i] = model.model.layers[i].cpu() model.model.layers[i] = None act_error = ( - / \ ( -, 1))).square().sum()'layer {i} activation error {act_error}')'calculating perplexity on devset') lm_head = norm = acc = 0.0 for i in tqdm(range(args.devset_size // args.batch_size), desc='original model perplexity'): shift_logits = lm_head( norm(orig_emb[args.batch_size * i:args.batch_size * (i + 1)].to(device).to(dtype_)))[..., :-1, :].contiguous().view( -1, model.config.vocab_size) shift_labels = devset[args.batch_size * i:args.batch_size * (i + 1), 1:].contiguous().view(-1).to(device) loss_fct = nn.CrossEntropyLoss().to(device) acc += loss_fct(shift_logits, shift_labels) perplexity = (acc / (args.devset_size // args.batch_size + 1)).exp()'original model perplexity: {perplexity}') acc = 0.0 for i in tqdm(range(args.devset_size // args.batch_size), desc='quantized model perplexity'): shift_logits = lm_head( norm(quant_emb[args.batch_size * i:args.batch_size * (i + 1)].to(device).to(dtype_)))[..., :-1, :].contiguous().view( -1, model.config.vocab_size) shift_labels = devset[args.batch_size * i:args.batch_size * (i + 1), 1:].contiguous().view(-1).to(device) loss_fct = nn.CrossEntropyLoss().to(device) acc += loss_fct(shift_logits, shift_labels) perplexity = (acc / (args.devset_size // args.batch_size + 1)).exp()'quantized model perplexity: {perplexity}') if __name__ == '__main__': torch.set_grad_enabled(False) mp.set_start_method('spawn') args = parser.parse_args() torch.set_num_threads(args.num_cpu_threads) torch.manual_seed(args.seed) os.makedirs(args.save_path, exist_ok=True) main(args)