Spaces:
Runtime error
Runtime error
| import argparse | |
| import time | |
| import numpy as np | |
| import torch | |
| import torch.nn as nn | |
| import quant | |
| from gptq import GPTQ | |
| from datautils import get_loaders | |
| def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): | |
| if type(module) in layers: | |
| return {name: module} | |
| res = {} | |
| for name1, child in module.named_children(): | |
| res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1)) | |
| return res | |
| def get_llama(model): | |
| def skip(*args, **kwargs): | |
| pass | |
| torch.nn.init.kaiming_uniform_ = skip | |
| torch.nn.init.uniform_ = skip | |
| torch.nn.init.normal_ = skip | |
| from transformers import LlamaForCausalLM | |
| model = LlamaForCausalLM.from_pretrained(model, torch_dtype=torch.float16) | |
| model.seqlen = 2048 | |
| return model | |
| def llama_sequential(model, dataloader, dev): | |
| print('Starting ...') | |
| use_cache = model.config.use_cache | |
| model.config.use_cache = False | |
| layers = model.model.layers | |
| model.model.embed_tokens = model.model.embed_tokens.to(dev) | |
| model.model.norm = model.model.norm.to(dev) | |
| layers[0] = layers[0].to(dev) | |
| dtype = next(iter(model.parameters())).dtype | |
| inps = torch.zeros((args.nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev) | |
| cache = {'i': 0, 'attention_mask': None} | |
| class Catcher(nn.Module): | |
| def __init__(self, module): | |
| super().__init__() | |
| self.module = module | |
| def forward(self, inp, **kwargs): | |
| inps[cache['i']] = inp | |
| cache['i'] += 1 | |
| cache['attention_mask'] = kwargs['attention_mask'] | |
| cache['position_ids'] = kwargs['position_ids'] | |
| raise ValueError | |
| layers[0] = Catcher(layers[0]) | |
| for batch in dataloader: | |
| try: | |
| model(batch[0].to(dev)) | |
| except ValueError: | |
| pass | |
| layers[0] = layers[0].module | |
| layers[0] = layers[0].cpu() | |
| model.model.embed_tokens = model.model.embed_tokens.cpu() | |
| model.model.norm = model.model.norm.cpu() | |
| torch.cuda.empty_cache() | |
| outs = torch.zeros_like(inps) | |
| attention_mask = cache['attention_mask'] | |
| position_ids = cache['position_ids'] | |
| print('Ready.') | |
| quantizers = {} | |
| for i in range(len(layers)): | |
| print(f'Quantizing layer {i+1}/{len(layers)}..') | |
| print('+------------------+--------------+------------+-----------+-------+') | |
| print('| name | weight_error | fp_inp_SNR | q_inp_SNR | time |') | |
| print('+==================+==============+============+===========+=======+') | |
| layer = layers[i].to(dev) | |
| full = find_layers(layer) | |
| if args.true_sequential: | |
| sequential = [['self_attn.k_proj', 'self_attn.v_proj', 'self_attn.q_proj'], ['self_attn.o_proj'], ['mlp.up_proj', 'mlp.gate_proj'], ['mlp.down_proj']] | |
| else: | |
| sequential = [list(full.keys())] | |
| for names in sequential: | |
| subset = {n: full[n] for n in names} | |
| gptq = {} | |
| for name in subset: | |
| gptq[name] = GPTQ(subset[name]) | |
| gptq[name].quantizer.configure(args.wbits, perchannel=True, mse=False) | |
| def add_batch(name): | |
| def tmp(_, inp, out): | |
| gptq[name].add_batch(inp[0].data, out.data) | |
| return tmp | |
| handles = [] | |
| for name in subset: | |
| handles.append(subset[name].register_forward_hook(add_batch(name))) | |
| for j in range(args.nsamples): | |
| outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] | |
| for h in handles: | |
| h.remove() | |
| for name in subset: | |
| scale, zero, g_idx, error = gptq[name].fasterquant(percdamp=args.percdamp, groupsize=args.groupsize, actorder=args.act_order, name=name) | |
| quantizers['model.layers.%d.%s' % (i, name)] = (gptq[name].quantizer.cpu(), scale.cpu(), zero.cpu(), g_idx.cpu(), args.wbits, args.groupsize) | |
| gptq[name].free() | |
| for j in range(args.nsamples): | |
| outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] | |
| layers[i] = layer.cpu() | |
| del layer | |
| del gptq | |
| torch.cuda.empty_cache() | |
| inps, outs = outs, inps | |
| print('+------------------+--------------+------------+-----------+-------+') | |
| print('\n') | |
| model.config.use_cache = use_cache | |
| return quantizers | |
| def llama_eval(model, testenc, dev): | |
| print('Evaluating ...') | |
| testenc = testenc.input_ids | |
| nsamples = testenc.numel() // model.seqlen | |
| use_cache = model.config.use_cache | |
| model.config.use_cache = False | |
| layers = model.model.layers | |
| model.model.embed_tokens = model.model.embed_tokens.to(dev) | |
| layers[0] = layers[0].to(dev) | |
| dtype = next(iter(model.parameters())).dtype | |
| inps = torch.zeros((nsamples, model.seqlen, model.config.hidden_size), dtype=dtype, device=dev) | |
| cache = {'i': 0, 'attention_mask': None} | |
| class Catcher(nn.Module): | |
| def __init__(self, module): | |
| super().__init__() | |
| self.module = module | |
| def forward(self, inp, **kwargs): | |
| inps[cache['i']] = inp | |
| cache['i'] += 1 | |
| cache['attention_mask'] = kwargs['attention_mask'] | |
| cache['position_ids'] = kwargs['position_ids'] | |
| raise ValueError | |
| layers[0] = Catcher(layers[0]) | |
| for i in range(nsamples): | |
| batch = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)].to(dev) | |
| try: | |
| model(batch) | |
| except ValueError: | |
| pass | |
| layers[0] = layers[0].module | |
| layers[0] = layers[0].cpu() | |
| model.model.embed_tokens = model.model.embed_tokens.cpu() | |
| torch.cuda.empty_cache() | |
| outs = torch.zeros_like(inps) | |
| attention_mask = cache['attention_mask'] | |
| position_ids = cache['position_ids'] | |
| for i in range(len(layers)): | |
| print(i) | |
| layer = layers[i].to(dev) | |
| if args.nearest: | |
| subset = find_layers(layer) | |
| for name in subset: | |
| quantizer = quant.Quantizer() | |
| quantizer.configure(args.wbits, perchannel=True, sym=args.sym, mse=False) | |
| W = subset[name].weight.data | |
| quantizer.find_params(W, weight=True) | |
| subset[name].weight.data = quantizer.quantize(W).to(next(iter(layer.parameters())).dtype) | |
| for j in range(nsamples): | |
| outs[j] = layer(inps[j].unsqueeze(0), attention_mask=attention_mask, position_ids=position_ids)[0] | |
| layers[i] = layer.cpu() | |
| del layer | |
| torch.cuda.empty_cache() | |
| inps, outs = outs, inps | |
| if model.model.norm is not None: | |
| model.model.norm = model.model.norm.to(dev) | |
| model.lm_head = model.lm_head.to(dev) | |
| testenc = testenc.to(dev) | |
| nlls = [] | |
| for i in range(nsamples): | |
| hidden_states = inps[i].unsqueeze(0) | |
| if model.model.norm is not None: | |
| hidden_states = model.model.norm(hidden_states) | |
| lm_logits = model.lm_head(hidden_states) | |
| shift_logits = lm_logits[:, :-1, :].contiguous() | |
| shift_labels = testenc[:, (i * model.seqlen):((i + 1) * model.seqlen)][:, 1:] | |
| loss_fct = nn.CrossEntropyLoss() | |
| loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) | |
| neg_log_likelihood = loss.float() * model.seqlen | |
| nlls.append(neg_log_likelihood) | |
| ppl = torch.exp(torch.stack(nlls).sum() / (nsamples * model.seqlen)) | |
| print(ppl.item()) | |
| model.config.use_cache = use_cache | |
| # TODO: perform packing on GPU | |
| def llama_pack(model, quantizers, wbits, groupsize): | |
| layers = find_layers(model) | |
| layers = {n: layers[n] for n in quantizers} | |
| quant.make_quant_linear(model, quantizers, wbits, groupsize) | |
| qlayers = find_layers(model, [quant.QuantLinear]) | |
| print('Packing ...') | |
| for name in qlayers: | |
| print(name) | |
| quantizers[name], scale, zero, g_idx, _, _ = quantizers[name] | |
| qlayers[name].pack(layers[name], scale, zero, g_idx) | |
| print('Done.') | |
| return model | |
| def load_quant(model, checkpoint, wbits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True): | |
| from transformers import LlamaConfig, LlamaForCausalLM, modeling_utils | |
| config = LlamaConfig.from_pretrained(model) | |
| def noop(*args, **kwargs): | |
| pass | |
| torch.nn.init.kaiming_uniform_ = noop | |
| torch.nn.init.uniform_ = noop | |
| torch.nn.init.normal_ = noop | |
| torch.set_default_dtype(torch.half) | |
| modeling_utils._init_weights = False | |
| torch.set_default_dtype(torch.half) | |
| model = LlamaForCausalLM(config) | |
| torch.set_default_dtype(torch.float) | |
| if eval: | |
| model = model.eval() | |
| layers = find_layers(model) | |
| for name in ['lm_head']: | |
| if name in layers: | |
| del layers[name] | |
| quant.make_quant_linear(model, layers, wbits, groupsize) | |
| del layers | |
| print('Loading model ...') | |
| if checkpoint.endswith('.safetensors'): | |
| from safetensors.torch import load_file as safe_load | |
| model.load_state_dict(safe_load(checkpoint)) | |
| else: | |
| model.load_state_dict(torch.load(checkpoint)) | |
| quant.make_quant_attn(model) | |
| if eval and fused_mlp: | |
| quant.make_fused_mlp(model) | |
| if warmup_autotune: | |
| quant.autotune_warmup_linear(model, transpose=not (eval)) | |
| if eval and fused_mlp: | |
| quant.autotune_warmup_fused(model) | |
| model.seqlen = 2048 | |
| print('Done.') | |
| return model | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('model', type=str, help='llama model to load') | |
| parser.add_argument('dataset', type=str, choices=['wikitext2', 'ptb', 'c4'], help='Where to extract calibration data from.') | |
| parser.add_argument('--seed', type=int, default=0, help='Seed for sampling the calibration data.') | |
| parser.add_argument('--nsamples', type=int, default=128, help='Number of calibration data samples.') | |
| parser.add_argument('--percdamp', type=float, default=.01, help='Percent of the average Hessian diagonal to use for dampening.') | |
| parser.add_argument('--wbits', type=int, default=16, choices=[2, 3, 4, 8, 16], help='#bits to use for quantization; use 16 for evaluating base model.') | |
| parser.add_argument('--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.') | |
| parser.add_argument('--eval', action='store_true', help='evaluate quantized model.') | |
| parser.add_argument('--save', type=str, default='', help='Save quantized checkpoint under this name.') | |
| parser.add_argument('--save_safetensors', type=str, default='', help='Save quantized `.safetensors` checkpoint under this name.') | |
| parser.add_argument('--quant-directory', type=str, default=None, help='Specify the directory for export quantization parameters to toml format. `None` means no export by default.') | |
| parser.add_argument('--act-order', action='store_true', help='Whether to apply the activation order GPTQ heuristic') | |
| parser.add_argument('--true-sequential', action='store_true', help='Whether to run in true sequential model.') | |
| args = parser.parse_args() | |
| DEV = torch.device('cuda:0') | |
| gpu_dist = [] | |
| model = get_llama(args.model) | |
| model.eval() | |
| dataloader, testloader = get_loaders(args.dataset, nsamples=args.nsamples, seed=args.seed, model=args.model, seqlen=model.seqlen) | |
| if args.wbits < 16: | |
| tick = time.time() | |
| quantizers = llama_sequential(model, dataloader, DEV) | |
| print(time.time() - tick) | |
| if args.eval: | |
| datasets = ['wikitext2', 'ptb', 'c4'] | |
| if args.new_eval: | |
| datasets = ['wikitext2', 'ptb-new', 'c4-new'] | |
| for dataset in datasets: | |
| dataloader, testloader = get_loaders(dataset, seed=args.seed, model=args.model, seqlen=model.seqlen) | |
| print(dataset) | |
| llama_eval(model, testloader, DEV) | |
| llama_pack(model, quantizers, args.wbits, args.groupsize) | |
| torch.save(model.state_dict(), args.save) | |
| # bash : CUDA_VISIBLE_DEVICES=0 proxychains python quant_llama.py ../model/llama7b_hf wikitext2 --wbits 4 --groupsize 128 --save llama7b-4bit-128g.pt |