File size: 1,611 Bytes
c5ee7b9 ce24af6 c5ee7b9 ce24af6 c5ee7b9 ce24af6 c5ee7b9 ce24af6 c5ee7b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
test_stage:
obcq_modifiers:
LogarithmicEqualizationModifier:
mappings: [
[["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
[["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"],
]
QuantizationModifier:
ignore:
# These operations don't make sense to quantize
- LlamaRotaryEmbedding
- LlamaRMSNorm
- SiLUActivation
- MatMulOutput_QK
- MatMulOutput_PV
# Skip quantizing the layers with the most sensitive activations
- model.layers.21.mlp.down_proj
- model.layers.7.mlp.down_proj
- model.layers.2.mlp.down_proj
- model.layers.8.self_attn.q_proj
- model.layers.8.self_attn.k_proj
post_oneshot_calibration: true
scheme_overrides:
# Enable channelwise quantization for better accuracy
Linear:
weights:
num_bits: 8
symmetric: true
strategy: channel
MatMulLeftInput_QK:
input_activations:
num_bits: 8
symmetric: true
MatMulLeftInput_PV:
input_activations:
num_bits: 8
symmetric: true
# For the embeddings, only weight-quantization makes sense
Embedding:
input_activations: null
weights:
num_bits: 8
symmetric: false
SparseGPTModifier:
sparsity: 0.5
block_size: 128
sequential_update: true
quantize: true
percdamp: 0.01
mask_structure: "0:0"
targets: ["re:model.layers.\\d*$"] |