stan-hua
/

Llama-3.1-8B-Instruct-LC-SmoothQuant-GPTQ-W4A16

Text Generation

text-generation-inference

Inference Endpoints

compressed-tensors

Model card Files Files and versions Community

stan-hua commited on 2 days ago

Commit

c97a362

•

1 Parent(s): 44b4b40

Push folder to HuggingFace Hub

Files changed (2) hide show

config.json +40 -1
recipe.yaml +8 -0

config.json CHANGED Viewed

@@ -23,6 +23,45 @@
   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 8.0,
@@ -37,4 +76,4 @@
   "transformers_version": "4.45.2",
   "use_cache": true,
   "vocab_size": 128256
-}

   "num_hidden_layers": 32,
   "num_key_value_heads": 8,
   "pretraining_tp": 1,
+  "quantization_config": {
+    "config_groups": {
+      "group_0": {
+        "input_activations": null,
+        "output_activations": null,
+        "targets": [
+          "Linear"
+        ],
+        "weights": {
+          "actorder": null,
+          "block_structure": null,
+          "dynamic": false,
+          "group_size": 128,
+          "num_bits": 4,
+          "observer": "minmax",
+          "observer_kwargs": {},
+          "strategy": "group",
+          "symmetric": true,
+          "type": "int"
+        }
+      }
+    },
+    "format": "pack-quantized",
+    "global_compression_ratio": 1.8917232374233346,
+    "ignore": [
+      "lm_head"
+    ],
+    "kv_cache_scheme": null,
+    "quant_method": "compressed-tensors",
+    "quantization_status": "compressed",
+    "sparsity_config": {
+      "format": "dense",
+      "global_sparsity": 0.14809091252120618,
+      "ignore": null,
+      "registry_requires_subclass": false,
+      "sparsity_structure": "unstructured",
+      "targets": null
+    }
+  },
   "rms_norm_eps": 1e-05,
   "rope_scaling": {
     "factor": 8.0,
   "transformers_version": "4.45.2",
   "use_cache": true,
   "vocab_size": 128256
+}

recipe.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+DEFAULT_stage:
+  DEFAULT_modifiers:
+    SmoothQuantModifier: {smoothing_strength: 0.8}
+    GPTQModifier:
+      targets: Linear
+      dampening_frac: 0.01
+      ignore: [lm_head]
+      scheme: W4A16