Initial commit of the model files.

Browse files

Files changed (8) hide show

.gitattributes +1 -0
README.md +159 -3
config.json +34 -0
generation_config.json +7 -0
model.safetensors +3 -0
special_tokens_map.json +23 -0
tokenizer.json +0 -0
tokenizer_config.json +43 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+model.safetensors filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,159 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+inference: false
+---
+# MegaBeam-Mistral-7B-300k-AWQ Model
+MegaBeam-Mistral-7B-300k-AWQ is a version of the [MegaBeam-Mistral-7B-300k](https://huggingface.co/amazon/MegaBeam-Mistral-7B-300k) model that was
+quantized using the AWQ method developed by [Lin et al. (2023)](https://arxiv.org/abs/2306.00978).
+The MegaBeam-Mistral-7B-300k-AWQ models are approximately **70% smaller** than those of MegaBeam-Mistral-7B-300k whilst maintaining comparable performance.
+Please refer to the [original MegaBeam-Mistral-7B-300k model card](https://huggingface.co/amazon/MegaBeam-Mistral-7B-300k) for details about the model
+preparation and training processes.
+## MegaBeam-Mistral-7B-300k Variants
+| Branch | Approx. Model Size | `q_group_size` | `w_bit` | `version` |
+|--------|---:|---------------:|--------:|-----------|
+| [main](https://huggingface.co/aws-prototyping/MegaBeam-Mistral-7B-300k-AWQ/tree/main) | 3.9 GB | 128 | 4 | GEMM |
+| [MegaBeam-Mistral-7B-300k-AWQ-64g-4b-GEMM](https://huggingface.co/aws-prototyping/MegaBeam-Mistral-7B-300k-AWQ/tree/MegaBeam-Mistral-7B-300k-AWQ-64g-4b-GEMM) | 4.0 GB | 64 | 4 | GEMM |
+| [MegaBeam-Mistral-7B-300k-AWQ-32g-4b-GEMM](https://huggingface.co/aws-prototyping/MegaBeam-Mistral-7B-300k-AWQ/tree/MegaBeam-Mistral-7B-300k-AWQ-32g-4b-GEMM) | 4.3 GB | 32 | 4 | GEMM |
+## Dependencies
+- [`autoawq==0.2.5`](https://pypi.org/project/autoawq/0.2.5/) – [AutoAWQ](https://github.com/casper-hansen/AutoAWQ) was used to quantize the MegaBeam-Mistral-7B-300k model.
+- [`vllm==0.4.2`](https://pypi.org/project/vllm/0.4.2/) – [vLLM](https://github.com/vllm-project/vllm) was used to host models for benchmarking.
+## Evaluations
+### InfiniteBench
+This benchmark was developed by [Zhang et al. (2024)](https://arxiv.org/abs/2402.13718), available from https://github.com/OpenBMB/InfiniteBench.
+See the [original MegaBeam-Mistral-7B-300k model card](https://huggingface.co/amazon/MegaBeam-Mistral-7B-300k)
+for more details.
+| Task Name        | MegaBeam-Mistral-7B-300k-AWQ | MegaBeam-Mistral-7B-300k | Mistral-7B-Instruct-v0.2 | Llama-3-8B-Instruct-262k | Llama3-70B-1M | GPT-4-1106-preview | YaRN-Mistral-7B | Kimi-Chat | Claude 2 | Yi-6B-200K | Yi-34B-200K | Chatglm3-6B-128K |
+|------------------|------------------------------|--------------------------|--------------------------|--------------------------|---------------|--------------------|-----------------|-----------|----------|------------|-------------|------------------|
+| Retrieve.PassKey | 100%                         | 100%                     | 75.76%                   | 98.30%                   | 81.35%        | 100%               | 92.71%          | 98.14%    | 97.80%   | 100.00%    | 100.00%     | 92.20%           |
+| Retrieve.Number  | 92.7%                        | 96.10%                   | 25.25%                   | 97.79%                   | 97.62%        | 100%               | 56.61%          | 95.42%    | 98.14%   | 94.92%     | 100.00%     | 80.68%           |
+| Retrieve.KV      | 0%                           | 0%                       | 0%                       | 3.40%                    | 3%            | 89.00%             | < 5%            | 53.60%    | 65.40%   | < 5%       | < 5%        | < 5%             |
+| En.Sum           | 29.05%                       | 29.39%                   | 22.13%                   | 16.40%                   | 20.72%        | 14.73%             | 9.09%           | 17.93%    | 14.45%   | < 5%       | < 5%        | < 5%             |
+| En.QA            | 15.69%                       | 14.93%                   | 4.93%                    | 13.20%                   | 16.52%        | 22.22%             | 9.55%           | 16.52%    | 11.97%   | 9.20%      | 12.17%      | < 5%             |
+| En.MC            | 48.91%                       | 51.52%                   | 7.80%                    | 50.65%                   | 62%           | 67.25%             | 27.95%          | 72.49%    | 62.88%   | 36.68%     | 38.43%      | 10.48%           |
+| En.Dia           | 11.50%                       | 9.50%                    | 3.50%                    | 1%                       | 12.50%        | 8.50%              | 7.50%           | 11.50%    | 46.50%   | < 5%       | < 5%        | < 5%             |
+| Zh.QA            | 10.53%                       | 10.71%                   | 3.43%                    | 19.02%                   | 26%           | 25.96%             | 14.43%          | 17.93%    | 9.64%    | 15.07%     | 13.61%      | < 5%             |
+| Code.Debug       | 21.83%                       | 27.41%                   | 11.60%                   | 22.08%                   | 23.85%        | 39.59%             | < 5%            | 18.02%    | < 5%     | < 5%       | < 5%        | < 5%             |
+| Code.Run         | 1.25%                        | 1.75%                    | 0.25%                    | 0%                       | 0%            | 23.25%             | < 5%            | < 5%      | < 5%     | < 5%       | < 5%        | < 5%             |
+| Math.Calc        | 0%                           | 0%                       | 0%                       | 0%                       | 0%            | < 5%               | < 5%            | < 5%      | < 5%     | < 5%       | < 5%        | < 5%             |
+| Math.Find        | 20.57%                       | 24.28%                   | 26.28%                   | 15.40%                   | 30%           | 60.00%             | 17.14%          | 12.57%    | 32.29%   | < 5%       | 25.71%      | 7.71%            |
+| **Average**      | 29.34%                       | 30.70%                   | 15.08%                   | 28.10%                   | 31.13%        | 46.08%             | 20.41%          | 34.93%    | 37.21%   | 22.78%     | 25.41%      | 17.59%           |
+### Long Context
+The following benchmark results are shown as _accuracy_ (%) values, unless stated otherwise.
+#### Topic Retrieval
+See https://lmsys.org/blog/2023-06-29-longchat/
+| Model Name                                         |   n_topics=05 |   n_topics=10 |   n_topics=15 |   n_topics=20 |   n_topics=25 |
+|:---------------------------------------------------|--------------:|--------------:|--------------:|--------------:|--------------:|
+| _n_tokens_ (approx.) =         | _3048_ | _5966_ | _8903_ | _11832_ | _14757_ |
+| MegaBeam-Mistral-7B-300k                                        |           100 |           100 |           100 |           100 |            100 |
+| **MegaBeam-Mistral-7B-300k-AWQ**           |          **100** |          **100** |          **100**|          **100** |           **100** |
+| **MegaBeam-Mistral-7B-300k-AWQ-64g-4b-GEMM**            |          **100** |          **100** |          **100**|          **100** |           **98** |
+| **MegaBeam-Mistral-7B-300k-AWQ-32g-4b-GEMM**            |          **100** |          **100** |          **100**|          **100** |           **98** |
+#### [Line Retrieval](https://lmsys.org/blog/2023-06-29-longchat/#longeval-results)
+See https://lmsys.org/blog/2023-06-29-longchat/#longeval-results
+| Model Name                                         |   n_lines=200 |   n_lines=300 |   n_lines=400 |   n_lines=500 |   n_lines=600 |   n_lines=680 |
+|:----------|-------------:|-------------:|------------:|-----------:|-----------:|-----------:|
+| _n_tokens_ (approx.) =         | _4317_ | _6415_ | _8510_ | _10610_ | _12698_ | _14373_ |
+| MegaBeam-Mistral-7B-300k                                        |           98 |            98 |            92 |            98 |            90 |            90 |
+| **MegaBeam-Mistral-7B-300k-AWQ**           |           **96**|           **94**|           **88** |           **80** |           **70**|           **62** |
+| **MegaBeam-Mistral-7B-300k-AWQ-64g-4b-GEMM**            |           **100**|           **98**|           **96** |           **96** |           **90**|           **94** |
+| **MegaBeam-Mistral-7B-300k-AWQ-32g-4b-GEMM**            |           **98**|           **98**|           **82** |           **96** |           **92**|           **90** |
+#### Pass Key Retrieval
+See https://github.com/epfml/landmark-attention/blob/main/llama/run_test.py#L101
+| Model Name                               |   n_garbage=12000 |   n_garbage=20000 |   n_garbage=31000 |   n_garbage=38000 |   n_garbage=45000 | n_garbage=60000 |
+|:----------|-------------:|-------------:|------------:|-----------:|-----------:|-----------:|
+| _n_tokens_ (approx.) =         | _3272_ | _5405_ | _8338_ | _10205_ | _12071_ | _16072_ |
+| MegaBeam-Mistral-7B-300k                              |               100 |               100 |               100 |               100 |               100 | 100|
+| **MegaBeam-Mistral-7B-300k-AWQ** |              **100** |             **100**|              **100**|              **100** |              **100**| **100**|
+| **MegaBeam-Mistral-7B-300k-AWQ-64g-4b-GEMM**  |              **100** |             **100**|              **100**|              **100** |              **100**| **100**|
+| **MegaBeam-Mistral-7B-300k-AWQ-32g-4b-GEMM**  |              **100** |             **100**|              **100**|              **100** |              **100**| **100**|
+#### QuALITY (Question Answering with Long Input Texts, Yes!)
+See https://nyu-mll.github.io/quality/
+|Model Name| Test set Accuracy | Hard subset Accuracy|
+|:----------|-------------:|-------------:|
+| MegaBeam-Mistral-7B-300k                              |   53.2 |       72 |
+| **MegaBeam-Mistral-7B-300k-AWQ** |  **51.3** |      **71.3** |
+| **MegaBeam-Mistral-7B-300k-AWQ-64g-4b-GEMM**  |  **52.4** |      **72.1** |
+| **MegaBeam-Mistral-7B-300k-AWQ-32g-4b-GEMM**  |  **53.1** |      **71.3** |
+## Usage
+## Inference via vLLM HTTP Host
+### Launch Host
+```bash
+python -m vllm.entrypoints.openai.api_server \
+    --model aws-prototyping/MegaBeam-Mistral-7B-300k-AWQ \
+    --quantization awq
+```
+### Query Host
+```bash
+curl -X POST http://localhost:8000/v1/completions \
+    -H "Content-Type: application/json" \
+    -d '{ "model": "aws-prototyping/MegaBeam-Mistral-7B-300k-AWQ",
+          "prompt": "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>",
+          "temperature": 0,
+          "echo": false
+    }'
+```
+## Inference via [vLLM Offline Inference](https://docs.vllm.ai/en/latest/getting_started/examples/offline_inference.html)
+```python
+from vllm import LLM, SamplingParams
+prompts = [
+   "<|prompter|>What are the main challenges to support a long context for LLM?</s><|assistant|>",
+]
+sampling_params = SamplingParams(temperature=0, max_tokens=100)
+llm = LLM(model="aws-prototyping/MegaBeam-Mistral-7B-300k-AWQ")
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+```
+## License
+Apache 2.0
+## Limitations
+Before using the MegaBeam-Mistral-7B-300k-AWQ model, it is important to perform your own
+independent assessment, and take measures to ensure that your use would comply
+with your own specific quality control practices and standards, and that your
+use would comply with the local rules, laws, regulations, licenses and terms
+that apply to you, and your content.

config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+  "_name_or_path": "aws-prototyping/MegaBeam-Mistral-7B-300k-AWQ",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 288800,
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "quantization_config": {
+    "bits": 4,
+    "group_size": 128,
+    "modules_to_not_convert": null,
+    "quant_method": "awq",
+    "version": "gemm",
+    "zero_point": true
+  },
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 25000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float16",
+  "transformers_version": "4.41.2",
+  "use_cache": true,
+  "vocab_size": 32000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "transformers_version": "4.41.2"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2b626ba3dcb50bea165b4925f9bb7e3e4b7c2ef8a0deb17ed2e04790fd36f9f
+size 4150880232

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": null,
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}