Commit
•
70b93f5
1
Parent(s):
8cf2f51
Updating with the correct notebooks
Browse files- 01_1_TGI-launcher.ipynb +12 -25
- 01_2_TGI-benchmark.ipynb +22 -90
01_1_TGI-launcher.ipynb
CHANGED
@@ -7,7 +7,7 @@
|
|
7 |
"source": [
|
8 |
"Here we can see the different settings for TGI. Be sure to read through them and decide which settings are most important for your use-case.\n",
|
9 |
"\n",
|
10 |
-
"Here are some of the most important ones
|
11 |
"- `--model-id`\n",
|
12 |
"- `--quantize` Most of the time you want to quantize to save memory\n",
|
13 |
"- `--max-input-tokens` We will use a high number for RAG and a low number for Chat\n",
|
@@ -15,7 +15,7 @@
|
|
15 |
"- `--max-batch-size` \n",
|
16 |
"- `--max-batch-total-tokens`\n",
|
17 |
"\n",
|
18 |
-
"These are
|
19 |
"- `--hostname`\n",
|
20 |
"- `--port`"
|
21 |
]
|
@@ -143,43 +143,30 @@
|
|
143 |
"id": "598435df-0cdb-411b-96c2-3930fc222f8e",
|
144 |
"metadata": {},
|
145 |
"source": [
|
146 |
-
"We can launch directly from the notebook since we dont need the command to be interactive
|
|
|
|
|
147 |
]
|
148 |
},
|
149 |
{
|
150 |
"cell_type": "code",
|
151 |
-
"execution_count":
|
152 |
-
"id": "
|
153 |
-
"metadata": {
|
154 |
-
|
155 |
-
},
|
156 |
-
"outputs": [
|
157 |
-
{
|
158 |
-
"name": "stdout",
|
159 |
-
"output_type": "stream",
|
160 |
-
"text": [
|
161 |
-
"zsh:1: command not found: text-generation-launcher\n"
|
162 |
-
]
|
163 |
-
}
|
164 |
-
],
|
165 |
"source": [
|
166 |
"!RUST_BACKTRACE=1 \\\n",
|
167 |
"text-generation-launcher \\\n",
|
168 |
"--model-id astronomer/Llama-3-8B-Instruct-GPTQ-8-Bit \\\n",
|
169 |
"--quantize gptq \\\n",
|
170 |
-
"--max-input-tokens 3000 \\\n",
|
171 |
-
"--max-total-tokens 3300 \\\n",
|
172 |
-
"--max-batch-size 5 \\\n",
|
173 |
-
"--max-total-tokens 100000 \\\n",
|
174 |
"--hostname 0.0.0.0 \\\n",
|
175 |
-
"--port 1337 # We need to change it from the default to play well with spaces \n"
|
176 |
-
"\n"
|
177 |
]
|
178 |
},
|
179 |
{
|
180 |
"cell_type": "code",
|
181 |
"execution_count": null,
|
182 |
-
"id": "
|
183 |
"metadata": {},
|
184 |
"outputs": [],
|
185 |
"source": []
|
@@ -201,7 +188,7 @@
|
|
201 |
"name": "python",
|
202 |
"nbconvert_exporter": "python",
|
203 |
"pygments_lexer": "ipython3",
|
204 |
-
"version": "3.
|
205 |
}
|
206 |
},
|
207 |
"nbformat": 4,
|
|
|
7 |
"source": [
|
8 |
"Here we can see the different settings for TGI. Be sure to read through them and decide which settings are most important for your use-case.\n",
|
9 |
"\n",
|
10 |
+
"Here are some of the most important ones:\n",
|
11 |
"- `--model-id`\n",
|
12 |
"- `--quantize` Most of the time you want to quantize to save memory\n",
|
13 |
"- `--max-input-tokens` We will use a high number for RAG and a low number for Chat\n",
|
|
|
15 |
"- `--max-batch-size` \n",
|
16 |
"- `--max-batch-total-tokens`\n",
|
17 |
"\n",
|
18 |
+
"These are changed because we are on spaces, and dont want to conflict with the spaces server:\n",
|
19 |
"- `--hostname`\n",
|
20 |
"- `--port`"
|
21 |
]
|
|
|
143 |
"id": "598435df-0cdb-411b-96c2-3930fc222f8e",
|
144 |
"metadata": {},
|
145 |
"source": [
|
146 |
+
"We can launch directly from the notebook since we dont need the command to be interactive.\n",
|
147 |
+
"\n",
|
148 |
+
"We will just be using defaults in this notebook as the intent is to understand the benchmark tool."
|
149 |
]
|
150 |
},
|
151 |
{
|
152 |
"cell_type": "code",
|
153 |
+
"execution_count": null,
|
154 |
+
"id": "de3b9694-d417-4d17-a28e-e481703344ec",
|
155 |
+
"metadata": {},
|
156 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
"source": [
|
158 |
"!RUST_BACKTRACE=1 \\\n",
|
159 |
"text-generation-launcher \\\n",
|
160 |
"--model-id astronomer/Llama-3-8B-Instruct-GPTQ-8-Bit \\\n",
|
161 |
"--quantize gptq \\\n",
|
|
|
|
|
|
|
|
|
162 |
"--hostname 0.0.0.0 \\\n",
|
163 |
+
"--port 1337 # We need to change it from the default to play well with spaces \n"
|
|
|
164 |
]
|
165 |
},
|
166 |
{
|
167 |
"cell_type": "code",
|
168 |
"execution_count": null,
|
169 |
+
"id": "9de093ba-b39a-4881-ab09-106de58dcd16",
|
170 |
"metadata": {},
|
171 |
"outputs": [],
|
172 |
"source": []
|
|
|
188 |
"name": "python",
|
189 |
"nbconvert_exporter": "python",
|
190 |
"pygments_lexer": "ipython3",
|
191 |
+
"version": "3.9.5"
|
192 |
}
|
193 |
},
|
194 |
"nbformat": 4,
|
01_2_TGI-benchmark.ipynb
CHANGED
@@ -6,10 +6,7 @@
|
|
6 |
"metadata": {},
|
7 |
"source": [
|
8 |
"# Introduction\n",
|
9 |
-
"This notebook is to show how to launch the TGI Benchmark tool. \n"
|
10 |
-
"\n",
|
11 |
-
"## Warning\n",
|
12 |
-
"Please note that the TGI Benchmark tool is designed to work in a terminal, not a jupyter notebook. This means you will need to copy/paste the command in a jupyter terminal tab. I am putting them here for convenience."
|
13 |
]
|
14 |
},
|
15 |
{
|
@@ -19,13 +16,17 @@
|
|
19 |
"source": [
|
20 |
"Here we can see the different settings for TGI Benchmark. \n",
|
21 |
"\n",
|
22 |
-
"Here are some of the
|
23 |
"\n",
|
24 |
"- `--tokenizer-name` This is required so the tool knows what tokenizer to use\n",
|
25 |
"- `--batch-size` This is important for load testing. We should use more and more values to see what happens to throughput and latency\n",
|
26 |
"- `--sequence-length` AKA input tokens, it is important to match your use-case needs\n",
|
27 |
"- `--decode-length` AKA output tokens, it is important to match your use-case needs\n",
|
28 |
-
"- `--runs`
|
|
|
|
|
|
|
|
|
29 |
]
|
30 |
},
|
31 |
{
|
@@ -96,103 +97,34 @@
|
|
96 |
"metadata": {},
|
97 |
"source": [
|
98 |
"Here is an example command. Notice that I add the batch sizes of interest repeatedly to make sure all of them are used by the benchmark tool.\n",
|
99 |
-
"```bash\n",
|
100 |
"\n",
|
|
|
|
|
|
|
|
|
|
|
101 |
"text-generation-benchmark \\\n",
|
102 |
"--tokenizer-name astronomer/Llama-3-8B-Instruct-GPTQ-8-Bit \\\n",
|
103 |
-
"--sequence-length
|
104 |
-
"--decode-length
|
105 |
"--batch-size 1 \\\n",
|
106 |
"--batch-size 2 \\\n",
|
107 |
-
"--batch-size 3 \\\n",
|
108 |
"--batch-size 4 \\\n",
|
109 |
-
"--batch-size
|
|
|
|
|
|
|
|
|
110 |
"```"
|
111 |
]
|
112 |
},
|
113 |
{
|
114 |
"cell_type": "code",
|
115 |
"execution_count": null,
|
116 |
-
"id": "
|
117 |
"metadata": {},
|
118 |
"outputs": [],
|
119 |
-
"source": [
|
120 |
-
"import subprocess\n",
|
121 |
-
"import time\n",
|
122 |
-
"\n",
|
123 |
-
"def launch_server(tokens):\n",
|
124 |
-
" try:\n",
|
125 |
-
" command = [\n",
|
126 |
-
" 'text-generation-launcher',\n",
|
127 |
-
" '--model-id', 'astronomer/Llama-3-8B-Instruct-GPTQ-8-Bit',\n",
|
128 |
-
" '--quantize', 'gptq',\n",
|
129 |
-
" '--max-input-tokens', '3000',\n",
|
130 |
-
" '--max-batch-size', '5',\n",
|
131 |
-
" '--max-batch-total-tokens', f'{tokens}',\n",
|
132 |
-
" '--hostname', '0.0.0.0',\n",
|
133 |
-
" '--port', '1337',\n",
|
134 |
-
" ]\n",
|
135 |
-
" \n",
|
136 |
-
" # Launch the subprocess with a text command (without shell=True for safety)\n",
|
137 |
-
" process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n",
|
138 |
-
" \n",
|
139 |
-
" # Set a time limit to wait for the process to stabilize\n",
|
140 |
-
" time_limit = 120\n",
|
141 |
-
" start_time = time.time()\n",
|
142 |
-
" \n",
|
143 |
-
" while time.time() - start_time < time_limit:\n",
|
144 |
-
" output = process.stdout.readline()\n",
|
145 |
-
" if \"Connected\" in output:\n",
|
146 |
-
" print(f\"Success message found for {tokens} tokens.\")\n",
|
147 |
-
" process.terminate() # Gracefully terminate if successful\n",
|
148 |
-
" return True\n",
|
149 |
-
" if \"RuntimeError\" in output or \"OutOfMemory\" in output:\n",
|
150 |
-
" print(f\"Failure message found for {tokens} tokens.\")\n",
|
151 |
-
" process.terminate()\n",
|
152 |
-
" return False\n",
|
153 |
-
" \n",
|
154 |
-
" # If no specific message was found but the process is still running\n",
|
155 |
-
" if process.poll() is None:\n",
|
156 |
-
" print(f\"No specific message but process is still running for {tokens} tokens.\")\n",
|
157 |
-
" process.terminate()\n",
|
158 |
-
" return True\n",
|
159 |
-
" else:\n",
|
160 |
-
" return False\n",
|
161 |
-
" except Exception as e:\n",
|
162 |
-
" print(f\"Error launching server with {tokens} tokens: {e}\")\n",
|
163 |
-
" return False"
|
164 |
-
]
|
165 |
-
},
|
166 |
-
{
|
167 |
-
"cell_type": "code",
|
168 |
-
"execution_count": null,
|
169 |
-
"id": "6622e91a-dc01-4401-8036-f1bf3c85d655",
|
170 |
-
"metadata": {},
|
171 |
-
"outputs": [],
|
172 |
-
"source": [
|
173 |
-
"%%time\n",
|
174 |
-
"def find_max_prefill_tokens():\n",
|
175 |
-
" low, high = 0, 1_000_000 # Adjust the upper bound as needed\n",
|
176 |
-
" best_valid = 0\n",
|
177 |
-
" \n",
|
178 |
-
" while low <= high:\n",
|
179 |
-
" mid = (low + high) // 2\n",
|
180 |
-
" print(f\"Testing with {mid} max-total-tokens...\")\n",
|
181 |
-
" \n",
|
182 |
-
" if launch_server(mid):\n",
|
183 |
-
" print(f\"Success with {mid} max-total-tokens.\")\n",
|
184 |
-
" best_valid = mid # Update best known valid count\n",
|
185 |
-
" low = mid + 1\n",
|
186 |
-
" else:\n",
|
187 |
-
" print(f\"Failed with {mid} max-total-tokens.\")\n",
|
188 |
-
" high = mid - 1\n",
|
189 |
-
" \n",
|
190 |
-
" print(f\"Maximum manageable prefill tokens: {best_valid}\")\n",
|
191 |
-
" return best_valid\n",
|
192 |
-
"\n",
|
193 |
-
"# Call the function\n",
|
194 |
-
"max_tokens = find_max_prefill_tokens()"
|
195 |
-
]
|
196 |
}
|
197 |
],
|
198 |
"metadata": {
|
@@ -211,7 +143,7 @@
|
|
211 |
"name": "python",
|
212 |
"nbconvert_exporter": "python",
|
213 |
"pygments_lexer": "ipython3",
|
214 |
-
"version": "3.
|
215 |
}
|
216 |
},
|
217 |
"nbformat": 4,
|
|
|
6 |
"metadata": {},
|
7 |
"source": [
|
8 |
"# Introduction\n",
|
9 |
+
"This notebook is to show how to launch the TGI Benchmark tool. \n"
|
|
|
|
|
|
|
10 |
]
|
11 |
},
|
12 |
{
|
|
|
16 |
"source": [
|
17 |
"Here we can see the different settings for TGI Benchmark. \n",
|
18 |
"\n",
|
19 |
+
"Here are some of the more important ones:\n",
|
20 |
"\n",
|
21 |
"- `--tokenizer-name` This is required so the tool knows what tokenizer to use\n",
|
22 |
"- `--batch-size` This is important for load testing. We should use more and more values to see what happens to throughput and latency\n",
|
23 |
"- `--sequence-length` AKA input tokens, it is important to match your use-case needs\n",
|
24 |
"- `--decode-length` AKA output tokens, it is important to match your use-case needs\n",
|
25 |
+
"- `--runs` 10 is the default\n",
|
26 |
+
"\n",
|
27 |
+
"<blockquote style=\"border-left: 5px solid #80CBC4; background: #263238; color: #CFD8DC; padding: 0.5em 1em; margin: 1em 0;\">\n",
|
28 |
+
" <strong>💡 Tip:</strong> Use a low number for <code style=\"background: #37474F; color: #FFFFFF; padding: 2px 4px; border-radius: 4px;\">--runs</code> when you are exploring but a higher number as you finalize to get more precise statistics\n",
|
29 |
+
"</blockquote>\n"
|
30 |
]
|
31 |
},
|
32 |
{
|
|
|
97 |
"metadata": {},
|
98 |
"source": [
|
99 |
"Here is an example command. Notice that I add the batch sizes of interest repeatedly to make sure all of them are used by the benchmark tool.\n",
|
|
|
100 |
"\n",
|
101 |
+
"<blockquote style=\"border-left: 5px solid #FFAB91; background: #37474F; color: #FFCCBC; padding: 0.5em 1em; margin: 1em 0;\">\n",
|
102 |
+
" <strong>⚠️ Warning:</strong> Please note that the TGI Benchmark tool is designed to work in a terminal, not a jupyter notebook. This means you will need to copy/paste the command in a jupyter terminal tab. I am putting them here for convenience.\n",
|
103 |
+
"</blockquote>\n",
|
104 |
+
"\n",
|
105 |
+
"```bash\n",
|
106 |
"text-generation-benchmark \\\n",
|
107 |
"--tokenizer-name astronomer/Llama-3-8B-Instruct-GPTQ-8-Bit \\\n",
|
108 |
+
"--sequence-length 70 \\\n",
|
109 |
+
"--decode-length 50 \\\n",
|
110 |
"--batch-size 1 \\\n",
|
111 |
"--batch-size 2 \\\n",
|
|
|
112 |
"--batch-size 4 \\\n",
|
113 |
+
"--batch-size 8 \\\n",
|
114 |
+
"--batch-size 16 \\\n",
|
115 |
+
"--batch-size 32 \\\n",
|
116 |
+
"--batch-size 64 \\\n",
|
117 |
+
"--batch-size 128 \n",
|
118 |
"```"
|
119 |
]
|
120 |
},
|
121 |
{
|
122 |
"cell_type": "code",
|
123 |
"execution_count": null,
|
124 |
+
"id": "13ac475b-44e1-47e4-85ce-def2db6879c9",
|
125 |
"metadata": {},
|
126 |
"outputs": [],
|
127 |
+
"source": []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
}
|
129 |
],
|
130 |
"metadata": {
|
|
|
143 |
"name": "python",
|
144 |
"nbconvert_exporter": "python",
|
145 |
"pygments_lexer": "ipython3",
|
146 |
+
"version": "3.9.5"
|
147 |
}
|
148 |
},
|
149 |
"nbformat": 4,
|