alexmarques commited on
Commit
8cd5791
1 Parent(s): 28e887b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +82 -21
README.md CHANGED
@@ -155,54 +155,84 @@ This version of the lm-evaluation-harness includes versions of ARC-Challenge, GS
155
  <td><strong>Recovery</strong>
156
  </td>
157
  </tr>
 
 
 
 
 
 
 
 
 
 
158
  <tr>
159
  <td>MMLU-cot (0-shot)
160
  </td>
161
- <td>40.17
162
  </td>
163
- <td>39.40
164
  </td>
165
- <td>98.1%
166
  </td>
167
  </tr>
168
  <tr>
169
  <td>ARC Challenge (0-shot)
170
  </td>
171
- <td>58.02
172
  </td>
173
- <td>56.74
174
  </td>
175
- <td>97.8%
176
  </td>
177
  </tr>
178
  <tr>
179
  <td>GSM-8K-cot (8-shot, strict-match)
180
  </td>
181
- <td>46.63
182
  </td>
183
- <td>45.49
184
  </td>
185
- <td>97.6%
186
  </td>
187
  </tr>
188
  <tr>
189
  <td>Winogrande (5-shot)
190
  </td>
191
- <td>61.96
192
  </td>
193
- <td>61.88
194
  </td>
195
- <td>99.9%
196
  </td>
197
  </tr>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  <tr>
199
  <td><strong>Average</strong>
200
  </td>
201
- <td><strong>51.70</strong>
202
  </td>
203
- <td><strong>50.88</strong>
204
  </td>
205
- <td><strong>98.41%</strong>
206
  </td>
207
  </tr>
208
  </table>
@@ -212,11 +242,23 @@ This version of the lm-evaluation-harness includes versions of ARC-Challenge, GS
212
  The results were obtained using the following commands:
213
 
214
 
215
- #### MMLU-cot
 
 
 
 
 
 
 
 
 
 
 
 
216
  ```
217
  lm_eval \
218
  --model vllm \
219
- --model_args pretrained="neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=4096,tensor_parallel_size=1 \
220
  --tasks mmlu_cot_0shot_llama_3.1_instruct \
221
  --apply_chat_template \
222
  --num_fewshot 0 \
@@ -227,7 +269,7 @@ lm_eval \
227
  ```
228
  lm_eval \
229
  --model vllm \
230
- --model_args pretrained="neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=4096,tensor_parallel_size=1 \
231
  --tasks arc_challenge_llama_3.1_instruct \
232
  --apply_chat_template \
233
  --num_fewshot 0 \
@@ -238,21 +280,40 @@ lm_eval \
238
  ```
239
  lm_eval \
240
  --model vllm \
241
- --model_args pretrained="neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=4096,tensor_parallel_size=1 \
242
  --tasks gsm8k_cot_llama_3.1_instruct \
243
- --apply_chat_template \
244
  --fewshot_as_multiturn \
 
245
  --num_fewshot 8 \
246
  --batch_size auto
247
  ```
248
 
 
 
 
 
 
 
 
 
 
 
249
  #### Winogrande
250
  ```
251
  lm_eval \
252
  --model vllm \
253
- --model_args pretrained="neuralmagic/Llama-3.2-1B-Instruct-FP8-dynamic",dtype=auto,add_bos_token=False,max_model_len=4096,tensor_parallel_size=1 \
254
  --tasks winogrande \
255
  --num_fewshot 5 \
256
  --batch_size auto
257
  ```
258
 
 
 
 
 
 
 
 
 
 
 
155
  <td><strong>Recovery</strong>
156
  </td>
157
  </tr>
158
+ <tr>
159
+ <td>MMLU (5-shot)
160
+ </td>
161
+ <td>47.66
162
+ </td>
163
+ <td>47.55
164
+ </td>
165
+ <td>99.8%
166
+ </td>
167
+ </tr>
168
  <tr>
169
  <td>MMLU-cot (0-shot)
170
  </td>
171
+ <td>47.10
172
  </td>
173
+ <td>46.79
174
  </td>
175
+ <td>99.3%
176
  </td>
177
  </tr>
178
  <tr>
179
  <td>ARC Challenge (0-shot)
180
  </td>
181
+ <td>58.36
182
  </td>
183
+ <td>57.25
184
  </td>
185
+ <td>98.1%
186
  </td>
187
  </tr>
188
  <tr>
189
  <td>GSM-8K-cot (8-shot, strict-match)
190
  </td>
191
+ <td>45.72
192
  </td>
193
+ <td>45.94
194
  </td>
195
+ <td>100.5%
196
  </td>
197
  </tr>
198
  <tr>
199
  <td>Winogrande (5-shot)
200
  </td>
201
+ <td>62.27
202
  </td>
203
+ <td>61.40
204
  </td>
205
+ <td>98.6%
206
  </td>
207
  </tr>
208
+ <tr>
209
+ <td>Winogrande (5-shot)
210
+ </td>
211
+ <td>62.27
212
+ </td>
213
+ <td>61.40
214
+ </td>
215
+ <td>98.6%
216
+ </td>
217
+ </tr>
218
+ <tr>
219
+ <td>TruthfulQA (0-shot, mc2)
220
+ </td>
221
+ <td>43.52
222
+ </td>
223
+ <td>44.23
224
+ </td>
225
+ <td>101.6%
226
+ </td>
227
+ </tr>
228
  <tr>
229
  <td><strong>Average</strong>
230
  </td>
231
+ <td><strong>52.24</strong>
232
  </td>
233
+ <td><strong>52.02</strong>
234
  </td>
235
+ <td><strong>99.7%</strong>
236
  </td>
237
  </tr>
238
  </table>
 
242
  The results were obtained using the following commands:
243
 
244
 
245
+ #### MMLU
246
+ ```
247
+ lm_eval \
248
+ --model vllm \
249
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,add_bos_token=True,max_model_len=3850,max_gen_toks=10,tensor_parallel_size=1 \
250
+ --tasks mmlu_llama_3.1_instruct \
251
+ --fewshot_as_multiturn \
252
+ --apply_chat_template \
253
+ --num_fewshot 5 \
254
+ --batch_size auto
255
+ ```
256
+
257
+ #### MMLU-CoT
258
  ```
259
  lm_eval \
260
  --model vllm \
261
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=4064,max_gen_toks=1024,tensor_parallel_size=1 \
262
  --tasks mmlu_cot_0shot_llama_3.1_instruct \
263
  --apply_chat_template \
264
  --num_fewshot 0 \
 
269
  ```
270
  lm_eval \
271
  --model vllm \
272
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=3940,max_gen_toks=100,tensor_parallel_size=1 \
273
  --tasks arc_challenge_llama_3.1_instruct \
274
  --apply_chat_template \
275
  --num_fewshot 0 \
 
280
  ```
281
  lm_eval \
282
  --model vllm \
283
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,max_model_len=4096,max_gen_toks=1024,tensor_parallel_size=1 \
284
  --tasks gsm8k_cot_llama_3.1_instruct \
 
285
  --fewshot_as_multiturn \
286
+ --apply_chat_template \
287
  --num_fewshot 8 \
288
  --batch_size auto
289
  ```
290
 
291
+ #### Hellaswag
292
+ ```
293
+ lm_eval \
294
+ --model vllm \
295
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
296
+ --tasks hellaswag \
297
+ --num_fewshot 10 \
298
+ --batch_size auto
299
+ ```
300
+
301
  #### Winogrande
302
  ```
303
  lm_eval \
304
  --model vllm \
305
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
306
  --tasks winogrande \
307
  --num_fewshot 5 \
308
  --batch_size auto
309
  ```
310
 
311
+ #### TruthfulQA
312
+ ```
313
+ lm_eval \
314
+ --model vllm \
315
+ --model_args pretrained="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",dtype=auto,add_bos_token=True,max_model_len=4096,tensor_parallel_size=1 \
316
+ --tasks truthfulqa \
317
+ --num_fewshot 0 \
318
+ --batch_size auto
319
+ ```