alexmarques commited on
Commit
422a385
1 Parent(s): b6c9b9c

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +21 -21
README.md CHANGED
@@ -148,71 +148,71 @@ This version of the lm-evaluation-harness includes versions of ARC-Challenge and
148
  <tr>
149
  <td>MMLU (5-shot)
150
  </td>
151
- <td>67.94
152
  </td>
153
- <td>68.09
154
  </td>
155
- <td>100.2%
156
  </td>
157
  </tr>
158
  <tr>
159
  <td>ARC Challenge (0-shot)
160
  </td>
161
- <td>83.19
162
  </td>
163
- <td>82.68
164
  </td>
165
- <td>99.4%
166
  </td>
167
  </tr>
168
  <tr>
169
  <td>GSM-8K (CoT, 8-shot, strict-match)
170
  </td>
171
- <td>82.79
172
  </td>
173
- <td>82.64
174
  </td>
175
- <td>99.8%
176
  </td>
177
  </tr>
178
  <tr>
179
  <td>Hellaswag (10-shot)
180
  </td>
181
- <td>80.01
182
  </td>
183
- <td>80.21
184
  </td>
185
- <td>100.3%
186
  </td>
187
  </tr>
188
  <tr>
189
  <td>Winogrande (5-shot)
190
  </td>
191
- <td>77.90
192
  </td>
193
- <td>77.27
194
  </td>
195
- <td>99.2%
196
  </td>
197
  </tr>
198
  <tr>
199
  <td>TruthfulQA (0-shot, mc2)
200
  </td>
201
- <td>54.04
202
  </td>
203
- <td>54.15
204
  </td>
205
- <td>100.2%
206
  </td>
207
  </tr>
208
  <tr>
209
  <td><strong>Average</strong>
210
  </td>
211
- <td><strong>74.31</strong>
212
  </td>
213
- <td><strong>74.17</strong>
214
  </td>
215
- <td><strong>99.8%</strong>
216
  </td>
217
  </tr>
218
  </table>
 
148
  <tr>
149
  <td>MMLU (5-shot)
150
  </td>
151
+ <td>82.21
152
  </td>
153
+ <td>82.12
154
  </td>
155
+ <td>99.9%
156
  </td>
157
  </tr>
158
  <tr>
159
  <td>ARC Challenge (0-shot)
160
  </td>
161
+ <td>95.05
162
  </td>
163
+ <td>93.60
164
  </td>
165
+ <td>98.5%
166
  </td>
167
  </tr>
168
  <tr>
169
  <td>GSM-8K (CoT, 8-shot, strict-match)
170
  </td>
171
+ <td>93.10
172
  </td>
173
+ <td>92.27
174
  </td>
175
+ <td>99.1%
176
  </td>
177
  </tr>
178
  <tr>
179
  <td>Hellaswag (10-shot)
180
  </td>
181
+ <td>86.40
182
  </td>
183
+ <td>86.11
184
  </td>
185
+ <td>99.7%
186
  </td>
187
  </tr>
188
  <tr>
189
  <td>Winogrande (5-shot)
190
  </td>
191
+ <td>85.00
192
  </td>
193
+ <td>84.14
194
  </td>
195
+ <td>99.0%
196
  </td>
197
  </tr>
198
  <tr>
199
  <td>TruthfulQA (0-shot, mc2)
200
  </td>
201
+ <td>59.83
202
  </td>
203
+ <td>58.90
204
  </td>
205
+ <td>98.5%
206
  </td>
207
  </tr>
208
  <tr>
209
  <td><strong>Average</strong>
210
  </td>
211
+ <td><strong>83.60</strong>
212
  </td>
213
+ <td><strong>82.66</strong>
214
  </td>
215
+ <td><strong>99.1%</strong>
216
  </td>
217
  </tr>
218
  </table>