Isaak Carter Augustus commited on
Commit
0caf7af
1 Parent(s): f6673f4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +385 -0
README.md CHANGED
@@ -38,6 +38,391 @@ parameters:
38
  dtype: bfloat16
39
  ```
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  ## 💻 Usage
42
 
43
  ```python
 
38
  dtype: bfloat16
39
  ```
40
 
41
+ ## Evaluation
42
+
43
+ ```json
44
+ {
45
+ "all": {
46
+ "acc": 0.6312165296664113,
47
+ "acc_stderr": 0.03236370559394293,
48
+ "acc_norm": 0.6324439925872714,
49
+ "acc_norm_stderr": 0.033019786616359854,
50
+ "mc1": 0.39657282741738065,
51
+ "mc1_stderr": 0.017124930942023518,
52
+ "mc2": 0.5688038233837539,
53
+ "mc2_stderr": 0.015263125204118244
54
+ },
55
+ "harness|arc:challenge|25": {
56
+ "acc": 0.6126279863481229,
57
+ "acc_stderr": 0.014235872487909869,
58
+ "acc_norm": 0.6348122866894198,
59
+ "acc_norm_stderr": 0.014070265519268802
60
+ },
61
+ "harness|hellaswag|10": {
62
+ "acc": 0.643397729535949,
63
+ "acc_stderr": 0.00478016987333285,
64
+ "acc_norm": 0.8378809002190799,
65
+ "acc_norm_stderr": 0.0036780679944244735
66
+ },
67
+ "harness|hendrycksTest-abstract_algebra|5": {
68
+ "acc": 0.32,
69
+ "acc_stderr": 0.046882617226215034,
70
+ "acc_norm": 0.32,
71
+ "acc_norm_stderr": 0.046882617226215034
72
+ },
73
+ "harness|hendrycksTest-anatomy|5": {
74
+ "acc": 0.6074074074074074,
75
+ "acc_stderr": 0.0421850621536888,
76
+ "acc_norm": 0.6074074074074074,
77
+ "acc_norm_stderr": 0.0421850621536888
78
+ },
79
+ "harness|hendrycksTest-astronomy|5": {
80
+ "acc": 0.6907894736842105,
81
+ "acc_stderr": 0.037610708698674805,
82
+ "acc_norm": 0.6907894736842105,
83
+ "acc_norm_stderr": 0.037610708698674805
84
+ },
85
+ "harness|hendrycksTest-business_ethics|5": {
86
+ "acc": 0.61,
87
+ "acc_stderr": 0.04902071300001975,
88
+ "acc_norm": 0.61,
89
+ "acc_norm_stderr": 0.04902071300001975
90
+ },
91
+ "harness|hendrycksTest-clinical_knowledge|5": {
92
+ "acc": 0.6754716981132075,
93
+ "acc_stderr": 0.02881561571343211,
94
+ "acc_norm": 0.6754716981132075,
95
+ "acc_norm_stderr": 0.02881561571343211
96
+ },
97
+ "harness|hendrycksTest-college_biology|5": {
98
+ "acc": 0.7291666666666666,
99
+ "acc_stderr": 0.03716177437566017,
100
+ "acc_norm": 0.7291666666666666,
101
+ "acc_norm_stderr": 0.03716177437566017
102
+ },
103
+ "harness|hendrycksTest-college_chemistry|5": {
104
+ "acc": 0.47,
105
+ "acc_stderr": 0.05016135580465919,
106
+ "acc_norm": 0.47,
107
+ "acc_norm_stderr": 0.05016135580465919
108
+ },
109
+ "harness|hendrycksTest-college_computer_science|5": {
110
+ "acc": 0.48,
111
+ "acc_stderr": 0.050211673156867795,
112
+ "acc_norm": 0.48,
113
+ "acc_norm_stderr": 0.050211673156867795
114
+ },
115
+ "harness|hendrycksTest-college_mathematics|5": {
116
+ "acc": 0.34,
117
+ "acc_stderr": 0.04760952285695235,
118
+ "acc_norm": 0.34,
119
+ "acc_norm_stderr": 0.04760952285695235
120
+ },
121
+ "harness|hendrycksTest-college_medicine|5": {
122
+ "acc": 0.6011560693641619,
123
+ "acc_stderr": 0.037336266553835096,
124
+ "acc_norm": 0.6011560693641619,
125
+ "acc_norm_stderr": 0.037336266553835096
126
+ },
127
+ "harness|hendrycksTest-college_physics|5": {
128
+ "acc": 0.29411764705882354,
129
+ "acc_stderr": 0.04533838195929775,
130
+ "acc_norm": 0.29411764705882354,
131
+ "acc_norm_stderr": 0.04533838195929775
132
+ },
133
+ "harness|hendrycksTest-computer_security|5": {
134
+ "acc": 0.72,
135
+ "acc_stderr": 0.045126085985421276,
136
+ "acc_norm": 0.72,
137
+ "acc_norm_stderr": 0.045126085985421276
138
+ },
139
+ "harness|hendrycksTest-conceptual_physics|5": {
140
+ "acc": 0.5659574468085107,
141
+ "acc_stderr": 0.03240038086792747,
142
+ "acc_norm": 0.5659574468085107,
143
+ "acc_norm_stderr": 0.03240038086792747
144
+ },
145
+ "harness|hendrycksTest-econometrics|5": {
146
+ "acc": 0.5,
147
+ "acc_stderr": 0.047036043419179864,
148
+ "acc_norm": 0.5,
149
+ "acc_norm_stderr": 0.047036043419179864
150
+ },
151
+ "harness|hendrycksTest-electrical_engineering|5": {
152
+ "acc": 0.5448275862068965,
153
+ "acc_stderr": 0.04149886942192117,
154
+ "acc_norm": 0.5448275862068965,
155
+ "acc_norm_stderr": 0.04149886942192117
156
+ },
157
+ "harness|hendrycksTest-elementary_mathematics|5": {
158
+ "acc": 0.4021164021164021,
159
+ "acc_stderr": 0.02525303255499769,
160
+ "acc_norm": 0.4021164021164021,
161
+ "acc_norm_stderr": 0.02525303255499769
162
+ },
163
+ "harness|hendrycksTest-formal_logic|5": {
164
+ "acc": 0.42063492063492064,
165
+ "acc_stderr": 0.04415438226743744,
166
+ "acc_norm": 0.42063492063492064,
167
+ "acc_norm_stderr": 0.04415438226743744
168
+ },
169
+ "harness|hendrycksTest-global_facts|5": {
170
+ "acc": 0.39,
171
+ "acc_stderr": 0.04902071300001975,
172
+ "acc_norm": 0.39,
173
+ "acc_norm_stderr": 0.04902071300001975
174
+ },
175
+ "harness|hendrycksTest-high_school_biology|5": {
176
+ "acc": 0.7774193548387097,
177
+ "acc_stderr": 0.02366421667164251,
178
+ "acc_norm": 0.7774193548387097,
179
+ "acc_norm_stderr": 0.02366421667164251
180
+ },
181
+ "harness|hendrycksTest-high_school_chemistry|5": {
182
+ "acc": 0.4876847290640394,
183
+ "acc_stderr": 0.035169204442208966,
184
+ "acc_norm": 0.4876847290640394,
185
+ "acc_norm_stderr": 0.035169204442208966
186
+ },
187
+ "harness|hendrycksTest-high_school_computer_science|5": {
188
+ "acc": 0.68,
189
+ "acc_stderr": 0.04688261722621505,
190
+ "acc_norm": 0.68,
191
+ "acc_norm_stderr": 0.04688261722621505
192
+ },
193
+ "harness|hendrycksTest-high_school_european_history|5": {
194
+ "acc": 0.7818181818181819,
195
+ "acc_stderr": 0.03225078108306289,
196
+ "acc_norm": 0.7818181818181819,
197
+ "acc_norm_stderr": 0.03225078108306289
198
+ },
199
+ "harness|hendrycksTest-high_school_geography|5": {
200
+ "acc": 0.803030303030303,
201
+ "acc_stderr": 0.02833560973246336,
202
+ "acc_norm": 0.803030303030303,
203
+ "acc_norm_stderr": 0.02833560973246336
204
+ },
205
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
206
+ "acc": 0.8549222797927462,
207
+ "acc_stderr": 0.025416343096306433,
208
+ "acc_norm": 0.8549222797927462,
209
+ "acc_norm_stderr": 0.025416343096306433
210
+ },
211
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
212
+ "acc": 0.6435897435897436,
213
+ "acc_stderr": 0.02428314052946731,
214
+ "acc_norm": 0.6435897435897436,
215
+ "acc_norm_stderr": 0.02428314052946731
216
+ },
217
+ "harness|hendrycksTest-high_school_mathematics|5": {
218
+ "acc": 0.32592592592592595,
219
+ "acc_stderr": 0.028578348365473072,
220
+ "acc_norm": 0.32592592592592595,
221
+ "acc_norm_stderr": 0.028578348365473072
222
+ },
223
+ "harness|hendrycksTest-high_school_microeconomics|5": {
224
+ "acc": 0.6638655462184874,
225
+ "acc_stderr": 0.030684737115135367,
226
+ "acc_norm": 0.6638655462184874,
227
+ "acc_norm_stderr": 0.030684737115135367
228
+ },
229
+ "harness|hendrycksTest-high_school_physics|5": {
230
+ "acc": 0.31788079470198677,
231
+ "acc_stderr": 0.038020397601079024,
232
+ "acc_norm": 0.31788079470198677,
233
+ "acc_norm_stderr": 0.038020397601079024
234
+ },
235
+ "harness|hendrycksTest-high_school_psychology|5": {
236
+ "acc": 0.8220183486238533,
237
+ "acc_stderr": 0.01639943636661289,
238
+ "acc_norm": 0.8220183486238533,
239
+ "acc_norm_stderr": 0.01639943636661289
240
+ },
241
+ "harness|hendrycksTest-high_school_statistics|5": {
242
+ "acc": 0.5185185185185185,
243
+ "acc_stderr": 0.034076320938540516,
244
+ "acc_norm": 0.5185185185185185,
245
+ "acc_norm_stderr": 0.034076320938540516
246
+ },
247
+ "harness|hendrycksTest-high_school_us_history|5": {
248
+ "acc": 0.803921568627451,
249
+ "acc_stderr": 0.027865942286639318,
250
+ "acc_norm": 0.803921568627451,
251
+ "acc_norm_stderr": 0.027865942286639318
252
+ },
253
+ "harness|hendrycksTest-high_school_world_history|5": {
254
+ "acc": 0.7974683544303798,
255
+ "acc_stderr": 0.026160568246601453,
256
+ "acc_norm": 0.7974683544303798,
257
+ "acc_norm_stderr": 0.026160568246601453
258
+ },
259
+ "harness|hendrycksTest-human_aging|5": {
260
+ "acc": 0.6995515695067265,
261
+ "acc_stderr": 0.03076935200822914,
262
+ "acc_norm": 0.6995515695067265,
263
+ "acc_norm_stderr": 0.03076935200822914
264
+ },
265
+ "harness|hendrycksTest-human_sexuality|5": {
266
+ "acc": 0.7480916030534351,
267
+ "acc_stderr": 0.03807387116306085,
268
+ "acc_norm": 0.7480916030534351,
269
+ "acc_norm_stderr": 0.03807387116306085
270
+ },
271
+ "harness|hendrycksTest-international_law|5": {
272
+ "acc": 0.8016528925619835,
273
+ "acc_stderr": 0.036401182719909456,
274
+ "acc_norm": 0.8016528925619835,
275
+ "acc_norm_stderr": 0.036401182719909456
276
+ },
277
+ "harness|hendrycksTest-jurisprudence|5": {
278
+ "acc": 0.8055555555555556,
279
+ "acc_stderr": 0.038260763248848646,
280
+ "acc_norm": 0.8055555555555556,
281
+ "acc_norm_stderr": 0.038260763248848646
282
+ },
283
+ "harness|hendrycksTest-logical_fallacies|5": {
284
+ "acc": 0.754601226993865,
285
+ "acc_stderr": 0.03380939813943354,
286
+ "acc_norm": 0.754601226993865,
287
+ "acc_norm_stderr": 0.03380939813943354
288
+ },
289
+ "harness|hendrycksTest-machine_learning|5": {
290
+ "acc": 0.44642857142857145,
291
+ "acc_stderr": 0.04718471485219588,
292
+ "acc_norm": 0.44642857142857145,
293
+ "acc_norm_stderr": 0.04718471485219588
294
+ },
295
+ "harness|hendrycksTest-management|5": {
296
+ "acc": 0.7961165048543689,
297
+ "acc_stderr": 0.039891398595317706,
298
+ "acc_norm": 0.7961165048543689,
299
+ "acc_norm_stderr": 0.039891398595317706
300
+ },
301
+ "harness|hendrycksTest-marketing|5": {
302
+ "acc": 0.8589743589743589,
303
+ "acc_stderr": 0.02280138253459754,
304
+ "acc_norm": 0.8589743589743589,
305
+ "acc_norm_stderr": 0.02280138253459754
306
+ },
307
+ "harness|hendrycksTest-medical_genetics|5": {
308
+ "acc": 0.73,
309
+ "acc_stderr": 0.044619604333847394,
310
+ "acc_norm": 0.73,
311
+ "acc_norm_stderr": 0.044619604333847394
312
+ },
313
+ "harness|hendrycksTest-miscellaneous|5": {
314
+ "acc": 0.8084291187739464,
315
+ "acc_stderr": 0.014072859310451949,
316
+ "acc_norm": 0.8084291187739464,
317
+ "acc_norm_stderr": 0.014072859310451949
318
+ },
319
+ "harness|hendrycksTest-moral_disputes|5": {
320
+ "acc": 0.7312138728323699,
321
+ "acc_stderr": 0.023868003262500104,
322
+ "acc_norm": 0.7312138728323699,
323
+ "acc_norm_stderr": 0.023868003262500104
324
+ },
325
+ "harness|hendrycksTest-moral_scenarios|5": {
326
+ "acc": 0.24916201117318434,
327
+ "acc_stderr": 0.014465893829859924,
328
+ "acc_norm": 0.24916201117318434,
329
+ "acc_norm_stderr": 0.014465893829859924
330
+ },
331
+ "harness|hendrycksTest-nutrition|5": {
332
+ "acc": 0.7124183006535948,
333
+ "acc_stderr": 0.02591780611714716,
334
+ "acc_norm": 0.7124183006535948,
335
+ "acc_norm_stderr": 0.02591780611714716
336
+ },
337
+ "harness|hendrycksTest-philosophy|5": {
338
+ "acc": 0.7106109324758842,
339
+ "acc_stderr": 0.025755865922632945,
340
+ "acc_norm": 0.7106109324758842,
341
+ "acc_norm_stderr": 0.025755865922632945
342
+ },
343
+ "harness|hendrycksTest-prehistory|5": {
344
+ "acc": 0.6975308641975309,
345
+ "acc_stderr": 0.02555765398186806,
346
+ "acc_norm": 0.6975308641975309,
347
+ "acc_norm_stderr": 0.02555765398186806
348
+ },
349
+ "harness|hendrycksTest-professional_accounting|5": {
350
+ "acc": 0.49645390070921985,
351
+ "acc_stderr": 0.02982674915328092,
352
+ "acc_norm": 0.49645390070921985,
353
+ "acc_norm_stderr": 0.02982674915328092
354
+ },
355
+ "harness|hendrycksTest-professional_law|5": {
356
+ "acc": 0.4745762711864407,
357
+ "acc_stderr": 0.01275371692910101,
358
+ "acc_norm": 0.4745762711864407,
359
+ "acc_norm_stderr": 0.01275371692910101
360
+ },
361
+ "harness|hendrycksTest-professional_medicine|5": {
362
+ "acc": 0.6507352941176471,
363
+ "acc_stderr": 0.028959755196824862,
364
+ "acc_norm": 0.6507352941176471,
365
+ "acc_norm_stderr": 0.028959755196824862
366
+ },
367
+ "harness|hendrycksTest-professional_psychology|5": {
368
+ "acc": 0.6323529411764706,
369
+ "acc_stderr": 0.019506291693954843,
370
+ "acc_norm": 0.6323529411764706,
371
+ "acc_norm_stderr": 0.019506291693954843
372
+ },
373
+ "harness|hendrycksTest-public_relations|5": {
374
+ "acc": 0.6363636363636364,
375
+ "acc_stderr": 0.046075820907199756,
376
+ "acc_norm": 0.6363636363636364,
377
+ "acc_norm_stderr": 0.046075820907199756
378
+ },
379
+ "harness|hendrycksTest-security_studies|5": {
380
+ "acc": 0.7183673469387755,
381
+ "acc_stderr": 0.028795185574291293,
382
+ "acc_norm": 0.7183673469387755,
383
+ "acc_norm_stderr": 0.028795185574291293
384
+ },
385
+ "harness|hendrycksTest-sociology|5": {
386
+ "acc": 0.835820895522388,
387
+ "acc_stderr": 0.026193923544454125,
388
+ "acc_norm": 0.835820895522388,
389
+ "acc_norm_stderr": 0.026193923544454125
390
+ },
391
+ "harness|hendrycksTest-us_foreign_policy|5": {
392
+ "acc": 0.87,
393
+ "acc_stderr": 0.033799766898963086,
394
+ "acc_norm": 0.87,
395
+ "acc_norm_stderr": 0.033799766898963086
396
+ },
397
+ "harness|hendrycksTest-virology|5": {
398
+ "acc": 0.5180722891566265,
399
+ "acc_stderr": 0.03889951252827216,
400
+ "acc_norm": 0.5180722891566265,
401
+ "acc_norm_stderr": 0.03889951252827216
402
+ },
403
+ "harness|hendrycksTest-world_religions|5": {
404
+ "acc": 0.8187134502923976,
405
+ "acc_stderr": 0.029547741687640038,
406
+ "acc_norm": 0.8187134502923976,
407
+ "acc_norm_stderr": 0.029547741687640038
408
+ },
409
+ "harness|truthfulqa:mc|0": {
410
+ "mc1": 0.39657282741738065,
411
+ "mc1_stderr": 0.017124930942023518,
412
+ "mc2": 0.5688038233837539,
413
+ "mc2_stderr": 0.015263125204118244
414
+ },
415
+ "harness|winogrande|5": {
416
+ "acc": 0.7963693764798737,
417
+ "acc_stderr": 0.011317798781626918
418
+ },
419
+ "harness|gsm8k|5": {
420
+ "acc": 0.6103108415466262,
421
+ "acc_stderr": 0.01343312323611072
422
+ }
423
+ }
424
+ ```
425
+
426
  ## 💻 Usage
427
 
428
  ```python