Ramikan-BR commited on
Commit
0fae5dc
1 Parent(s): 303c73a

Upload 2 files

Browse files
results_2024-05-27T06-04-08.229332.json ADDED
@@ -0,0 +1,1409 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config_general": {
3
+ "lighteval_sha": "494ee12240e716e804ae9ea834f84a2c864c07ca",
4
+ "num_few_shot_default": 0,
5
+ "num_fewshot_seeds": 1,
6
+ "override_batch_size": 1,
7
+ "max_samples": null,
8
+ "job_id": "",
9
+ "start_time": 2370587.736800548,
10
+ "end_time": 2372824.830827315,
11
+ "total_evaluation_time_secondes": "2237.0940267667174",
12
+ "model_name": "Ramikan-BR/tinyllama-coder-py-v12",
13
+ "model_sha": "5835856d42314f549c92bb77eb9ca3e44edd1cda",
14
+ "model_dtype": "torch.float16",
15
+ "model_size": "2.05 GB"
16
+ },
17
+ "results": {
18
+ "harness|arc:challenge|25": {
19
+ "acc": 0.2858361774744027,
20
+ "acc_stderr": 0.01320319608853737,
21
+ "acc_norm": 0.3199658703071672,
22
+ "acc_norm_stderr": 0.013631345807016193
23
+ },
24
+ "harness|hellaswag|10": {
25
+ "acc": 0.41276638119896436,
26
+ "acc_stderr": 0.004913253031155681,
27
+ "acc_norm": 0.5361481776538538,
28
+ "acc_norm_stderr": 0.004976724124850562
29
+ },
30
+ "harness|hendrycksTest-abstract_algebra|5": {
31
+ "acc": 0.23,
32
+ "acc_stderr": 0.04229525846816505,
33
+ "acc_norm": 0.23,
34
+ "acc_norm_stderr": 0.04229525846816505
35
+ },
36
+ "harness|hendrycksTest-anatomy|5": {
37
+ "acc": 0.2814814814814815,
38
+ "acc_stderr": 0.038850042458002526,
39
+ "acc_norm": 0.2814814814814815,
40
+ "acc_norm_stderr": 0.038850042458002526
41
+ },
42
+ "harness|hendrycksTest-astronomy|5": {
43
+ "acc": 0.17763157894736842,
44
+ "acc_stderr": 0.031103182383123387,
45
+ "acc_norm": 0.17763157894736842,
46
+ "acc_norm_stderr": 0.031103182383123387
47
+ },
48
+ "harness|hendrycksTest-business_ethics|5": {
49
+ "acc": 0.26,
50
+ "acc_stderr": 0.044084400227680794,
51
+ "acc_norm": 0.26,
52
+ "acc_norm_stderr": 0.044084400227680794
53
+ },
54
+ "harness|hendrycksTest-clinical_knowledge|5": {
55
+ "acc": 0.24528301886792453,
56
+ "acc_stderr": 0.026480357179895685,
57
+ "acc_norm": 0.24528301886792453,
58
+ "acc_norm_stderr": 0.026480357179895685
59
+ },
60
+ "harness|hendrycksTest-college_biology|5": {
61
+ "acc": 0.2361111111111111,
62
+ "acc_stderr": 0.03551446610810826,
63
+ "acc_norm": 0.2361111111111111,
64
+ "acc_norm_stderr": 0.03551446610810826
65
+ },
66
+ "harness|hendrycksTest-college_chemistry|5": {
67
+ "acc": 0.39,
68
+ "acc_stderr": 0.04902071300001974,
69
+ "acc_norm": 0.39,
70
+ "acc_norm_stderr": 0.04902071300001974
71
+ },
72
+ "harness|hendrycksTest-college_computer_science|5": {
73
+ "acc": 0.33,
74
+ "acc_stderr": 0.04725815626252604,
75
+ "acc_norm": 0.33,
76
+ "acc_norm_stderr": 0.04725815626252604
77
+ },
78
+ "harness|hendrycksTest-college_mathematics|5": {
79
+ "acc": 0.33,
80
+ "acc_stderr": 0.047258156262526045,
81
+ "acc_norm": 0.33,
82
+ "acc_norm_stderr": 0.047258156262526045
83
+ },
84
+ "harness|hendrycksTest-college_medicine|5": {
85
+ "acc": 0.2138728323699422,
86
+ "acc_stderr": 0.03126511206173043,
87
+ "acc_norm": 0.2138728323699422,
88
+ "acc_norm_stderr": 0.03126511206173043
89
+ },
90
+ "harness|hendrycksTest-college_physics|5": {
91
+ "acc": 0.21568627450980393,
92
+ "acc_stderr": 0.04092563958237655,
93
+ "acc_norm": 0.21568627450980393,
94
+ "acc_norm_stderr": 0.04092563958237655
95
+ },
96
+ "harness|hendrycksTest-computer_security|5": {
97
+ "acc": 0.31,
98
+ "acc_stderr": 0.04648231987117316,
99
+ "acc_norm": 0.31,
100
+ "acc_norm_stderr": 0.04648231987117316
101
+ },
102
+ "harness|hendrycksTest-conceptual_physics|5": {
103
+ "acc": 0.251063829787234,
104
+ "acc_stderr": 0.02834696377716245,
105
+ "acc_norm": 0.251063829787234,
106
+ "acc_norm_stderr": 0.02834696377716245
107
+ },
108
+ "harness|hendrycksTest-econometrics|5": {
109
+ "acc": 0.2894736842105263,
110
+ "acc_stderr": 0.04266339443159394,
111
+ "acc_norm": 0.2894736842105263,
112
+ "acc_norm_stderr": 0.04266339443159394
113
+ },
114
+ "harness|hendrycksTest-electrical_engineering|5": {
115
+ "acc": 0.21379310344827587,
116
+ "acc_stderr": 0.03416520447747548,
117
+ "acc_norm": 0.21379310344827587,
118
+ "acc_norm_stderr": 0.03416520447747548
119
+ },
120
+ "harness|hendrycksTest-elementary_mathematics|5": {
121
+ "acc": 0.2222222222222222,
122
+ "acc_stderr": 0.021411684393694203,
123
+ "acc_norm": 0.2222222222222222,
124
+ "acc_norm_stderr": 0.021411684393694203
125
+ },
126
+ "harness|hendrycksTest-formal_logic|5": {
127
+ "acc": 0.21428571428571427,
128
+ "acc_stderr": 0.03670066451047181,
129
+ "acc_norm": 0.21428571428571427,
130
+ "acc_norm_stderr": 0.03670066451047181
131
+ },
132
+ "harness|hendrycksTest-global_facts|5": {
133
+ "acc": 0.3,
134
+ "acc_stderr": 0.046056618647183814,
135
+ "acc_norm": 0.3,
136
+ "acc_norm_stderr": 0.046056618647183814
137
+ },
138
+ "harness|hendrycksTest-high_school_biology|5": {
139
+ "acc": 0.2064516129032258,
140
+ "acc_stderr": 0.023025899617188726,
141
+ "acc_norm": 0.2064516129032258,
142
+ "acc_norm_stderr": 0.023025899617188726
143
+ },
144
+ "harness|hendrycksTest-high_school_chemistry|5": {
145
+ "acc": 0.1625615763546798,
146
+ "acc_stderr": 0.025960300064605597,
147
+ "acc_norm": 0.1625615763546798,
148
+ "acc_norm_stderr": 0.025960300064605597
149
+ },
150
+ "harness|hendrycksTest-high_school_computer_science|5": {
151
+ "acc": 0.3,
152
+ "acc_stderr": 0.046056618647183814,
153
+ "acc_norm": 0.3,
154
+ "acc_norm_stderr": 0.046056618647183814
155
+ },
156
+ "harness|hendrycksTest-high_school_european_history|5": {
157
+ "acc": 0.21212121212121213,
158
+ "acc_stderr": 0.03192271569548299,
159
+ "acc_norm": 0.21212121212121213,
160
+ "acc_norm_stderr": 0.03192271569548299
161
+ },
162
+ "harness|hendrycksTest-high_school_geography|5": {
163
+ "acc": 0.20707070707070707,
164
+ "acc_stderr": 0.028869778460267063,
165
+ "acc_norm": 0.20707070707070707,
166
+ "acc_norm_stderr": 0.028869778460267063
167
+ },
168
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
169
+ "acc": 0.26424870466321243,
170
+ "acc_stderr": 0.031821550509166484,
171
+ "acc_norm": 0.26424870466321243,
172
+ "acc_norm_stderr": 0.031821550509166484
173
+ },
174
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
175
+ "acc": 0.32051282051282054,
176
+ "acc_stderr": 0.023661296393964273,
177
+ "acc_norm": 0.32051282051282054,
178
+ "acc_norm_stderr": 0.023661296393964273
179
+ },
180
+ "harness|hendrycksTest-high_school_mathematics|5": {
181
+ "acc": 0.2814814814814815,
182
+ "acc_stderr": 0.027420019350945277,
183
+ "acc_norm": 0.2814814814814815,
184
+ "acc_norm_stderr": 0.027420019350945277
185
+ },
186
+ "harness|hendrycksTest-high_school_microeconomics|5": {
187
+ "acc": 0.22268907563025211,
188
+ "acc_stderr": 0.027025433498882367,
189
+ "acc_norm": 0.22268907563025211,
190
+ "acc_norm_stderr": 0.027025433498882367
191
+ },
192
+ "harness|hendrycksTest-high_school_physics|5": {
193
+ "acc": 0.2847682119205298,
194
+ "acc_stderr": 0.03684881521389023,
195
+ "acc_norm": 0.2847682119205298,
196
+ "acc_norm_stderr": 0.03684881521389023
197
+ },
198
+ "harness|hendrycksTest-high_school_psychology|5": {
199
+ "acc": 0.24954128440366974,
200
+ "acc_stderr": 0.01855389762950161,
201
+ "acc_norm": 0.24954128440366974,
202
+ "acc_norm_stderr": 0.01855389762950161
203
+ },
204
+ "harness|hendrycksTest-high_school_statistics|5": {
205
+ "acc": 0.4722222222222222,
206
+ "acc_stderr": 0.0340470532865388,
207
+ "acc_norm": 0.4722222222222222,
208
+ "acc_norm_stderr": 0.0340470532865388
209
+ },
210
+ "harness|hendrycksTest-high_school_us_history|5": {
211
+ "acc": 0.25980392156862747,
212
+ "acc_stderr": 0.030778554678693264,
213
+ "acc_norm": 0.25980392156862747,
214
+ "acc_norm_stderr": 0.030778554678693264
215
+ },
216
+ "harness|hendrycksTest-high_school_world_history|5": {
217
+ "acc": 0.2616033755274262,
218
+ "acc_stderr": 0.028609516716994934,
219
+ "acc_norm": 0.2616033755274262,
220
+ "acc_norm_stderr": 0.028609516716994934
221
+ },
222
+ "harness|hendrycksTest-human_aging|5": {
223
+ "acc": 0.3004484304932735,
224
+ "acc_stderr": 0.030769352008229143,
225
+ "acc_norm": 0.3004484304932735,
226
+ "acc_norm_stderr": 0.030769352008229143
227
+ },
228
+ "harness|hendrycksTest-human_sexuality|5": {
229
+ "acc": 0.26717557251908397,
230
+ "acc_stderr": 0.038808483010823944,
231
+ "acc_norm": 0.26717557251908397,
232
+ "acc_norm_stderr": 0.038808483010823944
233
+ },
234
+ "harness|hendrycksTest-international_law|5": {
235
+ "acc": 0.24793388429752067,
236
+ "acc_stderr": 0.03941897526516302,
237
+ "acc_norm": 0.24793388429752067,
238
+ "acc_norm_stderr": 0.03941897526516302
239
+ },
240
+ "harness|hendrycksTest-jurisprudence|5": {
241
+ "acc": 0.25925925925925924,
242
+ "acc_stderr": 0.042365112580946336,
243
+ "acc_norm": 0.25925925925925924,
244
+ "acc_norm_stderr": 0.042365112580946336
245
+ },
246
+ "harness|hendrycksTest-logical_fallacies|5": {
247
+ "acc": 0.2883435582822086,
248
+ "acc_stderr": 0.035590395316173425,
249
+ "acc_norm": 0.2883435582822086,
250
+ "acc_norm_stderr": 0.035590395316173425
251
+ },
252
+ "harness|hendrycksTest-machine_learning|5": {
253
+ "acc": 0.25892857142857145,
254
+ "acc_stderr": 0.041577515398656284,
255
+ "acc_norm": 0.25892857142857145,
256
+ "acc_norm_stderr": 0.041577515398656284
257
+ },
258
+ "harness|hendrycksTest-management|5": {
259
+ "acc": 0.1650485436893204,
260
+ "acc_stderr": 0.036756688322331886,
261
+ "acc_norm": 0.1650485436893204,
262
+ "acc_norm_stderr": 0.036756688322331886
263
+ },
264
+ "harness|hendrycksTest-marketing|5": {
265
+ "acc": 0.3247863247863248,
266
+ "acc_stderr": 0.030679022765498835,
267
+ "acc_norm": 0.3247863247863248,
268
+ "acc_norm_stderr": 0.030679022765498835
269
+ },
270
+ "harness|hendrycksTest-medical_genetics|5": {
271
+ "acc": 0.29,
272
+ "acc_stderr": 0.045604802157206845,
273
+ "acc_norm": 0.29,
274
+ "acc_norm_stderr": 0.045604802157206845
275
+ },
276
+ "harness|hendrycksTest-miscellaneous|5": {
277
+ "acc": 0.280970625798212,
278
+ "acc_stderr": 0.01607312785122126,
279
+ "acc_norm": 0.280970625798212,
280
+ "acc_norm_stderr": 0.01607312785122126
281
+ },
282
+ "harness|hendrycksTest-moral_disputes|5": {
283
+ "acc": 0.24855491329479767,
284
+ "acc_stderr": 0.023267528432100174,
285
+ "acc_norm": 0.24855491329479767,
286
+ "acc_norm_stderr": 0.023267528432100174
287
+ },
288
+ "harness|hendrycksTest-moral_scenarios|5": {
289
+ "acc": 0.25027932960893856,
290
+ "acc_stderr": 0.01448750085285041,
291
+ "acc_norm": 0.25027932960893856,
292
+ "acc_norm_stderr": 0.01448750085285041
293
+ },
294
+ "harness|hendrycksTest-nutrition|5": {
295
+ "acc": 0.21895424836601307,
296
+ "acc_stderr": 0.02367908986180772,
297
+ "acc_norm": 0.21895424836601307,
298
+ "acc_norm_stderr": 0.02367908986180772
299
+ },
300
+ "harness|hendrycksTest-philosophy|5": {
301
+ "acc": 0.2733118971061093,
302
+ "acc_stderr": 0.025311765975426115,
303
+ "acc_norm": 0.2733118971061093,
304
+ "acc_norm_stderr": 0.025311765975426115
305
+ },
306
+ "harness|hendrycksTest-prehistory|5": {
307
+ "acc": 0.19135802469135801,
308
+ "acc_stderr": 0.021887704613396158,
309
+ "acc_norm": 0.19135802469135801,
310
+ "acc_norm_stderr": 0.021887704613396158
311
+ },
312
+ "harness|hendrycksTest-professional_accounting|5": {
313
+ "acc": 0.24113475177304963,
314
+ "acc_stderr": 0.02551873104953776,
315
+ "acc_norm": 0.24113475177304963,
316
+ "acc_norm_stderr": 0.02551873104953776
317
+ },
318
+ "harness|hendrycksTest-professional_law|5": {
319
+ "acc": 0.24445893089960888,
320
+ "acc_stderr": 0.010976425013113897,
321
+ "acc_norm": 0.24445893089960888,
322
+ "acc_norm_stderr": 0.010976425013113897
323
+ },
324
+ "harness|hendrycksTest-professional_medicine|5": {
325
+ "acc": 0.4485294117647059,
326
+ "acc_stderr": 0.030211479609121593,
327
+ "acc_norm": 0.4485294117647059,
328
+ "acc_norm_stderr": 0.030211479609121593
329
+ },
330
+ "harness|hendrycksTest-professional_psychology|5": {
331
+ "acc": 0.21568627450980393,
332
+ "acc_stderr": 0.01663931935031326,
333
+ "acc_norm": 0.21568627450980393,
334
+ "acc_norm_stderr": 0.01663931935031326
335
+ },
336
+ "harness|hendrycksTest-public_relations|5": {
337
+ "acc": 0.23636363636363636,
338
+ "acc_stderr": 0.04069306319721376,
339
+ "acc_norm": 0.23636363636363636,
340
+ "acc_norm_stderr": 0.04069306319721376
341
+ },
342
+ "harness|hendrycksTest-security_studies|5": {
343
+ "acc": 0.2571428571428571,
344
+ "acc_stderr": 0.02797982353874455,
345
+ "acc_norm": 0.2571428571428571,
346
+ "acc_norm_stderr": 0.02797982353874455
347
+ },
348
+ "harness|hendrycksTest-sociology|5": {
349
+ "acc": 0.2537313432835821,
350
+ "acc_stderr": 0.030769444967296018,
351
+ "acc_norm": 0.2537313432835821,
352
+ "acc_norm_stderr": 0.030769444967296018
353
+ },
354
+ "harness|hendrycksTest-us_foreign_policy|5": {
355
+ "acc": 0.23,
356
+ "acc_stderr": 0.04229525846816506,
357
+ "acc_norm": 0.23,
358
+ "acc_norm_stderr": 0.04229525846816506
359
+ },
360
+ "harness|hendrycksTest-virology|5": {
361
+ "acc": 0.26506024096385544,
362
+ "acc_stderr": 0.03436024037944967,
363
+ "acc_norm": 0.26506024096385544,
364
+ "acc_norm_stderr": 0.03436024037944967
365
+ },
366
+ "harness|hendrycksTest-world_religions|5": {
367
+ "acc": 0.3216374269005848,
368
+ "acc_stderr": 0.03582529442573122,
369
+ "acc_norm": 0.3216374269005848,
370
+ "acc_norm_stderr": 0.03582529442573122
371
+ },
372
+ "harness|truthfulqa:mc|0": {
373
+ "mc1": 0.25458996328029376,
374
+ "mc1_stderr": 0.015250117079156496,
375
+ "mc2": 0.4091007666951377,
376
+ "mc2_stderr": 0.014365367143474025
377
+ },
378
+ "harness|winogrande|5": {
379
+ "acc": 0.5706393054459353,
380
+ "acc_stderr": 0.013911537499969163
381
+ },
382
+ "harness|gsm8k|5": {
383
+ "acc": 0.014404852160727824,
384
+ "acc_stderr": 0.0032820559171369444
385
+ },
386
+ "all": {
387
+ "acc": 0.26686435788728485,
388
+ "acc_stderr": 0.031169507718254618,
389
+ "acc_norm": 0.2686642908950062,
390
+ "acc_norm_stderr": 0.03194301691878982,
391
+ "mc1": 0.25458996328029376,
392
+ "mc1_stderr": 0.015250117079156496,
393
+ "mc2": 0.4091007666951377,
394
+ "mc2_stderr": 0.014365367143474025
395
+ }
396
+ },
397
+ "versions": {
398
+ "all": 0,
399
+ "harness|arc:challenge|25": 0,
400
+ "harness|gsm8k|5": 0,
401
+ "harness|hellaswag|10": 0,
402
+ "harness|hendrycksTest-abstract_algebra|5": 1,
403
+ "harness|hendrycksTest-anatomy|5": 1,
404
+ "harness|hendrycksTest-astronomy|5": 1,
405
+ "harness|hendrycksTest-business_ethics|5": 1,
406
+ "harness|hendrycksTest-clinical_knowledge|5": 1,
407
+ "harness|hendrycksTest-college_biology|5": 1,
408
+ "harness|hendrycksTest-college_chemistry|5": 1,
409
+ "harness|hendrycksTest-college_computer_science|5": 1,
410
+ "harness|hendrycksTest-college_mathematics|5": 1,
411
+ "harness|hendrycksTest-college_medicine|5": 1,
412
+ "harness|hendrycksTest-college_physics|5": 1,
413
+ "harness|hendrycksTest-computer_security|5": 1,
414
+ "harness|hendrycksTest-conceptual_physics|5": 1,
415
+ "harness|hendrycksTest-econometrics|5": 1,
416
+ "harness|hendrycksTest-electrical_engineering|5": 1,
417
+ "harness|hendrycksTest-elementary_mathematics|5": 1,
418
+ "harness|hendrycksTest-formal_logic|5": 1,
419
+ "harness|hendrycksTest-global_facts|5": 1,
420
+ "harness|hendrycksTest-high_school_biology|5": 1,
421
+ "harness|hendrycksTest-high_school_chemistry|5": 1,
422
+ "harness|hendrycksTest-high_school_computer_science|5": 1,
423
+ "harness|hendrycksTest-high_school_european_history|5": 1,
424
+ "harness|hendrycksTest-high_school_geography|5": 1,
425
+ "harness|hendrycksTest-high_school_government_and_politics|5": 1,
426
+ "harness|hendrycksTest-high_school_macroeconomics|5": 1,
427
+ "harness|hendrycksTest-high_school_mathematics|5": 1,
428
+ "harness|hendrycksTest-high_school_microeconomics|5": 1,
429
+ "harness|hendrycksTest-high_school_physics|5": 1,
430
+ "harness|hendrycksTest-high_school_psychology|5": 1,
431
+ "harness|hendrycksTest-high_school_statistics|5": 1,
432
+ "harness|hendrycksTest-high_school_us_history|5": 1,
433
+ "harness|hendrycksTest-high_school_world_history|5": 1,
434
+ "harness|hendrycksTest-human_aging|5": 1,
435
+ "harness|hendrycksTest-human_sexuality|5": 1,
436
+ "harness|hendrycksTest-international_law|5": 1,
437
+ "harness|hendrycksTest-jurisprudence|5": 1,
438
+ "harness|hendrycksTest-logical_fallacies|5": 1,
439
+ "harness|hendrycksTest-machine_learning|5": 1,
440
+ "harness|hendrycksTest-management|5": 1,
441
+ "harness|hendrycksTest-marketing|5": 1,
442
+ "harness|hendrycksTest-medical_genetics|5": 1,
443
+ "harness|hendrycksTest-miscellaneous|5": 1,
444
+ "harness|hendrycksTest-moral_disputes|5": 1,
445
+ "harness|hendrycksTest-moral_scenarios|5": 1,
446
+ "harness|hendrycksTest-nutrition|5": 1,
447
+ "harness|hendrycksTest-philosophy|5": 1,
448
+ "harness|hendrycksTest-prehistory|5": 1,
449
+ "harness|hendrycksTest-professional_accounting|5": 1,
450
+ "harness|hendrycksTest-professional_law|5": 1,
451
+ "harness|hendrycksTest-professional_medicine|5": 1,
452
+ "harness|hendrycksTest-professional_psychology|5": 1,
453
+ "harness|hendrycksTest-public_relations|5": 1,
454
+ "harness|hendrycksTest-security_studies|5": 1,
455
+ "harness|hendrycksTest-sociology|5": 1,
456
+ "harness|hendrycksTest-us_foreign_policy|5": 1,
457
+ "harness|hendrycksTest-virology|5": 1,
458
+ "harness|hendrycksTest-world_religions|5": 1,
459
+ "harness|truthfulqa:mc|0": 1,
460
+ "harness|winogrande|5": 0
461
+ },
462
+ "config_tasks": {
463
+ "harness|arc:challenge": "LM Harness task",
464
+ "harness|gsm8k": "LM Harness task",
465
+ "harness|hellaswag": "LM Harness task",
466
+ "harness|hendrycksTest-abstract_algebra": "LM Harness task",
467
+ "harness|hendrycksTest-anatomy": "LM Harness task",
468
+ "harness|hendrycksTest-astronomy": "LM Harness task",
469
+ "harness|hendrycksTest-business_ethics": "LM Harness task",
470
+ "harness|hendrycksTest-clinical_knowledge": "LM Harness task",
471
+ "harness|hendrycksTest-college_biology": "LM Harness task",
472
+ "harness|hendrycksTest-college_chemistry": "LM Harness task",
473
+ "harness|hendrycksTest-college_computer_science": "LM Harness task",
474
+ "harness|hendrycksTest-college_mathematics": "LM Harness task",
475
+ "harness|hendrycksTest-college_medicine": "LM Harness task",
476
+ "harness|hendrycksTest-college_physics": "LM Harness task",
477
+ "harness|hendrycksTest-computer_security": "LM Harness task",
478
+ "harness|hendrycksTest-conceptual_physics": "LM Harness task",
479
+ "harness|hendrycksTest-econometrics": "LM Harness task",
480
+ "harness|hendrycksTest-electrical_engineering": "LM Harness task",
481
+ "harness|hendrycksTest-elementary_mathematics": "LM Harness task",
482
+ "harness|hendrycksTest-formal_logic": "LM Harness task",
483
+ "harness|hendrycksTest-global_facts": "LM Harness task",
484
+ "harness|hendrycksTest-high_school_biology": "LM Harness task",
485
+ "harness|hendrycksTest-high_school_chemistry": "LM Harness task",
486
+ "harness|hendrycksTest-high_school_computer_science": "LM Harness task",
487
+ "harness|hendrycksTest-high_school_european_history": "LM Harness task",
488
+ "harness|hendrycksTest-high_school_geography": "LM Harness task",
489
+ "harness|hendrycksTest-high_school_government_and_politics": "LM Harness task",
490
+ "harness|hendrycksTest-high_school_macroeconomics": "LM Harness task",
491
+ "harness|hendrycksTest-high_school_mathematics": "LM Harness task",
492
+ "harness|hendrycksTest-high_school_microeconomics": "LM Harness task",
493
+ "harness|hendrycksTest-high_school_physics": "LM Harness task",
494
+ "harness|hendrycksTest-high_school_psychology": "LM Harness task",
495
+ "harness|hendrycksTest-high_school_statistics": "LM Harness task",
496
+ "harness|hendrycksTest-high_school_us_history": "LM Harness task",
497
+ "harness|hendrycksTest-high_school_world_history": "LM Harness task",
498
+ "harness|hendrycksTest-human_aging": "LM Harness task",
499
+ "harness|hendrycksTest-human_sexuality": "LM Harness task",
500
+ "harness|hendrycksTest-international_law": "LM Harness task",
501
+ "harness|hendrycksTest-jurisprudence": "LM Harness task",
502
+ "harness|hendrycksTest-logical_fallacies": "LM Harness task",
503
+ "harness|hendrycksTest-machine_learning": "LM Harness task",
504
+ "harness|hendrycksTest-management": "LM Harness task",
505
+ "harness|hendrycksTest-marketing": "LM Harness task",
506
+ "harness|hendrycksTest-medical_genetics": "LM Harness task",
507
+ "harness|hendrycksTest-miscellaneous": "LM Harness task",
508
+ "harness|hendrycksTest-moral_disputes": "LM Harness task",
509
+ "harness|hendrycksTest-moral_scenarios": "LM Harness task",
510
+ "harness|hendrycksTest-nutrition": "LM Harness task",
511
+ "harness|hendrycksTest-philosophy": "LM Harness task",
512
+ "harness|hendrycksTest-prehistory": "LM Harness task",
513
+ "harness|hendrycksTest-professional_accounting": "LM Harness task",
514
+ "harness|hendrycksTest-professional_law": "LM Harness task",
515
+ "harness|hendrycksTest-professional_medicine": "LM Harness task",
516
+ "harness|hendrycksTest-professional_psychology": "LM Harness task",
517
+ "harness|hendrycksTest-public_relations": "LM Harness task",
518
+ "harness|hendrycksTest-security_studies": "LM Harness task",
519
+ "harness|hendrycksTest-sociology": "LM Harness task",
520
+ "harness|hendrycksTest-us_foreign_policy": "LM Harness task",
521
+ "harness|hendrycksTest-virology": "LM Harness task",
522
+ "harness|hendrycksTest-world_religions": "LM Harness task",
523
+ "harness|truthfulqa:mc": "LM Harness task",
524
+ "harness|winogrande": "LM Harness task"
525
+ },
526
+ "summary_tasks": {
527
+ "harness|arc:challenge|25": {
528
+ "hashes": {
529
+ "hash_examples": "17b0cae357c0259e",
530
+ "hash_full_prompts": "045cbb916e5145c6",
531
+ "hash_input_tokens": "ca48d52265c0051f",
532
+ "hash_cont_tokens": "e8abf848493b50f7"
533
+ },
534
+ "truncated": 0,
535
+ "non_truncated": 1172,
536
+ "padded": 4687,
537
+ "non_padded": 0,
538
+ "effective_few_shots": 25.0,
539
+ "num_truncated_few_shots": 0
540
+ },
541
+ "harness|hellaswag|10": {
542
+ "hashes": {
543
+ "hash_examples": "e1768ecb99d7ecf0",
544
+ "hash_full_prompts": "0b4c16983130f84f",
545
+ "hash_input_tokens": "4975ded0ed31f702",
546
+ "hash_cont_tokens": "9fe0a5c42e1532db"
547
+ },
548
+ "truncated": 0,
549
+ "non_truncated": 10042,
550
+ "padded": 40019,
551
+ "non_padded": 149,
552
+ "effective_few_shots": 10.0,
553
+ "num_truncated_few_shots": 0
554
+ },
555
+ "harness|hendrycksTest-abstract_algebra|5": {
556
+ "hashes": {
557
+ "hash_examples": "280f9f325b40559a",
558
+ "hash_full_prompts": "2f776a367d23aea2",
559
+ "hash_input_tokens": "8ff523ec326d5d55",
560
+ "hash_cont_tokens": "50421e30bef398f9"
561
+ },
562
+ "truncated": 0,
563
+ "non_truncated": 100,
564
+ "padded": 400,
565
+ "non_padded": 0,
566
+ "effective_few_shots": 5.0,
567
+ "num_truncated_few_shots": 0
568
+ },
569
+ "harness|hendrycksTest-anatomy|5": {
570
+ "hashes": {
571
+ "hash_examples": "2f83a4f1cab4ba18",
572
+ "hash_full_prompts": "516f74bef25df620",
573
+ "hash_input_tokens": "742bd6a389a8ef40",
574
+ "hash_cont_tokens": "f11971a765cb609f"
575
+ },
576
+ "truncated": 0,
577
+ "non_truncated": 135,
578
+ "padded": 540,
579
+ "non_padded": 0,
580
+ "effective_few_shots": 5.0,
581
+ "num_truncated_few_shots": 0
582
+ },
583
+ "harness|hendrycksTest-astronomy|5": {
584
+ "hashes": {
585
+ "hash_examples": "7d587b908da4d762",
586
+ "hash_full_prompts": "faf4e80f65de93ca",
587
+ "hash_input_tokens": "aa9743839c83bd9f",
588
+ "hash_cont_tokens": "440a970fadecdc7b"
589
+ },
590
+ "truncated": 0,
591
+ "non_truncated": 152,
592
+ "padded": 608,
593
+ "non_padded": 0,
594
+ "effective_few_shots": 5.0,
595
+ "num_truncated_few_shots": 0
596
+ },
597
+ "harness|hendrycksTest-business_ethics|5": {
598
+ "hashes": {
599
+ "hash_examples": "33e51740670de686",
600
+ "hash_full_prompts": "db01c3ef8e1479d4",
601
+ "hash_input_tokens": "60f6ed52e2a2987a",
602
+ "hash_cont_tokens": "50421e30bef398f9"
603
+ },
604
+ "truncated": 0,
605
+ "non_truncated": 100,
606
+ "padded": 400,
607
+ "non_padded": 0,
608
+ "effective_few_shots": 5.0,
609
+ "num_truncated_few_shots": 0
610
+ },
611
+ "harness|hendrycksTest-clinical_knowledge|5": {
612
+ "hashes": {
613
+ "hash_examples": "f3366dbe7eefffa4",
614
+ "hash_full_prompts": "49654f71d94b65c3",
615
+ "hash_input_tokens": "6080d9f3c5930be0",
616
+ "hash_cont_tokens": "7ecd60c25b9bfe5b"
617
+ },
618
+ "truncated": 0,
619
+ "non_truncated": 265,
620
+ "padded": 1060,
621
+ "non_padded": 0,
622
+ "effective_few_shots": 5.0,
623
+ "num_truncated_few_shots": 0
624
+ },
625
+ "harness|hendrycksTest-college_biology|5": {
626
+ "hashes": {
627
+ "hash_examples": "ca2b6753a0193e7f",
628
+ "hash_full_prompts": "2b460b75f1fdfefd",
629
+ "hash_input_tokens": "873319724ad65589",
630
+ "hash_cont_tokens": "875cde3af7a0ee14"
631
+ },
632
+ "truncated": 0,
633
+ "non_truncated": 144,
634
+ "padded": 564,
635
+ "non_padded": 12,
636
+ "effective_few_shots": 5.0,
637
+ "num_truncated_few_shots": 0
638
+ },
639
+ "harness|hendrycksTest-college_chemistry|5": {
640
+ "hashes": {
641
+ "hash_examples": "22ff85f1d34f42d1",
642
+ "hash_full_prompts": "242c9be6da583e95",
643
+ "hash_input_tokens": "8366d04d12b154a7",
644
+ "hash_cont_tokens": "50421e30bef398f9"
645
+ },
646
+ "truncated": 0,
647
+ "non_truncated": 100,
648
+ "padded": 400,
649
+ "non_padded": 0,
650
+ "effective_few_shots": 5.0,
651
+ "num_truncated_few_shots": 0
652
+ },
653
+ "harness|hendrycksTest-college_computer_science|5": {
654
+ "hashes": {
655
+ "hash_examples": "30318289d717a5cf",
656
+ "hash_full_prompts": "ed2bdb4e87c4b371",
657
+ "hash_input_tokens": "1724a282fb269fd7",
658
+ "hash_cont_tokens": "50421e30bef398f9"
659
+ },
660
+ "truncated": 0,
661
+ "non_truncated": 100,
662
+ "padded": 400,
663
+ "non_padded": 0,
664
+ "effective_few_shots": 5.0,
665
+ "num_truncated_few_shots": 0
666
+ },
667
+ "harness|hendrycksTest-college_mathematics|5": {
668
+ "hashes": {
669
+ "hash_examples": "4944d1f0b6b5d911",
670
+ "hash_full_prompts": "770bc4281c973190",
671
+ "hash_input_tokens": "b7aa815781eae172",
672
+ "hash_cont_tokens": "50421e30bef398f9"
673
+ },
674
+ "truncated": 0,
675
+ "non_truncated": 100,
676
+ "padded": 400,
677
+ "non_padded": 0,
678
+ "effective_few_shots": 5.0,
679
+ "num_truncated_few_shots": 0
680
+ },
681
+ "harness|hendrycksTest-college_medicine|5": {
682
+ "hashes": {
683
+ "hash_examples": "dd69cc33381275af",
684
+ "hash_full_prompts": "ad2a53e5250ab46e",
685
+ "hash_input_tokens": "0003d13e86bc8c1a",
686
+ "hash_cont_tokens": "702fb6d82ff0d6ac"
687
+ },
688
+ "truncated": 0,
689
+ "non_truncated": 173,
690
+ "padded": 692,
691
+ "non_padded": 0,
692
+ "effective_few_shots": 5.0,
693
+ "num_truncated_few_shots": 0
694
+ },
695
+ "harness|hendrycksTest-college_physics|5": {
696
+ "hashes": {
697
+ "hash_examples": "875dd26d22655b0d",
698
+ "hash_full_prompts": "833a0d7b55aed500",
699
+ "hash_input_tokens": "32b28762dd077c78",
700
+ "hash_cont_tokens": "f7b8097afc16a47c"
701
+ },
702
+ "truncated": 0,
703
+ "non_truncated": 102,
704
+ "padded": 404,
705
+ "non_padded": 4,
706
+ "effective_few_shots": 5.0,
707
+ "num_truncated_few_shots": 0
708
+ },
709
+ "harness|hendrycksTest-computer_security|5": {
710
+ "hashes": {
711
+ "hash_examples": "006451eedc0ededb",
712
+ "hash_full_prompts": "94034c97e85d8f46",
713
+ "hash_input_tokens": "19dd0e1895125d49",
714
+ "hash_cont_tokens": "50421e30bef398f9"
715
+ },
716
+ "truncated": 0,
717
+ "non_truncated": 100,
718
+ "padded": 400,
719
+ "non_padded": 0,
720
+ "effective_few_shots": 5.0,
721
+ "num_truncated_few_shots": 0
722
+ },
723
+ "harness|hendrycksTest-conceptual_physics|5": {
724
+ "hashes": {
725
+ "hash_examples": "8874ece872d2ca4c",
726
+ "hash_full_prompts": "e40d15a34640d6fa",
727
+ "hash_input_tokens": "761c7ce187b3338a",
728
+ "hash_cont_tokens": "aa0e8bc655f2f641"
729
+ },
730
+ "truncated": 0,
731
+ "non_truncated": 235,
732
+ "padded": 940,
733
+ "non_padded": 0,
734
+ "effective_few_shots": 5.0,
735
+ "num_truncated_few_shots": 0
736
+ },
737
+ "harness|hendrycksTest-econometrics|5": {
738
+ "hashes": {
739
+ "hash_examples": "64d3623b0bfaa43f",
740
+ "hash_full_prompts": "612f340fae41338d",
741
+ "hash_input_tokens": "dae74024ebc12b2b",
742
+ "hash_cont_tokens": "b1cc6e7e9fcd3827"
743
+ },
744
+ "truncated": 0,
745
+ "non_truncated": 114,
746
+ "padded": 456,
747
+ "non_padded": 0,
748
+ "effective_few_shots": 5.0,
749
+ "num_truncated_few_shots": 0
750
+ },
751
+ "harness|hendrycksTest-electrical_engineering|5": {
752
+ "hashes": {
753
+ "hash_examples": "e98f51780c674d7e",
754
+ "hash_full_prompts": "10275b312d812ae6",
755
+ "hash_input_tokens": "5fa8050688a246ed",
756
+ "hash_cont_tokens": "2425a3f084a591ef"
757
+ },
758
+ "truncated": 0,
759
+ "non_truncated": 145,
760
+ "padded": 580,
761
+ "non_padded": 0,
762
+ "effective_few_shots": 5.0,
763
+ "num_truncated_few_shots": 0
764
+ },
765
+ "harness|hendrycksTest-elementary_mathematics|5": {
766
+ "hashes": {
767
+ "hash_examples": "fc48208a5ac1c0ce",
768
+ "hash_full_prompts": "5ec274c6c82aca23",
769
+ "hash_input_tokens": "2da3f8d7d1515cc6",
770
+ "hash_cont_tokens": "bd87bf0c060fd925"
771
+ },
772
+ "truncated": 0,
773
+ "non_truncated": 378,
774
+ "padded": 1512,
775
+ "non_padded": 0,
776
+ "effective_few_shots": 5.0,
777
+ "num_truncated_few_shots": 0
778
+ },
779
+ "harness|hendrycksTest-formal_logic|5": {
780
+ "hashes": {
781
+ "hash_examples": "5a6525665f63ea72",
782
+ "hash_full_prompts": "07b92638c4a6b500",
783
+ "hash_input_tokens": "907de61bbe46dada",
784
+ "hash_cont_tokens": "eb8932890e0605db"
785
+ },
786
+ "truncated": 0,
787
+ "non_truncated": 126,
788
+ "padded": 504,
789
+ "non_padded": 0,
790
+ "effective_few_shots": 5.0,
791
+ "num_truncated_few_shots": 0
792
+ },
793
+ "harness|hendrycksTest-global_facts|5": {
794
+ "hashes": {
795
+ "hash_examples": "371d70d743b2b89b",
796
+ "hash_full_prompts": "332fdee50a1921b4",
797
+ "hash_input_tokens": "d7549fe9ac133643",
798
+ "hash_cont_tokens": "50421e30bef398f9"
799
+ },
800
+ "truncated": 0,
801
+ "non_truncated": 100,
802
+ "padded": 400,
803
+ "non_padded": 0,
804
+ "effective_few_shots": 5.0,
805
+ "num_truncated_few_shots": 0
806
+ },
807
+ "harness|hendrycksTest-high_school_biology|5": {
808
+ "hashes": {
809
+ "hash_examples": "a79e1018b1674052",
810
+ "hash_full_prompts": "e624e26ede922561",
811
+ "hash_input_tokens": "b449ae8cd622fb96",
812
+ "hash_cont_tokens": "1ddcb86d28cde266"
813
+ },
814
+ "truncated": 0,
815
+ "non_truncated": 310,
816
+ "padded": 1240,
817
+ "non_padded": 0,
818
+ "effective_few_shots": 5.0,
819
+ "num_truncated_few_shots": 0
820
+ },
821
+ "harness|hendrycksTest-high_school_chemistry|5": {
822
+ "hashes": {
823
+ "hash_examples": "44bfc25c389f0e03",
824
+ "hash_full_prompts": "0e3e5f5d9246482a",
825
+ "hash_input_tokens": "a447bd1574b5e26c",
826
+ "hash_cont_tokens": "176c8dcff38c5f8f"
827
+ },
828
+ "truncated": 0,
829
+ "non_truncated": 203,
830
+ "padded": 812,
831
+ "non_padded": 0,
832
+ "effective_few_shots": 5.0,
833
+ "num_truncated_few_shots": 0
834
+ },
835
+ "harness|hendrycksTest-high_school_computer_science|5": {
836
+ "hashes": {
837
+ "hash_examples": "8b8cdb1084f24169",
838
+ "hash_full_prompts": "c00487e67c1813cc",
839
+ "hash_input_tokens": "56312a0c3d85ae90",
840
+ "hash_cont_tokens": "50421e30bef398f9"
841
+ },
842
+ "truncated": 0,
843
+ "non_truncated": 100,
844
+ "padded": 400,
845
+ "non_padded": 0,
846
+ "effective_few_shots": 5.0,
847
+ "num_truncated_few_shots": 0
848
+ },
849
+ "harness|hendrycksTest-high_school_european_history|5": {
850
+ "hashes": {
851
+ "hash_examples": "11cd32d0ef440171",
852
+ "hash_full_prompts": "318f4513c537c6bf",
853
+ "hash_input_tokens": "5002f4ac8b1562ca",
854
+ "hash_cont_tokens": "674fc454bdc5ac93"
855
+ },
856
+ "truncated": 0,
857
+ "non_truncated": 165,
858
+ "padded": 656,
859
+ "non_padded": 4,
860
+ "effective_few_shots": 5.0,
861
+ "num_truncated_few_shots": 0
862
+ },
863
+ "harness|hendrycksTest-high_school_geography|5": {
864
+ "hashes": {
865
+ "hash_examples": "b60019b9e80b642f",
866
+ "hash_full_prompts": "ee5789fcc1a81b1e",
867
+ "hash_input_tokens": "b4f9efd054b0149d",
868
+ "hash_cont_tokens": "03a5012b916274ea"
869
+ },
870
+ "truncated": 0,
871
+ "non_truncated": 198,
872
+ "padded": 792,
873
+ "non_padded": 0,
874
+ "effective_few_shots": 5.0,
875
+ "num_truncated_few_shots": 0
876
+ },
877
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
878
+ "hashes": {
879
+ "hash_examples": "d221ec983d143dc3",
880
+ "hash_full_prompts": "ac42d888e1ce1155",
881
+ "hash_input_tokens": "6e010d01707b5a01",
882
+ "hash_cont_tokens": "873d2aab226ba1d8"
883
+ },
884
+ "truncated": 0,
885
+ "non_truncated": 193,
886
+ "padded": 772,
887
+ "non_padded": 0,
888
+ "effective_few_shots": 5.0,
889
+ "num_truncated_few_shots": 0
890
+ },
891
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
892
+ "hashes": {
893
+ "hash_examples": "59c2915cacfd3fbb",
894
+ "hash_full_prompts": "c6bd9d25158abd0e",
895
+ "hash_input_tokens": "fc1f6e824ba386d7",
896
+ "hash_cont_tokens": "c583432ad27fcfe0"
897
+ },
898
+ "truncated": 0,
899
+ "non_truncated": 390,
900
+ "padded": 1560,
901
+ "non_padded": 0,
902
+ "effective_few_shots": 5.0,
903
+ "num_truncated_few_shots": 0
904
+ },
905
+ "harness|hendrycksTest-high_school_mathematics|5": {
906
+ "hashes": {
907
+ "hash_examples": "1f8ac897608de342",
908
+ "hash_full_prompts": "5d88f41fc2d643a8",
909
+ "hash_input_tokens": "3a485a40c8432ece",
910
+ "hash_cont_tokens": "d7907b61bcb8c123"
911
+ },
912
+ "truncated": 0,
913
+ "non_truncated": 270,
914
+ "padded": 1080,
915
+ "non_padded": 0,
916
+ "effective_few_shots": 5.0,
917
+ "num_truncated_few_shots": 0
918
+ },
919
+ "harness|hendrycksTest-high_school_microeconomics|5": {
920
+ "hashes": {
921
+ "hash_examples": "ead6a0f2f6c83370",
922
+ "hash_full_prompts": "bfc393381298609e",
923
+ "hash_input_tokens": "a7dd9ca4bbda3752",
924
+ "hash_cont_tokens": "f47f041de50333b9"
925
+ },
926
+ "truncated": 0,
927
+ "non_truncated": 238,
928
+ "padded": 952,
929
+ "non_padded": 0,
930
+ "effective_few_shots": 5.0,
931
+ "num_truncated_few_shots": 0
932
+ },
933
+ "harness|hendrycksTest-high_school_physics|5": {
934
+ "hashes": {
935
+ "hash_examples": "c3f2025990afec64",
936
+ "hash_full_prompts": "fc78b4997e436734",
937
+ "hash_input_tokens": "d7ea631399a73865",
938
+ "hash_cont_tokens": "0d56317b3e5eedb5"
939
+ },
940
+ "truncated": 0,
941
+ "non_truncated": 151,
942
+ "padded": 604,
943
+ "non_padded": 0,
944
+ "effective_few_shots": 5.0,
945
+ "num_truncated_few_shots": 0
946
+ },
947
+ "harness|hendrycksTest-high_school_psychology|5": {
948
+ "hashes": {
949
+ "hash_examples": "21f8aab618f6d636",
950
+ "hash_full_prompts": "d5c76aa40b9dbc43",
951
+ "hash_input_tokens": "d12816cf88146011",
952
+ "hash_cont_tokens": "09ba1243e7390c0f"
953
+ },
954
+ "truncated": 0,
955
+ "non_truncated": 545,
956
+ "padded": 2180,
957
+ "non_padded": 0,
958
+ "effective_few_shots": 5.0,
959
+ "num_truncated_few_shots": 0
960
+ },
961
+ "harness|hendrycksTest-high_school_statistics|5": {
962
+ "hashes": {
963
+ "hash_examples": "2386a60a11fc5de3",
964
+ "hash_full_prompts": "4c5c8be5aafac432",
965
+ "hash_input_tokens": "9763ecaef4814c21",
966
+ "hash_cont_tokens": "9cc29889c3d3f77d"
967
+ },
968
+ "truncated": 0,
969
+ "non_truncated": 216,
970
+ "padded": 864,
971
+ "non_padded": 0,
972
+ "effective_few_shots": 5.0,
973
+ "num_truncated_few_shots": 0
974
+ },
975
+ "harness|hendrycksTest-high_school_us_history|5": {
976
+ "hashes": {
977
+ "hash_examples": "74961543be40f04f",
978
+ "hash_full_prompts": "5d5ca4840131ba21",
979
+ "hash_input_tokens": "c639cce12a46ebad",
980
+ "hash_cont_tokens": "cdd0b3dc06d933e5"
981
+ },
982
+ "truncated": 0,
983
+ "non_truncated": 204,
984
+ "padded": 816,
985
+ "non_padded": 0,
986
+ "effective_few_shots": 5.0,
987
+ "num_truncated_few_shots": 0
988
+ },
989
+ "harness|hendrycksTest-high_school_world_history|5": {
990
+ "hashes": {
991
+ "hash_examples": "2ad2f6b7198b2234",
992
+ "hash_full_prompts": "11845057459afd72",
993
+ "hash_input_tokens": "b9762065cce6f3a6",
994
+ "hash_cont_tokens": "e02816433ff28daf"
995
+ },
996
+ "truncated": 0,
997
+ "non_truncated": 237,
998
+ "padded": 948,
999
+ "non_padded": 0,
1000
+ "effective_few_shots": 5.0,
1001
+ "num_truncated_few_shots": 0
1002
+ },
1003
+ "harness|hendrycksTest-human_aging|5": {
1004
+ "hashes": {
1005
+ "hash_examples": "1a7199dc733e779b",
1006
+ "hash_full_prompts": "756b9096b8eaf892",
1007
+ "hash_input_tokens": "84157fee0b6d0f3c",
1008
+ "hash_cont_tokens": "142a4a8a1138a214"
1009
+ },
1010
+ "truncated": 0,
1011
+ "non_truncated": 223,
1012
+ "padded": 892,
1013
+ "non_padded": 0,
1014
+ "effective_few_shots": 5.0,
1015
+ "num_truncated_few_shots": 0
1016
+ },
1017
+ "harness|hendrycksTest-human_sexuality|5": {
1018
+ "hashes": {
1019
+ "hash_examples": "7acb8fdad97f88a6",
1020
+ "hash_full_prompts": "731a52ff15b8cfdb",
1021
+ "hash_input_tokens": "ade303e1ae3c016f",
1022
+ "hash_cont_tokens": "bc54813e809b796d"
1023
+ },
1024
+ "truncated": 0,
1025
+ "non_truncated": 131,
1026
+ "padded": 524,
1027
+ "non_padded": 0,
1028
+ "effective_few_shots": 5.0,
1029
+ "num_truncated_few_shots": 0
1030
+ },
1031
+ "harness|hendrycksTest-international_law|5": {
1032
+ "hashes": {
1033
+ "hash_examples": "1300bfd0dfc59114",
1034
+ "hash_full_prompts": "db2aefbff5eec996",
1035
+ "hash_input_tokens": "e5482e1c23c23d35",
1036
+ "hash_cont_tokens": "8ea8c5ff76a15bca"
1037
+ },
1038
+ "truncated": 0,
1039
+ "non_truncated": 121,
1040
+ "padded": 484,
1041
+ "non_padded": 0,
1042
+ "effective_few_shots": 5.0,
1043
+ "num_truncated_few_shots": 0
1044
+ },
1045
+ "harness|hendrycksTest-jurisprudence|5": {
1046
+ "hashes": {
1047
+ "hash_examples": "083b1e4904c48dc2",
1048
+ "hash_full_prompts": "0f89ee3fe03d6a21",
1049
+ "hash_input_tokens": "4415eeb9bad0507b",
1050
+ "hash_cont_tokens": "e3a8cd951b6e3469"
1051
+ },
1052
+ "truncated": 0,
1053
+ "non_truncated": 108,
1054
+ "padded": 432,
1055
+ "non_padded": 0,
1056
+ "effective_few_shots": 5.0,
1057
+ "num_truncated_few_shots": 0
1058
+ },
1059
+ "harness|hendrycksTest-logical_fallacies|5": {
1060
+ "hashes": {
1061
+ "hash_examples": "709128f9926a634c",
1062
+ "hash_full_prompts": "98a04b1f8f841069",
1063
+ "hash_input_tokens": "e6b5271422ecbaa8",
1064
+ "hash_cont_tokens": "3e9e0bdc248fd88a"
1065
+ },
1066
+ "truncated": 0,
1067
+ "non_truncated": 163,
1068
+ "padded": 644,
1069
+ "non_padded": 8,
1070
+ "effective_few_shots": 5.0,
1071
+ "num_truncated_few_shots": 0
1072
+ },
1073
+ "harness|hendrycksTest-machine_learning|5": {
1074
+ "hashes": {
1075
+ "hash_examples": "88f22a636029ae47",
1076
+ "hash_full_prompts": "2e1c8d4b1e0cc921",
1077
+ "hash_input_tokens": "e719cb83196977d8",
1078
+ "hash_cont_tokens": "55b12fb138c6a064"
1079
+ },
1080
+ "truncated": 0,
1081
+ "non_truncated": 112,
1082
+ "padded": 448,
1083
+ "non_padded": 0,
1084
+ "effective_few_shots": 5.0,
1085
+ "num_truncated_few_shots": 0
1086
+ },
1087
+ "harness|hendrycksTest-management|5": {
1088
+ "hashes": {
1089
+ "hash_examples": "8c8a1e07a2151dca",
1090
+ "hash_full_prompts": "f51611f514b265b0",
1091
+ "hash_input_tokens": "155da0e62b39e804",
1092
+ "hash_cont_tokens": "a01d6d39a83c4597"
1093
+ },
1094
+ "truncated": 0,
1095
+ "non_truncated": 103,
1096
+ "padded": 412,
1097
+ "non_padded": 0,
1098
+ "effective_few_shots": 5.0,
1099
+ "num_truncated_few_shots": 0
1100
+ },
1101
+ "harness|hendrycksTest-marketing|5": {
1102
+ "hashes": {
1103
+ "hash_examples": "2668953431f91e96",
1104
+ "hash_full_prompts": "77562bef997c7650",
1105
+ "hash_input_tokens": "38466c242259e6d3",
1106
+ "hash_cont_tokens": "6aeaed4d823c98aa"
1107
+ },
1108
+ "truncated": 0,
1109
+ "non_truncated": 234,
1110
+ "padded": 932,
1111
+ "non_padded": 4,
1112
+ "effective_few_shots": 5.0,
1113
+ "num_truncated_few_shots": 0
1114
+ },
1115
+ "harness|hendrycksTest-medical_genetics|5": {
1116
+ "hashes": {
1117
+ "hash_examples": "9c2dda34a2ea4fd2",
1118
+ "hash_full_prompts": "202139046daa118f",
1119
+ "hash_input_tokens": "0dd129e92538a7f6",
1120
+ "hash_cont_tokens": "50421e30bef398f9"
1121
+ },
1122
+ "truncated": 0,
1123
+ "non_truncated": 100,
1124
+ "padded": 400,
1125
+ "non_padded": 0,
1126
+ "effective_few_shots": 5.0,
1127
+ "num_truncated_few_shots": 0
1128
+ },
1129
+ "harness|hendrycksTest-miscellaneous|5": {
1130
+ "hashes": {
1131
+ "hash_examples": "41adb694024809c2",
1132
+ "hash_full_prompts": "bffec9fc237bcf93",
1133
+ "hash_input_tokens": "d108a883fc3e022f",
1134
+ "hash_cont_tokens": "9b0ab02a64603081"
1135
+ },
1136
+ "truncated": 0,
1137
+ "non_truncated": 783,
1138
+ "padded": 3132,
1139
+ "non_padded": 0,
1140
+ "effective_few_shots": 5.0,
1141
+ "num_truncated_few_shots": 0
1142
+ },
1143
+ "harness|hendrycksTest-moral_disputes|5": {
1144
+ "hashes": {
1145
+ "hash_examples": "3171c13ba3c594c4",
1146
+ "hash_full_prompts": "170831fc36f1d59e",
1147
+ "hash_input_tokens": "0e7b7df82884a2d5",
1148
+ "hash_cont_tokens": "3b8bbe9108e55ce9"
1149
+ },
1150
+ "truncated": 0,
1151
+ "non_truncated": 346,
1152
+ "padded": 1364,
1153
+ "non_padded": 20,
1154
+ "effective_few_shots": 5.0,
1155
+ "num_truncated_few_shots": 0
1156
+ },
1157
+ "harness|hendrycksTest-moral_scenarios|5": {
1158
+ "hashes": {
1159
+ "hash_examples": "9873e077e83e0546",
1160
+ "hash_full_prompts": "08f4ceba3131a068",
1161
+ "hash_input_tokens": "7c220f5613cd8426",
1162
+ "hash_cont_tokens": "3e9bfc0362e97330"
1163
+ },
1164
+ "truncated": 0,
1165
+ "non_truncated": 895,
1166
+ "padded": 3580,
1167
+ "non_padded": 0,
1168
+ "effective_few_shots": 5.0,
1169
+ "num_truncated_few_shots": 0
1170
+ },
1171
+ "harness|hendrycksTest-nutrition|5": {
1172
+ "hashes": {
1173
+ "hash_examples": "7db1d8142ec14323",
1174
+ "hash_full_prompts": "4c0e68e3586cb453",
1175
+ "hash_input_tokens": "35de1609a9a763a9",
1176
+ "hash_cont_tokens": "23b2dc6ee2da4cfc"
1177
+ },
1178
+ "truncated": 0,
1179
+ "non_truncated": 306,
1180
+ "padded": 1224,
1181
+ "non_padded": 0,
1182
+ "effective_few_shots": 5.0,
1183
+ "num_truncated_few_shots": 0
1184
+ },
1185
+ "harness|hendrycksTest-philosophy|5": {
1186
+ "hashes": {
1187
+ "hash_examples": "9b455b7d72811cc8",
1188
+ "hash_full_prompts": "e467f822d8a0d3ff",
1189
+ "hash_input_tokens": "a1dcfa9c80490d06",
1190
+ "hash_cont_tokens": "9f6ff69d23a48783"
1191
+ },
1192
+ "truncated": 0,
1193
+ "non_truncated": 311,
1194
+ "padded": 1244,
1195
+ "non_padded": 0,
1196
+ "effective_few_shots": 5.0,
1197
+ "num_truncated_few_shots": 0
1198
+ },
1199
+ "harness|hendrycksTest-prehistory|5": {
1200
+ "hashes": {
1201
+ "hash_examples": "8be90d0f538f1560",
1202
+ "hash_full_prompts": "152187949bcd0921",
1203
+ "hash_input_tokens": "a091cf645d2415e0",
1204
+ "hash_cont_tokens": "d6458d743d875837"
1205
+ },
1206
+ "truncated": 0,
1207
+ "non_truncated": 324,
1208
+ "padded": 1296,
1209
+ "non_padded": 0,
1210
+ "effective_few_shots": 5.0,
1211
+ "num_truncated_few_shots": 0
1212
+ },
1213
+ "harness|hendrycksTest-professional_accounting|5": {
1214
+ "hashes": {
1215
+ "hash_examples": "8d377597916cd07e",
1216
+ "hash_full_prompts": "0eb7345d6144ee0d",
1217
+ "hash_input_tokens": "e9df32a33f85290c",
1218
+ "hash_cont_tokens": "922a195f53a35662"
1219
+ },
1220
+ "truncated": 0,
1221
+ "non_truncated": 282,
1222
+ "padded": 1128,
1223
+ "non_padded": 0,
1224
+ "effective_few_shots": 5.0,
1225
+ "num_truncated_few_shots": 0
1226
+ },
1227
+ "harness|hendrycksTest-professional_law|5": {
1228
+ "hashes": {
1229
+ "hash_examples": "cd9dbc52b3c932d6",
1230
+ "hash_full_prompts": "36ac764272bfb182",
1231
+ "hash_input_tokens": "c9f7583fff66d361",
1232
+ "hash_cont_tokens": "2e590029ef41fbcd"
1233
+ },
1234
+ "truncated": 0,
1235
+ "non_truncated": 1534,
1236
+ "padded": 6136,
1237
+ "non_padded": 0,
1238
+ "effective_few_shots": 5.0,
1239
+ "num_truncated_few_shots": 0
1240
+ },
1241
+ "harness|hendrycksTest-professional_medicine|5": {
1242
+ "hashes": {
1243
+ "hash_examples": "b20e4e816c1e383e",
1244
+ "hash_full_prompts": "7b8d69ea2acaf2f7",
1245
+ "hash_input_tokens": "40a933f829116f8d",
1246
+ "hash_cont_tokens": "7cfee54dbddd5a98"
1247
+ },
1248
+ "truncated": 0,
1249
+ "non_truncated": 272,
1250
+ "padded": 1088,
1251
+ "non_padded": 0,
1252
+ "effective_few_shots": 5.0,
1253
+ "num_truncated_few_shots": 0
1254
+ },
1255
+ "harness|hendrycksTest-professional_psychology|5": {
1256
+ "hashes": {
1257
+ "hash_examples": "d45b73b22f9cc039",
1258
+ "hash_full_prompts": "fe8937e9ffc99771",
1259
+ "hash_input_tokens": "0f6a92c3a2062b48",
1260
+ "hash_cont_tokens": "a86677b2a45c20e1"
1261
+ },
1262
+ "truncated": 0,
1263
+ "non_truncated": 612,
1264
+ "padded": 2448,
1265
+ "non_padded": 0,
1266
+ "effective_few_shots": 5.0,
1267
+ "num_truncated_few_shots": 0
1268
+ },
1269
+ "harness|hendrycksTest-public_relations|5": {
1270
+ "hashes": {
1271
+ "hash_examples": "0d25072e1761652a",
1272
+ "hash_full_prompts": "f9adc39cfa9f42ba",
1273
+ "hash_input_tokens": "29a08e9bfbe9b2f0",
1274
+ "hash_cont_tokens": "0d756ccaae031757"
1275
+ },
1276
+ "truncated": 0,
1277
+ "non_truncated": 110,
1278
+ "padded": 440,
1279
+ "non_padded": 0,
1280
+ "effective_few_shots": 5.0,
1281
+ "num_truncated_few_shots": 0
1282
+ },
1283
+ "harness|hendrycksTest-security_studies|5": {
1284
+ "hashes": {
1285
+ "hash_examples": "62bb8197e63d60d4",
1286
+ "hash_full_prompts": "869c9c3ae196b7c3",
1287
+ "hash_input_tokens": "32a03f1f22a6e103",
1288
+ "hash_cont_tokens": "b2229bc2cfbf594b"
1289
+ },
1290
+ "truncated": 0,
1291
+ "non_truncated": 245,
1292
+ "padded": 980,
1293
+ "non_padded": 0,
1294
+ "effective_few_shots": 5.0,
1295
+ "num_truncated_few_shots": 0
1296
+ },
1297
+ "harness|hendrycksTest-sociology|5": {
1298
+ "hashes": {
1299
+ "hash_examples": "e7959df87dea8672",
1300
+ "hash_full_prompts": "1a1fc00e17b3a52a",
1301
+ "hash_input_tokens": "1de5c52d2b2831d7",
1302
+ "hash_cont_tokens": "c3a3bdfd177eed5b"
1303
+ },
1304
+ "truncated": 0,
1305
+ "non_truncated": 201,
1306
+ "padded": 800,
1307
+ "non_padded": 4,
1308
+ "effective_few_shots": 5.0,
1309
+ "num_truncated_few_shots": 0
1310
+ },
1311
+ "harness|hendrycksTest-us_foreign_policy|5": {
1312
+ "hashes": {
1313
+ "hash_examples": "4a56a01ddca44dca",
1314
+ "hash_full_prompts": "0c7a7081c71c07b6",
1315
+ "hash_input_tokens": "add924961f7f4146",
1316
+ "hash_cont_tokens": "50421e30bef398f9"
1317
+ },
1318
+ "truncated": 0,
1319
+ "non_truncated": 100,
1320
+ "padded": 400,
1321
+ "non_padded": 0,
1322
+ "effective_few_shots": 5.0,
1323
+ "num_truncated_few_shots": 0
1324
+ },
1325
+ "harness|hendrycksTest-virology|5": {
1326
+ "hashes": {
1327
+ "hash_examples": "451cc86a8c4f4fe9",
1328
+ "hash_full_prompts": "01e95325d8b738e4",
1329
+ "hash_input_tokens": "e0653601c466b1bc",
1330
+ "hash_cont_tokens": "af8b3658088cb37f"
1331
+ },
1332
+ "truncated": 0,
1333
+ "non_truncated": 166,
1334
+ "padded": 664,
1335
+ "non_padded": 0,
1336
+ "effective_few_shots": 5.0,
1337
+ "num_truncated_few_shots": 0
1338
+ },
1339
+ "harness|hendrycksTest-world_religions|5": {
1340
+ "hashes": {
1341
+ "hash_examples": "3b29cfaf1a81c379",
1342
+ "hash_full_prompts": "e0d79a15083dfdff",
1343
+ "hash_input_tokens": "ac600d612445156d",
1344
+ "hash_cont_tokens": "060118bef6de4e0a"
1345
+ },
1346
+ "truncated": 0,
1347
+ "non_truncated": 171,
1348
+ "padded": 684,
1349
+ "non_padded": 0,
1350
+ "effective_few_shots": 5.0,
1351
+ "num_truncated_few_shots": 0
1352
+ },
1353
+ "harness|truthfulqa:mc|0": {
1354
+ "hashes": {
1355
+ "hash_examples": "23176c0531c7b867",
1356
+ "hash_full_prompts": "36a6d90e75d92d4a",
1357
+ "hash_input_tokens": "a03ce28b7fd06aa7",
1358
+ "hash_cont_tokens": "f5da56a132aab151"
1359
+ },
1360
+ "truncated": 0,
1361
+ "non_truncated": 817,
1362
+ "padded": 9996,
1363
+ "non_padded": 0,
1364
+ "effective_few_shots": 0.0,
1365
+ "num_truncated_few_shots": 0
1366
+ },
1367
+ "harness|winogrande|5": {
1368
+ "hashes": {
1369
+ "hash_examples": "aada0a176fd81218",
1370
+ "hash_full_prompts": "c8655cbd12de8409",
1371
+ "hash_input_tokens": "72067255e368e24e",
1372
+ "hash_cont_tokens": "f08975ad6f2d5864"
1373
+ },
1374
+ "truncated": 0,
1375
+ "non_truncated": 1267,
1376
+ "padded": 2534,
1377
+ "non_padded": 0,
1378
+ "effective_few_shots": 5.0,
1379
+ "num_truncated_few_shots": 0
1380
+ },
1381
+ "harness|gsm8k|5": {
1382
+ "hashes": {
1383
+ "hash_examples": "4c0843a5d99bcfdc",
1384
+ "hash_full_prompts": "41d55e83abc0e02d",
1385
+ "hash_input_tokens": "bda342e47b5099b2",
1386
+ "hash_cont_tokens": "e4101d08d98273ca"
1387
+ },
1388
+ "truncated": 0,
1389
+ "non_truncated": 1319,
1390
+ "padded": 0,
1391
+ "non_padded": 1319,
1392
+ "effective_few_shots": 5.0,
1393
+ "num_truncated_few_shots": 0
1394
+ }
1395
+ },
1396
+ "summary_general": {
1397
+ "hashes": {
1398
+ "hash_examples": "3b7fa57a057f9415",
1399
+ "hash_full_prompts": "63615fc50fc9417c",
1400
+ "hash_input_tokens": "a8fa53915153e1db",
1401
+ "hash_cont_tokens": "c3c012687e8b60d2"
1402
+ },
1403
+ "truncated": 0,
1404
+ "non_truncated": 28659,
1405
+ "padded": 113348,
1406
+ "non_padded": 1524,
1407
+ "num_truncated_few_shots": 0
1408
+ }
1409
+ }
results_2024-05-27T06-04-08.229332.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0daf00c9bab17dedff430aa060f6a247564190f4471e3222f4065e643e7d516c
3
+ size 708892