shubhrapandit commited on
Commit
5108448
1 Parent(s): e88c378

Update model files and evals

Browse files
arc_challenge.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "results": {
3
  "arc_challenge": {
4
- "acc": 0.454778156996587,
5
- "acc_stderr": 0.014551507060836353,
6
- "acc_norm": 0.4735494880546075,
7
- "acc_norm_stderr": 0.014590931358120172
8
  }
9
  },
10
  "versions": {
11
  "arc_challenge": 0
12
  },
13
  "config": {
14
- "model": "sparseml",
15
- "model_args": "pretrained=/cerebras/experiments/spft-retrained_sparse70_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E6-/combined/,trust_remote_code=True",
16
  "num_fewshot": 25,
17
- "batch_size": "16",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
 
1
  {
2
  "results": {
3
  "arc_challenge": {
4
+ "acc": 0.4803754266211604,
5
+ "acc_stderr": 0.014600132075947094,
6
+ "acc_norm": 0.5025597269624573,
7
+ "acc_norm_stderr": 0.014611199329843784
8
  }
9
  },
10
  "versions": {
11
  "arc_challenge": 0
12
  },
13
  "config": {
14
+ "model": "hf",
15
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
16
  "num_fewshot": 25,
17
+ "batch_size": "32",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "neuralmagic/Llama-2-7b-pruned70-retrained-instruct",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
@@ -22,8 +22,8 @@
22
  "rope_theta": 10000.0,
23
  "tie_word_embeddings": false,
24
  "tokenizer_class": "LlamaTokenizerFast",
25
- "torch_dtype": "bfloat16",
26
- "transformers_version": "4.40.0",
27
  "use_cache": true,
28
  "vocab_size": 32000
29
  }
 
1
  {
2
+ "_name_or_path": "/nm/drive1/shubhra/cerebras/llama2_7B_sparse50_45B_retrained/",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
 
22
  "rope_theta": 10000.0,
23
  "tie_word_embeddings": false,
24
  "tokenizer_class": "LlamaTokenizerFast",
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.39.3",
27
  "use_cache": true,
28
  "vocab_size": 32000
29
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.40.0"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.39.3"
6
  }
gsm8k.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
  "results": {
3
  "gsm8k": {
4
- "acc": 0.1425322213798332,
5
- "acc_stderr": 0.009629588445673814
6
  }
7
  },
8
  "versions": {
9
  "gsm8k": 0
10
  },
11
  "config": {
12
- "model": "sparseml",
13
- "model_args": "pretrained=/cerebras/experiments/spft-retrained_sparse70_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E6-/combined/,trust_remote_code=True",
14
  "num_fewshot": 5,
15
- "batch_size": "8",
16
  "batch_sizes": [],
17
  "device": "cuda:0",
18
  "no_cache": true,
 
1
  {
2
  "results": {
3
  "gsm8k": {
4
+ "acc": 0.18119787717968158,
5
+ "acc_stderr": 0.010609827611527357
6
  }
7
  },
8
  "versions": {
9
  "gsm8k": 0
10
  },
11
  "config": {
12
+ "model": "hf",
13
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
14
  "num_fewshot": 5,
15
+ "batch_size": "128",
16
  "batch_sizes": [],
17
  "device": "cuda:0",
18
  "no_cache": true,
hellaswag.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "results": {
3
  "hellaswag": {
4
- "acc": 0.5796654052977495,
5
- "acc_stderr": 0.004926038197714527,
6
- "acc_norm": 0.7720573590918144,
7
- "acc_norm_stderr": 0.004186480645315563
8
  }
9
  },
10
  "versions": {
11
  "hellaswag": 0
12
  },
13
  "config": {
14
- "model": "sparseml",
15
- "model_args": "pretrained=/cerebras/experiments/spft-retrained_sparse70_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E6-/combined/,trust_remote_code=True",
16
  "num_fewshot": 10,
17
- "batch_size": "16",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
 
1
  {
2
  "results": {
3
  "hellaswag": {
4
+ "acc": 0.5884285998805019,
5
+ "acc_stderr": 0.004911125101064641,
6
+ "acc_norm": 0.784106751643099,
7
+ "acc_norm_stderr": 0.004105997149954853
8
  }
9
  },
10
  "versions": {
11
  "hellaswag": 0
12
  },
13
  "config": {
14
+ "model": "hf",
15
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
16
  "num_fewshot": 10,
17
+ "batch_size": "48",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
mmlu.json CHANGED
@@ -1,346 +1,346 @@
1
  {
2
  "results": {
3
  "hendrycksTest-abstract_algebra": {
4
- "acc": 0.28,
5
- "acc_stderr": 0.04512608598542129,
6
- "acc_norm": 0.28,
7
- "acc_norm_stderr": 0.04512608598542129
8
  },
9
  "hendrycksTest-anatomy": {
10
- "acc": 0.45925925925925926,
11
- "acc_stderr": 0.04304979692464242,
12
- "acc_norm": 0.45925925925925926,
13
- "acc_norm_stderr": 0.04304979692464242
14
  },
15
  "hendrycksTest-astronomy": {
16
- "acc": 0.47368421052631576,
17
- "acc_stderr": 0.040633027314866725,
18
- "acc_norm": 0.47368421052631576,
19
- "acc_norm_stderr": 0.040633027314866725
20
  },
21
  "hendrycksTest-business_ethics": {
22
- "acc": 0.45,
23
- "acc_stderr": 0.05,
24
- "acc_norm": 0.45,
25
- "acc_norm_stderr": 0.05
26
  },
27
  "hendrycksTest-clinical_knowledge": {
28
- "acc": 0.4339622641509434,
29
- "acc_stderr": 0.030503292013342596,
30
- "acc_norm": 0.4339622641509434,
31
- "acc_norm_stderr": 0.030503292013342596
32
  },
33
  "hendrycksTest-college_biology": {
34
- "acc": 0.3680555555555556,
35
- "acc_stderr": 0.040329990539607195,
36
- "acc_norm": 0.3680555555555556,
37
- "acc_norm_stderr": 0.040329990539607195
38
  },
39
  "hendrycksTest-college_chemistry": {
40
  "acc": 0.33,
41
- "acc_stderr": 0.04725815626252605,
42
  "acc_norm": 0.33,
43
- "acc_norm_stderr": 0.04725815626252605
44
  },
45
  "hendrycksTest-college_computer_science": {
46
- "acc": 0.35,
47
- "acc_stderr": 0.047937248544110196,
48
- "acc_norm": 0.35,
49
- "acc_norm_stderr": 0.047937248544110196
50
  },
51
  "hendrycksTest-college_mathematics": {
52
- "acc": 0.2,
53
- "acc_stderr": 0.040201512610368466,
54
- "acc_norm": 0.2,
55
- "acc_norm_stderr": 0.040201512610368466
56
  },
57
  "hendrycksTest-college_medicine": {
58
- "acc": 0.3930635838150289,
59
- "acc_stderr": 0.0372424959581773,
60
- "acc_norm": 0.3930635838150289,
61
- "acc_norm_stderr": 0.0372424959581773
62
  },
63
  "hendrycksTest-college_physics": {
64
- "acc": 0.18627450980392157,
65
- "acc_stderr": 0.038739587141493524,
66
- "acc_norm": 0.18627450980392157,
67
- "acc_norm_stderr": 0.038739587141493524
68
  },
69
  "hendrycksTest-computer_security": {
70
- "acc": 0.59,
71
- "acc_stderr": 0.049431107042371025,
72
- "acc_norm": 0.59,
73
- "acc_norm_stderr": 0.049431107042371025
74
  },
75
  "hendrycksTest-conceptual_physics": {
76
- "acc": 0.37872340425531914,
77
- "acc_stderr": 0.03170995606040655,
78
- "acc_norm": 0.37872340425531914,
79
- "acc_norm_stderr": 0.03170995606040655
80
  },
81
  "hendrycksTest-econometrics": {
82
- "acc": 0.2894736842105263,
83
- "acc_stderr": 0.04266339443159394,
84
- "acc_norm": 0.2894736842105263,
85
- "acc_norm_stderr": 0.04266339443159394
86
  },
87
  "hendrycksTest-electrical_engineering": {
88
- "acc": 0.4206896551724138,
89
- "acc_stderr": 0.0411391498118926,
90
- "acc_norm": 0.4206896551724138,
91
- "acc_norm_stderr": 0.0411391498118926
92
  },
93
  "hendrycksTest-elementary_mathematics": {
94
- "acc": 0.30158730158730157,
95
- "acc_stderr": 0.02363697599610179,
96
- "acc_norm": 0.30158730158730157,
97
- "acc_norm_stderr": 0.02363697599610179
98
  },
99
  "hendrycksTest-formal_logic": {
100
- "acc": 0.25396825396825395,
101
- "acc_stderr": 0.03893259610604674,
102
- "acc_norm": 0.25396825396825395,
103
- "acc_norm_stderr": 0.03893259610604674
104
  },
105
  "hendrycksTest-global_facts": {
106
- "acc": 0.41,
107
- "acc_stderr": 0.049431107042371025,
108
- "acc_norm": 0.41,
109
- "acc_norm_stderr": 0.049431107042371025
110
  },
111
  "hendrycksTest-high_school_biology": {
112
- "acc": 0.4806451612903226,
113
- "acc_stderr": 0.02842268740431211,
114
- "acc_norm": 0.4806451612903226,
115
- "acc_norm_stderr": 0.02842268740431211
116
  },
117
  "hendrycksTest-high_school_chemistry": {
118
- "acc": 0.33004926108374383,
119
- "acc_stderr": 0.033085304262282574,
120
- "acc_norm": 0.33004926108374383,
121
- "acc_norm_stderr": 0.033085304262282574
122
  },
123
  "hendrycksTest-high_school_computer_science": {
124
- "acc": 0.4,
125
- "acc_stderr": 0.049236596391733084,
126
- "acc_norm": 0.4,
127
- "acc_norm_stderr": 0.049236596391733084
128
  },
129
  "hendrycksTest-high_school_european_history": {
130
- "acc": 0.5151515151515151,
131
- "acc_stderr": 0.03902551007374448,
132
- "acc_norm": 0.5151515151515151,
133
- "acc_norm_stderr": 0.03902551007374448
134
  },
135
  "hendrycksTest-high_school_geography": {
136
- "acc": 0.4444444444444444,
137
- "acc_stderr": 0.035402943770953675,
138
- "acc_norm": 0.4444444444444444,
139
- "acc_norm_stderr": 0.035402943770953675
140
  },
141
  "hendrycksTest-high_school_government_and_politics": {
142
- "acc": 0.5906735751295337,
143
- "acc_stderr": 0.03548608168860806,
144
- "acc_norm": 0.5906735751295337,
145
- "acc_norm_stderr": 0.03548608168860806
146
  },
147
  "hendrycksTest-high_school_macroeconomics": {
148
- "acc": 0.36666666666666664,
149
- "acc_stderr": 0.024433016466052455,
150
- "acc_norm": 0.36666666666666664,
151
- "acc_norm_stderr": 0.024433016466052455
152
  },
153
  "hendrycksTest-high_school_mathematics": {
154
- "acc": 0.27037037037037037,
155
- "acc_stderr": 0.02708037281514568,
156
- "acc_norm": 0.27037037037037037,
157
- "acc_norm_stderr": 0.02708037281514568
158
  },
159
  "hendrycksTest-high_school_microeconomics": {
160
- "acc": 0.38235294117647056,
161
- "acc_stderr": 0.03156663099215416,
162
- "acc_norm": 0.38235294117647056,
163
- "acc_norm_stderr": 0.03156663099215416
164
  },
165
  "hendrycksTest-high_school_physics": {
166
- "acc": 0.32450331125827814,
167
- "acc_stderr": 0.03822746937658754,
168
- "acc_norm": 0.32450331125827814,
169
- "acc_norm_stderr": 0.03822746937658754
170
  },
171
  "hendrycksTest-high_school_psychology": {
172
- "acc": 0.5027522935779817,
173
- "acc_stderr": 0.021436998359765317,
174
- "acc_norm": 0.5027522935779817,
175
- "acc_norm_stderr": 0.021436998359765317
176
  },
177
  "hendrycksTest-high_school_statistics": {
178
- "acc": 0.33796296296296297,
179
- "acc_stderr": 0.03225941352631295,
180
- "acc_norm": 0.33796296296296297,
181
- "acc_norm_stderr": 0.03225941352631295
182
  },
183
  "hendrycksTest-high_school_us_history": {
184
- "acc": 0.5637254901960784,
185
- "acc_stderr": 0.034806931384570396,
186
- "acc_norm": 0.5637254901960784,
187
- "acc_norm_stderr": 0.034806931384570396
188
  },
189
  "hendrycksTest-high_school_world_history": {
190
- "acc": 0.6329113924050633,
191
- "acc_stderr": 0.03137624072561619,
192
- "acc_norm": 0.6329113924050633,
193
- "acc_norm_stderr": 0.03137624072561619
194
  },
195
  "hendrycksTest-human_aging": {
196
- "acc": 0.4439461883408072,
197
- "acc_stderr": 0.03334625674242728,
198
- "acc_norm": 0.4439461883408072,
199
- "acc_norm_stderr": 0.03334625674242728
200
  },
201
  "hendrycksTest-human_sexuality": {
202
- "acc": 0.4351145038167939,
203
- "acc_stderr": 0.04348208051644858,
204
- "acc_norm": 0.4351145038167939,
205
- "acc_norm_stderr": 0.04348208051644858
206
  },
207
  "hendrycksTest-international_law": {
208
- "acc": 0.5619834710743802,
209
- "acc_stderr": 0.045291468044357915,
210
- "acc_norm": 0.5619834710743802,
211
- "acc_norm_stderr": 0.045291468044357915
212
  },
213
  "hendrycksTest-jurisprudence": {
214
- "acc": 0.48148148148148145,
215
  "acc_stderr": 0.04830366024635331,
216
- "acc_norm": 0.48148148148148145,
217
  "acc_norm_stderr": 0.04830366024635331
218
  },
219
  "hendrycksTest-logical_fallacies": {
220
- "acc": 0.4539877300613497,
221
- "acc_stderr": 0.0391170190467718,
222
- "acc_norm": 0.4539877300613497,
223
- "acc_norm_stderr": 0.0391170190467718
224
  },
225
  "hendrycksTest-machine_learning": {
226
- "acc": 0.2857142857142857,
227
- "acc_stderr": 0.04287858751340456,
228
- "acc_norm": 0.2857142857142857,
229
- "acc_norm_stderr": 0.04287858751340456
230
  },
231
  "hendrycksTest-management": {
232
- "acc": 0.49514563106796117,
233
- "acc_stderr": 0.049505043821289195,
234
- "acc_norm": 0.49514563106796117,
235
- "acc_norm_stderr": 0.049505043821289195
236
  },
237
  "hendrycksTest-marketing": {
238
- "acc": 0.594017094017094,
239
- "acc_stderr": 0.03217180182641087,
240
- "acc_norm": 0.594017094017094,
241
- "acc_norm_stderr": 0.03217180182641087
242
  },
243
  "hendrycksTest-medical_genetics": {
244
- "acc": 0.44,
245
- "acc_stderr": 0.04988876515698589,
246
- "acc_norm": 0.44,
247
- "acc_norm_stderr": 0.04988876515698589
248
  },
249
  "hendrycksTest-miscellaneous": {
250
- "acc": 0.5530012771392082,
251
- "acc_stderr": 0.017779225233394223,
252
- "acc_norm": 0.5530012771392082,
253
- "acc_norm_stderr": 0.017779225233394223
254
  },
255
  "hendrycksTest-moral_disputes": {
256
- "acc": 0.4479768786127168,
257
- "acc_stderr": 0.026772990653361816,
258
- "acc_norm": 0.4479768786127168,
259
- "acc_norm_stderr": 0.026772990653361816
260
  },
261
  "hendrycksTest-moral_scenarios": {
262
- "acc": 0.2558659217877095,
263
- "acc_stderr": 0.014593620923210728,
264
- "acc_norm": 0.2558659217877095,
265
- "acc_norm_stderr": 0.014593620923210728
266
  },
267
  "hendrycksTest-nutrition": {
268
- "acc": 0.49019607843137253,
269
- "acc_stderr": 0.02862441255016795,
270
- "acc_norm": 0.49019607843137253,
271
- "acc_norm_stderr": 0.02862441255016795
272
  },
273
  "hendrycksTest-philosophy": {
274
- "acc": 0.5401929260450161,
275
- "acc_stderr": 0.028306190403305696,
276
- "acc_norm": 0.5401929260450161,
277
- "acc_norm_stderr": 0.028306190403305696
278
  },
279
  "hendrycksTest-prehistory": {
280
- "acc": 0.49074074074074076,
281
- "acc_stderr": 0.027815973433878014,
282
- "acc_norm": 0.49074074074074076,
283
- "acc_norm_stderr": 0.027815973433878014
284
  },
285
  "hendrycksTest-professional_accounting": {
286
- "acc": 0.31560283687943264,
287
- "acc_stderr": 0.02772498944950931,
288
- "acc_norm": 0.31560283687943264,
289
- "acc_norm_stderr": 0.02772498944950931
290
  },
291
  "hendrycksTest-professional_law": {
292
- "acc": 0.3409387222946545,
293
- "acc_stderr": 0.01210681720306721,
294
- "acc_norm": 0.3409387222946545,
295
- "acc_norm_stderr": 0.01210681720306721
296
  },
297
  "hendrycksTest-professional_medicine": {
298
- "acc": 0.39338235294117646,
299
- "acc_stderr": 0.029674288281311172,
300
- "acc_norm": 0.39338235294117646,
301
- "acc_norm_stderr": 0.029674288281311172
302
  },
303
  "hendrycksTest-professional_psychology": {
304
- "acc": 0.4215686274509804,
305
- "acc_stderr": 0.01997742260022747,
306
- "acc_norm": 0.4215686274509804,
307
- "acc_norm_stderr": 0.01997742260022747
308
  },
309
  "hendrycksTest-public_relations": {
310
- "acc": 0.42727272727272725,
311
- "acc_stderr": 0.04738198703545483,
312
- "acc_norm": 0.42727272727272725,
313
- "acc_norm_stderr": 0.04738198703545483
314
  },
315
  "hendrycksTest-security_studies": {
316
- "acc": 0.43673469387755104,
317
- "acc_stderr": 0.031751952375833226,
318
- "acc_norm": 0.43673469387755104,
319
- "acc_norm_stderr": 0.031751952375833226
320
  },
321
  "hendrycksTest-sociology": {
322
- "acc": 0.5572139303482587,
323
- "acc_stderr": 0.03512310964123936,
324
- "acc_norm": 0.5572139303482587,
325
- "acc_norm_stderr": 0.03512310964123936
326
  },
327
  "hendrycksTest-us_foreign_policy": {
328
- "acc": 0.6,
329
- "acc_stderr": 0.049236596391733084,
330
- "acc_norm": 0.6,
331
- "acc_norm_stderr": 0.049236596391733084
332
  },
333
  "hendrycksTest-virology": {
334
- "acc": 0.3855421686746988,
335
- "acc_stderr": 0.037891344246115496,
336
- "acc_norm": 0.3855421686746988,
337
- "acc_norm_stderr": 0.037891344246115496
338
  },
339
  "hendrycksTest-world_religions": {
340
- "acc": 0.5672514619883041,
341
- "acc_stderr": 0.03799978644370607,
342
- "acc_norm": 0.5672514619883041,
343
- "acc_norm_stderr": 0.03799978644370607
344
  }
345
  },
346
  "versions": {
@@ -403,10 +403,10 @@
403
  "hendrycksTest-world_religions": 1
404
  },
405
  "config": {
406
- "model": "sparseml",
407
- "model_args": "pretrained=/cerebras/experiments/spft-retrained_sparse70_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E6-/combined/,trust_remote_code=True",
408
  "num_fewshot": 5,
409
- "batch_size": "6",
410
  "batch_sizes": [],
411
  "device": "cuda:0",
412
  "no_cache": true,
 
1
  {
2
  "results": {
3
  "hendrycksTest-abstract_algebra": {
4
+ "acc": 0.25,
5
+ "acc_stderr": 0.04351941398892446,
6
+ "acc_norm": 0.25,
7
+ "acc_norm_stderr": 0.04351941398892446
8
  },
9
  "hendrycksTest-anatomy": {
10
+ "acc": 0.4666666666666667,
11
+ "acc_stderr": 0.043097329010363554,
12
+ "acc_norm": 0.4666666666666667,
13
+ "acc_norm_stderr": 0.043097329010363554
14
  },
15
  "hendrycksTest-astronomy": {
16
+ "acc": 0.5263157894736842,
17
+ "acc_stderr": 0.04063302731486671,
18
+ "acc_norm": 0.5263157894736842,
19
+ "acc_norm_stderr": 0.04063302731486671
20
  },
21
  "hendrycksTest-business_ethics": {
22
+ "acc": 0.52,
23
+ "acc_stderr": 0.050211673156867795,
24
+ "acc_norm": 0.52,
25
+ "acc_norm_stderr": 0.050211673156867795
26
  },
27
  "hendrycksTest-clinical_knowledge": {
28
+ "acc": 0.4867924528301887,
29
+ "acc_stderr": 0.030762134874500482,
30
+ "acc_norm": 0.4867924528301887,
31
+ "acc_norm_stderr": 0.030762134874500482
32
  },
33
  "hendrycksTest-college_biology": {
34
+ "acc": 0.4513888888888889,
35
+ "acc_stderr": 0.04161402398403279,
36
+ "acc_norm": 0.4513888888888889,
37
+ "acc_norm_stderr": 0.04161402398403279
38
  },
39
  "hendrycksTest-college_chemistry": {
40
  "acc": 0.33,
41
+ "acc_stderr": 0.047258156262526045,
42
  "acc_norm": 0.33,
43
+ "acc_norm_stderr": 0.047258156262526045
44
  },
45
  "hendrycksTest-college_computer_science": {
46
+ "acc": 0.37,
47
+ "acc_stderr": 0.04852365870939099,
48
+ "acc_norm": 0.37,
49
+ "acc_norm_stderr": 0.04852365870939099
50
  },
51
  "hendrycksTest-college_mathematics": {
52
+ "acc": 0.27,
53
+ "acc_stderr": 0.044619604333847394,
54
+ "acc_norm": 0.27,
55
+ "acc_norm_stderr": 0.044619604333847394
56
  },
57
  "hendrycksTest-college_medicine": {
58
+ "acc": 0.3988439306358382,
59
+ "acc_stderr": 0.03733626655383509,
60
+ "acc_norm": 0.3988439306358382,
61
+ "acc_norm_stderr": 0.03733626655383509
62
  },
63
  "hendrycksTest-college_physics": {
64
+ "acc": 0.23529411764705882,
65
+ "acc_stderr": 0.042207736591714534,
66
+ "acc_norm": 0.23529411764705882,
67
+ "acc_norm_stderr": 0.042207736591714534
68
  },
69
  "hendrycksTest-computer_security": {
70
+ "acc": 0.62,
71
+ "acc_stderr": 0.048783173121456316,
72
+ "acc_norm": 0.62,
73
+ "acc_norm_stderr": 0.048783173121456316
74
  },
75
  "hendrycksTest-conceptual_physics": {
76
+ "acc": 0.3659574468085106,
77
+ "acc_stderr": 0.03148955829745529,
78
+ "acc_norm": 0.3659574468085106,
79
+ "acc_norm_stderr": 0.03148955829745529
80
  },
81
  "hendrycksTest-econometrics": {
82
+ "acc": 0.2807017543859649,
83
+ "acc_stderr": 0.042270544512322,
84
+ "acc_norm": 0.2807017543859649,
85
+ "acc_norm_stderr": 0.042270544512322
86
  },
87
  "hendrycksTest-electrical_engineering": {
88
+ "acc": 0.41379310344827586,
89
+ "acc_stderr": 0.04104269211806232,
90
+ "acc_norm": 0.41379310344827586,
91
+ "acc_norm_stderr": 0.04104269211806232
92
  },
93
  "hendrycksTest-elementary_mathematics": {
94
+ "acc": 0.2962962962962963,
95
+ "acc_stderr": 0.023517294335963283,
96
+ "acc_norm": 0.2962962962962963,
97
+ "acc_norm_stderr": 0.023517294335963283
98
  },
99
  "hendrycksTest-formal_logic": {
100
+ "acc": 0.2619047619047619,
101
+ "acc_stderr": 0.0393253768039287,
102
+ "acc_norm": 0.2619047619047619,
103
+ "acc_norm_stderr": 0.0393253768039287
104
  },
105
  "hendrycksTest-global_facts": {
106
+ "acc": 0.34,
107
+ "acc_stderr": 0.047609522856952365,
108
+ "acc_norm": 0.34,
109
+ "acc_norm_stderr": 0.047609522856952365
110
  },
111
  "hendrycksTest-high_school_biology": {
112
+ "acc": 0.5193548387096775,
113
+ "acc_stderr": 0.028422687404312107,
114
+ "acc_norm": 0.5193548387096775,
115
+ "acc_norm_stderr": 0.028422687404312107
116
  },
117
  "hendrycksTest-high_school_chemistry": {
118
+ "acc": 0.3448275862068966,
119
+ "acc_stderr": 0.03344283744280458,
120
+ "acc_norm": 0.3448275862068966,
121
+ "acc_norm_stderr": 0.03344283744280458
122
  },
123
  "hendrycksTest-high_school_computer_science": {
124
+ "acc": 0.44,
125
+ "acc_stderr": 0.04988876515698589,
126
+ "acc_norm": 0.44,
127
+ "acc_norm_stderr": 0.04988876515698589
128
  },
129
  "hendrycksTest-high_school_european_history": {
130
+ "acc": 0.5696969696969697,
131
+ "acc_stderr": 0.03866225962879077,
132
+ "acc_norm": 0.5696969696969697,
133
+ "acc_norm_stderr": 0.03866225962879077
134
  },
135
  "hendrycksTest-high_school_geography": {
136
+ "acc": 0.5050505050505051,
137
+ "acc_stderr": 0.035621707606254015,
138
+ "acc_norm": 0.5050505050505051,
139
+ "acc_norm_stderr": 0.035621707606254015
140
  },
141
  "hendrycksTest-high_school_government_and_politics": {
142
+ "acc": 0.6528497409326425,
143
+ "acc_stderr": 0.03435696168361356,
144
+ "acc_norm": 0.6528497409326425,
145
+ "acc_norm_stderr": 0.03435696168361356
146
  },
147
  "hendrycksTest-high_school_macroeconomics": {
148
+ "acc": 0.40512820512820513,
149
+ "acc_stderr": 0.024890471769938145,
150
+ "acc_norm": 0.40512820512820513,
151
+ "acc_norm_stderr": 0.024890471769938145
152
  },
153
  "hendrycksTest-high_school_mathematics": {
154
+ "acc": 0.2518518518518518,
155
+ "acc_stderr": 0.026466117538959916,
156
+ "acc_norm": 0.2518518518518518,
157
+ "acc_norm_stderr": 0.026466117538959916
158
  },
159
  "hendrycksTest-high_school_microeconomics": {
160
+ "acc": 0.42857142857142855,
161
+ "acc_stderr": 0.032145368597886394,
162
+ "acc_norm": 0.42857142857142855,
163
+ "acc_norm_stderr": 0.032145368597886394
164
  },
165
  "hendrycksTest-high_school_physics": {
166
+ "acc": 0.271523178807947,
167
+ "acc_stderr": 0.03631329803969654,
168
+ "acc_norm": 0.271523178807947,
169
+ "acc_norm_stderr": 0.03631329803969654
170
  },
171
  "hendrycksTest-high_school_psychology": {
172
+ "acc": 0.6,
173
+ "acc_stderr": 0.021004201260420078,
174
+ "acc_norm": 0.6,
175
+ "acc_norm_stderr": 0.021004201260420078
176
  },
177
  "hendrycksTest-high_school_statistics": {
178
+ "acc": 0.37962962962962965,
179
+ "acc_stderr": 0.03309682581119035,
180
+ "acc_norm": 0.37962962962962965,
181
+ "acc_norm_stderr": 0.03309682581119035
182
  },
183
  "hendrycksTest-high_school_us_history": {
184
+ "acc": 0.5882352941176471,
185
+ "acc_stderr": 0.03454236585380608,
186
+ "acc_norm": 0.5882352941176471,
187
+ "acc_norm_stderr": 0.03454236585380608
188
  },
189
  "hendrycksTest-high_school_world_history": {
190
+ "acc": 0.6666666666666666,
191
+ "acc_stderr": 0.0306858205966108,
192
+ "acc_norm": 0.6666666666666666,
193
+ "acc_norm_stderr": 0.0306858205966108
194
  },
195
  "hendrycksTest-human_aging": {
196
+ "acc": 0.57847533632287,
197
+ "acc_stderr": 0.033141902221106564,
198
+ "acc_norm": 0.57847533632287,
199
+ "acc_norm_stderr": 0.033141902221106564
200
  },
201
  "hendrycksTest-human_sexuality": {
202
+ "acc": 0.549618320610687,
203
+ "acc_stderr": 0.04363643698524779,
204
+ "acc_norm": 0.549618320610687,
205
+ "acc_norm_stderr": 0.04363643698524779
206
  },
207
  "hendrycksTest-international_law": {
208
+ "acc": 0.6859504132231405,
209
+ "acc_stderr": 0.04236964753041018,
210
+ "acc_norm": 0.6859504132231405,
211
+ "acc_norm_stderr": 0.04236964753041018
212
  },
213
  "hendrycksTest-jurisprudence": {
214
+ "acc": 0.5185185185185185,
215
  "acc_stderr": 0.04830366024635331,
216
+ "acc_norm": 0.5185185185185185,
217
  "acc_norm_stderr": 0.04830366024635331
218
  },
219
  "hendrycksTest-logical_fallacies": {
220
+ "acc": 0.49079754601226994,
221
+ "acc_stderr": 0.03927705600787443,
222
+ "acc_norm": 0.49079754601226994,
223
+ "acc_norm_stderr": 0.03927705600787443
224
  },
225
  "hendrycksTest-machine_learning": {
226
+ "acc": 0.33035714285714285,
227
+ "acc_stderr": 0.04464285714285714,
228
+ "acc_norm": 0.33035714285714285,
229
+ "acc_norm_stderr": 0.04464285714285714
230
  },
231
  "hendrycksTest-management": {
232
+ "acc": 0.6116504854368932,
233
+ "acc_stderr": 0.048257293373563895,
234
+ "acc_norm": 0.6116504854368932,
235
+ "acc_norm_stderr": 0.048257293373563895
236
  },
237
  "hendrycksTest-marketing": {
238
+ "acc": 0.6538461538461539,
239
+ "acc_stderr": 0.0311669573672359,
240
+ "acc_norm": 0.6538461538461539,
241
+ "acc_norm_stderr": 0.0311669573672359
242
  },
243
  "hendrycksTest-medical_genetics": {
244
+ "acc": 0.45,
245
+ "acc_stderr": 0.05,
246
+ "acc_norm": 0.45,
247
+ "acc_norm_stderr": 0.05
248
  },
249
  "hendrycksTest-miscellaneous": {
250
+ "acc": 0.6475095785440613,
251
+ "acc_stderr": 0.01708415024408138,
252
+ "acc_norm": 0.6475095785440613,
253
+ "acc_norm_stderr": 0.01708415024408138
254
  },
255
  "hendrycksTest-moral_disputes": {
256
+ "acc": 0.5346820809248555,
257
+ "acc_stderr": 0.026854257928258875,
258
+ "acc_norm": 0.5346820809248555,
259
+ "acc_norm_stderr": 0.026854257928258875
260
  },
261
  "hendrycksTest-moral_scenarios": {
262
+ "acc": 0.2837988826815642,
263
+ "acc_stderr": 0.015078358970751764,
264
+ "acc_norm": 0.2837988826815642,
265
+ "acc_norm_stderr": 0.015078358970751764
266
  },
267
  "hendrycksTest-nutrition": {
268
+ "acc": 0.48366013071895425,
269
+ "acc_stderr": 0.028614624752805413,
270
+ "acc_norm": 0.48366013071895425,
271
+ "acc_norm_stderr": 0.028614624752805413
272
  },
273
  "hendrycksTest-philosophy": {
274
+ "acc": 0.5562700964630225,
275
+ "acc_stderr": 0.02821768355665231,
276
+ "acc_norm": 0.5562700964630225,
277
+ "acc_norm_stderr": 0.02821768355665231
278
  },
279
  "hendrycksTest-prehistory": {
280
+ "acc": 0.5370370370370371,
281
+ "acc_stderr": 0.027744313443376536,
282
+ "acc_norm": 0.5370370370370371,
283
+ "acc_norm_stderr": 0.027744313443376536
284
  },
285
  "hendrycksTest-professional_accounting": {
286
+ "acc": 0.36524822695035464,
287
+ "acc_stderr": 0.028723863853281285,
288
+ "acc_norm": 0.36524822695035464,
289
+ "acc_norm_stderr": 0.028723863853281285
290
  },
291
  "hendrycksTest-professional_law": {
292
+ "acc": 0.37614080834419816,
293
+ "acc_stderr": 0.012372214430599816,
294
+ "acc_norm": 0.37614080834419816,
295
+ "acc_norm_stderr": 0.012372214430599816
296
  },
297
  "hendrycksTest-professional_medicine": {
298
+ "acc": 0.44485294117647056,
299
+ "acc_stderr": 0.030187532060329387,
300
+ "acc_norm": 0.44485294117647056,
301
+ "acc_norm_stderr": 0.030187532060329387
302
  },
303
  "hendrycksTest-professional_psychology": {
304
+ "acc": 0.4411764705882353,
305
+ "acc_stderr": 0.020087362076702857,
306
+ "acc_norm": 0.4411764705882353,
307
+ "acc_norm_stderr": 0.020087362076702857
308
  },
309
  "hendrycksTest-public_relations": {
310
+ "acc": 0.509090909090909,
311
+ "acc_stderr": 0.0478833976870286,
312
+ "acc_norm": 0.509090909090909,
313
+ "acc_norm_stderr": 0.0478833976870286
314
  },
315
  "hendrycksTest-security_studies": {
316
+ "acc": 0.5224489795918368,
317
+ "acc_stderr": 0.031976941187136725,
318
+ "acc_norm": 0.5224489795918368,
319
+ "acc_norm_stderr": 0.031976941187136725
320
  },
321
  "hendrycksTest-sociology": {
322
+ "acc": 0.6119402985074627,
323
+ "acc_stderr": 0.0344578996436275,
324
+ "acc_norm": 0.6119402985074627,
325
+ "acc_norm_stderr": 0.0344578996436275
326
  },
327
  "hendrycksTest-us_foreign_policy": {
328
+ "acc": 0.73,
329
+ "acc_stderr": 0.0446196043338474,
330
+ "acc_norm": 0.73,
331
+ "acc_norm_stderr": 0.0446196043338474
332
  },
333
  "hendrycksTest-virology": {
334
+ "acc": 0.39759036144578314,
335
+ "acc_stderr": 0.038099730845402184,
336
+ "acc_norm": 0.39759036144578314,
337
+ "acc_norm_stderr": 0.038099730845402184
338
  },
339
  "hendrycksTest-world_religions": {
340
+ "acc": 0.7076023391812866,
341
+ "acc_stderr": 0.03488647713457922,
342
+ "acc_norm": 0.7076023391812866,
343
+ "acc_norm_stderr": 0.03488647713457922
344
  }
345
  },
346
  "versions": {
 
403
  "hendrycksTest-world_religions": 1
404
  },
405
  "config": {
406
+ "model": "hf",
407
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
408
  "num_fewshot": 5,
409
+ "batch_size": "16",
410
  "batch_sizes": [],
411
  "device": "cuda:0",
412
  "no_cache": true,
model-00001-of-00003.safetensors → model-00001-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4b70ffe94cfaaf4a68be9455c874896012b098690531fbbd78c9da473b1e178
3
- size 4938985352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc6ab460d335ac2b3d3e5752150098cf80eeff0910e1e1849ef9d079d6b4494e
3
+ size 4840396416
model-00002-of-00003.safetensors → model-00002-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76b52076bfa75355cf80593a6261f53d8e4e0ae85ba17903a900b59d20c406f3
3
- size 4947390880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fe349b4c785be7e5fff726b688ea6461116bae4ec1ee0c32dce2b8f94f6fa2f
3
+ size 4857206856
model-00003-of-00003.safetensors → model-00003-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1aa2d308ea6dacbe3c77e273165c284f9d860dbacac40a39aaf35ce3c49dc034
3
- size 3590488816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beca86c80d997da03ebaa699a361d53ebd11b9a133a3bf29d39c04434f65ffff
3
+ size 4857206904
model-00004-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:def437a470e94ef337a8b9c0a008320f0610f40cdbc29e097cc3f2111958a214
3
+ size 4857206904
model-00005-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:627a6137a9269e9b26dae8c3596d252aed6a317e295b244291278f4100a523c9
3
+ size 4857206904
model-00006-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e1211f02d8f8de62a565671261d73f598e3a1b4363ffd9e46f20a3b8ca1e3f4
3
+ size 2684472112
model.safetensors.index.json CHANGED
@@ -1,298 +1,298 @@
1
  {
2
  "metadata": {
3
- "total_size": 13476831232
4
  },
5
  "weight_map": {
6
- "lm_head.weight": "model-00003-of-00003.safetensors",
7
- "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
- "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
- "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
- "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
27
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
28
- "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
- "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
31
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
- "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
- "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
38
- "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
- "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
41
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
42
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
43
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
44
- "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
- "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
- "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
- "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
- "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
- "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
- "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
- "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
- "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
- "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
- "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
- "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
- "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
- "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
- "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
- "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
- "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
- "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
- "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
- "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
- "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
- "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
- "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
- "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
- "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
- "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
- "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
- "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
- "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
- "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
- "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
- "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
- "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
- "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
- "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
- "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
- "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
- "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
- "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
- "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
- "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
- "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
- "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
- "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
- "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
- "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
- "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
- "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
- "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
- "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
- "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
- "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
- "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
- "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
- "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
- "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
144
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
145
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
146
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
147
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
148
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
- "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
- "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
- "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
155
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
156
- "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
158
- "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
159
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
160
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
161
- "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
- "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
- "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
- "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
- "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
- "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
- "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
- "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
- "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
- "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
- "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
- "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
- "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
- "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
- "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
- "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
- "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
- "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
- "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
180
- "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
181
- "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
182
- "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
183
- "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
184
- "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
185
- "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
186
- "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
187
- "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
188
- "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
189
- "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
190
- "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
191
- "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
192
- "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
193
- "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
194
- "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
195
- "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
196
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
197
- "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
198
- "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
199
- "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
200
- "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
201
- "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
202
- "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
203
- "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
204
- "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
205
- "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
206
- "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
207
- "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
208
- "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
209
- "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
210
- "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
211
- "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
212
- "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
213
- "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
214
- "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
215
- "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
- "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
225
- "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
226
- "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
227
- "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
228
- "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
229
- "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
230
- "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
231
- "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
232
- "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
233
- "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
234
- "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
235
- "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
236
- "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
237
- "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
238
- "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
239
- "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
240
- "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
241
- "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
242
- "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
243
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
244
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
245
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
246
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
247
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
248
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
249
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
250
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
251
- "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
252
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
253
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
254
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
255
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
256
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
257
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
258
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
259
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
260
- "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
266
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
267
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
268
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
269
- "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
270
- "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
271
- "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
272
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
273
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
274
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
275
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
276
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
277
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
278
- "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
279
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
280
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
281
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
282
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
283
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
284
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
285
- "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
286
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
287
- "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
288
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
289
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
290
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
291
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
292
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
293
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
294
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
295
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
- "model.norm.weight": "model-00003-of-00003.safetensors"
297
  }
298
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 26953662464
4
  },
5
  "weight_map": {
6
+ "lm_head.weight": "model-00006-of-00006.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00006.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00006.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00006.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00006.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00003-of-00006.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00003-of-00006.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00003-of-00006.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00003-of-00006.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00003-of-00006.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00006.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00004-of-00006.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00004-of-00006.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00004-of-00006.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00006.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00004-of-00006.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00004-of-00006.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00004-of-00006.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00005-of-00006.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00005-of-00006.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00005-of-00006.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00005-of-00006.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00005-of-00006.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00005-of-00006.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00006-of-00006.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00006.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00006-of-00006.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00006-of-00006.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00006.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00006.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00006.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00006.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00006.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00006.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
296
+ "model.norm.weight": "model-00006-of-00006.safetensors"
297
  }
298
  }
truthfulqa_mc.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "results": {
3
  "truthfulqa_mc": {
4
- "mc1": 0.2631578947368421,
5
- "mc1_stderr": 0.015415241740237024,
6
- "mc2": 0.4225443383057251,
7
- "mc2_stderr": 0.014864051479547237
8
  }
9
  },
10
  "versions": {
11
  "truthfulqa_mc": 1
12
  },
13
  "config": {
14
- "model": "sparseml",
15
- "model_args": "pretrained=/cerebras/experiments/spft-retrained_sparse70_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E6-/combined/,trust_remote_code=True",
16
  "num_fewshot": 0,
17
- "batch_size": "64",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
 
1
  {
2
  "results": {
3
  "truthfulqa_mc": {
4
+ "mc1": 0.2839657282741738,
5
+ "mc1_stderr": 0.015785370858396725,
6
+ "mc2": 0.4316910007581044,
7
+ "mc2_stderr": 0.014766457929501058
8
  }
9
  },
10
  "versions": {
11
  "truthfulqa_mc": 1
12
  },
13
  "config": {
14
+ "model": "hf",
15
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
16
  "num_fewshot": 0,
17
+ "batch_size": "128",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
winogrande.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "results": {
3
  "winogrande": {
4
- "acc": 0.7190213101815311,
5
- "acc_stderr": 0.012632541095875824
6
  }
7
  },
8
  "versions": {
9
  "winogrande": 0
10
  },
11
  "config": {
12
- "model": "sparseml",
13
- "model_args": "pretrained=/cerebras/experiments/spft-retrained_sparse70_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E6-/combined/,trust_remote_code=True",
14
  "num_fewshot": 5,
15
- "batch_size": "64",
16
  "batch_sizes": [],
17
- "device": null,
18
  "no_cache": true,
19
  "limit": null,
20
  "bootstrap_iters": 100000,
 
1
  {
2
  "results": {
3
  "winogrande": {
4
+ "acc": 0.7324388318863457,
5
+ "acc_stderr": 0.012441718456893009
6
  }
7
  },
8
  "versions": {
9
  "winogrande": 0
10
  },
11
  "config": {
12
+ "model": "hf",
13
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
14
  "num_fewshot": 5,
15
+ "batch_size": "128",
16
  "batch_sizes": [],
17
+ "device": "cuda:0",
18
  "no_cache": true,
19
  "limit": null,
20
  "bootstrap_iters": 100000,