shubhrapandit commited on
Commit
e05bde7
1 Parent(s): d87d9ba

Update model files and evals

Browse files
arc_challenge.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "results": {
3
  "arc_challenge": {
4
- "acc": 0.48378839590443684,
5
- "acc_stderr": 0.014603708567414945,
6
- "acc_norm": 0.507679180887372,
7
- "acc_norm_stderr": 0.014609667440892574
8
  }
9
  },
10
  "versions": {
11
  "arc_challenge": 0
12
  },
13
  "config": {
14
- "model": "sparseml",
15
- "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-retrained_sparse50_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E2-/combined/,trust_remote_code=True",
16
  "num_fewshot": 25,
17
- "batch_size": "16",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
 
1
  {
2
  "results": {
3
  "arc_challenge": {
4
+ "acc": 0.4803754266211604,
5
+ "acc_stderr": 0.014600132075947094,
6
+ "acc_norm": 0.5025597269624573,
7
+ "acc_norm_stderr": 0.014611199329843784
8
  }
9
  },
10
  "versions": {
11
  "arc_challenge": 0
12
  },
13
  "config": {
14
+ "model": "hf",
15
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
16
  "num_fewshot": 25,
17
+ "batch_size": "32",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "neuralmagic/Llama-2-7b-pruned50-retrained-instruct",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
@@ -22,8 +22,8 @@
22
  "rope_theta": 10000.0,
23
  "tie_word_embeddings": false,
24
  "tokenizer_class": "LlamaTokenizerFast",
25
- "torch_dtype": "bfloat16",
26
- "transformers_version": "4.40.0",
27
  "use_cache": true,
28
  "vocab_size": 32000
29
  }
 
1
  {
2
+ "_name_or_path": "/nm/drive1/shubhra/cerebras/llama2_7B_sparse50_45B_retrained/",
3
  "architectures": [
4
  "LlamaForCausalLM"
5
  ],
 
22
  "rope_theta": 10000.0,
23
  "tie_word_embeddings": false,
24
  "tokenizer_class": "LlamaTokenizerFast",
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.39.3",
27
  "use_cache": true,
28
  "vocab_size": 32000
29
  }
generation_config.json CHANGED
@@ -2,5 +2,5 @@
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
- "transformers_version": "4.40.0"
6
  }
 
2
  "_from_model_config": true,
3
  "bos_token_id": 1,
4
  "eos_token_id": 2,
5
+ "transformers_version": "4.39.3"
6
  }
gsm8k.json CHANGED
@@ -1,18 +1,18 @@
1
  {
2
  "results": {
3
  "gsm8k": {
4
- "acc": 0.16376042456406367,
5
- "acc_stderr": 0.010193237214420945
6
  }
7
  },
8
  "versions": {
9
  "gsm8k": 0
10
  },
11
  "config": {
12
- "model": "sparseml",
13
- "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-retrained_sparse50_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E2-/combined/,trust_remote_code=True",
14
  "num_fewshot": 5,
15
- "batch_size": "8",
16
  "batch_sizes": [],
17
  "device": "cuda:0",
18
  "no_cache": true,
 
1
  {
2
  "results": {
3
  "gsm8k": {
4
+ "acc": 0.18119787717968158,
5
+ "acc_stderr": 0.010609827611527357
6
  }
7
  },
8
  "versions": {
9
  "gsm8k": 0
10
  },
11
  "config": {
12
+ "model": "hf",
13
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
14
  "num_fewshot": 5,
15
+ "batch_size": "128",
16
  "batch_sizes": [],
17
  "device": "cuda:0",
18
  "no_cache": true,
hellaswag.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "results": {
3
  "hellaswag": {
4
- "acc": 0.5931089424417447,
5
- "acc_stderr": 0.0049025025147385985,
6
- "acc_norm": 0.7885879306910973,
7
- "acc_norm_stderr": 0.004074754687134514
8
  }
9
  },
10
  "versions": {
11
  "hellaswag": 0
12
  },
13
  "config": {
14
- "model": "sparseml",
15
- "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-retrained_sparse50_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E2-/combined/,trust_remote_code=True",
16
  "num_fewshot": 10,
17
- "batch_size": "16",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
 
1
  {
2
  "results": {
3
  "hellaswag": {
4
+ "acc": 0.5884285998805019,
5
+ "acc_stderr": 0.004911125101064641,
6
+ "acc_norm": 0.784106751643099,
7
+ "acc_norm_stderr": 0.004105997149954853
8
  }
9
  },
10
  "versions": {
11
  "hellaswag": 0
12
  },
13
  "config": {
14
+ "model": "hf",
15
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
16
  "num_fewshot": 10,
17
+ "batch_size": "48",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
mmlu.json CHANGED
@@ -1,34 +1,34 @@
1
  {
2
  "results": {
3
  "hendrycksTest-abstract_algebra": {
4
- "acc": 0.27,
5
- "acc_stderr": 0.0446196043338474,
6
- "acc_norm": 0.27,
7
- "acc_norm_stderr": 0.0446196043338474
8
  },
9
  "hendrycksTest-anatomy": {
10
- "acc": 0.4444444444444444,
11
- "acc_stderr": 0.04292596718256981,
12
- "acc_norm": 0.4444444444444444,
13
- "acc_norm_stderr": 0.04292596718256981
14
  },
15
  "hendrycksTest-astronomy": {
16
- "acc": 0.4473684210526316,
17
- "acc_stderr": 0.04046336883978251,
18
- "acc_norm": 0.4473684210526316,
19
- "acc_norm_stderr": 0.04046336883978251
20
  },
21
  "hendrycksTest-business_ethics": {
22
- "acc": 0.43,
23
- "acc_stderr": 0.04975698519562428,
24
- "acc_norm": 0.43,
25
- "acc_norm_stderr": 0.04975698519562428
26
  },
27
  "hendrycksTest-clinical_knowledge": {
28
- "acc": 0.4528301886792453,
29
- "acc_stderr": 0.030635627957961823,
30
- "acc_norm": 0.4528301886792453,
31
- "acc_norm_stderr": 0.030635627957961823
32
  },
33
  "hendrycksTest-college_biology": {
34
  "acc": 0.4513888888888889,
@@ -37,88 +37,88 @@
37
  "acc_norm_stderr": 0.04161402398403279
38
  },
39
  "hendrycksTest-college_chemistry": {
 
 
 
 
 
 
40
  "acc": 0.37,
41
  "acc_stderr": 0.04852365870939099,
42
  "acc_norm": 0.37,
43
  "acc_norm_stderr": 0.04852365870939099
44
  },
45
- "hendrycksTest-college_computer_science": {
46
- "acc": 0.35,
47
- "acc_stderr": 0.0479372485441102,
48
- "acc_norm": 0.35,
49
- "acc_norm_stderr": 0.0479372485441102
50
- },
51
  "hendrycksTest-college_mathematics": {
52
- "acc": 0.23,
53
- "acc_stderr": 0.04229525846816508,
54
- "acc_norm": 0.23,
55
- "acc_norm_stderr": 0.04229525846816508
56
  },
57
  "hendrycksTest-college_medicine": {
58
- "acc": 0.3699421965317919,
59
- "acc_stderr": 0.036812296333943194,
60
- "acc_norm": 0.3699421965317919,
61
- "acc_norm_stderr": 0.036812296333943194
62
  },
63
  "hendrycksTest-college_physics": {
64
- "acc": 0.20588235294117646,
65
- "acc_stderr": 0.04023382273617746,
66
- "acc_norm": 0.20588235294117646,
67
- "acc_norm_stderr": 0.04023382273617746
68
  },
69
  "hendrycksTest-computer_security": {
70
- "acc": 0.59,
71
- "acc_stderr": 0.049431107042371025,
72
- "acc_norm": 0.59,
73
- "acc_norm_stderr": 0.049431107042371025
74
  },
75
  "hendrycksTest-conceptual_physics": {
76
- "acc": 0.39148936170212767,
77
- "acc_stderr": 0.03190701242326812,
78
- "acc_norm": 0.39148936170212767,
79
- "acc_norm_stderr": 0.03190701242326812
80
  },
81
  "hendrycksTest-econometrics": {
82
- "acc": 0.2719298245614035,
83
- "acc_stderr": 0.04185774424022056,
84
- "acc_norm": 0.2719298245614035,
85
- "acc_norm_stderr": 0.04185774424022056
86
  },
87
  "hendrycksTest-electrical_engineering": {
88
- "acc": 0.42758620689655175,
89
- "acc_stderr": 0.04122737111370333,
90
- "acc_norm": 0.42758620689655175,
91
- "acc_norm_stderr": 0.04122737111370333
92
  },
93
  "hendrycksTest-elementary_mathematics": {
94
- "acc": 0.3148148148148148,
95
- "acc_stderr": 0.02391998416404773,
96
- "acc_norm": 0.3148148148148148,
97
- "acc_norm_stderr": 0.02391998416404773
98
  },
99
  "hendrycksTest-formal_logic": {
100
- "acc": 0.2777777777777778,
101
- "acc_stderr": 0.04006168083848878,
102
- "acc_norm": 0.2777777777777778,
103
- "acc_norm_stderr": 0.04006168083848878
104
  },
105
  "hendrycksTest-global_facts": {
106
- "acc": 0.29,
107
- "acc_stderr": 0.045604802157206845,
108
- "acc_norm": 0.29,
109
- "acc_norm_stderr": 0.045604802157206845
110
  },
111
  "hendrycksTest-high_school_biology": {
112
- "acc": 0.5,
113
- "acc_stderr": 0.028444006199428714,
114
- "acc_norm": 0.5,
115
- "acc_norm_stderr": 0.028444006199428714
116
  },
117
  "hendrycksTest-high_school_chemistry": {
118
- "acc": 0.31527093596059114,
119
- "acc_stderr": 0.03269080871970186,
120
- "acc_norm": 0.31527093596059114,
121
- "acc_norm_stderr": 0.03269080871970186
122
  },
123
  "hendrycksTest-high_school_computer_science": {
124
  "acc": 0.44,
@@ -127,64 +127,64 @@
127
  "acc_norm_stderr": 0.04988876515698589
128
  },
129
  "hendrycksTest-high_school_european_history": {
130
- "acc": 0.593939393939394,
131
- "acc_stderr": 0.03834816355401181,
132
- "acc_norm": 0.593939393939394,
133
- "acc_norm_stderr": 0.03834816355401181
134
  },
135
  "hendrycksTest-high_school_geography": {
136
- "acc": 0.48484848484848486,
137
- "acc_stderr": 0.03560716516531061,
138
- "acc_norm": 0.48484848484848486,
139
- "acc_norm_stderr": 0.03560716516531061
140
  },
141
  "hendrycksTest-high_school_government_and_politics": {
142
- "acc": 0.6269430051813472,
143
- "acc_stderr": 0.03490205592048573,
144
- "acc_norm": 0.6269430051813472,
145
- "acc_norm_stderr": 0.03490205592048573
146
  },
147
  "hendrycksTest-high_school_macroeconomics": {
148
- "acc": 0.3871794871794872,
149
- "acc_stderr": 0.02469721693087894,
150
- "acc_norm": 0.3871794871794872,
151
- "acc_norm_stderr": 0.02469721693087894
152
  },
153
  "hendrycksTest-high_school_mathematics": {
154
- "acc": 0.2851851851851852,
155
- "acc_stderr": 0.027528599210340496,
156
- "acc_norm": 0.2851851851851852,
157
- "acc_norm_stderr": 0.027528599210340496
158
  },
159
  "hendrycksTest-high_school_microeconomics": {
160
- "acc": 0.42436974789915966,
161
- "acc_stderr": 0.032104790510157764,
162
- "acc_norm": 0.42436974789915966,
163
- "acc_norm_stderr": 0.032104790510157764
164
  },
165
  "hendrycksTest-high_school_physics": {
166
- "acc": 0.31788079470198677,
167
- "acc_stderr": 0.03802039760107903,
168
- "acc_norm": 0.31788079470198677,
169
- "acc_norm_stderr": 0.03802039760107903
170
  },
171
  "hendrycksTest-high_school_psychology": {
172
- "acc": 0.5743119266055046,
173
- "acc_stderr": 0.0211992359724708,
174
- "acc_norm": 0.5743119266055046,
175
- "acc_norm_stderr": 0.0211992359724708
176
  },
177
  "hendrycksTest-high_school_statistics": {
178
- "acc": 0.35648148148148145,
179
- "acc_stderr": 0.03266478331527272,
180
- "acc_norm": 0.35648148148148145,
181
- "acc_norm_stderr": 0.03266478331527272
182
  },
183
  "hendrycksTest-high_school_us_history": {
184
- "acc": 0.6029411764705882,
185
- "acc_stderr": 0.0343413116471913,
186
- "acc_norm": 0.6029411764705882,
187
- "acc_norm_stderr": 0.0343413116471913
188
  },
189
  "hendrycksTest-high_school_world_history": {
190
  "acc": 0.6666666666666666,
@@ -193,154 +193,154 @@
193
  "acc_norm_stderr": 0.0306858205966108
194
  },
195
  "hendrycksTest-human_aging": {
196
- "acc": 0.5426008968609866,
197
- "acc_stderr": 0.033435777055830646,
198
- "acc_norm": 0.5426008968609866,
199
- "acc_norm_stderr": 0.033435777055830646
200
  },
201
  "hendrycksTest-human_sexuality": {
202
- "acc": 0.4732824427480916,
203
- "acc_stderr": 0.04379024936553894,
204
- "acc_norm": 0.4732824427480916,
205
- "acc_norm_stderr": 0.04379024936553894
206
  },
207
  "hendrycksTest-international_law": {
208
- "acc": 0.6776859504132231,
209
- "acc_stderr": 0.04266416363352167,
210
- "acc_norm": 0.6776859504132231,
211
- "acc_norm_stderr": 0.04266416363352167
212
  },
213
  "hendrycksTest-jurisprudence": {
214
- "acc": 0.5370370370370371,
215
- "acc_stderr": 0.04820403072760627,
216
- "acc_norm": 0.5370370370370371,
217
- "acc_norm_stderr": 0.04820403072760627
218
  },
219
  "hendrycksTest-logical_fallacies": {
220
- "acc": 0.4601226993865031,
221
- "acc_stderr": 0.03915857291436971,
222
- "acc_norm": 0.4601226993865031,
223
- "acc_norm_stderr": 0.03915857291436971
224
  },
225
  "hendrycksTest-machine_learning": {
226
- "acc": 0.35714285714285715,
227
- "acc_stderr": 0.04547960999764376,
228
- "acc_norm": 0.35714285714285715,
229
- "acc_norm_stderr": 0.04547960999764376
230
  },
231
  "hendrycksTest-management": {
232
- "acc": 0.5825242718446602,
233
- "acc_stderr": 0.048828405482122375,
234
- "acc_norm": 0.5825242718446602,
235
- "acc_norm_stderr": 0.048828405482122375
236
  },
237
  "hendrycksTest-marketing": {
238
- "acc": 0.6367521367521367,
239
- "acc_stderr": 0.03150712523091264,
240
- "acc_norm": 0.6367521367521367,
241
- "acc_norm_stderr": 0.03150712523091264
242
  },
243
  "hendrycksTest-medical_genetics": {
244
- "acc": 0.43,
245
- "acc_stderr": 0.04975698519562428,
246
- "acc_norm": 0.43,
247
- "acc_norm_stderr": 0.04975698519562428
248
  },
249
  "hendrycksTest-miscellaneous": {
250
- "acc": 0.6245210727969349,
251
- "acc_stderr": 0.017316613197182786,
252
- "acc_norm": 0.6245210727969349,
253
- "acc_norm_stderr": 0.017316613197182786
254
  },
255
  "hendrycksTest-moral_disputes": {
256
- "acc": 0.5086705202312138,
257
- "acc_stderr": 0.026915047355369818,
258
- "acc_norm": 0.5086705202312138,
259
- "acc_norm_stderr": 0.026915047355369818
260
  },
261
  "hendrycksTest-moral_scenarios": {
262
- "acc": 0.24134078212290502,
263
- "acc_stderr": 0.014310999547961447,
264
- "acc_norm": 0.24134078212290502,
265
- "acc_norm_stderr": 0.014310999547961447
266
  },
267
  "hendrycksTest-nutrition": {
268
- "acc": 0.4869281045751634,
269
- "acc_stderr": 0.028620130800700246,
270
- "acc_norm": 0.4869281045751634,
271
- "acc_norm_stderr": 0.028620130800700246
272
  },
273
  "hendrycksTest-philosophy": {
274
- "acc": 0.5466237942122186,
275
- "acc_stderr": 0.028274359854894238,
276
- "acc_norm": 0.5466237942122186,
277
- "acc_norm_stderr": 0.028274359854894238
278
  },
279
  "hendrycksTest-prehistory": {
280
- "acc": 0.5061728395061729,
281
- "acc_stderr": 0.027818623962583295,
282
- "acc_norm": 0.5061728395061729,
283
- "acc_norm_stderr": 0.027818623962583295
284
  },
285
  "hendrycksTest-professional_accounting": {
286
- "acc": 0.3617021276595745,
287
- "acc_stderr": 0.0286638201471995,
288
- "acc_norm": 0.3617021276595745,
289
- "acc_norm_stderr": 0.0286638201471995
290
  },
291
  "hendrycksTest-professional_law": {
292
- "acc": 0.3741851368970013,
293
- "acc_stderr": 0.012359335618172061,
294
- "acc_norm": 0.3741851368970013,
295
- "acc_norm_stderr": 0.012359335618172061
296
  },
297
  "hendrycksTest-professional_medicine": {
298
- "acc": 0.45588235294117646,
299
- "acc_stderr": 0.030254372573976687,
300
- "acc_norm": 0.45588235294117646,
301
- "acc_norm_stderr": 0.030254372573976687
302
  },
303
  "hendrycksTest-professional_psychology": {
304
- "acc": 0.43300653594771243,
305
- "acc_stderr": 0.020045442473324227,
306
- "acc_norm": 0.43300653594771243,
307
- "acc_norm_stderr": 0.020045442473324227
308
  },
309
  "hendrycksTest-public_relations": {
310
- "acc": 0.4818181818181818,
311
- "acc_stderr": 0.04785964010794916,
312
- "acc_norm": 0.4818181818181818,
313
- "acc_norm_stderr": 0.04785964010794916
314
  },
315
  "hendrycksTest-security_studies": {
316
- "acc": 0.5020408163265306,
317
- "acc_stderr": 0.0320089533497105,
318
- "acc_norm": 0.5020408163265306,
319
- "acc_norm_stderr": 0.0320089533497105
320
  },
321
  "hendrycksTest-sociology": {
322
- "acc": 0.5671641791044776,
323
- "acc_stderr": 0.03503490923673281,
324
- "acc_norm": 0.5671641791044776,
325
- "acc_norm_stderr": 0.03503490923673281
326
  },
327
  "hendrycksTest-us_foreign_policy": {
328
- "acc": 0.7,
329
- "acc_stderr": 0.046056618647183814,
330
- "acc_norm": 0.7,
331
- "acc_norm_stderr": 0.046056618647183814
332
  },
333
  "hendrycksTest-virology": {
334
- "acc": 0.3795180722891566,
335
- "acc_stderr": 0.03777798822748018,
336
- "acc_norm": 0.3795180722891566,
337
- "acc_norm_stderr": 0.03777798822748018
338
  },
339
  "hendrycksTest-world_religions": {
340
- "acc": 0.6783625730994152,
341
- "acc_stderr": 0.03582529442573122,
342
- "acc_norm": 0.6783625730994152,
343
- "acc_norm_stderr": 0.03582529442573122
344
  }
345
  },
346
  "versions": {
@@ -403,10 +403,10 @@
403
  "hendrycksTest-world_religions": 1
404
  },
405
  "config": {
406
- "model": "sparseml",
407
- "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-retrained_sparse50_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E2-/combined/,trust_remote_code=True",
408
  "num_fewshot": 5,
409
- "batch_size": "6",
410
  "batch_sizes": [],
411
  "device": "cuda:0",
412
  "no_cache": true,
 
1
  {
2
  "results": {
3
  "hendrycksTest-abstract_algebra": {
4
+ "acc": 0.25,
5
+ "acc_stderr": 0.04351941398892446,
6
+ "acc_norm": 0.25,
7
+ "acc_norm_stderr": 0.04351941398892446
8
  },
9
  "hendrycksTest-anatomy": {
10
+ "acc": 0.4666666666666667,
11
+ "acc_stderr": 0.043097329010363554,
12
+ "acc_norm": 0.4666666666666667,
13
+ "acc_norm_stderr": 0.043097329010363554
14
  },
15
  "hendrycksTest-astronomy": {
16
+ "acc": 0.5263157894736842,
17
+ "acc_stderr": 0.04063302731486671,
18
+ "acc_norm": 0.5263157894736842,
19
+ "acc_norm_stderr": 0.04063302731486671
20
  },
21
  "hendrycksTest-business_ethics": {
22
+ "acc": 0.52,
23
+ "acc_stderr": 0.050211673156867795,
24
+ "acc_norm": 0.52,
25
+ "acc_norm_stderr": 0.050211673156867795
26
  },
27
  "hendrycksTest-clinical_knowledge": {
28
+ "acc": 0.4867924528301887,
29
+ "acc_stderr": 0.030762134874500482,
30
+ "acc_norm": 0.4867924528301887,
31
+ "acc_norm_stderr": 0.030762134874500482
32
  },
33
  "hendrycksTest-college_biology": {
34
  "acc": 0.4513888888888889,
 
37
  "acc_norm_stderr": 0.04161402398403279
38
  },
39
  "hendrycksTest-college_chemistry": {
40
+ "acc": 0.33,
41
+ "acc_stderr": 0.047258156262526045,
42
+ "acc_norm": 0.33,
43
+ "acc_norm_stderr": 0.047258156262526045
44
+ },
45
+ "hendrycksTest-college_computer_science": {
46
  "acc": 0.37,
47
  "acc_stderr": 0.04852365870939099,
48
  "acc_norm": 0.37,
49
  "acc_norm_stderr": 0.04852365870939099
50
  },
 
 
 
 
 
 
51
  "hendrycksTest-college_mathematics": {
52
+ "acc": 0.27,
53
+ "acc_stderr": 0.044619604333847394,
54
+ "acc_norm": 0.27,
55
+ "acc_norm_stderr": 0.044619604333847394
56
  },
57
  "hendrycksTest-college_medicine": {
58
+ "acc": 0.3988439306358382,
59
+ "acc_stderr": 0.03733626655383509,
60
+ "acc_norm": 0.3988439306358382,
61
+ "acc_norm_stderr": 0.03733626655383509
62
  },
63
  "hendrycksTest-college_physics": {
64
+ "acc": 0.23529411764705882,
65
+ "acc_stderr": 0.042207736591714534,
66
+ "acc_norm": 0.23529411764705882,
67
+ "acc_norm_stderr": 0.042207736591714534
68
  },
69
  "hendrycksTest-computer_security": {
70
+ "acc": 0.62,
71
+ "acc_stderr": 0.048783173121456316,
72
+ "acc_norm": 0.62,
73
+ "acc_norm_stderr": 0.048783173121456316
74
  },
75
  "hendrycksTest-conceptual_physics": {
76
+ "acc": 0.3659574468085106,
77
+ "acc_stderr": 0.03148955829745529,
78
+ "acc_norm": 0.3659574468085106,
79
+ "acc_norm_stderr": 0.03148955829745529
80
  },
81
  "hendrycksTest-econometrics": {
82
+ "acc": 0.2807017543859649,
83
+ "acc_stderr": 0.042270544512322,
84
+ "acc_norm": 0.2807017543859649,
85
+ "acc_norm_stderr": 0.042270544512322
86
  },
87
  "hendrycksTest-electrical_engineering": {
88
+ "acc": 0.41379310344827586,
89
+ "acc_stderr": 0.04104269211806232,
90
+ "acc_norm": 0.41379310344827586,
91
+ "acc_norm_stderr": 0.04104269211806232
92
  },
93
  "hendrycksTest-elementary_mathematics": {
94
+ "acc": 0.2962962962962963,
95
+ "acc_stderr": 0.023517294335963283,
96
+ "acc_norm": 0.2962962962962963,
97
+ "acc_norm_stderr": 0.023517294335963283
98
  },
99
  "hendrycksTest-formal_logic": {
100
+ "acc": 0.2619047619047619,
101
+ "acc_stderr": 0.0393253768039287,
102
+ "acc_norm": 0.2619047619047619,
103
+ "acc_norm_stderr": 0.0393253768039287
104
  },
105
  "hendrycksTest-global_facts": {
106
+ "acc": 0.34,
107
+ "acc_stderr": 0.047609522856952365,
108
+ "acc_norm": 0.34,
109
+ "acc_norm_stderr": 0.047609522856952365
110
  },
111
  "hendrycksTest-high_school_biology": {
112
+ "acc": 0.5193548387096775,
113
+ "acc_stderr": 0.028422687404312107,
114
+ "acc_norm": 0.5193548387096775,
115
+ "acc_norm_stderr": 0.028422687404312107
116
  },
117
  "hendrycksTest-high_school_chemistry": {
118
+ "acc": 0.3448275862068966,
119
+ "acc_stderr": 0.03344283744280458,
120
+ "acc_norm": 0.3448275862068966,
121
+ "acc_norm_stderr": 0.03344283744280458
122
  },
123
  "hendrycksTest-high_school_computer_science": {
124
  "acc": 0.44,
 
127
  "acc_norm_stderr": 0.04988876515698589
128
  },
129
  "hendrycksTest-high_school_european_history": {
130
+ "acc": 0.5696969696969697,
131
+ "acc_stderr": 0.03866225962879077,
132
+ "acc_norm": 0.5696969696969697,
133
+ "acc_norm_stderr": 0.03866225962879077
134
  },
135
  "hendrycksTest-high_school_geography": {
136
+ "acc": 0.5050505050505051,
137
+ "acc_stderr": 0.035621707606254015,
138
+ "acc_norm": 0.5050505050505051,
139
+ "acc_norm_stderr": 0.035621707606254015
140
  },
141
  "hendrycksTest-high_school_government_and_politics": {
142
+ "acc": 0.6528497409326425,
143
+ "acc_stderr": 0.03435696168361356,
144
+ "acc_norm": 0.6528497409326425,
145
+ "acc_norm_stderr": 0.03435696168361356
146
  },
147
  "hendrycksTest-high_school_macroeconomics": {
148
+ "acc": 0.40512820512820513,
149
+ "acc_stderr": 0.024890471769938145,
150
+ "acc_norm": 0.40512820512820513,
151
+ "acc_norm_stderr": 0.024890471769938145
152
  },
153
  "hendrycksTest-high_school_mathematics": {
154
+ "acc": 0.2518518518518518,
155
+ "acc_stderr": 0.026466117538959916,
156
+ "acc_norm": 0.2518518518518518,
157
+ "acc_norm_stderr": 0.026466117538959916
158
  },
159
  "hendrycksTest-high_school_microeconomics": {
160
+ "acc": 0.42857142857142855,
161
+ "acc_stderr": 0.032145368597886394,
162
+ "acc_norm": 0.42857142857142855,
163
+ "acc_norm_stderr": 0.032145368597886394
164
  },
165
  "hendrycksTest-high_school_physics": {
166
+ "acc": 0.271523178807947,
167
+ "acc_stderr": 0.03631329803969654,
168
+ "acc_norm": 0.271523178807947,
169
+ "acc_norm_stderr": 0.03631329803969654
170
  },
171
  "hendrycksTest-high_school_psychology": {
172
+ "acc": 0.6,
173
+ "acc_stderr": 0.021004201260420078,
174
+ "acc_norm": 0.6,
175
+ "acc_norm_stderr": 0.021004201260420078
176
  },
177
  "hendrycksTest-high_school_statistics": {
178
+ "acc": 0.37962962962962965,
179
+ "acc_stderr": 0.03309682581119035,
180
+ "acc_norm": 0.37962962962962965,
181
+ "acc_norm_stderr": 0.03309682581119035
182
  },
183
  "hendrycksTest-high_school_us_history": {
184
+ "acc": 0.5882352941176471,
185
+ "acc_stderr": 0.03454236585380608,
186
+ "acc_norm": 0.5882352941176471,
187
+ "acc_norm_stderr": 0.03454236585380608
188
  },
189
  "hendrycksTest-high_school_world_history": {
190
  "acc": 0.6666666666666666,
 
193
  "acc_norm_stderr": 0.0306858205966108
194
  },
195
  "hendrycksTest-human_aging": {
196
+ "acc": 0.57847533632287,
197
+ "acc_stderr": 0.033141902221106564,
198
+ "acc_norm": 0.57847533632287,
199
+ "acc_norm_stderr": 0.033141902221106564
200
  },
201
  "hendrycksTest-human_sexuality": {
202
+ "acc": 0.549618320610687,
203
+ "acc_stderr": 0.04363643698524779,
204
+ "acc_norm": 0.549618320610687,
205
+ "acc_norm_stderr": 0.04363643698524779
206
  },
207
  "hendrycksTest-international_law": {
208
+ "acc": 0.6859504132231405,
209
+ "acc_stderr": 0.04236964753041018,
210
+ "acc_norm": 0.6859504132231405,
211
+ "acc_norm_stderr": 0.04236964753041018
212
  },
213
  "hendrycksTest-jurisprudence": {
214
+ "acc": 0.5185185185185185,
215
+ "acc_stderr": 0.04830366024635331,
216
+ "acc_norm": 0.5185185185185185,
217
+ "acc_norm_stderr": 0.04830366024635331
218
  },
219
  "hendrycksTest-logical_fallacies": {
220
+ "acc": 0.49079754601226994,
221
+ "acc_stderr": 0.03927705600787443,
222
+ "acc_norm": 0.49079754601226994,
223
+ "acc_norm_stderr": 0.03927705600787443
224
  },
225
  "hendrycksTest-machine_learning": {
226
+ "acc": 0.33035714285714285,
227
+ "acc_stderr": 0.04464285714285714,
228
+ "acc_norm": 0.33035714285714285,
229
+ "acc_norm_stderr": 0.04464285714285714
230
  },
231
  "hendrycksTest-management": {
232
+ "acc": 0.6116504854368932,
233
+ "acc_stderr": 0.048257293373563895,
234
+ "acc_norm": 0.6116504854368932,
235
+ "acc_norm_stderr": 0.048257293373563895
236
  },
237
  "hendrycksTest-marketing": {
238
+ "acc": 0.6538461538461539,
239
+ "acc_stderr": 0.0311669573672359,
240
+ "acc_norm": 0.6538461538461539,
241
+ "acc_norm_stderr": 0.0311669573672359
242
  },
243
  "hendrycksTest-medical_genetics": {
244
+ "acc": 0.45,
245
+ "acc_stderr": 0.05,
246
+ "acc_norm": 0.45,
247
+ "acc_norm_stderr": 0.05
248
  },
249
  "hendrycksTest-miscellaneous": {
250
+ "acc": 0.6475095785440613,
251
+ "acc_stderr": 0.01708415024408138,
252
+ "acc_norm": 0.6475095785440613,
253
+ "acc_norm_stderr": 0.01708415024408138
254
  },
255
  "hendrycksTest-moral_disputes": {
256
+ "acc": 0.5346820809248555,
257
+ "acc_stderr": 0.026854257928258875,
258
+ "acc_norm": 0.5346820809248555,
259
+ "acc_norm_stderr": 0.026854257928258875
260
  },
261
  "hendrycksTest-moral_scenarios": {
262
+ "acc": 0.2837988826815642,
263
+ "acc_stderr": 0.015078358970751764,
264
+ "acc_norm": 0.2837988826815642,
265
+ "acc_norm_stderr": 0.015078358970751764
266
  },
267
  "hendrycksTest-nutrition": {
268
+ "acc": 0.48366013071895425,
269
+ "acc_stderr": 0.028614624752805413,
270
+ "acc_norm": 0.48366013071895425,
271
+ "acc_norm_stderr": 0.028614624752805413
272
  },
273
  "hendrycksTest-philosophy": {
274
+ "acc": 0.5562700964630225,
275
+ "acc_stderr": 0.02821768355665231,
276
+ "acc_norm": 0.5562700964630225,
277
+ "acc_norm_stderr": 0.02821768355665231
278
  },
279
  "hendrycksTest-prehistory": {
280
+ "acc": 0.5370370370370371,
281
+ "acc_stderr": 0.027744313443376536,
282
+ "acc_norm": 0.5370370370370371,
283
+ "acc_norm_stderr": 0.027744313443376536
284
  },
285
  "hendrycksTest-professional_accounting": {
286
+ "acc": 0.36524822695035464,
287
+ "acc_stderr": 0.028723863853281285,
288
+ "acc_norm": 0.36524822695035464,
289
+ "acc_norm_stderr": 0.028723863853281285
290
  },
291
  "hendrycksTest-professional_law": {
292
+ "acc": 0.37614080834419816,
293
+ "acc_stderr": 0.012372214430599816,
294
+ "acc_norm": 0.37614080834419816,
295
+ "acc_norm_stderr": 0.012372214430599816
296
  },
297
  "hendrycksTest-professional_medicine": {
298
+ "acc": 0.44485294117647056,
299
+ "acc_stderr": 0.030187532060329387,
300
+ "acc_norm": 0.44485294117647056,
301
+ "acc_norm_stderr": 0.030187532060329387
302
  },
303
  "hendrycksTest-professional_psychology": {
304
+ "acc": 0.4411764705882353,
305
+ "acc_stderr": 0.020087362076702857,
306
+ "acc_norm": 0.4411764705882353,
307
+ "acc_norm_stderr": 0.020087362076702857
308
  },
309
  "hendrycksTest-public_relations": {
310
+ "acc": 0.509090909090909,
311
+ "acc_stderr": 0.0478833976870286,
312
+ "acc_norm": 0.509090909090909,
313
+ "acc_norm_stderr": 0.0478833976870286
314
  },
315
  "hendrycksTest-security_studies": {
316
+ "acc": 0.5224489795918368,
317
+ "acc_stderr": 0.031976941187136725,
318
+ "acc_norm": 0.5224489795918368,
319
+ "acc_norm_stderr": 0.031976941187136725
320
  },
321
  "hendrycksTest-sociology": {
322
+ "acc": 0.6119402985074627,
323
+ "acc_stderr": 0.0344578996436275,
324
+ "acc_norm": 0.6119402985074627,
325
+ "acc_norm_stderr": 0.0344578996436275
326
  },
327
  "hendrycksTest-us_foreign_policy": {
328
+ "acc": 0.73,
329
+ "acc_stderr": 0.0446196043338474,
330
+ "acc_norm": 0.73,
331
+ "acc_norm_stderr": 0.0446196043338474
332
  },
333
  "hendrycksTest-virology": {
334
+ "acc": 0.39759036144578314,
335
+ "acc_stderr": 0.038099730845402184,
336
+ "acc_norm": 0.39759036144578314,
337
+ "acc_norm_stderr": 0.038099730845402184
338
  },
339
  "hendrycksTest-world_religions": {
340
+ "acc": 0.7076023391812866,
341
+ "acc_stderr": 0.03488647713457922,
342
+ "acc_norm": 0.7076023391812866,
343
+ "acc_norm_stderr": 0.03488647713457922
344
  }
345
  },
346
  "versions": {
 
403
  "hendrycksTest-world_religions": 1
404
  },
405
  "config": {
406
+ "model": "hf",
407
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
408
  "num_fewshot": 5,
409
+ "batch_size": "16",
410
  "batch_sizes": [],
411
  "device": "cuda:0",
412
  "no_cache": true,
model-00001-of-00003.safetensors → model-00001-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d2fe7879171b4321ff52544971723aa65d51ada176478b69173b19d194d8960
3
- size 4938985352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc6ab460d335ac2b3d3e5752150098cf80eeff0910e1e1849ef9d079d6b4494e
3
+ size 4840396416
model-00002-of-00003.safetensors → model-00002-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7cf60185456d79839e30277ed84c856a2e857ab728b1ed825696097aaf8cdc0e
3
- size 4947390880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2fe349b4c785be7e5fff726b688ea6461116bae4ec1ee0c32dce2b8f94f6fa2f
3
+ size 4857206856
model-00003-of-00003.safetensors → model-00003-of-00006.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:41ab9c2be4a3262d6f1ea90ca6b945b47e0f0fc6b6e2056e0bcfdf17ddee9140
3
- size 3590488816
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:beca86c80d997da03ebaa699a361d53ebd11b9a133a3bf29d39c04434f65ffff
3
+ size 4857206904
model-00004-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:def437a470e94ef337a8b9c0a008320f0610f40cdbc29e097cc3f2111958a214
3
+ size 4857206904
model-00005-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:627a6137a9269e9b26dae8c3596d252aed6a317e295b244291278f4100a523c9
3
+ size 4857206904
model-00006-of-00006.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e1211f02d8f8de62a565671261d73f598e3a1b4363ffd9e46f20a3b8ca1e3f4
3
+ size 2684472112
model.safetensors.index.json CHANGED
@@ -1,298 +1,298 @@
1
  {
2
  "metadata": {
3
- "total_size": 13476831232
4
  },
5
  "weight_map": {
6
- "lm_head.weight": "model-00003-of-00003.safetensors",
7
- "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
8
- "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
9
- "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
10
- "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
11
- "model.layers.0.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
12
- "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
13
- "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
14
- "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
15
- "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
16
- "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
17
- "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
18
- "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
19
- "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
20
- "model.layers.1.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
21
- "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
22
- "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
23
- "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
24
- "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
25
- "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
26
- "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
27
- "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
28
- "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
29
- "model.layers.10.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
30
- "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
31
- "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
32
- "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
33
- "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
34
- "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
35
- "model.layers.11.input_layernorm.weight": "model-00002-of-00003.safetensors",
36
- "model.layers.11.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
37
- "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
38
- "model.layers.11.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
39
- "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
40
- "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
41
- "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
42
- "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
43
- "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
44
- "model.layers.12.input_layernorm.weight": "model-00002-of-00003.safetensors",
45
- "model.layers.12.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
46
- "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
47
- "model.layers.12.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
48
- "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
49
- "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
50
- "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
51
- "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
52
- "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
53
- "model.layers.13.input_layernorm.weight": "model-00002-of-00003.safetensors",
54
- "model.layers.13.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
55
- "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
56
- "model.layers.13.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
57
- "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
58
- "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
59
- "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
60
- "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
61
- "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
62
- "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
63
- "model.layers.14.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
64
- "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
65
- "model.layers.14.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
66
- "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
67
- "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
68
- "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
69
- "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
70
- "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
71
- "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
72
- "model.layers.15.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
73
- "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
74
- "model.layers.15.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
75
- "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
76
- "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
77
- "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
78
- "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
79
- "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
80
- "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
81
- "model.layers.16.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
82
- "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
83
- "model.layers.16.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
84
- "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
85
- "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
86
- "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
87
- "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
88
- "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
89
- "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
90
- "model.layers.17.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
91
- "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
92
- "model.layers.17.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
93
- "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
94
- "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
95
- "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
96
- "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
97
- "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
98
- "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
99
- "model.layers.18.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
100
- "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
101
- "model.layers.18.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
102
- "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
103
- "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
104
- "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
105
- "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
106
- "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
107
- "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
108
- "model.layers.19.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
109
- "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
110
- "model.layers.19.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
111
- "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
112
- "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
113
- "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
114
- "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
115
- "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
116
- "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
117
- "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
118
- "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
119
- "model.layers.2.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
120
- "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
121
- "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
122
- "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
123
- "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
124
- "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
125
- "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
126
- "model.layers.20.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
127
- "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
128
- "model.layers.20.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
129
- "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
130
- "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
131
- "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
132
- "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
133
- "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
134
- "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
135
- "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
136
- "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
137
- "model.layers.21.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
138
- "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
139
- "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
140
- "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
141
- "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
142
- "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
143
- "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
144
- "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors",
145
- "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
146
- "model.layers.22.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
147
- "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors",
148
- "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
149
- "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
150
- "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
151
- "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
152
- "model.layers.23.input_layernorm.weight": "model-00003-of-00003.safetensors",
153
- "model.layers.23.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
154
- "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00003.safetensors",
155
- "model.layers.23.mlp.up_proj.weight": "model-00002-of-00003.safetensors",
156
- "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
157
- "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
158
- "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors",
159
- "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
160
- "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
161
- "model.layers.24.input_layernorm.weight": "model-00003-of-00003.safetensors",
162
- "model.layers.24.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
163
- "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
164
- "model.layers.24.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
165
- "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
166
- "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
167
- "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
168
- "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
169
- "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
170
- "model.layers.25.input_layernorm.weight": "model-00003-of-00003.safetensors",
171
- "model.layers.25.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
172
- "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
173
- "model.layers.25.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
174
- "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
175
- "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
176
- "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
177
- "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
178
- "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
179
- "model.layers.26.input_layernorm.weight": "model-00003-of-00003.safetensors",
180
- "model.layers.26.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
181
- "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
182
- "model.layers.26.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
183
- "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
184
- "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
185
- "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
186
- "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
187
- "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
188
- "model.layers.27.input_layernorm.weight": "model-00003-of-00003.safetensors",
189
- "model.layers.27.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
190
- "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
191
- "model.layers.27.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
192
- "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
193
- "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
194
- "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
195
- "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
196
- "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
197
- "model.layers.28.input_layernorm.weight": "model-00003-of-00003.safetensors",
198
- "model.layers.28.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
199
- "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
200
- "model.layers.28.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
201
- "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
202
- "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
203
- "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
204
- "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
205
- "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
206
- "model.layers.29.input_layernorm.weight": "model-00003-of-00003.safetensors",
207
- "model.layers.29.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
208
- "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
209
- "model.layers.29.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
210
- "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
211
- "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
212
- "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
213
- "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
214
- "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
215
- "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
216
- "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
217
- "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
218
- "model.layers.3.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
219
- "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
220
- "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
221
- "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
222
- "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
223
- "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
224
- "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
225
- "model.layers.30.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
226
- "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
227
- "model.layers.30.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
228
- "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
229
- "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
230
- "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
231
- "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
232
- "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
233
- "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
234
- "model.layers.31.mlp.down_proj.weight": "model-00003-of-00003.safetensors",
235
- "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00003.safetensors",
236
- "model.layers.31.mlp.up_proj.weight": "model-00003-of-00003.safetensors",
237
- "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00003.safetensors",
238
- "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
239
- "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00003.safetensors",
240
- "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
241
- "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
242
- "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
243
- "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
244
- "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
245
- "model.layers.4.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
246
- "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
247
- "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
248
- "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
249
- "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
250
- "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
251
- "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
252
- "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
253
- "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
254
- "model.layers.5.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
255
- "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
256
- "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
257
- "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
258
- "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
259
- "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
260
- "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
261
- "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
262
- "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
263
- "model.layers.6.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
264
- "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
265
- "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
266
- "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
267
- "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
268
- "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
269
- "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
270
- "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
271
- "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
272
- "model.layers.7.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
273
- "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
274
- "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
275
- "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
276
- "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
277
- "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
278
- "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
279
- "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
280
- "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
281
- "model.layers.8.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
282
- "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
283
- "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
284
- "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
285
- "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
286
- "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
287
- "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
288
- "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors",
289
- "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00003.safetensors",
290
- "model.layers.9.mlp.up_proj.weight": "model-00001-of-00003.safetensors",
291
- "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors",
292
- "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
293
- "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors",
294
- "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
295
- "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
296
- "model.norm.weight": "model-00003-of-00003.safetensors"
297
  }
298
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 26953662464
4
  },
5
  "weight_map": {
6
+ "lm_head.weight": "model-00006-of-00006.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00006.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00006.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
10
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
11
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
12
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
13
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
14
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
15
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
16
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
17
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00006.safetensors",
18
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
19
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
20
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
21
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
22
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
23
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
24
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
25
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
26
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00006.safetensors",
27
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
28
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
29
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
30
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
31
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
32
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
33
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
34
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
35
+ "model.layers.11.input_layernorm.weight": "model-00003-of-00006.safetensors",
36
+ "model.layers.11.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
37
+ "model.layers.11.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
38
+ "model.layers.11.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
39
+ "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
40
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
41
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
42
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
43
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
44
+ "model.layers.12.input_layernorm.weight": "model-00003-of-00006.safetensors",
45
+ "model.layers.12.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
46
+ "model.layers.12.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
47
+ "model.layers.12.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
48
+ "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
49
+ "model.layers.12.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
50
+ "model.layers.12.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
51
+ "model.layers.12.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
52
+ "model.layers.12.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
53
+ "model.layers.13.input_layernorm.weight": "model-00003-of-00006.safetensors",
54
+ "model.layers.13.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
55
+ "model.layers.13.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
56
+ "model.layers.13.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
57
+ "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
58
+ "model.layers.13.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
59
+ "model.layers.13.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
60
+ "model.layers.13.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
61
+ "model.layers.13.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
62
+ "model.layers.14.input_layernorm.weight": "model-00003-of-00006.safetensors",
63
+ "model.layers.14.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
64
+ "model.layers.14.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
65
+ "model.layers.14.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
66
+ "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
67
+ "model.layers.14.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
68
+ "model.layers.14.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
69
+ "model.layers.14.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
70
+ "model.layers.14.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
71
+ "model.layers.15.input_layernorm.weight": "model-00003-of-00006.safetensors",
72
+ "model.layers.15.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
73
+ "model.layers.15.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
74
+ "model.layers.15.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
75
+ "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
76
+ "model.layers.15.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
77
+ "model.layers.15.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
78
+ "model.layers.15.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
79
+ "model.layers.15.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
80
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00006.safetensors",
81
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00006.safetensors",
82
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00006.safetensors",
83
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00006.safetensors",
84
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00006.safetensors",
85
+ "model.layers.16.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
86
+ "model.layers.16.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
87
+ "model.layers.16.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
88
+ "model.layers.16.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
89
+ "model.layers.17.input_layernorm.weight": "model-00004-of-00006.safetensors",
90
+ "model.layers.17.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
91
+ "model.layers.17.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
92
+ "model.layers.17.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
93
+ "model.layers.17.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
94
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00006.safetensors",
95
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00006.safetensors",
96
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00006.safetensors",
97
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00006.safetensors",
98
+ "model.layers.18.input_layernorm.weight": "model-00004-of-00006.safetensors",
99
+ "model.layers.18.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
100
+ "model.layers.18.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
101
+ "model.layers.18.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
102
+ "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
103
+ "model.layers.18.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
104
+ "model.layers.18.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
105
+ "model.layers.18.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
106
+ "model.layers.18.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
107
+ "model.layers.19.input_layernorm.weight": "model-00004-of-00006.safetensors",
108
+ "model.layers.19.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
109
+ "model.layers.19.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
110
+ "model.layers.19.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
111
+ "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
112
+ "model.layers.19.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
113
+ "model.layers.19.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
114
+ "model.layers.19.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
115
+ "model.layers.19.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
116
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00006.safetensors",
117
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
118
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
119
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
120
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
121
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
122
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
123
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
124
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
125
+ "model.layers.20.input_layernorm.weight": "model-00004-of-00006.safetensors",
126
+ "model.layers.20.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
127
+ "model.layers.20.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
128
+ "model.layers.20.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
129
+ "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
130
+ "model.layers.20.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
131
+ "model.layers.20.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
132
+ "model.layers.20.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
133
+ "model.layers.20.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
134
+ "model.layers.21.input_layernorm.weight": "model-00004-of-00006.safetensors",
135
+ "model.layers.21.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
136
+ "model.layers.21.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
137
+ "model.layers.21.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
138
+ "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
139
+ "model.layers.21.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
140
+ "model.layers.21.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
141
+ "model.layers.21.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
142
+ "model.layers.21.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
143
+ "model.layers.22.input_layernorm.weight": "model-00004-of-00006.safetensors",
144
+ "model.layers.22.mlp.down_proj.weight": "model-00004-of-00006.safetensors",
145
+ "model.layers.22.mlp.gate_proj.weight": "model-00004-of-00006.safetensors",
146
+ "model.layers.22.mlp.up_proj.weight": "model-00004-of-00006.safetensors",
147
+ "model.layers.22.post_attention_layernorm.weight": "model-00004-of-00006.safetensors",
148
+ "model.layers.22.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
149
+ "model.layers.22.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
150
+ "model.layers.22.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
151
+ "model.layers.22.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
152
+ "model.layers.23.input_layernorm.weight": "model-00005-of-00006.safetensors",
153
+ "model.layers.23.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
154
+ "model.layers.23.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
155
+ "model.layers.23.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
156
+ "model.layers.23.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
157
+ "model.layers.23.self_attn.k_proj.weight": "model-00004-of-00006.safetensors",
158
+ "model.layers.23.self_attn.o_proj.weight": "model-00004-of-00006.safetensors",
159
+ "model.layers.23.self_attn.q_proj.weight": "model-00004-of-00006.safetensors",
160
+ "model.layers.23.self_attn.v_proj.weight": "model-00004-of-00006.safetensors",
161
+ "model.layers.24.input_layernorm.weight": "model-00005-of-00006.safetensors",
162
+ "model.layers.24.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
163
+ "model.layers.24.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
164
+ "model.layers.24.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
165
+ "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
166
+ "model.layers.24.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
167
+ "model.layers.24.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
168
+ "model.layers.24.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
169
+ "model.layers.24.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
170
+ "model.layers.25.input_layernorm.weight": "model-00005-of-00006.safetensors",
171
+ "model.layers.25.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
172
+ "model.layers.25.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
173
+ "model.layers.25.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
174
+ "model.layers.25.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
175
+ "model.layers.25.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
176
+ "model.layers.25.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
177
+ "model.layers.25.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
178
+ "model.layers.25.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
179
+ "model.layers.26.input_layernorm.weight": "model-00005-of-00006.safetensors",
180
+ "model.layers.26.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
181
+ "model.layers.26.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
182
+ "model.layers.26.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
183
+ "model.layers.26.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
184
+ "model.layers.26.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
185
+ "model.layers.26.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
186
+ "model.layers.26.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
187
+ "model.layers.26.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
188
+ "model.layers.27.input_layernorm.weight": "model-00005-of-00006.safetensors",
189
+ "model.layers.27.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
190
+ "model.layers.27.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
191
+ "model.layers.27.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
192
+ "model.layers.27.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
193
+ "model.layers.27.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
194
+ "model.layers.27.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
195
+ "model.layers.27.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
196
+ "model.layers.27.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
197
+ "model.layers.28.input_layernorm.weight": "model-00005-of-00006.safetensors",
198
+ "model.layers.28.mlp.down_proj.weight": "model-00005-of-00006.safetensors",
199
+ "model.layers.28.mlp.gate_proj.weight": "model-00005-of-00006.safetensors",
200
+ "model.layers.28.mlp.up_proj.weight": "model-00005-of-00006.safetensors",
201
+ "model.layers.28.post_attention_layernorm.weight": "model-00005-of-00006.safetensors",
202
+ "model.layers.28.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
203
+ "model.layers.28.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
204
+ "model.layers.28.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
205
+ "model.layers.28.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
206
+ "model.layers.29.input_layernorm.weight": "model-00006-of-00006.safetensors",
207
+ "model.layers.29.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
208
+ "model.layers.29.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
209
+ "model.layers.29.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
210
+ "model.layers.29.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
211
+ "model.layers.29.self_attn.k_proj.weight": "model-00005-of-00006.safetensors",
212
+ "model.layers.29.self_attn.o_proj.weight": "model-00005-of-00006.safetensors",
213
+ "model.layers.29.self_attn.q_proj.weight": "model-00005-of-00006.safetensors",
214
+ "model.layers.29.self_attn.v_proj.weight": "model-00005-of-00006.safetensors",
215
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00006.safetensors",
216
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
217
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
218
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
219
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
220
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
221
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
222
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
223
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
224
+ "model.layers.30.input_layernorm.weight": "model-00006-of-00006.safetensors",
225
+ "model.layers.30.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
226
+ "model.layers.30.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
227
+ "model.layers.30.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
228
+ "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
229
+ "model.layers.30.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
230
+ "model.layers.30.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
231
+ "model.layers.30.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
232
+ "model.layers.30.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
233
+ "model.layers.31.input_layernorm.weight": "model-00006-of-00006.safetensors",
234
+ "model.layers.31.mlp.down_proj.weight": "model-00006-of-00006.safetensors",
235
+ "model.layers.31.mlp.gate_proj.weight": "model-00006-of-00006.safetensors",
236
+ "model.layers.31.mlp.up_proj.weight": "model-00006-of-00006.safetensors",
237
+ "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00006.safetensors",
238
+ "model.layers.31.self_attn.k_proj.weight": "model-00006-of-00006.safetensors",
239
+ "model.layers.31.self_attn.o_proj.weight": "model-00006-of-00006.safetensors",
240
+ "model.layers.31.self_attn.q_proj.weight": "model-00006-of-00006.safetensors",
241
+ "model.layers.31.self_attn.v_proj.weight": "model-00006-of-00006.safetensors",
242
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00006.safetensors",
243
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00006.safetensors",
244
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00006.safetensors",
245
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00006.safetensors",
246
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00006.safetensors",
247
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
248
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
249
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
250
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
251
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00006.safetensors",
252
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
253
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
254
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
255
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
256
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00006.safetensors",
257
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00006.safetensors",
258
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00006.safetensors",
259
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00006.safetensors",
260
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00006.safetensors",
261
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
262
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
263
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
264
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
265
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
266
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
267
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
268
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
269
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00006.safetensors",
270
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
271
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
272
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
273
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
274
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
275
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
276
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
277
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
278
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00006.safetensors",
279
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
280
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
281
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
282
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
283
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
284
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
285
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
286
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
287
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00006.safetensors",
288
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00006.safetensors",
289
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00006.safetensors",
290
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00006.safetensors",
291
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00006.safetensors",
292
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00006.safetensors",
293
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00006.safetensors",
294
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00006.safetensors",
295
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00006.safetensors",
296
+ "model.norm.weight": "model-00006-of-00006.safetensors"
297
  }
298
  }
truthfulqa_mc.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "results": {
3
  "truthfulqa_mc": {
4
- "mc1": 0.29008567931456547,
5
- "mc1_stderr": 0.01588623687420952,
6
- "mc2": 0.44404926558063107,
7
- "mc2_stderr": 0.015030682044375404
8
  }
9
  },
10
  "versions": {
11
  "truthfulqa_mc": 1
12
  },
13
  "config": {
14
- "model": "sparseml",
15
- "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-retrained_sparse50_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E2-/combined/,trust_remote_code=True",
16
  "num_fewshot": 0,
17
- "batch_size": "64",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
 
1
  {
2
  "results": {
3
  "truthfulqa_mc": {
4
+ "mc1": 0.2839657282741738,
5
+ "mc1_stderr": 0.015785370858396725,
6
+ "mc2": 0.4316910007581044,
7
+ "mc2_stderr": 0.014766457929501058
8
  }
9
  },
10
  "versions": {
11
  "truthfulqa_mc": 1
12
  },
13
  "config": {
14
+ "model": "hf",
15
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
16
  "num_fewshot": 0,
17
+ "batch_size": "128",
18
  "batch_sizes": [],
19
  "device": "cuda:0",
20
  "no_cache": true,
winogrande.json CHANGED
@@ -1,20 +1,20 @@
1
  {
2
  "results": {
3
  "winogrande": {
4
- "acc": 0.7261247040252565,
5
- "acc_stderr": 0.012533292732620292
6
  }
7
  },
8
  "versions": {
9
  "winogrande": 0
10
  },
11
  "config": {
12
- "model": "sparseml",
13
- "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-retrained_sparse50_llama2_DATAUpdated_KDFalse_GCTrue_LR1e-4_E2-/combined/,trust_remote_code=True",
14
  "num_fewshot": 5,
15
- "batch_size": "64",
16
  "batch_sizes": [],
17
- "device": null,
18
  "no_cache": true,
19
  "limit": null,
20
  "bootstrap_iters": 100000,
 
1
  {
2
  "results": {
3
  "winogrande": {
4
+ "acc": 0.7324388318863457,
5
+ "acc_stderr": 0.012441718456893009
6
  }
7
  },
8
  "versions": {
9
  "winogrande": 0
10
  },
11
  "config": {
12
+ "model": "hf",
13
+ "model_args": "pretrained=/nm/drive1/shubhra/cerebras/experiments/spft-cerebras_llama2_sparse50_45B_platypus_dolphin_KDFalse_GCTrue_LR1e-4_E2/combined/,trust_remote_code=True,dtype=bfloat16",
14
  "num_fewshot": 5,
15
+ "batch_size": "128",
16
  "batch_sizes": [],
17
+ "device": "cuda:0",
18
  "no_cache": true,
19
  "limit": null,
20
  "bootstrap_iters": 100000,