wcyat commited on
Commit
393b3f4
1 Parent(s): 1d4db4d

Training in progress, step 500, checkpoint

Browse files
last-checkpoint/config.json CHANGED
@@ -9,16 +9,8 @@
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 768,
12
- "id2label": {
13
- "0": "non-suicide",
14
- "1": "suicide"
15
- },
16
  "initializer_range": 0.02,
17
  "intermediate_size": 3072,
18
- "label2id": {
19
- "non-suicide": 0,
20
- "suicide": 1
21
- },
22
  "layer_norm_eps": 1e-12,
23
  "max_position_embeddings": 512,
24
  "model_type": "bert",
@@ -33,7 +25,7 @@
33
  "position_embedding_type": "absolute",
34
  "problem_type": "single_label_classification",
35
  "torch_dtype": "float32",
36
- "transformers_version": "4.41.1",
37
  "type_vocab_size": 2,
38
  "use_cache": true,
39
  "vocab_size": 21628
 
9
  "hidden_act": "gelu",
10
  "hidden_dropout_prob": 0.1,
11
  "hidden_size": 768,
 
 
 
 
12
  "initializer_range": 0.02,
13
  "intermediate_size": 3072,
 
 
 
 
14
  "layer_norm_eps": 1e-12,
15
  "max_position_embeddings": 512,
16
  "model_type": "bert",
 
25
  "position_embedding_type": "absolute",
26
  "problem_type": "single_label_classification",
27
  "torch_dtype": "float32",
28
+ "transformers_version": "4.41.2",
29
  "type_vocab_size": 2,
30
  "use_cache": true,
31
  "vocab_size": 21628
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b0cae98f2984553c6f685f9e8e28167815d01df53d3174ff7b9c94d82e636094
3
  size 410636248
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6e214c04195e499ad3c1f1101a010a1d7e5fbaf3db25a7310fd72a63494bbb2
3
  size 410636248
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c519129fe07356101db3da3276238cfbfe3764e3ce9cfb856d2128fe131bc3e
3
  size 821393658
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a5a60bc7661f70bd637dcf01a52362f3f33ca1187127a2d80be61d3e4cfa6474
3
  size 821393658
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:347fd2016e0db3c1f0b839dd747224fc90878594d016b9aee34d4fc23c72b0d4
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b2be66fc445ad5b9b95c74f00e2829a3ab00fd604aa9b735f80f31e8d67a8a3
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db1fe5f37124dbfc634cbe41dc7f546d88bc742d43b693450f047c48ecddd6a7
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3ab3fae6bcbaddd22accfbc804c2c716cc71d2097adfaaa313faeb8455aa736
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "best_metric": 0.00016613505431450903,
3
  "best_model_checkpoint": "./results/checkpoint-500",
4
- "epoch": 4.62962962962963,
5
  "eval_steps": 20,
6
  "global_step": 500,
7
  "is_hyper_param_search": false,
@@ -9,408 +9,408 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.18518518518518517,
13
- "grad_norm": 15.828068733215332,
14
- "learning_rate": 1.925925925925926e-05,
15
- "loss": 0.3468,
16
  "step": 20
17
  },
18
  {
19
- "epoch": 0.18518518518518517,
20
- "eval_accuracy": 0.9019607843137255,
21
- "eval_loss": 0.20517832040786743,
22
- "eval_runtime": 0.628,
23
- "eval_samples_per_second": 81.206,
24
- "eval_steps_per_second": 20.699,
25
  "step": 20
26
  },
27
  {
28
- "epoch": 0.37037037037037035,
29
- "grad_norm": 9.892717361450195,
30
- "learning_rate": 1.851851851851852e-05,
31
- "loss": 0.2089,
32
  "step": 40
33
  },
34
  {
35
- "epoch": 0.37037037037037035,
36
- "eval_accuracy": 0.9215686274509803,
37
- "eval_loss": 0.12760986387729645,
38
- "eval_runtime": 0.6405,
39
- "eval_samples_per_second": 79.628,
40
- "eval_steps_per_second": 20.297,
41
  "step": 40
42
  },
43
  {
44
- "epoch": 0.5555555555555556,
45
- "grad_norm": 0.21783480048179626,
46
- "learning_rate": 1.7777777777777777e-05,
47
- "loss": 0.214,
48
  "step": 60
49
  },
50
  {
51
- "epoch": 0.5555555555555556,
52
- "eval_accuracy": 0.9803921568627451,
53
- "eval_loss": 0.02514837123453617,
54
- "eval_runtime": 0.632,
55
- "eval_samples_per_second": 80.694,
56
- "eval_steps_per_second": 20.569,
57
  "step": 60
58
  },
59
  {
60
- "epoch": 0.7407407407407407,
61
- "grad_norm": 0.05077819526195526,
62
- "learning_rate": 1.7037037037037038e-05,
63
- "loss": 0.0807,
64
  "step": 80
65
  },
66
  {
67
- "epoch": 0.7407407407407407,
68
- "eval_accuracy": 0.9803921568627451,
69
- "eval_loss": 0.016518110409379005,
70
- "eval_runtime": 0.629,
71
- "eval_samples_per_second": 81.079,
72
- "eval_steps_per_second": 20.667,
73
  "step": 80
74
  },
75
  {
76
- "epoch": 0.9259259259259259,
77
- "grad_norm": 0.04394271969795227,
78
- "learning_rate": 1.6296296296296297e-05,
79
- "loss": 0.1064,
80
  "step": 100
81
  },
82
  {
83
- "epoch": 0.9259259259259259,
84
- "eval_accuracy": 1.0,
85
- "eval_loss": 0.0030651080887764692,
86
- "eval_runtime": 0.6365,
87
- "eval_samples_per_second": 80.124,
88
- "eval_steps_per_second": 20.424,
89
  "step": 100
90
  },
91
  {
92
- "epoch": 1.1111111111111112,
93
- "grad_norm": 0.08775356411933899,
94
- "learning_rate": 1.555555555555556e-05,
95
- "loss": 0.0134,
96
  "step": 120
97
  },
98
  {
99
- "epoch": 1.1111111111111112,
100
- "eval_accuracy": 1.0,
101
- "eval_loss": 0.002504199743270874,
102
- "eval_runtime": 0.6271,
103
- "eval_samples_per_second": 81.321,
104
- "eval_steps_per_second": 20.729,
105
  "step": 120
106
  },
107
  {
108
- "epoch": 1.2962962962962963,
109
- "grad_norm": 0.028400583192706108,
110
- "learning_rate": 1.4814814814814815e-05,
111
- "loss": 0.0349,
112
  "step": 140
113
  },
114
  {
115
- "epoch": 1.2962962962962963,
116
- "eval_accuracy": 1.0,
117
- "eval_loss": 0.0006583676440641284,
118
- "eval_runtime": 0.6233,
119
- "eval_samples_per_second": 81.817,
120
- "eval_steps_per_second": 20.855,
121
  "step": 140
122
  },
123
  {
124
- "epoch": 1.4814814814814814,
125
- "grad_norm": 0.00876949355006218,
126
- "learning_rate": 1.4074074074074075e-05,
127
- "loss": 0.0819,
128
  "step": 160
129
  },
130
  {
131
- "epoch": 1.4814814814814814,
132
- "eval_accuracy": 0.9607843137254902,
133
- "eval_loss": 0.07063630223274231,
134
- "eval_runtime": 0.6218,
135
- "eval_samples_per_second": 82.021,
136
- "eval_steps_per_second": 20.907,
137
  "step": 160
138
  },
139
  {
140
- "epoch": 1.6666666666666665,
141
- "grad_norm": 0.014950844459235668,
142
- "learning_rate": 1.3333333333333333e-05,
143
- "loss": 0.0586,
144
  "step": 180
145
  },
146
  {
147
- "epoch": 1.6666666666666665,
148
- "eval_accuracy": 1.0,
149
- "eval_loss": 0.0005163149326108396,
150
- "eval_runtime": 0.6311,
151
- "eval_samples_per_second": 80.809,
152
- "eval_steps_per_second": 20.598,
153
  "step": 180
154
  },
155
  {
156
- "epoch": 1.8518518518518519,
157
- "grad_norm": 0.01813225820660591,
158
- "learning_rate": 1.2592592592592593e-05,
159
- "loss": 0.0368,
160
  "step": 200
161
  },
162
  {
163
- "epoch": 1.8518518518518519,
164
- "eval_accuracy": 1.0,
165
- "eval_loss": 0.0004809926904272288,
166
- "eval_runtime": 0.6213,
167
- "eval_samples_per_second": 82.09,
168
- "eval_steps_per_second": 20.925,
169
  "step": 200
170
  },
171
  {
172
- "epoch": 2.037037037037037,
173
- "grad_norm": 0.018220530822873116,
174
- "learning_rate": 1.1851851851851852e-05,
175
- "loss": 0.0485,
176
  "step": 220
177
  },
178
  {
179
- "epoch": 2.037037037037037,
180
- "eval_accuracy": 1.0,
181
- "eval_loss": 0.003055064007639885,
182
- "eval_runtime": 0.6259,
183
- "eval_samples_per_second": 81.483,
184
- "eval_steps_per_second": 20.77,
185
  "step": 220
186
  },
187
  {
188
- "epoch": 2.2222222222222223,
189
- "grad_norm": 0.012761042453348637,
190
- "learning_rate": 1.1111111111111113e-05,
191
- "loss": 0.017,
192
  "step": 240
193
  },
194
  {
195
- "epoch": 2.2222222222222223,
196
- "eval_accuracy": 1.0,
197
- "eval_loss": 0.0003475048579275608,
198
- "eval_runtime": 0.6263,
199
- "eval_samples_per_second": 81.436,
200
- "eval_steps_per_second": 20.758,
201
  "step": 240
202
  },
203
  {
204
- "epoch": 2.4074074074074074,
205
- "grad_norm": 0.012553339824080467,
206
- "learning_rate": 1.037037037037037e-05,
207
- "loss": 0.0017,
208
  "step": 260
209
  },
210
  {
211
- "epoch": 2.4074074074074074,
212
- "eval_accuracy": 1.0,
213
- "eval_loss": 0.0020730593241751194,
214
- "eval_runtime": 0.623,
215
- "eval_samples_per_second": 81.864,
216
- "eval_steps_per_second": 20.867,
217
  "step": 260
218
  },
219
  {
220
- "epoch": 2.5925925925925926,
221
- "grad_norm": 0.009718580171465874,
222
- "learning_rate": 9.62962962962963e-06,
223
- "loss": 0.0004,
224
  "step": 280
225
  },
226
  {
227
- "epoch": 2.5925925925925926,
228
- "eval_accuracy": 1.0,
229
- "eval_loss": 0.0004216369998175651,
230
- "eval_runtime": 0.6294,
231
- "eval_samples_per_second": 81.028,
232
- "eval_steps_per_second": 20.654,
233
  "step": 280
234
  },
235
  {
236
- "epoch": 2.7777777777777777,
237
- "grad_norm": 0.009635565802454948,
238
- "learning_rate": 8.888888888888888e-06,
239
- "loss": 0.0444,
240
  "step": 300
241
  },
242
  {
243
- "epoch": 2.7777777777777777,
244
- "eval_accuracy": 1.0,
245
- "eval_loss": 0.00025334919337183237,
246
- "eval_runtime": 0.6299,
247
- "eval_samples_per_second": 80.96,
248
- "eval_steps_per_second": 20.637,
249
  "step": 300
250
  },
251
  {
252
- "epoch": 2.962962962962963,
253
- "grad_norm": 0.005175964906811714,
254
- "learning_rate": 8.148148148148148e-06,
255
- "loss": 0.0005,
256
  "step": 320
257
  },
258
  {
259
- "epoch": 2.962962962962963,
260
- "eval_accuracy": 1.0,
261
- "eval_loss": 0.00028179946821182966,
262
- "eval_runtime": 0.6242,
263
- "eval_samples_per_second": 81.709,
264
- "eval_steps_per_second": 20.828,
265
  "step": 320
266
  },
267
  {
268
- "epoch": 3.148148148148148,
269
- "grad_norm": 0.008158649317920208,
270
- "learning_rate": 7.4074074074074075e-06,
271
- "loss": 0.0003,
272
  "step": 340
273
  },
274
  {
275
- "epoch": 3.148148148148148,
276
- "eval_accuracy": 1.0,
277
- "eval_loss": 0.0002456614456605166,
278
- "eval_runtime": 0.6258,
279
- "eval_samples_per_second": 81.502,
280
- "eval_steps_per_second": 20.775,
281
  "step": 340
282
  },
283
  {
284
- "epoch": 3.3333333333333335,
285
- "grad_norm": 0.004232426173985004,
286
- "learning_rate": 6.666666666666667e-06,
287
- "loss": 0.0238,
288
  "step": 360
289
  },
290
  {
291
- "epoch": 3.3333333333333335,
292
- "eval_accuracy": 1.0,
293
- "eval_loss": 0.00026214588433504105,
294
- "eval_runtime": 0.6234,
295
- "eval_samples_per_second": 81.814,
296
- "eval_steps_per_second": 20.855,
297
  "step": 360
298
  },
299
  {
300
- "epoch": 3.5185185185185186,
301
- "grad_norm": 0.004335971549153328,
302
- "learning_rate": 5.925925925925926e-06,
303
- "loss": 0.0003,
304
  "step": 380
305
  },
306
  {
307
- "epoch": 3.5185185185185186,
308
- "eval_accuracy": 1.0,
309
- "eval_loss": 0.0002457990194670856,
310
- "eval_runtime": 0.6226,
311
- "eval_samples_per_second": 81.91,
312
- "eval_steps_per_second": 20.879,
313
  "step": 380
314
  },
315
  {
316
- "epoch": 3.7037037037037037,
317
- "grad_norm": 0.007228133734315634,
318
- "learning_rate": 5.185185185185185e-06,
319
- "loss": 0.0003,
320
  "step": 400
321
  },
322
  {
323
- "epoch": 3.7037037037037037,
324
- "eval_accuracy": 1.0,
325
- "eval_loss": 0.0002311759744770825,
326
- "eval_runtime": 0.6205,
327
- "eval_samples_per_second": 82.185,
328
- "eval_steps_per_second": 20.949,
329
  "step": 400
330
  },
331
  {
332
- "epoch": 3.888888888888889,
333
- "grad_norm": 0.004542021080851555,
334
- "learning_rate": 4.444444444444444e-06,
335
- "loss": 0.0003,
336
  "step": 420
337
  },
338
  {
339
- "epoch": 3.888888888888889,
340
- "eval_accuracy": 1.0,
341
- "eval_loss": 0.0002200859016738832,
342
- "eval_runtime": 0.6236,
343
- "eval_samples_per_second": 81.782,
344
- "eval_steps_per_second": 20.846,
345
  "step": 420
346
  },
347
  {
348
- "epoch": 4.074074074074074,
349
- "grad_norm": 0.004315598402172327,
350
- "learning_rate": 3.7037037037037037e-06,
351
- "loss": 0.0007,
352
  "step": 440
353
  },
354
  {
355
- "epoch": 4.074074074074074,
356
- "eval_accuracy": 1.0,
357
- "eval_loss": 0.0001800174795789644,
358
- "eval_runtime": 0.6201,
359
- "eval_samples_per_second": 82.247,
360
- "eval_steps_per_second": 20.965,
361
  "step": 440
362
  },
363
  {
364
- "epoch": 4.2592592592592595,
365
- "grad_norm": 0.014702328480780125,
366
- "learning_rate": 2.962962962962963e-06,
367
- "loss": 0.0002,
368
  "step": 460
369
  },
370
  {
371
- "epoch": 4.2592592592592595,
372
- "eval_accuracy": 1.0,
373
- "eval_loss": 0.00017176676192320883,
374
- "eval_runtime": 0.6211,
375
- "eval_samples_per_second": 82.115,
376
- "eval_steps_per_second": 20.931,
377
  "step": 460
378
  },
379
  {
380
- "epoch": 4.444444444444445,
381
- "grad_norm": 0.005943182855844498,
382
- "learning_rate": 2.222222222222222e-06,
383
- "loss": 0.0003,
384
  "step": 480
385
  },
386
  {
387
- "epoch": 4.444444444444445,
388
- "eval_accuracy": 1.0,
389
- "eval_loss": 0.00016821030294522643,
390
- "eval_runtime": 0.6229,
391
- "eval_samples_per_second": 81.877,
392
- "eval_steps_per_second": 20.871,
393
  "step": 480
394
  },
395
  {
396
- "epoch": 4.62962962962963,
397
- "grad_norm": 0.036252692341804504,
398
- "learning_rate": 1.4814814814814815e-06,
399
- "loss": 0.0002,
400
  "step": 500
401
  },
402
  {
403
- "epoch": 4.62962962962963,
404
- "eval_accuracy": 1.0,
405
- "eval_loss": 0.00016613505431450903,
406
- "eval_runtime": 0.627,
407
- "eval_samples_per_second": 81.342,
408
- "eval_steps_per_second": 20.734,
409
  "step": 500
410
  }
411
  ],
412
  "logging_steps": 20,
413
- "max_steps": 540,
414
  "num_input_tokens_seen": 0,
415
  "num_train_epochs": 5,
416
  "save_steps": 500,
@@ -426,7 +426,7 @@
426
  "attributes": {}
427
  }
428
  },
429
- "total_flos": 411116162887800.0,
430
  "train_batch_size": 4,
431
  "trial_name": null,
432
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.11589507758617401,
3
  "best_model_checkpoint": "./results/checkpoint-500",
4
+ "epoch": 2.4038461538461537,
5
  "eval_steps": 20,
6
  "global_step": 500,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.09615384615384616,
13
+ "grad_norm": 2.879941701889038,
14
+ "learning_rate": 1.9615384615384617e-05,
15
+ "loss": 0.5169,
16
  "step": 20
17
  },
18
  {
19
+ "epoch": 0.09615384615384616,
20
+ "eval_accuracy": 0.8064516129032258,
21
+ "eval_loss": 0.43645337224006653,
22
+ "eval_runtime": 2.8523,
23
+ "eval_samples_per_second": 32.606,
24
+ "eval_steps_per_second": 8.414,
25
  "step": 20
26
  },
27
  {
28
+ "epoch": 0.19230769230769232,
29
+ "grad_norm": 0.7589161396026611,
30
+ "learning_rate": 1.923076923076923e-05,
31
+ "loss": 0.3487,
32
  "step": 40
33
  },
34
  {
35
+ "epoch": 0.19230769230769232,
36
+ "eval_accuracy": 0.8602150537634409,
37
+ "eval_loss": 0.32320985198020935,
38
+ "eval_runtime": 2.9393,
39
+ "eval_samples_per_second": 31.641,
40
+ "eval_steps_per_second": 8.165,
41
  "step": 40
42
  },
43
  {
44
+ "epoch": 0.28846153846153844,
45
+ "grad_norm": 22.88661766052246,
46
+ "learning_rate": 1.8846153846153846e-05,
47
+ "loss": 0.1785,
48
  "step": 60
49
  },
50
  {
51
+ "epoch": 0.28846153846153844,
52
+ "eval_accuracy": 0.946236559139785,
53
+ "eval_loss": 0.1469801366329193,
54
+ "eval_runtime": 3.241,
55
+ "eval_samples_per_second": 28.695,
56
+ "eval_steps_per_second": 7.405,
57
  "step": 60
58
  },
59
  {
60
+ "epoch": 0.38461538461538464,
61
+ "grad_norm": 0.04444374889135361,
62
+ "learning_rate": 1.8461538461538465e-05,
63
+ "loss": 0.192,
64
  "step": 80
65
  },
66
  {
67
+ "epoch": 0.38461538461538464,
68
+ "eval_accuracy": 0.956989247311828,
69
+ "eval_loss": 0.12639252841472626,
70
+ "eval_runtime": 2.8312,
71
+ "eval_samples_per_second": 32.849,
72
+ "eval_steps_per_second": 8.477,
73
  "step": 80
74
  },
75
  {
76
+ "epoch": 0.4807692307692308,
77
+ "grad_norm": 1.9381160736083984,
78
+ "learning_rate": 1.807692307692308e-05,
79
+ "loss": 0.0329,
80
  "step": 100
81
  },
82
  {
83
+ "epoch": 0.4807692307692308,
84
+ "eval_accuracy": 0.956989247311828,
85
+ "eval_loss": 0.18568147718906403,
86
+ "eval_runtime": 2.7388,
87
+ "eval_samples_per_second": 33.956,
88
+ "eval_steps_per_second": 8.763,
89
  "step": 100
90
  },
91
  {
92
+ "epoch": 0.5769230769230769,
93
+ "grad_norm": 0.0944330021739006,
94
+ "learning_rate": 1.7692307692307694e-05,
95
+ "loss": 0.1432,
96
  "step": 120
97
  },
98
  {
99
+ "epoch": 0.5769230769230769,
100
+ "eval_accuracy": 0.9247311827956989,
101
+ "eval_loss": 0.2023310661315918,
102
+ "eval_runtime": 2.6839,
103
+ "eval_samples_per_second": 34.651,
104
+ "eval_steps_per_second": 8.942,
105
  "step": 120
106
  },
107
  {
108
+ "epoch": 0.6730769230769231,
109
+ "grad_norm": 36.735008239746094,
110
+ "learning_rate": 1.730769230769231e-05,
111
+ "loss": 0.3458,
112
  "step": 140
113
  },
114
  {
115
+ "epoch": 0.6730769230769231,
116
+ "eval_accuracy": 0.967741935483871,
117
+ "eval_loss": 0.12693293392658234,
118
+ "eval_runtime": 2.6852,
119
+ "eval_samples_per_second": 34.634,
120
+ "eval_steps_per_second": 8.938,
121
  "step": 140
122
  },
123
  {
124
+ "epoch": 0.7692307692307693,
125
+ "grad_norm": 13.383593559265137,
126
+ "learning_rate": 1.6923076923076924e-05,
127
+ "loss": 0.234,
128
  "step": 160
129
  },
130
  {
131
+ "epoch": 0.7692307692307693,
132
+ "eval_accuracy": 0.946236559139785,
133
+ "eval_loss": 0.15099573135375977,
134
+ "eval_runtime": 2.7092,
135
+ "eval_samples_per_second": 34.327,
136
+ "eval_steps_per_second": 8.859,
137
  "step": 160
138
  },
139
  {
140
+ "epoch": 0.8653846153846154,
141
+ "grad_norm": 0.04881865903735161,
142
+ "learning_rate": 1.653846153846154e-05,
143
+ "loss": 0.179,
144
  "step": 180
145
  },
146
  {
147
+ "epoch": 0.8653846153846154,
148
+ "eval_accuracy": 0.967741935483871,
149
+ "eval_loss": 0.11019934713840485,
150
+ "eval_runtime": 2.7609,
151
+ "eval_samples_per_second": 33.685,
152
+ "eval_steps_per_second": 8.693,
153
  "step": 180
154
  },
155
  {
156
+ "epoch": 0.9615384615384616,
157
+ "grad_norm": 0.1799299567937851,
158
+ "learning_rate": 1.6153846153846154e-05,
159
+ "loss": 0.1873,
160
  "step": 200
161
  },
162
  {
163
+ "epoch": 0.9615384615384616,
164
+ "eval_accuracy": 0.956989247311828,
165
+ "eval_loss": 0.11402961611747742,
166
+ "eval_runtime": 2.7766,
167
+ "eval_samples_per_second": 33.494,
168
+ "eval_steps_per_second": 8.644,
169
  "step": 200
170
  },
171
  {
172
+ "epoch": 1.0576923076923077,
173
+ "grad_norm": 0.45117759704589844,
174
+ "learning_rate": 1.576923076923077e-05,
175
+ "loss": 0.1519,
176
  "step": 220
177
  },
178
  {
179
+ "epoch": 1.0576923076923077,
180
+ "eval_accuracy": 0.946236559139785,
181
+ "eval_loss": 0.2102302461862564,
182
+ "eval_runtime": 2.7619,
183
+ "eval_samples_per_second": 33.672,
184
+ "eval_steps_per_second": 8.69,
185
  "step": 220
186
  },
187
  {
188
+ "epoch": 1.1538461538461537,
189
+ "grad_norm": 55.769256591796875,
190
+ "learning_rate": 1.5384615384615387e-05,
191
+ "loss": 0.0453,
192
  "step": 240
193
  },
194
  {
195
+ "epoch": 1.1538461538461537,
196
+ "eval_accuracy": 0.9247311827956989,
197
+ "eval_loss": 0.31499457359313965,
198
+ "eval_runtime": 2.7752,
199
+ "eval_samples_per_second": 33.511,
200
+ "eval_steps_per_second": 8.648,
201
  "step": 240
202
  },
203
  {
204
+ "epoch": 1.25,
205
+ "grad_norm": 0.028473777696490288,
206
+ "learning_rate": 1.5000000000000002e-05,
207
+ "loss": 0.0341,
208
  "step": 260
209
  },
210
  {
211
+ "epoch": 1.25,
212
+ "eval_accuracy": 0.956989247311828,
213
+ "eval_loss": 0.1401311606168747,
214
+ "eval_runtime": 2.7614,
215
+ "eval_samples_per_second": 33.678,
216
+ "eval_steps_per_second": 8.691,
217
  "step": 260
218
  },
219
  {
220
+ "epoch": 1.3461538461538463,
221
+ "grad_norm": 0.04881782829761505,
222
+ "learning_rate": 1.4615384615384615e-05,
223
+ "loss": 0.0316,
224
  "step": 280
225
  },
226
  {
227
+ "epoch": 1.3461538461538463,
228
+ "eval_accuracy": 0.967741935483871,
229
+ "eval_loss": 0.16341440379619598,
230
+ "eval_runtime": 2.7702,
231
+ "eval_samples_per_second": 33.572,
232
+ "eval_steps_per_second": 8.664,
233
  "step": 280
234
  },
235
  {
236
+ "epoch": 1.4423076923076923,
237
+ "grad_norm": 0.040301088243722916,
238
+ "learning_rate": 1.4230769230769232e-05,
239
+ "loss": 0.1082,
240
  "step": 300
241
  },
242
  {
243
+ "epoch": 1.4423076923076923,
244
+ "eval_accuracy": 0.978494623655914,
245
+ "eval_loss": 0.10686944425106049,
246
+ "eval_runtime": 2.7502,
247
+ "eval_samples_per_second": 33.816,
248
+ "eval_steps_per_second": 8.727,
249
  "step": 300
250
  },
251
  {
252
+ "epoch": 1.5384615384615383,
253
+ "grad_norm": 27.52263641357422,
254
+ "learning_rate": 1.3846153846153847e-05,
255
+ "loss": 0.0954,
256
  "step": 320
257
  },
258
  {
259
+ "epoch": 1.5384615384615383,
260
+ "eval_accuracy": 0.978494623655914,
261
+ "eval_loss": 0.08186855167150497,
262
+ "eval_runtime": 2.7311,
263
+ "eval_samples_per_second": 34.052,
264
+ "eval_steps_per_second": 8.788,
265
  "step": 320
266
  },
267
  {
268
+ "epoch": 1.6346153846153846,
269
+ "grad_norm": 0.013332587666809559,
270
+ "learning_rate": 1.3461538461538463e-05,
271
+ "loss": 0.0472,
272
  "step": 340
273
  },
274
  {
275
+ "epoch": 1.6346153846153846,
276
+ "eval_accuracy": 0.967741935483871,
277
+ "eval_loss": 0.1686173677444458,
278
+ "eval_runtime": 2.733,
279
+ "eval_samples_per_second": 34.029,
280
+ "eval_steps_per_second": 8.782,
281
  "step": 340
282
  },
283
  {
284
+ "epoch": 1.7307692307692308,
285
+ "grad_norm": 0.05663549154996872,
286
+ "learning_rate": 1.3076923076923078e-05,
287
+ "loss": 0.1563,
288
  "step": 360
289
  },
290
  {
291
+ "epoch": 1.7307692307692308,
292
+ "eval_accuracy": 0.978494623655914,
293
+ "eval_loss": 0.037865716964006424,
294
+ "eval_runtime": 2.7489,
295
+ "eval_samples_per_second": 33.832,
296
+ "eval_steps_per_second": 8.731,
297
  "step": 360
298
  },
299
  {
300
+ "epoch": 1.8269230769230769,
301
+ "grad_norm": 8.364692687988281,
302
+ "learning_rate": 1.2692307692307693e-05,
303
+ "loss": 0.1812,
304
  "step": 380
305
  },
306
  {
307
+ "epoch": 1.8269230769230769,
308
+ "eval_accuracy": 0.967741935483871,
309
+ "eval_loss": 0.12184549123048782,
310
+ "eval_runtime": 2.7376,
311
+ "eval_samples_per_second": 33.971,
312
+ "eval_steps_per_second": 8.767,
313
  "step": 380
314
  },
315
  {
316
+ "epoch": 1.9230769230769231,
317
+ "grad_norm": 0.173508420586586,
318
+ "learning_rate": 1.230769230769231e-05,
319
+ "loss": 0.1276,
320
  "step": 400
321
  },
322
  {
323
+ "epoch": 1.9230769230769231,
324
+ "eval_accuracy": 0.989247311827957,
325
+ "eval_loss": 0.07845792919397354,
326
+ "eval_runtime": 2.7605,
327
+ "eval_samples_per_second": 33.689,
328
+ "eval_steps_per_second": 8.694,
329
  "step": 400
330
  },
331
  {
332
+ "epoch": 2.019230769230769,
333
+ "grad_norm": 0.009747051633894444,
334
+ "learning_rate": 1.1923076923076925e-05,
335
+ "loss": 0.0772,
336
  "step": 420
337
  },
338
  {
339
+ "epoch": 2.019230769230769,
340
+ "eval_accuracy": 0.989247311827957,
341
+ "eval_loss": 0.07875293493270874,
342
+ "eval_runtime": 2.7654,
343
+ "eval_samples_per_second": 33.63,
344
+ "eval_steps_per_second": 8.679,
345
  "step": 420
346
  },
347
  {
348
+ "epoch": 2.1153846153846154,
349
+ "grad_norm": 0.017300551757216454,
350
+ "learning_rate": 1.1538461538461538e-05,
351
+ "loss": 0.0022,
352
  "step": 440
353
  },
354
  {
355
+ "epoch": 2.1153846153846154,
356
+ "eval_accuracy": 0.956989247311828,
357
+ "eval_loss": 0.102827288210392,
358
+ "eval_runtime": 2.769,
359
+ "eval_samples_per_second": 33.587,
360
+ "eval_steps_per_second": 8.667,
361
  "step": 440
362
  },
363
  {
364
+ "epoch": 2.2115384615384617,
365
+ "grad_norm": 0.019045885652303696,
366
+ "learning_rate": 1.1153846153846154e-05,
367
+ "loss": 0.0011,
368
  "step": 460
369
  },
370
  {
371
+ "epoch": 2.2115384615384617,
372
+ "eval_accuracy": 0.956989247311828,
373
+ "eval_loss": 0.15620151162147522,
374
+ "eval_runtime": 2.7371,
375
+ "eval_samples_per_second": 33.977,
376
+ "eval_steps_per_second": 8.768,
377
  "step": 460
378
  },
379
  {
380
+ "epoch": 2.3076923076923075,
381
+ "grad_norm": 0.005280839279294014,
382
+ "learning_rate": 1.076923076923077e-05,
383
+ "loss": 0.076,
384
  "step": 480
385
  },
386
  {
387
+ "epoch": 2.3076923076923075,
388
+ "eval_accuracy": 0.978494623655914,
389
+ "eval_loss": 0.12186174839735031,
390
+ "eval_runtime": 2.7523,
391
+ "eval_samples_per_second": 33.789,
392
+ "eval_steps_per_second": 8.72,
393
  "step": 480
394
  },
395
  {
396
+ "epoch": 2.4038461538461537,
397
+ "grad_norm": 0.01168849878013134,
398
+ "learning_rate": 1.0384615384615386e-05,
399
+ "loss": 0.0012,
400
  "step": 500
401
  },
402
  {
403
+ "epoch": 2.4038461538461537,
404
+ "eval_accuracy": 0.956989247311828,
405
+ "eval_loss": 0.11589507758617401,
406
+ "eval_runtime": 2.7518,
407
+ "eval_samples_per_second": 33.796,
408
+ "eval_steps_per_second": 8.721,
409
  "step": 500
410
  }
411
  ],
412
  "logging_steps": 20,
413
+ "max_steps": 1040,
414
  "num_input_tokens_seen": 0,
415
  "num_train_epochs": 5,
416
  "save_steps": 500,
 
426
  "attributes": {}
427
  }
428
  },
429
+ "total_flos": 489118826915220.0,
430
  "train_batch_size": 4,
431
  "trial_name": null,
432
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82998ebf4315f777d4afb520b9ab57fc977bde0f37ef3c7e7370b36f34e8a485
3
  size 5112
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27fa1e1529bd8193e84d4f9fcaca6c5833806923edc0d3e169c3b554b74ed8ed
3
  size 5112