GlycerinLOL commited on
Commit
08b60dd
1 Parent(s): 12ab2d3

End of training

Browse files
Files changed (3) hide show
  1. all_results.json +5 -5
  2. train_results.json +5 -5
  3. trainer_state.json +352 -82
all_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 3.99,
3
- "train_loss": 1.6275942337818634,
4
- "train_runtime": 5933.9925,
5
- "train_samples_per_second": 33.704,
6
- "train_steps_per_second": 0.263
7
  }
 
1
  {
2
+ "epoch": 16.0,
3
+ "train_loss": 1.1567621652086957,
4
+ "train_runtime": 40538.4288,
5
+ "train_samples_per_second": 19.734,
6
+ "train_steps_per_second": 0.206
7
  }
train_results.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "epoch": 3.99,
3
- "train_loss": 1.6275942337818634,
4
- "train_runtime": 5933.9925,
5
- "train_samples_per_second": 33.704,
6
- "train_steps_per_second": 0.263
7
  }
 
1
  {
2
+ "epoch": 16.0,
3
+ "train_loss": 1.1567621652086957,
4
+ "train_runtime": 40538.4288,
5
+ "train_samples_per_second": 19.734,
6
+ "train_steps_per_second": 0.206
7
  }
trainer_state.json CHANGED
@@ -1,112 +1,382 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.9923224568138194,
5
  "eval_steps": 500,
6
- "global_step": 1560,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 1.0,
13
- "eval_f1": 0.8966,
14
- "eval_gen_len": 19.970909090909092,
15
- "eval_loss": 1.5708835124969482,
16
- "eval_precision": 0.9093,
17
- "eval_recall": 0.8846,
18
- "eval_rouge1": 0.4119,
19
- "eval_rouge2": 0.2002,
20
- "eval_rougeL": 0.3529,
21
- "eval_rougeLsum": 0.3527,
22
- "eval_runtime": 231.9368,
23
- "eval_samples_per_second": 11.857,
24
- "eval_steps_per_second": 0.742,
25
- "step": 390
26
- },
27
- {
28
- "epoch": 1.28,
29
- "learning_rate": 1.3589743589743592e-05,
30
- "loss": 1.8155,
31
  "step": 500
32
  },
33
  {
34
- "epoch": 2.0,
35
- "eval_f1": 0.9003,
36
- "eval_gen_len": 19.918545454545455,
37
- "eval_loss": 1.5360783338546753,
38
- "eval_precision": 0.9123,
39
- "eval_recall": 0.8889,
40
- "eval_rouge1": 0.4331,
41
- "eval_rouge2": 0.2157,
42
- "eval_rougeL": 0.3717,
43
- "eval_rougeLsum": 0.3717,
44
- "eval_runtime": 228.1491,
45
- "eval_samples_per_second": 12.054,
46
- "eval_steps_per_second": 0.754,
47
- "step": 781
48
- },
49
- {
50
- "epoch": 2.56,
51
- "learning_rate": 7.17948717948718e-06,
52
- "loss": 1.5875,
53
  "step": 1000
54
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  {
56
  "epoch": 3.0,
57
- "eval_f1": 0.899,
58
- "eval_gen_len": 19.954545454545453,
59
- "eval_loss": 1.5030012130737305,
60
- "eval_precision": 0.9117,
61
- "eval_recall": 0.8871,
62
- "eval_rouge1": 0.4263,
63
- "eval_rouge2": 0.2129,
64
- "eval_rougeL": 0.3671,
65
- "eval_rougeLsum": 0.3673,
66
- "eval_runtime": 228.8181,
67
- "eval_samples_per_second": 12.018,
68
- "eval_steps_per_second": 0.752,
69
- "step": 1172
70
  },
71
  {
72
  "epoch": 3.84,
73
- "learning_rate": 7.692307692307694e-07,
74
- "loss": 1.4978,
75
- "step": 1500
76
  },
77
  {
78
- "epoch": 3.99,
79
- "eval_f1": 0.9002,
80
- "eval_gen_len": 19.925454545454546,
81
- "eval_loss": 1.499870777130127,
82
- "eval_precision": 0.9125,
83
- "eval_recall": 0.8885,
84
- "eval_rouge1": 0.4331,
85
  "eval_rouge2": 0.2164,
86
- "eval_rougeL": 0.3724,
87
- "eval_rougeLsum": 0.3725,
88
- "eval_runtime": 229.5833,
89
- "eval_samples_per_second": 11.978,
90
- "eval_steps_per_second": 0.749,
91
- "step": 1560
92
- },
93
- {
94
- "epoch": 3.99,
95
- "step": 1560,
96
- "total_flos": 2.163117129644114e+17,
97
- "train_loss": 1.6275942337818634,
98
- "train_runtime": 5933.9925,
99
- "train_samples_per_second": 33.704,
100
- "train_steps_per_second": 0.263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  }
102
  ],
103
  "logging_steps": 500,
104
- "max_steps": 1560,
105
  "num_input_tokens_seen": 0,
106
- "num_train_epochs": 4,
107
  "save_steps": 500,
108
- "total_flos": 2.163117129644114e+17,
109
- "train_batch_size": 32,
110
  "trial_name": null,
111
  "trial_params": null
112
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 16.0,
5
  "eval_steps": 500,
6
+ "global_step": 8336,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.96,
13
+ "learning_rate": 1.880038387715931e-05,
14
+ "loss": 1.836,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  "step": 500
16
  },
17
  {
18
+ "epoch": 1.0,
19
+ "eval_f1": 0.8971,
20
+ "eval_gen_len": 19.974545454545453,
21
+ "eval_loss": 1.5560153722763062,
22
+ "eval_precision": 0.9105,
23
+ "eval_recall": 0.8843,
24
+ "eval_rouge1": 0.4155,
25
+ "eval_rouge2": 0.2028,
26
+ "eval_rougeL": 0.3561,
27
+ "eval_rougeLsum": 0.3559,
28
+ "eval_runtime": 315.2437,
29
+ "eval_samples_per_second": 8.723,
30
+ "eval_steps_per_second": 0.546,
31
+ "step": 521
32
+ },
33
+ {
34
+ "epoch": 1.92,
35
+ "learning_rate": 1.760076775431862e-05,
36
+ "loss": 1.5951,
37
  "step": 1000
38
  },
39
+ {
40
+ "epoch": 2.0,
41
+ "eval_f1": 0.8997,
42
+ "eval_gen_len": 19.93527272727273,
43
+ "eval_loss": 1.5003960132598877,
44
+ "eval_precision": 0.9115,
45
+ "eval_recall": 0.8886,
46
+ "eval_rouge1": 0.4333,
47
+ "eval_rouge2": 0.2136,
48
+ "eval_rougeL": 0.3695,
49
+ "eval_rougeLsum": 0.3694,
50
+ "eval_runtime": 311.8452,
51
+ "eval_samples_per_second": 8.818,
52
+ "eval_steps_per_second": 0.552,
53
+ "step": 1042
54
+ },
55
+ {
56
+ "epoch": 2.88,
57
+ "learning_rate": 1.6401151631477927e-05,
58
+ "loss": 1.469,
59
+ "step": 1500
60
+ },
61
  {
62
  "epoch": 3.0,
63
+ "eval_f1": 0.9001,
64
+ "eval_gen_len": 19.938545454545455,
65
+ "eval_loss": 1.4690784215927124,
66
+ "eval_precision": 0.912,
67
+ "eval_recall": 0.8888,
68
+ "eval_rouge1": 0.4355,
69
+ "eval_rouge2": 0.2176,
70
+ "eval_rougeL": 0.3729,
71
+ "eval_rougeLsum": 0.3728,
72
+ "eval_runtime": 312.4642,
73
+ "eval_samples_per_second": 8.801,
74
+ "eval_steps_per_second": 0.55,
75
+ "step": 1563
76
  },
77
  {
78
  "epoch": 3.84,
79
+ "learning_rate": 1.5201535508637238e-05,
80
+ "loss": 1.373,
81
+ "step": 2000
82
  },
83
  {
84
+ "epoch": 4.0,
85
+ "eval_f1": 0.9003,
86
+ "eval_gen_len": 19.964727272727274,
87
+ "eval_loss": 1.4657667875289917,
88
+ "eval_precision": 0.9137,
89
+ "eval_recall": 0.8877,
90
+ "eval_rouge1": 0.4311,
91
  "eval_rouge2": 0.2164,
92
+ "eval_rougeL": 0.3706,
93
+ "eval_rougeLsum": 0.3704,
94
+ "eval_runtime": 313.2326,
95
+ "eval_samples_per_second": 8.779,
96
+ "eval_steps_per_second": 0.549,
97
+ "step": 2084
98
+ },
99
+ {
100
+ "epoch": 4.8,
101
+ "learning_rate": 1.4001919385796546e-05,
102
+ "loss": 1.2902,
103
+ "step": 2500
104
+ },
105
+ {
106
+ "epoch": 5.0,
107
+ "eval_f1": 0.9008,
108
+ "eval_gen_len": 19.94981818181818,
109
+ "eval_loss": 1.4541645050048828,
110
+ "eval_precision": 0.9136,
111
+ "eval_recall": 0.8887,
112
+ "eval_rouge1": 0.4368,
113
+ "eval_rouge2": 0.2218,
114
+ "eval_rougeL": 0.3762,
115
+ "eval_rougeLsum": 0.376,
116
+ "eval_runtime": 313.1455,
117
+ "eval_samples_per_second": 8.782,
118
+ "eval_steps_per_second": 0.549,
119
+ "step": 2605
120
+ },
121
+ {
122
+ "epoch": 5.76,
123
+ "learning_rate": 1.2802303262955855e-05,
124
+ "loss": 1.222,
125
+ "step": 3000
126
+ },
127
+ {
128
+ "epoch": 6.0,
129
+ "eval_f1": 0.9018,
130
+ "eval_gen_len": 19.942545454545453,
131
+ "eval_loss": 1.458353042602539,
132
+ "eval_precision": 0.914,
133
+ "eval_recall": 0.8902,
134
+ "eval_rouge1": 0.4407,
135
+ "eval_rouge2": 0.223,
136
+ "eval_rougeL": 0.3802,
137
+ "eval_rougeLsum": 0.3798,
138
+ "eval_runtime": 312.4439,
139
+ "eval_samples_per_second": 8.802,
140
+ "eval_steps_per_second": 0.55,
141
+ "step": 3126
142
+ },
143
+ {
144
+ "epoch": 6.72,
145
+ "learning_rate": 1.1602687140115163e-05,
146
+ "loss": 1.1655,
147
+ "step": 3500
148
+ },
149
+ {
150
+ "epoch": 7.0,
151
+ "eval_f1": 0.9019,
152
+ "eval_gen_len": 19.932727272727274,
153
+ "eval_loss": 1.4708688259124756,
154
+ "eval_precision": 0.9145,
155
+ "eval_recall": 0.89,
156
+ "eval_rouge1": 0.4404,
157
+ "eval_rouge2": 0.2246,
158
+ "eval_rougeL": 0.3806,
159
+ "eval_rougeLsum": 0.3803,
160
+ "eval_runtime": 313.9664,
161
+ "eval_samples_per_second": 8.759,
162
+ "eval_steps_per_second": 0.548,
163
+ "step": 3647
164
+ },
165
+ {
166
+ "epoch": 7.68,
167
+ "learning_rate": 1.0403071017274472e-05,
168
+ "loss": 1.11,
169
+ "step": 4000
170
+ },
171
+ {
172
+ "epoch": 8.0,
173
+ "eval_f1": 0.9026,
174
+ "eval_gen_len": 19.908363636363635,
175
+ "eval_loss": 1.47238028049469,
176
+ "eval_precision": 0.9153,
177
+ "eval_recall": 0.8906,
178
+ "eval_rouge1": 0.4435,
179
+ "eval_rouge2": 0.2269,
180
+ "eval_rougeL": 0.383,
181
+ "eval_rougeLsum": 0.3828,
182
+ "eval_runtime": 312.3634,
183
+ "eval_samples_per_second": 8.804,
184
+ "eval_steps_per_second": 0.551,
185
+ "step": 4168
186
+ },
187
+ {
188
+ "epoch": 8.64,
189
+ "learning_rate": 9.203454894433782e-06,
190
+ "loss": 1.0629,
191
+ "step": 4500
192
+ },
193
+ {
194
+ "epoch": 9.0,
195
+ "eval_f1": 0.9028,
196
+ "eval_gen_len": 19.928,
197
+ "eval_loss": 1.485286831855774,
198
+ "eval_precision": 0.9155,
199
+ "eval_recall": 0.8908,
200
+ "eval_rouge1": 0.4431,
201
+ "eval_rouge2": 0.2273,
202
+ "eval_rougeL": 0.3832,
203
+ "eval_rougeLsum": 0.383,
204
+ "eval_runtime": 312.2978,
205
+ "eval_samples_per_second": 8.806,
206
+ "eval_steps_per_second": 0.551,
207
+ "step": 4689
208
+ },
209
+ {
210
+ "epoch": 9.6,
211
+ "learning_rate": 8.003838771593091e-06,
212
+ "loss": 1.023,
213
+ "step": 5000
214
+ },
215
+ {
216
+ "epoch": 10.0,
217
+ "eval_f1": 0.9021,
218
+ "eval_gen_len": 19.944,
219
+ "eval_loss": 1.503290057182312,
220
+ "eval_precision": 0.9152,
221
+ "eval_recall": 0.8897,
222
+ "eval_rouge1": 0.4409,
223
+ "eval_rouge2": 0.2247,
224
+ "eval_rougeL": 0.3819,
225
+ "eval_rougeLsum": 0.3818,
226
+ "eval_runtime": 312.2524,
227
+ "eval_samples_per_second": 8.807,
228
+ "eval_steps_per_second": 0.551,
229
+ "step": 5210
230
+ },
231
+ {
232
+ "epoch": 10.56,
233
+ "learning_rate": 6.8042226487524e-06,
234
+ "loss": 0.9862,
235
+ "step": 5500
236
+ },
237
+ {
238
+ "epoch": 11.0,
239
+ "eval_f1": 0.9034,
240
+ "eval_gen_len": 19.912363636363636,
241
+ "eval_loss": 1.5074084997177124,
242
+ "eval_precision": 0.9158,
243
+ "eval_recall": 0.8916,
244
+ "eval_rouge1": 0.4479,
245
+ "eval_rouge2": 0.2278,
246
+ "eval_rougeL": 0.3862,
247
+ "eval_rougeLsum": 0.386,
248
+ "eval_runtime": 313.5934,
249
+ "eval_samples_per_second": 8.769,
250
+ "eval_steps_per_second": 0.548,
251
+ "step": 5731
252
+ },
253
+ {
254
+ "epoch": 11.52,
255
+ "learning_rate": 5.6046065259117085e-06,
256
+ "loss": 0.957,
257
+ "step": 6000
258
+ },
259
+ {
260
+ "epoch": 12.0,
261
+ "eval_f1": 0.903,
262
+ "eval_gen_len": 19.90327272727273,
263
+ "eval_loss": 1.518417239189148,
264
+ "eval_precision": 0.9159,
265
+ "eval_recall": 0.8909,
266
+ "eval_rouge1": 0.4461,
267
+ "eval_rouge2": 0.2264,
268
+ "eval_rougeL": 0.3846,
269
+ "eval_rougeLsum": 0.3847,
270
+ "eval_runtime": 314.9612,
271
+ "eval_samples_per_second": 8.731,
272
+ "eval_steps_per_second": 0.546,
273
+ "step": 6252
274
+ },
275
+ {
276
+ "epoch": 12.48,
277
+ "learning_rate": 4.404990403071018e-06,
278
+ "loss": 0.9315,
279
+ "step": 6500
280
+ },
281
+ {
282
+ "epoch": 13.0,
283
+ "eval_f1": 0.9031,
284
+ "eval_gen_len": 19.908363636363635,
285
+ "eval_loss": 1.5269190073013306,
286
+ "eval_precision": 0.9156,
287
+ "eval_recall": 0.8912,
288
+ "eval_rouge1": 0.4473,
289
+ "eval_rouge2": 0.2284,
290
+ "eval_rougeL": 0.386,
291
+ "eval_rougeLsum": 0.3858,
292
+ "eval_runtime": 311.2352,
293
+ "eval_samples_per_second": 8.836,
294
+ "eval_steps_per_second": 0.553,
295
+ "step": 6773
296
+ },
297
+ {
298
+ "epoch": 13.44,
299
+ "learning_rate": 3.2053742802303266e-06,
300
+ "loss": 0.9093,
301
+ "step": 7000
302
+ },
303
+ {
304
+ "epoch": 14.0,
305
+ "eval_f1": 0.9029,
306
+ "eval_gen_len": 19.913454545454545,
307
+ "eval_loss": 1.5310986042022705,
308
+ "eval_precision": 0.9155,
309
+ "eval_recall": 0.8909,
310
+ "eval_rouge1": 0.4453,
311
+ "eval_rouge2": 0.2273,
312
+ "eval_rougeL": 0.3846,
313
+ "eval_rougeLsum": 0.3843,
314
+ "eval_runtime": 313.2169,
315
+ "eval_samples_per_second": 8.78,
316
+ "eval_steps_per_second": 0.549,
317
+ "step": 7294
318
+ },
319
+ {
320
+ "epoch": 14.4,
321
+ "learning_rate": 2.0057581573896352e-06,
322
+ "loss": 0.8927,
323
+ "step": 7500
324
+ },
325
+ {
326
+ "epoch": 15.0,
327
+ "eval_f1": 0.9029,
328
+ "eval_gen_len": 19.906545454545455,
329
+ "eval_loss": 1.5351076126098633,
330
+ "eval_precision": 0.9156,
331
+ "eval_recall": 0.8909,
332
+ "eval_rouge1": 0.4457,
333
+ "eval_rouge2": 0.2267,
334
+ "eval_rougeL": 0.3842,
335
+ "eval_rougeLsum": 0.384,
336
+ "eval_runtime": 314.8443,
337
+ "eval_samples_per_second": 8.734,
338
+ "eval_steps_per_second": 0.546,
339
+ "step": 7815
340
+ },
341
+ {
342
+ "epoch": 15.36,
343
+ "learning_rate": 8.061420345489445e-07,
344
+ "loss": 0.8773,
345
+ "step": 8000
346
+ },
347
+ {
348
+ "epoch": 16.0,
349
+ "eval_f1": 0.9034,
350
+ "eval_gen_len": 19.900727272727273,
351
+ "eval_loss": 1.5434002876281738,
352
+ "eval_precision": 0.9159,
353
+ "eval_recall": 0.8916,
354
+ "eval_rouge1": 0.4476,
355
+ "eval_rouge2": 0.2292,
356
+ "eval_rougeL": 0.3868,
357
+ "eval_rougeLsum": 0.3865,
358
+ "eval_runtime": 313.163,
359
+ "eval_samples_per_second": 8.781,
360
+ "eval_steps_per_second": 0.549,
361
+ "step": 8336
362
+ },
363
+ {
364
+ "epoch": 16.0,
365
+ "step": 8336,
366
+ "total_flos": 1.7130441774590853e+18,
367
+ "train_loss": 1.1567621652086957,
368
+ "train_runtime": 40538.4288,
369
+ "train_samples_per_second": 19.734,
370
+ "train_steps_per_second": 0.206
371
  }
372
  ],
373
  "logging_steps": 500,
374
+ "max_steps": 8336,
375
  "num_input_tokens_seen": 0,
376
+ "num_train_epochs": 16,
377
  "save_steps": 500,
378
+ "total_flos": 1.7130441774590853e+18,
379
+ "train_batch_size": 24,
380
  "trial_name": null,
381
  "trial_params": null
382
  }