learn3r commited on
Commit
81cba4b
1 Parent(s): 39f2060

End of training

Browse files
Files changed (5) hide show
  1. README.md +14 -2
  2. all_results.json +18 -0
  3. eval_results.json +13 -0
  4. train_results.json +8 -0
  5. trainer_state.json +960 -0
README.md CHANGED
@@ -3,11 +3,23 @@ license: apache-2.0
3
  base_model: google/long-t5-tglobal-xl
4
  tags:
5
  - generated_from_trainer
 
 
6
  metrics:
7
  - rouge
8
  model-index:
9
  - name: longt5_xl_sfd_bp_15
10
- results: []
 
 
 
 
 
 
 
 
 
 
11
  ---
12
 
13
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
@@ -15,7 +27,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # longt5_xl_sfd_bp_15
17
 
18
- This model is a fine-tuned version of [google/long-t5-tglobal-xl](https://huggingface.co/google/long-t5-tglobal-xl) on an unknown dataset.
19
  It achieves the following results on the evaluation set:
20
  - Loss: 2.5840
21
  - Rouge1: 29.7482
 
3
  base_model: google/long-t5-tglobal-xl
4
  tags:
5
  - generated_from_trainer
6
+ datasets:
7
+ - learn3r/summ_screen_fd_bp
8
  metrics:
9
  - rouge
10
  model-index:
11
  - name: longt5_xl_sfd_bp_15
12
+ results:
13
+ - task:
14
+ name: Summarization
15
+ type: summarization
16
+ dataset:
17
+ name: learn3r/summ_screen_fd_bp
18
+ type: learn3r/summ_screen_fd_bp
19
+ metrics:
20
+ - name: Rouge1
21
+ type: rouge
22
+ value: 29.7482
23
  ---
24
 
25
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
 
27
 
28
  # longt5_xl_sfd_bp_15
29
 
30
+ This model is a fine-tuned version of [google/long-t5-tglobal-xl](https://huggingface.co/google/long-t5-tglobal-xl) on the learn3r/summ_screen_fd_bp dataset.
31
  It achieves the following results on the evaluation set:
32
  - Loss: 2.5840
33
  - Rouge1: 29.7482
all_results.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 14.61,
3
+ "eval_gen_len": 503.5769230769231,
4
+ "eval_loss": 2.5840089321136475,
5
+ "eval_rouge1": 29.7482,
6
+ "eval_rouge2": 12.0072,
7
+ "eval_rougeL": 21.348,
8
+ "eval_rougeLsum": 28.5849,
9
+ "eval_runtime": 1799.7554,
10
+ "eval_samples": 338,
11
+ "eval_samples_per_second": 0.188,
12
+ "eval_steps_per_second": 0.024,
13
+ "train_loss": 0.9784720075981957,
14
+ "train_runtime": 78497.3694,
15
+ "train_samples": 3673,
16
+ "train_samples_per_second": 0.702,
17
+ "train_steps_per_second": 0.003
18
+ }
eval_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 14.61,
3
+ "eval_gen_len": 503.5769230769231,
4
+ "eval_loss": 2.5840089321136475,
5
+ "eval_rouge1": 29.7482,
6
+ "eval_rouge2": 12.0072,
7
+ "eval_rougeL": 21.348,
8
+ "eval_rougeLsum": 28.5849,
9
+ "eval_runtime": 1799.7554,
10
+ "eval_samples": 338,
11
+ "eval_samples_per_second": 0.188,
12
+ "eval_steps_per_second": 0.024
13
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 14.61,
3
+ "train_loss": 0.9784720075981957,
4
+ "train_runtime": 78497.3694,
5
+ "train_samples": 3673,
6
+ "train_samples_per_second": 0.702,
7
+ "train_steps_per_second": 0.003
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,960 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 25.2438,
3
+ "best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/longt5_xl_sfd_bp_15/checkpoint-201",
4
+ "epoch": 14.608695652173914,
5
+ "eval_steps": 500,
6
+ "global_step": 210,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.14,
13
+ "grad_norm": 4.218067169189453,
14
+ "learning_rate": 0.001,
15
+ "loss": 2.9668,
16
+ "step": 2
17
+ },
18
+ {
19
+ "epoch": 0.28,
20
+ "grad_norm": 200.38365173339844,
21
+ "learning_rate": 0.001,
22
+ "loss": 3.3198,
23
+ "step": 4
24
+ },
25
+ {
26
+ "epoch": 0.42,
27
+ "grad_norm": 74.55081176757812,
28
+ "learning_rate": 0.001,
29
+ "loss": 2.6874,
30
+ "step": 6
31
+ },
32
+ {
33
+ "epoch": 0.56,
34
+ "grad_norm": 19.85554313659668,
35
+ "learning_rate": 0.001,
36
+ "loss": 2.3138,
37
+ "step": 8
38
+ },
39
+ {
40
+ "epoch": 0.7,
41
+ "grad_norm": 41.041751861572266,
42
+ "learning_rate": 0.001,
43
+ "loss": 2.0222,
44
+ "step": 10
45
+ },
46
+ {
47
+ "epoch": 0.83,
48
+ "grad_norm": 3.789278507232666,
49
+ "learning_rate": 0.001,
50
+ "loss": 1.8363,
51
+ "step": 12
52
+ },
53
+ {
54
+ "epoch": 0.97,
55
+ "grad_norm": 5.874878883361816,
56
+ "learning_rate": 0.001,
57
+ "loss": 2.5763,
58
+ "step": 14
59
+ },
60
+ {
61
+ "epoch": 0.97,
62
+ "eval_gen_len": 509.64792899408286,
63
+ "eval_loss": 2.541541814804077,
64
+ "eval_rouge1": 10.6052,
65
+ "eval_rouge2": 1.4494,
66
+ "eval_rougeL": 10.4593,
67
+ "eval_rougeLsum": 10.4801,
68
+ "eval_runtime": 1798.047,
69
+ "eval_samples_per_second": 0.188,
70
+ "eval_steps_per_second": 0.024,
71
+ "step": 14
72
+ },
73
+ {
74
+ "epoch": 1.11,
75
+ "grad_norm": 1.338165521621704,
76
+ "learning_rate": 0.001,
77
+ "loss": 2.4441,
78
+ "step": 16
79
+ },
80
+ {
81
+ "epoch": 1.25,
82
+ "grad_norm": 3.755629777908325,
83
+ "learning_rate": 0.001,
84
+ "loss": 2.258,
85
+ "step": 18
86
+ },
87
+ {
88
+ "epoch": 1.39,
89
+ "grad_norm": 6.490938663482666,
90
+ "learning_rate": 0.001,
91
+ "loss": 3.0147,
92
+ "step": 20
93
+ },
94
+ {
95
+ "epoch": 1.53,
96
+ "grad_norm": 5.593593597412109,
97
+ "learning_rate": 0.001,
98
+ "loss": 2.4724,
99
+ "step": 22
100
+ },
101
+ {
102
+ "epoch": 1.67,
103
+ "grad_norm": 1.0521235466003418,
104
+ "learning_rate": 0.001,
105
+ "loss": 2.023,
106
+ "step": 24
107
+ },
108
+ {
109
+ "epoch": 1.81,
110
+ "grad_norm": 12.585270881652832,
111
+ "learning_rate": 0.001,
112
+ "loss": 2.223,
113
+ "step": 26
114
+ },
115
+ {
116
+ "epoch": 1.95,
117
+ "grad_norm": 78.1630630493164,
118
+ "learning_rate": 0.001,
119
+ "loss": 1.8998,
120
+ "step": 28
121
+ },
122
+ {
123
+ "epoch": 1.95,
124
+ "eval_gen_len": 511.0,
125
+ "eval_loss": 1.739753246307373,
126
+ "eval_rouge1": 16.7989,
127
+ "eval_rouge2": 4.1457,
128
+ "eval_rougeL": 16.4049,
129
+ "eval_rougeLsum": 15.1803,
130
+ "eval_runtime": 1798.9905,
131
+ "eval_samples_per_second": 0.188,
132
+ "eval_steps_per_second": 0.024,
133
+ "step": 28
134
+ },
135
+ {
136
+ "epoch": 2.09,
137
+ "grad_norm": 0.7915446758270264,
138
+ "learning_rate": 0.001,
139
+ "loss": 1.8375,
140
+ "step": 30
141
+ },
142
+ {
143
+ "epoch": 2.23,
144
+ "grad_norm": 1.5996413230895996,
145
+ "learning_rate": 0.001,
146
+ "loss": 2.0326,
147
+ "step": 32
148
+ },
149
+ {
150
+ "epoch": 2.37,
151
+ "grad_norm": 1.0431970357894897,
152
+ "learning_rate": 0.001,
153
+ "loss": 2.1242,
154
+ "step": 34
155
+ },
156
+ {
157
+ "epoch": 2.5,
158
+ "grad_norm": 0.5979584455490112,
159
+ "learning_rate": 0.001,
160
+ "loss": 2.0047,
161
+ "step": 36
162
+ },
163
+ {
164
+ "epoch": 2.64,
165
+ "grad_norm": 0.28407618403434753,
166
+ "learning_rate": 0.001,
167
+ "loss": 1.7317,
168
+ "step": 38
169
+ },
170
+ {
171
+ "epoch": 2.78,
172
+ "grad_norm": 0.3217169940471649,
173
+ "learning_rate": 0.001,
174
+ "loss": 1.694,
175
+ "step": 40
176
+ },
177
+ {
178
+ "epoch": 2.92,
179
+ "grad_norm": 0.33735284209251404,
180
+ "learning_rate": 0.001,
181
+ "loss": 1.6403,
182
+ "step": 42
183
+ },
184
+ {
185
+ "epoch": 2.99,
186
+ "eval_gen_len": 511.0,
187
+ "eval_loss": 1.5456656217575073,
188
+ "eval_rouge1": 18.4716,
189
+ "eval_rouge2": 5.4633,
190
+ "eval_rougeL": 17.1393,
191
+ "eval_rougeLsum": 16.9242,
192
+ "eval_runtime": 1798.0277,
193
+ "eval_samples_per_second": 0.188,
194
+ "eval_steps_per_second": 0.024,
195
+ "step": 43
196
+ },
197
+ {
198
+ "epoch": 3.06,
199
+ "grad_norm": 0.20745837688446045,
200
+ "learning_rate": 0.001,
201
+ "loss": 1.5256,
202
+ "step": 44
203
+ },
204
+ {
205
+ "epoch": 3.2,
206
+ "grad_norm": 0.24278272688388824,
207
+ "learning_rate": 0.001,
208
+ "loss": 1.4077,
209
+ "step": 46
210
+ },
211
+ {
212
+ "epoch": 3.34,
213
+ "grad_norm": 3.5210845470428467,
214
+ "learning_rate": 0.001,
215
+ "loss": 1.4244,
216
+ "step": 48
217
+ },
218
+ {
219
+ "epoch": 3.48,
220
+ "grad_norm": 0.31759026646614075,
221
+ "learning_rate": 0.001,
222
+ "loss": 1.3542,
223
+ "step": 50
224
+ },
225
+ {
226
+ "epoch": 3.62,
227
+ "grad_norm": 0.2855791449546814,
228
+ "learning_rate": 0.001,
229
+ "loss": 1.3873,
230
+ "step": 52
231
+ },
232
+ {
233
+ "epoch": 3.76,
234
+ "grad_norm": 0.30171895027160645,
235
+ "learning_rate": 0.001,
236
+ "loss": 1.4693,
237
+ "step": 54
238
+ },
239
+ {
240
+ "epoch": 3.9,
241
+ "grad_norm": 0.28778406977653503,
242
+ "learning_rate": 0.001,
243
+ "loss": 1.5012,
244
+ "step": 56
245
+ },
246
+ {
247
+ "epoch": 3.97,
248
+ "eval_gen_len": 511.0,
249
+ "eval_loss": 1.5736442804336548,
250
+ "eval_rouge1": 18.2259,
251
+ "eval_rouge2": 5.3524,
252
+ "eval_rougeL": 17.0162,
253
+ "eval_rougeLsum": 16.7948,
254
+ "eval_runtime": 1799.9735,
255
+ "eval_samples_per_second": 0.188,
256
+ "eval_steps_per_second": 0.024,
257
+ "step": 57
258
+ },
259
+ {
260
+ "epoch": 4.03,
261
+ "grad_norm": 0.27410924434661865,
262
+ "learning_rate": 0.001,
263
+ "loss": 1.3865,
264
+ "step": 58
265
+ },
266
+ {
267
+ "epoch": 4.17,
268
+ "grad_norm": 0.2398337870836258,
269
+ "learning_rate": 0.001,
270
+ "loss": 1.198,
271
+ "step": 60
272
+ },
273
+ {
274
+ "epoch": 4.31,
275
+ "grad_norm": 0.24380528926849365,
276
+ "learning_rate": 0.001,
277
+ "loss": 1.1965,
278
+ "step": 62
279
+ },
280
+ {
281
+ "epoch": 4.45,
282
+ "grad_norm": 0.28130125999450684,
283
+ "learning_rate": 0.001,
284
+ "loss": 1.2576,
285
+ "step": 64
286
+ },
287
+ {
288
+ "epoch": 4.59,
289
+ "grad_norm": 0.22549273073673248,
290
+ "learning_rate": 0.001,
291
+ "loss": 1.2108,
292
+ "step": 66
293
+ },
294
+ {
295
+ "epoch": 4.73,
296
+ "grad_norm": 0.3336837589740753,
297
+ "learning_rate": 0.001,
298
+ "loss": 1.23,
299
+ "step": 68
300
+ },
301
+ {
302
+ "epoch": 4.87,
303
+ "grad_norm": 0.39294493198394775,
304
+ "learning_rate": 0.001,
305
+ "loss": 1.248,
306
+ "step": 70
307
+ },
308
+ {
309
+ "epoch": 4.94,
310
+ "eval_gen_len": 511.0,
311
+ "eval_loss": 1.5482468605041504,
312
+ "eval_rouge1": 20.8275,
313
+ "eval_rouge2": 6.7412,
314
+ "eval_rougeL": 18.0859,
315
+ "eval_rougeLsum": 19.3113,
316
+ "eval_runtime": 1798.5715,
317
+ "eval_samples_per_second": 0.188,
318
+ "eval_steps_per_second": 0.024,
319
+ "step": 71
320
+ },
321
+ {
322
+ "epoch": 5.01,
323
+ "grad_norm": 0.3731052577495575,
324
+ "learning_rate": 0.001,
325
+ "loss": 1.2523,
326
+ "step": 72
327
+ },
328
+ {
329
+ "epoch": 5.15,
330
+ "grad_norm": 0.33552882075309753,
331
+ "learning_rate": 0.001,
332
+ "loss": 1.0577,
333
+ "step": 74
334
+ },
335
+ {
336
+ "epoch": 5.29,
337
+ "grad_norm": 0.3163793087005615,
338
+ "learning_rate": 0.001,
339
+ "loss": 1.0478,
340
+ "step": 76
341
+ },
342
+ {
343
+ "epoch": 5.43,
344
+ "grad_norm": 0.21926109492778778,
345
+ "learning_rate": 0.001,
346
+ "loss": 1.0127,
347
+ "step": 78
348
+ },
349
+ {
350
+ "epoch": 5.57,
351
+ "grad_norm": 0.24710944294929504,
352
+ "learning_rate": 0.001,
353
+ "loss": 1.0042,
354
+ "step": 80
355
+ },
356
+ {
357
+ "epoch": 5.7,
358
+ "grad_norm": 0.2397957742214203,
359
+ "learning_rate": 0.001,
360
+ "loss": 1.0332,
361
+ "step": 82
362
+ },
363
+ {
364
+ "epoch": 5.84,
365
+ "grad_norm": 0.21428123116493225,
366
+ "learning_rate": 0.001,
367
+ "loss": 1.022,
368
+ "step": 84
369
+ },
370
+ {
371
+ "epoch": 5.98,
372
+ "grad_norm": 0.2227003127336502,
373
+ "learning_rate": 0.001,
374
+ "loss": 1.0176,
375
+ "step": 86
376
+ },
377
+ {
378
+ "epoch": 5.98,
379
+ "eval_gen_len": 510.6775147928994,
380
+ "eval_loss": 1.625435709953308,
381
+ "eval_rouge1": 21.1937,
382
+ "eval_rouge2": 6.8813,
383
+ "eval_rougeL": 18.411,
384
+ "eval_rougeLsum": 19.8577,
385
+ "eval_runtime": 1798.5872,
386
+ "eval_samples_per_second": 0.188,
387
+ "eval_steps_per_second": 0.024,
388
+ "step": 86
389
+ },
390
+ {
391
+ "epoch": 6.12,
392
+ "grad_norm": 0.2959192991256714,
393
+ "learning_rate": 0.001,
394
+ "loss": 0.879,
395
+ "step": 88
396
+ },
397
+ {
398
+ "epoch": 6.26,
399
+ "grad_norm": 0.33006206154823303,
400
+ "learning_rate": 0.001,
401
+ "loss": 0.8812,
402
+ "step": 90
403
+ },
404
+ {
405
+ "epoch": 6.4,
406
+ "grad_norm": 0.34284549951553345,
407
+ "learning_rate": 0.001,
408
+ "loss": 0.8742,
409
+ "step": 92
410
+ },
411
+ {
412
+ "epoch": 6.54,
413
+ "grad_norm": 0.4311819076538086,
414
+ "learning_rate": 0.001,
415
+ "loss": 0.8357,
416
+ "step": 94
417
+ },
418
+ {
419
+ "epoch": 6.68,
420
+ "grad_norm": 0.5699031352996826,
421
+ "learning_rate": 0.001,
422
+ "loss": 0.8721,
423
+ "step": 96
424
+ },
425
+ {
426
+ "epoch": 6.82,
427
+ "grad_norm": 0.39324450492858887,
428
+ "learning_rate": 0.001,
429
+ "loss": 0.8739,
430
+ "step": 98
431
+ },
432
+ {
433
+ "epoch": 6.96,
434
+ "grad_norm": 0.3442493677139282,
435
+ "learning_rate": 0.001,
436
+ "loss": 0.8472,
437
+ "step": 100
438
+ },
439
+ {
440
+ "epoch": 6.96,
441
+ "eval_gen_len": 479.9704142011834,
442
+ "eval_loss": 1.6212307214736938,
443
+ "eval_rouge1": 26.1873,
444
+ "eval_rouge2": 9.1581,
445
+ "eval_rougeL": 20.393,
446
+ "eval_rougeLsum": 24.1393,
447
+ "eval_runtime": 1802.4729,
448
+ "eval_samples_per_second": 0.188,
449
+ "eval_steps_per_second": 0.024,
450
+ "step": 100
451
+ },
452
+ {
453
+ "epoch": 7.1,
454
+ "grad_norm": 0.2600483298301697,
455
+ "learning_rate": 0.001,
456
+ "loss": 0.7568,
457
+ "step": 102
458
+ },
459
+ {
460
+ "epoch": 7.23,
461
+ "grad_norm": 0.28727108240127563,
462
+ "learning_rate": 0.001,
463
+ "loss": 0.6971,
464
+ "step": 104
465
+ },
466
+ {
467
+ "epoch": 7.37,
468
+ "grad_norm": 0.3065392076969147,
469
+ "learning_rate": 0.001,
470
+ "loss": 0.6918,
471
+ "step": 106
472
+ },
473
+ {
474
+ "epoch": 7.51,
475
+ "grad_norm": 0.427791029214859,
476
+ "learning_rate": 0.001,
477
+ "loss": 0.6902,
478
+ "step": 108
479
+ },
480
+ {
481
+ "epoch": 7.65,
482
+ "grad_norm": 0.48664093017578125,
483
+ "learning_rate": 0.001,
484
+ "loss": 0.7415,
485
+ "step": 110
486
+ },
487
+ {
488
+ "epoch": 7.79,
489
+ "grad_norm": 0.2857199013233185,
490
+ "learning_rate": 0.001,
491
+ "loss": 0.7442,
492
+ "step": 112
493
+ },
494
+ {
495
+ "epoch": 7.93,
496
+ "grad_norm": 0.24586661159992218,
497
+ "learning_rate": 0.001,
498
+ "loss": 0.7242,
499
+ "step": 114
500
+ },
501
+ {
502
+ "epoch": 8.0,
503
+ "eval_gen_len": 506.9112426035503,
504
+ "eval_loss": 1.723126769065857,
505
+ "eval_rouge1": 23.5881,
506
+ "eval_rouge2": 7.8961,
507
+ "eval_rougeL": 18.7014,
508
+ "eval_rougeLsum": 22.2999,
509
+ "eval_runtime": 1807.8192,
510
+ "eval_samples_per_second": 0.187,
511
+ "eval_steps_per_second": 0.024,
512
+ "step": 115
513
+ },
514
+ {
515
+ "epoch": 8.07,
516
+ "grad_norm": 0.21033655107021332,
517
+ "learning_rate": 0.001,
518
+ "loss": 0.6797,
519
+ "step": 116
520
+ },
521
+ {
522
+ "epoch": 8.21,
523
+ "grad_norm": 0.22591687738895416,
524
+ "learning_rate": 0.001,
525
+ "loss": 0.5446,
526
+ "step": 118
527
+ },
528
+ {
529
+ "epoch": 8.35,
530
+ "grad_norm": 0.20658165216445923,
531
+ "learning_rate": 0.001,
532
+ "loss": 0.5545,
533
+ "step": 120
534
+ },
535
+ {
536
+ "epoch": 8.49,
537
+ "grad_norm": 0.29855239391326904,
538
+ "learning_rate": 0.001,
539
+ "loss": 0.6124,
540
+ "step": 122
541
+ },
542
+ {
543
+ "epoch": 8.63,
544
+ "grad_norm": 0.3976292312145233,
545
+ "learning_rate": 0.001,
546
+ "loss": 0.6052,
547
+ "step": 124
548
+ },
549
+ {
550
+ "epoch": 8.77,
551
+ "grad_norm": 0.27770739793777466,
552
+ "learning_rate": 0.001,
553
+ "loss": 0.5755,
554
+ "step": 126
555
+ },
556
+ {
557
+ "epoch": 8.9,
558
+ "grad_norm": 0.2741471529006958,
559
+ "learning_rate": 0.001,
560
+ "loss": 0.5876,
561
+ "step": 128
562
+ },
563
+ {
564
+ "epoch": 8.97,
565
+ "eval_gen_len": 451.698224852071,
566
+ "eval_loss": 1.9400925636291504,
567
+ "eval_rouge1": 32.1851,
568
+ "eval_rouge2": 12.6426,
569
+ "eval_rougeL": 22.8358,
570
+ "eval_rougeLsum": 30.6718,
571
+ "eval_runtime": 1805.3092,
572
+ "eval_samples_per_second": 0.187,
573
+ "eval_steps_per_second": 0.024,
574
+ "step": 129
575
+ },
576
+ {
577
+ "epoch": 9.04,
578
+ "grad_norm": 0.25768765807151794,
579
+ "learning_rate": 0.001,
580
+ "loss": 0.5517,
581
+ "step": 130
582
+ },
583
+ {
584
+ "epoch": 9.18,
585
+ "grad_norm": 0.203142449259758,
586
+ "learning_rate": 0.001,
587
+ "loss": 0.4295,
588
+ "step": 132
589
+ },
590
+ {
591
+ "epoch": 9.32,
592
+ "grad_norm": 0.29351434111595154,
593
+ "learning_rate": 0.001,
594
+ "loss": 0.493,
595
+ "step": 134
596
+ },
597
+ {
598
+ "epoch": 9.46,
599
+ "grad_norm": 0.23967808485031128,
600
+ "learning_rate": 0.001,
601
+ "loss": 0.4877,
602
+ "step": 136
603
+ },
604
+ {
605
+ "epoch": 9.6,
606
+ "grad_norm": 0.21488718688488007,
607
+ "learning_rate": 0.001,
608
+ "loss": 0.4943,
609
+ "step": 138
610
+ },
611
+ {
612
+ "epoch": 9.74,
613
+ "grad_norm": 0.20587602257728577,
614
+ "learning_rate": 0.001,
615
+ "loss": 0.4729,
616
+ "step": 140
617
+ },
618
+ {
619
+ "epoch": 9.88,
620
+ "grad_norm": 0.2094978392124176,
621
+ "learning_rate": 0.001,
622
+ "loss": 0.4756,
623
+ "step": 142
624
+ },
625
+ {
626
+ "epoch": 9.95,
627
+ "eval_gen_len": 455.594674556213,
628
+ "eval_loss": 1.9001177549362183,
629
+ "eval_rouge1": 31.353,
630
+ "eval_rouge2": 12.994,
631
+ "eval_rougeL": 23.1542,
632
+ "eval_rougeLsum": 29.8375,
633
+ "eval_runtime": 1806.0454,
634
+ "eval_samples_per_second": 0.187,
635
+ "eval_steps_per_second": 0.024,
636
+ "step": 143
637
+ },
638
+ {
639
+ "epoch": 10.02,
640
+ "grad_norm": 0.2443789541721344,
641
+ "learning_rate": 0.001,
642
+ "loss": 0.4707,
643
+ "step": 144
644
+ },
645
+ {
646
+ "epoch": 10.16,
647
+ "grad_norm": 0.21666786074638367,
648
+ "learning_rate": 0.001,
649
+ "loss": 0.3612,
650
+ "step": 146
651
+ },
652
+ {
653
+ "epoch": 10.3,
654
+ "grad_norm": 0.20268017053604126,
655
+ "learning_rate": 0.001,
656
+ "loss": 0.3739,
657
+ "step": 148
658
+ },
659
+ {
660
+ "epoch": 10.43,
661
+ "grad_norm": 0.22428925335407257,
662
+ "learning_rate": 0.001,
663
+ "loss": 0.382,
664
+ "step": 150
665
+ },
666
+ {
667
+ "epoch": 10.57,
668
+ "grad_norm": 0.21844923496246338,
669
+ "learning_rate": 0.001,
670
+ "loss": 0.3623,
671
+ "step": 152
672
+ },
673
+ {
674
+ "epoch": 10.71,
675
+ "grad_norm": 0.2675388753414154,
676
+ "learning_rate": 0.001,
677
+ "loss": 0.3674,
678
+ "step": 154
679
+ },
680
+ {
681
+ "epoch": 10.85,
682
+ "grad_norm": 0.2905120849609375,
683
+ "learning_rate": 0.001,
684
+ "loss": 0.39,
685
+ "step": 156
686
+ },
687
+ {
688
+ "epoch": 10.99,
689
+ "grad_norm": 0.27420204877853394,
690
+ "learning_rate": 0.001,
691
+ "loss": 0.4042,
692
+ "step": 158
693
+ },
694
+ {
695
+ "epoch": 10.99,
696
+ "eval_gen_len": 497.53550295857985,
697
+ "eval_loss": 2.1294684410095215,
698
+ "eval_rouge1": 28.6425,
699
+ "eval_rouge2": 11.8399,
700
+ "eval_rougeL": 21.3847,
701
+ "eval_rougeLsum": 27.0508,
702
+ "eval_runtime": 1807.4153,
703
+ "eval_samples_per_second": 0.187,
704
+ "eval_steps_per_second": 0.024,
705
+ "step": 158
706
+ },
707
+ {
708
+ "epoch": 11.13,
709
+ "grad_norm": 0.26691916584968567,
710
+ "learning_rate": 0.001,
711
+ "loss": 0.3127,
712
+ "step": 160
713
+ },
714
+ {
715
+ "epoch": 11.27,
716
+ "grad_norm": 0.3042663335800171,
717
+ "learning_rate": 0.001,
718
+ "loss": 0.305,
719
+ "step": 162
720
+ },
721
+ {
722
+ "epoch": 11.41,
723
+ "grad_norm": 0.26255106925964355,
724
+ "learning_rate": 0.001,
725
+ "loss": 0.3133,
726
+ "step": 164
727
+ },
728
+ {
729
+ "epoch": 11.55,
730
+ "grad_norm": 0.23816817998886108,
731
+ "learning_rate": 0.001,
732
+ "loss": 0.3118,
733
+ "step": 166
734
+ },
735
+ {
736
+ "epoch": 11.69,
737
+ "grad_norm": 0.22553777694702148,
738
+ "learning_rate": 0.001,
739
+ "loss": 0.3073,
740
+ "step": 168
741
+ },
742
+ {
743
+ "epoch": 11.83,
744
+ "grad_norm": 0.2234884351491928,
745
+ "learning_rate": 0.001,
746
+ "loss": 0.3346,
747
+ "step": 170
748
+ },
749
+ {
750
+ "epoch": 11.97,
751
+ "grad_norm": 0.18143154680728912,
752
+ "learning_rate": 0.001,
753
+ "loss": 0.3292,
754
+ "step": 172
755
+ },
756
+ {
757
+ "epoch": 11.97,
758
+ "eval_gen_len": 478.81065088757396,
759
+ "eval_loss": 2.244086503982544,
760
+ "eval_rouge1": 31.8393,
761
+ "eval_rouge2": 13.1308,
762
+ "eval_rougeL": 22.135,
763
+ "eval_rougeLsum": 30.5866,
764
+ "eval_runtime": 1798.3958,
765
+ "eval_samples_per_second": 0.188,
766
+ "eval_steps_per_second": 0.024,
767
+ "step": 172
768
+ },
769
+ {
770
+ "epoch": 12.1,
771
+ "grad_norm": 0.24745677411556244,
772
+ "learning_rate": 0.001,
773
+ "loss": 0.2539,
774
+ "step": 174
775
+ },
776
+ {
777
+ "epoch": 12.24,
778
+ "grad_norm": 0.26513755321502686,
779
+ "learning_rate": 0.001,
780
+ "loss": 0.2588,
781
+ "step": 176
782
+ },
783
+ {
784
+ "epoch": 12.38,
785
+ "grad_norm": 0.20156317949295044,
786
+ "learning_rate": 0.001,
787
+ "loss": 0.2537,
788
+ "step": 178
789
+ },
790
+ {
791
+ "epoch": 12.52,
792
+ "grad_norm": 0.21362556517124176,
793
+ "learning_rate": 0.001,
794
+ "loss": 0.2812,
795
+ "step": 180
796
+ },
797
+ {
798
+ "epoch": 12.66,
799
+ "grad_norm": 0.5383086800575256,
800
+ "learning_rate": 0.001,
801
+ "loss": 0.2594,
802
+ "step": 182
803
+ },
804
+ {
805
+ "epoch": 12.8,
806
+ "grad_norm": 0.2891131639480591,
807
+ "learning_rate": 0.001,
808
+ "loss": 0.2629,
809
+ "step": 184
810
+ },
811
+ {
812
+ "epoch": 12.94,
813
+ "grad_norm": 0.265836238861084,
814
+ "learning_rate": 0.001,
815
+ "loss": 0.2812,
816
+ "step": 186
817
+ },
818
+ {
819
+ "epoch": 12.94,
820
+ "eval_gen_len": 429.99112426035504,
821
+ "eval_loss": 2.3464245796203613,
822
+ "eval_rouge1": 34.4102,
823
+ "eval_rouge2": 14.3607,
824
+ "eval_rougeL": 23.8634,
825
+ "eval_rougeLsum": 32.9732,
826
+ "eval_runtime": 1798.2194,
827
+ "eval_samples_per_second": 0.188,
828
+ "eval_steps_per_second": 0.024,
829
+ "step": 186
830
+ },
831
+ {
832
+ "epoch": 13.08,
833
+ "grad_norm": 0.2541401982307434,
834
+ "learning_rate": 0.001,
835
+ "loss": 0.2283,
836
+ "step": 188
837
+ },
838
+ {
839
+ "epoch": 13.22,
840
+ "grad_norm": 9.848714828491211,
841
+ "learning_rate": 0.001,
842
+ "loss": 0.206,
843
+ "step": 190
844
+ },
845
+ {
846
+ "epoch": 13.36,
847
+ "grad_norm": 0.4088878333568573,
848
+ "learning_rate": 0.001,
849
+ "loss": 0.2014,
850
+ "step": 192
851
+ },
852
+ {
853
+ "epoch": 13.5,
854
+ "grad_norm": 0.4533099830150604,
855
+ "learning_rate": 0.001,
856
+ "loss": 0.2292,
857
+ "step": 194
858
+ },
859
+ {
860
+ "epoch": 13.63,
861
+ "grad_norm": 0.28066885471343994,
862
+ "learning_rate": 0.001,
863
+ "loss": 0.2202,
864
+ "step": 196
865
+ },
866
+ {
867
+ "epoch": 13.77,
868
+ "grad_norm": 0.38810494542121887,
869
+ "learning_rate": 0.001,
870
+ "loss": 0.2278,
871
+ "step": 198
872
+ },
873
+ {
874
+ "epoch": 13.91,
875
+ "grad_norm": 0.2568497657775879,
876
+ "learning_rate": 0.001,
877
+ "loss": 0.2443,
878
+ "step": 200
879
+ },
880
+ {
881
+ "epoch": 13.98,
882
+ "eval_gen_len": 392.53846153846155,
883
+ "eval_loss": 2.2002713680267334,
884
+ "eval_rouge1": 34.8239,
885
+ "eval_rouge2": 14.8042,
886
+ "eval_rougeL": 25.2438,
887
+ "eval_rougeLsum": 33.0469,
888
+ "eval_runtime": 1797.5392,
889
+ "eval_samples_per_second": 0.188,
890
+ "eval_steps_per_second": 0.024,
891
+ "step": 201
892
+ },
893
+ {
894
+ "epoch": 14.05,
895
+ "grad_norm": 0.279291570186615,
896
+ "learning_rate": 0.001,
897
+ "loss": 0.2362,
898
+ "step": 202
899
+ },
900
+ {
901
+ "epoch": 14.19,
902
+ "grad_norm": 0.18151430785655975,
903
+ "learning_rate": 0.001,
904
+ "loss": 0.1807,
905
+ "step": 204
906
+ },
907
+ {
908
+ "epoch": 14.33,
909
+ "grad_norm": 0.2227843850851059,
910
+ "learning_rate": 0.001,
911
+ "loss": 0.1708,
912
+ "step": 206
913
+ },
914
+ {
915
+ "epoch": 14.47,
916
+ "grad_norm": 0.2937067151069641,
917
+ "learning_rate": 0.001,
918
+ "loss": 0.1818,
919
+ "step": 208
920
+ },
921
+ {
922
+ "epoch": 14.61,
923
+ "grad_norm": 0.3238927125930786,
924
+ "learning_rate": 0.001,
925
+ "loss": 0.1958,
926
+ "step": 210
927
+ },
928
+ {
929
+ "epoch": 14.61,
930
+ "eval_gen_len": 503.5769230769231,
931
+ "eval_loss": 2.5840089321136475,
932
+ "eval_rouge1": 29.7482,
933
+ "eval_rouge2": 12.0072,
934
+ "eval_rougeL": 21.348,
935
+ "eval_rougeLsum": 28.5849,
936
+ "eval_runtime": 1799.2535,
937
+ "eval_samples_per_second": 0.188,
938
+ "eval_steps_per_second": 0.024,
939
+ "step": 210
940
+ },
941
+ {
942
+ "epoch": 14.61,
943
+ "step": 210,
944
+ "total_flos": 3.6715210940733604e+18,
945
+ "train_loss": 0.9784720075981957,
946
+ "train_runtime": 78497.3694,
947
+ "train_samples_per_second": 0.702,
948
+ "train_steps_per_second": 0.003
949
+ }
950
+ ],
951
+ "logging_steps": 2,
952
+ "max_steps": 210,
953
+ "num_input_tokens_seen": 0,
954
+ "num_train_epochs": 15,
955
+ "save_steps": 500,
956
+ "total_flos": 3.6715210940733604e+18,
957
+ "train_batch_size": 8,
958
+ "trial_name": null,
959
+ "trial_params": null
960
+ }