longt5_xl_sfd_bp_15 / trainer_state.json
learn3r's picture
End of training
81cba4b verified
{
"best_metric": 25.2438,
"best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/longt5_xl_sfd_bp_15/checkpoint-201",
"epoch": 14.608695652173914,
"eval_steps": 500,
"global_step": 210,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.14,
"grad_norm": 4.218067169189453,
"learning_rate": 0.001,
"loss": 2.9668,
"step": 2
},
{
"epoch": 0.28,
"grad_norm": 200.38365173339844,
"learning_rate": 0.001,
"loss": 3.3198,
"step": 4
},
{
"epoch": 0.42,
"grad_norm": 74.55081176757812,
"learning_rate": 0.001,
"loss": 2.6874,
"step": 6
},
{
"epoch": 0.56,
"grad_norm": 19.85554313659668,
"learning_rate": 0.001,
"loss": 2.3138,
"step": 8
},
{
"epoch": 0.7,
"grad_norm": 41.041751861572266,
"learning_rate": 0.001,
"loss": 2.0222,
"step": 10
},
{
"epoch": 0.83,
"grad_norm": 3.789278507232666,
"learning_rate": 0.001,
"loss": 1.8363,
"step": 12
},
{
"epoch": 0.97,
"grad_norm": 5.874878883361816,
"learning_rate": 0.001,
"loss": 2.5763,
"step": 14
},
{
"epoch": 0.97,
"eval_gen_len": 509.64792899408286,
"eval_loss": 2.541541814804077,
"eval_rouge1": 10.6052,
"eval_rouge2": 1.4494,
"eval_rougeL": 10.4593,
"eval_rougeLsum": 10.4801,
"eval_runtime": 1798.047,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 14
},
{
"epoch": 1.11,
"grad_norm": 1.338165521621704,
"learning_rate": 0.001,
"loss": 2.4441,
"step": 16
},
{
"epoch": 1.25,
"grad_norm": 3.755629777908325,
"learning_rate": 0.001,
"loss": 2.258,
"step": 18
},
{
"epoch": 1.39,
"grad_norm": 6.490938663482666,
"learning_rate": 0.001,
"loss": 3.0147,
"step": 20
},
{
"epoch": 1.53,
"grad_norm": 5.593593597412109,
"learning_rate": 0.001,
"loss": 2.4724,
"step": 22
},
{
"epoch": 1.67,
"grad_norm": 1.0521235466003418,
"learning_rate": 0.001,
"loss": 2.023,
"step": 24
},
{
"epoch": 1.81,
"grad_norm": 12.585270881652832,
"learning_rate": 0.001,
"loss": 2.223,
"step": 26
},
{
"epoch": 1.95,
"grad_norm": 78.1630630493164,
"learning_rate": 0.001,
"loss": 1.8998,
"step": 28
},
{
"epoch": 1.95,
"eval_gen_len": 511.0,
"eval_loss": 1.739753246307373,
"eval_rouge1": 16.7989,
"eval_rouge2": 4.1457,
"eval_rougeL": 16.4049,
"eval_rougeLsum": 15.1803,
"eval_runtime": 1798.9905,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 28
},
{
"epoch": 2.09,
"grad_norm": 0.7915446758270264,
"learning_rate": 0.001,
"loss": 1.8375,
"step": 30
},
{
"epoch": 2.23,
"grad_norm": 1.5996413230895996,
"learning_rate": 0.001,
"loss": 2.0326,
"step": 32
},
{
"epoch": 2.37,
"grad_norm": 1.0431970357894897,
"learning_rate": 0.001,
"loss": 2.1242,
"step": 34
},
{
"epoch": 2.5,
"grad_norm": 0.5979584455490112,
"learning_rate": 0.001,
"loss": 2.0047,
"step": 36
},
{
"epoch": 2.64,
"grad_norm": 0.28407618403434753,
"learning_rate": 0.001,
"loss": 1.7317,
"step": 38
},
{
"epoch": 2.78,
"grad_norm": 0.3217169940471649,
"learning_rate": 0.001,
"loss": 1.694,
"step": 40
},
{
"epoch": 2.92,
"grad_norm": 0.33735284209251404,
"learning_rate": 0.001,
"loss": 1.6403,
"step": 42
},
{
"epoch": 2.99,
"eval_gen_len": 511.0,
"eval_loss": 1.5456656217575073,
"eval_rouge1": 18.4716,
"eval_rouge2": 5.4633,
"eval_rougeL": 17.1393,
"eval_rougeLsum": 16.9242,
"eval_runtime": 1798.0277,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 43
},
{
"epoch": 3.06,
"grad_norm": 0.20745837688446045,
"learning_rate": 0.001,
"loss": 1.5256,
"step": 44
},
{
"epoch": 3.2,
"grad_norm": 0.24278272688388824,
"learning_rate": 0.001,
"loss": 1.4077,
"step": 46
},
{
"epoch": 3.34,
"grad_norm": 3.5210845470428467,
"learning_rate": 0.001,
"loss": 1.4244,
"step": 48
},
{
"epoch": 3.48,
"grad_norm": 0.31759026646614075,
"learning_rate": 0.001,
"loss": 1.3542,
"step": 50
},
{
"epoch": 3.62,
"grad_norm": 0.2855791449546814,
"learning_rate": 0.001,
"loss": 1.3873,
"step": 52
},
{
"epoch": 3.76,
"grad_norm": 0.30171895027160645,
"learning_rate": 0.001,
"loss": 1.4693,
"step": 54
},
{
"epoch": 3.9,
"grad_norm": 0.28778406977653503,
"learning_rate": 0.001,
"loss": 1.5012,
"step": 56
},
{
"epoch": 3.97,
"eval_gen_len": 511.0,
"eval_loss": 1.5736442804336548,
"eval_rouge1": 18.2259,
"eval_rouge2": 5.3524,
"eval_rougeL": 17.0162,
"eval_rougeLsum": 16.7948,
"eval_runtime": 1799.9735,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 57
},
{
"epoch": 4.03,
"grad_norm": 0.27410924434661865,
"learning_rate": 0.001,
"loss": 1.3865,
"step": 58
},
{
"epoch": 4.17,
"grad_norm": 0.2398337870836258,
"learning_rate": 0.001,
"loss": 1.198,
"step": 60
},
{
"epoch": 4.31,
"grad_norm": 0.24380528926849365,
"learning_rate": 0.001,
"loss": 1.1965,
"step": 62
},
{
"epoch": 4.45,
"grad_norm": 0.28130125999450684,
"learning_rate": 0.001,
"loss": 1.2576,
"step": 64
},
{
"epoch": 4.59,
"grad_norm": 0.22549273073673248,
"learning_rate": 0.001,
"loss": 1.2108,
"step": 66
},
{
"epoch": 4.73,
"grad_norm": 0.3336837589740753,
"learning_rate": 0.001,
"loss": 1.23,
"step": 68
},
{
"epoch": 4.87,
"grad_norm": 0.39294493198394775,
"learning_rate": 0.001,
"loss": 1.248,
"step": 70
},
{
"epoch": 4.94,
"eval_gen_len": 511.0,
"eval_loss": 1.5482468605041504,
"eval_rouge1": 20.8275,
"eval_rouge2": 6.7412,
"eval_rougeL": 18.0859,
"eval_rougeLsum": 19.3113,
"eval_runtime": 1798.5715,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 71
},
{
"epoch": 5.01,
"grad_norm": 0.3731052577495575,
"learning_rate": 0.001,
"loss": 1.2523,
"step": 72
},
{
"epoch": 5.15,
"grad_norm": 0.33552882075309753,
"learning_rate": 0.001,
"loss": 1.0577,
"step": 74
},
{
"epoch": 5.29,
"grad_norm": 0.3163793087005615,
"learning_rate": 0.001,
"loss": 1.0478,
"step": 76
},
{
"epoch": 5.43,
"grad_norm": 0.21926109492778778,
"learning_rate": 0.001,
"loss": 1.0127,
"step": 78
},
{
"epoch": 5.57,
"grad_norm": 0.24710944294929504,
"learning_rate": 0.001,
"loss": 1.0042,
"step": 80
},
{
"epoch": 5.7,
"grad_norm": 0.2397957742214203,
"learning_rate": 0.001,
"loss": 1.0332,
"step": 82
},
{
"epoch": 5.84,
"grad_norm": 0.21428123116493225,
"learning_rate": 0.001,
"loss": 1.022,
"step": 84
},
{
"epoch": 5.98,
"grad_norm": 0.2227003127336502,
"learning_rate": 0.001,
"loss": 1.0176,
"step": 86
},
{
"epoch": 5.98,
"eval_gen_len": 510.6775147928994,
"eval_loss": 1.625435709953308,
"eval_rouge1": 21.1937,
"eval_rouge2": 6.8813,
"eval_rougeL": 18.411,
"eval_rougeLsum": 19.8577,
"eval_runtime": 1798.5872,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 86
},
{
"epoch": 6.12,
"grad_norm": 0.2959192991256714,
"learning_rate": 0.001,
"loss": 0.879,
"step": 88
},
{
"epoch": 6.26,
"grad_norm": 0.33006206154823303,
"learning_rate": 0.001,
"loss": 0.8812,
"step": 90
},
{
"epoch": 6.4,
"grad_norm": 0.34284549951553345,
"learning_rate": 0.001,
"loss": 0.8742,
"step": 92
},
{
"epoch": 6.54,
"grad_norm": 0.4311819076538086,
"learning_rate": 0.001,
"loss": 0.8357,
"step": 94
},
{
"epoch": 6.68,
"grad_norm": 0.5699031352996826,
"learning_rate": 0.001,
"loss": 0.8721,
"step": 96
},
{
"epoch": 6.82,
"grad_norm": 0.39324450492858887,
"learning_rate": 0.001,
"loss": 0.8739,
"step": 98
},
{
"epoch": 6.96,
"grad_norm": 0.3442493677139282,
"learning_rate": 0.001,
"loss": 0.8472,
"step": 100
},
{
"epoch": 6.96,
"eval_gen_len": 479.9704142011834,
"eval_loss": 1.6212307214736938,
"eval_rouge1": 26.1873,
"eval_rouge2": 9.1581,
"eval_rougeL": 20.393,
"eval_rougeLsum": 24.1393,
"eval_runtime": 1802.4729,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 100
},
{
"epoch": 7.1,
"grad_norm": 0.2600483298301697,
"learning_rate": 0.001,
"loss": 0.7568,
"step": 102
},
{
"epoch": 7.23,
"grad_norm": 0.28727108240127563,
"learning_rate": 0.001,
"loss": 0.6971,
"step": 104
},
{
"epoch": 7.37,
"grad_norm": 0.3065392076969147,
"learning_rate": 0.001,
"loss": 0.6918,
"step": 106
},
{
"epoch": 7.51,
"grad_norm": 0.427791029214859,
"learning_rate": 0.001,
"loss": 0.6902,
"step": 108
},
{
"epoch": 7.65,
"grad_norm": 0.48664093017578125,
"learning_rate": 0.001,
"loss": 0.7415,
"step": 110
},
{
"epoch": 7.79,
"grad_norm": 0.2857199013233185,
"learning_rate": 0.001,
"loss": 0.7442,
"step": 112
},
{
"epoch": 7.93,
"grad_norm": 0.24586661159992218,
"learning_rate": 0.001,
"loss": 0.7242,
"step": 114
},
{
"epoch": 8.0,
"eval_gen_len": 506.9112426035503,
"eval_loss": 1.723126769065857,
"eval_rouge1": 23.5881,
"eval_rouge2": 7.8961,
"eval_rougeL": 18.7014,
"eval_rougeLsum": 22.2999,
"eval_runtime": 1807.8192,
"eval_samples_per_second": 0.187,
"eval_steps_per_second": 0.024,
"step": 115
},
{
"epoch": 8.07,
"grad_norm": 0.21033655107021332,
"learning_rate": 0.001,
"loss": 0.6797,
"step": 116
},
{
"epoch": 8.21,
"grad_norm": 0.22591687738895416,
"learning_rate": 0.001,
"loss": 0.5446,
"step": 118
},
{
"epoch": 8.35,
"grad_norm": 0.20658165216445923,
"learning_rate": 0.001,
"loss": 0.5545,
"step": 120
},
{
"epoch": 8.49,
"grad_norm": 0.29855239391326904,
"learning_rate": 0.001,
"loss": 0.6124,
"step": 122
},
{
"epoch": 8.63,
"grad_norm": 0.3976292312145233,
"learning_rate": 0.001,
"loss": 0.6052,
"step": 124
},
{
"epoch": 8.77,
"grad_norm": 0.27770739793777466,
"learning_rate": 0.001,
"loss": 0.5755,
"step": 126
},
{
"epoch": 8.9,
"grad_norm": 0.2741471529006958,
"learning_rate": 0.001,
"loss": 0.5876,
"step": 128
},
{
"epoch": 8.97,
"eval_gen_len": 451.698224852071,
"eval_loss": 1.9400925636291504,
"eval_rouge1": 32.1851,
"eval_rouge2": 12.6426,
"eval_rougeL": 22.8358,
"eval_rougeLsum": 30.6718,
"eval_runtime": 1805.3092,
"eval_samples_per_second": 0.187,
"eval_steps_per_second": 0.024,
"step": 129
},
{
"epoch": 9.04,
"grad_norm": 0.25768765807151794,
"learning_rate": 0.001,
"loss": 0.5517,
"step": 130
},
{
"epoch": 9.18,
"grad_norm": 0.203142449259758,
"learning_rate": 0.001,
"loss": 0.4295,
"step": 132
},
{
"epoch": 9.32,
"grad_norm": 0.29351434111595154,
"learning_rate": 0.001,
"loss": 0.493,
"step": 134
},
{
"epoch": 9.46,
"grad_norm": 0.23967808485031128,
"learning_rate": 0.001,
"loss": 0.4877,
"step": 136
},
{
"epoch": 9.6,
"grad_norm": 0.21488718688488007,
"learning_rate": 0.001,
"loss": 0.4943,
"step": 138
},
{
"epoch": 9.74,
"grad_norm": 0.20587602257728577,
"learning_rate": 0.001,
"loss": 0.4729,
"step": 140
},
{
"epoch": 9.88,
"grad_norm": 0.2094978392124176,
"learning_rate": 0.001,
"loss": 0.4756,
"step": 142
},
{
"epoch": 9.95,
"eval_gen_len": 455.594674556213,
"eval_loss": 1.9001177549362183,
"eval_rouge1": 31.353,
"eval_rouge2": 12.994,
"eval_rougeL": 23.1542,
"eval_rougeLsum": 29.8375,
"eval_runtime": 1806.0454,
"eval_samples_per_second": 0.187,
"eval_steps_per_second": 0.024,
"step": 143
},
{
"epoch": 10.02,
"grad_norm": 0.2443789541721344,
"learning_rate": 0.001,
"loss": 0.4707,
"step": 144
},
{
"epoch": 10.16,
"grad_norm": 0.21666786074638367,
"learning_rate": 0.001,
"loss": 0.3612,
"step": 146
},
{
"epoch": 10.3,
"grad_norm": 0.20268017053604126,
"learning_rate": 0.001,
"loss": 0.3739,
"step": 148
},
{
"epoch": 10.43,
"grad_norm": 0.22428925335407257,
"learning_rate": 0.001,
"loss": 0.382,
"step": 150
},
{
"epoch": 10.57,
"grad_norm": 0.21844923496246338,
"learning_rate": 0.001,
"loss": 0.3623,
"step": 152
},
{
"epoch": 10.71,
"grad_norm": 0.2675388753414154,
"learning_rate": 0.001,
"loss": 0.3674,
"step": 154
},
{
"epoch": 10.85,
"grad_norm": 0.2905120849609375,
"learning_rate": 0.001,
"loss": 0.39,
"step": 156
},
{
"epoch": 10.99,
"grad_norm": 0.27420204877853394,
"learning_rate": 0.001,
"loss": 0.4042,
"step": 158
},
{
"epoch": 10.99,
"eval_gen_len": 497.53550295857985,
"eval_loss": 2.1294684410095215,
"eval_rouge1": 28.6425,
"eval_rouge2": 11.8399,
"eval_rougeL": 21.3847,
"eval_rougeLsum": 27.0508,
"eval_runtime": 1807.4153,
"eval_samples_per_second": 0.187,
"eval_steps_per_second": 0.024,
"step": 158
},
{
"epoch": 11.13,
"grad_norm": 0.26691916584968567,
"learning_rate": 0.001,
"loss": 0.3127,
"step": 160
},
{
"epoch": 11.27,
"grad_norm": 0.3042663335800171,
"learning_rate": 0.001,
"loss": 0.305,
"step": 162
},
{
"epoch": 11.41,
"grad_norm": 0.26255106925964355,
"learning_rate": 0.001,
"loss": 0.3133,
"step": 164
},
{
"epoch": 11.55,
"grad_norm": 0.23816817998886108,
"learning_rate": 0.001,
"loss": 0.3118,
"step": 166
},
{
"epoch": 11.69,
"grad_norm": 0.22553777694702148,
"learning_rate": 0.001,
"loss": 0.3073,
"step": 168
},
{
"epoch": 11.83,
"grad_norm": 0.2234884351491928,
"learning_rate": 0.001,
"loss": 0.3346,
"step": 170
},
{
"epoch": 11.97,
"grad_norm": 0.18143154680728912,
"learning_rate": 0.001,
"loss": 0.3292,
"step": 172
},
{
"epoch": 11.97,
"eval_gen_len": 478.81065088757396,
"eval_loss": 2.244086503982544,
"eval_rouge1": 31.8393,
"eval_rouge2": 13.1308,
"eval_rougeL": 22.135,
"eval_rougeLsum": 30.5866,
"eval_runtime": 1798.3958,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 172
},
{
"epoch": 12.1,
"grad_norm": 0.24745677411556244,
"learning_rate": 0.001,
"loss": 0.2539,
"step": 174
},
{
"epoch": 12.24,
"grad_norm": 0.26513755321502686,
"learning_rate": 0.001,
"loss": 0.2588,
"step": 176
},
{
"epoch": 12.38,
"grad_norm": 0.20156317949295044,
"learning_rate": 0.001,
"loss": 0.2537,
"step": 178
},
{
"epoch": 12.52,
"grad_norm": 0.21362556517124176,
"learning_rate": 0.001,
"loss": 0.2812,
"step": 180
},
{
"epoch": 12.66,
"grad_norm": 0.5383086800575256,
"learning_rate": 0.001,
"loss": 0.2594,
"step": 182
},
{
"epoch": 12.8,
"grad_norm": 0.2891131639480591,
"learning_rate": 0.001,
"loss": 0.2629,
"step": 184
},
{
"epoch": 12.94,
"grad_norm": 0.265836238861084,
"learning_rate": 0.001,
"loss": 0.2812,
"step": 186
},
{
"epoch": 12.94,
"eval_gen_len": 429.99112426035504,
"eval_loss": 2.3464245796203613,
"eval_rouge1": 34.4102,
"eval_rouge2": 14.3607,
"eval_rougeL": 23.8634,
"eval_rougeLsum": 32.9732,
"eval_runtime": 1798.2194,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 186
},
{
"epoch": 13.08,
"grad_norm": 0.2541401982307434,
"learning_rate": 0.001,
"loss": 0.2283,
"step": 188
},
{
"epoch": 13.22,
"grad_norm": 9.848714828491211,
"learning_rate": 0.001,
"loss": 0.206,
"step": 190
},
{
"epoch": 13.36,
"grad_norm": 0.4088878333568573,
"learning_rate": 0.001,
"loss": 0.2014,
"step": 192
},
{
"epoch": 13.5,
"grad_norm": 0.4533099830150604,
"learning_rate": 0.001,
"loss": 0.2292,
"step": 194
},
{
"epoch": 13.63,
"grad_norm": 0.28066885471343994,
"learning_rate": 0.001,
"loss": 0.2202,
"step": 196
},
{
"epoch": 13.77,
"grad_norm": 0.38810494542121887,
"learning_rate": 0.001,
"loss": 0.2278,
"step": 198
},
{
"epoch": 13.91,
"grad_norm": 0.2568497657775879,
"learning_rate": 0.001,
"loss": 0.2443,
"step": 200
},
{
"epoch": 13.98,
"eval_gen_len": 392.53846153846155,
"eval_loss": 2.2002713680267334,
"eval_rouge1": 34.8239,
"eval_rouge2": 14.8042,
"eval_rougeL": 25.2438,
"eval_rougeLsum": 33.0469,
"eval_runtime": 1797.5392,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 201
},
{
"epoch": 14.05,
"grad_norm": 0.279291570186615,
"learning_rate": 0.001,
"loss": 0.2362,
"step": 202
},
{
"epoch": 14.19,
"grad_norm": 0.18151430785655975,
"learning_rate": 0.001,
"loss": 0.1807,
"step": 204
},
{
"epoch": 14.33,
"grad_norm": 0.2227843850851059,
"learning_rate": 0.001,
"loss": 0.1708,
"step": 206
},
{
"epoch": 14.47,
"grad_norm": 0.2937067151069641,
"learning_rate": 0.001,
"loss": 0.1818,
"step": 208
},
{
"epoch": 14.61,
"grad_norm": 0.3238927125930786,
"learning_rate": 0.001,
"loss": 0.1958,
"step": 210
},
{
"epoch": 14.61,
"eval_gen_len": 503.5769230769231,
"eval_loss": 2.5840089321136475,
"eval_rouge1": 29.7482,
"eval_rouge2": 12.0072,
"eval_rougeL": 21.348,
"eval_rougeLsum": 28.5849,
"eval_runtime": 1799.2535,
"eval_samples_per_second": 0.188,
"eval_steps_per_second": 0.024,
"step": 210
},
{
"epoch": 14.61,
"step": 210,
"total_flos": 3.6715210940733604e+18,
"train_loss": 0.9784720075981957,
"train_runtime": 78497.3694,
"train_samples_per_second": 0.702,
"train_steps_per_second": 0.003
}
],
"logging_steps": 2,
"max_steps": 210,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 500,
"total_flos": 3.6715210940733604e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}