|
{ |
|
"best_metric": 25.2438, |
|
"best_model_checkpoint": "/exports/eddie/scratch/s1970716/models/longt5_xl_sfd_bp_15/checkpoint-201", |
|
"epoch": 14.608695652173914, |
|
"eval_steps": 500, |
|
"global_step": 210, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.218067169189453, |
|
"learning_rate": 0.001, |
|
"loss": 2.9668, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 200.38365173339844, |
|
"learning_rate": 0.001, |
|
"loss": 3.3198, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 74.55081176757812, |
|
"learning_rate": 0.001, |
|
"loss": 2.6874, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 19.85554313659668, |
|
"learning_rate": 0.001, |
|
"loss": 2.3138, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 41.041751861572266, |
|
"learning_rate": 0.001, |
|
"loss": 2.0222, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.789278507232666, |
|
"learning_rate": 0.001, |
|
"loss": 1.8363, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.874878883361816, |
|
"learning_rate": 0.001, |
|
"loss": 2.5763, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_gen_len": 509.64792899408286, |
|
"eval_loss": 2.541541814804077, |
|
"eval_rouge1": 10.6052, |
|
"eval_rouge2": 1.4494, |
|
"eval_rougeL": 10.4593, |
|
"eval_rougeLsum": 10.4801, |
|
"eval_runtime": 1798.047, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 1.338165521621704, |
|
"learning_rate": 0.001, |
|
"loss": 2.4441, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 3.755629777908325, |
|
"learning_rate": 0.001, |
|
"loss": 2.258, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 6.490938663482666, |
|
"learning_rate": 0.001, |
|
"loss": 3.0147, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 5.593593597412109, |
|
"learning_rate": 0.001, |
|
"loss": 2.4724, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 1.0521235466003418, |
|
"learning_rate": 0.001, |
|
"loss": 2.023, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 12.585270881652832, |
|
"learning_rate": 0.001, |
|
"loss": 2.223, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 78.1630630493164, |
|
"learning_rate": 0.001, |
|
"loss": 1.8998, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"eval_gen_len": 511.0, |
|
"eval_loss": 1.739753246307373, |
|
"eval_rouge1": 16.7989, |
|
"eval_rouge2": 4.1457, |
|
"eval_rougeL": 16.4049, |
|
"eval_rougeLsum": 15.1803, |
|
"eval_runtime": 1798.9905, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.7915446758270264, |
|
"learning_rate": 0.001, |
|
"loss": 1.8375, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.5996413230895996, |
|
"learning_rate": 0.001, |
|
"loss": 2.0326, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.0431970357894897, |
|
"learning_rate": 0.001, |
|
"loss": 2.1242, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.5979584455490112, |
|
"learning_rate": 0.001, |
|
"loss": 2.0047, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.28407618403434753, |
|
"learning_rate": 0.001, |
|
"loss": 1.7317, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.3217169940471649, |
|
"learning_rate": 0.001, |
|
"loss": 1.694, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.33735284209251404, |
|
"learning_rate": 0.001, |
|
"loss": 1.6403, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"eval_gen_len": 511.0, |
|
"eval_loss": 1.5456656217575073, |
|
"eval_rouge1": 18.4716, |
|
"eval_rouge2": 5.4633, |
|
"eval_rougeL": 17.1393, |
|
"eval_rougeLsum": 16.9242, |
|
"eval_runtime": 1798.0277, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 3.06, |
|
"grad_norm": 0.20745837688446045, |
|
"learning_rate": 0.001, |
|
"loss": 1.5256, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.24278272688388824, |
|
"learning_rate": 0.001, |
|
"loss": 1.4077, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 3.34, |
|
"grad_norm": 3.5210845470428467, |
|
"learning_rate": 0.001, |
|
"loss": 1.4244, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 0.31759026646614075, |
|
"learning_rate": 0.001, |
|
"loss": 1.3542, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.2855791449546814, |
|
"learning_rate": 0.001, |
|
"loss": 1.3873, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 0.30171895027160645, |
|
"learning_rate": 0.001, |
|
"loss": 1.4693, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 0.28778406977653503, |
|
"learning_rate": 0.001, |
|
"loss": 1.5012, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"eval_gen_len": 511.0, |
|
"eval_loss": 1.5736442804336548, |
|
"eval_rouge1": 18.2259, |
|
"eval_rouge2": 5.3524, |
|
"eval_rougeL": 17.0162, |
|
"eval_rougeLsum": 16.7948, |
|
"eval_runtime": 1799.9735, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 0.27410924434661865, |
|
"learning_rate": 0.001, |
|
"loss": 1.3865, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 0.2398337870836258, |
|
"learning_rate": 0.001, |
|
"loss": 1.198, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 0.24380528926849365, |
|
"learning_rate": 0.001, |
|
"loss": 1.1965, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 0.28130125999450684, |
|
"learning_rate": 0.001, |
|
"loss": 1.2576, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 4.59, |
|
"grad_norm": 0.22549273073673248, |
|
"learning_rate": 0.001, |
|
"loss": 1.2108, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 4.73, |
|
"grad_norm": 0.3336837589740753, |
|
"learning_rate": 0.001, |
|
"loss": 1.23, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 4.87, |
|
"grad_norm": 0.39294493198394775, |
|
"learning_rate": 0.001, |
|
"loss": 1.248, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"eval_gen_len": 511.0, |
|
"eval_loss": 1.5482468605041504, |
|
"eval_rouge1": 20.8275, |
|
"eval_rouge2": 6.7412, |
|
"eval_rougeL": 18.0859, |
|
"eval_rougeLsum": 19.3113, |
|
"eval_runtime": 1798.5715, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"grad_norm": 0.3731052577495575, |
|
"learning_rate": 0.001, |
|
"loss": 1.2523, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 5.15, |
|
"grad_norm": 0.33552882075309753, |
|
"learning_rate": 0.001, |
|
"loss": 1.0577, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"grad_norm": 0.3163793087005615, |
|
"learning_rate": 0.001, |
|
"loss": 1.0478, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 5.43, |
|
"grad_norm": 0.21926109492778778, |
|
"learning_rate": 0.001, |
|
"loss": 1.0127, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"grad_norm": 0.24710944294929504, |
|
"learning_rate": 0.001, |
|
"loss": 1.0042, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 5.7, |
|
"grad_norm": 0.2397957742214203, |
|
"learning_rate": 0.001, |
|
"loss": 1.0332, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 5.84, |
|
"grad_norm": 0.21428123116493225, |
|
"learning_rate": 0.001, |
|
"loss": 1.022, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"grad_norm": 0.2227003127336502, |
|
"learning_rate": 0.001, |
|
"loss": 1.0176, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 5.98, |
|
"eval_gen_len": 510.6775147928994, |
|
"eval_loss": 1.625435709953308, |
|
"eval_rouge1": 21.1937, |
|
"eval_rouge2": 6.8813, |
|
"eval_rougeL": 18.411, |
|
"eval_rougeLsum": 19.8577, |
|
"eval_runtime": 1798.5872, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 0.2959192991256714, |
|
"learning_rate": 0.001, |
|
"loss": 0.879, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 6.26, |
|
"grad_norm": 0.33006206154823303, |
|
"learning_rate": 0.001, |
|
"loss": 0.8812, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.34284549951553345, |
|
"learning_rate": 0.001, |
|
"loss": 0.8742, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 6.54, |
|
"grad_norm": 0.4311819076538086, |
|
"learning_rate": 0.001, |
|
"loss": 0.8357, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 0.5699031352996826, |
|
"learning_rate": 0.001, |
|
"loss": 0.8721, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 6.82, |
|
"grad_norm": 0.39324450492858887, |
|
"learning_rate": 0.001, |
|
"loss": 0.8739, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"grad_norm": 0.3442493677139282, |
|
"learning_rate": 0.001, |
|
"loss": 0.8472, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 6.96, |
|
"eval_gen_len": 479.9704142011834, |
|
"eval_loss": 1.6212307214736938, |
|
"eval_rouge1": 26.1873, |
|
"eval_rouge2": 9.1581, |
|
"eval_rougeL": 20.393, |
|
"eval_rougeLsum": 24.1393, |
|
"eval_runtime": 1802.4729, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 7.1, |
|
"grad_norm": 0.2600483298301697, |
|
"learning_rate": 0.001, |
|
"loss": 0.7568, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"grad_norm": 0.28727108240127563, |
|
"learning_rate": 0.001, |
|
"loss": 0.6971, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 7.37, |
|
"grad_norm": 0.3065392076969147, |
|
"learning_rate": 0.001, |
|
"loss": 0.6918, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"grad_norm": 0.427791029214859, |
|
"learning_rate": 0.001, |
|
"loss": 0.6902, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"grad_norm": 0.48664093017578125, |
|
"learning_rate": 0.001, |
|
"loss": 0.7415, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 7.79, |
|
"grad_norm": 0.2857199013233185, |
|
"learning_rate": 0.001, |
|
"loss": 0.7442, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 7.93, |
|
"grad_norm": 0.24586661159992218, |
|
"learning_rate": 0.001, |
|
"loss": 0.7242, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_gen_len": 506.9112426035503, |
|
"eval_loss": 1.723126769065857, |
|
"eval_rouge1": 23.5881, |
|
"eval_rouge2": 7.8961, |
|
"eval_rougeL": 18.7014, |
|
"eval_rougeLsum": 22.2999, |
|
"eval_runtime": 1807.8192, |
|
"eval_samples_per_second": 0.187, |
|
"eval_steps_per_second": 0.024, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 8.07, |
|
"grad_norm": 0.21033655107021332, |
|
"learning_rate": 0.001, |
|
"loss": 0.6797, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 8.21, |
|
"grad_norm": 0.22591687738895416, |
|
"learning_rate": 0.001, |
|
"loss": 0.5446, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 8.35, |
|
"grad_norm": 0.20658165216445923, |
|
"learning_rate": 0.001, |
|
"loss": 0.5545, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 8.49, |
|
"grad_norm": 0.29855239391326904, |
|
"learning_rate": 0.001, |
|
"loss": 0.6124, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 8.63, |
|
"grad_norm": 0.3976292312145233, |
|
"learning_rate": 0.001, |
|
"loss": 0.6052, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 8.77, |
|
"grad_norm": 0.27770739793777466, |
|
"learning_rate": 0.001, |
|
"loss": 0.5755, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"grad_norm": 0.2741471529006958, |
|
"learning_rate": 0.001, |
|
"loss": 0.5876, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 8.97, |
|
"eval_gen_len": 451.698224852071, |
|
"eval_loss": 1.9400925636291504, |
|
"eval_rouge1": 32.1851, |
|
"eval_rouge2": 12.6426, |
|
"eval_rougeL": 22.8358, |
|
"eval_rougeLsum": 30.6718, |
|
"eval_runtime": 1805.3092, |
|
"eval_samples_per_second": 0.187, |
|
"eval_steps_per_second": 0.024, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 9.04, |
|
"grad_norm": 0.25768765807151794, |
|
"learning_rate": 0.001, |
|
"loss": 0.5517, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 9.18, |
|
"grad_norm": 0.203142449259758, |
|
"learning_rate": 0.001, |
|
"loss": 0.4295, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 9.32, |
|
"grad_norm": 0.29351434111595154, |
|
"learning_rate": 0.001, |
|
"loss": 0.493, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 9.46, |
|
"grad_norm": 0.23967808485031128, |
|
"learning_rate": 0.001, |
|
"loss": 0.4877, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 0.21488718688488007, |
|
"learning_rate": 0.001, |
|
"loss": 0.4943, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 9.74, |
|
"grad_norm": 0.20587602257728577, |
|
"learning_rate": 0.001, |
|
"loss": 0.4729, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 9.88, |
|
"grad_norm": 0.2094978392124176, |
|
"learning_rate": 0.001, |
|
"loss": 0.4756, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 9.95, |
|
"eval_gen_len": 455.594674556213, |
|
"eval_loss": 1.9001177549362183, |
|
"eval_rouge1": 31.353, |
|
"eval_rouge2": 12.994, |
|
"eval_rougeL": 23.1542, |
|
"eval_rougeLsum": 29.8375, |
|
"eval_runtime": 1806.0454, |
|
"eval_samples_per_second": 0.187, |
|
"eval_steps_per_second": 0.024, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 10.02, |
|
"grad_norm": 0.2443789541721344, |
|
"learning_rate": 0.001, |
|
"loss": 0.4707, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 10.16, |
|
"grad_norm": 0.21666786074638367, |
|
"learning_rate": 0.001, |
|
"loss": 0.3612, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 10.3, |
|
"grad_norm": 0.20268017053604126, |
|
"learning_rate": 0.001, |
|
"loss": 0.3739, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 10.43, |
|
"grad_norm": 0.22428925335407257, |
|
"learning_rate": 0.001, |
|
"loss": 0.382, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 10.57, |
|
"grad_norm": 0.21844923496246338, |
|
"learning_rate": 0.001, |
|
"loss": 0.3623, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 10.71, |
|
"grad_norm": 0.2675388753414154, |
|
"learning_rate": 0.001, |
|
"loss": 0.3674, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 10.85, |
|
"grad_norm": 0.2905120849609375, |
|
"learning_rate": 0.001, |
|
"loss": 0.39, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 10.99, |
|
"grad_norm": 0.27420204877853394, |
|
"learning_rate": 0.001, |
|
"loss": 0.4042, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 10.99, |
|
"eval_gen_len": 497.53550295857985, |
|
"eval_loss": 2.1294684410095215, |
|
"eval_rouge1": 28.6425, |
|
"eval_rouge2": 11.8399, |
|
"eval_rougeL": 21.3847, |
|
"eval_rougeLsum": 27.0508, |
|
"eval_runtime": 1807.4153, |
|
"eval_samples_per_second": 0.187, |
|
"eval_steps_per_second": 0.024, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 11.13, |
|
"grad_norm": 0.26691916584968567, |
|
"learning_rate": 0.001, |
|
"loss": 0.3127, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 11.27, |
|
"grad_norm": 0.3042663335800171, |
|
"learning_rate": 0.001, |
|
"loss": 0.305, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 11.41, |
|
"grad_norm": 0.26255106925964355, |
|
"learning_rate": 0.001, |
|
"loss": 0.3133, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 11.55, |
|
"grad_norm": 0.23816817998886108, |
|
"learning_rate": 0.001, |
|
"loss": 0.3118, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 11.69, |
|
"grad_norm": 0.22553777694702148, |
|
"learning_rate": 0.001, |
|
"loss": 0.3073, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 11.83, |
|
"grad_norm": 0.2234884351491928, |
|
"learning_rate": 0.001, |
|
"loss": 0.3346, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 11.97, |
|
"grad_norm": 0.18143154680728912, |
|
"learning_rate": 0.001, |
|
"loss": 0.3292, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 11.97, |
|
"eval_gen_len": 478.81065088757396, |
|
"eval_loss": 2.244086503982544, |
|
"eval_rouge1": 31.8393, |
|
"eval_rouge2": 13.1308, |
|
"eval_rougeL": 22.135, |
|
"eval_rougeLsum": 30.5866, |
|
"eval_runtime": 1798.3958, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 12.1, |
|
"grad_norm": 0.24745677411556244, |
|
"learning_rate": 0.001, |
|
"loss": 0.2539, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 12.24, |
|
"grad_norm": 0.26513755321502686, |
|
"learning_rate": 0.001, |
|
"loss": 0.2588, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 12.38, |
|
"grad_norm": 0.20156317949295044, |
|
"learning_rate": 0.001, |
|
"loss": 0.2537, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 12.52, |
|
"grad_norm": 0.21362556517124176, |
|
"learning_rate": 0.001, |
|
"loss": 0.2812, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 12.66, |
|
"grad_norm": 0.5383086800575256, |
|
"learning_rate": 0.001, |
|
"loss": 0.2594, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 12.8, |
|
"grad_norm": 0.2891131639480591, |
|
"learning_rate": 0.001, |
|
"loss": 0.2629, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 12.94, |
|
"grad_norm": 0.265836238861084, |
|
"learning_rate": 0.001, |
|
"loss": 0.2812, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 12.94, |
|
"eval_gen_len": 429.99112426035504, |
|
"eval_loss": 2.3464245796203613, |
|
"eval_rouge1": 34.4102, |
|
"eval_rouge2": 14.3607, |
|
"eval_rougeL": 23.8634, |
|
"eval_rougeLsum": 32.9732, |
|
"eval_runtime": 1798.2194, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 13.08, |
|
"grad_norm": 0.2541401982307434, |
|
"learning_rate": 0.001, |
|
"loss": 0.2283, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 13.22, |
|
"grad_norm": 9.848714828491211, |
|
"learning_rate": 0.001, |
|
"loss": 0.206, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 13.36, |
|
"grad_norm": 0.4088878333568573, |
|
"learning_rate": 0.001, |
|
"loss": 0.2014, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 13.5, |
|
"grad_norm": 0.4533099830150604, |
|
"learning_rate": 0.001, |
|
"loss": 0.2292, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 13.63, |
|
"grad_norm": 0.28066885471343994, |
|
"learning_rate": 0.001, |
|
"loss": 0.2202, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 13.77, |
|
"grad_norm": 0.38810494542121887, |
|
"learning_rate": 0.001, |
|
"loss": 0.2278, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 13.91, |
|
"grad_norm": 0.2568497657775879, |
|
"learning_rate": 0.001, |
|
"loss": 0.2443, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 13.98, |
|
"eval_gen_len": 392.53846153846155, |
|
"eval_loss": 2.2002713680267334, |
|
"eval_rouge1": 34.8239, |
|
"eval_rouge2": 14.8042, |
|
"eval_rougeL": 25.2438, |
|
"eval_rougeLsum": 33.0469, |
|
"eval_runtime": 1797.5392, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 14.05, |
|
"grad_norm": 0.279291570186615, |
|
"learning_rate": 0.001, |
|
"loss": 0.2362, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 14.19, |
|
"grad_norm": 0.18151430785655975, |
|
"learning_rate": 0.001, |
|
"loss": 0.1807, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 14.33, |
|
"grad_norm": 0.2227843850851059, |
|
"learning_rate": 0.001, |
|
"loss": 0.1708, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 14.47, |
|
"grad_norm": 0.2937067151069641, |
|
"learning_rate": 0.001, |
|
"loss": 0.1818, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 14.61, |
|
"grad_norm": 0.3238927125930786, |
|
"learning_rate": 0.001, |
|
"loss": 0.1958, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 14.61, |
|
"eval_gen_len": 503.5769230769231, |
|
"eval_loss": 2.5840089321136475, |
|
"eval_rouge1": 29.7482, |
|
"eval_rouge2": 12.0072, |
|
"eval_rougeL": 21.348, |
|
"eval_rougeLsum": 28.5849, |
|
"eval_runtime": 1799.2535, |
|
"eval_samples_per_second": 0.188, |
|
"eval_steps_per_second": 0.024, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 14.61, |
|
"step": 210, |
|
"total_flos": 3.6715210940733604e+18, |
|
"train_loss": 0.9784720075981957, |
|
"train_runtime": 78497.3694, |
|
"train_samples_per_second": 0.702, |
|
"train_steps_per_second": 0.003 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 210, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 15, |
|
"save_steps": 500, |
|
"total_flos": 3.6715210940733604e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|