ncbateman's picture
Training in progress, step 520, checkpoint
1943437 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.964200477326969,
"eval_steps": 52,
"global_step": 520,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00954653937947494,
"grad_norm": 0.541497528553009,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.8864,
"step": 1
},
{
"epoch": 0.00954653937947494,
"eval_loss": 0.975593626499176,
"eval_runtime": 12.7833,
"eval_samples_per_second": 13.846,
"eval_steps_per_second": 1.799,
"step": 1
},
{
"epoch": 0.01909307875894988,
"grad_norm": 0.7477102875709534,
"learning_rate": 4.000000000000001e-06,
"loss": 0.9382,
"step": 2
},
{
"epoch": 0.028639618138424822,
"grad_norm": 0.8567990064620972,
"learning_rate": 6e-06,
"loss": 1.0294,
"step": 3
},
{
"epoch": 0.03818615751789976,
"grad_norm": 0.737090528011322,
"learning_rate": 8.000000000000001e-06,
"loss": 0.9398,
"step": 4
},
{
"epoch": 0.0477326968973747,
"grad_norm": 0.6384573578834534,
"learning_rate": 1e-05,
"loss": 0.9345,
"step": 5
},
{
"epoch": 0.057279236276849645,
"grad_norm": 0.7606269121170044,
"learning_rate": 1.2e-05,
"loss": 0.941,
"step": 6
},
{
"epoch": 0.06682577565632458,
"grad_norm": 0.7251742482185364,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.958,
"step": 7
},
{
"epoch": 0.07637231503579953,
"grad_norm": 0.6773186922073364,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.9318,
"step": 8
},
{
"epoch": 0.08591885441527446,
"grad_norm": 0.8379804491996765,
"learning_rate": 1.8e-05,
"loss": 0.9919,
"step": 9
},
{
"epoch": 0.0954653937947494,
"grad_norm": 0.9668668508529663,
"learning_rate": 2e-05,
"loss": 1.1819,
"step": 10
},
{
"epoch": 0.10501193317422435,
"grad_norm": 0.5556192398071289,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.9474,
"step": 11
},
{
"epoch": 0.11455847255369929,
"grad_norm": 0.6156389117240906,
"learning_rate": 2.4e-05,
"loss": 0.9212,
"step": 12
},
{
"epoch": 0.12410501193317422,
"grad_norm": 0.6029731035232544,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.8982,
"step": 13
},
{
"epoch": 0.13365155131264916,
"grad_norm": 0.6002732515335083,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.9832,
"step": 14
},
{
"epoch": 0.1431980906921241,
"grad_norm": 0.6266693472862244,
"learning_rate": 3e-05,
"loss": 0.9485,
"step": 15
},
{
"epoch": 0.15274463007159905,
"grad_norm": 0.5493901371955872,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.9013,
"step": 16
},
{
"epoch": 0.162291169451074,
"grad_norm": 0.5467607378959656,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.8975,
"step": 17
},
{
"epoch": 0.1718377088305489,
"grad_norm": 0.556983232498169,
"learning_rate": 3.6e-05,
"loss": 1.1175,
"step": 18
},
{
"epoch": 0.18138424821002386,
"grad_norm": 0.4399558901786804,
"learning_rate": 3.8e-05,
"loss": 0.8975,
"step": 19
},
{
"epoch": 0.1909307875894988,
"grad_norm": 0.4225325882434845,
"learning_rate": 4e-05,
"loss": 0.8186,
"step": 20
},
{
"epoch": 0.20047732696897375,
"grad_norm": 0.38200807571411133,
"learning_rate": 4.2e-05,
"loss": 0.82,
"step": 21
},
{
"epoch": 0.2100238663484487,
"grad_norm": 0.3150412440299988,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.844,
"step": 22
},
{
"epoch": 0.21957040572792363,
"grad_norm": 0.3351333737373352,
"learning_rate": 4.600000000000001e-05,
"loss": 0.9004,
"step": 23
},
{
"epoch": 0.22911694510739858,
"grad_norm": 0.3316827416419983,
"learning_rate": 4.8e-05,
"loss": 0.8965,
"step": 24
},
{
"epoch": 0.2386634844868735,
"grad_norm": 0.31210237741470337,
"learning_rate": 5e-05,
"loss": 0.9084,
"step": 25
},
{
"epoch": 0.24821002386634844,
"grad_norm": 0.33221927285194397,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.9448,
"step": 26
},
{
"epoch": 0.2577565632458234,
"grad_norm": 0.35407039523124695,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.82,
"step": 27
},
{
"epoch": 0.26730310262529833,
"grad_norm": 0.34163302183151245,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.8843,
"step": 28
},
{
"epoch": 0.27684964200477324,
"grad_norm": 0.3265129029750824,
"learning_rate": 5.8e-05,
"loss": 0.8376,
"step": 29
},
{
"epoch": 0.2863961813842482,
"grad_norm": 0.38681215047836304,
"learning_rate": 6e-05,
"loss": 0.9903,
"step": 30
},
{
"epoch": 0.29594272076372313,
"grad_norm": 0.3341940641403198,
"learning_rate": 6.2e-05,
"loss": 0.8054,
"step": 31
},
{
"epoch": 0.3054892601431981,
"grad_norm": 0.3521149158477783,
"learning_rate": 6.400000000000001e-05,
"loss": 0.8931,
"step": 32
},
{
"epoch": 0.315035799522673,
"grad_norm": 0.3392002284526825,
"learning_rate": 6.6e-05,
"loss": 0.7776,
"step": 33
},
{
"epoch": 0.324582338902148,
"grad_norm": 0.3627275824546814,
"learning_rate": 6.800000000000001e-05,
"loss": 0.7253,
"step": 34
},
{
"epoch": 0.3341288782816229,
"grad_norm": 0.33696770668029785,
"learning_rate": 7e-05,
"loss": 0.9011,
"step": 35
},
{
"epoch": 0.3436754176610978,
"grad_norm": 0.3550478518009186,
"learning_rate": 7.2e-05,
"loss": 0.8064,
"step": 36
},
{
"epoch": 0.3532219570405728,
"grad_norm": 0.3183474838733673,
"learning_rate": 7.4e-05,
"loss": 0.8094,
"step": 37
},
{
"epoch": 0.3627684964200477,
"grad_norm": 0.33637434244155884,
"learning_rate": 7.6e-05,
"loss": 0.8637,
"step": 38
},
{
"epoch": 0.3723150357995227,
"grad_norm": 0.34323257207870483,
"learning_rate": 7.800000000000001e-05,
"loss": 0.8717,
"step": 39
},
{
"epoch": 0.3818615751789976,
"grad_norm": 0.290461003780365,
"learning_rate": 8e-05,
"loss": 0.8501,
"step": 40
},
{
"epoch": 0.3914081145584726,
"grad_norm": 0.35435885190963745,
"learning_rate": 8.2e-05,
"loss": 0.8446,
"step": 41
},
{
"epoch": 0.4009546539379475,
"grad_norm": 0.30319270491600037,
"learning_rate": 8.4e-05,
"loss": 0.8175,
"step": 42
},
{
"epoch": 0.4105011933174224,
"grad_norm": 0.28563714027404785,
"learning_rate": 8.6e-05,
"loss": 0.7933,
"step": 43
},
{
"epoch": 0.4200477326968974,
"grad_norm": 0.26857540011405945,
"learning_rate": 8.800000000000001e-05,
"loss": 0.7177,
"step": 44
},
{
"epoch": 0.4295942720763723,
"grad_norm": 0.27898675203323364,
"learning_rate": 9e-05,
"loss": 0.7506,
"step": 45
},
{
"epoch": 0.43914081145584727,
"grad_norm": 0.2959842383861542,
"learning_rate": 9.200000000000001e-05,
"loss": 0.859,
"step": 46
},
{
"epoch": 0.4486873508353222,
"grad_norm": 0.2996789515018463,
"learning_rate": 9.4e-05,
"loss": 0.7429,
"step": 47
},
{
"epoch": 0.45823389021479716,
"grad_norm": 0.2459433376789093,
"learning_rate": 9.6e-05,
"loss": 0.6911,
"step": 48
},
{
"epoch": 0.4677804295942721,
"grad_norm": 0.318551629781723,
"learning_rate": 9.8e-05,
"loss": 0.8618,
"step": 49
},
{
"epoch": 0.477326968973747,
"grad_norm": 0.30586713552474976,
"learning_rate": 0.0001,
"loss": 0.8546,
"step": 50
},
{
"epoch": 0.48687350835322196,
"grad_norm": 0.33441683650016785,
"learning_rate": 9.999888302765345e-05,
"loss": 0.8177,
"step": 51
},
{
"epoch": 0.4964200477326969,
"grad_norm": 0.3031998872756958,
"learning_rate": 9.99955321605189e-05,
"loss": 0.8585,
"step": 52
},
{
"epoch": 0.4964200477326969,
"eval_loss": 0.8117080926895142,
"eval_runtime": 12.9734,
"eval_samples_per_second": 13.643,
"eval_steps_per_second": 1.773,
"step": 52
},
{
"epoch": 0.5059665871121718,
"grad_norm": 0.2897348403930664,
"learning_rate": 9.99899475483094e-05,
"loss": 0.731,
"step": 53
},
{
"epoch": 0.5155131264916468,
"grad_norm": 0.29636526107788086,
"learning_rate": 9.99821294405392e-05,
"loss": 0.8064,
"step": 54
},
{
"epoch": 0.5250596658711217,
"grad_norm": 0.3027283847332001,
"learning_rate": 9.997207818651274e-05,
"loss": 0.7737,
"step": 55
},
{
"epoch": 0.5346062052505967,
"grad_norm": 0.2626173794269562,
"learning_rate": 9.995979423530892e-05,
"loss": 0.793,
"step": 56
},
{
"epoch": 0.5441527446300716,
"grad_norm": 0.2591354548931122,
"learning_rate": 9.99452781357611e-05,
"loss": 0.7764,
"step": 57
},
{
"epoch": 0.5536992840095465,
"grad_norm": 0.3225250244140625,
"learning_rate": 9.992853053643257e-05,
"loss": 0.8997,
"step": 58
},
{
"epoch": 0.5632458233890215,
"grad_norm": 0.29806965589523315,
"learning_rate": 9.99095521855875e-05,
"loss": 0.73,
"step": 59
},
{
"epoch": 0.5727923627684964,
"grad_norm": 0.28511664271354675,
"learning_rate": 9.988834393115767e-05,
"loss": 0.7074,
"step": 60
},
{
"epoch": 0.5823389021479713,
"grad_norm": 0.2897747755050659,
"learning_rate": 9.986490672070437e-05,
"loss": 0.8004,
"step": 61
},
{
"epoch": 0.5918854415274463,
"grad_norm": 0.31389617919921875,
"learning_rate": 9.983924160137625e-05,
"loss": 0.7397,
"step": 62
},
{
"epoch": 0.6014319809069213,
"grad_norm": 0.2554086744785309,
"learning_rate": 9.98113497198625e-05,
"loss": 0.8052,
"step": 63
},
{
"epoch": 0.6109785202863962,
"grad_norm": 0.2977796494960785,
"learning_rate": 9.978123232234147e-05,
"loss": 0.7742,
"step": 64
},
{
"epoch": 0.6205250596658711,
"grad_norm": 0.2979027330875397,
"learning_rate": 9.974889075442521e-05,
"loss": 0.7169,
"step": 65
},
{
"epoch": 0.630071599045346,
"grad_norm": 0.29229047894477844,
"learning_rate": 9.971432646109919e-05,
"loss": 0.8076,
"step": 66
},
{
"epoch": 0.639618138424821,
"grad_norm": 0.29552674293518066,
"learning_rate": 9.967754098665778e-05,
"loss": 0.7533,
"step": 67
},
{
"epoch": 0.649164677804296,
"grad_norm": 0.2626473605632782,
"learning_rate": 9.963853597463533e-05,
"loss": 0.7669,
"step": 68
},
{
"epoch": 0.6587112171837709,
"grad_norm": 0.28999584913253784,
"learning_rate": 9.959731316773259e-05,
"loss": 0.8175,
"step": 69
},
{
"epoch": 0.6682577565632458,
"grad_norm": 0.3006598949432373,
"learning_rate": 9.9553874407739e-05,
"loss": 0.822,
"step": 70
},
{
"epoch": 0.6778042959427207,
"grad_norm": 0.3051898181438446,
"learning_rate": 9.950822163545032e-05,
"loss": 0.8513,
"step": 71
},
{
"epoch": 0.6873508353221957,
"grad_norm": 0.3299737572669983,
"learning_rate": 9.946035689058188e-05,
"loss": 0.8559,
"step": 72
},
{
"epoch": 0.6968973747016707,
"grad_norm": 0.2913152575492859,
"learning_rate": 9.941028231167756e-05,
"loss": 0.7742,
"step": 73
},
{
"epoch": 0.7064439140811456,
"grad_norm": 0.2927692234516144,
"learning_rate": 9.935800013601414e-05,
"loss": 0.8335,
"step": 74
},
{
"epoch": 0.7159904534606205,
"grad_norm": 0.28141114115715027,
"learning_rate": 9.930351269950143e-05,
"loss": 0.7976,
"step": 75
},
{
"epoch": 0.7255369928400954,
"grad_norm": 0.2617853581905365,
"learning_rate": 9.924682243657779e-05,
"loss": 0.7707,
"step": 76
},
{
"epoch": 0.7350835322195705,
"grad_norm": 0.3233207166194916,
"learning_rate": 9.918793188010147e-05,
"loss": 0.8698,
"step": 77
},
{
"epoch": 0.7446300715990454,
"grad_norm": 0.29364439845085144,
"learning_rate": 9.91268436612374e-05,
"loss": 0.79,
"step": 78
},
{
"epoch": 0.7541766109785203,
"grad_norm": 0.3051166832447052,
"learning_rate": 9.906356050933961e-05,
"loss": 0.864,
"step": 79
},
{
"epoch": 0.7637231503579952,
"grad_norm": 0.2952063977718353,
"learning_rate": 9.899808525182935e-05,
"loss": 0.7312,
"step": 80
},
{
"epoch": 0.7732696897374701,
"grad_norm": 0.2865234911441803,
"learning_rate": 9.893042081406867e-05,
"loss": 0.7051,
"step": 81
},
{
"epoch": 0.7828162291169452,
"grad_norm": 0.3515304625034332,
"learning_rate": 9.886057021922982e-05,
"loss": 0.917,
"step": 82
},
{
"epoch": 0.7923627684964201,
"grad_norm": 0.2626941204071045,
"learning_rate": 9.878853658816014e-05,
"loss": 0.7131,
"step": 83
},
{
"epoch": 0.801909307875895,
"grad_norm": 0.276617169380188,
"learning_rate": 9.871432313924255e-05,
"loss": 0.7683,
"step": 84
},
{
"epoch": 0.8114558472553699,
"grad_norm": 0.2965279519557953,
"learning_rate": 9.863793318825186e-05,
"loss": 0.686,
"step": 85
},
{
"epoch": 0.8210023866348448,
"grad_norm": 0.3024349510669708,
"learning_rate": 9.85593701482066e-05,
"loss": 0.8203,
"step": 86
},
{
"epoch": 0.8305489260143198,
"grad_norm": 0.31996044516563416,
"learning_rate": 9.847863752921649e-05,
"loss": 0.8381,
"step": 87
},
{
"epoch": 0.8400954653937948,
"grad_norm": 0.2804871201515198,
"learning_rate": 9.839573893832563e-05,
"loss": 0.8378,
"step": 88
},
{
"epoch": 0.8496420047732697,
"grad_norm": 0.3115004301071167,
"learning_rate": 9.831067807935137e-05,
"loss": 0.7382,
"step": 89
},
{
"epoch": 0.8591885441527446,
"grad_norm": 0.3251977264881134,
"learning_rate": 9.822345875271883e-05,
"loss": 0.8506,
"step": 90
},
{
"epoch": 0.8687350835322196,
"grad_norm": 0.32227852940559387,
"learning_rate": 9.813408485529103e-05,
"loss": 0.7768,
"step": 91
},
{
"epoch": 0.8782816229116945,
"grad_norm": 0.33089524507522583,
"learning_rate": 9.804256038019481e-05,
"loss": 0.7763,
"step": 92
},
{
"epoch": 0.8878281622911695,
"grad_norm": 0.3890259563922882,
"learning_rate": 9.794888941664253e-05,
"loss": 0.9264,
"step": 93
},
{
"epoch": 0.8973747016706444,
"grad_norm": 0.31058862805366516,
"learning_rate": 9.785307614974921e-05,
"loss": 0.8139,
"step": 94
},
{
"epoch": 0.9069212410501193,
"grad_norm": 0.2780233323574066,
"learning_rate": 9.775512486034563e-05,
"loss": 0.8274,
"step": 95
},
{
"epoch": 0.9164677804295943,
"grad_norm": 0.3168707489967346,
"learning_rate": 9.765503992478704e-05,
"loss": 0.8425,
"step": 96
},
{
"epoch": 0.9260143198090692,
"grad_norm": 0.3951367139816284,
"learning_rate": 9.755282581475769e-05,
"loss": 0.6909,
"step": 97
},
{
"epoch": 0.9355608591885441,
"grad_norm": 0.3271735608577728,
"learning_rate": 9.744848709707091e-05,
"loss": 0.7677,
"step": 98
},
{
"epoch": 0.9451073985680191,
"grad_norm": 0.26784732937812805,
"learning_rate": 9.734202843346522e-05,
"loss": 0.6579,
"step": 99
},
{
"epoch": 0.954653937947494,
"grad_norm": 0.3045744001865387,
"learning_rate": 9.723345458039594e-05,
"loss": 0.738,
"step": 100
},
{
"epoch": 0.964200477326969,
"grad_norm": 0.30207037925720215,
"learning_rate": 9.712277038882273e-05,
"loss": 0.7435,
"step": 101
},
{
"epoch": 0.9737470167064439,
"grad_norm": 0.26012739539146423,
"learning_rate": 9.700998080399287e-05,
"loss": 0.6577,
"step": 102
},
{
"epoch": 0.9832935560859188,
"grad_norm": 0.3377532660961151,
"learning_rate": 9.689509086522019e-05,
"loss": 0.8357,
"step": 103
},
{
"epoch": 0.9928400954653938,
"grad_norm": 0.2794972360134125,
"learning_rate": 9.67781057056601e-05,
"loss": 0.7169,
"step": 104
},
{
"epoch": 0.9928400954653938,
"eval_loss": 0.7859669327735901,
"eval_runtime": 13.0799,
"eval_samples_per_second": 13.532,
"eval_steps_per_second": 1.758,
"step": 104
},
{
"epoch": 1.0023866348448687,
"grad_norm": 0.34849369525909424,
"learning_rate": 9.665903055208014e-05,
"loss": 0.7555,
"step": 105
},
{
"epoch": 1.0119331742243436,
"grad_norm": 0.32931214570999146,
"learning_rate": 9.653787072462643e-05,
"loss": 0.8744,
"step": 106
},
{
"epoch": 1.0214797136038185,
"grad_norm": 0.2845034897327423,
"learning_rate": 9.641463163658605e-05,
"loss": 0.7014,
"step": 107
},
{
"epoch": 1.0310262529832936,
"grad_norm": 0.3317318260669708,
"learning_rate": 9.628931879414517e-05,
"loss": 0.6797,
"step": 108
},
{
"epoch": 1.0405727923627686,
"grad_norm": 0.3143399655818939,
"learning_rate": 9.616193779614294e-05,
"loss": 0.7855,
"step": 109
},
{
"epoch": 1.0501193317422435,
"grad_norm": 0.3227895498275757,
"learning_rate": 9.603249433382144e-05,
"loss": 0.8448,
"step": 110
},
{
"epoch": 1.0596658711217184,
"grad_norm": 0.29557734727859497,
"learning_rate": 9.590099419057141e-05,
"loss": 0.7718,
"step": 111
},
{
"epoch": 1.0692124105011933,
"grad_norm": 0.2954796254634857,
"learning_rate": 9.57674432416738e-05,
"loss": 0.7213,
"step": 112
},
{
"epoch": 1.0787589498806682,
"grad_norm": 0.33134913444519043,
"learning_rate": 9.563184745403724e-05,
"loss": 0.7673,
"step": 113
},
{
"epoch": 1.0883054892601431,
"grad_norm": 0.2836027145385742,
"learning_rate": 9.549421288593157e-05,
"loss": 0.6618,
"step": 114
},
{
"epoch": 1.097852028639618,
"grad_norm": 0.27743393182754517,
"learning_rate": 9.535454568671704e-05,
"loss": 0.5814,
"step": 115
},
{
"epoch": 1.107398568019093,
"grad_norm": 0.30395206809043884,
"learning_rate": 9.521285209656962e-05,
"loss": 0.7574,
"step": 116
},
{
"epoch": 1.1169451073985681,
"grad_norm": 0.3082098364830017,
"learning_rate": 9.506913844620218e-05,
"loss": 0.6795,
"step": 117
},
{
"epoch": 1.126491646778043,
"grad_norm": 0.34204941987991333,
"learning_rate": 9.492341115658167e-05,
"loss": 0.7607,
"step": 118
},
{
"epoch": 1.136038186157518,
"grad_norm": 0.2831597924232483,
"learning_rate": 9.477567673864216e-05,
"loss": 0.7202,
"step": 119
},
{
"epoch": 1.1455847255369929,
"grad_norm": 0.31007006764411926,
"learning_rate": 9.462594179299406e-05,
"loss": 0.7508,
"step": 120
},
{
"epoch": 1.1551312649164678,
"grad_norm": 0.36465707421302795,
"learning_rate": 9.44742130096291e-05,
"loss": 0.7767,
"step": 121
},
{
"epoch": 1.1646778042959427,
"grad_norm": 0.30296140909194946,
"learning_rate": 9.43204971676215e-05,
"loss": 0.7016,
"step": 122
},
{
"epoch": 1.1742243436754176,
"grad_norm": 0.30213695764541626,
"learning_rate": 9.416480113482504e-05,
"loss": 0.7981,
"step": 123
},
{
"epoch": 1.1837708830548925,
"grad_norm": 0.3844444155693054,
"learning_rate": 9.400713186756625e-05,
"loss": 0.8803,
"step": 124
},
{
"epoch": 1.1933174224343674,
"grad_norm": 0.35131993889808655,
"learning_rate": 9.384749641033359e-05,
"loss": 0.7987,
"step": 125
},
{
"epoch": 1.2028639618138426,
"grad_norm": 0.32640498876571655,
"learning_rate": 9.368590189546269e-05,
"loss": 0.7141,
"step": 126
},
{
"epoch": 1.2124105011933175,
"grad_norm": 0.33494001626968384,
"learning_rate": 9.352235554281774e-05,
"loss": 0.7661,
"step": 127
},
{
"epoch": 1.2219570405727924,
"grad_norm": 0.3069153130054474,
"learning_rate": 9.335686465946887e-05,
"loss": 0.7727,
"step": 128
},
{
"epoch": 1.2315035799522673,
"grad_norm": 0.34017837047576904,
"learning_rate": 9.31894366393657e-05,
"loss": 0.712,
"step": 129
},
{
"epoch": 1.2410501193317423,
"grad_norm": 0.3549365997314453,
"learning_rate": 9.302007896300698e-05,
"loss": 0.7778,
"step": 130
},
{
"epoch": 1.2505966587112172,
"grad_norm": 0.36418265104293823,
"learning_rate": 9.284879919710632e-05,
"loss": 0.7858,
"step": 131
},
{
"epoch": 1.260143198090692,
"grad_norm": 0.35619720816612244,
"learning_rate": 9.267560499425424e-05,
"loss": 0.7254,
"step": 132
},
{
"epoch": 1.269689737470167,
"grad_norm": 0.3609873652458191,
"learning_rate": 9.250050409257611e-05,
"loss": 0.702,
"step": 133
},
{
"epoch": 1.279236276849642,
"grad_norm": 0.33549803495407104,
"learning_rate": 9.232350431538656e-05,
"loss": 0.753,
"step": 134
},
{
"epoch": 1.288782816229117,
"grad_norm": 0.3622001111507416,
"learning_rate": 9.214461357083985e-05,
"loss": 0.7751,
"step": 135
},
{
"epoch": 1.2983293556085918,
"grad_norm": 0.36122390627861023,
"learning_rate": 9.196383985157656e-05,
"loss": 0.7293,
"step": 136
},
{
"epoch": 1.307875894988067,
"grad_norm": 0.3313043713569641,
"learning_rate": 9.17811912343665e-05,
"loss": 0.7719,
"step": 137
},
{
"epoch": 1.3174224343675418,
"grad_norm": 0.420242041349411,
"learning_rate": 9.159667587974785e-05,
"loss": 0.8639,
"step": 138
},
{
"epoch": 1.3269689737470167,
"grad_norm": 0.3835316598415375,
"learning_rate": 9.141030203166256e-05,
"loss": 0.7759,
"step": 139
},
{
"epoch": 1.3365155131264916,
"grad_norm": 0.3210572898387909,
"learning_rate": 9.122207801708802e-05,
"loss": 0.7146,
"step": 140
},
{
"epoch": 1.3460620525059666,
"grad_norm": 0.3777942955493927,
"learning_rate": 9.103201224566498e-05,
"loss": 0.6785,
"step": 141
},
{
"epoch": 1.3556085918854415,
"grad_norm": 0.36900511384010315,
"learning_rate": 9.084011320932189e-05,
"loss": 0.7334,
"step": 142
},
{
"epoch": 1.3651551312649164,
"grad_norm": 0.34692367911338806,
"learning_rate": 9.064638948189538e-05,
"loss": 0.8239,
"step": 143
},
{
"epoch": 1.3747016706443915,
"grad_norm": 0.3698657155036926,
"learning_rate": 9.045084971874738e-05,
"loss": 0.816,
"step": 144
},
{
"epoch": 1.3842482100238662,
"grad_norm": 0.3782055974006653,
"learning_rate": 9.025350265637815e-05,
"loss": 0.7642,
"step": 145
},
{
"epoch": 1.3937947494033414,
"grad_norm": 0.32594063878059387,
"learning_rate": 9.005435711203618e-05,
"loss": 0.6679,
"step": 146
},
{
"epoch": 1.4033412887828163,
"grad_norm": 0.31659746170043945,
"learning_rate": 8.985342198332407e-05,
"loss": 0.6733,
"step": 147
},
{
"epoch": 1.4128878281622912,
"grad_norm": 0.3700142502784729,
"learning_rate": 8.965070624780116e-05,
"loss": 0.7628,
"step": 148
},
{
"epoch": 1.422434367541766,
"grad_norm": 0.36097973585128784,
"learning_rate": 8.944621896258225e-05,
"loss": 0.823,
"step": 149
},
{
"epoch": 1.431980906921241,
"grad_norm": 0.34358176589012146,
"learning_rate": 8.923996926393305e-05,
"loss": 0.8027,
"step": 150
},
{
"epoch": 1.441527446300716,
"grad_norm": 0.3484276831150055,
"learning_rate": 8.903196636686197e-05,
"loss": 0.7654,
"step": 151
},
{
"epoch": 1.4510739856801909,
"grad_norm": 0.3297461271286011,
"learning_rate": 8.882221956470836e-05,
"loss": 0.7309,
"step": 152
},
{
"epoch": 1.460620525059666,
"grad_norm": 0.337780237197876,
"learning_rate": 8.861073822872734e-05,
"loss": 0.6041,
"step": 153
},
{
"epoch": 1.4701670644391407,
"grad_norm": 0.4128996431827545,
"learning_rate": 8.839753180767108e-05,
"loss": 0.755,
"step": 154
},
{
"epoch": 1.4797136038186158,
"grad_norm": 0.39525777101516724,
"learning_rate": 8.818260982736661e-05,
"loss": 0.893,
"step": 155
},
{
"epoch": 1.4892601431980907,
"grad_norm": 0.364863783121109,
"learning_rate": 8.79659818902903e-05,
"loss": 0.7558,
"step": 156
},
{
"epoch": 1.4892601431980907,
"eval_loss": 0.7723400592803955,
"eval_runtime": 12.9849,
"eval_samples_per_second": 13.631,
"eval_steps_per_second": 1.771,
"step": 156
},
{
"epoch": 1.4988066825775657,
"grad_norm": 0.3498753309249878,
"learning_rate": 8.774765767513875e-05,
"loss": 0.6726,
"step": 157
},
{
"epoch": 1.5083532219570406,
"grad_norm": 0.3801003694534302,
"learning_rate": 8.752764693639638e-05,
"loss": 0.8054,
"step": 158
},
{
"epoch": 1.5178997613365155,
"grad_norm": 0.33022618293762207,
"learning_rate": 8.730595950389968e-05,
"loss": 0.6711,
"step": 159
},
{
"epoch": 1.5274463007159904,
"grad_norm": 0.3503524363040924,
"learning_rate": 8.708260528239788e-05,
"loss": 0.695,
"step": 160
},
{
"epoch": 1.5369928400954653,
"grad_norm": 0.3124283254146576,
"learning_rate": 8.685759425111056e-05,
"loss": 0.6797,
"step": 161
},
{
"epoch": 1.5465393794749405,
"grad_norm": 0.3534424602985382,
"learning_rate": 8.663093646328166e-05,
"loss": 0.7181,
"step": 162
},
{
"epoch": 1.5560859188544152,
"grad_norm": 0.40379753708839417,
"learning_rate": 8.640264204573047e-05,
"loss": 0.7923,
"step": 163
},
{
"epoch": 1.5656324582338903,
"grad_norm": 0.4083033502101898,
"learning_rate": 8.617272119839903e-05,
"loss": 0.7997,
"step": 164
},
{
"epoch": 1.575178997613365,
"grad_norm": 0.3316013216972351,
"learning_rate": 8.594118419389647e-05,
"loss": 0.7135,
"step": 165
},
{
"epoch": 1.5847255369928401,
"grad_norm": 0.3352660834789276,
"learning_rate": 8.570804137704003e-05,
"loss": 0.6358,
"step": 166
},
{
"epoch": 1.594272076372315,
"grad_norm": 0.34296879172325134,
"learning_rate": 8.547330316439291e-05,
"loss": 0.683,
"step": 167
},
{
"epoch": 1.60381861575179,
"grad_norm": 0.362801730632782,
"learning_rate": 8.523698004379877e-05,
"loss": 0.7637,
"step": 168
},
{
"epoch": 1.6133651551312649,
"grad_norm": 0.3877812623977661,
"learning_rate": 8.499908257391323e-05,
"loss": 0.6848,
"step": 169
},
{
"epoch": 1.6229116945107398,
"grad_norm": 0.3417890965938568,
"learning_rate": 8.475962138373213e-05,
"loss": 0.6743,
"step": 170
},
{
"epoch": 1.632458233890215,
"grad_norm": 0.3690805435180664,
"learning_rate": 8.451860717211653e-05,
"loss": 0.717,
"step": 171
},
{
"epoch": 1.6420047732696896,
"grad_norm": 0.39610129594802856,
"learning_rate": 8.427605070731482e-05,
"loss": 0.831,
"step": 172
},
{
"epoch": 1.6515513126491648,
"grad_norm": 0.3383614718914032,
"learning_rate": 8.403196282648156e-05,
"loss": 0.6713,
"step": 173
},
{
"epoch": 1.6610978520286395,
"grad_norm": 0.4015936553478241,
"learning_rate": 8.378635443519327e-05,
"loss": 0.8089,
"step": 174
},
{
"epoch": 1.6706443914081146,
"grad_norm": 0.34833744168281555,
"learning_rate": 8.353923650696118e-05,
"loss": 0.6678,
"step": 175
},
{
"epoch": 1.6801909307875895,
"grad_norm": 0.35956260561943054,
"learning_rate": 8.329062008274098e-05,
"loss": 0.751,
"step": 176
},
{
"epoch": 1.6897374701670644,
"grad_norm": 0.35701537132263184,
"learning_rate": 8.304051627043953e-05,
"loss": 0.6618,
"step": 177
},
{
"epoch": 1.6992840095465394,
"grad_norm": 0.2929876446723938,
"learning_rate": 8.278893624441848e-05,
"loss": 0.647,
"step": 178
},
{
"epoch": 1.7088305489260143,
"grad_norm": 0.3412924110889435,
"learning_rate": 8.253589124499512e-05,
"loss": 0.6297,
"step": 179
},
{
"epoch": 1.7183770883054894,
"grad_norm": 0.3739149272441864,
"learning_rate": 8.228139257794012e-05,
"loss": 0.7222,
"step": 180
},
{
"epoch": 1.727923627684964,
"grad_norm": 0.327097624540329,
"learning_rate": 8.202545161397242e-05,
"loss": 0.7254,
"step": 181
},
{
"epoch": 1.7374701670644392,
"grad_norm": 0.4074615240097046,
"learning_rate": 8.176807978825118e-05,
"loss": 0.8142,
"step": 182
},
{
"epoch": 1.747016706443914,
"grad_norm": 0.36452674865722656,
"learning_rate": 8.150928859986488e-05,
"loss": 0.732,
"step": 183
},
{
"epoch": 1.756563245823389,
"grad_norm": 0.41651418805122375,
"learning_rate": 8.124908961131759e-05,
"loss": 0.8458,
"step": 184
},
{
"epoch": 1.766109785202864,
"grad_norm": 0.3859712779521942,
"learning_rate": 8.098749444801224e-05,
"loss": 0.852,
"step": 185
},
{
"epoch": 1.775656324582339,
"grad_norm": 0.3324730694293976,
"learning_rate": 8.072451479773143e-05,
"loss": 0.6524,
"step": 186
},
{
"epoch": 1.7852028639618138,
"grad_norm": 0.35326018929481506,
"learning_rate": 8.0460162410115e-05,
"loss": 0.6616,
"step": 187
},
{
"epoch": 1.7947494033412887,
"grad_norm": 0.3914497494697571,
"learning_rate": 8.019444909613522e-05,
"loss": 0.8023,
"step": 188
},
{
"epoch": 1.8042959427207639,
"grad_norm": 0.35949957370758057,
"learning_rate": 7.992738672756909e-05,
"loss": 0.6108,
"step": 189
},
{
"epoch": 1.8138424821002386,
"grad_norm": 0.366787314414978,
"learning_rate": 7.965898723646776e-05,
"loss": 0.8388,
"step": 190
},
{
"epoch": 1.8233890214797137,
"grad_norm": 0.3832429349422455,
"learning_rate": 7.938926261462366e-05,
"loss": 0.7082,
"step": 191
},
{
"epoch": 1.8329355608591884,
"grad_norm": 0.3898661732673645,
"learning_rate": 7.911822491303452e-05,
"loss": 0.7633,
"step": 192
},
{
"epoch": 1.8424821002386635,
"grad_norm": 0.3598467707633972,
"learning_rate": 7.884588624136504e-05,
"loss": 0.6604,
"step": 193
},
{
"epoch": 1.8520286396181385,
"grad_norm": 0.33222588896751404,
"learning_rate": 7.857225876740584e-05,
"loss": 0.7332,
"step": 194
},
{
"epoch": 1.8615751789976134,
"grad_norm": 0.410185307264328,
"learning_rate": 7.829735471652978e-05,
"loss": 0.8056,
"step": 195
},
{
"epoch": 1.8711217183770883,
"grad_norm": 0.41406041383743286,
"learning_rate": 7.802118637114573e-05,
"loss": 0.8366,
"step": 196
},
{
"epoch": 1.8806682577565632,
"grad_norm": 0.4121192693710327,
"learning_rate": 7.774376607014995e-05,
"loss": 0.8089,
"step": 197
},
{
"epoch": 1.8902147971360383,
"grad_norm": 0.3749670684337616,
"learning_rate": 7.746510620837459e-05,
"loss": 0.7294,
"step": 198
},
{
"epoch": 1.899761336515513,
"grad_norm": 0.42995452880859375,
"learning_rate": 7.718521923603404e-05,
"loss": 0.7776,
"step": 199
},
{
"epoch": 1.9093078758949882,
"grad_norm": 0.36848926544189453,
"learning_rate": 7.690411765816864e-05,
"loss": 0.7549,
"step": 200
},
{
"epoch": 1.9188544152744629,
"grad_norm": 0.40055760741233826,
"learning_rate": 7.662181403408593e-05,
"loss": 0.6901,
"step": 201
},
{
"epoch": 1.928400954653938,
"grad_norm": 0.38324177265167236,
"learning_rate": 7.633832097679958e-05,
"loss": 0.8203,
"step": 202
},
{
"epoch": 1.937947494033413,
"grad_norm": 0.31426626443862915,
"learning_rate": 7.605365115246581e-05,
"loss": 0.7143,
"step": 203
},
{
"epoch": 1.9474940334128878,
"grad_norm": 0.3708311915397644,
"learning_rate": 7.576781727981749e-05,
"loss": 0.7329,
"step": 204
},
{
"epoch": 1.9570405727923628,
"grad_norm": 0.3401066064834595,
"learning_rate": 7.548083212959588e-05,
"loss": 0.6827,
"step": 205
},
{
"epoch": 1.9665871121718377,
"grad_norm": 0.32935282588005066,
"learning_rate": 7.519270852398001e-05,
"loss": 0.6928,
"step": 206
},
{
"epoch": 1.9761336515513126,
"grad_norm": 0.30615073442459106,
"learning_rate": 7.490345933601395e-05,
"loss": 0.6326,
"step": 207
},
{
"epoch": 1.9856801909307875,
"grad_norm": 0.37611639499664307,
"learning_rate": 7.461309748903139e-05,
"loss": 0.7814,
"step": 208
},
{
"epoch": 1.9856801909307875,
"eval_loss": 0.7655227780342102,
"eval_runtime": 12.9629,
"eval_samples_per_second": 13.654,
"eval_steps_per_second": 1.774,
"step": 208
},
{
"epoch": 1.9952267303102627,
"grad_norm": 0.4345564842224121,
"learning_rate": 7.432163595607851e-05,
"loss": 0.8073,
"step": 209
},
{
"epoch": 2.0047732696897373,
"grad_norm": 0.3374365270137787,
"learning_rate": 7.402908775933419e-05,
"loss": 0.6948,
"step": 210
},
{
"epoch": 2.0143198090692125,
"grad_norm": 0.34650278091430664,
"learning_rate": 7.373546596952829e-05,
"loss": 0.6353,
"step": 211
},
{
"epoch": 2.023866348448687,
"grad_norm": 0.40124908089637756,
"learning_rate": 7.344078370535757e-05,
"loss": 0.7312,
"step": 212
},
{
"epoch": 2.0334128878281623,
"grad_norm": 0.3434641361236572,
"learning_rate": 7.314505413289964e-05,
"loss": 0.6418,
"step": 213
},
{
"epoch": 2.042959427207637,
"grad_norm": 0.36048009991645813,
"learning_rate": 7.284829046502468e-05,
"loss": 0.7244,
"step": 214
},
{
"epoch": 2.052505966587112,
"grad_norm": 0.37354418635368347,
"learning_rate": 7.255050596080509e-05,
"loss": 0.7364,
"step": 215
},
{
"epoch": 2.0620525059665873,
"grad_norm": 0.3802691698074341,
"learning_rate": 7.225171392492316e-05,
"loss": 0.7152,
"step": 216
},
{
"epoch": 2.071599045346062,
"grad_norm": 0.3605879545211792,
"learning_rate": 7.195192770707654e-05,
"loss": 0.6506,
"step": 217
},
{
"epoch": 2.081145584725537,
"grad_norm": 0.3946097791194916,
"learning_rate": 7.165116070138183e-05,
"loss": 0.715,
"step": 218
},
{
"epoch": 2.090692124105012,
"grad_norm": 0.35205793380737305,
"learning_rate": 7.134942634577614e-05,
"loss": 0.7169,
"step": 219
},
{
"epoch": 2.100238663484487,
"grad_norm": 0.4253597557544708,
"learning_rate": 7.104673812141675e-05,
"loss": 0.7926,
"step": 220
},
{
"epoch": 2.1097852028639617,
"grad_norm": 0.43062886595726013,
"learning_rate": 7.07431095520787e-05,
"loss": 0.6923,
"step": 221
},
{
"epoch": 2.119331742243437,
"grad_norm": 0.38732174038887024,
"learning_rate": 7.04385542035506e-05,
"loss": 0.6641,
"step": 222
},
{
"epoch": 2.128878281622912,
"grad_norm": 0.37736237049102783,
"learning_rate": 7.013308568302854e-05,
"loss": 0.6602,
"step": 223
},
{
"epoch": 2.1384248210023866,
"grad_norm": 0.3910059928894043,
"learning_rate": 6.982671763850814e-05,
"loss": 0.5671,
"step": 224
},
{
"epoch": 2.1479713603818618,
"grad_norm": 0.4084903597831726,
"learning_rate": 6.951946375817474e-05,
"loss": 0.7144,
"step": 225
},
{
"epoch": 2.1575178997613365,
"grad_norm": 0.46185556054115295,
"learning_rate": 6.921133776979186e-05,
"loss": 0.7373,
"step": 226
},
{
"epoch": 2.1670644391408116,
"grad_norm": 0.38187095522880554,
"learning_rate": 6.890235344008781e-05,
"loss": 0.6753,
"step": 227
},
{
"epoch": 2.1766109785202863,
"grad_norm": 0.43176108598709106,
"learning_rate": 6.859252457414067e-05,
"loss": 0.683,
"step": 228
},
{
"epoch": 2.1861575178997614,
"grad_norm": 0.43367186188697815,
"learning_rate": 6.828186501476144e-05,
"loss": 0.7548,
"step": 229
},
{
"epoch": 2.195704057279236,
"grad_norm": 0.3524869978427887,
"learning_rate": 6.797038864187564e-05,
"loss": 0.6032,
"step": 230
},
{
"epoch": 2.2052505966587113,
"grad_norm": 0.40967708826065063,
"learning_rate": 6.765810937190306e-05,
"loss": 0.6378,
"step": 231
},
{
"epoch": 2.214797136038186,
"grad_norm": 0.3816625773906708,
"learning_rate": 6.734504115713604e-05,
"loss": 0.623,
"step": 232
},
{
"epoch": 2.224343675417661,
"grad_norm": 0.5187081694602966,
"learning_rate": 6.703119798511612e-05,
"loss": 0.838,
"step": 233
},
{
"epoch": 2.2338902147971362,
"grad_norm": 0.4348510503768921,
"learning_rate": 6.67165938780091e-05,
"loss": 0.7311,
"step": 234
},
{
"epoch": 2.243436754176611,
"grad_norm": 0.3842396140098572,
"learning_rate": 6.640124289197845e-05,
"loss": 0.6527,
"step": 235
},
{
"epoch": 2.252983293556086,
"grad_norm": 0.4316810965538025,
"learning_rate": 6.608515911655744e-05,
"loss": 0.7794,
"step": 236
},
{
"epoch": 2.2625298329355608,
"grad_norm": 0.3772410452365875,
"learning_rate": 6.576835667401953e-05,
"loss": 0.6369,
"step": 237
},
{
"epoch": 2.272076372315036,
"grad_norm": 0.4141497015953064,
"learning_rate": 6.545084971874738e-05,
"loss": 0.6993,
"step": 238
},
{
"epoch": 2.2816229116945106,
"grad_norm": 0.41354265809059143,
"learning_rate": 6.513265243660057e-05,
"loss": 0.6389,
"step": 239
},
{
"epoch": 2.2911694510739857,
"grad_norm": 0.3867475986480713,
"learning_rate": 6.481377904428171e-05,
"loss": 0.6306,
"step": 240
},
{
"epoch": 2.300715990453461,
"grad_norm": 0.40222805738449097,
"learning_rate": 6.449424378870123e-05,
"loss": 0.6857,
"step": 241
},
{
"epoch": 2.3102625298329356,
"grad_norm": 0.4215107858181,
"learning_rate": 6.41740609463409e-05,
"loss": 0.7309,
"step": 242
},
{
"epoch": 2.3198090692124103,
"grad_norm": 0.4149893820285797,
"learning_rate": 6.385324482261597e-05,
"loss": 0.6562,
"step": 243
},
{
"epoch": 2.3293556085918854,
"grad_norm": 0.4119661748409271,
"learning_rate": 6.353180975123595e-05,
"loss": 0.7544,
"step": 244
},
{
"epoch": 2.3389021479713605,
"grad_norm": 0.32324427366256714,
"learning_rate": 6.320977009356431e-05,
"loss": 0.5994,
"step": 245
},
{
"epoch": 2.3484486873508352,
"grad_norm": 0.4508344531059265,
"learning_rate": 6.288714023797672e-05,
"loss": 0.7047,
"step": 246
},
{
"epoch": 2.3579952267303104,
"grad_norm": 0.3957417905330658,
"learning_rate": 6.256393459921824e-05,
"loss": 0.6364,
"step": 247
},
{
"epoch": 2.367541766109785,
"grad_norm": 0.4180348813533783,
"learning_rate": 6.224016761775933e-05,
"loss": 0.6511,
"step": 248
},
{
"epoch": 2.37708830548926,
"grad_norm": 0.46107926964759827,
"learning_rate": 6.191585375915055e-05,
"loss": 0.6736,
"step": 249
},
{
"epoch": 2.386634844868735,
"grad_norm": 0.43949881196022034,
"learning_rate": 6.159100751337642e-05,
"loss": 0.6639,
"step": 250
},
{
"epoch": 2.39618138424821,
"grad_norm": 0.4427139461040497,
"learning_rate": 6.126564339420784e-05,
"loss": 0.6581,
"step": 251
},
{
"epoch": 2.405727923627685,
"grad_norm": 0.4241901636123657,
"learning_rate": 6.093977593855375e-05,
"loss": 0.6738,
"step": 252
},
{
"epoch": 2.41527446300716,
"grad_norm": 0.41828441619873047,
"learning_rate": 6.061341970581165e-05,
"loss": 0.685,
"step": 253
},
{
"epoch": 2.424821002386635,
"grad_norm": 0.4712134599685669,
"learning_rate": 6.028658927721697e-05,
"loss": 0.6853,
"step": 254
},
{
"epoch": 2.4343675417661097,
"grad_norm": 0.47678568959236145,
"learning_rate": 5.99592992551918e-05,
"loss": 0.673,
"step": 255
},
{
"epoch": 2.443914081145585,
"grad_norm": 0.46318480372428894,
"learning_rate": 5.9631564262692274e-05,
"loss": 0.688,
"step": 256
},
{
"epoch": 2.4534606205250595,
"grad_norm": 0.4256531000137329,
"learning_rate": 5.930339894255532e-05,
"loss": 0.6521,
"step": 257
},
{
"epoch": 2.4630071599045347,
"grad_norm": 0.39636510610580444,
"learning_rate": 5.897481795684446e-05,
"loss": 0.6713,
"step": 258
},
{
"epoch": 2.4725536992840094,
"grad_norm": 0.497344434261322,
"learning_rate": 5.8645835986194676e-05,
"loss": 0.7745,
"step": 259
},
{
"epoch": 2.4821002386634845,
"grad_norm": 0.4814034104347229,
"learning_rate": 5.831646772915651e-05,
"loss": 0.6849,
"step": 260
},
{
"epoch": 2.4821002386634845,
"eval_loss": 0.7669724225997925,
"eval_runtime": 13.0227,
"eval_samples_per_second": 13.592,
"eval_steps_per_second": 1.766,
"step": 260
},
{
"epoch": 2.491646778042959,
"grad_norm": 0.4661683142185211,
"learning_rate": 5.7986727901539374e-05,
"loss": 0.7284,
"step": 261
},
{
"epoch": 2.5011933174224343,
"grad_norm": 0.4617115557193756,
"learning_rate": 5.7656631235754014e-05,
"loss": 0.7026,
"step": 262
},
{
"epoch": 2.5107398568019095,
"grad_norm": 0.48742860555648804,
"learning_rate": 5.732619248015434e-05,
"loss": 0.6964,
"step": 263
},
{
"epoch": 2.520286396181384,
"grad_norm": 0.4442538917064667,
"learning_rate": 5.699542639837844e-05,
"loss": 0.6804,
"step": 264
},
{
"epoch": 2.5298329355608593,
"grad_norm": 0.49106183648109436,
"learning_rate": 5.666434776868895e-05,
"loss": 0.6865,
"step": 265
},
{
"epoch": 2.539379474940334,
"grad_norm": 0.4551374316215515,
"learning_rate": 5.633297138331285e-05,
"loss": 0.7748,
"step": 266
},
{
"epoch": 2.548926014319809,
"grad_norm": 0.413185715675354,
"learning_rate": 5.6001312047780486e-05,
"loss": 0.5999,
"step": 267
},
{
"epoch": 2.558472553699284,
"grad_norm": 0.44866228103637695,
"learning_rate": 5.5669384580264104e-05,
"loss": 0.6102,
"step": 268
},
{
"epoch": 2.568019093078759,
"grad_norm": 0.45962679386138916,
"learning_rate": 5.533720381091582e-05,
"loss": 0.7214,
"step": 269
},
{
"epoch": 2.577565632458234,
"grad_norm": 0.45130231976509094,
"learning_rate": 5.5004784581204927e-05,
"loss": 0.7123,
"step": 270
},
{
"epoch": 2.587112171837709,
"grad_norm": 0.44015398621559143,
"learning_rate": 5.467214174325493e-05,
"loss": 0.6339,
"step": 271
},
{
"epoch": 2.5966587112171835,
"grad_norm": 0.45853525400161743,
"learning_rate": 5.4339290159179875e-05,
"loss": 0.7224,
"step": 272
},
{
"epoch": 2.6062052505966586,
"grad_norm": 0.45820721983909607,
"learning_rate": 5.400624470042037e-05,
"loss": 0.638,
"step": 273
},
{
"epoch": 2.615751789976134,
"grad_norm": 0.4199580252170563,
"learning_rate": 5.367302024707911e-05,
"loss": 0.6548,
"step": 274
},
{
"epoch": 2.6252983293556085,
"grad_norm": 0.5202221870422363,
"learning_rate": 5.3339631687256084e-05,
"loss": 0.7328,
"step": 275
},
{
"epoch": 2.6348448687350836,
"grad_norm": 0.41012313961982727,
"learning_rate": 5.300609391638336e-05,
"loss": 0.6315,
"step": 276
},
{
"epoch": 2.6443914081145583,
"grad_norm": 0.5391274094581604,
"learning_rate": 5.267242183655961e-05,
"loss": 0.7476,
"step": 277
},
{
"epoch": 2.6539379474940334,
"grad_norm": 0.4461412727832794,
"learning_rate": 5.233863035588426e-05,
"loss": 0.6223,
"step": 278
},
{
"epoch": 2.663484486873508,
"grad_norm": 0.4335575997829437,
"learning_rate": 5.200473438779146e-05,
"loss": 0.6816,
"step": 279
},
{
"epoch": 2.6730310262529833,
"grad_norm": 0.44996485114097595,
"learning_rate": 5.167074885038373e-05,
"loss": 0.6554,
"step": 280
},
{
"epoch": 2.6825775656324584,
"grad_norm": 0.4875689744949341,
"learning_rate": 5.133668866576544e-05,
"loss": 0.7889,
"step": 281
},
{
"epoch": 2.692124105011933,
"grad_norm": 0.45980238914489746,
"learning_rate": 5.1002568759376134e-05,
"loss": 0.6615,
"step": 282
},
{
"epoch": 2.7016706443914082,
"grad_norm": 0.43575319647789,
"learning_rate": 5.0668404059323635e-05,
"loss": 0.6595,
"step": 283
},
{
"epoch": 2.711217183770883,
"grad_norm": 0.438365638256073,
"learning_rate": 5.033420949571712e-05,
"loss": 0.6354,
"step": 284
},
{
"epoch": 2.720763723150358,
"grad_norm": 0.4110269546508789,
"learning_rate": 5e-05,
"loss": 0.6814,
"step": 285
},
{
"epoch": 2.7303102625298328,
"grad_norm": 0.4653089642524719,
"learning_rate": 4.96657905042829e-05,
"loss": 0.7375,
"step": 286
},
{
"epoch": 2.739856801909308,
"grad_norm": 0.45703983306884766,
"learning_rate": 4.933159594067637e-05,
"loss": 0.7464,
"step": 287
},
{
"epoch": 2.749403341288783,
"grad_norm": 0.40393638610839844,
"learning_rate": 4.899743124062388e-05,
"loss": 0.6091,
"step": 288
},
{
"epoch": 2.7589498806682577,
"grad_norm": 0.468176007270813,
"learning_rate": 4.866331133423456e-05,
"loss": 0.7052,
"step": 289
},
{
"epoch": 2.7684964200477324,
"grad_norm": 0.4771003723144531,
"learning_rate": 4.832925114961629e-05,
"loss": 0.7071,
"step": 290
},
{
"epoch": 2.7780429594272076,
"grad_norm": 0.4809763729572296,
"learning_rate": 4.799526561220855e-05,
"loss": 0.7289,
"step": 291
},
{
"epoch": 2.7875894988066827,
"grad_norm": 0.46262210607528687,
"learning_rate": 4.7661369644115755e-05,
"loss": 0.6031,
"step": 292
},
{
"epoch": 2.7971360381861574,
"grad_norm": 0.48082613945007324,
"learning_rate": 4.73275781634404e-05,
"loss": 0.7008,
"step": 293
},
{
"epoch": 2.8066825775656326,
"grad_norm": 0.5203619003295898,
"learning_rate": 4.6993906083616654e-05,
"loss": 0.6972,
"step": 294
},
{
"epoch": 2.8162291169451072,
"grad_norm": 0.5060526728630066,
"learning_rate": 4.666036831274392e-05,
"loss": 0.844,
"step": 295
},
{
"epoch": 2.8257756563245824,
"grad_norm": 0.5173178911209106,
"learning_rate": 4.63269797529209e-05,
"loss": 0.711,
"step": 296
},
{
"epoch": 2.835322195704057,
"grad_norm": 0.40481331944465637,
"learning_rate": 4.5993755299579626e-05,
"loss": 0.6849,
"step": 297
},
{
"epoch": 2.844868735083532,
"grad_norm": 0.5248334407806396,
"learning_rate": 4.566070984082013e-05,
"loss": 0.8126,
"step": 298
},
{
"epoch": 2.8544152744630074,
"grad_norm": 0.469752699136734,
"learning_rate": 4.5327858256745073e-05,
"loss": 0.7265,
"step": 299
},
{
"epoch": 2.863961813842482,
"grad_norm": 0.4676867425441742,
"learning_rate": 4.4995215418795085e-05,
"loss": 0.7859,
"step": 300
},
{
"epoch": 2.873508353221957,
"grad_norm": 0.43738093972206116,
"learning_rate": 4.466279618908419e-05,
"loss": 0.5927,
"step": 301
},
{
"epoch": 2.883054892601432,
"grad_norm": 0.4657769799232483,
"learning_rate": 4.433061541973591e-05,
"loss": 0.6947,
"step": 302
},
{
"epoch": 2.892601431980907,
"grad_norm": 0.44069626927375793,
"learning_rate": 4.399868795221951e-05,
"loss": 0.6984,
"step": 303
},
{
"epoch": 2.9021479713603817,
"grad_norm": 0.3833423852920532,
"learning_rate": 4.366702861668716e-05,
"loss": 0.5624,
"step": 304
},
{
"epoch": 2.911694510739857,
"grad_norm": 0.42630308866500854,
"learning_rate": 4.333565223131107e-05,
"loss": 0.5505,
"step": 305
},
{
"epoch": 2.921241050119332,
"grad_norm": 0.46541622281074524,
"learning_rate": 4.300457360162158e-05,
"loss": 0.7071,
"step": 306
},
{
"epoch": 2.9307875894988067,
"grad_norm": 0.41140827536582947,
"learning_rate": 4.267380751984568e-05,
"loss": 0.5786,
"step": 307
},
{
"epoch": 2.9403341288782814,
"grad_norm": 0.42655548453330994,
"learning_rate": 4.2343368764246e-05,
"loss": 0.5977,
"step": 308
},
{
"epoch": 2.9498806682577565,
"grad_norm": 0.45397818088531494,
"learning_rate": 4.201327209846065e-05,
"loss": 0.6592,
"step": 309
},
{
"epoch": 2.9594272076372317,
"grad_norm": 0.5159749984741211,
"learning_rate": 4.1683532270843504e-05,
"loss": 0.7322,
"step": 310
},
{
"epoch": 2.9689737470167064,
"grad_norm": 0.4518764615058899,
"learning_rate": 4.135416401380535e-05,
"loss": 0.796,
"step": 311
},
{
"epoch": 2.9785202863961815,
"grad_norm": 0.47323092818260193,
"learning_rate": 4.102518204315555e-05,
"loss": 0.7456,
"step": 312
},
{
"epoch": 2.9785202863961815,
"eval_loss": 0.7645925879478455,
"eval_runtime": 13.0059,
"eval_samples_per_second": 13.609,
"eval_steps_per_second": 1.768,
"step": 312
},
{
"epoch": 2.988066825775656,
"grad_norm": 0.49233347177505493,
"learning_rate": 4.069660105744469e-05,
"loss": 0.6878,
"step": 313
},
{
"epoch": 2.9976133651551313,
"grad_norm": 0.4664769470691681,
"learning_rate": 4.036843573730774e-05,
"loss": 0.7379,
"step": 314
},
{
"epoch": 3.007159904534606,
"grad_norm": 0.39737001061439514,
"learning_rate": 4.0040700744808204e-05,
"loss": 0.6706,
"step": 315
},
{
"epoch": 3.016706443914081,
"grad_norm": 0.46519792079925537,
"learning_rate": 3.971341072278302e-05,
"loss": 0.6626,
"step": 316
},
{
"epoch": 3.0262529832935563,
"grad_norm": 0.4189288914203644,
"learning_rate": 3.938658029418837e-05,
"loss": 0.6465,
"step": 317
},
{
"epoch": 3.035799522673031,
"grad_norm": 0.5005217790603638,
"learning_rate": 3.9060224061446245e-05,
"loss": 0.6552,
"step": 318
},
{
"epoch": 3.045346062052506,
"grad_norm": 0.4551185965538025,
"learning_rate": 3.873435660579217e-05,
"loss": 0.5767,
"step": 319
},
{
"epoch": 3.054892601431981,
"grad_norm": 0.44225507974624634,
"learning_rate": 3.840899248662358e-05,
"loss": 0.6387,
"step": 320
},
{
"epoch": 3.064439140811456,
"grad_norm": 0.5424272418022156,
"learning_rate": 3.808414624084946e-05,
"loss": 0.6887,
"step": 321
},
{
"epoch": 3.0739856801909307,
"grad_norm": 0.4525783658027649,
"learning_rate": 3.77598323822407e-05,
"loss": 0.6612,
"step": 322
},
{
"epoch": 3.083532219570406,
"grad_norm": 0.4461750388145447,
"learning_rate": 3.7436065400781774e-05,
"loss": 0.6418,
"step": 323
},
{
"epoch": 3.0930787589498805,
"grad_norm": 0.524474561214447,
"learning_rate": 3.7112859762023314e-05,
"loss": 0.6662,
"step": 324
},
{
"epoch": 3.1026252983293556,
"grad_norm": 0.5077919960021973,
"learning_rate": 3.6790229906435705e-05,
"loss": 0.6368,
"step": 325
},
{
"epoch": 3.1121718377088303,
"grad_norm": 0.5111605525016785,
"learning_rate": 3.646819024876406e-05,
"loss": 0.6471,
"step": 326
},
{
"epoch": 3.1217183770883055,
"grad_norm": 0.47017544507980347,
"learning_rate": 3.614675517738405e-05,
"loss": 0.6412,
"step": 327
},
{
"epoch": 3.1312649164677806,
"grad_norm": 0.5017347931861877,
"learning_rate": 3.5825939053659116e-05,
"loss": 0.6719,
"step": 328
},
{
"epoch": 3.1408114558472553,
"grad_norm": 0.4742693305015564,
"learning_rate": 3.550575621129878e-05,
"loss": 0.6389,
"step": 329
},
{
"epoch": 3.1503579952267304,
"grad_norm": 0.48239150643348694,
"learning_rate": 3.5186220955718306e-05,
"loss": 0.6071,
"step": 330
},
{
"epoch": 3.159904534606205,
"grad_norm": 0.461012065410614,
"learning_rate": 3.486734756339943e-05,
"loss": 0.5409,
"step": 331
},
{
"epoch": 3.1694510739856803,
"grad_norm": 0.5519300103187561,
"learning_rate": 3.4549150281252636e-05,
"loss": 0.6617,
"step": 332
},
{
"epoch": 3.178997613365155,
"grad_norm": 0.4272817373275757,
"learning_rate": 3.423164332598049e-05,
"loss": 0.4943,
"step": 333
},
{
"epoch": 3.18854415274463,
"grad_norm": 0.5129975080490112,
"learning_rate": 3.391484088344257e-05,
"loss": 0.6569,
"step": 334
},
{
"epoch": 3.1980906921241052,
"grad_norm": 0.5243316292762756,
"learning_rate": 3.3598757108021546e-05,
"loss": 0.6659,
"step": 335
},
{
"epoch": 3.20763723150358,
"grad_norm": 0.4862256348133087,
"learning_rate": 3.3283406121990915e-05,
"loss": 0.6414,
"step": 336
},
{
"epoch": 3.217183770883055,
"grad_norm": 0.5197688937187195,
"learning_rate": 3.2968802014883874e-05,
"loss": 0.6751,
"step": 337
},
{
"epoch": 3.2267303102625298,
"grad_norm": 0.5474854111671448,
"learning_rate": 3.265495884286397e-05,
"loss": 0.7531,
"step": 338
},
{
"epoch": 3.236276849642005,
"grad_norm": 0.47088584303855896,
"learning_rate": 3.234189062809695e-05,
"loss": 0.6319,
"step": 339
},
{
"epoch": 3.2458233890214796,
"grad_norm": 0.4922322630882263,
"learning_rate": 3.202961135812437e-05,
"loss": 0.6297,
"step": 340
},
{
"epoch": 3.2553699284009547,
"grad_norm": 0.5747278332710266,
"learning_rate": 3.1718134985238567e-05,
"loss": 0.6269,
"step": 341
},
{
"epoch": 3.2649164677804294,
"grad_norm": 0.5492677092552185,
"learning_rate": 3.1407475425859345e-05,
"loss": 0.7295,
"step": 342
},
{
"epoch": 3.2744630071599046,
"grad_norm": 0.487875759601593,
"learning_rate": 3.109764655991221e-05,
"loss": 0.6013,
"step": 343
},
{
"epoch": 3.2840095465393793,
"grad_norm": 0.5148051381111145,
"learning_rate": 3.078866223020815e-05,
"loss": 0.7156,
"step": 344
},
{
"epoch": 3.2935560859188544,
"grad_norm": 0.4531029760837555,
"learning_rate": 3.0480536241825263e-05,
"loss": 0.581,
"step": 345
},
{
"epoch": 3.3031026252983295,
"grad_norm": 0.4673330783843994,
"learning_rate": 3.0173282361491868e-05,
"loss": 0.5355,
"step": 346
},
{
"epoch": 3.3126491646778042,
"grad_norm": 0.4933515191078186,
"learning_rate": 2.9866914316971477e-05,
"loss": 0.5584,
"step": 347
},
{
"epoch": 3.3221957040572794,
"grad_norm": 0.5033287405967712,
"learning_rate": 2.9561445796449415e-05,
"loss": 0.6598,
"step": 348
},
{
"epoch": 3.331742243436754,
"grad_norm": 0.5585161447525024,
"learning_rate": 2.925689044792132e-05,
"loss": 0.7268,
"step": 349
},
{
"epoch": 3.341288782816229,
"grad_norm": 0.4780008792877197,
"learning_rate": 2.895326187858326e-05,
"loss": 0.6363,
"step": 350
},
{
"epoch": 3.350835322195704,
"grad_norm": 0.5018965005874634,
"learning_rate": 2.865057365422386e-05,
"loss": 0.6098,
"step": 351
},
{
"epoch": 3.360381861575179,
"grad_norm": 0.4768741726875305,
"learning_rate": 2.8348839298618178e-05,
"loss": 0.6504,
"step": 352
},
{
"epoch": 3.369928400954654,
"grad_norm": 0.47006916999816895,
"learning_rate": 2.8048072292923465e-05,
"loss": 0.7122,
"step": 353
},
{
"epoch": 3.379474940334129,
"grad_norm": 0.5839781165122986,
"learning_rate": 2.7748286075076835e-05,
"loss": 0.688,
"step": 354
},
{
"epoch": 3.389021479713604,
"grad_norm": 0.48642516136169434,
"learning_rate": 2.74494940391949e-05,
"loss": 0.5968,
"step": 355
},
{
"epoch": 3.3985680190930787,
"grad_norm": 0.5563519597053528,
"learning_rate": 2.7151709534975324e-05,
"loss": 0.6904,
"step": 356
},
{
"epoch": 3.408114558472554,
"grad_norm": 0.489467591047287,
"learning_rate": 2.685494586710038e-05,
"loss": 0.6965,
"step": 357
},
{
"epoch": 3.4176610978520285,
"grad_norm": 0.5098894238471985,
"learning_rate": 2.655921629464245e-05,
"loss": 0.5593,
"step": 358
},
{
"epoch": 3.4272076372315037,
"grad_norm": 0.5460035800933838,
"learning_rate": 2.626453403047172e-05,
"loss": 0.7281,
"step": 359
},
{
"epoch": 3.4367541766109784,
"grad_norm": 0.5174149870872498,
"learning_rate": 2.5970912240665813e-05,
"loss": 0.6505,
"step": 360
},
{
"epoch": 3.4463007159904535,
"grad_norm": 0.5922746658325195,
"learning_rate": 2.5678364043921504e-05,
"loss": 0.7072,
"step": 361
},
{
"epoch": 3.455847255369928,
"grad_norm": 0.501711368560791,
"learning_rate": 2.5386902510968625e-05,
"loss": 0.7021,
"step": 362
},
{
"epoch": 3.4653937947494033,
"grad_norm": 0.551275372505188,
"learning_rate": 2.5096540663986067e-05,
"loss": 0.6635,
"step": 363
},
{
"epoch": 3.4749403341288785,
"grad_norm": 0.510543942451477,
"learning_rate": 2.4807291476019995e-05,
"loss": 0.6037,
"step": 364
},
{
"epoch": 3.4749403341288785,
"eval_loss": 0.7706080079078674,
"eval_runtime": 12.9841,
"eval_samples_per_second": 13.632,
"eval_steps_per_second": 1.771,
"step": 364
},
{
"epoch": 3.484486873508353,
"grad_norm": 0.5204513072967529,
"learning_rate": 2.4519167870404125e-05,
"loss": 0.6473,
"step": 365
},
{
"epoch": 3.4940334128878283,
"grad_norm": 0.5144868493080139,
"learning_rate": 2.4232182720182522e-05,
"loss": 0.5738,
"step": 366
},
{
"epoch": 3.503579952267303,
"grad_norm": 0.4935360848903656,
"learning_rate": 2.3946348847534194e-05,
"loss": 0.6041,
"step": 367
},
{
"epoch": 3.513126491646778,
"grad_norm": 0.5450278520584106,
"learning_rate": 2.3661679023200422e-05,
"loss": 0.6976,
"step": 368
},
{
"epoch": 3.522673031026253,
"grad_norm": 0.5161455869674683,
"learning_rate": 2.337818596591408e-05,
"loss": 0.7566,
"step": 369
},
{
"epoch": 3.532219570405728,
"grad_norm": 0.47960028052330017,
"learning_rate": 2.3095882341831372e-05,
"loss": 0.6354,
"step": 370
},
{
"epoch": 3.541766109785203,
"grad_norm": 0.46854427456855774,
"learning_rate": 2.281478076396596e-05,
"loss": 0.6636,
"step": 371
},
{
"epoch": 3.551312649164678,
"grad_norm": 0.49230995774269104,
"learning_rate": 2.2534893791625406e-05,
"loss": 0.5336,
"step": 372
},
{
"epoch": 3.5608591885441525,
"grad_norm": 0.475827693939209,
"learning_rate": 2.2256233929850044e-05,
"loss": 0.6397,
"step": 373
},
{
"epoch": 3.5704057279236276,
"grad_norm": 0.5518814921379089,
"learning_rate": 2.197881362885426e-05,
"loss": 0.6751,
"step": 374
},
{
"epoch": 3.579952267303103,
"grad_norm": 0.5297905802726746,
"learning_rate": 2.1702645283470236e-05,
"loss": 0.6379,
"step": 375
},
{
"epoch": 3.5894988066825775,
"grad_norm": 0.5582184195518494,
"learning_rate": 2.1427741232594184e-05,
"loss": 0.6456,
"step": 376
},
{
"epoch": 3.5990453460620526,
"grad_norm": 0.6382442712783813,
"learning_rate": 2.115411375863497e-05,
"loss": 0.7669,
"step": 377
},
{
"epoch": 3.6085918854415273,
"grad_norm": 0.5114546418190002,
"learning_rate": 2.0881775086965495e-05,
"loss": 0.6209,
"step": 378
},
{
"epoch": 3.6181384248210025,
"grad_norm": 0.5362977981567383,
"learning_rate": 2.061073738537635e-05,
"loss": 0.5689,
"step": 379
},
{
"epoch": 3.627684964200477,
"grad_norm": 0.5577702522277832,
"learning_rate": 2.0341012763532243e-05,
"loss": 0.68,
"step": 380
},
{
"epoch": 3.6372315035799523,
"grad_norm": 0.4712672233581543,
"learning_rate": 2.0072613272430923e-05,
"loss": 0.6012,
"step": 381
},
{
"epoch": 3.6467780429594274,
"grad_norm": 0.5355048179626465,
"learning_rate": 1.9805550903864774e-05,
"loss": 0.6409,
"step": 382
},
{
"epoch": 3.656324582338902,
"grad_norm": 0.5459690093994141,
"learning_rate": 1.9539837589885024e-05,
"loss": 0.615,
"step": 383
},
{
"epoch": 3.665871121718377,
"grad_norm": 0.5169499516487122,
"learning_rate": 1.9275485202268572e-05,
"loss": 0.5431,
"step": 384
},
{
"epoch": 3.675417661097852,
"grad_norm": 0.4908435344696045,
"learning_rate": 1.9012505551987765e-05,
"loss": 0.5995,
"step": 385
},
{
"epoch": 3.684964200477327,
"grad_norm": 0.5475894808769226,
"learning_rate": 1.875091038868243e-05,
"loss": 0.5709,
"step": 386
},
{
"epoch": 3.694510739856802,
"grad_norm": 0.4647465944290161,
"learning_rate": 1.8490711400135118e-05,
"loss": 0.6245,
"step": 387
},
{
"epoch": 3.704057279236277,
"grad_norm": 0.5150587558746338,
"learning_rate": 1.823192021174882e-05,
"loss": 0.6999,
"step": 388
},
{
"epoch": 3.713603818615752,
"grad_norm": 0.5281825661659241,
"learning_rate": 1.7974548386027585e-05,
"loss": 0.6953,
"step": 389
},
{
"epoch": 3.7231503579952268,
"grad_norm": 0.5862619876861572,
"learning_rate": 1.771860742205988e-05,
"loss": 0.7626,
"step": 390
},
{
"epoch": 3.7326968973747015,
"grad_norm": 0.5895105600357056,
"learning_rate": 1.746410875500488e-05,
"loss": 0.7215,
"step": 391
},
{
"epoch": 3.7422434367541766,
"grad_norm": 0.5397690534591675,
"learning_rate": 1.7211063755581525e-05,
"loss": 0.6534,
"step": 392
},
{
"epoch": 3.7517899761336517,
"grad_norm": 0.5140255689620972,
"learning_rate": 1.695948372956047e-05,
"loss": 0.6316,
"step": 393
},
{
"epoch": 3.7613365155131264,
"grad_norm": 0.555587887763977,
"learning_rate": 1.6709379917259028e-05,
"loss": 0.5928,
"step": 394
},
{
"epoch": 3.7708830548926016,
"grad_norm": 0.5376729965209961,
"learning_rate": 1.646076349303884e-05,
"loss": 0.7066,
"step": 395
},
{
"epoch": 3.7804295942720763,
"grad_norm": 0.5037944912910461,
"learning_rate": 1.621364556480675e-05,
"loss": 0.7138,
"step": 396
},
{
"epoch": 3.7899761336515514,
"grad_norm": 0.5480453372001648,
"learning_rate": 1.596803717351845e-05,
"loss": 0.6835,
"step": 397
},
{
"epoch": 3.799522673031026,
"grad_norm": 0.5427130460739136,
"learning_rate": 1.5723949292685192e-05,
"loss": 0.645,
"step": 398
},
{
"epoch": 3.8090692124105012,
"grad_norm": 0.5332582592964172,
"learning_rate": 1.548139282788349e-05,
"loss": 0.6536,
"step": 399
},
{
"epoch": 3.8186157517899764,
"grad_norm": 0.45773863792419434,
"learning_rate": 1.5240378616267886e-05,
"loss": 0.4132,
"step": 400
},
{
"epoch": 3.828162291169451,
"grad_norm": 0.5269774794578552,
"learning_rate": 1.5000917426086768e-05,
"loss": 0.709,
"step": 401
},
{
"epoch": 3.8377088305489258,
"grad_norm": 0.5353370904922485,
"learning_rate": 1.4763019956201252e-05,
"loss": 0.6245,
"step": 402
},
{
"epoch": 3.847255369928401,
"grad_norm": 0.5651338696479797,
"learning_rate": 1.452669683560709e-05,
"loss": 0.7354,
"step": 403
},
{
"epoch": 3.856801909307876,
"grad_norm": 0.5225863456726074,
"learning_rate": 1.4291958622959973e-05,
"loss": 0.6269,
"step": 404
},
{
"epoch": 3.8663484486873507,
"grad_norm": 0.4741249680519104,
"learning_rate": 1.4058815806103542e-05,
"loss": 0.6367,
"step": 405
},
{
"epoch": 3.875894988066826,
"grad_norm": 0.4927036166191101,
"learning_rate": 1.3827278801600979e-05,
"loss": 0.5917,
"step": 406
},
{
"epoch": 3.8854415274463006,
"grad_norm": 0.528948187828064,
"learning_rate": 1.3597357954269535e-05,
"loss": 0.6245,
"step": 407
},
{
"epoch": 3.8949880668257757,
"grad_norm": 0.499723881483078,
"learning_rate": 1.3369063536718345e-05,
"loss": 0.6612,
"step": 408
},
{
"epoch": 3.9045346062052504,
"grad_norm": 0.5520623326301575,
"learning_rate": 1.3142405748889457e-05,
"loss": 0.6276,
"step": 409
},
{
"epoch": 3.9140811455847255,
"grad_norm": 0.519705593585968,
"learning_rate": 1.2917394717602121e-05,
"loss": 0.6639,
"step": 410
},
{
"epoch": 3.9236276849642007,
"grad_norm": 0.5106028318405151,
"learning_rate": 1.2694040496100318e-05,
"loss": 0.6402,
"step": 411
},
{
"epoch": 3.9331742243436754,
"grad_norm": 0.5074647665023804,
"learning_rate": 1.2472353063603625e-05,
"loss": 0.6424,
"step": 412
},
{
"epoch": 3.9427207637231505,
"grad_norm": 0.594458281993866,
"learning_rate": 1.2252342324861272e-05,
"loss": 0.6677,
"step": 413
},
{
"epoch": 3.952267303102625,
"grad_norm": 0.5148235559463501,
"learning_rate": 1.2034018109709716e-05,
"loss": 0.5508,
"step": 414
},
{
"epoch": 3.9618138424821003,
"grad_norm": 0.5786213278770447,
"learning_rate": 1.1817390172633403e-05,
"loss": 0.5669,
"step": 415
},
{
"epoch": 3.971360381861575,
"grad_norm": 0.5178529024124146,
"learning_rate": 1.1602468192328936e-05,
"loss": 0.6335,
"step": 416
},
{
"epoch": 3.971360381861575,
"eval_loss": 0.7703084945678711,
"eval_runtime": 13.1275,
"eval_samples_per_second": 13.483,
"eval_steps_per_second": 1.752,
"step": 416
},
{
"epoch": 3.98090692124105,
"grad_norm": 0.6362994313240051,
"learning_rate": 1.1389261771272663e-05,
"loss": 0.667,
"step": 417
},
{
"epoch": 3.9904534606205253,
"grad_norm": 0.46682727336883545,
"learning_rate": 1.117778043529164e-05,
"loss": 0.5675,
"step": 418
},
{
"epoch": 4.0,
"grad_norm": 0.5248848795890808,
"learning_rate": 1.096803363313803e-05,
"loss": 0.6431,
"step": 419
},
{
"epoch": 4.009546539379475,
"grad_norm": 0.4764450192451477,
"learning_rate": 1.0760030736066951e-05,
"loss": 0.5765,
"step": 420
},
{
"epoch": 4.01909307875895,
"grad_norm": 0.5138590335845947,
"learning_rate": 1.055378103741777e-05,
"loss": 0.5938,
"step": 421
},
{
"epoch": 4.028639618138425,
"grad_norm": 0.560528039932251,
"learning_rate": 1.034929375219884e-05,
"loss": 0.7372,
"step": 422
},
{
"epoch": 4.0381861575179,
"grad_norm": 0.5144191384315491,
"learning_rate": 1.0146578016675934e-05,
"loss": 0.6935,
"step": 423
},
{
"epoch": 4.047732696897374,
"grad_norm": 0.4730660915374756,
"learning_rate": 9.945642887963841e-06,
"loss": 0.5619,
"step": 424
},
{
"epoch": 4.05727923627685,
"grad_norm": 0.5379086136817932,
"learning_rate": 9.746497343621857e-06,
"loss": 0.5955,
"step": 425
},
{
"epoch": 4.066825775656325,
"grad_norm": 0.46496352553367615,
"learning_rate": 9.549150281252633e-06,
"loss": 0.5427,
"step": 426
},
{
"epoch": 4.076372315035799,
"grad_norm": 0.4938548803329468,
"learning_rate": 9.353610518104611e-06,
"loss": 0.6966,
"step": 427
},
{
"epoch": 4.085918854415274,
"grad_norm": 0.5221308469772339,
"learning_rate": 9.159886790678124e-06,
"loss": 0.6781,
"step": 428
},
{
"epoch": 4.09546539379475,
"grad_norm": 0.4903409779071808,
"learning_rate": 8.967987754335022e-06,
"loss": 0.6055,
"step": 429
},
{
"epoch": 4.105011933174224,
"grad_norm": 0.5302107930183411,
"learning_rate": 8.777921982911996e-06,
"loss": 0.5903,
"step": 430
},
{
"epoch": 4.114558472553699,
"grad_norm": 0.4461494982242584,
"learning_rate": 8.589697968337446e-06,
"loss": 0.5499,
"step": 431
},
{
"epoch": 4.124105011933175,
"grad_norm": 0.6021783351898193,
"learning_rate": 8.40332412025216e-06,
"loss": 0.6363,
"step": 432
},
{
"epoch": 4.133651551312649,
"grad_norm": 0.5821279883384705,
"learning_rate": 8.218808765633512e-06,
"loss": 0.5479,
"step": 433
},
{
"epoch": 4.143198090692124,
"grad_norm": 0.5231468677520752,
"learning_rate": 8.036160148423449e-06,
"loss": 0.6066,
"step": 434
},
{
"epoch": 4.152744630071599,
"grad_norm": 0.5418100357055664,
"learning_rate": 7.85538642916015e-06,
"loss": 0.6153,
"step": 435
},
{
"epoch": 4.162291169451074,
"grad_norm": 0.5310184359550476,
"learning_rate": 7.676495684613432e-06,
"loss": 0.541,
"step": 436
},
{
"epoch": 4.171837708830549,
"grad_norm": 0.5160778164863586,
"learning_rate": 7.499495907423887e-06,
"loss": 0.6041,
"step": 437
},
{
"epoch": 4.181384248210024,
"grad_norm": 0.5340070128440857,
"learning_rate": 7.324395005745771e-06,
"loss": 0.6006,
"step": 438
},
{
"epoch": 4.190930787589499,
"grad_norm": 0.5116534233093262,
"learning_rate": 7.151200802893682e-06,
"loss": 0.6093,
"step": 439
},
{
"epoch": 4.200477326968974,
"grad_norm": 0.5754937529563904,
"learning_rate": 6.979921036993042e-06,
"loss": 0.6219,
"step": 440
},
{
"epoch": 4.210023866348449,
"grad_norm": 0.6205330491065979,
"learning_rate": 6.810563360634298e-06,
"loss": 0.6091,
"step": 441
},
{
"epoch": 4.219570405727923,
"grad_norm": 0.47774946689605713,
"learning_rate": 6.643135340531137e-06,
"loss": 0.5943,
"step": 442
},
{
"epoch": 4.229116945107399,
"grad_norm": 0.5198448896408081,
"learning_rate": 6.477644457182275e-06,
"loss": 0.711,
"step": 443
},
{
"epoch": 4.238663484486874,
"grad_norm": 0.548129141330719,
"learning_rate": 6.314098104537325e-06,
"loss": 0.5444,
"step": 444
},
{
"epoch": 4.248210023866348,
"grad_norm": 0.549915611743927,
"learning_rate": 6.152503589666425e-06,
"loss": 0.6128,
"step": 445
},
{
"epoch": 4.257756563245824,
"grad_norm": 0.473899245262146,
"learning_rate": 5.992868132433754e-06,
"loss": 0.5761,
"step": 446
},
{
"epoch": 4.2673031026252985,
"grad_norm": 0.6241645216941833,
"learning_rate": 5.835198865174956e-06,
"loss": 0.6581,
"step": 447
},
{
"epoch": 4.276849642004773,
"grad_norm": 0.5191392302513123,
"learning_rate": 5.679502832378497e-06,
"loss": 0.6428,
"step": 448
},
{
"epoch": 4.286396181384248,
"grad_norm": 0.5194590091705322,
"learning_rate": 5.5257869903709015e-06,
"loss": 0.6437,
"step": 449
},
{
"epoch": 4.2959427207637235,
"grad_norm": 0.565933108329773,
"learning_rate": 5.374058207005944e-06,
"loss": 0.6357,
"step": 450
},
{
"epoch": 4.305489260143198,
"grad_norm": 0.5410757064819336,
"learning_rate": 5.224323261357844e-06,
"loss": 0.6178,
"step": 451
},
{
"epoch": 4.315035799522673,
"grad_norm": 0.5989289283752441,
"learning_rate": 5.0765888434183454e-06,
"loss": 0.6333,
"step": 452
},
{
"epoch": 4.324582338902148,
"grad_norm": 0.48727279901504517,
"learning_rate": 4.930861553797822e-06,
"loss": 0.5238,
"step": 453
},
{
"epoch": 4.334128878281623,
"grad_norm": 0.5582761764526367,
"learning_rate": 4.7871479034303835e-06,
"loss": 0.5644,
"step": 454
},
{
"epoch": 4.343675417661098,
"grad_norm": 0.5856003165245056,
"learning_rate": 4.645454313282965e-06,
"loss": 0.7528,
"step": 455
},
{
"epoch": 4.353221957040573,
"grad_norm": 0.5278400778770447,
"learning_rate": 4.505787114068432e-06,
"loss": 0.5257,
"step": 456
},
{
"epoch": 4.362768496420047,
"grad_norm": 0.555724024772644,
"learning_rate": 4.3681525459627614e-06,
"loss": 0.6286,
"step": 457
},
{
"epoch": 4.372315035799523,
"grad_norm": 0.5980591773986816,
"learning_rate": 4.232556758326212e-06,
"loss": 0.7107,
"step": 458
},
{
"epoch": 4.3818615751789975,
"grad_norm": 0.5622268915176392,
"learning_rate": 4.099005809428596e-06,
"loss": 0.6305,
"step": 459
},
{
"epoch": 4.391408114558472,
"grad_norm": 0.5708304643630981,
"learning_rate": 3.967505666178556e-06,
"loss": 0.5354,
"step": 460
},
{
"epoch": 4.400954653937948,
"grad_norm": 0.5751599073410034,
"learning_rate": 3.838062203857074e-06,
"loss": 0.5867,
"step": 461
},
{
"epoch": 4.4105011933174225,
"grad_norm": 0.5867375135421753,
"learning_rate": 3.7106812058548377e-06,
"loss": 0.5848,
"step": 462
},
{
"epoch": 4.420047732696897,
"grad_norm": 0.5069125294685364,
"learning_rate": 3.5853683634139434e-06,
"loss": 0.5737,
"step": 463
},
{
"epoch": 4.429594272076372,
"grad_norm": 0.5603401064872742,
"learning_rate": 3.462129275373577e-06,
"loss": 0.6771,
"step": 464
},
{
"epoch": 4.4391408114558475,
"grad_norm": 0.4407503306865692,
"learning_rate": 3.340969447919873e-06,
"loss": 0.4828,
"step": 465
},
{
"epoch": 4.448687350835322,
"grad_norm": 0.5868078470230103,
"learning_rate": 3.2218942943399112e-06,
"loss": 0.6198,
"step": 466
},
{
"epoch": 4.458233890214797,
"grad_norm": 0.5815831422805786,
"learning_rate": 3.104909134779821e-06,
"loss": 0.6983,
"step": 467
},
{
"epoch": 4.4677804295942725,
"grad_norm": 0.5096269845962524,
"learning_rate": 2.9900191960071544e-06,
"loss": 0.5835,
"step": 468
},
{
"epoch": 4.4677804295942725,
"eval_loss": 0.774871826171875,
"eval_runtime": 13.0266,
"eval_samples_per_second": 13.588,
"eval_steps_per_second": 1.766,
"step": 468
},
{
"epoch": 4.477326968973747,
"grad_norm": 0.5428768396377563,
"learning_rate": 2.877229611177268e-06,
"loss": 0.6312,
"step": 469
},
{
"epoch": 4.486873508353222,
"grad_norm": 0.6652801632881165,
"learning_rate": 2.7665454196040664e-06,
"loss": 0.564,
"step": 470
},
{
"epoch": 4.4964200477326965,
"grad_norm": 0.6429077982902527,
"learning_rate": 2.6579715665347893e-06,
"loss": 0.6679,
"step": 471
},
{
"epoch": 4.505966587112172,
"grad_norm": 0.5690730810165405,
"learning_rate": 2.5515129029290986e-06,
"loss": 0.615,
"step": 472
},
{
"epoch": 4.515513126491647,
"grad_norm": 0.6044589877128601,
"learning_rate": 2.4471741852423237e-06,
"loss": 0.6089,
"step": 473
},
{
"epoch": 4.5250596658711215,
"grad_norm": 0.5761138200759888,
"learning_rate": 2.34496007521296e-06,
"loss": 0.6444,
"step": 474
},
{
"epoch": 4.534606205250597,
"grad_norm": 0.6119807958602905,
"learning_rate": 2.2448751396543787e-06,
"loss": 0.7381,
"step": 475
},
{
"epoch": 4.544152744630072,
"grad_norm": 0.5442492961883545,
"learning_rate": 2.1469238502507925e-06,
"loss": 0.6084,
"step": 476
},
{
"epoch": 4.5536992840095465,
"grad_norm": 0.5115097761154175,
"learning_rate": 2.0511105833574683e-06,
"loss": 0.5036,
"step": 477
},
{
"epoch": 4.563245823389021,
"grad_norm": 0.5502949953079224,
"learning_rate": 1.957439619805196e-06,
"loss": 0.6587,
"step": 478
},
{
"epoch": 4.572792362768497,
"grad_norm": 0.5296341776847839,
"learning_rate": 1.865915144708985e-06,
"loss": 0.6043,
"step": 479
},
{
"epoch": 4.5823389021479715,
"grad_norm": 0.5244660377502441,
"learning_rate": 1.7765412472811771e-06,
"loss": 0.7407,
"step": 480
},
{
"epoch": 4.591885441527446,
"grad_norm": 0.4937599301338196,
"learning_rate": 1.6893219206486233e-06,
"loss": 0.653,
"step": 481
},
{
"epoch": 4.601431980906922,
"grad_norm": 0.5165035128593445,
"learning_rate": 1.6042610616743781e-06,
"loss": 0.6745,
"step": 482
},
{
"epoch": 4.610978520286396,
"grad_norm": 0.5757156014442444,
"learning_rate": 1.5213624707835273e-06,
"loss": 0.7117,
"step": 483
},
{
"epoch": 4.620525059665871,
"grad_norm": 0.5235234498977661,
"learning_rate": 1.4406298517934069e-06,
"loss": 0.5699,
"step": 484
},
{
"epoch": 4.630071599045346,
"grad_norm": 0.5536040663719177,
"learning_rate": 1.3620668117481472e-06,
"loss": 0.6527,
"step": 485
},
{
"epoch": 4.6396181384248205,
"grad_norm": 0.5768601894378662,
"learning_rate": 1.2856768607574566e-06,
"loss": 0.7039,
"step": 486
},
{
"epoch": 4.649164677804296,
"grad_norm": 0.5293567776679993,
"learning_rate": 1.2114634118398638e-06,
"loss": 0.5016,
"step": 487
},
{
"epoch": 4.658711217183771,
"grad_norm": 0.4922030568122864,
"learning_rate": 1.1394297807701737e-06,
"loss": 0.6378,
"step": 488
},
{
"epoch": 4.6682577565632455,
"grad_norm": 0.5279848575592041,
"learning_rate": 1.0695791859313298e-06,
"loss": 0.5854,
"step": 489
},
{
"epoch": 4.677804295942721,
"grad_norm": 0.5893853902816772,
"learning_rate": 1.0019147481706625e-06,
"loss": 0.6481,
"step": 490
},
{
"epoch": 4.687350835322196,
"grad_norm": 0.5379951000213623,
"learning_rate": 9.364394906603901e-07,
"loss": 0.5932,
"step": 491
},
{
"epoch": 4.6968973747016705,
"grad_norm": 0.5492742657661438,
"learning_rate": 8.731563387626096e-07,
"loss": 0.6475,
"step": 492
},
{
"epoch": 4.706443914081145,
"grad_norm": 0.596684455871582,
"learning_rate": 8.120681198985292e-07,
"loss": 0.6257,
"step": 493
},
{
"epoch": 4.715990453460621,
"grad_norm": 0.5613247156143188,
"learning_rate": 7.531775634222138e-07,
"loss": 0.6233,
"step": 494
},
{
"epoch": 4.725536992840095,
"grad_norm": 0.5937960743904114,
"learning_rate": 6.964873004985717e-07,
"loss": 0.6734,
"step": 495
},
{
"epoch": 4.73508353221957,
"grad_norm": 0.5525433421134949,
"learning_rate": 6.419998639858538e-07,
"loss": 0.5601,
"step": 496
},
{
"epoch": 4.744630071599046,
"grad_norm": 0.5674152374267578,
"learning_rate": 5.897176883224442e-07,
"loss": 0.6594,
"step": 497
},
{
"epoch": 4.75417661097852,
"grad_norm": 0.5498500466346741,
"learning_rate": 5.396431094181198e-07,
"loss": 0.616,
"step": 498
},
{
"epoch": 4.763723150357995,
"grad_norm": 0.5305196046829224,
"learning_rate": 4.917783645496888e-07,
"loss": 0.4995,
"step": 499
},
{
"epoch": 4.77326968973747,
"grad_norm": 0.5790063738822937,
"learning_rate": 4.461255922609986e-07,
"loss": 0.5518,
"step": 500
},
{
"epoch": 4.782816229116945,
"grad_norm": 0.548302412033081,
"learning_rate": 4.0268683226741265e-07,
"loss": 0.6202,
"step": 501
},
{
"epoch": 4.79236276849642,
"grad_norm": 0.548893928527832,
"learning_rate": 3.6146402536468283e-07,
"loss": 0.6218,
"step": 502
},
{
"epoch": 4.801909307875895,
"grad_norm": 0.5890070199966431,
"learning_rate": 3.2245901334221895e-07,
"loss": 0.6638,
"step": 503
},
{
"epoch": 4.81145584725537,
"grad_norm": 0.4871584475040436,
"learning_rate": 2.856735389008269e-07,
"loss": 0.6107,
"step": 504
},
{
"epoch": 4.821002386634845,
"grad_norm": 0.5432624816894531,
"learning_rate": 2.511092455747932e-07,
"loss": 0.583,
"step": 505
},
{
"epoch": 4.83054892601432,
"grad_norm": 0.5359986424446106,
"learning_rate": 2.1876767765853234e-07,
"loss": 0.5368,
"step": 506
},
{
"epoch": 4.840095465393794,
"grad_norm": 0.5359886288642883,
"learning_rate": 1.8865028013751452e-07,
"loss": 0.6259,
"step": 507
},
{
"epoch": 4.84964200477327,
"grad_norm": 0.5111921429634094,
"learning_rate": 1.6075839862374488e-07,
"loss": 0.5609,
"step": 508
},
{
"epoch": 4.859188544152745,
"grad_norm": 0.6437258124351501,
"learning_rate": 1.3509327929563942e-07,
"loss": 0.6395,
"step": 509
},
{
"epoch": 4.868735083532219,
"grad_norm": 0.5992398262023926,
"learning_rate": 1.1165606884234181e-07,
"loss": 0.6546,
"step": 510
},
{
"epoch": 4.878281622911695,
"grad_norm": 0.5831811428070068,
"learning_rate": 9.044781441249207e-08,
"loss": 0.609,
"step": 511
},
{
"epoch": 4.88782816229117,
"grad_norm": 0.5561614632606506,
"learning_rate": 7.146946356743067e-08,
"loss": 0.6699,
"step": 512
},
{
"epoch": 4.897374701670644,
"grad_norm": 0.5337750911712646,
"learning_rate": 5.472186423889358e-08,
"loss": 0.5614,
"step": 513
},
{
"epoch": 4.906921241050119,
"grad_norm": 0.41179969906806946,
"learning_rate": 4.020576469108139e-08,
"loss": 0.4451,
"step": 514
},
{
"epoch": 4.916467780429595,
"grad_norm": 0.4820442795753479,
"learning_rate": 2.792181348726941e-08,
"loss": 0.5897,
"step": 515
},
{
"epoch": 4.926014319809069,
"grad_norm": 0.5927594900131226,
"learning_rate": 1.7870559460814173e-08,
"loss": 0.6788,
"step": 516
},
{
"epoch": 4.935560859188544,
"grad_norm": 0.5302107334136963,
"learning_rate": 1.0052451690617527e-08,
"loss": 0.6105,
"step": 517
},
{
"epoch": 4.945107398568019,
"grad_norm": 0.5596168041229248,
"learning_rate": 4.46783948109819e-09,
"loss": 0.5537,
"step": 518
},
{
"epoch": 4.954653937947494,
"grad_norm": 0.5655501484870911,
"learning_rate": 1.1169723465487281e-09,
"loss": 0.6915,
"step": 519
},
{
"epoch": 4.964200477326969,
"grad_norm": 0.5537259578704834,
"learning_rate": 0.0,
"loss": 0.7157,
"step": 520
},
{
"epoch": 4.964200477326969,
"eval_loss": 0.7753176689147949,
"eval_runtime": 12.9676,
"eval_samples_per_second": 13.649,
"eval_steps_per_second": 1.774,
"step": 520
}
],
"logging_steps": 1,
"max_steps": 520,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.0257258893869056e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}