olierlarge2 / trainer_state.json
Jaredquek's picture
Upload folder using huggingface_hub
4f7c9ec verified
raw
history blame contribute delete
No virus
89 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9972690572310614,
"eval_steps": 500,
"global_step": 50532,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003955070400253125,
"grad_norm": 0.3985116183757782,
"learning_rate": 1.5e-06,
"loss": 2.3445,
"step": 100
},
{
"epoch": 0.00791014080050625,
"grad_norm": 0.5253750681877136,
"learning_rate": 3e-06,
"loss": 2.2315,
"step": 200
},
{
"epoch": 0.011865211200759373,
"grad_norm": 0.7505590319633484,
"learning_rate": 4.5e-06,
"loss": 2.0777,
"step": 300
},
{
"epoch": 0.0158202816010125,
"grad_norm": 0.7067473530769348,
"learning_rate": 6e-06,
"loss": 2.0165,
"step": 400
},
{
"epoch": 0.019775352001265623,
"grad_norm": 0.8964686393737793,
"learning_rate": 7.5e-06,
"loss": 1.9872,
"step": 500
},
{
"epoch": 0.023730422401518746,
"grad_norm": 1.0233873128890991,
"learning_rate": 9e-06,
"loss": 1.9621,
"step": 600
},
{
"epoch": 0.02768549280177187,
"grad_norm": 1.0008875131607056,
"learning_rate": 1.05e-05,
"loss": 1.9519,
"step": 700
},
{
"epoch": 0.031640563202025,
"grad_norm": 0.9941542148590088,
"learning_rate": 1.2e-05,
"loss": 1.922,
"step": 800
},
{
"epoch": 0.03559563360227812,
"grad_norm": 1.0296485424041748,
"learning_rate": 1.3500000000000001e-05,
"loss": 1.9305,
"step": 900
},
{
"epoch": 0.039550704002531245,
"grad_norm": 1.056834101676941,
"learning_rate": 1.5e-05,
"loss": 1.9199,
"step": 1000
},
{
"epoch": 0.04350577440278437,
"grad_norm": 1.1616301536560059,
"learning_rate": 1.4999976437535872e-05,
"loss": 1.9058,
"step": 1100
},
{
"epoch": 0.04746084480303749,
"grad_norm": 1.0753331184387207,
"learning_rate": 1.4999905750291538e-05,
"loss": 1.9321,
"step": 1200
},
{
"epoch": 0.05141591520329062,
"grad_norm": 1.3542801141738892,
"learning_rate": 1.4999787938711148e-05,
"loss": 1.8942,
"step": 1300
},
{
"epoch": 0.05537098560354374,
"grad_norm": 1.2391093969345093,
"learning_rate": 1.499962300353495e-05,
"loss": 1.8677,
"step": 1400
},
{
"epoch": 0.05932605600379687,
"grad_norm": 1.0844732522964478,
"learning_rate": 1.4999410945799291e-05,
"loss": 1.8723,
"step": 1500
},
{
"epoch": 0.06328112640405,
"grad_norm": 1.0814001560211182,
"learning_rate": 1.499915176683659e-05,
"loss": 1.8715,
"step": 1600
},
{
"epoch": 0.06723619680430312,
"grad_norm": 1.2003209590911865,
"learning_rate": 1.4998845468275357e-05,
"loss": 1.8739,
"step": 1700
},
{
"epoch": 0.07119126720455624,
"grad_norm": 1.2625234127044678,
"learning_rate": 1.4998492052040163e-05,
"loss": 1.8475,
"step": 1800
},
{
"epoch": 0.07514633760480936,
"grad_norm": 1.144286870956421,
"learning_rate": 1.499809152035164e-05,
"loss": 1.8517,
"step": 1900
},
{
"epoch": 0.07910140800506249,
"grad_norm": 1.2154852151870728,
"learning_rate": 1.4997643875726454e-05,
"loss": 1.8262,
"step": 2000
},
{
"epoch": 0.08305647840531562,
"grad_norm": 1.389930009841919,
"learning_rate": 1.4997149120977304e-05,
"loss": 1.8419,
"step": 2100
},
{
"epoch": 0.08701154880556874,
"grad_norm": 1.265163779258728,
"learning_rate": 1.4996607259212892e-05,
"loss": 1.8344,
"step": 2200
},
{
"epoch": 0.09096661920582186,
"grad_norm": 1.1876202821731567,
"learning_rate": 1.4996018293837914e-05,
"loss": 1.8356,
"step": 2300
},
{
"epoch": 0.09492168960607499,
"grad_norm": 1.2677972316741943,
"learning_rate": 1.4995382228553028e-05,
"loss": 1.8463,
"step": 2400
},
{
"epoch": 0.09887676000632811,
"grad_norm": 1.206444501876831,
"learning_rate": 1.4994699067354838e-05,
"loss": 1.8256,
"step": 2500
},
{
"epoch": 0.10283183040658124,
"grad_norm": 1.2112213373184204,
"learning_rate": 1.4993968814535867e-05,
"loss": 1.8627,
"step": 2600
},
{
"epoch": 0.10678690080683437,
"grad_norm": 1.1587677001953125,
"learning_rate": 1.4993191474684532e-05,
"loss": 1.8458,
"step": 2700
},
{
"epoch": 0.11074197120708748,
"grad_norm": 1.2365622520446777,
"learning_rate": 1.4992367052685107e-05,
"loss": 1.8383,
"step": 2800
},
{
"epoch": 0.11469704160734061,
"grad_norm": 1.179803490638733,
"learning_rate": 1.4991495553717708e-05,
"loss": 1.8305,
"step": 2900
},
{
"epoch": 0.11865211200759374,
"grad_norm": 1.336897850036621,
"learning_rate": 1.499057698325824e-05,
"loss": 1.8381,
"step": 3000
},
{
"epoch": 0.12260718240784686,
"grad_norm": 1.346336007118225,
"learning_rate": 1.498961134707838e-05,
"loss": 1.8467,
"step": 3100
},
{
"epoch": 0.1265622528081,
"grad_norm": 1.2718663215637207,
"learning_rate": 1.4988598651245534e-05,
"loss": 1.8076,
"step": 3200
},
{
"epoch": 0.1305173232083531,
"grad_norm": 1.1807332038879395,
"learning_rate": 1.4987538902122799e-05,
"loss": 1.8212,
"step": 3300
},
{
"epoch": 0.13447239360860624,
"grad_norm": 1.4749420881271362,
"learning_rate": 1.4986432106368917e-05,
"loss": 1.8039,
"step": 3400
},
{
"epoch": 0.13842746400885936,
"grad_norm": 1.2670525312423706,
"learning_rate": 1.4985278270938247e-05,
"loss": 1.8137,
"step": 3500
},
{
"epoch": 0.14238253440911247,
"grad_norm": 1.5255069732666016,
"learning_rate": 1.4984077403080711e-05,
"loss": 1.8161,
"step": 3600
},
{
"epoch": 0.1463376048093656,
"grad_norm": 1.1995693445205688,
"learning_rate": 1.4982829510341751e-05,
"loss": 1.8104,
"step": 3700
},
{
"epoch": 0.15029267520961873,
"grad_norm": 1.3007076978683472,
"learning_rate": 1.4981534600562279e-05,
"loss": 1.7952,
"step": 3800
},
{
"epoch": 0.15424774560987187,
"grad_norm": 1.4348576068878174,
"learning_rate": 1.4980192681878635e-05,
"loss": 1.819,
"step": 3900
},
{
"epoch": 0.15820281601012498,
"grad_norm": 1.3245015144348145,
"learning_rate": 1.4978803762722526e-05,
"loss": 1.8043,
"step": 4000
},
{
"epoch": 0.1621578864103781,
"grad_norm": 1.3621593713760376,
"learning_rate": 1.4977367851820984e-05,
"loss": 1.7992,
"step": 4100
},
{
"epoch": 0.16611295681063123,
"grad_norm": 1.2361946105957031,
"learning_rate": 1.4975884958196297e-05,
"loss": 1.8179,
"step": 4200
},
{
"epoch": 0.17006802721088435,
"grad_norm": 1.5746525526046753,
"learning_rate": 1.4974355091165972e-05,
"loss": 1.8045,
"step": 4300
},
{
"epoch": 0.1740230976111375,
"grad_norm": 1.4326754808425903,
"learning_rate": 1.497277826034265e-05,
"loss": 1.8155,
"step": 4400
},
{
"epoch": 0.1779781680113906,
"grad_norm": 1.3772553205490112,
"learning_rate": 1.4971154475634081e-05,
"loss": 1.7838,
"step": 4500
},
{
"epoch": 0.18193323841164372,
"grad_norm": 1.4580802917480469,
"learning_rate": 1.4969483747243023e-05,
"loss": 1.7997,
"step": 4600
},
{
"epoch": 0.18588830881189686,
"grad_norm": 1.2635383605957031,
"learning_rate": 1.4967766085667204e-05,
"loss": 1.8091,
"step": 4700
},
{
"epoch": 0.18984337921214997,
"grad_norm": 1.523247241973877,
"learning_rate": 1.496600150169925e-05,
"loss": 1.8086,
"step": 4800
},
{
"epoch": 0.1937984496124031,
"grad_norm": 1.4835641384124756,
"learning_rate": 1.496419000642661e-05,
"loss": 1.8001,
"step": 4900
},
{
"epoch": 0.19775352001265623,
"grad_norm": 1.3758567571640015,
"learning_rate": 1.4962331611231496e-05,
"loss": 1.7773,
"step": 5000
},
{
"epoch": 0.20170859041290934,
"grad_norm": 1.343885898590088,
"learning_rate": 1.4960426327790808e-05,
"loss": 1.7884,
"step": 5100
},
{
"epoch": 0.20566366081316248,
"grad_norm": 1.4712055921554565,
"learning_rate": 1.4958474168076061e-05,
"loss": 1.7904,
"step": 5200
},
{
"epoch": 0.2096187312134156,
"grad_norm": 1.3729618787765503,
"learning_rate": 1.4956475144353305e-05,
"loss": 1.7883,
"step": 5300
},
{
"epoch": 0.21357380161366873,
"grad_norm": 1.4087861776351929,
"learning_rate": 1.4954429269183049e-05,
"loss": 1.7764,
"step": 5400
},
{
"epoch": 0.21752887201392185,
"grad_norm": 1.359567642211914,
"learning_rate": 1.4952336555420194e-05,
"loss": 1.7522,
"step": 5500
},
{
"epoch": 0.22148394241417496,
"grad_norm": 1.5321180820465088,
"learning_rate": 1.4950197016213935e-05,
"loss": 1.7858,
"step": 5600
},
{
"epoch": 0.2254390128144281,
"grad_norm": 1.4172133207321167,
"learning_rate": 1.4948010665007694e-05,
"loss": 1.7889,
"step": 5700
},
{
"epoch": 0.22939408321468122,
"grad_norm": 1.389819622039795,
"learning_rate": 1.4945777515539018e-05,
"loss": 1.7787,
"step": 5800
},
{
"epoch": 0.23334915361493436,
"grad_norm": 1.319344162940979,
"learning_rate": 1.4943497581839515e-05,
"loss": 1.7832,
"step": 5900
},
{
"epoch": 0.23730422401518747,
"grad_norm": 1.3472243547439575,
"learning_rate": 1.4941170878234739e-05,
"loss": 1.7708,
"step": 6000
},
{
"epoch": 0.24125929441544058,
"grad_norm": 1.319574236869812,
"learning_rate": 1.4938797419344127e-05,
"loss": 1.8013,
"step": 6100
},
{
"epoch": 0.24521436481569373,
"grad_norm": 1.3281052112579346,
"learning_rate": 1.4936377220080886e-05,
"loss": 1.7657,
"step": 6200
},
{
"epoch": 0.24916943521594684,
"grad_norm": 1.4613900184631348,
"learning_rate": 1.4933910295651914e-05,
"loss": 1.7955,
"step": 6300
},
{
"epoch": 0.2531245056162,
"grad_norm": 1.6248208284378052,
"learning_rate": 1.4931396661557699e-05,
"loss": 1.7775,
"step": 6400
},
{
"epoch": 0.2570795760164531,
"grad_norm": 1.4050931930541992,
"learning_rate": 1.492883633359221e-05,
"loss": 1.749,
"step": 6500
},
{
"epoch": 0.2610346464167062,
"grad_norm": 1.3959672451019287,
"learning_rate": 1.4926229327842822e-05,
"loss": 1.7735,
"step": 6600
},
{
"epoch": 0.2649897168169593,
"grad_norm": 1.3407930135726929,
"learning_rate": 1.4923575660690197e-05,
"loss": 1.7685,
"step": 6700
},
{
"epoch": 0.2689447872172125,
"grad_norm": 1.4229165315628052,
"learning_rate": 1.4920875348808181e-05,
"loss": 1.769,
"step": 6800
},
{
"epoch": 0.2728998576174656,
"grad_norm": 1.3997846841812134,
"learning_rate": 1.4918128409163712e-05,
"loss": 1.7804,
"step": 6900
},
{
"epoch": 0.2768549280177187,
"grad_norm": 1.7216688394546509,
"learning_rate": 1.4915334859016699e-05,
"loss": 1.7699,
"step": 7000
},
{
"epoch": 0.28080999841797183,
"grad_norm": 1.4955641031265259,
"learning_rate": 1.491249471591992e-05,
"loss": 1.7615,
"step": 7100
},
{
"epoch": 0.28476506881822494,
"grad_norm": 1.631832480430603,
"learning_rate": 1.4909607997718917e-05,
"loss": 1.7708,
"step": 7200
},
{
"epoch": 0.2887201392184781,
"grad_norm": 1.4372748136520386,
"learning_rate": 1.4906674722551872e-05,
"loss": 1.7618,
"step": 7300
},
{
"epoch": 0.2926752096187312,
"grad_norm": 1.3430101871490479,
"learning_rate": 1.4903694908849506e-05,
"loss": 1.7734,
"step": 7400
},
{
"epoch": 0.29663028001898434,
"grad_norm": 1.4826927185058594,
"learning_rate": 1.4900668575334953e-05,
"loss": 1.7679,
"step": 7500
},
{
"epoch": 0.30058535041923745,
"grad_norm": 1.4105191230773926,
"learning_rate": 1.4897595741023642e-05,
"loss": 1.7666,
"step": 7600
},
{
"epoch": 0.30454042081949056,
"grad_norm": 1.3381356000900269,
"learning_rate": 1.4894476425223191e-05,
"loss": 1.7697,
"step": 7700
},
{
"epoch": 0.30849549121974373,
"grad_norm": 1.3745373487472534,
"learning_rate": 1.4891310647533266e-05,
"loss": 1.7707,
"step": 7800
},
{
"epoch": 0.31245056161999685,
"grad_norm": 1.3524316549301147,
"learning_rate": 1.488809842784548e-05,
"loss": 1.7515,
"step": 7900
},
{
"epoch": 0.31640563202024996,
"grad_norm": 1.4299299716949463,
"learning_rate": 1.4884839786343242e-05,
"loss": 1.7799,
"step": 8000
},
{
"epoch": 0.3203607024205031,
"grad_norm": 1.4132308959960938,
"learning_rate": 1.4881534743501656e-05,
"loss": 1.7258,
"step": 8100
},
{
"epoch": 0.3243157728207562,
"grad_norm": 1.4797372817993164,
"learning_rate": 1.4878183320087377e-05,
"loss": 1.7657,
"step": 8200
},
{
"epoch": 0.32827084322100936,
"grad_norm": 1.3737105131149292,
"learning_rate": 1.4874785537158479e-05,
"loss": 1.7845,
"step": 8300
},
{
"epoch": 0.33222591362126247,
"grad_norm": 1.4676398038864136,
"learning_rate": 1.4871341416064337e-05,
"loss": 1.7652,
"step": 8400
},
{
"epoch": 0.3361809840215156,
"grad_norm": 1.3782434463500977,
"learning_rate": 1.4867850978445476e-05,
"loss": 1.7516,
"step": 8500
},
{
"epoch": 0.3401360544217687,
"grad_norm": 1.5370761156082153,
"learning_rate": 1.4864314246233448e-05,
"loss": 1.75,
"step": 8600
},
{
"epoch": 0.3440911248220218,
"grad_norm": 1.4677528142929077,
"learning_rate": 1.486073124165068e-05,
"loss": 1.7518,
"step": 8700
},
{
"epoch": 0.348046195222275,
"grad_norm": 1.4215251207351685,
"learning_rate": 1.4857101987210359e-05,
"loss": 1.7634,
"step": 8800
},
{
"epoch": 0.3520012656225281,
"grad_norm": 1.4959337711334229,
"learning_rate": 1.4853426505716261e-05,
"loss": 1.7491,
"step": 8900
},
{
"epoch": 0.3559563360227812,
"grad_norm": 1.4005351066589355,
"learning_rate": 1.4849704820262627e-05,
"loss": 1.7713,
"step": 9000
},
{
"epoch": 0.3599114064230343,
"grad_norm": 1.4689812660217285,
"learning_rate": 1.484593695423401e-05,
"loss": 1.7448,
"step": 9100
},
{
"epoch": 0.36386647682328743,
"grad_norm": 1.5371148586273193,
"learning_rate": 1.4842122931305133e-05,
"loss": 1.7452,
"step": 9200
},
{
"epoch": 0.3678215472235406,
"grad_norm": 1.4465723037719727,
"learning_rate": 1.4838262775440741e-05,
"loss": 1.7452,
"step": 9300
},
{
"epoch": 0.3717766176237937,
"grad_norm": 1.5890401601791382,
"learning_rate": 1.4834356510895436e-05,
"loss": 1.737,
"step": 9400
},
{
"epoch": 0.37573168802404683,
"grad_norm": 1.4862806797027588,
"learning_rate": 1.4830404162213549e-05,
"loss": 1.7426,
"step": 9500
},
{
"epoch": 0.37968675842429994,
"grad_norm": 1.5449295043945312,
"learning_rate": 1.4826405754228963e-05,
"loss": 1.7379,
"step": 9600
},
{
"epoch": 0.38364182882455306,
"grad_norm": 1.5151877403259277,
"learning_rate": 1.482236131206497e-05,
"loss": 1.7269,
"step": 9700
},
{
"epoch": 0.3875968992248062,
"grad_norm": 1.600046157836914,
"learning_rate": 1.4818270861134113e-05,
"loss": 1.7556,
"step": 9800
},
{
"epoch": 0.39155196962505934,
"grad_norm": 1.4293779134750366,
"learning_rate": 1.4814134427138015e-05,
"loss": 1.7368,
"step": 9900
},
{
"epoch": 0.39550704002531245,
"grad_norm": 1.378175973892212,
"learning_rate": 1.4809952036067231e-05,
"loss": 1.7405,
"step": 10000
},
{
"epoch": 0.39946211042556556,
"grad_norm": 1.417622447013855,
"learning_rate": 1.4805723714201079e-05,
"loss": 1.7484,
"step": 10100
},
{
"epoch": 0.4034171808258187,
"grad_norm": 1.5106312036514282,
"learning_rate": 1.4801449488107477e-05,
"loss": 1.7218,
"step": 10200
},
{
"epoch": 0.40737225122607185,
"grad_norm": 1.5248609781265259,
"learning_rate": 1.4797129384642768e-05,
"loss": 1.7328,
"step": 10300
},
{
"epoch": 0.41132732162632496,
"grad_norm": 1.4607023000717163,
"learning_rate": 1.4792763430951562e-05,
"loss": 1.7131,
"step": 10400
},
{
"epoch": 0.4152823920265781,
"grad_norm": 1.4600701332092285,
"learning_rate": 1.4788351654466556e-05,
"loss": 1.7418,
"step": 10500
},
{
"epoch": 0.4192374624268312,
"grad_norm": 1.3468823432922363,
"learning_rate": 1.4783894082908377e-05,
"loss": 1.7649,
"step": 10600
},
{
"epoch": 0.4231925328270843,
"grad_norm": 1.5118048191070557,
"learning_rate": 1.4779390744285386e-05,
"loss": 1.7233,
"step": 10700
},
{
"epoch": 0.42714760322733747,
"grad_norm": 1.5199166536331177,
"learning_rate": 1.4774841666893515e-05,
"loss": 1.7238,
"step": 10800
},
{
"epoch": 0.4311026736275906,
"grad_norm": 1.6537836790084839,
"learning_rate": 1.4770246879316097e-05,
"loss": 1.7216,
"step": 10900
},
{
"epoch": 0.4350577440278437,
"grad_norm": 1.37918221950531,
"learning_rate": 1.4765606410423666e-05,
"loss": 1.7481,
"step": 11000
},
{
"epoch": 0.4390128144280968,
"grad_norm": 1.526502013206482,
"learning_rate": 1.4760920289373791e-05,
"loss": 1.7141,
"step": 11100
},
{
"epoch": 0.4429678848283499,
"grad_norm": 1.3577282428741455,
"learning_rate": 1.4756188545610884e-05,
"loss": 1.7507,
"step": 11200
},
{
"epoch": 0.4469229552286031,
"grad_norm": 1.557986855506897,
"learning_rate": 1.475141120886603e-05,
"loss": 1.7103,
"step": 11300
},
{
"epoch": 0.4508780256288562,
"grad_norm": 1.638221025466919,
"learning_rate": 1.474658830915678e-05,
"loss": 1.7363,
"step": 11400
},
{
"epoch": 0.4548330960291093,
"grad_norm": 1.472142219543457,
"learning_rate": 1.474171987678697e-05,
"loss": 1.7331,
"step": 11500
},
{
"epoch": 0.45878816642936243,
"grad_norm": 1.4680249691009521,
"learning_rate": 1.4736805942346542e-05,
"loss": 1.7273,
"step": 11600
},
{
"epoch": 0.46274323682961555,
"grad_norm": 1.4165573120117188,
"learning_rate": 1.4731846536711337e-05,
"loss": 1.7159,
"step": 11700
},
{
"epoch": 0.4666983072298687,
"grad_norm": 2.1816458702087402,
"learning_rate": 1.4726841691042902e-05,
"loss": 1.7236,
"step": 11800
},
{
"epoch": 0.4706533776301218,
"grad_norm": 1.5376547574996948,
"learning_rate": 1.4721791436788307e-05,
"loss": 1.7227,
"step": 11900
},
{
"epoch": 0.47460844803037494,
"grad_norm": 1.6850054264068604,
"learning_rate": 1.4716695805679932e-05,
"loss": 1.7116,
"step": 12000
},
{
"epoch": 0.47856351843062805,
"grad_norm": 1.7338590621948242,
"learning_rate": 1.471155482973528e-05,
"loss": 1.7129,
"step": 12100
},
{
"epoch": 0.48251858883088117,
"grad_norm": 1.4183164834976196,
"learning_rate": 1.4706368541256762e-05,
"loss": 1.7267,
"step": 12200
},
{
"epoch": 0.48647365923113434,
"grad_norm": 1.7117156982421875,
"learning_rate": 1.4701136972831513e-05,
"loss": 1.7149,
"step": 12300
},
{
"epoch": 0.49042872963138745,
"grad_norm": 1.4747951030731201,
"learning_rate": 1.4695860157331169e-05,
"loss": 1.7218,
"step": 12400
},
{
"epoch": 0.49438380003164056,
"grad_norm": 1.6341221332550049,
"learning_rate": 1.4690538127911672e-05,
"loss": 1.7331,
"step": 12500
},
{
"epoch": 0.4983388704318937,
"grad_norm": 1.4981880187988281,
"learning_rate": 1.4685170918013054e-05,
"loss": 1.7182,
"step": 12600
},
{
"epoch": 0.5022939408321468,
"grad_norm": 1.5774872303009033,
"learning_rate": 1.4679758561359232e-05,
"loss": 1.7154,
"step": 12700
},
{
"epoch": 0.5062490112324,
"grad_norm": 1.5503437519073486,
"learning_rate": 1.4674301091957795e-05,
"loss": 1.716,
"step": 12800
},
{
"epoch": 0.5102040816326531,
"grad_norm": 1.5208927392959595,
"learning_rate": 1.4668798544099795e-05,
"loss": 1.7041,
"step": 12900
},
{
"epoch": 0.5141591520329062,
"grad_norm": 1.8089638948440552,
"learning_rate": 1.4663250952359516e-05,
"loss": 1.7276,
"step": 13000
},
{
"epoch": 0.5181142224331593,
"grad_norm": 1.5653834342956543,
"learning_rate": 1.4657658351594275e-05,
"loss": 1.7164,
"step": 13100
},
{
"epoch": 0.5220692928334124,
"grad_norm": 1.7017031908035278,
"learning_rate": 1.4652020776944194e-05,
"loss": 1.7053,
"step": 13200
},
{
"epoch": 0.5260243632336655,
"grad_norm": 1.6849620342254639,
"learning_rate": 1.4646338263831977e-05,
"loss": 1.7134,
"step": 13300
},
{
"epoch": 0.5299794336339186,
"grad_norm": 1.8098126649856567,
"learning_rate": 1.4640610847962699e-05,
"loss": 1.7158,
"step": 13400
},
{
"epoch": 0.5339345040341718,
"grad_norm": 1.7234479188919067,
"learning_rate": 1.4634838565323563e-05,
"loss": 1.7229,
"step": 13500
},
{
"epoch": 0.537889574434425,
"grad_norm": 1.35356867313385,
"learning_rate": 1.4629021452183695e-05,
"loss": 1.715,
"step": 13600
},
{
"epoch": 0.5418446448346781,
"grad_norm": 1.5286564826965332,
"learning_rate": 1.4623159545093895e-05,
"loss": 1.7011,
"step": 13700
},
{
"epoch": 0.5457997152349312,
"grad_norm": 1.5586360692977905,
"learning_rate": 1.4617252880886427e-05,
"loss": 1.6978,
"step": 13800
},
{
"epoch": 0.5497547856351843,
"grad_norm": 1.5301753282546997,
"learning_rate": 1.461130149667477e-05,
"loss": 1.6984,
"step": 13900
},
{
"epoch": 0.5537098560354374,
"grad_norm": 1.6551586389541626,
"learning_rate": 1.4605305429853402e-05,
"loss": 1.6935,
"step": 14000
},
{
"epoch": 0.5576649264356905,
"grad_norm": 1.522283911705017,
"learning_rate": 1.4599264718097552e-05,
"loss": 1.6795,
"step": 14100
},
{
"epoch": 0.5616199968359437,
"grad_norm": 1.519173502922058,
"learning_rate": 1.4593179399362967e-05,
"loss": 1.6948,
"step": 14200
},
{
"epoch": 0.5655750672361968,
"grad_norm": 1.582780122756958,
"learning_rate": 1.4587049511885675e-05,
"loss": 1.7168,
"step": 14300
},
{
"epoch": 0.5695301376364499,
"grad_norm": 1.5130764245986938,
"learning_rate": 1.458087509418174e-05,
"loss": 1.7049,
"step": 14400
},
{
"epoch": 0.573485208036703,
"grad_norm": 1.581992268562317,
"learning_rate": 1.4574656185047033e-05,
"loss": 1.695,
"step": 14500
},
{
"epoch": 0.5774402784369562,
"grad_norm": 1.4675225019454956,
"learning_rate": 1.456839282355697e-05,
"loss": 1.7015,
"step": 14600
},
{
"epoch": 0.5813953488372093,
"grad_norm": 1.5948406457901,
"learning_rate": 1.4562085049066282e-05,
"loss": 1.7129,
"step": 14700
},
{
"epoch": 0.5853504192374624,
"grad_norm": 1.8901729583740234,
"learning_rate": 1.4555732901208756e-05,
"loss": 1.7062,
"step": 14800
},
{
"epoch": 0.5893054896377156,
"grad_norm": 1.6940269470214844,
"learning_rate": 1.4549336419896993e-05,
"loss": 1.7025,
"step": 14900
},
{
"epoch": 0.5932605600379687,
"grad_norm": 1.5160539150238037,
"learning_rate": 1.454289564532216e-05,
"loss": 1.688,
"step": 15000
},
{
"epoch": 0.5972156304382218,
"grad_norm": 1.6424893140792847,
"learning_rate": 1.4536410617953726e-05,
"loss": 1.696,
"step": 15100
},
{
"epoch": 0.6011707008384749,
"grad_norm": 1.492990493774414,
"learning_rate": 1.4529881378539218e-05,
"loss": 1.6768,
"step": 15200
},
{
"epoch": 0.605125771238728,
"grad_norm": 1.7309181690216064,
"learning_rate": 1.452330796810396e-05,
"loss": 1.6972,
"step": 15300
},
{
"epoch": 0.6090808416389811,
"grad_norm": 1.684484601020813,
"learning_rate": 1.451669042795082e-05,
"loss": 1.6903,
"step": 15400
},
{
"epoch": 0.6130359120392342,
"grad_norm": 1.5465792417526245,
"learning_rate": 1.4510028799659944e-05,
"loss": 1.714,
"step": 15500
},
{
"epoch": 0.6169909824394875,
"grad_norm": 1.8257033824920654,
"learning_rate": 1.4503323125088501e-05,
"loss": 1.6894,
"step": 15600
},
{
"epoch": 0.6209460528397406,
"grad_norm": 1.5299944877624512,
"learning_rate": 1.4496573446370414e-05,
"loss": 1.6944,
"step": 15700
},
{
"epoch": 0.6249011232399937,
"grad_norm": 1.7090293169021606,
"learning_rate": 1.44897798059161e-05,
"loss": 1.6878,
"step": 15800
},
{
"epoch": 0.6288561936402468,
"grad_norm": 1.690470576286316,
"learning_rate": 1.4482942246412203e-05,
"loss": 1.6807,
"step": 15900
},
{
"epoch": 0.6328112640404999,
"grad_norm": 1.8261181116104126,
"learning_rate": 1.4476060810821319e-05,
"loss": 1.6887,
"step": 16000
},
{
"epoch": 0.636766334440753,
"grad_norm": 1.5878318548202515,
"learning_rate": 1.4469135542381741e-05,
"loss": 1.6618,
"step": 16100
},
{
"epoch": 0.6407214048410061,
"grad_norm": 1.5003888607025146,
"learning_rate": 1.4462166484607167e-05,
"loss": 1.6734,
"step": 16200
},
{
"epoch": 0.6446764752412593,
"grad_norm": 1.7296781539916992,
"learning_rate": 1.445515368128645e-05,
"loss": 1.6712,
"step": 16300
},
{
"epoch": 0.6486315456415124,
"grad_norm": 1.6283060312271118,
"learning_rate": 1.4448097176483299e-05,
"loss": 1.6963,
"step": 16400
},
{
"epoch": 0.6525866160417655,
"grad_norm": 1.5867258310317993,
"learning_rate": 1.444099701453602e-05,
"loss": 1.6834,
"step": 16500
},
{
"epoch": 0.6565416864420187,
"grad_norm": 1.8763879537582397,
"learning_rate": 1.4433853240057229e-05,
"loss": 1.6811,
"step": 16600
},
{
"epoch": 0.6604967568422718,
"grad_norm": 1.5323275327682495,
"learning_rate": 1.4426665897933574e-05,
"loss": 1.6778,
"step": 16700
},
{
"epoch": 0.6644518272425249,
"grad_norm": 1.581667184829712,
"learning_rate": 1.4419435033325455e-05,
"loss": 1.6926,
"step": 16800
},
{
"epoch": 0.668406897642778,
"grad_norm": 1.6673179864883423,
"learning_rate": 1.441216069166673e-05,
"loss": 1.6806,
"step": 16900
},
{
"epoch": 0.6723619680430312,
"grad_norm": 1.8026336431503296,
"learning_rate": 1.4404842918664446e-05,
"loss": 1.6829,
"step": 17000
},
{
"epoch": 0.6763170384432843,
"grad_norm": 1.6094428300857544,
"learning_rate": 1.4397481760298542e-05,
"loss": 1.6763,
"step": 17100
},
{
"epoch": 0.6802721088435374,
"grad_norm": 1.565843105316162,
"learning_rate": 1.4390077262821559e-05,
"loss": 1.659,
"step": 17200
},
{
"epoch": 0.6842271792437905,
"grad_norm": 1.7567963600158691,
"learning_rate": 1.4382629472758346e-05,
"loss": 1.666,
"step": 17300
},
{
"epoch": 0.6881822496440436,
"grad_norm": 1.591693639755249,
"learning_rate": 1.4375138436905786e-05,
"loss": 1.6666,
"step": 17400
},
{
"epoch": 0.6921373200442967,
"grad_norm": 1.638576865196228,
"learning_rate": 1.436760420233248e-05,
"loss": 1.6554,
"step": 17500
},
{
"epoch": 0.69609239044455,
"grad_norm": 1.7055751085281372,
"learning_rate": 1.4360026816378462e-05,
"loss": 1.671,
"step": 17600
},
{
"epoch": 0.7000474608448031,
"grad_norm": 1.6867974996566772,
"learning_rate": 1.4352406326654905e-05,
"loss": 1.6722,
"step": 17700
},
{
"epoch": 0.7040025312450562,
"grad_norm": 1.7862675189971924,
"learning_rate": 1.4344742781043809e-05,
"loss": 1.6965,
"step": 17800
},
{
"epoch": 0.7079576016453093,
"grad_norm": 1.7989298105239868,
"learning_rate": 1.4337036227697715e-05,
"loss": 1.6762,
"step": 17900
},
{
"epoch": 0.7119126720455624,
"grad_norm": 1.7017799615859985,
"learning_rate": 1.4329286715039392e-05,
"loss": 1.6614,
"step": 18000
},
{
"epoch": 0.7158677424458155,
"grad_norm": 1.6665624380111694,
"learning_rate": 1.4321494291761537e-05,
"loss": 1.662,
"step": 18100
},
{
"epoch": 0.7198228128460686,
"grad_norm": 1.9826371669769287,
"learning_rate": 1.4313659006826468e-05,
"loss": 1.6638,
"step": 18200
},
{
"epoch": 0.7237778832463218,
"grad_norm": 1.7711797952651978,
"learning_rate": 1.430578090946582e-05,
"loss": 1.6625,
"step": 18300
},
{
"epoch": 0.7277329536465749,
"grad_norm": 1.8430758714675903,
"learning_rate": 1.4297860049180223e-05,
"loss": 1.6824,
"step": 18400
},
{
"epoch": 0.731688024046828,
"grad_norm": 1.8522826433181763,
"learning_rate": 1.4289896475739012e-05,
"loss": 1.6599,
"step": 18500
},
{
"epoch": 0.7356430944470812,
"grad_norm": 1.6422381401062012,
"learning_rate": 1.4281890239179897e-05,
"loss": 1.6584,
"step": 18600
},
{
"epoch": 0.7395981648473343,
"grad_norm": 1.7232320308685303,
"learning_rate": 1.4273841389808653e-05,
"loss": 1.6604,
"step": 18700
},
{
"epoch": 0.7435532352475874,
"grad_norm": 1.6723573207855225,
"learning_rate": 1.4265749978198805e-05,
"loss": 1.6654,
"step": 18800
},
{
"epoch": 0.7475083056478405,
"grad_norm": 1.776376485824585,
"learning_rate": 1.4257616055191316e-05,
"loss": 1.672,
"step": 18900
},
{
"epoch": 0.7514633760480937,
"grad_norm": 1.9001519680023193,
"learning_rate": 1.4249439671894253e-05,
"loss": 1.6473,
"step": 19000
},
{
"epoch": 0.7554184464483468,
"grad_norm": 1.5602456331253052,
"learning_rate": 1.4241220879682484e-05,
"loss": 1.6558,
"step": 19100
},
{
"epoch": 0.7593735168485999,
"grad_norm": 1.6195124387741089,
"learning_rate": 1.423295973019734e-05,
"loss": 1.6704,
"step": 19200
},
{
"epoch": 0.763328587248853,
"grad_norm": 1.7354437112808228,
"learning_rate": 1.4224656275346295e-05,
"loss": 1.6697,
"step": 19300
},
{
"epoch": 0.7672836576491061,
"grad_norm": 1.6655138731002808,
"learning_rate": 1.4216310567302648e-05,
"loss": 1.6567,
"step": 19400
},
{
"epoch": 0.7712387280493592,
"grad_norm": 1.9168803691864014,
"learning_rate": 1.4207922658505184e-05,
"loss": 1.6398,
"step": 19500
},
{
"epoch": 0.7751937984496124,
"grad_norm": 1.7918130159378052,
"learning_rate": 1.4199492601657848e-05,
"loss": 1.654,
"step": 19600
},
{
"epoch": 0.7791488688498656,
"grad_norm": 1.7283716201782227,
"learning_rate": 1.4191020449729417e-05,
"loss": 1.6534,
"step": 19700
},
{
"epoch": 0.7831039392501187,
"grad_norm": 1.860144019126892,
"learning_rate": 1.4182506255953167e-05,
"loss": 1.6553,
"step": 19800
},
{
"epoch": 0.7870590096503718,
"grad_norm": 1.7320619821548462,
"learning_rate": 1.4173950073826531e-05,
"loss": 1.6586,
"step": 19900
},
{
"epoch": 0.7910140800506249,
"grad_norm": 1.7704521417617798,
"learning_rate": 1.4165351957110772e-05,
"loss": 1.6599,
"step": 20000
},
{
"epoch": 0.794969150450878,
"grad_norm": 2.051400661468506,
"learning_rate": 1.4156711959830644e-05,
"loss": 1.6695,
"step": 20100
},
{
"epoch": 0.7989242208511311,
"grad_norm": 1.7863457202911377,
"learning_rate": 1.4148030136274043e-05,
"loss": 1.6538,
"step": 20200
},
{
"epoch": 0.8028792912513842,
"grad_norm": 1.85243558883667,
"learning_rate": 1.413930654099168e-05,
"loss": 1.63,
"step": 20300
},
{
"epoch": 0.8068343616516374,
"grad_norm": 1.7953428030014038,
"learning_rate": 1.413054122879673e-05,
"loss": 1.6374,
"step": 20400
},
{
"epoch": 0.8107894320518905,
"grad_norm": 1.8957959413528442,
"learning_rate": 1.4121734254764482e-05,
"loss": 1.6445,
"step": 20500
},
{
"epoch": 0.8147445024521437,
"grad_norm": 1.7762993574142456,
"learning_rate": 1.4112885674232011e-05,
"loss": 1.6503,
"step": 20600
},
{
"epoch": 0.8186995728523968,
"grad_norm": 1.723813772201538,
"learning_rate": 1.410399554279781e-05,
"loss": 1.6416,
"step": 20700
},
{
"epoch": 0.8226546432526499,
"grad_norm": 1.9667476415634155,
"learning_rate": 1.4095063916321456e-05,
"loss": 1.6297,
"step": 20800
},
{
"epoch": 0.826609713652903,
"grad_norm": 1.8855000734329224,
"learning_rate": 1.4086090850923246e-05,
"loss": 1.6684,
"step": 20900
},
{
"epoch": 0.8305647840531561,
"grad_norm": 1.8669531345367432,
"learning_rate": 1.4077076402983857e-05,
"loss": 1.6344,
"step": 21000
},
{
"epoch": 0.8345198544534093,
"grad_norm": 2.0968127250671387,
"learning_rate": 1.4068020629143985e-05,
"loss": 1.6524,
"step": 21100
},
{
"epoch": 0.8384749248536624,
"grad_norm": 1.712428092956543,
"learning_rate": 1.4058923586303988e-05,
"loss": 1.6333,
"step": 21200
},
{
"epoch": 0.8424299952539155,
"grad_norm": 1.7560315132141113,
"learning_rate": 1.4049785331623534e-05,
"loss": 1.6337,
"step": 21300
},
{
"epoch": 0.8463850656541686,
"grad_norm": 1.875779390335083,
"learning_rate": 1.4040605922521231e-05,
"loss": 1.6328,
"step": 21400
},
{
"epoch": 0.8503401360544217,
"grad_norm": 1.9142519235610962,
"learning_rate": 1.403138541667428e-05,
"loss": 1.6333,
"step": 21500
},
{
"epoch": 0.8542952064546749,
"grad_norm": 2.263770580291748,
"learning_rate": 1.4022123872018107e-05,
"loss": 1.635,
"step": 21600
},
{
"epoch": 0.858250276854928,
"grad_norm": 1.838889479637146,
"learning_rate": 1.4012821346745995e-05,
"loss": 1.6328,
"step": 21700
},
{
"epoch": 0.8622053472551812,
"grad_norm": 1.9877907037734985,
"learning_rate": 1.400347789930872e-05,
"loss": 1.6427,
"step": 21800
},
{
"epoch": 0.8661604176554343,
"grad_norm": 1.9885168075561523,
"learning_rate": 1.399409358841419e-05,
"loss": 1.637,
"step": 21900
},
{
"epoch": 0.8701154880556874,
"grad_norm": 1.8428804874420166,
"learning_rate": 1.3984668473027065e-05,
"loss": 1.6377,
"step": 22000
},
{
"epoch": 0.8740705584559405,
"grad_norm": 1.9875133037567139,
"learning_rate": 1.39752026123684e-05,
"loss": 1.6248,
"step": 22100
},
{
"epoch": 0.8780256288561936,
"grad_norm": 2.0525448322296143,
"learning_rate": 1.3965696065915262e-05,
"loss": 1.6501,
"step": 22200
},
{
"epoch": 0.8819806992564467,
"grad_norm": 1.9695172309875488,
"learning_rate": 1.3956148893400357e-05,
"loss": 1.6308,
"step": 22300
},
{
"epoch": 0.8859357696566998,
"grad_norm": 2.084592580795288,
"learning_rate": 1.3946561154811664e-05,
"loss": 1.6335,
"step": 22400
},
{
"epoch": 0.889890840056953,
"grad_norm": 1.7602378129959106,
"learning_rate": 1.3936932910392048e-05,
"loss": 1.6195,
"step": 22500
},
{
"epoch": 0.8938459104572062,
"grad_norm": 2.0157277584075928,
"learning_rate": 1.3927264220638889e-05,
"loss": 1.6395,
"step": 22600
},
{
"epoch": 0.8978009808574593,
"grad_norm": 2.184307336807251,
"learning_rate": 1.391755514630369e-05,
"loss": 1.6448,
"step": 22700
},
{
"epoch": 0.9017560512577124,
"grad_norm": 1.9747377634048462,
"learning_rate": 1.390780574839171e-05,
"loss": 1.6302,
"step": 22800
},
{
"epoch": 0.9057111216579655,
"grad_norm": 2.1203644275665283,
"learning_rate": 1.3898016088161575e-05,
"loss": 1.6447,
"step": 22900
},
{
"epoch": 0.9096661920582186,
"grad_norm": 2.0279908180236816,
"learning_rate": 1.3888186227124885e-05,
"loss": 1.622,
"step": 23000
},
{
"epoch": 0.9136212624584718,
"grad_norm": 1.9809517860412598,
"learning_rate": 1.3878316227045846e-05,
"loss": 1.6189,
"step": 23100
},
{
"epoch": 0.9175763328587249,
"grad_norm": 2.1499814987182617,
"learning_rate": 1.386840614994086e-05,
"loss": 1.6192,
"step": 23200
},
{
"epoch": 0.921531403258978,
"grad_norm": 1.9230985641479492,
"learning_rate": 1.3858456058078148e-05,
"loss": 1.6251,
"step": 23300
},
{
"epoch": 0.9254864736592311,
"grad_norm": 2.1623005867004395,
"learning_rate": 1.3848466013977365e-05,
"loss": 1.6145,
"step": 23400
},
{
"epoch": 0.9294415440594842,
"grad_norm": 2.0579729080200195,
"learning_rate": 1.3838436080409188e-05,
"loss": 1.628,
"step": 23500
},
{
"epoch": 0.9333966144597374,
"grad_norm": 1.9566960334777832,
"learning_rate": 1.3828366320394937e-05,
"loss": 1.6317,
"step": 23600
},
{
"epoch": 0.9373516848599905,
"grad_norm": 1.9498804807662964,
"learning_rate": 1.3818256797206177e-05,
"loss": 1.6107,
"step": 23700
},
{
"epoch": 0.9413067552602437,
"grad_norm": 1.8530848026275635,
"learning_rate": 1.3808107574364312e-05,
"loss": 1.6186,
"step": 23800
},
{
"epoch": 0.9452618256604968,
"grad_norm": 2.0309464931488037,
"learning_rate": 1.3797918715640197e-05,
"loss": 1.6208,
"step": 23900
},
{
"epoch": 0.9492168960607499,
"grad_norm": 2.0447559356689453,
"learning_rate": 1.3787690285053732e-05,
"loss": 1.6226,
"step": 24000
},
{
"epoch": 0.953171966461003,
"grad_norm": 2.198214292526245,
"learning_rate": 1.3777422346873453e-05,
"loss": 1.5954,
"step": 24100
},
{
"epoch": 0.9571270368612561,
"grad_norm": 2.0797901153564453,
"learning_rate": 1.3767114965616143e-05,
"loss": 1.6378,
"step": 24200
},
{
"epoch": 0.9610821072615092,
"grad_norm": 2.361177444458008,
"learning_rate": 1.3756768206046418e-05,
"loss": 1.6207,
"step": 24300
},
{
"epoch": 0.9650371776617623,
"grad_norm": 2.194758653640747,
"learning_rate": 1.3746382133176314e-05,
"loss": 1.6147,
"step": 24400
},
{
"epoch": 0.9689922480620154,
"grad_norm": 2.1259610652923584,
"learning_rate": 1.3735956812264893e-05,
"loss": 1.583,
"step": 24500
},
{
"epoch": 0.9729473184622687,
"grad_norm": 2.084428548812866,
"learning_rate": 1.372549230881782e-05,
"loss": 1.6257,
"step": 24600
},
{
"epoch": 0.9769023888625218,
"grad_norm": 2.0704309940338135,
"learning_rate": 1.3714988688586958e-05,
"loss": 1.6062,
"step": 24700
},
{
"epoch": 0.9808574592627749,
"grad_norm": 1.9661308526992798,
"learning_rate": 1.3704446017569953e-05,
"loss": 1.6164,
"step": 24800
},
{
"epoch": 0.984812529663028,
"grad_norm": 2.1788337230682373,
"learning_rate": 1.3693864362009821e-05,
"loss": 1.6188,
"step": 24900
},
{
"epoch": 0.9887676000632811,
"grad_norm": 1.9492045640945435,
"learning_rate": 1.3683243788394534e-05,
"loss": 1.609,
"step": 25000
},
{
"epoch": 0.9927226704635342,
"grad_norm": 2.2324581146240234,
"learning_rate": 1.3672584363456587e-05,
"loss": 1.6058,
"step": 25100
},
{
"epoch": 0.9966777408637874,
"grad_norm": 2.14666485786438,
"learning_rate": 1.3661886154172602e-05,
"loss": 1.6059,
"step": 25200
},
{
"epoch": 1.0006328112640406,
"grad_norm": 2.1909172534942627,
"learning_rate": 1.3651149227762893e-05,
"loss": 1.6006,
"step": 25300
},
{
"epoch": 1.0025726272461015,
"grad_norm": 2.143050193786621,
"learning_rate": 1.3640373651691044e-05,
"loss": 1.5379,
"step": 25400
},
{
"epoch": 1.0065305153170268,
"grad_norm": 2.6964924335479736,
"learning_rate": 1.3629559493663487e-05,
"loss": 1.507,
"step": 25500
},
{
"epoch": 1.0104884033879522,
"grad_norm": 2.556349992752075,
"learning_rate": 1.361870682162908e-05,
"loss": 1.5266,
"step": 25600
},
{
"epoch": 1.0144462914588774,
"grad_norm": 2.2218785285949707,
"learning_rate": 1.3607815703778673e-05,
"loss": 1.4867,
"step": 25700
},
{
"epoch": 1.0184041795298029,
"grad_norm": 2.5236029624938965,
"learning_rate": 1.3596886208544687e-05,
"loss": 1.4879,
"step": 25800
},
{
"epoch": 1.0223620676007283,
"grad_norm": 2.5998125076293945,
"learning_rate": 1.3585918404600679e-05,
"loss": 1.524,
"step": 25900
},
{
"epoch": 1.0263199556716536,
"grad_norm": 2.6749346256256104,
"learning_rate": 1.3574912360860912e-05,
"loss": 1.5148,
"step": 26000
},
{
"epoch": 1.030277843742579,
"grad_norm": 2.4651994705200195,
"learning_rate": 1.3563868146479921e-05,
"loss": 1.4905,
"step": 26100
},
{
"epoch": 1.0342357318135043,
"grad_norm": 2.4166133403778076,
"learning_rate": 1.3552785830852084e-05,
"loss": 1.5155,
"step": 26200
},
{
"epoch": 1.0381936198844297,
"grad_norm": 2.2493224143981934,
"learning_rate": 1.3541665483611175e-05,
"loss": 1.515,
"step": 26300
},
{
"epoch": 1.042151507955355,
"grad_norm": 2.7555856704711914,
"learning_rate": 1.3530507174629938e-05,
"loss": 1.5136,
"step": 26400
},
{
"epoch": 1.0461093960262804,
"grad_norm": 2.7666208744049072,
"learning_rate": 1.3519310974019639e-05,
"loss": 1.4917,
"step": 26500
},
{
"epoch": 1.0500672840972056,
"grad_norm": 2.7659711837768555,
"learning_rate": 1.3508076952129634e-05,
"loss": 1.5076,
"step": 26600
},
{
"epoch": 1.054025172168131,
"grad_norm": 2.5507092475891113,
"learning_rate": 1.3496805179546919e-05,
"loss": 1.5052,
"step": 26700
},
{
"epoch": 1.0579830602390565,
"grad_norm": 2.302542209625244,
"learning_rate": 1.3485495727095687e-05,
"loss": 1.5034,
"step": 26800
},
{
"epoch": 1.0619409483099818,
"grad_norm": 2.578275203704834,
"learning_rate": 1.3474148665836894e-05,
"loss": 1.4886,
"step": 26900
},
{
"epoch": 1.0658988363809072,
"grad_norm": 2.354796886444092,
"learning_rate": 1.3462764067067799e-05,
"loss": 1.506,
"step": 27000
},
{
"epoch": 1.0698567244518324,
"grad_norm": 2.5606160163879395,
"learning_rate": 1.345134200232152e-05,
"loss": 1.4931,
"step": 27100
},
{
"epoch": 1.073814612522758,
"grad_norm": 2.46881365776062,
"learning_rate": 1.343988254336659e-05,
"loss": 1.503,
"step": 27200
},
{
"epoch": 1.0777725005936831,
"grad_norm": 2.657731771469116,
"learning_rate": 1.3428385762206498e-05,
"loss": 1.5064,
"step": 27300
},
{
"epoch": 1.0817303886646086,
"grad_norm": 2.3708932399749756,
"learning_rate": 1.3416851731079244e-05,
"loss": 1.4943,
"step": 27400
},
{
"epoch": 1.0856882767355338,
"grad_norm": 2.6182353496551514,
"learning_rate": 1.340528052245688e-05,
"loss": 1.5001,
"step": 27500
},
{
"epoch": 1.0896461648064593,
"grad_norm": 2.7265477180480957,
"learning_rate": 1.3393672209045055e-05,
"loss": 1.4794,
"step": 27600
},
{
"epoch": 1.0936040528773847,
"grad_norm": 2.6186697483062744,
"learning_rate": 1.3382026863782559e-05,
"loss": 1.4878,
"step": 27700
},
{
"epoch": 1.09756194094831,
"grad_norm": 3.3754959106445312,
"learning_rate": 1.3370344559840868e-05,
"loss": 1.4769,
"step": 27800
},
{
"epoch": 1.1015198290192354,
"grad_norm": 3.0805869102478027,
"learning_rate": 1.3358625370623684e-05,
"loss": 1.5098,
"step": 27900
},
{
"epoch": 1.1054777170901606,
"grad_norm": 2.626561403274536,
"learning_rate": 1.334686936976646e-05,
"loss": 1.4965,
"step": 28000
},
{
"epoch": 1.109435605161086,
"grad_norm": 2.6148223876953125,
"learning_rate": 1.333507663113596e-05,
"loss": 1.4973,
"step": 28100
},
{
"epoch": 1.1133934932320113,
"grad_norm": 2.9437952041625977,
"learning_rate": 1.3323247228829781e-05,
"loss": 1.4861,
"step": 28200
},
{
"epoch": 1.1173513813029368,
"grad_norm": 2.7327873706817627,
"learning_rate": 1.3311381237175882e-05,
"loss": 1.4725,
"step": 28300
},
{
"epoch": 1.121309269373862,
"grad_norm": 2.8548924922943115,
"learning_rate": 1.3299478730732134e-05,
"loss": 1.4825,
"step": 28400
},
{
"epoch": 1.1252671574447874,
"grad_norm": 2.638568878173828,
"learning_rate": 1.3287539784285839e-05,
"loss": 1.4715,
"step": 28500
},
{
"epoch": 1.129225045515713,
"grad_norm": 3.1752021312713623,
"learning_rate": 1.327556447285326e-05,
"loss": 1.4749,
"step": 28600
},
{
"epoch": 1.1331829335866381,
"grad_norm": 2.8398923873901367,
"learning_rate": 1.3263552871679156e-05,
"loss": 1.4901,
"step": 28700
},
{
"epoch": 1.1371408216575636,
"grad_norm": 2.708963632583618,
"learning_rate": 1.3251505056236312e-05,
"loss": 1.4805,
"step": 28800
},
{
"epoch": 1.1410987097284888,
"grad_norm": 2.9168691635131836,
"learning_rate": 1.3239421102225049e-05,
"loss": 1.4653,
"step": 28900
},
{
"epoch": 1.1450565977994143,
"grad_norm": 2.523481845855713,
"learning_rate": 1.322730108557276e-05,
"loss": 1.4822,
"step": 29000
},
{
"epoch": 1.1490144858703395,
"grad_norm": 3.2788479328155518,
"learning_rate": 1.3215145082433436e-05,
"loss": 1.4685,
"step": 29100
},
{
"epoch": 1.152972373941265,
"grad_norm": 2.765491485595703,
"learning_rate": 1.3202953169187181e-05,
"loss": 1.4878,
"step": 29200
},
{
"epoch": 1.1569302620121902,
"grad_norm": 2.9480185508728027,
"learning_rate": 1.3190725422439734e-05,
"loss": 1.4607,
"step": 29300
},
{
"epoch": 1.1608881500831156,
"grad_norm": 2.9779725074768066,
"learning_rate": 1.3178461919021984e-05,
"loss": 1.4724,
"step": 29400
},
{
"epoch": 1.164846038154041,
"grad_norm": 2.768763780593872,
"learning_rate": 1.3166162735989497e-05,
"loss": 1.4989,
"step": 29500
},
{
"epoch": 1.1688039262249663,
"grad_norm": 3.2208807468414307,
"learning_rate": 1.3153827950622019e-05,
"loss": 1.4695,
"step": 29600
},
{
"epoch": 1.1727618142958918,
"grad_norm": 2.7532846927642822,
"learning_rate": 1.3141457640423002e-05,
"loss": 1.4841,
"step": 29700
},
{
"epoch": 1.176719702366817,
"grad_norm": 3.399897575378418,
"learning_rate": 1.3129051883119107e-05,
"loss": 1.4627,
"step": 29800
},
{
"epoch": 1.1806775904377425,
"grad_norm": 2.892542600631714,
"learning_rate": 1.311661075665973e-05,
"loss": 1.4754,
"step": 29900
},
{
"epoch": 1.1846354785086677,
"grad_norm": 2.6261606216430664,
"learning_rate": 1.310413433921649e-05,
"loss": 1.4847,
"step": 30000
},
{
"epoch": 1.1885933665795931,
"grad_norm": 2.6923210620880127,
"learning_rate": 1.3091622709182762e-05,
"loss": 1.4722,
"step": 30100
},
{
"epoch": 1.1925512546505184,
"grad_norm": 3.0266880989074707,
"learning_rate": 1.3079075945173164e-05,
"loss": 1.4922,
"step": 30200
},
{
"epoch": 1.1965091427214438,
"grad_norm": 2.9252758026123047,
"learning_rate": 1.306649412602308e-05,
"loss": 1.4692,
"step": 30300
},
{
"epoch": 1.2004670307923693,
"grad_norm": 2.757887125015259,
"learning_rate": 1.305387733078815e-05,
"loss": 1.465,
"step": 30400
},
{
"epoch": 1.2044249188632945,
"grad_norm": 3.536576271057129,
"learning_rate": 1.304122563874379e-05,
"loss": 1.4625,
"step": 30500
},
{
"epoch": 1.20838280693422,
"grad_norm": 2.765883684158325,
"learning_rate": 1.3028539129384668e-05,
"loss": 1.4509,
"step": 30600
},
{
"epoch": 1.2123406950051452,
"grad_norm": 2.8738183975219727,
"learning_rate": 1.3015817882424235e-05,
"loss": 1.4421,
"step": 30700
},
{
"epoch": 1.2162985830760706,
"grad_norm": 2.8780033588409424,
"learning_rate": 1.3003061977794207e-05,
"loss": 1.4749,
"step": 30800
},
{
"epoch": 1.2202564711469959,
"grad_norm": 3.499163866043091,
"learning_rate": 1.2990271495644059e-05,
"loss": 1.4564,
"step": 30900
},
{
"epoch": 1.2242143592179213,
"grad_norm": 2.900714159011841,
"learning_rate": 1.297744651634053e-05,
"loss": 1.4642,
"step": 31000
},
{
"epoch": 1.2281722472888466,
"grad_norm": 2.939680337905884,
"learning_rate": 1.2964587120467122e-05,
"loss": 1.4512,
"step": 31100
},
{
"epoch": 1.232130135359772,
"grad_norm": 3.2539045810699463,
"learning_rate": 1.2951693388823577e-05,
"loss": 1.4478,
"step": 31200
},
{
"epoch": 1.2360880234306975,
"grad_norm": 3.0612869262695312,
"learning_rate": 1.293876540242539e-05,
"loss": 1.4674,
"step": 31300
},
{
"epoch": 1.2400459115016227,
"grad_norm": 2.9142935276031494,
"learning_rate": 1.2925803242503287e-05,
"loss": 1.4704,
"step": 31400
},
{
"epoch": 1.2440037995725481,
"grad_norm": 3.0163750648498535,
"learning_rate": 1.291280699050271e-05,
"loss": 1.4433,
"step": 31500
},
{
"epoch": 1.2479616876434734,
"grad_norm": 2.712174654006958,
"learning_rate": 1.289977672808332e-05,
"loss": 1.4637,
"step": 31600
},
{
"epoch": 1.2519195757143988,
"grad_norm": 2.7295100688934326,
"learning_rate": 1.2886712537118475e-05,
"loss": 1.4599,
"step": 31700
},
{
"epoch": 1.2558774637853243,
"grad_norm": 3.3285765647888184,
"learning_rate": 1.2873614499694717e-05,
"loss": 1.4432,
"step": 31800
},
{
"epoch": 1.2598353518562495,
"grad_norm": 3.4205710887908936,
"learning_rate": 1.2860482698111254e-05,
"loss": 1.4571,
"step": 31900
},
{
"epoch": 1.2637932399271747,
"grad_norm": 2.865621328353882,
"learning_rate": 1.2847317214879451e-05,
"loss": 1.4434,
"step": 32000
},
{
"epoch": 1.2677511279981002,
"grad_norm": 2.7961373329162598,
"learning_rate": 1.2834118132722296e-05,
"loss": 1.4305,
"step": 32100
},
{
"epoch": 1.2717090160690256,
"grad_norm": 2.86441707611084,
"learning_rate": 1.2820885534573903e-05,
"loss": 1.4592,
"step": 32200
},
{
"epoch": 1.2756669041399509,
"grad_norm": 3.3199241161346436,
"learning_rate": 1.2807619503578964e-05,
"loss": 1.451,
"step": 32300
},
{
"epoch": 1.2796247922108763,
"grad_norm": 2.9922525882720947,
"learning_rate": 1.2794320123092248e-05,
"loss": 1.4447,
"step": 32400
},
{
"epoch": 1.2835826802818016,
"grad_norm": 2.9650540351867676,
"learning_rate": 1.2780987476678072e-05,
"loss": 1.4492,
"step": 32500
},
{
"epoch": 1.287540568352727,
"grad_norm": 3.2158126831054688,
"learning_rate": 1.2767621648109765e-05,
"loss": 1.428,
"step": 32600
},
{
"epoch": 1.2914984564236525,
"grad_norm": 3.462463617324829,
"learning_rate": 1.275422272136916e-05,
"loss": 1.4355,
"step": 32700
},
{
"epoch": 1.2954563444945777,
"grad_norm": 2.9805209636688232,
"learning_rate": 1.2740790780646048e-05,
"loss": 1.4153,
"step": 32800
},
{
"epoch": 1.299414232565503,
"grad_norm": 3.3206562995910645,
"learning_rate": 1.2727325910337665e-05,
"loss": 1.425,
"step": 32900
},
{
"epoch": 1.3033721206364284,
"grad_norm": 3.306009531021118,
"learning_rate": 1.2713828195048149e-05,
"loss": 1.4398,
"step": 33000
},
{
"epoch": 1.3073300087073538,
"grad_norm": 3.6652069091796875,
"learning_rate": 1.2700297719588015e-05,
"loss": 1.4126,
"step": 33100
},
{
"epoch": 1.311287896778279,
"grad_norm": 3.067331314086914,
"learning_rate": 1.268673456897362e-05,
"loss": 1.4453,
"step": 33200
},
{
"epoch": 1.3152457848492045,
"grad_norm": 3.4072649478912354,
"learning_rate": 1.2673138828426633e-05,
"loss": 1.4195,
"step": 33300
},
{
"epoch": 1.3192036729201297,
"grad_norm": 3.721276044845581,
"learning_rate": 1.2659510583373492e-05,
"loss": 1.4308,
"step": 33400
},
{
"epoch": 1.3231615609910552,
"grad_norm": 3.381657361984253,
"learning_rate": 1.2645849919444875e-05,
"loss": 1.4102,
"step": 33500
},
{
"epoch": 1.3271194490619806,
"grad_norm": 3.2021045684814453,
"learning_rate": 1.2632156922475153e-05,
"loss": 1.4248,
"step": 33600
},
{
"epoch": 1.3310773371329059,
"grad_norm": 3.0373260974884033,
"learning_rate": 1.2618431678501862e-05,
"loss": 1.4156,
"step": 33700
},
{
"epoch": 1.335035225203831,
"grad_norm": 2.7702269554138184,
"learning_rate": 1.2604674273765154e-05,
"loss": 1.4511,
"step": 33800
},
{
"epoch": 1.3389931132747566,
"grad_norm": 3.3153131008148193,
"learning_rate": 1.2590884794707254e-05,
"loss": 1.4523,
"step": 33900
},
{
"epoch": 1.342951001345682,
"grad_norm": 3.249516248703003,
"learning_rate": 1.2577063327971927e-05,
"loss": 1.4225,
"step": 34000
},
{
"epoch": 1.3469088894166072,
"grad_norm": 3.879835367202759,
"learning_rate": 1.2563209960403921e-05,
"loss": 1.4248,
"step": 34100
},
{
"epoch": 1.3508667774875327,
"grad_norm": 3.548116445541382,
"learning_rate": 1.2549324779048432e-05,
"loss": 1.4248,
"step": 34200
},
{
"epoch": 1.354824665558458,
"grad_norm": 3.109065294265747,
"learning_rate": 1.253540787115055e-05,
"loss": 1.4269,
"step": 34300
},
{
"epoch": 1.3587825536293834,
"grad_norm": 3.3330225944519043,
"learning_rate": 1.2521459324154708e-05,
"loss": 1.4354,
"step": 34400
},
{
"epoch": 1.3627404417003088,
"grad_norm": 3.0380284786224365,
"learning_rate": 1.2507479225704149e-05,
"loss": 1.3966,
"step": 34500
},
{
"epoch": 1.366698329771234,
"grad_norm": 3.3855364322662354,
"learning_rate": 1.2493467663640356e-05,
"loss": 1.402,
"step": 34600
},
{
"epoch": 1.3706562178421593,
"grad_norm": 3.3429582118988037,
"learning_rate": 1.247942472600251e-05,
"loss": 1.4315,
"step": 34700
},
{
"epoch": 1.3746141059130847,
"grad_norm": 3.0505242347717285,
"learning_rate": 1.2465350501026931e-05,
"loss": 1.425,
"step": 34800
},
{
"epoch": 1.3785719939840102,
"grad_norm": 3.4241063594818115,
"learning_rate": 1.245124507714654e-05,
"loss": 1.4179,
"step": 34900
},
{
"epoch": 1.3825298820549354,
"grad_norm": 3.458108901977539,
"learning_rate": 1.2437108542990274e-05,
"loss": 1.4133,
"step": 35000
},
{
"epoch": 1.3864877701258609,
"grad_norm": 3.9022340774536133,
"learning_rate": 1.2422940987382556e-05,
"loss": 1.4112,
"step": 35100
},
{
"epoch": 1.3904456581967861,
"grad_norm": 3.5141968727111816,
"learning_rate": 1.240874249934273e-05,
"loss": 1.4453,
"step": 35200
},
{
"epoch": 1.3944035462677116,
"grad_norm": 3.4254074096679688,
"learning_rate": 1.2394513168084485e-05,
"loss": 1.4096,
"step": 35300
},
{
"epoch": 1.398361434338637,
"grad_norm": 3.460205316543579,
"learning_rate": 1.2380253083015321e-05,
"loss": 1.4145,
"step": 35400
},
{
"epoch": 1.4023193224095623,
"grad_norm": 3.7515103816986084,
"learning_rate": 1.236596233373597e-05,
"loss": 1.4132,
"step": 35500
},
{
"epoch": 1.4062772104804875,
"grad_norm": 3.2565503120422363,
"learning_rate": 1.2351641010039833e-05,
"loss": 1.3945,
"step": 35600
},
{
"epoch": 1.410235098551413,
"grad_norm": 3.754737138748169,
"learning_rate": 1.2337289201912429e-05,
"loss": 1.381,
"step": 35700
},
{
"epoch": 1.4141929866223384,
"grad_norm": 3.7933449745178223,
"learning_rate": 1.2322906999530811e-05,
"loss": 1.3943,
"step": 35800
},
{
"epoch": 1.4181508746932636,
"grad_norm": 3.108177900314331,
"learning_rate": 1.2308494493263014e-05,
"loss": 1.4127,
"step": 35900
},
{
"epoch": 1.422108762764189,
"grad_norm": 3.393486499786377,
"learning_rate": 1.2294051773667482e-05,
"loss": 1.3921,
"step": 36000
},
{
"epoch": 1.4260666508351143,
"grad_norm": 3.9485793113708496,
"learning_rate": 1.22795789314925e-05,
"loss": 1.42,
"step": 36100
},
{
"epoch": 1.4300245389060398,
"grad_norm": 3.353940725326538,
"learning_rate": 1.2265076057675615e-05,
"loss": 1.412,
"step": 36200
},
{
"epoch": 1.4339824269769652,
"grad_norm": 3.861928939819336,
"learning_rate": 1.2250543243343082e-05,
"loss": 1.3952,
"step": 36300
},
{
"epoch": 1.4379403150478904,
"grad_norm": 2.9782791137695312,
"learning_rate": 1.2235980579809283e-05,
"loss": 1.3872,
"step": 36400
},
{
"epoch": 1.4418982031188157,
"grad_norm": 3.552558660507202,
"learning_rate": 1.2221388158576142e-05,
"loss": 1.3855,
"step": 36500
},
{
"epoch": 1.4458560911897411,
"grad_norm": 3.034158229827881,
"learning_rate": 1.2206766071332568e-05,
"loss": 1.4028,
"step": 36600
},
{
"epoch": 1.4498139792606666,
"grad_norm": 3.669677495956421,
"learning_rate": 1.219211440995387e-05,
"loss": 1.3865,
"step": 36700
},
{
"epoch": 1.4537718673315918,
"grad_norm": 3.4838759899139404,
"learning_rate": 1.2177433266501182e-05,
"loss": 1.3987,
"step": 36800
},
{
"epoch": 1.4577297554025173,
"grad_norm": 3.4691314697265625,
"learning_rate": 1.2162722733220877e-05,
"loss": 1.3939,
"step": 36900
},
{
"epoch": 1.4616876434734425,
"grad_norm": 3.5910284519195557,
"learning_rate": 1.2147982902544004e-05,
"loss": 1.3972,
"step": 37000
},
{
"epoch": 1.465645531544368,
"grad_norm": 3.2121059894561768,
"learning_rate": 1.2133213867085686e-05,
"loss": 1.3762,
"step": 37100
},
{
"epoch": 1.4696034196152934,
"grad_norm": 3.8289687633514404,
"learning_rate": 1.2118415719644557e-05,
"loss": 1.4032,
"step": 37200
},
{
"epoch": 1.4735613076862186,
"grad_norm": 3.597191095352173,
"learning_rate": 1.2103588553202167e-05,
"loss": 1.3925,
"step": 37300
},
{
"epoch": 1.4775191957571439,
"grad_norm": 3.4253151416778564,
"learning_rate": 1.2088732460922407e-05,
"loss": 1.3715,
"step": 37400
},
{
"epoch": 1.4814770838280693,
"grad_norm": 3.624340772628784,
"learning_rate": 1.2073847536150912e-05,
"loss": 1.387,
"step": 37500
},
{
"epoch": 1.4854349718989948,
"grad_norm": 3.2783761024475098,
"learning_rate": 1.2058933872414484e-05,
"loss": 1.3837,
"step": 37600
},
{
"epoch": 1.48939285996992,
"grad_norm": 3.377274990081787,
"learning_rate": 1.2043991563420501e-05,
"loss": 1.3515,
"step": 37700
},
{
"epoch": 1.4933507480408454,
"grad_norm": 3.676497459411621,
"learning_rate": 1.2029020703056327e-05,
"loss": 1.3647,
"step": 37800
},
{
"epoch": 1.4973086361117707,
"grad_norm": 4.441483020782471,
"learning_rate": 1.2014021385388727e-05,
"loss": 1.3594,
"step": 37900
},
{
"epoch": 1.5012665241826961,
"grad_norm": 4.011296272277832,
"learning_rate": 1.1998993704663267e-05,
"loss": 1.3855,
"step": 38000
},
{
"epoch": 1.5052244122536216,
"grad_norm": 3.3986339569091797,
"learning_rate": 1.1983937755303735e-05,
"loss": 1.3549,
"step": 38100
},
{
"epoch": 1.5091823003245468,
"grad_norm": 3.2860589027404785,
"learning_rate": 1.1968853631911532e-05,
"loss": 1.3481,
"step": 38200
},
{
"epoch": 1.513140188395472,
"grad_norm": 4.623264789581299,
"learning_rate": 1.1953741429265089e-05,
"loss": 1.3609,
"step": 38300
},
{
"epoch": 1.5170980764663975,
"grad_norm": 3.3357603549957275,
"learning_rate": 1.1938601242319269e-05,
"loss": 1.373,
"step": 38400
},
{
"epoch": 1.521055964537323,
"grad_norm": 3.6516709327697754,
"learning_rate": 1.1923433166204768e-05,
"loss": 1.3568,
"step": 38500
},
{
"epoch": 1.5250138526082484,
"grad_norm": 4.045721530914307,
"learning_rate": 1.1908237296227522e-05,
"loss": 1.3419,
"step": 38600
},
{
"epoch": 1.5289717406791736,
"grad_norm": 3.331871271133423,
"learning_rate": 1.1893013727868098e-05,
"loss": 1.3575,
"step": 38700
},
{
"epoch": 1.5329296287500989,
"grad_norm": 3.959519624710083,
"learning_rate": 1.1877762556781109e-05,
"loss": 1.3464,
"step": 38800
},
{
"epoch": 1.5368875168210243,
"grad_norm": 4.424190998077393,
"learning_rate": 1.1862483878794596e-05,
"loss": 1.3593,
"step": 38900
},
{
"epoch": 1.5408454048919498,
"grad_norm": 4.052654266357422,
"learning_rate": 1.1847177789909441e-05,
"loss": 1.3474,
"step": 39000
},
{
"epoch": 1.544803292962875,
"grad_norm": 3.552598237991333,
"learning_rate": 1.1831844386298758e-05,
"loss": 1.346,
"step": 39100
},
{
"epoch": 1.5487611810338002,
"grad_norm": 3.979213237762451,
"learning_rate": 1.1816483764307286e-05,
"loss": 1.3557,
"step": 39200
},
{
"epoch": 1.5527190691047257,
"grad_norm": 4.073390960693359,
"learning_rate": 1.1801096020450786e-05,
"loss": 1.3658,
"step": 39300
},
{
"epoch": 1.5566769571756511,
"grad_norm": 4.211179256439209,
"learning_rate": 1.1785681251415431e-05,
"loss": 1.346,
"step": 39400
},
{
"epoch": 1.5606348452465766,
"grad_norm": 3.6185340881347656,
"learning_rate": 1.177023955405721e-05,
"loss": 1.3686,
"step": 39500
},
{
"epoch": 1.5645927333175018,
"grad_norm": 3.7389111518859863,
"learning_rate": 1.1754771025401307e-05,
"loss": 1.3536,
"step": 39600
},
{
"epoch": 1.568550621388427,
"grad_norm": 4.2574357986450195,
"learning_rate": 1.1739275762641494e-05,
"loss": 1.352,
"step": 39700
},
{
"epoch": 1.5725085094593525,
"grad_norm": 3.516805410385132,
"learning_rate": 1.1723753863139529e-05,
"loss": 1.3411,
"step": 39800
},
{
"epoch": 1.576466397530278,
"grad_norm": 3.5958383083343506,
"learning_rate": 1.1708205424424521e-05,
"loss": 1.3433,
"step": 39900
},
{
"epoch": 1.5804242856012032,
"grad_norm": 3.995814323425293,
"learning_rate": 1.1692630544192354e-05,
"loss": 1.3529,
"step": 40000
},
{
"epoch": 1.5843821736721284,
"grad_norm": 3.817218780517578,
"learning_rate": 1.1677029320305041e-05,
"loss": 1.3469,
"step": 40100
},
{
"epoch": 1.5883400617430539,
"grad_norm": 4.439276695251465,
"learning_rate": 1.1661401850790119e-05,
"loss": 1.3466,
"step": 40200
},
{
"epoch": 1.5922979498139793,
"grad_norm": 3.7181553840637207,
"learning_rate": 1.1645748233840044e-05,
"loss": 1.3476,
"step": 40300
},
{
"epoch": 1.5962558378849048,
"grad_norm": 10.218334197998047,
"learning_rate": 1.1630068567811557e-05,
"loss": 1.3602,
"step": 40400
},
{
"epoch": 1.60021372595583,
"grad_norm": 4.133950710296631,
"learning_rate": 1.1614362951225075e-05,
"loss": 1.3485,
"step": 40500
},
{
"epoch": 1.6041716140267552,
"grad_norm": 4.007839202880859,
"learning_rate": 1.1598631482764074e-05,
"loss": 1.3594,
"step": 40600
},
{
"epoch": 1.6081295020976807,
"grad_norm": 4.194820404052734,
"learning_rate": 1.1582874261274463e-05,
"loss": 1.3383,
"step": 40700
},
{
"epoch": 1.6120873901686061,
"grad_norm": 4.193638801574707,
"learning_rate": 1.1567091385763965e-05,
"loss": 1.3715,
"step": 40800
},
{
"epoch": 1.6160452782395314,
"grad_norm": 3.5800745487213135,
"learning_rate": 1.15512829554015e-05,
"loss": 1.3494,
"step": 40900
},
{
"epoch": 1.6200031663104566,
"grad_norm": 4.548177719116211,
"learning_rate": 1.1535449069516552e-05,
"loss": 1.3442,
"step": 41000
},
{
"epoch": 1.623961054381382,
"grad_norm": 3.7496066093444824,
"learning_rate": 1.1519589827598553e-05,
"loss": 1.329,
"step": 41100
},
{
"epoch": 1.6279189424523075,
"grad_norm": 4.815052032470703,
"learning_rate": 1.1503705329296252e-05,
"loss": 1.32,
"step": 41200
},
{
"epoch": 1.631876830523233,
"grad_norm": 4.1927103996276855,
"learning_rate": 1.14877956744171e-05,
"loss": 1.3172,
"step": 41300
},
{
"epoch": 1.6358347185941582,
"grad_norm": 3.9431440830230713,
"learning_rate": 1.1471860962926604e-05,
"loss": 1.3271,
"step": 41400
},
{
"epoch": 1.6397926066650834,
"grad_norm": 4.615567684173584,
"learning_rate": 1.1455901294947722e-05,
"loss": 1.348,
"step": 41500
},
{
"epoch": 1.6437504947360089,
"grad_norm": 4.151221752166748,
"learning_rate": 1.143991677076021e-05,
"loss": 1.3336,
"step": 41600
},
{
"epoch": 1.6477083828069343,
"grad_norm": 4.409358978271484,
"learning_rate": 1.142390749080001e-05,
"loss": 1.3254,
"step": 41700
},
{
"epoch": 1.6516662708778596,
"grad_norm": 4.490970134735107,
"learning_rate": 1.140787355565861e-05,
"loss": 1.3303,
"step": 41800
},
{
"epoch": 1.6556241589487848,
"grad_norm": 4.116312026977539,
"learning_rate": 1.1391815066082418e-05,
"loss": 1.3315,
"step": 41900
},
{
"epoch": 1.6595820470197102,
"grad_norm": 4.251399517059326,
"learning_rate": 1.1375732122972124e-05,
"loss": 1.3243,
"step": 42000
},
{
"epoch": 1.6635399350906357,
"grad_norm": 4.184506416320801,
"learning_rate": 1.1359624827382062e-05,
"loss": 1.3226,
"step": 42100
},
{
"epoch": 1.6674978231615611,
"grad_norm": 4.628664970397949,
"learning_rate": 1.134349328051959e-05,
"loss": 1.3399,
"step": 42200
},
{
"epoch": 1.6714557112324864,
"grad_norm": 4.529860496520996,
"learning_rate": 1.132733758374444e-05,
"loss": 1.2967,
"step": 42300
},
{
"epoch": 1.6754135993034116,
"grad_norm": 4.5048699378967285,
"learning_rate": 1.1311157838568083e-05,
"loss": 1.3255,
"step": 42400
},
{
"epoch": 1.679371487374337,
"grad_norm": 4.321528911590576,
"learning_rate": 1.1294954146653094e-05,
"loss": 1.311,
"step": 42500
},
{
"epoch": 1.6833293754452625,
"grad_norm": 4.919022083282471,
"learning_rate": 1.1278726609812523e-05,
"loss": 1.3219,
"step": 42600
},
{
"epoch": 1.6872872635161877,
"grad_norm": 4.146111965179443,
"learning_rate": 1.126247533000923e-05,
"loss": 1.298,
"step": 42700
},
{
"epoch": 1.691245151587113,
"grad_norm": 4.071747779846191,
"learning_rate": 1.1246200409355271e-05,
"loss": 1.313,
"step": 42800
},
{
"epoch": 1.6952030396580384,
"grad_norm": 3.8871426582336426,
"learning_rate": 1.1229901950111245e-05,
"loss": 1.3176,
"step": 42900
},
{
"epoch": 1.6991609277289639,
"grad_norm": 3.9479401111602783,
"learning_rate": 1.1213580054685644e-05,
"loss": 1.3112,
"step": 43000
},
{
"epoch": 1.7031188157998893,
"grad_norm": 4.039346694946289,
"learning_rate": 1.1197234825634222e-05,
"loss": 1.3109,
"step": 43100
},
{
"epoch": 1.7070767038708146,
"grad_norm": 4.356393814086914,
"learning_rate": 1.1180866365659346e-05,
"loss": 1.3202,
"step": 43200
},
{
"epoch": 1.7110345919417398,
"grad_norm": 4.11832857131958,
"learning_rate": 1.1164474777609351e-05,
"loss": 1.313,
"step": 43300
},
{
"epoch": 1.7149924800126652,
"grad_norm": 4.101129531860352,
"learning_rate": 1.1148060164477887e-05,
"loss": 1.2968,
"step": 43400
},
{
"epoch": 1.7189503680835907,
"grad_norm": 3.728778600692749,
"learning_rate": 1.1131622629403289e-05,
"loss": 1.2869,
"step": 43500
},
{
"epoch": 1.722908256154516,
"grad_norm": 3.846654176712036,
"learning_rate": 1.1115162275667909e-05,
"loss": 1.3054,
"step": 43600
},
{
"epoch": 1.7268661442254412,
"grad_norm": 4.967803478240967,
"learning_rate": 1.1098679206697474e-05,
"loss": 1.3165,
"step": 43700
},
{
"epoch": 1.7308240322963666,
"grad_norm": 4.610755443572998,
"learning_rate": 1.1082173526060454e-05,
"loss": 1.3234,
"step": 43800
},
{
"epoch": 1.734781920367292,
"grad_norm": 4.377742290496826,
"learning_rate": 1.1065645337467375e-05,
"loss": 1.3261,
"step": 43900
},
{
"epoch": 1.7387398084382175,
"grad_norm": 5.010995864868164,
"learning_rate": 1.1049094744770201e-05,
"loss": 1.3026,
"step": 44000
},
{
"epoch": 1.7426976965091427,
"grad_norm": 4.31913423538208,
"learning_rate": 1.1032521851961665e-05,
"loss": 1.2697,
"step": 44100
},
{
"epoch": 1.746655584580068,
"grad_norm": 4.2657060623168945,
"learning_rate": 1.1015926763174617e-05,
"loss": 1.3101,
"step": 44200
},
{
"epoch": 1.7506134726509934,
"grad_norm": 3.801684617996216,
"learning_rate": 1.0999309582681372e-05,
"loss": 1.268,
"step": 44300
},
{
"epoch": 1.7545713607219189,
"grad_norm": 4.505929946899414,
"learning_rate": 1.0982670414893057e-05,
"loss": 1.2957,
"step": 44400
},
{
"epoch": 1.7585292487928441,
"grad_norm": 3.837562084197998,
"learning_rate": 1.0966009364358948e-05,
"loss": 1.3078,
"step": 44500
},
{
"epoch": 1.7624871368637693,
"grad_norm": 4.854923248291016,
"learning_rate": 1.0949326535765823e-05,
"loss": 1.3119,
"step": 44600
},
{
"epoch": 1.7664450249346948,
"grad_norm": 5.621912002563477,
"learning_rate": 1.0932622033937294e-05,
"loss": 1.2847,
"step": 44700
},
{
"epoch": 1.7704029130056202,
"grad_norm": 4.009350776672363,
"learning_rate": 1.0915895963833152e-05,
"loss": 1.3006,
"step": 44800
},
{
"epoch": 1.7743608010765457,
"grad_norm": 3.6888113021850586,
"learning_rate": 1.0899148430548716e-05,
"loss": 1.3032,
"step": 44900
},
{
"epoch": 1.778318689147471,
"grad_norm": 4.511534690856934,
"learning_rate": 1.0882379539314155e-05,
"loss": 1.2615,
"step": 45000
},
{
"epoch": 1.7822765772183962,
"grad_norm": 5.086874485015869,
"learning_rate": 1.0865589395493845e-05,
"loss": 1.2634,
"step": 45100
},
{
"epoch": 1.7862344652893216,
"grad_norm": 4.865400791168213,
"learning_rate": 1.0848778104585692e-05,
"loss": 1.2876,
"step": 45200
},
{
"epoch": 1.790192353360247,
"grad_norm": 4.537430763244629,
"learning_rate": 1.0831945772220487e-05,
"loss": 1.2764,
"step": 45300
},
{
"epoch": 1.7941502414311723,
"grad_norm": 4.448334693908691,
"learning_rate": 1.0815092504161214e-05,
"loss": 1.2814,
"step": 45400
},
{
"epoch": 1.7981081295020975,
"grad_norm": 4.650451183319092,
"learning_rate": 1.0798218406302422e-05,
"loss": 1.2819,
"step": 45500
},
{
"epoch": 1.802066017573023,
"grad_norm": 4.031219005584717,
"learning_rate": 1.0781323584669524e-05,
"loss": 1.2729,
"step": 45600
},
{
"epoch": 1.8060239056439484,
"grad_norm": 4.477336883544922,
"learning_rate": 1.0764408145418157e-05,
"loss": 1.2586,
"step": 45700
},
{
"epoch": 1.8099817937148739,
"grad_norm": 4.118893146514893,
"learning_rate": 1.0747472194833506e-05,
"loss": 1.2591,
"step": 45800
},
{
"epoch": 1.8139396817857991,
"grad_norm": 4.766265392303467,
"learning_rate": 1.073051583932963e-05,
"loss": 1.2693,
"step": 45900
},
{
"epoch": 1.8178975698567243,
"grad_norm": 5.545733451843262,
"learning_rate": 1.0713539185448795e-05,
"loss": 1.2691,
"step": 46000
},
{
"epoch": 1.8218554579276498,
"grad_norm": 4.723430633544922,
"learning_rate": 1.069654233986082e-05,
"loss": 1.2582,
"step": 46100
},
{
"epoch": 1.8258133459985753,
"grad_norm": 4.899106025695801,
"learning_rate": 1.0679525409362387e-05,
"loss": 1.2802,
"step": 46200
},
{
"epoch": 1.8297712340695005,
"grad_norm": 4.531938552856445,
"learning_rate": 1.066248850087638e-05,
"loss": 1.2683,
"step": 46300
},
{
"epoch": 1.8337291221404257,
"grad_norm": 4.174386501312256,
"learning_rate": 1.0645431721451212e-05,
"loss": 1.2538,
"step": 46400
},
{
"epoch": 1.8376870102113512,
"grad_norm": 4.827451229095459,
"learning_rate": 1.0628355178260147e-05,
"loss": 1.2571,
"step": 46500
},
{
"epoch": 1.8416448982822766,
"grad_norm": 4.749929904937744,
"learning_rate": 1.0611258978600638e-05,
"loss": 1.2759,
"step": 46600
},
{
"epoch": 1.845602786353202,
"grad_norm": 4.659051418304443,
"learning_rate": 1.0594143229893643e-05,
"loss": 1.2698,
"step": 46700
},
{
"epoch": 1.8495606744241273,
"grad_norm": 4.492525100708008,
"learning_rate": 1.057700803968295e-05,
"loss": 1.2583,
"step": 46800
},
{
"epoch": 1.8535185624950525,
"grad_norm": 4.04518461227417,
"learning_rate": 1.0559853515634509e-05,
"loss": 1.268,
"step": 46900
},
{
"epoch": 1.857476450565978,
"grad_norm": 5.029372215270996,
"learning_rate": 1.054267976553575e-05,
"loss": 1.238,
"step": 47000
},
{
"epoch": 1.8614343386369034,
"grad_norm": 4.29434871673584,
"learning_rate": 1.05254868972949e-05,
"loss": 1.2431,
"step": 47100
},
{
"epoch": 1.8653922267078287,
"grad_norm": 4.874353408813477,
"learning_rate": 1.050827501894032e-05,
"loss": 1.2339,
"step": 47200
},
{
"epoch": 1.869350114778754,
"grad_norm": 4.865941047668457,
"learning_rate": 1.0491044238619817e-05,
"loss": 1.2427,
"step": 47300
},
{
"epoch": 1.8733080028496794,
"grad_norm": 4.548977375030518,
"learning_rate": 1.0473794664599957e-05,
"loss": 1.2586,
"step": 47400
},
{
"epoch": 1.8772658909206048,
"grad_norm": 5.584561824798584,
"learning_rate": 1.0456526405265402e-05,
"loss": 1.2388,
"step": 47500
},
{
"epoch": 1.8812237789915303,
"grad_norm": 4.570620536804199,
"learning_rate": 1.0439239569118215e-05,
"loss": 1.2499,
"step": 47600
},
{
"epoch": 1.8851816670624555,
"grad_norm": 4.9521379470825195,
"learning_rate": 1.0421934264777186e-05,
"loss": 1.2486,
"step": 47700
},
{
"epoch": 1.8891395551333807,
"grad_norm": 4.744478702545166,
"learning_rate": 1.0404610600977141e-05,
"loss": 1.2428,
"step": 47800
},
{
"epoch": 1.8930974432043062,
"grad_norm": 4.681623935699463,
"learning_rate": 1.0387268686568275e-05,
"loss": 1.2577,
"step": 47900
},
{
"epoch": 1.8970553312752316,
"grad_norm": 4.406890392303467,
"learning_rate": 1.0369908630515445e-05,
"loss": 1.2367,
"step": 48000
},
{
"epoch": 1.9010132193461569,
"grad_norm": 4.650542259216309,
"learning_rate": 1.0352530541897507e-05,
"loss": 1.243,
"step": 48100
},
{
"epoch": 1.904971107417082,
"grad_norm": 5.0188164710998535,
"learning_rate": 1.0335134529906619e-05,
"loss": 1.2222,
"step": 48200
},
{
"epoch": 1.9089289954880075,
"grad_norm": 4.498706340789795,
"learning_rate": 1.0317720703847554e-05,
"loss": 1.2508,
"step": 48300
},
{
"epoch": 1.912886883558933,
"grad_norm": 4.909203052520752,
"learning_rate": 1.0300289173137021e-05,
"loss": 1.2241,
"step": 48400
},
{
"epoch": 1.9168447716298584,
"grad_norm": 4.85788631439209,
"learning_rate": 1.0282840047302967e-05,
"loss": 1.2268,
"step": 48500
},
{
"epoch": 1.9208026597007837,
"grad_norm": 4.537557601928711,
"learning_rate": 1.0265373435983907e-05,
"loss": 1.2515,
"step": 48600
},
{
"epoch": 1.924760547771709,
"grad_norm": 4.660990238189697,
"learning_rate": 1.0247889448928208e-05,
"loss": 1.2531,
"step": 48700
},
{
"epoch": 1.9287184358426344,
"grad_norm": 4.9931511878967285,
"learning_rate": 1.0230388195993424e-05,
"loss": 1.2336,
"step": 48800
},
{
"epoch": 1.9326763239135598,
"grad_norm": 4.516580581665039,
"learning_rate": 1.0212869787145594e-05,
"loss": 1.2292,
"step": 48900
},
{
"epoch": 1.936634211984485,
"grad_norm": 4.5982255935668945,
"learning_rate": 1.0195334332458552e-05,
"loss": 1.2417,
"step": 49000
},
{
"epoch": 1.9405921000554103,
"grad_norm": 5.071137428283691,
"learning_rate": 1.0177781942113238e-05,
"loss": 1.2194,
"step": 49100
},
{
"epoch": 1.9445499881263357,
"grad_norm": 4.76341438293457,
"learning_rate": 1.0160212726397001e-05,
"loss": 1.2207,
"step": 49200
},
{
"epoch": 1.9485078761972612,
"grad_norm": 4.945827484130859,
"learning_rate": 1.0142626795702916e-05,
"loss": 1.208,
"step": 49300
},
{
"epoch": 1.9524657642681866,
"grad_norm": 5.068126678466797,
"learning_rate": 1.0125024260529075e-05,
"loss": 1.2372,
"step": 49400
},
{
"epoch": 1.9564236523391119,
"grad_norm": 5.238717079162598,
"learning_rate": 1.010740523147791e-05,
"loss": 1.2287,
"step": 49500
},
{
"epoch": 1.960381540410037,
"grad_norm": 4.586404800415039,
"learning_rate": 1.008976981925548e-05,
"loss": 1.2023,
"step": 49600
},
{
"epoch": 1.9643394284809625,
"grad_norm": 5.284154415130615,
"learning_rate": 1.0072118134670792e-05,
"loss": 1.2254,
"step": 49700
},
{
"epoch": 1.968297316551888,
"grad_norm": 4.639484882354736,
"learning_rate": 1.0054450288635098e-05,
"loss": 1.2298,
"step": 49800
},
{
"epoch": 1.9722552046228132,
"grad_norm": 5.642242908477783,
"learning_rate": 1.003676639216119e-05,
"loss": 1.2196,
"step": 49900
},
{
"epoch": 1.9762130926937385,
"grad_norm": 5.445943355560303,
"learning_rate": 1.0019066556362718e-05,
"loss": 1.2035,
"step": 50000
},
{
"epoch": 1.980170980764664,
"grad_norm": 5.127743244171143,
"learning_rate": 1.000135089245348e-05,
"loss": 1.2281,
"step": 50100
},
{
"epoch": 1.9841288688355894,
"grad_norm": 4.978102684020996,
"learning_rate": 9.98361951174673e-06,
"loss": 1.22,
"step": 50200
},
{
"epoch": 1.9880867569065148,
"grad_norm": 4.681839466094971,
"learning_rate": 9.965872525654468e-06,
"loss": 1.2011,
"step": 50300
},
{
"epoch": 1.99204464497744,
"grad_norm": 5.249551296234131,
"learning_rate": 9.948110045686763e-06,
"loss": 1.1931,
"step": 50400
},
{
"epoch": 1.9960025330483653,
"grad_norm": 4.80012845993042,
"learning_rate": 9.930332183451022e-06,
"loss": 1.2282,
"step": 50500
}
],
"logging_steps": 100,
"max_steps": 126330,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 25266,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.9517716845075366e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}