{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9972690572310614, "eval_steps": 500, "global_step": 50532, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003955070400253125, "grad_norm": 0.3985116183757782, "learning_rate": 1.5e-06, "loss": 2.3445, "step": 100 }, { "epoch": 0.00791014080050625, "grad_norm": 0.5253750681877136, "learning_rate": 3e-06, "loss": 2.2315, "step": 200 }, { "epoch": 0.011865211200759373, "grad_norm": 0.7505590319633484, "learning_rate": 4.5e-06, "loss": 2.0777, "step": 300 }, { "epoch": 0.0158202816010125, "grad_norm": 0.7067473530769348, "learning_rate": 6e-06, "loss": 2.0165, "step": 400 }, { "epoch": 0.019775352001265623, "grad_norm": 0.8964686393737793, "learning_rate": 7.5e-06, "loss": 1.9872, "step": 500 }, { "epoch": 0.023730422401518746, "grad_norm": 1.0233873128890991, "learning_rate": 9e-06, "loss": 1.9621, "step": 600 }, { "epoch": 0.02768549280177187, "grad_norm": 1.0008875131607056, "learning_rate": 1.05e-05, "loss": 1.9519, "step": 700 }, { "epoch": 0.031640563202025, "grad_norm": 0.9941542148590088, "learning_rate": 1.2e-05, "loss": 1.922, "step": 800 }, { "epoch": 0.03559563360227812, "grad_norm": 1.0296485424041748, "learning_rate": 1.3500000000000001e-05, "loss": 1.9305, "step": 900 }, { "epoch": 0.039550704002531245, "grad_norm": 1.056834101676941, "learning_rate": 1.5e-05, "loss": 1.9199, "step": 1000 }, { "epoch": 0.04350577440278437, "grad_norm": 1.1616301536560059, "learning_rate": 1.4999976437535872e-05, "loss": 1.9058, "step": 1100 }, { "epoch": 0.04746084480303749, "grad_norm": 1.0753331184387207, "learning_rate": 1.4999905750291538e-05, "loss": 1.9321, "step": 1200 }, { "epoch": 0.05141591520329062, "grad_norm": 1.3542801141738892, "learning_rate": 1.4999787938711148e-05, "loss": 1.8942, "step": 1300 }, { "epoch": 0.05537098560354374, "grad_norm": 1.2391093969345093, "learning_rate": 1.499962300353495e-05, "loss": 1.8677, "step": 1400 }, { "epoch": 0.05932605600379687, "grad_norm": 1.0844732522964478, "learning_rate": 1.4999410945799291e-05, "loss": 1.8723, "step": 1500 }, { "epoch": 0.06328112640405, "grad_norm": 1.0814001560211182, "learning_rate": 1.499915176683659e-05, "loss": 1.8715, "step": 1600 }, { "epoch": 0.06723619680430312, "grad_norm": 1.2003209590911865, "learning_rate": 1.4998845468275357e-05, "loss": 1.8739, "step": 1700 }, { "epoch": 0.07119126720455624, "grad_norm": 1.2625234127044678, "learning_rate": 1.4998492052040163e-05, "loss": 1.8475, "step": 1800 }, { "epoch": 0.07514633760480936, "grad_norm": 1.144286870956421, "learning_rate": 1.499809152035164e-05, "loss": 1.8517, "step": 1900 }, { "epoch": 0.07910140800506249, "grad_norm": 1.2154852151870728, "learning_rate": 1.4997643875726454e-05, "loss": 1.8262, "step": 2000 }, { "epoch": 0.08305647840531562, "grad_norm": 1.389930009841919, "learning_rate": 1.4997149120977304e-05, "loss": 1.8419, "step": 2100 }, { "epoch": 0.08701154880556874, "grad_norm": 1.265163779258728, "learning_rate": 1.4996607259212892e-05, "loss": 1.8344, "step": 2200 }, { "epoch": 0.09096661920582186, "grad_norm": 1.1876202821731567, "learning_rate": 1.4996018293837914e-05, "loss": 1.8356, "step": 2300 }, { "epoch": 0.09492168960607499, "grad_norm": 1.2677972316741943, "learning_rate": 1.4995382228553028e-05, "loss": 1.8463, "step": 2400 }, { "epoch": 0.09887676000632811, "grad_norm": 1.206444501876831, "learning_rate": 1.4994699067354838e-05, "loss": 1.8256, "step": 2500 }, { "epoch": 0.10283183040658124, "grad_norm": 1.2112213373184204, "learning_rate": 1.4993968814535867e-05, "loss": 1.8627, "step": 2600 }, { "epoch": 0.10678690080683437, "grad_norm": 1.1587677001953125, "learning_rate": 1.4993191474684532e-05, "loss": 1.8458, "step": 2700 }, { "epoch": 0.11074197120708748, "grad_norm": 1.2365622520446777, "learning_rate": 1.4992367052685107e-05, "loss": 1.8383, "step": 2800 }, { "epoch": 0.11469704160734061, "grad_norm": 1.179803490638733, "learning_rate": 1.4991495553717708e-05, "loss": 1.8305, "step": 2900 }, { "epoch": 0.11865211200759374, "grad_norm": 1.336897850036621, "learning_rate": 1.499057698325824e-05, "loss": 1.8381, "step": 3000 }, { "epoch": 0.12260718240784686, "grad_norm": 1.346336007118225, "learning_rate": 1.498961134707838e-05, "loss": 1.8467, "step": 3100 }, { "epoch": 0.1265622528081, "grad_norm": 1.2718663215637207, "learning_rate": 1.4988598651245534e-05, "loss": 1.8076, "step": 3200 }, { "epoch": 0.1305173232083531, "grad_norm": 1.1807332038879395, "learning_rate": 1.4987538902122799e-05, "loss": 1.8212, "step": 3300 }, { "epoch": 0.13447239360860624, "grad_norm": 1.4749420881271362, "learning_rate": 1.4986432106368917e-05, "loss": 1.8039, "step": 3400 }, { "epoch": 0.13842746400885936, "grad_norm": 1.2670525312423706, "learning_rate": 1.4985278270938247e-05, "loss": 1.8137, "step": 3500 }, { "epoch": 0.14238253440911247, "grad_norm": 1.5255069732666016, "learning_rate": 1.4984077403080711e-05, "loss": 1.8161, "step": 3600 }, { "epoch": 0.1463376048093656, "grad_norm": 1.1995693445205688, "learning_rate": 1.4982829510341751e-05, "loss": 1.8104, "step": 3700 }, { "epoch": 0.15029267520961873, "grad_norm": 1.3007076978683472, "learning_rate": 1.4981534600562279e-05, "loss": 1.7952, "step": 3800 }, { "epoch": 0.15424774560987187, "grad_norm": 1.4348576068878174, "learning_rate": 1.4980192681878635e-05, "loss": 1.819, "step": 3900 }, { "epoch": 0.15820281601012498, "grad_norm": 1.3245015144348145, "learning_rate": 1.4978803762722526e-05, "loss": 1.8043, "step": 4000 }, { "epoch": 0.1621578864103781, "grad_norm": 1.3621593713760376, "learning_rate": 1.4977367851820984e-05, "loss": 1.7992, "step": 4100 }, { "epoch": 0.16611295681063123, "grad_norm": 1.2361946105957031, "learning_rate": 1.4975884958196297e-05, "loss": 1.8179, "step": 4200 }, { "epoch": 0.17006802721088435, "grad_norm": 1.5746525526046753, "learning_rate": 1.4974355091165972e-05, "loss": 1.8045, "step": 4300 }, { "epoch": 0.1740230976111375, "grad_norm": 1.4326754808425903, "learning_rate": 1.497277826034265e-05, "loss": 1.8155, "step": 4400 }, { "epoch": 0.1779781680113906, "grad_norm": 1.3772553205490112, "learning_rate": 1.4971154475634081e-05, "loss": 1.7838, "step": 4500 }, { "epoch": 0.18193323841164372, "grad_norm": 1.4580802917480469, "learning_rate": 1.4969483747243023e-05, "loss": 1.7997, "step": 4600 }, { "epoch": 0.18588830881189686, "grad_norm": 1.2635383605957031, "learning_rate": 1.4967766085667204e-05, "loss": 1.8091, "step": 4700 }, { "epoch": 0.18984337921214997, "grad_norm": 1.523247241973877, "learning_rate": 1.496600150169925e-05, "loss": 1.8086, "step": 4800 }, { "epoch": 0.1937984496124031, "grad_norm": 1.4835641384124756, "learning_rate": 1.496419000642661e-05, "loss": 1.8001, "step": 4900 }, { "epoch": 0.19775352001265623, "grad_norm": 1.3758567571640015, "learning_rate": 1.4962331611231496e-05, "loss": 1.7773, "step": 5000 }, { "epoch": 0.20170859041290934, "grad_norm": 1.343885898590088, "learning_rate": 1.4960426327790808e-05, "loss": 1.7884, "step": 5100 }, { "epoch": 0.20566366081316248, "grad_norm": 1.4712055921554565, "learning_rate": 1.4958474168076061e-05, "loss": 1.7904, "step": 5200 }, { "epoch": 0.2096187312134156, "grad_norm": 1.3729618787765503, "learning_rate": 1.4956475144353305e-05, "loss": 1.7883, "step": 5300 }, { "epoch": 0.21357380161366873, "grad_norm": 1.4087861776351929, "learning_rate": 1.4954429269183049e-05, "loss": 1.7764, "step": 5400 }, { "epoch": 0.21752887201392185, "grad_norm": 1.359567642211914, "learning_rate": 1.4952336555420194e-05, "loss": 1.7522, "step": 5500 }, { "epoch": 0.22148394241417496, "grad_norm": 1.5321180820465088, "learning_rate": 1.4950197016213935e-05, "loss": 1.7858, "step": 5600 }, { "epoch": 0.2254390128144281, "grad_norm": 1.4172133207321167, "learning_rate": 1.4948010665007694e-05, "loss": 1.7889, "step": 5700 }, { "epoch": 0.22939408321468122, "grad_norm": 1.389819622039795, "learning_rate": 1.4945777515539018e-05, "loss": 1.7787, "step": 5800 }, { "epoch": 0.23334915361493436, "grad_norm": 1.319344162940979, "learning_rate": 1.4943497581839515e-05, "loss": 1.7832, "step": 5900 }, { "epoch": 0.23730422401518747, "grad_norm": 1.3472243547439575, "learning_rate": 1.4941170878234739e-05, "loss": 1.7708, "step": 6000 }, { "epoch": 0.24125929441544058, "grad_norm": 1.319574236869812, "learning_rate": 1.4938797419344127e-05, "loss": 1.8013, "step": 6100 }, { "epoch": 0.24521436481569373, "grad_norm": 1.3281052112579346, "learning_rate": 1.4936377220080886e-05, "loss": 1.7657, "step": 6200 }, { "epoch": 0.24916943521594684, "grad_norm": 1.4613900184631348, "learning_rate": 1.4933910295651914e-05, "loss": 1.7955, "step": 6300 }, { "epoch": 0.2531245056162, "grad_norm": 1.6248208284378052, "learning_rate": 1.4931396661557699e-05, "loss": 1.7775, "step": 6400 }, { "epoch": 0.2570795760164531, "grad_norm": 1.4050931930541992, "learning_rate": 1.492883633359221e-05, "loss": 1.749, "step": 6500 }, { "epoch": 0.2610346464167062, "grad_norm": 1.3959672451019287, "learning_rate": 1.4926229327842822e-05, "loss": 1.7735, "step": 6600 }, { "epoch": 0.2649897168169593, "grad_norm": 1.3407930135726929, "learning_rate": 1.4923575660690197e-05, "loss": 1.7685, "step": 6700 }, { "epoch": 0.2689447872172125, "grad_norm": 1.4229165315628052, "learning_rate": 1.4920875348808181e-05, "loss": 1.769, "step": 6800 }, { "epoch": 0.2728998576174656, "grad_norm": 1.3997846841812134, "learning_rate": 1.4918128409163712e-05, "loss": 1.7804, "step": 6900 }, { "epoch": 0.2768549280177187, "grad_norm": 1.7216688394546509, "learning_rate": 1.4915334859016699e-05, "loss": 1.7699, "step": 7000 }, { "epoch": 0.28080999841797183, "grad_norm": 1.4955641031265259, "learning_rate": 1.491249471591992e-05, "loss": 1.7615, "step": 7100 }, { "epoch": 0.28476506881822494, "grad_norm": 1.631832480430603, "learning_rate": 1.4909607997718917e-05, "loss": 1.7708, "step": 7200 }, { "epoch": 0.2887201392184781, "grad_norm": 1.4372748136520386, "learning_rate": 1.4906674722551872e-05, "loss": 1.7618, "step": 7300 }, { "epoch": 0.2926752096187312, "grad_norm": 1.3430101871490479, "learning_rate": 1.4903694908849506e-05, "loss": 1.7734, "step": 7400 }, { "epoch": 0.29663028001898434, "grad_norm": 1.4826927185058594, "learning_rate": 1.4900668575334953e-05, "loss": 1.7679, "step": 7500 }, { "epoch": 0.30058535041923745, "grad_norm": 1.4105191230773926, "learning_rate": 1.4897595741023642e-05, "loss": 1.7666, "step": 7600 }, { "epoch": 0.30454042081949056, "grad_norm": 1.3381356000900269, "learning_rate": 1.4894476425223191e-05, "loss": 1.7697, "step": 7700 }, { "epoch": 0.30849549121974373, "grad_norm": 1.3745373487472534, "learning_rate": 1.4891310647533266e-05, "loss": 1.7707, "step": 7800 }, { "epoch": 0.31245056161999685, "grad_norm": 1.3524316549301147, "learning_rate": 1.488809842784548e-05, "loss": 1.7515, "step": 7900 }, { "epoch": 0.31640563202024996, "grad_norm": 1.4299299716949463, "learning_rate": 1.4884839786343242e-05, "loss": 1.7799, "step": 8000 }, { "epoch": 0.3203607024205031, "grad_norm": 1.4132308959960938, "learning_rate": 1.4881534743501656e-05, "loss": 1.7258, "step": 8100 }, { "epoch": 0.3243157728207562, "grad_norm": 1.4797372817993164, "learning_rate": 1.4878183320087377e-05, "loss": 1.7657, "step": 8200 }, { "epoch": 0.32827084322100936, "grad_norm": 1.3737105131149292, "learning_rate": 1.4874785537158479e-05, "loss": 1.7845, "step": 8300 }, { "epoch": 0.33222591362126247, "grad_norm": 1.4676398038864136, "learning_rate": 1.4871341416064337e-05, "loss": 1.7652, "step": 8400 }, { "epoch": 0.3361809840215156, "grad_norm": 1.3782434463500977, "learning_rate": 1.4867850978445476e-05, "loss": 1.7516, "step": 8500 }, { "epoch": 0.3401360544217687, "grad_norm": 1.5370761156082153, "learning_rate": 1.4864314246233448e-05, "loss": 1.75, "step": 8600 }, { "epoch": 0.3440911248220218, "grad_norm": 1.4677528142929077, "learning_rate": 1.486073124165068e-05, "loss": 1.7518, "step": 8700 }, { "epoch": 0.348046195222275, "grad_norm": 1.4215251207351685, "learning_rate": 1.4857101987210359e-05, "loss": 1.7634, "step": 8800 }, { "epoch": 0.3520012656225281, "grad_norm": 1.4959337711334229, "learning_rate": 1.4853426505716261e-05, "loss": 1.7491, "step": 8900 }, { "epoch": 0.3559563360227812, "grad_norm": 1.4005351066589355, "learning_rate": 1.4849704820262627e-05, "loss": 1.7713, "step": 9000 }, { "epoch": 0.3599114064230343, "grad_norm": 1.4689812660217285, "learning_rate": 1.484593695423401e-05, "loss": 1.7448, "step": 9100 }, { "epoch": 0.36386647682328743, "grad_norm": 1.5371148586273193, "learning_rate": 1.4842122931305133e-05, "loss": 1.7452, "step": 9200 }, { "epoch": 0.3678215472235406, "grad_norm": 1.4465723037719727, "learning_rate": 1.4838262775440741e-05, "loss": 1.7452, "step": 9300 }, { "epoch": 0.3717766176237937, "grad_norm": 1.5890401601791382, "learning_rate": 1.4834356510895436e-05, "loss": 1.737, "step": 9400 }, { "epoch": 0.37573168802404683, "grad_norm": 1.4862806797027588, "learning_rate": 1.4830404162213549e-05, "loss": 1.7426, "step": 9500 }, { "epoch": 0.37968675842429994, "grad_norm": 1.5449295043945312, "learning_rate": 1.4826405754228963e-05, "loss": 1.7379, "step": 9600 }, { "epoch": 0.38364182882455306, "grad_norm": 1.5151877403259277, "learning_rate": 1.482236131206497e-05, "loss": 1.7269, "step": 9700 }, { "epoch": 0.3875968992248062, "grad_norm": 1.600046157836914, "learning_rate": 1.4818270861134113e-05, "loss": 1.7556, "step": 9800 }, { "epoch": 0.39155196962505934, "grad_norm": 1.4293779134750366, "learning_rate": 1.4814134427138015e-05, "loss": 1.7368, "step": 9900 }, { "epoch": 0.39550704002531245, "grad_norm": 1.378175973892212, "learning_rate": 1.4809952036067231e-05, "loss": 1.7405, "step": 10000 }, { "epoch": 0.39946211042556556, "grad_norm": 1.417622447013855, "learning_rate": 1.4805723714201079e-05, "loss": 1.7484, "step": 10100 }, { "epoch": 0.4034171808258187, "grad_norm": 1.5106312036514282, "learning_rate": 1.4801449488107477e-05, "loss": 1.7218, "step": 10200 }, { "epoch": 0.40737225122607185, "grad_norm": 1.5248609781265259, "learning_rate": 1.4797129384642768e-05, "loss": 1.7328, "step": 10300 }, { "epoch": 0.41132732162632496, "grad_norm": 1.4607023000717163, "learning_rate": 1.4792763430951562e-05, "loss": 1.7131, "step": 10400 }, { "epoch": 0.4152823920265781, "grad_norm": 1.4600701332092285, "learning_rate": 1.4788351654466556e-05, "loss": 1.7418, "step": 10500 }, { "epoch": 0.4192374624268312, "grad_norm": 1.3468823432922363, "learning_rate": 1.4783894082908377e-05, "loss": 1.7649, "step": 10600 }, { "epoch": 0.4231925328270843, "grad_norm": 1.5118048191070557, "learning_rate": 1.4779390744285386e-05, "loss": 1.7233, "step": 10700 }, { "epoch": 0.42714760322733747, "grad_norm": 1.5199166536331177, "learning_rate": 1.4774841666893515e-05, "loss": 1.7238, "step": 10800 }, { "epoch": 0.4311026736275906, "grad_norm": 1.6537836790084839, "learning_rate": 1.4770246879316097e-05, "loss": 1.7216, "step": 10900 }, { "epoch": 0.4350577440278437, "grad_norm": 1.37918221950531, "learning_rate": 1.4765606410423666e-05, "loss": 1.7481, "step": 11000 }, { "epoch": 0.4390128144280968, "grad_norm": 1.526502013206482, "learning_rate": 1.4760920289373791e-05, "loss": 1.7141, "step": 11100 }, { "epoch": 0.4429678848283499, "grad_norm": 1.3577282428741455, "learning_rate": 1.4756188545610884e-05, "loss": 1.7507, "step": 11200 }, { "epoch": 0.4469229552286031, "grad_norm": 1.557986855506897, "learning_rate": 1.475141120886603e-05, "loss": 1.7103, "step": 11300 }, { "epoch": 0.4508780256288562, "grad_norm": 1.638221025466919, "learning_rate": 1.474658830915678e-05, "loss": 1.7363, "step": 11400 }, { "epoch": 0.4548330960291093, "grad_norm": 1.472142219543457, "learning_rate": 1.474171987678697e-05, "loss": 1.7331, "step": 11500 }, { "epoch": 0.45878816642936243, "grad_norm": 1.4680249691009521, "learning_rate": 1.4736805942346542e-05, "loss": 1.7273, "step": 11600 }, { "epoch": 0.46274323682961555, "grad_norm": 1.4165573120117188, "learning_rate": 1.4731846536711337e-05, "loss": 1.7159, "step": 11700 }, { "epoch": 0.4666983072298687, "grad_norm": 2.1816458702087402, "learning_rate": 1.4726841691042902e-05, "loss": 1.7236, "step": 11800 }, { "epoch": 0.4706533776301218, "grad_norm": 1.5376547574996948, "learning_rate": 1.4721791436788307e-05, "loss": 1.7227, "step": 11900 }, { "epoch": 0.47460844803037494, "grad_norm": 1.6850054264068604, "learning_rate": 1.4716695805679932e-05, "loss": 1.7116, "step": 12000 }, { "epoch": 0.47856351843062805, "grad_norm": 1.7338590621948242, "learning_rate": 1.471155482973528e-05, "loss": 1.7129, "step": 12100 }, { "epoch": 0.48251858883088117, "grad_norm": 1.4183164834976196, "learning_rate": 1.4706368541256762e-05, "loss": 1.7267, "step": 12200 }, { "epoch": 0.48647365923113434, "grad_norm": 1.7117156982421875, "learning_rate": 1.4701136972831513e-05, "loss": 1.7149, "step": 12300 }, { "epoch": 0.49042872963138745, "grad_norm": 1.4747951030731201, "learning_rate": 1.4695860157331169e-05, "loss": 1.7218, "step": 12400 }, { "epoch": 0.49438380003164056, "grad_norm": 1.6341221332550049, "learning_rate": 1.4690538127911672e-05, "loss": 1.7331, "step": 12500 }, { "epoch": 0.4983388704318937, "grad_norm": 1.4981880187988281, "learning_rate": 1.4685170918013054e-05, "loss": 1.7182, "step": 12600 }, { "epoch": 0.5022939408321468, "grad_norm": 1.5774872303009033, "learning_rate": 1.4679758561359232e-05, "loss": 1.7154, "step": 12700 }, { "epoch": 0.5062490112324, "grad_norm": 1.5503437519073486, "learning_rate": 1.4674301091957795e-05, "loss": 1.716, "step": 12800 }, { "epoch": 0.5102040816326531, "grad_norm": 1.5208927392959595, "learning_rate": 1.4668798544099795e-05, "loss": 1.7041, "step": 12900 }, { "epoch": 0.5141591520329062, "grad_norm": 1.8089638948440552, "learning_rate": 1.4663250952359516e-05, "loss": 1.7276, "step": 13000 }, { "epoch": 0.5181142224331593, "grad_norm": 1.5653834342956543, "learning_rate": 1.4657658351594275e-05, "loss": 1.7164, "step": 13100 }, { "epoch": 0.5220692928334124, "grad_norm": 1.7017031908035278, "learning_rate": 1.4652020776944194e-05, "loss": 1.7053, "step": 13200 }, { "epoch": 0.5260243632336655, "grad_norm": 1.6849620342254639, "learning_rate": 1.4646338263831977e-05, "loss": 1.7134, "step": 13300 }, { "epoch": 0.5299794336339186, "grad_norm": 1.8098126649856567, "learning_rate": 1.4640610847962699e-05, "loss": 1.7158, "step": 13400 }, { "epoch": 0.5339345040341718, "grad_norm": 1.7234479188919067, "learning_rate": 1.4634838565323563e-05, "loss": 1.7229, "step": 13500 }, { "epoch": 0.537889574434425, "grad_norm": 1.35356867313385, "learning_rate": 1.4629021452183695e-05, "loss": 1.715, "step": 13600 }, { "epoch": 0.5418446448346781, "grad_norm": 1.5286564826965332, "learning_rate": 1.4623159545093895e-05, "loss": 1.7011, "step": 13700 }, { "epoch": 0.5457997152349312, "grad_norm": 1.5586360692977905, "learning_rate": 1.4617252880886427e-05, "loss": 1.6978, "step": 13800 }, { "epoch": 0.5497547856351843, "grad_norm": 1.5301753282546997, "learning_rate": 1.461130149667477e-05, "loss": 1.6984, "step": 13900 }, { "epoch": 0.5537098560354374, "grad_norm": 1.6551586389541626, "learning_rate": 1.4605305429853402e-05, "loss": 1.6935, "step": 14000 }, { "epoch": 0.5576649264356905, "grad_norm": 1.522283911705017, "learning_rate": 1.4599264718097552e-05, "loss": 1.6795, "step": 14100 }, { "epoch": 0.5616199968359437, "grad_norm": 1.519173502922058, "learning_rate": 1.4593179399362967e-05, "loss": 1.6948, "step": 14200 }, { "epoch": 0.5655750672361968, "grad_norm": 1.582780122756958, "learning_rate": 1.4587049511885675e-05, "loss": 1.7168, "step": 14300 }, { "epoch": 0.5695301376364499, "grad_norm": 1.5130764245986938, "learning_rate": 1.458087509418174e-05, "loss": 1.7049, "step": 14400 }, { "epoch": 0.573485208036703, "grad_norm": 1.581992268562317, "learning_rate": 1.4574656185047033e-05, "loss": 1.695, "step": 14500 }, { "epoch": 0.5774402784369562, "grad_norm": 1.4675225019454956, "learning_rate": 1.456839282355697e-05, "loss": 1.7015, "step": 14600 }, { "epoch": 0.5813953488372093, "grad_norm": 1.5948406457901, "learning_rate": 1.4562085049066282e-05, "loss": 1.7129, "step": 14700 }, { "epoch": 0.5853504192374624, "grad_norm": 1.8901729583740234, "learning_rate": 1.4555732901208756e-05, "loss": 1.7062, "step": 14800 }, { "epoch": 0.5893054896377156, "grad_norm": 1.6940269470214844, "learning_rate": 1.4549336419896993e-05, "loss": 1.7025, "step": 14900 }, { "epoch": 0.5932605600379687, "grad_norm": 1.5160539150238037, "learning_rate": 1.454289564532216e-05, "loss": 1.688, "step": 15000 }, { "epoch": 0.5972156304382218, "grad_norm": 1.6424893140792847, "learning_rate": 1.4536410617953726e-05, "loss": 1.696, "step": 15100 }, { "epoch": 0.6011707008384749, "grad_norm": 1.492990493774414, "learning_rate": 1.4529881378539218e-05, "loss": 1.6768, "step": 15200 }, { "epoch": 0.605125771238728, "grad_norm": 1.7309181690216064, "learning_rate": 1.452330796810396e-05, "loss": 1.6972, "step": 15300 }, { "epoch": 0.6090808416389811, "grad_norm": 1.684484601020813, "learning_rate": 1.451669042795082e-05, "loss": 1.6903, "step": 15400 }, { "epoch": 0.6130359120392342, "grad_norm": 1.5465792417526245, "learning_rate": 1.4510028799659944e-05, "loss": 1.714, "step": 15500 }, { "epoch": 0.6169909824394875, "grad_norm": 1.8257033824920654, "learning_rate": 1.4503323125088501e-05, "loss": 1.6894, "step": 15600 }, { "epoch": 0.6209460528397406, "grad_norm": 1.5299944877624512, "learning_rate": 1.4496573446370414e-05, "loss": 1.6944, "step": 15700 }, { "epoch": 0.6249011232399937, "grad_norm": 1.7090293169021606, "learning_rate": 1.44897798059161e-05, "loss": 1.6878, "step": 15800 }, { "epoch": 0.6288561936402468, "grad_norm": 1.690470576286316, "learning_rate": 1.4482942246412203e-05, "loss": 1.6807, "step": 15900 }, { "epoch": 0.6328112640404999, "grad_norm": 1.8261181116104126, "learning_rate": 1.4476060810821319e-05, "loss": 1.6887, "step": 16000 }, { "epoch": 0.636766334440753, "grad_norm": 1.5878318548202515, "learning_rate": 1.4469135542381741e-05, "loss": 1.6618, "step": 16100 }, { "epoch": 0.6407214048410061, "grad_norm": 1.5003888607025146, "learning_rate": 1.4462166484607167e-05, "loss": 1.6734, "step": 16200 }, { "epoch": 0.6446764752412593, "grad_norm": 1.7296781539916992, "learning_rate": 1.445515368128645e-05, "loss": 1.6712, "step": 16300 }, { "epoch": 0.6486315456415124, "grad_norm": 1.6283060312271118, "learning_rate": 1.4448097176483299e-05, "loss": 1.6963, "step": 16400 }, { "epoch": 0.6525866160417655, "grad_norm": 1.5867258310317993, "learning_rate": 1.444099701453602e-05, "loss": 1.6834, "step": 16500 }, { "epoch": 0.6565416864420187, "grad_norm": 1.8763879537582397, "learning_rate": 1.4433853240057229e-05, "loss": 1.6811, "step": 16600 }, { "epoch": 0.6604967568422718, "grad_norm": 1.5323275327682495, "learning_rate": 1.4426665897933574e-05, "loss": 1.6778, "step": 16700 }, { "epoch": 0.6644518272425249, "grad_norm": 1.581667184829712, "learning_rate": 1.4419435033325455e-05, "loss": 1.6926, "step": 16800 }, { "epoch": 0.668406897642778, "grad_norm": 1.6673179864883423, "learning_rate": 1.441216069166673e-05, "loss": 1.6806, "step": 16900 }, { "epoch": 0.6723619680430312, "grad_norm": 1.8026336431503296, "learning_rate": 1.4404842918664446e-05, "loss": 1.6829, "step": 17000 }, { "epoch": 0.6763170384432843, "grad_norm": 1.6094428300857544, "learning_rate": 1.4397481760298542e-05, "loss": 1.6763, "step": 17100 }, { "epoch": 0.6802721088435374, "grad_norm": 1.565843105316162, "learning_rate": 1.4390077262821559e-05, "loss": 1.659, "step": 17200 }, { "epoch": 0.6842271792437905, "grad_norm": 1.7567963600158691, "learning_rate": 1.4382629472758346e-05, "loss": 1.666, "step": 17300 }, { "epoch": 0.6881822496440436, "grad_norm": 1.591693639755249, "learning_rate": 1.4375138436905786e-05, "loss": 1.6666, "step": 17400 }, { "epoch": 0.6921373200442967, "grad_norm": 1.638576865196228, "learning_rate": 1.436760420233248e-05, "loss": 1.6554, "step": 17500 }, { "epoch": 0.69609239044455, "grad_norm": 1.7055751085281372, "learning_rate": 1.4360026816378462e-05, "loss": 1.671, "step": 17600 }, { "epoch": 0.7000474608448031, "grad_norm": 1.6867974996566772, "learning_rate": 1.4352406326654905e-05, "loss": 1.6722, "step": 17700 }, { "epoch": 0.7040025312450562, "grad_norm": 1.7862675189971924, "learning_rate": 1.4344742781043809e-05, "loss": 1.6965, "step": 17800 }, { "epoch": 0.7079576016453093, "grad_norm": 1.7989298105239868, "learning_rate": 1.4337036227697715e-05, "loss": 1.6762, "step": 17900 }, { "epoch": 0.7119126720455624, "grad_norm": 1.7017799615859985, "learning_rate": 1.4329286715039392e-05, "loss": 1.6614, "step": 18000 }, { "epoch": 0.7158677424458155, "grad_norm": 1.6665624380111694, "learning_rate": 1.4321494291761537e-05, "loss": 1.662, "step": 18100 }, { "epoch": 0.7198228128460686, "grad_norm": 1.9826371669769287, "learning_rate": 1.4313659006826468e-05, "loss": 1.6638, "step": 18200 }, { "epoch": 0.7237778832463218, "grad_norm": 1.7711797952651978, "learning_rate": 1.430578090946582e-05, "loss": 1.6625, "step": 18300 }, { "epoch": 0.7277329536465749, "grad_norm": 1.8430758714675903, "learning_rate": 1.4297860049180223e-05, "loss": 1.6824, "step": 18400 }, { "epoch": 0.731688024046828, "grad_norm": 1.8522826433181763, "learning_rate": 1.4289896475739012e-05, "loss": 1.6599, "step": 18500 }, { "epoch": 0.7356430944470812, "grad_norm": 1.6422381401062012, "learning_rate": 1.4281890239179897e-05, "loss": 1.6584, "step": 18600 }, { "epoch": 0.7395981648473343, "grad_norm": 1.7232320308685303, "learning_rate": 1.4273841389808653e-05, "loss": 1.6604, "step": 18700 }, { "epoch": 0.7435532352475874, "grad_norm": 1.6723573207855225, "learning_rate": 1.4265749978198805e-05, "loss": 1.6654, "step": 18800 }, { "epoch": 0.7475083056478405, "grad_norm": 1.776376485824585, "learning_rate": 1.4257616055191316e-05, "loss": 1.672, "step": 18900 }, { "epoch": 0.7514633760480937, "grad_norm": 1.9001519680023193, "learning_rate": 1.4249439671894253e-05, "loss": 1.6473, "step": 19000 }, { "epoch": 0.7554184464483468, "grad_norm": 1.5602456331253052, "learning_rate": 1.4241220879682484e-05, "loss": 1.6558, "step": 19100 }, { "epoch": 0.7593735168485999, "grad_norm": 1.6195124387741089, "learning_rate": 1.423295973019734e-05, "loss": 1.6704, "step": 19200 }, { "epoch": 0.763328587248853, "grad_norm": 1.7354437112808228, "learning_rate": 1.4224656275346295e-05, "loss": 1.6697, "step": 19300 }, { "epoch": 0.7672836576491061, "grad_norm": 1.6655138731002808, "learning_rate": 1.4216310567302648e-05, "loss": 1.6567, "step": 19400 }, { "epoch": 0.7712387280493592, "grad_norm": 1.9168803691864014, "learning_rate": 1.4207922658505184e-05, "loss": 1.6398, "step": 19500 }, { "epoch": 0.7751937984496124, "grad_norm": 1.7918130159378052, "learning_rate": 1.4199492601657848e-05, "loss": 1.654, "step": 19600 }, { "epoch": 0.7791488688498656, "grad_norm": 1.7283716201782227, "learning_rate": 1.4191020449729417e-05, "loss": 1.6534, "step": 19700 }, { "epoch": 0.7831039392501187, "grad_norm": 1.860144019126892, "learning_rate": 1.4182506255953167e-05, "loss": 1.6553, "step": 19800 }, { "epoch": 0.7870590096503718, "grad_norm": 1.7320619821548462, "learning_rate": 1.4173950073826531e-05, "loss": 1.6586, "step": 19900 }, { "epoch": 0.7910140800506249, "grad_norm": 1.7704521417617798, "learning_rate": 1.4165351957110772e-05, "loss": 1.6599, "step": 20000 }, { "epoch": 0.794969150450878, "grad_norm": 2.051400661468506, "learning_rate": 1.4156711959830644e-05, "loss": 1.6695, "step": 20100 }, { "epoch": 0.7989242208511311, "grad_norm": 1.7863457202911377, "learning_rate": 1.4148030136274043e-05, "loss": 1.6538, "step": 20200 }, { "epoch": 0.8028792912513842, "grad_norm": 1.85243558883667, "learning_rate": 1.413930654099168e-05, "loss": 1.63, "step": 20300 }, { "epoch": 0.8068343616516374, "grad_norm": 1.7953428030014038, "learning_rate": 1.413054122879673e-05, "loss": 1.6374, "step": 20400 }, { "epoch": 0.8107894320518905, "grad_norm": 1.8957959413528442, "learning_rate": 1.4121734254764482e-05, "loss": 1.6445, "step": 20500 }, { "epoch": 0.8147445024521437, "grad_norm": 1.7762993574142456, "learning_rate": 1.4112885674232011e-05, "loss": 1.6503, "step": 20600 }, { "epoch": 0.8186995728523968, "grad_norm": 1.723813772201538, "learning_rate": 1.410399554279781e-05, "loss": 1.6416, "step": 20700 }, { "epoch": 0.8226546432526499, "grad_norm": 1.9667476415634155, "learning_rate": 1.4095063916321456e-05, "loss": 1.6297, "step": 20800 }, { "epoch": 0.826609713652903, "grad_norm": 1.8855000734329224, "learning_rate": 1.4086090850923246e-05, "loss": 1.6684, "step": 20900 }, { "epoch": 0.8305647840531561, "grad_norm": 1.8669531345367432, "learning_rate": 1.4077076402983857e-05, "loss": 1.6344, "step": 21000 }, { "epoch": 0.8345198544534093, "grad_norm": 2.0968127250671387, "learning_rate": 1.4068020629143985e-05, "loss": 1.6524, "step": 21100 }, { "epoch": 0.8384749248536624, "grad_norm": 1.712428092956543, "learning_rate": 1.4058923586303988e-05, "loss": 1.6333, "step": 21200 }, { "epoch": 0.8424299952539155, "grad_norm": 1.7560315132141113, "learning_rate": 1.4049785331623534e-05, "loss": 1.6337, "step": 21300 }, { "epoch": 0.8463850656541686, "grad_norm": 1.875779390335083, "learning_rate": 1.4040605922521231e-05, "loss": 1.6328, "step": 21400 }, { "epoch": 0.8503401360544217, "grad_norm": 1.9142519235610962, "learning_rate": 1.403138541667428e-05, "loss": 1.6333, "step": 21500 }, { "epoch": 0.8542952064546749, "grad_norm": 2.263770580291748, "learning_rate": 1.4022123872018107e-05, "loss": 1.635, "step": 21600 }, { "epoch": 0.858250276854928, "grad_norm": 1.838889479637146, "learning_rate": 1.4012821346745995e-05, "loss": 1.6328, "step": 21700 }, { "epoch": 0.8622053472551812, "grad_norm": 1.9877907037734985, "learning_rate": 1.400347789930872e-05, "loss": 1.6427, "step": 21800 }, { "epoch": 0.8661604176554343, "grad_norm": 1.9885168075561523, "learning_rate": 1.399409358841419e-05, "loss": 1.637, "step": 21900 }, { "epoch": 0.8701154880556874, "grad_norm": 1.8428804874420166, "learning_rate": 1.3984668473027065e-05, "loss": 1.6377, "step": 22000 }, { "epoch": 0.8740705584559405, "grad_norm": 1.9875133037567139, "learning_rate": 1.39752026123684e-05, "loss": 1.6248, "step": 22100 }, { "epoch": 0.8780256288561936, "grad_norm": 2.0525448322296143, "learning_rate": 1.3965696065915262e-05, "loss": 1.6501, "step": 22200 }, { "epoch": 0.8819806992564467, "grad_norm": 1.9695172309875488, "learning_rate": 1.3956148893400357e-05, "loss": 1.6308, "step": 22300 }, { "epoch": 0.8859357696566998, "grad_norm": 2.084592580795288, "learning_rate": 1.3946561154811664e-05, "loss": 1.6335, "step": 22400 }, { "epoch": 0.889890840056953, "grad_norm": 1.7602378129959106, "learning_rate": 1.3936932910392048e-05, "loss": 1.6195, "step": 22500 }, { "epoch": 0.8938459104572062, "grad_norm": 2.0157277584075928, "learning_rate": 1.3927264220638889e-05, "loss": 1.6395, "step": 22600 }, { "epoch": 0.8978009808574593, "grad_norm": 2.184307336807251, "learning_rate": 1.391755514630369e-05, "loss": 1.6448, "step": 22700 }, { "epoch": 0.9017560512577124, "grad_norm": 1.9747377634048462, "learning_rate": 1.390780574839171e-05, "loss": 1.6302, "step": 22800 }, { "epoch": 0.9057111216579655, "grad_norm": 2.1203644275665283, "learning_rate": 1.3898016088161575e-05, "loss": 1.6447, "step": 22900 }, { "epoch": 0.9096661920582186, "grad_norm": 2.0279908180236816, "learning_rate": 1.3888186227124885e-05, "loss": 1.622, "step": 23000 }, { "epoch": 0.9136212624584718, "grad_norm": 1.9809517860412598, "learning_rate": 1.3878316227045846e-05, "loss": 1.6189, "step": 23100 }, { "epoch": 0.9175763328587249, "grad_norm": 2.1499814987182617, "learning_rate": 1.386840614994086e-05, "loss": 1.6192, "step": 23200 }, { "epoch": 0.921531403258978, "grad_norm": 1.9230985641479492, "learning_rate": 1.3858456058078148e-05, "loss": 1.6251, "step": 23300 }, { "epoch": 0.9254864736592311, "grad_norm": 2.1623005867004395, "learning_rate": 1.3848466013977365e-05, "loss": 1.6145, "step": 23400 }, { "epoch": 0.9294415440594842, "grad_norm": 2.0579729080200195, "learning_rate": 1.3838436080409188e-05, "loss": 1.628, "step": 23500 }, { "epoch": 0.9333966144597374, "grad_norm": 1.9566960334777832, "learning_rate": 1.3828366320394937e-05, "loss": 1.6317, "step": 23600 }, { "epoch": 0.9373516848599905, "grad_norm": 1.9498804807662964, "learning_rate": 1.3818256797206177e-05, "loss": 1.6107, "step": 23700 }, { "epoch": 0.9413067552602437, "grad_norm": 1.8530848026275635, "learning_rate": 1.3808107574364312e-05, "loss": 1.6186, "step": 23800 }, { "epoch": 0.9452618256604968, "grad_norm": 2.0309464931488037, "learning_rate": 1.3797918715640197e-05, "loss": 1.6208, "step": 23900 }, { "epoch": 0.9492168960607499, "grad_norm": 2.0447559356689453, "learning_rate": 1.3787690285053732e-05, "loss": 1.6226, "step": 24000 }, { "epoch": 0.953171966461003, "grad_norm": 2.198214292526245, "learning_rate": 1.3777422346873453e-05, "loss": 1.5954, "step": 24100 }, { "epoch": 0.9571270368612561, "grad_norm": 2.0797901153564453, "learning_rate": 1.3767114965616143e-05, "loss": 1.6378, "step": 24200 }, { "epoch": 0.9610821072615092, "grad_norm": 2.361177444458008, "learning_rate": 1.3756768206046418e-05, "loss": 1.6207, "step": 24300 }, { "epoch": 0.9650371776617623, "grad_norm": 2.194758653640747, "learning_rate": 1.3746382133176314e-05, "loss": 1.6147, "step": 24400 }, { "epoch": 0.9689922480620154, "grad_norm": 2.1259610652923584, "learning_rate": 1.3735956812264893e-05, "loss": 1.583, "step": 24500 }, { "epoch": 0.9729473184622687, "grad_norm": 2.084428548812866, "learning_rate": 1.372549230881782e-05, "loss": 1.6257, "step": 24600 }, { "epoch": 0.9769023888625218, "grad_norm": 2.0704309940338135, "learning_rate": 1.3714988688586958e-05, "loss": 1.6062, "step": 24700 }, { "epoch": 0.9808574592627749, "grad_norm": 1.9661308526992798, "learning_rate": 1.3704446017569953e-05, "loss": 1.6164, "step": 24800 }, { "epoch": 0.984812529663028, "grad_norm": 2.1788337230682373, "learning_rate": 1.3693864362009821e-05, "loss": 1.6188, "step": 24900 }, { "epoch": 0.9887676000632811, "grad_norm": 1.9492045640945435, "learning_rate": 1.3683243788394534e-05, "loss": 1.609, "step": 25000 }, { "epoch": 0.9927226704635342, "grad_norm": 2.2324581146240234, "learning_rate": 1.3672584363456587e-05, "loss": 1.6058, "step": 25100 }, { "epoch": 0.9966777408637874, "grad_norm": 2.14666485786438, "learning_rate": 1.3661886154172602e-05, "loss": 1.6059, "step": 25200 }, { "epoch": 1.0006328112640406, "grad_norm": 2.1909172534942627, "learning_rate": 1.3651149227762893e-05, "loss": 1.6006, "step": 25300 }, { "epoch": 1.0025726272461015, "grad_norm": 2.143050193786621, "learning_rate": 1.3640373651691044e-05, "loss": 1.5379, "step": 25400 }, { "epoch": 1.0065305153170268, "grad_norm": 2.6964924335479736, "learning_rate": 1.3629559493663487e-05, "loss": 1.507, "step": 25500 }, { "epoch": 1.0104884033879522, "grad_norm": 2.556349992752075, "learning_rate": 1.361870682162908e-05, "loss": 1.5266, "step": 25600 }, { "epoch": 1.0144462914588774, "grad_norm": 2.2218785285949707, "learning_rate": 1.3607815703778673e-05, "loss": 1.4867, "step": 25700 }, { "epoch": 1.0184041795298029, "grad_norm": 2.5236029624938965, "learning_rate": 1.3596886208544687e-05, "loss": 1.4879, "step": 25800 }, { "epoch": 1.0223620676007283, "grad_norm": 2.5998125076293945, "learning_rate": 1.3585918404600679e-05, "loss": 1.524, "step": 25900 }, { "epoch": 1.0263199556716536, "grad_norm": 2.6749346256256104, "learning_rate": 1.3574912360860912e-05, "loss": 1.5148, "step": 26000 }, { "epoch": 1.030277843742579, "grad_norm": 2.4651994705200195, "learning_rate": 1.3563868146479921e-05, "loss": 1.4905, "step": 26100 }, { "epoch": 1.0342357318135043, "grad_norm": 2.4166133403778076, "learning_rate": 1.3552785830852084e-05, "loss": 1.5155, "step": 26200 }, { "epoch": 1.0381936198844297, "grad_norm": 2.2493224143981934, "learning_rate": 1.3541665483611175e-05, "loss": 1.515, "step": 26300 }, { "epoch": 1.042151507955355, "grad_norm": 2.7555856704711914, "learning_rate": 1.3530507174629938e-05, "loss": 1.5136, "step": 26400 }, { "epoch": 1.0461093960262804, "grad_norm": 2.7666208744049072, "learning_rate": 1.3519310974019639e-05, "loss": 1.4917, "step": 26500 }, { "epoch": 1.0500672840972056, "grad_norm": 2.7659711837768555, "learning_rate": 1.3508076952129634e-05, "loss": 1.5076, "step": 26600 }, { "epoch": 1.054025172168131, "grad_norm": 2.5507092475891113, "learning_rate": 1.3496805179546919e-05, "loss": 1.5052, "step": 26700 }, { "epoch": 1.0579830602390565, "grad_norm": 2.302542209625244, "learning_rate": 1.3485495727095687e-05, "loss": 1.5034, "step": 26800 }, { "epoch": 1.0619409483099818, "grad_norm": 2.578275203704834, "learning_rate": 1.3474148665836894e-05, "loss": 1.4886, "step": 26900 }, { "epoch": 1.0658988363809072, "grad_norm": 2.354796886444092, "learning_rate": 1.3462764067067799e-05, "loss": 1.506, "step": 27000 }, { "epoch": 1.0698567244518324, "grad_norm": 2.5606160163879395, "learning_rate": 1.345134200232152e-05, "loss": 1.4931, "step": 27100 }, { "epoch": 1.073814612522758, "grad_norm": 2.46881365776062, "learning_rate": 1.343988254336659e-05, "loss": 1.503, "step": 27200 }, { "epoch": 1.0777725005936831, "grad_norm": 2.657731771469116, "learning_rate": 1.3428385762206498e-05, "loss": 1.5064, "step": 27300 }, { "epoch": 1.0817303886646086, "grad_norm": 2.3708932399749756, "learning_rate": 1.3416851731079244e-05, "loss": 1.4943, "step": 27400 }, { "epoch": 1.0856882767355338, "grad_norm": 2.6182353496551514, "learning_rate": 1.340528052245688e-05, "loss": 1.5001, "step": 27500 }, { "epoch": 1.0896461648064593, "grad_norm": 2.7265477180480957, "learning_rate": 1.3393672209045055e-05, "loss": 1.4794, "step": 27600 }, { "epoch": 1.0936040528773847, "grad_norm": 2.6186697483062744, "learning_rate": 1.3382026863782559e-05, "loss": 1.4878, "step": 27700 }, { "epoch": 1.09756194094831, "grad_norm": 3.3754959106445312, "learning_rate": 1.3370344559840868e-05, "loss": 1.4769, "step": 27800 }, { "epoch": 1.1015198290192354, "grad_norm": 3.0805869102478027, "learning_rate": 1.3358625370623684e-05, "loss": 1.5098, "step": 27900 }, { "epoch": 1.1054777170901606, "grad_norm": 2.626561403274536, "learning_rate": 1.334686936976646e-05, "loss": 1.4965, "step": 28000 }, { "epoch": 1.109435605161086, "grad_norm": 2.6148223876953125, "learning_rate": 1.333507663113596e-05, "loss": 1.4973, "step": 28100 }, { "epoch": 1.1133934932320113, "grad_norm": 2.9437952041625977, "learning_rate": 1.3323247228829781e-05, "loss": 1.4861, "step": 28200 }, { "epoch": 1.1173513813029368, "grad_norm": 2.7327873706817627, "learning_rate": 1.3311381237175882e-05, "loss": 1.4725, "step": 28300 }, { "epoch": 1.121309269373862, "grad_norm": 2.8548924922943115, "learning_rate": 1.3299478730732134e-05, "loss": 1.4825, "step": 28400 }, { "epoch": 1.1252671574447874, "grad_norm": 2.638568878173828, "learning_rate": 1.3287539784285839e-05, "loss": 1.4715, "step": 28500 }, { "epoch": 1.129225045515713, "grad_norm": 3.1752021312713623, "learning_rate": 1.327556447285326e-05, "loss": 1.4749, "step": 28600 }, { "epoch": 1.1331829335866381, "grad_norm": 2.8398923873901367, "learning_rate": 1.3263552871679156e-05, "loss": 1.4901, "step": 28700 }, { "epoch": 1.1371408216575636, "grad_norm": 2.708963632583618, "learning_rate": 1.3251505056236312e-05, "loss": 1.4805, "step": 28800 }, { "epoch": 1.1410987097284888, "grad_norm": 2.9168691635131836, "learning_rate": 1.3239421102225049e-05, "loss": 1.4653, "step": 28900 }, { "epoch": 1.1450565977994143, "grad_norm": 2.523481845855713, "learning_rate": 1.322730108557276e-05, "loss": 1.4822, "step": 29000 }, { "epoch": 1.1490144858703395, "grad_norm": 3.2788479328155518, "learning_rate": 1.3215145082433436e-05, "loss": 1.4685, "step": 29100 }, { "epoch": 1.152972373941265, "grad_norm": 2.765491485595703, "learning_rate": 1.3202953169187181e-05, "loss": 1.4878, "step": 29200 }, { "epoch": 1.1569302620121902, "grad_norm": 2.9480185508728027, "learning_rate": 1.3190725422439734e-05, "loss": 1.4607, "step": 29300 }, { "epoch": 1.1608881500831156, "grad_norm": 2.9779725074768066, "learning_rate": 1.3178461919021984e-05, "loss": 1.4724, "step": 29400 }, { "epoch": 1.164846038154041, "grad_norm": 2.768763780593872, "learning_rate": 1.3166162735989497e-05, "loss": 1.4989, "step": 29500 }, { "epoch": 1.1688039262249663, "grad_norm": 3.2208807468414307, "learning_rate": 1.3153827950622019e-05, "loss": 1.4695, "step": 29600 }, { "epoch": 1.1727618142958918, "grad_norm": 2.7532846927642822, "learning_rate": 1.3141457640423002e-05, "loss": 1.4841, "step": 29700 }, { "epoch": 1.176719702366817, "grad_norm": 3.399897575378418, "learning_rate": 1.3129051883119107e-05, "loss": 1.4627, "step": 29800 }, { "epoch": 1.1806775904377425, "grad_norm": 2.892542600631714, "learning_rate": 1.311661075665973e-05, "loss": 1.4754, "step": 29900 }, { "epoch": 1.1846354785086677, "grad_norm": 2.6261606216430664, "learning_rate": 1.310413433921649e-05, "loss": 1.4847, "step": 30000 }, { "epoch": 1.1885933665795931, "grad_norm": 2.6923210620880127, "learning_rate": 1.3091622709182762e-05, "loss": 1.4722, "step": 30100 }, { "epoch": 1.1925512546505184, "grad_norm": 3.0266880989074707, "learning_rate": 1.3079075945173164e-05, "loss": 1.4922, "step": 30200 }, { "epoch": 1.1965091427214438, "grad_norm": 2.9252758026123047, "learning_rate": 1.306649412602308e-05, "loss": 1.4692, "step": 30300 }, { "epoch": 1.2004670307923693, "grad_norm": 2.757887125015259, "learning_rate": 1.305387733078815e-05, "loss": 1.465, "step": 30400 }, { "epoch": 1.2044249188632945, "grad_norm": 3.536576271057129, "learning_rate": 1.304122563874379e-05, "loss": 1.4625, "step": 30500 }, { "epoch": 1.20838280693422, "grad_norm": 2.765883684158325, "learning_rate": 1.3028539129384668e-05, "loss": 1.4509, "step": 30600 }, { "epoch": 1.2123406950051452, "grad_norm": 2.8738183975219727, "learning_rate": 1.3015817882424235e-05, "loss": 1.4421, "step": 30700 }, { "epoch": 1.2162985830760706, "grad_norm": 2.8780033588409424, "learning_rate": 1.3003061977794207e-05, "loss": 1.4749, "step": 30800 }, { "epoch": 1.2202564711469959, "grad_norm": 3.499163866043091, "learning_rate": 1.2990271495644059e-05, "loss": 1.4564, "step": 30900 }, { "epoch": 1.2242143592179213, "grad_norm": 2.900714159011841, "learning_rate": 1.297744651634053e-05, "loss": 1.4642, "step": 31000 }, { "epoch": 1.2281722472888466, "grad_norm": 2.939680337905884, "learning_rate": 1.2964587120467122e-05, "loss": 1.4512, "step": 31100 }, { "epoch": 1.232130135359772, "grad_norm": 3.2539045810699463, "learning_rate": 1.2951693388823577e-05, "loss": 1.4478, "step": 31200 }, { "epoch": 1.2360880234306975, "grad_norm": 3.0612869262695312, "learning_rate": 1.293876540242539e-05, "loss": 1.4674, "step": 31300 }, { "epoch": 1.2400459115016227, "grad_norm": 2.9142935276031494, "learning_rate": 1.2925803242503287e-05, "loss": 1.4704, "step": 31400 }, { "epoch": 1.2440037995725481, "grad_norm": 3.0163750648498535, "learning_rate": 1.291280699050271e-05, "loss": 1.4433, "step": 31500 }, { "epoch": 1.2479616876434734, "grad_norm": 2.712174654006958, "learning_rate": 1.289977672808332e-05, "loss": 1.4637, "step": 31600 }, { "epoch": 1.2519195757143988, "grad_norm": 2.7295100688934326, "learning_rate": 1.2886712537118475e-05, "loss": 1.4599, "step": 31700 }, { "epoch": 1.2558774637853243, "grad_norm": 3.3285765647888184, "learning_rate": 1.2873614499694717e-05, "loss": 1.4432, "step": 31800 }, { "epoch": 1.2598353518562495, "grad_norm": 3.4205710887908936, "learning_rate": 1.2860482698111254e-05, "loss": 1.4571, "step": 31900 }, { "epoch": 1.2637932399271747, "grad_norm": 2.865621328353882, "learning_rate": 1.2847317214879451e-05, "loss": 1.4434, "step": 32000 }, { "epoch": 1.2677511279981002, "grad_norm": 2.7961373329162598, "learning_rate": 1.2834118132722296e-05, "loss": 1.4305, "step": 32100 }, { "epoch": 1.2717090160690256, "grad_norm": 2.86441707611084, "learning_rate": 1.2820885534573903e-05, "loss": 1.4592, "step": 32200 }, { "epoch": 1.2756669041399509, "grad_norm": 3.3199241161346436, "learning_rate": 1.2807619503578964e-05, "loss": 1.451, "step": 32300 }, { "epoch": 1.2796247922108763, "grad_norm": 2.9922525882720947, "learning_rate": 1.2794320123092248e-05, "loss": 1.4447, "step": 32400 }, { "epoch": 1.2835826802818016, "grad_norm": 2.9650540351867676, "learning_rate": 1.2780987476678072e-05, "loss": 1.4492, "step": 32500 }, { "epoch": 1.287540568352727, "grad_norm": 3.2158126831054688, "learning_rate": 1.2767621648109765e-05, "loss": 1.428, "step": 32600 }, { "epoch": 1.2914984564236525, "grad_norm": 3.462463617324829, "learning_rate": 1.275422272136916e-05, "loss": 1.4355, "step": 32700 }, { "epoch": 1.2954563444945777, "grad_norm": 2.9805209636688232, "learning_rate": 1.2740790780646048e-05, "loss": 1.4153, "step": 32800 }, { "epoch": 1.299414232565503, "grad_norm": 3.3206562995910645, "learning_rate": 1.2727325910337665e-05, "loss": 1.425, "step": 32900 }, { "epoch": 1.3033721206364284, "grad_norm": 3.306009531021118, "learning_rate": 1.2713828195048149e-05, "loss": 1.4398, "step": 33000 }, { "epoch": 1.3073300087073538, "grad_norm": 3.6652069091796875, "learning_rate": 1.2700297719588015e-05, "loss": 1.4126, "step": 33100 }, { "epoch": 1.311287896778279, "grad_norm": 3.067331314086914, "learning_rate": 1.268673456897362e-05, "loss": 1.4453, "step": 33200 }, { "epoch": 1.3152457848492045, "grad_norm": 3.4072649478912354, "learning_rate": 1.2673138828426633e-05, "loss": 1.4195, "step": 33300 }, { "epoch": 1.3192036729201297, "grad_norm": 3.721276044845581, "learning_rate": 1.2659510583373492e-05, "loss": 1.4308, "step": 33400 }, { "epoch": 1.3231615609910552, "grad_norm": 3.381657361984253, "learning_rate": 1.2645849919444875e-05, "loss": 1.4102, "step": 33500 }, { "epoch": 1.3271194490619806, "grad_norm": 3.2021045684814453, "learning_rate": 1.2632156922475153e-05, "loss": 1.4248, "step": 33600 }, { "epoch": 1.3310773371329059, "grad_norm": 3.0373260974884033, "learning_rate": 1.2618431678501862e-05, "loss": 1.4156, "step": 33700 }, { "epoch": 1.335035225203831, "grad_norm": 2.7702269554138184, "learning_rate": 1.2604674273765154e-05, "loss": 1.4511, "step": 33800 }, { "epoch": 1.3389931132747566, "grad_norm": 3.3153131008148193, "learning_rate": 1.2590884794707254e-05, "loss": 1.4523, "step": 33900 }, { "epoch": 1.342951001345682, "grad_norm": 3.249516248703003, "learning_rate": 1.2577063327971927e-05, "loss": 1.4225, "step": 34000 }, { "epoch": 1.3469088894166072, "grad_norm": 3.879835367202759, "learning_rate": 1.2563209960403921e-05, "loss": 1.4248, "step": 34100 }, { "epoch": 1.3508667774875327, "grad_norm": 3.548116445541382, "learning_rate": 1.2549324779048432e-05, "loss": 1.4248, "step": 34200 }, { "epoch": 1.354824665558458, "grad_norm": 3.109065294265747, "learning_rate": 1.253540787115055e-05, "loss": 1.4269, "step": 34300 }, { "epoch": 1.3587825536293834, "grad_norm": 3.3330225944519043, "learning_rate": 1.2521459324154708e-05, "loss": 1.4354, "step": 34400 }, { "epoch": 1.3627404417003088, "grad_norm": 3.0380284786224365, "learning_rate": 1.2507479225704149e-05, "loss": 1.3966, "step": 34500 }, { "epoch": 1.366698329771234, "grad_norm": 3.3855364322662354, "learning_rate": 1.2493467663640356e-05, "loss": 1.402, "step": 34600 }, { "epoch": 1.3706562178421593, "grad_norm": 3.3429582118988037, "learning_rate": 1.247942472600251e-05, "loss": 1.4315, "step": 34700 }, { "epoch": 1.3746141059130847, "grad_norm": 3.0505242347717285, "learning_rate": 1.2465350501026931e-05, "loss": 1.425, "step": 34800 }, { "epoch": 1.3785719939840102, "grad_norm": 3.4241063594818115, "learning_rate": 1.245124507714654e-05, "loss": 1.4179, "step": 34900 }, { "epoch": 1.3825298820549354, "grad_norm": 3.458108901977539, "learning_rate": 1.2437108542990274e-05, "loss": 1.4133, "step": 35000 }, { "epoch": 1.3864877701258609, "grad_norm": 3.9022340774536133, "learning_rate": 1.2422940987382556e-05, "loss": 1.4112, "step": 35100 }, { "epoch": 1.3904456581967861, "grad_norm": 3.5141968727111816, "learning_rate": 1.240874249934273e-05, "loss": 1.4453, "step": 35200 }, { "epoch": 1.3944035462677116, "grad_norm": 3.4254074096679688, "learning_rate": 1.2394513168084485e-05, "loss": 1.4096, "step": 35300 }, { "epoch": 1.398361434338637, "grad_norm": 3.460205316543579, "learning_rate": 1.2380253083015321e-05, "loss": 1.4145, "step": 35400 }, { "epoch": 1.4023193224095623, "grad_norm": 3.7515103816986084, "learning_rate": 1.236596233373597e-05, "loss": 1.4132, "step": 35500 }, { "epoch": 1.4062772104804875, "grad_norm": 3.2565503120422363, "learning_rate": 1.2351641010039833e-05, "loss": 1.3945, "step": 35600 }, { "epoch": 1.410235098551413, "grad_norm": 3.754737138748169, "learning_rate": 1.2337289201912429e-05, "loss": 1.381, "step": 35700 }, { "epoch": 1.4141929866223384, "grad_norm": 3.7933449745178223, "learning_rate": 1.2322906999530811e-05, "loss": 1.3943, "step": 35800 }, { "epoch": 1.4181508746932636, "grad_norm": 3.108177900314331, "learning_rate": 1.2308494493263014e-05, "loss": 1.4127, "step": 35900 }, { "epoch": 1.422108762764189, "grad_norm": 3.393486499786377, "learning_rate": 1.2294051773667482e-05, "loss": 1.3921, "step": 36000 }, { "epoch": 1.4260666508351143, "grad_norm": 3.9485793113708496, "learning_rate": 1.22795789314925e-05, "loss": 1.42, "step": 36100 }, { "epoch": 1.4300245389060398, "grad_norm": 3.353940725326538, "learning_rate": 1.2265076057675615e-05, "loss": 1.412, "step": 36200 }, { "epoch": 1.4339824269769652, "grad_norm": 3.861928939819336, "learning_rate": 1.2250543243343082e-05, "loss": 1.3952, "step": 36300 }, { "epoch": 1.4379403150478904, "grad_norm": 2.9782791137695312, "learning_rate": 1.2235980579809283e-05, "loss": 1.3872, "step": 36400 }, { "epoch": 1.4418982031188157, "grad_norm": 3.552558660507202, "learning_rate": 1.2221388158576142e-05, "loss": 1.3855, "step": 36500 }, { "epoch": 1.4458560911897411, "grad_norm": 3.034158229827881, "learning_rate": 1.2206766071332568e-05, "loss": 1.4028, "step": 36600 }, { "epoch": 1.4498139792606666, "grad_norm": 3.669677495956421, "learning_rate": 1.219211440995387e-05, "loss": 1.3865, "step": 36700 }, { "epoch": 1.4537718673315918, "grad_norm": 3.4838759899139404, "learning_rate": 1.2177433266501182e-05, "loss": 1.3987, "step": 36800 }, { "epoch": 1.4577297554025173, "grad_norm": 3.4691314697265625, "learning_rate": 1.2162722733220877e-05, "loss": 1.3939, "step": 36900 }, { "epoch": 1.4616876434734425, "grad_norm": 3.5910284519195557, "learning_rate": 1.2147982902544004e-05, "loss": 1.3972, "step": 37000 }, { "epoch": 1.465645531544368, "grad_norm": 3.2121059894561768, "learning_rate": 1.2133213867085686e-05, "loss": 1.3762, "step": 37100 }, { "epoch": 1.4696034196152934, "grad_norm": 3.8289687633514404, "learning_rate": 1.2118415719644557e-05, "loss": 1.4032, "step": 37200 }, { "epoch": 1.4735613076862186, "grad_norm": 3.597191095352173, "learning_rate": 1.2103588553202167e-05, "loss": 1.3925, "step": 37300 }, { "epoch": 1.4775191957571439, "grad_norm": 3.4253151416778564, "learning_rate": 1.2088732460922407e-05, "loss": 1.3715, "step": 37400 }, { "epoch": 1.4814770838280693, "grad_norm": 3.624340772628784, "learning_rate": 1.2073847536150912e-05, "loss": 1.387, "step": 37500 }, { "epoch": 1.4854349718989948, "grad_norm": 3.2783761024475098, "learning_rate": 1.2058933872414484e-05, "loss": 1.3837, "step": 37600 }, { "epoch": 1.48939285996992, "grad_norm": 3.377274990081787, "learning_rate": 1.2043991563420501e-05, "loss": 1.3515, "step": 37700 }, { "epoch": 1.4933507480408454, "grad_norm": 3.676497459411621, "learning_rate": 1.2029020703056327e-05, "loss": 1.3647, "step": 37800 }, { "epoch": 1.4973086361117707, "grad_norm": 4.441483020782471, "learning_rate": 1.2014021385388727e-05, "loss": 1.3594, "step": 37900 }, { "epoch": 1.5012665241826961, "grad_norm": 4.011296272277832, "learning_rate": 1.1998993704663267e-05, "loss": 1.3855, "step": 38000 }, { "epoch": 1.5052244122536216, "grad_norm": 3.3986339569091797, "learning_rate": 1.1983937755303735e-05, "loss": 1.3549, "step": 38100 }, { "epoch": 1.5091823003245468, "grad_norm": 3.2860589027404785, "learning_rate": 1.1968853631911532e-05, "loss": 1.3481, "step": 38200 }, { "epoch": 1.513140188395472, "grad_norm": 4.623264789581299, "learning_rate": 1.1953741429265089e-05, "loss": 1.3609, "step": 38300 }, { "epoch": 1.5170980764663975, "grad_norm": 3.3357603549957275, "learning_rate": 1.1938601242319269e-05, "loss": 1.373, "step": 38400 }, { "epoch": 1.521055964537323, "grad_norm": 3.6516709327697754, "learning_rate": 1.1923433166204768e-05, "loss": 1.3568, "step": 38500 }, { "epoch": 1.5250138526082484, "grad_norm": 4.045721530914307, "learning_rate": 1.1908237296227522e-05, "loss": 1.3419, "step": 38600 }, { "epoch": 1.5289717406791736, "grad_norm": 3.331871271133423, "learning_rate": 1.1893013727868098e-05, "loss": 1.3575, "step": 38700 }, { "epoch": 1.5329296287500989, "grad_norm": 3.959519624710083, "learning_rate": 1.1877762556781109e-05, "loss": 1.3464, "step": 38800 }, { "epoch": 1.5368875168210243, "grad_norm": 4.424190998077393, "learning_rate": 1.1862483878794596e-05, "loss": 1.3593, "step": 38900 }, { "epoch": 1.5408454048919498, "grad_norm": 4.052654266357422, "learning_rate": 1.1847177789909441e-05, "loss": 1.3474, "step": 39000 }, { "epoch": 1.544803292962875, "grad_norm": 3.552598237991333, "learning_rate": 1.1831844386298758e-05, "loss": 1.346, "step": 39100 }, { "epoch": 1.5487611810338002, "grad_norm": 3.979213237762451, "learning_rate": 1.1816483764307286e-05, "loss": 1.3557, "step": 39200 }, { "epoch": 1.5527190691047257, "grad_norm": 4.073390960693359, "learning_rate": 1.1801096020450786e-05, "loss": 1.3658, "step": 39300 }, { "epoch": 1.5566769571756511, "grad_norm": 4.211179256439209, "learning_rate": 1.1785681251415431e-05, "loss": 1.346, "step": 39400 }, { "epoch": 1.5606348452465766, "grad_norm": 3.6185340881347656, "learning_rate": 1.177023955405721e-05, "loss": 1.3686, "step": 39500 }, { "epoch": 1.5645927333175018, "grad_norm": 3.7389111518859863, "learning_rate": 1.1754771025401307e-05, "loss": 1.3536, "step": 39600 }, { "epoch": 1.568550621388427, "grad_norm": 4.2574357986450195, "learning_rate": 1.1739275762641494e-05, "loss": 1.352, "step": 39700 }, { "epoch": 1.5725085094593525, "grad_norm": 3.516805410385132, "learning_rate": 1.1723753863139529e-05, "loss": 1.3411, "step": 39800 }, { "epoch": 1.576466397530278, "grad_norm": 3.5958383083343506, "learning_rate": 1.1708205424424521e-05, "loss": 1.3433, "step": 39900 }, { "epoch": 1.5804242856012032, "grad_norm": 3.995814323425293, "learning_rate": 1.1692630544192354e-05, "loss": 1.3529, "step": 40000 }, { "epoch": 1.5843821736721284, "grad_norm": 3.817218780517578, "learning_rate": 1.1677029320305041e-05, "loss": 1.3469, "step": 40100 }, { "epoch": 1.5883400617430539, "grad_norm": 4.439276695251465, "learning_rate": 1.1661401850790119e-05, "loss": 1.3466, "step": 40200 }, { "epoch": 1.5922979498139793, "grad_norm": 3.7181553840637207, "learning_rate": 1.1645748233840044e-05, "loss": 1.3476, "step": 40300 }, { "epoch": 1.5962558378849048, "grad_norm": 10.218334197998047, "learning_rate": 1.1630068567811557e-05, "loss": 1.3602, "step": 40400 }, { "epoch": 1.60021372595583, "grad_norm": 4.133950710296631, "learning_rate": 1.1614362951225075e-05, "loss": 1.3485, "step": 40500 }, { "epoch": 1.6041716140267552, "grad_norm": 4.007839202880859, "learning_rate": 1.1598631482764074e-05, "loss": 1.3594, "step": 40600 }, { "epoch": 1.6081295020976807, "grad_norm": 4.194820404052734, "learning_rate": 1.1582874261274463e-05, "loss": 1.3383, "step": 40700 }, { "epoch": 1.6120873901686061, "grad_norm": 4.193638801574707, "learning_rate": 1.1567091385763965e-05, "loss": 1.3715, "step": 40800 }, { "epoch": 1.6160452782395314, "grad_norm": 3.5800745487213135, "learning_rate": 1.15512829554015e-05, "loss": 1.3494, "step": 40900 }, { "epoch": 1.6200031663104566, "grad_norm": 4.548177719116211, "learning_rate": 1.1535449069516552e-05, "loss": 1.3442, "step": 41000 }, { "epoch": 1.623961054381382, "grad_norm": 3.7496066093444824, "learning_rate": 1.1519589827598553e-05, "loss": 1.329, "step": 41100 }, { "epoch": 1.6279189424523075, "grad_norm": 4.815052032470703, "learning_rate": 1.1503705329296252e-05, "loss": 1.32, "step": 41200 }, { "epoch": 1.631876830523233, "grad_norm": 4.1927103996276855, "learning_rate": 1.14877956744171e-05, "loss": 1.3172, "step": 41300 }, { "epoch": 1.6358347185941582, "grad_norm": 3.9431440830230713, "learning_rate": 1.1471860962926604e-05, "loss": 1.3271, "step": 41400 }, { "epoch": 1.6397926066650834, "grad_norm": 4.615567684173584, "learning_rate": 1.1455901294947722e-05, "loss": 1.348, "step": 41500 }, { "epoch": 1.6437504947360089, "grad_norm": 4.151221752166748, "learning_rate": 1.143991677076021e-05, "loss": 1.3336, "step": 41600 }, { "epoch": 1.6477083828069343, "grad_norm": 4.409358978271484, "learning_rate": 1.142390749080001e-05, "loss": 1.3254, "step": 41700 }, { "epoch": 1.6516662708778596, "grad_norm": 4.490970134735107, "learning_rate": 1.140787355565861e-05, "loss": 1.3303, "step": 41800 }, { "epoch": 1.6556241589487848, "grad_norm": 4.116312026977539, "learning_rate": 1.1391815066082418e-05, "loss": 1.3315, "step": 41900 }, { "epoch": 1.6595820470197102, "grad_norm": 4.251399517059326, "learning_rate": 1.1375732122972124e-05, "loss": 1.3243, "step": 42000 }, { "epoch": 1.6635399350906357, "grad_norm": 4.184506416320801, "learning_rate": 1.1359624827382062e-05, "loss": 1.3226, "step": 42100 }, { "epoch": 1.6674978231615611, "grad_norm": 4.628664970397949, "learning_rate": 1.134349328051959e-05, "loss": 1.3399, "step": 42200 }, { "epoch": 1.6714557112324864, "grad_norm": 4.529860496520996, "learning_rate": 1.132733758374444e-05, "loss": 1.2967, "step": 42300 }, { "epoch": 1.6754135993034116, "grad_norm": 4.5048699378967285, "learning_rate": 1.1311157838568083e-05, "loss": 1.3255, "step": 42400 }, { "epoch": 1.679371487374337, "grad_norm": 4.321528911590576, "learning_rate": 1.1294954146653094e-05, "loss": 1.311, "step": 42500 }, { "epoch": 1.6833293754452625, "grad_norm": 4.919022083282471, "learning_rate": 1.1278726609812523e-05, "loss": 1.3219, "step": 42600 }, { "epoch": 1.6872872635161877, "grad_norm": 4.146111965179443, "learning_rate": 1.126247533000923e-05, "loss": 1.298, "step": 42700 }, { "epoch": 1.691245151587113, "grad_norm": 4.071747779846191, "learning_rate": 1.1246200409355271e-05, "loss": 1.313, "step": 42800 }, { "epoch": 1.6952030396580384, "grad_norm": 3.8871426582336426, "learning_rate": 1.1229901950111245e-05, "loss": 1.3176, "step": 42900 }, { "epoch": 1.6991609277289639, "grad_norm": 3.9479401111602783, "learning_rate": 1.1213580054685644e-05, "loss": 1.3112, "step": 43000 }, { "epoch": 1.7031188157998893, "grad_norm": 4.039346694946289, "learning_rate": 1.1197234825634222e-05, "loss": 1.3109, "step": 43100 }, { "epoch": 1.7070767038708146, "grad_norm": 4.356393814086914, "learning_rate": 1.1180866365659346e-05, "loss": 1.3202, "step": 43200 }, { "epoch": 1.7110345919417398, "grad_norm": 4.11832857131958, "learning_rate": 1.1164474777609351e-05, "loss": 1.313, "step": 43300 }, { "epoch": 1.7149924800126652, "grad_norm": 4.101129531860352, "learning_rate": 1.1148060164477887e-05, "loss": 1.2968, "step": 43400 }, { "epoch": 1.7189503680835907, "grad_norm": 3.728778600692749, "learning_rate": 1.1131622629403289e-05, "loss": 1.2869, "step": 43500 }, { "epoch": 1.722908256154516, "grad_norm": 3.846654176712036, "learning_rate": 1.1115162275667909e-05, "loss": 1.3054, "step": 43600 }, { "epoch": 1.7268661442254412, "grad_norm": 4.967803478240967, "learning_rate": 1.1098679206697474e-05, "loss": 1.3165, "step": 43700 }, { "epoch": 1.7308240322963666, "grad_norm": 4.610755443572998, "learning_rate": 1.1082173526060454e-05, "loss": 1.3234, "step": 43800 }, { "epoch": 1.734781920367292, "grad_norm": 4.377742290496826, "learning_rate": 1.1065645337467375e-05, "loss": 1.3261, "step": 43900 }, { "epoch": 1.7387398084382175, "grad_norm": 5.010995864868164, "learning_rate": 1.1049094744770201e-05, "loss": 1.3026, "step": 44000 }, { "epoch": 1.7426976965091427, "grad_norm": 4.31913423538208, "learning_rate": 1.1032521851961665e-05, "loss": 1.2697, "step": 44100 }, { "epoch": 1.746655584580068, "grad_norm": 4.2657060623168945, "learning_rate": 1.1015926763174617e-05, "loss": 1.3101, "step": 44200 }, { "epoch": 1.7506134726509934, "grad_norm": 3.801684617996216, "learning_rate": 1.0999309582681372e-05, "loss": 1.268, "step": 44300 }, { "epoch": 1.7545713607219189, "grad_norm": 4.505929946899414, "learning_rate": 1.0982670414893057e-05, "loss": 1.2957, "step": 44400 }, { "epoch": 1.7585292487928441, "grad_norm": 3.837562084197998, "learning_rate": 1.0966009364358948e-05, "loss": 1.3078, "step": 44500 }, { "epoch": 1.7624871368637693, "grad_norm": 4.854923248291016, "learning_rate": 1.0949326535765823e-05, "loss": 1.3119, "step": 44600 }, { "epoch": 1.7664450249346948, "grad_norm": 5.621912002563477, "learning_rate": 1.0932622033937294e-05, "loss": 1.2847, "step": 44700 }, { "epoch": 1.7704029130056202, "grad_norm": 4.009350776672363, "learning_rate": 1.0915895963833152e-05, "loss": 1.3006, "step": 44800 }, { "epoch": 1.7743608010765457, "grad_norm": 3.6888113021850586, "learning_rate": 1.0899148430548716e-05, "loss": 1.3032, "step": 44900 }, { "epoch": 1.778318689147471, "grad_norm": 4.511534690856934, "learning_rate": 1.0882379539314155e-05, "loss": 1.2615, "step": 45000 }, { "epoch": 1.7822765772183962, "grad_norm": 5.086874485015869, "learning_rate": 1.0865589395493845e-05, "loss": 1.2634, "step": 45100 }, { "epoch": 1.7862344652893216, "grad_norm": 4.865400791168213, "learning_rate": 1.0848778104585692e-05, "loss": 1.2876, "step": 45200 }, { "epoch": 1.790192353360247, "grad_norm": 4.537430763244629, "learning_rate": 1.0831945772220487e-05, "loss": 1.2764, "step": 45300 }, { "epoch": 1.7941502414311723, "grad_norm": 4.448334693908691, "learning_rate": 1.0815092504161214e-05, "loss": 1.2814, "step": 45400 }, { "epoch": 1.7981081295020975, "grad_norm": 4.650451183319092, "learning_rate": 1.0798218406302422e-05, "loss": 1.2819, "step": 45500 }, { "epoch": 1.802066017573023, "grad_norm": 4.031219005584717, "learning_rate": 1.0781323584669524e-05, "loss": 1.2729, "step": 45600 }, { "epoch": 1.8060239056439484, "grad_norm": 4.477336883544922, "learning_rate": 1.0764408145418157e-05, "loss": 1.2586, "step": 45700 }, { "epoch": 1.8099817937148739, "grad_norm": 4.118893146514893, "learning_rate": 1.0747472194833506e-05, "loss": 1.2591, "step": 45800 }, { "epoch": 1.8139396817857991, "grad_norm": 4.766265392303467, "learning_rate": 1.073051583932963e-05, "loss": 1.2693, "step": 45900 }, { "epoch": 1.8178975698567243, "grad_norm": 5.545733451843262, "learning_rate": 1.0713539185448795e-05, "loss": 1.2691, "step": 46000 }, { "epoch": 1.8218554579276498, "grad_norm": 4.723430633544922, "learning_rate": 1.069654233986082e-05, "loss": 1.2582, "step": 46100 }, { "epoch": 1.8258133459985753, "grad_norm": 4.899106025695801, "learning_rate": 1.0679525409362387e-05, "loss": 1.2802, "step": 46200 }, { "epoch": 1.8297712340695005, "grad_norm": 4.531938552856445, "learning_rate": 1.066248850087638e-05, "loss": 1.2683, "step": 46300 }, { "epoch": 1.8337291221404257, "grad_norm": 4.174386501312256, "learning_rate": 1.0645431721451212e-05, "loss": 1.2538, "step": 46400 }, { "epoch": 1.8376870102113512, "grad_norm": 4.827451229095459, "learning_rate": 1.0628355178260147e-05, "loss": 1.2571, "step": 46500 }, { "epoch": 1.8416448982822766, "grad_norm": 4.749929904937744, "learning_rate": 1.0611258978600638e-05, "loss": 1.2759, "step": 46600 }, { "epoch": 1.845602786353202, "grad_norm": 4.659051418304443, "learning_rate": 1.0594143229893643e-05, "loss": 1.2698, "step": 46700 }, { "epoch": 1.8495606744241273, "grad_norm": 4.492525100708008, "learning_rate": 1.057700803968295e-05, "loss": 1.2583, "step": 46800 }, { "epoch": 1.8535185624950525, "grad_norm": 4.04518461227417, "learning_rate": 1.0559853515634509e-05, "loss": 1.268, "step": 46900 }, { "epoch": 1.857476450565978, "grad_norm": 5.029372215270996, "learning_rate": 1.054267976553575e-05, "loss": 1.238, "step": 47000 }, { "epoch": 1.8614343386369034, "grad_norm": 4.29434871673584, "learning_rate": 1.05254868972949e-05, "loss": 1.2431, "step": 47100 }, { "epoch": 1.8653922267078287, "grad_norm": 4.874353408813477, "learning_rate": 1.050827501894032e-05, "loss": 1.2339, "step": 47200 }, { "epoch": 1.869350114778754, "grad_norm": 4.865941047668457, "learning_rate": 1.0491044238619817e-05, "loss": 1.2427, "step": 47300 }, { "epoch": 1.8733080028496794, "grad_norm": 4.548977375030518, "learning_rate": 1.0473794664599957e-05, "loss": 1.2586, "step": 47400 }, { "epoch": 1.8772658909206048, "grad_norm": 5.584561824798584, "learning_rate": 1.0456526405265402e-05, "loss": 1.2388, "step": 47500 }, { "epoch": 1.8812237789915303, "grad_norm": 4.570620536804199, "learning_rate": 1.0439239569118215e-05, "loss": 1.2499, "step": 47600 }, { "epoch": 1.8851816670624555, "grad_norm": 4.9521379470825195, "learning_rate": 1.0421934264777186e-05, "loss": 1.2486, "step": 47700 }, { "epoch": 1.8891395551333807, "grad_norm": 4.744478702545166, "learning_rate": 1.0404610600977141e-05, "loss": 1.2428, "step": 47800 }, { "epoch": 1.8930974432043062, "grad_norm": 4.681623935699463, "learning_rate": 1.0387268686568275e-05, "loss": 1.2577, "step": 47900 }, { "epoch": 1.8970553312752316, "grad_norm": 4.406890392303467, "learning_rate": 1.0369908630515445e-05, "loss": 1.2367, "step": 48000 }, { "epoch": 1.9010132193461569, "grad_norm": 4.650542259216309, "learning_rate": 1.0352530541897507e-05, "loss": 1.243, "step": 48100 }, { "epoch": 1.904971107417082, "grad_norm": 5.0188164710998535, "learning_rate": 1.0335134529906619e-05, "loss": 1.2222, "step": 48200 }, { "epoch": 1.9089289954880075, "grad_norm": 4.498706340789795, "learning_rate": 1.0317720703847554e-05, "loss": 1.2508, "step": 48300 }, { "epoch": 1.912886883558933, "grad_norm": 4.909203052520752, "learning_rate": 1.0300289173137021e-05, "loss": 1.2241, "step": 48400 }, { "epoch": 1.9168447716298584, "grad_norm": 4.85788631439209, "learning_rate": 1.0282840047302967e-05, "loss": 1.2268, "step": 48500 }, { "epoch": 1.9208026597007837, "grad_norm": 4.537557601928711, "learning_rate": 1.0265373435983907e-05, "loss": 1.2515, "step": 48600 }, { "epoch": 1.924760547771709, "grad_norm": 4.660990238189697, "learning_rate": 1.0247889448928208e-05, "loss": 1.2531, "step": 48700 }, { "epoch": 1.9287184358426344, "grad_norm": 4.9931511878967285, "learning_rate": 1.0230388195993424e-05, "loss": 1.2336, "step": 48800 }, { "epoch": 1.9326763239135598, "grad_norm": 4.516580581665039, "learning_rate": 1.0212869787145594e-05, "loss": 1.2292, "step": 48900 }, { "epoch": 1.936634211984485, "grad_norm": 4.5982255935668945, "learning_rate": 1.0195334332458552e-05, "loss": 1.2417, "step": 49000 }, { "epoch": 1.9405921000554103, "grad_norm": 5.071137428283691, "learning_rate": 1.0177781942113238e-05, "loss": 1.2194, "step": 49100 }, { "epoch": 1.9445499881263357, "grad_norm": 4.76341438293457, "learning_rate": 1.0160212726397001e-05, "loss": 1.2207, "step": 49200 }, { "epoch": 1.9485078761972612, "grad_norm": 4.945827484130859, "learning_rate": 1.0142626795702916e-05, "loss": 1.208, "step": 49300 }, { "epoch": 1.9524657642681866, "grad_norm": 5.068126678466797, "learning_rate": 1.0125024260529075e-05, "loss": 1.2372, "step": 49400 }, { "epoch": 1.9564236523391119, "grad_norm": 5.238717079162598, "learning_rate": 1.010740523147791e-05, "loss": 1.2287, "step": 49500 }, { "epoch": 1.960381540410037, "grad_norm": 4.586404800415039, "learning_rate": 1.008976981925548e-05, "loss": 1.2023, "step": 49600 }, { "epoch": 1.9643394284809625, "grad_norm": 5.284154415130615, "learning_rate": 1.0072118134670792e-05, "loss": 1.2254, "step": 49700 }, { "epoch": 1.968297316551888, "grad_norm": 4.639484882354736, "learning_rate": 1.0054450288635098e-05, "loss": 1.2298, "step": 49800 }, { "epoch": 1.9722552046228132, "grad_norm": 5.642242908477783, "learning_rate": 1.003676639216119e-05, "loss": 1.2196, "step": 49900 }, { "epoch": 1.9762130926937385, "grad_norm": 5.445943355560303, "learning_rate": 1.0019066556362718e-05, "loss": 1.2035, "step": 50000 }, { "epoch": 1.980170980764664, "grad_norm": 5.127743244171143, "learning_rate": 1.000135089245348e-05, "loss": 1.2281, "step": 50100 }, { "epoch": 1.9841288688355894, "grad_norm": 4.978102684020996, "learning_rate": 9.98361951174673e-06, "loss": 1.22, "step": 50200 }, { "epoch": 1.9880867569065148, "grad_norm": 4.681839466094971, "learning_rate": 9.965872525654468e-06, "loss": 1.2011, "step": 50300 }, { "epoch": 1.99204464497744, "grad_norm": 5.249551296234131, "learning_rate": 9.948110045686763e-06, "loss": 1.1931, "step": 50400 }, { "epoch": 1.9960025330483653, "grad_norm": 4.80012845993042, "learning_rate": 9.930332183451022e-06, "loss": 1.2282, "step": 50500 } ], "logging_steps": 100, "max_steps": 126330, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 25266, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.9517716845075366e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }