|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.974380871050384, |
|
"eval_steps": 500, |
|
"global_step": 2920, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0034158838599487617, |
|
"grad_norm": 3.8125, |
|
"learning_rate": 6.849315068493151e-07, |
|
"loss": 3.0658, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.017079419299743808, |
|
"grad_norm": 3.921875, |
|
"learning_rate": 3.4246575342465754e-06, |
|
"loss": 3.0727, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.034158838599487616, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 6.849315068493151e-06, |
|
"loss": 3.0381, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05123825789923143, |
|
"grad_norm": 3.375, |
|
"learning_rate": 1.0273972602739726e-05, |
|
"loss": 2.9796, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06831767719897523, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 1.3698630136986302e-05, |
|
"loss": 2.8478, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.08539709649871904, |
|
"grad_norm": 2.375, |
|
"learning_rate": 1.7123287671232875e-05, |
|
"loss": 2.7142, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.10247651579846286, |
|
"grad_norm": 1.4375, |
|
"learning_rate": 2.0547945205479453e-05, |
|
"loss": 2.5273, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11955593509820667, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 2.3972602739726026e-05, |
|
"loss": 2.3905, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.13663535439795046, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 2.7397260273972603e-05, |
|
"loss": 2.2615, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1537147736976943, |
|
"grad_norm": 21.875, |
|
"learning_rate": 3.082191780821918e-05, |
|
"loss": 2.1359, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1707941929974381, |
|
"grad_norm": 15.6875, |
|
"learning_rate": 3.424657534246575e-05, |
|
"loss": 2.0159, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18787361229718189, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 3.767123287671233e-05, |
|
"loss": 1.8994, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2049530315969257, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 4.1095890410958905e-05, |
|
"loss": 1.7873, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2220324508966695, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.452054794520548e-05, |
|
"loss": 1.6691, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.23911187019641333, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 4.794520547945205e-05, |
|
"loss": 1.5889, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2561912894961571, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 5.136986301369864e-05, |
|
"loss": 1.5329, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.27327070879590093, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 5.479452054794521e-05, |
|
"loss": 1.4749, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.29035012809564475, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 5.821917808219178e-05, |
|
"loss": 1.438, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3074295473953886, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 6.164383561643835e-05, |
|
"loss": 1.395, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.32450896669513235, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 6.506849315068494e-05, |
|
"loss": 1.3653, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3415883859948762, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 6.84931506849315e-05, |
|
"loss": 1.3329, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.35866780529462, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 7.191780821917809e-05, |
|
"loss": 1.3221, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.37574722459436377, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 7.534246575342466e-05, |
|
"loss": 1.3048, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3928266438941076, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 7.876712328767124e-05, |
|
"loss": 1.2884, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.4099060631938514, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 8.219178082191781e-05, |
|
"loss": 1.2687, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.4269854824935952, |
|
"grad_norm": 0.388671875, |
|
"learning_rate": 8.561643835616438e-05, |
|
"loss": 1.2502, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.444064901793339, |
|
"grad_norm": 0.5859375, |
|
"learning_rate": 8.904109589041096e-05, |
|
"loss": 1.2416, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.46114432109308284, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 9.246575342465755e-05, |
|
"loss": 1.2345, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.47822374039282667, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 9.58904109589041e-05, |
|
"loss": 1.2381, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.49530315969257044, |
|
"grad_norm": 0.5625, |
|
"learning_rate": 9.931506849315069e-05, |
|
"loss": 1.2236, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5123825789923142, |
|
"grad_norm": 0.5703125, |
|
"learning_rate": 0.00010273972602739728, |
|
"loss": 1.2102, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5294619982920581, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 0.00010616438356164384, |
|
"loss": 1.203, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5465414175918019, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00010958904109589041, |
|
"loss": 1.2011, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5636208368915457, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.000113013698630137, |
|
"loss": 1.193, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5807002561912895, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00011643835616438356, |
|
"loss": 1.1933, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5977796754910333, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00011986301369863014, |
|
"loss": 1.1774, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6148590947907772, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 0.0001232876712328767, |
|
"loss": 1.177, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6319385140905209, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 0.0001267123287671233, |
|
"loss": 1.1705, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6490179333902647, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00013013698630136988, |
|
"loss": 1.1612, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6660973526900086, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00013356164383561644, |
|
"loss": 1.167, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6831767719897524, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.000136986301369863, |
|
"loss": 1.1616, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7002561912894961, |
|
"grad_norm": 1.3203125, |
|
"learning_rate": 0.0001404109589041096, |
|
"loss": 1.1553, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.71733561058924, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00014383561643835618, |
|
"loss": 1.1475, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7344150298889838, |
|
"grad_norm": 0.51953125, |
|
"learning_rate": 0.00014726027397260274, |
|
"loss": 1.1482, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7514944491887275, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.00015068493150684933, |
|
"loss": 1.1427, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.7685738684884714, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 0.00015410958904109589, |
|
"loss": 1.1441, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7856532877882152, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.00015753424657534247, |
|
"loss": 1.1336, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.802732707087959, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 0.00016095890410958906, |
|
"loss": 1.1315, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8198121263877028, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00016438356164383562, |
|
"loss": 1.1316, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8368915456874466, |
|
"grad_norm": 1.8671875, |
|
"learning_rate": 0.0001678082191780822, |
|
"loss": 1.1381, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8539709649871904, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.00017123287671232877, |
|
"loss": 1.1376, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8710503842869343, |
|
"grad_norm": 1.328125, |
|
"learning_rate": 0.00017465753424657536, |
|
"loss": 1.1309, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.888129803586678, |
|
"grad_norm": 1.375, |
|
"learning_rate": 0.00017808219178082192, |
|
"loss": 1.1282, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9052092228864219, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 0.0001815068493150685, |
|
"loss": 1.1414, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.9222886421861657, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.0001849315068493151, |
|
"loss": 1.1403, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.9393680614859095, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00018835616438356165, |
|
"loss": 1.1352, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9564474807856533, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001917808219178082, |
|
"loss": 1.1201, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9735269000853971, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0001952054794520548, |
|
"loss": 1.1145, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.9906063193851409, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00019863013698630139, |
|
"loss": 1.1174, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9974380871050385, |
|
"eval_loss": 2.4481546878814697, |
|
"eval_runtime": 0.5643, |
|
"eval_samples_per_second": 17.72, |
|
"eval_steps_per_second": 1.772, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.0076857386848848, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.00019999935692582106, |
|
"loss": 1.1057, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.0247651579846284, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00019999542705801296, |
|
"loss": 1.0972, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0418445772843723, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00019998792472605885, |
|
"loss": 1.1012, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.0589239965841162, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00019997685019798912, |
|
"loss": 1.0859, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0760034158838598, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00019996220386945537, |
|
"loss": 1.0973, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0930828351836037, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00019994398626371643, |
|
"loss": 1.0961, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1101622544833476, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0001999221980316194, |
|
"loss": 1.0901, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.1272416737830913, |
|
"grad_norm": 0.546875, |
|
"learning_rate": 0.00019989683995157677, |
|
"loss": 1.0761, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.1443210930828351, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.0001998679129295382, |
|
"loss": 1.082, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.161400512382579, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.0001998354179989585, |
|
"loss": 1.0788, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1784799316823227, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00019979935632076048, |
|
"loss": 1.0745, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.1955593509820666, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00019975972918329356, |
|
"loss": 1.0775, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.2126387702818104, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001997165380022878, |
|
"loss": 1.0761, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.2297181895815543, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00019966978432080316, |
|
"loss": 1.0789, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.246797608881298, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 1.0762, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.2638770281810419, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00019956559626495212, |
|
"loss": 1.0748, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2809564474807855, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00019950816561283685, |
|
"loss": 1.0756, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.2980358667805294, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00019944717990461207, |
|
"loss": 1.0694, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.3151152860802733, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00019938264131907, |
|
"loss": 1.0654, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.332194705380017, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00019931455216193382, |
|
"loss": 1.0645, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.3492741246797608, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00019924291486577559, |
|
"loss": 1.0613, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.3663535439795047, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.000199167731989929, |
|
"loss": 1.0689, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3834329632792486, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00019908900622039822, |
|
"loss": 1.065, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.4005123825789922, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019900674036976173, |
|
"loss": 1.0668, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.4175918018787361, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001989209373770719, |
|
"loss": 1.0628, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.43467122117848, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00019883160030775016, |
|
"loss": 1.0617, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.4517506404782237, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019873873235347719, |
|
"loss": 1.0598, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.4688300597779675, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00019864233683207906, |
|
"loss": 1.0536, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4859094790777114, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001985424171874087, |
|
"loss": 1.0565, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.5029888983774553, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00019843897698922284, |
|
"loss": 1.0613, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.520068317677199, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0001983320199330545, |
|
"loss": 1.0517, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.5371477369769426, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00019822154984008088, |
|
"loss": 1.0589, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.5542271562766867, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00019810757065698688, |
|
"loss": 1.0517, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.5713065755764304, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0001979900864558242, |
|
"loss": 1.0547, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.588385994876174, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.0001978691014338658, |
|
"loss": 1.0537, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.6054654141759181, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.00019774461991345577, |
|
"loss": 1.0459, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.6225448334756618, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0001976166463418552, |
|
"loss": 1.0477, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.6396242527754057, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00019748518529108316, |
|
"loss": 1.0472, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.6567036720751496, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 0.0001973502414577533, |
|
"loss": 1.0521, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.6737830913748932, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00019721181966290613, |
|
"loss": 1.0394, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.690862510674637, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00019706992485183684, |
|
"loss": 1.0328, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.707941929974381, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00019692456209391846, |
|
"loss": 1.0382, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.7250213492741246, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019677573658242087, |
|
"loss": 1.0418, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.7421007685738685, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0001966234536343253, |
|
"loss": 1.0416, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.7591801878736124, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0001964677186901342, |
|
"loss": 1.0399, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.776259607173356, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00019630853731367713, |
|
"loss": 1.0404, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7933390264731, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00019614591519191165, |
|
"loss": 1.0349, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.8104184457728438, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.00019597985813472052, |
|
"loss": 1.0303, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.8274978650725875, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00019581037207470382, |
|
"loss": 1.0318, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.8445772843723314, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0001956374630669672, |
|
"loss": 1.0386, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.8616567036720753, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00019546113728890541, |
|
"loss": 1.0252, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.878736122971819, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00019528140103998177, |
|
"loss": 1.0329, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8958155422715628, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00019509826074150298, |
|
"loss": 1.0385, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.9128949615713067, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00019491172293638968, |
|
"loss": 1.0322, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.9299743808710503, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00019472179428894288, |
|
"loss": 1.0296, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.9470538001707942, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001945284815846057, |
|
"loss": 1.0434, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.964133219470538, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019433179172972102, |
|
"loss": 1.0228, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.9812126387702818, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00019413173175128473, |
|
"loss": 1.0274, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9982920580700256, |
|
"grad_norm": 0.4765625, |
|
"learning_rate": 0.00019392830879669463, |
|
"loss": 1.0252, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.9982920580700256, |
|
"eval_loss": 2.451388120651245, |
|
"eval_runtime": 0.5458, |
|
"eval_samples_per_second": 18.323, |
|
"eval_steps_per_second": 1.832, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.0153714773697695, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.00019372153013349523, |
|
"loss": 1.0051, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.032450896669513, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 0.00019351140314911795, |
|
"loss": 1.0105, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.049530315969257, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00019329793535061723, |
|
"loss": 1.0135, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.066609735269001, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00019308113436440242, |
|
"loss": 1.0062, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.0836891545687446, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0001928610079359652, |
|
"loss": 1.0019, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.1007685738684883, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00019263756392960294, |
|
"loss": 1.0048, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.1178479931682324, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00019241081032813772, |
|
"loss": 1.0094, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.134927412467976, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00019218075523263104, |
|
"loss": 1.0014, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 2.1520068317677197, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00019194740686209464, |
|
"loss": 1.0085, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.1690862510674638, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.0001917107735531966, |
|
"loss": 1.0014, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 2.1861656703672074, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0001914708637599636, |
|
"loss": 1.0056, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.203245089666951, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.00019122768605347892, |
|
"loss": 0.998, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 2.220324508966695, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00019098124912157632, |
|
"loss": 1.0007, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.237403928266439, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00019073156176852935, |
|
"loss": 1.0046, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 2.2544833475661825, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.00019047863291473717, |
|
"loss": 1.0084, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.2715627668659266, |
|
"grad_norm": 0.333984375, |
|
"learning_rate": 0.00019022247159640557, |
|
"loss": 1.0006, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 2.2886421861656703, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00018996308696522433, |
|
"loss": 1.0057, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.305721605465414, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00018970048828804016, |
|
"loss": 1.0019, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 2.322801024765158, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.0001894346849465257, |
|
"loss": 1.0054, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.3398804440649017, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0001891656864368442, |
|
"loss": 1.0021, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 2.3569598633646454, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00018889350236931055, |
|
"loss": 0.9956, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.3740392826643895, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.00018861814246804755, |
|
"loss": 1.0063, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 2.391118701964133, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00018833961657063885, |
|
"loss": 1.0013, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.408198121263877, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00018805793462777734, |
|
"loss": 0.9951, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 2.425277540563621, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0001877731067029096, |
|
"loss": 1.0019, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.4423569598633645, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00018748514297187648, |
|
"loss": 0.995, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 2.4594363791631086, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00018719405372254948, |
|
"loss": 1.002, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.4765157984628523, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.00018689984935446317, |
|
"loss": 0.9942, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 2.493595217762596, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 1.0012, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.5106746370623396, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00018630213741623383, |
|
"loss": 1.002, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 2.5277540563620837, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00018599865120011192, |
|
"loss": 0.9975, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.5448334756618274, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00018569209257251026, |
|
"loss": 0.9996, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 2.561912894961571, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00018538247248562674, |
|
"loss": 1.0001, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 2.578992314261315, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.00018506980200103375, |
|
"loss": 0.9954, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 2.596071733561059, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00018475409228928312, |
|
"loss": 0.9945, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 2.6131511528608025, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00018443535462950688, |
|
"loss": 0.9918, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 2.6302305721605466, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.0001841136004090144, |
|
"loss": 1.0007, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 2.6473099914602902, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00018378884112288542, |
|
"loss": 1.0026, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 2.664389410760034, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00018346108837355972, |
|
"loss": 0.995, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 2.681468830059778, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0001831303538704221, |
|
"loss": 0.9916, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 2.6985482493595216, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00018279664942938447, |
|
"loss": 0.9902, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 2.7156276686592657, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00018245998697246352, |
|
"loss": 1.0003, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 2.7327070879590094, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00018212037852735486, |
|
"loss": 0.9933, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 2.749786507258753, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00018177783622700327, |
|
"loss": 0.9934, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 2.766865926558497, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0001814323723091692, |
|
"loss": 0.9887, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 2.783945345858241, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00018108399911599167, |
|
"loss": 0.995, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 2.8010247651579845, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00018073272909354727, |
|
"loss": 0.9897, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 2.8181041844577286, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00018037857479140547, |
|
"loss": 0.9923, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 2.8351836037574722, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.00018002154886218033, |
|
"loss": 0.9877, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 2.852263023057216, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00017966166406107846, |
|
"loss": 0.9936, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.86934244235696, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00017929893324544332, |
|
"loss": 0.9931, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.8864218616567037, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00017893336937429581, |
|
"loss": 0.992, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.9035012809564473, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00017856498550787144, |
|
"loss": 0.9896, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.9205807002561914, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0001781937948071536, |
|
"loss": 0.9899, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.937660119555935, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00017781981053340337, |
|
"loss": 0.9869, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.9547395388556787, |
|
"grad_norm": 0.43359375, |
|
"learning_rate": 0.00017744304604768588, |
|
"loss": 0.9865, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.971818958155423, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.00017706351481039284, |
|
"loss": 0.9885, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.9888983774551665, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00017668123038076163, |
|
"loss": 0.988, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.9991460290350127, |
|
"eval_loss": 2.468273639678955, |
|
"eval_runtime": 0.5562, |
|
"eval_samples_per_second": 17.978, |
|
"eval_steps_per_second": 1.798, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 3.00597779675491, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.00017629620641639103, |
|
"loss": 0.9741, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.0230572160546543, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00017590845667275312, |
|
"loss": 0.9621, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 3.040136635354398, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00017551799500270198, |
|
"loss": 0.968, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.0572160546541416, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00017512483535597867, |
|
"loss": 0.9683, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 3.0742954739538857, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00017472899177871297, |
|
"loss": 0.9671, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.0913748932536294, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0001743304784129214, |
|
"loss": 0.9563, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 3.108454312553373, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00017392930949600217, |
|
"loss": 0.9678, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.125533731853117, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.0001735254993602264, |
|
"loss": 0.9594, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 3.1426131511528608, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00017311906243222614, |
|
"loss": 0.9691, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.1596925704526044, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.0001727100132324789, |
|
"loss": 0.9694, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 3.1767719897523485, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00017229836637478902, |
|
"loss": 0.9678, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.193851409052092, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00017188413656576534, |
|
"loss": 0.972, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 3.210930828351836, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00017146733860429612, |
|
"loss": 0.9661, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.22801024765158, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00017104798738101993, |
|
"loss": 0.9567, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 3.2450896669513236, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00017062609787779403, |
|
"loss": 0.9605, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.2621690862510673, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00017020168516715894, |
|
"loss": 0.9678, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 3.2792485055508114, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00016977476441179992, |
|
"loss": 0.961, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.296327924850555, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00016934535086400538, |
|
"loss": 0.9657, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 3.313407344150299, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0001689134598651219, |
|
"loss": 0.9601, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.330486763450043, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00016847910684500615, |
|
"loss": 0.9652, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 3.3475661827497865, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0001680423073214737, |
|
"loss": 0.9755, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.3646456020495306, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.0001676030768997445, |
|
"loss": 0.9641, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 3.381725021349274, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00016716143127188548, |
|
"loss": 0.9652, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.398804440649018, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.0001667173862162499, |
|
"loss": 0.9647, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 3.415883859948762, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00016627095759691362, |
|
"loss": 0.9689, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.4329632792485056, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001658221613631083, |
|
"loss": 0.9656, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 3.4500426985482493, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0001653710135486518, |
|
"loss": 0.9601, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 3.4671221178479934, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00016491753027137498, |
|
"loss": 0.9669, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 3.484201537147737, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00016446172773254629, |
|
"loss": 0.9606, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 3.5012809564474807, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00016400362221629264, |
|
"loss": 0.9693, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 3.518360375747225, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00016354323008901776, |
|
"loss": 0.9631, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 3.5354397950469685, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0001630805677988175, |
|
"loss": 0.9601, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 3.552519214346712, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0001626156518748922, |
|
"loss": 0.9593, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 3.5695986336464562, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.00016214849892695602, |
|
"loss": 0.9611, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 3.5866780529462, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00016167912564464383, |
|
"loss": 0.966, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 3.6037574722459436, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00016120754879691464, |
|
"loss": 0.9651, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 3.6208368915456877, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001607337852314527, |
|
"loss": 0.9591, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 3.6379163108454313, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00016025785187406553, |
|
"loss": 0.9578, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 3.654995730145175, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0001597797657280792, |
|
"loss": 0.9691, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 3.672075149444919, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00015929954387373103, |
|
"loss": 0.9591, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 3.6891545687446627, |
|
"grad_norm": 0.478515625, |
|
"learning_rate": 0.00015881720346755905, |
|
"loss": 0.9629, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 3.7062339880444064, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00015833276174178937, |
|
"loss": 0.9564, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 3.7233134073441505, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00015784623600372042, |
|
"loss": 0.9644, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 3.740392826643894, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 0.9592, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 3.757472245943638, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00015686700209152738, |
|
"loss": 0.9709, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 3.774551665243382, |
|
"grad_norm": 1.3828125, |
|
"learning_rate": 0.00015637432890178353, |
|
"loss": 0.9658, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 3.7916310845431256, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00015587964166725095, |
|
"loss": 0.9621, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 3.8087105038428692, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00015538295806126205, |
|
"loss": 0.9648, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 3.8257899231426133, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00015488429582847192, |
|
"loss": 0.9647, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 3.842869342442357, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0001543836727842248, |
|
"loss": 0.9569, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 3.8599487617421007, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00015388110681391725, |
|
"loss": 0.9615, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 3.8770281810418448, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00015337661587235953, |
|
"loss": 0.9561, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 3.8941076003415884, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0001528702179831338, |
|
"loss": 0.9686, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 3.911187019641332, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00015236193123795041, |
|
"loss": 0.959, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 3.928266438941076, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00015185177379600152, |
|
"loss": 0.9545, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 3.94534585824082, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00015133976388331227, |
|
"loss": 0.9626, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 3.9624252775405635, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00015082591979208976, |
|
"loss": 0.9595, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 3.9795046968403076, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00015031025988006936, |
|
"loss": 0.959, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 3.9965841161400513, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.000149792802569859, |
|
"loss": 0.9741, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.4999709129333496, |
|
"eval_runtime": 0.5563, |
|
"eval_samples_per_second": 17.977, |
|
"eval_steps_per_second": 1.798, |
|
"step": 1171 |
|
}, |
|
{ |
|
"epoch": 4.013663535439795, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.00014927356634828094, |
|
"loss": 0.943, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 4.030742954739539, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00014875256976571135, |
|
"loss": 0.9301, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.047822374039282, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00014822983143541752, |
|
"loss": 0.9339, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 4.064901793339026, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0001477053700328929, |
|
"loss": 0.9284, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.0819812126387705, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00014717920429518984, |
|
"loss": 0.9403, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 4.099060631938514, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00014665135302025035, |
|
"loss": 0.936, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.116140051238258, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00014612183506623432, |
|
"loss": 0.9361, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 4.133219470538002, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00014559066935084588, |
|
"loss": 0.9354, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.150298889837745, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0001450578748506576, |
|
"loss": 0.9339, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 4.167378309137489, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00014452347060043237, |
|
"loss": 0.9319, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.184457728437233, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00014398747569244354, |
|
"loss": 0.9403, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 4.2015371477369765, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00014344990927579268, |
|
"loss": 0.9368, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.218616567036721, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00014291079055572554, |
|
"loss": 0.9327, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 4.235695986336465, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0001423701387929459, |
|
"loss": 0.942, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.252775405636209, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0001418279733029274, |
|
"loss": 0.9416, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 4.269854824935952, |
|
"grad_norm": 0.357421875, |
|
"learning_rate": 0.0001412843134552235, |
|
"loss": 0.9365, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.286934244235696, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00014073917867277557, |
|
"loss": 0.9334, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 4.304013663535439, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.00014019258843121893, |
|
"loss": 0.9374, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 4.3210930828351835, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0001396445622581869, |
|
"loss": 0.9309, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 4.3381725021349276, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0001390951197326134, |
|
"loss": 0.9256, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 4.355251921434672, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00013854428048403324, |
|
"loss": 0.9336, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 4.372331340734415, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00013799206419188103, |
|
"loss": 0.9441, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 4.389410760034159, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.00013743849058478808, |
|
"loss": 0.938, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 4.406490179333902, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.00013688357943987732, |
|
"loss": 0.9389, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 4.423569598633646, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00013632735058205706, |
|
"loss": 0.945, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 4.44064901793339, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0001357698238833126, |
|
"loss": 0.9378, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 4.4577284372331345, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00013521101926199607, |
|
"loss": 0.9378, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 4.474807856532878, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0001346509566821153, |
|
"loss": 0.9409, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 4.491887275832622, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00013408965615262008, |
|
"loss": 0.9363, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 4.508966695132365, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00013352713772668765, |
|
"loss": 0.9413, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 4.526046114432109, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00013296342150100605, |
|
"loss": 0.9509, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 4.543125533731853, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00013239852761505626, |
|
"loss": 0.9361, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 4.560204953031597, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00013183247625039282, |
|
"loss": 0.9366, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 4.577284372331341, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00013126528762992247, |
|
"loss": 0.9381, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 4.594363791631085, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.000130696982017182, |
|
"loss": 0.9394, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 4.611443210930828, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00013012757971561415, |
|
"loss": 0.9363, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 4.628522630230572, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00012955710106784214, |
|
"loss": 0.9323, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 4.645602049530316, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00012898556645494325, |
|
"loss": 0.9387, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 4.66268146883006, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00012841299629572032, |
|
"loss": 0.935, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 4.679760888129803, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0001278394110459724, |
|
"loss": 0.9446, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 4.6968403074295475, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.000127264831197764, |
|
"loss": 0.9372, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 4.713919726729291, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0001266892772786929, |
|
"loss": 0.9363, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 4.730999146029035, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00012611276985115678, |
|
"loss": 0.9394, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 4.748078565328779, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001255353295116187, |
|
"loss": 0.9438, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 4.765157984628523, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.00012495697688987112, |
|
"loss": 0.942, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 4.782237403928266, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00012437773264829897, |
|
"loss": 0.9436, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 4.79931682322801, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0001237976174811414, |
|
"loss": 0.9403, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 4.816396242527754, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00012321665211375256, |
|
"loss": 0.9361, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 4.833475661827498, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00012263485730186103, |
|
"loss": 0.9404, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 4.850555081127242, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00012205225383082843, |
|
"loss": 0.9409, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 4.867634500426986, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0001214688625149066, |
|
"loss": 0.9351, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 4.884713919726729, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00012088470419649432, |
|
"loss": 0.938, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 4.901793339026473, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00012029979974539234, |
|
"loss": 0.9425, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 4.918872758326217, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00011971417005805818, |
|
"loss": 0.9372, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 4.9359521776259605, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00011912783605685913, |
|
"loss": 0.9399, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 4.953031596925705, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001185408186893251, |
|
"loss": 0.9385, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 4.970111016225449, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.0001179531389274001, |
|
"loss": 0.9311, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 4.987190435525192, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.9342, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 4.997438087105039, |
|
"eval_loss": 2.5202550888061523, |
|
"eval_runtime": 0.5609, |
|
"eval_samples_per_second": 17.829, |
|
"eval_steps_per_second": 1.783, |
|
"step": 1463 |
|
}, |
|
{ |
|
"epoch": 5.004269854824936, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00011677587622572763, |
|
"loss": 0.9354, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 5.02134927412468, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.00011618633534519141, |
|
"loss": 0.9194, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.038428693424423, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00011559621618718414, |
|
"loss": 0.9073, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 5.0555081127241674, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00011500553983446527, |
|
"loss": 0.9146, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.0725875320239115, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.00011441432738970072, |
|
"loss": 0.9098, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 5.089666951323655, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00011382259997470899, |
|
"loss": 0.9135, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 5.106746370623399, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00011323037872970657, |
|
"loss": 0.9174, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 5.123825789923143, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00011263768481255264, |
|
"loss": 0.9155, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.140905209222886, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00011204453939799315, |
|
"loss": 0.9115, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 5.15798462852263, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00011145096367690444, |
|
"loss": 0.9112, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 5.175064047822374, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0001108569788555361, |
|
"loss": 0.9142, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 5.192143467122118, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.00011026260615475333, |
|
"loss": 0.9116, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 5.209222886421862, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00010966786680927874, |
|
"loss": 0.9141, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 5.226302305721606, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00010907278206693395, |
|
"loss": 0.9168, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 5.243381725021349, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00010847737318788013, |
|
"loss": 0.9216, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 5.260461144321093, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00010788166144385888, |
|
"loss": 0.9167, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 5.277540563620837, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0001072856681174318, |
|
"loss": 0.9155, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 5.2946199829205804, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00010668941450122055, |
|
"loss": 0.9218, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 5.3116994022203246, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00010609292189714586, |
|
"loss": 0.9132, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 5.328778821520069, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0001054962116156667, |
|
"loss": 0.9181, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 5.345858240819812, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0001048993049750188, |
|
"loss": 0.9151, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 5.362937660119556, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.00010430222330045304, |
|
"loss": 0.9096, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 5.3800170794193, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0001037049879234737, |
|
"loss": 0.9183, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 5.397096498719043, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0001031076201810762, |
|
"loss": 0.9151, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 5.414175918018787, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.00010251014141498484, |
|
"loss": 0.9205, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 5.4312553373185315, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00010191257297089052, |
|
"loss": 0.9114, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.448334756618275, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.00010131493619768788, |
|
"loss": 0.9148, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 5.465414175918019, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00010071725244671282, |
|
"loss": 0.9202, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 5.482493595217763, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00010011954307097942, |
|
"loss": 0.9216, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 5.499573014517506, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.952182942441733e-05, |
|
"loss": 0.9206, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 5.51665243381725, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 9.892413286110886e-05, |
|
"loss": 0.9193, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 5.533731853116994, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 9.83264747345259e-05, |
|
"loss": 0.9195, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 5.5508112724167376, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 9.772887639676707e-05, |
|
"loss": 0.9178, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 5.567890691716482, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 9.713135919779515e-05, |
|
"loss": 0.9174, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 5.584970111016226, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 9.653394448467399e-05, |
|
"loss": 0.9194, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 5.602049530315969, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.593665360080599e-05, |
|
"loss": 0.9192, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 5.619128949615713, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 9.533950788516974e-05, |
|
"loss": 0.9154, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 5.636208368915457, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 9.474252867155732e-05, |
|
"loss": 0.9142, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 5.6532877882152, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 9.414573728781247e-05, |
|
"loss": 0.9101, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 5.6703672075149445, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 9.354915505506839e-05, |
|
"loss": 0.9158, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 5.687446626814689, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.295280328698604e-05, |
|
"loss": 0.9181, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 5.704526046114432, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 9.235670328899293e-05, |
|
"loss": 0.9138, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 5.721605465414176, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 9.176087635752156e-05, |
|
"loss": 0.9119, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 5.73868488471392, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 9.116534377924883e-05, |
|
"loss": 0.9213, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 5.755764304013663, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 9.057012683033555e-05, |
|
"loss": 0.9177, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 5.772843723313407, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 8.997524677566627e-05, |
|
"loss": 0.9217, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 5.789923142613151, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 8.938072486808952e-05, |
|
"loss": 0.9167, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 5.807002561912895, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 8.878658234765858e-05, |
|
"loss": 0.9207, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 5.824081981212639, |
|
"grad_norm": 0.365234375, |
|
"learning_rate": 8.81928404408726e-05, |
|
"loss": 0.9173, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 5.841161400512383, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 8.759952035991844e-05, |
|
"loss": 0.9192, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 5.858240819812126, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 8.70066433019125e-05, |
|
"loss": 0.9178, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 5.87532023911187, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 8.641423044814374e-05, |
|
"loss": 0.9246, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 5.892399658411614, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 8.582230296331686e-05, |
|
"loss": 0.9187, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 5.9094790777113575, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 8.5230881994796e-05, |
|
"loss": 0.9211, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 5.926558497011102, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 8.463998867184952e-05, |
|
"loss": 0.9194, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 5.943637916310846, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 8.404964410489485e-05, |
|
"loss": 0.9215, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 5.960717335610589, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 8.34598693847444e-05, |
|
"loss": 0.9144, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 5.977796754910333, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 8.287068558185225e-05, |
|
"loss": 0.9175, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 5.994876174210077, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 8.228211374556103e-05, |
|
"loss": 0.9201, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 5.998292058070025, |
|
"eval_loss": 2.5518593788146973, |
|
"eval_runtime": 0.5597, |
|
"eval_samples_per_second": 17.868, |
|
"eval_steps_per_second": 1.787, |
|
"step": 1756 |
|
}, |
|
{ |
|
"epoch": 6.01195559350982, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 8.169417490335007e-05, |
|
"loss": 0.9044, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 6.029035012809564, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 8.110689006008434e-05, |
|
"loss": 0.8914, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 6.0461144321093085, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 8.052028019726371e-05, |
|
"loss": 0.896, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 6.063193851409052, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 7.993436627227368e-05, |
|
"loss": 0.9072, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 6.080273270708796, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 7.934916921763628e-05, |
|
"loss": 0.8999, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 6.09735269000854, |
|
"grad_norm": 0.25, |
|
"learning_rate": 7.876470994026254e-05, |
|
"loss": 0.8991, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 6.114432109308283, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 7.818100932070546e-05, |
|
"loss": 0.9065, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 6.131511528608027, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 7.759808821241406e-05, |
|
"loss": 0.8899, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 6.148590947907771, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 7.701596744098818e-05, |
|
"loss": 0.8956, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 6.165670367207515, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 7.643466780343479e-05, |
|
"loss": 0.8964, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 6.182749786507259, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 7.585421006742463e-05, |
|
"loss": 0.8985, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 6.199829205807003, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 7.527461497055061e-05, |
|
"loss": 0.8979, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 6.216908625106746, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 7.469590321958662e-05, |
|
"loss": 0.9014, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 6.23398804440649, |
|
"grad_norm": 0.25, |
|
"learning_rate": 7.411809548974792e-05, |
|
"loss": 0.9059, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 6.251067463706234, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 7.354121242395254e-05, |
|
"loss": 0.903, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 6.268146883005977, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 7.296527463208358e-05, |
|
"loss": 0.8955, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 6.2852263023057215, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 7.239030269025311e-05, |
|
"loss": 0.8991, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 6.302305721605466, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 7.1816317140067e-05, |
|
"loss": 0.9014, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 6.319385140905209, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 7.124333848789091e-05, |
|
"loss": 0.9015, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 6.336464560204953, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 7.067138720411795e-05, |
|
"loss": 0.9039, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 6.353543979504697, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 7.010048372243698e-05, |
|
"loss": 0.8993, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 6.37062339880444, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 6.953064843910296e-05, |
|
"loss": 0.908, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 6.387702818104184, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 6.8961901712208e-05, |
|
"loss": 0.9021, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 6.4047822374039285, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 6.839426386095425e-05, |
|
"loss": 0.9002, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 6.421861656703672, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 6.782775516492771e-05, |
|
"loss": 0.9007, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 6.438941076003416, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 6.726239586337408e-05, |
|
"loss": 0.8959, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 6.45602049530316, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 6.669820615447522e-05, |
|
"loss": 0.9078, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 6.473099914602903, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 6.613520619462803e-05, |
|
"loss": 0.8996, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 6.490179333902647, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 6.5573416097724e-05, |
|
"loss": 0.9023, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 6.507258753202391, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 6.50128559344307e-05, |
|
"loss": 0.9004, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 6.5243381725021345, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 6.445354573147484e-05, |
|
"loss": 0.9088, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 6.541417591801879, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 6.389550547092661e-05, |
|
"loss": 0.8937, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 6.558497011101623, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 6.333875508948593e-05, |
|
"loss": 0.906, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 6.575576430401366, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 6.278331447777021e-05, |
|
"loss": 0.9062, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 6.59265584970111, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 6.22292034796035e-05, |
|
"loss": 0.9004, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 6.609735269000854, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 6.167644189130794e-05, |
|
"loss": 0.8995, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 6.626814688300598, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 6.112504946099604e-05, |
|
"loss": 0.9011, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 6.6438941076003415, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 6.057504588786556e-05, |
|
"loss": 0.8957, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 6.660973526900086, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 6.0026450821495536e-05, |
|
"loss": 0.909, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 6.678052946199829, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 5.947928386114428e-05, |
|
"loss": 0.8996, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 6.695132365499573, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 5.8933564555049105e-05, |
|
"loss": 0.9072, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 6.712211784799317, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 5.838931239972824e-05, |
|
"loss": 0.9022, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 6.729291204099061, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 5.784654683928391e-05, |
|
"loss": 0.9009, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 6.746370623398804, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 5.730528726470792e-05, |
|
"loss": 0.8999, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 6.763450042698548, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 5.6765553013188766e-05, |
|
"loss": 0.9002, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 6.780529461998292, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 5.622736336742087e-05, |
|
"loss": 0.8965, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 6.797608881298036, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 5.5690737554915604e-05, |
|
"loss": 0.9015, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 6.81468830059778, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 5.5155694747314504e-05, |
|
"loss": 0.9105, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 6.831767719897524, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 5.462225405970401e-05, |
|
"loss": 0.8978, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 6.848847139197267, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 5.4090434549933064e-05, |
|
"loss": 0.8999, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 6.865926558497011, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 5.3560255217931785e-05, |
|
"loss": 0.8988, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 6.8830059777967545, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 5.303173500503289e-05, |
|
"loss": 0.9055, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 6.900085397096499, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 5.2504892793295e-05, |
|
"loss": 0.8991, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 6.917164816396243, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 5.197974740482785e-05, |
|
"loss": 0.8997, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 6.934244235695987, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 5.145631760112022e-05, |
|
"loss": 0.8983, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 6.95132365499573, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 5.093462208236931e-05, |
|
"loss": 0.9038, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 6.968403074295474, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 5.041467948681269e-05, |
|
"loss": 0.8978, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 6.985482493595217, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 4.989650839006279e-05, |
|
"loss": 0.9054, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 6.999146029035013, |
|
"eval_loss": 2.5763192176818848, |
|
"eval_runtime": 0.5559, |
|
"eval_samples_per_second": 17.989, |
|
"eval_steps_per_second": 1.799, |
|
"step": 2049 |
|
}, |
|
{ |
|
"epoch": 7.002561912894961, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 4.9380127304442634e-05, |
|
"loss": 0.8953, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 7.0196413321947055, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 4.886555467832512e-05, |
|
"loss": 0.893, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 7.036720751494449, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 4.835280889547351e-05, |
|
"loss": 0.8885, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 7.053800170794193, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 4.7841908274384616e-05, |
|
"loss": 0.8916, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 7.070879590093937, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 4.733287106763481e-05, |
|
"loss": 0.8906, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 7.08795900939368, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 4.6825715461227284e-05, |
|
"loss": 0.8876, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 7.105038428693424, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 4.6320459573942856e-05, |
|
"loss": 0.8908, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 7.122117847993168, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 4.581712145669239e-05, |
|
"loss": 0.8887, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 7.1391972672929125, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 4.531571909187197e-05, |
|
"loss": 0.886, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 7.156276686592656, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 4.481627039272056e-05, |
|
"loss": 0.8883, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 7.1733561058924, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 4.431879320267972e-05, |
|
"loss": 0.8922, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 7.190435525192143, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 4.38233052947565e-05, |
|
"loss": 0.8825, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 7.207514944491887, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 4.332982437088825e-05, |
|
"loss": 0.8856, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 7.224594363791631, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 4.2838368061310276e-05, |
|
"loss": 0.8929, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 7.241673783091375, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 4.2348953923925916e-05, |
|
"loss": 0.8977, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 7.2587532023911185, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 4.186159944367936e-05, |
|
"loss": 0.8855, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 7.275832621690863, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 4.137632203193086e-05, |
|
"loss": 0.8837, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 7.292912040990606, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 4.0893139025834806e-05, |
|
"loss": 0.8927, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 7.30999146029035, |
|
"grad_norm": 0.25, |
|
"learning_rate": 4.041206768772022e-05, |
|
"loss": 0.8902, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 7.327070879590094, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 3.993312520447414e-05, |
|
"loss": 0.8904, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 7.344150298889838, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 3.9456328686927525e-05, |
|
"loss": 0.8885, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 7.361229718189581, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 3.898169516924398e-05, |
|
"loss": 0.8945, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 7.3783091374893255, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 3.850924160831115e-05, |
|
"loss": 0.892, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 7.395388556789069, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 3.803898488313501e-05, |
|
"loss": 0.8933, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 7.412467976088813, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 3.757094179423672e-05, |
|
"loss": 0.892, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 7.429547395388557, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 3.710512906305248e-05, |
|
"loss": 0.8905, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 7.446626814688301, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 3.6641563331336125e-05, |
|
"loss": 0.888, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 7.463706233988044, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 3.618026116056456e-05, |
|
"loss": 0.8847, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 7.480785653287788, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 0.8867, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 7.497865072587532, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 3.5264513342831615e-05, |
|
"loss": 0.8894, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 7.514944491887276, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 3.4810100412128747e-05, |
|
"loss": 0.8894, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 7.53202391118702, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 3.435801647371897e-05, |
|
"loss": 0.8922, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 7.549103330486764, |
|
"grad_norm": 0.25, |
|
"learning_rate": 3.3908277678877445e-05, |
|
"loss": 0.8934, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 7.566182749786507, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 3.346090009509613e-05, |
|
"loss": 0.8865, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 7.583262169086251, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 3.3015899705509734e-05, |
|
"loss": 0.8889, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 7.600341588385994, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 3.257329240832454e-05, |
|
"loss": 0.886, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 7.6174210076857385, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 3.21330940162508e-05, |
|
"loss": 0.8875, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 7.634500426985483, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 3.169532025593729e-05, |
|
"loss": 0.8863, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 7.651579846285227, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 3.125998676740987e-05, |
|
"loss": 0.8945, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 7.66865926558497, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 3.0827109103512643e-05, |
|
"loss": 0.888, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 7.685738684884714, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 3.0396702729352023e-05, |
|
"loss": 0.895, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 7.702818104184458, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 2.996878302174472e-05, |
|
"loss": 0.89, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 7.719897523484201, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 2.9543365268667867e-05, |
|
"loss": 0.8868, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 7.736976942783945, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 2.9120464668713188e-05, |
|
"loss": 0.8944, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 7.7540563620836895, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 2.8700096330544012e-05, |
|
"loss": 0.8946, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 7.771135781383433, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 2.828227527235513e-05, |
|
"loss": 0.8926, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 7.788215200683177, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 2.7867016421336776e-05, |
|
"loss": 0.8984, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 7.805294619982921, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 2.7454334613140864e-05, |
|
"loss": 0.8874, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 7.822374039282664, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 2.7044244591351232e-05, |
|
"loss": 0.892, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 7.839453458582408, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 2.6636761006956955e-05, |
|
"loss": 0.8936, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 7.856532877882152, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 2.6231898417828603e-05, |
|
"loss": 0.8856, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 7.873612297181896, |
|
"grad_norm": 0.25, |
|
"learning_rate": 2.582967128819851e-05, |
|
"loss": 0.8886, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 7.89069171648164, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 2.5430093988143778e-05, |
|
"loss": 0.8891, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 7.907771135781384, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 2.5033180793072986e-05, |
|
"loss": 0.8808, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 7.924850555081127, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 2.4638945883216235e-05, |
|
"loss": 0.8868, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 7.941929974380871, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 2.4247403343118335e-05, |
|
"loss": 0.8934, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 7.959009393680615, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 2.385856716113587e-05, |
|
"loss": 0.8878, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 7.976088812980358, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 2.3472451228937253e-05, |
|
"loss": 0.8913, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 7.9931682322801025, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 2.3089069341006565e-05, |
|
"loss": 0.8902, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.592200994491577, |
|
"eval_runtime": 0.5427, |
|
"eval_samples_per_second": 18.425, |
|
"eval_steps_per_second": 1.843, |
|
"step": 2342 |
|
}, |
|
{ |
|
"epoch": 8.010247651579846, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 2.2708435194150634e-05, |
|
"loss": 0.8945, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 8.02732707087959, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 2.2330562387009745e-05, |
|
"loss": 0.8833, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 8.044406490179334, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 2.1955464419571782e-05, |
|
"loss": 0.8823, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 8.061485909479078, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 2.1583154692689976e-05, |
|
"loss": 0.8874, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 8.078565328778822, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 2.121364650760408e-05, |
|
"loss": 0.8743, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 8.095644748078564, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 2.08469530654652e-05, |
|
"loss": 0.8872, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 8.112724167378309, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 2.048308746686417e-05, |
|
"loss": 0.8936, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 8.129803586678053, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 2.0122062711363532e-05, |
|
"loss": 0.8818, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 8.146883005977797, |
|
"grad_norm": 0.25, |
|
"learning_rate": 1.9763891697032978e-05, |
|
"loss": 0.887, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 8.163962425277541, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 1.9408587219988805e-05, |
|
"loss": 0.884, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 8.181041844577285, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 1.9056161973936513e-05, |
|
"loss": 0.8892, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 8.198121263877027, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 1.8706628549717452e-05, |
|
"loss": 0.8883, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 8.215200683176771, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 1.835999943485892e-05, |
|
"loss": 0.8802, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 8.232280102476516, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 1.8016287013128018e-05, |
|
"loss": 0.8886, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 8.24935952177626, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 1.767550356408938e-05, |
|
"loss": 0.8784, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 8.266438941076004, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 1.7337661262666294e-05, |
|
"loss": 0.8897, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 8.283518360375748, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.7002772178705716e-05, |
|
"loss": 0.8844, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 8.30059777967549, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 1.6670848276547334e-05, |
|
"loss": 0.8856, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 8.317677198975234, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 1.6341901414595705e-05, |
|
"loss": 0.8762, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 8.334756618274978, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.601594334489702e-05, |
|
"loss": 0.8802, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 8.351836037574722, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 1.5692985712719e-05, |
|
"loss": 0.8939, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 8.368915456874467, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 1.5373040056134814e-05, |
|
"loss": 0.8804, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 8.38599487617421, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 1.5056117805611115e-05, |
|
"loss": 0.8806, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 8.403074295473953, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 1.474223028359939e-05, |
|
"loss": 0.8856, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 8.420153714773697, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 1.4431388704131632e-05, |
|
"loss": 0.8791, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 8.437233134073441, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 1.4123604172419713e-05, |
|
"loss": 0.8874, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 8.454312553373185, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.3818887684458426e-05, |
|
"loss": 0.8827, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 8.47139197267293, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 1.3517250126632986e-05, |
|
"loss": 0.8847, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 8.488471391972674, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 1.321870227532971e-05, |
|
"loss": 0.8826, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 8.505550811272418, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 1.292325479655131e-05, |
|
"loss": 0.89, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 8.52263023057216, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 1.263091824553574e-05, |
|
"loss": 0.8904, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 8.539709649871904, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 1.2341703066379074e-05, |
|
"loss": 0.888, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 8.556789069171648, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 1.205561959166237e-05, |
|
"loss": 0.8841, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 8.573868488471392, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.1772678042082607e-05, |
|
"loss": 0.8856, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 8.590947907771136, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 1.149288852608743e-05, |
|
"loss": 0.8871, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 8.608027327070879, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.1216261039514087e-05, |
|
"loss": 0.8817, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 8.625106746370623, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 1.094280546523231e-05, |
|
"loss": 0.8825, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 8.642186165670367, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 1.0672531572791178e-05, |
|
"loss": 0.8922, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 8.659265584970111, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 1.0405449018070168e-05, |
|
"loss": 0.8879, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 8.676345004269855, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 1.0141567342934132e-05, |
|
"loss": 0.885, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 8.6934244235696, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 9.880895974892412e-06, |
|
"loss": 0.8886, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 8.710503842869343, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 9.623444226762035e-06, |
|
"loss": 0.8805, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 8.727583262169086, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 9.369221296335006e-06, |
|
"loss": 0.8866, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 8.74466268146883, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 9.118236266049707e-06, |
|
"loss": 0.8811, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 8.761742100768574, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 8.870498102666402e-06, |
|
"loss": 0.8849, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 8.778821520068318, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 8.626015656946895e-06, |
|
"loss": 0.8857, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 8.795900939368062, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 8.384797663338306e-06, |
|
"loss": 0.8833, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 8.812980358667804, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 8.146852739661105e-06, |
|
"loss": 0.885, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 8.830059777967548, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 7.91218938680104e-06, |
|
"loss": 0.8861, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 8.847139197267293, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 7.6808159884057e-06, |
|
"loss": 0.88, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 8.864218616567037, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 7.45274081058478e-06, |
|
"loss": 0.8794, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 8.88129803586678, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 7.2279720016148244e-06, |
|
"loss": 0.8801, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 8.898377455166525, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 7.0065175916482095e-06, |
|
"loss": 0.8818, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 8.915456874466269, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 6.788385492426053e-06, |
|
"loss": 0.8856, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 8.932536293766011, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 6.573583496995816e-06, |
|
"loss": 0.8887, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 8.949615713065755, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 6.36211927943271e-06, |
|
"loss": 0.8778, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 8.9666951323655, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 6.1540003945655286e-06, |
|
"loss": 0.8906, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 8.983774551665244, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 5.949234277706861e-06, |
|
"loss": 0.8818, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 8.997438087105039, |
|
"eval_loss": 2.5981767177581787, |
|
"eval_runtime": 0.5523, |
|
"eval_samples_per_second": 18.106, |
|
"eval_steps_per_second": 1.811, |
|
"step": 2634 |
|
}, |
|
{ |
|
"epoch": 9.000853970964988, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 5.74782824438731e-06, |
|
"loss": 0.8865, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 9.017933390264732, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 5.549789490094304e-06, |
|
"loss": 0.8846, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 9.035012809564474, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 5.355125090014845e-06, |
|
"loss": 0.8845, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 9.052092228864218, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 5.163841998782837e-06, |
|
"loss": 0.8852, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 9.069171648163962, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 4.975947050230712e-06, |
|
"loss": 0.8831, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 9.086251067463706, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 4.79144695714504e-06, |
|
"loss": 0.8838, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 9.10333048676345, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 4.610348311026958e-06, |
|
"loss": 0.8892, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 9.120409906063195, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 4.432657581856525e-06, |
|
"loss": 0.882, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 9.137489325362937, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 4.25838111786162e-06, |
|
"loss": 0.8839, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 9.154568744662681, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 4.087525145291204e-06, |
|
"loss": 0.8854, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 9.171648163962425, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 3.920095768192722e-06, |
|
"loss": 0.8823, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 9.18872758326217, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 3.7560989681941992e-06, |
|
"loss": 0.883, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 9.205807002561913, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 3.595540604290437e-06, |
|
"loss": 0.8795, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 9.222886421861658, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 3.4384264126337328e-06, |
|
"loss": 0.8868, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 9.2399658411614, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 3.284762006328945e-06, |
|
"loss": 0.8884, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 9.257045260461144, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 3.1345528752329212e-06, |
|
"loss": 0.8819, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 9.274124679760888, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 2.9878043857584415e-06, |
|
"loss": 0.8893, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 9.291204099060632, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 2.8445217806824077e-06, |
|
"loss": 0.8805, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 9.308283518360376, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 2.704710178958603e-06, |
|
"loss": 0.8796, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 9.32536293766012, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 2.5683745755348044e-06, |
|
"loss": 0.8853, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 9.342442356959863, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 2.435519841174272e-06, |
|
"loss": 0.8844, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 9.359521776259607, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 2.30615072228183e-06, |
|
"loss": 0.8838, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 9.376601195559351, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 2.180271840734216e-06, |
|
"loss": 0.8895, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 9.393680614859095, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 2.057887693714988e-06, |
|
"loss": 0.876, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 9.410760034158839, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 1.9390026535538674e-06, |
|
"loss": 0.8831, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 9.427839453458583, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 1.8236209675705274e-06, |
|
"loss": 0.8851, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 9.444918872758326, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 1.7117467579228053e-06, |
|
"loss": 0.876, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 9.46199829205807, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 1.6033840214595308e-06, |
|
"loss": 0.879, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 9.479077711357814, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 1.4985366295776404e-06, |
|
"loss": 0.8899, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 9.496157130657558, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 1.397208328083921e-06, |
|
"loss": 0.8836, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 9.513236549957302, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 1.2994027370611173e-06, |
|
"loss": 0.8905, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 9.530315969257046, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 1.205123350738746e-06, |
|
"loss": 0.8875, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 9.547395388556788, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 1.114373537368063e-06, |
|
"loss": 0.8838, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 9.564474807856532, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 1.0271565391018922e-06, |
|
"loss": 0.8807, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 9.581554227156277, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 9.434754718787409e-07, |
|
"loss": 0.875, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 9.59863364645602, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 8.633333253113995e-07, |
|
"loss": 0.8845, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 9.615713065755765, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 7.867329625802833e-07, |
|
"loss": 0.88, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 9.632792485055509, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 7.136771203310245e-07, |
|
"loss": 0.8794, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 9.649871904355251, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 6.441684085767396e-07, |
|
"loss": 0.8894, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 9.666951323654995, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 5.782093106048159e-07, |
|
"loss": 0.8803, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 9.68403074295474, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 5.158021828881032e-07, |
|
"loss": 0.8844, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 9.701110162254484, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 4.569492550008603e-07, |
|
"loss": 0.8835, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 9.718189581554228, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 4.016526295389622e-07, |
|
"loss": 0.8832, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 9.735269000853972, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 3.49914282044872e-07, |
|
"loss": 0.879, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 9.752348420153714, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 3.017360609370301e-07, |
|
"loss": 0.8805, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 9.769427839453458, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 2.5711968744382974e-07, |
|
"loss": 0.8853, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 9.786507258753202, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 2.1606675554209922e-07, |
|
"loss": 0.8901, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 9.803586678052946, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 1.7857873190019192e-07, |
|
"loss": 0.8816, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 9.82066609735269, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 1.446569558255395e-07, |
|
"loss": 0.8823, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 9.837745516652435, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 1.143026392168789e-07, |
|
"loss": 0.8837, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 9.854824935952177, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 8.751686652084256e-08, |
|
"loss": 0.8835, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 9.871904355251921, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 6.430059469334504e-08, |
|
"loss": 0.8839, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 9.888983774551665, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 4.465465316529915e-08, |
|
"loss": 0.8863, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 9.90606319385141, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 2.8579743813006432e-08, |
|
"loss": 0.8822, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 9.923142613151153, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 1.6076440933099345e-08, |
|
"loss": 0.8817, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 9.940222032450897, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 7.145191222035497e-09, |
|
"loss": 0.8827, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 9.95730145175064, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 1.7863137600993008e-09, |
|
"loss": 0.881, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 9.974380871050384, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0, |
|
"loss": 0.8852, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 9.974380871050384, |
|
"eval_loss": 2.5989506244659424, |
|
"eval_runtime": 0.5586, |
|
"eval_samples_per_second": 17.903, |
|
"eval_steps_per_second": 1.79, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 9.974380871050384, |
|
"step": 2920, |
|
"total_flos": 3.4809256003093135e+18, |
|
"train_loss": 0.9919237802289936, |
|
"train_runtime": 34991.5416, |
|
"train_samples_per_second": 8.027, |
|
"train_steps_per_second": 0.083 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2920, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.4809256003093135e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|