{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 9.974380871050384,
  "eval_steps": 500,
  "global_step": 2920,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0034158838599487617,
      "grad_norm": 3.8125,
      "learning_rate": 6.849315068493151e-07,
      "loss": 3.0658,
      "step": 1
    },
    {
      "epoch": 0.017079419299743808,
      "grad_norm": 3.921875,
      "learning_rate": 3.4246575342465754e-06,
      "loss": 3.0727,
      "step": 5
    },
    {
      "epoch": 0.034158838599487616,
      "grad_norm": 4.59375,
      "learning_rate": 6.849315068493151e-06,
      "loss": 3.0381,
      "step": 10
    },
    {
      "epoch": 0.05123825789923143,
      "grad_norm": 3.375,
      "learning_rate": 1.0273972602739726e-05,
      "loss": 2.9796,
      "step": 15
    },
    {
      "epoch": 0.06831767719897523,
      "grad_norm": 2.546875,
      "learning_rate": 1.3698630136986302e-05,
      "loss": 2.8478,
      "step": 20
    },
    {
      "epoch": 0.08539709649871904,
      "grad_norm": 2.375,
      "learning_rate": 1.7123287671232875e-05,
      "loss": 2.7142,
      "step": 25
    },
    {
      "epoch": 0.10247651579846286,
      "grad_norm": 1.4375,
      "learning_rate": 2.0547945205479453e-05,
      "loss": 2.5273,
      "step": 30
    },
    {
      "epoch": 0.11955593509820667,
      "grad_norm": 5.34375,
      "learning_rate": 2.3972602739726026e-05,
      "loss": 2.3905,
      "step": 35
    },
    {
      "epoch": 0.13663535439795046,
      "grad_norm": 1.53125,
      "learning_rate": 2.7397260273972603e-05,
      "loss": 2.2615,
      "step": 40
    },
    {
      "epoch": 0.1537147736976943,
      "grad_norm": 21.875,
      "learning_rate": 3.082191780821918e-05,
      "loss": 2.1359,
      "step": 45
    },
    {
      "epoch": 0.1707941929974381,
      "grad_norm": 15.6875,
      "learning_rate": 3.424657534246575e-05,
      "loss": 2.0159,
      "step": 50
    },
    {
      "epoch": 0.18787361229718189,
      "grad_norm": 0.921875,
      "learning_rate": 3.767123287671233e-05,
      "loss": 1.8994,
      "step": 55
    },
    {
      "epoch": 0.2049530315969257,
      "grad_norm": 0.8203125,
      "learning_rate": 4.1095890410958905e-05,
      "loss": 1.7873,
      "step": 60
    },
    {
      "epoch": 0.2220324508966695,
      "grad_norm": 0.8671875,
      "learning_rate": 4.452054794520548e-05,
      "loss": 1.6691,
      "step": 65
    },
    {
      "epoch": 0.23911187019641333,
      "grad_norm": 0.7109375,
      "learning_rate": 4.794520547945205e-05,
      "loss": 1.5889,
      "step": 70
    },
    {
      "epoch": 0.2561912894961571,
      "grad_norm": 0.8984375,
      "learning_rate": 5.136986301369864e-05,
      "loss": 1.5329,
      "step": 75
    },
    {
      "epoch": 0.27327070879590093,
      "grad_norm": 0.5390625,
      "learning_rate": 5.479452054794521e-05,
      "loss": 1.4749,
      "step": 80
    },
    {
      "epoch": 0.29035012809564475,
      "grad_norm": 0.31640625,
      "learning_rate": 5.821917808219178e-05,
      "loss": 1.438,
      "step": 85
    },
    {
      "epoch": 0.3074295473953886,
      "grad_norm": 0.404296875,
      "learning_rate": 6.164383561643835e-05,
      "loss": 1.395,
      "step": 90
    },
    {
      "epoch": 0.32450896669513235,
      "grad_norm": 0.2578125,
      "learning_rate": 6.506849315068494e-05,
      "loss": 1.3653,
      "step": 95
    },
    {
      "epoch": 0.3415883859948762,
      "grad_norm": 0.46875,
      "learning_rate": 6.84931506849315e-05,
      "loss": 1.3329,
      "step": 100
    },
    {
      "epoch": 0.35866780529462,
      "grad_norm": 0.5078125,
      "learning_rate": 7.191780821917809e-05,
      "loss": 1.3221,
      "step": 105
    },
    {
      "epoch": 0.37574722459436377,
      "grad_norm": 0.6015625,
      "learning_rate": 7.534246575342466e-05,
      "loss": 1.3048,
      "step": 110
    },
    {
      "epoch": 0.3928266438941076,
      "grad_norm": 0.69140625,
      "learning_rate": 7.876712328767124e-05,
      "loss": 1.2884,
      "step": 115
    },
    {
      "epoch": 0.4099060631938514,
      "grad_norm": 0.408203125,
      "learning_rate": 8.219178082191781e-05,
      "loss": 1.2687,
      "step": 120
    },
    {
      "epoch": 0.4269854824935952,
      "grad_norm": 0.388671875,
      "learning_rate": 8.561643835616438e-05,
      "loss": 1.2502,
      "step": 125
    },
    {
      "epoch": 0.444064901793339,
      "grad_norm": 0.5859375,
      "learning_rate": 8.904109589041096e-05,
      "loss": 1.2416,
      "step": 130
    },
    {
      "epoch": 0.46114432109308284,
      "grad_norm": 0.3828125,
      "learning_rate": 9.246575342465755e-05,
      "loss": 1.2345,
      "step": 135
    },
    {
      "epoch": 0.47822374039282667,
      "grad_norm": 0.7890625,
      "learning_rate": 9.58904109589041e-05,
      "loss": 1.2381,
      "step": 140
    },
    {
      "epoch": 0.49530315969257044,
      "grad_norm": 0.5625,
      "learning_rate": 9.931506849315069e-05,
      "loss": 1.2236,
      "step": 145
    },
    {
      "epoch": 0.5123825789923142,
      "grad_norm": 0.5703125,
      "learning_rate": 0.00010273972602739728,
      "loss": 1.2102,
      "step": 150
    },
    {
      "epoch": 0.5294619982920581,
      "grad_norm": 1.015625,
      "learning_rate": 0.00010616438356164384,
      "loss": 1.203,
      "step": 155
    },
    {
      "epoch": 0.5465414175918019,
      "grad_norm": 0.33984375,
      "learning_rate": 0.00010958904109589041,
      "loss": 1.2011,
      "step": 160
    },
    {
      "epoch": 0.5636208368915457,
      "grad_norm": 0.267578125,
      "learning_rate": 0.000113013698630137,
      "loss": 1.193,
      "step": 165
    },
    {
      "epoch": 0.5807002561912895,
      "grad_norm": 0.6171875,
      "learning_rate": 0.00011643835616438356,
      "loss": 1.1933,
      "step": 170
    },
    {
      "epoch": 0.5977796754910333,
      "grad_norm": 0.41015625,
      "learning_rate": 0.00011986301369863014,
      "loss": 1.1774,
      "step": 175
    },
    {
      "epoch": 0.6148590947907772,
      "grad_norm": 0.734375,
      "learning_rate": 0.0001232876712328767,
      "loss": 1.177,
      "step": 180
    },
    {
      "epoch": 0.6319385140905209,
      "grad_norm": 0.9609375,
      "learning_rate": 0.0001267123287671233,
      "loss": 1.1705,
      "step": 185
    },
    {
      "epoch": 0.6490179333902647,
      "grad_norm": 0.44140625,
      "learning_rate": 0.00013013698630136988,
      "loss": 1.1612,
      "step": 190
    },
    {
      "epoch": 0.6660973526900086,
      "grad_norm": 0.44140625,
      "learning_rate": 0.00013356164383561644,
      "loss": 1.167,
      "step": 195
    },
    {
      "epoch": 0.6831767719897524,
      "grad_norm": 0.353515625,
      "learning_rate": 0.000136986301369863,
      "loss": 1.1616,
      "step": 200
    },
    {
      "epoch": 0.7002561912894961,
      "grad_norm": 1.3203125,
      "learning_rate": 0.0001404109589041096,
      "loss": 1.1553,
      "step": 205
    },
    {
      "epoch": 0.71733561058924,
      "grad_norm": 0.322265625,
      "learning_rate": 0.00014383561643835618,
      "loss": 1.1475,
      "step": 210
    },
    {
      "epoch": 0.7344150298889838,
      "grad_norm": 0.51953125,
      "learning_rate": 0.00014726027397260274,
      "loss": 1.1482,
      "step": 215
    },
    {
      "epoch": 0.7514944491887275,
      "grad_norm": 0.671875,
      "learning_rate": 0.00015068493150684933,
      "loss": 1.1427,
      "step": 220
    },
    {
      "epoch": 0.7685738684884714,
      "grad_norm": 1.1171875,
      "learning_rate": 0.00015410958904109589,
      "loss": 1.1441,
      "step": 225
    },
    {
      "epoch": 0.7856532877882152,
      "grad_norm": 0.6640625,
      "learning_rate": 0.00015753424657534247,
      "loss": 1.1336,
      "step": 230
    },
    {
      "epoch": 0.802732707087959,
      "grad_norm": 0.70703125,
      "learning_rate": 0.00016095890410958906,
      "loss": 1.1315,
      "step": 235
    },
    {
      "epoch": 0.8198121263877028,
      "grad_norm": 1.0859375,
      "learning_rate": 0.00016438356164383562,
      "loss": 1.1316,
      "step": 240
    },
    {
      "epoch": 0.8368915456874466,
      "grad_norm": 1.8671875,
      "learning_rate": 0.0001678082191780822,
      "loss": 1.1381,
      "step": 245
    },
    {
      "epoch": 0.8539709649871904,
      "grad_norm": 1.0703125,
      "learning_rate": 0.00017123287671232877,
      "loss": 1.1376,
      "step": 250
    },
    {
      "epoch": 0.8710503842869343,
      "grad_norm": 1.328125,
      "learning_rate": 0.00017465753424657536,
      "loss": 1.1309,
      "step": 255
    },
    {
      "epoch": 0.888129803586678,
      "grad_norm": 1.375,
      "learning_rate": 0.00017808219178082192,
      "loss": 1.1282,
      "step": 260
    },
    {
      "epoch": 0.9052092228864219,
      "grad_norm": 5.78125,
      "learning_rate": 0.0001815068493150685,
      "loss": 1.1414,
      "step": 265
    },
    {
      "epoch": 0.9222886421861657,
      "grad_norm": 0.47265625,
      "learning_rate": 0.0001849315068493151,
      "loss": 1.1403,
      "step": 270
    },
    {
      "epoch": 0.9393680614859095,
      "grad_norm": 0.326171875,
      "learning_rate": 0.00018835616438356165,
      "loss": 1.1352,
      "step": 275
    },
    {
      "epoch": 0.9564474807856533,
      "grad_norm": 0.31640625,
      "learning_rate": 0.0001917808219178082,
      "loss": 1.1201,
      "step": 280
    },
    {
      "epoch": 0.9735269000853971,
      "grad_norm": 0.3359375,
      "learning_rate": 0.0001952054794520548,
      "loss": 1.1145,
      "step": 285
    },
    {
      "epoch": 0.9906063193851409,
      "grad_norm": 0.31640625,
      "learning_rate": 0.00019863013698630139,
      "loss": 1.1174,
      "step": 290
    },
    {
      "epoch": 0.9974380871050385,
      "eval_loss": 2.4481546878814697,
      "eval_runtime": 0.5643,
      "eval_samples_per_second": 17.72,
      "eval_steps_per_second": 1.772,
      "step": 292
    },
    {
      "epoch": 1.0076857386848848,
      "grad_norm": 0.41796875,
      "learning_rate": 0.00019999935692582106,
      "loss": 1.1057,
      "step": 295
    },
    {
      "epoch": 1.0247651579846284,
      "grad_norm": 0.314453125,
      "learning_rate": 0.00019999542705801296,
      "loss": 1.0972,
      "step": 300
    },
    {
      "epoch": 1.0418445772843723,
      "grad_norm": 0.412109375,
      "learning_rate": 0.00019998792472605885,
      "loss": 1.1012,
      "step": 305
    },
    {
      "epoch": 1.0589239965841162,
      "grad_norm": 0.32421875,
      "learning_rate": 0.00019997685019798912,
      "loss": 1.0859,
      "step": 310
    },
    {
      "epoch": 1.0760034158838598,
      "grad_norm": 0.578125,
      "learning_rate": 0.00019996220386945537,
      "loss": 1.0973,
      "step": 315
    },
    {
      "epoch": 1.0930828351836037,
      "grad_norm": 0.53515625,
      "learning_rate": 0.00019994398626371643,
      "loss": 1.0961,
      "step": 320
    },
    {
      "epoch": 1.1101622544833476,
      "grad_norm": 0.271484375,
      "learning_rate": 0.0001999221980316194,
      "loss": 1.0901,
      "step": 325
    },
    {
      "epoch": 1.1272416737830913,
      "grad_norm": 0.546875,
      "learning_rate": 0.00019989683995157677,
      "loss": 1.0761,
      "step": 330
    },
    {
      "epoch": 1.1443210930828351,
      "grad_norm": 0.54296875,
      "learning_rate": 0.0001998679129295382,
      "loss": 1.082,
      "step": 335
    },
    {
      "epoch": 1.161400512382579,
      "grad_norm": 0.44921875,
      "learning_rate": 0.0001998354179989585,
      "loss": 1.0788,
      "step": 340
    },
    {
      "epoch": 1.1784799316823227,
      "grad_norm": 0.263671875,
      "learning_rate": 0.00019979935632076048,
      "loss": 1.0745,
      "step": 345
    },
    {
      "epoch": 1.1955593509820666,
      "grad_norm": 0.302734375,
      "learning_rate": 0.00019975972918329356,
      "loss": 1.0775,
      "step": 350
    },
    {
      "epoch": 1.2126387702818104,
      "grad_norm": 0.28515625,
      "learning_rate": 0.0001997165380022878,
      "loss": 1.0761,
      "step": 355
    },
    {
      "epoch": 1.2297181895815543,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00019966978432080316,
      "loss": 1.0789,
      "step": 360
    },
    {
      "epoch": 1.246797608881298,
      "grad_norm": 0.2451171875,
      "learning_rate": 0.00019961946980917456,
      "loss": 1.0762,
      "step": 365
    },
    {
      "epoch": 1.2638770281810419,
      "grad_norm": 0.46875,
      "learning_rate": 0.00019956559626495212,
      "loss": 1.0748,
      "step": 370
    },
    {
      "epoch": 1.2809564474807855,
      "grad_norm": 0.2734375,
      "learning_rate": 0.00019950816561283685,
      "loss": 1.0756,
      "step": 375
    },
    {
      "epoch": 1.2980358667805294,
      "grad_norm": 0.36328125,
      "learning_rate": 0.00019944717990461207,
      "loss": 1.0694,
      "step": 380
    },
    {
      "epoch": 1.3151152860802733,
      "grad_norm": 0.259765625,
      "learning_rate": 0.00019938264131907,
      "loss": 1.0654,
      "step": 385
    },
    {
      "epoch": 1.332194705380017,
      "grad_norm": 0.375,
      "learning_rate": 0.00019931455216193382,
      "loss": 1.0645,
      "step": 390
    },
    {
      "epoch": 1.3492741246797608,
      "grad_norm": 0.64453125,
      "learning_rate": 0.00019924291486577559,
      "loss": 1.0613,
      "step": 395
    },
    {
      "epoch": 1.3663535439795047,
      "grad_norm": 0.392578125,
      "learning_rate": 0.000199167731989929,
      "loss": 1.0689,
      "step": 400
    },
    {
      "epoch": 1.3834329632792486,
      "grad_norm": 0.39453125,
      "learning_rate": 0.00019908900622039822,
      "loss": 1.065,
      "step": 405
    },
    {
      "epoch": 1.4005123825789922,
      "grad_norm": 0.33203125,
      "learning_rate": 0.00019900674036976173,
      "loss": 1.0668,
      "step": 410
    },
    {
      "epoch": 1.4175918018787361,
      "grad_norm": 0.32421875,
      "learning_rate": 0.0001989209373770719,
      "loss": 1.0628,
      "step": 415
    },
    {
      "epoch": 1.43467122117848,
      "grad_norm": 0.421875,
      "learning_rate": 0.00019883160030775016,
      "loss": 1.0617,
      "step": 420
    },
    {
      "epoch": 1.4517506404782237,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00019873873235347719,
      "loss": 1.0598,
      "step": 425
    },
    {
      "epoch": 1.4688300597779675,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00019864233683207906,
      "loss": 1.0536,
      "step": 430
    },
    {
      "epoch": 1.4859094790777114,
      "grad_norm": 0.251953125,
      "learning_rate": 0.0001985424171874087,
      "loss": 1.0565,
      "step": 435
    },
    {
      "epoch": 1.5029888983774553,
      "grad_norm": 0.333984375,
      "learning_rate": 0.00019843897698922284,
      "loss": 1.0613,
      "step": 440
    },
    {
      "epoch": 1.520068317677199,
      "grad_norm": 0.32421875,
      "learning_rate": 0.0001983320199330545,
      "loss": 1.0517,
      "step": 445
    },
    {
      "epoch": 1.5371477369769426,
      "grad_norm": 0.404296875,
      "learning_rate": 0.00019822154984008088,
      "loss": 1.0589,
      "step": 450
    },
    {
      "epoch": 1.5542271562766867,
      "grad_norm": 0.353515625,
      "learning_rate": 0.00019810757065698688,
      "loss": 1.0517,
      "step": 455
    },
    {
      "epoch": 1.5713065755764304,
      "grad_norm": 0.373046875,
      "learning_rate": 0.0001979900864558242,
      "loss": 1.0547,
      "step": 460
    },
    {
      "epoch": 1.588385994876174,
      "grad_norm": 0.408203125,
      "learning_rate": 0.0001978691014338658,
      "loss": 1.0537,
      "step": 465
    },
    {
      "epoch": 1.6054654141759181,
      "grad_norm": 0.48046875,
      "learning_rate": 0.00019774461991345577,
      "loss": 1.0459,
      "step": 470
    },
    {
      "epoch": 1.6225448334756618,
      "grad_norm": 0.73046875,
      "learning_rate": 0.0001976166463418552,
      "loss": 1.0477,
      "step": 475
    },
    {
      "epoch": 1.6396242527754057,
      "grad_norm": 0.6796875,
      "learning_rate": 0.00019748518529108316,
      "loss": 1.0472,
      "step": 480
    },
    {
      "epoch": 1.6567036720751496,
      "grad_norm": 0.58984375,
      "learning_rate": 0.0001973502414577533,
      "loss": 1.0521,
      "step": 485
    },
    {
      "epoch": 1.6737830913748932,
      "grad_norm": 0.375,
      "learning_rate": 0.00019721181966290613,
      "loss": 1.0394,
      "step": 490
    },
    {
      "epoch": 1.690862510674637,
      "grad_norm": 0.474609375,
      "learning_rate": 0.00019706992485183684,
      "loss": 1.0328,
      "step": 495
    },
    {
      "epoch": 1.707941929974381,
      "grad_norm": 0.302734375,
      "learning_rate": 0.00019692456209391846,
      "loss": 1.0382,
      "step": 500
    },
    {
      "epoch": 1.7250213492741246,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00019677573658242087,
      "loss": 1.0418,
      "step": 505
    },
    {
      "epoch": 1.7421007685738685,
      "grad_norm": 0.3984375,
      "learning_rate": 0.0001966234536343253,
      "loss": 1.0416,
      "step": 510
    },
    {
      "epoch": 1.7591801878736124,
      "grad_norm": 0.294921875,
      "learning_rate": 0.0001964677186901342,
      "loss": 1.0399,
      "step": 515
    },
    {
      "epoch": 1.776259607173356,
      "grad_norm": 0.30078125,
      "learning_rate": 0.00019630853731367713,
      "loss": 1.0404,
      "step": 520
    },
    {
      "epoch": 1.7933390264731,
      "grad_norm": 0.365234375,
      "learning_rate": 0.00019614591519191165,
      "loss": 1.0349,
      "step": 525
    },
    {
      "epoch": 1.8104184457728438,
      "grad_norm": 0.39453125,
      "learning_rate": 0.00019597985813472052,
      "loss": 1.0303,
      "step": 530
    },
    {
      "epoch": 1.8274978650725875,
      "grad_norm": 0.353515625,
      "learning_rate": 0.00019581037207470382,
      "loss": 1.0318,
      "step": 535
    },
    {
      "epoch": 1.8445772843723314,
      "grad_norm": 0.3984375,
      "learning_rate": 0.0001956374630669672,
      "loss": 1.0386,
      "step": 540
    },
    {
      "epoch": 1.8616567036720753,
      "grad_norm": 0.359375,
      "learning_rate": 0.00019546113728890541,
      "loss": 1.0252,
      "step": 545
    },
    {
      "epoch": 1.878736122971819,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00019528140103998177,
      "loss": 1.0329,
      "step": 550
    },
    {
      "epoch": 1.8958155422715628,
      "grad_norm": 0.265625,
      "learning_rate": 0.00019509826074150298,
      "loss": 1.0385,
      "step": 555
    },
    {
      "epoch": 1.9128949615713067,
      "grad_norm": 0.2734375,
      "learning_rate": 0.00019491172293638968,
      "loss": 1.0322,
      "step": 560
    },
    {
      "epoch": 1.9299743808710503,
      "grad_norm": 0.25,
      "learning_rate": 0.00019472179428894288,
      "loss": 1.0296,
      "step": 565
    },
    {
      "epoch": 1.9470538001707942,
      "grad_norm": 0.28515625,
      "learning_rate": 0.0001945284815846057,
      "loss": 1.0434,
      "step": 570
    },
    {
      "epoch": 1.964133219470538,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00019433179172972102,
      "loss": 1.0228,
      "step": 575
    },
    {
      "epoch": 1.9812126387702818,
      "grad_norm": 0.25390625,
      "learning_rate": 0.00019413173175128473,
      "loss": 1.0274,
      "step": 580
    },
    {
      "epoch": 1.9982920580700256,
      "grad_norm": 0.4765625,
      "learning_rate": 0.00019392830879669463,
      "loss": 1.0252,
      "step": 585
    },
    {
      "epoch": 1.9982920580700256,
      "eval_loss": 2.451388120651245,
      "eval_runtime": 0.5458,
      "eval_samples_per_second": 18.323,
      "eval_steps_per_second": 1.832,
      "step": 585
    },
    {
      "epoch": 2.0153714773697695,
      "grad_norm": 0.474609375,
      "learning_rate": 0.00019372153013349523,
      "loss": 1.0051,
      "step": 590
    },
    {
      "epoch": 2.032450896669513,
      "grad_norm": 0.365234375,
      "learning_rate": 0.00019351140314911795,
      "loss": 1.0105,
      "step": 595
    },
    {
      "epoch": 2.049530315969257,
      "grad_norm": 0.416015625,
      "learning_rate": 0.00019329793535061723,
      "loss": 1.0135,
      "step": 600
    },
    {
      "epoch": 2.066609735269001,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00019308113436440242,
      "loss": 1.0062,
      "step": 605
    },
    {
      "epoch": 2.0836891545687446,
      "grad_norm": 0.48828125,
      "learning_rate": 0.0001928610079359652,
      "loss": 1.0019,
      "step": 610
    },
    {
      "epoch": 2.1007685738684883,
      "grad_norm": 0.330078125,
      "learning_rate": 0.00019263756392960294,
      "loss": 1.0048,
      "step": 615
    },
    {
      "epoch": 2.1178479931682324,
      "grad_norm": 0.271484375,
      "learning_rate": 0.00019241081032813772,
      "loss": 1.0094,
      "step": 620
    },
    {
      "epoch": 2.134927412467976,
      "grad_norm": 0.265625,
      "learning_rate": 0.00019218075523263104,
      "loss": 1.0014,
      "step": 625
    },
    {
      "epoch": 2.1520068317677197,
      "grad_norm": 0.333984375,
      "learning_rate": 0.00019194740686209464,
      "loss": 1.0085,
      "step": 630
    },
    {
      "epoch": 2.1690862510674638,
      "grad_norm": 0.40625,
      "learning_rate": 0.0001917107735531966,
      "loss": 1.0014,
      "step": 635
    },
    {
      "epoch": 2.1861656703672074,
      "grad_norm": 0.455078125,
      "learning_rate": 0.0001914708637599636,
      "loss": 1.0056,
      "step": 640
    },
    {
      "epoch": 2.203245089666951,
      "grad_norm": 0.46875,
      "learning_rate": 0.00019122768605347892,
      "loss": 0.998,
      "step": 645
    },
    {
      "epoch": 2.220324508966695,
      "grad_norm": 0.265625,
      "learning_rate": 0.00019098124912157632,
      "loss": 1.0007,
      "step": 650
    },
    {
      "epoch": 2.237403928266439,
      "grad_norm": 0.244140625,
      "learning_rate": 0.00019073156176852935,
      "loss": 1.0046,
      "step": 655
    },
    {
      "epoch": 2.2544833475661825,
      "grad_norm": 0.30859375,
      "learning_rate": 0.00019047863291473717,
      "loss": 1.0084,
      "step": 660
    },
    {
      "epoch": 2.2715627668659266,
      "grad_norm": 0.333984375,
      "learning_rate": 0.00019022247159640557,
      "loss": 1.0006,
      "step": 665
    },
    {
      "epoch": 2.2886421861656703,
      "grad_norm": 0.49609375,
      "learning_rate": 0.00018996308696522433,
      "loss": 1.0057,
      "step": 670
    },
    {
      "epoch": 2.305721605465414,
      "grad_norm": 0.396484375,
      "learning_rate": 0.00018970048828804016,
      "loss": 1.0019,
      "step": 675
    },
    {
      "epoch": 2.322801024765158,
      "grad_norm": 0.443359375,
      "learning_rate": 0.0001894346849465257,
      "loss": 1.0054,
      "step": 680
    },
    {
      "epoch": 2.3398804440649017,
      "grad_norm": 0.390625,
      "learning_rate": 0.0001891656864368442,
      "loss": 1.0021,
      "step": 685
    },
    {
      "epoch": 2.3569598633646454,
      "grad_norm": 0.28515625,
      "learning_rate": 0.00018889350236931055,
      "loss": 0.9956,
      "step": 690
    },
    {
      "epoch": 2.3740392826643895,
      "grad_norm": 0.412109375,
      "learning_rate": 0.00018861814246804755,
      "loss": 1.0063,
      "step": 695
    },
    {
      "epoch": 2.391118701964133,
      "grad_norm": 0.27734375,
      "learning_rate": 0.00018833961657063885,
      "loss": 1.0013,
      "step": 700
    },
    {
      "epoch": 2.408198121263877,
      "grad_norm": 0.255859375,
      "learning_rate": 0.00018805793462777734,
      "loss": 0.9951,
      "step": 705
    },
    {
      "epoch": 2.425277540563621,
      "grad_norm": 0.2490234375,
      "learning_rate": 0.0001877731067029096,
      "loss": 1.0019,
      "step": 710
    },
    {
      "epoch": 2.4423569598633645,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00018748514297187648,
      "loss": 0.995,
      "step": 715
    },
    {
      "epoch": 2.4594363791631086,
      "grad_norm": 0.31640625,
      "learning_rate": 0.00018719405372254948,
      "loss": 1.002,
      "step": 720
    },
    {
      "epoch": 2.4765157984628523,
      "grad_norm": 0.392578125,
      "learning_rate": 0.00018689984935446317,
      "loss": 0.9942,
      "step": 725
    },
    {
      "epoch": 2.493595217762596,
      "grad_norm": 0.34765625,
      "learning_rate": 0.00018660254037844388,
      "loss": 1.0012,
      "step": 730
    },
    {
      "epoch": 2.5106746370623396,
      "grad_norm": 0.3203125,
      "learning_rate": 0.00018630213741623383,
      "loss": 1.002,
      "step": 735
    },
    {
      "epoch": 2.5277540563620837,
      "grad_norm": 0.263671875,
      "learning_rate": 0.00018599865120011192,
      "loss": 0.9975,
      "step": 740
    },
    {
      "epoch": 2.5448334756618274,
      "grad_norm": 0.373046875,
      "learning_rate": 0.00018569209257251026,
      "loss": 0.9996,
      "step": 745
    },
    {
      "epoch": 2.561912894961571,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00018538247248562674,
      "loss": 1.0001,
      "step": 750
    },
    {
      "epoch": 2.578992314261315,
      "grad_norm": 0.345703125,
      "learning_rate": 0.00018506980200103375,
      "loss": 0.9954,
      "step": 755
    },
    {
      "epoch": 2.596071733561059,
      "grad_norm": 0.265625,
      "learning_rate": 0.00018475409228928312,
      "loss": 0.9945,
      "step": 760
    },
    {
      "epoch": 2.6131511528608025,
      "grad_norm": 0.267578125,
      "learning_rate": 0.00018443535462950688,
      "loss": 0.9918,
      "step": 765
    },
    {
      "epoch": 2.6302305721605466,
      "grad_norm": 0.375,
      "learning_rate": 0.0001841136004090144,
      "loss": 1.0007,
      "step": 770
    },
    {
      "epoch": 2.6473099914602902,
      "grad_norm": 0.251953125,
      "learning_rate": 0.00018378884112288542,
      "loss": 1.0026,
      "step": 775
    },
    {
      "epoch": 2.664389410760034,
      "grad_norm": 0.26953125,
      "learning_rate": 0.00018346108837355972,
      "loss": 0.995,
      "step": 780
    },
    {
      "epoch": 2.681468830059778,
      "grad_norm": 0.37109375,
      "learning_rate": 0.0001831303538704221,
      "loss": 0.9916,
      "step": 785
    },
    {
      "epoch": 2.6985482493595216,
      "grad_norm": 0.275390625,
      "learning_rate": 0.00018279664942938447,
      "loss": 0.9902,
      "step": 790
    },
    {
      "epoch": 2.7156276686592657,
      "grad_norm": 0.2333984375,
      "learning_rate": 0.00018245998697246352,
      "loss": 1.0003,
      "step": 795
    },
    {
      "epoch": 2.7327070879590094,
      "grad_norm": 0.28125,
      "learning_rate": 0.00018212037852735486,
      "loss": 0.9933,
      "step": 800
    },
    {
      "epoch": 2.749786507258753,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00018177783622700327,
      "loss": 0.9934,
      "step": 805
    },
    {
      "epoch": 2.766865926558497,
      "grad_norm": 0.232421875,
      "learning_rate": 0.0001814323723091692,
      "loss": 0.9887,
      "step": 810
    },
    {
      "epoch": 2.783945345858241,
      "grad_norm": 0.28125,
      "learning_rate": 0.00018108399911599167,
      "loss": 0.995,
      "step": 815
    },
    {
      "epoch": 2.8010247651579845,
      "grad_norm": 0.30078125,
      "learning_rate": 0.00018073272909354727,
      "loss": 0.9897,
      "step": 820
    },
    {
      "epoch": 2.8181041844577286,
      "grad_norm": 0.27734375,
      "learning_rate": 0.00018037857479140547,
      "loss": 0.9923,
      "step": 825
    },
    {
      "epoch": 2.8351836037574722,
      "grad_norm": 0.31640625,
      "learning_rate": 0.00018002154886218033,
      "loss": 0.9877,
      "step": 830
    },
    {
      "epoch": 2.852263023057216,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00017966166406107846,
      "loss": 0.9936,
      "step": 835
    },
    {
      "epoch": 2.86934244235696,
      "grad_norm": 0.28125,
      "learning_rate": 0.00017929893324544332,
      "loss": 0.9931,
      "step": 840
    },
    {
      "epoch": 2.8864218616567037,
      "grad_norm": 0.359375,
      "learning_rate": 0.00017893336937429581,
      "loss": 0.992,
      "step": 845
    },
    {
      "epoch": 2.9035012809564473,
      "grad_norm": 0.26171875,
      "learning_rate": 0.00017856498550787144,
      "loss": 0.9896,
      "step": 850
    },
    {
      "epoch": 2.9205807002561914,
      "grad_norm": 0.2451171875,
      "learning_rate": 0.0001781937948071536,
      "loss": 0.9899,
      "step": 855
    },
    {
      "epoch": 2.937660119555935,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00017781981053340337,
      "loss": 0.9869,
      "step": 860
    },
    {
      "epoch": 2.9547395388556787,
      "grad_norm": 0.43359375,
      "learning_rate": 0.00017744304604768588,
      "loss": 0.9865,
      "step": 865
    },
    {
      "epoch": 2.971818958155423,
      "grad_norm": 0.423828125,
      "learning_rate": 0.00017706351481039284,
      "loss": 0.9885,
      "step": 870
    },
    {
      "epoch": 2.9888983774551665,
      "grad_norm": 0.3125,
      "learning_rate": 0.00017668123038076163,
      "loss": 0.988,
      "step": 875
    },
    {
      "epoch": 2.9991460290350127,
      "eval_loss": 2.468273639678955,
      "eval_runtime": 0.5562,
      "eval_samples_per_second": 17.978,
      "eval_steps_per_second": 1.798,
      "step": 878
    },
    {
      "epoch": 3.00597779675491,
      "grad_norm": 0.2392578125,
      "learning_rate": 0.00017629620641639103,
      "loss": 0.9741,
      "step": 880
    },
    {
      "epoch": 3.0230572160546543,
      "grad_norm": 0.337890625,
      "learning_rate": 0.00017590845667275312,
      "loss": 0.9621,
      "step": 885
    },
    {
      "epoch": 3.040136635354398,
      "grad_norm": 0.390625,
      "learning_rate": 0.00017551799500270198,
      "loss": 0.968,
      "step": 890
    },
    {
      "epoch": 3.0572160546541416,
      "grad_norm": 0.326171875,
      "learning_rate": 0.00017512483535597867,
      "loss": 0.9683,
      "step": 895
    },
    {
      "epoch": 3.0742954739538857,
      "grad_norm": 0.25,
      "learning_rate": 0.00017472899177871297,
      "loss": 0.9671,
      "step": 900
    },
    {
      "epoch": 3.0913748932536294,
      "grad_norm": 0.2431640625,
      "learning_rate": 0.0001743304784129214,
      "loss": 0.9563,
      "step": 905
    },
    {
      "epoch": 3.108454312553373,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00017392930949600217,
      "loss": 0.9678,
      "step": 910
    },
    {
      "epoch": 3.125533731853117,
      "grad_norm": 0.240234375,
      "learning_rate": 0.0001735254993602264,
      "loss": 0.9594,
      "step": 915
    },
    {
      "epoch": 3.1426131511528608,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00017311906243222614,
      "loss": 0.9691,
      "step": 920
    },
    {
      "epoch": 3.1596925704526044,
      "grad_norm": 0.2578125,
      "learning_rate": 0.0001727100132324789,
      "loss": 0.9694,
      "step": 925
    },
    {
      "epoch": 3.1767719897523485,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00017229836637478902,
      "loss": 0.9678,
      "step": 930
    },
    {
      "epoch": 3.193851409052092,
      "grad_norm": 0.2470703125,
      "learning_rate": 0.00017188413656576534,
      "loss": 0.972,
      "step": 935
    },
    {
      "epoch": 3.210930828351836,
      "grad_norm": 0.267578125,
      "learning_rate": 0.00017146733860429612,
      "loss": 0.9661,
      "step": 940
    },
    {
      "epoch": 3.22801024765158,
      "grad_norm": 0.2578125,
      "learning_rate": 0.00017104798738101993,
      "loss": 0.9567,
      "step": 945
    },
    {
      "epoch": 3.2450896669513236,
      "grad_norm": 0.28515625,
      "learning_rate": 0.00017062609787779403,
      "loss": 0.9605,
      "step": 950
    },
    {
      "epoch": 3.2621690862510673,
      "grad_norm": 0.259765625,
      "learning_rate": 0.00017020168516715894,
      "loss": 0.9678,
      "step": 955
    },
    {
      "epoch": 3.2792485055508114,
      "grad_norm": 0.271484375,
      "learning_rate": 0.00016977476441179992,
      "loss": 0.961,
      "step": 960
    },
    {
      "epoch": 3.296327924850555,
      "grad_norm": 0.298828125,
      "learning_rate": 0.00016934535086400538,
      "loss": 0.9657,
      "step": 965
    },
    {
      "epoch": 3.313407344150299,
      "grad_norm": 0.2734375,
      "learning_rate": 0.0001689134598651219,
      "loss": 0.9601,
      "step": 970
    },
    {
      "epoch": 3.330486763450043,
      "grad_norm": 0.2314453125,
      "learning_rate": 0.00016847910684500615,
      "loss": 0.9652,
      "step": 975
    },
    {
      "epoch": 3.3475661827497865,
      "grad_norm": 0.298828125,
      "learning_rate": 0.0001680423073214737,
      "loss": 0.9755,
      "step": 980
    },
    {
      "epoch": 3.3646456020495306,
      "grad_norm": 0.42578125,
      "learning_rate": 0.0001676030768997445,
      "loss": 0.9641,
      "step": 985
    },
    {
      "epoch": 3.381725021349274,
      "grad_norm": 0.6875,
      "learning_rate": 0.00016716143127188548,
      "loss": 0.9652,
      "step": 990
    },
    {
      "epoch": 3.398804440649018,
      "grad_norm": 0.400390625,
      "learning_rate": 0.0001667173862162499,
      "loss": 0.9647,
      "step": 995
    },
    {
      "epoch": 3.415883859948762,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00016627095759691362,
      "loss": 0.9689,
      "step": 1000
    },
    {
      "epoch": 3.4329632792485056,
      "grad_norm": 0.318359375,
      "learning_rate": 0.0001658221613631083,
      "loss": 0.9656,
      "step": 1005
    },
    {
      "epoch": 3.4500426985482493,
      "grad_norm": 0.29296875,
      "learning_rate": 0.0001653710135486518,
      "loss": 0.9601,
      "step": 1010
    },
    {
      "epoch": 3.4671221178479934,
      "grad_norm": 0.34765625,
      "learning_rate": 0.00016491753027137498,
      "loss": 0.9669,
      "step": 1015
    },
    {
      "epoch": 3.484201537147737,
      "grad_norm": 0.302734375,
      "learning_rate": 0.00016446172773254629,
      "loss": 0.9606,
      "step": 1020
    },
    {
      "epoch": 3.5012809564474807,
      "grad_norm": 0.26953125,
      "learning_rate": 0.00016400362221629264,
      "loss": 0.9693,
      "step": 1025
    },
    {
      "epoch": 3.518360375747225,
      "grad_norm": 0.32421875,
      "learning_rate": 0.00016354323008901776,
      "loss": 0.9631,
      "step": 1030
    },
    {
      "epoch": 3.5354397950469685,
      "grad_norm": 0.34375,
      "learning_rate": 0.0001630805677988175,
      "loss": 0.9601,
      "step": 1035
    },
    {
      "epoch": 3.552519214346712,
      "grad_norm": 0.24609375,
      "learning_rate": 0.0001626156518748922,
      "loss": 0.9593,
      "step": 1040
    },
    {
      "epoch": 3.5695986336464562,
      "grad_norm": 0.21875,
      "learning_rate": 0.00016214849892695602,
      "loss": 0.9611,
      "step": 1045
    },
    {
      "epoch": 3.5866780529462,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00016167912564464383,
      "loss": 0.966,
      "step": 1050
    },
    {
      "epoch": 3.6037574722459436,
      "grad_norm": 0.267578125,
      "learning_rate": 0.00016120754879691464,
      "loss": 0.9651,
      "step": 1055
    },
    {
      "epoch": 3.6208368915456877,
      "grad_norm": 0.330078125,
      "learning_rate": 0.0001607337852314527,
      "loss": 0.9591,
      "step": 1060
    },
    {
      "epoch": 3.6379163108454313,
      "grad_norm": 0.27734375,
      "learning_rate": 0.00016025785187406553,
      "loss": 0.9578,
      "step": 1065
    },
    {
      "epoch": 3.654995730145175,
      "grad_norm": 0.369140625,
      "learning_rate": 0.0001597797657280792,
      "loss": 0.9691,
      "step": 1070
    },
    {
      "epoch": 3.672075149444919,
      "grad_norm": 0.25,
      "learning_rate": 0.00015929954387373103,
      "loss": 0.9591,
      "step": 1075
    },
    {
      "epoch": 3.6891545687446627,
      "grad_norm": 0.478515625,
      "learning_rate": 0.00015881720346755905,
      "loss": 0.9629,
      "step": 1080
    },
    {
      "epoch": 3.7062339880444064,
      "grad_norm": 0.267578125,
      "learning_rate": 0.00015833276174178937,
      "loss": 0.9564,
      "step": 1085
    },
    {
      "epoch": 3.7233134073441505,
      "grad_norm": 0.2578125,
      "learning_rate": 0.00015784623600372042,
      "loss": 0.9644,
      "step": 1090
    },
    {
      "epoch": 3.740392826643894,
      "grad_norm": 0.3203125,
      "learning_rate": 0.0001573576436351046,
      "loss": 0.9592,
      "step": 1095
    },
    {
      "epoch": 3.757472245943638,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00015686700209152738,
      "loss": 0.9709,
      "step": 1100
    },
    {
      "epoch": 3.774551665243382,
      "grad_norm": 1.3828125,
      "learning_rate": 0.00015637432890178353,
      "loss": 0.9658,
      "step": 1105
    },
    {
      "epoch": 3.7916310845431256,
      "grad_norm": 0.314453125,
      "learning_rate": 0.00015587964166725095,
      "loss": 0.9621,
      "step": 1110
    },
    {
      "epoch": 3.8087105038428692,
      "grad_norm": 0.2353515625,
      "learning_rate": 0.00015538295806126205,
      "loss": 0.9648,
      "step": 1115
    },
    {
      "epoch": 3.8257899231426133,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00015488429582847192,
      "loss": 0.9647,
      "step": 1120
    },
    {
      "epoch": 3.842869342442357,
      "grad_norm": 0.27734375,
      "learning_rate": 0.0001543836727842248,
      "loss": 0.9569,
      "step": 1125
    },
    {
      "epoch": 3.8599487617421007,
      "grad_norm": 0.318359375,
      "learning_rate": 0.00015388110681391725,
      "loss": 0.9615,
      "step": 1130
    },
    {
      "epoch": 3.8770281810418448,
      "grad_norm": 0.2412109375,
      "learning_rate": 0.00015337661587235953,
      "loss": 0.9561,
      "step": 1135
    },
    {
      "epoch": 3.8941076003415884,
      "grad_norm": 0.2275390625,
      "learning_rate": 0.0001528702179831338,
      "loss": 0.9686,
      "step": 1140
    },
    {
      "epoch": 3.911187019641332,
      "grad_norm": 0.28515625,
      "learning_rate": 0.00015236193123795041,
      "loss": 0.959,
      "step": 1145
    },
    {
      "epoch": 3.928266438941076,
      "grad_norm": 0.25,
      "learning_rate": 0.00015185177379600152,
      "loss": 0.9545,
      "step": 1150
    },
    {
      "epoch": 3.94534585824082,
      "grad_norm": 0.322265625,
      "learning_rate": 0.00015133976388331227,
      "loss": 0.9626,
      "step": 1155
    },
    {
      "epoch": 3.9624252775405635,
      "grad_norm": 0.26171875,
      "learning_rate": 0.00015082591979208976,
      "loss": 0.9595,
      "step": 1160
    },
    {
      "epoch": 3.9795046968403076,
      "grad_norm": 0.24609375,
      "learning_rate": 0.00015031025988006936,
      "loss": 0.959,
      "step": 1165
    },
    {
      "epoch": 3.9965841161400513,
      "grad_norm": 0.28515625,
      "learning_rate": 0.000149792802569859,
      "loss": 0.9741,
      "step": 1170
    },
    {
      "epoch": 4.0,
      "eval_loss": 2.4999709129333496,
      "eval_runtime": 0.5563,
      "eval_samples_per_second": 17.977,
      "eval_steps_per_second": 1.798,
      "step": 1171
    },
    {
      "epoch": 4.013663535439795,
      "grad_norm": 0.419921875,
      "learning_rate": 0.00014927356634828094,
      "loss": 0.943,
      "step": 1175
    },
    {
      "epoch": 4.030742954739539,
      "grad_norm": 0.314453125,
      "learning_rate": 0.00014875256976571135,
      "loss": 0.9301,
      "step": 1180
    },
    {
      "epoch": 4.047822374039282,
      "grad_norm": 0.25,
      "learning_rate": 0.00014822983143541752,
      "loss": 0.9339,
      "step": 1185
    },
    {
      "epoch": 4.064901793339026,
      "grad_norm": 0.3125,
      "learning_rate": 0.0001477053700328929,
      "loss": 0.9284,
      "step": 1190
    },
    {
      "epoch": 4.0819812126387705,
      "grad_norm": 0.30078125,
      "learning_rate": 0.00014717920429518984,
      "loss": 0.9403,
      "step": 1195
    },
    {
      "epoch": 4.099060631938514,
      "grad_norm": 0.3203125,
      "learning_rate": 0.00014665135302025035,
      "loss": 0.936,
      "step": 1200
    },
    {
      "epoch": 4.116140051238258,
      "grad_norm": 0.306640625,
      "learning_rate": 0.00014612183506623432,
      "loss": 0.9361,
      "step": 1205
    },
    {
      "epoch": 4.133219470538002,
      "grad_norm": 0.28515625,
      "learning_rate": 0.00014559066935084588,
      "loss": 0.9354,
      "step": 1210
    },
    {
      "epoch": 4.150298889837745,
      "grad_norm": 0.380859375,
      "learning_rate": 0.0001450578748506576,
      "loss": 0.9339,
      "step": 1215
    },
    {
      "epoch": 4.167378309137489,
      "grad_norm": 0.248046875,
      "learning_rate": 0.00014452347060043237,
      "loss": 0.9319,
      "step": 1220
    },
    {
      "epoch": 4.184457728437233,
      "grad_norm": 0.330078125,
      "learning_rate": 0.00014398747569244354,
      "loss": 0.9403,
      "step": 1225
    },
    {
      "epoch": 4.2015371477369765,
      "grad_norm": 0.2578125,
      "learning_rate": 0.00014344990927579268,
      "loss": 0.9368,
      "step": 1230
    },
    {
      "epoch": 4.218616567036721,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00014291079055572554,
      "loss": 0.9327,
      "step": 1235
    },
    {
      "epoch": 4.235695986336465,
      "grad_norm": 0.24609375,
      "learning_rate": 0.0001423701387929459,
      "loss": 0.942,
      "step": 1240
    },
    {
      "epoch": 4.252775405636209,
      "grad_norm": 0.296875,
      "learning_rate": 0.0001418279733029274,
      "loss": 0.9416,
      "step": 1245
    },
    {
      "epoch": 4.269854824935952,
      "grad_norm": 0.357421875,
      "learning_rate": 0.0001412843134552235,
      "loss": 0.9365,
      "step": 1250
    },
    {
      "epoch": 4.286934244235696,
      "grad_norm": 0.2421875,
      "learning_rate": 0.00014073917867277557,
      "loss": 0.9334,
      "step": 1255
    },
    {
      "epoch": 4.304013663535439,
      "grad_norm": 0.322265625,
      "learning_rate": 0.00014019258843121893,
      "loss": 0.9374,
      "step": 1260
    },
    {
      "epoch": 4.3210930828351835,
      "grad_norm": 0.34765625,
      "learning_rate": 0.0001396445622581869,
      "loss": 0.9309,
      "step": 1265
    },
    {
      "epoch": 4.3381725021349276,
      "grad_norm": 0.2421875,
      "learning_rate": 0.0001390951197326134,
      "loss": 0.9256,
      "step": 1270
    },
    {
      "epoch": 4.355251921434672,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00013854428048403324,
      "loss": 0.9336,
      "step": 1275
    },
    {
      "epoch": 4.372331340734415,
      "grad_norm": 0.275390625,
      "learning_rate": 0.00013799206419188103,
      "loss": 0.9441,
      "step": 1280
    },
    {
      "epoch": 4.389410760034159,
      "grad_norm": 0.232421875,
      "learning_rate": 0.00013743849058478808,
      "loss": 0.938,
      "step": 1285
    },
    {
      "epoch": 4.406490179333902,
      "grad_norm": 0.3515625,
      "learning_rate": 0.00013688357943987732,
      "loss": 0.9389,
      "step": 1290
    },
    {
      "epoch": 4.423569598633646,
      "grad_norm": 0.29296875,
      "learning_rate": 0.00013632735058205706,
      "loss": 0.945,
      "step": 1295
    },
    {
      "epoch": 4.44064901793339,
      "grad_norm": 0.232421875,
      "learning_rate": 0.0001357698238833126,
      "loss": 0.9378,
      "step": 1300
    },
    {
      "epoch": 4.4577284372331345,
      "grad_norm": 0.279296875,
      "learning_rate": 0.00013521101926199607,
      "loss": 0.9378,
      "step": 1305
    },
    {
      "epoch": 4.474807856532878,
      "grad_norm": 0.291015625,
      "learning_rate": 0.0001346509566821153,
      "loss": 0.9409,
      "step": 1310
    },
    {
      "epoch": 4.491887275832622,
      "grad_norm": 0.3125,
      "learning_rate": 0.00013408965615262008,
      "loss": 0.9363,
      "step": 1315
    },
    {
      "epoch": 4.508966695132365,
      "grad_norm": 0.265625,
      "learning_rate": 0.00013352713772668765,
      "loss": 0.9413,
      "step": 1320
    },
    {
      "epoch": 4.526046114432109,
      "grad_norm": 0.2470703125,
      "learning_rate": 0.00013296342150100605,
      "loss": 0.9509,
      "step": 1325
    },
    {
      "epoch": 4.543125533731853,
      "grad_norm": 0.24609375,
      "learning_rate": 0.00013239852761505626,
      "loss": 0.9361,
      "step": 1330
    },
    {
      "epoch": 4.560204953031597,
      "grad_norm": 0.255859375,
      "learning_rate": 0.00013183247625039282,
      "loss": 0.9366,
      "step": 1335
    },
    {
      "epoch": 4.577284372331341,
      "grad_norm": 0.24609375,
      "learning_rate": 0.00013126528762992247,
      "loss": 0.9381,
      "step": 1340
    },
    {
      "epoch": 4.594363791631085,
      "grad_norm": 0.275390625,
      "learning_rate": 0.000130696982017182,
      "loss": 0.9394,
      "step": 1345
    },
    {
      "epoch": 4.611443210930828,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00013012757971561415,
      "loss": 0.9363,
      "step": 1350
    },
    {
      "epoch": 4.628522630230572,
      "grad_norm": 0.291015625,
      "learning_rate": 0.00012955710106784214,
      "loss": 0.9323,
      "step": 1355
    },
    {
      "epoch": 4.645602049530316,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00012898556645494325,
      "loss": 0.9387,
      "step": 1360
    },
    {
      "epoch": 4.66268146883006,
      "grad_norm": 0.291015625,
      "learning_rate": 0.00012841299629572032,
      "loss": 0.935,
      "step": 1365
    },
    {
      "epoch": 4.679760888129803,
      "grad_norm": 0.359375,
      "learning_rate": 0.0001278394110459724,
      "loss": 0.9446,
      "step": 1370
    },
    {
      "epoch": 4.6968403074295475,
      "grad_norm": 0.255859375,
      "learning_rate": 0.000127264831197764,
      "loss": 0.9372,
      "step": 1375
    },
    {
      "epoch": 4.713919726729291,
      "grad_norm": 0.275390625,
      "learning_rate": 0.0001266892772786929,
      "loss": 0.9363,
      "step": 1380
    },
    {
      "epoch": 4.730999146029035,
      "grad_norm": 0.26171875,
      "learning_rate": 0.00012611276985115678,
      "loss": 0.9394,
      "step": 1385
    },
    {
      "epoch": 4.748078565328779,
      "grad_norm": 0.33984375,
      "learning_rate": 0.0001255353295116187,
      "loss": 0.9438,
      "step": 1390
    },
    {
      "epoch": 4.765157984628523,
      "grad_norm": 0.267578125,
      "learning_rate": 0.00012495697688987112,
      "loss": 0.942,
      "step": 1395
    },
    {
      "epoch": 4.782237403928266,
      "grad_norm": 0.326171875,
      "learning_rate": 0.00012437773264829897,
      "loss": 0.9436,
      "step": 1400
    },
    {
      "epoch": 4.79931682322801,
      "grad_norm": 0.2890625,
      "learning_rate": 0.0001237976174811414,
      "loss": 0.9403,
      "step": 1405
    },
    {
      "epoch": 4.816396242527754,
      "grad_norm": 0.2578125,
      "learning_rate": 0.00012321665211375256,
      "loss": 0.9361,
      "step": 1410
    },
    {
      "epoch": 4.833475661827498,
      "grad_norm": 0.234375,
      "learning_rate": 0.00012263485730186103,
      "loss": 0.9404,
      "step": 1415
    },
    {
      "epoch": 4.850555081127242,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00012205225383082843,
      "loss": 0.9409,
      "step": 1420
    },
    {
      "epoch": 4.867634500426986,
      "grad_norm": 0.2421875,
      "learning_rate": 0.0001214688625149066,
      "loss": 0.9351,
      "step": 1425
    },
    {
      "epoch": 4.884713919726729,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00012088470419649432,
      "loss": 0.938,
      "step": 1430
    },
    {
      "epoch": 4.901793339026473,
      "grad_norm": 0.236328125,
      "learning_rate": 0.00012029979974539234,
      "loss": 0.9425,
      "step": 1435
    },
    {
      "epoch": 4.918872758326217,
      "grad_norm": 0.240234375,
      "learning_rate": 0.00011971417005805818,
      "loss": 0.9372,
      "step": 1440
    },
    {
      "epoch": 4.9359521776259605,
      "grad_norm": 0.2451171875,
      "learning_rate": 0.00011912783605685913,
      "loss": 0.9399,
      "step": 1445
    },
    {
      "epoch": 4.953031596925705,
      "grad_norm": 0.251953125,
      "learning_rate": 0.0001185408186893251,
      "loss": 0.9385,
      "step": 1450
    },
    {
      "epoch": 4.970111016225449,
      "grad_norm": 0.318359375,
      "learning_rate": 0.0001179531389274001,
      "loss": 0.9311,
      "step": 1455
    },
    {
      "epoch": 4.987190435525192,
      "grad_norm": 0.2470703125,
      "learning_rate": 0.00011736481776669306,
      "loss": 0.9342,
      "step": 1460
    },
    {
      "epoch": 4.997438087105039,
      "eval_loss": 2.5202550888061523,
      "eval_runtime": 0.5609,
      "eval_samples_per_second": 17.829,
      "eval_steps_per_second": 1.783,
      "step": 1463
    },
    {
      "epoch": 5.004269854824936,
      "grad_norm": 0.263671875,
      "learning_rate": 0.00011677587622572763,
      "loss": 0.9354,
      "step": 1465
    },
    {
      "epoch": 5.02134927412468,
      "grad_norm": 0.28515625,
      "learning_rate": 0.00011618633534519141,
      "loss": 0.9194,
      "step": 1470
    },
    {
      "epoch": 5.038428693424423,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00011559621618718414,
      "loss": 0.9073,
      "step": 1475
    },
    {
      "epoch": 5.0555081127241674,
      "grad_norm": 0.30078125,
      "learning_rate": 0.00011500553983446527,
      "loss": 0.9146,
      "step": 1480
    },
    {
      "epoch": 5.0725875320239115,
      "grad_norm": 0.24609375,
      "learning_rate": 0.00011441432738970072,
      "loss": 0.9098,
      "step": 1485
    },
    {
      "epoch": 5.089666951323655,
      "grad_norm": 0.291015625,
      "learning_rate": 0.00011382259997470899,
      "loss": 0.9135,
      "step": 1490
    },
    {
      "epoch": 5.106746370623399,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00011323037872970657,
      "loss": 0.9174,
      "step": 1495
    },
    {
      "epoch": 5.123825789923143,
      "grad_norm": 0.240234375,
      "learning_rate": 0.00011263768481255264,
      "loss": 0.9155,
      "step": 1500
    },
    {
      "epoch": 5.140905209222886,
      "grad_norm": 0.25,
      "learning_rate": 0.00011204453939799315,
      "loss": 0.9115,
      "step": 1505
    },
    {
      "epoch": 5.15798462852263,
      "grad_norm": 0.3046875,
      "learning_rate": 0.00011145096367690444,
      "loss": 0.9112,
      "step": 1510
    },
    {
      "epoch": 5.175064047822374,
      "grad_norm": 0.26953125,
      "learning_rate": 0.0001108569788555361,
      "loss": 0.9142,
      "step": 1515
    },
    {
      "epoch": 5.192143467122118,
      "grad_norm": 0.251953125,
      "learning_rate": 0.00011026260615475333,
      "loss": 0.9116,
      "step": 1520
    },
    {
      "epoch": 5.209222886421862,
      "grad_norm": 0.2490234375,
      "learning_rate": 0.00010966786680927874,
      "loss": 0.9141,
      "step": 1525
    },
    {
      "epoch": 5.226302305721606,
      "grad_norm": 0.25,
      "learning_rate": 0.00010907278206693395,
      "loss": 0.9168,
      "step": 1530
    },
    {
      "epoch": 5.243381725021349,
      "grad_norm": 0.30078125,
      "learning_rate": 0.00010847737318788013,
      "loss": 0.9216,
      "step": 1535
    },
    {
      "epoch": 5.260461144321093,
      "grad_norm": 0.283203125,
      "learning_rate": 0.00010788166144385888,
      "loss": 0.9167,
      "step": 1540
    },
    {
      "epoch": 5.277540563620837,
      "grad_norm": 0.25390625,
      "learning_rate": 0.0001072856681174318,
      "loss": 0.9155,
      "step": 1545
    },
    {
      "epoch": 5.2946199829205804,
      "grad_norm": 0.2734375,
      "learning_rate": 0.00010668941450122055,
      "loss": 0.9218,
      "step": 1550
    },
    {
      "epoch": 5.3116994022203246,
      "grad_norm": 0.259765625,
      "learning_rate": 0.00010609292189714586,
      "loss": 0.9132,
      "step": 1555
    },
    {
      "epoch": 5.328778821520069,
      "grad_norm": 0.25,
      "learning_rate": 0.0001054962116156667,
      "loss": 0.9181,
      "step": 1560
    },
    {
      "epoch": 5.345858240819812,
      "grad_norm": 0.2392578125,
      "learning_rate": 0.0001048993049750188,
      "loss": 0.9151,
      "step": 1565
    },
    {
      "epoch": 5.362937660119556,
      "grad_norm": 0.2490234375,
      "learning_rate": 0.00010430222330045304,
      "loss": 0.9096,
      "step": 1570
    },
    {
      "epoch": 5.3800170794193,
      "grad_norm": 0.2353515625,
      "learning_rate": 0.0001037049879234737,
      "loss": 0.9183,
      "step": 1575
    },
    {
      "epoch": 5.397096498719043,
      "grad_norm": 0.26171875,
      "learning_rate": 0.0001031076201810762,
      "loss": 0.9151,
      "step": 1580
    },
    {
      "epoch": 5.414175918018787,
      "grad_norm": 0.2890625,
      "learning_rate": 0.00010251014141498484,
      "loss": 0.9205,
      "step": 1585
    },
    {
      "epoch": 5.4312553373185315,
      "grad_norm": 0.294921875,
      "learning_rate": 0.00010191257297089052,
      "loss": 0.9114,
      "step": 1590
    },
    {
      "epoch": 5.448334756618275,
      "grad_norm": 0.2392578125,
      "learning_rate": 0.00010131493619768788,
      "loss": 0.9148,
      "step": 1595
    },
    {
      "epoch": 5.465414175918019,
      "grad_norm": 0.244140625,
      "learning_rate": 0.00010071725244671282,
      "loss": 0.9202,
      "step": 1600
    },
    {
      "epoch": 5.482493595217763,
      "grad_norm": 0.259765625,
      "learning_rate": 0.00010011954307097942,
      "loss": 0.9216,
      "step": 1605
    },
    {
      "epoch": 5.499573014517506,
      "grad_norm": 0.251953125,
      "learning_rate": 9.952182942441733e-05,
      "loss": 0.9206,
      "step": 1610
    },
    {
      "epoch": 5.51665243381725,
      "grad_norm": 0.265625,
      "learning_rate": 9.892413286110886e-05,
      "loss": 0.9193,
      "step": 1615
    },
    {
      "epoch": 5.533731853116994,
      "grad_norm": 0.2431640625,
      "learning_rate": 9.83264747345259e-05,
      "loss": 0.9195,
      "step": 1620
    },
    {
      "epoch": 5.5508112724167376,
      "grad_norm": 0.28515625,
      "learning_rate": 9.772887639676707e-05,
      "loss": 0.9178,
      "step": 1625
    },
    {
      "epoch": 5.567890691716482,
      "grad_norm": 0.2373046875,
      "learning_rate": 9.713135919779515e-05,
      "loss": 0.9174,
      "step": 1630
    },
    {
      "epoch": 5.584970111016226,
      "grad_norm": 0.2890625,
      "learning_rate": 9.653394448467399e-05,
      "loss": 0.9194,
      "step": 1635
    },
    {
      "epoch": 5.602049530315969,
      "grad_norm": 0.2578125,
      "learning_rate": 9.593665360080599e-05,
      "loss": 0.9192,
      "step": 1640
    },
    {
      "epoch": 5.619128949615713,
      "grad_norm": 0.28515625,
      "learning_rate": 9.533950788516974e-05,
      "loss": 0.9154,
      "step": 1645
    },
    {
      "epoch": 5.636208368915457,
      "grad_norm": 0.255859375,
      "learning_rate": 9.474252867155732e-05,
      "loss": 0.9142,
      "step": 1650
    },
    {
      "epoch": 5.6532877882152,
      "grad_norm": 0.267578125,
      "learning_rate": 9.414573728781247e-05,
      "loss": 0.9101,
      "step": 1655
    },
    {
      "epoch": 5.6703672075149445,
      "grad_norm": 0.28125,
      "learning_rate": 9.354915505506839e-05,
      "loss": 0.9158,
      "step": 1660
    },
    {
      "epoch": 5.687446626814689,
      "grad_norm": 0.2578125,
      "learning_rate": 9.295280328698604e-05,
      "loss": 0.9181,
      "step": 1665
    },
    {
      "epoch": 5.704526046114432,
      "grad_norm": 0.306640625,
      "learning_rate": 9.235670328899293e-05,
      "loss": 0.9138,
      "step": 1670
    },
    {
      "epoch": 5.721605465414176,
      "grad_norm": 0.26171875,
      "learning_rate": 9.176087635752156e-05,
      "loss": 0.9119,
      "step": 1675
    },
    {
      "epoch": 5.73868488471392,
      "grad_norm": 0.26171875,
      "learning_rate": 9.116534377924883e-05,
      "loss": 0.9213,
      "step": 1680
    },
    {
      "epoch": 5.755764304013663,
      "grad_norm": 0.251953125,
      "learning_rate": 9.057012683033555e-05,
      "loss": 0.9177,
      "step": 1685
    },
    {
      "epoch": 5.772843723313407,
      "grad_norm": 0.298828125,
      "learning_rate": 8.997524677566627e-05,
      "loss": 0.9217,
      "step": 1690
    },
    {
      "epoch": 5.789923142613151,
      "grad_norm": 0.236328125,
      "learning_rate": 8.938072486808952e-05,
      "loss": 0.9167,
      "step": 1695
    },
    {
      "epoch": 5.807002561912895,
      "grad_norm": 0.2392578125,
      "learning_rate": 8.878658234765858e-05,
      "loss": 0.9207,
      "step": 1700
    },
    {
      "epoch": 5.824081981212639,
      "grad_norm": 0.365234375,
      "learning_rate": 8.81928404408726e-05,
      "loss": 0.9173,
      "step": 1705
    },
    {
      "epoch": 5.841161400512383,
      "grad_norm": 0.298828125,
      "learning_rate": 8.759952035991844e-05,
      "loss": 0.9192,
      "step": 1710
    },
    {
      "epoch": 5.858240819812126,
      "grad_norm": 0.26171875,
      "learning_rate": 8.70066433019125e-05,
      "loss": 0.9178,
      "step": 1715
    },
    {
      "epoch": 5.87532023911187,
      "grad_norm": 0.2373046875,
      "learning_rate": 8.641423044814374e-05,
      "loss": 0.9246,
      "step": 1720
    },
    {
      "epoch": 5.892399658411614,
      "grad_norm": 0.255859375,
      "learning_rate": 8.582230296331686e-05,
      "loss": 0.9187,
      "step": 1725
    },
    {
      "epoch": 5.9094790777113575,
      "grad_norm": 0.240234375,
      "learning_rate": 8.5230881994796e-05,
      "loss": 0.9211,
      "step": 1730
    },
    {
      "epoch": 5.926558497011102,
      "grad_norm": 0.265625,
      "learning_rate": 8.463998867184952e-05,
      "loss": 0.9194,
      "step": 1735
    },
    {
      "epoch": 5.943637916310846,
      "grad_norm": 0.2490234375,
      "learning_rate": 8.404964410489485e-05,
      "loss": 0.9215,
      "step": 1740
    },
    {
      "epoch": 5.960717335610589,
      "grad_norm": 0.2412109375,
      "learning_rate": 8.34598693847444e-05,
      "loss": 0.9144,
      "step": 1745
    },
    {
      "epoch": 5.977796754910333,
      "grad_norm": 0.287109375,
      "learning_rate": 8.287068558185225e-05,
      "loss": 0.9175,
      "step": 1750
    },
    {
      "epoch": 5.994876174210077,
      "grad_norm": 0.2314453125,
      "learning_rate": 8.228211374556103e-05,
      "loss": 0.9201,
      "step": 1755
    },
    {
      "epoch": 5.998292058070025,
      "eval_loss": 2.5518593788146973,
      "eval_runtime": 0.5597,
      "eval_samples_per_second": 17.868,
      "eval_steps_per_second": 1.787,
      "step": 1756
    },
    {
      "epoch": 6.01195559350982,
      "grad_norm": 0.2490234375,
      "learning_rate": 8.169417490335007e-05,
      "loss": 0.9044,
      "step": 1760
    },
    {
      "epoch": 6.029035012809564,
      "grad_norm": 0.265625,
      "learning_rate": 8.110689006008434e-05,
      "loss": 0.8914,
      "step": 1765
    },
    {
      "epoch": 6.0461144321093085,
      "grad_norm": 0.263671875,
      "learning_rate": 8.052028019726371e-05,
      "loss": 0.896,
      "step": 1770
    },
    {
      "epoch": 6.063193851409052,
      "grad_norm": 0.26953125,
      "learning_rate": 7.993436627227368e-05,
      "loss": 0.9072,
      "step": 1775
    },
    {
      "epoch": 6.080273270708796,
      "grad_norm": 0.248046875,
      "learning_rate": 7.934916921763628e-05,
      "loss": 0.8999,
      "step": 1780
    },
    {
      "epoch": 6.09735269000854,
      "grad_norm": 0.25,
      "learning_rate": 7.876470994026254e-05,
      "loss": 0.8991,
      "step": 1785
    },
    {
      "epoch": 6.114432109308283,
      "grad_norm": 0.259765625,
      "learning_rate": 7.818100932070546e-05,
      "loss": 0.9065,
      "step": 1790
    },
    {
      "epoch": 6.131511528608027,
      "grad_norm": 0.251953125,
      "learning_rate": 7.759808821241406e-05,
      "loss": 0.8899,
      "step": 1795
    },
    {
      "epoch": 6.148590947907771,
      "grad_norm": 0.24609375,
      "learning_rate": 7.701596744098818e-05,
      "loss": 0.8956,
      "step": 1800
    },
    {
      "epoch": 6.165670367207515,
      "grad_norm": 0.26171875,
      "learning_rate": 7.643466780343479e-05,
      "loss": 0.8964,
      "step": 1805
    },
    {
      "epoch": 6.182749786507259,
      "grad_norm": 0.2470703125,
      "learning_rate": 7.585421006742463e-05,
      "loss": 0.8985,
      "step": 1810
    },
    {
      "epoch": 6.199829205807003,
      "grad_norm": 0.251953125,
      "learning_rate": 7.527461497055061e-05,
      "loss": 0.8979,
      "step": 1815
    },
    {
      "epoch": 6.216908625106746,
      "grad_norm": 0.26953125,
      "learning_rate": 7.469590321958662e-05,
      "loss": 0.9014,
      "step": 1820
    },
    {
      "epoch": 6.23398804440649,
      "grad_norm": 0.25,
      "learning_rate": 7.411809548974792e-05,
      "loss": 0.9059,
      "step": 1825
    },
    {
      "epoch": 6.251067463706234,
      "grad_norm": 0.23828125,
      "learning_rate": 7.354121242395254e-05,
      "loss": 0.903,
      "step": 1830
    },
    {
      "epoch": 6.268146883005977,
      "grad_norm": 0.267578125,
      "learning_rate": 7.296527463208358e-05,
      "loss": 0.8955,
      "step": 1835
    },
    {
      "epoch": 6.2852263023057215,
      "grad_norm": 0.2578125,
      "learning_rate": 7.239030269025311e-05,
      "loss": 0.8991,
      "step": 1840
    },
    {
      "epoch": 6.302305721605466,
      "grad_norm": 0.255859375,
      "learning_rate": 7.1816317140067e-05,
      "loss": 0.9014,
      "step": 1845
    },
    {
      "epoch": 6.319385140905209,
      "grad_norm": 0.244140625,
      "learning_rate": 7.124333848789091e-05,
      "loss": 0.9015,
      "step": 1850
    },
    {
      "epoch": 6.336464560204953,
      "grad_norm": 0.251953125,
      "learning_rate": 7.067138720411795e-05,
      "loss": 0.9039,
      "step": 1855
    },
    {
      "epoch": 6.353543979504697,
      "grad_norm": 0.263671875,
      "learning_rate": 7.010048372243698e-05,
      "loss": 0.8993,
      "step": 1860
    },
    {
      "epoch": 6.37062339880444,
      "grad_norm": 0.2734375,
      "learning_rate": 6.953064843910296e-05,
      "loss": 0.908,
      "step": 1865
    },
    {
      "epoch": 6.387702818104184,
      "grad_norm": 0.26953125,
      "learning_rate": 6.8961901712208e-05,
      "loss": 0.9021,
      "step": 1870
    },
    {
      "epoch": 6.4047822374039285,
      "grad_norm": 0.240234375,
      "learning_rate": 6.839426386095425e-05,
      "loss": 0.9002,
      "step": 1875
    },
    {
      "epoch": 6.421861656703672,
      "grad_norm": 0.279296875,
      "learning_rate": 6.782775516492771e-05,
      "loss": 0.9007,
      "step": 1880
    },
    {
      "epoch": 6.438941076003416,
      "grad_norm": 0.265625,
      "learning_rate": 6.726239586337408e-05,
      "loss": 0.8959,
      "step": 1885
    },
    {
      "epoch": 6.45602049530316,
      "grad_norm": 0.2470703125,
      "learning_rate": 6.669820615447522e-05,
      "loss": 0.9078,
      "step": 1890
    },
    {
      "epoch": 6.473099914602903,
      "grad_norm": 0.2490234375,
      "learning_rate": 6.613520619462803e-05,
      "loss": 0.8996,
      "step": 1895
    },
    {
      "epoch": 6.490179333902647,
      "grad_norm": 0.251953125,
      "learning_rate": 6.5573416097724e-05,
      "loss": 0.9023,
      "step": 1900
    },
    {
      "epoch": 6.507258753202391,
      "grad_norm": 0.259765625,
      "learning_rate": 6.50128559344307e-05,
      "loss": 0.9004,
      "step": 1905
    },
    {
      "epoch": 6.5243381725021345,
      "grad_norm": 0.244140625,
      "learning_rate": 6.445354573147484e-05,
      "loss": 0.9088,
      "step": 1910
    },
    {
      "epoch": 6.541417591801879,
      "grad_norm": 0.2412109375,
      "learning_rate": 6.389550547092661e-05,
      "loss": 0.8937,
      "step": 1915
    },
    {
      "epoch": 6.558497011101623,
      "grad_norm": 0.26171875,
      "learning_rate": 6.333875508948593e-05,
      "loss": 0.906,
      "step": 1920
    },
    {
      "epoch": 6.575576430401366,
      "grad_norm": 0.2431640625,
      "learning_rate": 6.278331447777021e-05,
      "loss": 0.9062,
      "step": 1925
    },
    {
      "epoch": 6.59265584970111,
      "grad_norm": 0.251953125,
      "learning_rate": 6.22292034796035e-05,
      "loss": 0.9004,
      "step": 1930
    },
    {
      "epoch": 6.609735269000854,
      "grad_norm": 0.2490234375,
      "learning_rate": 6.167644189130794e-05,
      "loss": 0.8995,
      "step": 1935
    },
    {
      "epoch": 6.626814688300598,
      "grad_norm": 0.2412109375,
      "learning_rate": 6.112504946099604e-05,
      "loss": 0.9011,
      "step": 1940
    },
    {
      "epoch": 6.6438941076003415,
      "grad_norm": 0.248046875,
      "learning_rate": 6.057504588786556e-05,
      "loss": 0.8957,
      "step": 1945
    },
    {
      "epoch": 6.660973526900086,
      "grad_norm": 0.2578125,
      "learning_rate": 6.0026450821495536e-05,
      "loss": 0.909,
      "step": 1950
    },
    {
      "epoch": 6.678052946199829,
      "grad_norm": 0.2373046875,
      "learning_rate": 5.947928386114428e-05,
      "loss": 0.8996,
      "step": 1955
    },
    {
      "epoch": 6.695132365499573,
      "grad_norm": 0.34375,
      "learning_rate": 5.8933564555049105e-05,
      "loss": 0.9072,
      "step": 1960
    },
    {
      "epoch": 6.712211784799317,
      "grad_norm": 0.2578125,
      "learning_rate": 5.838931239972824e-05,
      "loss": 0.9022,
      "step": 1965
    },
    {
      "epoch": 6.729291204099061,
      "grad_norm": 0.251953125,
      "learning_rate": 5.784654683928391e-05,
      "loss": 0.9009,
      "step": 1970
    },
    {
      "epoch": 6.746370623398804,
      "grad_norm": 0.244140625,
      "learning_rate": 5.730528726470792e-05,
      "loss": 0.8999,
      "step": 1975
    },
    {
      "epoch": 6.763450042698548,
      "grad_norm": 0.2490234375,
      "learning_rate": 5.6765553013188766e-05,
      "loss": 0.9002,
      "step": 1980
    },
    {
      "epoch": 6.780529461998292,
      "grad_norm": 0.26171875,
      "learning_rate": 5.622736336742087e-05,
      "loss": 0.8965,
      "step": 1985
    },
    {
      "epoch": 6.797608881298036,
      "grad_norm": 0.314453125,
      "learning_rate": 5.5690737554915604e-05,
      "loss": 0.9015,
      "step": 1990
    },
    {
      "epoch": 6.81468830059778,
      "grad_norm": 0.244140625,
      "learning_rate": 5.5155694747314504e-05,
      "loss": 0.9105,
      "step": 1995
    },
    {
      "epoch": 6.831767719897524,
      "grad_norm": 0.244140625,
      "learning_rate": 5.462225405970401e-05,
      "loss": 0.8978,
      "step": 2000
    },
    {
      "epoch": 6.848847139197267,
      "grad_norm": 0.263671875,
      "learning_rate": 5.4090434549933064e-05,
      "loss": 0.8999,
      "step": 2005
    },
    {
      "epoch": 6.865926558497011,
      "grad_norm": 0.2734375,
      "learning_rate": 5.3560255217931785e-05,
      "loss": 0.8988,
      "step": 2010
    },
    {
      "epoch": 6.8830059777967545,
      "grad_norm": 0.2412109375,
      "learning_rate": 5.303173500503289e-05,
      "loss": 0.9055,
      "step": 2015
    },
    {
      "epoch": 6.900085397096499,
      "grad_norm": 0.248046875,
      "learning_rate": 5.2504892793295e-05,
      "loss": 0.8991,
      "step": 2020
    },
    {
      "epoch": 6.917164816396243,
      "grad_norm": 0.236328125,
      "learning_rate": 5.197974740482785e-05,
      "loss": 0.8997,
      "step": 2025
    },
    {
      "epoch": 6.934244235695987,
      "grad_norm": 0.2392578125,
      "learning_rate": 5.145631760112022e-05,
      "loss": 0.8983,
      "step": 2030
    },
    {
      "epoch": 6.95132365499573,
      "grad_norm": 0.232421875,
      "learning_rate": 5.093462208236931e-05,
      "loss": 0.9038,
      "step": 2035
    },
    {
      "epoch": 6.968403074295474,
      "grad_norm": 0.2451171875,
      "learning_rate": 5.041467948681269e-05,
      "loss": 0.8978,
      "step": 2040
    },
    {
      "epoch": 6.985482493595217,
      "grad_norm": 0.240234375,
      "learning_rate": 4.989650839006279e-05,
      "loss": 0.9054,
      "step": 2045
    },
    {
      "epoch": 6.999146029035013,
      "eval_loss": 2.5763192176818848,
      "eval_runtime": 0.5559,
      "eval_samples_per_second": 17.989,
      "eval_steps_per_second": 1.799,
      "step": 2049
    },
    {
      "epoch": 7.002561912894961,
      "grad_norm": 0.2431640625,
      "learning_rate": 4.9380127304442634e-05,
      "loss": 0.8953,
      "step": 2050
    },
    {
      "epoch": 7.0196413321947055,
      "grad_norm": 0.24609375,
      "learning_rate": 4.886555467832512e-05,
      "loss": 0.893,
      "step": 2055
    },
    {
      "epoch": 7.036720751494449,
      "grad_norm": 0.2451171875,
      "learning_rate": 4.835280889547351e-05,
      "loss": 0.8885,
      "step": 2060
    },
    {
      "epoch": 7.053800170794193,
      "grad_norm": 0.251953125,
      "learning_rate": 4.7841908274384616e-05,
      "loss": 0.8916,
      "step": 2065
    },
    {
      "epoch": 7.070879590093937,
      "grad_norm": 0.2421875,
      "learning_rate": 4.733287106763481e-05,
      "loss": 0.8906,
      "step": 2070
    },
    {
      "epoch": 7.08795900939368,
      "grad_norm": 0.2451171875,
      "learning_rate": 4.6825715461227284e-05,
      "loss": 0.8876,
      "step": 2075
    },
    {
      "epoch": 7.105038428693424,
      "grad_norm": 0.2470703125,
      "learning_rate": 4.6320459573942856e-05,
      "loss": 0.8908,
      "step": 2080
    },
    {
      "epoch": 7.122117847993168,
      "grad_norm": 0.240234375,
      "learning_rate": 4.581712145669239e-05,
      "loss": 0.8887,
      "step": 2085
    },
    {
      "epoch": 7.1391972672929125,
      "grad_norm": 0.2451171875,
      "learning_rate": 4.531571909187197e-05,
      "loss": 0.886,
      "step": 2090
    },
    {
      "epoch": 7.156276686592656,
      "grad_norm": 0.2431640625,
      "learning_rate": 4.481627039272056e-05,
      "loss": 0.8883,
      "step": 2095
    },
    {
      "epoch": 7.1733561058924,
      "grad_norm": 0.25390625,
      "learning_rate": 4.431879320267972e-05,
      "loss": 0.8922,
      "step": 2100
    },
    {
      "epoch": 7.190435525192143,
      "grad_norm": 0.244140625,
      "learning_rate": 4.38233052947565e-05,
      "loss": 0.8825,
      "step": 2105
    },
    {
      "epoch": 7.207514944491887,
      "grad_norm": 0.2353515625,
      "learning_rate": 4.332982437088825e-05,
      "loss": 0.8856,
      "step": 2110
    },
    {
      "epoch": 7.224594363791631,
      "grad_norm": 0.248046875,
      "learning_rate": 4.2838368061310276e-05,
      "loss": 0.8929,
      "step": 2115
    },
    {
      "epoch": 7.241673783091375,
      "grad_norm": 0.2431640625,
      "learning_rate": 4.2348953923925916e-05,
      "loss": 0.8977,
      "step": 2120
    },
    {
      "epoch": 7.2587532023911185,
      "grad_norm": 0.2392578125,
      "learning_rate": 4.186159944367936e-05,
      "loss": 0.8855,
      "step": 2125
    },
    {
      "epoch": 7.275832621690863,
      "grad_norm": 0.2421875,
      "learning_rate": 4.137632203193086e-05,
      "loss": 0.8837,
      "step": 2130
    },
    {
      "epoch": 7.292912040990606,
      "grad_norm": 0.251953125,
      "learning_rate": 4.0893139025834806e-05,
      "loss": 0.8927,
      "step": 2135
    },
    {
      "epoch": 7.30999146029035,
      "grad_norm": 0.25,
      "learning_rate": 4.041206768772022e-05,
      "loss": 0.8902,
      "step": 2140
    },
    {
      "epoch": 7.327070879590094,
      "grad_norm": 0.2451171875,
      "learning_rate": 3.993312520447414e-05,
      "loss": 0.8904,
      "step": 2145
    },
    {
      "epoch": 7.344150298889838,
      "grad_norm": 0.2451171875,
      "learning_rate": 3.9456328686927525e-05,
      "loss": 0.8885,
      "step": 2150
    },
    {
      "epoch": 7.361229718189581,
      "grad_norm": 0.2451171875,
      "learning_rate": 3.898169516924398e-05,
      "loss": 0.8945,
      "step": 2155
    },
    {
      "epoch": 7.3783091374893255,
      "grad_norm": 0.240234375,
      "learning_rate": 3.850924160831115e-05,
      "loss": 0.892,
      "step": 2160
    },
    {
      "epoch": 7.395388556789069,
      "grad_norm": 0.2451171875,
      "learning_rate": 3.803898488313501e-05,
      "loss": 0.8933,
      "step": 2165
    },
    {
      "epoch": 7.412467976088813,
      "grad_norm": 0.244140625,
      "learning_rate": 3.757094179423672e-05,
      "loss": 0.892,
      "step": 2170
    },
    {
      "epoch": 7.429547395388557,
      "grad_norm": 0.2392578125,
      "learning_rate": 3.710512906305248e-05,
      "loss": 0.8905,
      "step": 2175
    },
    {
      "epoch": 7.446626814688301,
      "grad_norm": 0.2373046875,
      "learning_rate": 3.6641563331336125e-05,
      "loss": 0.888,
      "step": 2180
    },
    {
      "epoch": 7.463706233988044,
      "grad_norm": 0.244140625,
      "learning_rate": 3.618026116056456e-05,
      "loss": 0.8847,
      "step": 2185
    },
    {
      "epoch": 7.480785653287788,
      "grad_norm": 0.26171875,
      "learning_rate": 3.5721239031346066e-05,
      "loss": 0.8867,
      "step": 2190
    },
    {
      "epoch": 7.497865072587532,
      "grad_norm": 0.251953125,
      "learning_rate": 3.5264513342831615e-05,
      "loss": 0.8894,
      "step": 2195
    },
    {
      "epoch": 7.514944491887276,
      "grad_norm": 0.2431640625,
      "learning_rate": 3.4810100412128747e-05,
      "loss": 0.8894,
      "step": 2200
    },
    {
      "epoch": 7.53202391118702,
      "grad_norm": 0.248046875,
      "learning_rate": 3.435801647371897e-05,
      "loss": 0.8922,
      "step": 2205
    },
    {
      "epoch": 7.549103330486764,
      "grad_norm": 0.25,
      "learning_rate": 3.3908277678877445e-05,
      "loss": 0.8934,
      "step": 2210
    },
    {
      "epoch": 7.566182749786507,
      "grad_norm": 0.240234375,
      "learning_rate": 3.346090009509613e-05,
      "loss": 0.8865,
      "step": 2215
    },
    {
      "epoch": 7.583262169086251,
      "grad_norm": 0.244140625,
      "learning_rate": 3.3015899705509734e-05,
      "loss": 0.8889,
      "step": 2220
    },
    {
      "epoch": 7.600341588385994,
      "grad_norm": 0.248046875,
      "learning_rate": 3.257329240832454e-05,
      "loss": 0.886,
      "step": 2225
    },
    {
      "epoch": 7.6174210076857385,
      "grad_norm": 0.244140625,
      "learning_rate": 3.21330940162508e-05,
      "loss": 0.8875,
      "step": 2230
    },
    {
      "epoch": 7.634500426985483,
      "grad_norm": 0.2392578125,
      "learning_rate": 3.169532025593729e-05,
      "loss": 0.8863,
      "step": 2235
    },
    {
      "epoch": 7.651579846285227,
      "grad_norm": 0.2490234375,
      "learning_rate": 3.125998676740987e-05,
      "loss": 0.8945,
      "step": 2240
    },
    {
      "epoch": 7.66865926558497,
      "grad_norm": 0.248046875,
      "learning_rate": 3.0827109103512643e-05,
      "loss": 0.888,
      "step": 2245
    },
    {
      "epoch": 7.685738684884714,
      "grad_norm": 0.2470703125,
      "learning_rate": 3.0396702729352023e-05,
      "loss": 0.895,
      "step": 2250
    },
    {
      "epoch": 7.702818104184458,
      "grad_norm": 0.2470703125,
      "learning_rate": 2.996878302174472e-05,
      "loss": 0.89,
      "step": 2255
    },
    {
      "epoch": 7.719897523484201,
      "grad_norm": 0.26953125,
      "learning_rate": 2.9543365268667867e-05,
      "loss": 0.8868,
      "step": 2260
    },
    {
      "epoch": 7.736976942783945,
      "grad_norm": 0.26171875,
      "learning_rate": 2.9120464668713188e-05,
      "loss": 0.8944,
      "step": 2265
    },
    {
      "epoch": 7.7540563620836895,
      "grad_norm": 0.2392578125,
      "learning_rate": 2.8700096330544012e-05,
      "loss": 0.8946,
      "step": 2270
    },
    {
      "epoch": 7.771135781383433,
      "grad_norm": 0.267578125,
      "learning_rate": 2.828227527235513e-05,
      "loss": 0.8926,
      "step": 2275
    },
    {
      "epoch": 7.788215200683177,
      "grad_norm": 0.2578125,
      "learning_rate": 2.7867016421336776e-05,
      "loss": 0.8984,
      "step": 2280
    },
    {
      "epoch": 7.805294619982921,
      "grad_norm": 0.26171875,
      "learning_rate": 2.7454334613140864e-05,
      "loss": 0.8874,
      "step": 2285
    },
    {
      "epoch": 7.822374039282664,
      "grad_norm": 0.2431640625,
      "learning_rate": 2.7044244591351232e-05,
      "loss": 0.892,
      "step": 2290
    },
    {
      "epoch": 7.839453458582408,
      "grad_norm": 0.248046875,
      "learning_rate": 2.6636761006956955e-05,
      "loss": 0.8936,
      "step": 2295
    },
    {
      "epoch": 7.856532877882152,
      "grad_norm": 0.24609375,
      "learning_rate": 2.6231898417828603e-05,
      "loss": 0.8856,
      "step": 2300
    },
    {
      "epoch": 7.873612297181896,
      "grad_norm": 0.25,
      "learning_rate": 2.582967128819851e-05,
      "loss": 0.8886,
      "step": 2305
    },
    {
      "epoch": 7.89069171648164,
      "grad_norm": 0.2353515625,
      "learning_rate": 2.5430093988143778e-05,
      "loss": 0.8891,
      "step": 2310
    },
    {
      "epoch": 7.907771135781384,
      "grad_norm": 0.2421875,
      "learning_rate": 2.5033180793072986e-05,
      "loss": 0.8808,
      "step": 2315
    },
    {
      "epoch": 7.924850555081127,
      "grad_norm": 0.2392578125,
      "learning_rate": 2.4638945883216235e-05,
      "loss": 0.8868,
      "step": 2320
    },
    {
      "epoch": 7.941929974380871,
      "grad_norm": 0.2470703125,
      "learning_rate": 2.4247403343118335e-05,
      "loss": 0.8934,
      "step": 2325
    },
    {
      "epoch": 7.959009393680615,
      "grad_norm": 0.2392578125,
      "learning_rate": 2.385856716113587e-05,
      "loss": 0.8878,
      "step": 2330
    },
    {
      "epoch": 7.976088812980358,
      "grad_norm": 0.2421875,
      "learning_rate": 2.3472451228937253e-05,
      "loss": 0.8913,
      "step": 2335
    },
    {
      "epoch": 7.9931682322801025,
      "grad_norm": 0.2373046875,
      "learning_rate": 2.3089069341006565e-05,
      "loss": 0.8902,
      "step": 2340
    },
    {
      "epoch": 8.0,
      "eval_loss": 2.592200994491577,
      "eval_runtime": 0.5427,
      "eval_samples_per_second": 18.425,
      "eval_steps_per_second": 1.843,
      "step": 2342
    },
    {
      "epoch": 8.010247651579846,
      "grad_norm": 0.236328125,
      "learning_rate": 2.2708435194150634e-05,
      "loss": 0.8945,
      "step": 2345
    },
    {
      "epoch": 8.02732707087959,
      "grad_norm": 0.2392578125,
      "learning_rate": 2.2330562387009745e-05,
      "loss": 0.8833,
      "step": 2350
    },
    {
      "epoch": 8.044406490179334,
      "grad_norm": 0.2412109375,
      "learning_rate": 2.1955464419571782e-05,
      "loss": 0.8823,
      "step": 2355
    },
    {
      "epoch": 8.061485909479078,
      "grad_norm": 0.2373046875,
      "learning_rate": 2.1583154692689976e-05,
      "loss": 0.8874,
      "step": 2360
    },
    {
      "epoch": 8.078565328778822,
      "grad_norm": 0.2470703125,
      "learning_rate": 2.121364650760408e-05,
      "loss": 0.8743,
      "step": 2365
    },
    {
      "epoch": 8.095644748078564,
      "grad_norm": 0.251953125,
      "learning_rate": 2.08469530654652e-05,
      "loss": 0.8872,
      "step": 2370
    },
    {
      "epoch": 8.112724167378309,
      "grad_norm": 0.2470703125,
      "learning_rate": 2.048308746686417e-05,
      "loss": 0.8936,
      "step": 2375
    },
    {
      "epoch": 8.129803586678053,
      "grad_norm": 0.236328125,
      "learning_rate": 2.0122062711363532e-05,
      "loss": 0.8818,
      "step": 2380
    },
    {
      "epoch": 8.146883005977797,
      "grad_norm": 0.25,
      "learning_rate": 1.9763891697032978e-05,
      "loss": 0.887,
      "step": 2385
    },
    {
      "epoch": 8.163962425277541,
      "grad_norm": 0.248046875,
      "learning_rate": 1.9408587219988805e-05,
      "loss": 0.884,
      "step": 2390
    },
    {
      "epoch": 8.181041844577285,
      "grad_norm": 0.2421875,
      "learning_rate": 1.9056161973936513e-05,
      "loss": 0.8892,
      "step": 2395
    },
    {
      "epoch": 8.198121263877027,
      "grad_norm": 0.240234375,
      "learning_rate": 1.8706628549717452e-05,
      "loss": 0.8883,
      "step": 2400
    },
    {
      "epoch": 8.215200683176771,
      "grad_norm": 0.26953125,
      "learning_rate": 1.835999943485892e-05,
      "loss": 0.8802,
      "step": 2405
    },
    {
      "epoch": 8.232280102476516,
      "grad_norm": 0.2451171875,
      "learning_rate": 1.8016287013128018e-05,
      "loss": 0.8886,
      "step": 2410
    },
    {
      "epoch": 8.24935952177626,
      "grad_norm": 0.2392578125,
      "learning_rate": 1.767550356408938e-05,
      "loss": 0.8784,
      "step": 2415
    },
    {
      "epoch": 8.266438941076004,
      "grad_norm": 0.24609375,
      "learning_rate": 1.7337661262666294e-05,
      "loss": 0.8897,
      "step": 2420
    },
    {
      "epoch": 8.283518360375748,
      "grad_norm": 0.2412109375,
      "learning_rate": 1.7002772178705716e-05,
      "loss": 0.8844,
      "step": 2425
    },
    {
      "epoch": 8.30059777967549,
      "grad_norm": 0.2451171875,
      "learning_rate": 1.6670848276547334e-05,
      "loss": 0.8856,
      "step": 2430
    },
    {
      "epoch": 8.317677198975234,
      "grad_norm": 0.2421875,
      "learning_rate": 1.6341901414595705e-05,
      "loss": 0.8762,
      "step": 2435
    },
    {
      "epoch": 8.334756618274978,
      "grad_norm": 0.2412109375,
      "learning_rate": 1.601594334489702e-05,
      "loss": 0.8802,
      "step": 2440
    },
    {
      "epoch": 8.351836037574722,
      "grad_norm": 0.2451171875,
      "learning_rate": 1.5692985712719e-05,
      "loss": 0.8939,
      "step": 2445
    },
    {
      "epoch": 8.368915456874467,
      "grad_norm": 0.23828125,
      "learning_rate": 1.5373040056134814e-05,
      "loss": 0.8804,
      "step": 2450
    },
    {
      "epoch": 8.38599487617421,
      "grad_norm": 0.240234375,
      "learning_rate": 1.5056117805611115e-05,
      "loss": 0.8806,
      "step": 2455
    },
    {
      "epoch": 8.403074295473953,
      "grad_norm": 0.24609375,
      "learning_rate": 1.474223028359939e-05,
      "loss": 0.8856,
      "step": 2460
    },
    {
      "epoch": 8.420153714773697,
      "grad_norm": 0.2470703125,
      "learning_rate": 1.4431388704131632e-05,
      "loss": 0.8791,
      "step": 2465
    },
    {
      "epoch": 8.437233134073441,
      "grad_norm": 0.236328125,
      "learning_rate": 1.4123604172419713e-05,
      "loss": 0.8874,
      "step": 2470
    },
    {
      "epoch": 8.454312553373185,
      "grad_norm": 0.2412109375,
      "learning_rate": 1.3818887684458426e-05,
      "loss": 0.8827,
      "step": 2475
    },
    {
      "epoch": 8.47139197267293,
      "grad_norm": 0.24609375,
      "learning_rate": 1.3517250126632986e-05,
      "loss": 0.8847,
      "step": 2480
    },
    {
      "epoch": 8.488471391972674,
      "grad_norm": 0.2470703125,
      "learning_rate": 1.321870227532971e-05,
      "loss": 0.8826,
      "step": 2485
    },
    {
      "epoch": 8.505550811272418,
      "grad_norm": 0.24609375,
      "learning_rate": 1.292325479655131e-05,
      "loss": 0.89,
      "step": 2490
    },
    {
      "epoch": 8.52263023057216,
      "grad_norm": 0.234375,
      "learning_rate": 1.263091824553574e-05,
      "loss": 0.8904,
      "step": 2495
    },
    {
      "epoch": 8.539709649871904,
      "grad_norm": 0.2392578125,
      "learning_rate": 1.2341703066379074e-05,
      "loss": 0.888,
      "step": 2500
    },
    {
      "epoch": 8.556789069171648,
      "grad_norm": 0.236328125,
      "learning_rate": 1.205561959166237e-05,
      "loss": 0.8841,
      "step": 2505
    },
    {
      "epoch": 8.573868488471392,
      "grad_norm": 0.2412109375,
      "learning_rate": 1.1772678042082607e-05,
      "loss": 0.8856,
      "step": 2510
    },
    {
      "epoch": 8.590947907771136,
      "grad_norm": 0.2431640625,
      "learning_rate": 1.149288852608743e-05,
      "loss": 0.8871,
      "step": 2515
    },
    {
      "epoch": 8.608027327070879,
      "grad_norm": 0.2412109375,
      "learning_rate": 1.1216261039514087e-05,
      "loss": 0.8817,
      "step": 2520
    },
    {
      "epoch": 8.625106746370623,
      "grad_norm": 0.244140625,
      "learning_rate": 1.094280546523231e-05,
      "loss": 0.8825,
      "step": 2525
    },
    {
      "epoch": 8.642186165670367,
      "grad_norm": 0.23828125,
      "learning_rate": 1.0672531572791178e-05,
      "loss": 0.8922,
      "step": 2530
    },
    {
      "epoch": 8.659265584970111,
      "grad_norm": 0.2421875,
      "learning_rate": 1.0405449018070168e-05,
      "loss": 0.8879,
      "step": 2535
    },
    {
      "epoch": 8.676345004269855,
      "grad_norm": 0.24609375,
      "learning_rate": 1.0141567342934132e-05,
      "loss": 0.885,
      "step": 2540
    },
    {
      "epoch": 8.6934244235696,
      "grad_norm": 0.25390625,
      "learning_rate": 9.880895974892412e-06,
      "loss": 0.8886,
      "step": 2545
    },
    {
      "epoch": 8.710503842869343,
      "grad_norm": 0.2431640625,
      "learning_rate": 9.623444226762035e-06,
      "loss": 0.8805,
      "step": 2550
    },
    {
      "epoch": 8.727583262169086,
      "grad_norm": 0.23828125,
      "learning_rate": 9.369221296335006e-06,
      "loss": 0.8866,
      "step": 2555
    },
    {
      "epoch": 8.74466268146883,
      "grad_norm": 0.2353515625,
      "learning_rate": 9.118236266049707e-06,
      "loss": 0.8811,
      "step": 2560
    },
    {
      "epoch": 8.761742100768574,
      "grad_norm": 0.248046875,
      "learning_rate": 8.870498102666402e-06,
      "loss": 0.8849,
      "step": 2565
    },
    {
      "epoch": 8.778821520068318,
      "grad_norm": 0.2353515625,
      "learning_rate": 8.626015656946895e-06,
      "loss": 0.8857,
      "step": 2570
    },
    {
      "epoch": 8.795900939368062,
      "grad_norm": 0.2373046875,
      "learning_rate": 8.384797663338306e-06,
      "loss": 0.8833,
      "step": 2575
    },
    {
      "epoch": 8.812980358667804,
      "grad_norm": 0.2392578125,
      "learning_rate": 8.146852739661105e-06,
      "loss": 0.885,
      "step": 2580
    },
    {
      "epoch": 8.830059777967548,
      "grad_norm": 0.23828125,
      "learning_rate": 7.91218938680104e-06,
      "loss": 0.8861,
      "step": 2585
    },
    {
      "epoch": 8.847139197267293,
      "grad_norm": 0.24609375,
      "learning_rate": 7.6808159884057e-06,
      "loss": 0.88,
      "step": 2590
    },
    {
      "epoch": 8.864218616567037,
      "grad_norm": 0.2451171875,
      "learning_rate": 7.45274081058478e-06,
      "loss": 0.8794,
      "step": 2595
    },
    {
      "epoch": 8.88129803586678,
      "grad_norm": 0.236328125,
      "learning_rate": 7.2279720016148244e-06,
      "loss": 0.8801,
      "step": 2600
    },
    {
      "epoch": 8.898377455166525,
      "grad_norm": 0.2373046875,
      "learning_rate": 7.0065175916482095e-06,
      "loss": 0.8818,
      "step": 2605
    },
    {
      "epoch": 8.915456874466269,
      "grad_norm": 0.23828125,
      "learning_rate": 6.788385492426053e-06,
      "loss": 0.8856,
      "step": 2610
    },
    {
      "epoch": 8.932536293766011,
      "grad_norm": 0.2412109375,
      "learning_rate": 6.573583496995816e-06,
      "loss": 0.8887,
      "step": 2615
    },
    {
      "epoch": 8.949615713065755,
      "grad_norm": 0.2412109375,
      "learning_rate": 6.36211927943271e-06,
      "loss": 0.8778,
      "step": 2620
    },
    {
      "epoch": 8.9666951323655,
      "grad_norm": 0.244140625,
      "learning_rate": 6.1540003945655286e-06,
      "loss": 0.8906,
      "step": 2625
    },
    {
      "epoch": 8.983774551665244,
      "grad_norm": 0.2421875,
      "learning_rate": 5.949234277706861e-06,
      "loss": 0.8818,
      "step": 2630
    },
    {
      "epoch": 8.997438087105039,
      "eval_loss": 2.5981767177581787,
      "eval_runtime": 0.5523,
      "eval_samples_per_second": 18.106,
      "eval_steps_per_second": 1.811,
      "step": 2634
    },
    {
      "epoch": 9.000853970964988,
      "grad_norm": 0.2431640625,
      "learning_rate": 5.74782824438731e-06,
      "loss": 0.8865,
      "step": 2635
    },
    {
      "epoch": 9.017933390264732,
      "grad_norm": 0.2392578125,
      "learning_rate": 5.549789490094304e-06,
      "loss": 0.8846,
      "step": 2640
    },
    {
      "epoch": 9.035012809564474,
      "grad_norm": 0.2373046875,
      "learning_rate": 5.355125090014845e-06,
      "loss": 0.8845,
      "step": 2645
    },
    {
      "epoch": 9.052092228864218,
      "grad_norm": 0.466796875,
      "learning_rate": 5.163841998782837e-06,
      "loss": 0.8852,
      "step": 2650
    },
    {
      "epoch": 9.069171648163962,
      "grad_norm": 0.23828125,
      "learning_rate": 4.975947050230712e-06,
      "loss": 0.8831,
      "step": 2655
    },
    {
      "epoch": 9.086251067463706,
      "grad_norm": 0.234375,
      "learning_rate": 4.79144695714504e-06,
      "loss": 0.8838,
      "step": 2660
    },
    {
      "epoch": 9.10333048676345,
      "grad_norm": 0.2392578125,
      "learning_rate": 4.610348311026958e-06,
      "loss": 0.8892,
      "step": 2665
    },
    {
      "epoch": 9.120409906063195,
      "grad_norm": 0.2373046875,
      "learning_rate": 4.432657581856525e-06,
      "loss": 0.882,
      "step": 2670
    },
    {
      "epoch": 9.137489325362937,
      "grad_norm": 0.2470703125,
      "learning_rate": 4.25838111786162e-06,
      "loss": 0.8839,
      "step": 2675
    },
    {
      "epoch": 9.154568744662681,
      "grad_norm": 0.2431640625,
      "learning_rate": 4.087525145291204e-06,
      "loss": 0.8854,
      "step": 2680
    },
    {
      "epoch": 9.171648163962425,
      "grad_norm": 0.244140625,
      "learning_rate": 3.920095768192722e-06,
      "loss": 0.8823,
      "step": 2685
    },
    {
      "epoch": 9.18872758326217,
      "grad_norm": 0.2353515625,
      "learning_rate": 3.7560989681941992e-06,
      "loss": 0.883,
      "step": 2690
    },
    {
      "epoch": 9.205807002561913,
      "grad_norm": 0.2353515625,
      "learning_rate": 3.595540604290437e-06,
      "loss": 0.8795,
      "step": 2695
    },
    {
      "epoch": 9.222886421861658,
      "grad_norm": 0.23828125,
      "learning_rate": 3.4384264126337328e-06,
      "loss": 0.8868,
      "step": 2700
    },
    {
      "epoch": 9.2399658411614,
      "grad_norm": 0.255859375,
      "learning_rate": 3.284762006328945e-06,
      "loss": 0.8884,
      "step": 2705
    },
    {
      "epoch": 9.257045260461144,
      "grad_norm": 0.267578125,
      "learning_rate": 3.1345528752329212e-06,
      "loss": 0.8819,
      "step": 2710
    },
    {
      "epoch": 9.274124679760888,
      "grad_norm": 0.23828125,
      "learning_rate": 2.9878043857584415e-06,
      "loss": 0.8893,
      "step": 2715
    },
    {
      "epoch": 9.291204099060632,
      "grad_norm": 0.23828125,
      "learning_rate": 2.8445217806824077e-06,
      "loss": 0.8805,
      "step": 2720
    },
    {
      "epoch": 9.308283518360376,
      "grad_norm": 0.2373046875,
      "learning_rate": 2.704710178958603e-06,
      "loss": 0.8796,
      "step": 2725
    },
    {
      "epoch": 9.32536293766012,
      "grad_norm": 0.2392578125,
      "learning_rate": 2.5683745755348044e-06,
      "loss": 0.8853,
      "step": 2730
    },
    {
      "epoch": 9.342442356959863,
      "grad_norm": 0.2373046875,
      "learning_rate": 2.435519841174272e-06,
      "loss": 0.8844,
      "step": 2735
    },
    {
      "epoch": 9.359521776259607,
      "grad_norm": 0.255859375,
      "learning_rate": 2.30615072228183e-06,
      "loss": 0.8838,
      "step": 2740
    },
    {
      "epoch": 9.376601195559351,
      "grad_norm": 0.24609375,
      "learning_rate": 2.180271840734216e-06,
      "loss": 0.8895,
      "step": 2745
    },
    {
      "epoch": 9.393680614859095,
      "grad_norm": 0.24609375,
      "learning_rate": 2.057887693714988e-06,
      "loss": 0.876,
      "step": 2750
    },
    {
      "epoch": 9.410760034158839,
      "grad_norm": 0.2373046875,
      "learning_rate": 1.9390026535538674e-06,
      "loss": 0.8831,
      "step": 2755
    },
    {
      "epoch": 9.427839453458583,
      "grad_norm": 0.2451171875,
      "learning_rate": 1.8236209675705274e-06,
      "loss": 0.8851,
      "step": 2760
    },
    {
      "epoch": 9.444918872758326,
      "grad_norm": 0.236328125,
      "learning_rate": 1.7117467579228053e-06,
      "loss": 0.876,
      "step": 2765
    },
    {
      "epoch": 9.46199829205807,
      "grad_norm": 0.2421875,
      "learning_rate": 1.6033840214595308e-06,
      "loss": 0.879,
      "step": 2770
    },
    {
      "epoch": 9.479077711357814,
      "grad_norm": 0.240234375,
      "learning_rate": 1.4985366295776404e-06,
      "loss": 0.8899,
      "step": 2775
    },
    {
      "epoch": 9.496157130657558,
      "grad_norm": 0.232421875,
      "learning_rate": 1.397208328083921e-06,
      "loss": 0.8836,
      "step": 2780
    },
    {
      "epoch": 9.513236549957302,
      "grad_norm": 0.2373046875,
      "learning_rate": 1.2994027370611173e-06,
      "loss": 0.8905,
      "step": 2785
    },
    {
      "epoch": 9.530315969257046,
      "grad_norm": 0.2392578125,
      "learning_rate": 1.205123350738746e-06,
      "loss": 0.8875,
      "step": 2790
    },
    {
      "epoch": 9.547395388556788,
      "grad_norm": 0.2470703125,
      "learning_rate": 1.114373537368063e-06,
      "loss": 0.8838,
      "step": 2795
    },
    {
      "epoch": 9.564474807856532,
      "grad_norm": 0.234375,
      "learning_rate": 1.0271565391018922e-06,
      "loss": 0.8807,
      "step": 2800
    },
    {
      "epoch": 9.581554227156277,
      "grad_norm": 0.2470703125,
      "learning_rate": 9.434754718787409e-07,
      "loss": 0.875,
      "step": 2805
    },
    {
      "epoch": 9.59863364645602,
      "grad_norm": 0.25390625,
      "learning_rate": 8.633333253113995e-07,
      "loss": 0.8845,
      "step": 2810
    },
    {
      "epoch": 9.615713065755765,
      "grad_norm": 0.23828125,
      "learning_rate": 7.867329625802833e-07,
      "loss": 0.88,
      "step": 2815
    },
    {
      "epoch": 9.632792485055509,
      "grad_norm": 0.236328125,
      "learning_rate": 7.136771203310245e-07,
      "loss": 0.8794,
      "step": 2820
    },
    {
      "epoch": 9.649871904355251,
      "grad_norm": 0.2373046875,
      "learning_rate": 6.441684085767396e-07,
      "loss": 0.8894,
      "step": 2825
    },
    {
      "epoch": 9.666951323654995,
      "grad_norm": 0.2431640625,
      "learning_rate": 5.782093106048159e-07,
      "loss": 0.8803,
      "step": 2830
    },
    {
      "epoch": 9.68403074295474,
      "grad_norm": 0.251953125,
      "learning_rate": 5.158021828881032e-07,
      "loss": 0.8844,
      "step": 2835
    },
    {
      "epoch": 9.701110162254484,
      "grad_norm": 0.2373046875,
      "learning_rate": 4.569492550008603e-07,
      "loss": 0.8835,
      "step": 2840
    },
    {
      "epoch": 9.718189581554228,
      "grad_norm": 0.2373046875,
      "learning_rate": 4.016526295389622e-07,
      "loss": 0.8832,
      "step": 2845
    },
    {
      "epoch": 9.735269000853972,
      "grad_norm": 0.2421875,
      "learning_rate": 3.49914282044872e-07,
      "loss": 0.879,
      "step": 2850
    },
    {
      "epoch": 9.752348420153714,
      "grad_norm": 0.23828125,
      "learning_rate": 3.017360609370301e-07,
      "loss": 0.8805,
      "step": 2855
    },
    {
      "epoch": 9.769427839453458,
      "grad_norm": 0.2373046875,
      "learning_rate": 2.5711968744382974e-07,
      "loss": 0.8853,
      "step": 2860
    },
    {
      "epoch": 9.786507258753202,
      "grad_norm": 0.2412109375,
      "learning_rate": 2.1606675554209922e-07,
      "loss": 0.8901,
      "step": 2865
    },
    {
      "epoch": 9.803586678052946,
      "grad_norm": 0.2392578125,
      "learning_rate": 1.7857873190019192e-07,
      "loss": 0.8816,
      "step": 2870
    },
    {
      "epoch": 9.82066609735269,
      "grad_norm": 0.2353515625,
      "learning_rate": 1.446569558255395e-07,
      "loss": 0.8823,
      "step": 2875
    },
    {
      "epoch": 9.837745516652435,
      "grad_norm": 0.23828125,
      "learning_rate": 1.143026392168789e-07,
      "loss": 0.8837,
      "step": 2880
    },
    {
      "epoch": 9.854824935952177,
      "grad_norm": 0.2333984375,
      "learning_rate": 8.751686652084256e-08,
      "loss": 0.8835,
      "step": 2885
    },
    {
      "epoch": 9.871904355251921,
      "grad_norm": 0.2392578125,
      "learning_rate": 6.430059469334504e-08,
      "loss": 0.8839,
      "step": 2890
    },
    {
      "epoch": 9.888983774551665,
      "grad_norm": 0.2373046875,
      "learning_rate": 4.465465316529915e-08,
      "loss": 0.8863,
      "step": 2895
    },
    {
      "epoch": 9.90606319385141,
      "grad_norm": 0.2412109375,
      "learning_rate": 2.8579743813006432e-08,
      "loss": 0.8822,
      "step": 2900
    },
    {
      "epoch": 9.923142613151153,
      "grad_norm": 0.244140625,
      "learning_rate": 1.6076440933099345e-08,
      "loss": 0.8817,
      "step": 2905
    },
    {
      "epoch": 9.940222032450897,
      "grad_norm": 0.23828125,
      "learning_rate": 7.145191222035497e-09,
      "loss": 0.8827,
      "step": 2910
    },
    {
      "epoch": 9.95730145175064,
      "grad_norm": 0.2412109375,
      "learning_rate": 1.7863137600993008e-09,
      "loss": 0.881,
      "step": 2915
    },
    {
      "epoch": 9.974380871050384,
      "grad_norm": 0.2421875,
      "learning_rate": 0.0,
      "loss": 0.8852,
      "step": 2920
    },
    {
      "epoch": 9.974380871050384,
      "eval_loss": 2.5989506244659424,
      "eval_runtime": 0.5586,
      "eval_samples_per_second": 17.903,
      "eval_steps_per_second": 1.79,
      "step": 2920
    },
    {
      "epoch": 9.974380871050384,
      "step": 2920,
      "total_flos": 3.4809256003093135e+18,
      "train_loss": 0.9919237802289936,
      "train_runtime": 34991.5416,
      "train_samples_per_second": 8.027,
      "train_steps_per_second": 0.083
    }
  ],
  "logging_steps": 5,
  "max_steps": 2920,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 10,
  "save_steps": 100,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 3.4809256003093135e+18,
  "train_batch_size": 8,
  "trial_name": null,
  "trial_params": null
}