diff --git "a/trainer_state.json" "b/trainer_state.json"
new file mode 100644--- /dev/null
+++ "b/trainer_state.json"
@@ -0,0 +1,6704 @@
+{
+  "best_metric": 0.5016890168190002,
+  "best_model_checkpoint": "data/hansken_human_hql_v3/checkpoint-2345",
+  "epoch": 10.0,
+  "eval_steps": 500,
+  "global_step": 4690,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0021321961620469083,
+      "grad_norm": 1.0516366958618164,
+      "learning_rate": 1.4214641080312722e-07,
+      "loss": 1.9389,
+      "step": 1
+    },
+    {
+      "epoch": 0.010660980810234541,
+      "grad_norm": 0.9856139421463013,
+      "learning_rate": 7.107320540156362e-07,
+      "loss": 2.0398,
+      "step": 5
+    },
+    {
+      "epoch": 0.021321961620469083,
+      "grad_norm": 1.0568891763687134,
+      "learning_rate": 1.4214641080312723e-06,
+      "loss": 2.0618,
+      "step": 10
+    },
+    {
+      "epoch": 0.031982942430703626,
+      "grad_norm": 0.9998515844345093,
+      "learning_rate": 2.132196162046908e-06,
+      "loss": 2.0543,
+      "step": 15
+    },
+    {
+      "epoch": 0.042643923240938165,
+      "grad_norm": 1.004911184310913,
+      "learning_rate": 2.8429282160625447e-06,
+      "loss": 1.9997,
+      "step": 20
+    },
+    {
+      "epoch": 0.053304904051172705,
+      "grad_norm": 0.9931671619415283,
+      "learning_rate": 3.553660270078181e-06,
+      "loss": 1.9913,
+      "step": 25
+    },
+    {
+      "epoch": 0.06396588486140725,
+      "grad_norm": 0.9859012365341187,
+      "learning_rate": 4.264392324093816e-06,
+      "loss": 1.9729,
+      "step": 30
+    },
+    {
+      "epoch": 0.07462686567164178,
+      "grad_norm": 1.0391347408294678,
+      "learning_rate": 4.975124378109453e-06,
+      "loss": 1.9434,
+      "step": 35
+    },
+    {
+      "epoch": 0.08528784648187633,
+      "grad_norm": 0.8275197744369507,
+      "learning_rate": 5.685856432125089e-06,
+      "loss": 1.9092,
+      "step": 40
+    },
+    {
+      "epoch": 0.09594882729211088,
+      "grad_norm": 0.7102633714675903,
+      "learning_rate": 6.396588486140726e-06,
+      "loss": 1.8488,
+      "step": 45
+    },
+    {
+      "epoch": 0.10660980810234541,
+      "grad_norm": 0.6521381735801697,
+      "learning_rate": 7.107320540156362e-06,
+      "loss": 1.8673,
+      "step": 50
+    },
+    {
+      "epoch": 0.11727078891257996,
+      "grad_norm": 0.5477872490882874,
+      "learning_rate": 7.818052594171997e-06,
+      "loss": 1.7758,
+      "step": 55
+    },
+    {
+      "epoch": 0.1279317697228145,
+      "grad_norm": 0.49889788031578064,
+      "learning_rate": 8.528784648187633e-06,
+      "loss": 1.7453,
+      "step": 60
+    },
+    {
+      "epoch": 0.13859275053304904,
+      "grad_norm": 0.5726047158241272,
+      "learning_rate": 9.23951670220327e-06,
+      "loss": 1.7635,
+      "step": 65
+    },
+    {
+      "epoch": 0.14925373134328357,
+      "grad_norm": 0.4760012924671173,
+      "learning_rate": 9.950248756218906e-06,
+      "loss": 1.7027,
+      "step": 70
+    },
+    {
+      "epoch": 0.15991471215351813,
+      "grad_norm": 0.4642033278942108,
+      "learning_rate": 1.0660980810234541e-05,
+      "loss": 1.7086,
+      "step": 75
+    },
+    {
+      "epoch": 0.17057569296375266,
+      "grad_norm": 0.42560943961143494,
+      "learning_rate": 1.1371712864250179e-05,
+      "loss": 1.638,
+      "step": 80
+    },
+    {
+      "epoch": 0.1812366737739872,
+      "grad_norm": 0.4680778384208679,
+      "learning_rate": 1.2082444918265814e-05,
+      "loss": 1.6029,
+      "step": 85
+    },
+    {
+      "epoch": 0.19189765458422176,
+      "grad_norm": 0.4264519214630127,
+      "learning_rate": 1.2793176972281452e-05,
+      "loss": 1.4899,
+      "step": 90
+    },
+    {
+      "epoch": 0.2025586353944563,
+      "grad_norm": 0.41101664304733276,
+      "learning_rate": 1.3503909026297087e-05,
+      "loss": 1.4997,
+      "step": 95
+    },
+    {
+      "epoch": 0.21321961620469082,
+      "grad_norm": 0.34257784485816956,
+      "learning_rate": 1.4214641080312725e-05,
+      "loss": 1.4734,
+      "step": 100
+    },
+    {
+      "epoch": 0.22388059701492538,
+      "grad_norm": 0.34164702892303467,
+      "learning_rate": 1.4925373134328357e-05,
+      "loss": 1.4341,
+      "step": 105
+    },
+    {
+      "epoch": 0.2345415778251599,
+      "grad_norm": 0.3285938501358032,
+      "learning_rate": 1.5636105188343994e-05,
+      "loss": 1.4293,
+      "step": 110
+    },
+    {
+      "epoch": 0.24520255863539445,
+      "grad_norm": 0.33409905433654785,
+      "learning_rate": 1.634683724235963e-05,
+      "loss": 1.3792,
+      "step": 115
+    },
+    {
+      "epoch": 0.255863539445629,
+      "grad_norm": 0.3385579288005829,
+      "learning_rate": 1.7057569296375266e-05,
+      "loss": 1.3811,
+      "step": 120
+    },
+    {
+      "epoch": 0.26652452025586354,
+      "grad_norm": 0.35849225521087646,
+      "learning_rate": 1.7768301350390903e-05,
+      "loss": 1.3217,
+      "step": 125
+    },
+    {
+      "epoch": 0.2771855010660981,
+      "grad_norm": 0.3905642330646515,
+      "learning_rate": 1.847903340440654e-05,
+      "loss": 1.2792,
+      "step": 130
+    },
+    {
+      "epoch": 0.2878464818763326,
+      "grad_norm": 0.45816823840141296,
+      "learning_rate": 1.9189765458422178e-05,
+      "loss": 1.268,
+      "step": 135
+    },
+    {
+      "epoch": 0.29850746268656714,
+      "grad_norm": 0.42841047048568726,
+      "learning_rate": 1.990049751243781e-05,
+      "loss": 1.1999,
+      "step": 140
+    },
+    {
+      "epoch": 0.3091684434968017,
+      "grad_norm": 0.42461100220680237,
+      "learning_rate": 2.061122956645345e-05,
+      "loss": 1.1908,
+      "step": 145
+    },
+    {
+      "epoch": 0.31982942430703626,
+      "grad_norm": 0.3846851885318756,
+      "learning_rate": 2.1321961620469083e-05,
+      "loss": 1.0417,
+      "step": 150
+    },
+    {
+      "epoch": 0.3304904051172708,
+      "grad_norm": 0.35793304443359375,
+      "learning_rate": 2.203269367448472e-05,
+      "loss": 1.0804,
+      "step": 155
+    },
+    {
+      "epoch": 0.3411513859275053,
+      "grad_norm": 0.3422033488750458,
+      "learning_rate": 2.2743425728500358e-05,
+      "loss": 1.0433,
+      "step": 160
+    },
+    {
+      "epoch": 0.35181236673773986,
+      "grad_norm": 0.34404265880584717,
+      "learning_rate": 2.345415778251599e-05,
+      "loss": 1.0823,
+      "step": 165
+    },
+    {
+      "epoch": 0.3624733475479744,
+      "grad_norm": 0.31916388869285583,
+      "learning_rate": 2.416488983653163e-05,
+      "loss": 1.001,
+      "step": 170
+    },
+    {
+      "epoch": 0.373134328358209,
+      "grad_norm": 0.33065563440322876,
+      "learning_rate": 2.4875621890547266e-05,
+      "loss": 0.9698,
+      "step": 175
+    },
+    {
+      "epoch": 0.3837953091684435,
+      "grad_norm": 0.34518882632255554,
+      "learning_rate": 2.5586353944562904e-05,
+      "loss": 0.9731,
+      "step": 180
+    },
+    {
+      "epoch": 0.39445628997867804,
+      "grad_norm": 0.31844091415405273,
+      "learning_rate": 2.6297085998578534e-05,
+      "loss": 0.9293,
+      "step": 185
+    },
+    {
+      "epoch": 0.4051172707889126,
+      "grad_norm": 0.32537004351615906,
+      "learning_rate": 2.7007818052594175e-05,
+      "loss": 0.9306,
+      "step": 190
+    },
+    {
+      "epoch": 0.4157782515991471,
+      "grad_norm": 0.38439956307411194,
+      "learning_rate": 2.771855010660981e-05,
+      "loss": 0.8915,
+      "step": 195
+    },
+    {
+      "epoch": 0.42643923240938164,
+      "grad_norm": 0.3455168306827545,
+      "learning_rate": 2.842928216062545e-05,
+      "loss": 0.903,
+      "step": 200
+    },
+    {
+      "epoch": 0.43710021321961623,
+      "grad_norm": 0.36652979254722595,
+      "learning_rate": 2.914001421464108e-05,
+      "loss": 0.8468,
+      "step": 205
+    },
+    {
+      "epoch": 0.44776119402985076,
+      "grad_norm": 0.35580819845199585,
+      "learning_rate": 2.9850746268656714e-05,
+      "loss": 0.8467,
+      "step": 210
+    },
+    {
+      "epoch": 0.4584221748400853,
+      "grad_norm": 0.3748577833175659,
+      "learning_rate": 3.056147832267235e-05,
+      "loss": 0.8037,
+      "step": 215
+    },
+    {
+      "epoch": 0.4690831556503198,
+      "grad_norm": 0.3399907052516937,
+      "learning_rate": 3.127221037668799e-05,
+      "loss": 0.8525,
+      "step": 220
+    },
+    {
+      "epoch": 0.47974413646055436,
+      "grad_norm": 0.39041897654533386,
+      "learning_rate": 3.1982942430703626e-05,
+      "loss": 0.8672,
+      "step": 225
+    },
+    {
+      "epoch": 0.4904051172707889,
+      "grad_norm": 0.37930938601493835,
+      "learning_rate": 3.269367448471926e-05,
+      "loss": 0.7967,
+      "step": 230
+    },
+    {
+      "epoch": 0.5010660980810234,
+      "grad_norm": 0.4009639024734497,
+      "learning_rate": 3.34044065387349e-05,
+      "loss": 0.8134,
+      "step": 235
+    },
+    {
+      "epoch": 0.511727078891258,
+      "grad_norm": 0.4189032018184662,
+      "learning_rate": 3.411513859275053e-05,
+      "loss": 0.791,
+      "step": 240
+    },
+    {
+      "epoch": 0.5223880597014925,
+      "grad_norm": 0.3848344385623932,
+      "learning_rate": 3.4825870646766175e-05,
+      "loss": 0.8183,
+      "step": 245
+    },
+    {
+      "epoch": 0.5330490405117271,
+      "grad_norm": 0.41223597526550293,
+      "learning_rate": 3.5536602700781806e-05,
+      "loss": 0.7668,
+      "step": 250
+    },
+    {
+      "epoch": 0.5437100213219617,
+      "grad_norm": 0.4024832844734192,
+      "learning_rate": 3.624733475479744e-05,
+      "loss": 0.7819,
+      "step": 255
+    },
+    {
+      "epoch": 0.5543710021321961,
+      "grad_norm": 0.3832787871360779,
+      "learning_rate": 3.695806680881308e-05,
+      "loss": 0.7693,
+      "step": 260
+    },
+    {
+      "epoch": 0.5650319829424307,
+      "grad_norm": 0.4266470670700073,
+      "learning_rate": 3.766879886282871e-05,
+      "loss": 0.795,
+      "step": 265
+    },
+    {
+      "epoch": 0.5756929637526652,
+      "grad_norm": 0.47055262327194214,
+      "learning_rate": 3.8379530916844355e-05,
+      "loss": 0.7752,
+      "step": 270
+    },
+    {
+      "epoch": 0.5863539445628998,
+      "grad_norm": 0.420669823884964,
+      "learning_rate": 3.9090262970859986e-05,
+      "loss": 0.7691,
+      "step": 275
+    },
+    {
+      "epoch": 0.5970149253731343,
+      "grad_norm": 0.4140627384185791,
+      "learning_rate": 3.980099502487562e-05,
+      "loss": 0.7385,
+      "step": 280
+    },
+    {
+      "epoch": 0.6076759061833689,
+      "grad_norm": 0.4674805998802185,
+      "learning_rate": 4.051172707889126e-05,
+      "loss": 0.7668,
+      "step": 285
+    },
+    {
+      "epoch": 0.6183368869936035,
+      "grad_norm": 0.45881038904190063,
+      "learning_rate": 4.12224591329069e-05,
+      "loss": 0.7777,
+      "step": 290
+    },
+    {
+      "epoch": 0.6289978678038379,
+      "grad_norm": 0.4218686819076538,
+      "learning_rate": 4.1933191186922535e-05,
+      "loss": 0.7106,
+      "step": 295
+    },
+    {
+      "epoch": 0.6396588486140725,
+      "grad_norm": 0.43359580636024475,
+      "learning_rate": 4.2643923240938166e-05,
+      "loss": 0.7076,
+      "step": 300
+    },
+    {
+      "epoch": 0.650319829424307,
+      "grad_norm": 0.42106226086616516,
+      "learning_rate": 4.33546552949538e-05,
+      "loss": 0.7353,
+      "step": 305
+    },
+    {
+      "epoch": 0.6609808102345416,
+      "grad_norm": 0.4189695715904236,
+      "learning_rate": 4.406538734896944e-05,
+      "loss": 0.698,
+      "step": 310
+    },
+    {
+      "epoch": 0.6716417910447762,
+      "grad_norm": 0.45314905047416687,
+      "learning_rate": 4.477611940298508e-05,
+      "loss": 0.7356,
+      "step": 315
+    },
+    {
+      "epoch": 0.6823027718550106,
+      "grad_norm": 0.46034571528434753,
+      "learning_rate": 4.5486851457000715e-05,
+      "loss": 0.7397,
+      "step": 320
+    },
+    {
+      "epoch": 0.6929637526652452,
+      "grad_norm": 0.44907087087631226,
+      "learning_rate": 4.619758351101635e-05,
+      "loss": 0.7326,
+      "step": 325
+    },
+    {
+      "epoch": 0.7036247334754797,
+      "grad_norm": 0.46258679032325745,
+      "learning_rate": 4.690831556503198e-05,
+      "loss": 0.6663,
+      "step": 330
+    },
+    {
+      "epoch": 0.7142857142857143,
+      "grad_norm": 0.446308434009552,
+      "learning_rate": 4.761904761904762e-05,
+      "loss": 0.6941,
+      "step": 335
+    },
+    {
+      "epoch": 0.7249466950959488,
+      "grad_norm": 0.40378594398498535,
+      "learning_rate": 4.832977967306326e-05,
+      "loss": 0.7174,
+      "step": 340
+    },
+    {
+      "epoch": 0.7356076759061834,
+      "grad_norm": 0.39717379212379456,
+      "learning_rate": 4.904051172707889e-05,
+      "loss": 0.659,
+      "step": 345
+    },
+    {
+      "epoch": 0.746268656716418,
+      "grad_norm": 0.4855833053588867,
+      "learning_rate": 4.975124378109453e-05,
+      "loss": 0.6762,
+      "step": 350
+    },
+    {
+      "epoch": 0.7569296375266524,
+      "grad_norm": 0.47973328828811646,
+      "learning_rate": 5.046197583511016e-05,
+      "loss": 0.6782,
+      "step": 355
+    },
+    {
+      "epoch": 0.767590618336887,
+      "grad_norm": 0.4429256319999695,
+      "learning_rate": 5.117270788912581e-05,
+      "loss": 0.6634,
+      "step": 360
+    },
+    {
+      "epoch": 0.7782515991471215,
+      "grad_norm": 0.44692516326904297,
+      "learning_rate": 5.1883439943141444e-05,
+      "loss": 0.6792,
+      "step": 365
+    },
+    {
+      "epoch": 0.7889125799573561,
+      "grad_norm": 0.4430787265300751,
+      "learning_rate": 5.259417199715707e-05,
+      "loss": 0.6416,
+      "step": 370
+    },
+    {
+      "epoch": 0.7995735607675906,
+      "grad_norm": 0.4461454451084137,
+      "learning_rate": 5.330490405117271e-05,
+      "loss": 0.7013,
+      "step": 375
+    },
+    {
+      "epoch": 0.8102345415778252,
+      "grad_norm": 0.526995837688446,
+      "learning_rate": 5.401563610518835e-05,
+      "loss": 0.6396,
+      "step": 380
+    },
+    {
+      "epoch": 0.8208955223880597,
+      "grad_norm": 0.4485580623149872,
+      "learning_rate": 5.472636815920398e-05,
+      "loss": 0.6307,
+      "step": 385
+    },
+    {
+      "epoch": 0.8315565031982942,
+      "grad_norm": 0.45416155457496643,
+      "learning_rate": 5.543710021321962e-05,
+      "loss": 0.6361,
+      "step": 390
+    },
+    {
+      "epoch": 0.8422174840085288,
+      "grad_norm": 0.4746207296848297,
+      "learning_rate": 5.6147832267235255e-05,
+      "loss": 0.641,
+      "step": 395
+    },
+    {
+      "epoch": 0.8528784648187633,
+      "grad_norm": 0.4466172456741333,
+      "learning_rate": 5.68585643212509e-05,
+      "loss": 0.643,
+      "step": 400
+    },
+    {
+      "epoch": 0.8635394456289979,
+      "grad_norm": 0.46807265281677246,
+      "learning_rate": 5.756929637526652e-05,
+      "loss": 0.6258,
+      "step": 405
+    },
+    {
+      "epoch": 0.8742004264392325,
+      "grad_norm": 0.46169164776802063,
+      "learning_rate": 5.828002842928216e-05,
+      "loss": 0.6212,
+      "step": 410
+    },
+    {
+      "epoch": 0.8848614072494669,
+      "grad_norm": 0.47564077377319336,
+      "learning_rate": 5.8990760483297804e-05,
+      "loss": 0.6369,
+      "step": 415
+    },
+    {
+      "epoch": 0.8955223880597015,
+      "grad_norm": 0.4582447409629822,
+      "learning_rate": 5.970149253731343e-05,
+      "loss": 0.6086,
+      "step": 420
+    },
+    {
+      "epoch": 0.906183368869936,
+      "grad_norm": 0.5161389708518982,
+      "learning_rate": 6.041222459132907e-05,
+      "loss": 0.6529,
+      "step": 425
+    },
+    {
+      "epoch": 0.9168443496801706,
+      "grad_norm": 0.47045719623565674,
+      "learning_rate": 6.11229566453447e-05,
+      "loss": 0.6119,
+      "step": 430
+    },
+    {
+      "epoch": 0.9275053304904051,
+      "grad_norm": 0.5950572490692139,
+      "learning_rate": 6.183368869936035e-05,
+      "loss": 0.6259,
+      "step": 435
+    },
+    {
+      "epoch": 0.9381663113006397,
+      "grad_norm": 0.5470284223556519,
+      "learning_rate": 6.254442075337598e-05,
+      "loss": 0.6282,
+      "step": 440
+    },
+    {
+      "epoch": 0.9488272921108742,
+      "grad_norm": 0.5164011716842651,
+      "learning_rate": 6.325515280739162e-05,
+      "loss": 0.6399,
+      "step": 445
+    },
+    {
+      "epoch": 0.9594882729211087,
+      "grad_norm": 0.4264001250267029,
+      "learning_rate": 6.396588486140725e-05,
+      "loss": 0.6405,
+      "step": 450
+    },
+    {
+      "epoch": 0.9701492537313433,
+      "grad_norm": 0.4878412187099457,
+      "learning_rate": 6.46766169154229e-05,
+      "loss": 0.6548,
+      "step": 455
+    },
+    {
+      "epoch": 0.9808102345415778,
+      "grad_norm": 0.47677186131477356,
+      "learning_rate": 6.538734896943853e-05,
+      "loss": 0.6506,
+      "step": 460
+    },
+    {
+      "epoch": 0.9914712153518124,
+      "grad_norm": 0.4687974452972412,
+      "learning_rate": 6.609808102345416e-05,
+      "loss": 0.6267,
+      "step": 465
+    },
+    {
+      "epoch": 1.0,
+      "eval_loss": 0.6078405976295471,
+      "eval_runtime": 377.5565,
+      "eval_samples_per_second": 1.091,
+      "eval_steps_per_second": 1.091,
+      "step": 469
+    },
+    {
+      "epoch": 1.0021321961620469,
+      "grad_norm": 0.4401796758174896,
+      "learning_rate": 6.68088130774698e-05,
+      "loss": 0.5968,
+      "step": 470
+    },
+    {
+      "epoch": 1.0127931769722816,
+      "grad_norm": 0.8371634483337402,
+      "learning_rate": 6.751954513148543e-05,
+      "loss": 0.5923,
+      "step": 475
+    },
+    {
+      "epoch": 1.023454157782516,
+      "grad_norm": 0.49846479296684265,
+      "learning_rate": 6.823027718550106e-05,
+      "loss": 0.6835,
+      "step": 480
+    },
+    {
+      "epoch": 1.0341151385927505,
+      "grad_norm": 0.5845323801040649,
+      "learning_rate": 6.89410092395167e-05,
+      "loss": 0.5906,
+      "step": 485
+    },
+    {
+      "epoch": 1.044776119402985,
+      "grad_norm": 0.5639384984970093,
+      "learning_rate": 6.965174129353235e-05,
+      "loss": 0.5881,
+      "step": 490
+    },
+    {
+      "epoch": 1.0554371002132197,
+      "grad_norm": 0.5082396268844604,
+      "learning_rate": 7.036247334754798e-05,
+      "loss": 0.6224,
+      "step": 495
+    },
+    {
+      "epoch": 1.0660980810234542,
+      "grad_norm": 0.5611528158187866,
+      "learning_rate": 7.107320540156361e-05,
+      "loss": 0.5643,
+      "step": 500
+    },
+    {
+      "epoch": 1.0767590618336886,
+      "grad_norm": 0.7102047801017761,
+      "learning_rate": 7.178393745557926e-05,
+      "loss": 0.5814,
+      "step": 505
+    },
+    {
+      "epoch": 1.0874200426439233,
+      "grad_norm": 0.46847936511039734,
+      "learning_rate": 7.249466950959489e-05,
+      "loss": 0.5642,
+      "step": 510
+    },
+    {
+      "epoch": 1.0980810234541578,
+      "grad_norm": 0.47119173407554626,
+      "learning_rate": 7.320540156361052e-05,
+      "loss": 0.5674,
+      "step": 515
+    },
+    {
+      "epoch": 1.1087420042643923,
+      "grad_norm": 1.0005890130996704,
+      "learning_rate": 7.391613361762616e-05,
+      "loss": 0.5949,
+      "step": 520
+    },
+    {
+      "epoch": 1.1194029850746268,
+      "grad_norm": 0.7785916924476624,
+      "learning_rate": 7.46268656716418e-05,
+      "loss": 0.5643,
+      "step": 525
+    },
+    {
+      "epoch": 1.1300639658848615,
+      "grad_norm": 0.6393773555755615,
+      "learning_rate": 7.533759772565742e-05,
+      "loss": 0.5886,
+      "step": 530
+    },
+    {
+      "epoch": 1.140724946695096,
+      "grad_norm": 0.6369247436523438,
+      "learning_rate": 7.604832977967307e-05,
+      "loss": 0.58,
+      "step": 535
+    },
+    {
+      "epoch": 1.1513859275053304,
+      "grad_norm": 0.48704272508621216,
+      "learning_rate": 7.675906183368871e-05,
+      "loss": 0.6125,
+      "step": 540
+    },
+    {
+      "epoch": 1.1620469083155651,
+      "grad_norm": 0.5542349219322205,
+      "learning_rate": 7.746979388770433e-05,
+      "loss": 0.5688,
+      "step": 545
+    },
+    {
+      "epoch": 1.1727078891257996,
+      "grad_norm": 0.4632197618484497,
+      "learning_rate": 7.818052594171997e-05,
+      "loss": 0.5727,
+      "step": 550
+    },
+    {
+      "epoch": 1.183368869936034,
+      "grad_norm": 0.40735307335853577,
+      "learning_rate": 7.889125799573562e-05,
+      "loss": 0.5704,
+      "step": 555
+    },
+    {
+      "epoch": 1.1940298507462686,
+      "grad_norm": 0.45803022384643555,
+      "learning_rate": 7.960199004975125e-05,
+      "loss": 0.6041,
+      "step": 560
+    },
+    {
+      "epoch": 1.2046908315565032,
+      "grad_norm": 0.47275593876838684,
+      "learning_rate": 8.031272210376688e-05,
+      "loss": 0.5476,
+      "step": 565
+    },
+    {
+      "epoch": 1.2153518123667377,
+      "grad_norm": 0.4402256906032562,
+      "learning_rate": 8.102345415778252e-05,
+      "loss": 0.6101,
+      "step": 570
+    },
+    {
+      "epoch": 1.2260127931769722,
+      "grad_norm": 0.4577506184577942,
+      "learning_rate": 8.173418621179815e-05,
+      "loss": 0.6021,
+      "step": 575
+    },
+    {
+      "epoch": 1.236673773987207,
+      "grad_norm": 0.4695811867713928,
+      "learning_rate": 8.24449182658138e-05,
+      "loss": 0.5843,
+      "step": 580
+    },
+    {
+      "epoch": 1.2473347547974414,
+      "grad_norm": 0.5012730360031128,
+      "learning_rate": 8.315565031982943e-05,
+      "loss": 0.5963,
+      "step": 585
+    },
+    {
+      "epoch": 1.2579957356076759,
+      "grad_norm": 0.4261506199836731,
+      "learning_rate": 8.386638237384507e-05,
+      "loss": 0.5608,
+      "step": 590
+    },
+    {
+      "epoch": 1.2686567164179103,
+      "grad_norm": 0.48886266350746155,
+      "learning_rate": 8.45771144278607e-05,
+      "loss": 0.5768,
+      "step": 595
+    },
+    {
+      "epoch": 1.279317697228145,
+      "grad_norm": 0.4756333529949188,
+      "learning_rate": 8.528784648187633e-05,
+      "loss": 0.5581,
+      "step": 600
+    },
+    {
+      "epoch": 1.2899786780383795,
+      "grad_norm": 0.4242517054080963,
+      "learning_rate": 8.599857853589198e-05,
+      "loss": 0.5436,
+      "step": 605
+    },
+    {
+      "epoch": 1.3006396588486142,
+      "grad_norm": 0.44590556621551514,
+      "learning_rate": 8.67093105899076e-05,
+      "loss": 0.5821,
+      "step": 610
+    },
+    {
+      "epoch": 1.3113006396588487,
+      "grad_norm": 0.4373833239078522,
+      "learning_rate": 8.742004264392325e-05,
+      "loss": 0.544,
+      "step": 615
+    },
+    {
+      "epoch": 1.3219616204690832,
+      "grad_norm": 0.42627617716789246,
+      "learning_rate": 8.813077469793888e-05,
+      "loss": 0.5417,
+      "step": 620
+    },
+    {
+      "epoch": 1.3326226012793176,
+      "grad_norm": 0.516544759273529,
+      "learning_rate": 8.884150675195451e-05,
+      "loss": 0.573,
+      "step": 625
+    },
+    {
+      "epoch": 1.3432835820895521,
+      "grad_norm": 0.4419044256210327,
+      "learning_rate": 8.955223880597016e-05,
+      "loss": 0.5523,
+      "step": 630
+    },
+    {
+      "epoch": 1.3539445628997868,
+      "grad_norm": 0.4533810019493103,
+      "learning_rate": 9.026297085998579e-05,
+      "loss": 0.5372,
+      "step": 635
+    },
+    {
+      "epoch": 1.3646055437100213,
+      "grad_norm": 0.4296520948410034,
+      "learning_rate": 9.097370291400143e-05,
+      "loss": 0.5742,
+      "step": 640
+    },
+    {
+      "epoch": 1.375266524520256,
+      "grad_norm": 0.4285917282104492,
+      "learning_rate": 9.168443496801706e-05,
+      "loss": 0.5577,
+      "step": 645
+    },
+    {
+      "epoch": 1.3859275053304905,
+      "grad_norm": 0.41438210010528564,
+      "learning_rate": 9.23951670220327e-05,
+      "loss": 0.5659,
+      "step": 650
+    },
+    {
+      "epoch": 1.396588486140725,
+      "grad_norm": 0.43702948093414307,
+      "learning_rate": 9.310589907604834e-05,
+      "loss": 0.5425,
+      "step": 655
+    },
+    {
+      "epoch": 1.4072494669509594,
+      "grad_norm": 0.520577609539032,
+      "learning_rate": 9.381663113006397e-05,
+      "loss": 0.5624,
+      "step": 660
+    },
+    {
+      "epoch": 1.417910447761194,
+      "grad_norm": 0.451948881149292,
+      "learning_rate": 9.452736318407961e-05,
+      "loss": 0.5598,
+      "step": 665
+    },
+    {
+      "epoch": 1.4285714285714286,
+      "grad_norm": 0.4748338460922241,
+      "learning_rate": 9.523809523809524e-05,
+      "loss": 0.6579,
+      "step": 670
+    },
+    {
+      "epoch": 1.439232409381663,
+      "grad_norm": 0.4351726472377777,
+      "learning_rate": 9.594882729211087e-05,
+      "loss": 0.541,
+      "step": 675
+    },
+    {
+      "epoch": 1.4498933901918978,
+      "grad_norm": 0.4322686493396759,
+      "learning_rate": 9.665955934612652e-05,
+      "loss": 0.5941,
+      "step": 680
+    },
+    {
+      "epoch": 1.4605543710021323,
+      "grad_norm": 0.43369051814079285,
+      "learning_rate": 9.737029140014216e-05,
+      "loss": 0.5862,
+      "step": 685
+    },
+    {
+      "epoch": 1.4712153518123667,
+      "grad_norm": 0.5028679966926575,
+      "learning_rate": 9.808102345415778e-05,
+      "loss": 0.5444,
+      "step": 690
+    },
+    {
+      "epoch": 1.4818763326226012,
+      "grad_norm": 0.4060784578323364,
+      "learning_rate": 9.879175550817342e-05,
+      "loss": 0.549,
+      "step": 695
+    },
+    {
+      "epoch": 1.4925373134328357,
+      "grad_norm": 0.4283974766731262,
+      "learning_rate": 9.950248756218906e-05,
+      "loss": 0.5474,
+      "step": 700
+    },
+    {
+      "epoch": 1.5031982942430704,
+      "grad_norm": 0.3743923008441925,
+      "learning_rate": 0.0001002132196162047,
+      "loss": 0.5394,
+      "step": 705
+    },
+    {
+      "epoch": 1.5138592750533049,
+      "grad_norm": 0.44469088315963745,
+      "learning_rate": 0.00010092395167022033,
+      "loss": 0.5563,
+      "step": 710
+    },
+    {
+      "epoch": 1.5245202558635396,
+      "grad_norm": 0.43209415674209595,
+      "learning_rate": 0.00010163468372423597,
+      "loss": 0.5803,
+      "step": 715
+    },
+    {
+      "epoch": 1.535181236673774,
+      "grad_norm": 0.4075677990913391,
+      "learning_rate": 0.00010234541577825161,
+      "loss": 0.5369,
+      "step": 720
+    },
+    {
+      "epoch": 1.5458422174840085,
+      "grad_norm": 0.4084095358848572,
+      "learning_rate": 0.00010305614783226724,
+      "loss": 0.5687,
+      "step": 725
+    },
+    {
+      "epoch": 1.556503198294243,
+      "grad_norm": 0.4053703248500824,
+      "learning_rate": 0.00010376687988628289,
+      "loss": 0.5301,
+      "step": 730
+    },
+    {
+      "epoch": 1.5671641791044775,
+      "grad_norm": 0.46452564001083374,
+      "learning_rate": 0.0001044776119402985,
+      "loss": 0.5823,
+      "step": 735
+    },
+    {
+      "epoch": 1.5778251599147122,
+      "grad_norm": 0.4020977020263672,
+      "learning_rate": 0.00010518834399431414,
+      "loss": 0.5463,
+      "step": 740
+    },
+    {
+      "epoch": 1.5884861407249466,
+      "grad_norm": 0.3993551433086395,
+      "learning_rate": 0.00010589907604832978,
+      "loss": 0.5551,
+      "step": 745
+    },
+    {
+      "epoch": 1.5991471215351813,
+      "grad_norm": 0.4211786985397339,
+      "learning_rate": 0.00010660980810234542,
+      "loss": 0.5607,
+      "step": 750
+    },
+    {
+      "epoch": 1.6098081023454158,
+      "grad_norm": 0.4241097867488861,
+      "learning_rate": 0.00010732054015636106,
+      "loss": 0.5402,
+      "step": 755
+    },
+    {
+      "epoch": 1.6204690831556503,
+      "grad_norm": 0.3934391736984253,
+      "learning_rate": 0.0001080312722103767,
+      "loss": 0.5618,
+      "step": 760
+    },
+    {
+      "epoch": 1.6311300639658848,
+      "grad_norm": 0.37157073616981506,
+      "learning_rate": 0.00010874200426439234,
+      "loss": 0.5232,
+      "step": 765
+    },
+    {
+      "epoch": 1.6417910447761193,
+      "grad_norm": 0.4151962399482727,
+      "learning_rate": 0.00010945273631840796,
+      "loss": 0.563,
+      "step": 770
+    },
+    {
+      "epoch": 1.652452025586354,
+      "grad_norm": 0.42233771085739136,
+      "learning_rate": 0.00011016346837242359,
+      "loss": 0.5667,
+      "step": 775
+    },
+    {
+      "epoch": 1.6631130063965884,
+      "grad_norm": 0.3891717493534088,
+      "learning_rate": 0.00011087420042643924,
+      "loss": 0.582,
+      "step": 780
+    },
+    {
+      "epoch": 1.6737739872068231,
+      "grad_norm": 0.4017283618450165,
+      "learning_rate": 0.00011158493248045488,
+      "loss": 0.5386,
+      "step": 785
+    },
+    {
+      "epoch": 1.6844349680170576,
+      "grad_norm": 0.4058316648006439,
+      "learning_rate": 0.00011229566453447051,
+      "loss": 0.5357,
+      "step": 790
+    },
+    {
+      "epoch": 1.695095948827292,
+      "grad_norm": 0.38968625664711,
+      "learning_rate": 0.00011300639658848615,
+      "loss": 0.527,
+      "step": 795
+    },
+    {
+      "epoch": 1.7057569296375266,
+      "grad_norm": 0.4108840525150299,
+      "learning_rate": 0.0001137171286425018,
+      "loss": 0.5347,
+      "step": 800
+    },
+    {
+      "epoch": 1.716417910447761,
+      "grad_norm": 0.37222376465797424,
+      "learning_rate": 0.00011442786069651741,
+      "loss": 0.524,
+      "step": 805
+    },
+    {
+      "epoch": 1.7270788912579957,
+      "grad_norm": 0.4046708047389984,
+      "learning_rate": 0.00011513859275053305,
+      "loss": 0.5096,
+      "step": 810
+    },
+    {
+      "epoch": 1.7377398720682304,
+      "grad_norm": 0.37089455127716064,
+      "learning_rate": 0.00011584932480454869,
+      "loss": 0.5316,
+      "step": 815
+    },
+    {
+      "epoch": 1.748400852878465,
+      "grad_norm": 0.3895399272441864,
+      "learning_rate": 0.00011656005685856432,
+      "loss": 0.5274,
+      "step": 820
+    },
+    {
+      "epoch": 1.7590618336886994,
+      "grad_norm": 0.3956606984138489,
+      "learning_rate": 0.00011727078891257996,
+      "loss": 0.5395,
+      "step": 825
+    },
+    {
+      "epoch": 1.7697228144989339,
+      "grad_norm": 0.4023361802101135,
+      "learning_rate": 0.00011798152096659561,
+      "loss": 0.53,
+      "step": 830
+    },
+    {
+      "epoch": 1.7803837953091683,
+      "grad_norm": 0.39323511719703674,
+      "learning_rate": 0.00011869225302061124,
+      "loss": 0.5341,
+      "step": 835
+    },
+    {
+      "epoch": 1.7910447761194028,
+      "grad_norm": 0.3870689868927002,
+      "learning_rate": 0.00011940298507462686,
+      "loss": 0.5268,
+      "step": 840
+    },
+    {
+      "epoch": 1.8017057569296375,
+      "grad_norm": 0.39864471554756165,
+      "learning_rate": 0.0001201137171286425,
+      "loss": 0.5754,
+      "step": 845
+    },
+    {
+      "epoch": 1.8123667377398722,
+      "grad_norm": 0.413980633020401,
+      "learning_rate": 0.00012082444918265814,
+      "loss": 0.5274,
+      "step": 850
+    },
+    {
+      "epoch": 1.8230277185501067,
+      "grad_norm": 0.3994651138782501,
+      "learning_rate": 0.00012153518123667377,
+      "loss": 0.5313,
+      "step": 855
+    },
+    {
+      "epoch": 1.8336886993603412,
+      "grad_norm": 0.4106079041957855,
+      "learning_rate": 0.0001222459132906894,
+      "loss": 0.5293,
+      "step": 860
+    },
+    {
+      "epoch": 1.8443496801705757,
+      "grad_norm": 0.38014471530914307,
+      "learning_rate": 0.00012295664534470505,
+      "loss": 0.5313,
+      "step": 865
+    },
+    {
+      "epoch": 1.8550106609808101,
+      "grad_norm": 0.3477731943130493,
+      "learning_rate": 0.0001236673773987207,
+      "loss": 0.5499,
+      "step": 870
+    },
+    {
+      "epoch": 1.8656716417910446,
+      "grad_norm": 0.3609556555747986,
+      "learning_rate": 0.0001243781094527363,
+      "loss": 0.5195,
+      "step": 875
+    },
+    {
+      "epoch": 1.8763326226012793,
+      "grad_norm": 0.3532927334308624,
+      "learning_rate": 0.00012508884150675195,
+      "loss": 0.5233,
+      "step": 880
+    },
+    {
+      "epoch": 1.886993603411514,
+      "grad_norm": 0.3663487434387207,
+      "learning_rate": 0.0001257995735607676,
+      "loss": 0.5129,
+      "step": 885
+    },
+    {
+      "epoch": 1.8976545842217485,
+      "grad_norm": 0.35837364196777344,
+      "learning_rate": 0.00012651030561478324,
+      "loss": 0.5106,
+      "step": 890
+    },
+    {
+      "epoch": 1.908315565031983,
+      "grad_norm": 0.38498660922050476,
+      "learning_rate": 0.00012722103766879886,
+      "loss": 0.5216,
+      "step": 895
+    },
+    {
+      "epoch": 1.9189765458422174,
+      "grad_norm": 0.3501322269439697,
+      "learning_rate": 0.0001279317697228145,
+      "loss": 0.54,
+      "step": 900
+    },
+    {
+      "epoch": 1.929637526652452,
+      "grad_norm": 0.34796684980392456,
+      "learning_rate": 0.00012864250177683015,
+      "loss": 0.5165,
+      "step": 905
+    },
+    {
+      "epoch": 1.9402985074626866,
+      "grad_norm": 0.46670106053352356,
+      "learning_rate": 0.0001293532338308458,
+      "loss": 0.5437,
+      "step": 910
+    },
+    {
+      "epoch": 1.950959488272921,
+      "grad_norm": 0.3535880148410797,
+      "learning_rate": 0.0001300639658848614,
+      "loss": 0.5561,
+      "step": 915
+    },
+    {
+      "epoch": 1.9616204690831558,
+      "grad_norm": 0.3591325283050537,
+      "learning_rate": 0.00013077469793887705,
+      "loss": 0.5193,
+      "step": 920
+    },
+    {
+      "epoch": 1.9722814498933903,
+      "grad_norm": 0.4969016909599304,
+      "learning_rate": 0.00013148542999289267,
+      "loss": 0.526,
+      "step": 925
+    },
+    {
+      "epoch": 1.9829424307036247,
+      "grad_norm": 0.3567504584789276,
+      "learning_rate": 0.00013219616204690831,
+      "loss": 0.5063,
+      "step": 930
+    },
+    {
+      "epoch": 1.9936034115138592,
+      "grad_norm": 0.3647787272930145,
+      "learning_rate": 0.00013290689410092396,
+      "loss": 0.5094,
+      "step": 935
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.5335173606872559,
+      "eval_runtime": 377.8765,
+      "eval_samples_per_second": 1.09,
+      "eval_steps_per_second": 1.09,
+      "step": 938
+    },
+    {
+      "epoch": 2.0042643923240937,
+      "grad_norm": 0.34923797845840454,
+      "learning_rate": 0.0001336176261549396,
+      "loss": 0.5126,
+      "step": 940
+    },
+    {
+      "epoch": 2.014925373134328,
+      "grad_norm": 0.4439273476600647,
+      "learning_rate": 0.00013432835820895525,
+      "loss": 0.5349,
+      "step": 945
+    },
+    {
+      "epoch": 2.025586353944563,
+      "grad_norm": 0.35956764221191406,
+      "learning_rate": 0.00013503909026297086,
+      "loss": 0.493,
+      "step": 950
+    },
+    {
+      "epoch": 2.0362473347547976,
+      "grad_norm": 0.3677864074707031,
+      "learning_rate": 0.0001357498223169865,
+      "loss": 0.523,
+      "step": 955
+    },
+    {
+      "epoch": 2.046908315565032,
+      "grad_norm": 0.3486590087413788,
+      "learning_rate": 0.00013646055437100213,
+      "loss": 0.5322,
+      "step": 960
+    },
+    {
+      "epoch": 2.0575692963752665,
+      "grad_norm": 0.3785991072654724,
+      "learning_rate": 0.00013717128642501777,
+      "loss": 0.4903,
+      "step": 965
+    },
+    {
+      "epoch": 2.068230277185501,
+      "grad_norm": 0.3422692120075226,
+      "learning_rate": 0.0001378820184790334,
+      "loss": 0.5356,
+      "step": 970
+    },
+    {
+      "epoch": 2.0788912579957355,
+      "grad_norm": 0.41184964776039124,
+      "learning_rate": 0.00013859275053304906,
+      "loss": 0.4969,
+      "step": 975
+    },
+    {
+      "epoch": 2.08955223880597,
+      "grad_norm": 0.34267646074295044,
+      "learning_rate": 0.0001393034825870647,
+      "loss": 0.5113,
+      "step": 980
+    },
+    {
+      "epoch": 2.100213219616205,
+      "grad_norm": 0.38112279772758484,
+      "learning_rate": 0.00014001421464108032,
+      "loss": 0.4793,
+      "step": 985
+    },
+    {
+      "epoch": 2.1108742004264394,
+      "grad_norm": 0.33497291803359985,
+      "learning_rate": 0.00014072494669509596,
+      "loss": 0.5185,
+      "step": 990
+    },
+    {
+      "epoch": 2.121535181236674,
+      "grad_norm": 0.37100210785865784,
+      "learning_rate": 0.00014143567874911158,
+      "loss": 0.5024,
+      "step": 995
+    },
+    {
+      "epoch": 2.1321961620469083,
+      "grad_norm": 0.3079771101474762,
+      "learning_rate": 0.00014214641080312722,
+      "loss": 0.5066,
+      "step": 1000
+    },
+    {
+      "epoch": 2.142857142857143,
+      "grad_norm": 0.3615591824054718,
+      "learning_rate": 0.00014285714285714287,
+      "loss": 0.5157,
+      "step": 1005
+    },
+    {
+      "epoch": 2.1535181236673773,
+      "grad_norm": 0.3394719958305359,
+      "learning_rate": 0.0001435678749111585,
+      "loss": 0.4906,
+      "step": 1010
+    },
+    {
+      "epoch": 2.1641791044776117,
+      "grad_norm": 0.4234224557876587,
+      "learning_rate": 0.00014427860696517416,
+      "loss": 0.5015,
+      "step": 1015
+    },
+    {
+      "epoch": 2.1748400852878467,
+      "grad_norm": 0.3535841107368469,
+      "learning_rate": 0.00014498933901918977,
+      "loss": 0.5107,
+      "step": 1020
+    },
+    {
+      "epoch": 2.185501066098081,
+      "grad_norm": 0.41673514246940613,
+      "learning_rate": 0.0001457000710732054,
+      "loss": 0.505,
+      "step": 1025
+    },
+    {
+      "epoch": 2.1961620469083156,
+      "grad_norm": 0.3521960973739624,
+      "learning_rate": 0.00014641080312722103,
+      "loss": 0.5339,
+      "step": 1030
+    },
+    {
+      "epoch": 2.20682302771855,
+      "grad_norm": 0.341727614402771,
+      "learning_rate": 0.00014712153518123668,
+      "loss": 0.4897,
+      "step": 1035
+    },
+    {
+      "epoch": 2.2174840085287846,
+      "grad_norm": 0.32079800963401794,
+      "learning_rate": 0.00014783226723525232,
+      "loss": 0.5049,
+      "step": 1040
+    },
+    {
+      "epoch": 2.228144989339019,
+      "grad_norm": 0.34027552604675293,
+      "learning_rate": 0.00014854299928926797,
+      "loss": 0.4993,
+      "step": 1045
+    },
+    {
+      "epoch": 2.2388059701492535,
+      "grad_norm": 0.34183624386787415,
+      "learning_rate": 0.0001492537313432836,
+      "loss": 0.51,
+      "step": 1050
+    },
+    {
+      "epoch": 2.2494669509594885,
+      "grad_norm": 0.31983354687690735,
+      "learning_rate": 0.00014996446339729923,
+      "loss": 0.5084,
+      "step": 1055
+    },
+    {
+      "epoch": 2.260127931769723,
+      "grad_norm": 0.3631596565246582,
+      "learning_rate": 0.00015067519545131484,
+      "loss": 0.4986,
+      "step": 1060
+    },
+    {
+      "epoch": 2.2707889125799574,
+      "grad_norm": 0.32126784324645996,
+      "learning_rate": 0.0001513859275053305,
+      "loss": 0.4832,
+      "step": 1065
+    },
+    {
+      "epoch": 2.281449893390192,
+      "grad_norm": 0.3390761911869049,
+      "learning_rate": 0.00015209665955934613,
+      "loss": 0.4972,
+      "step": 1070
+    },
+    {
+      "epoch": 2.2921108742004264,
+      "grad_norm": 0.3330533504486084,
+      "learning_rate": 0.00015280739161336178,
+      "loss": 0.4772,
+      "step": 1075
+    },
+    {
+      "epoch": 2.302771855010661,
+      "grad_norm": 0.3619351089000702,
+      "learning_rate": 0.00015351812366737742,
+      "loss": 0.5141,
+      "step": 1080
+    },
+    {
+      "epoch": 2.3134328358208958,
+      "grad_norm": 0.3252182602882385,
+      "learning_rate": 0.00015422885572139304,
+      "loss": 0.5056,
+      "step": 1085
+    },
+    {
+      "epoch": 2.3240938166311302,
+      "grad_norm": 0.3745068311691284,
+      "learning_rate": 0.00015493958777540866,
+      "loss": 0.5395,
+      "step": 1090
+    },
+    {
+      "epoch": 2.3347547974413647,
+      "grad_norm": 0.38191962242126465,
+      "learning_rate": 0.0001556503198294243,
+      "loss": 0.4865,
+      "step": 1095
+    },
+    {
+      "epoch": 2.345415778251599,
+      "grad_norm": 0.32218611240386963,
+      "learning_rate": 0.00015636105188343994,
+      "loss": 0.4955,
+      "step": 1100
+    },
+    {
+      "epoch": 2.3560767590618337,
+      "grad_norm": 0.32240140438079834,
+      "learning_rate": 0.0001570717839374556,
+      "loss": 0.4972,
+      "step": 1105
+    },
+    {
+      "epoch": 2.366737739872068,
+      "grad_norm": 0.37284377217292786,
+      "learning_rate": 0.00015778251599147123,
+      "loss": 0.4874,
+      "step": 1110
+    },
+    {
+      "epoch": 2.3773987206823026,
+      "grad_norm": 0.350769579410553,
+      "learning_rate": 0.00015849324804548688,
+      "loss": 0.4931,
+      "step": 1115
+    },
+    {
+      "epoch": 2.388059701492537,
+      "grad_norm": 0.3309812843799591,
+      "learning_rate": 0.0001592039800995025,
+      "loss": 0.5103,
+      "step": 1120
+    },
+    {
+      "epoch": 2.398720682302772,
+      "grad_norm": 0.3497963547706604,
+      "learning_rate": 0.0001599147121535181,
+      "loss": 0.4864,
+      "step": 1125
+    },
+    {
+      "epoch": 2.4093816631130065,
+      "grad_norm": 0.3567025661468506,
+      "learning_rate": 0.00016062544420753375,
+      "loss": 0.5461,
+      "step": 1130
+    },
+    {
+      "epoch": 2.420042643923241,
+      "grad_norm": 0.5213941931724548,
+      "learning_rate": 0.0001613361762615494,
+      "loss": 0.5138,
+      "step": 1135
+    },
+    {
+      "epoch": 2.4307036247334755,
+      "grad_norm": 0.32027000188827515,
+      "learning_rate": 0.00016204690831556504,
+      "loss": 0.5078,
+      "step": 1140
+    },
+    {
+      "epoch": 2.44136460554371,
+      "grad_norm": 0.37092500925064087,
+      "learning_rate": 0.00016275764036958069,
+      "loss": 0.4903,
+      "step": 1145
+    },
+    {
+      "epoch": 2.4520255863539444,
+      "grad_norm": 0.35545867681503296,
+      "learning_rate": 0.0001634683724235963,
+      "loss": 0.5131,
+      "step": 1150
+    },
+    {
+      "epoch": 2.4626865671641793,
+      "grad_norm": 0.3277740776538849,
+      "learning_rate": 0.00016417910447761195,
+      "loss": 0.4814,
+      "step": 1155
+    },
+    {
+      "epoch": 2.473347547974414,
+      "grad_norm": 0.3226880133152008,
+      "learning_rate": 0.0001648898365316276,
+      "loss": 0.4944,
+      "step": 1160
+    },
+    {
+      "epoch": 2.4840085287846483,
+      "grad_norm": 0.3283137381076813,
+      "learning_rate": 0.0001656005685856432,
+      "loss": 0.5058,
+      "step": 1165
+    },
+    {
+      "epoch": 2.4946695095948828,
+      "grad_norm": 0.38707828521728516,
+      "learning_rate": 0.00016631130063965885,
+      "loss": 0.5108,
+      "step": 1170
+    },
+    {
+      "epoch": 2.5053304904051172,
+      "grad_norm": 0.3053881824016571,
+      "learning_rate": 0.0001670220326936745,
+      "loss": 0.4751,
+      "step": 1175
+    },
+    {
+      "epoch": 2.5159914712153517,
+      "grad_norm": 0.29871490597724915,
+      "learning_rate": 0.00016773276474769014,
+      "loss": 0.4848,
+      "step": 1180
+    },
+    {
+      "epoch": 2.526652452025586,
+      "grad_norm": 0.3135201930999756,
+      "learning_rate": 0.00016844349680170576,
+      "loss": 0.4852,
+      "step": 1185
+    },
+    {
+      "epoch": 2.5373134328358207,
+      "grad_norm": 0.31287622451782227,
+      "learning_rate": 0.0001691542288557214,
+      "loss": 0.4804,
+      "step": 1190
+    },
+    {
+      "epoch": 2.5479744136460556,
+      "grad_norm": 0.30184197425842285,
+      "learning_rate": 0.00016986496090973705,
+      "loss": 0.5006,
+      "step": 1195
+    },
+    {
+      "epoch": 2.55863539445629,
+      "grad_norm": 0.29948562383651733,
+      "learning_rate": 0.00017057569296375266,
+      "loss": 0.4934,
+      "step": 1200
+    },
+    {
+      "epoch": 2.5692963752665245,
+      "grad_norm": 0.29258280992507935,
+      "learning_rate": 0.0001712864250177683,
+      "loss": 0.4887,
+      "step": 1205
+    },
+    {
+      "epoch": 2.579957356076759,
+      "grad_norm": 0.29767826199531555,
+      "learning_rate": 0.00017199715707178395,
+      "loss": 0.4958,
+      "step": 1210
+    },
+    {
+      "epoch": 2.5906183368869935,
+      "grad_norm": 0.29649823904037476,
+      "learning_rate": 0.0001727078891257996,
+      "loss": 0.51,
+      "step": 1215
+    },
+    {
+      "epoch": 2.6012793176972284,
+      "grad_norm": 0.30332130193710327,
+      "learning_rate": 0.0001734186211798152,
+      "loss": 0.4954,
+      "step": 1220
+    },
+    {
+      "epoch": 2.611940298507463,
+      "grad_norm": 0.3551209270954132,
+      "learning_rate": 0.00017412935323383086,
+      "loss": 0.5088,
+      "step": 1225
+    },
+    {
+      "epoch": 2.6226012793176974,
+      "grad_norm": 0.33677777647972107,
+      "learning_rate": 0.0001748400852878465,
+      "loss": 0.5248,
+      "step": 1230
+    },
+    {
+      "epoch": 2.633262260127932,
+      "grad_norm": 0.29216548800468445,
+      "learning_rate": 0.00017555081734186212,
+      "loss": 0.4954,
+      "step": 1235
+    },
+    {
+      "epoch": 2.6439232409381663,
+      "grad_norm": 0.32732442021369934,
+      "learning_rate": 0.00017626154939587776,
+      "loss": 0.5048,
+      "step": 1240
+    },
+    {
+      "epoch": 2.654584221748401,
+      "grad_norm": 0.29788029193878174,
+      "learning_rate": 0.0001769722814498934,
+      "loss": 0.5056,
+      "step": 1245
+    },
+    {
+      "epoch": 2.6652452025586353,
+      "grad_norm": 0.3407440185546875,
+      "learning_rate": 0.00017768301350390902,
+      "loss": 0.5385,
+      "step": 1250
+    },
+    {
+      "epoch": 2.6759061833688698,
+      "grad_norm": 0.2790848910808563,
+      "learning_rate": 0.00017839374555792467,
+      "loss": 0.5014,
+      "step": 1255
+    },
+    {
+      "epoch": 2.6865671641791042,
+      "grad_norm": 0.30173078179359436,
+      "learning_rate": 0.0001791044776119403,
+      "loss": 0.5118,
+      "step": 1260
+    },
+    {
+      "epoch": 2.697228144989339,
+      "grad_norm": 0.2736753821372986,
+      "learning_rate": 0.00017981520966595596,
+      "loss": 0.5018,
+      "step": 1265
+    },
+    {
+      "epoch": 2.7078891257995736,
+      "grad_norm": 0.2970294952392578,
+      "learning_rate": 0.00018052594171997157,
+      "loss": 0.4966,
+      "step": 1270
+    },
+    {
+      "epoch": 2.718550106609808,
+      "grad_norm": 0.2721494138240814,
+      "learning_rate": 0.00018123667377398722,
+      "loss": 0.4746,
+      "step": 1275
+    },
+    {
+      "epoch": 2.7292110874200426,
+      "grad_norm": 0.29144713282585144,
+      "learning_rate": 0.00018194740582800286,
+      "loss": 0.4739,
+      "step": 1280
+    },
+    {
+      "epoch": 2.739872068230277,
+      "grad_norm": 0.3217550814151764,
+      "learning_rate": 0.00018265813788201848,
+      "loss": 0.4868,
+      "step": 1285
+    },
+    {
+      "epoch": 2.750533049040512,
+      "grad_norm": 0.25847169756889343,
+      "learning_rate": 0.00018336886993603412,
+      "loss": 0.4664,
+      "step": 1290
+    },
+    {
+      "epoch": 2.7611940298507465,
+      "grad_norm": 0.2917424142360687,
+      "learning_rate": 0.00018407960199004977,
+      "loss": 0.4659,
+      "step": 1295
+    },
+    {
+      "epoch": 2.771855010660981,
+      "grad_norm": 0.29807865619659424,
+      "learning_rate": 0.0001847903340440654,
+      "loss": 0.4838,
+      "step": 1300
+    },
+    {
+      "epoch": 2.7825159914712154,
+      "grad_norm": 0.28630420565605164,
+      "learning_rate": 0.00018550106609808103,
+      "loss": 0.4658,
+      "step": 1305
+    },
+    {
+      "epoch": 2.79317697228145,
+      "grad_norm": 0.2946392595767975,
+      "learning_rate": 0.00018621179815209667,
+      "loss": 0.5037,
+      "step": 1310
+    },
+    {
+      "epoch": 2.8038379530916844,
+      "grad_norm": 0.38894176483154297,
+      "learning_rate": 0.0001869225302061123,
+      "loss": 0.525,
+      "step": 1315
+    },
+    {
+      "epoch": 2.814498933901919,
+      "grad_norm": 0.28793737292289734,
+      "learning_rate": 0.00018763326226012793,
+      "loss": 0.5238,
+      "step": 1320
+    },
+    {
+      "epoch": 2.8251599147121533,
+      "grad_norm": 0.3103950023651123,
+      "learning_rate": 0.00018834399431414358,
+      "loss": 0.4932,
+      "step": 1325
+    },
+    {
+      "epoch": 2.835820895522388,
+      "grad_norm": 0.2969878017902374,
+      "learning_rate": 0.00018905472636815922,
+      "loss": 0.4807,
+      "step": 1330
+    },
+    {
+      "epoch": 2.8464818763326227,
+      "grad_norm": 0.2937600612640381,
+      "learning_rate": 0.00018976545842217486,
+      "loss": 0.4862,
+      "step": 1335
+    },
+    {
+      "epoch": 2.857142857142857,
+      "grad_norm": 0.2892070710659027,
+      "learning_rate": 0.00019047619047619048,
+      "loss": 0.526,
+      "step": 1340
+    },
+    {
+      "epoch": 2.8678038379530917,
+      "grad_norm": 0.28446847200393677,
+      "learning_rate": 0.00019118692253020613,
+      "loss": 0.4846,
+      "step": 1345
+    },
+    {
+      "epoch": 2.878464818763326,
+      "grad_norm": 0.2877322733402252,
+      "learning_rate": 0.00019189765458422174,
+      "loss": 0.4759,
+      "step": 1350
+    },
+    {
+      "epoch": 2.8891257995735606,
+      "grad_norm": 0.2837788462638855,
+      "learning_rate": 0.0001926083866382374,
+      "loss": 0.4894,
+      "step": 1355
+    },
+    {
+      "epoch": 2.8997867803837956,
+      "grad_norm": 0.3020360469818115,
+      "learning_rate": 0.00019331911869225303,
+      "loss": 0.4936,
+      "step": 1360
+    },
+    {
+      "epoch": 2.91044776119403,
+      "grad_norm": 0.28344911336898804,
+      "learning_rate": 0.00019402985074626867,
+      "loss": 0.4881,
+      "step": 1365
+    },
+    {
+      "epoch": 2.9211087420042645,
+      "grad_norm": 0.2753186821937561,
+      "learning_rate": 0.00019474058280028432,
+      "loss": 0.4826,
+      "step": 1370
+    },
+    {
+      "epoch": 2.931769722814499,
+      "grad_norm": 0.2922317385673523,
+      "learning_rate": 0.00019545131485429994,
+      "loss": 0.4759,
+      "step": 1375
+    },
+    {
+      "epoch": 2.9424307036247335,
+      "grad_norm": 0.3179524540901184,
+      "learning_rate": 0.00019616204690831555,
+      "loss": 0.4883,
+      "step": 1380
+    },
+    {
+      "epoch": 2.953091684434968,
+      "grad_norm": 0.2944222688674927,
+      "learning_rate": 0.0001968727789623312,
+      "loss": 0.4804,
+      "step": 1385
+    },
+    {
+      "epoch": 2.9637526652452024,
+      "grad_norm": 0.2687291204929352,
+      "learning_rate": 0.00019758351101634684,
+      "loss": 0.4891,
+      "step": 1390
+    },
+    {
+      "epoch": 2.974413646055437,
+      "grad_norm": 0.25935596227645874,
+      "learning_rate": 0.00019829424307036249,
+      "loss": 0.4902,
+      "step": 1395
+    },
+    {
+      "epoch": 2.9850746268656714,
+      "grad_norm": 0.30086612701416016,
+      "learning_rate": 0.00019900497512437813,
+      "loss": 0.4942,
+      "step": 1400
+    },
+    {
+      "epoch": 2.9957356076759063,
+      "grad_norm": 0.2930257022380829,
+      "learning_rate": 0.00019971570717839377,
+      "loss": 0.513,
+      "step": 1405
+    },
+    {
+      "epoch": 3.0,
+      "eval_loss": 0.5142309069633484,
+      "eval_runtime": 377.5199,
+      "eval_samples_per_second": 1.091,
+      "eval_steps_per_second": 1.091,
+      "step": 1407
+    },
+    {
+      "epoch": 3.0063965884861408,
+      "grad_norm": 0.28208208084106445,
+      "learning_rate": 0.00019999997230259856,
+      "loss": 0.467,
+      "step": 1410
+    },
+    {
+      "epoch": 3.0170575692963753,
+      "grad_norm": 0.290385365486145,
+      "learning_rate": 0.00019999980304075655,
+      "loss": 0.44,
+      "step": 1415
+    },
+    {
+      "epoch": 3.0277185501066097,
+      "grad_norm": 0.27436771988868713,
+      "learning_rate": 0.00019999947990477788,
+      "loss": 0.4876,
+      "step": 1420
+    },
+    {
+      "epoch": 3.038379530916844,
+      "grad_norm": 0.2883841395378113,
+      "learning_rate": 0.00019999900289515975,
+      "loss": 0.4509,
+      "step": 1425
+    },
+    {
+      "epoch": 3.0490405117270787,
+      "grad_norm": 0.279857337474823,
+      "learning_rate": 0.00019999837201263622,
+      "loss": 0.4431,
+      "step": 1430
+    },
+    {
+      "epoch": 3.0597014925373136,
+      "grad_norm": 0.31563228368759155,
+      "learning_rate": 0.000199997587258178,
+      "loss": 0.4789,
+      "step": 1435
+    },
+    {
+      "epoch": 3.070362473347548,
+      "grad_norm": 0.302135169506073,
+      "learning_rate": 0.00019999664863299267,
+      "loss": 0.4685,
+      "step": 1440
+    },
+    {
+      "epoch": 3.0810234541577826,
+      "grad_norm": 0.2668147385120392,
+      "learning_rate": 0.00019999555613852449,
+      "loss": 0.4361,
+      "step": 1445
+    },
+    {
+      "epoch": 3.091684434968017,
+      "grad_norm": 0.28701773285865784,
+      "learning_rate": 0.00019999430977645457,
+      "loss": 0.4417,
+      "step": 1450
+    },
+    {
+      "epoch": 3.1023454157782515,
+      "grad_norm": 0.2622893154621124,
+      "learning_rate": 0.00019999290954870073,
+      "loss": 0.4524,
+      "step": 1455
+    },
+    {
+      "epoch": 3.113006396588486,
+      "grad_norm": 0.2776693105697632,
+      "learning_rate": 0.00019999135545741755,
+      "loss": 0.463,
+      "step": 1460
+    },
+    {
+      "epoch": 3.1236673773987205,
+      "grad_norm": 0.26774516701698303,
+      "learning_rate": 0.00019998964750499637,
+      "loss": 0.4732,
+      "step": 1465
+    },
+    {
+      "epoch": 3.1343283582089554,
+      "grad_norm": 0.26958051323890686,
+      "learning_rate": 0.0001999877856940653,
+      "loss": 0.4517,
+      "step": 1470
+    },
+    {
+      "epoch": 3.14498933901919,
+      "grad_norm": 0.2604299485683441,
+      "learning_rate": 0.00019998577002748924,
+      "loss": 0.4476,
+      "step": 1475
+    },
+    {
+      "epoch": 3.1556503198294243,
+      "grad_norm": 1.0628249645233154,
+      "learning_rate": 0.00019998360050836974,
+      "loss": 0.4542,
+      "step": 1480
+    },
+    {
+      "epoch": 3.166311300639659,
+      "grad_norm": 0.26215219497680664,
+      "learning_rate": 0.0001999812771400451,
+      "loss": 0.4608,
+      "step": 1485
+    },
+    {
+      "epoch": 3.1769722814498933,
+      "grad_norm": 0.2745310068130493,
+      "learning_rate": 0.00019997879992609047,
+      "loss": 0.4532,
+      "step": 1490
+    },
+    {
+      "epoch": 3.1876332622601278,
+      "grad_norm": 0.3186289072036743,
+      "learning_rate": 0.0001999761688703176,
+      "loss": 0.4854,
+      "step": 1495
+    },
+    {
+      "epoch": 3.1982942430703627,
+      "grad_norm": 0.2697219252586365,
+      "learning_rate": 0.000199973383976775,
+      "loss": 0.4759,
+      "step": 1500
+    },
+    {
+      "epoch": 3.208955223880597,
+      "grad_norm": 0.32173436880111694,
+      "learning_rate": 0.00019997044524974799,
+      "loss": 0.47,
+      "step": 1505
+    },
+    {
+      "epoch": 3.2196162046908317,
+      "grad_norm": 0.28551211953163147,
+      "learning_rate": 0.00019996735269375843,
+      "loss": 0.4537,
+      "step": 1510
+    },
+    {
+      "epoch": 3.230277185501066,
+      "grad_norm": 0.2618770897388458,
+      "learning_rate": 0.00019996410631356498,
+      "loss": 0.455,
+      "step": 1515
+    },
+    {
+      "epoch": 3.2409381663113006,
+      "grad_norm": 0.3189204931259155,
+      "learning_rate": 0.00019996070611416305,
+      "loss": 0.4869,
+      "step": 1520
+    },
+    {
+      "epoch": 3.251599147121535,
+      "grad_norm": 0.2555652856826782,
+      "learning_rate": 0.00019995715210078464,
+      "loss": 0.4582,
+      "step": 1525
+    },
+    {
+      "epoch": 3.2622601279317696,
+      "grad_norm": 0.45129457116127014,
+      "learning_rate": 0.00019995344427889845,
+      "loss": 0.5055,
+      "step": 1530
+    },
+    {
+      "epoch": 3.272921108742004,
+      "grad_norm": 0.2851119637489319,
+      "learning_rate": 0.0001999495826542099,
+      "loss": 0.4495,
+      "step": 1535
+    },
+    {
+      "epoch": 3.283582089552239,
+      "grad_norm": 0.4647831916809082,
+      "learning_rate": 0.00019994556723266103,
+      "loss": 0.4442,
+      "step": 1540
+    },
+    {
+      "epoch": 3.2942430703624734,
+      "grad_norm": 0.28650426864624023,
+      "learning_rate": 0.00019994139802043055,
+      "loss": 0.488,
+      "step": 1545
+    },
+    {
+      "epoch": 3.304904051172708,
+      "grad_norm": 0.2804616093635559,
+      "learning_rate": 0.0001999370750239338,
+      "loss": 0.4538,
+      "step": 1550
+    },
+    {
+      "epoch": 3.3155650319829424,
+      "grad_norm": 0.2778622508049011,
+      "learning_rate": 0.0001999325982498228,
+      "loss": 0.4468,
+      "step": 1555
+    },
+    {
+      "epoch": 3.326226012793177,
+      "grad_norm": 0.26577600836753845,
+      "learning_rate": 0.00019992796770498616,
+      "loss": 0.4805,
+      "step": 1560
+    },
+    {
+      "epoch": 3.3368869936034113,
+      "grad_norm": 0.25679486989974976,
+      "learning_rate": 0.00019992318339654905,
+      "loss": 0.4648,
+      "step": 1565
+    },
+    {
+      "epoch": 3.3475479744136463,
+      "grad_norm": 0.263921856880188,
+      "learning_rate": 0.00019991824533187335,
+      "loss": 0.4638,
+      "step": 1570
+    },
+    {
+      "epoch": 3.3582089552238807,
+      "grad_norm": 0.25445836782455444,
+      "learning_rate": 0.00019991315351855748,
+      "loss": 0.4395,
+      "step": 1575
+    },
+    {
+      "epoch": 3.368869936034115,
+      "grad_norm": 0.2354278415441513,
+      "learning_rate": 0.0001999079079644364,
+      "loss": 0.487,
+      "step": 1580
+    },
+    {
+      "epoch": 3.3795309168443497,
+      "grad_norm": 0.2561117708683014,
+      "learning_rate": 0.0001999025086775817,
+      "loss": 0.4562,
+      "step": 1585
+    },
+    {
+      "epoch": 3.390191897654584,
+      "grad_norm": 0.3330647349357605,
+      "learning_rate": 0.00019989695566630152,
+      "loss": 0.4445,
+      "step": 1590
+    },
+    {
+      "epoch": 3.4008528784648187,
+      "grad_norm": 0.26299235224723816,
+      "learning_rate": 0.00019989124893914046,
+      "loss": 0.4488,
+      "step": 1595
+    },
+    {
+      "epoch": 3.411513859275053,
+      "grad_norm": 0.299434095621109,
+      "learning_rate": 0.0001998853885048798,
+      "loss": 0.4563,
+      "step": 1600
+    },
+    {
+      "epoch": 3.4221748400852876,
+      "grad_norm": 0.23711760342121124,
+      "learning_rate": 0.0001998793743725372,
+      "loss": 0.4473,
+      "step": 1605
+    },
+    {
+      "epoch": 3.4328358208955225,
+      "grad_norm": 0.24863874912261963,
+      "learning_rate": 0.00019987320655136693,
+      "loss": 0.4574,
+      "step": 1610
+    },
+    {
+      "epoch": 3.443496801705757,
+      "grad_norm": 0.24471955001354218,
+      "learning_rate": 0.00019986688505085957,
+      "loss": 0.4665,
+      "step": 1615
+    },
+    {
+      "epoch": 3.4541577825159915,
+      "grad_norm": 0.2540249526500702,
+      "learning_rate": 0.00019986040988074238,
+      "loss": 0.4689,
+      "step": 1620
+    },
+    {
+      "epoch": 3.464818763326226,
+      "grad_norm": 0.2666712701320648,
+      "learning_rate": 0.00019985378105097902,
+      "loss": 0.4477,
+      "step": 1625
+    },
+    {
+      "epoch": 3.4754797441364604,
+      "grad_norm": 0.27709081768989563,
+      "learning_rate": 0.0001998469985717695,
+      "loss": 0.4403,
+      "step": 1630
+    },
+    {
+      "epoch": 3.486140724946695,
+      "grad_norm": 0.27587834000587463,
+      "learning_rate": 0.00019984006245355037,
+      "loss": 0.4565,
+      "step": 1635
+    },
+    {
+      "epoch": 3.49680170575693,
+      "grad_norm": 0.22859402000904083,
+      "learning_rate": 0.00019983297270699448,
+      "loss": 0.4514,
+      "step": 1640
+    },
+    {
+      "epoch": 3.5074626865671643,
+      "grad_norm": 0.3489368259906769,
+      "learning_rate": 0.00019982572934301122,
+      "loss": 0.4727,
+      "step": 1645
+    },
+    {
+      "epoch": 3.518123667377399,
+      "grad_norm": 0.2632017135620117,
+      "learning_rate": 0.00019981833237274618,
+      "loss": 0.4415,
+      "step": 1650
+    },
+    {
+      "epoch": 3.5287846481876333,
+      "grad_norm": 0.27099326252937317,
+      "learning_rate": 0.00019981078180758154,
+      "loss": 0.4489,
+      "step": 1655
+    },
+    {
+      "epoch": 3.5394456289978677,
+      "grad_norm": 0.2415977120399475,
+      "learning_rate": 0.00019980307765913552,
+      "loss": 0.4764,
+      "step": 1660
+    },
+    {
+      "epoch": 3.550106609808102,
+      "grad_norm": 0.23986046016216278,
+      "learning_rate": 0.000199795219939263,
+      "loss": 0.4458,
+      "step": 1665
+    },
+    {
+      "epoch": 3.5607675906183367,
+      "grad_norm": 0.28455114364624023,
+      "learning_rate": 0.00019978720866005488,
+      "loss": 0.4846,
+      "step": 1670
+    },
+    {
+      "epoch": 3.571428571428571,
+      "grad_norm": 0.2913159430027008,
+      "learning_rate": 0.0001997790438338385,
+      "loss": 0.4547,
+      "step": 1675
+    },
+    {
+      "epoch": 3.582089552238806,
+      "grad_norm": 0.25150275230407715,
+      "learning_rate": 0.0001997707254731775,
+      "loss": 0.4599,
+      "step": 1680
+    },
+    {
+      "epoch": 3.5927505330490406,
+      "grad_norm": 0.23482745885849,
+      "learning_rate": 0.00019976225359087164,
+      "loss": 0.4315,
+      "step": 1685
+    },
+    {
+      "epoch": 3.603411513859275,
+      "grad_norm": 0.23308737576007843,
+      "learning_rate": 0.00019975362819995703,
+      "loss": 0.449,
+      "step": 1690
+    },
+    {
+      "epoch": 3.6140724946695095,
+      "grad_norm": 0.2528814375400543,
+      "learning_rate": 0.00019974484931370592,
+      "loss": 0.4392,
+      "step": 1695
+    },
+    {
+      "epoch": 3.624733475479744,
+      "grad_norm": 0.25079530477523804,
+      "learning_rate": 0.00019973591694562678,
+      "loss": 0.4536,
+      "step": 1700
+    },
+    {
+      "epoch": 3.635394456289979,
+      "grad_norm": 0.2929099202156067,
+      "learning_rate": 0.00019972683110946421,
+      "loss": 0.4426,
+      "step": 1705
+    },
+    {
+      "epoch": 3.6460554371002134,
+      "grad_norm": 0.23356157541275024,
+      "learning_rate": 0.00019971759181919903,
+      "loss": 0.4602,
+      "step": 1710
+    },
+    {
+      "epoch": 3.656716417910448,
+      "grad_norm": 0.3128319978713989,
+      "learning_rate": 0.00019970819908904814,
+      "loss": 0.4629,
+      "step": 1715
+    },
+    {
+      "epoch": 3.6673773987206824,
+      "grad_norm": 0.23164990544319153,
+      "learning_rate": 0.00019969865293346454,
+      "loss": 0.4662,
+      "step": 1720
+    },
+    {
+      "epoch": 3.678038379530917,
+      "grad_norm": 0.43762582540512085,
+      "learning_rate": 0.00019968895336713733,
+      "loss": 0.4685,
+      "step": 1725
+    },
+    {
+      "epoch": 3.6886993603411513,
+      "grad_norm": 0.34830760955810547,
+      "learning_rate": 0.00019967910040499164,
+      "loss": 0.4504,
+      "step": 1730
+    },
+    {
+      "epoch": 3.699360341151386,
+      "grad_norm": 0.2538786828517914,
+      "learning_rate": 0.00019966909406218868,
+      "loss": 0.4967,
+      "step": 1735
+    },
+    {
+      "epoch": 3.7100213219616203,
+      "grad_norm": 0.23103195428848267,
+      "learning_rate": 0.0001996589343541257,
+      "loss": 0.4556,
+      "step": 1740
+    },
+    {
+      "epoch": 3.7206823027718547,
+      "grad_norm": 0.2618430554866791,
+      "learning_rate": 0.0001996486212964358,
+      "loss": 0.4453,
+      "step": 1745
+    },
+    {
+      "epoch": 3.7313432835820897,
+      "grad_norm": 0.23393474519252777,
+      "learning_rate": 0.00019963815490498817,
+      "loss": 0.4613,
+      "step": 1750
+    },
+    {
+      "epoch": 3.742004264392324,
+      "grad_norm": 0.2798391282558441,
+      "learning_rate": 0.00019962753519588798,
+      "loss": 0.4668,
+      "step": 1755
+    },
+    {
+      "epoch": 3.7526652452025586,
+      "grad_norm": 0.24927425384521484,
+      "learning_rate": 0.00019961676218547617,
+      "loss": 0.4424,
+      "step": 1760
+    },
+    {
+      "epoch": 3.763326226012793,
+      "grad_norm": 0.2537556290626526,
+      "learning_rate": 0.00019960583589032966,
+      "loss": 0.4413,
+      "step": 1765
+    },
+    {
+      "epoch": 3.7739872068230276,
+      "grad_norm": 0.2401181310415268,
+      "learning_rate": 0.00019959475632726128,
+      "loss": 0.4365,
+      "step": 1770
+    },
+    {
+      "epoch": 3.7846481876332625,
+      "grad_norm": 0.22927629947662354,
+      "learning_rate": 0.00019958352351331956,
+      "loss": 0.4455,
+      "step": 1775
+    },
+    {
+      "epoch": 3.795309168443497,
+      "grad_norm": 0.21933622658252716,
+      "learning_rate": 0.00019957213746578902,
+      "loss": 0.4661,
+      "step": 1780
+    },
+    {
+      "epoch": 3.8059701492537314,
+      "grad_norm": 0.28884589672088623,
+      "learning_rate": 0.00019956059820218982,
+      "loss": 0.4931,
+      "step": 1785
+    },
+    {
+      "epoch": 3.816631130063966,
+      "grad_norm": 0.2619436979293823,
+      "learning_rate": 0.00019954890574027797,
+      "loss": 0.4446,
+      "step": 1790
+    },
+    {
+      "epoch": 3.8272921108742004,
+      "grad_norm": 0.22175399959087372,
+      "learning_rate": 0.00019953706009804512,
+      "loss": 0.4482,
+      "step": 1795
+    },
+    {
+      "epoch": 3.837953091684435,
+      "grad_norm": 0.23060369491577148,
+      "learning_rate": 0.00019952506129371873,
+      "loss": 0.451,
+      "step": 1800
+    },
+    {
+      "epoch": 3.8486140724946694,
+      "grad_norm": 0.2313724309206009,
+      "learning_rate": 0.0001995129093457619,
+      "loss": 0.4496,
+      "step": 1805
+    },
+    {
+      "epoch": 3.859275053304904,
+      "grad_norm": 0.23518264293670654,
+      "learning_rate": 0.00019950060427287335,
+      "loss": 0.4581,
+      "step": 1810
+    },
+    {
+      "epoch": 3.8699360341151388,
+      "grad_norm": 0.22398614883422852,
+      "learning_rate": 0.00019948814609398746,
+      "loss": 0.4382,
+      "step": 1815
+    },
+    {
+      "epoch": 3.8805970149253732,
+      "grad_norm": 0.21408702433109283,
+      "learning_rate": 0.00019947553482827418,
+      "loss": 0.4517,
+      "step": 1820
+    },
+    {
+      "epoch": 3.8912579957356077,
+      "grad_norm": 0.26791512966156006,
+      "learning_rate": 0.00019946277049513904,
+      "loss": 0.4671,
+      "step": 1825
+    },
+    {
+      "epoch": 3.901918976545842,
+      "grad_norm": 0.37972912192344666,
+      "learning_rate": 0.00019944985311422304,
+      "loss": 0.4665,
+      "step": 1830
+    },
+    {
+      "epoch": 3.9125799573560767,
+      "grad_norm": 0.2744680941104889,
+      "learning_rate": 0.00019943678270540276,
+      "loss": 0.4627,
+      "step": 1835
+    },
+    {
+      "epoch": 3.923240938166311,
+      "grad_norm": 0.3253777325153351,
+      "learning_rate": 0.00019942355928879023,
+      "loss": 0.468,
+      "step": 1840
+    },
+    {
+      "epoch": 3.933901918976546,
+      "grad_norm": 0.32431936264038086,
+      "learning_rate": 0.00019941018288473285,
+      "loss": 0.4497,
+      "step": 1845
+    },
+    {
+      "epoch": 3.9445628997867805,
+      "grad_norm": 0.2247323989868164,
+      "learning_rate": 0.00019939665351381355,
+      "loss": 0.4444,
+      "step": 1850
+    },
+    {
+      "epoch": 3.955223880597015,
+      "grad_norm": 0.35610342025756836,
+      "learning_rate": 0.00019938297119685054,
+      "loss": 0.4563,
+      "step": 1855
+    },
+    {
+      "epoch": 3.9658848614072495,
+      "grad_norm": 0.2513818144798279,
+      "learning_rate": 0.00019936913595489743,
+      "loss": 0.442,
+      "step": 1860
+    },
+    {
+      "epoch": 3.976545842217484,
+      "grad_norm": 0.3135777711868286,
+      "learning_rate": 0.0001993551478092431,
+      "loss": 0.4377,
+      "step": 1865
+    },
+    {
+      "epoch": 3.9872068230277184,
+      "grad_norm": 0.24127310514450073,
+      "learning_rate": 0.0001993410067814118,
+      "loss": 0.4478,
+      "step": 1870
+    },
+    {
+      "epoch": 3.997867803837953,
+      "grad_norm": 0.23388491570949554,
+      "learning_rate": 0.00019932671289316282,
+      "loss": 0.4306,
+      "step": 1875
+    },
+    {
+      "epoch": 4.0,
+      "eval_loss": 0.5043795108795166,
+      "eval_runtime": 377.5601,
+      "eval_samples_per_second": 1.091,
+      "eval_steps_per_second": 1.091,
+      "step": 1876
+    },
+    {
+      "epoch": 4.008528784648187,
+      "grad_norm": 0.3674967288970947,
+      "learning_rate": 0.0001993122661664909,
+      "loss": 0.4371,
+      "step": 1880
+    },
+    {
+      "epoch": 4.019189765458422,
+      "grad_norm": 0.2773316204547882,
+      "learning_rate": 0.00019929766662362585,
+      "loss": 0.4043,
+      "step": 1885
+    },
+    {
+      "epoch": 4.029850746268656,
+      "grad_norm": 0.2394101619720459,
+      "learning_rate": 0.00019928291428703262,
+      "loss": 0.413,
+      "step": 1890
+    },
+    {
+      "epoch": 4.040511727078891,
+      "grad_norm": 0.23238113522529602,
+      "learning_rate": 0.00019926800917941128,
+      "loss": 0.4021,
+      "step": 1895
+    },
+    {
+      "epoch": 4.051172707889126,
+      "grad_norm": 0.22244401276111603,
+      "learning_rate": 0.000199252951323697,
+      "loss": 0.4101,
+      "step": 1900
+    },
+    {
+      "epoch": 4.061833688699361,
+      "grad_norm": 0.24964463710784912,
+      "learning_rate": 0.00019923774074306,
+      "loss": 0.4123,
+      "step": 1905
+    },
+    {
+      "epoch": 4.072494669509595,
+      "grad_norm": 0.23066940903663635,
+      "learning_rate": 0.00019922237746090537,
+      "loss": 0.4267,
+      "step": 1910
+    },
+    {
+      "epoch": 4.08315565031983,
+      "grad_norm": 0.23452460765838623,
+      "learning_rate": 0.00019920686150087336,
+      "loss": 0.4223,
+      "step": 1915
+    },
+    {
+      "epoch": 4.093816631130064,
+      "grad_norm": 0.3032955527305603,
+      "learning_rate": 0.00019919119288683908,
+      "loss": 0.432,
+      "step": 1920
+    },
+    {
+      "epoch": 4.104477611940299,
+      "grad_norm": 0.3310707211494446,
+      "learning_rate": 0.00019917537164291244,
+      "loss": 0.42,
+      "step": 1925
+    },
+    {
+      "epoch": 4.115138592750533,
+      "grad_norm": 0.24135416746139526,
+      "learning_rate": 0.00019915939779343838,
+      "loss": 0.4289,
+      "step": 1930
+    },
+    {
+      "epoch": 4.1257995735607675,
+      "grad_norm": 0.23443254828453064,
+      "learning_rate": 0.00019914327136299651,
+      "loss": 0.4216,
+      "step": 1935
+    },
+    {
+      "epoch": 4.136460554371002,
+      "grad_norm": 0.3196619749069214,
+      "learning_rate": 0.0001991269923764013,
+      "loss": 0.4387,
+      "step": 1940
+    },
+    {
+      "epoch": 4.1471215351812365,
+      "grad_norm": 0.2881762981414795,
+      "learning_rate": 0.00019911056085870197,
+      "loss": 0.4176,
+      "step": 1945
+    },
+    {
+      "epoch": 4.157782515991471,
+      "grad_norm": 0.25249961018562317,
+      "learning_rate": 0.00019909397683518242,
+      "loss": 0.4221,
+      "step": 1950
+    },
+    {
+      "epoch": 4.1684434968017055,
+      "grad_norm": 0.22756356000900269,
+      "learning_rate": 0.00019907724033136118,
+      "loss": 0.413,
+      "step": 1955
+    },
+    {
+      "epoch": 4.17910447761194,
+      "grad_norm": 0.24332334101200104,
+      "learning_rate": 0.0001990603513729915,
+      "loss": 0.4218,
+      "step": 1960
+    },
+    {
+      "epoch": 4.189765458422174,
+      "grad_norm": 0.23593220114707947,
+      "learning_rate": 0.00019904330998606116,
+      "loss": 0.4114,
+      "step": 1965
+    },
+    {
+      "epoch": 4.20042643923241,
+      "grad_norm": 0.266313374042511,
+      "learning_rate": 0.00019902611619679252,
+      "loss": 0.4309,
+      "step": 1970
+    },
+    {
+      "epoch": 4.211087420042644,
+      "grad_norm": 0.3359983563423157,
+      "learning_rate": 0.00019900877003164235,
+      "loss": 0.4339,
+      "step": 1975
+    },
+    {
+      "epoch": 4.221748400852879,
+      "grad_norm": 0.22711415588855743,
+      "learning_rate": 0.00019899127151730206,
+      "loss": 0.4165,
+      "step": 1980
+    },
+    {
+      "epoch": 4.232409381663113,
+      "grad_norm": 0.2225334793329239,
+      "learning_rate": 0.00019897362068069732,
+      "loss": 0.4094,
+      "step": 1985
+    },
+    {
+      "epoch": 4.243070362473348,
+      "grad_norm": 0.2701500356197357,
+      "learning_rate": 0.0001989558175489883,
+      "loss": 0.4239,
+      "step": 1990
+    },
+    {
+      "epoch": 4.253731343283582,
+      "grad_norm": 0.2480495721101761,
+      "learning_rate": 0.00019893786214956945,
+      "loss": 0.4137,
+      "step": 1995
+    },
+    {
+      "epoch": 4.264392324093817,
+      "grad_norm": 0.22299885749816895,
+      "learning_rate": 0.00019891975451006953,
+      "loss": 0.4273,
+      "step": 2000
+    },
+    {
+      "epoch": 4.275053304904051,
+      "grad_norm": 0.2259630262851715,
+      "learning_rate": 0.0001989014946583516,
+      "loss": 0.4223,
+      "step": 2005
+    },
+    {
+      "epoch": 4.285714285714286,
+      "grad_norm": 0.3351574242115021,
+      "learning_rate": 0.00019888308262251285,
+      "loss": 0.4483,
+      "step": 2010
+    },
+    {
+      "epoch": 4.29637526652452,
+      "grad_norm": 0.21363438665866852,
+      "learning_rate": 0.0001988645184308848,
+      "loss": 0.4138,
+      "step": 2015
+    },
+    {
+      "epoch": 4.3070362473347545,
+      "grad_norm": 0.2409023493528366,
+      "learning_rate": 0.00019884580211203287,
+      "loss": 0.4166,
+      "step": 2020
+    },
+    {
+      "epoch": 4.317697228144989,
+      "grad_norm": 0.24684803187847137,
+      "learning_rate": 0.00019882693369475675,
+      "loss": 0.4089,
+      "step": 2025
+    },
+    {
+      "epoch": 4.3283582089552235,
+      "grad_norm": 0.24175861477851868,
+      "learning_rate": 0.0001988079132080901,
+      "loss": 0.4169,
+      "step": 2030
+    },
+    {
+      "epoch": 4.339019189765459,
+      "grad_norm": 0.3582640290260315,
+      "learning_rate": 0.00019878874068130062,
+      "loss": 0.4207,
+      "step": 2035
+    },
+    {
+      "epoch": 4.349680170575693,
+      "grad_norm": 0.23563334345817566,
+      "learning_rate": 0.00019876941614388992,
+      "loss": 0.4056,
+      "step": 2040
+    },
+    {
+      "epoch": 4.360341151385928,
+      "grad_norm": 0.24959246814250946,
+      "learning_rate": 0.0001987499396255935,
+      "loss": 0.4152,
+      "step": 2045
+    },
+    {
+      "epoch": 4.371002132196162,
+      "grad_norm": 0.2378864586353302,
+      "learning_rate": 0.00019873031115638073,
+      "loss": 0.428,
+      "step": 2050
+    },
+    {
+      "epoch": 4.381663113006397,
+      "grad_norm": 0.25769662857055664,
+      "learning_rate": 0.00019871053076645488,
+      "loss": 0.4273,
+      "step": 2055
+    },
+    {
+      "epoch": 4.392324093816631,
+      "grad_norm": 0.2148350328207016,
+      "learning_rate": 0.0001986905984862528,
+      "loss": 0.4341,
+      "step": 2060
+    },
+    {
+      "epoch": 4.402985074626866,
+      "grad_norm": 0.22630667686462402,
+      "learning_rate": 0.0001986705143464453,
+      "loss": 0.43,
+      "step": 2065
+    },
+    {
+      "epoch": 4.4136460554371,
+      "grad_norm": 0.23718136548995972,
+      "learning_rate": 0.00019865027837793665,
+      "loss": 0.4193,
+      "step": 2070
+    },
+    {
+      "epoch": 4.424307036247335,
+      "grad_norm": 0.26240232586860657,
+      "learning_rate": 0.00019862989061186483,
+      "loss": 0.4327,
+      "step": 2075
+    },
+    {
+      "epoch": 4.434968017057569,
+      "grad_norm": 0.21503274142742157,
+      "learning_rate": 0.0001986093510796015,
+      "loss": 0.4208,
+      "step": 2080
+    },
+    {
+      "epoch": 4.445628997867804,
+      "grad_norm": 0.31747710704803467,
+      "learning_rate": 0.0001985886598127516,
+      "loss": 0.4348,
+      "step": 2085
+    },
+    {
+      "epoch": 4.456289978678038,
+      "grad_norm": 0.24618090689182281,
+      "learning_rate": 0.00019856781684315382,
+      "loss": 0.4247,
+      "step": 2090
+    },
+    {
+      "epoch": 4.466950959488273,
+      "grad_norm": 0.33112359046936035,
+      "learning_rate": 0.00019854682220288013,
+      "loss": 0.4175,
+      "step": 2095
+    },
+    {
+      "epoch": 4.477611940298507,
+      "grad_norm": 0.23943935334682465,
+      "learning_rate": 0.0001985256759242359,
+      "loss": 0.4271,
+      "step": 2100
+    },
+    {
+      "epoch": 4.4882729211087415,
+      "grad_norm": 0.24192848801612854,
+      "learning_rate": 0.00019850437803975988,
+      "loss": 0.4221,
+      "step": 2105
+    },
+    {
+      "epoch": 4.498933901918977,
+      "grad_norm": 0.22631579637527466,
+      "learning_rate": 0.00019848292858222401,
+      "loss": 0.4233,
+      "step": 2110
+    },
+    {
+      "epoch": 4.509594882729211,
+      "grad_norm": 0.23344965279102325,
+      "learning_rate": 0.00019846132758463356,
+      "loss": 0.4161,
+      "step": 2115
+    },
+    {
+      "epoch": 4.520255863539446,
+      "grad_norm": 0.22698044776916504,
+      "learning_rate": 0.000198439575080227,
+      "loss": 0.4112,
+      "step": 2120
+    },
+    {
+      "epoch": 4.53091684434968,
+      "grad_norm": 0.3037104308605194,
+      "learning_rate": 0.00019841767110247575,
+      "loss": 0.4362,
+      "step": 2125
+    },
+    {
+      "epoch": 4.541577825159915,
+      "grad_norm": 0.24173210561275482,
+      "learning_rate": 0.00019839561568508454,
+      "loss": 0.4223,
+      "step": 2130
+    },
+    {
+      "epoch": 4.552238805970149,
+      "grad_norm": 0.2352645844221115,
+      "learning_rate": 0.00019837340886199096,
+      "loss": 0.4274,
+      "step": 2135
+    },
+    {
+      "epoch": 4.562899786780384,
+      "grad_norm": 0.2779860496520996,
+      "learning_rate": 0.0001983510506673657,
+      "loss": 0.4316,
+      "step": 2140
+    },
+    {
+      "epoch": 4.573560767590618,
+      "grad_norm": 0.24002455174922943,
+      "learning_rate": 0.0001983285411356122,
+      "loss": 0.4159,
+      "step": 2145
+    },
+    {
+      "epoch": 4.584221748400853,
+      "grad_norm": 0.22028042376041412,
+      "learning_rate": 0.00019830588030136698,
+      "loss": 0.4296,
+      "step": 2150
+    },
+    {
+      "epoch": 4.594882729211087,
+      "grad_norm": 0.3180830776691437,
+      "learning_rate": 0.0001982830681994992,
+      "loss": 0.4339,
+      "step": 2155
+    },
+    {
+      "epoch": 4.605543710021322,
+      "grad_norm": 0.2228025496006012,
+      "learning_rate": 0.00019826010486511091,
+      "loss": 0.4149,
+      "step": 2160
+    },
+    {
+      "epoch": 4.616204690831556,
+      "grad_norm": 0.2128361463546753,
+      "learning_rate": 0.00019823699033353677,
+      "loss": 0.4126,
+      "step": 2165
+    },
+    {
+      "epoch": 4.6268656716417915,
+      "grad_norm": 0.2322179228067398,
+      "learning_rate": 0.00019821372464034416,
+      "loss": 0.4128,
+      "step": 2170
+    },
+    {
+      "epoch": 4.637526652452026,
+      "grad_norm": 0.30600860714912415,
+      "learning_rate": 0.00019819030782133304,
+      "loss": 0.414,
+      "step": 2175
+    },
+    {
+      "epoch": 4.6481876332622605,
+      "grad_norm": 0.22045232355594635,
+      "learning_rate": 0.00019816673991253586,
+      "loss": 0.409,
+      "step": 2180
+    },
+    {
+      "epoch": 4.658848614072495,
+      "grad_norm": 0.2302045375108719,
+      "learning_rate": 0.00019814302095021768,
+      "loss": 0.4199,
+      "step": 2185
+    },
+    {
+      "epoch": 4.669509594882729,
+      "grad_norm": 0.22577248513698578,
+      "learning_rate": 0.00019811915097087587,
+      "loss": 0.4058,
+      "step": 2190
+    },
+    {
+      "epoch": 4.680170575692964,
+      "grad_norm": 0.6790816187858582,
+      "learning_rate": 0.00019809513001124024,
+      "loss": 0.4356,
+      "step": 2195
+    },
+    {
+      "epoch": 4.690831556503198,
+      "grad_norm": 0.2510231137275696,
+      "learning_rate": 0.00019807095810827293,
+      "loss": 0.4062,
+      "step": 2200
+    },
+    {
+      "epoch": 4.701492537313433,
+      "grad_norm": 0.24071648716926575,
+      "learning_rate": 0.00019804663529916826,
+      "loss": 0.4282,
+      "step": 2205
+    },
+    {
+      "epoch": 4.712153518123667,
+      "grad_norm": 0.2886710464954376,
+      "learning_rate": 0.00019802216162135287,
+      "loss": 0.4254,
+      "step": 2210
+    },
+    {
+      "epoch": 4.722814498933902,
+      "grad_norm": 0.2941761910915375,
+      "learning_rate": 0.0001979975371124855,
+      "loss": 0.4343,
+      "step": 2215
+    },
+    {
+      "epoch": 4.733475479744136,
+      "grad_norm": 0.2591281533241272,
+      "learning_rate": 0.00019797276181045693,
+      "loss": 0.4165,
+      "step": 2220
+    },
+    {
+      "epoch": 4.744136460554371,
+      "grad_norm": 0.2245703637599945,
+      "learning_rate": 0.00019794783575339004,
+      "loss": 0.4112,
+      "step": 2225
+    },
+    {
+      "epoch": 4.754797441364605,
+      "grad_norm": 0.48405957221984863,
+      "learning_rate": 0.00019792275897963967,
+      "loss": 0.4279,
+      "step": 2230
+    },
+    {
+      "epoch": 4.76545842217484,
+      "grad_norm": 0.22091209888458252,
+      "learning_rate": 0.00019789753152779258,
+      "loss": 0.4371,
+      "step": 2235
+    },
+    {
+      "epoch": 4.776119402985074,
+      "grad_norm": 0.23672465980052948,
+      "learning_rate": 0.00019787215343666732,
+      "loss": 0.4166,
+      "step": 2240
+    },
+    {
+      "epoch": 4.786780383795309,
+      "grad_norm": 0.43999361991882324,
+      "learning_rate": 0.0001978466247453143,
+      "loss": 0.4167,
+      "step": 2245
+    },
+    {
+      "epoch": 4.797441364605544,
+      "grad_norm": 0.2732659578323364,
+      "learning_rate": 0.0001978209454930157,
+      "loss": 0.4326,
+      "step": 2250
+    },
+    {
+      "epoch": 4.8081023454157785,
+      "grad_norm": 0.27667996287345886,
+      "learning_rate": 0.00019779511571928527,
+      "loss": 0.4192,
+      "step": 2255
+    },
+    {
+      "epoch": 4.818763326226013,
+      "grad_norm": 0.24479329586029053,
+      "learning_rate": 0.00019776913546386843,
+      "loss": 0.4158,
+      "step": 2260
+    },
+    {
+      "epoch": 4.8294243070362475,
+      "grad_norm": 0.21344681084156036,
+      "learning_rate": 0.0001977430047667422,
+      "loss": 0.4112,
+      "step": 2265
+    },
+    {
+      "epoch": 4.840085287846482,
+      "grad_norm": 0.24819132685661316,
+      "learning_rate": 0.00019771672366811503,
+      "loss": 0.414,
+      "step": 2270
+    },
+    {
+      "epoch": 4.850746268656716,
+      "grad_norm": 0.2435145080089569,
+      "learning_rate": 0.00019769029220842677,
+      "loss": 0.4172,
+      "step": 2275
+    },
+    {
+      "epoch": 4.861407249466951,
+      "grad_norm": 0.21831800043582916,
+      "learning_rate": 0.0001976637104283487,
+      "loss": 0.4168,
+      "step": 2280
+    },
+    {
+      "epoch": 4.872068230277185,
+      "grad_norm": 0.3001014292240143,
+      "learning_rate": 0.00019763697836878343,
+      "loss": 0.4271,
+      "step": 2285
+    },
+    {
+      "epoch": 4.88272921108742,
+      "grad_norm": 0.3473288118839264,
+      "learning_rate": 0.00019761009607086472,
+      "loss": 0.4256,
+      "step": 2290
+    },
+    {
+      "epoch": 4.893390191897654,
+      "grad_norm": 0.2094939649105072,
+      "learning_rate": 0.00019758306357595755,
+      "loss": 0.4207,
+      "step": 2295
+    },
+    {
+      "epoch": 4.904051172707889,
+      "grad_norm": 0.224636048078537,
+      "learning_rate": 0.00019755588092565805,
+      "loss": 0.4214,
+      "step": 2300
+    },
+    {
+      "epoch": 4.914712153518123,
+      "grad_norm": 0.22260229289531708,
+      "learning_rate": 0.00019752854816179336,
+      "loss": 0.4226,
+      "step": 2305
+    },
+    {
+      "epoch": 4.925373134328359,
+      "grad_norm": 0.21004381775856018,
+      "learning_rate": 0.0001975010653264216,
+      "loss": 0.414,
+      "step": 2310
+    },
+    {
+      "epoch": 4.936034115138593,
+      "grad_norm": 0.2120514214038849,
+      "learning_rate": 0.00019747343246183185,
+      "loss": 0.4152,
+      "step": 2315
+    },
+    {
+      "epoch": 4.946695095948828,
+      "grad_norm": 0.2152203619480133,
+      "learning_rate": 0.00019744564961054402,
+      "loss": 0.4159,
+      "step": 2320
+    },
+    {
+      "epoch": 4.957356076759062,
+      "grad_norm": 0.22371242940425873,
+      "learning_rate": 0.0001974177168153088,
+      "loss": 0.4095,
+      "step": 2325
+    },
+    {
+      "epoch": 4.968017057569297,
+      "grad_norm": 0.21865862607955933,
+      "learning_rate": 0.00019738963411910766,
+      "loss": 0.4261,
+      "step": 2330
+    },
+    {
+      "epoch": 4.978678038379531,
+      "grad_norm": 0.3230665326118469,
+      "learning_rate": 0.0001973614015651527,
+      "loss": 0.4116,
+      "step": 2335
+    },
+    {
+      "epoch": 4.9893390191897655,
+      "grad_norm": 0.21557492017745972,
+      "learning_rate": 0.00019733301919688651,
+      "loss": 0.4161,
+      "step": 2340
+    },
+    {
+      "epoch": 5.0,
+      "grad_norm": 0.21153585612773895,
+      "learning_rate": 0.00019730448705798239,
+      "loss": 0.4128,
+      "step": 2345
+    },
+    {
+      "epoch": 5.0,
+      "eval_loss": 0.5016890168190002,
+      "eval_runtime": 377.5434,
+      "eval_samples_per_second": 1.091,
+      "eval_steps_per_second": 1.091,
+      "step": 2345
+    },
+    {
+      "epoch": 5.0106609808102345,
+      "grad_norm": 0.20196357369422913,
+      "learning_rate": 0.000197275805192344,
+      "loss": 0.3909,
+      "step": 2350
+    },
+    {
+      "epoch": 5.021321961620469,
+      "grad_norm": 0.2446993738412857,
+      "learning_rate": 0.00019724697364410535,
+      "loss": 0.3876,
+      "step": 2355
+    },
+    {
+      "epoch": 5.031982942430703,
+      "grad_norm": 0.22501204907894135,
+      "learning_rate": 0.00019721799245763088,
+      "loss": 0.3882,
+      "step": 2360
+    },
+    {
+      "epoch": 5.042643923240938,
+      "grad_norm": 0.23419953882694244,
+      "learning_rate": 0.0001971888616775152,
+      "loss": 0.3786,
+      "step": 2365
+    },
+    {
+      "epoch": 5.053304904051172,
+      "grad_norm": 0.23151536285877228,
+      "learning_rate": 0.00019715958134858315,
+      "loss": 0.3925,
+      "step": 2370
+    },
+    {
+      "epoch": 5.063965884861407,
+      "grad_norm": 0.23873166739940643,
+      "learning_rate": 0.00019713015151588966,
+      "loss": 0.3927,
+      "step": 2375
+    },
+    {
+      "epoch": 5.074626865671641,
+      "grad_norm": 0.23083342611789703,
+      "learning_rate": 0.00019710057222471967,
+      "loss": 0.3836,
+      "step": 2380
+    },
+    {
+      "epoch": 5.085287846481877,
+      "grad_norm": 0.22406326234340668,
+      "learning_rate": 0.00019707084352058827,
+      "loss": 0.389,
+      "step": 2385
+    },
+    {
+      "epoch": 5.095948827292111,
+      "grad_norm": 0.37570300698280334,
+      "learning_rate": 0.00019704096544924022,
+      "loss": 0.3999,
+      "step": 2390
+    },
+    {
+      "epoch": 5.106609808102346,
+      "grad_norm": 0.21594493091106415,
+      "learning_rate": 0.0001970109380566503,
+      "loss": 0.38,
+      "step": 2395
+    },
+    {
+      "epoch": 5.11727078891258,
+      "grad_norm": 0.2725168466567993,
+      "learning_rate": 0.00019698076138902298,
+      "loss": 0.3848,
+      "step": 2400
+    },
+    {
+      "epoch": 5.127931769722815,
+      "grad_norm": 0.2510855495929718,
+      "learning_rate": 0.00019695043549279243,
+      "loss": 0.3859,
+      "step": 2405
+    },
+    {
+      "epoch": 5.138592750533049,
+      "grad_norm": 0.23722735047340393,
+      "learning_rate": 0.00019691996041462244,
+      "loss": 0.3876,
+      "step": 2410
+    },
+    {
+      "epoch": 5.149253731343284,
+      "grad_norm": 0.35469353199005127,
+      "learning_rate": 0.00019688933620140637,
+      "loss": 0.3863,
+      "step": 2415
+    },
+    {
+      "epoch": 5.159914712153518,
+      "grad_norm": 0.23087090253829956,
+      "learning_rate": 0.0001968585629002671,
+      "loss": 0.3898,
+      "step": 2420
+    },
+    {
+      "epoch": 5.1705756929637525,
+      "grad_norm": 0.21194830536842346,
+      "learning_rate": 0.00019682764055855683,
+      "loss": 0.3832,
+      "step": 2425
+    },
+    {
+      "epoch": 5.181236673773987,
+      "grad_norm": 0.23261596262454987,
+      "learning_rate": 0.00019679656922385715,
+      "loss": 0.3895,
+      "step": 2430
+    },
+    {
+      "epoch": 5.1918976545842215,
+      "grad_norm": 0.24160555005073547,
+      "learning_rate": 0.0001967653489439789,
+      "loss": 0.391,
+      "step": 2435
+    },
+    {
+      "epoch": 5.202558635394456,
+      "grad_norm": 0.23709999024868011,
+      "learning_rate": 0.00019673397976696216,
+      "loss": 0.3904,
+      "step": 2440
+    },
+    {
+      "epoch": 5.21321961620469,
+      "grad_norm": 0.2529030740261078,
+      "learning_rate": 0.00019670246174107597,
+      "loss": 0.3853,
+      "step": 2445
+    },
+    {
+      "epoch": 5.223880597014926,
+      "grad_norm": 0.22068992257118225,
+      "learning_rate": 0.0001966707949148186,
+      "loss": 0.3791,
+      "step": 2450
+    },
+    {
+      "epoch": 5.23454157782516,
+      "grad_norm": 0.23219233751296997,
+      "learning_rate": 0.00019663897933691718,
+      "loss": 0.3904,
+      "step": 2455
+    },
+    {
+      "epoch": 5.245202558635395,
+      "grad_norm": 0.25079360604286194,
+      "learning_rate": 0.00019660701505632772,
+      "loss": 0.3995,
+      "step": 2460
+    },
+    {
+      "epoch": 5.255863539445629,
+      "grad_norm": 0.2510697841644287,
+      "learning_rate": 0.00019657490212223515,
+      "loss": 0.3861,
+      "step": 2465
+    },
+    {
+      "epoch": 5.266524520255864,
+      "grad_norm": 0.25218454003334045,
+      "learning_rate": 0.000196542640584053,
+      "loss": 0.3878,
+      "step": 2470
+    },
+    {
+      "epoch": 5.277185501066098,
+      "grad_norm": 0.21124300360679626,
+      "learning_rate": 0.00019651023049142356,
+      "loss": 0.3881,
+      "step": 2475
+    },
+    {
+      "epoch": 5.287846481876333,
+      "grad_norm": 0.23286496102809906,
+      "learning_rate": 0.0001964776718942177,
+      "loss": 0.3893,
+      "step": 2480
+    },
+    {
+      "epoch": 5.298507462686567,
+      "grad_norm": 0.2385607361793518,
+      "learning_rate": 0.00019644496484253474,
+      "loss": 0.381,
+      "step": 2485
+    },
+    {
+      "epoch": 5.309168443496802,
+      "grad_norm": 0.22742030024528503,
+      "learning_rate": 0.00019641210938670247,
+      "loss": 0.393,
+      "step": 2490
+    },
+    {
+      "epoch": 5.319829424307036,
+      "grad_norm": 0.22051115334033966,
+      "learning_rate": 0.00019637910557727706,
+      "loss": 0.3933,
+      "step": 2495
+    },
+    {
+      "epoch": 5.330490405117271,
+      "grad_norm": 0.23317855596542358,
+      "learning_rate": 0.00019634595346504293,
+      "loss": 0.3877,
+      "step": 2500
+    },
+    {
+      "epoch": 5.341151385927505,
+      "grad_norm": 0.23425228893756866,
+      "learning_rate": 0.00019631265310101272,
+      "loss": 0.4158,
+      "step": 2505
+    },
+    {
+      "epoch": 5.3518123667377395,
+      "grad_norm": 0.25701725482940674,
+      "learning_rate": 0.00019627920453642715,
+      "loss": 0.3835,
+      "step": 2510
+    },
+    {
+      "epoch": 5.362473347547974,
+      "grad_norm": 0.23093344271183014,
+      "learning_rate": 0.00019624560782275505,
+      "loss": 0.3846,
+      "step": 2515
+    },
+    {
+      "epoch": 5.373134328358209,
+      "grad_norm": 0.2600732147693634,
+      "learning_rate": 0.00019621186301169315,
+      "loss": 0.3917,
+      "step": 2520
+    },
+    {
+      "epoch": 5.383795309168444,
+      "grad_norm": 0.2647717595100403,
+      "learning_rate": 0.00019617797015516607,
+      "loss": 0.3938,
+      "step": 2525
+    },
+    {
+      "epoch": 5.394456289978678,
+      "grad_norm": 0.24304771423339844,
+      "learning_rate": 0.0001961439293053263,
+      "loss": 0.3925,
+      "step": 2530
+    },
+    {
+      "epoch": 5.405117270788913,
+      "grad_norm": 0.2271909862756729,
+      "learning_rate": 0.00019610974051455398,
+      "loss": 0.3878,
+      "step": 2535
+    },
+    {
+      "epoch": 5.415778251599147,
+      "grad_norm": 0.22085613012313843,
+      "learning_rate": 0.00019607540383545692,
+      "loss": 0.4025,
+      "step": 2540
+    },
+    {
+      "epoch": 5.426439232409382,
+      "grad_norm": 0.2830078899860382,
+      "learning_rate": 0.0001960409193208705,
+      "loss": 0.3935,
+      "step": 2545
+    },
+    {
+      "epoch": 5.437100213219616,
+      "grad_norm": 0.37187430262565613,
+      "learning_rate": 0.00019600628702385751,
+      "loss": 0.3896,
+      "step": 2550
+    },
+    {
+      "epoch": 5.447761194029851,
+      "grad_norm": 0.23631027340888977,
+      "learning_rate": 0.00019597150699770835,
+      "loss": 0.3911,
+      "step": 2555
+    },
+    {
+      "epoch": 5.458422174840085,
+      "grad_norm": 0.224113330245018,
+      "learning_rate": 0.00019593657929594044,
+      "loss": 0.3876,
+      "step": 2560
+    },
+    {
+      "epoch": 5.46908315565032,
+      "grad_norm": 0.29911914467811584,
+      "learning_rate": 0.00019590150397229866,
+      "loss": 0.3966,
+      "step": 2565
+    },
+    {
+      "epoch": 5.479744136460554,
+      "grad_norm": 0.22963348031044006,
+      "learning_rate": 0.000195866281080755,
+      "loss": 0.3931,
+      "step": 2570
+    },
+    {
+      "epoch": 5.490405117270789,
+      "grad_norm": 0.24756336212158203,
+      "learning_rate": 0.0001958309106755084,
+      "loss": 0.3827,
+      "step": 2575
+    },
+    {
+      "epoch": 5.501066098081023,
+      "grad_norm": 0.22494661808013916,
+      "learning_rate": 0.00019579539281098493,
+      "loss": 0.3884,
+      "step": 2580
+    },
+    {
+      "epoch": 5.5117270788912585,
+      "grad_norm": 0.2217581868171692,
+      "learning_rate": 0.00019575972754183748,
+      "loss": 0.3954,
+      "step": 2585
+    },
+    {
+      "epoch": 5.522388059701493,
+      "grad_norm": 0.22264057397842407,
+      "learning_rate": 0.0001957239149229458,
+      "loss": 0.3925,
+      "step": 2590
+    },
+    {
+      "epoch": 5.533049040511727,
+      "grad_norm": 0.24900676310062408,
+      "learning_rate": 0.00019568795500941635,
+      "loss": 0.3938,
+      "step": 2595
+    },
+    {
+      "epoch": 5.543710021321962,
+      "grad_norm": 0.22802846133708954,
+      "learning_rate": 0.00019565184785658223,
+      "loss": 0.3903,
+      "step": 2600
+    },
+    {
+      "epoch": 5.554371002132196,
+      "grad_norm": 0.2182716578245163,
+      "learning_rate": 0.00019561559352000317,
+      "loss": 0.3929,
+      "step": 2605
+    },
+    {
+      "epoch": 5.565031982942431,
+      "grad_norm": 0.23668424785137177,
+      "learning_rate": 0.00019557919205546526,
+      "loss": 0.3815,
+      "step": 2610
+    },
+    {
+      "epoch": 5.575692963752665,
+      "grad_norm": 0.22820915281772614,
+      "learning_rate": 0.0001955426435189811,
+      "loss": 0.3937,
+      "step": 2615
+    },
+    {
+      "epoch": 5.5863539445629,
+      "grad_norm": 0.21698084473609924,
+      "learning_rate": 0.00019550594796678952,
+      "loss": 0.3925,
+      "step": 2620
+    },
+    {
+      "epoch": 5.597014925373134,
+      "grad_norm": 0.22192837297916412,
+      "learning_rate": 0.00019546910545535558,
+      "loss": 0.3858,
+      "step": 2625
+    },
+    {
+      "epoch": 5.607675906183369,
+      "grad_norm": 0.22095522284507751,
+      "learning_rate": 0.00019543211604137052,
+      "loss": 0.3863,
+      "step": 2630
+    },
+    {
+      "epoch": 5.618336886993603,
+      "grad_norm": 0.22427357733249664,
+      "learning_rate": 0.0001953949797817516,
+      "loss": 0.3836,
+      "step": 2635
+    },
+    {
+      "epoch": 5.628997867803838,
+      "grad_norm": 0.23269647359848022,
+      "learning_rate": 0.00019535769673364203,
+      "loss": 0.3913,
+      "step": 2640
+    },
+    {
+      "epoch": 5.639658848614072,
+      "grad_norm": 0.21933898329734802,
+      "learning_rate": 0.00019532026695441083,
+      "loss": 0.3948,
+      "step": 2645
+    },
+    {
+      "epoch": 5.650319829424307,
+      "grad_norm": 0.227766752243042,
+      "learning_rate": 0.00019528269050165297,
+      "loss": 0.3861,
+      "step": 2650
+    },
+    {
+      "epoch": 5.660980810234541,
+      "grad_norm": 0.22262893617153168,
+      "learning_rate": 0.00019524496743318891,
+      "loss": 0.3921,
+      "step": 2655
+    },
+    {
+      "epoch": 5.6716417910447765,
+      "grad_norm": 0.28188657760620117,
+      "learning_rate": 0.00019520709780706486,
+      "loss": 0.3802,
+      "step": 2660
+    },
+    {
+      "epoch": 5.682302771855011,
+      "grad_norm": 0.22414395213127136,
+      "learning_rate": 0.00019516908168155245,
+      "loss": 0.3858,
+      "step": 2665
+    },
+    {
+      "epoch": 5.6929637526652455,
+      "grad_norm": 0.222300723195076,
+      "learning_rate": 0.00019513091911514885,
+      "loss": 0.3886,
+      "step": 2670
+    },
+    {
+      "epoch": 5.70362473347548,
+      "grad_norm": 0.2155119776725769,
+      "learning_rate": 0.00019509261016657643,
+      "loss": 0.3948,
+      "step": 2675
+    },
+    {
+      "epoch": 5.714285714285714,
+      "grad_norm": 0.23029391467571259,
+      "learning_rate": 0.0001950541548947829,
+      "loss": 0.3915,
+      "step": 2680
+    },
+    {
+      "epoch": 5.724946695095949,
+      "grad_norm": 0.23538485169410706,
+      "learning_rate": 0.0001950155533589411,
+      "loss": 0.4005,
+      "step": 2685
+    },
+    {
+      "epoch": 5.735607675906183,
+      "grad_norm": 0.249455988407135,
+      "learning_rate": 0.00019497680561844893,
+      "loss": 0.386,
+      "step": 2690
+    },
+    {
+      "epoch": 5.746268656716418,
+      "grad_norm": 0.21184088289737701,
+      "learning_rate": 0.00019493791173292923,
+      "loss": 0.3931,
+      "step": 2695
+    },
+    {
+      "epoch": 5.756929637526652,
+      "grad_norm": 0.21931645274162292,
+      "learning_rate": 0.00019489887176222975,
+      "loss": 0.3981,
+      "step": 2700
+    },
+    {
+      "epoch": 5.767590618336887,
+      "grad_norm": 0.2259492725133896,
+      "learning_rate": 0.00019485968576642308,
+      "loss": 0.3848,
+      "step": 2705
+    },
+    {
+      "epoch": 5.778251599147121,
+      "grad_norm": 0.23413480818271637,
+      "learning_rate": 0.00019482035380580638,
+      "loss": 0.3875,
+      "step": 2710
+    },
+    {
+      "epoch": 5.788912579957356,
+      "grad_norm": 0.22880232334136963,
+      "learning_rate": 0.00019478087594090155,
+      "loss": 0.3838,
+      "step": 2715
+    },
+    {
+      "epoch": 5.79957356076759,
+      "grad_norm": 0.22865185141563416,
+      "learning_rate": 0.00019474125223245488,
+      "loss": 0.3855,
+      "step": 2720
+    },
+    {
+      "epoch": 5.810234541577826,
+      "grad_norm": 0.24277456104755402,
+      "learning_rate": 0.00019470148274143713,
+      "loss": 0.3938,
+      "step": 2725
+    },
+    {
+      "epoch": 5.82089552238806,
+      "grad_norm": 0.2189398854970932,
+      "learning_rate": 0.00019466156752904343,
+      "loss": 0.4008,
+      "step": 2730
+    },
+    {
+      "epoch": 5.8315565031982945,
+      "grad_norm": 0.21893605589866638,
+      "learning_rate": 0.00019462150665669302,
+      "loss": 0.3874,
+      "step": 2735
+    },
+    {
+      "epoch": 5.842217484008529,
+      "grad_norm": 0.23077057301998138,
+      "learning_rate": 0.00019458130018602945,
+      "loss": 0.3929,
+      "step": 2740
+    },
+    {
+      "epoch": 5.8528784648187635,
+      "grad_norm": 0.2599683701992035,
+      "learning_rate": 0.00019454094817892008,
+      "loss": 0.3892,
+      "step": 2745
+    },
+    {
+      "epoch": 5.863539445628998,
+      "grad_norm": 0.22645121812820435,
+      "learning_rate": 0.00019450045069745642,
+      "loss": 0.3913,
+      "step": 2750
+    },
+    {
+      "epoch": 5.8742004264392325,
+      "grad_norm": 0.22834275662899017,
+      "learning_rate": 0.00019445980780395368,
+      "loss": 0.3958,
+      "step": 2755
+    },
+    {
+      "epoch": 5.884861407249467,
+      "grad_norm": 0.24456727504730225,
+      "learning_rate": 0.00019441901956095093,
+      "loss": 0.3939,
+      "step": 2760
+    },
+    {
+      "epoch": 5.895522388059701,
+      "grad_norm": 0.21773149073123932,
+      "learning_rate": 0.00019437808603121087,
+      "loss": 0.3988,
+      "step": 2765
+    },
+    {
+      "epoch": 5.906183368869936,
+      "grad_norm": 0.21768063306808472,
+      "learning_rate": 0.00019433700727771965,
+      "loss": 0.3894,
+      "step": 2770
+    },
+    {
+      "epoch": 5.91684434968017,
+      "grad_norm": 0.2415178418159485,
+      "learning_rate": 0.00019429578336368708,
+      "loss": 0.3931,
+      "step": 2775
+    },
+    {
+      "epoch": 5.927505330490405,
+      "grad_norm": 0.21271879971027374,
+      "learning_rate": 0.00019425441435254616,
+      "loss": 0.3957,
+      "step": 2780
+    },
+    {
+      "epoch": 5.938166311300639,
+      "grad_norm": 0.21745960414409637,
+      "learning_rate": 0.00019421290030795322,
+      "loss": 0.3948,
+      "step": 2785
+    },
+    {
+      "epoch": 5.948827292110874,
+      "grad_norm": 0.22035416960716248,
+      "learning_rate": 0.0001941712412937878,
+      "loss": 0.3922,
+      "step": 2790
+    },
+    {
+      "epoch": 5.959488272921108,
+      "grad_norm": 0.20828816294670105,
+      "learning_rate": 0.00019412943737415246,
+      "loss": 0.3976,
+      "step": 2795
+    },
+    {
+      "epoch": 5.970149253731344,
+      "grad_norm": 0.19749729335308075,
+      "learning_rate": 0.00019408748861337273,
+      "loss": 0.3994,
+      "step": 2800
+    },
+    {
+      "epoch": 5.980810234541578,
+      "grad_norm": 0.20768584311008453,
+      "learning_rate": 0.00019404539507599707,
+      "loss": 0.3869,
+      "step": 2805
+    },
+    {
+      "epoch": 5.991471215351813,
+      "grad_norm": 0.2182578146457672,
+      "learning_rate": 0.00019400315682679663,
+      "loss": 0.3924,
+      "step": 2810
+    },
+    {
+      "epoch": 6.0,
+      "eval_loss": 0.5093127489089966,
+      "eval_runtime": 377.4947,
+      "eval_samples_per_second": 1.091,
+      "eval_steps_per_second": 1.091,
+      "step": 2814
+    },
+    {
+      "epoch": 6.002132196162047,
+      "grad_norm": 0.21125191450119019,
+      "learning_rate": 0.0001939607739307653,
+      "loss": 0.3874,
+      "step": 2815
+    },
+    {
+      "epoch": 6.0127931769722816,
+      "grad_norm": 0.31068113446235657,
+      "learning_rate": 0.0001939182464531195,
+      "loss": 0.3704,
+      "step": 2820
+    },
+    {
+      "epoch": 6.023454157782516,
+      "grad_norm": 0.23276059329509735,
+      "learning_rate": 0.00019387557445929823,
+      "loss": 0.353,
+      "step": 2825
+    },
+    {
+      "epoch": 6.0341151385927505,
+      "grad_norm": 0.25309714674949646,
+      "learning_rate": 0.00019383275801496268,
+      "loss": 0.3494,
+      "step": 2830
+    },
+    {
+      "epoch": 6.044776119402985,
+      "grad_norm": 0.2310338020324707,
+      "learning_rate": 0.00019378979718599645,
+      "loss": 0.3534,
+      "step": 2835
+    },
+    {
+      "epoch": 6.0554371002132195,
+      "grad_norm": 0.23623259365558624,
+      "learning_rate": 0.00019374669203850532,
+      "loss": 0.3513,
+      "step": 2840
+    },
+    {
+      "epoch": 6.066098081023454,
+      "grad_norm": 0.2299884408712387,
+      "learning_rate": 0.00019370344263881702,
+      "loss": 0.3534,
+      "step": 2845
+    },
+    {
+      "epoch": 6.076759061833688,
+      "grad_norm": 0.5613902807235718,
+      "learning_rate": 0.0001936600490534814,
+      "loss": 0.3615,
+      "step": 2850
+    },
+    {
+      "epoch": 6.087420042643923,
+      "grad_norm": 0.22940614819526672,
+      "learning_rate": 0.00019361651134927003,
+      "loss": 0.3522,
+      "step": 2855
+    },
+    {
+      "epoch": 6.098081023454157,
+      "grad_norm": 0.22831672430038452,
+      "learning_rate": 0.0001935728295931763,
+      "loss": 0.3523,
+      "step": 2860
+    },
+    {
+      "epoch": 6.108742004264393,
+      "grad_norm": 0.23445968329906464,
+      "learning_rate": 0.00019352900385241536,
+      "loss": 0.369,
+      "step": 2865
+    },
+    {
+      "epoch": 6.119402985074627,
+      "grad_norm": 0.2444639503955841,
+      "learning_rate": 0.0001934850341944237,
+      "loss": 0.355,
+      "step": 2870
+    },
+    {
+      "epoch": 6.130063965884862,
+      "grad_norm": 0.2400490790605545,
+      "learning_rate": 0.00019344092068685948,
+      "loss": 0.3625,
+      "step": 2875
+    },
+    {
+      "epoch": 6.140724946695096,
+      "grad_norm": 0.2361455261707306,
+      "learning_rate": 0.00019339666339760207,
+      "loss": 0.3649,
+      "step": 2880
+    },
+    {
+      "epoch": 6.151385927505331,
+      "grad_norm": 0.26625874638557434,
+      "learning_rate": 0.00019335226239475215,
+      "loss": 0.3572,
+      "step": 2885
+    },
+    {
+      "epoch": 6.162046908315565,
+      "grad_norm": 0.2775781750679016,
+      "learning_rate": 0.0001933077177466315,
+      "loss": 0.3446,
+      "step": 2890
+    },
+    {
+      "epoch": 6.1727078891258,
+      "grad_norm": 0.25833654403686523,
+      "learning_rate": 0.00019326302952178294,
+      "loss": 0.3624,
+      "step": 2895
+    },
+    {
+      "epoch": 6.183368869936034,
+      "grad_norm": 0.2403610199689865,
+      "learning_rate": 0.00019321819778897023,
+      "loss": 0.3578,
+      "step": 2900
+    },
+    {
+      "epoch": 6.1940298507462686,
+      "grad_norm": 0.2580753266811371,
+      "learning_rate": 0.00019317322261717794,
+      "loss": 0.3536,
+      "step": 2905
+    },
+    {
+      "epoch": 6.204690831556503,
+      "grad_norm": 0.2725096046924591,
+      "learning_rate": 0.0001931281040756114,
+      "loss": 0.3689,
+      "step": 2910
+    },
+    {
+      "epoch": 6.2153518123667375,
+      "grad_norm": 0.27059614658355713,
+      "learning_rate": 0.00019308284223369646,
+      "loss": 0.3656,
+      "step": 2915
+    },
+    {
+      "epoch": 6.226012793176972,
+      "grad_norm": 0.24707560241222382,
+      "learning_rate": 0.00019303743716107957,
+      "loss": 0.3682,
+      "step": 2920
+    },
+    {
+      "epoch": 6.2366737739872065,
+      "grad_norm": 0.23825524747371674,
+      "learning_rate": 0.00019299188892762752,
+      "loss": 0.3578,
+      "step": 2925
+    },
+    {
+      "epoch": 6.247334754797441,
+      "grad_norm": 0.24557247757911682,
+      "learning_rate": 0.00019294619760342737,
+      "loss": 0.3624,
+      "step": 2930
+    },
+    {
+      "epoch": 6.257995735607676,
+      "grad_norm": 0.2559678256511688,
+      "learning_rate": 0.00019290036325878644,
+      "loss": 0.3693,
+      "step": 2935
+    },
+    {
+      "epoch": 6.268656716417911,
+      "grad_norm": 0.25294074416160583,
+      "learning_rate": 0.00019285438596423204,
+      "loss": 0.3651,
+      "step": 2940
+    },
+    {
+      "epoch": 6.279317697228145,
+      "grad_norm": 0.24387520551681519,
+      "learning_rate": 0.00019280826579051147,
+      "loss": 0.3589,
+      "step": 2945
+    },
+    {
+      "epoch": 6.28997867803838,
+      "grad_norm": 0.22580432891845703,
+      "learning_rate": 0.0001927620028085919,
+      "loss": 0.3703,
+      "step": 2950
+    },
+    {
+      "epoch": 6.300639658848614,
+      "grad_norm": 0.24953973293304443,
+      "learning_rate": 0.00019271559708966023,
+      "loss": 0.3606,
+      "step": 2955
+    },
+    {
+      "epoch": 6.311300639658849,
+      "grad_norm": 0.2454618364572525,
+      "learning_rate": 0.000192669048705123,
+      "loss": 0.362,
+      "step": 2960
+    },
+    {
+      "epoch": 6.321961620469083,
+      "grad_norm": 0.2393016368150711,
+      "learning_rate": 0.00019262235772660627,
+      "loss": 0.3695,
+      "step": 2965
+    },
+    {
+      "epoch": 6.332622601279318,
+      "grad_norm": 0.2463667392730713,
+      "learning_rate": 0.00019257552422595554,
+      "loss": 0.3658,
+      "step": 2970
+    },
+    {
+      "epoch": 6.343283582089552,
+      "grad_norm": 0.24116967618465424,
+      "learning_rate": 0.00019252854827523557,
+      "loss": 0.3671,
+      "step": 2975
+    },
+    {
+      "epoch": 6.353944562899787,
+      "grad_norm": 0.2345789670944214,
+      "learning_rate": 0.00019248142994673036,
+      "loss": 0.368,
+      "step": 2980
+    },
+    {
+      "epoch": 6.364605543710021,
+      "grad_norm": 0.26505357027053833,
+      "learning_rate": 0.000192434169312943,
+      "loss": 0.3695,
+      "step": 2985
+    },
+    {
+      "epoch": 6.3752665245202556,
+      "grad_norm": 0.2504933476448059,
+      "learning_rate": 0.00019238676644659546,
+      "loss": 0.3605,
+      "step": 2990
+    },
+    {
+      "epoch": 6.38592750533049,
+      "grad_norm": 0.24889980256557465,
+      "learning_rate": 0.0001923392214206287,
+      "loss": 0.3684,
+      "step": 2995
+    },
+    {
+      "epoch": 6.396588486140725,
+      "grad_norm": 0.2319326400756836,
+      "learning_rate": 0.00019229153430820232,
+      "loss": 0.3621,
+      "step": 3000
+    },
+    {
+      "epoch": 6.40724946695096,
+      "grad_norm": 0.2329808622598648,
+      "learning_rate": 0.00019224370518269458,
+      "loss": 0.3649,
+      "step": 3005
+    },
+    {
+      "epoch": 6.417910447761194,
+      "grad_norm": 0.2565195560455322,
+      "learning_rate": 0.00019219573411770235,
+      "loss": 0.3602,
+      "step": 3010
+    },
+    {
+      "epoch": 6.428571428571429,
+      "grad_norm": 0.24189329147338867,
+      "learning_rate": 0.00019214762118704076,
+      "loss": 0.3691,
+      "step": 3015
+    },
+    {
+      "epoch": 6.439232409381663,
+      "grad_norm": 0.2512595057487488,
+      "learning_rate": 0.0001920993664647434,
+      "loss": 0.364,
+      "step": 3020
+    },
+    {
+      "epoch": 6.449893390191898,
+      "grad_norm": 0.24277447164058685,
+      "learning_rate": 0.00019205097002506185,
+      "loss": 0.3732,
+      "step": 3025
+    },
+    {
+      "epoch": 6.460554371002132,
+      "grad_norm": 0.242990642786026,
+      "learning_rate": 0.00019200243194246594,
+      "loss": 0.3674,
+      "step": 3030
+    },
+    {
+      "epoch": 6.471215351812367,
+      "grad_norm": 0.23621074855327606,
+      "learning_rate": 0.00019195375229164334,
+      "loss": 0.3599,
+      "step": 3035
+    },
+    {
+      "epoch": 6.481876332622601,
+      "grad_norm": 0.26253125071525574,
+      "learning_rate": 0.0001919049311474996,
+      "loss": 0.3708,
+      "step": 3040
+    },
+    {
+      "epoch": 6.492537313432836,
+      "grad_norm": 0.2214423567056656,
+      "learning_rate": 0.000191855968585158,
+      "loss": 0.3612,
+      "step": 3045
+    },
+    {
+      "epoch": 6.50319829424307,
+      "grad_norm": 0.24866749346256256,
+      "learning_rate": 0.00019180686467995935,
+      "loss": 0.3682,
+      "step": 3050
+    },
+    {
+      "epoch": 6.513859275053305,
+      "grad_norm": 0.2474697232246399,
+      "learning_rate": 0.00019175761950746204,
+      "loss": 0.354,
+      "step": 3055
+    },
+    {
+      "epoch": 6.524520255863539,
+      "grad_norm": 0.26961109042167664,
+      "learning_rate": 0.00019170823314344185,
+      "loss": 0.3708,
+      "step": 3060
+    },
+    {
+      "epoch": 6.535181236673774,
+      "grad_norm": 0.2510351538658142,
+      "learning_rate": 0.0001916587056638917,
+      "loss": 0.3667,
+      "step": 3065
+    },
+    {
+      "epoch": 6.545842217484008,
+      "grad_norm": 0.24457301199436188,
+      "learning_rate": 0.00019160903714502173,
+      "loss": 0.3679,
+      "step": 3070
+    },
+    {
+      "epoch": 6.556503198294243,
+      "grad_norm": 0.23988381028175354,
+      "learning_rate": 0.00019155922766325918,
+      "loss": 0.3608,
+      "step": 3075
+    },
+    {
+      "epoch": 6.567164179104478,
+      "grad_norm": 0.2317483127117157,
+      "learning_rate": 0.000191509277295248,
+      "loss": 0.3761,
+      "step": 3080
+    },
+    {
+      "epoch": 6.577825159914712,
+      "grad_norm": 0.2614232301712036,
+      "learning_rate": 0.0001914591861178491,
+      "loss": 0.3606,
+      "step": 3085
+    },
+    {
+      "epoch": 6.588486140724947,
+      "grad_norm": 0.24253317713737488,
+      "learning_rate": 0.00019140895420813997,
+      "loss": 0.362,
+      "step": 3090
+    },
+    {
+      "epoch": 6.599147121535181,
+      "grad_norm": 0.2507173418998718,
+      "learning_rate": 0.00019135858164341473,
+      "loss": 0.3594,
+      "step": 3095
+    },
+    {
+      "epoch": 6.609808102345416,
+      "grad_norm": 0.23574085533618927,
+      "learning_rate": 0.0001913080685011838,
+      "loss": 0.3661,
+      "step": 3100
+    },
+    {
+      "epoch": 6.62046908315565,
+      "grad_norm": 0.2325553447008133,
+      "learning_rate": 0.00019125741485917405,
+      "loss": 0.3756,
+      "step": 3105
+    },
+    {
+      "epoch": 6.631130063965885,
+      "grad_norm": 0.2191423624753952,
+      "learning_rate": 0.00019120662079532853,
+      "loss": 0.354,
+      "step": 3110
+    },
+    {
+      "epoch": 6.641791044776119,
+      "grad_norm": 0.21787339448928833,
+      "learning_rate": 0.00019115568638780622,
+      "loss": 0.3657,
+      "step": 3115
+    },
+    {
+      "epoch": 6.652452025586354,
+      "grad_norm": 0.21904399991035461,
+      "learning_rate": 0.0001911046117149822,
+      "loss": 0.367,
+      "step": 3120
+    },
+    {
+      "epoch": 6.663113006396588,
+      "grad_norm": 0.23119735717773438,
+      "learning_rate": 0.00019105339685544735,
+      "loss": 0.3646,
+      "step": 3125
+    },
+    {
+      "epoch": 6.673773987206823,
+      "grad_norm": 0.24613478779792786,
+      "learning_rate": 0.00019100204188800827,
+      "loss": 0.3682,
+      "step": 3130
+    },
+    {
+      "epoch": 6.684434968017058,
+      "grad_norm": 0.2366684079170227,
+      "learning_rate": 0.00019095054689168705,
+      "loss": 0.3714,
+      "step": 3135
+    },
+    {
+      "epoch": 6.6950959488272925,
+      "grad_norm": 0.2413744032382965,
+      "learning_rate": 0.0001908989119457214,
+      "loss": 0.3682,
+      "step": 3140
+    },
+    {
+      "epoch": 6.705756929637527,
+      "grad_norm": 0.23421700298786163,
+      "learning_rate": 0.00019084713712956428,
+      "loss": 0.3639,
+      "step": 3145
+    },
+    {
+      "epoch": 6.7164179104477615,
+      "grad_norm": 0.23423875868320465,
+      "learning_rate": 0.00019079522252288386,
+      "loss": 0.3655,
+      "step": 3150
+    },
+    {
+      "epoch": 6.727078891257996,
+      "grad_norm": 0.23802149295806885,
+      "learning_rate": 0.00019074316820556352,
+      "loss": 0.3708,
+      "step": 3155
+    },
+    {
+      "epoch": 6.73773987206823,
+      "grad_norm": 0.25665974617004395,
+      "learning_rate": 0.00019069097425770154,
+      "loss": 0.3762,
+      "step": 3160
+    },
+    {
+      "epoch": 6.748400852878465,
+      "grad_norm": 0.23551535606384277,
+      "learning_rate": 0.00019063864075961098,
+      "loss": 0.3687,
+      "step": 3165
+    },
+    {
+      "epoch": 6.759061833688699,
+      "grad_norm": 0.24098068475723267,
+      "learning_rate": 0.00019058616779181982,
+      "loss": 0.3659,
+      "step": 3170
+    },
+    {
+      "epoch": 6.769722814498934,
+      "grad_norm": 0.22562439739704132,
+      "learning_rate": 0.0001905335554350705,
+      "loss": 0.3724,
+      "step": 3175
+    },
+    {
+      "epoch": 6.780383795309168,
+      "grad_norm": 0.224997878074646,
+      "learning_rate": 0.00019048080377031995,
+      "loss": 0.3705,
+      "step": 3180
+    },
+    {
+      "epoch": 6.791044776119403,
+      "grad_norm": 0.2575388252735138,
+      "learning_rate": 0.00019042791287873957,
+      "loss": 0.3611,
+      "step": 3185
+    },
+    {
+      "epoch": 6.801705756929637,
+      "grad_norm": 0.231009379029274,
+      "learning_rate": 0.0001903748828417149,
+      "loss": 0.3653,
+      "step": 3190
+    },
+    {
+      "epoch": 6.812366737739872,
+      "grad_norm": 0.23769618570804596,
+      "learning_rate": 0.0001903217137408456,
+      "loss": 0.3615,
+      "step": 3195
+    },
+    {
+      "epoch": 6.823027718550106,
+      "grad_norm": 0.23301640152931213,
+      "learning_rate": 0.00019026840565794536,
+      "loss": 0.366,
+      "step": 3200
+    },
+    {
+      "epoch": 6.833688699360341,
+      "grad_norm": 0.2212369292974472,
+      "learning_rate": 0.00019021495867504163,
+      "loss": 0.3632,
+      "step": 3205
+    },
+    {
+      "epoch": 6.844349680170575,
+      "grad_norm": 0.23795363306999207,
+      "learning_rate": 0.0001901613728743757,
+      "loss": 0.3681,
+      "step": 3210
+    },
+    {
+      "epoch": 6.855010660980811,
+      "grad_norm": 0.24354343116283417,
+      "learning_rate": 0.00019010764833840243,
+      "loss": 0.3695,
+      "step": 3215
+    },
+    {
+      "epoch": 6.865671641791045,
+      "grad_norm": 0.24145299196243286,
+      "learning_rate": 0.00019005378514979008,
+      "loss": 0.3667,
+      "step": 3220
+    },
+    {
+      "epoch": 6.8763326226012795,
+      "grad_norm": 0.24070268869400024,
+      "learning_rate": 0.0001899997833914204,
+      "loss": 0.3693,
+      "step": 3225
+    },
+    {
+      "epoch": 6.886993603411514,
+      "grad_norm": 0.22578920423984528,
+      "learning_rate": 0.00018994564314638832,
+      "loss": 0.3692,
+      "step": 3230
+    },
+    {
+      "epoch": 6.8976545842217485,
+      "grad_norm": 0.22691179811954498,
+      "learning_rate": 0.00018989136449800174,
+      "loss": 0.3766,
+      "step": 3235
+    },
+    {
+      "epoch": 6.908315565031983,
+      "grad_norm": 0.2194678634405136,
+      "learning_rate": 0.0001898369475297817,
+      "loss": 0.3668,
+      "step": 3240
+    },
+    {
+      "epoch": 6.918976545842217,
+      "grad_norm": 0.22618421912193298,
+      "learning_rate": 0.000189782392325462,
+      "loss": 0.3592,
+      "step": 3245
+    },
+    {
+      "epoch": 6.929637526652452,
+      "grad_norm": 0.2549285292625427,
+      "learning_rate": 0.0001897276989689891,
+      "loss": 0.3653,
+      "step": 3250
+    },
+    {
+      "epoch": 6.940298507462686,
+      "grad_norm": 0.23101598024368286,
+      "learning_rate": 0.00018967286754452214,
+      "loss": 0.3569,
+      "step": 3255
+    },
+    {
+      "epoch": 6.950959488272921,
+      "grad_norm": 0.2506960332393646,
+      "learning_rate": 0.00018961789813643268,
+      "loss": 0.3633,
+      "step": 3260
+    },
+    {
+      "epoch": 6.961620469083155,
+      "grad_norm": 0.2284671515226364,
+      "learning_rate": 0.00018956279082930455,
+      "loss": 0.3624,
+      "step": 3265
+    },
+    {
+      "epoch": 6.97228144989339,
+      "grad_norm": 0.22146272659301758,
+      "learning_rate": 0.00018950754570793384,
+      "loss": 0.37,
+      "step": 3270
+    },
+    {
+      "epoch": 6.982942430703625,
+      "grad_norm": 0.2425510585308075,
+      "learning_rate": 0.00018945216285732864,
+      "loss": 0.366,
+      "step": 3275
+    },
+    {
+      "epoch": 6.99360341151386,
+      "grad_norm": 0.2304454892873764,
+      "learning_rate": 0.00018939664236270907,
+      "loss": 0.3684,
+      "step": 3280
+    },
+    {
+      "epoch": 7.0,
+      "eval_loss": 0.5168320536613464,
+      "eval_runtime": 377.6098,
+      "eval_samples_per_second": 1.091,
+      "eval_steps_per_second": 1.091,
+      "step": 3283
+    },
+    {
+      "epoch": 7.004264392324094,
+      "grad_norm": 0.2056385576725006,
+      "learning_rate": 0.00018934098430950692,
+      "loss": 0.3479,
+      "step": 3285
+    },
+    {
+      "epoch": 7.014925373134329,
+      "grad_norm": 0.2757323086261749,
+      "learning_rate": 0.0001892851887833657,
+      "loss": 0.333,
+      "step": 3290
+    },
+    {
+      "epoch": 7.025586353944563,
+      "grad_norm": 0.25871726870536804,
+      "learning_rate": 0.00018922925587014046,
+      "loss": 0.3212,
+      "step": 3295
+    },
+    {
+      "epoch": 7.036247334754798,
+      "grad_norm": 0.2494359016418457,
+      "learning_rate": 0.00018917318565589772,
+      "loss": 0.3248,
+      "step": 3300
+    },
+    {
+      "epoch": 7.046908315565032,
+      "grad_norm": 0.2385275512933731,
+      "learning_rate": 0.00018911697822691516,
+      "loss": 0.3189,
+      "step": 3305
+    },
+    {
+      "epoch": 7.0575692963752665,
+      "grad_norm": 0.2520158588886261,
+      "learning_rate": 0.00018906063366968165,
+      "loss": 0.3268,
+      "step": 3310
+    },
+    {
+      "epoch": 7.068230277185501,
+      "grad_norm": 0.25822409987449646,
+      "learning_rate": 0.00018900415207089708,
+      "loss": 0.3169,
+      "step": 3315
+    },
+    {
+      "epoch": 7.0788912579957355,
+      "grad_norm": 0.2619076669216156,
+      "learning_rate": 0.00018894753351747214,
+      "loss": 0.3279,
+      "step": 3320
+    },
+    {
+      "epoch": 7.08955223880597,
+      "grad_norm": 0.30978551506996155,
+      "learning_rate": 0.0001888907780965284,
+      "loss": 0.327,
+      "step": 3325
+    },
+    {
+      "epoch": 7.100213219616204,
+      "grad_norm": 0.25372347235679626,
+      "learning_rate": 0.00018883388589539785,
+      "loss": 0.3254,
+      "step": 3330
+    },
+    {
+      "epoch": 7.110874200426439,
+      "grad_norm": 0.27630311250686646,
+      "learning_rate": 0.0001887768570016231,
+      "loss": 0.3291,
+      "step": 3335
+    },
+    {
+      "epoch": 7.121535181236673,
+      "grad_norm": 0.2716643810272217,
+      "learning_rate": 0.00018871969150295706,
+      "loss": 0.3241,
+      "step": 3340
+    },
+    {
+      "epoch": 7.132196162046908,
+      "grad_norm": 0.2678888440132141,
+      "learning_rate": 0.00018866238948736278,
+      "loss": 0.3304,
+      "step": 3345
+    },
+    {
+      "epoch": 7.142857142857143,
+      "grad_norm": 0.2532709240913391,
+      "learning_rate": 0.00018860495104301345,
+      "loss": 0.3331,
+      "step": 3350
+    },
+    {
+      "epoch": 7.153518123667378,
+      "grad_norm": 0.2671636939048767,
+      "learning_rate": 0.0001885473762582921,
+      "loss": 0.3315,
+      "step": 3355
+    },
+    {
+      "epoch": 7.164179104477612,
+      "grad_norm": 0.2550068497657776,
+      "learning_rate": 0.00018848966522179168,
+      "loss": 0.3306,
+      "step": 3360
+    },
+    {
+      "epoch": 7.174840085287847,
+      "grad_norm": 0.2700331211090088,
+      "learning_rate": 0.00018843181802231465,
+      "loss": 0.329,
+      "step": 3365
+    },
+    {
+      "epoch": 7.185501066098081,
+      "grad_norm": 0.26168689131736755,
+      "learning_rate": 0.00018837383474887314,
+      "loss": 0.3327,
+      "step": 3370
+    },
+    {
+      "epoch": 7.196162046908316,
+      "grad_norm": 0.24964787065982819,
+      "learning_rate": 0.00018831571549068852,
+      "loss": 0.3353,
+      "step": 3375
+    },
+    {
+      "epoch": 7.20682302771855,
+      "grad_norm": 0.2676330804824829,
+      "learning_rate": 0.00018825746033719149,
+      "loss": 0.3316,
+      "step": 3380
+    },
+    {
+      "epoch": 7.217484008528785,
+      "grad_norm": 0.25253960490226746,
+      "learning_rate": 0.0001881990693780219,
+      "loss": 0.3316,
+      "step": 3385
+    },
+    {
+      "epoch": 7.228144989339019,
+      "grad_norm": 0.257114440202713,
+      "learning_rate": 0.0001881405427030284,
+      "loss": 0.3307,
+      "step": 3390
+    },
+    {
+      "epoch": 7.2388059701492535,
+      "grad_norm": 0.25102248787879944,
+      "learning_rate": 0.00018808188040226868,
+      "loss": 0.3348,
+      "step": 3395
+    },
+    {
+      "epoch": 7.249466950959488,
+      "grad_norm": 0.25489816069602966,
+      "learning_rate": 0.000188023082566009,
+      "loss": 0.3342,
+      "step": 3400
+    },
+    {
+      "epoch": 7.2601279317697225,
+      "grad_norm": 0.27044063806533813,
+      "learning_rate": 0.00018796414928472417,
+      "loss": 0.3391,
+      "step": 3405
+    },
+    {
+      "epoch": 7.270788912579957,
+      "grad_norm": 0.26209956407546997,
+      "learning_rate": 0.00018790508064909746,
+      "loss": 0.3318,
+      "step": 3410
+    },
+    {
+      "epoch": 7.281449893390192,
+      "grad_norm": 0.25549113750457764,
+      "learning_rate": 0.00018784587675002045,
+      "loss": 0.3322,
+      "step": 3415
+    },
+    {
+      "epoch": 7.292110874200427,
+      "grad_norm": 0.26465660333633423,
+      "learning_rate": 0.00018778653767859274,
+      "loss": 0.3319,
+      "step": 3420
+    },
+    {
+      "epoch": 7.302771855010661,
+      "grad_norm": 0.2753106951713562,
+      "learning_rate": 0.00018772706352612203,
+      "loss": 0.3329,
+      "step": 3425
+    },
+    {
+      "epoch": 7.313432835820896,
+      "grad_norm": 0.2526467740535736,
+      "learning_rate": 0.00018766745438412384,
+      "loss": 0.3311,
+      "step": 3430
+    },
+    {
+      "epoch": 7.32409381663113,
+      "grad_norm": 0.2626464068889618,
+      "learning_rate": 0.00018760771034432138,
+      "loss": 0.3318,
+      "step": 3435
+    },
+    {
+      "epoch": 7.334754797441365,
+      "grad_norm": 0.2631151080131531,
+      "learning_rate": 0.0001875478314986455,
+      "loss": 0.3453,
+      "step": 3440
+    },
+    {
+      "epoch": 7.345415778251599,
+      "grad_norm": 0.25757527351379395,
+      "learning_rate": 0.0001874878179392344,
+      "loss": 0.3373,
+      "step": 3445
+    },
+    {
+      "epoch": 7.356076759061834,
+      "grad_norm": 0.2395113706588745,
+      "learning_rate": 0.0001874276697584336,
+      "loss": 0.331,
+      "step": 3450
+    },
+    {
+      "epoch": 7.366737739872068,
+      "grad_norm": 0.2804111838340759,
+      "learning_rate": 0.0001873673870487958,
+      "loss": 0.3378,
+      "step": 3455
+    },
+    {
+      "epoch": 7.377398720682303,
+      "grad_norm": 0.24439595639705658,
+      "learning_rate": 0.00018730696990308069,
+      "loss": 0.3381,
+      "step": 3460
+    },
+    {
+      "epoch": 7.388059701492537,
+      "grad_norm": 0.270958811044693,
+      "learning_rate": 0.00018724641841425478,
+      "loss": 0.3418,
+      "step": 3465
+    },
+    {
+      "epoch": 7.398720682302772,
+      "grad_norm": 0.2635878324508667,
+      "learning_rate": 0.0001871857326754914,
+      "loss": 0.3433,
+      "step": 3470
+    },
+    {
+      "epoch": 7.409381663113006,
+      "grad_norm": 0.24128612875938416,
+      "learning_rate": 0.00018712491278017032,
+      "loss": 0.3395,
+      "step": 3475
+    },
+    {
+      "epoch": 7.4200426439232405,
+      "grad_norm": 0.2588317096233368,
+      "learning_rate": 0.00018706395882187783,
+      "loss": 0.3415,
+      "step": 3480
+    },
+    {
+      "epoch": 7.430703624733475,
+      "grad_norm": 0.2590773105621338,
+      "learning_rate": 0.0001870028708944065,
+      "loss": 0.3392,
+      "step": 3485
+    },
+    {
+      "epoch": 7.44136460554371,
+      "grad_norm": 0.25688695907592773,
+      "learning_rate": 0.00018694164909175505,
+      "loss": 0.3385,
+      "step": 3490
+    },
+    {
+      "epoch": 7.452025586353945,
+      "grad_norm": 0.23704120516777039,
+      "learning_rate": 0.00018688029350812817,
+      "loss": 0.3356,
+      "step": 3495
+    },
+    {
+      "epoch": 7.462686567164179,
+      "grad_norm": 0.2817398011684418,
+      "learning_rate": 0.00018681880423793642,
+      "loss": 0.3368,
+      "step": 3500
+    },
+    {
+      "epoch": 7.473347547974414,
+      "grad_norm": 0.2590171694755554,
+      "learning_rate": 0.00018675718137579607,
+      "loss": 0.3382,
+      "step": 3505
+    },
+    {
+      "epoch": 7.484008528784648,
+      "grad_norm": 0.2843134105205536,
+      "learning_rate": 0.00018669542501652896,
+      "loss": 0.3304,
+      "step": 3510
+    },
+    {
+      "epoch": 7.494669509594883,
+      "grad_norm": 0.25284621119499207,
+      "learning_rate": 0.00018663353525516234,
+      "loss": 0.3337,
+      "step": 3515
+    },
+    {
+      "epoch": 7.505330490405117,
+      "grad_norm": 0.24715737998485565,
+      "learning_rate": 0.00018657151218692873,
+      "loss": 0.3373,
+      "step": 3520
+    },
+    {
+      "epoch": 7.515991471215352,
+      "grad_norm": 0.28074926137924194,
+      "learning_rate": 0.0001865093559072658,
+      "loss": 0.3376,
+      "step": 3525
+    },
+    {
+      "epoch": 7.526652452025586,
+      "grad_norm": 0.2531152367591858,
+      "learning_rate": 0.00018644706651181614,
+      "loss": 0.3329,
+      "step": 3530
+    },
+    {
+      "epoch": 7.537313432835821,
+      "grad_norm": 0.27217596769332886,
+      "learning_rate": 0.00018638464409642723,
+      "loss": 0.3486,
+      "step": 3535
+    },
+    {
+      "epoch": 7.547974413646055,
+      "grad_norm": 0.2517159581184387,
+      "learning_rate": 0.0001863220887571512,
+      "loss": 0.343,
+      "step": 3540
+    },
+    {
+      "epoch": 7.55863539445629,
+      "grad_norm": 0.2538190484046936,
+      "learning_rate": 0.00018625940059024477,
+      "loss": 0.3343,
+      "step": 3545
+    },
+    {
+      "epoch": 7.569296375266525,
+      "grad_norm": 0.26679527759552,
+      "learning_rate": 0.00018619657969216893,
+      "loss": 0.348,
+      "step": 3550
+    },
+    {
+      "epoch": 7.5799573560767595,
+      "grad_norm": 0.24433985352516174,
+      "learning_rate": 0.00018613362615958905,
+      "loss": 0.3455,
+      "step": 3555
+    },
+    {
+      "epoch": 7.590618336886994,
+      "grad_norm": 0.2719508111476898,
+      "learning_rate": 0.0001860705400893745,
+      "loss": 0.3414,
+      "step": 3560
+    },
+    {
+      "epoch": 7.601279317697228,
+      "grad_norm": 0.2666242718696594,
+      "learning_rate": 0.00018600732157859863,
+      "loss": 0.3384,
+      "step": 3565
+    },
+    {
+      "epoch": 7.611940298507463,
+      "grad_norm": 0.24249517917633057,
+      "learning_rate": 0.00018594397072453856,
+      "loss": 0.339,
+      "step": 3570
+    },
+    {
+      "epoch": 7.622601279317697,
+      "grad_norm": 0.2475687712430954,
+      "learning_rate": 0.00018588048762467502,
+      "loss": 0.3428,
+      "step": 3575
+    },
+    {
+      "epoch": 7.633262260127932,
+      "grad_norm": 0.2500527799129486,
+      "learning_rate": 0.00018581687237669234,
+      "loss": 0.3332,
+      "step": 3580
+    },
+    {
+      "epoch": 7.643923240938166,
+      "grad_norm": 0.2528587281703949,
+      "learning_rate": 0.0001857531250784781,
+      "loss": 0.3429,
+      "step": 3585
+    },
+    {
+      "epoch": 7.654584221748401,
+      "grad_norm": 0.2627830505371094,
+      "learning_rate": 0.0001856892458281231,
+      "loss": 0.3396,
+      "step": 3590
+    },
+    {
+      "epoch": 7.665245202558635,
+      "grad_norm": 0.2573624849319458,
+      "learning_rate": 0.00018562523472392118,
+      "loss": 0.3391,
+      "step": 3595
+    },
+    {
+      "epoch": 7.67590618336887,
+      "grad_norm": 0.2411065399646759,
+      "learning_rate": 0.0001855610918643691,
+      "loss": 0.3384,
+      "step": 3600
+    },
+    {
+      "epoch": 7.686567164179104,
+      "grad_norm": 0.2589527666568756,
+      "learning_rate": 0.00018549681734816623,
+      "loss": 0.3429,
+      "step": 3605
+    },
+    {
+      "epoch": 7.697228144989339,
+      "grad_norm": 0.2436107099056244,
+      "learning_rate": 0.00018543241127421474,
+      "loss": 0.3435,
+      "step": 3610
+    },
+    {
+      "epoch": 7.707889125799573,
+      "grad_norm": 0.272020161151886,
+      "learning_rate": 0.00018536787374161902,
+      "loss": 0.3418,
+      "step": 3615
+    },
+    {
+      "epoch": 7.718550106609808,
+      "grad_norm": 0.26080530881881714,
+      "learning_rate": 0.00018530320484968588,
+      "loss": 0.3367,
+      "step": 3620
+    },
+    {
+      "epoch": 7.729211087420042,
+      "grad_norm": 0.2503691613674164,
+      "learning_rate": 0.0001852384046979242,
+      "loss": 0.3367,
+      "step": 3625
+    },
+    {
+      "epoch": 7.7398720682302775,
+      "grad_norm": 0.26822352409362793,
+      "learning_rate": 0.0001851734733860449,
+      "loss": 0.3498,
+      "step": 3630
+    },
+    {
+      "epoch": 7.750533049040512,
+      "grad_norm": 0.28552523255348206,
+      "learning_rate": 0.00018510841101396062,
+      "loss": 0.3406,
+      "step": 3635
+    },
+    {
+      "epoch": 7.7611940298507465,
+      "grad_norm": 0.2446276843547821,
+      "learning_rate": 0.0001850432176817857,
+      "loss": 0.3465,
+      "step": 3640
+    },
+    {
+      "epoch": 7.771855010660981,
+      "grad_norm": 0.24052871763706207,
+      "learning_rate": 0.00018497789348983606,
+      "loss": 0.3434,
+      "step": 3645
+    },
+    {
+      "epoch": 7.782515991471215,
+      "grad_norm": 0.23899152874946594,
+      "learning_rate": 0.00018491243853862893,
+      "loss": 0.3365,
+      "step": 3650
+    },
+    {
+      "epoch": 7.79317697228145,
+      "grad_norm": 0.24732346832752228,
+      "learning_rate": 0.00018484685292888278,
+      "loss": 0.3382,
+      "step": 3655
+    },
+    {
+      "epoch": 7.803837953091684,
+      "grad_norm": 0.2519215941429138,
+      "learning_rate": 0.00018478113676151703,
+      "loss": 0.3463,
+      "step": 3660
+    },
+    {
+      "epoch": 7.814498933901919,
+      "grad_norm": 0.24091705679893494,
+      "learning_rate": 0.00018471529013765209,
+      "loss": 0.3404,
+      "step": 3665
+    },
+    {
+      "epoch": 7.825159914712153,
+      "grad_norm": 0.2794884443283081,
+      "learning_rate": 0.0001846493131586091,
+      "loss": 0.3469,
+      "step": 3670
+    },
+    {
+      "epoch": 7.835820895522388,
+      "grad_norm": 0.24296560883522034,
+      "learning_rate": 0.00018458320592590975,
+      "loss": 0.3434,
+      "step": 3675
+    },
+    {
+      "epoch": 7.846481876332622,
+      "grad_norm": 0.24800756573677063,
+      "learning_rate": 0.00018451696854127617,
+      "loss": 0.3384,
+      "step": 3680
+    },
+    {
+      "epoch": 7.857142857142857,
+      "grad_norm": 0.2350349873304367,
+      "learning_rate": 0.0001844506011066308,
+      "loss": 0.3428,
+      "step": 3685
+    },
+    {
+      "epoch": 7.867803837953092,
+      "grad_norm": 0.2573322355747223,
+      "learning_rate": 0.0001843841037240961,
+      "loss": 0.3463,
+      "step": 3690
+    },
+    {
+      "epoch": 7.878464818763327,
+      "grad_norm": 0.256381630897522,
+      "learning_rate": 0.00018431747649599463,
+      "loss": 0.3397,
+      "step": 3695
+    },
+    {
+      "epoch": 7.889125799573561,
+      "grad_norm": 0.23707297444343567,
+      "learning_rate": 0.0001842507195248486,
+      "loss": 0.3437,
+      "step": 3700
+    },
+    {
+      "epoch": 7.899786780383796,
+      "grad_norm": 0.24699944257736206,
+      "learning_rate": 0.00018418383291337988,
+      "loss": 0.3398,
+      "step": 3705
+    },
+    {
+      "epoch": 7.91044776119403,
+      "grad_norm": 0.25237977504730225,
+      "learning_rate": 0.00018411681676450999,
+      "loss": 0.3409,
+      "step": 3710
+    },
+    {
+      "epoch": 7.9211087420042645,
+      "grad_norm": 0.2656485438346863,
+      "learning_rate": 0.00018404967118135955,
+      "loss": 0.3487,
+      "step": 3715
+    },
+    {
+      "epoch": 7.931769722814499,
+      "grad_norm": 0.23709309101104736,
+      "learning_rate": 0.0001839823962672485,
+      "loss": 0.3398,
+      "step": 3720
+    },
+    {
+      "epoch": 7.9424307036247335,
+      "grad_norm": 0.24946698546409607,
+      "learning_rate": 0.00018391499212569573,
+      "loss": 0.3459,
+      "step": 3725
+    },
+    {
+      "epoch": 7.953091684434968,
+      "grad_norm": 0.2608436346054077,
+      "learning_rate": 0.00018384745886041898,
+      "loss": 0.3394,
+      "step": 3730
+    },
+    {
+      "epoch": 7.963752665245202,
+      "grad_norm": 0.2503463625907898,
+      "learning_rate": 0.00018377979657533468,
+      "loss": 0.3436,
+      "step": 3735
+    },
+    {
+      "epoch": 7.974413646055437,
+      "grad_norm": 0.2556673586368561,
+      "learning_rate": 0.0001837120053745578,
+      "loss": 0.3519,
+      "step": 3740
+    },
+    {
+      "epoch": 7.985074626865671,
+      "grad_norm": 0.24612018465995789,
+      "learning_rate": 0.0001836440853624017,
+      "loss": 0.3388,
+      "step": 3745
+    },
+    {
+      "epoch": 7.995735607675906,
+      "grad_norm": 0.26963427662849426,
+      "learning_rate": 0.00018357603664337786,
+      "loss": 0.3403,
+      "step": 3750
+    },
+    {
+      "epoch": 8.0,
+      "eval_loss": 0.5337910056114197,
+      "eval_runtime": 377.6371,
+      "eval_samples_per_second": 1.091,
+      "eval_steps_per_second": 1.091,
+      "step": 3752
+    },
+    {
+      "epoch": 8.00639658848614,
+      "grad_norm": 0.2208224982023239,
+      "learning_rate": 0.00018350785932219588,
+      "loss": 0.3081,
+      "step": 3755
+    },
+    {
+      "epoch": 8.017057569296375,
+      "grad_norm": 0.30632683634757996,
+      "learning_rate": 0.00018343955350376325,
+      "loss": 0.2978,
+      "step": 3760
+    },
+    {
+      "epoch": 8.02771855010661,
+      "grad_norm": 0.25390052795410156,
+      "learning_rate": 0.00018337111929318516,
+      "loss": 0.2948,
+      "step": 3765
+    },
+    {
+      "epoch": 8.038379530916844,
+      "grad_norm": 0.296369731426239,
+      "learning_rate": 0.00018330255679576438,
+      "loss": 0.2963,
+      "step": 3770
+    },
+    {
+      "epoch": 8.049040511727078,
+      "grad_norm": 0.2958175837993622,
+      "learning_rate": 0.00018323386611700105,
+      "loss": 0.2905,
+      "step": 3775
+    },
+    {
+      "epoch": 8.059701492537313,
+      "grad_norm": 0.2595365047454834,
+      "learning_rate": 0.00018316504736259255,
+      "loss": 0.2918,
+      "step": 3780
+    },
+    {
+      "epoch": 8.070362473347547,
+      "grad_norm": 0.2825353145599365,
+      "learning_rate": 0.00018309610063843337,
+      "loss": 0.3,
+      "step": 3785
+    },
+    {
+      "epoch": 8.081023454157782,
+      "grad_norm": 0.2677433490753174,
+      "learning_rate": 0.00018302702605061492,
+      "loss": 0.2964,
+      "step": 3790
+    },
+    {
+      "epoch": 8.091684434968018,
+      "grad_norm": 0.28075000643730164,
+      "learning_rate": 0.00018295782370542532,
+      "loss": 0.2979,
+      "step": 3795
+    },
+    {
+      "epoch": 8.102345415778252,
+      "grad_norm": 0.2629709243774414,
+      "learning_rate": 0.00018288849370934926,
+      "loss": 0.3005,
+      "step": 3800
+    },
+    {
+      "epoch": 8.113006396588487,
+      "grad_norm": 0.2850215435028076,
+      "learning_rate": 0.00018281903616906796,
+      "loss": 0.2976,
+      "step": 3805
+    },
+    {
+      "epoch": 8.123667377398721,
+      "grad_norm": 0.29631924629211426,
+      "learning_rate": 0.0001827494511914587,
+      "loss": 0.2938,
+      "step": 3810
+    },
+    {
+      "epoch": 8.134328358208956,
+      "grad_norm": 0.26315709948539734,
+      "learning_rate": 0.00018267973888359509,
+      "loss": 0.3021,
+      "step": 3815
+    },
+    {
+      "epoch": 8.14498933901919,
+      "grad_norm": 0.30577051639556885,
+      "learning_rate": 0.0001826098993527465,
+      "loss": 0.2996,
+      "step": 3820
+    },
+    {
+      "epoch": 8.155650319829425,
+      "grad_norm": 0.2897678315639496,
+      "learning_rate": 0.0001825399327063781,
+      "loss": 0.3048,
+      "step": 3825
+    },
+    {
+      "epoch": 8.16631130063966,
+      "grad_norm": 0.3003354072570801,
+      "learning_rate": 0.00018246983905215075,
+      "loss": 0.3075,
+      "step": 3830
+    },
+    {
+      "epoch": 8.176972281449894,
+      "grad_norm": 0.28864815831184387,
+      "learning_rate": 0.00018239961849792055,
+      "loss": 0.3091,
+      "step": 3835
+    },
+    {
+      "epoch": 8.187633262260128,
+      "grad_norm": 0.28102535009384155,
+      "learning_rate": 0.0001823292711517391,
+      "loss": 0.2969,
+      "step": 3840
+    },
+    {
+      "epoch": 8.198294243070363,
+      "grad_norm": 0.2669455409049988,
+      "learning_rate": 0.00018225879712185293,
+      "loss": 0.3061,
+      "step": 3845
+    },
+    {
+      "epoch": 8.208955223880597,
+      "grad_norm": 0.2893795669078827,
+      "learning_rate": 0.00018218819651670356,
+      "loss": 0.3003,
+      "step": 3850
+    },
+    {
+      "epoch": 8.219616204690832,
+      "grad_norm": 0.31041857600212097,
+      "learning_rate": 0.00018211746944492727,
+      "loss": 0.3069,
+      "step": 3855
+    },
+    {
+      "epoch": 8.230277185501066,
+      "grad_norm": 0.2678110599517822,
+      "learning_rate": 0.000182046616015355,
+      "loss": 0.3023,
+      "step": 3860
+    },
+    {
+      "epoch": 8.2409381663113,
+      "grad_norm": 0.3051944375038147,
+      "learning_rate": 0.00018197563633701196,
+      "loss": 0.3095,
+      "step": 3865
+    },
+    {
+      "epoch": 8.251599147121535,
+      "grad_norm": 0.267646461725235,
+      "learning_rate": 0.00018190453051911782,
+      "loss": 0.3047,
+      "step": 3870
+    },
+    {
+      "epoch": 8.26226012793177,
+      "grad_norm": 0.27988821268081665,
+      "learning_rate": 0.00018183329867108624,
+      "loss": 0.3132,
+      "step": 3875
+    },
+    {
+      "epoch": 8.272921108742004,
+      "grad_norm": 0.293363094329834,
+      "learning_rate": 0.0001817619409025248,
+      "loss": 0.3054,
+      "step": 3880
+    },
+    {
+      "epoch": 8.283582089552239,
+      "grad_norm": 0.28679507970809937,
+      "learning_rate": 0.00018169045732323492,
+      "loss": 0.3049,
+      "step": 3885
+    },
+    {
+      "epoch": 8.294243070362473,
+      "grad_norm": 0.28792116045951843,
+      "learning_rate": 0.0001816188480432115,
+      "loss": 0.3112,
+      "step": 3890
+    },
+    {
+      "epoch": 8.304904051172707,
+      "grad_norm": 0.2938394844532013,
+      "learning_rate": 0.00018154711317264297,
+      "loss": 0.3101,
+      "step": 3895
+    },
+    {
+      "epoch": 8.315565031982942,
+      "grad_norm": 0.2776646316051483,
+      "learning_rate": 0.00018147525282191093,
+      "loss": 0.3046,
+      "step": 3900
+    },
+    {
+      "epoch": 8.326226012793176,
+      "grad_norm": 0.2619486153125763,
+      "learning_rate": 0.00018140326710159007,
+      "loss": 0.3066,
+      "step": 3905
+    },
+    {
+      "epoch": 8.336886993603411,
+      "grad_norm": 0.2895703911781311,
+      "learning_rate": 0.00018133115612244807,
+      "loss": 0.3122,
+      "step": 3910
+    },
+    {
+      "epoch": 8.347547974413645,
+      "grad_norm": 0.2928364872932434,
+      "learning_rate": 0.00018125891999544525,
+      "loss": 0.303,
+      "step": 3915
+    },
+    {
+      "epoch": 8.35820895522388,
+      "grad_norm": 0.27352485060691833,
+      "learning_rate": 0.00018118655883173456,
+      "loss": 0.301,
+      "step": 3920
+    },
+    {
+      "epoch": 8.368869936034114,
+      "grad_norm": 0.3004440665245056,
+      "learning_rate": 0.00018111407274266136,
+      "loss": 0.3084,
+      "step": 3925
+    },
+    {
+      "epoch": 8.379530916844349,
+      "grad_norm": 0.26515400409698486,
+      "learning_rate": 0.00018104146183976316,
+      "loss": 0.3052,
+      "step": 3930
+    },
+    {
+      "epoch": 8.390191897654585,
+      "grad_norm": 0.29159972071647644,
+      "learning_rate": 0.00018096872623476963,
+      "loss": 0.3018,
+      "step": 3935
+    },
+    {
+      "epoch": 8.40085287846482,
+      "grad_norm": 0.31077924370765686,
+      "learning_rate": 0.00018089586603960224,
+      "loss": 0.3139,
+      "step": 3940
+    },
+    {
+      "epoch": 8.411513859275054,
+      "grad_norm": 0.2826644480228424,
+      "learning_rate": 0.00018082288136637422,
+      "loss": 0.2955,
+      "step": 3945
+    },
+    {
+      "epoch": 8.422174840085288,
+      "grad_norm": 0.2825087308883667,
+      "learning_rate": 0.00018074977232739031,
+      "loss": 0.3127,
+      "step": 3950
+    },
+    {
+      "epoch": 8.432835820895523,
+      "grad_norm": 0.2901898920536041,
+      "learning_rate": 0.0001806765390351467,
+      "loss": 0.3099,
+      "step": 3955
+    },
+    {
+      "epoch": 8.443496801705757,
+      "grad_norm": 0.28308314085006714,
+      "learning_rate": 0.00018060318160233063,
+      "loss": 0.3122,
+      "step": 3960
+    },
+    {
+      "epoch": 8.454157782515992,
+      "grad_norm": 0.26890453696250916,
+      "learning_rate": 0.00018052970014182046,
+      "loss": 0.3156,
+      "step": 3965
+    },
+    {
+      "epoch": 8.464818763326226,
+      "grad_norm": 0.2962822914123535,
+      "learning_rate": 0.00018045609476668545,
+      "loss": 0.3184,
+      "step": 3970
+    },
+    {
+      "epoch": 8.47547974413646,
+      "grad_norm": 0.2848854959011078,
+      "learning_rate": 0.00018038236559018533,
+      "loss": 0.309,
+      "step": 3975
+    },
+    {
+      "epoch": 8.486140724946695,
+      "grad_norm": 0.3047114312648773,
+      "learning_rate": 0.00018030851272577051,
+      "loss": 0.3118,
+      "step": 3980
+    },
+    {
+      "epoch": 8.49680170575693,
+      "grad_norm": 0.28175976872444153,
+      "learning_rate": 0.00018023453628708173,
+      "loss": 0.3074,
+      "step": 3985
+    },
+    {
+      "epoch": 8.507462686567164,
+      "grad_norm": 0.27742594480514526,
+      "learning_rate": 0.00018016043638794974,
+      "loss": 0.3127,
+      "step": 3990
+    },
+    {
+      "epoch": 8.518123667377399,
+      "grad_norm": 0.28773581981658936,
+      "learning_rate": 0.0001800862131423954,
+      "loss": 0.3057,
+      "step": 3995
+    },
+    {
+      "epoch": 8.528784648187633,
+      "grad_norm": 0.2765009105205536,
+      "learning_rate": 0.00018001186666462927,
+      "loss": 0.3128,
+      "step": 4000
+    },
+    {
+      "epoch": 8.539445628997868,
+      "grad_norm": 0.2800111174583435,
+      "learning_rate": 0.00017993739706905162,
+      "loss": 0.3096,
+      "step": 4005
+    },
+    {
+      "epoch": 8.550106609808102,
+      "grad_norm": 0.30302369594573975,
+      "learning_rate": 0.00017986280447025209,
+      "loss": 0.3016,
+      "step": 4010
+    },
+    {
+      "epoch": 8.560767590618337,
+      "grad_norm": 0.2798007130622864,
+      "learning_rate": 0.0001797880889830096,
+      "loss": 0.3061,
+      "step": 4015
+    },
+    {
+      "epoch": 8.571428571428571,
+      "grad_norm": 0.29015523195266724,
+      "learning_rate": 0.00017971325072229226,
+      "loss": 0.3134,
+      "step": 4020
+    },
+    {
+      "epoch": 8.582089552238806,
+      "grad_norm": 0.3815457820892334,
+      "learning_rate": 0.00017963828980325697,
+      "loss": 0.3131,
+      "step": 4025
+    },
+    {
+      "epoch": 8.59275053304904,
+      "grad_norm": 0.2907319664955139,
+      "learning_rate": 0.00017956320634124944,
+      "loss": 0.314,
+      "step": 4030
+    },
+    {
+      "epoch": 8.603411513859275,
+      "grad_norm": 0.29612481594085693,
+      "learning_rate": 0.00017948800045180393,
+      "loss": 0.3168,
+      "step": 4035
+    },
+    {
+      "epoch": 8.614072494669509,
+      "grad_norm": 0.2797704339027405,
+      "learning_rate": 0.00017941267225064306,
+      "loss": 0.3144,
+      "step": 4040
+    },
+    {
+      "epoch": 8.624733475479744,
+      "grad_norm": 0.27811723947525024,
+      "learning_rate": 0.00017933722185367774,
+      "loss": 0.303,
+      "step": 4045
+    },
+    {
+      "epoch": 8.635394456289978,
+      "grad_norm": 0.2933618724346161,
+      "learning_rate": 0.00017926164937700676,
+      "loss": 0.3097,
+      "step": 4050
+    },
+    {
+      "epoch": 8.646055437100213,
+      "grad_norm": 0.282921701669693,
+      "learning_rate": 0.0001791859549369169,
+      "loss": 0.3104,
+      "step": 4055
+    },
+    {
+      "epoch": 8.656716417910447,
+      "grad_norm": 0.2758900225162506,
+      "learning_rate": 0.00017911013864988252,
+      "loss": 0.3108,
+      "step": 4060
+    },
+    {
+      "epoch": 8.667377398720681,
+      "grad_norm": 0.2904449999332428,
+      "learning_rate": 0.00017903420063256555,
+      "loss": 0.3209,
+      "step": 4065
+    },
+    {
+      "epoch": 8.678038379530918,
+      "grad_norm": 0.28849634528160095,
+      "learning_rate": 0.00017895814100181515,
+      "loss": 0.3055,
+      "step": 4070
+    },
+    {
+      "epoch": 8.688699360341152,
+      "grad_norm": 0.2709294259548187,
+      "learning_rate": 0.0001788819598746677,
+      "loss": 0.3167,
+      "step": 4075
+    },
+    {
+      "epoch": 8.699360341151387,
+      "grad_norm": 0.28200262784957886,
+      "learning_rate": 0.0001788056573683464,
+      "loss": 0.307,
+      "step": 4080
+    },
+    {
+      "epoch": 8.710021321961621,
+      "grad_norm": 0.27431854605674744,
+      "learning_rate": 0.00017872923360026137,
+      "loss": 0.3163,
+      "step": 4085
+    },
+    {
+      "epoch": 8.720682302771856,
+      "grad_norm": 0.28479164838790894,
+      "learning_rate": 0.00017865268868800925,
+      "loss": 0.3257,
+      "step": 4090
+    },
+    {
+      "epoch": 8.73134328358209,
+      "grad_norm": 0.2959545850753784,
+      "learning_rate": 0.00017857602274937308,
+      "loss": 0.3138,
+      "step": 4095
+    },
+    {
+      "epoch": 8.742004264392325,
+      "grad_norm": 0.270533949136734,
+      "learning_rate": 0.00017849923590232213,
+      "loss": 0.3182,
+      "step": 4100
+    },
+    {
+      "epoch": 8.752665245202559,
+      "grad_norm": 0.26438501477241516,
+      "learning_rate": 0.0001784223282650118,
+      "loss": 0.3084,
+      "step": 4105
+    },
+    {
+      "epoch": 8.763326226012794,
+      "grad_norm": 0.2890710234642029,
+      "learning_rate": 0.00017834529995578317,
+      "loss": 0.3093,
+      "step": 4110
+    },
+    {
+      "epoch": 8.773987206823028,
+      "grad_norm": 0.2725368142127991,
+      "learning_rate": 0.0001782681510931632,
+      "loss": 0.3185,
+      "step": 4115
+    },
+    {
+      "epoch": 8.784648187633262,
+      "grad_norm": 0.2648097276687622,
+      "learning_rate": 0.00017819088179586427,
+      "loss": 0.3126,
+      "step": 4120
+    },
+    {
+      "epoch": 8.795309168443497,
+      "grad_norm": 0.27868813276290894,
+      "learning_rate": 0.00017811349218278407,
+      "loss": 0.3157,
+      "step": 4125
+    },
+    {
+      "epoch": 8.805970149253731,
+      "grad_norm": 0.3133993446826935,
+      "learning_rate": 0.00017803598237300537,
+      "loss": 0.3128,
+      "step": 4130
+    },
+    {
+      "epoch": 8.816631130063966,
+      "grad_norm": 0.270416796207428,
+      "learning_rate": 0.00017795835248579606,
+      "loss": 0.3087,
+      "step": 4135
+    },
+    {
+      "epoch": 8.8272921108742,
+      "grad_norm": 0.299452543258667,
+      "learning_rate": 0.00017788060264060864,
+      "loss": 0.3126,
+      "step": 4140
+    },
+    {
+      "epoch": 8.837953091684435,
+      "grad_norm": 0.2789115607738495,
+      "learning_rate": 0.00017780273295708025,
+      "loss": 0.3149,
+      "step": 4145
+    },
+    {
+      "epoch": 8.84861407249467,
+      "grad_norm": 0.2616700828075409,
+      "learning_rate": 0.0001777247435550324,
+      "loss": 0.3151,
+      "step": 4150
+    },
+    {
+      "epoch": 8.859275053304904,
+      "grad_norm": 0.2998231053352356,
+      "learning_rate": 0.0001776466345544709,
+      "loss": 0.3143,
+      "step": 4155
+    },
+    {
+      "epoch": 8.869936034115138,
+      "grad_norm": 0.2851693034172058,
+      "learning_rate": 0.00017756840607558553,
+      "loss": 0.3153,
+      "step": 4160
+    },
+    {
+      "epoch": 8.880597014925373,
+      "grad_norm": 0.2862933874130249,
+      "learning_rate": 0.00017749005823874988,
+      "loss": 0.3124,
+      "step": 4165
+    },
+    {
+      "epoch": 8.891257995735607,
+      "grad_norm": 0.29242345690727234,
+      "learning_rate": 0.00017741159116452132,
+      "loss": 0.3137,
+      "step": 4170
+    },
+    {
+      "epoch": 8.901918976545842,
+      "grad_norm": 0.3226570188999176,
+      "learning_rate": 0.00017733300497364054,
+      "loss": 0.3168,
+      "step": 4175
+    },
+    {
+      "epoch": 8.912579957356076,
+      "grad_norm": 0.31018882989883423,
+      "learning_rate": 0.00017725429978703163,
+      "loss": 0.3162,
+      "step": 4180
+    },
+    {
+      "epoch": 8.92324093816631,
+      "grad_norm": 0.30581411719322205,
+      "learning_rate": 0.00017717547572580178,
+      "loss": 0.3166,
+      "step": 4185
+    },
+    {
+      "epoch": 8.933901918976545,
+      "grad_norm": 0.27954214811325073,
+      "learning_rate": 0.00017709653291124103,
+      "loss": 0.3175,
+      "step": 4190
+    },
+    {
+      "epoch": 8.94456289978678,
+      "grad_norm": 0.2803252041339874,
+      "learning_rate": 0.00017701747146482222,
+      "loss": 0.3228,
+      "step": 4195
+    },
+    {
+      "epoch": 8.955223880597014,
+      "grad_norm": 0.27694806456565857,
+      "learning_rate": 0.00017693829150820068,
+      "loss": 0.3152,
+      "step": 4200
+    },
+    {
+      "epoch": 8.96588486140725,
+      "grad_norm": 0.2755722403526306,
+      "learning_rate": 0.00017685899316321422,
+      "loss": 0.3105,
+      "step": 4205
+    },
+    {
+      "epoch": 8.976545842217483,
+      "grad_norm": 0.26287201046943665,
+      "learning_rate": 0.00017677957655188258,
+      "loss": 0.3146,
+      "step": 4210
+    },
+    {
+      "epoch": 8.98720682302772,
+      "grad_norm": 0.2679538428783417,
+      "learning_rate": 0.00017670004179640774,
+      "loss": 0.3196,
+      "step": 4215
+    },
+    {
+      "epoch": 8.997867803837954,
+      "grad_norm": 0.2998240292072296,
+      "learning_rate": 0.0001766203890191733,
+      "loss": 0.311,
+      "step": 4220
+    },
+    {
+      "epoch": 9.0,
+      "eval_loss": 0.556614875793457,
+      "eval_runtime": 377.56,
+      "eval_samples_per_second": 1.091,
+      "eval_steps_per_second": 1.091,
+      "step": 4221
+    },
+    {
+      "epoch": 9.008528784648188,
+      "grad_norm": 0.2680657207965851,
+      "learning_rate": 0.00017654061834274453,
+      "loss": 0.2787,
+      "step": 4225
+    },
+    {
+      "epoch": 9.019189765458423,
+      "grad_norm": 0.28186333179473877,
+      "learning_rate": 0.00017646072988986816,
+      "loss": 0.2668,
+      "step": 4230
+    },
+    {
+      "epoch": 9.029850746268657,
+      "grad_norm": 0.3159712255001068,
+      "learning_rate": 0.00017638072378347203,
+      "loss": 0.2681,
+      "step": 4235
+    },
+    {
+      "epoch": 9.040511727078892,
+      "grad_norm": 0.29439476132392883,
+      "learning_rate": 0.00017630060014666514,
+      "loss": 0.2644,
+      "step": 4240
+    },
+    {
+      "epoch": 9.051172707889126,
+      "grad_norm": 0.27110064029693604,
+      "learning_rate": 0.00017622035910273726,
+      "loss": 0.2645,
+      "step": 4245
+    },
+    {
+      "epoch": 9.06183368869936,
+      "grad_norm": 0.3253141939640045,
+      "learning_rate": 0.00017614000077515886,
+      "loss": 0.2668,
+      "step": 4250
+    },
+    {
+      "epoch": 9.072494669509595,
+      "grad_norm": 0.27271440625190735,
+      "learning_rate": 0.00017605952528758085,
+      "loss": 0.2636,
+      "step": 4255
+    },
+    {
+      "epoch": 9.08315565031983,
+      "grad_norm": 0.3024181127548218,
+      "learning_rate": 0.00017597893276383446,
+      "loss": 0.2651,
+      "step": 4260
+    },
+    {
+      "epoch": 9.093816631130064,
+      "grad_norm": 0.29704058170318604,
+      "learning_rate": 0.00017589822332793098,
+      "loss": 0.2705,
+      "step": 4265
+    },
+    {
+      "epoch": 9.104477611940299,
+      "grad_norm": 0.3102332055568695,
+      "learning_rate": 0.0001758173971040616,
+      "loss": 0.2645,
+      "step": 4270
+    },
+    {
+      "epoch": 9.115138592750533,
+      "grad_norm": 0.28398755192756653,
+      "learning_rate": 0.00017573645421659715,
+      "loss": 0.2695,
+      "step": 4275
+    },
+    {
+      "epoch": 9.125799573560768,
+      "grad_norm": 0.3188519775867462,
+      "learning_rate": 0.00017565539479008814,
+      "loss": 0.272,
+      "step": 4280
+    },
+    {
+      "epoch": 9.136460554371002,
+      "grad_norm": 0.30803632736206055,
+      "learning_rate": 0.0001755742189492643,
+      "loss": 0.268,
+      "step": 4285
+    },
+    {
+      "epoch": 9.147121535181236,
+      "grad_norm": 0.3042227327823639,
+      "learning_rate": 0.00017549292681903444,
+      "loss": 0.2659,
+      "step": 4290
+    },
+    {
+      "epoch": 9.157782515991471,
+      "grad_norm": 0.3055075407028198,
+      "learning_rate": 0.00017541151852448644,
+      "loss": 0.2705,
+      "step": 4295
+    },
+    {
+      "epoch": 9.168443496801705,
+      "grad_norm": 0.3084838092327118,
+      "learning_rate": 0.00017532999419088682,
+      "loss": 0.2711,
+      "step": 4300
+    },
+    {
+      "epoch": 9.17910447761194,
+      "grad_norm": 0.3110904395580292,
+      "learning_rate": 0.00017524835394368065,
+      "loss": 0.2678,
+      "step": 4305
+    },
+    {
+      "epoch": 9.189765458422174,
+      "grad_norm": 0.3138080835342407,
+      "learning_rate": 0.0001751665979084915,
+      "loss": 0.2715,
+      "step": 4310
+    },
+    {
+      "epoch": 9.200426439232409,
+      "grad_norm": 0.2787773609161377,
+      "learning_rate": 0.00017508472621112093,
+      "loss": 0.2764,
+      "step": 4315
+    },
+    {
+      "epoch": 9.211087420042643,
+      "grad_norm": 0.31073546409606934,
+      "learning_rate": 0.0001750027389775486,
+      "loss": 0.2745,
+      "step": 4320
+    },
+    {
+      "epoch": 9.221748400852878,
+      "grad_norm": 0.3100415766239166,
+      "learning_rate": 0.00017492063633393188,
+      "loss": 0.2731,
+      "step": 4325
+    },
+    {
+      "epoch": 9.232409381663112,
+      "grad_norm": 0.300081342458725,
+      "learning_rate": 0.00017483841840660577,
+      "loss": 0.2711,
+      "step": 4330
+    },
+    {
+      "epoch": 9.243070362473347,
+      "grad_norm": 0.31163203716278076,
+      "learning_rate": 0.0001747560853220826,
+      "loss": 0.2786,
+      "step": 4335
+    },
+    {
+      "epoch": 9.253731343283581,
+      "grad_norm": 0.33607375621795654,
+      "learning_rate": 0.00017467363720705204,
+      "loss": 0.2728,
+      "step": 4340
+    },
+    {
+      "epoch": 9.264392324093816,
+      "grad_norm": 0.300729900598526,
+      "learning_rate": 0.0001745910741883806,
+      "loss": 0.2749,
+      "step": 4345
+    },
+    {
+      "epoch": 9.275053304904052,
+      "grad_norm": 0.3036794364452362,
+      "learning_rate": 0.00017450839639311162,
+      "loss": 0.2726,
+      "step": 4350
+    },
+    {
+      "epoch": 9.285714285714286,
+      "grad_norm": 0.32798221707344055,
+      "learning_rate": 0.00017442560394846516,
+      "loss": 0.2752,
+      "step": 4355
+    },
+    {
+      "epoch": 9.296375266524521,
+      "grad_norm": 0.2973875105381012,
+      "learning_rate": 0.00017434269698183763,
+      "loss": 0.2743,
+      "step": 4360
+    },
+    {
+      "epoch": 9.307036247334755,
+      "grad_norm": 0.3339863717556,
+      "learning_rate": 0.00017425967562080167,
+      "loss": 0.2766,
+      "step": 4365
+    },
+    {
+      "epoch": 9.31769722814499,
+      "grad_norm": 0.30738508701324463,
+      "learning_rate": 0.00017417653999310585,
+      "loss": 0.2728,
+      "step": 4370
+    },
+    {
+      "epoch": 9.328358208955224,
+      "grad_norm": 0.3430582284927368,
+      "learning_rate": 0.0001740932902266747,
+      "loss": 0.2744,
+      "step": 4375
+    },
+    {
+      "epoch": 9.339019189765459,
+      "grad_norm": 0.2887689769268036,
+      "learning_rate": 0.00017400992644960842,
+      "loss": 0.2772,
+      "step": 4380
+    },
+    {
+      "epoch": 9.349680170575693,
+      "grad_norm": 0.3249075412750244,
+      "learning_rate": 0.0001739264487901824,
+      "loss": 0.2757,
+      "step": 4385
+    },
+    {
+      "epoch": 9.360341151385928,
+      "grad_norm": 0.31958818435668945,
+      "learning_rate": 0.00017384285737684753,
+      "loss": 0.2744,
+      "step": 4390
+    },
+    {
+      "epoch": 9.371002132196162,
+      "grad_norm": 0.31824401021003723,
+      "learning_rate": 0.0001737591523382296,
+      "loss": 0.2809,
+      "step": 4395
+    },
+    {
+      "epoch": 9.381663113006397,
+      "grad_norm": 0.3125913143157959,
+      "learning_rate": 0.00017367533380312924,
+      "loss": 0.276,
+      "step": 4400
+    },
+    {
+      "epoch": 9.392324093816631,
+      "grad_norm": 0.32215094566345215,
+      "learning_rate": 0.0001735914019005218,
+      "loss": 0.2746,
+      "step": 4405
+    },
+    {
+      "epoch": 9.402985074626866,
+      "grad_norm": 0.3145129382610321,
+      "learning_rate": 0.00017350735675955697,
+      "loss": 0.2818,
+      "step": 4410
+    },
+    {
+      "epoch": 9.4136460554371,
+      "grad_norm": 0.3180083930492401,
+      "learning_rate": 0.0001734231985095588,
+      "loss": 0.2782,
+      "step": 4415
+    },
+    {
+      "epoch": 9.424307036247335,
+      "grad_norm": 0.307829350233078,
+      "learning_rate": 0.00017333892728002527,
+      "loss": 0.2744,
+      "step": 4420
+    },
+    {
+      "epoch": 9.43496801705757,
+      "grad_norm": 0.3098660111427307,
+      "learning_rate": 0.00017325454320062832,
+      "loss": 0.2794,
+      "step": 4425
+    },
+    {
+      "epoch": 9.445628997867804,
+      "grad_norm": 0.2991037666797638,
+      "learning_rate": 0.0001731700464012134,
+      "loss": 0.2778,
+      "step": 4430
+    },
+    {
+      "epoch": 9.456289978678038,
+      "grad_norm": 0.3197588622570038,
+      "learning_rate": 0.0001730854370117996,
+      "loss": 0.2764,
+      "step": 4435
+    },
+    {
+      "epoch": 9.466950959488273,
+      "grad_norm": 0.31818678975105286,
+      "learning_rate": 0.00017300071516257904,
+      "loss": 0.2754,
+      "step": 4440
+    },
+    {
+      "epoch": 9.477611940298507,
+      "grad_norm": 0.3030422031879425,
+      "learning_rate": 0.000172915880983917,
+      "loss": 0.2795,
+      "step": 4445
+    },
+    {
+      "epoch": 9.488272921108742,
+      "grad_norm": 0.304565966129303,
+      "learning_rate": 0.00017283093460635166,
+      "loss": 0.2837,
+      "step": 4450
+    },
+    {
+      "epoch": 9.498933901918976,
+      "grad_norm": 0.3034186363220215,
+      "learning_rate": 0.00017274587616059376,
+      "loss": 0.2768,
+      "step": 4455
+    },
+    {
+      "epoch": 9.50959488272921,
+      "grad_norm": 0.30095112323760986,
+      "learning_rate": 0.00017266070577752647,
+      "loss": 0.2786,
+      "step": 4460
+    },
+    {
+      "epoch": 9.520255863539445,
+      "grad_norm": 0.3102254271507263,
+      "learning_rate": 0.0001725754235882053,
+      "loss": 0.2776,
+      "step": 4465
+    },
+    {
+      "epoch": 9.53091684434968,
+      "grad_norm": 0.2985278367996216,
+      "learning_rate": 0.00017249002972385765,
+      "loss": 0.2784,
+      "step": 4470
+    },
+    {
+      "epoch": 9.541577825159914,
+      "grad_norm": 0.32831713557243347,
+      "learning_rate": 0.00017240452431588294,
+      "loss": 0.2869,
+      "step": 4475
+    },
+    {
+      "epoch": 9.552238805970148,
+      "grad_norm": 0.3177868127822876,
+      "learning_rate": 0.0001723189074958521,
+      "loss": 0.2784,
+      "step": 4480
+    },
+    {
+      "epoch": 9.562899786780385,
+      "grad_norm": 0.3071228265762329,
+      "learning_rate": 0.00017223317939550753,
+      "loss": 0.2804,
+      "step": 4485
+    },
+    {
+      "epoch": 9.57356076759062,
+      "grad_norm": 0.3183000981807709,
+      "learning_rate": 0.00017214734014676288,
+      "loss": 0.2799,
+      "step": 4490
+    },
+    {
+      "epoch": 9.584221748400854,
+      "grad_norm": 0.33166825771331787,
+      "learning_rate": 0.00017206138988170281,
+      "loss": 0.2828,
+      "step": 4495
+    },
+    {
+      "epoch": 9.594882729211088,
+      "grad_norm": 0.3132229149341583,
+      "learning_rate": 0.0001719753287325828,
+      "loss": 0.279,
+      "step": 4500
+    },
+    {
+      "epoch": 9.605543710021323,
+      "grad_norm": 0.3281535506248474,
+      "learning_rate": 0.00017188915683182896,
+      "loss": 0.2767,
+      "step": 4505
+    },
+    {
+      "epoch": 9.616204690831557,
+      "grad_norm": 0.31389063596725464,
+      "learning_rate": 0.00017180287431203781,
+      "loss": 0.2851,
+      "step": 4510
+    },
+    {
+      "epoch": 9.626865671641792,
+      "grad_norm": 0.315807580947876,
+      "learning_rate": 0.00017171648130597612,
+      "loss": 0.2816,
+      "step": 4515
+    },
+    {
+      "epoch": 9.637526652452026,
+      "grad_norm": 0.3103027939796448,
+      "learning_rate": 0.0001716299779465806,
+      "loss": 0.2797,
+      "step": 4520
+    },
+    {
+      "epoch": 9.64818763326226,
+      "grad_norm": 0.3018797039985657,
+      "learning_rate": 0.00017154336436695785,
+      "loss": 0.2827,
+      "step": 4525
+    },
+    {
+      "epoch": 9.658848614072495,
+      "grad_norm": 0.3306185007095337,
+      "learning_rate": 0.00017145664070038406,
+      "loss": 0.2861,
+      "step": 4530
+    },
+    {
+      "epoch": 9.66950959488273,
+      "grad_norm": 0.3151242434978485,
+      "learning_rate": 0.0001713698070803047,
+      "loss": 0.2855,
+      "step": 4535
+    },
+    {
+      "epoch": 9.680170575692964,
+      "grad_norm": 0.3073995113372803,
+      "learning_rate": 0.0001712828636403346,
+      "loss": 0.2825,
+      "step": 4540
+    },
+    {
+      "epoch": 9.690831556503198,
+      "grad_norm": 0.31615933775901794,
+      "learning_rate": 0.00017119581051425742,
+      "loss": 0.2791,
+      "step": 4545
+    },
+    {
+      "epoch": 9.701492537313433,
+      "grad_norm": 0.3101312816143036,
+      "learning_rate": 0.0001711086478360257,
+      "loss": 0.287,
+      "step": 4550
+    },
+    {
+      "epoch": 9.712153518123667,
+      "grad_norm": 0.3094468116760254,
+      "learning_rate": 0.00017102137573976058,
+      "loss": 0.2804,
+      "step": 4555
+    },
+    {
+      "epoch": 9.722814498933902,
+      "grad_norm": 0.33349186182022095,
+      "learning_rate": 0.00017093399435975142,
+      "loss": 0.2773,
+      "step": 4560
+    },
+    {
+      "epoch": 9.733475479744136,
+      "grad_norm": 0.2954055368900299,
+      "learning_rate": 0.00017084650383045587,
+      "loss": 0.2762,
+      "step": 4565
+    },
+    {
+      "epoch": 9.74413646055437,
+      "grad_norm": 0.2962237000465393,
+      "learning_rate": 0.0001707589042864995,
+      "loss": 0.2861,
+      "step": 4570
+    },
+    {
+      "epoch": 9.754797441364605,
+      "grad_norm": 0.3323478698730469,
+      "learning_rate": 0.00017067119586267556,
+      "loss": 0.2861,
+      "step": 4575
+    },
+    {
+      "epoch": 9.76545842217484,
+      "grad_norm": 0.2926410138607025,
+      "learning_rate": 0.000170583378693945,
+      "loss": 0.2817,
+      "step": 4580
+    },
+    {
+      "epoch": 9.776119402985074,
+      "grad_norm": 0.3227819502353668,
+      "learning_rate": 0.0001704954529154359,
+      "loss": 0.2884,
+      "step": 4585
+    },
+    {
+      "epoch": 9.786780383795309,
+      "grad_norm": 0.32089999318122864,
+      "learning_rate": 0.00017040741866244358,
+      "loss": 0.2881,
+      "step": 4590
+    },
+    {
+      "epoch": 9.797441364605543,
+      "grad_norm": 0.3188937306404114,
+      "learning_rate": 0.0001703192760704303,
+      "loss": 0.2855,
+      "step": 4595
+    },
+    {
+      "epoch": 9.808102345415778,
+      "grad_norm": 0.3184082508087158,
+      "learning_rate": 0.00017023102527502496,
+      "loss": 0.2842,
+      "step": 4600
+    },
+    {
+      "epoch": 9.818763326226012,
+      "grad_norm": 0.2914822995662689,
+      "learning_rate": 0.00017014266641202292,
+      "loss": 0.274,
+      "step": 4605
+    },
+    {
+      "epoch": 9.829424307036247,
+      "grad_norm": 0.33117881417274475,
+      "learning_rate": 0.00017005419961738593,
+      "loss": 0.2888,
+      "step": 4610
+    },
+    {
+      "epoch": 9.840085287846481,
+      "grad_norm": 0.32017573714256287,
+      "learning_rate": 0.0001699656250272418,
+      "loss": 0.2785,
+      "step": 4615
+    },
+    {
+      "epoch": 9.850746268656717,
+      "grad_norm": 0.29259586334228516,
+      "learning_rate": 0.00016987694277788417,
+      "loss": 0.2888,
+      "step": 4620
+    },
+    {
+      "epoch": 9.86140724946695,
+      "grad_norm": 0.29314401745796204,
+      "learning_rate": 0.00016978815300577234,
+      "loss": 0.2826,
+      "step": 4625
+    },
+    {
+      "epoch": 9.872068230277186,
+      "grad_norm": 0.3312009572982788,
+      "learning_rate": 0.00016969925584753108,
+      "loss": 0.2828,
+      "step": 4630
+    },
+    {
+      "epoch": 9.88272921108742,
+      "grad_norm": 0.31798672676086426,
+      "learning_rate": 0.00016961025143995037,
+      "loss": 0.2777,
+      "step": 4635
+    },
+    {
+      "epoch": 9.893390191897655,
+      "grad_norm": 0.2987801134586334,
+      "learning_rate": 0.00016952113991998527,
+      "loss": 0.2818,
+      "step": 4640
+    },
+    {
+      "epoch": 9.90405117270789,
+      "grad_norm": 0.3148316442966461,
+      "learning_rate": 0.00016943192142475564,
+      "loss": 0.2853,
+      "step": 4645
+    },
+    {
+      "epoch": 9.914712153518124,
+      "grad_norm": 0.3207818269729614,
+      "learning_rate": 0.00016934259609154592,
+      "loss": 0.2835,
+      "step": 4650
+    },
+    {
+      "epoch": 9.925373134328359,
+      "grad_norm": 0.29595887660980225,
+      "learning_rate": 0.000169253164057805,
+      "loss": 0.2845,
+      "step": 4655
+    },
+    {
+      "epoch": 9.936034115138593,
+      "grad_norm": 0.2958875894546509,
+      "learning_rate": 0.00016916362546114585,
+      "loss": 0.2793,
+      "step": 4660
+    },
+    {
+      "epoch": 9.946695095948828,
+      "grad_norm": 0.2999938726425171,
+      "learning_rate": 0.00016907398043934557,
+      "loss": 0.2794,
+      "step": 4665
+    },
+    {
+      "epoch": 9.957356076759062,
+      "grad_norm": 0.29154959321022034,
+      "learning_rate": 0.00016898422913034486,
+      "loss": 0.2891,
+      "step": 4670
+    },
+    {
+      "epoch": 9.968017057569297,
+      "grad_norm": 0.30298835039138794,
+      "learning_rate": 0.0001688943716722481,
+      "loss": 0.2859,
+      "step": 4675
+    },
+    {
+      "epoch": 9.978678038379531,
+      "grad_norm": 0.3251824975013733,
+      "learning_rate": 0.00016880440820332291,
+      "loss": 0.283,
+      "step": 4680
+    },
+    {
+      "epoch": 9.989339019189766,
+      "grad_norm": 0.29153597354888916,
+      "learning_rate": 0.0001687143388620001,
+      "loss": 0.2871,
+      "step": 4685
+    },
+    {
+      "epoch": 10.0,
+      "grad_norm": 0.3233014643192291,
+      "learning_rate": 0.0001686241637868734,
+      "loss": 0.2853,
+      "step": 4690
+    },
+    {
+      "epoch": 10.0,
+      "eval_loss": 0.5920408368110657,
+      "eval_runtime": 377.5422,
+      "eval_samples_per_second": 1.091,
+      "eval_steps_per_second": 1.091,
+      "step": 4690
+    },
+    {
+      "epoch": 10.0,
+      "step": 4690,
+      "total_flos": 3.4794514845867704e+18,
+      "train_loss": 0.46453510172077334,
+      "train_runtime": 112907.655,
+      "train_samples_per_second": 0.997,
+      "train_steps_per_second": 0.125
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 14070,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 30,
+  "save_steps": 50,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 5,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 3.4794514845867704e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}