hansken_human_hql_v3 / trainer_state.json
hansh's picture
Model save
93543e4 verified
raw
history blame
167 kB
{
"best_metric": 0.5016890168190002,
"best_model_checkpoint": "data/hansken_human_hql_v3/checkpoint-2345",
"epoch": 10.0,
"eval_steps": 500,
"global_step": 4690,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021321961620469083,
"grad_norm": 1.0516366958618164,
"learning_rate": 1.4214641080312722e-07,
"loss": 1.9389,
"step": 1
},
{
"epoch": 0.010660980810234541,
"grad_norm": 0.9856139421463013,
"learning_rate": 7.107320540156362e-07,
"loss": 2.0398,
"step": 5
},
{
"epoch": 0.021321961620469083,
"grad_norm": 1.0568891763687134,
"learning_rate": 1.4214641080312723e-06,
"loss": 2.0618,
"step": 10
},
{
"epoch": 0.031982942430703626,
"grad_norm": 0.9998515844345093,
"learning_rate": 2.132196162046908e-06,
"loss": 2.0543,
"step": 15
},
{
"epoch": 0.042643923240938165,
"grad_norm": 1.004911184310913,
"learning_rate": 2.8429282160625447e-06,
"loss": 1.9997,
"step": 20
},
{
"epoch": 0.053304904051172705,
"grad_norm": 0.9931671619415283,
"learning_rate": 3.553660270078181e-06,
"loss": 1.9913,
"step": 25
},
{
"epoch": 0.06396588486140725,
"grad_norm": 0.9859012365341187,
"learning_rate": 4.264392324093816e-06,
"loss": 1.9729,
"step": 30
},
{
"epoch": 0.07462686567164178,
"grad_norm": 1.0391347408294678,
"learning_rate": 4.975124378109453e-06,
"loss": 1.9434,
"step": 35
},
{
"epoch": 0.08528784648187633,
"grad_norm": 0.8275197744369507,
"learning_rate": 5.685856432125089e-06,
"loss": 1.9092,
"step": 40
},
{
"epoch": 0.09594882729211088,
"grad_norm": 0.7102633714675903,
"learning_rate": 6.396588486140726e-06,
"loss": 1.8488,
"step": 45
},
{
"epoch": 0.10660980810234541,
"grad_norm": 0.6521381735801697,
"learning_rate": 7.107320540156362e-06,
"loss": 1.8673,
"step": 50
},
{
"epoch": 0.11727078891257996,
"grad_norm": 0.5477872490882874,
"learning_rate": 7.818052594171997e-06,
"loss": 1.7758,
"step": 55
},
{
"epoch": 0.1279317697228145,
"grad_norm": 0.49889788031578064,
"learning_rate": 8.528784648187633e-06,
"loss": 1.7453,
"step": 60
},
{
"epoch": 0.13859275053304904,
"grad_norm": 0.5726047158241272,
"learning_rate": 9.23951670220327e-06,
"loss": 1.7635,
"step": 65
},
{
"epoch": 0.14925373134328357,
"grad_norm": 0.4760012924671173,
"learning_rate": 9.950248756218906e-06,
"loss": 1.7027,
"step": 70
},
{
"epoch": 0.15991471215351813,
"grad_norm": 0.4642033278942108,
"learning_rate": 1.0660980810234541e-05,
"loss": 1.7086,
"step": 75
},
{
"epoch": 0.17057569296375266,
"grad_norm": 0.42560943961143494,
"learning_rate": 1.1371712864250179e-05,
"loss": 1.638,
"step": 80
},
{
"epoch": 0.1812366737739872,
"grad_norm": 0.4680778384208679,
"learning_rate": 1.2082444918265814e-05,
"loss": 1.6029,
"step": 85
},
{
"epoch": 0.19189765458422176,
"grad_norm": 0.4264519214630127,
"learning_rate": 1.2793176972281452e-05,
"loss": 1.4899,
"step": 90
},
{
"epoch": 0.2025586353944563,
"grad_norm": 0.41101664304733276,
"learning_rate": 1.3503909026297087e-05,
"loss": 1.4997,
"step": 95
},
{
"epoch": 0.21321961620469082,
"grad_norm": 0.34257784485816956,
"learning_rate": 1.4214641080312725e-05,
"loss": 1.4734,
"step": 100
},
{
"epoch": 0.22388059701492538,
"grad_norm": 0.34164702892303467,
"learning_rate": 1.4925373134328357e-05,
"loss": 1.4341,
"step": 105
},
{
"epoch": 0.2345415778251599,
"grad_norm": 0.3285938501358032,
"learning_rate": 1.5636105188343994e-05,
"loss": 1.4293,
"step": 110
},
{
"epoch": 0.24520255863539445,
"grad_norm": 0.33409905433654785,
"learning_rate": 1.634683724235963e-05,
"loss": 1.3792,
"step": 115
},
{
"epoch": 0.255863539445629,
"grad_norm": 0.3385579288005829,
"learning_rate": 1.7057569296375266e-05,
"loss": 1.3811,
"step": 120
},
{
"epoch": 0.26652452025586354,
"grad_norm": 0.35849225521087646,
"learning_rate": 1.7768301350390903e-05,
"loss": 1.3217,
"step": 125
},
{
"epoch": 0.2771855010660981,
"grad_norm": 0.3905642330646515,
"learning_rate": 1.847903340440654e-05,
"loss": 1.2792,
"step": 130
},
{
"epoch": 0.2878464818763326,
"grad_norm": 0.45816823840141296,
"learning_rate": 1.9189765458422178e-05,
"loss": 1.268,
"step": 135
},
{
"epoch": 0.29850746268656714,
"grad_norm": 0.42841047048568726,
"learning_rate": 1.990049751243781e-05,
"loss": 1.1999,
"step": 140
},
{
"epoch": 0.3091684434968017,
"grad_norm": 0.42461100220680237,
"learning_rate": 2.061122956645345e-05,
"loss": 1.1908,
"step": 145
},
{
"epoch": 0.31982942430703626,
"grad_norm": 0.3846851885318756,
"learning_rate": 2.1321961620469083e-05,
"loss": 1.0417,
"step": 150
},
{
"epoch": 0.3304904051172708,
"grad_norm": 0.35793304443359375,
"learning_rate": 2.203269367448472e-05,
"loss": 1.0804,
"step": 155
},
{
"epoch": 0.3411513859275053,
"grad_norm": 0.3422033488750458,
"learning_rate": 2.2743425728500358e-05,
"loss": 1.0433,
"step": 160
},
{
"epoch": 0.35181236673773986,
"grad_norm": 0.34404265880584717,
"learning_rate": 2.345415778251599e-05,
"loss": 1.0823,
"step": 165
},
{
"epoch": 0.3624733475479744,
"grad_norm": 0.31916388869285583,
"learning_rate": 2.416488983653163e-05,
"loss": 1.001,
"step": 170
},
{
"epoch": 0.373134328358209,
"grad_norm": 0.33065563440322876,
"learning_rate": 2.4875621890547266e-05,
"loss": 0.9698,
"step": 175
},
{
"epoch": 0.3837953091684435,
"grad_norm": 0.34518882632255554,
"learning_rate": 2.5586353944562904e-05,
"loss": 0.9731,
"step": 180
},
{
"epoch": 0.39445628997867804,
"grad_norm": 0.31844091415405273,
"learning_rate": 2.6297085998578534e-05,
"loss": 0.9293,
"step": 185
},
{
"epoch": 0.4051172707889126,
"grad_norm": 0.32537004351615906,
"learning_rate": 2.7007818052594175e-05,
"loss": 0.9306,
"step": 190
},
{
"epoch": 0.4157782515991471,
"grad_norm": 0.38439956307411194,
"learning_rate": 2.771855010660981e-05,
"loss": 0.8915,
"step": 195
},
{
"epoch": 0.42643923240938164,
"grad_norm": 0.3455168306827545,
"learning_rate": 2.842928216062545e-05,
"loss": 0.903,
"step": 200
},
{
"epoch": 0.43710021321961623,
"grad_norm": 0.36652979254722595,
"learning_rate": 2.914001421464108e-05,
"loss": 0.8468,
"step": 205
},
{
"epoch": 0.44776119402985076,
"grad_norm": 0.35580819845199585,
"learning_rate": 2.9850746268656714e-05,
"loss": 0.8467,
"step": 210
},
{
"epoch": 0.4584221748400853,
"grad_norm": 0.3748577833175659,
"learning_rate": 3.056147832267235e-05,
"loss": 0.8037,
"step": 215
},
{
"epoch": 0.4690831556503198,
"grad_norm": 0.3399907052516937,
"learning_rate": 3.127221037668799e-05,
"loss": 0.8525,
"step": 220
},
{
"epoch": 0.47974413646055436,
"grad_norm": 0.39041897654533386,
"learning_rate": 3.1982942430703626e-05,
"loss": 0.8672,
"step": 225
},
{
"epoch": 0.4904051172707889,
"grad_norm": 0.37930938601493835,
"learning_rate": 3.269367448471926e-05,
"loss": 0.7967,
"step": 230
},
{
"epoch": 0.5010660980810234,
"grad_norm": 0.4009639024734497,
"learning_rate": 3.34044065387349e-05,
"loss": 0.8134,
"step": 235
},
{
"epoch": 0.511727078891258,
"grad_norm": 0.4189032018184662,
"learning_rate": 3.411513859275053e-05,
"loss": 0.791,
"step": 240
},
{
"epoch": 0.5223880597014925,
"grad_norm": 0.3848344385623932,
"learning_rate": 3.4825870646766175e-05,
"loss": 0.8183,
"step": 245
},
{
"epoch": 0.5330490405117271,
"grad_norm": 0.41223597526550293,
"learning_rate": 3.5536602700781806e-05,
"loss": 0.7668,
"step": 250
},
{
"epoch": 0.5437100213219617,
"grad_norm": 0.4024832844734192,
"learning_rate": 3.624733475479744e-05,
"loss": 0.7819,
"step": 255
},
{
"epoch": 0.5543710021321961,
"grad_norm": 0.3832787871360779,
"learning_rate": 3.695806680881308e-05,
"loss": 0.7693,
"step": 260
},
{
"epoch": 0.5650319829424307,
"grad_norm": 0.4266470670700073,
"learning_rate": 3.766879886282871e-05,
"loss": 0.795,
"step": 265
},
{
"epoch": 0.5756929637526652,
"grad_norm": 0.47055262327194214,
"learning_rate": 3.8379530916844355e-05,
"loss": 0.7752,
"step": 270
},
{
"epoch": 0.5863539445628998,
"grad_norm": 0.420669823884964,
"learning_rate": 3.9090262970859986e-05,
"loss": 0.7691,
"step": 275
},
{
"epoch": 0.5970149253731343,
"grad_norm": 0.4140627384185791,
"learning_rate": 3.980099502487562e-05,
"loss": 0.7385,
"step": 280
},
{
"epoch": 0.6076759061833689,
"grad_norm": 0.4674805998802185,
"learning_rate": 4.051172707889126e-05,
"loss": 0.7668,
"step": 285
},
{
"epoch": 0.6183368869936035,
"grad_norm": 0.45881038904190063,
"learning_rate": 4.12224591329069e-05,
"loss": 0.7777,
"step": 290
},
{
"epoch": 0.6289978678038379,
"grad_norm": 0.4218686819076538,
"learning_rate": 4.1933191186922535e-05,
"loss": 0.7106,
"step": 295
},
{
"epoch": 0.6396588486140725,
"grad_norm": 0.43359580636024475,
"learning_rate": 4.2643923240938166e-05,
"loss": 0.7076,
"step": 300
},
{
"epoch": 0.650319829424307,
"grad_norm": 0.42106226086616516,
"learning_rate": 4.33546552949538e-05,
"loss": 0.7353,
"step": 305
},
{
"epoch": 0.6609808102345416,
"grad_norm": 0.4189695715904236,
"learning_rate": 4.406538734896944e-05,
"loss": 0.698,
"step": 310
},
{
"epoch": 0.6716417910447762,
"grad_norm": 0.45314905047416687,
"learning_rate": 4.477611940298508e-05,
"loss": 0.7356,
"step": 315
},
{
"epoch": 0.6823027718550106,
"grad_norm": 0.46034571528434753,
"learning_rate": 4.5486851457000715e-05,
"loss": 0.7397,
"step": 320
},
{
"epoch": 0.6929637526652452,
"grad_norm": 0.44907087087631226,
"learning_rate": 4.619758351101635e-05,
"loss": 0.7326,
"step": 325
},
{
"epoch": 0.7036247334754797,
"grad_norm": 0.46258679032325745,
"learning_rate": 4.690831556503198e-05,
"loss": 0.6663,
"step": 330
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.446308434009552,
"learning_rate": 4.761904761904762e-05,
"loss": 0.6941,
"step": 335
},
{
"epoch": 0.7249466950959488,
"grad_norm": 0.40378594398498535,
"learning_rate": 4.832977967306326e-05,
"loss": 0.7174,
"step": 340
},
{
"epoch": 0.7356076759061834,
"grad_norm": 0.39717379212379456,
"learning_rate": 4.904051172707889e-05,
"loss": 0.659,
"step": 345
},
{
"epoch": 0.746268656716418,
"grad_norm": 0.4855833053588867,
"learning_rate": 4.975124378109453e-05,
"loss": 0.6762,
"step": 350
},
{
"epoch": 0.7569296375266524,
"grad_norm": 0.47973328828811646,
"learning_rate": 5.046197583511016e-05,
"loss": 0.6782,
"step": 355
},
{
"epoch": 0.767590618336887,
"grad_norm": 0.4429256319999695,
"learning_rate": 5.117270788912581e-05,
"loss": 0.6634,
"step": 360
},
{
"epoch": 0.7782515991471215,
"grad_norm": 0.44692516326904297,
"learning_rate": 5.1883439943141444e-05,
"loss": 0.6792,
"step": 365
},
{
"epoch": 0.7889125799573561,
"grad_norm": 0.4430787265300751,
"learning_rate": 5.259417199715707e-05,
"loss": 0.6416,
"step": 370
},
{
"epoch": 0.7995735607675906,
"grad_norm": 0.4461454451084137,
"learning_rate": 5.330490405117271e-05,
"loss": 0.7013,
"step": 375
},
{
"epoch": 0.8102345415778252,
"grad_norm": 0.526995837688446,
"learning_rate": 5.401563610518835e-05,
"loss": 0.6396,
"step": 380
},
{
"epoch": 0.8208955223880597,
"grad_norm": 0.4485580623149872,
"learning_rate": 5.472636815920398e-05,
"loss": 0.6307,
"step": 385
},
{
"epoch": 0.8315565031982942,
"grad_norm": 0.45416155457496643,
"learning_rate": 5.543710021321962e-05,
"loss": 0.6361,
"step": 390
},
{
"epoch": 0.8422174840085288,
"grad_norm": 0.4746207296848297,
"learning_rate": 5.6147832267235255e-05,
"loss": 0.641,
"step": 395
},
{
"epoch": 0.8528784648187633,
"grad_norm": 0.4466172456741333,
"learning_rate": 5.68585643212509e-05,
"loss": 0.643,
"step": 400
},
{
"epoch": 0.8635394456289979,
"grad_norm": 0.46807265281677246,
"learning_rate": 5.756929637526652e-05,
"loss": 0.6258,
"step": 405
},
{
"epoch": 0.8742004264392325,
"grad_norm": 0.46169164776802063,
"learning_rate": 5.828002842928216e-05,
"loss": 0.6212,
"step": 410
},
{
"epoch": 0.8848614072494669,
"grad_norm": 0.47564077377319336,
"learning_rate": 5.8990760483297804e-05,
"loss": 0.6369,
"step": 415
},
{
"epoch": 0.8955223880597015,
"grad_norm": 0.4582447409629822,
"learning_rate": 5.970149253731343e-05,
"loss": 0.6086,
"step": 420
},
{
"epoch": 0.906183368869936,
"grad_norm": 0.5161389708518982,
"learning_rate": 6.041222459132907e-05,
"loss": 0.6529,
"step": 425
},
{
"epoch": 0.9168443496801706,
"grad_norm": 0.47045719623565674,
"learning_rate": 6.11229566453447e-05,
"loss": 0.6119,
"step": 430
},
{
"epoch": 0.9275053304904051,
"grad_norm": 0.5950572490692139,
"learning_rate": 6.183368869936035e-05,
"loss": 0.6259,
"step": 435
},
{
"epoch": 0.9381663113006397,
"grad_norm": 0.5470284223556519,
"learning_rate": 6.254442075337598e-05,
"loss": 0.6282,
"step": 440
},
{
"epoch": 0.9488272921108742,
"grad_norm": 0.5164011716842651,
"learning_rate": 6.325515280739162e-05,
"loss": 0.6399,
"step": 445
},
{
"epoch": 0.9594882729211087,
"grad_norm": 0.4264001250267029,
"learning_rate": 6.396588486140725e-05,
"loss": 0.6405,
"step": 450
},
{
"epoch": 0.9701492537313433,
"grad_norm": 0.4878412187099457,
"learning_rate": 6.46766169154229e-05,
"loss": 0.6548,
"step": 455
},
{
"epoch": 0.9808102345415778,
"grad_norm": 0.47677186131477356,
"learning_rate": 6.538734896943853e-05,
"loss": 0.6506,
"step": 460
},
{
"epoch": 0.9914712153518124,
"grad_norm": 0.4687974452972412,
"learning_rate": 6.609808102345416e-05,
"loss": 0.6267,
"step": 465
},
{
"epoch": 1.0,
"eval_loss": 0.6078405976295471,
"eval_runtime": 377.5565,
"eval_samples_per_second": 1.091,
"eval_steps_per_second": 1.091,
"step": 469
},
{
"epoch": 1.0021321961620469,
"grad_norm": 0.4401796758174896,
"learning_rate": 6.68088130774698e-05,
"loss": 0.5968,
"step": 470
},
{
"epoch": 1.0127931769722816,
"grad_norm": 0.8371634483337402,
"learning_rate": 6.751954513148543e-05,
"loss": 0.5923,
"step": 475
},
{
"epoch": 1.023454157782516,
"grad_norm": 0.49846479296684265,
"learning_rate": 6.823027718550106e-05,
"loss": 0.6835,
"step": 480
},
{
"epoch": 1.0341151385927505,
"grad_norm": 0.5845323801040649,
"learning_rate": 6.89410092395167e-05,
"loss": 0.5906,
"step": 485
},
{
"epoch": 1.044776119402985,
"grad_norm": 0.5639384984970093,
"learning_rate": 6.965174129353235e-05,
"loss": 0.5881,
"step": 490
},
{
"epoch": 1.0554371002132197,
"grad_norm": 0.5082396268844604,
"learning_rate": 7.036247334754798e-05,
"loss": 0.6224,
"step": 495
},
{
"epoch": 1.0660980810234542,
"grad_norm": 0.5611528158187866,
"learning_rate": 7.107320540156361e-05,
"loss": 0.5643,
"step": 500
},
{
"epoch": 1.0767590618336886,
"grad_norm": 0.7102047801017761,
"learning_rate": 7.178393745557926e-05,
"loss": 0.5814,
"step": 505
},
{
"epoch": 1.0874200426439233,
"grad_norm": 0.46847936511039734,
"learning_rate": 7.249466950959489e-05,
"loss": 0.5642,
"step": 510
},
{
"epoch": 1.0980810234541578,
"grad_norm": 0.47119173407554626,
"learning_rate": 7.320540156361052e-05,
"loss": 0.5674,
"step": 515
},
{
"epoch": 1.1087420042643923,
"grad_norm": 1.0005890130996704,
"learning_rate": 7.391613361762616e-05,
"loss": 0.5949,
"step": 520
},
{
"epoch": 1.1194029850746268,
"grad_norm": 0.7785916924476624,
"learning_rate": 7.46268656716418e-05,
"loss": 0.5643,
"step": 525
},
{
"epoch": 1.1300639658848615,
"grad_norm": 0.6393773555755615,
"learning_rate": 7.533759772565742e-05,
"loss": 0.5886,
"step": 530
},
{
"epoch": 1.140724946695096,
"grad_norm": 0.6369247436523438,
"learning_rate": 7.604832977967307e-05,
"loss": 0.58,
"step": 535
},
{
"epoch": 1.1513859275053304,
"grad_norm": 0.48704272508621216,
"learning_rate": 7.675906183368871e-05,
"loss": 0.6125,
"step": 540
},
{
"epoch": 1.1620469083155651,
"grad_norm": 0.5542349219322205,
"learning_rate": 7.746979388770433e-05,
"loss": 0.5688,
"step": 545
},
{
"epoch": 1.1727078891257996,
"grad_norm": 0.4632197618484497,
"learning_rate": 7.818052594171997e-05,
"loss": 0.5727,
"step": 550
},
{
"epoch": 1.183368869936034,
"grad_norm": 0.40735307335853577,
"learning_rate": 7.889125799573562e-05,
"loss": 0.5704,
"step": 555
},
{
"epoch": 1.1940298507462686,
"grad_norm": 0.45803022384643555,
"learning_rate": 7.960199004975125e-05,
"loss": 0.6041,
"step": 560
},
{
"epoch": 1.2046908315565032,
"grad_norm": 0.47275593876838684,
"learning_rate": 8.031272210376688e-05,
"loss": 0.5476,
"step": 565
},
{
"epoch": 1.2153518123667377,
"grad_norm": 0.4402256906032562,
"learning_rate": 8.102345415778252e-05,
"loss": 0.6101,
"step": 570
},
{
"epoch": 1.2260127931769722,
"grad_norm": 0.4577506184577942,
"learning_rate": 8.173418621179815e-05,
"loss": 0.6021,
"step": 575
},
{
"epoch": 1.236673773987207,
"grad_norm": 0.4695811867713928,
"learning_rate": 8.24449182658138e-05,
"loss": 0.5843,
"step": 580
},
{
"epoch": 1.2473347547974414,
"grad_norm": 0.5012730360031128,
"learning_rate": 8.315565031982943e-05,
"loss": 0.5963,
"step": 585
},
{
"epoch": 1.2579957356076759,
"grad_norm": 0.4261506199836731,
"learning_rate": 8.386638237384507e-05,
"loss": 0.5608,
"step": 590
},
{
"epoch": 1.2686567164179103,
"grad_norm": 0.48886266350746155,
"learning_rate": 8.45771144278607e-05,
"loss": 0.5768,
"step": 595
},
{
"epoch": 1.279317697228145,
"grad_norm": 0.4756333529949188,
"learning_rate": 8.528784648187633e-05,
"loss": 0.5581,
"step": 600
},
{
"epoch": 1.2899786780383795,
"grad_norm": 0.4242517054080963,
"learning_rate": 8.599857853589198e-05,
"loss": 0.5436,
"step": 605
},
{
"epoch": 1.3006396588486142,
"grad_norm": 0.44590556621551514,
"learning_rate": 8.67093105899076e-05,
"loss": 0.5821,
"step": 610
},
{
"epoch": 1.3113006396588487,
"grad_norm": 0.4373833239078522,
"learning_rate": 8.742004264392325e-05,
"loss": 0.544,
"step": 615
},
{
"epoch": 1.3219616204690832,
"grad_norm": 0.42627617716789246,
"learning_rate": 8.813077469793888e-05,
"loss": 0.5417,
"step": 620
},
{
"epoch": 1.3326226012793176,
"grad_norm": 0.516544759273529,
"learning_rate": 8.884150675195451e-05,
"loss": 0.573,
"step": 625
},
{
"epoch": 1.3432835820895521,
"grad_norm": 0.4419044256210327,
"learning_rate": 8.955223880597016e-05,
"loss": 0.5523,
"step": 630
},
{
"epoch": 1.3539445628997868,
"grad_norm": 0.4533810019493103,
"learning_rate": 9.026297085998579e-05,
"loss": 0.5372,
"step": 635
},
{
"epoch": 1.3646055437100213,
"grad_norm": 0.4296520948410034,
"learning_rate": 9.097370291400143e-05,
"loss": 0.5742,
"step": 640
},
{
"epoch": 1.375266524520256,
"grad_norm": 0.4285917282104492,
"learning_rate": 9.168443496801706e-05,
"loss": 0.5577,
"step": 645
},
{
"epoch": 1.3859275053304905,
"grad_norm": 0.41438210010528564,
"learning_rate": 9.23951670220327e-05,
"loss": 0.5659,
"step": 650
},
{
"epoch": 1.396588486140725,
"grad_norm": 0.43702948093414307,
"learning_rate": 9.310589907604834e-05,
"loss": 0.5425,
"step": 655
},
{
"epoch": 1.4072494669509594,
"grad_norm": 0.520577609539032,
"learning_rate": 9.381663113006397e-05,
"loss": 0.5624,
"step": 660
},
{
"epoch": 1.417910447761194,
"grad_norm": 0.451948881149292,
"learning_rate": 9.452736318407961e-05,
"loss": 0.5598,
"step": 665
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.4748338460922241,
"learning_rate": 9.523809523809524e-05,
"loss": 0.6579,
"step": 670
},
{
"epoch": 1.439232409381663,
"grad_norm": 0.4351726472377777,
"learning_rate": 9.594882729211087e-05,
"loss": 0.541,
"step": 675
},
{
"epoch": 1.4498933901918978,
"grad_norm": 0.4322686493396759,
"learning_rate": 9.665955934612652e-05,
"loss": 0.5941,
"step": 680
},
{
"epoch": 1.4605543710021323,
"grad_norm": 0.43369051814079285,
"learning_rate": 9.737029140014216e-05,
"loss": 0.5862,
"step": 685
},
{
"epoch": 1.4712153518123667,
"grad_norm": 0.5028679966926575,
"learning_rate": 9.808102345415778e-05,
"loss": 0.5444,
"step": 690
},
{
"epoch": 1.4818763326226012,
"grad_norm": 0.4060784578323364,
"learning_rate": 9.879175550817342e-05,
"loss": 0.549,
"step": 695
},
{
"epoch": 1.4925373134328357,
"grad_norm": 0.4283974766731262,
"learning_rate": 9.950248756218906e-05,
"loss": 0.5474,
"step": 700
},
{
"epoch": 1.5031982942430704,
"grad_norm": 0.3743923008441925,
"learning_rate": 0.0001002132196162047,
"loss": 0.5394,
"step": 705
},
{
"epoch": 1.5138592750533049,
"grad_norm": 0.44469088315963745,
"learning_rate": 0.00010092395167022033,
"loss": 0.5563,
"step": 710
},
{
"epoch": 1.5245202558635396,
"grad_norm": 0.43209415674209595,
"learning_rate": 0.00010163468372423597,
"loss": 0.5803,
"step": 715
},
{
"epoch": 1.535181236673774,
"grad_norm": 0.4075677990913391,
"learning_rate": 0.00010234541577825161,
"loss": 0.5369,
"step": 720
},
{
"epoch": 1.5458422174840085,
"grad_norm": 0.4084095358848572,
"learning_rate": 0.00010305614783226724,
"loss": 0.5687,
"step": 725
},
{
"epoch": 1.556503198294243,
"grad_norm": 0.4053703248500824,
"learning_rate": 0.00010376687988628289,
"loss": 0.5301,
"step": 730
},
{
"epoch": 1.5671641791044775,
"grad_norm": 0.46452564001083374,
"learning_rate": 0.0001044776119402985,
"loss": 0.5823,
"step": 735
},
{
"epoch": 1.5778251599147122,
"grad_norm": 0.4020977020263672,
"learning_rate": 0.00010518834399431414,
"loss": 0.5463,
"step": 740
},
{
"epoch": 1.5884861407249466,
"grad_norm": 0.3993551433086395,
"learning_rate": 0.00010589907604832978,
"loss": 0.5551,
"step": 745
},
{
"epoch": 1.5991471215351813,
"grad_norm": 0.4211786985397339,
"learning_rate": 0.00010660980810234542,
"loss": 0.5607,
"step": 750
},
{
"epoch": 1.6098081023454158,
"grad_norm": 0.4241097867488861,
"learning_rate": 0.00010732054015636106,
"loss": 0.5402,
"step": 755
},
{
"epoch": 1.6204690831556503,
"grad_norm": 0.3934391736984253,
"learning_rate": 0.0001080312722103767,
"loss": 0.5618,
"step": 760
},
{
"epoch": 1.6311300639658848,
"grad_norm": 0.37157073616981506,
"learning_rate": 0.00010874200426439234,
"loss": 0.5232,
"step": 765
},
{
"epoch": 1.6417910447761193,
"grad_norm": 0.4151962399482727,
"learning_rate": 0.00010945273631840796,
"loss": 0.563,
"step": 770
},
{
"epoch": 1.652452025586354,
"grad_norm": 0.42233771085739136,
"learning_rate": 0.00011016346837242359,
"loss": 0.5667,
"step": 775
},
{
"epoch": 1.6631130063965884,
"grad_norm": 0.3891717493534088,
"learning_rate": 0.00011087420042643924,
"loss": 0.582,
"step": 780
},
{
"epoch": 1.6737739872068231,
"grad_norm": 0.4017283618450165,
"learning_rate": 0.00011158493248045488,
"loss": 0.5386,
"step": 785
},
{
"epoch": 1.6844349680170576,
"grad_norm": 0.4058316648006439,
"learning_rate": 0.00011229566453447051,
"loss": 0.5357,
"step": 790
},
{
"epoch": 1.695095948827292,
"grad_norm": 0.38968625664711,
"learning_rate": 0.00011300639658848615,
"loss": 0.527,
"step": 795
},
{
"epoch": 1.7057569296375266,
"grad_norm": 0.4108840525150299,
"learning_rate": 0.0001137171286425018,
"loss": 0.5347,
"step": 800
},
{
"epoch": 1.716417910447761,
"grad_norm": 0.37222376465797424,
"learning_rate": 0.00011442786069651741,
"loss": 0.524,
"step": 805
},
{
"epoch": 1.7270788912579957,
"grad_norm": 0.4046708047389984,
"learning_rate": 0.00011513859275053305,
"loss": 0.5096,
"step": 810
},
{
"epoch": 1.7377398720682304,
"grad_norm": 0.37089455127716064,
"learning_rate": 0.00011584932480454869,
"loss": 0.5316,
"step": 815
},
{
"epoch": 1.748400852878465,
"grad_norm": 0.3895399272441864,
"learning_rate": 0.00011656005685856432,
"loss": 0.5274,
"step": 820
},
{
"epoch": 1.7590618336886994,
"grad_norm": 0.3956606984138489,
"learning_rate": 0.00011727078891257996,
"loss": 0.5395,
"step": 825
},
{
"epoch": 1.7697228144989339,
"grad_norm": 0.4023361802101135,
"learning_rate": 0.00011798152096659561,
"loss": 0.53,
"step": 830
},
{
"epoch": 1.7803837953091683,
"grad_norm": 0.39323511719703674,
"learning_rate": 0.00011869225302061124,
"loss": 0.5341,
"step": 835
},
{
"epoch": 1.7910447761194028,
"grad_norm": 0.3870689868927002,
"learning_rate": 0.00011940298507462686,
"loss": 0.5268,
"step": 840
},
{
"epoch": 1.8017057569296375,
"grad_norm": 0.39864471554756165,
"learning_rate": 0.0001201137171286425,
"loss": 0.5754,
"step": 845
},
{
"epoch": 1.8123667377398722,
"grad_norm": 0.413980633020401,
"learning_rate": 0.00012082444918265814,
"loss": 0.5274,
"step": 850
},
{
"epoch": 1.8230277185501067,
"grad_norm": 0.3994651138782501,
"learning_rate": 0.00012153518123667377,
"loss": 0.5313,
"step": 855
},
{
"epoch": 1.8336886993603412,
"grad_norm": 0.4106079041957855,
"learning_rate": 0.0001222459132906894,
"loss": 0.5293,
"step": 860
},
{
"epoch": 1.8443496801705757,
"grad_norm": 0.38014471530914307,
"learning_rate": 0.00012295664534470505,
"loss": 0.5313,
"step": 865
},
{
"epoch": 1.8550106609808101,
"grad_norm": 0.3477731943130493,
"learning_rate": 0.0001236673773987207,
"loss": 0.5499,
"step": 870
},
{
"epoch": 1.8656716417910446,
"grad_norm": 0.3609556555747986,
"learning_rate": 0.0001243781094527363,
"loss": 0.5195,
"step": 875
},
{
"epoch": 1.8763326226012793,
"grad_norm": 0.3532927334308624,
"learning_rate": 0.00012508884150675195,
"loss": 0.5233,
"step": 880
},
{
"epoch": 1.886993603411514,
"grad_norm": 0.3663487434387207,
"learning_rate": 0.0001257995735607676,
"loss": 0.5129,
"step": 885
},
{
"epoch": 1.8976545842217485,
"grad_norm": 0.35837364196777344,
"learning_rate": 0.00012651030561478324,
"loss": 0.5106,
"step": 890
},
{
"epoch": 1.908315565031983,
"grad_norm": 0.38498660922050476,
"learning_rate": 0.00012722103766879886,
"loss": 0.5216,
"step": 895
},
{
"epoch": 1.9189765458422174,
"grad_norm": 0.3501322269439697,
"learning_rate": 0.0001279317697228145,
"loss": 0.54,
"step": 900
},
{
"epoch": 1.929637526652452,
"grad_norm": 0.34796684980392456,
"learning_rate": 0.00012864250177683015,
"loss": 0.5165,
"step": 905
},
{
"epoch": 1.9402985074626866,
"grad_norm": 0.46670106053352356,
"learning_rate": 0.0001293532338308458,
"loss": 0.5437,
"step": 910
},
{
"epoch": 1.950959488272921,
"grad_norm": 0.3535880148410797,
"learning_rate": 0.0001300639658848614,
"loss": 0.5561,
"step": 915
},
{
"epoch": 1.9616204690831558,
"grad_norm": 0.3591325283050537,
"learning_rate": 0.00013077469793887705,
"loss": 0.5193,
"step": 920
},
{
"epoch": 1.9722814498933903,
"grad_norm": 0.4969016909599304,
"learning_rate": 0.00013148542999289267,
"loss": 0.526,
"step": 925
},
{
"epoch": 1.9829424307036247,
"grad_norm": 0.3567504584789276,
"learning_rate": 0.00013219616204690831,
"loss": 0.5063,
"step": 930
},
{
"epoch": 1.9936034115138592,
"grad_norm": 0.3647787272930145,
"learning_rate": 0.00013290689410092396,
"loss": 0.5094,
"step": 935
},
{
"epoch": 2.0,
"eval_loss": 0.5335173606872559,
"eval_runtime": 377.8765,
"eval_samples_per_second": 1.09,
"eval_steps_per_second": 1.09,
"step": 938
},
{
"epoch": 2.0042643923240937,
"grad_norm": 0.34923797845840454,
"learning_rate": 0.0001336176261549396,
"loss": 0.5126,
"step": 940
},
{
"epoch": 2.014925373134328,
"grad_norm": 0.4439273476600647,
"learning_rate": 0.00013432835820895525,
"loss": 0.5349,
"step": 945
},
{
"epoch": 2.025586353944563,
"grad_norm": 0.35956764221191406,
"learning_rate": 0.00013503909026297086,
"loss": 0.493,
"step": 950
},
{
"epoch": 2.0362473347547976,
"grad_norm": 0.3677864074707031,
"learning_rate": 0.0001357498223169865,
"loss": 0.523,
"step": 955
},
{
"epoch": 2.046908315565032,
"grad_norm": 0.3486590087413788,
"learning_rate": 0.00013646055437100213,
"loss": 0.5322,
"step": 960
},
{
"epoch": 2.0575692963752665,
"grad_norm": 0.3785991072654724,
"learning_rate": 0.00013717128642501777,
"loss": 0.4903,
"step": 965
},
{
"epoch": 2.068230277185501,
"grad_norm": 0.3422692120075226,
"learning_rate": 0.0001378820184790334,
"loss": 0.5356,
"step": 970
},
{
"epoch": 2.0788912579957355,
"grad_norm": 0.41184964776039124,
"learning_rate": 0.00013859275053304906,
"loss": 0.4969,
"step": 975
},
{
"epoch": 2.08955223880597,
"grad_norm": 0.34267646074295044,
"learning_rate": 0.0001393034825870647,
"loss": 0.5113,
"step": 980
},
{
"epoch": 2.100213219616205,
"grad_norm": 0.38112279772758484,
"learning_rate": 0.00014001421464108032,
"loss": 0.4793,
"step": 985
},
{
"epoch": 2.1108742004264394,
"grad_norm": 0.33497291803359985,
"learning_rate": 0.00014072494669509596,
"loss": 0.5185,
"step": 990
},
{
"epoch": 2.121535181236674,
"grad_norm": 0.37100210785865784,
"learning_rate": 0.00014143567874911158,
"loss": 0.5024,
"step": 995
},
{
"epoch": 2.1321961620469083,
"grad_norm": 0.3079771101474762,
"learning_rate": 0.00014214641080312722,
"loss": 0.5066,
"step": 1000
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.3615591824054718,
"learning_rate": 0.00014285714285714287,
"loss": 0.5157,
"step": 1005
},
{
"epoch": 2.1535181236673773,
"grad_norm": 0.3394719958305359,
"learning_rate": 0.0001435678749111585,
"loss": 0.4906,
"step": 1010
},
{
"epoch": 2.1641791044776117,
"grad_norm": 0.4234224557876587,
"learning_rate": 0.00014427860696517416,
"loss": 0.5015,
"step": 1015
},
{
"epoch": 2.1748400852878467,
"grad_norm": 0.3535841107368469,
"learning_rate": 0.00014498933901918977,
"loss": 0.5107,
"step": 1020
},
{
"epoch": 2.185501066098081,
"grad_norm": 0.41673514246940613,
"learning_rate": 0.0001457000710732054,
"loss": 0.505,
"step": 1025
},
{
"epoch": 2.1961620469083156,
"grad_norm": 0.3521960973739624,
"learning_rate": 0.00014641080312722103,
"loss": 0.5339,
"step": 1030
},
{
"epoch": 2.20682302771855,
"grad_norm": 0.341727614402771,
"learning_rate": 0.00014712153518123668,
"loss": 0.4897,
"step": 1035
},
{
"epoch": 2.2174840085287846,
"grad_norm": 0.32079800963401794,
"learning_rate": 0.00014783226723525232,
"loss": 0.5049,
"step": 1040
},
{
"epoch": 2.228144989339019,
"grad_norm": 0.34027552604675293,
"learning_rate": 0.00014854299928926797,
"loss": 0.4993,
"step": 1045
},
{
"epoch": 2.2388059701492535,
"grad_norm": 0.34183624386787415,
"learning_rate": 0.0001492537313432836,
"loss": 0.51,
"step": 1050
},
{
"epoch": 2.2494669509594885,
"grad_norm": 0.31983354687690735,
"learning_rate": 0.00014996446339729923,
"loss": 0.5084,
"step": 1055
},
{
"epoch": 2.260127931769723,
"grad_norm": 0.3631596565246582,
"learning_rate": 0.00015067519545131484,
"loss": 0.4986,
"step": 1060
},
{
"epoch": 2.2707889125799574,
"grad_norm": 0.32126784324645996,
"learning_rate": 0.0001513859275053305,
"loss": 0.4832,
"step": 1065
},
{
"epoch": 2.281449893390192,
"grad_norm": 0.3390761911869049,
"learning_rate": 0.00015209665955934613,
"loss": 0.4972,
"step": 1070
},
{
"epoch": 2.2921108742004264,
"grad_norm": 0.3330533504486084,
"learning_rate": 0.00015280739161336178,
"loss": 0.4772,
"step": 1075
},
{
"epoch": 2.302771855010661,
"grad_norm": 0.3619351089000702,
"learning_rate": 0.00015351812366737742,
"loss": 0.5141,
"step": 1080
},
{
"epoch": 2.3134328358208958,
"grad_norm": 0.3252182602882385,
"learning_rate": 0.00015422885572139304,
"loss": 0.5056,
"step": 1085
},
{
"epoch": 2.3240938166311302,
"grad_norm": 0.3745068311691284,
"learning_rate": 0.00015493958777540866,
"loss": 0.5395,
"step": 1090
},
{
"epoch": 2.3347547974413647,
"grad_norm": 0.38191962242126465,
"learning_rate": 0.0001556503198294243,
"loss": 0.4865,
"step": 1095
},
{
"epoch": 2.345415778251599,
"grad_norm": 0.32218611240386963,
"learning_rate": 0.00015636105188343994,
"loss": 0.4955,
"step": 1100
},
{
"epoch": 2.3560767590618337,
"grad_norm": 0.32240140438079834,
"learning_rate": 0.0001570717839374556,
"loss": 0.4972,
"step": 1105
},
{
"epoch": 2.366737739872068,
"grad_norm": 0.37284377217292786,
"learning_rate": 0.00015778251599147123,
"loss": 0.4874,
"step": 1110
},
{
"epoch": 2.3773987206823026,
"grad_norm": 0.350769579410553,
"learning_rate": 0.00015849324804548688,
"loss": 0.4931,
"step": 1115
},
{
"epoch": 2.388059701492537,
"grad_norm": 0.3309812843799591,
"learning_rate": 0.0001592039800995025,
"loss": 0.5103,
"step": 1120
},
{
"epoch": 2.398720682302772,
"grad_norm": 0.3497963547706604,
"learning_rate": 0.0001599147121535181,
"loss": 0.4864,
"step": 1125
},
{
"epoch": 2.4093816631130065,
"grad_norm": 0.3567025661468506,
"learning_rate": 0.00016062544420753375,
"loss": 0.5461,
"step": 1130
},
{
"epoch": 2.420042643923241,
"grad_norm": 0.5213941931724548,
"learning_rate": 0.0001613361762615494,
"loss": 0.5138,
"step": 1135
},
{
"epoch": 2.4307036247334755,
"grad_norm": 0.32027000188827515,
"learning_rate": 0.00016204690831556504,
"loss": 0.5078,
"step": 1140
},
{
"epoch": 2.44136460554371,
"grad_norm": 0.37092500925064087,
"learning_rate": 0.00016275764036958069,
"loss": 0.4903,
"step": 1145
},
{
"epoch": 2.4520255863539444,
"grad_norm": 0.35545867681503296,
"learning_rate": 0.0001634683724235963,
"loss": 0.5131,
"step": 1150
},
{
"epoch": 2.4626865671641793,
"grad_norm": 0.3277740776538849,
"learning_rate": 0.00016417910447761195,
"loss": 0.4814,
"step": 1155
},
{
"epoch": 2.473347547974414,
"grad_norm": 0.3226880133152008,
"learning_rate": 0.0001648898365316276,
"loss": 0.4944,
"step": 1160
},
{
"epoch": 2.4840085287846483,
"grad_norm": 0.3283137381076813,
"learning_rate": 0.0001656005685856432,
"loss": 0.5058,
"step": 1165
},
{
"epoch": 2.4946695095948828,
"grad_norm": 0.38707828521728516,
"learning_rate": 0.00016631130063965885,
"loss": 0.5108,
"step": 1170
},
{
"epoch": 2.5053304904051172,
"grad_norm": 0.3053881824016571,
"learning_rate": 0.0001670220326936745,
"loss": 0.4751,
"step": 1175
},
{
"epoch": 2.5159914712153517,
"grad_norm": 0.29871490597724915,
"learning_rate": 0.00016773276474769014,
"loss": 0.4848,
"step": 1180
},
{
"epoch": 2.526652452025586,
"grad_norm": 0.3135201930999756,
"learning_rate": 0.00016844349680170576,
"loss": 0.4852,
"step": 1185
},
{
"epoch": 2.5373134328358207,
"grad_norm": 0.31287622451782227,
"learning_rate": 0.0001691542288557214,
"loss": 0.4804,
"step": 1190
},
{
"epoch": 2.5479744136460556,
"grad_norm": 0.30184197425842285,
"learning_rate": 0.00016986496090973705,
"loss": 0.5006,
"step": 1195
},
{
"epoch": 2.55863539445629,
"grad_norm": 0.29948562383651733,
"learning_rate": 0.00017057569296375266,
"loss": 0.4934,
"step": 1200
},
{
"epoch": 2.5692963752665245,
"grad_norm": 0.29258280992507935,
"learning_rate": 0.0001712864250177683,
"loss": 0.4887,
"step": 1205
},
{
"epoch": 2.579957356076759,
"grad_norm": 0.29767826199531555,
"learning_rate": 0.00017199715707178395,
"loss": 0.4958,
"step": 1210
},
{
"epoch": 2.5906183368869935,
"grad_norm": 0.29649823904037476,
"learning_rate": 0.0001727078891257996,
"loss": 0.51,
"step": 1215
},
{
"epoch": 2.6012793176972284,
"grad_norm": 0.30332130193710327,
"learning_rate": 0.0001734186211798152,
"loss": 0.4954,
"step": 1220
},
{
"epoch": 2.611940298507463,
"grad_norm": 0.3551209270954132,
"learning_rate": 0.00017412935323383086,
"loss": 0.5088,
"step": 1225
},
{
"epoch": 2.6226012793176974,
"grad_norm": 0.33677777647972107,
"learning_rate": 0.0001748400852878465,
"loss": 0.5248,
"step": 1230
},
{
"epoch": 2.633262260127932,
"grad_norm": 0.29216548800468445,
"learning_rate": 0.00017555081734186212,
"loss": 0.4954,
"step": 1235
},
{
"epoch": 2.6439232409381663,
"grad_norm": 0.32732442021369934,
"learning_rate": 0.00017626154939587776,
"loss": 0.5048,
"step": 1240
},
{
"epoch": 2.654584221748401,
"grad_norm": 0.29788029193878174,
"learning_rate": 0.0001769722814498934,
"loss": 0.5056,
"step": 1245
},
{
"epoch": 2.6652452025586353,
"grad_norm": 0.3407440185546875,
"learning_rate": 0.00017768301350390902,
"loss": 0.5385,
"step": 1250
},
{
"epoch": 2.6759061833688698,
"grad_norm": 0.2790848910808563,
"learning_rate": 0.00017839374555792467,
"loss": 0.5014,
"step": 1255
},
{
"epoch": 2.6865671641791042,
"grad_norm": 0.30173078179359436,
"learning_rate": 0.0001791044776119403,
"loss": 0.5118,
"step": 1260
},
{
"epoch": 2.697228144989339,
"grad_norm": 0.2736753821372986,
"learning_rate": 0.00017981520966595596,
"loss": 0.5018,
"step": 1265
},
{
"epoch": 2.7078891257995736,
"grad_norm": 0.2970294952392578,
"learning_rate": 0.00018052594171997157,
"loss": 0.4966,
"step": 1270
},
{
"epoch": 2.718550106609808,
"grad_norm": 0.2721494138240814,
"learning_rate": 0.00018123667377398722,
"loss": 0.4746,
"step": 1275
},
{
"epoch": 2.7292110874200426,
"grad_norm": 0.29144713282585144,
"learning_rate": 0.00018194740582800286,
"loss": 0.4739,
"step": 1280
},
{
"epoch": 2.739872068230277,
"grad_norm": 0.3217550814151764,
"learning_rate": 0.00018265813788201848,
"loss": 0.4868,
"step": 1285
},
{
"epoch": 2.750533049040512,
"grad_norm": 0.25847169756889343,
"learning_rate": 0.00018336886993603412,
"loss": 0.4664,
"step": 1290
},
{
"epoch": 2.7611940298507465,
"grad_norm": 0.2917424142360687,
"learning_rate": 0.00018407960199004977,
"loss": 0.4659,
"step": 1295
},
{
"epoch": 2.771855010660981,
"grad_norm": 0.29807865619659424,
"learning_rate": 0.0001847903340440654,
"loss": 0.4838,
"step": 1300
},
{
"epoch": 2.7825159914712154,
"grad_norm": 0.28630420565605164,
"learning_rate": 0.00018550106609808103,
"loss": 0.4658,
"step": 1305
},
{
"epoch": 2.79317697228145,
"grad_norm": 0.2946392595767975,
"learning_rate": 0.00018621179815209667,
"loss": 0.5037,
"step": 1310
},
{
"epoch": 2.8038379530916844,
"grad_norm": 0.38894176483154297,
"learning_rate": 0.0001869225302061123,
"loss": 0.525,
"step": 1315
},
{
"epoch": 2.814498933901919,
"grad_norm": 0.28793737292289734,
"learning_rate": 0.00018763326226012793,
"loss": 0.5238,
"step": 1320
},
{
"epoch": 2.8251599147121533,
"grad_norm": 0.3103950023651123,
"learning_rate": 0.00018834399431414358,
"loss": 0.4932,
"step": 1325
},
{
"epoch": 2.835820895522388,
"grad_norm": 0.2969878017902374,
"learning_rate": 0.00018905472636815922,
"loss": 0.4807,
"step": 1330
},
{
"epoch": 2.8464818763326227,
"grad_norm": 0.2937600612640381,
"learning_rate": 0.00018976545842217486,
"loss": 0.4862,
"step": 1335
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.2892070710659027,
"learning_rate": 0.00019047619047619048,
"loss": 0.526,
"step": 1340
},
{
"epoch": 2.8678038379530917,
"grad_norm": 0.28446847200393677,
"learning_rate": 0.00019118692253020613,
"loss": 0.4846,
"step": 1345
},
{
"epoch": 2.878464818763326,
"grad_norm": 0.2877322733402252,
"learning_rate": 0.00019189765458422174,
"loss": 0.4759,
"step": 1350
},
{
"epoch": 2.8891257995735606,
"grad_norm": 0.2837788462638855,
"learning_rate": 0.0001926083866382374,
"loss": 0.4894,
"step": 1355
},
{
"epoch": 2.8997867803837956,
"grad_norm": 0.3020360469818115,
"learning_rate": 0.00019331911869225303,
"loss": 0.4936,
"step": 1360
},
{
"epoch": 2.91044776119403,
"grad_norm": 0.28344911336898804,
"learning_rate": 0.00019402985074626867,
"loss": 0.4881,
"step": 1365
},
{
"epoch": 2.9211087420042645,
"grad_norm": 0.2753186821937561,
"learning_rate": 0.00019474058280028432,
"loss": 0.4826,
"step": 1370
},
{
"epoch": 2.931769722814499,
"grad_norm": 0.2922317385673523,
"learning_rate": 0.00019545131485429994,
"loss": 0.4759,
"step": 1375
},
{
"epoch": 2.9424307036247335,
"grad_norm": 0.3179524540901184,
"learning_rate": 0.00019616204690831555,
"loss": 0.4883,
"step": 1380
},
{
"epoch": 2.953091684434968,
"grad_norm": 0.2944222688674927,
"learning_rate": 0.0001968727789623312,
"loss": 0.4804,
"step": 1385
},
{
"epoch": 2.9637526652452024,
"grad_norm": 0.2687291204929352,
"learning_rate": 0.00019758351101634684,
"loss": 0.4891,
"step": 1390
},
{
"epoch": 2.974413646055437,
"grad_norm": 0.25935596227645874,
"learning_rate": 0.00019829424307036249,
"loss": 0.4902,
"step": 1395
},
{
"epoch": 2.9850746268656714,
"grad_norm": 0.30086612701416016,
"learning_rate": 0.00019900497512437813,
"loss": 0.4942,
"step": 1400
},
{
"epoch": 2.9957356076759063,
"grad_norm": 0.2930257022380829,
"learning_rate": 0.00019971570717839377,
"loss": 0.513,
"step": 1405
},
{
"epoch": 3.0,
"eval_loss": 0.5142309069633484,
"eval_runtime": 377.5199,
"eval_samples_per_second": 1.091,
"eval_steps_per_second": 1.091,
"step": 1407
},
{
"epoch": 3.0063965884861408,
"grad_norm": 0.28208208084106445,
"learning_rate": 0.00019999997230259856,
"loss": 0.467,
"step": 1410
},
{
"epoch": 3.0170575692963753,
"grad_norm": 0.290385365486145,
"learning_rate": 0.00019999980304075655,
"loss": 0.44,
"step": 1415
},
{
"epoch": 3.0277185501066097,
"grad_norm": 0.27436771988868713,
"learning_rate": 0.00019999947990477788,
"loss": 0.4876,
"step": 1420
},
{
"epoch": 3.038379530916844,
"grad_norm": 0.2883841395378113,
"learning_rate": 0.00019999900289515975,
"loss": 0.4509,
"step": 1425
},
{
"epoch": 3.0490405117270787,
"grad_norm": 0.279857337474823,
"learning_rate": 0.00019999837201263622,
"loss": 0.4431,
"step": 1430
},
{
"epoch": 3.0597014925373136,
"grad_norm": 0.31563228368759155,
"learning_rate": 0.000199997587258178,
"loss": 0.4789,
"step": 1435
},
{
"epoch": 3.070362473347548,
"grad_norm": 0.302135169506073,
"learning_rate": 0.00019999664863299267,
"loss": 0.4685,
"step": 1440
},
{
"epoch": 3.0810234541577826,
"grad_norm": 0.2668147385120392,
"learning_rate": 0.00019999555613852449,
"loss": 0.4361,
"step": 1445
},
{
"epoch": 3.091684434968017,
"grad_norm": 0.28701773285865784,
"learning_rate": 0.00019999430977645457,
"loss": 0.4417,
"step": 1450
},
{
"epoch": 3.1023454157782515,
"grad_norm": 0.2622893154621124,
"learning_rate": 0.00019999290954870073,
"loss": 0.4524,
"step": 1455
},
{
"epoch": 3.113006396588486,
"grad_norm": 0.2776693105697632,
"learning_rate": 0.00019999135545741755,
"loss": 0.463,
"step": 1460
},
{
"epoch": 3.1236673773987205,
"grad_norm": 0.26774516701698303,
"learning_rate": 0.00019998964750499637,
"loss": 0.4732,
"step": 1465
},
{
"epoch": 3.1343283582089554,
"grad_norm": 0.26958051323890686,
"learning_rate": 0.0001999877856940653,
"loss": 0.4517,
"step": 1470
},
{
"epoch": 3.14498933901919,
"grad_norm": 0.2604299485683441,
"learning_rate": 0.00019998577002748924,
"loss": 0.4476,
"step": 1475
},
{
"epoch": 3.1556503198294243,
"grad_norm": 1.0628249645233154,
"learning_rate": 0.00019998360050836974,
"loss": 0.4542,
"step": 1480
},
{
"epoch": 3.166311300639659,
"grad_norm": 0.26215219497680664,
"learning_rate": 0.0001999812771400451,
"loss": 0.4608,
"step": 1485
},
{
"epoch": 3.1769722814498933,
"grad_norm": 0.2745310068130493,
"learning_rate": 0.00019997879992609047,
"loss": 0.4532,
"step": 1490
},
{
"epoch": 3.1876332622601278,
"grad_norm": 0.3186289072036743,
"learning_rate": 0.0001999761688703176,
"loss": 0.4854,
"step": 1495
},
{
"epoch": 3.1982942430703627,
"grad_norm": 0.2697219252586365,
"learning_rate": 0.000199973383976775,
"loss": 0.4759,
"step": 1500
},
{
"epoch": 3.208955223880597,
"grad_norm": 0.32173436880111694,
"learning_rate": 0.00019997044524974799,
"loss": 0.47,
"step": 1505
},
{
"epoch": 3.2196162046908317,
"grad_norm": 0.28551211953163147,
"learning_rate": 0.00019996735269375843,
"loss": 0.4537,
"step": 1510
},
{
"epoch": 3.230277185501066,
"grad_norm": 0.2618770897388458,
"learning_rate": 0.00019996410631356498,
"loss": 0.455,
"step": 1515
},
{
"epoch": 3.2409381663113006,
"grad_norm": 0.3189204931259155,
"learning_rate": 0.00019996070611416305,
"loss": 0.4869,
"step": 1520
},
{
"epoch": 3.251599147121535,
"grad_norm": 0.2555652856826782,
"learning_rate": 0.00019995715210078464,
"loss": 0.4582,
"step": 1525
},
{
"epoch": 3.2622601279317696,
"grad_norm": 0.45129457116127014,
"learning_rate": 0.00019995344427889845,
"loss": 0.5055,
"step": 1530
},
{
"epoch": 3.272921108742004,
"grad_norm": 0.2851119637489319,
"learning_rate": 0.0001999495826542099,
"loss": 0.4495,
"step": 1535
},
{
"epoch": 3.283582089552239,
"grad_norm": 0.4647831916809082,
"learning_rate": 0.00019994556723266103,
"loss": 0.4442,
"step": 1540
},
{
"epoch": 3.2942430703624734,
"grad_norm": 0.28650426864624023,
"learning_rate": 0.00019994139802043055,
"loss": 0.488,
"step": 1545
},
{
"epoch": 3.304904051172708,
"grad_norm": 0.2804616093635559,
"learning_rate": 0.0001999370750239338,
"loss": 0.4538,
"step": 1550
},
{
"epoch": 3.3155650319829424,
"grad_norm": 0.2778622508049011,
"learning_rate": 0.0001999325982498228,
"loss": 0.4468,
"step": 1555
},
{
"epoch": 3.326226012793177,
"grad_norm": 0.26577600836753845,
"learning_rate": 0.00019992796770498616,
"loss": 0.4805,
"step": 1560
},
{
"epoch": 3.3368869936034113,
"grad_norm": 0.25679486989974976,
"learning_rate": 0.00019992318339654905,
"loss": 0.4648,
"step": 1565
},
{
"epoch": 3.3475479744136463,
"grad_norm": 0.263921856880188,
"learning_rate": 0.00019991824533187335,
"loss": 0.4638,
"step": 1570
},
{
"epoch": 3.3582089552238807,
"grad_norm": 0.25445836782455444,
"learning_rate": 0.00019991315351855748,
"loss": 0.4395,
"step": 1575
},
{
"epoch": 3.368869936034115,
"grad_norm": 0.2354278415441513,
"learning_rate": 0.0001999079079644364,
"loss": 0.487,
"step": 1580
},
{
"epoch": 3.3795309168443497,
"grad_norm": 0.2561117708683014,
"learning_rate": 0.0001999025086775817,
"loss": 0.4562,
"step": 1585
},
{
"epoch": 3.390191897654584,
"grad_norm": 0.3330647349357605,
"learning_rate": 0.00019989695566630152,
"loss": 0.4445,
"step": 1590
},
{
"epoch": 3.4008528784648187,
"grad_norm": 0.26299235224723816,
"learning_rate": 0.00019989124893914046,
"loss": 0.4488,
"step": 1595
},
{
"epoch": 3.411513859275053,
"grad_norm": 0.299434095621109,
"learning_rate": 0.0001998853885048798,
"loss": 0.4563,
"step": 1600
},
{
"epoch": 3.4221748400852876,
"grad_norm": 0.23711760342121124,
"learning_rate": 0.0001998793743725372,
"loss": 0.4473,
"step": 1605
},
{
"epoch": 3.4328358208955225,
"grad_norm": 0.24863874912261963,
"learning_rate": 0.00019987320655136693,
"loss": 0.4574,
"step": 1610
},
{
"epoch": 3.443496801705757,
"grad_norm": 0.24471955001354218,
"learning_rate": 0.00019986688505085957,
"loss": 0.4665,
"step": 1615
},
{
"epoch": 3.4541577825159915,
"grad_norm": 0.2540249526500702,
"learning_rate": 0.00019986040988074238,
"loss": 0.4689,
"step": 1620
},
{
"epoch": 3.464818763326226,
"grad_norm": 0.2666712701320648,
"learning_rate": 0.00019985378105097902,
"loss": 0.4477,
"step": 1625
},
{
"epoch": 3.4754797441364604,
"grad_norm": 0.27709081768989563,
"learning_rate": 0.0001998469985717695,
"loss": 0.4403,
"step": 1630
},
{
"epoch": 3.486140724946695,
"grad_norm": 0.27587834000587463,
"learning_rate": 0.00019984006245355037,
"loss": 0.4565,
"step": 1635
},
{
"epoch": 3.49680170575693,
"grad_norm": 0.22859402000904083,
"learning_rate": 0.00019983297270699448,
"loss": 0.4514,
"step": 1640
},
{
"epoch": 3.5074626865671643,
"grad_norm": 0.3489368259906769,
"learning_rate": 0.00019982572934301122,
"loss": 0.4727,
"step": 1645
},
{
"epoch": 3.518123667377399,
"grad_norm": 0.2632017135620117,
"learning_rate": 0.00019981833237274618,
"loss": 0.4415,
"step": 1650
},
{
"epoch": 3.5287846481876333,
"grad_norm": 0.27099326252937317,
"learning_rate": 0.00019981078180758154,
"loss": 0.4489,
"step": 1655
},
{
"epoch": 3.5394456289978677,
"grad_norm": 0.2415977120399475,
"learning_rate": 0.00019980307765913552,
"loss": 0.4764,
"step": 1660
},
{
"epoch": 3.550106609808102,
"grad_norm": 0.23986046016216278,
"learning_rate": 0.000199795219939263,
"loss": 0.4458,
"step": 1665
},
{
"epoch": 3.5607675906183367,
"grad_norm": 0.28455114364624023,
"learning_rate": 0.00019978720866005488,
"loss": 0.4846,
"step": 1670
},
{
"epoch": 3.571428571428571,
"grad_norm": 0.2913159430027008,
"learning_rate": 0.0001997790438338385,
"loss": 0.4547,
"step": 1675
},
{
"epoch": 3.582089552238806,
"grad_norm": 0.25150275230407715,
"learning_rate": 0.0001997707254731775,
"loss": 0.4599,
"step": 1680
},
{
"epoch": 3.5927505330490406,
"grad_norm": 0.23482745885849,
"learning_rate": 0.00019976225359087164,
"loss": 0.4315,
"step": 1685
},
{
"epoch": 3.603411513859275,
"grad_norm": 0.23308737576007843,
"learning_rate": 0.00019975362819995703,
"loss": 0.449,
"step": 1690
},
{
"epoch": 3.6140724946695095,
"grad_norm": 0.2528814375400543,
"learning_rate": 0.00019974484931370592,
"loss": 0.4392,
"step": 1695
},
{
"epoch": 3.624733475479744,
"grad_norm": 0.25079530477523804,
"learning_rate": 0.00019973591694562678,
"loss": 0.4536,
"step": 1700
},
{
"epoch": 3.635394456289979,
"grad_norm": 0.2929099202156067,
"learning_rate": 0.00019972683110946421,
"loss": 0.4426,
"step": 1705
},
{
"epoch": 3.6460554371002134,
"grad_norm": 0.23356157541275024,
"learning_rate": 0.00019971759181919903,
"loss": 0.4602,
"step": 1710
},
{
"epoch": 3.656716417910448,
"grad_norm": 0.3128319978713989,
"learning_rate": 0.00019970819908904814,
"loss": 0.4629,
"step": 1715
},
{
"epoch": 3.6673773987206824,
"grad_norm": 0.23164990544319153,
"learning_rate": 0.00019969865293346454,
"loss": 0.4662,
"step": 1720
},
{
"epoch": 3.678038379530917,
"grad_norm": 0.43762582540512085,
"learning_rate": 0.00019968895336713733,
"loss": 0.4685,
"step": 1725
},
{
"epoch": 3.6886993603411513,
"grad_norm": 0.34830760955810547,
"learning_rate": 0.00019967910040499164,
"loss": 0.4504,
"step": 1730
},
{
"epoch": 3.699360341151386,
"grad_norm": 0.2538786828517914,
"learning_rate": 0.00019966909406218868,
"loss": 0.4967,
"step": 1735
},
{
"epoch": 3.7100213219616203,
"grad_norm": 0.23103195428848267,
"learning_rate": 0.0001996589343541257,
"loss": 0.4556,
"step": 1740
},
{
"epoch": 3.7206823027718547,
"grad_norm": 0.2618430554866791,
"learning_rate": 0.0001996486212964358,
"loss": 0.4453,
"step": 1745
},
{
"epoch": 3.7313432835820897,
"grad_norm": 0.23393474519252777,
"learning_rate": 0.00019963815490498817,
"loss": 0.4613,
"step": 1750
},
{
"epoch": 3.742004264392324,
"grad_norm": 0.2798391282558441,
"learning_rate": 0.00019962753519588798,
"loss": 0.4668,
"step": 1755
},
{
"epoch": 3.7526652452025586,
"grad_norm": 0.24927425384521484,
"learning_rate": 0.00019961676218547617,
"loss": 0.4424,
"step": 1760
},
{
"epoch": 3.763326226012793,
"grad_norm": 0.2537556290626526,
"learning_rate": 0.00019960583589032966,
"loss": 0.4413,
"step": 1765
},
{
"epoch": 3.7739872068230276,
"grad_norm": 0.2401181310415268,
"learning_rate": 0.00019959475632726128,
"loss": 0.4365,
"step": 1770
},
{
"epoch": 3.7846481876332625,
"grad_norm": 0.22927629947662354,
"learning_rate": 0.00019958352351331956,
"loss": 0.4455,
"step": 1775
},
{
"epoch": 3.795309168443497,
"grad_norm": 0.21933622658252716,
"learning_rate": 0.00019957213746578902,
"loss": 0.4661,
"step": 1780
},
{
"epoch": 3.8059701492537314,
"grad_norm": 0.28884589672088623,
"learning_rate": 0.00019956059820218982,
"loss": 0.4931,
"step": 1785
},
{
"epoch": 3.816631130063966,
"grad_norm": 0.2619436979293823,
"learning_rate": 0.00019954890574027797,
"loss": 0.4446,
"step": 1790
},
{
"epoch": 3.8272921108742004,
"grad_norm": 0.22175399959087372,
"learning_rate": 0.00019953706009804512,
"loss": 0.4482,
"step": 1795
},
{
"epoch": 3.837953091684435,
"grad_norm": 0.23060369491577148,
"learning_rate": 0.00019952506129371873,
"loss": 0.451,
"step": 1800
},
{
"epoch": 3.8486140724946694,
"grad_norm": 0.2313724309206009,
"learning_rate": 0.0001995129093457619,
"loss": 0.4496,
"step": 1805
},
{
"epoch": 3.859275053304904,
"grad_norm": 0.23518264293670654,
"learning_rate": 0.00019950060427287335,
"loss": 0.4581,
"step": 1810
},
{
"epoch": 3.8699360341151388,
"grad_norm": 0.22398614883422852,
"learning_rate": 0.00019948814609398746,
"loss": 0.4382,
"step": 1815
},
{
"epoch": 3.8805970149253732,
"grad_norm": 0.21408702433109283,
"learning_rate": 0.00019947553482827418,
"loss": 0.4517,
"step": 1820
},
{
"epoch": 3.8912579957356077,
"grad_norm": 0.26791512966156006,
"learning_rate": 0.00019946277049513904,
"loss": 0.4671,
"step": 1825
},
{
"epoch": 3.901918976545842,
"grad_norm": 0.37972912192344666,
"learning_rate": 0.00019944985311422304,
"loss": 0.4665,
"step": 1830
},
{
"epoch": 3.9125799573560767,
"grad_norm": 0.2744680941104889,
"learning_rate": 0.00019943678270540276,
"loss": 0.4627,
"step": 1835
},
{
"epoch": 3.923240938166311,
"grad_norm": 0.3253777325153351,
"learning_rate": 0.00019942355928879023,
"loss": 0.468,
"step": 1840
},
{
"epoch": 3.933901918976546,
"grad_norm": 0.32431936264038086,
"learning_rate": 0.00019941018288473285,
"loss": 0.4497,
"step": 1845
},
{
"epoch": 3.9445628997867805,
"grad_norm": 0.2247323989868164,
"learning_rate": 0.00019939665351381355,
"loss": 0.4444,
"step": 1850
},
{
"epoch": 3.955223880597015,
"grad_norm": 0.35610342025756836,
"learning_rate": 0.00019938297119685054,
"loss": 0.4563,
"step": 1855
},
{
"epoch": 3.9658848614072495,
"grad_norm": 0.2513818144798279,
"learning_rate": 0.00019936913595489743,
"loss": 0.442,
"step": 1860
},
{
"epoch": 3.976545842217484,
"grad_norm": 0.3135777711868286,
"learning_rate": 0.0001993551478092431,
"loss": 0.4377,
"step": 1865
},
{
"epoch": 3.9872068230277184,
"grad_norm": 0.24127310514450073,
"learning_rate": 0.0001993410067814118,
"loss": 0.4478,
"step": 1870
},
{
"epoch": 3.997867803837953,
"grad_norm": 0.23388491570949554,
"learning_rate": 0.00019932671289316282,
"loss": 0.4306,
"step": 1875
},
{
"epoch": 4.0,
"eval_loss": 0.5043795108795166,
"eval_runtime": 377.5601,
"eval_samples_per_second": 1.091,
"eval_steps_per_second": 1.091,
"step": 1876
},
{
"epoch": 4.008528784648187,
"grad_norm": 0.3674967288970947,
"learning_rate": 0.0001993122661664909,
"loss": 0.4371,
"step": 1880
},
{
"epoch": 4.019189765458422,
"grad_norm": 0.2773316204547882,
"learning_rate": 0.00019929766662362585,
"loss": 0.4043,
"step": 1885
},
{
"epoch": 4.029850746268656,
"grad_norm": 0.2394101619720459,
"learning_rate": 0.00019928291428703262,
"loss": 0.413,
"step": 1890
},
{
"epoch": 4.040511727078891,
"grad_norm": 0.23238113522529602,
"learning_rate": 0.00019926800917941128,
"loss": 0.4021,
"step": 1895
},
{
"epoch": 4.051172707889126,
"grad_norm": 0.22244401276111603,
"learning_rate": 0.000199252951323697,
"loss": 0.4101,
"step": 1900
},
{
"epoch": 4.061833688699361,
"grad_norm": 0.24964463710784912,
"learning_rate": 0.00019923774074306,
"loss": 0.4123,
"step": 1905
},
{
"epoch": 4.072494669509595,
"grad_norm": 0.23066940903663635,
"learning_rate": 0.00019922237746090537,
"loss": 0.4267,
"step": 1910
},
{
"epoch": 4.08315565031983,
"grad_norm": 0.23452460765838623,
"learning_rate": 0.00019920686150087336,
"loss": 0.4223,
"step": 1915
},
{
"epoch": 4.093816631130064,
"grad_norm": 0.3032955527305603,
"learning_rate": 0.00019919119288683908,
"loss": 0.432,
"step": 1920
},
{
"epoch": 4.104477611940299,
"grad_norm": 0.3310707211494446,
"learning_rate": 0.00019917537164291244,
"loss": 0.42,
"step": 1925
},
{
"epoch": 4.115138592750533,
"grad_norm": 0.24135416746139526,
"learning_rate": 0.00019915939779343838,
"loss": 0.4289,
"step": 1930
},
{
"epoch": 4.1257995735607675,
"grad_norm": 0.23443254828453064,
"learning_rate": 0.00019914327136299651,
"loss": 0.4216,
"step": 1935
},
{
"epoch": 4.136460554371002,
"grad_norm": 0.3196619749069214,
"learning_rate": 0.0001991269923764013,
"loss": 0.4387,
"step": 1940
},
{
"epoch": 4.1471215351812365,
"grad_norm": 0.2881762981414795,
"learning_rate": 0.00019911056085870197,
"loss": 0.4176,
"step": 1945
},
{
"epoch": 4.157782515991471,
"grad_norm": 0.25249961018562317,
"learning_rate": 0.00019909397683518242,
"loss": 0.4221,
"step": 1950
},
{
"epoch": 4.1684434968017055,
"grad_norm": 0.22756356000900269,
"learning_rate": 0.00019907724033136118,
"loss": 0.413,
"step": 1955
},
{
"epoch": 4.17910447761194,
"grad_norm": 0.24332334101200104,
"learning_rate": 0.0001990603513729915,
"loss": 0.4218,
"step": 1960
},
{
"epoch": 4.189765458422174,
"grad_norm": 0.23593220114707947,
"learning_rate": 0.00019904330998606116,
"loss": 0.4114,
"step": 1965
},
{
"epoch": 4.20042643923241,
"grad_norm": 0.266313374042511,
"learning_rate": 0.00019902611619679252,
"loss": 0.4309,
"step": 1970
},
{
"epoch": 4.211087420042644,
"grad_norm": 0.3359983563423157,
"learning_rate": 0.00019900877003164235,
"loss": 0.4339,
"step": 1975
},
{
"epoch": 4.221748400852879,
"grad_norm": 0.22711415588855743,
"learning_rate": 0.00019899127151730206,
"loss": 0.4165,
"step": 1980
},
{
"epoch": 4.232409381663113,
"grad_norm": 0.2225334793329239,
"learning_rate": 0.00019897362068069732,
"loss": 0.4094,
"step": 1985
},
{
"epoch": 4.243070362473348,
"grad_norm": 0.2701500356197357,
"learning_rate": 0.0001989558175489883,
"loss": 0.4239,
"step": 1990
},
{
"epoch": 4.253731343283582,
"grad_norm": 0.2480495721101761,
"learning_rate": 0.00019893786214956945,
"loss": 0.4137,
"step": 1995
},
{
"epoch": 4.264392324093817,
"grad_norm": 0.22299885749816895,
"learning_rate": 0.00019891975451006953,
"loss": 0.4273,
"step": 2000
},
{
"epoch": 4.275053304904051,
"grad_norm": 0.2259630262851715,
"learning_rate": 0.0001989014946583516,
"loss": 0.4223,
"step": 2005
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.3351574242115021,
"learning_rate": 0.00019888308262251285,
"loss": 0.4483,
"step": 2010
},
{
"epoch": 4.29637526652452,
"grad_norm": 0.21363438665866852,
"learning_rate": 0.0001988645184308848,
"loss": 0.4138,
"step": 2015
},
{
"epoch": 4.3070362473347545,
"grad_norm": 0.2409023493528366,
"learning_rate": 0.00019884580211203287,
"loss": 0.4166,
"step": 2020
},
{
"epoch": 4.317697228144989,
"grad_norm": 0.24684803187847137,
"learning_rate": 0.00019882693369475675,
"loss": 0.4089,
"step": 2025
},
{
"epoch": 4.3283582089552235,
"grad_norm": 0.24175861477851868,
"learning_rate": 0.0001988079132080901,
"loss": 0.4169,
"step": 2030
},
{
"epoch": 4.339019189765459,
"grad_norm": 0.3582640290260315,
"learning_rate": 0.00019878874068130062,
"loss": 0.4207,
"step": 2035
},
{
"epoch": 4.349680170575693,
"grad_norm": 0.23563334345817566,
"learning_rate": 0.00019876941614388992,
"loss": 0.4056,
"step": 2040
},
{
"epoch": 4.360341151385928,
"grad_norm": 0.24959246814250946,
"learning_rate": 0.0001987499396255935,
"loss": 0.4152,
"step": 2045
},
{
"epoch": 4.371002132196162,
"grad_norm": 0.2378864586353302,
"learning_rate": 0.00019873031115638073,
"loss": 0.428,
"step": 2050
},
{
"epoch": 4.381663113006397,
"grad_norm": 0.25769662857055664,
"learning_rate": 0.00019871053076645488,
"loss": 0.4273,
"step": 2055
},
{
"epoch": 4.392324093816631,
"grad_norm": 0.2148350328207016,
"learning_rate": 0.0001986905984862528,
"loss": 0.4341,
"step": 2060
},
{
"epoch": 4.402985074626866,
"grad_norm": 0.22630667686462402,
"learning_rate": 0.0001986705143464453,
"loss": 0.43,
"step": 2065
},
{
"epoch": 4.4136460554371,
"grad_norm": 0.23718136548995972,
"learning_rate": 0.00019865027837793665,
"loss": 0.4193,
"step": 2070
},
{
"epoch": 4.424307036247335,
"grad_norm": 0.26240232586860657,
"learning_rate": 0.00019862989061186483,
"loss": 0.4327,
"step": 2075
},
{
"epoch": 4.434968017057569,
"grad_norm": 0.21503274142742157,
"learning_rate": 0.0001986093510796015,
"loss": 0.4208,
"step": 2080
},
{
"epoch": 4.445628997867804,
"grad_norm": 0.31747710704803467,
"learning_rate": 0.0001985886598127516,
"loss": 0.4348,
"step": 2085
},
{
"epoch": 4.456289978678038,
"grad_norm": 0.24618090689182281,
"learning_rate": 0.00019856781684315382,
"loss": 0.4247,
"step": 2090
},
{
"epoch": 4.466950959488273,
"grad_norm": 0.33112359046936035,
"learning_rate": 0.00019854682220288013,
"loss": 0.4175,
"step": 2095
},
{
"epoch": 4.477611940298507,
"grad_norm": 0.23943935334682465,
"learning_rate": 0.0001985256759242359,
"loss": 0.4271,
"step": 2100
},
{
"epoch": 4.4882729211087415,
"grad_norm": 0.24192848801612854,
"learning_rate": 0.00019850437803975988,
"loss": 0.4221,
"step": 2105
},
{
"epoch": 4.498933901918977,
"grad_norm": 0.22631579637527466,
"learning_rate": 0.00019848292858222401,
"loss": 0.4233,
"step": 2110
},
{
"epoch": 4.509594882729211,
"grad_norm": 0.23344965279102325,
"learning_rate": 0.00019846132758463356,
"loss": 0.4161,
"step": 2115
},
{
"epoch": 4.520255863539446,
"grad_norm": 0.22698044776916504,
"learning_rate": 0.000198439575080227,
"loss": 0.4112,
"step": 2120
},
{
"epoch": 4.53091684434968,
"grad_norm": 0.3037104308605194,
"learning_rate": 0.00019841767110247575,
"loss": 0.4362,
"step": 2125
},
{
"epoch": 4.541577825159915,
"grad_norm": 0.24173210561275482,
"learning_rate": 0.00019839561568508454,
"loss": 0.4223,
"step": 2130
},
{
"epoch": 4.552238805970149,
"grad_norm": 0.2352645844221115,
"learning_rate": 0.00019837340886199096,
"loss": 0.4274,
"step": 2135
},
{
"epoch": 4.562899786780384,
"grad_norm": 0.2779860496520996,
"learning_rate": 0.0001983510506673657,
"loss": 0.4316,
"step": 2140
},
{
"epoch": 4.573560767590618,
"grad_norm": 0.24002455174922943,
"learning_rate": 0.0001983285411356122,
"loss": 0.4159,
"step": 2145
},
{
"epoch": 4.584221748400853,
"grad_norm": 0.22028042376041412,
"learning_rate": 0.00019830588030136698,
"loss": 0.4296,
"step": 2150
},
{
"epoch": 4.594882729211087,
"grad_norm": 0.3180830776691437,
"learning_rate": 0.0001982830681994992,
"loss": 0.4339,
"step": 2155
},
{
"epoch": 4.605543710021322,
"grad_norm": 0.2228025496006012,
"learning_rate": 0.00019826010486511091,
"loss": 0.4149,
"step": 2160
},
{
"epoch": 4.616204690831556,
"grad_norm": 0.2128361463546753,
"learning_rate": 0.00019823699033353677,
"loss": 0.4126,
"step": 2165
},
{
"epoch": 4.6268656716417915,
"grad_norm": 0.2322179228067398,
"learning_rate": 0.00019821372464034416,
"loss": 0.4128,
"step": 2170
},
{
"epoch": 4.637526652452026,
"grad_norm": 0.30600860714912415,
"learning_rate": 0.00019819030782133304,
"loss": 0.414,
"step": 2175
},
{
"epoch": 4.6481876332622605,
"grad_norm": 0.22045232355594635,
"learning_rate": 0.00019816673991253586,
"loss": 0.409,
"step": 2180
},
{
"epoch": 4.658848614072495,
"grad_norm": 0.2302045375108719,
"learning_rate": 0.00019814302095021768,
"loss": 0.4199,
"step": 2185
},
{
"epoch": 4.669509594882729,
"grad_norm": 0.22577248513698578,
"learning_rate": 0.00019811915097087587,
"loss": 0.4058,
"step": 2190
},
{
"epoch": 4.680170575692964,
"grad_norm": 0.6790816187858582,
"learning_rate": 0.00019809513001124024,
"loss": 0.4356,
"step": 2195
},
{
"epoch": 4.690831556503198,
"grad_norm": 0.2510231137275696,
"learning_rate": 0.00019807095810827293,
"loss": 0.4062,
"step": 2200
},
{
"epoch": 4.701492537313433,
"grad_norm": 0.24071648716926575,
"learning_rate": 0.00019804663529916826,
"loss": 0.4282,
"step": 2205
},
{
"epoch": 4.712153518123667,
"grad_norm": 0.2886710464954376,
"learning_rate": 0.00019802216162135287,
"loss": 0.4254,
"step": 2210
},
{
"epoch": 4.722814498933902,
"grad_norm": 0.2941761910915375,
"learning_rate": 0.0001979975371124855,
"loss": 0.4343,
"step": 2215
},
{
"epoch": 4.733475479744136,
"grad_norm": 0.2591281533241272,
"learning_rate": 0.00019797276181045693,
"loss": 0.4165,
"step": 2220
},
{
"epoch": 4.744136460554371,
"grad_norm": 0.2245703637599945,
"learning_rate": 0.00019794783575339004,
"loss": 0.4112,
"step": 2225
},
{
"epoch": 4.754797441364605,
"grad_norm": 0.48405957221984863,
"learning_rate": 0.00019792275897963967,
"loss": 0.4279,
"step": 2230
},
{
"epoch": 4.76545842217484,
"grad_norm": 0.22091209888458252,
"learning_rate": 0.00019789753152779258,
"loss": 0.4371,
"step": 2235
},
{
"epoch": 4.776119402985074,
"grad_norm": 0.23672465980052948,
"learning_rate": 0.00019787215343666732,
"loss": 0.4166,
"step": 2240
},
{
"epoch": 4.786780383795309,
"grad_norm": 0.43999361991882324,
"learning_rate": 0.0001978466247453143,
"loss": 0.4167,
"step": 2245
},
{
"epoch": 4.797441364605544,
"grad_norm": 0.2732659578323364,
"learning_rate": 0.0001978209454930157,
"loss": 0.4326,
"step": 2250
},
{
"epoch": 4.8081023454157785,
"grad_norm": 0.27667996287345886,
"learning_rate": 0.00019779511571928527,
"loss": 0.4192,
"step": 2255
},
{
"epoch": 4.818763326226013,
"grad_norm": 0.24479329586029053,
"learning_rate": 0.00019776913546386843,
"loss": 0.4158,
"step": 2260
},
{
"epoch": 4.8294243070362475,
"grad_norm": 0.21344681084156036,
"learning_rate": 0.0001977430047667422,
"loss": 0.4112,
"step": 2265
},
{
"epoch": 4.840085287846482,
"grad_norm": 0.24819132685661316,
"learning_rate": 0.00019771672366811503,
"loss": 0.414,
"step": 2270
},
{
"epoch": 4.850746268656716,
"grad_norm": 0.2435145080089569,
"learning_rate": 0.00019769029220842677,
"loss": 0.4172,
"step": 2275
},
{
"epoch": 4.861407249466951,
"grad_norm": 0.21831800043582916,
"learning_rate": 0.0001976637104283487,
"loss": 0.4168,
"step": 2280
},
{
"epoch": 4.872068230277185,
"grad_norm": 0.3001014292240143,
"learning_rate": 0.00019763697836878343,
"loss": 0.4271,
"step": 2285
},
{
"epoch": 4.88272921108742,
"grad_norm": 0.3473288118839264,
"learning_rate": 0.00019761009607086472,
"loss": 0.4256,
"step": 2290
},
{
"epoch": 4.893390191897654,
"grad_norm": 0.2094939649105072,
"learning_rate": 0.00019758306357595755,
"loss": 0.4207,
"step": 2295
},
{
"epoch": 4.904051172707889,
"grad_norm": 0.224636048078537,
"learning_rate": 0.00019755588092565805,
"loss": 0.4214,
"step": 2300
},
{
"epoch": 4.914712153518123,
"grad_norm": 0.22260229289531708,
"learning_rate": 0.00019752854816179336,
"loss": 0.4226,
"step": 2305
},
{
"epoch": 4.925373134328359,
"grad_norm": 0.21004381775856018,
"learning_rate": 0.0001975010653264216,
"loss": 0.414,
"step": 2310
},
{
"epoch": 4.936034115138593,
"grad_norm": 0.2120514214038849,
"learning_rate": 0.00019747343246183185,
"loss": 0.4152,
"step": 2315
},
{
"epoch": 4.946695095948828,
"grad_norm": 0.2152203619480133,
"learning_rate": 0.00019744564961054402,
"loss": 0.4159,
"step": 2320
},
{
"epoch": 4.957356076759062,
"grad_norm": 0.22371242940425873,
"learning_rate": 0.0001974177168153088,
"loss": 0.4095,
"step": 2325
},
{
"epoch": 4.968017057569297,
"grad_norm": 0.21865862607955933,
"learning_rate": 0.00019738963411910766,
"loss": 0.4261,
"step": 2330
},
{
"epoch": 4.978678038379531,
"grad_norm": 0.3230665326118469,
"learning_rate": 0.0001973614015651527,
"loss": 0.4116,
"step": 2335
},
{
"epoch": 4.9893390191897655,
"grad_norm": 0.21557492017745972,
"learning_rate": 0.00019733301919688651,
"loss": 0.4161,
"step": 2340
},
{
"epoch": 5.0,
"grad_norm": 0.21153585612773895,
"learning_rate": 0.00019730448705798239,
"loss": 0.4128,
"step": 2345
},
{
"epoch": 5.0,
"eval_loss": 0.5016890168190002,
"eval_runtime": 377.5434,
"eval_samples_per_second": 1.091,
"eval_steps_per_second": 1.091,
"step": 2345
},
{
"epoch": 5.0106609808102345,
"grad_norm": 0.20196357369422913,
"learning_rate": 0.000197275805192344,
"loss": 0.3909,
"step": 2350
},
{
"epoch": 5.021321961620469,
"grad_norm": 0.2446993738412857,
"learning_rate": 0.00019724697364410535,
"loss": 0.3876,
"step": 2355
},
{
"epoch": 5.031982942430703,
"grad_norm": 0.22501204907894135,
"learning_rate": 0.00019721799245763088,
"loss": 0.3882,
"step": 2360
},
{
"epoch": 5.042643923240938,
"grad_norm": 0.23419953882694244,
"learning_rate": 0.0001971888616775152,
"loss": 0.3786,
"step": 2365
},
{
"epoch": 5.053304904051172,
"grad_norm": 0.23151536285877228,
"learning_rate": 0.00019715958134858315,
"loss": 0.3925,
"step": 2370
},
{
"epoch": 5.063965884861407,
"grad_norm": 0.23873166739940643,
"learning_rate": 0.00019713015151588966,
"loss": 0.3927,
"step": 2375
},
{
"epoch": 5.074626865671641,
"grad_norm": 0.23083342611789703,
"learning_rate": 0.00019710057222471967,
"loss": 0.3836,
"step": 2380
},
{
"epoch": 5.085287846481877,
"grad_norm": 0.22406326234340668,
"learning_rate": 0.00019707084352058827,
"loss": 0.389,
"step": 2385
},
{
"epoch": 5.095948827292111,
"grad_norm": 0.37570300698280334,
"learning_rate": 0.00019704096544924022,
"loss": 0.3999,
"step": 2390
},
{
"epoch": 5.106609808102346,
"grad_norm": 0.21594493091106415,
"learning_rate": 0.0001970109380566503,
"loss": 0.38,
"step": 2395
},
{
"epoch": 5.11727078891258,
"grad_norm": 0.2725168466567993,
"learning_rate": 0.00019698076138902298,
"loss": 0.3848,
"step": 2400
},
{
"epoch": 5.127931769722815,
"grad_norm": 0.2510855495929718,
"learning_rate": 0.00019695043549279243,
"loss": 0.3859,
"step": 2405
},
{
"epoch": 5.138592750533049,
"grad_norm": 0.23722735047340393,
"learning_rate": 0.00019691996041462244,
"loss": 0.3876,
"step": 2410
},
{
"epoch": 5.149253731343284,
"grad_norm": 0.35469353199005127,
"learning_rate": 0.00019688933620140637,
"loss": 0.3863,
"step": 2415
},
{
"epoch": 5.159914712153518,
"grad_norm": 0.23087090253829956,
"learning_rate": 0.0001968585629002671,
"loss": 0.3898,
"step": 2420
},
{
"epoch": 5.1705756929637525,
"grad_norm": 0.21194830536842346,
"learning_rate": 0.00019682764055855683,
"loss": 0.3832,
"step": 2425
},
{
"epoch": 5.181236673773987,
"grad_norm": 0.23261596262454987,
"learning_rate": 0.00019679656922385715,
"loss": 0.3895,
"step": 2430
},
{
"epoch": 5.1918976545842215,
"grad_norm": 0.24160555005073547,
"learning_rate": 0.0001967653489439789,
"loss": 0.391,
"step": 2435
},
{
"epoch": 5.202558635394456,
"grad_norm": 0.23709999024868011,
"learning_rate": 0.00019673397976696216,
"loss": 0.3904,
"step": 2440
},
{
"epoch": 5.21321961620469,
"grad_norm": 0.2529030740261078,
"learning_rate": 0.00019670246174107597,
"loss": 0.3853,
"step": 2445
},
{
"epoch": 5.223880597014926,
"grad_norm": 0.22068992257118225,
"learning_rate": 0.0001966707949148186,
"loss": 0.3791,
"step": 2450
},
{
"epoch": 5.23454157782516,
"grad_norm": 0.23219233751296997,
"learning_rate": 0.00019663897933691718,
"loss": 0.3904,
"step": 2455
},
{
"epoch": 5.245202558635395,
"grad_norm": 0.25079360604286194,
"learning_rate": 0.00019660701505632772,
"loss": 0.3995,
"step": 2460
},
{
"epoch": 5.255863539445629,
"grad_norm": 0.2510697841644287,
"learning_rate": 0.00019657490212223515,
"loss": 0.3861,
"step": 2465
},
{
"epoch": 5.266524520255864,
"grad_norm": 0.25218454003334045,
"learning_rate": 0.000196542640584053,
"loss": 0.3878,
"step": 2470
},
{
"epoch": 5.277185501066098,
"grad_norm": 0.21124300360679626,
"learning_rate": 0.00019651023049142356,
"loss": 0.3881,
"step": 2475
},
{
"epoch": 5.287846481876333,
"grad_norm": 0.23286496102809906,
"learning_rate": 0.0001964776718942177,
"loss": 0.3893,
"step": 2480
},
{
"epoch": 5.298507462686567,
"grad_norm": 0.2385607361793518,
"learning_rate": 0.00019644496484253474,
"loss": 0.381,
"step": 2485
},
{
"epoch": 5.309168443496802,
"grad_norm": 0.22742030024528503,
"learning_rate": 0.00019641210938670247,
"loss": 0.393,
"step": 2490
},
{
"epoch": 5.319829424307036,
"grad_norm": 0.22051115334033966,
"learning_rate": 0.00019637910557727706,
"loss": 0.3933,
"step": 2495
},
{
"epoch": 5.330490405117271,
"grad_norm": 0.23317855596542358,
"learning_rate": 0.00019634595346504293,
"loss": 0.3877,
"step": 2500
},
{
"epoch": 5.341151385927505,
"grad_norm": 0.23425228893756866,
"learning_rate": 0.00019631265310101272,
"loss": 0.4158,
"step": 2505
},
{
"epoch": 5.3518123667377395,
"grad_norm": 0.25701725482940674,
"learning_rate": 0.00019627920453642715,
"loss": 0.3835,
"step": 2510
},
{
"epoch": 5.362473347547974,
"grad_norm": 0.23093344271183014,
"learning_rate": 0.00019624560782275505,
"loss": 0.3846,
"step": 2515
},
{
"epoch": 5.373134328358209,
"grad_norm": 0.2600732147693634,
"learning_rate": 0.00019621186301169315,
"loss": 0.3917,
"step": 2520
},
{
"epoch": 5.383795309168444,
"grad_norm": 0.2647717595100403,
"learning_rate": 0.00019617797015516607,
"loss": 0.3938,
"step": 2525
},
{
"epoch": 5.394456289978678,
"grad_norm": 0.24304771423339844,
"learning_rate": 0.0001961439293053263,
"loss": 0.3925,
"step": 2530
},
{
"epoch": 5.405117270788913,
"grad_norm": 0.2271909862756729,
"learning_rate": 0.00019610974051455398,
"loss": 0.3878,
"step": 2535
},
{
"epoch": 5.415778251599147,
"grad_norm": 0.22085613012313843,
"learning_rate": 0.00019607540383545692,
"loss": 0.4025,
"step": 2540
},
{
"epoch": 5.426439232409382,
"grad_norm": 0.2830078899860382,
"learning_rate": 0.0001960409193208705,
"loss": 0.3935,
"step": 2545
},
{
"epoch": 5.437100213219616,
"grad_norm": 0.37187430262565613,
"learning_rate": 0.00019600628702385751,
"loss": 0.3896,
"step": 2550
},
{
"epoch": 5.447761194029851,
"grad_norm": 0.23631027340888977,
"learning_rate": 0.00019597150699770835,
"loss": 0.3911,
"step": 2555
},
{
"epoch": 5.458422174840085,
"grad_norm": 0.224113330245018,
"learning_rate": 0.00019593657929594044,
"loss": 0.3876,
"step": 2560
},
{
"epoch": 5.46908315565032,
"grad_norm": 0.29911914467811584,
"learning_rate": 0.00019590150397229866,
"loss": 0.3966,
"step": 2565
},
{
"epoch": 5.479744136460554,
"grad_norm": 0.22963348031044006,
"learning_rate": 0.000195866281080755,
"loss": 0.3931,
"step": 2570
},
{
"epoch": 5.490405117270789,
"grad_norm": 0.24756336212158203,
"learning_rate": 0.0001958309106755084,
"loss": 0.3827,
"step": 2575
},
{
"epoch": 5.501066098081023,
"grad_norm": 0.22494661808013916,
"learning_rate": 0.00019579539281098493,
"loss": 0.3884,
"step": 2580
},
{
"epoch": 5.5117270788912585,
"grad_norm": 0.2217581868171692,
"learning_rate": 0.00019575972754183748,
"loss": 0.3954,
"step": 2585
},
{
"epoch": 5.522388059701493,
"grad_norm": 0.22264057397842407,
"learning_rate": 0.0001957239149229458,
"loss": 0.3925,
"step": 2590
},
{
"epoch": 5.533049040511727,
"grad_norm": 0.24900676310062408,
"learning_rate": 0.00019568795500941635,
"loss": 0.3938,
"step": 2595
},
{
"epoch": 5.543710021321962,
"grad_norm": 0.22802846133708954,
"learning_rate": 0.00019565184785658223,
"loss": 0.3903,
"step": 2600
},
{
"epoch": 5.554371002132196,
"grad_norm": 0.2182716578245163,
"learning_rate": 0.00019561559352000317,
"loss": 0.3929,
"step": 2605
},
{
"epoch": 5.565031982942431,
"grad_norm": 0.23668424785137177,
"learning_rate": 0.00019557919205546526,
"loss": 0.3815,
"step": 2610
},
{
"epoch": 5.575692963752665,
"grad_norm": 0.22820915281772614,
"learning_rate": 0.0001955426435189811,
"loss": 0.3937,
"step": 2615
},
{
"epoch": 5.5863539445629,
"grad_norm": 0.21698084473609924,
"learning_rate": 0.00019550594796678952,
"loss": 0.3925,
"step": 2620
},
{
"epoch": 5.597014925373134,
"grad_norm": 0.22192837297916412,
"learning_rate": 0.00019546910545535558,
"loss": 0.3858,
"step": 2625
},
{
"epoch": 5.607675906183369,
"grad_norm": 0.22095522284507751,
"learning_rate": 0.00019543211604137052,
"loss": 0.3863,
"step": 2630
},
{
"epoch": 5.618336886993603,
"grad_norm": 0.22427357733249664,
"learning_rate": 0.0001953949797817516,
"loss": 0.3836,
"step": 2635
},
{
"epoch": 5.628997867803838,
"grad_norm": 0.23269647359848022,
"learning_rate": 0.00019535769673364203,
"loss": 0.3913,
"step": 2640
},
{
"epoch": 5.639658848614072,
"grad_norm": 0.21933898329734802,
"learning_rate": 0.00019532026695441083,
"loss": 0.3948,
"step": 2645
},
{
"epoch": 5.650319829424307,
"grad_norm": 0.227766752243042,
"learning_rate": 0.00019528269050165297,
"loss": 0.3861,
"step": 2650
},
{
"epoch": 5.660980810234541,
"grad_norm": 0.22262893617153168,
"learning_rate": 0.00019524496743318891,
"loss": 0.3921,
"step": 2655
},
{
"epoch": 5.6716417910447765,
"grad_norm": 0.28188657760620117,
"learning_rate": 0.00019520709780706486,
"loss": 0.3802,
"step": 2660
},
{
"epoch": 5.682302771855011,
"grad_norm": 0.22414395213127136,
"learning_rate": 0.00019516908168155245,
"loss": 0.3858,
"step": 2665
},
{
"epoch": 5.6929637526652455,
"grad_norm": 0.222300723195076,
"learning_rate": 0.00019513091911514885,
"loss": 0.3886,
"step": 2670
},
{
"epoch": 5.70362473347548,
"grad_norm": 0.2155119776725769,
"learning_rate": 0.00019509261016657643,
"loss": 0.3948,
"step": 2675
},
{
"epoch": 5.714285714285714,
"grad_norm": 0.23029391467571259,
"learning_rate": 0.0001950541548947829,
"loss": 0.3915,
"step": 2680
},
{
"epoch": 5.724946695095949,
"grad_norm": 0.23538485169410706,
"learning_rate": 0.0001950155533589411,
"loss": 0.4005,
"step": 2685
},
{
"epoch": 5.735607675906183,
"grad_norm": 0.249455988407135,
"learning_rate": 0.00019497680561844893,
"loss": 0.386,
"step": 2690
},
{
"epoch": 5.746268656716418,
"grad_norm": 0.21184088289737701,
"learning_rate": 0.00019493791173292923,
"loss": 0.3931,
"step": 2695
},
{
"epoch": 5.756929637526652,
"grad_norm": 0.21931645274162292,
"learning_rate": 0.00019489887176222975,
"loss": 0.3981,
"step": 2700
},
{
"epoch": 5.767590618336887,
"grad_norm": 0.2259492725133896,
"learning_rate": 0.00019485968576642308,
"loss": 0.3848,
"step": 2705
},
{
"epoch": 5.778251599147121,
"grad_norm": 0.23413480818271637,
"learning_rate": 0.00019482035380580638,
"loss": 0.3875,
"step": 2710
},
{
"epoch": 5.788912579957356,
"grad_norm": 0.22880232334136963,
"learning_rate": 0.00019478087594090155,
"loss": 0.3838,
"step": 2715
},
{
"epoch": 5.79957356076759,
"grad_norm": 0.22865185141563416,
"learning_rate": 0.00019474125223245488,
"loss": 0.3855,
"step": 2720
},
{
"epoch": 5.810234541577826,
"grad_norm": 0.24277456104755402,
"learning_rate": 0.00019470148274143713,
"loss": 0.3938,
"step": 2725
},
{
"epoch": 5.82089552238806,
"grad_norm": 0.2189398854970932,
"learning_rate": 0.00019466156752904343,
"loss": 0.4008,
"step": 2730
},
{
"epoch": 5.8315565031982945,
"grad_norm": 0.21893605589866638,
"learning_rate": 0.00019462150665669302,
"loss": 0.3874,
"step": 2735
},
{
"epoch": 5.842217484008529,
"grad_norm": 0.23077057301998138,
"learning_rate": 0.00019458130018602945,
"loss": 0.3929,
"step": 2740
},
{
"epoch": 5.8528784648187635,
"grad_norm": 0.2599683701992035,
"learning_rate": 0.00019454094817892008,
"loss": 0.3892,
"step": 2745
},
{
"epoch": 5.863539445628998,
"grad_norm": 0.22645121812820435,
"learning_rate": 0.00019450045069745642,
"loss": 0.3913,
"step": 2750
},
{
"epoch": 5.8742004264392325,
"grad_norm": 0.22834275662899017,
"learning_rate": 0.00019445980780395368,
"loss": 0.3958,
"step": 2755
},
{
"epoch": 5.884861407249467,
"grad_norm": 0.24456727504730225,
"learning_rate": 0.00019441901956095093,
"loss": 0.3939,
"step": 2760
},
{
"epoch": 5.895522388059701,
"grad_norm": 0.21773149073123932,
"learning_rate": 0.00019437808603121087,
"loss": 0.3988,
"step": 2765
},
{
"epoch": 5.906183368869936,
"grad_norm": 0.21768063306808472,
"learning_rate": 0.00019433700727771965,
"loss": 0.3894,
"step": 2770
},
{
"epoch": 5.91684434968017,
"grad_norm": 0.2415178418159485,
"learning_rate": 0.00019429578336368708,
"loss": 0.3931,
"step": 2775
},
{
"epoch": 5.927505330490405,
"grad_norm": 0.21271879971027374,
"learning_rate": 0.00019425441435254616,
"loss": 0.3957,
"step": 2780
},
{
"epoch": 5.938166311300639,
"grad_norm": 0.21745960414409637,
"learning_rate": 0.00019421290030795322,
"loss": 0.3948,
"step": 2785
},
{
"epoch": 5.948827292110874,
"grad_norm": 0.22035416960716248,
"learning_rate": 0.0001941712412937878,
"loss": 0.3922,
"step": 2790
},
{
"epoch": 5.959488272921108,
"grad_norm": 0.20828816294670105,
"learning_rate": 0.00019412943737415246,
"loss": 0.3976,
"step": 2795
},
{
"epoch": 5.970149253731344,
"grad_norm": 0.19749729335308075,
"learning_rate": 0.00019408748861337273,
"loss": 0.3994,
"step": 2800
},
{
"epoch": 5.980810234541578,
"grad_norm": 0.20768584311008453,
"learning_rate": 0.00019404539507599707,
"loss": 0.3869,
"step": 2805
},
{
"epoch": 5.991471215351813,
"grad_norm": 0.2182578146457672,
"learning_rate": 0.00019400315682679663,
"loss": 0.3924,
"step": 2810
},
{
"epoch": 6.0,
"eval_loss": 0.5093127489089966,
"eval_runtime": 377.4947,
"eval_samples_per_second": 1.091,
"eval_steps_per_second": 1.091,
"step": 2814
},
{
"epoch": 6.002132196162047,
"grad_norm": 0.21125191450119019,
"learning_rate": 0.0001939607739307653,
"loss": 0.3874,
"step": 2815
},
{
"epoch": 6.0127931769722816,
"grad_norm": 0.31068113446235657,
"learning_rate": 0.0001939182464531195,
"loss": 0.3704,
"step": 2820
},
{
"epoch": 6.023454157782516,
"grad_norm": 0.23276059329509735,
"learning_rate": 0.00019387557445929823,
"loss": 0.353,
"step": 2825
},
{
"epoch": 6.0341151385927505,
"grad_norm": 0.25309714674949646,
"learning_rate": 0.00019383275801496268,
"loss": 0.3494,
"step": 2830
},
{
"epoch": 6.044776119402985,
"grad_norm": 0.2310338020324707,
"learning_rate": 0.00019378979718599645,
"loss": 0.3534,
"step": 2835
},
{
"epoch": 6.0554371002132195,
"grad_norm": 0.23623259365558624,
"learning_rate": 0.00019374669203850532,
"loss": 0.3513,
"step": 2840
},
{
"epoch": 6.066098081023454,
"grad_norm": 0.2299884408712387,
"learning_rate": 0.00019370344263881702,
"loss": 0.3534,
"step": 2845
},
{
"epoch": 6.076759061833688,
"grad_norm": 0.5613902807235718,
"learning_rate": 0.0001936600490534814,
"loss": 0.3615,
"step": 2850
},
{
"epoch": 6.087420042643923,
"grad_norm": 0.22940614819526672,
"learning_rate": 0.00019361651134927003,
"loss": 0.3522,
"step": 2855
},
{
"epoch": 6.098081023454157,
"grad_norm": 0.22831672430038452,
"learning_rate": 0.0001935728295931763,
"loss": 0.3523,
"step": 2860
},
{
"epoch": 6.108742004264393,
"grad_norm": 0.23445968329906464,
"learning_rate": 0.00019352900385241536,
"loss": 0.369,
"step": 2865
},
{
"epoch": 6.119402985074627,
"grad_norm": 0.2444639503955841,
"learning_rate": 0.0001934850341944237,
"loss": 0.355,
"step": 2870
},
{
"epoch": 6.130063965884862,
"grad_norm": 0.2400490790605545,
"learning_rate": 0.00019344092068685948,
"loss": 0.3625,
"step": 2875
},
{
"epoch": 6.140724946695096,
"grad_norm": 0.2361455261707306,
"learning_rate": 0.00019339666339760207,
"loss": 0.3649,
"step": 2880
},
{
"epoch": 6.151385927505331,
"grad_norm": 0.26625874638557434,
"learning_rate": 0.00019335226239475215,
"loss": 0.3572,
"step": 2885
},
{
"epoch": 6.162046908315565,
"grad_norm": 0.2775781750679016,
"learning_rate": 0.0001933077177466315,
"loss": 0.3446,
"step": 2890
},
{
"epoch": 6.1727078891258,
"grad_norm": 0.25833654403686523,
"learning_rate": 0.00019326302952178294,
"loss": 0.3624,
"step": 2895
},
{
"epoch": 6.183368869936034,
"grad_norm": 0.2403610199689865,
"learning_rate": 0.00019321819778897023,
"loss": 0.3578,
"step": 2900
},
{
"epoch": 6.1940298507462686,
"grad_norm": 0.2580753266811371,
"learning_rate": 0.00019317322261717794,
"loss": 0.3536,
"step": 2905
},
{
"epoch": 6.204690831556503,
"grad_norm": 0.2725096046924591,
"learning_rate": 0.0001931281040756114,
"loss": 0.3689,
"step": 2910
},
{
"epoch": 6.2153518123667375,
"grad_norm": 0.27059614658355713,
"learning_rate": 0.00019308284223369646,
"loss": 0.3656,
"step": 2915
},
{
"epoch": 6.226012793176972,
"grad_norm": 0.24707560241222382,
"learning_rate": 0.00019303743716107957,
"loss": 0.3682,
"step": 2920
},
{
"epoch": 6.2366737739872065,
"grad_norm": 0.23825524747371674,
"learning_rate": 0.00019299188892762752,
"loss": 0.3578,
"step": 2925
},
{
"epoch": 6.247334754797441,
"grad_norm": 0.24557247757911682,
"learning_rate": 0.00019294619760342737,
"loss": 0.3624,
"step": 2930
},
{
"epoch": 6.257995735607676,
"grad_norm": 0.2559678256511688,
"learning_rate": 0.00019290036325878644,
"loss": 0.3693,
"step": 2935
},
{
"epoch": 6.268656716417911,
"grad_norm": 0.25294074416160583,
"learning_rate": 0.00019285438596423204,
"loss": 0.3651,
"step": 2940
},
{
"epoch": 6.279317697228145,
"grad_norm": 0.24387520551681519,
"learning_rate": 0.00019280826579051147,
"loss": 0.3589,
"step": 2945
},
{
"epoch": 6.28997867803838,
"grad_norm": 0.22580432891845703,
"learning_rate": 0.0001927620028085919,
"loss": 0.3703,
"step": 2950
},
{
"epoch": 6.300639658848614,
"grad_norm": 0.24953973293304443,
"learning_rate": 0.00019271559708966023,
"loss": 0.3606,
"step": 2955
},
{
"epoch": 6.311300639658849,
"grad_norm": 0.2454618364572525,
"learning_rate": 0.000192669048705123,
"loss": 0.362,
"step": 2960
},
{
"epoch": 6.321961620469083,
"grad_norm": 0.2393016368150711,
"learning_rate": 0.00019262235772660627,
"loss": 0.3695,
"step": 2965
},
{
"epoch": 6.332622601279318,
"grad_norm": 0.2463667392730713,
"learning_rate": 0.00019257552422595554,
"loss": 0.3658,
"step": 2970
},
{
"epoch": 6.343283582089552,
"grad_norm": 0.24116967618465424,
"learning_rate": 0.00019252854827523557,
"loss": 0.3671,
"step": 2975
},
{
"epoch": 6.353944562899787,
"grad_norm": 0.2345789670944214,
"learning_rate": 0.00019248142994673036,
"loss": 0.368,
"step": 2980
},
{
"epoch": 6.364605543710021,
"grad_norm": 0.26505357027053833,
"learning_rate": 0.000192434169312943,
"loss": 0.3695,
"step": 2985
},
{
"epoch": 6.3752665245202556,
"grad_norm": 0.2504933476448059,
"learning_rate": 0.00019238676644659546,
"loss": 0.3605,
"step": 2990
},
{
"epoch": 6.38592750533049,
"grad_norm": 0.24889980256557465,
"learning_rate": 0.0001923392214206287,
"loss": 0.3684,
"step": 2995
},
{
"epoch": 6.396588486140725,
"grad_norm": 0.2319326400756836,
"learning_rate": 0.00019229153430820232,
"loss": 0.3621,
"step": 3000
},
{
"epoch": 6.40724946695096,
"grad_norm": 0.2329808622598648,
"learning_rate": 0.00019224370518269458,
"loss": 0.3649,
"step": 3005
},
{
"epoch": 6.417910447761194,
"grad_norm": 0.2565195560455322,
"learning_rate": 0.00019219573411770235,
"loss": 0.3602,
"step": 3010
},
{
"epoch": 6.428571428571429,
"grad_norm": 0.24189329147338867,
"learning_rate": 0.00019214762118704076,
"loss": 0.3691,
"step": 3015
},
{
"epoch": 6.439232409381663,
"grad_norm": 0.2512595057487488,
"learning_rate": 0.0001920993664647434,
"loss": 0.364,
"step": 3020
},
{
"epoch": 6.449893390191898,
"grad_norm": 0.24277447164058685,
"learning_rate": 0.00019205097002506185,
"loss": 0.3732,
"step": 3025
},
{
"epoch": 6.460554371002132,
"grad_norm": 0.242990642786026,
"learning_rate": 0.00019200243194246594,
"loss": 0.3674,
"step": 3030
},
{
"epoch": 6.471215351812367,
"grad_norm": 0.23621074855327606,
"learning_rate": 0.00019195375229164334,
"loss": 0.3599,
"step": 3035
},
{
"epoch": 6.481876332622601,
"grad_norm": 0.26253125071525574,
"learning_rate": 0.0001919049311474996,
"loss": 0.3708,
"step": 3040
},
{
"epoch": 6.492537313432836,
"grad_norm": 0.2214423567056656,
"learning_rate": 0.000191855968585158,
"loss": 0.3612,
"step": 3045
},
{
"epoch": 6.50319829424307,
"grad_norm": 0.24866749346256256,
"learning_rate": 0.00019180686467995935,
"loss": 0.3682,
"step": 3050
},
{
"epoch": 6.513859275053305,
"grad_norm": 0.2474697232246399,
"learning_rate": 0.00019175761950746204,
"loss": 0.354,
"step": 3055
},
{
"epoch": 6.524520255863539,
"grad_norm": 0.26961109042167664,
"learning_rate": 0.00019170823314344185,
"loss": 0.3708,
"step": 3060
},
{
"epoch": 6.535181236673774,
"grad_norm": 0.2510351538658142,
"learning_rate": 0.0001916587056638917,
"loss": 0.3667,
"step": 3065
},
{
"epoch": 6.545842217484008,
"grad_norm": 0.24457301199436188,
"learning_rate": 0.00019160903714502173,
"loss": 0.3679,
"step": 3070
},
{
"epoch": 6.556503198294243,
"grad_norm": 0.23988381028175354,
"learning_rate": 0.00019155922766325918,
"loss": 0.3608,
"step": 3075
},
{
"epoch": 6.567164179104478,
"grad_norm": 0.2317483127117157,
"learning_rate": 0.000191509277295248,
"loss": 0.3761,
"step": 3080
},
{
"epoch": 6.577825159914712,
"grad_norm": 0.2614232301712036,
"learning_rate": 0.0001914591861178491,
"loss": 0.3606,
"step": 3085
},
{
"epoch": 6.588486140724947,
"grad_norm": 0.24253317713737488,
"learning_rate": 0.00019140895420813997,
"loss": 0.362,
"step": 3090
},
{
"epoch": 6.599147121535181,
"grad_norm": 0.2507173418998718,
"learning_rate": 0.00019135858164341473,
"loss": 0.3594,
"step": 3095
},
{
"epoch": 6.609808102345416,
"grad_norm": 0.23574085533618927,
"learning_rate": 0.0001913080685011838,
"loss": 0.3661,
"step": 3100
},
{
"epoch": 6.62046908315565,
"grad_norm": 0.2325553447008133,
"learning_rate": 0.00019125741485917405,
"loss": 0.3756,
"step": 3105
},
{
"epoch": 6.631130063965885,
"grad_norm": 0.2191423624753952,
"learning_rate": 0.00019120662079532853,
"loss": 0.354,
"step": 3110
},
{
"epoch": 6.641791044776119,
"grad_norm": 0.21787339448928833,
"learning_rate": 0.00019115568638780622,
"loss": 0.3657,
"step": 3115
},
{
"epoch": 6.652452025586354,
"grad_norm": 0.21904399991035461,
"learning_rate": 0.0001911046117149822,
"loss": 0.367,
"step": 3120
},
{
"epoch": 6.663113006396588,
"grad_norm": 0.23119735717773438,
"learning_rate": 0.00019105339685544735,
"loss": 0.3646,
"step": 3125
},
{
"epoch": 6.673773987206823,
"grad_norm": 0.24613478779792786,
"learning_rate": 0.00019100204188800827,
"loss": 0.3682,
"step": 3130
},
{
"epoch": 6.684434968017058,
"grad_norm": 0.2366684079170227,
"learning_rate": 0.00019095054689168705,
"loss": 0.3714,
"step": 3135
},
{
"epoch": 6.6950959488272925,
"grad_norm": 0.2413744032382965,
"learning_rate": 0.0001908989119457214,
"loss": 0.3682,
"step": 3140
},
{
"epoch": 6.705756929637527,
"grad_norm": 0.23421700298786163,
"learning_rate": 0.00019084713712956428,
"loss": 0.3639,
"step": 3145
},
{
"epoch": 6.7164179104477615,
"grad_norm": 0.23423875868320465,
"learning_rate": 0.00019079522252288386,
"loss": 0.3655,
"step": 3150
},
{
"epoch": 6.727078891257996,
"grad_norm": 0.23802149295806885,
"learning_rate": 0.00019074316820556352,
"loss": 0.3708,
"step": 3155
},
{
"epoch": 6.73773987206823,
"grad_norm": 0.25665974617004395,
"learning_rate": 0.00019069097425770154,
"loss": 0.3762,
"step": 3160
},
{
"epoch": 6.748400852878465,
"grad_norm": 0.23551535606384277,
"learning_rate": 0.00019063864075961098,
"loss": 0.3687,
"step": 3165
},
{
"epoch": 6.759061833688699,
"grad_norm": 0.24098068475723267,
"learning_rate": 0.00019058616779181982,
"loss": 0.3659,
"step": 3170
},
{
"epoch": 6.769722814498934,
"grad_norm": 0.22562439739704132,
"learning_rate": 0.0001905335554350705,
"loss": 0.3724,
"step": 3175
},
{
"epoch": 6.780383795309168,
"grad_norm": 0.224997878074646,
"learning_rate": 0.00019048080377031995,
"loss": 0.3705,
"step": 3180
},
{
"epoch": 6.791044776119403,
"grad_norm": 0.2575388252735138,
"learning_rate": 0.00019042791287873957,
"loss": 0.3611,
"step": 3185
},
{
"epoch": 6.801705756929637,
"grad_norm": 0.231009379029274,
"learning_rate": 0.0001903748828417149,
"loss": 0.3653,
"step": 3190
},
{
"epoch": 6.812366737739872,
"grad_norm": 0.23769618570804596,
"learning_rate": 0.0001903217137408456,
"loss": 0.3615,
"step": 3195
},
{
"epoch": 6.823027718550106,
"grad_norm": 0.23301640152931213,
"learning_rate": 0.00019026840565794536,
"loss": 0.366,
"step": 3200
},
{
"epoch": 6.833688699360341,
"grad_norm": 0.2212369292974472,
"learning_rate": 0.00019021495867504163,
"loss": 0.3632,
"step": 3205
},
{
"epoch": 6.844349680170575,
"grad_norm": 0.23795363306999207,
"learning_rate": 0.0001901613728743757,
"loss": 0.3681,
"step": 3210
},
{
"epoch": 6.855010660980811,
"grad_norm": 0.24354343116283417,
"learning_rate": 0.00019010764833840243,
"loss": 0.3695,
"step": 3215
},
{
"epoch": 6.865671641791045,
"grad_norm": 0.24145299196243286,
"learning_rate": 0.00019005378514979008,
"loss": 0.3667,
"step": 3220
},
{
"epoch": 6.8763326226012795,
"grad_norm": 0.24070268869400024,
"learning_rate": 0.0001899997833914204,
"loss": 0.3693,
"step": 3225
},
{
"epoch": 6.886993603411514,
"grad_norm": 0.22578920423984528,
"learning_rate": 0.00018994564314638832,
"loss": 0.3692,
"step": 3230
},
{
"epoch": 6.8976545842217485,
"grad_norm": 0.22691179811954498,
"learning_rate": 0.00018989136449800174,
"loss": 0.3766,
"step": 3235
},
{
"epoch": 6.908315565031983,
"grad_norm": 0.2194678634405136,
"learning_rate": 0.0001898369475297817,
"loss": 0.3668,
"step": 3240
},
{
"epoch": 6.918976545842217,
"grad_norm": 0.22618421912193298,
"learning_rate": 0.000189782392325462,
"loss": 0.3592,
"step": 3245
},
{
"epoch": 6.929637526652452,
"grad_norm": 0.2549285292625427,
"learning_rate": 0.0001897276989689891,
"loss": 0.3653,
"step": 3250
},
{
"epoch": 6.940298507462686,
"grad_norm": 0.23101598024368286,
"learning_rate": 0.00018967286754452214,
"loss": 0.3569,
"step": 3255
},
{
"epoch": 6.950959488272921,
"grad_norm": 0.2506960332393646,
"learning_rate": 0.00018961789813643268,
"loss": 0.3633,
"step": 3260
},
{
"epoch": 6.961620469083155,
"grad_norm": 0.2284671515226364,
"learning_rate": 0.00018956279082930455,
"loss": 0.3624,
"step": 3265
},
{
"epoch": 6.97228144989339,
"grad_norm": 0.22146272659301758,
"learning_rate": 0.00018950754570793384,
"loss": 0.37,
"step": 3270
},
{
"epoch": 6.982942430703625,
"grad_norm": 0.2425510585308075,
"learning_rate": 0.00018945216285732864,
"loss": 0.366,
"step": 3275
},
{
"epoch": 6.99360341151386,
"grad_norm": 0.2304454892873764,
"learning_rate": 0.00018939664236270907,
"loss": 0.3684,
"step": 3280
},
{
"epoch": 7.0,
"eval_loss": 0.5168320536613464,
"eval_runtime": 377.6098,
"eval_samples_per_second": 1.091,
"eval_steps_per_second": 1.091,
"step": 3283
},
{
"epoch": 7.004264392324094,
"grad_norm": 0.2056385576725006,
"learning_rate": 0.00018934098430950692,
"loss": 0.3479,
"step": 3285
},
{
"epoch": 7.014925373134329,
"grad_norm": 0.2757323086261749,
"learning_rate": 0.0001892851887833657,
"loss": 0.333,
"step": 3290
},
{
"epoch": 7.025586353944563,
"grad_norm": 0.25871726870536804,
"learning_rate": 0.00018922925587014046,
"loss": 0.3212,
"step": 3295
},
{
"epoch": 7.036247334754798,
"grad_norm": 0.2494359016418457,
"learning_rate": 0.00018917318565589772,
"loss": 0.3248,
"step": 3300
},
{
"epoch": 7.046908315565032,
"grad_norm": 0.2385275512933731,
"learning_rate": 0.00018911697822691516,
"loss": 0.3189,
"step": 3305
},
{
"epoch": 7.0575692963752665,
"grad_norm": 0.2520158588886261,
"learning_rate": 0.00018906063366968165,
"loss": 0.3268,
"step": 3310
},
{
"epoch": 7.068230277185501,
"grad_norm": 0.25822409987449646,
"learning_rate": 0.00018900415207089708,
"loss": 0.3169,
"step": 3315
},
{
"epoch": 7.0788912579957355,
"grad_norm": 0.2619076669216156,
"learning_rate": 0.00018894753351747214,
"loss": 0.3279,
"step": 3320
},
{
"epoch": 7.08955223880597,
"grad_norm": 0.30978551506996155,
"learning_rate": 0.0001888907780965284,
"loss": 0.327,
"step": 3325
},
{
"epoch": 7.100213219616204,
"grad_norm": 0.25372347235679626,
"learning_rate": 0.00018883388589539785,
"loss": 0.3254,
"step": 3330
},
{
"epoch": 7.110874200426439,
"grad_norm": 0.27630311250686646,
"learning_rate": 0.0001887768570016231,
"loss": 0.3291,
"step": 3335
},
{
"epoch": 7.121535181236673,
"grad_norm": 0.2716643810272217,
"learning_rate": 0.00018871969150295706,
"loss": 0.3241,
"step": 3340
},
{
"epoch": 7.132196162046908,
"grad_norm": 0.2678888440132141,
"learning_rate": 0.00018866238948736278,
"loss": 0.3304,
"step": 3345
},
{
"epoch": 7.142857142857143,
"grad_norm": 0.2532709240913391,
"learning_rate": 0.00018860495104301345,
"loss": 0.3331,
"step": 3350
},
{
"epoch": 7.153518123667378,
"grad_norm": 0.2671636939048767,
"learning_rate": 0.0001885473762582921,
"loss": 0.3315,
"step": 3355
},
{
"epoch": 7.164179104477612,
"grad_norm": 0.2550068497657776,
"learning_rate": 0.00018848966522179168,
"loss": 0.3306,
"step": 3360
},
{
"epoch": 7.174840085287847,
"grad_norm": 0.2700331211090088,
"learning_rate": 0.00018843181802231465,
"loss": 0.329,
"step": 3365
},
{
"epoch": 7.185501066098081,
"grad_norm": 0.26168689131736755,
"learning_rate": 0.00018837383474887314,
"loss": 0.3327,
"step": 3370
},
{
"epoch": 7.196162046908316,
"grad_norm": 0.24964787065982819,
"learning_rate": 0.00018831571549068852,
"loss": 0.3353,
"step": 3375
},
{
"epoch": 7.20682302771855,
"grad_norm": 0.2676330804824829,
"learning_rate": 0.00018825746033719149,
"loss": 0.3316,
"step": 3380
},
{
"epoch": 7.217484008528785,
"grad_norm": 0.25253960490226746,
"learning_rate": 0.0001881990693780219,
"loss": 0.3316,
"step": 3385
},
{
"epoch": 7.228144989339019,
"grad_norm": 0.257114440202713,
"learning_rate": 0.0001881405427030284,
"loss": 0.3307,
"step": 3390
},
{
"epoch": 7.2388059701492535,
"grad_norm": 0.25102248787879944,
"learning_rate": 0.00018808188040226868,
"loss": 0.3348,
"step": 3395
},
{
"epoch": 7.249466950959488,
"grad_norm": 0.25489816069602966,
"learning_rate": 0.000188023082566009,
"loss": 0.3342,
"step": 3400
},
{
"epoch": 7.2601279317697225,
"grad_norm": 0.27044063806533813,
"learning_rate": 0.00018796414928472417,
"loss": 0.3391,
"step": 3405
},
{
"epoch": 7.270788912579957,
"grad_norm": 0.26209956407546997,
"learning_rate": 0.00018790508064909746,
"loss": 0.3318,
"step": 3410
},
{
"epoch": 7.281449893390192,
"grad_norm": 0.25549113750457764,
"learning_rate": 0.00018784587675002045,
"loss": 0.3322,
"step": 3415
},
{
"epoch": 7.292110874200427,
"grad_norm": 0.26465660333633423,
"learning_rate": 0.00018778653767859274,
"loss": 0.3319,
"step": 3420
},
{
"epoch": 7.302771855010661,
"grad_norm": 0.2753106951713562,
"learning_rate": 0.00018772706352612203,
"loss": 0.3329,
"step": 3425
},
{
"epoch": 7.313432835820896,
"grad_norm": 0.2526467740535736,
"learning_rate": 0.00018766745438412384,
"loss": 0.3311,
"step": 3430
},
{
"epoch": 7.32409381663113,
"grad_norm": 0.2626464068889618,
"learning_rate": 0.00018760771034432138,
"loss": 0.3318,
"step": 3435
},
{
"epoch": 7.334754797441365,
"grad_norm": 0.2631151080131531,
"learning_rate": 0.0001875478314986455,
"loss": 0.3453,
"step": 3440
},
{
"epoch": 7.345415778251599,
"grad_norm": 0.25757527351379395,
"learning_rate": 0.0001874878179392344,
"loss": 0.3373,
"step": 3445
},
{
"epoch": 7.356076759061834,
"grad_norm": 0.2395113706588745,
"learning_rate": 0.0001874276697584336,
"loss": 0.331,
"step": 3450
},
{
"epoch": 7.366737739872068,
"grad_norm": 0.2804111838340759,
"learning_rate": 0.0001873673870487958,
"loss": 0.3378,
"step": 3455
},
{
"epoch": 7.377398720682303,
"grad_norm": 0.24439595639705658,
"learning_rate": 0.00018730696990308069,
"loss": 0.3381,
"step": 3460
},
{
"epoch": 7.388059701492537,
"grad_norm": 0.270958811044693,
"learning_rate": 0.00018724641841425478,
"loss": 0.3418,
"step": 3465
},
{
"epoch": 7.398720682302772,
"grad_norm": 0.2635878324508667,
"learning_rate": 0.0001871857326754914,
"loss": 0.3433,
"step": 3470
},
{
"epoch": 7.409381663113006,
"grad_norm": 0.24128612875938416,
"learning_rate": 0.00018712491278017032,
"loss": 0.3395,
"step": 3475
},
{
"epoch": 7.4200426439232405,
"grad_norm": 0.2588317096233368,
"learning_rate": 0.00018706395882187783,
"loss": 0.3415,
"step": 3480
},
{
"epoch": 7.430703624733475,
"grad_norm": 0.2590773105621338,
"learning_rate": 0.0001870028708944065,
"loss": 0.3392,
"step": 3485
},
{
"epoch": 7.44136460554371,
"grad_norm": 0.25688695907592773,
"learning_rate": 0.00018694164909175505,
"loss": 0.3385,
"step": 3490
},
{
"epoch": 7.452025586353945,
"grad_norm": 0.23704120516777039,
"learning_rate": 0.00018688029350812817,
"loss": 0.3356,
"step": 3495
},
{
"epoch": 7.462686567164179,
"grad_norm": 0.2817398011684418,
"learning_rate": 0.00018681880423793642,
"loss": 0.3368,
"step": 3500
},
{
"epoch": 7.473347547974414,
"grad_norm": 0.2590171694755554,
"learning_rate": 0.00018675718137579607,
"loss": 0.3382,
"step": 3505
},
{
"epoch": 7.484008528784648,
"grad_norm": 0.2843134105205536,
"learning_rate": 0.00018669542501652896,
"loss": 0.3304,
"step": 3510
},
{
"epoch": 7.494669509594883,
"grad_norm": 0.25284621119499207,
"learning_rate": 0.00018663353525516234,
"loss": 0.3337,
"step": 3515
},
{
"epoch": 7.505330490405117,
"grad_norm": 0.24715737998485565,
"learning_rate": 0.00018657151218692873,
"loss": 0.3373,
"step": 3520
},
{
"epoch": 7.515991471215352,
"grad_norm": 0.28074926137924194,
"learning_rate": 0.0001865093559072658,
"loss": 0.3376,
"step": 3525
},
{
"epoch": 7.526652452025586,
"grad_norm": 0.2531152367591858,
"learning_rate": 0.00018644706651181614,
"loss": 0.3329,
"step": 3530
},
{
"epoch": 7.537313432835821,
"grad_norm": 0.27217596769332886,
"learning_rate": 0.00018638464409642723,
"loss": 0.3486,
"step": 3535
},
{
"epoch": 7.547974413646055,
"grad_norm": 0.2517159581184387,
"learning_rate": 0.0001863220887571512,
"loss": 0.343,
"step": 3540
},
{
"epoch": 7.55863539445629,
"grad_norm": 0.2538190484046936,
"learning_rate": 0.00018625940059024477,
"loss": 0.3343,
"step": 3545
},
{
"epoch": 7.569296375266525,
"grad_norm": 0.26679527759552,
"learning_rate": 0.00018619657969216893,
"loss": 0.348,
"step": 3550
},
{
"epoch": 7.5799573560767595,
"grad_norm": 0.24433985352516174,
"learning_rate": 0.00018613362615958905,
"loss": 0.3455,
"step": 3555
},
{
"epoch": 7.590618336886994,
"grad_norm": 0.2719508111476898,
"learning_rate": 0.0001860705400893745,
"loss": 0.3414,
"step": 3560
},
{
"epoch": 7.601279317697228,
"grad_norm": 0.2666242718696594,
"learning_rate": 0.00018600732157859863,
"loss": 0.3384,
"step": 3565
},
{
"epoch": 7.611940298507463,
"grad_norm": 0.24249517917633057,
"learning_rate": 0.00018594397072453856,
"loss": 0.339,
"step": 3570
},
{
"epoch": 7.622601279317697,
"grad_norm": 0.2475687712430954,
"learning_rate": 0.00018588048762467502,
"loss": 0.3428,
"step": 3575
},
{
"epoch": 7.633262260127932,
"grad_norm": 0.2500527799129486,
"learning_rate": 0.00018581687237669234,
"loss": 0.3332,
"step": 3580
},
{
"epoch": 7.643923240938166,
"grad_norm": 0.2528587281703949,
"learning_rate": 0.0001857531250784781,
"loss": 0.3429,
"step": 3585
},
{
"epoch": 7.654584221748401,
"grad_norm": 0.2627830505371094,
"learning_rate": 0.0001856892458281231,
"loss": 0.3396,
"step": 3590
},
{
"epoch": 7.665245202558635,
"grad_norm": 0.2573624849319458,
"learning_rate": 0.00018562523472392118,
"loss": 0.3391,
"step": 3595
},
{
"epoch": 7.67590618336887,
"grad_norm": 0.2411065399646759,
"learning_rate": 0.0001855610918643691,
"loss": 0.3384,
"step": 3600
},
{
"epoch": 7.686567164179104,
"grad_norm": 0.2589527666568756,
"learning_rate": 0.00018549681734816623,
"loss": 0.3429,
"step": 3605
},
{
"epoch": 7.697228144989339,
"grad_norm": 0.2436107099056244,
"learning_rate": 0.00018543241127421474,
"loss": 0.3435,
"step": 3610
},
{
"epoch": 7.707889125799573,
"grad_norm": 0.272020161151886,
"learning_rate": 0.00018536787374161902,
"loss": 0.3418,
"step": 3615
},
{
"epoch": 7.718550106609808,
"grad_norm": 0.26080530881881714,
"learning_rate": 0.00018530320484968588,
"loss": 0.3367,
"step": 3620
},
{
"epoch": 7.729211087420042,
"grad_norm": 0.2503691613674164,
"learning_rate": 0.0001852384046979242,
"loss": 0.3367,
"step": 3625
},
{
"epoch": 7.7398720682302775,
"grad_norm": 0.26822352409362793,
"learning_rate": 0.0001851734733860449,
"loss": 0.3498,
"step": 3630
},
{
"epoch": 7.750533049040512,
"grad_norm": 0.28552523255348206,
"learning_rate": 0.00018510841101396062,
"loss": 0.3406,
"step": 3635
},
{
"epoch": 7.7611940298507465,
"grad_norm": 0.2446276843547821,
"learning_rate": 0.0001850432176817857,
"loss": 0.3465,
"step": 3640
},
{
"epoch": 7.771855010660981,
"grad_norm": 0.24052871763706207,
"learning_rate": 0.00018497789348983606,
"loss": 0.3434,
"step": 3645
},
{
"epoch": 7.782515991471215,
"grad_norm": 0.23899152874946594,
"learning_rate": 0.00018491243853862893,
"loss": 0.3365,
"step": 3650
},
{
"epoch": 7.79317697228145,
"grad_norm": 0.24732346832752228,
"learning_rate": 0.00018484685292888278,
"loss": 0.3382,
"step": 3655
},
{
"epoch": 7.803837953091684,
"grad_norm": 0.2519215941429138,
"learning_rate": 0.00018478113676151703,
"loss": 0.3463,
"step": 3660
},
{
"epoch": 7.814498933901919,
"grad_norm": 0.24091705679893494,
"learning_rate": 0.00018471529013765209,
"loss": 0.3404,
"step": 3665
},
{
"epoch": 7.825159914712153,
"grad_norm": 0.2794884443283081,
"learning_rate": 0.0001846493131586091,
"loss": 0.3469,
"step": 3670
},
{
"epoch": 7.835820895522388,
"grad_norm": 0.24296560883522034,
"learning_rate": 0.00018458320592590975,
"loss": 0.3434,
"step": 3675
},
{
"epoch": 7.846481876332622,
"grad_norm": 0.24800756573677063,
"learning_rate": 0.00018451696854127617,
"loss": 0.3384,
"step": 3680
},
{
"epoch": 7.857142857142857,
"grad_norm": 0.2350349873304367,
"learning_rate": 0.0001844506011066308,
"loss": 0.3428,
"step": 3685
},
{
"epoch": 7.867803837953092,
"grad_norm": 0.2573322355747223,
"learning_rate": 0.0001843841037240961,
"loss": 0.3463,
"step": 3690
},
{
"epoch": 7.878464818763327,
"grad_norm": 0.256381630897522,
"learning_rate": 0.00018431747649599463,
"loss": 0.3397,
"step": 3695
},
{
"epoch": 7.889125799573561,
"grad_norm": 0.23707297444343567,
"learning_rate": 0.0001842507195248486,
"loss": 0.3437,
"step": 3700
},
{
"epoch": 7.899786780383796,
"grad_norm": 0.24699944257736206,
"learning_rate": 0.00018418383291337988,
"loss": 0.3398,
"step": 3705
},
{
"epoch": 7.91044776119403,
"grad_norm": 0.25237977504730225,
"learning_rate": 0.00018411681676450999,
"loss": 0.3409,
"step": 3710
},
{
"epoch": 7.9211087420042645,
"grad_norm": 0.2656485438346863,
"learning_rate": 0.00018404967118135955,
"loss": 0.3487,
"step": 3715
},
{
"epoch": 7.931769722814499,
"grad_norm": 0.23709309101104736,
"learning_rate": 0.0001839823962672485,
"loss": 0.3398,
"step": 3720
},
{
"epoch": 7.9424307036247335,
"grad_norm": 0.24946698546409607,
"learning_rate": 0.00018391499212569573,
"loss": 0.3459,
"step": 3725
},
{
"epoch": 7.953091684434968,
"grad_norm": 0.2608436346054077,
"learning_rate": 0.00018384745886041898,
"loss": 0.3394,
"step": 3730
},
{
"epoch": 7.963752665245202,
"grad_norm": 0.2503463625907898,
"learning_rate": 0.00018377979657533468,
"loss": 0.3436,
"step": 3735
},
{
"epoch": 7.974413646055437,
"grad_norm": 0.2556673586368561,
"learning_rate": 0.0001837120053745578,
"loss": 0.3519,
"step": 3740
},
{
"epoch": 7.985074626865671,
"grad_norm": 0.24612018465995789,
"learning_rate": 0.0001836440853624017,
"loss": 0.3388,
"step": 3745
},
{
"epoch": 7.995735607675906,
"grad_norm": 0.26963427662849426,
"learning_rate": 0.00018357603664337786,
"loss": 0.3403,
"step": 3750
},
{
"epoch": 8.0,
"eval_loss": 0.5337910056114197,
"eval_runtime": 377.6371,
"eval_samples_per_second": 1.091,
"eval_steps_per_second": 1.091,
"step": 3752
},
{
"epoch": 8.00639658848614,
"grad_norm": 0.2208224982023239,
"learning_rate": 0.00018350785932219588,
"loss": 0.3081,
"step": 3755
},
{
"epoch": 8.017057569296375,
"grad_norm": 0.30632683634757996,
"learning_rate": 0.00018343955350376325,
"loss": 0.2978,
"step": 3760
},
{
"epoch": 8.02771855010661,
"grad_norm": 0.25390052795410156,
"learning_rate": 0.00018337111929318516,
"loss": 0.2948,
"step": 3765
},
{
"epoch": 8.038379530916844,
"grad_norm": 0.296369731426239,
"learning_rate": 0.00018330255679576438,
"loss": 0.2963,
"step": 3770
},
{
"epoch": 8.049040511727078,
"grad_norm": 0.2958175837993622,
"learning_rate": 0.00018323386611700105,
"loss": 0.2905,
"step": 3775
},
{
"epoch": 8.059701492537313,
"grad_norm": 0.2595365047454834,
"learning_rate": 0.00018316504736259255,
"loss": 0.2918,
"step": 3780
},
{
"epoch": 8.070362473347547,
"grad_norm": 0.2825353145599365,
"learning_rate": 0.00018309610063843337,
"loss": 0.3,
"step": 3785
},
{
"epoch": 8.081023454157782,
"grad_norm": 0.2677433490753174,
"learning_rate": 0.00018302702605061492,
"loss": 0.2964,
"step": 3790
},
{
"epoch": 8.091684434968018,
"grad_norm": 0.28075000643730164,
"learning_rate": 0.00018295782370542532,
"loss": 0.2979,
"step": 3795
},
{
"epoch": 8.102345415778252,
"grad_norm": 0.2629709243774414,
"learning_rate": 0.00018288849370934926,
"loss": 0.3005,
"step": 3800
},
{
"epoch": 8.113006396588487,
"grad_norm": 0.2850215435028076,
"learning_rate": 0.00018281903616906796,
"loss": 0.2976,
"step": 3805
},
{
"epoch": 8.123667377398721,
"grad_norm": 0.29631924629211426,
"learning_rate": 0.0001827494511914587,
"loss": 0.2938,
"step": 3810
},
{
"epoch": 8.134328358208956,
"grad_norm": 0.26315709948539734,
"learning_rate": 0.00018267973888359509,
"loss": 0.3021,
"step": 3815
},
{
"epoch": 8.14498933901919,
"grad_norm": 0.30577051639556885,
"learning_rate": 0.0001826098993527465,
"loss": 0.2996,
"step": 3820
},
{
"epoch": 8.155650319829425,
"grad_norm": 0.2897678315639496,
"learning_rate": 0.0001825399327063781,
"loss": 0.3048,
"step": 3825
},
{
"epoch": 8.16631130063966,
"grad_norm": 0.3003354072570801,
"learning_rate": 0.00018246983905215075,
"loss": 0.3075,
"step": 3830
},
{
"epoch": 8.176972281449894,
"grad_norm": 0.28864815831184387,
"learning_rate": 0.00018239961849792055,
"loss": 0.3091,
"step": 3835
},
{
"epoch": 8.187633262260128,
"grad_norm": 0.28102535009384155,
"learning_rate": 0.0001823292711517391,
"loss": 0.2969,
"step": 3840
},
{
"epoch": 8.198294243070363,
"grad_norm": 0.2669455409049988,
"learning_rate": 0.00018225879712185293,
"loss": 0.3061,
"step": 3845
},
{
"epoch": 8.208955223880597,
"grad_norm": 0.2893795669078827,
"learning_rate": 0.00018218819651670356,
"loss": 0.3003,
"step": 3850
},
{
"epoch": 8.219616204690832,
"grad_norm": 0.31041857600212097,
"learning_rate": 0.00018211746944492727,
"loss": 0.3069,
"step": 3855
},
{
"epoch": 8.230277185501066,
"grad_norm": 0.2678110599517822,
"learning_rate": 0.000182046616015355,
"loss": 0.3023,
"step": 3860
},
{
"epoch": 8.2409381663113,
"grad_norm": 0.3051944375038147,
"learning_rate": 0.00018197563633701196,
"loss": 0.3095,
"step": 3865
},
{
"epoch": 8.251599147121535,
"grad_norm": 0.267646461725235,
"learning_rate": 0.00018190453051911782,
"loss": 0.3047,
"step": 3870
},
{
"epoch": 8.26226012793177,
"grad_norm": 0.27988821268081665,
"learning_rate": 0.00018183329867108624,
"loss": 0.3132,
"step": 3875
},
{
"epoch": 8.272921108742004,
"grad_norm": 0.293363094329834,
"learning_rate": 0.0001817619409025248,
"loss": 0.3054,
"step": 3880
},
{
"epoch": 8.283582089552239,
"grad_norm": 0.28679507970809937,
"learning_rate": 0.00018169045732323492,
"loss": 0.3049,
"step": 3885
},
{
"epoch": 8.294243070362473,
"grad_norm": 0.28792116045951843,
"learning_rate": 0.0001816188480432115,
"loss": 0.3112,
"step": 3890
},
{
"epoch": 8.304904051172707,
"grad_norm": 0.2938394844532013,
"learning_rate": 0.00018154711317264297,
"loss": 0.3101,
"step": 3895
},
{
"epoch": 8.315565031982942,
"grad_norm": 0.2776646316051483,
"learning_rate": 0.00018147525282191093,
"loss": 0.3046,
"step": 3900
},
{
"epoch": 8.326226012793176,
"grad_norm": 0.2619486153125763,
"learning_rate": 0.00018140326710159007,
"loss": 0.3066,
"step": 3905
},
{
"epoch": 8.336886993603411,
"grad_norm": 0.2895703911781311,
"learning_rate": 0.00018133115612244807,
"loss": 0.3122,
"step": 3910
},
{
"epoch": 8.347547974413645,
"grad_norm": 0.2928364872932434,
"learning_rate": 0.00018125891999544525,
"loss": 0.303,
"step": 3915
},
{
"epoch": 8.35820895522388,
"grad_norm": 0.27352485060691833,
"learning_rate": 0.00018118655883173456,
"loss": 0.301,
"step": 3920
},
{
"epoch": 8.368869936034114,
"grad_norm": 0.3004440665245056,
"learning_rate": 0.00018111407274266136,
"loss": 0.3084,
"step": 3925
},
{
"epoch": 8.379530916844349,
"grad_norm": 0.26515400409698486,
"learning_rate": 0.00018104146183976316,
"loss": 0.3052,
"step": 3930
},
{
"epoch": 8.390191897654585,
"grad_norm": 0.29159972071647644,
"learning_rate": 0.00018096872623476963,
"loss": 0.3018,
"step": 3935
},
{
"epoch": 8.40085287846482,
"grad_norm": 0.31077924370765686,
"learning_rate": 0.00018089586603960224,
"loss": 0.3139,
"step": 3940
},
{
"epoch": 8.411513859275054,
"grad_norm": 0.2826644480228424,
"learning_rate": 0.00018082288136637422,
"loss": 0.2955,
"step": 3945
},
{
"epoch": 8.422174840085288,
"grad_norm": 0.2825087308883667,
"learning_rate": 0.00018074977232739031,
"loss": 0.3127,
"step": 3950
},
{
"epoch": 8.432835820895523,
"grad_norm": 0.2901898920536041,
"learning_rate": 0.0001806765390351467,
"loss": 0.3099,
"step": 3955
},
{
"epoch": 8.443496801705757,
"grad_norm": 0.28308314085006714,
"learning_rate": 0.00018060318160233063,
"loss": 0.3122,
"step": 3960
},
{
"epoch": 8.454157782515992,
"grad_norm": 0.26890453696250916,
"learning_rate": 0.00018052970014182046,
"loss": 0.3156,
"step": 3965
},
{
"epoch": 8.464818763326226,
"grad_norm": 0.2962822914123535,
"learning_rate": 0.00018045609476668545,
"loss": 0.3184,
"step": 3970
},
{
"epoch": 8.47547974413646,
"grad_norm": 0.2848854959011078,
"learning_rate": 0.00018038236559018533,
"loss": 0.309,
"step": 3975
},
{
"epoch": 8.486140724946695,
"grad_norm": 0.3047114312648773,
"learning_rate": 0.00018030851272577051,
"loss": 0.3118,
"step": 3980
},
{
"epoch": 8.49680170575693,
"grad_norm": 0.28175976872444153,
"learning_rate": 0.00018023453628708173,
"loss": 0.3074,
"step": 3985
},
{
"epoch": 8.507462686567164,
"grad_norm": 0.27742594480514526,
"learning_rate": 0.00018016043638794974,
"loss": 0.3127,
"step": 3990
},
{
"epoch": 8.518123667377399,
"grad_norm": 0.28773581981658936,
"learning_rate": 0.0001800862131423954,
"loss": 0.3057,
"step": 3995
},
{
"epoch": 8.528784648187633,
"grad_norm": 0.2765009105205536,
"learning_rate": 0.00018001186666462927,
"loss": 0.3128,
"step": 4000
},
{
"epoch": 8.539445628997868,
"grad_norm": 0.2800111174583435,
"learning_rate": 0.00017993739706905162,
"loss": 0.3096,
"step": 4005
},
{
"epoch": 8.550106609808102,
"grad_norm": 0.30302369594573975,
"learning_rate": 0.00017986280447025209,
"loss": 0.3016,
"step": 4010
},
{
"epoch": 8.560767590618337,
"grad_norm": 0.2798007130622864,
"learning_rate": 0.0001797880889830096,
"loss": 0.3061,
"step": 4015
},
{
"epoch": 8.571428571428571,
"grad_norm": 0.29015523195266724,
"learning_rate": 0.00017971325072229226,
"loss": 0.3134,
"step": 4020
},
{
"epoch": 8.582089552238806,
"grad_norm": 0.3815457820892334,
"learning_rate": 0.00017963828980325697,
"loss": 0.3131,
"step": 4025
},
{
"epoch": 8.59275053304904,
"grad_norm": 0.2907319664955139,
"learning_rate": 0.00017956320634124944,
"loss": 0.314,
"step": 4030
},
{
"epoch": 8.603411513859275,
"grad_norm": 0.29612481594085693,
"learning_rate": 0.00017948800045180393,
"loss": 0.3168,
"step": 4035
},
{
"epoch": 8.614072494669509,
"grad_norm": 0.2797704339027405,
"learning_rate": 0.00017941267225064306,
"loss": 0.3144,
"step": 4040
},
{
"epoch": 8.624733475479744,
"grad_norm": 0.27811723947525024,
"learning_rate": 0.00017933722185367774,
"loss": 0.303,
"step": 4045
},
{
"epoch": 8.635394456289978,
"grad_norm": 0.2933618724346161,
"learning_rate": 0.00017926164937700676,
"loss": 0.3097,
"step": 4050
},
{
"epoch": 8.646055437100213,
"grad_norm": 0.282921701669693,
"learning_rate": 0.0001791859549369169,
"loss": 0.3104,
"step": 4055
},
{
"epoch": 8.656716417910447,
"grad_norm": 0.2758900225162506,
"learning_rate": 0.00017911013864988252,
"loss": 0.3108,
"step": 4060
},
{
"epoch": 8.667377398720681,
"grad_norm": 0.2904449999332428,
"learning_rate": 0.00017903420063256555,
"loss": 0.3209,
"step": 4065
},
{
"epoch": 8.678038379530918,
"grad_norm": 0.28849634528160095,
"learning_rate": 0.00017895814100181515,
"loss": 0.3055,
"step": 4070
},
{
"epoch": 8.688699360341152,
"grad_norm": 0.2709294259548187,
"learning_rate": 0.0001788819598746677,
"loss": 0.3167,
"step": 4075
},
{
"epoch": 8.699360341151387,
"grad_norm": 0.28200262784957886,
"learning_rate": 0.0001788056573683464,
"loss": 0.307,
"step": 4080
},
{
"epoch": 8.710021321961621,
"grad_norm": 0.27431854605674744,
"learning_rate": 0.00017872923360026137,
"loss": 0.3163,
"step": 4085
},
{
"epoch": 8.720682302771856,
"grad_norm": 0.28479164838790894,
"learning_rate": 0.00017865268868800925,
"loss": 0.3257,
"step": 4090
},
{
"epoch": 8.73134328358209,
"grad_norm": 0.2959545850753784,
"learning_rate": 0.00017857602274937308,
"loss": 0.3138,
"step": 4095
},
{
"epoch": 8.742004264392325,
"grad_norm": 0.270533949136734,
"learning_rate": 0.00017849923590232213,
"loss": 0.3182,
"step": 4100
},
{
"epoch": 8.752665245202559,
"grad_norm": 0.26438501477241516,
"learning_rate": 0.0001784223282650118,
"loss": 0.3084,
"step": 4105
},
{
"epoch": 8.763326226012794,
"grad_norm": 0.2890710234642029,
"learning_rate": 0.00017834529995578317,
"loss": 0.3093,
"step": 4110
},
{
"epoch": 8.773987206823028,
"grad_norm": 0.2725368142127991,
"learning_rate": 0.0001782681510931632,
"loss": 0.3185,
"step": 4115
},
{
"epoch": 8.784648187633262,
"grad_norm": 0.2648097276687622,
"learning_rate": 0.00017819088179586427,
"loss": 0.3126,
"step": 4120
},
{
"epoch": 8.795309168443497,
"grad_norm": 0.27868813276290894,
"learning_rate": 0.00017811349218278407,
"loss": 0.3157,
"step": 4125
},
{
"epoch": 8.805970149253731,
"grad_norm": 0.3133993446826935,
"learning_rate": 0.00017803598237300537,
"loss": 0.3128,
"step": 4130
},
{
"epoch": 8.816631130063966,
"grad_norm": 0.270416796207428,
"learning_rate": 0.00017795835248579606,
"loss": 0.3087,
"step": 4135
},
{
"epoch": 8.8272921108742,
"grad_norm": 0.299452543258667,
"learning_rate": 0.00017788060264060864,
"loss": 0.3126,
"step": 4140
},
{
"epoch": 8.837953091684435,
"grad_norm": 0.2789115607738495,
"learning_rate": 0.00017780273295708025,
"loss": 0.3149,
"step": 4145
},
{
"epoch": 8.84861407249467,
"grad_norm": 0.2616700828075409,
"learning_rate": 0.0001777247435550324,
"loss": 0.3151,
"step": 4150
},
{
"epoch": 8.859275053304904,
"grad_norm": 0.2998231053352356,
"learning_rate": 0.0001776466345544709,
"loss": 0.3143,
"step": 4155
},
{
"epoch": 8.869936034115138,
"grad_norm": 0.2851693034172058,
"learning_rate": 0.00017756840607558553,
"loss": 0.3153,
"step": 4160
},
{
"epoch": 8.880597014925373,
"grad_norm": 0.2862933874130249,
"learning_rate": 0.00017749005823874988,
"loss": 0.3124,
"step": 4165
},
{
"epoch": 8.891257995735607,
"grad_norm": 0.29242345690727234,
"learning_rate": 0.00017741159116452132,
"loss": 0.3137,
"step": 4170
},
{
"epoch": 8.901918976545842,
"grad_norm": 0.3226570188999176,
"learning_rate": 0.00017733300497364054,
"loss": 0.3168,
"step": 4175
},
{
"epoch": 8.912579957356076,
"grad_norm": 0.31018882989883423,
"learning_rate": 0.00017725429978703163,
"loss": 0.3162,
"step": 4180
},
{
"epoch": 8.92324093816631,
"grad_norm": 0.30581411719322205,
"learning_rate": 0.00017717547572580178,
"loss": 0.3166,
"step": 4185
},
{
"epoch": 8.933901918976545,
"grad_norm": 0.27954214811325073,
"learning_rate": 0.00017709653291124103,
"loss": 0.3175,
"step": 4190
},
{
"epoch": 8.94456289978678,
"grad_norm": 0.2803252041339874,
"learning_rate": 0.00017701747146482222,
"loss": 0.3228,
"step": 4195
},
{
"epoch": 8.955223880597014,
"grad_norm": 0.27694806456565857,
"learning_rate": 0.00017693829150820068,
"loss": 0.3152,
"step": 4200
},
{
"epoch": 8.96588486140725,
"grad_norm": 0.2755722403526306,
"learning_rate": 0.00017685899316321422,
"loss": 0.3105,
"step": 4205
},
{
"epoch": 8.976545842217483,
"grad_norm": 0.26287201046943665,
"learning_rate": 0.00017677957655188258,
"loss": 0.3146,
"step": 4210
},
{
"epoch": 8.98720682302772,
"grad_norm": 0.2679538428783417,
"learning_rate": 0.00017670004179640774,
"loss": 0.3196,
"step": 4215
},
{
"epoch": 8.997867803837954,
"grad_norm": 0.2998240292072296,
"learning_rate": 0.0001766203890191733,
"loss": 0.311,
"step": 4220
},
{
"epoch": 9.0,
"eval_loss": 0.556614875793457,
"eval_runtime": 377.56,
"eval_samples_per_second": 1.091,
"eval_steps_per_second": 1.091,
"step": 4221
},
{
"epoch": 9.008528784648188,
"grad_norm": 0.2680657207965851,
"learning_rate": 0.00017654061834274453,
"loss": 0.2787,
"step": 4225
},
{
"epoch": 9.019189765458423,
"grad_norm": 0.28186333179473877,
"learning_rate": 0.00017646072988986816,
"loss": 0.2668,
"step": 4230
},
{
"epoch": 9.029850746268657,
"grad_norm": 0.3159712255001068,
"learning_rate": 0.00017638072378347203,
"loss": 0.2681,
"step": 4235
},
{
"epoch": 9.040511727078892,
"grad_norm": 0.29439476132392883,
"learning_rate": 0.00017630060014666514,
"loss": 0.2644,
"step": 4240
},
{
"epoch": 9.051172707889126,
"grad_norm": 0.27110064029693604,
"learning_rate": 0.00017622035910273726,
"loss": 0.2645,
"step": 4245
},
{
"epoch": 9.06183368869936,
"grad_norm": 0.3253141939640045,
"learning_rate": 0.00017614000077515886,
"loss": 0.2668,
"step": 4250
},
{
"epoch": 9.072494669509595,
"grad_norm": 0.27271440625190735,
"learning_rate": 0.00017605952528758085,
"loss": 0.2636,
"step": 4255
},
{
"epoch": 9.08315565031983,
"grad_norm": 0.3024181127548218,
"learning_rate": 0.00017597893276383446,
"loss": 0.2651,
"step": 4260
},
{
"epoch": 9.093816631130064,
"grad_norm": 0.29704058170318604,
"learning_rate": 0.00017589822332793098,
"loss": 0.2705,
"step": 4265
},
{
"epoch": 9.104477611940299,
"grad_norm": 0.3102332055568695,
"learning_rate": 0.0001758173971040616,
"loss": 0.2645,
"step": 4270
},
{
"epoch": 9.115138592750533,
"grad_norm": 0.28398755192756653,
"learning_rate": 0.00017573645421659715,
"loss": 0.2695,
"step": 4275
},
{
"epoch": 9.125799573560768,
"grad_norm": 0.3188519775867462,
"learning_rate": 0.00017565539479008814,
"loss": 0.272,
"step": 4280
},
{
"epoch": 9.136460554371002,
"grad_norm": 0.30803632736206055,
"learning_rate": 0.0001755742189492643,
"loss": 0.268,
"step": 4285
},
{
"epoch": 9.147121535181236,
"grad_norm": 0.3042227327823639,
"learning_rate": 0.00017549292681903444,
"loss": 0.2659,
"step": 4290
},
{
"epoch": 9.157782515991471,
"grad_norm": 0.3055075407028198,
"learning_rate": 0.00017541151852448644,
"loss": 0.2705,
"step": 4295
},
{
"epoch": 9.168443496801705,
"grad_norm": 0.3084838092327118,
"learning_rate": 0.00017532999419088682,
"loss": 0.2711,
"step": 4300
},
{
"epoch": 9.17910447761194,
"grad_norm": 0.3110904395580292,
"learning_rate": 0.00017524835394368065,
"loss": 0.2678,
"step": 4305
},
{
"epoch": 9.189765458422174,
"grad_norm": 0.3138080835342407,
"learning_rate": 0.0001751665979084915,
"loss": 0.2715,
"step": 4310
},
{
"epoch": 9.200426439232409,
"grad_norm": 0.2787773609161377,
"learning_rate": 0.00017508472621112093,
"loss": 0.2764,
"step": 4315
},
{
"epoch": 9.211087420042643,
"grad_norm": 0.31073546409606934,
"learning_rate": 0.0001750027389775486,
"loss": 0.2745,
"step": 4320
},
{
"epoch": 9.221748400852878,
"grad_norm": 0.3100415766239166,
"learning_rate": 0.00017492063633393188,
"loss": 0.2731,
"step": 4325
},
{
"epoch": 9.232409381663112,
"grad_norm": 0.300081342458725,
"learning_rate": 0.00017483841840660577,
"loss": 0.2711,
"step": 4330
},
{
"epoch": 9.243070362473347,
"grad_norm": 0.31163203716278076,
"learning_rate": 0.0001747560853220826,
"loss": 0.2786,
"step": 4335
},
{
"epoch": 9.253731343283581,
"grad_norm": 0.33607375621795654,
"learning_rate": 0.00017467363720705204,
"loss": 0.2728,
"step": 4340
},
{
"epoch": 9.264392324093816,
"grad_norm": 0.300729900598526,
"learning_rate": 0.0001745910741883806,
"loss": 0.2749,
"step": 4345
},
{
"epoch": 9.275053304904052,
"grad_norm": 0.3036794364452362,
"learning_rate": 0.00017450839639311162,
"loss": 0.2726,
"step": 4350
},
{
"epoch": 9.285714285714286,
"grad_norm": 0.32798221707344055,
"learning_rate": 0.00017442560394846516,
"loss": 0.2752,
"step": 4355
},
{
"epoch": 9.296375266524521,
"grad_norm": 0.2973875105381012,
"learning_rate": 0.00017434269698183763,
"loss": 0.2743,
"step": 4360
},
{
"epoch": 9.307036247334755,
"grad_norm": 0.3339863717556,
"learning_rate": 0.00017425967562080167,
"loss": 0.2766,
"step": 4365
},
{
"epoch": 9.31769722814499,
"grad_norm": 0.30738508701324463,
"learning_rate": 0.00017417653999310585,
"loss": 0.2728,
"step": 4370
},
{
"epoch": 9.328358208955224,
"grad_norm": 0.3430582284927368,
"learning_rate": 0.0001740932902266747,
"loss": 0.2744,
"step": 4375
},
{
"epoch": 9.339019189765459,
"grad_norm": 0.2887689769268036,
"learning_rate": 0.00017400992644960842,
"loss": 0.2772,
"step": 4380
},
{
"epoch": 9.349680170575693,
"grad_norm": 0.3249075412750244,
"learning_rate": 0.0001739264487901824,
"loss": 0.2757,
"step": 4385
},
{
"epoch": 9.360341151385928,
"grad_norm": 0.31958818435668945,
"learning_rate": 0.00017384285737684753,
"loss": 0.2744,
"step": 4390
},
{
"epoch": 9.371002132196162,
"grad_norm": 0.31824401021003723,
"learning_rate": 0.0001737591523382296,
"loss": 0.2809,
"step": 4395
},
{
"epoch": 9.381663113006397,
"grad_norm": 0.3125913143157959,
"learning_rate": 0.00017367533380312924,
"loss": 0.276,
"step": 4400
},
{
"epoch": 9.392324093816631,
"grad_norm": 0.32215094566345215,
"learning_rate": 0.0001735914019005218,
"loss": 0.2746,
"step": 4405
},
{
"epoch": 9.402985074626866,
"grad_norm": 0.3145129382610321,
"learning_rate": 0.00017350735675955697,
"loss": 0.2818,
"step": 4410
},
{
"epoch": 9.4136460554371,
"grad_norm": 0.3180083930492401,
"learning_rate": 0.0001734231985095588,
"loss": 0.2782,
"step": 4415
},
{
"epoch": 9.424307036247335,
"grad_norm": 0.307829350233078,
"learning_rate": 0.00017333892728002527,
"loss": 0.2744,
"step": 4420
},
{
"epoch": 9.43496801705757,
"grad_norm": 0.3098660111427307,
"learning_rate": 0.00017325454320062832,
"loss": 0.2794,
"step": 4425
},
{
"epoch": 9.445628997867804,
"grad_norm": 0.2991037666797638,
"learning_rate": 0.0001731700464012134,
"loss": 0.2778,
"step": 4430
},
{
"epoch": 9.456289978678038,
"grad_norm": 0.3197588622570038,
"learning_rate": 0.0001730854370117996,
"loss": 0.2764,
"step": 4435
},
{
"epoch": 9.466950959488273,
"grad_norm": 0.31818678975105286,
"learning_rate": 0.00017300071516257904,
"loss": 0.2754,
"step": 4440
},
{
"epoch": 9.477611940298507,
"grad_norm": 0.3030422031879425,
"learning_rate": 0.000172915880983917,
"loss": 0.2795,
"step": 4445
},
{
"epoch": 9.488272921108742,
"grad_norm": 0.304565966129303,
"learning_rate": 0.00017283093460635166,
"loss": 0.2837,
"step": 4450
},
{
"epoch": 9.498933901918976,
"grad_norm": 0.3034186363220215,
"learning_rate": 0.00017274587616059376,
"loss": 0.2768,
"step": 4455
},
{
"epoch": 9.50959488272921,
"grad_norm": 0.30095112323760986,
"learning_rate": 0.00017266070577752647,
"loss": 0.2786,
"step": 4460
},
{
"epoch": 9.520255863539445,
"grad_norm": 0.3102254271507263,
"learning_rate": 0.0001725754235882053,
"loss": 0.2776,
"step": 4465
},
{
"epoch": 9.53091684434968,
"grad_norm": 0.2985278367996216,
"learning_rate": 0.00017249002972385765,
"loss": 0.2784,
"step": 4470
},
{
"epoch": 9.541577825159914,
"grad_norm": 0.32831713557243347,
"learning_rate": 0.00017240452431588294,
"loss": 0.2869,
"step": 4475
},
{
"epoch": 9.552238805970148,
"grad_norm": 0.3177868127822876,
"learning_rate": 0.0001723189074958521,
"loss": 0.2784,
"step": 4480
},
{
"epoch": 9.562899786780385,
"grad_norm": 0.3071228265762329,
"learning_rate": 0.00017223317939550753,
"loss": 0.2804,
"step": 4485
},
{
"epoch": 9.57356076759062,
"grad_norm": 0.3183000981807709,
"learning_rate": 0.00017214734014676288,
"loss": 0.2799,
"step": 4490
},
{
"epoch": 9.584221748400854,
"grad_norm": 0.33166825771331787,
"learning_rate": 0.00017206138988170281,
"loss": 0.2828,
"step": 4495
},
{
"epoch": 9.594882729211088,
"grad_norm": 0.3132229149341583,
"learning_rate": 0.0001719753287325828,
"loss": 0.279,
"step": 4500
},
{
"epoch": 9.605543710021323,
"grad_norm": 0.3281535506248474,
"learning_rate": 0.00017188915683182896,
"loss": 0.2767,
"step": 4505
},
{
"epoch": 9.616204690831557,
"grad_norm": 0.31389063596725464,
"learning_rate": 0.00017180287431203781,
"loss": 0.2851,
"step": 4510
},
{
"epoch": 9.626865671641792,
"grad_norm": 0.315807580947876,
"learning_rate": 0.00017171648130597612,
"loss": 0.2816,
"step": 4515
},
{
"epoch": 9.637526652452026,
"grad_norm": 0.3103027939796448,
"learning_rate": 0.0001716299779465806,
"loss": 0.2797,
"step": 4520
},
{
"epoch": 9.64818763326226,
"grad_norm": 0.3018797039985657,
"learning_rate": 0.00017154336436695785,
"loss": 0.2827,
"step": 4525
},
{
"epoch": 9.658848614072495,
"grad_norm": 0.3306185007095337,
"learning_rate": 0.00017145664070038406,
"loss": 0.2861,
"step": 4530
},
{
"epoch": 9.66950959488273,
"grad_norm": 0.3151242434978485,
"learning_rate": 0.0001713698070803047,
"loss": 0.2855,
"step": 4535
},
{
"epoch": 9.680170575692964,
"grad_norm": 0.3073995113372803,
"learning_rate": 0.0001712828636403346,
"loss": 0.2825,
"step": 4540
},
{
"epoch": 9.690831556503198,
"grad_norm": 0.31615933775901794,
"learning_rate": 0.00017119581051425742,
"loss": 0.2791,
"step": 4545
},
{
"epoch": 9.701492537313433,
"grad_norm": 0.3101312816143036,
"learning_rate": 0.0001711086478360257,
"loss": 0.287,
"step": 4550
},
{
"epoch": 9.712153518123667,
"grad_norm": 0.3094468116760254,
"learning_rate": 0.00017102137573976058,
"loss": 0.2804,
"step": 4555
},
{
"epoch": 9.722814498933902,
"grad_norm": 0.33349186182022095,
"learning_rate": 0.00017093399435975142,
"loss": 0.2773,
"step": 4560
},
{
"epoch": 9.733475479744136,
"grad_norm": 0.2954055368900299,
"learning_rate": 0.00017084650383045587,
"loss": 0.2762,
"step": 4565
},
{
"epoch": 9.74413646055437,
"grad_norm": 0.2962237000465393,
"learning_rate": 0.0001707589042864995,
"loss": 0.2861,
"step": 4570
},
{
"epoch": 9.754797441364605,
"grad_norm": 0.3323478698730469,
"learning_rate": 0.00017067119586267556,
"loss": 0.2861,
"step": 4575
},
{
"epoch": 9.76545842217484,
"grad_norm": 0.2926410138607025,
"learning_rate": 0.000170583378693945,
"loss": 0.2817,
"step": 4580
},
{
"epoch": 9.776119402985074,
"grad_norm": 0.3227819502353668,
"learning_rate": 0.0001704954529154359,
"loss": 0.2884,
"step": 4585
},
{
"epoch": 9.786780383795309,
"grad_norm": 0.32089999318122864,
"learning_rate": 0.00017040741866244358,
"loss": 0.2881,
"step": 4590
},
{
"epoch": 9.797441364605543,
"grad_norm": 0.3188937306404114,
"learning_rate": 0.0001703192760704303,
"loss": 0.2855,
"step": 4595
},
{
"epoch": 9.808102345415778,
"grad_norm": 0.3184082508087158,
"learning_rate": 0.00017023102527502496,
"loss": 0.2842,
"step": 4600
},
{
"epoch": 9.818763326226012,
"grad_norm": 0.2914822995662689,
"learning_rate": 0.00017014266641202292,
"loss": 0.274,
"step": 4605
},
{
"epoch": 9.829424307036247,
"grad_norm": 0.33117881417274475,
"learning_rate": 0.00017005419961738593,
"loss": 0.2888,
"step": 4610
},
{
"epoch": 9.840085287846481,
"grad_norm": 0.32017573714256287,
"learning_rate": 0.0001699656250272418,
"loss": 0.2785,
"step": 4615
},
{
"epoch": 9.850746268656717,
"grad_norm": 0.29259586334228516,
"learning_rate": 0.00016987694277788417,
"loss": 0.2888,
"step": 4620
},
{
"epoch": 9.86140724946695,
"grad_norm": 0.29314401745796204,
"learning_rate": 0.00016978815300577234,
"loss": 0.2826,
"step": 4625
},
{
"epoch": 9.872068230277186,
"grad_norm": 0.3312009572982788,
"learning_rate": 0.00016969925584753108,
"loss": 0.2828,
"step": 4630
},
{
"epoch": 9.88272921108742,
"grad_norm": 0.31798672676086426,
"learning_rate": 0.00016961025143995037,
"loss": 0.2777,
"step": 4635
},
{
"epoch": 9.893390191897655,
"grad_norm": 0.2987801134586334,
"learning_rate": 0.00016952113991998527,
"loss": 0.2818,
"step": 4640
},
{
"epoch": 9.90405117270789,
"grad_norm": 0.3148316442966461,
"learning_rate": 0.00016943192142475564,
"loss": 0.2853,
"step": 4645
},
{
"epoch": 9.914712153518124,
"grad_norm": 0.3207818269729614,
"learning_rate": 0.00016934259609154592,
"loss": 0.2835,
"step": 4650
},
{
"epoch": 9.925373134328359,
"grad_norm": 0.29595887660980225,
"learning_rate": 0.000169253164057805,
"loss": 0.2845,
"step": 4655
},
{
"epoch": 9.936034115138593,
"grad_norm": 0.2958875894546509,
"learning_rate": 0.00016916362546114585,
"loss": 0.2793,
"step": 4660
},
{
"epoch": 9.946695095948828,
"grad_norm": 0.2999938726425171,
"learning_rate": 0.00016907398043934557,
"loss": 0.2794,
"step": 4665
},
{
"epoch": 9.957356076759062,
"grad_norm": 0.29154959321022034,
"learning_rate": 0.00016898422913034486,
"loss": 0.2891,
"step": 4670
},
{
"epoch": 9.968017057569297,
"grad_norm": 0.30298835039138794,
"learning_rate": 0.0001688943716722481,
"loss": 0.2859,
"step": 4675
},
{
"epoch": 9.978678038379531,
"grad_norm": 0.3251824975013733,
"learning_rate": 0.00016880440820332291,
"loss": 0.283,
"step": 4680
},
{
"epoch": 9.989339019189766,
"grad_norm": 0.29153597354888916,
"learning_rate": 0.0001687143388620001,
"loss": 0.2871,
"step": 4685
},
{
"epoch": 10.0,
"grad_norm": 0.3233014643192291,
"learning_rate": 0.0001686241637868734,
"loss": 0.2853,
"step": 4690
},
{
"epoch": 10.0,
"eval_loss": 0.5920408368110657,
"eval_runtime": 377.5422,
"eval_samples_per_second": 1.091,
"eval_steps_per_second": 1.091,
"step": 4690
},
{
"epoch": 10.0,
"step": 4690,
"total_flos": 3.4794514845867704e+18,
"train_loss": 0.46453510172077334,
"train_runtime": 112907.655,
"train_samples_per_second": 0.997,
"train_steps_per_second": 0.125
}
],
"logging_steps": 5,
"max_steps": 14070,
"num_input_tokens_seen": 0,
"num_train_epochs": 30,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.4794514845867704e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}