ECGFTLlava / trainer_state.json
Geohunterr's picture
Upload 9 files
e29071a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3105,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004830917874396135,
"grad_norm": 0.4590633809566498,
"learning_rate": 1.3986464711569449e-05,
"loss": 3.9553,
"step": 5
},
{
"epoch": 0.00966183574879227,
"grad_norm": 0.562667965888977,
"learning_rate": 1.396390589751853e-05,
"loss": 3.8462,
"step": 10
},
{
"epoch": 0.014492753623188406,
"grad_norm": 0.5178580284118652,
"learning_rate": 1.3941347083467613e-05,
"loss": 3.7563,
"step": 15
},
{
"epoch": 0.01932367149758454,
"grad_norm": 0.549095869064331,
"learning_rate": 1.3918788269416693e-05,
"loss": 3.5661,
"step": 20
},
{
"epoch": 0.024154589371980676,
"grad_norm": 0.5863974094390869,
"learning_rate": 1.3896229455365775e-05,
"loss": 3.3998,
"step": 25
},
{
"epoch": 0.028985507246376812,
"grad_norm": 0.6717987060546875,
"learning_rate": 1.3873670641314857e-05,
"loss": 3.2327,
"step": 30
},
{
"epoch": 0.033816425120772944,
"grad_norm": 0.6742504835128784,
"learning_rate": 1.3851111827263939e-05,
"loss": 3.0419,
"step": 35
},
{
"epoch": 0.03864734299516908,
"grad_norm": 0.7488217949867249,
"learning_rate": 1.382855301321302e-05,
"loss": 2.8092,
"step": 40
},
{
"epoch": 0.043478260869565216,
"grad_norm": 0.7981260418891907,
"learning_rate": 1.3805994199162101e-05,
"loss": 2.5637,
"step": 45
},
{
"epoch": 0.04830917874396135,
"grad_norm": 0.7837016582489014,
"learning_rate": 1.3783435385111182e-05,
"loss": 2.311,
"step": 50
},
{
"epoch": 0.05314009661835749,
"grad_norm": 0.8491142988204956,
"learning_rate": 1.3760876571060263e-05,
"loss": 2.0814,
"step": 55
},
{
"epoch": 0.057971014492753624,
"grad_norm": 0.7609491348266602,
"learning_rate": 1.3738317757009345e-05,
"loss": 1.7811,
"step": 60
},
{
"epoch": 0.06280193236714976,
"grad_norm": 0.7346836924552917,
"learning_rate": 1.3715758942958427e-05,
"loss": 1.5668,
"step": 65
},
{
"epoch": 0.06763285024154589,
"grad_norm": 0.7201610803604126,
"learning_rate": 1.369320012890751e-05,
"loss": 1.3152,
"step": 70
},
{
"epoch": 0.07246376811594203,
"grad_norm": 0.6400141716003418,
"learning_rate": 1.3670641314856591e-05,
"loss": 1.0742,
"step": 75
},
{
"epoch": 0.07729468599033816,
"grad_norm": 0.38841813802719116,
"learning_rate": 1.3648082500805672e-05,
"loss": 0.9316,
"step": 80
},
{
"epoch": 0.0821256038647343,
"grad_norm": 0.4104098677635193,
"learning_rate": 1.3625523686754754e-05,
"loss": 0.8594,
"step": 85
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.5033922791481018,
"learning_rate": 1.3602964872703834e-05,
"loss": 0.7873,
"step": 90
},
{
"epoch": 0.09178743961352658,
"grad_norm": 0.3223589360713959,
"learning_rate": 1.3580406058652916e-05,
"loss": 0.7265,
"step": 95
},
{
"epoch": 0.0966183574879227,
"grad_norm": 0.31837838888168335,
"learning_rate": 1.3557847244601998e-05,
"loss": 0.7056,
"step": 100
},
{
"epoch": 0.10144927536231885,
"grad_norm": 0.35547807812690735,
"learning_rate": 1.353528843055108e-05,
"loss": 0.6684,
"step": 105
},
{
"epoch": 0.10628019323671498,
"grad_norm": 0.3624265789985657,
"learning_rate": 1.351272961650016e-05,
"loss": 0.6424,
"step": 110
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.37934672832489014,
"learning_rate": 1.3490170802449242e-05,
"loss": 0.6473,
"step": 115
},
{
"epoch": 0.11594202898550725,
"grad_norm": 0.43373095989227295,
"learning_rate": 1.3467611988398324e-05,
"loss": 0.6108,
"step": 120
},
{
"epoch": 0.12077294685990338,
"grad_norm": 0.46837344765663147,
"learning_rate": 1.3445053174347406e-05,
"loss": 0.5718,
"step": 125
},
{
"epoch": 0.12560386473429952,
"grad_norm": 0.4655485153198242,
"learning_rate": 1.3422494360296488e-05,
"loss": 0.5618,
"step": 130
},
{
"epoch": 0.13043478260869565,
"grad_norm": 0.5438677072525024,
"learning_rate": 1.3399935546245569e-05,
"loss": 0.5834,
"step": 135
},
{
"epoch": 0.13526570048309178,
"grad_norm": 0.5986974239349365,
"learning_rate": 1.3377376732194649e-05,
"loss": 0.51,
"step": 140
},
{
"epoch": 0.14009661835748793,
"grad_norm": 0.7286536693572998,
"learning_rate": 1.3354817918143731e-05,
"loss": 0.4704,
"step": 145
},
{
"epoch": 0.14492753623188406,
"grad_norm": 0.9337557554244995,
"learning_rate": 1.3332259104092813e-05,
"loss": 0.4379,
"step": 150
},
{
"epoch": 0.1497584541062802,
"grad_norm": 1.0971410274505615,
"learning_rate": 1.3309700290041895e-05,
"loss": 0.3994,
"step": 155
},
{
"epoch": 0.15458937198067632,
"grad_norm": 1.5142974853515625,
"learning_rate": 1.3287141475990977e-05,
"loss": 0.3397,
"step": 160
},
{
"epoch": 0.15942028985507245,
"grad_norm": 0.5735320448875427,
"learning_rate": 1.3264582661940057e-05,
"loss": 0.3047,
"step": 165
},
{
"epoch": 0.1642512077294686,
"grad_norm": 0.31310656666755676,
"learning_rate": 1.324202384788914e-05,
"loss": 0.2761,
"step": 170
},
{
"epoch": 0.16908212560386474,
"grad_norm": 0.3159743845462799,
"learning_rate": 1.3219465033838221e-05,
"loss": 0.2584,
"step": 175
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.2747821509838104,
"learning_rate": 1.3196906219787303e-05,
"loss": 0.2696,
"step": 180
},
{
"epoch": 0.178743961352657,
"grad_norm": 0.26819175481796265,
"learning_rate": 1.3174347405736383e-05,
"loss": 0.2633,
"step": 185
},
{
"epoch": 0.18357487922705315,
"grad_norm": 0.27827367186546326,
"learning_rate": 1.3151788591685465e-05,
"loss": 0.2587,
"step": 190
},
{
"epoch": 0.18840579710144928,
"grad_norm": 0.30673256516456604,
"learning_rate": 1.3129229777634546e-05,
"loss": 0.2749,
"step": 195
},
{
"epoch": 0.1932367149758454,
"grad_norm": 0.28767552971839905,
"learning_rate": 1.3106670963583628e-05,
"loss": 0.2527,
"step": 200
},
{
"epoch": 0.19806763285024154,
"grad_norm": 0.2788391709327698,
"learning_rate": 1.308411214953271e-05,
"loss": 0.2548,
"step": 205
},
{
"epoch": 0.2028985507246377,
"grad_norm": 0.26774516701698303,
"learning_rate": 1.3061553335481792e-05,
"loss": 0.2426,
"step": 210
},
{
"epoch": 0.20772946859903382,
"grad_norm": 0.3280729651451111,
"learning_rate": 1.3038994521430874e-05,
"loss": 0.2343,
"step": 215
},
{
"epoch": 0.21256038647342995,
"grad_norm": 0.28210124373435974,
"learning_rate": 1.3016435707379956e-05,
"loss": 0.2385,
"step": 220
},
{
"epoch": 0.21739130434782608,
"grad_norm": 0.2706020176410675,
"learning_rate": 1.2993876893329036e-05,
"loss": 0.2418,
"step": 225
},
{
"epoch": 0.2222222222222222,
"grad_norm": 0.2814071476459503,
"learning_rate": 1.2971318079278118e-05,
"loss": 0.2309,
"step": 230
},
{
"epoch": 0.22705314009661837,
"grad_norm": 0.311310738325119,
"learning_rate": 1.2948759265227198e-05,
"loss": 0.239,
"step": 235
},
{
"epoch": 0.2318840579710145,
"grad_norm": 0.31521573662757874,
"learning_rate": 1.292620045117628e-05,
"loss": 0.2271,
"step": 240
},
{
"epoch": 0.23671497584541062,
"grad_norm": 0.3365338146686554,
"learning_rate": 1.2903641637125362e-05,
"loss": 0.2418,
"step": 245
},
{
"epoch": 0.24154589371980675,
"grad_norm": 0.32416385412216187,
"learning_rate": 1.2881082823074444e-05,
"loss": 0.2282,
"step": 250
},
{
"epoch": 0.2463768115942029,
"grad_norm": 0.3272862434387207,
"learning_rate": 1.2858524009023525e-05,
"loss": 0.2319,
"step": 255
},
{
"epoch": 0.25120772946859904,
"grad_norm": 0.34287795424461365,
"learning_rate": 1.2835965194972607e-05,
"loss": 0.2529,
"step": 260
},
{
"epoch": 0.2560386473429952,
"grad_norm": 0.338498055934906,
"learning_rate": 1.2813406380921689e-05,
"loss": 0.2216,
"step": 265
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.34228625893592834,
"learning_rate": 1.279084756687077e-05,
"loss": 0.2306,
"step": 270
},
{
"epoch": 0.26570048309178745,
"grad_norm": 0.38022157549858093,
"learning_rate": 1.2768288752819853e-05,
"loss": 0.2091,
"step": 275
},
{
"epoch": 0.27053140096618356,
"grad_norm": 0.35013625025749207,
"learning_rate": 1.2745729938768935e-05,
"loss": 0.2219,
"step": 280
},
{
"epoch": 0.2753623188405797,
"grad_norm": 0.3915255665779114,
"learning_rate": 1.2723171124718013e-05,
"loss": 0.202,
"step": 285
},
{
"epoch": 0.28019323671497587,
"grad_norm": 0.4278201758861542,
"learning_rate": 1.2700612310667095e-05,
"loss": 0.2223,
"step": 290
},
{
"epoch": 0.28502415458937197,
"grad_norm": 0.4377511441707611,
"learning_rate": 1.2678053496616177e-05,
"loss": 0.2001,
"step": 295
},
{
"epoch": 0.2898550724637681,
"grad_norm": 0.44731107354164124,
"learning_rate": 1.265549468256526e-05,
"loss": 0.1884,
"step": 300
},
{
"epoch": 0.2946859903381642,
"grad_norm": 0.4644255042076111,
"learning_rate": 1.2632935868514341e-05,
"loss": 0.195,
"step": 305
},
{
"epoch": 0.2995169082125604,
"grad_norm": 0.46685394644737244,
"learning_rate": 1.2610377054463423e-05,
"loss": 0.1867,
"step": 310
},
{
"epoch": 0.30434782608695654,
"grad_norm": 0.484323650598526,
"learning_rate": 1.2587818240412503e-05,
"loss": 0.1855,
"step": 315
},
{
"epoch": 0.30917874396135264,
"grad_norm": 0.4667232632637024,
"learning_rate": 1.2565259426361585e-05,
"loss": 0.1823,
"step": 320
},
{
"epoch": 0.3140096618357488,
"grad_norm": 0.5028926134109497,
"learning_rate": 1.2542700612310667e-05,
"loss": 0.1726,
"step": 325
},
{
"epoch": 0.3188405797101449,
"grad_norm": 0.5125951766967773,
"learning_rate": 1.252014179825975e-05,
"loss": 0.1709,
"step": 330
},
{
"epoch": 0.32367149758454106,
"grad_norm": 0.4960808753967285,
"learning_rate": 1.249758298420883e-05,
"loss": 0.1539,
"step": 335
},
{
"epoch": 0.3285024154589372,
"grad_norm": 0.42437031865119934,
"learning_rate": 1.2475024170157912e-05,
"loss": 0.1484,
"step": 340
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.32370465993881226,
"learning_rate": 1.2452465356106992e-05,
"loss": 0.128,
"step": 345
},
{
"epoch": 0.33816425120772947,
"grad_norm": 0.2940502166748047,
"learning_rate": 1.2429906542056074e-05,
"loss": 0.1152,
"step": 350
},
{
"epoch": 0.34299516908212563,
"grad_norm": 0.3140239715576172,
"learning_rate": 1.2407347728005156e-05,
"loss": 0.1466,
"step": 355
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.26790526509284973,
"learning_rate": 1.2384788913954238e-05,
"loss": 0.1148,
"step": 360
},
{
"epoch": 0.3526570048309179,
"grad_norm": 0.276149183511734,
"learning_rate": 1.236223009990332e-05,
"loss": 0.0996,
"step": 365
},
{
"epoch": 0.357487922705314,
"grad_norm": 0.24558521807193756,
"learning_rate": 1.2339671285852402e-05,
"loss": 0.105,
"step": 370
},
{
"epoch": 0.36231884057971014,
"grad_norm": 0.31819969415664673,
"learning_rate": 1.2317112471801482e-05,
"loss": 0.0968,
"step": 375
},
{
"epoch": 0.3671497584541063,
"grad_norm": 0.28777143359184265,
"learning_rate": 1.2294553657750564e-05,
"loss": 0.1029,
"step": 380
},
{
"epoch": 0.3719806763285024,
"grad_norm": 0.26321807503700256,
"learning_rate": 1.2271994843699645e-05,
"loss": 0.1018,
"step": 385
},
{
"epoch": 0.37681159420289856,
"grad_norm": 0.23004086315631866,
"learning_rate": 1.2249436029648727e-05,
"loss": 0.0912,
"step": 390
},
{
"epoch": 0.38164251207729466,
"grad_norm": 0.21847793459892273,
"learning_rate": 1.2226877215597809e-05,
"loss": 0.0913,
"step": 395
},
{
"epoch": 0.3864734299516908,
"grad_norm": 0.19117720425128937,
"learning_rate": 1.220431840154689e-05,
"loss": 0.0895,
"step": 400
},
{
"epoch": 0.391304347826087,
"grad_norm": 0.2535386085510254,
"learning_rate": 1.2181759587495971e-05,
"loss": 0.1103,
"step": 405
},
{
"epoch": 0.3961352657004831,
"grad_norm": 0.26542574167251587,
"learning_rate": 1.2159200773445053e-05,
"loss": 0.0923,
"step": 410
},
{
"epoch": 0.40096618357487923,
"grad_norm": 0.20197734236717224,
"learning_rate": 1.2136641959394135e-05,
"loss": 0.0931,
"step": 415
},
{
"epoch": 0.4057971014492754,
"grad_norm": 0.20673911273479462,
"learning_rate": 1.2114083145343217e-05,
"loss": 0.1138,
"step": 420
},
{
"epoch": 0.4106280193236715,
"grad_norm": 0.24391110241413116,
"learning_rate": 1.2091524331292299e-05,
"loss": 0.094,
"step": 425
},
{
"epoch": 0.41545893719806765,
"grad_norm": 0.2456451952457428,
"learning_rate": 1.2068965517241379e-05,
"loss": 0.1078,
"step": 430
},
{
"epoch": 0.42028985507246375,
"grad_norm": 0.29903218150138855,
"learning_rate": 1.204640670319046e-05,
"loss": 0.0999,
"step": 435
},
{
"epoch": 0.4251207729468599,
"grad_norm": 0.17596346139907837,
"learning_rate": 1.2023847889139541e-05,
"loss": 0.0995,
"step": 440
},
{
"epoch": 0.42995169082125606,
"grad_norm": 0.14841659367084503,
"learning_rate": 1.2001289075088623e-05,
"loss": 0.0934,
"step": 445
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.18399696052074432,
"learning_rate": 1.1978730261037705e-05,
"loss": 0.0967,
"step": 450
},
{
"epoch": 0.4396135265700483,
"grad_norm": 0.1746302992105484,
"learning_rate": 1.1956171446986787e-05,
"loss": 0.0947,
"step": 455
},
{
"epoch": 0.4444444444444444,
"grad_norm": 0.2423829287290573,
"learning_rate": 1.1933612632935868e-05,
"loss": 0.0936,
"step": 460
},
{
"epoch": 0.4492753623188406,
"grad_norm": 0.15260176360607147,
"learning_rate": 1.191105381888495e-05,
"loss": 0.1034,
"step": 465
},
{
"epoch": 0.45410628019323673,
"grad_norm": 0.2334187626838684,
"learning_rate": 1.1888495004834032e-05,
"loss": 0.1053,
"step": 470
},
{
"epoch": 0.45893719806763283,
"grad_norm": 0.19356365501880646,
"learning_rate": 1.1865936190783114e-05,
"loss": 0.0842,
"step": 475
},
{
"epoch": 0.463768115942029,
"grad_norm": 0.20395216345787048,
"learning_rate": 1.1843377376732194e-05,
"loss": 0.0792,
"step": 480
},
{
"epoch": 0.46859903381642515,
"grad_norm": 0.1807161122560501,
"learning_rate": 1.1820818562681276e-05,
"loss": 0.0882,
"step": 485
},
{
"epoch": 0.47342995169082125,
"grad_norm": 0.16710110008716583,
"learning_rate": 1.1798259748630358e-05,
"loss": 0.0822,
"step": 490
},
{
"epoch": 0.4782608695652174,
"grad_norm": 0.1776697188615799,
"learning_rate": 1.1775700934579438e-05,
"loss": 0.0922,
"step": 495
},
{
"epoch": 0.4830917874396135,
"grad_norm": 0.21817447245121002,
"learning_rate": 1.175314212052852e-05,
"loss": 0.0949,
"step": 500
},
{
"epoch": 0.48792270531400966,
"grad_norm": 0.20692448318004608,
"learning_rate": 1.1730583306477602e-05,
"loss": 0.0907,
"step": 505
},
{
"epoch": 0.4927536231884058,
"grad_norm": 0.1886768341064453,
"learning_rate": 1.1708024492426684e-05,
"loss": 0.0858,
"step": 510
},
{
"epoch": 0.4975845410628019,
"grad_norm": 0.19374988973140717,
"learning_rate": 1.1685465678375766e-05,
"loss": 0.084,
"step": 515
},
{
"epoch": 0.5024154589371981,
"grad_norm": 0.1982010304927826,
"learning_rate": 1.1662906864324847e-05,
"loss": 0.087,
"step": 520
},
{
"epoch": 0.5072463768115942,
"grad_norm": 0.292267769575119,
"learning_rate": 1.1640348050273929e-05,
"loss": 0.0918,
"step": 525
},
{
"epoch": 0.5120772946859904,
"grad_norm": 0.19581086933612823,
"learning_rate": 1.1617789236223009e-05,
"loss": 0.1012,
"step": 530
},
{
"epoch": 0.5169082125603864,
"grad_norm": 0.1730077862739563,
"learning_rate": 1.159523042217209e-05,
"loss": 0.0853,
"step": 535
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.20485533773899078,
"learning_rate": 1.1572671608121173e-05,
"loss": 0.093,
"step": 540
},
{
"epoch": 0.5265700483091788,
"grad_norm": 0.2086704820394516,
"learning_rate": 1.1550112794070255e-05,
"loss": 0.0945,
"step": 545
},
{
"epoch": 0.5314009661835749,
"grad_norm": 0.15911467373371124,
"learning_rate": 1.1527553980019335e-05,
"loss": 0.1034,
"step": 550
},
{
"epoch": 0.5362318840579711,
"grad_norm": 0.2168796807527542,
"learning_rate": 1.1504995165968417e-05,
"loss": 0.0945,
"step": 555
},
{
"epoch": 0.5410628019323671,
"grad_norm": 0.20228448510169983,
"learning_rate": 1.1482436351917499e-05,
"loss": 0.1029,
"step": 560
},
{
"epoch": 0.5458937198067633,
"grad_norm": 0.2441129982471466,
"learning_rate": 1.1459877537866581e-05,
"loss": 0.0906,
"step": 565
},
{
"epoch": 0.5507246376811594,
"grad_norm": 0.22443729639053345,
"learning_rate": 1.1437318723815663e-05,
"loss": 0.0994,
"step": 570
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.18132899701595306,
"learning_rate": 1.1414759909764745e-05,
"loss": 0.0938,
"step": 575
},
{
"epoch": 0.5603864734299517,
"grad_norm": 0.19448505342006683,
"learning_rate": 1.1392201095713824e-05,
"loss": 0.0835,
"step": 580
},
{
"epoch": 0.5652173913043478,
"grad_norm": 0.23075686395168304,
"learning_rate": 1.1369642281662906e-05,
"loss": 0.0983,
"step": 585
},
{
"epoch": 0.5700483091787439,
"grad_norm": 0.22883069515228271,
"learning_rate": 1.1347083467611988e-05,
"loss": 0.0787,
"step": 590
},
{
"epoch": 0.5748792270531401,
"grad_norm": 0.23262719810009003,
"learning_rate": 1.132452465356107e-05,
"loss": 0.0939,
"step": 595
},
{
"epoch": 0.5797101449275363,
"grad_norm": 0.20541128516197205,
"learning_rate": 1.1301965839510152e-05,
"loss": 0.0776,
"step": 600
},
{
"epoch": 0.5845410628019324,
"grad_norm": 0.21663478016853333,
"learning_rate": 1.1279407025459234e-05,
"loss": 0.0918,
"step": 605
},
{
"epoch": 0.5893719806763285,
"grad_norm": 0.22586220502853394,
"learning_rate": 1.1256848211408314e-05,
"loss": 0.0824,
"step": 610
},
{
"epoch": 0.5942028985507246,
"grad_norm": 0.1860446035861969,
"learning_rate": 1.1234289397357396e-05,
"loss": 0.0853,
"step": 615
},
{
"epoch": 0.5990338164251208,
"grad_norm": 0.195932075381279,
"learning_rate": 1.1211730583306478e-05,
"loss": 0.0818,
"step": 620
},
{
"epoch": 0.6038647342995169,
"grad_norm": 0.19570867717266083,
"learning_rate": 1.118917176925556e-05,
"loss": 0.0859,
"step": 625
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.16349905729293823,
"learning_rate": 1.116661295520464e-05,
"loss": 0.0938,
"step": 630
},
{
"epoch": 0.6135265700483091,
"grad_norm": 0.1926320493221283,
"learning_rate": 1.1144054141153722e-05,
"loss": 0.0846,
"step": 635
},
{
"epoch": 0.6183574879227053,
"grad_norm": 0.19020161032676697,
"learning_rate": 1.1121495327102803e-05,
"loss": 0.086,
"step": 640
},
{
"epoch": 0.6231884057971014,
"grad_norm": 0.20265896618366241,
"learning_rate": 1.1098936513051885e-05,
"loss": 0.0793,
"step": 645
},
{
"epoch": 0.6280193236714976,
"grad_norm": 0.17398878931999207,
"learning_rate": 1.1076377699000967e-05,
"loss": 0.09,
"step": 650
},
{
"epoch": 0.6328502415458938,
"grad_norm": 0.19005955755710602,
"learning_rate": 1.1053818884950049e-05,
"loss": 0.0792,
"step": 655
},
{
"epoch": 0.6376811594202898,
"grad_norm": 0.18029935657978058,
"learning_rate": 1.103126007089913e-05,
"loss": 0.0923,
"step": 660
},
{
"epoch": 0.642512077294686,
"grad_norm": 0.1881086826324463,
"learning_rate": 1.1008701256848212e-05,
"loss": 0.0936,
"step": 665
},
{
"epoch": 0.6473429951690821,
"grad_norm": 0.269255667924881,
"learning_rate": 1.0986142442797293e-05,
"loss": 0.0916,
"step": 670
},
{
"epoch": 0.6521739130434783,
"grad_norm": 0.18038909137248993,
"learning_rate": 1.0963583628746373e-05,
"loss": 0.0855,
"step": 675
},
{
"epoch": 0.6570048309178744,
"grad_norm": 0.17990528047084808,
"learning_rate": 1.0941024814695455e-05,
"loss": 0.0926,
"step": 680
},
{
"epoch": 0.6618357487922706,
"grad_norm": 0.2431405931711197,
"learning_rate": 1.0918466000644537e-05,
"loss": 0.0917,
"step": 685
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.23243603110313416,
"learning_rate": 1.0895907186593619e-05,
"loss": 0.1046,
"step": 690
},
{
"epoch": 0.6714975845410628,
"grad_norm": 0.20667722821235657,
"learning_rate": 1.0873348372542701e-05,
"loss": 0.0952,
"step": 695
},
{
"epoch": 0.6763285024154589,
"grad_norm": 0.20045587420463562,
"learning_rate": 1.0850789558491781e-05,
"loss": 0.0839,
"step": 700
},
{
"epoch": 0.6811594202898551,
"grad_norm": 0.15829257667064667,
"learning_rate": 1.0828230744440863e-05,
"loss": 0.0931,
"step": 705
},
{
"epoch": 0.6859903381642513,
"grad_norm": 0.18778935074806213,
"learning_rate": 1.0805671930389945e-05,
"loss": 0.083,
"step": 710
},
{
"epoch": 0.6908212560386473,
"grad_norm": 0.1949867457151413,
"learning_rate": 1.0783113116339027e-05,
"loss": 0.0797,
"step": 715
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.3177832365036011,
"learning_rate": 1.076055430228811e-05,
"loss": 0.081,
"step": 720
},
{
"epoch": 0.7004830917874396,
"grad_norm": 0.1714804619550705,
"learning_rate": 1.073799548823719e-05,
"loss": 0.0908,
"step": 725
},
{
"epoch": 0.7053140096618358,
"grad_norm": 0.25471800565719604,
"learning_rate": 1.071543667418627e-05,
"loss": 0.0988,
"step": 730
},
{
"epoch": 0.7101449275362319,
"grad_norm": 0.21141599118709564,
"learning_rate": 1.0692877860135352e-05,
"loss": 0.0944,
"step": 735
},
{
"epoch": 0.714975845410628,
"grad_norm": 0.17371544241905212,
"learning_rate": 1.0670319046084434e-05,
"loss": 0.0836,
"step": 740
},
{
"epoch": 0.7198067632850241,
"grad_norm": 0.19493460655212402,
"learning_rate": 1.0647760232033516e-05,
"loss": 0.0855,
"step": 745
},
{
"epoch": 0.7246376811594203,
"grad_norm": 0.25241127610206604,
"learning_rate": 1.0625201417982598e-05,
"loss": 0.0844,
"step": 750
},
{
"epoch": 0.7294685990338164,
"grad_norm": 0.2515096664428711,
"learning_rate": 1.060264260393168e-05,
"loss": 0.0883,
"step": 755
},
{
"epoch": 0.7342995169082126,
"grad_norm": 0.15292327105998993,
"learning_rate": 1.058008378988076e-05,
"loss": 0.0792,
"step": 760
},
{
"epoch": 0.7391304347826086,
"grad_norm": 0.20495273172855377,
"learning_rate": 1.0557524975829842e-05,
"loss": 0.0789,
"step": 765
},
{
"epoch": 0.7439613526570048,
"grad_norm": 0.261168897151947,
"learning_rate": 1.0534966161778924e-05,
"loss": 0.0832,
"step": 770
},
{
"epoch": 0.748792270531401,
"grad_norm": 0.28218600153923035,
"learning_rate": 1.0512407347728004e-05,
"loss": 0.1046,
"step": 775
},
{
"epoch": 0.7536231884057971,
"grad_norm": 0.1737246960401535,
"learning_rate": 1.0489848533677086e-05,
"loss": 0.0722,
"step": 780
},
{
"epoch": 0.7584541062801933,
"grad_norm": 0.24183641374111176,
"learning_rate": 1.0467289719626168e-05,
"loss": 0.0752,
"step": 785
},
{
"epoch": 0.7632850241545893,
"grad_norm": 0.23685990273952484,
"learning_rate": 1.0444730905575249e-05,
"loss": 0.1037,
"step": 790
},
{
"epoch": 0.7681159420289855,
"grad_norm": 0.22956091165542603,
"learning_rate": 1.042217209152433e-05,
"loss": 0.0761,
"step": 795
},
{
"epoch": 0.7729468599033816,
"grad_norm": 0.18922095000743866,
"learning_rate": 1.0399613277473413e-05,
"loss": 0.0885,
"step": 800
},
{
"epoch": 0.7777777777777778,
"grad_norm": 0.18391458690166473,
"learning_rate": 1.0377054463422495e-05,
"loss": 0.0859,
"step": 805
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.27890563011169434,
"learning_rate": 1.0354495649371577e-05,
"loss": 0.0924,
"step": 810
},
{
"epoch": 0.7874396135265701,
"grad_norm": 0.22491532564163208,
"learning_rate": 1.0331936835320657e-05,
"loss": 0.0811,
"step": 815
},
{
"epoch": 0.7922705314009661,
"grad_norm": 0.21809989213943481,
"learning_rate": 1.0309378021269739e-05,
"loss": 0.0784,
"step": 820
},
{
"epoch": 0.7971014492753623,
"grad_norm": 0.27180778980255127,
"learning_rate": 1.028681920721882e-05,
"loss": 0.088,
"step": 825
},
{
"epoch": 0.8019323671497585,
"grad_norm": 0.22717216610908508,
"learning_rate": 1.0264260393167901e-05,
"loss": 0.0755,
"step": 830
},
{
"epoch": 0.8067632850241546,
"grad_norm": 0.19013768434524536,
"learning_rate": 1.0241701579116983e-05,
"loss": 0.0782,
"step": 835
},
{
"epoch": 0.8115942028985508,
"grad_norm": 0.2028125375509262,
"learning_rate": 1.0219142765066065e-05,
"loss": 0.1034,
"step": 840
},
{
"epoch": 0.8164251207729468,
"grad_norm": 0.24243703484535217,
"learning_rate": 1.0196583951015146e-05,
"loss": 0.0899,
"step": 845
},
{
"epoch": 0.821256038647343,
"grad_norm": 0.21742011606693268,
"learning_rate": 1.0174025136964228e-05,
"loss": 0.0898,
"step": 850
},
{
"epoch": 0.8260869565217391,
"grad_norm": 0.2000913769006729,
"learning_rate": 1.015146632291331e-05,
"loss": 0.0828,
"step": 855
},
{
"epoch": 0.8309178743961353,
"grad_norm": 0.1902933269739151,
"learning_rate": 1.0128907508862392e-05,
"loss": 0.0934,
"step": 860
},
{
"epoch": 0.8357487922705314,
"grad_norm": 0.20363092422485352,
"learning_rate": 1.0106348694811474e-05,
"loss": 0.081,
"step": 865
},
{
"epoch": 0.8405797101449275,
"grad_norm": 0.2238474041223526,
"learning_rate": 1.0083789880760556e-05,
"loss": 0.0963,
"step": 870
},
{
"epoch": 0.8454106280193237,
"grad_norm": 0.19188345968723297,
"learning_rate": 1.0061231066709634e-05,
"loss": 0.0809,
"step": 875
},
{
"epoch": 0.8502415458937198,
"grad_norm": 0.18286921083927155,
"learning_rate": 1.0038672252658716e-05,
"loss": 0.0891,
"step": 880
},
{
"epoch": 0.855072463768116,
"grad_norm": 0.19798459112644196,
"learning_rate": 1.0016113438607798e-05,
"loss": 0.0789,
"step": 885
},
{
"epoch": 0.8599033816425121,
"grad_norm": 0.1937275230884552,
"learning_rate": 9.99355462455688e-06,
"loss": 0.0748,
"step": 890
},
{
"epoch": 0.8647342995169082,
"grad_norm": 0.2399519830942154,
"learning_rate": 9.970995810505962e-06,
"loss": 0.0941,
"step": 895
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.2435486763715744,
"learning_rate": 9.948436996455044e-06,
"loss": 0.078,
"step": 900
},
{
"epoch": 0.8743961352657005,
"grad_norm": 0.22818566858768463,
"learning_rate": 9.925878182404124e-06,
"loss": 0.0813,
"step": 905
},
{
"epoch": 0.8792270531400966,
"grad_norm": 0.19992083311080933,
"learning_rate": 9.903319368353206e-06,
"loss": 0.0757,
"step": 910
},
{
"epoch": 0.8840579710144928,
"grad_norm": 0.24121499061584473,
"learning_rate": 9.880760554302288e-06,
"loss": 0.0878,
"step": 915
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.2414121776819229,
"learning_rate": 9.85820174025137e-06,
"loss": 0.076,
"step": 920
},
{
"epoch": 0.893719806763285,
"grad_norm": 0.17777179181575775,
"learning_rate": 9.83564292620045e-06,
"loss": 0.0903,
"step": 925
},
{
"epoch": 0.8985507246376812,
"grad_norm": 0.23024319112300873,
"learning_rate": 9.813084112149533e-06,
"loss": 0.0965,
"step": 930
},
{
"epoch": 0.9033816425120773,
"grad_norm": 0.20664696395397186,
"learning_rate": 9.790525298098613e-06,
"loss": 0.0707,
"step": 935
},
{
"epoch": 0.9082125603864735,
"grad_norm": 0.1725015491247177,
"learning_rate": 9.767966484047695e-06,
"loss": 0.0821,
"step": 940
},
{
"epoch": 0.9130434782608695,
"grad_norm": 0.2138936072587967,
"learning_rate": 9.745407669996777e-06,
"loss": 0.087,
"step": 945
},
{
"epoch": 0.9178743961352657,
"grad_norm": 0.24879959225654602,
"learning_rate": 9.722848855945859e-06,
"loss": 0.0782,
"step": 950
},
{
"epoch": 0.9227053140096618,
"grad_norm": 0.24507424235343933,
"learning_rate": 9.700290041894941e-06,
"loss": 0.0976,
"step": 955
},
{
"epoch": 0.927536231884058,
"grad_norm": 0.21825656294822693,
"learning_rate": 9.677731227844023e-06,
"loss": 0.0846,
"step": 960
},
{
"epoch": 0.9323671497584541,
"grad_norm": 0.22634956240653992,
"learning_rate": 9.655172413793103e-06,
"loss": 0.0833,
"step": 965
},
{
"epoch": 0.9371980676328503,
"grad_norm": 0.20103132724761963,
"learning_rate": 9.632613599742184e-06,
"loss": 0.0739,
"step": 970
},
{
"epoch": 0.9420289855072463,
"grad_norm": 0.19459068775177002,
"learning_rate": 9.610054785691266e-06,
"loss": 0.0841,
"step": 975
},
{
"epoch": 0.9468599033816425,
"grad_norm": 0.18598385155200958,
"learning_rate": 9.587495971640348e-06,
"loss": 0.0734,
"step": 980
},
{
"epoch": 0.9516908212560387,
"grad_norm": 0.24302643537521362,
"learning_rate": 9.56493715758943e-06,
"loss": 0.0893,
"step": 985
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.23758938908576965,
"learning_rate": 9.542378343538512e-06,
"loss": 0.0763,
"step": 990
},
{
"epoch": 0.961352657004831,
"grad_norm": 0.2180752158164978,
"learning_rate": 9.519819529487592e-06,
"loss": 0.0792,
"step": 995
},
{
"epoch": 0.966183574879227,
"grad_norm": 0.22509507834911346,
"learning_rate": 9.497260715436674e-06,
"loss": 0.0682,
"step": 1000
},
{
"epoch": 0.9710144927536232,
"grad_norm": 0.197494238615036,
"learning_rate": 9.474701901385756e-06,
"loss": 0.0903,
"step": 1005
},
{
"epoch": 0.9758454106280193,
"grad_norm": 0.1817607879638672,
"learning_rate": 9.452143087334838e-06,
"loss": 0.0846,
"step": 1010
},
{
"epoch": 0.9806763285024155,
"grad_norm": 0.19075438380241394,
"learning_rate": 9.42958427328392e-06,
"loss": 0.0846,
"step": 1015
},
{
"epoch": 0.9855072463768116,
"grad_norm": 0.15087321400642395,
"learning_rate": 9.407025459233e-06,
"loss": 0.0753,
"step": 1020
},
{
"epoch": 0.9903381642512077,
"grad_norm": 0.2226846069097519,
"learning_rate": 9.38446664518208e-06,
"loss": 0.0666,
"step": 1025
},
{
"epoch": 0.9951690821256038,
"grad_norm": 0.30765634775161743,
"learning_rate": 9.361907831131162e-06,
"loss": 0.0779,
"step": 1030
},
{
"epoch": 1.0,
"grad_norm": 0.4031631052494049,
"learning_rate": 9.339349017080244e-06,
"loss": 0.095,
"step": 1035
},
{
"epoch": 1.0,
"eval_runtime": 338.4602,
"eval_samples_per_second": 3.055,
"eval_steps_per_second": 0.384,
"step": 1035
},
{
"epoch": 1.0048309178743962,
"grad_norm": 0.2173592448234558,
"learning_rate": 9.316790203029326e-06,
"loss": 0.0803,
"step": 1040
},
{
"epoch": 1.0096618357487923,
"grad_norm": 0.22241808474063873,
"learning_rate": 9.294231388978408e-06,
"loss": 0.0909,
"step": 1045
},
{
"epoch": 1.0144927536231885,
"grad_norm": 0.2699296474456787,
"learning_rate": 9.27167257492749e-06,
"loss": 0.0842,
"step": 1050
},
{
"epoch": 1.0193236714975846,
"grad_norm": 0.27080684900283813,
"learning_rate": 9.24911376087657e-06,
"loss": 0.0837,
"step": 1055
},
{
"epoch": 1.0241545893719808,
"grad_norm": 0.1808546930551529,
"learning_rate": 9.226554946825653e-06,
"loss": 0.0816,
"step": 1060
},
{
"epoch": 1.0289855072463767,
"grad_norm": 0.19763918220996857,
"learning_rate": 9.203996132774735e-06,
"loss": 0.0705,
"step": 1065
},
{
"epoch": 1.0338164251207729,
"grad_norm": 0.21294108033180237,
"learning_rate": 9.181437318723815e-06,
"loss": 0.0726,
"step": 1070
},
{
"epoch": 1.038647342995169,
"grad_norm": 0.19769993424415588,
"learning_rate": 9.158878504672897e-06,
"loss": 0.0743,
"step": 1075
},
{
"epoch": 1.0434782608695652,
"grad_norm": 0.23708152770996094,
"learning_rate": 9.136319690621979e-06,
"loss": 0.0785,
"step": 1080
},
{
"epoch": 1.0483091787439613,
"grad_norm": 0.232899010181427,
"learning_rate": 9.11376087657106e-06,
"loss": 0.093,
"step": 1085
},
{
"epoch": 1.0531400966183575,
"grad_norm": 0.267478883266449,
"learning_rate": 9.091202062520141e-06,
"loss": 0.0901,
"step": 1090
},
{
"epoch": 1.0579710144927537,
"grad_norm": 0.23761190474033356,
"learning_rate": 9.068643248469223e-06,
"loss": 0.0898,
"step": 1095
},
{
"epoch": 1.0628019323671498,
"grad_norm": 0.19679813086986542,
"learning_rate": 9.046084434418305e-06,
"loss": 0.0877,
"step": 1100
},
{
"epoch": 1.067632850241546,
"grad_norm": 0.20915761590003967,
"learning_rate": 9.023525620367387e-06,
"loss": 0.0731,
"step": 1105
},
{
"epoch": 1.0724637681159421,
"grad_norm": 0.18718890845775604,
"learning_rate": 9.000966806316468e-06,
"loss": 0.0885,
"step": 1110
},
{
"epoch": 1.077294685990338,
"grad_norm": 0.29885435104370117,
"learning_rate": 8.97840799226555e-06,
"loss": 0.0861,
"step": 1115
},
{
"epoch": 1.0821256038647342,
"grad_norm": 0.16953594982624054,
"learning_rate": 8.95584917821463e-06,
"loss": 0.0862,
"step": 1120
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.21629682183265686,
"learning_rate": 8.933290364163712e-06,
"loss": 0.1022,
"step": 1125
},
{
"epoch": 1.0917874396135265,
"grad_norm": 0.26614615321159363,
"learning_rate": 8.910731550112794e-06,
"loss": 0.0825,
"step": 1130
},
{
"epoch": 1.0966183574879227,
"grad_norm": 0.2642468810081482,
"learning_rate": 8.888172736061876e-06,
"loss": 0.0796,
"step": 1135
},
{
"epoch": 1.1014492753623188,
"grad_norm": 0.16877882182598114,
"learning_rate": 8.865613922010956e-06,
"loss": 0.0726,
"step": 1140
},
{
"epoch": 1.106280193236715,
"grad_norm": 0.2619246542453766,
"learning_rate": 8.843055107960038e-06,
"loss": 0.0835,
"step": 1145
},
{
"epoch": 1.1111111111111112,
"grad_norm": 0.2424723505973816,
"learning_rate": 8.82049629390912e-06,
"loss": 0.0896,
"step": 1150
},
{
"epoch": 1.1159420289855073,
"grad_norm": 0.20973582565784454,
"learning_rate": 8.797937479858202e-06,
"loss": 0.0751,
"step": 1155
},
{
"epoch": 1.1207729468599035,
"grad_norm": 0.23418009281158447,
"learning_rate": 8.775378665807284e-06,
"loss": 0.077,
"step": 1160
},
{
"epoch": 1.1256038647342996,
"grad_norm": 0.3117668032646179,
"learning_rate": 8.752819851756366e-06,
"loss": 0.0769,
"step": 1165
},
{
"epoch": 1.1304347826086956,
"grad_norm": 0.25092753767967224,
"learning_rate": 8.730261037705446e-06,
"loss": 0.0729,
"step": 1170
},
{
"epoch": 1.1352657004830917,
"grad_norm": 0.1926090270280838,
"learning_rate": 8.707702223654527e-06,
"loss": 0.0772,
"step": 1175
},
{
"epoch": 1.1400966183574879,
"grad_norm": 0.27212995290756226,
"learning_rate": 8.685143409603609e-06,
"loss": 0.0712,
"step": 1180
},
{
"epoch": 1.144927536231884,
"grad_norm": 0.2097581923007965,
"learning_rate": 8.66258459555269e-06,
"loss": 0.0767,
"step": 1185
},
{
"epoch": 1.1497584541062802,
"grad_norm": 0.2765638828277588,
"learning_rate": 8.640025781501773e-06,
"loss": 0.0885,
"step": 1190
},
{
"epoch": 1.1545893719806763,
"grad_norm": 0.28414320945739746,
"learning_rate": 8.617466967450855e-06,
"loss": 0.0631,
"step": 1195
},
{
"epoch": 1.1594202898550725,
"grad_norm": 0.21230548620224,
"learning_rate": 8.594908153399935e-06,
"loss": 0.0848,
"step": 1200
},
{
"epoch": 1.1642512077294687,
"grad_norm": 0.1870320439338684,
"learning_rate": 8.572349339349017e-06,
"loss": 0.0724,
"step": 1205
},
{
"epoch": 1.1690821256038648,
"grad_norm": 0.23322801291942596,
"learning_rate": 8.549790525298099e-06,
"loss": 0.0846,
"step": 1210
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.22248071432113647,
"learning_rate": 8.52723171124718e-06,
"loss": 0.0624,
"step": 1215
},
{
"epoch": 1.178743961352657,
"grad_norm": 0.196117103099823,
"learning_rate": 8.504672897196261e-06,
"loss": 0.093,
"step": 1220
},
{
"epoch": 1.183574879227053,
"grad_norm": 0.2212802767753601,
"learning_rate": 8.482114083145343e-06,
"loss": 0.0853,
"step": 1225
},
{
"epoch": 1.1884057971014492,
"grad_norm": 0.17421841621398926,
"learning_rate": 8.459555269094424e-06,
"loss": 0.0747,
"step": 1230
},
{
"epoch": 1.1932367149758454,
"grad_norm": 0.2645537853240967,
"learning_rate": 8.436996455043506e-06,
"loss": 0.0859,
"step": 1235
},
{
"epoch": 1.1980676328502415,
"grad_norm": 0.27182498574256897,
"learning_rate": 8.414437640992588e-06,
"loss": 0.0943,
"step": 1240
},
{
"epoch": 1.2028985507246377,
"grad_norm": 0.20389291644096375,
"learning_rate": 8.39187882694167e-06,
"loss": 0.0783,
"step": 1245
},
{
"epoch": 1.2077294685990339,
"grad_norm": 0.3193868398666382,
"learning_rate": 8.369320012890752e-06,
"loss": 0.0917,
"step": 1250
},
{
"epoch": 1.21256038647343,
"grad_norm": 0.2852030098438263,
"learning_rate": 8.346761198839834e-06,
"loss": 0.077,
"step": 1255
},
{
"epoch": 1.2173913043478262,
"grad_norm": 0.256452739238739,
"learning_rate": 8.324202384788914e-06,
"loss": 0.0804,
"step": 1260
},
{
"epoch": 1.2222222222222223,
"grad_norm": 0.209047332406044,
"learning_rate": 8.301643570737994e-06,
"loss": 0.0928,
"step": 1265
},
{
"epoch": 1.2270531400966185,
"grad_norm": 0.21215900778770447,
"learning_rate": 8.279084756687076e-06,
"loss": 0.0788,
"step": 1270
},
{
"epoch": 1.2318840579710144,
"grad_norm": 0.15550634264945984,
"learning_rate": 8.256525942636158e-06,
"loss": 0.0804,
"step": 1275
},
{
"epoch": 1.2367149758454106,
"grad_norm": 0.16960662603378296,
"learning_rate": 8.23396712858524e-06,
"loss": 0.0737,
"step": 1280
},
{
"epoch": 1.2415458937198067,
"grad_norm": 0.20484741032123566,
"learning_rate": 8.211408314534322e-06,
"loss": 0.0794,
"step": 1285
},
{
"epoch": 1.2463768115942029,
"grad_norm": 0.24889996647834778,
"learning_rate": 8.188849500483402e-06,
"loss": 0.0903,
"step": 1290
},
{
"epoch": 1.251207729468599,
"grad_norm": 0.23695576190948486,
"learning_rate": 8.166290686432484e-06,
"loss": 0.0826,
"step": 1295
},
{
"epoch": 1.2560386473429952,
"grad_norm": 0.23449349403381348,
"learning_rate": 8.143731872381566e-06,
"loss": 0.0922,
"step": 1300
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.2362452745437622,
"learning_rate": 8.121173058330648e-06,
"loss": 0.0716,
"step": 1305
},
{
"epoch": 1.2657004830917875,
"grad_norm": 0.33280622959136963,
"learning_rate": 8.09861424427973e-06,
"loss": 0.0909,
"step": 1310
},
{
"epoch": 1.2705314009661834,
"grad_norm": 0.22267523407936096,
"learning_rate": 8.07605543022881e-06,
"loss": 0.0816,
"step": 1315
},
{
"epoch": 1.2753623188405796,
"grad_norm": 0.23176385462284088,
"learning_rate": 8.053496616177891e-06,
"loss": 0.091,
"step": 1320
},
{
"epoch": 1.2801932367149758,
"grad_norm": 0.21951176226139069,
"learning_rate": 8.030937802126973e-06,
"loss": 0.0752,
"step": 1325
},
{
"epoch": 1.285024154589372,
"grad_norm": 0.19361701607704163,
"learning_rate": 8.008378988076055e-06,
"loss": 0.0731,
"step": 1330
},
{
"epoch": 1.289855072463768,
"grad_norm": 0.2284880429506302,
"learning_rate": 7.985820174025137e-06,
"loss": 0.0821,
"step": 1335
},
{
"epoch": 1.2946859903381642,
"grad_norm": 0.28775539994239807,
"learning_rate": 7.963261359974219e-06,
"loss": 0.0865,
"step": 1340
},
{
"epoch": 1.2995169082125604,
"grad_norm": 0.22133222222328186,
"learning_rate": 7.940702545923301e-06,
"loss": 0.0722,
"step": 1345
},
{
"epoch": 1.3043478260869565,
"grad_norm": 0.2120644450187683,
"learning_rate": 7.918143731872381e-06,
"loss": 0.0642,
"step": 1350
},
{
"epoch": 1.3091787439613527,
"grad_norm": 0.2922479212284088,
"learning_rate": 7.895584917821463e-06,
"loss": 0.0804,
"step": 1355
},
{
"epoch": 1.3140096618357489,
"grad_norm": 0.2302795797586441,
"learning_rate": 7.873026103770545e-06,
"loss": 0.0726,
"step": 1360
},
{
"epoch": 1.318840579710145,
"grad_norm": 0.28763264417648315,
"learning_rate": 7.850467289719626e-06,
"loss": 0.0976,
"step": 1365
},
{
"epoch": 1.3236714975845412,
"grad_norm": 0.2106347233057022,
"learning_rate": 7.827908475668708e-06,
"loss": 0.0744,
"step": 1370
},
{
"epoch": 1.3285024154589373,
"grad_norm": 0.23215855658054352,
"learning_rate": 7.80534966161779e-06,
"loss": 0.0916,
"step": 1375
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.20885543525218964,
"learning_rate": 7.78279084756687e-06,
"loss": 0.0774,
"step": 1380
},
{
"epoch": 1.3381642512077294,
"grad_norm": 0.20533576607704163,
"learning_rate": 7.760232033515952e-06,
"loss": 0.0751,
"step": 1385
},
{
"epoch": 1.3429951690821256,
"grad_norm": 0.20719490945339203,
"learning_rate": 7.737673219465034e-06,
"loss": 0.0816,
"step": 1390
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.19761165976524353,
"learning_rate": 7.715114405414116e-06,
"loss": 0.0804,
"step": 1395
},
{
"epoch": 1.3526570048309179,
"grad_norm": 0.20369771122932434,
"learning_rate": 7.692555591363198e-06,
"loss": 0.0845,
"step": 1400
},
{
"epoch": 1.357487922705314,
"grad_norm": 0.20887012779712677,
"learning_rate": 7.669996777312278e-06,
"loss": 0.0704,
"step": 1405
},
{
"epoch": 1.3623188405797102,
"grad_norm": 0.29784587025642395,
"learning_rate": 7.64743796326136e-06,
"loss": 0.0866,
"step": 1410
},
{
"epoch": 1.3671497584541064,
"grad_norm": 0.31036221981048584,
"learning_rate": 7.62487914921044e-06,
"loss": 0.0862,
"step": 1415
},
{
"epoch": 1.3719806763285023,
"grad_norm": 0.25198647379875183,
"learning_rate": 7.602320335159522e-06,
"loss": 0.0825,
"step": 1420
},
{
"epoch": 1.3768115942028984,
"grad_norm": 0.24515630304813385,
"learning_rate": 7.579761521108604e-06,
"loss": 0.0787,
"step": 1425
},
{
"epoch": 1.3816425120772946,
"grad_norm": 0.22536733746528625,
"learning_rate": 7.5572027070576855e-06,
"loss": 0.0928,
"step": 1430
},
{
"epoch": 1.3864734299516908,
"grad_norm": 0.23405781388282776,
"learning_rate": 7.5346438930067675e-06,
"loss": 0.0917,
"step": 1435
},
{
"epoch": 1.391304347826087,
"grad_norm": 0.24243396520614624,
"learning_rate": 7.5120850789558495e-06,
"loss": 0.0675,
"step": 1440
},
{
"epoch": 1.396135265700483,
"grad_norm": 0.2637854814529419,
"learning_rate": 7.489526264904931e-06,
"loss": 0.0863,
"step": 1445
},
{
"epoch": 1.4009661835748792,
"grad_norm": 0.2491244375705719,
"learning_rate": 7.466967450854013e-06,
"loss": 0.0808,
"step": 1450
},
{
"epoch": 1.4057971014492754,
"grad_norm": 0.23132705688476562,
"learning_rate": 7.444408636803095e-06,
"loss": 0.0797,
"step": 1455
},
{
"epoch": 1.4106280193236715,
"grad_norm": 0.2987098693847656,
"learning_rate": 7.421849822752176e-06,
"loss": 0.0766,
"step": 1460
},
{
"epoch": 1.4154589371980677,
"grad_norm": 0.23995457589626312,
"learning_rate": 7.399291008701256e-06,
"loss": 0.0764,
"step": 1465
},
{
"epoch": 1.4202898550724639,
"grad_norm": 0.21818973124027252,
"learning_rate": 7.376732194650338e-06,
"loss": 0.09,
"step": 1470
},
{
"epoch": 1.42512077294686,
"grad_norm": 0.19304029643535614,
"learning_rate": 7.354173380599419e-06,
"loss": 0.0759,
"step": 1475
},
{
"epoch": 1.4299516908212562,
"grad_norm": 0.26081785559654236,
"learning_rate": 7.331614566548501e-06,
"loss": 0.0781,
"step": 1480
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.23940761387348175,
"learning_rate": 7.309055752497583e-06,
"loss": 0.085,
"step": 1485
},
{
"epoch": 1.4396135265700483,
"grad_norm": 0.21909761428833008,
"learning_rate": 7.286496938446664e-06,
"loss": 0.0815,
"step": 1490
},
{
"epoch": 1.4444444444444444,
"grad_norm": 0.16527162492275238,
"learning_rate": 7.263938124395746e-06,
"loss": 0.0698,
"step": 1495
},
{
"epoch": 1.4492753623188406,
"grad_norm": 0.21258555352687836,
"learning_rate": 7.241379310344828e-06,
"loss": 0.0806,
"step": 1500
},
{
"epoch": 1.4541062801932367,
"grad_norm": 0.18572719395160675,
"learning_rate": 7.2188204962939095e-06,
"loss": 0.0757,
"step": 1505
},
{
"epoch": 1.458937198067633,
"grad_norm": 0.16916704177856445,
"learning_rate": 7.19626168224299e-06,
"loss": 0.07,
"step": 1510
},
{
"epoch": 1.463768115942029,
"grad_norm": 0.289044588804245,
"learning_rate": 7.173702868192072e-06,
"loss": 0.0656,
"step": 1515
},
{
"epoch": 1.4685990338164252,
"grad_norm": 0.27173757553100586,
"learning_rate": 7.151144054141153e-06,
"loss": 0.0704,
"step": 1520
},
{
"epoch": 1.4734299516908211,
"grad_norm": 0.2929324209690094,
"learning_rate": 7.128585240090235e-06,
"loss": 0.0833,
"step": 1525
},
{
"epoch": 1.4782608695652173,
"grad_norm": 0.2387627214193344,
"learning_rate": 7.106026426039317e-06,
"loss": 0.075,
"step": 1530
},
{
"epoch": 1.4830917874396135,
"grad_norm": 0.3277483582496643,
"learning_rate": 7.083467611988398e-06,
"loss": 0.074,
"step": 1535
},
{
"epoch": 1.4879227053140096,
"grad_norm": 0.23673392832279205,
"learning_rate": 7.06090879793748e-06,
"loss": 0.0697,
"step": 1540
},
{
"epoch": 1.4927536231884058,
"grad_norm": 0.19109922647476196,
"learning_rate": 7.038349983886562e-06,
"loss": 0.0775,
"step": 1545
},
{
"epoch": 1.497584541062802,
"grad_norm": 0.2344091832637787,
"learning_rate": 7.015791169835643e-06,
"loss": 0.0644,
"step": 1550
},
{
"epoch": 1.502415458937198,
"grad_norm": 0.28420698642730713,
"learning_rate": 6.993232355784724e-06,
"loss": 0.0935,
"step": 1555
},
{
"epoch": 1.5072463768115942,
"grad_norm": 0.2632888853549957,
"learning_rate": 6.970673541733806e-06,
"loss": 0.083,
"step": 1560
},
{
"epoch": 1.5120772946859904,
"grad_norm": 0.2461112141609192,
"learning_rate": 6.9481147276828875e-06,
"loss": 0.0729,
"step": 1565
},
{
"epoch": 1.5169082125603865,
"grad_norm": 0.2015853226184845,
"learning_rate": 6.9255559136319695e-06,
"loss": 0.0836,
"step": 1570
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.2409069985151291,
"learning_rate": 6.902997099581051e-06,
"loss": 0.0797,
"step": 1575
},
{
"epoch": 1.5265700483091789,
"grad_norm": 0.2014143019914627,
"learning_rate": 6.880438285530132e-06,
"loss": 0.09,
"step": 1580
},
{
"epoch": 1.531400966183575,
"grad_norm": 0.2173725664615631,
"learning_rate": 6.857879471479214e-06,
"loss": 0.0648,
"step": 1585
},
{
"epoch": 1.5362318840579712,
"grad_norm": 0.20185904204845428,
"learning_rate": 6.835320657428296e-06,
"loss": 0.0924,
"step": 1590
},
{
"epoch": 1.541062801932367,
"grad_norm": 0.29456228017807007,
"learning_rate": 6.812761843377377e-06,
"loss": 0.0764,
"step": 1595
},
{
"epoch": 1.5458937198067633,
"grad_norm": 0.22320301830768585,
"learning_rate": 6.790203029326458e-06,
"loss": 0.0754,
"step": 1600
},
{
"epoch": 1.5507246376811594,
"grad_norm": 0.2032977044582367,
"learning_rate": 6.76764421527554e-06,
"loss": 0.089,
"step": 1605
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.24341309070587158,
"learning_rate": 6.745085401224621e-06,
"loss": 0.0767,
"step": 1610
},
{
"epoch": 1.5603864734299517,
"grad_norm": 0.22675780951976776,
"learning_rate": 6.722526587173703e-06,
"loss": 0.0811,
"step": 1615
},
{
"epoch": 1.5652173913043477,
"grad_norm": 0.2980429232120514,
"learning_rate": 6.699967773122784e-06,
"loss": 0.0714,
"step": 1620
},
{
"epoch": 1.5700483091787438,
"grad_norm": 0.2221527248620987,
"learning_rate": 6.6774089590718655e-06,
"loss": 0.0811,
"step": 1625
},
{
"epoch": 1.57487922705314,
"grad_norm": 0.29102587699890137,
"learning_rate": 6.6548501450209474e-06,
"loss": 0.0717,
"step": 1630
},
{
"epoch": 1.5797101449275361,
"grad_norm": 0.24565882980823517,
"learning_rate": 6.632291330970029e-06,
"loss": 0.0688,
"step": 1635
},
{
"epoch": 1.5845410628019323,
"grad_norm": 0.2056146264076233,
"learning_rate": 6.609732516919111e-06,
"loss": 0.0739,
"step": 1640
},
{
"epoch": 1.5893719806763285,
"grad_norm": 0.25777336955070496,
"learning_rate": 6.587173702868192e-06,
"loss": 0.0746,
"step": 1645
},
{
"epoch": 1.5942028985507246,
"grad_norm": 0.20640453696250916,
"learning_rate": 6.564614888817273e-06,
"loss": 0.0757,
"step": 1650
},
{
"epoch": 1.5990338164251208,
"grad_norm": 0.16480913758277893,
"learning_rate": 6.542056074766355e-06,
"loss": 0.0752,
"step": 1655
},
{
"epoch": 1.603864734299517,
"grad_norm": 0.23693595826625824,
"learning_rate": 6.519497260715437e-06,
"loss": 0.0813,
"step": 1660
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.24152866005897522,
"learning_rate": 6.496938446664518e-06,
"loss": 0.0784,
"step": 1665
},
{
"epoch": 1.6135265700483092,
"grad_norm": 0.23890602588653564,
"learning_rate": 6.474379632613599e-06,
"loss": 0.0813,
"step": 1670
},
{
"epoch": 1.6183574879227054,
"grad_norm": 0.2686842679977417,
"learning_rate": 6.451820818562681e-06,
"loss": 0.0833,
"step": 1675
},
{
"epoch": 1.6231884057971016,
"grad_norm": 0.2103358954191208,
"learning_rate": 6.429262004511762e-06,
"loss": 0.0906,
"step": 1680
},
{
"epoch": 1.6280193236714977,
"grad_norm": 0.23938271403312683,
"learning_rate": 6.406703190460844e-06,
"loss": 0.0721,
"step": 1685
},
{
"epoch": 1.6328502415458939,
"grad_norm": 0.1797400861978531,
"learning_rate": 6.384144376409926e-06,
"loss": 0.0678,
"step": 1690
},
{
"epoch": 1.6376811594202898,
"grad_norm": 0.23905880749225616,
"learning_rate": 6.361585562359007e-06,
"loss": 0.0886,
"step": 1695
},
{
"epoch": 1.642512077294686,
"grad_norm": 0.19138076901435852,
"learning_rate": 6.339026748308089e-06,
"loss": 0.0705,
"step": 1700
},
{
"epoch": 1.6473429951690821,
"grad_norm": 0.19759757816791534,
"learning_rate": 6.3164679342571706e-06,
"loss": 0.0772,
"step": 1705
},
{
"epoch": 1.6521739130434783,
"grad_norm": 0.22951267659664154,
"learning_rate": 6.293909120206252e-06,
"loss": 0.0701,
"step": 1710
},
{
"epoch": 1.6570048309178744,
"grad_norm": 0.3317079246044159,
"learning_rate": 6.271350306155334e-06,
"loss": 0.0838,
"step": 1715
},
{
"epoch": 1.6618357487922706,
"grad_norm": 0.2875089645385742,
"learning_rate": 6.248791492104415e-06,
"loss": 0.0711,
"step": 1720
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.22365209460258484,
"learning_rate": 6.226232678053496e-06,
"loss": 0.0913,
"step": 1725
},
{
"epoch": 1.6714975845410627,
"grad_norm": 0.26004156470298767,
"learning_rate": 6.203673864002578e-06,
"loss": 0.0749,
"step": 1730
},
{
"epoch": 1.6763285024154588,
"grad_norm": 0.24029529094696045,
"learning_rate": 6.18111504995166e-06,
"loss": 0.0872,
"step": 1735
},
{
"epoch": 1.681159420289855,
"grad_norm": 0.2503759562969208,
"learning_rate": 6.158556235900741e-06,
"loss": 0.0662,
"step": 1740
},
{
"epoch": 1.6859903381642511,
"grad_norm": 0.24961721897125244,
"learning_rate": 6.135997421849822e-06,
"loss": 0.0805,
"step": 1745
},
{
"epoch": 1.6908212560386473,
"grad_norm": 0.20291025936603546,
"learning_rate": 6.113438607798904e-06,
"loss": 0.0723,
"step": 1750
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.24923092126846313,
"learning_rate": 6.0908797937479854e-06,
"loss": 0.0766,
"step": 1755
},
{
"epoch": 1.7004830917874396,
"grad_norm": 0.3006664514541626,
"learning_rate": 6.068320979697067e-06,
"loss": 0.0767,
"step": 1760
},
{
"epoch": 1.7053140096618358,
"grad_norm": 0.22034914791584015,
"learning_rate": 6.045762165646149e-06,
"loss": 0.0716,
"step": 1765
},
{
"epoch": 1.710144927536232,
"grad_norm": 0.22951188683509827,
"learning_rate": 6.02320335159523e-06,
"loss": 0.0713,
"step": 1770
},
{
"epoch": 1.714975845410628,
"grad_norm": 0.22270874679088593,
"learning_rate": 6.000644537544312e-06,
"loss": 0.0671,
"step": 1775
},
{
"epoch": 1.7198067632850242,
"grad_norm": 0.23195502161979675,
"learning_rate": 5.978085723493394e-06,
"loss": 0.0864,
"step": 1780
},
{
"epoch": 1.7246376811594204,
"grad_norm": 0.2421010136604309,
"learning_rate": 5.955526909442475e-06,
"loss": 0.0886,
"step": 1785
},
{
"epoch": 1.7294685990338166,
"grad_norm": 0.20693883299827576,
"learning_rate": 5.932968095391557e-06,
"loss": 0.0715,
"step": 1790
},
{
"epoch": 1.7342995169082127,
"grad_norm": 0.32137101888656616,
"learning_rate": 5.910409281340638e-06,
"loss": 0.0639,
"step": 1795
},
{
"epoch": 1.7391304347826086,
"grad_norm": 0.21108365058898926,
"learning_rate": 5.887850467289719e-06,
"loss": 0.0786,
"step": 1800
},
{
"epoch": 1.7439613526570048,
"grad_norm": 0.2952270805835724,
"learning_rate": 5.865291653238801e-06,
"loss": 0.0641,
"step": 1805
},
{
"epoch": 1.748792270531401,
"grad_norm": 0.26709944009780884,
"learning_rate": 5.842732839187883e-06,
"loss": 0.0698,
"step": 1810
},
{
"epoch": 1.7536231884057971,
"grad_norm": 0.30126988887786865,
"learning_rate": 5.820174025136964e-06,
"loss": 0.0773,
"step": 1815
},
{
"epoch": 1.7584541062801933,
"grad_norm": 0.2402152717113495,
"learning_rate": 5.797615211086045e-06,
"loss": 0.0778,
"step": 1820
},
{
"epoch": 1.7632850241545892,
"grad_norm": 0.19652244448661804,
"learning_rate": 5.775056397035127e-06,
"loss": 0.082,
"step": 1825
},
{
"epoch": 1.7681159420289854,
"grad_norm": 0.21389204263687134,
"learning_rate": 5.7524975829842086e-06,
"loss": 0.0727,
"step": 1830
},
{
"epoch": 1.7729468599033815,
"grad_norm": 0.2189796268939972,
"learning_rate": 5.7299387689332905e-06,
"loss": 0.0757,
"step": 1835
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.28000935912132263,
"learning_rate": 5.7073799548823725e-06,
"loss": 0.0803,
"step": 1840
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.24566881358623505,
"learning_rate": 5.684821140831453e-06,
"loss": 0.0815,
"step": 1845
},
{
"epoch": 1.78743961352657,
"grad_norm": 0.22037634253501892,
"learning_rate": 5.662262326780535e-06,
"loss": 0.0871,
"step": 1850
},
{
"epoch": 1.7922705314009661,
"grad_norm": 0.1990278661251068,
"learning_rate": 5.639703512729617e-06,
"loss": 0.0756,
"step": 1855
},
{
"epoch": 1.7971014492753623,
"grad_norm": 0.3180176615715027,
"learning_rate": 5.617144698678698e-06,
"loss": 0.0735,
"step": 1860
},
{
"epoch": 1.8019323671497585,
"grad_norm": 0.2075718492269516,
"learning_rate": 5.59458588462778e-06,
"loss": 0.0665,
"step": 1865
},
{
"epoch": 1.8067632850241546,
"grad_norm": 0.2611768841743469,
"learning_rate": 5.572027070576861e-06,
"loss": 0.0873,
"step": 1870
},
{
"epoch": 1.8115942028985508,
"grad_norm": 0.22146160900592804,
"learning_rate": 5.549468256525942e-06,
"loss": 0.0638,
"step": 1875
},
{
"epoch": 1.816425120772947,
"grad_norm": 0.29287296533584595,
"learning_rate": 5.526909442475024e-06,
"loss": 0.0812,
"step": 1880
},
{
"epoch": 1.821256038647343,
"grad_norm": 0.2280767410993576,
"learning_rate": 5.504350628424106e-06,
"loss": 0.0766,
"step": 1885
},
{
"epoch": 1.8260869565217392,
"grad_norm": 0.20453138649463654,
"learning_rate": 5.4817918143731865e-06,
"loss": 0.0748,
"step": 1890
},
{
"epoch": 1.8309178743961354,
"grad_norm": 0.2855188250541687,
"learning_rate": 5.4592330003222685e-06,
"loss": 0.0901,
"step": 1895
},
{
"epoch": 1.8357487922705316,
"grad_norm": 0.21556098759174347,
"learning_rate": 5.4366741862713505e-06,
"loss": 0.0735,
"step": 1900
},
{
"epoch": 1.8405797101449275,
"grad_norm": 0.3091937303543091,
"learning_rate": 5.414115372220432e-06,
"loss": 0.0636,
"step": 1905
},
{
"epoch": 1.8454106280193237,
"grad_norm": 0.2939262390136719,
"learning_rate": 5.391556558169514e-06,
"loss": 0.0753,
"step": 1910
},
{
"epoch": 1.8502415458937198,
"grad_norm": 0.2101174294948578,
"learning_rate": 5.368997744118595e-06,
"loss": 0.0714,
"step": 1915
},
{
"epoch": 1.855072463768116,
"grad_norm": 0.2570497691631317,
"learning_rate": 5.346438930067676e-06,
"loss": 0.0877,
"step": 1920
},
{
"epoch": 1.8599033816425121,
"grad_norm": 0.2754373848438263,
"learning_rate": 5.323880116016758e-06,
"loss": 0.0729,
"step": 1925
},
{
"epoch": 1.864734299516908,
"grad_norm": 0.2952544391155243,
"learning_rate": 5.30132130196584e-06,
"loss": 0.0714,
"step": 1930
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.2360425889492035,
"learning_rate": 5.278762487914921e-06,
"loss": 0.0711,
"step": 1935
},
{
"epoch": 1.8743961352657004,
"grad_norm": 0.22847935557365417,
"learning_rate": 5.256203673864002e-06,
"loss": 0.07,
"step": 1940
},
{
"epoch": 1.8792270531400965,
"grad_norm": 0.26060476899147034,
"learning_rate": 5.233644859813084e-06,
"loss": 0.086,
"step": 1945
},
{
"epoch": 1.8840579710144927,
"grad_norm": 0.28593048453330994,
"learning_rate": 5.211086045762165e-06,
"loss": 0.0782,
"step": 1950
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.2553214430809021,
"learning_rate": 5.188527231711247e-06,
"loss": 0.0689,
"step": 1955
},
{
"epoch": 1.893719806763285,
"grad_norm": 0.38168102502822876,
"learning_rate": 5.1659684176603285e-06,
"loss": 0.0917,
"step": 1960
},
{
"epoch": 1.8985507246376812,
"grad_norm": 0.22879190742969513,
"learning_rate": 5.14340960360941e-06,
"loss": 0.0833,
"step": 1965
},
{
"epoch": 1.9033816425120773,
"grad_norm": 0.19676880538463593,
"learning_rate": 5.120850789558492e-06,
"loss": 0.0594,
"step": 1970
},
{
"epoch": 1.9082125603864735,
"grad_norm": 0.36660292744636536,
"learning_rate": 5.098291975507573e-06,
"loss": 0.0932,
"step": 1975
},
{
"epoch": 1.9130434782608696,
"grad_norm": 0.23486468195915222,
"learning_rate": 5.075733161456655e-06,
"loss": 0.0941,
"step": 1980
},
{
"epoch": 1.9178743961352658,
"grad_norm": 0.2950279414653778,
"learning_rate": 5.053174347405737e-06,
"loss": 0.0796,
"step": 1985
},
{
"epoch": 1.922705314009662,
"grad_norm": 0.1995108425617218,
"learning_rate": 5.030615533354817e-06,
"loss": 0.0766,
"step": 1990
},
{
"epoch": 1.927536231884058,
"grad_norm": 0.3509507179260254,
"learning_rate": 5.008056719303899e-06,
"loss": 0.0718,
"step": 1995
},
{
"epoch": 1.9323671497584543,
"grad_norm": 0.22868584096431732,
"learning_rate": 4.985497905252981e-06,
"loss": 0.0724,
"step": 2000
},
{
"epoch": 1.9371980676328504,
"grad_norm": 0.270059734582901,
"learning_rate": 4.962939091202062e-06,
"loss": 0.0761,
"step": 2005
},
{
"epoch": 1.9420289855072463,
"grad_norm": 0.24437829852104187,
"learning_rate": 4.940380277151144e-06,
"loss": 0.0729,
"step": 2010
},
{
"epoch": 1.9468599033816425,
"grad_norm": 0.24446424841880798,
"learning_rate": 4.917821463100225e-06,
"loss": 0.0648,
"step": 2015
},
{
"epoch": 1.9516908212560387,
"grad_norm": 0.21626543998718262,
"learning_rate": 4.8952626490493065e-06,
"loss": 0.0739,
"step": 2020
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.20689117908477783,
"learning_rate": 4.8727038349983885e-06,
"loss": 0.0701,
"step": 2025
},
{
"epoch": 1.961352657004831,
"grad_norm": 0.2660706043243408,
"learning_rate": 4.8501450209474705e-06,
"loss": 0.0571,
"step": 2030
},
{
"epoch": 1.966183574879227,
"grad_norm": 0.24084658920764923,
"learning_rate": 4.827586206896552e-06,
"loss": 0.0764,
"step": 2035
},
{
"epoch": 1.971014492753623,
"grad_norm": 0.2771299481391907,
"learning_rate": 4.805027392845633e-06,
"loss": 0.0738,
"step": 2040
},
{
"epoch": 1.9758454106280192,
"grad_norm": 0.2248222976922989,
"learning_rate": 4.782468578794715e-06,
"loss": 0.0774,
"step": 2045
},
{
"epoch": 1.9806763285024154,
"grad_norm": 0.22526535391807556,
"learning_rate": 4.759909764743796e-06,
"loss": 0.0678,
"step": 2050
},
{
"epoch": 1.9855072463768115,
"grad_norm": 0.21107898652553558,
"learning_rate": 4.737350950692878e-06,
"loss": 0.1011,
"step": 2055
},
{
"epoch": 1.9903381642512077,
"grad_norm": 0.22934384644031525,
"learning_rate": 4.71479213664196e-06,
"loss": 0.0715,
"step": 2060
},
{
"epoch": 1.9951690821256038,
"grad_norm": 0.2517627775669098,
"learning_rate": 4.69223332259104e-06,
"loss": 0.0796,
"step": 2065
},
{
"epoch": 2.0,
"grad_norm": 0.40475329756736755,
"learning_rate": 4.669674508540122e-06,
"loss": 0.0919,
"step": 2070
},
{
"epoch": 2.0,
"eval_runtime": 339.1035,
"eval_samples_per_second": 3.049,
"eval_steps_per_second": 0.383,
"step": 2070
},
{
"epoch": 2.004830917874396,
"grad_norm": 0.23014891147613525,
"learning_rate": 4.647115694489204e-06,
"loss": 0.0721,
"step": 2075
},
{
"epoch": 2.0096618357487923,
"grad_norm": 0.292595773935318,
"learning_rate": 4.624556880438285e-06,
"loss": 0.0797,
"step": 2080
},
{
"epoch": 2.0144927536231885,
"grad_norm": 0.2784234583377838,
"learning_rate": 4.601998066387367e-06,
"loss": 0.0783,
"step": 2085
},
{
"epoch": 2.0193236714975846,
"grad_norm": 0.21615320444107056,
"learning_rate": 4.5794392523364485e-06,
"loss": 0.0794,
"step": 2090
},
{
"epoch": 2.024154589371981,
"grad_norm": 0.30054816603660583,
"learning_rate": 4.55688043828553e-06,
"loss": 0.078,
"step": 2095
},
{
"epoch": 2.028985507246377,
"grad_norm": 0.21918036043643951,
"learning_rate": 4.534321624234612e-06,
"loss": 0.0706,
"step": 2100
},
{
"epoch": 2.033816425120773,
"grad_norm": 0.22675025463104248,
"learning_rate": 4.511762810183694e-06,
"loss": 0.0578,
"step": 2105
},
{
"epoch": 2.0386473429951693,
"grad_norm": 0.3500133454799652,
"learning_rate": 4.489203996132775e-06,
"loss": 0.077,
"step": 2110
},
{
"epoch": 2.0434782608695654,
"grad_norm": 0.2782948315143585,
"learning_rate": 4.466645182081856e-06,
"loss": 0.0747,
"step": 2115
},
{
"epoch": 2.0483091787439616,
"grad_norm": 0.3685343265533447,
"learning_rate": 4.444086368030938e-06,
"loss": 0.0775,
"step": 2120
},
{
"epoch": 2.0531400966183573,
"grad_norm": 0.26994946599006653,
"learning_rate": 4.421527553980019e-06,
"loss": 0.076,
"step": 2125
},
{
"epoch": 2.0579710144927534,
"grad_norm": 0.2926693856716156,
"learning_rate": 4.398968739929101e-06,
"loss": 0.0797,
"step": 2130
},
{
"epoch": 2.0628019323671496,
"grad_norm": 0.26841118931770325,
"learning_rate": 4.376409925878183e-06,
"loss": 0.0733,
"step": 2135
},
{
"epoch": 2.0676328502415457,
"grad_norm": 0.25837743282318115,
"learning_rate": 4.353851111827263e-06,
"loss": 0.0572,
"step": 2140
},
{
"epoch": 2.072463768115942,
"grad_norm": 0.23347356915473938,
"learning_rate": 4.331292297776345e-06,
"loss": 0.0824,
"step": 2145
},
{
"epoch": 2.077294685990338,
"grad_norm": 0.31139683723449707,
"learning_rate": 4.308733483725427e-06,
"loss": 0.0801,
"step": 2150
},
{
"epoch": 2.082125603864734,
"grad_norm": 0.33561673760414124,
"learning_rate": 4.2861746696745085e-06,
"loss": 0.0816,
"step": 2155
},
{
"epoch": 2.0869565217391304,
"grad_norm": 0.2744121551513672,
"learning_rate": 4.26361585562359e-06,
"loss": 0.0709,
"step": 2160
},
{
"epoch": 2.0917874396135265,
"grad_norm": 0.29332056641578674,
"learning_rate": 4.241057041572672e-06,
"loss": 0.0768,
"step": 2165
},
{
"epoch": 2.0966183574879227,
"grad_norm": 0.26820820569992065,
"learning_rate": 4.218498227521753e-06,
"loss": 0.0854,
"step": 2170
},
{
"epoch": 2.101449275362319,
"grad_norm": 0.3563501536846161,
"learning_rate": 4.195939413470835e-06,
"loss": 0.0829,
"step": 2175
},
{
"epoch": 2.106280193236715,
"grad_norm": 0.35537421703338623,
"learning_rate": 4.173380599419917e-06,
"loss": 0.0763,
"step": 2180
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.2760440707206726,
"learning_rate": 4.150821785368997e-06,
"loss": 0.092,
"step": 2185
},
{
"epoch": 2.1159420289855073,
"grad_norm": 0.21750731766223907,
"learning_rate": 4.128262971318079e-06,
"loss": 0.0756,
"step": 2190
},
{
"epoch": 2.1207729468599035,
"grad_norm": 0.2815890610218048,
"learning_rate": 4.105704157267161e-06,
"loss": 0.0844,
"step": 2195
},
{
"epoch": 2.1256038647342996,
"grad_norm": 0.20408152043819427,
"learning_rate": 4.083145343216242e-06,
"loss": 0.0603,
"step": 2200
},
{
"epoch": 2.130434782608696,
"grad_norm": 0.2452622503042221,
"learning_rate": 4.060586529165324e-06,
"loss": 0.0767,
"step": 2205
},
{
"epoch": 2.135265700483092,
"grad_norm": 0.3027113080024719,
"learning_rate": 4.038027715114405e-06,
"loss": 0.0716,
"step": 2210
},
{
"epoch": 2.140096618357488,
"grad_norm": 0.23567864298820496,
"learning_rate": 4.0154689010634865e-06,
"loss": 0.0845,
"step": 2215
},
{
"epoch": 2.1449275362318843,
"grad_norm": 0.28407129645347595,
"learning_rate": 3.9929100870125685e-06,
"loss": 0.0784,
"step": 2220
},
{
"epoch": 2.14975845410628,
"grad_norm": 0.28088170289993286,
"learning_rate": 3.9703512729616505e-06,
"loss": 0.0771,
"step": 2225
},
{
"epoch": 2.154589371980676,
"grad_norm": 0.3641108274459839,
"learning_rate": 3.947792458910732e-06,
"loss": 0.0791,
"step": 2230
},
{
"epoch": 2.1594202898550723,
"grad_norm": 0.23423610627651215,
"learning_rate": 3.925233644859813e-06,
"loss": 0.0735,
"step": 2235
},
{
"epoch": 2.1642512077294684,
"grad_norm": 0.21887804567813873,
"learning_rate": 3.902674830808895e-06,
"loss": 0.0795,
"step": 2240
},
{
"epoch": 2.1690821256038646,
"grad_norm": 0.24810364842414856,
"learning_rate": 3.880116016757976e-06,
"loss": 0.076,
"step": 2245
},
{
"epoch": 2.1739130434782608,
"grad_norm": 0.217853844165802,
"learning_rate": 3.857557202707058e-06,
"loss": 0.0794,
"step": 2250
},
{
"epoch": 2.178743961352657,
"grad_norm": 0.28543898463249207,
"learning_rate": 3.834998388656139e-06,
"loss": 0.0707,
"step": 2255
},
{
"epoch": 2.183574879227053,
"grad_norm": 0.2932458221912384,
"learning_rate": 3.81243957460522e-06,
"loss": 0.0715,
"step": 2260
},
{
"epoch": 2.1884057971014492,
"grad_norm": 0.3077555000782013,
"learning_rate": 3.789880760554302e-06,
"loss": 0.0756,
"step": 2265
},
{
"epoch": 2.1932367149758454,
"grad_norm": 0.295901358127594,
"learning_rate": 3.7673219465033837e-06,
"loss": 0.0785,
"step": 2270
},
{
"epoch": 2.1980676328502415,
"grad_norm": 0.2174501270055771,
"learning_rate": 3.7447631324524653e-06,
"loss": 0.0578,
"step": 2275
},
{
"epoch": 2.2028985507246377,
"grad_norm": 0.2652744948863983,
"learning_rate": 3.7222043184015473e-06,
"loss": 0.0579,
"step": 2280
},
{
"epoch": 2.207729468599034,
"grad_norm": 0.34323185682296753,
"learning_rate": 3.699645504350628e-06,
"loss": 0.072,
"step": 2285
},
{
"epoch": 2.21256038647343,
"grad_norm": 0.3072277903556824,
"learning_rate": 3.6770866902997096e-06,
"loss": 0.0676,
"step": 2290
},
{
"epoch": 2.217391304347826,
"grad_norm": 0.27712109684944153,
"learning_rate": 3.6545278762487916e-06,
"loss": 0.0699,
"step": 2295
},
{
"epoch": 2.2222222222222223,
"grad_norm": 0.2862177789211273,
"learning_rate": 3.631969062197873e-06,
"loss": 0.0643,
"step": 2300
},
{
"epoch": 2.2270531400966185,
"grad_norm": 0.2914809286594391,
"learning_rate": 3.6094102481469547e-06,
"loss": 0.0702,
"step": 2305
},
{
"epoch": 2.2318840579710146,
"grad_norm": 0.19755889475345612,
"learning_rate": 3.586851434096036e-06,
"loss": 0.0817,
"step": 2310
},
{
"epoch": 2.236714975845411,
"grad_norm": 0.25922340154647827,
"learning_rate": 3.5642926200451175e-06,
"loss": 0.0602,
"step": 2315
},
{
"epoch": 2.241545893719807,
"grad_norm": 0.30358242988586426,
"learning_rate": 3.541733805994199e-06,
"loss": 0.0725,
"step": 2320
},
{
"epoch": 2.246376811594203,
"grad_norm": 0.2505339980125427,
"learning_rate": 3.519174991943281e-06,
"loss": 0.079,
"step": 2325
},
{
"epoch": 2.2512077294685993,
"grad_norm": 0.2911323308944702,
"learning_rate": 3.496616177892362e-06,
"loss": 0.0673,
"step": 2330
},
{
"epoch": 2.2560386473429954,
"grad_norm": 0.3253360092639923,
"learning_rate": 3.4740573638414437e-06,
"loss": 0.0776,
"step": 2335
},
{
"epoch": 2.260869565217391,
"grad_norm": 0.2546384036540985,
"learning_rate": 3.4514985497905253e-06,
"loss": 0.0689,
"step": 2340
},
{
"epoch": 2.2657004830917873,
"grad_norm": 0.29095250368118286,
"learning_rate": 3.428939735739607e-06,
"loss": 0.0812,
"step": 2345
},
{
"epoch": 2.2705314009661834,
"grad_norm": 0.29789912700653076,
"learning_rate": 3.4063809216886884e-06,
"loss": 0.087,
"step": 2350
},
{
"epoch": 2.2753623188405796,
"grad_norm": 0.23721112310886383,
"learning_rate": 3.38382210763777e-06,
"loss": 0.075,
"step": 2355
},
{
"epoch": 2.2801932367149758,
"grad_norm": 0.2618652284145355,
"learning_rate": 3.3612632935868516e-06,
"loss": 0.0781,
"step": 2360
},
{
"epoch": 2.285024154589372,
"grad_norm": 0.3185523748397827,
"learning_rate": 3.3387044795359327e-06,
"loss": 0.0865,
"step": 2365
},
{
"epoch": 2.289855072463768,
"grad_norm": 0.30211564898490906,
"learning_rate": 3.3161456654850143e-06,
"loss": 0.0755,
"step": 2370
},
{
"epoch": 2.2946859903381642,
"grad_norm": 0.18218393623828888,
"learning_rate": 3.293586851434096e-06,
"loss": 0.0695,
"step": 2375
},
{
"epoch": 2.2995169082125604,
"grad_norm": 0.20001597702503204,
"learning_rate": 3.2710280373831774e-06,
"loss": 0.0744,
"step": 2380
},
{
"epoch": 2.3043478260869565,
"grad_norm": 0.37984150648117065,
"learning_rate": 3.248469223332259e-06,
"loss": 0.0585,
"step": 2385
},
{
"epoch": 2.3091787439613527,
"grad_norm": 0.31228166818618774,
"learning_rate": 3.2259104092813406e-06,
"loss": 0.0731,
"step": 2390
},
{
"epoch": 2.314009661835749,
"grad_norm": 0.27851906418800354,
"learning_rate": 3.203351595230422e-06,
"loss": 0.0767,
"step": 2395
},
{
"epoch": 2.318840579710145,
"grad_norm": 0.22976937890052795,
"learning_rate": 3.1807927811795033e-06,
"loss": 0.0738,
"step": 2400
},
{
"epoch": 2.323671497584541,
"grad_norm": 0.24843037128448486,
"learning_rate": 3.1582339671285853e-06,
"loss": 0.0792,
"step": 2405
},
{
"epoch": 2.3285024154589373,
"grad_norm": 0.23123487830162048,
"learning_rate": 3.135675153077667e-06,
"loss": 0.0752,
"step": 2410
},
{
"epoch": 2.3333333333333335,
"grad_norm": 0.23363561928272247,
"learning_rate": 3.113116339026748e-06,
"loss": 0.0693,
"step": 2415
},
{
"epoch": 2.3381642512077296,
"grad_norm": 0.2371598780155182,
"learning_rate": 3.09055752497583e-06,
"loss": 0.0781,
"step": 2420
},
{
"epoch": 2.342995169082126,
"grad_norm": 0.320534884929657,
"learning_rate": 3.067998710924911e-06,
"loss": 0.0635,
"step": 2425
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.2920200824737549,
"learning_rate": 3.0454398968739927e-06,
"loss": 0.0771,
"step": 2430
},
{
"epoch": 2.3526570048309177,
"grad_norm": 0.32089921832084656,
"learning_rate": 3.0228810828230747e-06,
"loss": 0.0733,
"step": 2435
},
{
"epoch": 2.357487922705314,
"grad_norm": 0.2733156979084015,
"learning_rate": 3.000322268772156e-06,
"loss": 0.0681,
"step": 2440
},
{
"epoch": 2.36231884057971,
"grad_norm": 0.24564507603645325,
"learning_rate": 2.9777634547212374e-06,
"loss": 0.0771,
"step": 2445
},
{
"epoch": 2.367149758454106,
"grad_norm": 0.24026136100292206,
"learning_rate": 2.955204640670319e-06,
"loss": 0.0748,
"step": 2450
},
{
"epoch": 2.3719806763285023,
"grad_norm": 0.20703287422657013,
"learning_rate": 2.9326458266194006e-06,
"loss": 0.0688,
"step": 2455
},
{
"epoch": 2.3768115942028984,
"grad_norm": 0.18269629776477814,
"learning_rate": 2.910087012568482e-06,
"loss": 0.0728,
"step": 2460
},
{
"epoch": 2.3816425120772946,
"grad_norm": 0.3421408236026764,
"learning_rate": 2.8875281985175637e-06,
"loss": 0.0679,
"step": 2465
},
{
"epoch": 2.3864734299516908,
"grad_norm": 0.4087986350059509,
"learning_rate": 2.8649693844666453e-06,
"loss": 0.0791,
"step": 2470
},
{
"epoch": 2.391304347826087,
"grad_norm": 0.2629115879535675,
"learning_rate": 2.8424105704157264e-06,
"loss": 0.074,
"step": 2475
},
{
"epoch": 2.396135265700483,
"grad_norm": 0.2295183390378952,
"learning_rate": 2.8198517563648084e-06,
"loss": 0.0739,
"step": 2480
},
{
"epoch": 2.4009661835748792,
"grad_norm": 0.31765657663345337,
"learning_rate": 2.79729294231389e-06,
"loss": 0.0708,
"step": 2485
},
{
"epoch": 2.4057971014492754,
"grad_norm": 0.31528520584106445,
"learning_rate": 2.774734128262971e-06,
"loss": 0.0673,
"step": 2490
},
{
"epoch": 2.4106280193236715,
"grad_norm": 0.2358902543783188,
"learning_rate": 2.752175314212053e-06,
"loss": 0.0543,
"step": 2495
},
{
"epoch": 2.4154589371980677,
"grad_norm": 0.2725466787815094,
"learning_rate": 2.7296165001611343e-06,
"loss": 0.0703,
"step": 2500
},
{
"epoch": 2.420289855072464,
"grad_norm": 0.24531903862953186,
"learning_rate": 2.707057686110216e-06,
"loss": 0.0715,
"step": 2505
},
{
"epoch": 2.42512077294686,
"grad_norm": 0.29307085275650024,
"learning_rate": 2.6844988720592974e-06,
"loss": 0.0752,
"step": 2510
},
{
"epoch": 2.429951690821256,
"grad_norm": 0.2959176003932953,
"learning_rate": 2.661940058008379e-06,
"loss": 0.0685,
"step": 2515
},
{
"epoch": 2.4347826086956523,
"grad_norm": 0.2573854923248291,
"learning_rate": 2.6393812439574605e-06,
"loss": 0.0664,
"step": 2520
},
{
"epoch": 2.4396135265700485,
"grad_norm": 0.3154689371585846,
"learning_rate": 2.616822429906542e-06,
"loss": 0.0615,
"step": 2525
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.21446138620376587,
"learning_rate": 2.5942636158556237e-06,
"loss": 0.0635,
"step": 2530
},
{
"epoch": 2.449275362318841,
"grad_norm": 0.3040371537208557,
"learning_rate": 2.571704801804705e-06,
"loss": 0.0788,
"step": 2535
},
{
"epoch": 2.454106280193237,
"grad_norm": 0.2636314034461975,
"learning_rate": 2.5491459877537864e-06,
"loss": 0.072,
"step": 2540
},
{
"epoch": 2.4589371980676327,
"grad_norm": 0.26327863335609436,
"learning_rate": 2.5265871737028684e-06,
"loss": 0.0777,
"step": 2545
},
{
"epoch": 2.463768115942029,
"grad_norm": 0.28980839252471924,
"learning_rate": 2.5040283596519495e-06,
"loss": 0.0694,
"step": 2550
},
{
"epoch": 2.468599033816425,
"grad_norm": 0.2889906167984009,
"learning_rate": 2.481469545601031e-06,
"loss": 0.0703,
"step": 2555
},
{
"epoch": 2.473429951690821,
"grad_norm": 0.2539612650871277,
"learning_rate": 2.4589107315501127e-06,
"loss": 0.0894,
"step": 2560
},
{
"epoch": 2.4782608695652173,
"grad_norm": 0.25100603699684143,
"learning_rate": 2.4363519174991943e-06,
"loss": 0.0649,
"step": 2565
},
{
"epoch": 2.4830917874396135,
"grad_norm": 0.24855615198612213,
"learning_rate": 2.413793103448276e-06,
"loss": 0.0687,
"step": 2570
},
{
"epoch": 2.4879227053140096,
"grad_norm": 0.2766883671283722,
"learning_rate": 2.3912342893973574e-06,
"loss": 0.0712,
"step": 2575
},
{
"epoch": 2.4927536231884058,
"grad_norm": 0.24230973422527313,
"learning_rate": 2.368675475346439e-06,
"loss": 0.0792,
"step": 2580
},
{
"epoch": 2.497584541062802,
"grad_norm": 0.2981168031692505,
"learning_rate": 2.34611666129552e-06,
"loss": 0.0724,
"step": 2585
},
{
"epoch": 2.502415458937198,
"grad_norm": 0.26249799132347107,
"learning_rate": 2.323557847244602e-06,
"loss": 0.0727,
"step": 2590
},
{
"epoch": 2.5072463768115942,
"grad_norm": 0.23193541169166565,
"learning_rate": 2.3009990331936837e-06,
"loss": 0.0658,
"step": 2595
},
{
"epoch": 2.5120772946859904,
"grad_norm": 0.3478648364543915,
"learning_rate": 2.278440219142765e-06,
"loss": 0.0766,
"step": 2600
},
{
"epoch": 2.5169082125603865,
"grad_norm": 0.2009768933057785,
"learning_rate": 2.255881405091847e-06,
"loss": 0.0735,
"step": 2605
},
{
"epoch": 2.5217391304347827,
"grad_norm": 0.2750122547149658,
"learning_rate": 2.233322591040928e-06,
"loss": 0.0778,
"step": 2610
},
{
"epoch": 2.526570048309179,
"grad_norm": 0.22165286540985107,
"learning_rate": 2.2107637769900095e-06,
"loss": 0.0656,
"step": 2615
},
{
"epoch": 2.531400966183575,
"grad_norm": 0.26584914326667786,
"learning_rate": 2.1882049629390915e-06,
"loss": 0.0723,
"step": 2620
},
{
"epoch": 2.536231884057971,
"grad_norm": 0.30248183012008667,
"learning_rate": 2.1656461488881727e-06,
"loss": 0.0647,
"step": 2625
},
{
"epoch": 2.541062801932367,
"grad_norm": 0.2667482793331146,
"learning_rate": 2.1430873348372542e-06,
"loss": 0.0694,
"step": 2630
},
{
"epoch": 2.545893719806763,
"grad_norm": 0.2767150402069092,
"learning_rate": 2.120528520786336e-06,
"loss": 0.0818,
"step": 2635
},
{
"epoch": 2.550724637681159,
"grad_norm": 0.30463531613349915,
"learning_rate": 2.0979697067354174e-06,
"loss": 0.0684,
"step": 2640
},
{
"epoch": 2.5555555555555554,
"grad_norm": 0.2667052447795868,
"learning_rate": 2.0754108926844985e-06,
"loss": 0.068,
"step": 2645
},
{
"epoch": 2.5603864734299515,
"grad_norm": 0.37567076086997986,
"learning_rate": 2.0528520786335805e-06,
"loss": 0.0578,
"step": 2650
},
{
"epoch": 2.5652173913043477,
"grad_norm": 0.24227222800254822,
"learning_rate": 2.030293264582662e-06,
"loss": 0.0748,
"step": 2655
},
{
"epoch": 2.570048309178744,
"grad_norm": 0.3247409760951996,
"learning_rate": 2.0077344505317432e-06,
"loss": 0.073,
"step": 2660
},
{
"epoch": 2.57487922705314,
"grad_norm": 0.30261141061782837,
"learning_rate": 1.9851756364808252e-06,
"loss": 0.0722,
"step": 2665
},
{
"epoch": 2.579710144927536,
"grad_norm": 0.2872192859649658,
"learning_rate": 1.9626168224299064e-06,
"loss": 0.0728,
"step": 2670
},
{
"epoch": 2.5845410628019323,
"grad_norm": 0.3606136441230774,
"learning_rate": 1.940058008378988e-06,
"loss": 0.0735,
"step": 2675
},
{
"epoch": 2.5893719806763285,
"grad_norm": 0.21871723234653473,
"learning_rate": 1.9174991943280695e-06,
"loss": 0.0682,
"step": 2680
},
{
"epoch": 2.5942028985507246,
"grad_norm": 0.2941882312297821,
"learning_rate": 1.894940380277151e-06,
"loss": 0.0722,
"step": 2685
},
{
"epoch": 2.5990338164251208,
"grad_norm": 0.31706181168556213,
"learning_rate": 1.8723815662262327e-06,
"loss": 0.0698,
"step": 2690
},
{
"epoch": 2.603864734299517,
"grad_norm": 0.25599217414855957,
"learning_rate": 1.849822752175314e-06,
"loss": 0.0691,
"step": 2695
},
{
"epoch": 2.608695652173913,
"grad_norm": 0.2954462468624115,
"learning_rate": 1.8272639381243958e-06,
"loss": 0.0831,
"step": 2700
},
{
"epoch": 2.6135265700483092,
"grad_norm": 0.31768399477005005,
"learning_rate": 1.8047051240734774e-06,
"loss": 0.0684,
"step": 2705
},
{
"epoch": 2.6183574879227054,
"grad_norm": 0.2380971759557724,
"learning_rate": 1.7821463100225587e-06,
"loss": 0.0604,
"step": 2710
},
{
"epoch": 2.6231884057971016,
"grad_norm": 0.2857172191143036,
"learning_rate": 1.7595874959716405e-06,
"loss": 0.0648,
"step": 2715
},
{
"epoch": 2.6280193236714977,
"grad_norm": 0.2866944968700409,
"learning_rate": 1.7370286819207219e-06,
"loss": 0.067,
"step": 2720
},
{
"epoch": 2.632850241545894,
"grad_norm": 0.3259107172489166,
"learning_rate": 1.7144698678698034e-06,
"loss": 0.0789,
"step": 2725
},
{
"epoch": 2.63768115942029,
"grad_norm": 0.23563902080059052,
"learning_rate": 1.691911053818885e-06,
"loss": 0.0826,
"step": 2730
},
{
"epoch": 2.642512077294686,
"grad_norm": 0.33754512667655945,
"learning_rate": 1.6693522397679664e-06,
"loss": 0.0756,
"step": 2735
},
{
"epoch": 2.6473429951690823,
"grad_norm": 0.22349333763122559,
"learning_rate": 1.646793425717048e-06,
"loss": 0.0773,
"step": 2740
},
{
"epoch": 2.6521739130434785,
"grad_norm": 0.42616990208625793,
"learning_rate": 1.6242346116661295e-06,
"loss": 0.0676,
"step": 2745
},
{
"epoch": 2.6570048309178746,
"grad_norm": 0.27920448780059814,
"learning_rate": 1.601675797615211e-06,
"loss": 0.07,
"step": 2750
},
{
"epoch": 2.661835748792271,
"grad_norm": 0.34114235639572144,
"learning_rate": 1.5791169835642926e-06,
"loss": 0.0807,
"step": 2755
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.2515537142753601,
"learning_rate": 1.556558169513374e-06,
"loss": 0.0739,
"step": 2760
},
{
"epoch": 2.6714975845410627,
"grad_norm": 0.24267147481441498,
"learning_rate": 1.5339993554624556e-06,
"loss": 0.0727,
"step": 2765
},
{
"epoch": 2.676328502415459,
"grad_norm": 0.290988564491272,
"learning_rate": 1.5114405414115374e-06,
"loss": 0.0739,
"step": 2770
},
{
"epoch": 2.681159420289855,
"grad_norm": 0.3821360766887665,
"learning_rate": 1.4888817273606187e-06,
"loss": 0.0792,
"step": 2775
},
{
"epoch": 2.685990338164251,
"grad_norm": 0.284109890460968,
"learning_rate": 1.4663229133097003e-06,
"loss": 0.0767,
"step": 2780
},
{
"epoch": 2.6908212560386473,
"grad_norm": 0.303076833486557,
"learning_rate": 1.4437640992587819e-06,
"loss": 0.0714,
"step": 2785
},
{
"epoch": 2.6956521739130435,
"grad_norm": 0.37678495049476624,
"learning_rate": 1.4212052852078632e-06,
"loss": 0.0555,
"step": 2790
},
{
"epoch": 2.7004830917874396,
"grad_norm": 0.23108994960784912,
"learning_rate": 1.398646471156945e-06,
"loss": 0.0833,
"step": 2795
},
{
"epoch": 2.7053140096618358,
"grad_norm": 0.3246385157108307,
"learning_rate": 1.3760876571060266e-06,
"loss": 0.076,
"step": 2800
},
{
"epoch": 2.710144927536232,
"grad_norm": 0.2140025794506073,
"learning_rate": 1.353528843055108e-06,
"loss": 0.0791,
"step": 2805
},
{
"epoch": 2.714975845410628,
"grad_norm": 0.2923656404018402,
"learning_rate": 1.3309700290041895e-06,
"loss": 0.0892,
"step": 2810
},
{
"epoch": 2.7198067632850242,
"grad_norm": 0.2978055775165558,
"learning_rate": 1.308411214953271e-06,
"loss": 0.0647,
"step": 2815
},
{
"epoch": 2.7246376811594204,
"grad_norm": 0.2982514500617981,
"learning_rate": 1.2858524009023524e-06,
"loss": 0.0677,
"step": 2820
},
{
"epoch": 2.7294685990338166,
"grad_norm": 0.2721270024776459,
"learning_rate": 1.2632935868514342e-06,
"loss": 0.0633,
"step": 2825
},
{
"epoch": 2.7342995169082127,
"grad_norm": 0.2582114636898041,
"learning_rate": 1.2407347728005156e-06,
"loss": 0.0721,
"step": 2830
},
{
"epoch": 2.7391304347826084,
"grad_norm": 0.2242422103881836,
"learning_rate": 1.2181759587495971e-06,
"loss": 0.0694,
"step": 2835
},
{
"epoch": 2.7439613526570046,
"grad_norm": 0.2729090750217438,
"learning_rate": 1.1956171446986787e-06,
"loss": 0.0726,
"step": 2840
},
{
"epoch": 2.7487922705314007,
"grad_norm": 0.34203121066093445,
"learning_rate": 1.17305833064776e-06,
"loss": 0.0796,
"step": 2845
},
{
"epoch": 2.753623188405797,
"grad_norm": 0.30749765038490295,
"learning_rate": 1.1504995165968418e-06,
"loss": 0.07,
"step": 2850
},
{
"epoch": 2.758454106280193,
"grad_norm": 0.3750080168247223,
"learning_rate": 1.1279407025459234e-06,
"loss": 0.08,
"step": 2855
},
{
"epoch": 2.763285024154589,
"grad_norm": 0.32321617007255554,
"learning_rate": 1.1053818884950048e-06,
"loss": 0.082,
"step": 2860
},
{
"epoch": 2.7681159420289854,
"grad_norm": 0.25304415822029114,
"learning_rate": 1.0828230744440863e-06,
"loss": 0.076,
"step": 2865
},
{
"epoch": 2.7729468599033815,
"grad_norm": 0.30696550011634827,
"learning_rate": 1.060264260393168e-06,
"loss": 0.0703,
"step": 2870
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.3218288719654083,
"learning_rate": 1.0377054463422493e-06,
"loss": 0.0696,
"step": 2875
},
{
"epoch": 2.782608695652174,
"grad_norm": 0.2573774755001068,
"learning_rate": 1.015146632291331e-06,
"loss": 0.0711,
"step": 2880
},
{
"epoch": 2.78743961352657,
"grad_norm": 0.3438413143157959,
"learning_rate": 9.925878182404126e-07,
"loss": 0.0805,
"step": 2885
},
{
"epoch": 2.792270531400966,
"grad_norm": 0.3613496422767639,
"learning_rate": 9.70029004189494e-07,
"loss": 0.0742,
"step": 2890
},
{
"epoch": 2.7971014492753623,
"grad_norm": 0.2860325276851654,
"learning_rate": 9.474701901385755e-07,
"loss": 0.0735,
"step": 2895
},
{
"epoch": 2.8019323671497585,
"grad_norm": 0.240507572889328,
"learning_rate": 9.24911376087657e-07,
"loss": 0.0677,
"step": 2900
},
{
"epoch": 2.8067632850241546,
"grad_norm": 0.28737547993659973,
"learning_rate": 9.023525620367387e-07,
"loss": 0.0666,
"step": 2905
},
{
"epoch": 2.8115942028985508,
"grad_norm": 0.34197041392326355,
"learning_rate": 8.797937479858203e-07,
"loss": 0.0799,
"step": 2910
},
{
"epoch": 2.816425120772947,
"grad_norm": 0.326251745223999,
"learning_rate": 8.572349339349017e-07,
"loss": 0.0691,
"step": 2915
},
{
"epoch": 2.821256038647343,
"grad_norm": 0.42289331555366516,
"learning_rate": 8.346761198839832e-07,
"loss": 0.0746,
"step": 2920
},
{
"epoch": 2.8260869565217392,
"grad_norm": 0.28735774755477905,
"learning_rate": 8.121173058330648e-07,
"loss": 0.0782,
"step": 2925
},
{
"epoch": 2.8309178743961354,
"grad_norm": 0.29395702481269836,
"learning_rate": 7.895584917821463e-07,
"loss": 0.08,
"step": 2930
},
{
"epoch": 2.8357487922705316,
"grad_norm": 0.3306836187839508,
"learning_rate": 7.669996777312278e-07,
"loss": 0.0869,
"step": 2935
},
{
"epoch": 2.8405797101449277,
"grad_norm": 0.2740659713745117,
"learning_rate": 7.444408636803094e-07,
"loss": 0.064,
"step": 2940
},
{
"epoch": 2.845410628019324,
"grad_norm": 0.28304237127304077,
"learning_rate": 7.218820496293909e-07,
"loss": 0.0769,
"step": 2945
},
{
"epoch": 2.85024154589372,
"grad_norm": 0.3081373870372772,
"learning_rate": 6.993232355784725e-07,
"loss": 0.0783,
"step": 2950
},
{
"epoch": 2.855072463768116,
"grad_norm": 0.3063504099845886,
"learning_rate": 6.76764421527554e-07,
"loss": 0.0643,
"step": 2955
},
{
"epoch": 2.8599033816425123,
"grad_norm": 0.2641620635986328,
"learning_rate": 6.542056074766355e-07,
"loss": 0.0658,
"step": 2960
},
{
"epoch": 2.864734299516908,
"grad_norm": 0.3239176869392395,
"learning_rate": 6.316467934257171e-07,
"loss": 0.0677,
"step": 2965
},
{
"epoch": 2.869565217391304,
"grad_norm": 0.23815782368183136,
"learning_rate": 6.090879793747986e-07,
"loss": 0.0686,
"step": 2970
},
{
"epoch": 2.8743961352657004,
"grad_norm": 0.26518934965133667,
"learning_rate": 5.8652916532388e-07,
"loss": 0.073,
"step": 2975
},
{
"epoch": 2.8792270531400965,
"grad_norm": 0.2455345243215561,
"learning_rate": 5.639703512729617e-07,
"loss": 0.0664,
"step": 2980
},
{
"epoch": 2.8840579710144927,
"grad_norm": 0.2730591893196106,
"learning_rate": 5.414115372220432e-07,
"loss": 0.0745,
"step": 2985
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.3046686351299286,
"learning_rate": 5.188527231711246e-07,
"loss": 0.0637,
"step": 2990
},
{
"epoch": 2.893719806763285,
"grad_norm": 0.26765045523643494,
"learning_rate": 4.962939091202063e-07,
"loss": 0.0818,
"step": 2995
},
{
"epoch": 2.898550724637681,
"grad_norm": 0.2611401677131653,
"learning_rate": 4.7373509506928777e-07,
"loss": 0.0871,
"step": 3000
},
{
"epoch": 2.9033816425120773,
"grad_norm": 0.3256029486656189,
"learning_rate": 4.5117628101836934e-07,
"loss": 0.0772,
"step": 3005
},
{
"epoch": 2.9082125603864735,
"grad_norm": 0.3779186010360718,
"learning_rate": 4.2861746696745086e-07,
"loss": 0.0709,
"step": 3010
},
{
"epoch": 2.9130434782608696,
"grad_norm": 0.248891681432724,
"learning_rate": 4.060586529165324e-07,
"loss": 0.0836,
"step": 3015
},
{
"epoch": 2.917874396135266,
"grad_norm": 0.27647843956947327,
"learning_rate": 3.834998388656139e-07,
"loss": 0.0636,
"step": 3020
},
{
"epoch": 2.922705314009662,
"grad_norm": 0.28876233100891113,
"learning_rate": 3.6094102481469546e-07,
"loss": 0.0648,
"step": 3025
},
{
"epoch": 2.927536231884058,
"grad_norm": 0.26836660504341125,
"learning_rate": 3.38382210763777e-07,
"loss": 0.0726,
"step": 3030
},
{
"epoch": 2.9323671497584543,
"grad_norm": 0.2655857503414154,
"learning_rate": 3.1582339671285855e-07,
"loss": 0.0736,
"step": 3035
},
{
"epoch": 2.9371980676328504,
"grad_norm": 0.30681997537612915,
"learning_rate": 2.9326458266194e-07,
"loss": 0.0688,
"step": 3040
},
{
"epoch": 2.942028985507246,
"grad_norm": 0.3034045994281769,
"learning_rate": 2.707057686110216e-07,
"loss": 0.0611,
"step": 3045
},
{
"epoch": 2.9468599033816423,
"grad_norm": 0.24807259440422058,
"learning_rate": 2.4814695456010315e-07,
"loss": 0.0782,
"step": 3050
},
{
"epoch": 2.9516908212560384,
"grad_norm": 0.34220463037490845,
"learning_rate": 2.2558814050918467e-07,
"loss": 0.0751,
"step": 3055
},
{
"epoch": 2.9565217391304346,
"grad_norm": 0.2882407009601593,
"learning_rate": 2.030293264582662e-07,
"loss": 0.0686,
"step": 3060
},
{
"epoch": 2.9613526570048307,
"grad_norm": 0.31148266792297363,
"learning_rate": 1.8047051240734773e-07,
"loss": 0.0668,
"step": 3065
},
{
"epoch": 2.966183574879227,
"grad_norm": 0.2847365736961365,
"learning_rate": 1.5791169835642927e-07,
"loss": 0.0785,
"step": 3070
},
{
"epoch": 2.971014492753623,
"grad_norm": 0.2872695028781891,
"learning_rate": 1.353528843055108e-07,
"loss": 0.0723,
"step": 3075
},
{
"epoch": 2.975845410628019,
"grad_norm": 0.24350111186504364,
"learning_rate": 1.1279407025459234e-07,
"loss": 0.0669,
"step": 3080
},
{
"epoch": 2.9806763285024154,
"grad_norm": 0.2746003270149231,
"learning_rate": 9.023525620367387e-08,
"loss": 0.0814,
"step": 3085
},
{
"epoch": 2.9855072463768115,
"grad_norm": 0.255521684885025,
"learning_rate": 6.76764421527554e-08,
"loss": 0.0774,
"step": 3090
},
{
"epoch": 2.9903381642512077,
"grad_norm": 0.35289525985717773,
"learning_rate": 4.511762810183693e-08,
"loss": 0.0645,
"step": 3095
},
{
"epoch": 2.995169082125604,
"grad_norm": 0.279884934425354,
"learning_rate": 2.2558814050918466e-08,
"loss": 0.0738,
"step": 3100
},
{
"epoch": 3.0,
"grad_norm": 0.4045591652393341,
"learning_rate": 0.0,
"loss": 0.0711,
"step": 3105
},
{
"epoch": 3.0,
"eval_runtime": 338.6263,
"eval_samples_per_second": 3.054,
"eval_steps_per_second": 0.384,
"step": 3105
}
],
"logging_steps": 5,
"max_steps": 3105,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1255832139272192e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}