{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 3105, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004830917874396135, "grad_norm": 0.4590633809566498, "learning_rate": 1.3986464711569449e-05, "loss": 3.9553, "step": 5 }, { "epoch": 0.00966183574879227, "grad_norm": 0.562667965888977, "learning_rate": 1.396390589751853e-05, "loss": 3.8462, "step": 10 }, { "epoch": 0.014492753623188406, "grad_norm": 0.5178580284118652, "learning_rate": 1.3941347083467613e-05, "loss": 3.7563, "step": 15 }, { "epoch": 0.01932367149758454, "grad_norm": 0.549095869064331, "learning_rate": 1.3918788269416693e-05, "loss": 3.5661, "step": 20 }, { "epoch": 0.024154589371980676, "grad_norm": 0.5863974094390869, "learning_rate": 1.3896229455365775e-05, "loss": 3.3998, "step": 25 }, { "epoch": 0.028985507246376812, "grad_norm": 0.6717987060546875, "learning_rate": 1.3873670641314857e-05, "loss": 3.2327, "step": 30 }, { "epoch": 0.033816425120772944, "grad_norm": 0.6742504835128784, "learning_rate": 1.3851111827263939e-05, "loss": 3.0419, "step": 35 }, { "epoch": 0.03864734299516908, "grad_norm": 0.7488217949867249, "learning_rate": 1.382855301321302e-05, "loss": 2.8092, "step": 40 }, { "epoch": 0.043478260869565216, "grad_norm": 0.7981260418891907, "learning_rate": 1.3805994199162101e-05, "loss": 2.5637, "step": 45 }, { "epoch": 0.04830917874396135, "grad_norm": 0.7837016582489014, "learning_rate": 1.3783435385111182e-05, "loss": 2.311, "step": 50 }, { "epoch": 0.05314009661835749, "grad_norm": 0.8491142988204956, "learning_rate": 1.3760876571060263e-05, "loss": 2.0814, "step": 55 }, { "epoch": 0.057971014492753624, "grad_norm": 0.7609491348266602, "learning_rate": 1.3738317757009345e-05, "loss": 1.7811, "step": 60 }, { "epoch": 0.06280193236714976, "grad_norm": 0.7346836924552917, "learning_rate": 1.3715758942958427e-05, "loss": 1.5668, "step": 65 }, { "epoch": 0.06763285024154589, "grad_norm": 0.7201610803604126, "learning_rate": 1.369320012890751e-05, "loss": 1.3152, "step": 70 }, { "epoch": 0.07246376811594203, "grad_norm": 0.6400141716003418, "learning_rate": 1.3670641314856591e-05, "loss": 1.0742, "step": 75 }, { "epoch": 0.07729468599033816, "grad_norm": 0.38841813802719116, "learning_rate": 1.3648082500805672e-05, "loss": 0.9316, "step": 80 }, { "epoch": 0.0821256038647343, "grad_norm": 0.4104098677635193, "learning_rate": 1.3625523686754754e-05, "loss": 0.8594, "step": 85 }, { "epoch": 0.08695652173913043, "grad_norm": 0.5033922791481018, "learning_rate": 1.3602964872703834e-05, "loss": 0.7873, "step": 90 }, { "epoch": 0.09178743961352658, "grad_norm": 0.3223589360713959, "learning_rate": 1.3580406058652916e-05, "loss": 0.7265, "step": 95 }, { "epoch": 0.0966183574879227, "grad_norm": 0.31837838888168335, "learning_rate": 1.3557847244601998e-05, "loss": 0.7056, "step": 100 }, { "epoch": 0.10144927536231885, "grad_norm": 0.35547807812690735, "learning_rate": 1.353528843055108e-05, "loss": 0.6684, "step": 105 }, { "epoch": 0.10628019323671498, "grad_norm": 0.3624265789985657, "learning_rate": 1.351272961650016e-05, "loss": 0.6424, "step": 110 }, { "epoch": 0.1111111111111111, "grad_norm": 0.37934672832489014, "learning_rate": 1.3490170802449242e-05, "loss": 0.6473, "step": 115 }, { "epoch": 0.11594202898550725, "grad_norm": 0.43373095989227295, "learning_rate": 1.3467611988398324e-05, "loss": 0.6108, "step": 120 }, { "epoch": 0.12077294685990338, "grad_norm": 0.46837344765663147, "learning_rate": 1.3445053174347406e-05, "loss": 0.5718, "step": 125 }, { "epoch": 0.12560386473429952, "grad_norm": 0.4655485153198242, "learning_rate": 1.3422494360296488e-05, "loss": 0.5618, "step": 130 }, { "epoch": 0.13043478260869565, "grad_norm": 0.5438677072525024, "learning_rate": 1.3399935546245569e-05, "loss": 0.5834, "step": 135 }, { "epoch": 0.13526570048309178, "grad_norm": 0.5986974239349365, "learning_rate": 1.3377376732194649e-05, "loss": 0.51, "step": 140 }, { "epoch": 0.14009661835748793, "grad_norm": 0.7286536693572998, "learning_rate": 1.3354817918143731e-05, "loss": 0.4704, "step": 145 }, { "epoch": 0.14492753623188406, "grad_norm": 0.9337557554244995, "learning_rate": 1.3332259104092813e-05, "loss": 0.4379, "step": 150 }, { "epoch": 0.1497584541062802, "grad_norm": 1.0971410274505615, "learning_rate": 1.3309700290041895e-05, "loss": 0.3994, "step": 155 }, { "epoch": 0.15458937198067632, "grad_norm": 1.5142974853515625, "learning_rate": 1.3287141475990977e-05, "loss": 0.3397, "step": 160 }, { "epoch": 0.15942028985507245, "grad_norm": 0.5735320448875427, "learning_rate": 1.3264582661940057e-05, "loss": 0.3047, "step": 165 }, { "epoch": 0.1642512077294686, "grad_norm": 0.31310656666755676, "learning_rate": 1.324202384788914e-05, "loss": 0.2761, "step": 170 }, { "epoch": 0.16908212560386474, "grad_norm": 0.3159743845462799, "learning_rate": 1.3219465033838221e-05, "loss": 0.2584, "step": 175 }, { "epoch": 0.17391304347826086, "grad_norm": 0.2747821509838104, "learning_rate": 1.3196906219787303e-05, "loss": 0.2696, "step": 180 }, { "epoch": 0.178743961352657, "grad_norm": 0.26819175481796265, "learning_rate": 1.3174347405736383e-05, "loss": 0.2633, "step": 185 }, { "epoch": 0.18357487922705315, "grad_norm": 0.27827367186546326, "learning_rate": 1.3151788591685465e-05, "loss": 0.2587, "step": 190 }, { "epoch": 0.18840579710144928, "grad_norm": 0.30673256516456604, "learning_rate": 1.3129229777634546e-05, "loss": 0.2749, "step": 195 }, { "epoch": 0.1932367149758454, "grad_norm": 0.28767552971839905, "learning_rate": 1.3106670963583628e-05, "loss": 0.2527, "step": 200 }, { "epoch": 0.19806763285024154, "grad_norm": 0.2788391709327698, "learning_rate": 1.308411214953271e-05, "loss": 0.2548, "step": 205 }, { "epoch": 0.2028985507246377, "grad_norm": 0.26774516701698303, "learning_rate": 1.3061553335481792e-05, "loss": 0.2426, "step": 210 }, { "epoch": 0.20772946859903382, "grad_norm": 0.3280729651451111, "learning_rate": 1.3038994521430874e-05, "loss": 0.2343, "step": 215 }, { "epoch": 0.21256038647342995, "grad_norm": 0.28210124373435974, "learning_rate": 1.3016435707379956e-05, "loss": 0.2385, "step": 220 }, { "epoch": 0.21739130434782608, "grad_norm": 0.2706020176410675, "learning_rate": 1.2993876893329036e-05, "loss": 0.2418, "step": 225 }, { "epoch": 0.2222222222222222, "grad_norm": 0.2814071476459503, "learning_rate": 1.2971318079278118e-05, "loss": 0.2309, "step": 230 }, { "epoch": 0.22705314009661837, "grad_norm": 0.311310738325119, "learning_rate": 1.2948759265227198e-05, "loss": 0.239, "step": 235 }, { "epoch": 0.2318840579710145, "grad_norm": 0.31521573662757874, "learning_rate": 1.292620045117628e-05, "loss": 0.2271, "step": 240 }, { "epoch": 0.23671497584541062, "grad_norm": 0.3365338146686554, "learning_rate": 1.2903641637125362e-05, "loss": 0.2418, "step": 245 }, { "epoch": 0.24154589371980675, "grad_norm": 0.32416385412216187, "learning_rate": 1.2881082823074444e-05, "loss": 0.2282, "step": 250 }, { "epoch": 0.2463768115942029, "grad_norm": 0.3272862434387207, "learning_rate": 1.2858524009023525e-05, "loss": 0.2319, "step": 255 }, { "epoch": 0.25120772946859904, "grad_norm": 0.34287795424461365, "learning_rate": 1.2835965194972607e-05, "loss": 0.2529, "step": 260 }, { "epoch": 0.2560386473429952, "grad_norm": 0.338498055934906, "learning_rate": 1.2813406380921689e-05, "loss": 0.2216, "step": 265 }, { "epoch": 0.2608695652173913, "grad_norm": 0.34228625893592834, "learning_rate": 1.279084756687077e-05, "loss": 0.2306, "step": 270 }, { "epoch": 0.26570048309178745, "grad_norm": 0.38022157549858093, "learning_rate": 1.2768288752819853e-05, "loss": 0.2091, "step": 275 }, { "epoch": 0.27053140096618356, "grad_norm": 0.35013625025749207, "learning_rate": 1.2745729938768935e-05, "loss": 0.2219, "step": 280 }, { "epoch": 0.2753623188405797, "grad_norm": 0.3915255665779114, "learning_rate": 1.2723171124718013e-05, "loss": 0.202, "step": 285 }, { "epoch": 0.28019323671497587, "grad_norm": 0.4278201758861542, "learning_rate": 1.2700612310667095e-05, "loss": 0.2223, "step": 290 }, { "epoch": 0.28502415458937197, "grad_norm": 0.4377511441707611, "learning_rate": 1.2678053496616177e-05, "loss": 0.2001, "step": 295 }, { "epoch": 0.2898550724637681, "grad_norm": 0.44731107354164124, "learning_rate": 1.265549468256526e-05, "loss": 0.1884, "step": 300 }, { "epoch": 0.2946859903381642, "grad_norm": 0.4644255042076111, "learning_rate": 1.2632935868514341e-05, "loss": 0.195, "step": 305 }, { "epoch": 0.2995169082125604, "grad_norm": 0.46685394644737244, "learning_rate": 1.2610377054463423e-05, "loss": 0.1867, "step": 310 }, { "epoch": 0.30434782608695654, "grad_norm": 0.484323650598526, "learning_rate": 1.2587818240412503e-05, "loss": 0.1855, "step": 315 }, { "epoch": 0.30917874396135264, "grad_norm": 0.4667232632637024, "learning_rate": 1.2565259426361585e-05, "loss": 0.1823, "step": 320 }, { "epoch": 0.3140096618357488, "grad_norm": 0.5028926134109497, "learning_rate": 1.2542700612310667e-05, "loss": 0.1726, "step": 325 }, { "epoch": 0.3188405797101449, "grad_norm": 0.5125951766967773, "learning_rate": 1.252014179825975e-05, "loss": 0.1709, "step": 330 }, { "epoch": 0.32367149758454106, "grad_norm": 0.4960808753967285, "learning_rate": 1.249758298420883e-05, "loss": 0.1539, "step": 335 }, { "epoch": 0.3285024154589372, "grad_norm": 0.42437031865119934, "learning_rate": 1.2475024170157912e-05, "loss": 0.1484, "step": 340 }, { "epoch": 0.3333333333333333, "grad_norm": 0.32370465993881226, "learning_rate": 1.2452465356106992e-05, "loss": 0.128, "step": 345 }, { "epoch": 0.33816425120772947, "grad_norm": 0.2940502166748047, "learning_rate": 1.2429906542056074e-05, "loss": 0.1152, "step": 350 }, { "epoch": 0.34299516908212563, "grad_norm": 0.3140239715576172, "learning_rate": 1.2407347728005156e-05, "loss": 0.1466, "step": 355 }, { "epoch": 0.34782608695652173, "grad_norm": 0.26790526509284973, "learning_rate": 1.2384788913954238e-05, "loss": 0.1148, "step": 360 }, { "epoch": 0.3526570048309179, "grad_norm": 0.276149183511734, "learning_rate": 1.236223009990332e-05, "loss": 0.0996, "step": 365 }, { "epoch": 0.357487922705314, "grad_norm": 0.24558521807193756, "learning_rate": 1.2339671285852402e-05, "loss": 0.105, "step": 370 }, { "epoch": 0.36231884057971014, "grad_norm": 0.31819969415664673, "learning_rate": 1.2317112471801482e-05, "loss": 0.0968, "step": 375 }, { "epoch": 0.3671497584541063, "grad_norm": 0.28777143359184265, "learning_rate": 1.2294553657750564e-05, "loss": 0.1029, "step": 380 }, { "epoch": 0.3719806763285024, "grad_norm": 0.26321807503700256, "learning_rate": 1.2271994843699645e-05, "loss": 0.1018, "step": 385 }, { "epoch": 0.37681159420289856, "grad_norm": 0.23004086315631866, "learning_rate": 1.2249436029648727e-05, "loss": 0.0912, "step": 390 }, { "epoch": 0.38164251207729466, "grad_norm": 0.21847793459892273, "learning_rate": 1.2226877215597809e-05, "loss": 0.0913, "step": 395 }, { "epoch": 0.3864734299516908, "grad_norm": 0.19117720425128937, "learning_rate": 1.220431840154689e-05, "loss": 0.0895, "step": 400 }, { "epoch": 0.391304347826087, "grad_norm": 0.2535386085510254, "learning_rate": 1.2181759587495971e-05, "loss": 0.1103, "step": 405 }, { "epoch": 0.3961352657004831, "grad_norm": 0.26542574167251587, "learning_rate": 1.2159200773445053e-05, "loss": 0.0923, "step": 410 }, { "epoch": 0.40096618357487923, "grad_norm": 0.20197734236717224, "learning_rate": 1.2136641959394135e-05, "loss": 0.0931, "step": 415 }, { "epoch": 0.4057971014492754, "grad_norm": 0.20673911273479462, "learning_rate": 1.2114083145343217e-05, "loss": 0.1138, "step": 420 }, { "epoch": 0.4106280193236715, "grad_norm": 0.24391110241413116, "learning_rate": 1.2091524331292299e-05, "loss": 0.094, "step": 425 }, { "epoch": 0.41545893719806765, "grad_norm": 0.2456451952457428, "learning_rate": 1.2068965517241379e-05, "loss": 0.1078, "step": 430 }, { "epoch": 0.42028985507246375, "grad_norm": 0.29903218150138855, "learning_rate": 1.204640670319046e-05, "loss": 0.0999, "step": 435 }, { "epoch": 0.4251207729468599, "grad_norm": 0.17596346139907837, "learning_rate": 1.2023847889139541e-05, "loss": 0.0995, "step": 440 }, { "epoch": 0.42995169082125606, "grad_norm": 0.14841659367084503, "learning_rate": 1.2001289075088623e-05, "loss": 0.0934, "step": 445 }, { "epoch": 0.43478260869565216, "grad_norm": 0.18399696052074432, "learning_rate": 1.1978730261037705e-05, "loss": 0.0967, "step": 450 }, { "epoch": 0.4396135265700483, "grad_norm": 0.1746302992105484, "learning_rate": 1.1956171446986787e-05, "loss": 0.0947, "step": 455 }, { "epoch": 0.4444444444444444, "grad_norm": 0.2423829287290573, "learning_rate": 1.1933612632935868e-05, "loss": 0.0936, "step": 460 }, { "epoch": 0.4492753623188406, "grad_norm": 0.15260176360607147, "learning_rate": 1.191105381888495e-05, "loss": 0.1034, "step": 465 }, { "epoch": 0.45410628019323673, "grad_norm": 0.2334187626838684, "learning_rate": 1.1888495004834032e-05, "loss": 0.1053, "step": 470 }, { "epoch": 0.45893719806763283, "grad_norm": 0.19356365501880646, "learning_rate": 1.1865936190783114e-05, "loss": 0.0842, "step": 475 }, { "epoch": 0.463768115942029, "grad_norm": 0.20395216345787048, "learning_rate": 1.1843377376732194e-05, "loss": 0.0792, "step": 480 }, { "epoch": 0.46859903381642515, "grad_norm": 0.1807161122560501, "learning_rate": 1.1820818562681276e-05, "loss": 0.0882, "step": 485 }, { "epoch": 0.47342995169082125, "grad_norm": 0.16710110008716583, "learning_rate": 1.1798259748630358e-05, "loss": 0.0822, "step": 490 }, { "epoch": 0.4782608695652174, "grad_norm": 0.1776697188615799, "learning_rate": 1.1775700934579438e-05, "loss": 0.0922, "step": 495 }, { "epoch": 0.4830917874396135, "grad_norm": 0.21817447245121002, "learning_rate": 1.175314212052852e-05, "loss": 0.0949, "step": 500 }, { "epoch": 0.48792270531400966, "grad_norm": 0.20692448318004608, "learning_rate": 1.1730583306477602e-05, "loss": 0.0907, "step": 505 }, { "epoch": 0.4927536231884058, "grad_norm": 0.1886768341064453, "learning_rate": 1.1708024492426684e-05, "loss": 0.0858, "step": 510 }, { "epoch": 0.4975845410628019, "grad_norm": 0.19374988973140717, "learning_rate": 1.1685465678375766e-05, "loss": 0.084, "step": 515 }, { "epoch": 0.5024154589371981, "grad_norm": 0.1982010304927826, "learning_rate": 1.1662906864324847e-05, "loss": 0.087, "step": 520 }, { "epoch": 0.5072463768115942, "grad_norm": 0.292267769575119, "learning_rate": 1.1640348050273929e-05, "loss": 0.0918, "step": 525 }, { "epoch": 0.5120772946859904, "grad_norm": 0.19581086933612823, "learning_rate": 1.1617789236223009e-05, "loss": 0.1012, "step": 530 }, { "epoch": 0.5169082125603864, "grad_norm": 0.1730077862739563, "learning_rate": 1.159523042217209e-05, "loss": 0.0853, "step": 535 }, { "epoch": 0.5217391304347826, "grad_norm": 0.20485533773899078, "learning_rate": 1.1572671608121173e-05, "loss": 0.093, "step": 540 }, { "epoch": 0.5265700483091788, "grad_norm": 0.2086704820394516, "learning_rate": 1.1550112794070255e-05, "loss": 0.0945, "step": 545 }, { "epoch": 0.5314009661835749, "grad_norm": 0.15911467373371124, "learning_rate": 1.1527553980019335e-05, "loss": 0.1034, "step": 550 }, { "epoch": 0.5362318840579711, "grad_norm": 0.2168796807527542, "learning_rate": 1.1504995165968417e-05, "loss": 0.0945, "step": 555 }, { "epoch": 0.5410628019323671, "grad_norm": 0.20228448510169983, "learning_rate": 1.1482436351917499e-05, "loss": 0.1029, "step": 560 }, { "epoch": 0.5458937198067633, "grad_norm": 0.2441129982471466, "learning_rate": 1.1459877537866581e-05, "loss": 0.0906, "step": 565 }, { "epoch": 0.5507246376811594, "grad_norm": 0.22443729639053345, "learning_rate": 1.1437318723815663e-05, "loss": 0.0994, "step": 570 }, { "epoch": 0.5555555555555556, "grad_norm": 0.18132899701595306, "learning_rate": 1.1414759909764745e-05, "loss": 0.0938, "step": 575 }, { "epoch": 0.5603864734299517, "grad_norm": 0.19448505342006683, "learning_rate": 1.1392201095713824e-05, "loss": 0.0835, "step": 580 }, { "epoch": 0.5652173913043478, "grad_norm": 0.23075686395168304, "learning_rate": 1.1369642281662906e-05, "loss": 0.0983, "step": 585 }, { "epoch": 0.5700483091787439, "grad_norm": 0.22883069515228271, "learning_rate": 1.1347083467611988e-05, "loss": 0.0787, "step": 590 }, { "epoch": 0.5748792270531401, "grad_norm": 0.23262719810009003, "learning_rate": 1.132452465356107e-05, "loss": 0.0939, "step": 595 }, { "epoch": 0.5797101449275363, "grad_norm": 0.20541128516197205, "learning_rate": 1.1301965839510152e-05, "loss": 0.0776, "step": 600 }, { "epoch": 0.5845410628019324, "grad_norm": 0.21663478016853333, "learning_rate": 1.1279407025459234e-05, "loss": 0.0918, "step": 605 }, { "epoch": 0.5893719806763285, "grad_norm": 0.22586220502853394, "learning_rate": 1.1256848211408314e-05, "loss": 0.0824, "step": 610 }, { "epoch": 0.5942028985507246, "grad_norm": 0.1860446035861969, "learning_rate": 1.1234289397357396e-05, "loss": 0.0853, "step": 615 }, { "epoch": 0.5990338164251208, "grad_norm": 0.195932075381279, "learning_rate": 1.1211730583306478e-05, "loss": 0.0818, "step": 620 }, { "epoch": 0.6038647342995169, "grad_norm": 0.19570867717266083, "learning_rate": 1.118917176925556e-05, "loss": 0.0859, "step": 625 }, { "epoch": 0.6086956521739131, "grad_norm": 0.16349905729293823, "learning_rate": 1.116661295520464e-05, "loss": 0.0938, "step": 630 }, { "epoch": 0.6135265700483091, "grad_norm": 0.1926320493221283, "learning_rate": 1.1144054141153722e-05, "loss": 0.0846, "step": 635 }, { "epoch": 0.6183574879227053, "grad_norm": 0.19020161032676697, "learning_rate": 1.1121495327102803e-05, "loss": 0.086, "step": 640 }, { "epoch": 0.6231884057971014, "grad_norm": 0.20265896618366241, "learning_rate": 1.1098936513051885e-05, "loss": 0.0793, "step": 645 }, { "epoch": 0.6280193236714976, "grad_norm": 0.17398878931999207, "learning_rate": 1.1076377699000967e-05, "loss": 0.09, "step": 650 }, { "epoch": 0.6328502415458938, "grad_norm": 0.19005955755710602, "learning_rate": 1.1053818884950049e-05, "loss": 0.0792, "step": 655 }, { "epoch": 0.6376811594202898, "grad_norm": 0.18029935657978058, "learning_rate": 1.103126007089913e-05, "loss": 0.0923, "step": 660 }, { "epoch": 0.642512077294686, "grad_norm": 0.1881086826324463, "learning_rate": 1.1008701256848212e-05, "loss": 0.0936, "step": 665 }, { "epoch": 0.6473429951690821, "grad_norm": 0.269255667924881, "learning_rate": 1.0986142442797293e-05, "loss": 0.0916, "step": 670 }, { "epoch": 0.6521739130434783, "grad_norm": 0.18038909137248993, "learning_rate": 1.0963583628746373e-05, "loss": 0.0855, "step": 675 }, { "epoch": 0.6570048309178744, "grad_norm": 0.17990528047084808, "learning_rate": 1.0941024814695455e-05, "loss": 0.0926, "step": 680 }, { "epoch": 0.6618357487922706, "grad_norm": 0.2431405931711197, "learning_rate": 1.0918466000644537e-05, "loss": 0.0917, "step": 685 }, { "epoch": 0.6666666666666666, "grad_norm": 0.23243603110313416, "learning_rate": 1.0895907186593619e-05, "loss": 0.1046, "step": 690 }, { "epoch": 0.6714975845410628, "grad_norm": 0.20667722821235657, "learning_rate": 1.0873348372542701e-05, "loss": 0.0952, "step": 695 }, { "epoch": 0.6763285024154589, "grad_norm": 0.20045587420463562, "learning_rate": 1.0850789558491781e-05, "loss": 0.0839, "step": 700 }, { "epoch": 0.6811594202898551, "grad_norm": 0.15829257667064667, "learning_rate": 1.0828230744440863e-05, "loss": 0.0931, "step": 705 }, { "epoch": 0.6859903381642513, "grad_norm": 0.18778935074806213, "learning_rate": 1.0805671930389945e-05, "loss": 0.083, "step": 710 }, { "epoch": 0.6908212560386473, "grad_norm": 0.1949867457151413, "learning_rate": 1.0783113116339027e-05, "loss": 0.0797, "step": 715 }, { "epoch": 0.6956521739130435, "grad_norm": 0.3177832365036011, "learning_rate": 1.076055430228811e-05, "loss": 0.081, "step": 720 }, { "epoch": 0.7004830917874396, "grad_norm": 0.1714804619550705, "learning_rate": 1.073799548823719e-05, "loss": 0.0908, "step": 725 }, { "epoch": 0.7053140096618358, "grad_norm": 0.25471800565719604, "learning_rate": 1.071543667418627e-05, "loss": 0.0988, "step": 730 }, { "epoch": 0.7101449275362319, "grad_norm": 0.21141599118709564, "learning_rate": 1.0692877860135352e-05, "loss": 0.0944, "step": 735 }, { "epoch": 0.714975845410628, "grad_norm": 0.17371544241905212, "learning_rate": 1.0670319046084434e-05, "loss": 0.0836, "step": 740 }, { "epoch": 0.7198067632850241, "grad_norm": 0.19493460655212402, "learning_rate": 1.0647760232033516e-05, "loss": 0.0855, "step": 745 }, { "epoch": 0.7246376811594203, "grad_norm": 0.25241127610206604, "learning_rate": 1.0625201417982598e-05, "loss": 0.0844, "step": 750 }, { "epoch": 0.7294685990338164, "grad_norm": 0.2515096664428711, "learning_rate": 1.060264260393168e-05, "loss": 0.0883, "step": 755 }, { "epoch": 0.7342995169082126, "grad_norm": 0.15292327105998993, "learning_rate": 1.058008378988076e-05, "loss": 0.0792, "step": 760 }, { "epoch": 0.7391304347826086, "grad_norm": 0.20495273172855377, "learning_rate": 1.0557524975829842e-05, "loss": 0.0789, "step": 765 }, { "epoch": 0.7439613526570048, "grad_norm": 0.261168897151947, "learning_rate": 1.0534966161778924e-05, "loss": 0.0832, "step": 770 }, { "epoch": 0.748792270531401, "grad_norm": 0.28218600153923035, "learning_rate": 1.0512407347728004e-05, "loss": 0.1046, "step": 775 }, { "epoch": 0.7536231884057971, "grad_norm": 0.1737246960401535, "learning_rate": 1.0489848533677086e-05, "loss": 0.0722, "step": 780 }, { "epoch": 0.7584541062801933, "grad_norm": 0.24183641374111176, "learning_rate": 1.0467289719626168e-05, "loss": 0.0752, "step": 785 }, { "epoch": 0.7632850241545893, "grad_norm": 0.23685990273952484, "learning_rate": 1.0444730905575249e-05, "loss": 0.1037, "step": 790 }, { "epoch": 0.7681159420289855, "grad_norm": 0.22956091165542603, "learning_rate": 1.042217209152433e-05, "loss": 0.0761, "step": 795 }, { "epoch": 0.7729468599033816, "grad_norm": 0.18922095000743866, "learning_rate": 1.0399613277473413e-05, "loss": 0.0885, "step": 800 }, { "epoch": 0.7777777777777778, "grad_norm": 0.18391458690166473, "learning_rate": 1.0377054463422495e-05, "loss": 0.0859, "step": 805 }, { "epoch": 0.782608695652174, "grad_norm": 0.27890563011169434, "learning_rate": 1.0354495649371577e-05, "loss": 0.0924, "step": 810 }, { "epoch": 0.7874396135265701, "grad_norm": 0.22491532564163208, "learning_rate": 1.0331936835320657e-05, "loss": 0.0811, "step": 815 }, { "epoch": 0.7922705314009661, "grad_norm": 0.21809989213943481, "learning_rate": 1.0309378021269739e-05, "loss": 0.0784, "step": 820 }, { "epoch": 0.7971014492753623, "grad_norm": 0.27180778980255127, "learning_rate": 1.028681920721882e-05, "loss": 0.088, "step": 825 }, { "epoch": 0.8019323671497585, "grad_norm": 0.22717216610908508, "learning_rate": 1.0264260393167901e-05, "loss": 0.0755, "step": 830 }, { "epoch": 0.8067632850241546, "grad_norm": 0.19013768434524536, "learning_rate": 1.0241701579116983e-05, "loss": 0.0782, "step": 835 }, { "epoch": 0.8115942028985508, "grad_norm": 0.2028125375509262, "learning_rate": 1.0219142765066065e-05, "loss": 0.1034, "step": 840 }, { "epoch": 0.8164251207729468, "grad_norm": 0.24243703484535217, "learning_rate": 1.0196583951015146e-05, "loss": 0.0899, "step": 845 }, { "epoch": 0.821256038647343, "grad_norm": 0.21742011606693268, "learning_rate": 1.0174025136964228e-05, "loss": 0.0898, "step": 850 }, { "epoch": 0.8260869565217391, "grad_norm": 0.2000913769006729, "learning_rate": 1.015146632291331e-05, "loss": 0.0828, "step": 855 }, { "epoch": 0.8309178743961353, "grad_norm": 0.1902933269739151, "learning_rate": 1.0128907508862392e-05, "loss": 0.0934, "step": 860 }, { "epoch": 0.8357487922705314, "grad_norm": 0.20363092422485352, "learning_rate": 1.0106348694811474e-05, "loss": 0.081, "step": 865 }, { "epoch": 0.8405797101449275, "grad_norm": 0.2238474041223526, "learning_rate": 1.0083789880760556e-05, "loss": 0.0963, "step": 870 }, { "epoch": 0.8454106280193237, "grad_norm": 0.19188345968723297, "learning_rate": 1.0061231066709634e-05, "loss": 0.0809, "step": 875 }, { "epoch": 0.8502415458937198, "grad_norm": 0.18286921083927155, "learning_rate": 1.0038672252658716e-05, "loss": 0.0891, "step": 880 }, { "epoch": 0.855072463768116, "grad_norm": 0.19798459112644196, "learning_rate": 1.0016113438607798e-05, "loss": 0.0789, "step": 885 }, { "epoch": 0.8599033816425121, "grad_norm": 0.1937275230884552, "learning_rate": 9.99355462455688e-06, "loss": 0.0748, "step": 890 }, { "epoch": 0.8647342995169082, "grad_norm": 0.2399519830942154, "learning_rate": 9.970995810505962e-06, "loss": 0.0941, "step": 895 }, { "epoch": 0.8695652173913043, "grad_norm": 0.2435486763715744, "learning_rate": 9.948436996455044e-06, "loss": 0.078, "step": 900 }, { "epoch": 0.8743961352657005, "grad_norm": 0.22818566858768463, "learning_rate": 9.925878182404124e-06, "loss": 0.0813, "step": 905 }, { "epoch": 0.8792270531400966, "grad_norm": 0.19992083311080933, "learning_rate": 9.903319368353206e-06, "loss": 0.0757, "step": 910 }, { "epoch": 0.8840579710144928, "grad_norm": 0.24121499061584473, "learning_rate": 9.880760554302288e-06, "loss": 0.0878, "step": 915 }, { "epoch": 0.8888888888888888, "grad_norm": 0.2414121776819229, "learning_rate": 9.85820174025137e-06, "loss": 0.076, "step": 920 }, { "epoch": 0.893719806763285, "grad_norm": 0.17777179181575775, "learning_rate": 9.83564292620045e-06, "loss": 0.0903, "step": 925 }, { "epoch": 0.8985507246376812, "grad_norm": 0.23024319112300873, "learning_rate": 9.813084112149533e-06, "loss": 0.0965, "step": 930 }, { "epoch": 0.9033816425120773, "grad_norm": 0.20664696395397186, "learning_rate": 9.790525298098613e-06, "loss": 0.0707, "step": 935 }, { "epoch": 0.9082125603864735, "grad_norm": 0.1725015491247177, "learning_rate": 9.767966484047695e-06, "loss": 0.0821, "step": 940 }, { "epoch": 0.9130434782608695, "grad_norm": 0.2138936072587967, "learning_rate": 9.745407669996777e-06, "loss": 0.087, "step": 945 }, { "epoch": 0.9178743961352657, "grad_norm": 0.24879959225654602, "learning_rate": 9.722848855945859e-06, "loss": 0.0782, "step": 950 }, { "epoch": 0.9227053140096618, "grad_norm": 0.24507424235343933, "learning_rate": 9.700290041894941e-06, "loss": 0.0976, "step": 955 }, { "epoch": 0.927536231884058, "grad_norm": 0.21825656294822693, "learning_rate": 9.677731227844023e-06, "loss": 0.0846, "step": 960 }, { "epoch": 0.9323671497584541, "grad_norm": 0.22634956240653992, "learning_rate": 9.655172413793103e-06, "loss": 0.0833, "step": 965 }, { "epoch": 0.9371980676328503, "grad_norm": 0.20103132724761963, "learning_rate": 9.632613599742184e-06, "loss": 0.0739, "step": 970 }, { "epoch": 0.9420289855072463, "grad_norm": 0.19459068775177002, "learning_rate": 9.610054785691266e-06, "loss": 0.0841, "step": 975 }, { "epoch": 0.9468599033816425, "grad_norm": 0.18598385155200958, "learning_rate": 9.587495971640348e-06, "loss": 0.0734, "step": 980 }, { "epoch": 0.9516908212560387, "grad_norm": 0.24302643537521362, "learning_rate": 9.56493715758943e-06, "loss": 0.0893, "step": 985 }, { "epoch": 0.9565217391304348, "grad_norm": 0.23758938908576965, "learning_rate": 9.542378343538512e-06, "loss": 0.0763, "step": 990 }, { "epoch": 0.961352657004831, "grad_norm": 0.2180752158164978, "learning_rate": 9.519819529487592e-06, "loss": 0.0792, "step": 995 }, { "epoch": 0.966183574879227, "grad_norm": 0.22509507834911346, "learning_rate": 9.497260715436674e-06, "loss": 0.0682, "step": 1000 }, { "epoch": 0.9710144927536232, "grad_norm": 0.197494238615036, "learning_rate": 9.474701901385756e-06, "loss": 0.0903, "step": 1005 }, { "epoch": 0.9758454106280193, "grad_norm": 0.1817607879638672, "learning_rate": 9.452143087334838e-06, "loss": 0.0846, "step": 1010 }, { "epoch": 0.9806763285024155, "grad_norm": 0.19075438380241394, "learning_rate": 9.42958427328392e-06, "loss": 0.0846, "step": 1015 }, { "epoch": 0.9855072463768116, "grad_norm": 0.15087321400642395, "learning_rate": 9.407025459233e-06, "loss": 0.0753, "step": 1020 }, { "epoch": 0.9903381642512077, "grad_norm": 0.2226846069097519, "learning_rate": 9.38446664518208e-06, "loss": 0.0666, "step": 1025 }, { "epoch": 0.9951690821256038, "grad_norm": 0.30765634775161743, "learning_rate": 9.361907831131162e-06, "loss": 0.0779, "step": 1030 }, { "epoch": 1.0, "grad_norm": 0.4031631052494049, "learning_rate": 9.339349017080244e-06, "loss": 0.095, "step": 1035 }, { "epoch": 1.0, "eval_runtime": 338.4602, "eval_samples_per_second": 3.055, "eval_steps_per_second": 0.384, "step": 1035 }, { "epoch": 1.0048309178743962, "grad_norm": 0.2173592448234558, "learning_rate": 9.316790203029326e-06, "loss": 0.0803, "step": 1040 }, { "epoch": 1.0096618357487923, "grad_norm": 0.22241808474063873, "learning_rate": 9.294231388978408e-06, "loss": 0.0909, "step": 1045 }, { "epoch": 1.0144927536231885, "grad_norm": 0.2699296474456787, "learning_rate": 9.27167257492749e-06, "loss": 0.0842, "step": 1050 }, { "epoch": 1.0193236714975846, "grad_norm": 0.27080684900283813, "learning_rate": 9.24911376087657e-06, "loss": 0.0837, "step": 1055 }, { "epoch": 1.0241545893719808, "grad_norm": 0.1808546930551529, "learning_rate": 9.226554946825653e-06, "loss": 0.0816, "step": 1060 }, { "epoch": 1.0289855072463767, "grad_norm": 0.19763918220996857, "learning_rate": 9.203996132774735e-06, "loss": 0.0705, "step": 1065 }, { "epoch": 1.0338164251207729, "grad_norm": 0.21294108033180237, "learning_rate": 9.181437318723815e-06, "loss": 0.0726, "step": 1070 }, { "epoch": 1.038647342995169, "grad_norm": 0.19769993424415588, "learning_rate": 9.158878504672897e-06, "loss": 0.0743, "step": 1075 }, { "epoch": 1.0434782608695652, "grad_norm": 0.23708152770996094, "learning_rate": 9.136319690621979e-06, "loss": 0.0785, "step": 1080 }, { "epoch": 1.0483091787439613, "grad_norm": 0.232899010181427, "learning_rate": 9.11376087657106e-06, "loss": 0.093, "step": 1085 }, { "epoch": 1.0531400966183575, "grad_norm": 0.267478883266449, "learning_rate": 9.091202062520141e-06, "loss": 0.0901, "step": 1090 }, { "epoch": 1.0579710144927537, "grad_norm": 0.23761190474033356, "learning_rate": 9.068643248469223e-06, "loss": 0.0898, "step": 1095 }, { "epoch": 1.0628019323671498, "grad_norm": 0.19679813086986542, "learning_rate": 9.046084434418305e-06, "loss": 0.0877, "step": 1100 }, { "epoch": 1.067632850241546, "grad_norm": 0.20915761590003967, "learning_rate": 9.023525620367387e-06, "loss": 0.0731, "step": 1105 }, { "epoch": 1.0724637681159421, "grad_norm": 0.18718890845775604, "learning_rate": 9.000966806316468e-06, "loss": 0.0885, "step": 1110 }, { "epoch": 1.077294685990338, "grad_norm": 0.29885435104370117, "learning_rate": 8.97840799226555e-06, "loss": 0.0861, "step": 1115 }, { "epoch": 1.0821256038647342, "grad_norm": 0.16953594982624054, "learning_rate": 8.95584917821463e-06, "loss": 0.0862, "step": 1120 }, { "epoch": 1.0869565217391304, "grad_norm": 0.21629682183265686, "learning_rate": 8.933290364163712e-06, "loss": 0.1022, "step": 1125 }, { "epoch": 1.0917874396135265, "grad_norm": 0.26614615321159363, "learning_rate": 8.910731550112794e-06, "loss": 0.0825, "step": 1130 }, { "epoch": 1.0966183574879227, "grad_norm": 0.2642468810081482, "learning_rate": 8.888172736061876e-06, "loss": 0.0796, "step": 1135 }, { "epoch": 1.1014492753623188, "grad_norm": 0.16877882182598114, "learning_rate": 8.865613922010956e-06, "loss": 0.0726, "step": 1140 }, { "epoch": 1.106280193236715, "grad_norm": 0.2619246542453766, "learning_rate": 8.843055107960038e-06, "loss": 0.0835, "step": 1145 }, { "epoch": 1.1111111111111112, "grad_norm": 0.2424723505973816, "learning_rate": 8.82049629390912e-06, "loss": 0.0896, "step": 1150 }, { "epoch": 1.1159420289855073, "grad_norm": 0.20973582565784454, "learning_rate": 8.797937479858202e-06, "loss": 0.0751, "step": 1155 }, { "epoch": 1.1207729468599035, "grad_norm": 0.23418009281158447, "learning_rate": 8.775378665807284e-06, "loss": 0.077, "step": 1160 }, { "epoch": 1.1256038647342996, "grad_norm": 0.3117668032646179, "learning_rate": 8.752819851756366e-06, "loss": 0.0769, "step": 1165 }, { "epoch": 1.1304347826086956, "grad_norm": 0.25092753767967224, "learning_rate": 8.730261037705446e-06, "loss": 0.0729, "step": 1170 }, { "epoch": 1.1352657004830917, "grad_norm": 0.1926090270280838, "learning_rate": 8.707702223654527e-06, "loss": 0.0772, "step": 1175 }, { "epoch": 1.1400966183574879, "grad_norm": 0.27212995290756226, "learning_rate": 8.685143409603609e-06, "loss": 0.0712, "step": 1180 }, { "epoch": 1.144927536231884, "grad_norm": 0.2097581923007965, "learning_rate": 8.66258459555269e-06, "loss": 0.0767, "step": 1185 }, { "epoch": 1.1497584541062802, "grad_norm": 0.2765638828277588, "learning_rate": 8.640025781501773e-06, "loss": 0.0885, "step": 1190 }, { "epoch": 1.1545893719806763, "grad_norm": 0.28414320945739746, "learning_rate": 8.617466967450855e-06, "loss": 0.0631, "step": 1195 }, { "epoch": 1.1594202898550725, "grad_norm": 0.21230548620224, "learning_rate": 8.594908153399935e-06, "loss": 0.0848, "step": 1200 }, { "epoch": 1.1642512077294687, "grad_norm": 0.1870320439338684, "learning_rate": 8.572349339349017e-06, "loss": 0.0724, "step": 1205 }, { "epoch": 1.1690821256038648, "grad_norm": 0.23322801291942596, "learning_rate": 8.549790525298099e-06, "loss": 0.0846, "step": 1210 }, { "epoch": 1.1739130434782608, "grad_norm": 0.22248071432113647, "learning_rate": 8.52723171124718e-06, "loss": 0.0624, "step": 1215 }, { "epoch": 1.178743961352657, "grad_norm": 0.196117103099823, "learning_rate": 8.504672897196261e-06, "loss": 0.093, "step": 1220 }, { "epoch": 1.183574879227053, "grad_norm": 0.2212802767753601, "learning_rate": 8.482114083145343e-06, "loss": 0.0853, "step": 1225 }, { "epoch": 1.1884057971014492, "grad_norm": 0.17421841621398926, "learning_rate": 8.459555269094424e-06, "loss": 0.0747, "step": 1230 }, { "epoch": 1.1932367149758454, "grad_norm": 0.2645537853240967, "learning_rate": 8.436996455043506e-06, "loss": 0.0859, "step": 1235 }, { "epoch": 1.1980676328502415, "grad_norm": 0.27182498574256897, "learning_rate": 8.414437640992588e-06, "loss": 0.0943, "step": 1240 }, { "epoch": 1.2028985507246377, "grad_norm": 0.20389291644096375, "learning_rate": 8.39187882694167e-06, "loss": 0.0783, "step": 1245 }, { "epoch": 1.2077294685990339, "grad_norm": 0.3193868398666382, "learning_rate": 8.369320012890752e-06, "loss": 0.0917, "step": 1250 }, { "epoch": 1.21256038647343, "grad_norm": 0.2852030098438263, "learning_rate": 8.346761198839834e-06, "loss": 0.077, "step": 1255 }, { "epoch": 1.2173913043478262, "grad_norm": 0.256452739238739, "learning_rate": 8.324202384788914e-06, "loss": 0.0804, "step": 1260 }, { "epoch": 1.2222222222222223, "grad_norm": 0.209047332406044, "learning_rate": 8.301643570737994e-06, "loss": 0.0928, "step": 1265 }, { "epoch": 1.2270531400966185, "grad_norm": 0.21215900778770447, "learning_rate": 8.279084756687076e-06, "loss": 0.0788, "step": 1270 }, { "epoch": 1.2318840579710144, "grad_norm": 0.15550634264945984, "learning_rate": 8.256525942636158e-06, "loss": 0.0804, "step": 1275 }, { "epoch": 1.2367149758454106, "grad_norm": 0.16960662603378296, "learning_rate": 8.23396712858524e-06, "loss": 0.0737, "step": 1280 }, { "epoch": 1.2415458937198067, "grad_norm": 0.20484741032123566, "learning_rate": 8.211408314534322e-06, "loss": 0.0794, "step": 1285 }, { "epoch": 1.2463768115942029, "grad_norm": 0.24889996647834778, "learning_rate": 8.188849500483402e-06, "loss": 0.0903, "step": 1290 }, { "epoch": 1.251207729468599, "grad_norm": 0.23695576190948486, "learning_rate": 8.166290686432484e-06, "loss": 0.0826, "step": 1295 }, { "epoch": 1.2560386473429952, "grad_norm": 0.23449349403381348, "learning_rate": 8.143731872381566e-06, "loss": 0.0922, "step": 1300 }, { "epoch": 1.2608695652173914, "grad_norm": 0.2362452745437622, "learning_rate": 8.121173058330648e-06, "loss": 0.0716, "step": 1305 }, { "epoch": 1.2657004830917875, "grad_norm": 0.33280622959136963, "learning_rate": 8.09861424427973e-06, "loss": 0.0909, "step": 1310 }, { "epoch": 1.2705314009661834, "grad_norm": 0.22267523407936096, "learning_rate": 8.07605543022881e-06, "loss": 0.0816, "step": 1315 }, { "epoch": 1.2753623188405796, "grad_norm": 0.23176385462284088, "learning_rate": 8.053496616177891e-06, "loss": 0.091, "step": 1320 }, { "epoch": 1.2801932367149758, "grad_norm": 0.21951176226139069, "learning_rate": 8.030937802126973e-06, "loss": 0.0752, "step": 1325 }, { "epoch": 1.285024154589372, "grad_norm": 0.19361701607704163, "learning_rate": 8.008378988076055e-06, "loss": 0.0731, "step": 1330 }, { "epoch": 1.289855072463768, "grad_norm": 0.2284880429506302, "learning_rate": 7.985820174025137e-06, "loss": 0.0821, "step": 1335 }, { "epoch": 1.2946859903381642, "grad_norm": 0.28775539994239807, "learning_rate": 7.963261359974219e-06, "loss": 0.0865, "step": 1340 }, { "epoch": 1.2995169082125604, "grad_norm": 0.22133222222328186, "learning_rate": 7.940702545923301e-06, "loss": 0.0722, "step": 1345 }, { "epoch": 1.3043478260869565, "grad_norm": 0.2120644450187683, "learning_rate": 7.918143731872381e-06, "loss": 0.0642, "step": 1350 }, { "epoch": 1.3091787439613527, "grad_norm": 0.2922479212284088, "learning_rate": 7.895584917821463e-06, "loss": 0.0804, "step": 1355 }, { "epoch": 1.3140096618357489, "grad_norm": 0.2302795797586441, "learning_rate": 7.873026103770545e-06, "loss": 0.0726, "step": 1360 }, { "epoch": 1.318840579710145, "grad_norm": 0.28763264417648315, "learning_rate": 7.850467289719626e-06, "loss": 0.0976, "step": 1365 }, { "epoch": 1.3236714975845412, "grad_norm": 0.2106347233057022, "learning_rate": 7.827908475668708e-06, "loss": 0.0744, "step": 1370 }, { "epoch": 1.3285024154589373, "grad_norm": 0.23215855658054352, "learning_rate": 7.80534966161779e-06, "loss": 0.0916, "step": 1375 }, { "epoch": 1.3333333333333333, "grad_norm": 0.20885543525218964, "learning_rate": 7.78279084756687e-06, "loss": 0.0774, "step": 1380 }, { "epoch": 1.3381642512077294, "grad_norm": 0.20533576607704163, "learning_rate": 7.760232033515952e-06, "loss": 0.0751, "step": 1385 }, { "epoch": 1.3429951690821256, "grad_norm": 0.20719490945339203, "learning_rate": 7.737673219465034e-06, "loss": 0.0816, "step": 1390 }, { "epoch": 1.3478260869565217, "grad_norm": 0.19761165976524353, "learning_rate": 7.715114405414116e-06, "loss": 0.0804, "step": 1395 }, { "epoch": 1.3526570048309179, "grad_norm": 0.20369771122932434, "learning_rate": 7.692555591363198e-06, "loss": 0.0845, "step": 1400 }, { "epoch": 1.357487922705314, "grad_norm": 0.20887012779712677, "learning_rate": 7.669996777312278e-06, "loss": 0.0704, "step": 1405 }, { "epoch": 1.3623188405797102, "grad_norm": 0.29784587025642395, "learning_rate": 7.64743796326136e-06, "loss": 0.0866, "step": 1410 }, { "epoch": 1.3671497584541064, "grad_norm": 0.31036221981048584, "learning_rate": 7.62487914921044e-06, "loss": 0.0862, "step": 1415 }, { "epoch": 1.3719806763285023, "grad_norm": 0.25198647379875183, "learning_rate": 7.602320335159522e-06, "loss": 0.0825, "step": 1420 }, { "epoch": 1.3768115942028984, "grad_norm": 0.24515630304813385, "learning_rate": 7.579761521108604e-06, "loss": 0.0787, "step": 1425 }, { "epoch": 1.3816425120772946, "grad_norm": 0.22536733746528625, "learning_rate": 7.5572027070576855e-06, "loss": 0.0928, "step": 1430 }, { "epoch": 1.3864734299516908, "grad_norm": 0.23405781388282776, "learning_rate": 7.5346438930067675e-06, "loss": 0.0917, "step": 1435 }, { "epoch": 1.391304347826087, "grad_norm": 0.24243396520614624, "learning_rate": 7.5120850789558495e-06, "loss": 0.0675, "step": 1440 }, { "epoch": 1.396135265700483, "grad_norm": 0.2637854814529419, "learning_rate": 7.489526264904931e-06, "loss": 0.0863, "step": 1445 }, { "epoch": 1.4009661835748792, "grad_norm": 0.2491244375705719, "learning_rate": 7.466967450854013e-06, "loss": 0.0808, "step": 1450 }, { "epoch": 1.4057971014492754, "grad_norm": 0.23132705688476562, "learning_rate": 7.444408636803095e-06, "loss": 0.0797, "step": 1455 }, { "epoch": 1.4106280193236715, "grad_norm": 0.2987098693847656, "learning_rate": 7.421849822752176e-06, "loss": 0.0766, "step": 1460 }, { "epoch": 1.4154589371980677, "grad_norm": 0.23995457589626312, "learning_rate": 7.399291008701256e-06, "loss": 0.0764, "step": 1465 }, { "epoch": 1.4202898550724639, "grad_norm": 0.21818973124027252, "learning_rate": 7.376732194650338e-06, "loss": 0.09, "step": 1470 }, { "epoch": 1.42512077294686, "grad_norm": 0.19304029643535614, "learning_rate": 7.354173380599419e-06, "loss": 0.0759, "step": 1475 }, { "epoch": 1.4299516908212562, "grad_norm": 0.26081785559654236, "learning_rate": 7.331614566548501e-06, "loss": 0.0781, "step": 1480 }, { "epoch": 1.434782608695652, "grad_norm": 0.23940761387348175, "learning_rate": 7.309055752497583e-06, "loss": 0.085, "step": 1485 }, { "epoch": 1.4396135265700483, "grad_norm": 0.21909761428833008, "learning_rate": 7.286496938446664e-06, "loss": 0.0815, "step": 1490 }, { "epoch": 1.4444444444444444, "grad_norm": 0.16527162492275238, "learning_rate": 7.263938124395746e-06, "loss": 0.0698, "step": 1495 }, { "epoch": 1.4492753623188406, "grad_norm": 0.21258555352687836, "learning_rate": 7.241379310344828e-06, "loss": 0.0806, "step": 1500 }, { "epoch": 1.4541062801932367, "grad_norm": 0.18572719395160675, "learning_rate": 7.2188204962939095e-06, "loss": 0.0757, "step": 1505 }, { "epoch": 1.458937198067633, "grad_norm": 0.16916704177856445, "learning_rate": 7.19626168224299e-06, "loss": 0.07, "step": 1510 }, { "epoch": 1.463768115942029, "grad_norm": 0.289044588804245, "learning_rate": 7.173702868192072e-06, "loss": 0.0656, "step": 1515 }, { "epoch": 1.4685990338164252, "grad_norm": 0.27173757553100586, "learning_rate": 7.151144054141153e-06, "loss": 0.0704, "step": 1520 }, { "epoch": 1.4734299516908211, "grad_norm": 0.2929324209690094, "learning_rate": 7.128585240090235e-06, "loss": 0.0833, "step": 1525 }, { "epoch": 1.4782608695652173, "grad_norm": 0.2387627214193344, "learning_rate": 7.106026426039317e-06, "loss": 0.075, "step": 1530 }, { "epoch": 1.4830917874396135, "grad_norm": 0.3277483582496643, "learning_rate": 7.083467611988398e-06, "loss": 0.074, "step": 1535 }, { "epoch": 1.4879227053140096, "grad_norm": 0.23673392832279205, "learning_rate": 7.06090879793748e-06, "loss": 0.0697, "step": 1540 }, { "epoch": 1.4927536231884058, "grad_norm": 0.19109922647476196, "learning_rate": 7.038349983886562e-06, "loss": 0.0775, "step": 1545 }, { "epoch": 1.497584541062802, "grad_norm": 0.2344091832637787, "learning_rate": 7.015791169835643e-06, "loss": 0.0644, "step": 1550 }, { "epoch": 1.502415458937198, "grad_norm": 0.28420698642730713, "learning_rate": 6.993232355784724e-06, "loss": 0.0935, "step": 1555 }, { "epoch": 1.5072463768115942, "grad_norm": 0.2632888853549957, "learning_rate": 6.970673541733806e-06, "loss": 0.083, "step": 1560 }, { "epoch": 1.5120772946859904, "grad_norm": 0.2461112141609192, "learning_rate": 6.9481147276828875e-06, "loss": 0.0729, "step": 1565 }, { "epoch": 1.5169082125603865, "grad_norm": 0.2015853226184845, "learning_rate": 6.9255559136319695e-06, "loss": 0.0836, "step": 1570 }, { "epoch": 1.5217391304347827, "grad_norm": 0.2409069985151291, "learning_rate": 6.902997099581051e-06, "loss": 0.0797, "step": 1575 }, { "epoch": 1.5265700483091789, "grad_norm": 0.2014143019914627, "learning_rate": 6.880438285530132e-06, "loss": 0.09, "step": 1580 }, { "epoch": 1.531400966183575, "grad_norm": 0.2173725664615631, "learning_rate": 6.857879471479214e-06, "loss": 0.0648, "step": 1585 }, { "epoch": 1.5362318840579712, "grad_norm": 0.20185904204845428, "learning_rate": 6.835320657428296e-06, "loss": 0.0924, "step": 1590 }, { "epoch": 1.541062801932367, "grad_norm": 0.29456228017807007, "learning_rate": 6.812761843377377e-06, "loss": 0.0764, "step": 1595 }, { "epoch": 1.5458937198067633, "grad_norm": 0.22320301830768585, "learning_rate": 6.790203029326458e-06, "loss": 0.0754, "step": 1600 }, { "epoch": 1.5507246376811594, "grad_norm": 0.2032977044582367, "learning_rate": 6.76764421527554e-06, "loss": 0.089, "step": 1605 }, { "epoch": 1.5555555555555556, "grad_norm": 0.24341309070587158, "learning_rate": 6.745085401224621e-06, "loss": 0.0767, "step": 1610 }, { "epoch": 1.5603864734299517, "grad_norm": 0.22675780951976776, "learning_rate": 6.722526587173703e-06, "loss": 0.0811, "step": 1615 }, { "epoch": 1.5652173913043477, "grad_norm": 0.2980429232120514, "learning_rate": 6.699967773122784e-06, "loss": 0.0714, "step": 1620 }, { "epoch": 1.5700483091787438, "grad_norm": 0.2221527248620987, "learning_rate": 6.6774089590718655e-06, "loss": 0.0811, "step": 1625 }, { "epoch": 1.57487922705314, "grad_norm": 0.29102587699890137, "learning_rate": 6.6548501450209474e-06, "loss": 0.0717, "step": 1630 }, { "epoch": 1.5797101449275361, "grad_norm": 0.24565882980823517, "learning_rate": 6.632291330970029e-06, "loss": 0.0688, "step": 1635 }, { "epoch": 1.5845410628019323, "grad_norm": 0.2056146264076233, "learning_rate": 6.609732516919111e-06, "loss": 0.0739, "step": 1640 }, { "epoch": 1.5893719806763285, "grad_norm": 0.25777336955070496, "learning_rate": 6.587173702868192e-06, "loss": 0.0746, "step": 1645 }, { "epoch": 1.5942028985507246, "grad_norm": 0.20640453696250916, "learning_rate": 6.564614888817273e-06, "loss": 0.0757, "step": 1650 }, { "epoch": 1.5990338164251208, "grad_norm": 0.16480913758277893, "learning_rate": 6.542056074766355e-06, "loss": 0.0752, "step": 1655 }, { "epoch": 1.603864734299517, "grad_norm": 0.23693595826625824, "learning_rate": 6.519497260715437e-06, "loss": 0.0813, "step": 1660 }, { "epoch": 1.608695652173913, "grad_norm": 0.24152866005897522, "learning_rate": 6.496938446664518e-06, "loss": 0.0784, "step": 1665 }, { "epoch": 1.6135265700483092, "grad_norm": 0.23890602588653564, "learning_rate": 6.474379632613599e-06, "loss": 0.0813, "step": 1670 }, { "epoch": 1.6183574879227054, "grad_norm": 0.2686842679977417, "learning_rate": 6.451820818562681e-06, "loss": 0.0833, "step": 1675 }, { "epoch": 1.6231884057971016, "grad_norm": 0.2103358954191208, "learning_rate": 6.429262004511762e-06, "loss": 0.0906, "step": 1680 }, { "epoch": 1.6280193236714977, "grad_norm": 0.23938271403312683, "learning_rate": 6.406703190460844e-06, "loss": 0.0721, "step": 1685 }, { "epoch": 1.6328502415458939, "grad_norm": 0.1797400861978531, "learning_rate": 6.384144376409926e-06, "loss": 0.0678, "step": 1690 }, { "epoch": 1.6376811594202898, "grad_norm": 0.23905880749225616, "learning_rate": 6.361585562359007e-06, "loss": 0.0886, "step": 1695 }, { "epoch": 1.642512077294686, "grad_norm": 0.19138076901435852, "learning_rate": 6.339026748308089e-06, "loss": 0.0705, "step": 1700 }, { "epoch": 1.6473429951690821, "grad_norm": 0.19759757816791534, "learning_rate": 6.3164679342571706e-06, "loss": 0.0772, "step": 1705 }, { "epoch": 1.6521739130434783, "grad_norm": 0.22951267659664154, "learning_rate": 6.293909120206252e-06, "loss": 0.0701, "step": 1710 }, { "epoch": 1.6570048309178744, "grad_norm": 0.3317079246044159, "learning_rate": 6.271350306155334e-06, "loss": 0.0838, "step": 1715 }, { "epoch": 1.6618357487922706, "grad_norm": 0.2875089645385742, "learning_rate": 6.248791492104415e-06, "loss": 0.0711, "step": 1720 }, { "epoch": 1.6666666666666665, "grad_norm": 0.22365209460258484, "learning_rate": 6.226232678053496e-06, "loss": 0.0913, "step": 1725 }, { "epoch": 1.6714975845410627, "grad_norm": 0.26004156470298767, "learning_rate": 6.203673864002578e-06, "loss": 0.0749, "step": 1730 }, { "epoch": 1.6763285024154588, "grad_norm": 0.24029529094696045, "learning_rate": 6.18111504995166e-06, "loss": 0.0872, "step": 1735 }, { "epoch": 1.681159420289855, "grad_norm": 0.2503759562969208, "learning_rate": 6.158556235900741e-06, "loss": 0.0662, "step": 1740 }, { "epoch": 1.6859903381642511, "grad_norm": 0.24961721897125244, "learning_rate": 6.135997421849822e-06, "loss": 0.0805, "step": 1745 }, { "epoch": 1.6908212560386473, "grad_norm": 0.20291025936603546, "learning_rate": 6.113438607798904e-06, "loss": 0.0723, "step": 1750 }, { "epoch": 1.6956521739130435, "grad_norm": 0.24923092126846313, "learning_rate": 6.0908797937479854e-06, "loss": 0.0766, "step": 1755 }, { "epoch": 1.7004830917874396, "grad_norm": 0.3006664514541626, "learning_rate": 6.068320979697067e-06, "loss": 0.0767, "step": 1760 }, { "epoch": 1.7053140096618358, "grad_norm": 0.22034914791584015, "learning_rate": 6.045762165646149e-06, "loss": 0.0716, "step": 1765 }, { "epoch": 1.710144927536232, "grad_norm": 0.22951188683509827, "learning_rate": 6.02320335159523e-06, "loss": 0.0713, "step": 1770 }, { "epoch": 1.714975845410628, "grad_norm": 0.22270874679088593, "learning_rate": 6.000644537544312e-06, "loss": 0.0671, "step": 1775 }, { "epoch": 1.7198067632850242, "grad_norm": 0.23195502161979675, "learning_rate": 5.978085723493394e-06, "loss": 0.0864, "step": 1780 }, { "epoch": 1.7246376811594204, "grad_norm": 0.2421010136604309, "learning_rate": 5.955526909442475e-06, "loss": 0.0886, "step": 1785 }, { "epoch": 1.7294685990338166, "grad_norm": 0.20693883299827576, "learning_rate": 5.932968095391557e-06, "loss": 0.0715, "step": 1790 }, { "epoch": 1.7342995169082127, "grad_norm": 0.32137101888656616, "learning_rate": 5.910409281340638e-06, "loss": 0.0639, "step": 1795 }, { "epoch": 1.7391304347826086, "grad_norm": 0.21108365058898926, "learning_rate": 5.887850467289719e-06, "loss": 0.0786, "step": 1800 }, { "epoch": 1.7439613526570048, "grad_norm": 0.2952270805835724, "learning_rate": 5.865291653238801e-06, "loss": 0.0641, "step": 1805 }, { "epoch": 1.748792270531401, "grad_norm": 0.26709944009780884, "learning_rate": 5.842732839187883e-06, "loss": 0.0698, "step": 1810 }, { "epoch": 1.7536231884057971, "grad_norm": 0.30126988887786865, "learning_rate": 5.820174025136964e-06, "loss": 0.0773, "step": 1815 }, { "epoch": 1.7584541062801933, "grad_norm": 0.2402152717113495, "learning_rate": 5.797615211086045e-06, "loss": 0.0778, "step": 1820 }, { "epoch": 1.7632850241545892, "grad_norm": 0.19652244448661804, "learning_rate": 5.775056397035127e-06, "loss": 0.082, "step": 1825 }, { "epoch": 1.7681159420289854, "grad_norm": 0.21389204263687134, "learning_rate": 5.7524975829842086e-06, "loss": 0.0727, "step": 1830 }, { "epoch": 1.7729468599033815, "grad_norm": 0.2189796268939972, "learning_rate": 5.7299387689332905e-06, "loss": 0.0757, "step": 1835 }, { "epoch": 1.7777777777777777, "grad_norm": 0.28000935912132263, "learning_rate": 5.7073799548823725e-06, "loss": 0.0803, "step": 1840 }, { "epoch": 1.7826086956521738, "grad_norm": 0.24566881358623505, "learning_rate": 5.684821140831453e-06, "loss": 0.0815, "step": 1845 }, { "epoch": 1.78743961352657, "grad_norm": 0.22037634253501892, "learning_rate": 5.662262326780535e-06, "loss": 0.0871, "step": 1850 }, { "epoch": 1.7922705314009661, "grad_norm": 0.1990278661251068, "learning_rate": 5.639703512729617e-06, "loss": 0.0756, "step": 1855 }, { "epoch": 1.7971014492753623, "grad_norm": 0.3180176615715027, "learning_rate": 5.617144698678698e-06, "loss": 0.0735, "step": 1860 }, { "epoch": 1.8019323671497585, "grad_norm": 0.2075718492269516, "learning_rate": 5.59458588462778e-06, "loss": 0.0665, "step": 1865 }, { "epoch": 1.8067632850241546, "grad_norm": 0.2611768841743469, "learning_rate": 5.572027070576861e-06, "loss": 0.0873, "step": 1870 }, { "epoch": 1.8115942028985508, "grad_norm": 0.22146160900592804, "learning_rate": 5.549468256525942e-06, "loss": 0.0638, "step": 1875 }, { "epoch": 1.816425120772947, "grad_norm": 0.29287296533584595, "learning_rate": 5.526909442475024e-06, "loss": 0.0812, "step": 1880 }, { "epoch": 1.821256038647343, "grad_norm": 0.2280767410993576, "learning_rate": 5.504350628424106e-06, "loss": 0.0766, "step": 1885 }, { "epoch": 1.8260869565217392, "grad_norm": 0.20453138649463654, "learning_rate": 5.4817918143731865e-06, "loss": 0.0748, "step": 1890 }, { "epoch": 1.8309178743961354, "grad_norm": 0.2855188250541687, "learning_rate": 5.4592330003222685e-06, "loss": 0.0901, "step": 1895 }, { "epoch": 1.8357487922705316, "grad_norm": 0.21556098759174347, "learning_rate": 5.4366741862713505e-06, "loss": 0.0735, "step": 1900 }, { "epoch": 1.8405797101449275, "grad_norm": 0.3091937303543091, "learning_rate": 5.414115372220432e-06, "loss": 0.0636, "step": 1905 }, { "epoch": 1.8454106280193237, "grad_norm": 0.2939262390136719, "learning_rate": 5.391556558169514e-06, "loss": 0.0753, "step": 1910 }, { "epoch": 1.8502415458937198, "grad_norm": 0.2101174294948578, "learning_rate": 5.368997744118595e-06, "loss": 0.0714, "step": 1915 }, { "epoch": 1.855072463768116, "grad_norm": 0.2570497691631317, "learning_rate": 5.346438930067676e-06, "loss": 0.0877, "step": 1920 }, { "epoch": 1.8599033816425121, "grad_norm": 0.2754373848438263, "learning_rate": 5.323880116016758e-06, "loss": 0.0729, "step": 1925 }, { "epoch": 1.864734299516908, "grad_norm": 0.2952544391155243, "learning_rate": 5.30132130196584e-06, "loss": 0.0714, "step": 1930 }, { "epoch": 1.8695652173913042, "grad_norm": 0.2360425889492035, "learning_rate": 5.278762487914921e-06, "loss": 0.0711, "step": 1935 }, { "epoch": 1.8743961352657004, "grad_norm": 0.22847935557365417, "learning_rate": 5.256203673864002e-06, "loss": 0.07, "step": 1940 }, { "epoch": 1.8792270531400965, "grad_norm": 0.26060476899147034, "learning_rate": 5.233644859813084e-06, "loss": 0.086, "step": 1945 }, { "epoch": 1.8840579710144927, "grad_norm": 0.28593048453330994, "learning_rate": 5.211086045762165e-06, "loss": 0.0782, "step": 1950 }, { "epoch": 1.8888888888888888, "grad_norm": 0.2553214430809021, "learning_rate": 5.188527231711247e-06, "loss": 0.0689, "step": 1955 }, { "epoch": 1.893719806763285, "grad_norm": 0.38168102502822876, "learning_rate": 5.1659684176603285e-06, "loss": 0.0917, "step": 1960 }, { "epoch": 1.8985507246376812, "grad_norm": 0.22879190742969513, "learning_rate": 5.14340960360941e-06, "loss": 0.0833, "step": 1965 }, { "epoch": 1.9033816425120773, "grad_norm": 0.19676880538463593, "learning_rate": 5.120850789558492e-06, "loss": 0.0594, "step": 1970 }, { "epoch": 1.9082125603864735, "grad_norm": 0.36660292744636536, "learning_rate": 5.098291975507573e-06, "loss": 0.0932, "step": 1975 }, { "epoch": 1.9130434782608696, "grad_norm": 0.23486468195915222, "learning_rate": 5.075733161456655e-06, "loss": 0.0941, "step": 1980 }, { "epoch": 1.9178743961352658, "grad_norm": 0.2950279414653778, "learning_rate": 5.053174347405737e-06, "loss": 0.0796, "step": 1985 }, { "epoch": 1.922705314009662, "grad_norm": 0.1995108425617218, "learning_rate": 5.030615533354817e-06, "loss": 0.0766, "step": 1990 }, { "epoch": 1.927536231884058, "grad_norm": 0.3509507179260254, "learning_rate": 5.008056719303899e-06, "loss": 0.0718, "step": 1995 }, { "epoch": 1.9323671497584543, "grad_norm": 0.22868584096431732, "learning_rate": 4.985497905252981e-06, "loss": 0.0724, "step": 2000 }, { "epoch": 1.9371980676328504, "grad_norm": 0.270059734582901, "learning_rate": 4.962939091202062e-06, "loss": 0.0761, "step": 2005 }, { "epoch": 1.9420289855072463, "grad_norm": 0.24437829852104187, "learning_rate": 4.940380277151144e-06, "loss": 0.0729, "step": 2010 }, { "epoch": 1.9468599033816425, "grad_norm": 0.24446424841880798, "learning_rate": 4.917821463100225e-06, "loss": 0.0648, "step": 2015 }, { "epoch": 1.9516908212560387, "grad_norm": 0.21626543998718262, "learning_rate": 4.8952626490493065e-06, "loss": 0.0739, "step": 2020 }, { "epoch": 1.9565217391304348, "grad_norm": 0.20689117908477783, "learning_rate": 4.8727038349983885e-06, "loss": 0.0701, "step": 2025 }, { "epoch": 1.961352657004831, "grad_norm": 0.2660706043243408, "learning_rate": 4.8501450209474705e-06, "loss": 0.0571, "step": 2030 }, { "epoch": 1.966183574879227, "grad_norm": 0.24084658920764923, "learning_rate": 4.827586206896552e-06, "loss": 0.0764, "step": 2035 }, { "epoch": 1.971014492753623, "grad_norm": 0.2771299481391907, "learning_rate": 4.805027392845633e-06, "loss": 0.0738, "step": 2040 }, { "epoch": 1.9758454106280192, "grad_norm": 0.2248222976922989, "learning_rate": 4.782468578794715e-06, "loss": 0.0774, "step": 2045 }, { "epoch": 1.9806763285024154, "grad_norm": 0.22526535391807556, "learning_rate": 4.759909764743796e-06, "loss": 0.0678, "step": 2050 }, { "epoch": 1.9855072463768115, "grad_norm": 0.21107898652553558, "learning_rate": 4.737350950692878e-06, "loss": 0.1011, "step": 2055 }, { "epoch": 1.9903381642512077, "grad_norm": 0.22934384644031525, "learning_rate": 4.71479213664196e-06, "loss": 0.0715, "step": 2060 }, { "epoch": 1.9951690821256038, "grad_norm": 0.2517627775669098, "learning_rate": 4.69223332259104e-06, "loss": 0.0796, "step": 2065 }, { "epoch": 2.0, "grad_norm": 0.40475329756736755, "learning_rate": 4.669674508540122e-06, "loss": 0.0919, "step": 2070 }, { "epoch": 2.0, "eval_runtime": 339.1035, "eval_samples_per_second": 3.049, "eval_steps_per_second": 0.383, "step": 2070 }, { "epoch": 2.004830917874396, "grad_norm": 0.23014891147613525, "learning_rate": 4.647115694489204e-06, "loss": 0.0721, "step": 2075 }, { "epoch": 2.0096618357487923, "grad_norm": 0.292595773935318, "learning_rate": 4.624556880438285e-06, "loss": 0.0797, "step": 2080 }, { "epoch": 2.0144927536231885, "grad_norm": 0.2784234583377838, "learning_rate": 4.601998066387367e-06, "loss": 0.0783, "step": 2085 }, { "epoch": 2.0193236714975846, "grad_norm": 0.21615320444107056, "learning_rate": 4.5794392523364485e-06, "loss": 0.0794, "step": 2090 }, { "epoch": 2.024154589371981, "grad_norm": 0.30054816603660583, "learning_rate": 4.55688043828553e-06, "loss": 0.078, "step": 2095 }, { "epoch": 2.028985507246377, "grad_norm": 0.21918036043643951, "learning_rate": 4.534321624234612e-06, "loss": 0.0706, "step": 2100 }, { "epoch": 2.033816425120773, "grad_norm": 0.22675025463104248, "learning_rate": 4.511762810183694e-06, "loss": 0.0578, "step": 2105 }, { "epoch": 2.0386473429951693, "grad_norm": 0.3500133454799652, "learning_rate": 4.489203996132775e-06, "loss": 0.077, "step": 2110 }, { "epoch": 2.0434782608695654, "grad_norm": 0.2782948315143585, "learning_rate": 4.466645182081856e-06, "loss": 0.0747, "step": 2115 }, { "epoch": 2.0483091787439616, "grad_norm": 0.3685343265533447, "learning_rate": 4.444086368030938e-06, "loss": 0.0775, "step": 2120 }, { "epoch": 2.0531400966183573, "grad_norm": 0.26994946599006653, "learning_rate": 4.421527553980019e-06, "loss": 0.076, "step": 2125 }, { "epoch": 2.0579710144927534, "grad_norm": 0.2926693856716156, "learning_rate": 4.398968739929101e-06, "loss": 0.0797, "step": 2130 }, { "epoch": 2.0628019323671496, "grad_norm": 0.26841118931770325, "learning_rate": 4.376409925878183e-06, "loss": 0.0733, "step": 2135 }, { "epoch": 2.0676328502415457, "grad_norm": 0.25837743282318115, "learning_rate": 4.353851111827263e-06, "loss": 0.0572, "step": 2140 }, { "epoch": 2.072463768115942, "grad_norm": 0.23347356915473938, "learning_rate": 4.331292297776345e-06, "loss": 0.0824, "step": 2145 }, { "epoch": 2.077294685990338, "grad_norm": 0.31139683723449707, "learning_rate": 4.308733483725427e-06, "loss": 0.0801, "step": 2150 }, { "epoch": 2.082125603864734, "grad_norm": 0.33561673760414124, "learning_rate": 4.2861746696745085e-06, "loss": 0.0816, "step": 2155 }, { "epoch": 2.0869565217391304, "grad_norm": 0.2744121551513672, "learning_rate": 4.26361585562359e-06, "loss": 0.0709, "step": 2160 }, { "epoch": 2.0917874396135265, "grad_norm": 0.29332056641578674, "learning_rate": 4.241057041572672e-06, "loss": 0.0768, "step": 2165 }, { "epoch": 2.0966183574879227, "grad_norm": 0.26820820569992065, "learning_rate": 4.218498227521753e-06, "loss": 0.0854, "step": 2170 }, { "epoch": 2.101449275362319, "grad_norm": 0.3563501536846161, "learning_rate": 4.195939413470835e-06, "loss": 0.0829, "step": 2175 }, { "epoch": 2.106280193236715, "grad_norm": 0.35537421703338623, "learning_rate": 4.173380599419917e-06, "loss": 0.0763, "step": 2180 }, { "epoch": 2.111111111111111, "grad_norm": 0.2760440707206726, "learning_rate": 4.150821785368997e-06, "loss": 0.092, "step": 2185 }, { "epoch": 2.1159420289855073, "grad_norm": 0.21750731766223907, "learning_rate": 4.128262971318079e-06, "loss": 0.0756, "step": 2190 }, { "epoch": 2.1207729468599035, "grad_norm": 0.2815890610218048, "learning_rate": 4.105704157267161e-06, "loss": 0.0844, "step": 2195 }, { "epoch": 2.1256038647342996, "grad_norm": 0.20408152043819427, "learning_rate": 4.083145343216242e-06, "loss": 0.0603, "step": 2200 }, { "epoch": 2.130434782608696, "grad_norm": 0.2452622503042221, "learning_rate": 4.060586529165324e-06, "loss": 0.0767, "step": 2205 }, { "epoch": 2.135265700483092, "grad_norm": 0.3027113080024719, "learning_rate": 4.038027715114405e-06, "loss": 0.0716, "step": 2210 }, { "epoch": 2.140096618357488, "grad_norm": 0.23567864298820496, "learning_rate": 4.0154689010634865e-06, "loss": 0.0845, "step": 2215 }, { "epoch": 2.1449275362318843, "grad_norm": 0.28407129645347595, "learning_rate": 3.9929100870125685e-06, "loss": 0.0784, "step": 2220 }, { "epoch": 2.14975845410628, "grad_norm": 0.28088170289993286, "learning_rate": 3.9703512729616505e-06, "loss": 0.0771, "step": 2225 }, { "epoch": 2.154589371980676, "grad_norm": 0.3641108274459839, "learning_rate": 3.947792458910732e-06, "loss": 0.0791, "step": 2230 }, { "epoch": 2.1594202898550723, "grad_norm": 0.23423610627651215, "learning_rate": 3.925233644859813e-06, "loss": 0.0735, "step": 2235 }, { "epoch": 2.1642512077294684, "grad_norm": 0.21887804567813873, "learning_rate": 3.902674830808895e-06, "loss": 0.0795, "step": 2240 }, { "epoch": 2.1690821256038646, "grad_norm": 0.24810364842414856, "learning_rate": 3.880116016757976e-06, "loss": 0.076, "step": 2245 }, { "epoch": 2.1739130434782608, "grad_norm": 0.217853844165802, "learning_rate": 3.857557202707058e-06, "loss": 0.0794, "step": 2250 }, { "epoch": 2.178743961352657, "grad_norm": 0.28543898463249207, "learning_rate": 3.834998388656139e-06, "loss": 0.0707, "step": 2255 }, { "epoch": 2.183574879227053, "grad_norm": 0.2932458221912384, "learning_rate": 3.81243957460522e-06, "loss": 0.0715, "step": 2260 }, { "epoch": 2.1884057971014492, "grad_norm": 0.3077555000782013, "learning_rate": 3.789880760554302e-06, "loss": 0.0756, "step": 2265 }, { "epoch": 2.1932367149758454, "grad_norm": 0.295901358127594, "learning_rate": 3.7673219465033837e-06, "loss": 0.0785, "step": 2270 }, { "epoch": 2.1980676328502415, "grad_norm": 0.2174501270055771, "learning_rate": 3.7447631324524653e-06, "loss": 0.0578, "step": 2275 }, { "epoch": 2.2028985507246377, "grad_norm": 0.2652744948863983, "learning_rate": 3.7222043184015473e-06, "loss": 0.0579, "step": 2280 }, { "epoch": 2.207729468599034, "grad_norm": 0.34323185682296753, "learning_rate": 3.699645504350628e-06, "loss": 0.072, "step": 2285 }, { "epoch": 2.21256038647343, "grad_norm": 0.3072277903556824, "learning_rate": 3.6770866902997096e-06, "loss": 0.0676, "step": 2290 }, { "epoch": 2.217391304347826, "grad_norm": 0.27712109684944153, "learning_rate": 3.6545278762487916e-06, "loss": 0.0699, "step": 2295 }, { "epoch": 2.2222222222222223, "grad_norm": 0.2862177789211273, "learning_rate": 3.631969062197873e-06, "loss": 0.0643, "step": 2300 }, { "epoch": 2.2270531400966185, "grad_norm": 0.2914809286594391, "learning_rate": 3.6094102481469547e-06, "loss": 0.0702, "step": 2305 }, { "epoch": 2.2318840579710146, "grad_norm": 0.19755889475345612, "learning_rate": 3.586851434096036e-06, "loss": 0.0817, "step": 2310 }, { "epoch": 2.236714975845411, "grad_norm": 0.25922340154647827, "learning_rate": 3.5642926200451175e-06, "loss": 0.0602, "step": 2315 }, { "epoch": 2.241545893719807, "grad_norm": 0.30358242988586426, "learning_rate": 3.541733805994199e-06, "loss": 0.0725, "step": 2320 }, { "epoch": 2.246376811594203, "grad_norm": 0.2505339980125427, "learning_rate": 3.519174991943281e-06, "loss": 0.079, "step": 2325 }, { "epoch": 2.2512077294685993, "grad_norm": 0.2911323308944702, "learning_rate": 3.496616177892362e-06, "loss": 0.0673, "step": 2330 }, { "epoch": 2.2560386473429954, "grad_norm": 0.3253360092639923, "learning_rate": 3.4740573638414437e-06, "loss": 0.0776, "step": 2335 }, { "epoch": 2.260869565217391, "grad_norm": 0.2546384036540985, "learning_rate": 3.4514985497905253e-06, "loss": 0.0689, "step": 2340 }, { "epoch": 2.2657004830917873, "grad_norm": 0.29095250368118286, "learning_rate": 3.428939735739607e-06, "loss": 0.0812, "step": 2345 }, { "epoch": 2.2705314009661834, "grad_norm": 0.29789912700653076, "learning_rate": 3.4063809216886884e-06, "loss": 0.087, "step": 2350 }, { "epoch": 2.2753623188405796, "grad_norm": 0.23721112310886383, "learning_rate": 3.38382210763777e-06, "loss": 0.075, "step": 2355 }, { "epoch": 2.2801932367149758, "grad_norm": 0.2618652284145355, "learning_rate": 3.3612632935868516e-06, "loss": 0.0781, "step": 2360 }, { "epoch": 2.285024154589372, "grad_norm": 0.3185523748397827, "learning_rate": 3.3387044795359327e-06, "loss": 0.0865, "step": 2365 }, { "epoch": 2.289855072463768, "grad_norm": 0.30211564898490906, "learning_rate": 3.3161456654850143e-06, "loss": 0.0755, "step": 2370 }, { "epoch": 2.2946859903381642, "grad_norm": 0.18218393623828888, "learning_rate": 3.293586851434096e-06, "loss": 0.0695, "step": 2375 }, { "epoch": 2.2995169082125604, "grad_norm": 0.20001597702503204, "learning_rate": 3.2710280373831774e-06, "loss": 0.0744, "step": 2380 }, { "epoch": 2.3043478260869565, "grad_norm": 0.37984150648117065, "learning_rate": 3.248469223332259e-06, "loss": 0.0585, "step": 2385 }, { "epoch": 2.3091787439613527, "grad_norm": 0.31228166818618774, "learning_rate": 3.2259104092813406e-06, "loss": 0.0731, "step": 2390 }, { "epoch": 2.314009661835749, "grad_norm": 0.27851906418800354, "learning_rate": 3.203351595230422e-06, "loss": 0.0767, "step": 2395 }, { "epoch": 2.318840579710145, "grad_norm": 0.22976937890052795, "learning_rate": 3.1807927811795033e-06, "loss": 0.0738, "step": 2400 }, { "epoch": 2.323671497584541, "grad_norm": 0.24843037128448486, "learning_rate": 3.1582339671285853e-06, "loss": 0.0792, "step": 2405 }, { "epoch": 2.3285024154589373, "grad_norm": 0.23123487830162048, "learning_rate": 3.135675153077667e-06, "loss": 0.0752, "step": 2410 }, { "epoch": 2.3333333333333335, "grad_norm": 0.23363561928272247, "learning_rate": 3.113116339026748e-06, "loss": 0.0693, "step": 2415 }, { "epoch": 2.3381642512077296, "grad_norm": 0.2371598780155182, "learning_rate": 3.09055752497583e-06, "loss": 0.0781, "step": 2420 }, { "epoch": 2.342995169082126, "grad_norm": 0.320534884929657, "learning_rate": 3.067998710924911e-06, "loss": 0.0635, "step": 2425 }, { "epoch": 2.3478260869565215, "grad_norm": 0.2920200824737549, "learning_rate": 3.0454398968739927e-06, "loss": 0.0771, "step": 2430 }, { "epoch": 2.3526570048309177, "grad_norm": 0.32089921832084656, "learning_rate": 3.0228810828230747e-06, "loss": 0.0733, "step": 2435 }, { "epoch": 2.357487922705314, "grad_norm": 0.2733156979084015, "learning_rate": 3.000322268772156e-06, "loss": 0.0681, "step": 2440 }, { "epoch": 2.36231884057971, "grad_norm": 0.24564507603645325, "learning_rate": 2.9777634547212374e-06, "loss": 0.0771, "step": 2445 }, { "epoch": 2.367149758454106, "grad_norm": 0.24026136100292206, "learning_rate": 2.955204640670319e-06, "loss": 0.0748, "step": 2450 }, { "epoch": 2.3719806763285023, "grad_norm": 0.20703287422657013, "learning_rate": 2.9326458266194006e-06, "loss": 0.0688, "step": 2455 }, { "epoch": 2.3768115942028984, "grad_norm": 0.18269629776477814, "learning_rate": 2.910087012568482e-06, "loss": 0.0728, "step": 2460 }, { "epoch": 2.3816425120772946, "grad_norm": 0.3421408236026764, "learning_rate": 2.8875281985175637e-06, "loss": 0.0679, "step": 2465 }, { "epoch": 2.3864734299516908, "grad_norm": 0.4087986350059509, "learning_rate": 2.8649693844666453e-06, "loss": 0.0791, "step": 2470 }, { "epoch": 2.391304347826087, "grad_norm": 0.2629115879535675, "learning_rate": 2.8424105704157264e-06, "loss": 0.074, "step": 2475 }, { "epoch": 2.396135265700483, "grad_norm": 0.2295183390378952, "learning_rate": 2.8198517563648084e-06, "loss": 0.0739, "step": 2480 }, { "epoch": 2.4009661835748792, "grad_norm": 0.31765657663345337, "learning_rate": 2.79729294231389e-06, "loss": 0.0708, "step": 2485 }, { "epoch": 2.4057971014492754, "grad_norm": 0.31528520584106445, "learning_rate": 2.774734128262971e-06, "loss": 0.0673, "step": 2490 }, { "epoch": 2.4106280193236715, "grad_norm": 0.2358902543783188, "learning_rate": 2.752175314212053e-06, "loss": 0.0543, "step": 2495 }, { "epoch": 2.4154589371980677, "grad_norm": 0.2725466787815094, "learning_rate": 2.7296165001611343e-06, "loss": 0.0703, "step": 2500 }, { "epoch": 2.420289855072464, "grad_norm": 0.24531903862953186, "learning_rate": 2.707057686110216e-06, "loss": 0.0715, "step": 2505 }, { "epoch": 2.42512077294686, "grad_norm": 0.29307085275650024, "learning_rate": 2.6844988720592974e-06, "loss": 0.0752, "step": 2510 }, { "epoch": 2.429951690821256, "grad_norm": 0.2959176003932953, "learning_rate": 2.661940058008379e-06, "loss": 0.0685, "step": 2515 }, { "epoch": 2.4347826086956523, "grad_norm": 0.2573854923248291, "learning_rate": 2.6393812439574605e-06, "loss": 0.0664, "step": 2520 }, { "epoch": 2.4396135265700485, "grad_norm": 0.3154689371585846, "learning_rate": 2.616822429906542e-06, "loss": 0.0615, "step": 2525 }, { "epoch": 2.4444444444444446, "grad_norm": 0.21446138620376587, "learning_rate": 2.5942636158556237e-06, "loss": 0.0635, "step": 2530 }, { "epoch": 2.449275362318841, "grad_norm": 0.3040371537208557, "learning_rate": 2.571704801804705e-06, "loss": 0.0788, "step": 2535 }, { "epoch": 2.454106280193237, "grad_norm": 0.2636314034461975, "learning_rate": 2.5491459877537864e-06, "loss": 0.072, "step": 2540 }, { "epoch": 2.4589371980676327, "grad_norm": 0.26327863335609436, "learning_rate": 2.5265871737028684e-06, "loss": 0.0777, "step": 2545 }, { "epoch": 2.463768115942029, "grad_norm": 0.28980839252471924, "learning_rate": 2.5040283596519495e-06, "loss": 0.0694, "step": 2550 }, { "epoch": 2.468599033816425, "grad_norm": 0.2889906167984009, "learning_rate": 2.481469545601031e-06, "loss": 0.0703, "step": 2555 }, { "epoch": 2.473429951690821, "grad_norm": 0.2539612650871277, "learning_rate": 2.4589107315501127e-06, "loss": 0.0894, "step": 2560 }, { "epoch": 2.4782608695652173, "grad_norm": 0.25100603699684143, "learning_rate": 2.4363519174991943e-06, "loss": 0.0649, "step": 2565 }, { "epoch": 2.4830917874396135, "grad_norm": 0.24855615198612213, "learning_rate": 2.413793103448276e-06, "loss": 0.0687, "step": 2570 }, { "epoch": 2.4879227053140096, "grad_norm": 0.2766883671283722, "learning_rate": 2.3912342893973574e-06, "loss": 0.0712, "step": 2575 }, { "epoch": 2.4927536231884058, "grad_norm": 0.24230973422527313, "learning_rate": 2.368675475346439e-06, "loss": 0.0792, "step": 2580 }, { "epoch": 2.497584541062802, "grad_norm": 0.2981168031692505, "learning_rate": 2.34611666129552e-06, "loss": 0.0724, "step": 2585 }, { "epoch": 2.502415458937198, "grad_norm": 0.26249799132347107, "learning_rate": 2.323557847244602e-06, "loss": 0.0727, "step": 2590 }, { "epoch": 2.5072463768115942, "grad_norm": 0.23193541169166565, "learning_rate": 2.3009990331936837e-06, "loss": 0.0658, "step": 2595 }, { "epoch": 2.5120772946859904, "grad_norm": 0.3478648364543915, "learning_rate": 2.278440219142765e-06, "loss": 0.0766, "step": 2600 }, { "epoch": 2.5169082125603865, "grad_norm": 0.2009768933057785, "learning_rate": 2.255881405091847e-06, "loss": 0.0735, "step": 2605 }, { "epoch": 2.5217391304347827, "grad_norm": 0.2750122547149658, "learning_rate": 2.233322591040928e-06, "loss": 0.0778, "step": 2610 }, { "epoch": 2.526570048309179, "grad_norm": 0.22165286540985107, "learning_rate": 2.2107637769900095e-06, "loss": 0.0656, "step": 2615 }, { "epoch": 2.531400966183575, "grad_norm": 0.26584914326667786, "learning_rate": 2.1882049629390915e-06, "loss": 0.0723, "step": 2620 }, { "epoch": 2.536231884057971, "grad_norm": 0.30248183012008667, "learning_rate": 2.1656461488881727e-06, "loss": 0.0647, "step": 2625 }, { "epoch": 2.541062801932367, "grad_norm": 0.2667482793331146, "learning_rate": 2.1430873348372542e-06, "loss": 0.0694, "step": 2630 }, { "epoch": 2.545893719806763, "grad_norm": 0.2767150402069092, "learning_rate": 2.120528520786336e-06, "loss": 0.0818, "step": 2635 }, { "epoch": 2.550724637681159, "grad_norm": 0.30463531613349915, "learning_rate": 2.0979697067354174e-06, "loss": 0.0684, "step": 2640 }, { "epoch": 2.5555555555555554, "grad_norm": 0.2667052447795868, "learning_rate": 2.0754108926844985e-06, "loss": 0.068, "step": 2645 }, { "epoch": 2.5603864734299515, "grad_norm": 0.37567076086997986, "learning_rate": 2.0528520786335805e-06, "loss": 0.0578, "step": 2650 }, { "epoch": 2.5652173913043477, "grad_norm": 0.24227222800254822, "learning_rate": 2.030293264582662e-06, "loss": 0.0748, "step": 2655 }, { "epoch": 2.570048309178744, "grad_norm": 0.3247409760951996, "learning_rate": 2.0077344505317432e-06, "loss": 0.073, "step": 2660 }, { "epoch": 2.57487922705314, "grad_norm": 0.30261141061782837, "learning_rate": 1.9851756364808252e-06, "loss": 0.0722, "step": 2665 }, { "epoch": 2.579710144927536, "grad_norm": 0.2872192859649658, "learning_rate": 1.9626168224299064e-06, "loss": 0.0728, "step": 2670 }, { "epoch": 2.5845410628019323, "grad_norm": 0.3606136441230774, "learning_rate": 1.940058008378988e-06, "loss": 0.0735, "step": 2675 }, { "epoch": 2.5893719806763285, "grad_norm": 0.21871723234653473, "learning_rate": 1.9174991943280695e-06, "loss": 0.0682, "step": 2680 }, { "epoch": 2.5942028985507246, "grad_norm": 0.2941882312297821, "learning_rate": 1.894940380277151e-06, "loss": 0.0722, "step": 2685 }, { "epoch": 2.5990338164251208, "grad_norm": 0.31706181168556213, "learning_rate": 1.8723815662262327e-06, "loss": 0.0698, "step": 2690 }, { "epoch": 2.603864734299517, "grad_norm": 0.25599217414855957, "learning_rate": 1.849822752175314e-06, "loss": 0.0691, "step": 2695 }, { "epoch": 2.608695652173913, "grad_norm": 0.2954462468624115, "learning_rate": 1.8272639381243958e-06, "loss": 0.0831, "step": 2700 }, { "epoch": 2.6135265700483092, "grad_norm": 0.31768399477005005, "learning_rate": 1.8047051240734774e-06, "loss": 0.0684, "step": 2705 }, { "epoch": 2.6183574879227054, "grad_norm": 0.2380971759557724, "learning_rate": 1.7821463100225587e-06, "loss": 0.0604, "step": 2710 }, { "epoch": 2.6231884057971016, "grad_norm": 0.2857172191143036, "learning_rate": 1.7595874959716405e-06, "loss": 0.0648, "step": 2715 }, { "epoch": 2.6280193236714977, "grad_norm": 0.2866944968700409, "learning_rate": 1.7370286819207219e-06, "loss": 0.067, "step": 2720 }, { "epoch": 2.632850241545894, "grad_norm": 0.3259107172489166, "learning_rate": 1.7144698678698034e-06, "loss": 0.0789, "step": 2725 }, { "epoch": 2.63768115942029, "grad_norm": 0.23563902080059052, "learning_rate": 1.691911053818885e-06, "loss": 0.0826, "step": 2730 }, { "epoch": 2.642512077294686, "grad_norm": 0.33754512667655945, "learning_rate": 1.6693522397679664e-06, "loss": 0.0756, "step": 2735 }, { "epoch": 2.6473429951690823, "grad_norm": 0.22349333763122559, "learning_rate": 1.646793425717048e-06, "loss": 0.0773, "step": 2740 }, { "epoch": 2.6521739130434785, "grad_norm": 0.42616990208625793, "learning_rate": 1.6242346116661295e-06, "loss": 0.0676, "step": 2745 }, { "epoch": 2.6570048309178746, "grad_norm": 0.27920448780059814, "learning_rate": 1.601675797615211e-06, "loss": 0.07, "step": 2750 }, { "epoch": 2.661835748792271, "grad_norm": 0.34114235639572144, "learning_rate": 1.5791169835642926e-06, "loss": 0.0807, "step": 2755 }, { "epoch": 2.6666666666666665, "grad_norm": 0.2515537142753601, "learning_rate": 1.556558169513374e-06, "loss": 0.0739, "step": 2760 }, { "epoch": 2.6714975845410627, "grad_norm": 0.24267147481441498, "learning_rate": 1.5339993554624556e-06, "loss": 0.0727, "step": 2765 }, { "epoch": 2.676328502415459, "grad_norm": 0.290988564491272, "learning_rate": 1.5114405414115374e-06, "loss": 0.0739, "step": 2770 }, { "epoch": 2.681159420289855, "grad_norm": 0.3821360766887665, "learning_rate": 1.4888817273606187e-06, "loss": 0.0792, "step": 2775 }, { "epoch": 2.685990338164251, "grad_norm": 0.284109890460968, "learning_rate": 1.4663229133097003e-06, "loss": 0.0767, "step": 2780 }, { "epoch": 2.6908212560386473, "grad_norm": 0.303076833486557, "learning_rate": 1.4437640992587819e-06, "loss": 0.0714, "step": 2785 }, { "epoch": 2.6956521739130435, "grad_norm": 0.37678495049476624, "learning_rate": 1.4212052852078632e-06, "loss": 0.0555, "step": 2790 }, { "epoch": 2.7004830917874396, "grad_norm": 0.23108994960784912, "learning_rate": 1.398646471156945e-06, "loss": 0.0833, "step": 2795 }, { "epoch": 2.7053140096618358, "grad_norm": 0.3246385157108307, "learning_rate": 1.3760876571060266e-06, "loss": 0.076, "step": 2800 }, { "epoch": 2.710144927536232, "grad_norm": 0.2140025794506073, "learning_rate": 1.353528843055108e-06, "loss": 0.0791, "step": 2805 }, { "epoch": 2.714975845410628, "grad_norm": 0.2923656404018402, "learning_rate": 1.3309700290041895e-06, "loss": 0.0892, "step": 2810 }, { "epoch": 2.7198067632850242, "grad_norm": 0.2978055775165558, "learning_rate": 1.308411214953271e-06, "loss": 0.0647, "step": 2815 }, { "epoch": 2.7246376811594204, "grad_norm": 0.2982514500617981, "learning_rate": 1.2858524009023524e-06, "loss": 0.0677, "step": 2820 }, { "epoch": 2.7294685990338166, "grad_norm": 0.2721270024776459, "learning_rate": 1.2632935868514342e-06, "loss": 0.0633, "step": 2825 }, { "epoch": 2.7342995169082127, "grad_norm": 0.2582114636898041, "learning_rate": 1.2407347728005156e-06, "loss": 0.0721, "step": 2830 }, { "epoch": 2.7391304347826084, "grad_norm": 0.2242422103881836, "learning_rate": 1.2181759587495971e-06, "loss": 0.0694, "step": 2835 }, { "epoch": 2.7439613526570046, "grad_norm": 0.2729090750217438, "learning_rate": 1.1956171446986787e-06, "loss": 0.0726, "step": 2840 }, { "epoch": 2.7487922705314007, "grad_norm": 0.34203121066093445, "learning_rate": 1.17305833064776e-06, "loss": 0.0796, "step": 2845 }, { "epoch": 2.753623188405797, "grad_norm": 0.30749765038490295, "learning_rate": 1.1504995165968418e-06, "loss": 0.07, "step": 2850 }, { "epoch": 2.758454106280193, "grad_norm": 0.3750080168247223, "learning_rate": 1.1279407025459234e-06, "loss": 0.08, "step": 2855 }, { "epoch": 2.763285024154589, "grad_norm": 0.32321617007255554, "learning_rate": 1.1053818884950048e-06, "loss": 0.082, "step": 2860 }, { "epoch": 2.7681159420289854, "grad_norm": 0.25304415822029114, "learning_rate": 1.0828230744440863e-06, "loss": 0.076, "step": 2865 }, { "epoch": 2.7729468599033815, "grad_norm": 0.30696550011634827, "learning_rate": 1.060264260393168e-06, "loss": 0.0703, "step": 2870 }, { "epoch": 2.7777777777777777, "grad_norm": 0.3218288719654083, "learning_rate": 1.0377054463422493e-06, "loss": 0.0696, "step": 2875 }, { "epoch": 2.782608695652174, "grad_norm": 0.2573774755001068, "learning_rate": 1.015146632291331e-06, "loss": 0.0711, "step": 2880 }, { "epoch": 2.78743961352657, "grad_norm": 0.3438413143157959, "learning_rate": 9.925878182404126e-07, "loss": 0.0805, "step": 2885 }, { "epoch": 2.792270531400966, "grad_norm": 0.3613496422767639, "learning_rate": 9.70029004189494e-07, "loss": 0.0742, "step": 2890 }, { "epoch": 2.7971014492753623, "grad_norm": 0.2860325276851654, "learning_rate": 9.474701901385755e-07, "loss": 0.0735, "step": 2895 }, { "epoch": 2.8019323671497585, "grad_norm": 0.240507572889328, "learning_rate": 9.24911376087657e-07, "loss": 0.0677, "step": 2900 }, { "epoch": 2.8067632850241546, "grad_norm": 0.28737547993659973, "learning_rate": 9.023525620367387e-07, "loss": 0.0666, "step": 2905 }, { "epoch": 2.8115942028985508, "grad_norm": 0.34197041392326355, "learning_rate": 8.797937479858203e-07, "loss": 0.0799, "step": 2910 }, { "epoch": 2.816425120772947, "grad_norm": 0.326251745223999, "learning_rate": 8.572349339349017e-07, "loss": 0.0691, "step": 2915 }, { "epoch": 2.821256038647343, "grad_norm": 0.42289331555366516, "learning_rate": 8.346761198839832e-07, "loss": 0.0746, "step": 2920 }, { "epoch": 2.8260869565217392, "grad_norm": 0.28735774755477905, "learning_rate": 8.121173058330648e-07, "loss": 0.0782, "step": 2925 }, { "epoch": 2.8309178743961354, "grad_norm": 0.29395702481269836, "learning_rate": 7.895584917821463e-07, "loss": 0.08, "step": 2930 }, { "epoch": 2.8357487922705316, "grad_norm": 0.3306836187839508, "learning_rate": 7.669996777312278e-07, "loss": 0.0869, "step": 2935 }, { "epoch": 2.8405797101449277, "grad_norm": 0.2740659713745117, "learning_rate": 7.444408636803094e-07, "loss": 0.064, "step": 2940 }, { "epoch": 2.845410628019324, "grad_norm": 0.28304237127304077, "learning_rate": 7.218820496293909e-07, "loss": 0.0769, "step": 2945 }, { "epoch": 2.85024154589372, "grad_norm": 0.3081373870372772, "learning_rate": 6.993232355784725e-07, "loss": 0.0783, "step": 2950 }, { "epoch": 2.855072463768116, "grad_norm": 0.3063504099845886, "learning_rate": 6.76764421527554e-07, "loss": 0.0643, "step": 2955 }, { "epoch": 2.8599033816425123, "grad_norm": 0.2641620635986328, "learning_rate": 6.542056074766355e-07, "loss": 0.0658, "step": 2960 }, { "epoch": 2.864734299516908, "grad_norm": 0.3239176869392395, "learning_rate": 6.316467934257171e-07, "loss": 0.0677, "step": 2965 }, { "epoch": 2.869565217391304, "grad_norm": 0.23815782368183136, "learning_rate": 6.090879793747986e-07, "loss": 0.0686, "step": 2970 }, { "epoch": 2.8743961352657004, "grad_norm": 0.26518934965133667, "learning_rate": 5.8652916532388e-07, "loss": 0.073, "step": 2975 }, { "epoch": 2.8792270531400965, "grad_norm": 0.2455345243215561, "learning_rate": 5.639703512729617e-07, "loss": 0.0664, "step": 2980 }, { "epoch": 2.8840579710144927, "grad_norm": 0.2730591893196106, "learning_rate": 5.414115372220432e-07, "loss": 0.0745, "step": 2985 }, { "epoch": 2.888888888888889, "grad_norm": 0.3046686351299286, "learning_rate": 5.188527231711246e-07, "loss": 0.0637, "step": 2990 }, { "epoch": 2.893719806763285, "grad_norm": 0.26765045523643494, "learning_rate": 4.962939091202063e-07, "loss": 0.0818, "step": 2995 }, { "epoch": 2.898550724637681, "grad_norm": 0.2611401677131653, "learning_rate": 4.7373509506928777e-07, "loss": 0.0871, "step": 3000 }, { "epoch": 2.9033816425120773, "grad_norm": 0.3256029486656189, "learning_rate": 4.5117628101836934e-07, "loss": 0.0772, "step": 3005 }, { "epoch": 2.9082125603864735, "grad_norm": 0.3779186010360718, "learning_rate": 4.2861746696745086e-07, "loss": 0.0709, "step": 3010 }, { "epoch": 2.9130434782608696, "grad_norm": 0.248891681432724, "learning_rate": 4.060586529165324e-07, "loss": 0.0836, "step": 3015 }, { "epoch": 2.917874396135266, "grad_norm": 0.27647843956947327, "learning_rate": 3.834998388656139e-07, "loss": 0.0636, "step": 3020 }, { "epoch": 2.922705314009662, "grad_norm": 0.28876233100891113, "learning_rate": 3.6094102481469546e-07, "loss": 0.0648, "step": 3025 }, { "epoch": 2.927536231884058, "grad_norm": 0.26836660504341125, "learning_rate": 3.38382210763777e-07, "loss": 0.0726, "step": 3030 }, { "epoch": 2.9323671497584543, "grad_norm": 0.2655857503414154, "learning_rate": 3.1582339671285855e-07, "loss": 0.0736, "step": 3035 }, { "epoch": 2.9371980676328504, "grad_norm": 0.30681997537612915, "learning_rate": 2.9326458266194e-07, "loss": 0.0688, "step": 3040 }, { "epoch": 2.942028985507246, "grad_norm": 0.3034045994281769, "learning_rate": 2.707057686110216e-07, "loss": 0.0611, "step": 3045 }, { "epoch": 2.9468599033816423, "grad_norm": 0.24807259440422058, "learning_rate": 2.4814695456010315e-07, "loss": 0.0782, "step": 3050 }, { "epoch": 2.9516908212560384, "grad_norm": 0.34220463037490845, "learning_rate": 2.2558814050918467e-07, "loss": 0.0751, "step": 3055 }, { "epoch": 2.9565217391304346, "grad_norm": 0.2882407009601593, "learning_rate": 2.030293264582662e-07, "loss": 0.0686, "step": 3060 }, { "epoch": 2.9613526570048307, "grad_norm": 0.31148266792297363, "learning_rate": 1.8047051240734773e-07, "loss": 0.0668, "step": 3065 }, { "epoch": 2.966183574879227, "grad_norm": 0.2847365736961365, "learning_rate": 1.5791169835642927e-07, "loss": 0.0785, "step": 3070 }, { "epoch": 2.971014492753623, "grad_norm": 0.2872695028781891, "learning_rate": 1.353528843055108e-07, "loss": 0.0723, "step": 3075 }, { "epoch": 2.975845410628019, "grad_norm": 0.24350111186504364, "learning_rate": 1.1279407025459234e-07, "loss": 0.0669, "step": 3080 }, { "epoch": 2.9806763285024154, "grad_norm": 0.2746003270149231, "learning_rate": 9.023525620367387e-08, "loss": 0.0814, "step": 3085 }, { "epoch": 2.9855072463768115, "grad_norm": 0.255521684885025, "learning_rate": 6.76764421527554e-08, "loss": 0.0774, "step": 3090 }, { "epoch": 2.9903381642512077, "grad_norm": 0.35289525985717773, "learning_rate": 4.511762810183693e-08, "loss": 0.0645, "step": 3095 }, { "epoch": 2.995169082125604, "grad_norm": 0.279884934425354, "learning_rate": 2.2558814050918466e-08, "loss": 0.0738, "step": 3100 }, { "epoch": 3.0, "grad_norm": 0.4045591652393341, "learning_rate": 0.0, "loss": 0.0711, "step": 3105 }, { "epoch": 3.0, "eval_runtime": 338.6263, "eval_samples_per_second": 3.054, "eval_steps_per_second": 0.384, "step": 3105 } ], "logging_steps": 5, "max_steps": 3105, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1255832139272192e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }