{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2181, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004585052728106373, "grad_norm": 25.35940676221757, "learning_rate": 4.5662100456621004e-08, "loss": 1.4356, "step": 1 }, { "epoch": 0.0022925263640531865, "grad_norm": 23.277460508193656, "learning_rate": 2.2831050228310502e-07, "loss": 1.4178, "step": 5 }, { "epoch": 0.004585052728106373, "grad_norm": 15.396159390081614, "learning_rate": 4.5662100456621004e-07, "loss": 1.3928, "step": 10 }, { "epoch": 0.0068775790921595595, "grad_norm": 9.927996187561872, "learning_rate": 6.849315068493151e-07, "loss": 1.2487, "step": 15 }, { "epoch": 0.009170105456212746, "grad_norm": 8.936136397262343, "learning_rate": 9.132420091324201e-07, "loss": 1.1467, "step": 20 }, { "epoch": 0.011462631820265932, "grad_norm": 3.9419002716272007, "learning_rate": 1.1415525114155251e-06, "loss": 1.0321, "step": 25 }, { "epoch": 0.013755158184319119, "grad_norm": 3.2824292809209212, "learning_rate": 1.3698630136986302e-06, "loss": 0.9911, "step": 30 }, { "epoch": 0.016047684548372305, "grad_norm": 3.198808731865913, "learning_rate": 1.5981735159817353e-06, "loss": 0.9499, "step": 35 }, { "epoch": 0.018340210912425492, "grad_norm": 3.200026153105945, "learning_rate": 1.8264840182648401e-06, "loss": 0.9394, "step": 40 }, { "epoch": 0.02063273727647868, "grad_norm": 3.1015042038551264, "learning_rate": 2.0547945205479454e-06, "loss": 0.9374, "step": 45 }, { "epoch": 0.022925263640531865, "grad_norm": 3.0638884680066116, "learning_rate": 2.2831050228310503e-06, "loss": 0.9366, "step": 50 }, { "epoch": 0.02521779000458505, "grad_norm": 3.1218708697344337, "learning_rate": 2.511415525114155e-06, "loss": 0.9072, "step": 55 }, { "epoch": 0.027510316368638238, "grad_norm": 3.030931859384564, "learning_rate": 2.7397260273972604e-06, "loss": 0.896, "step": 60 }, { "epoch": 0.029802842732691424, "grad_norm": 3.183215428730836, "learning_rate": 2.9680365296803653e-06, "loss": 0.904, "step": 65 }, { "epoch": 0.03209536909674461, "grad_norm": 3.1193991823217884, "learning_rate": 3.1963470319634706e-06, "loss": 0.8992, "step": 70 }, { "epoch": 0.0343878954607978, "grad_norm": 3.1562480345048662, "learning_rate": 3.4246575342465754e-06, "loss": 0.9008, "step": 75 }, { "epoch": 0.036680421824850984, "grad_norm": 3.1106379275365263, "learning_rate": 3.6529680365296803e-06, "loss": 0.8835, "step": 80 }, { "epoch": 0.03897294818890417, "grad_norm": 3.1659334626442455, "learning_rate": 3.881278538812785e-06, "loss": 0.8798, "step": 85 }, { "epoch": 0.04126547455295736, "grad_norm": 3.1010027836059533, "learning_rate": 4.109589041095891e-06, "loss": 0.879, "step": 90 }, { "epoch": 0.04355800091701054, "grad_norm": 3.3519588401192273, "learning_rate": 4.337899543378996e-06, "loss": 0.8615, "step": 95 }, { "epoch": 0.04585052728106373, "grad_norm": 3.049285908948199, "learning_rate": 4.566210045662101e-06, "loss": 0.8529, "step": 100 }, { "epoch": 0.048143053645116916, "grad_norm": 3.109756439871898, "learning_rate": 4.7945205479452054e-06, "loss": 0.8654, "step": 105 }, { "epoch": 0.0504355800091701, "grad_norm": 3.1513505710159335, "learning_rate": 5.02283105022831e-06, "loss": 0.8663, "step": 110 }, { "epoch": 0.05272810637322329, "grad_norm": 3.1767156567086614, "learning_rate": 5.251141552511416e-06, "loss": 0.8613, "step": 115 }, { "epoch": 0.055020632737276476, "grad_norm": 3.453537287264967, "learning_rate": 5.479452054794521e-06, "loss": 0.8771, "step": 120 }, { "epoch": 0.05731315910132966, "grad_norm": 3.013155684535603, "learning_rate": 5.7077625570776266e-06, "loss": 0.8473, "step": 125 }, { "epoch": 0.05960568546538285, "grad_norm": 3.425642520518735, "learning_rate": 5.936073059360731e-06, "loss": 0.8521, "step": 130 }, { "epoch": 0.061898211829436035, "grad_norm": 3.031927176672884, "learning_rate": 6.164383561643836e-06, "loss": 0.84, "step": 135 }, { "epoch": 0.06419073819348922, "grad_norm": 3.239390421336056, "learning_rate": 6.392694063926941e-06, "loss": 0.859, "step": 140 }, { "epoch": 0.06648326455754242, "grad_norm": 3.017820442924467, "learning_rate": 6.621004566210046e-06, "loss": 0.86, "step": 145 }, { "epoch": 0.0687757909215956, "grad_norm": 3.0002036905279503, "learning_rate": 6.849315068493151e-06, "loss": 0.8525, "step": 150 }, { "epoch": 0.07106831728564879, "grad_norm": 3.1828998491124016, "learning_rate": 7.077625570776257e-06, "loss": 0.8433, "step": 155 }, { "epoch": 0.07336084364970197, "grad_norm": 3.087610569097963, "learning_rate": 7.305936073059361e-06, "loss": 0.8361, "step": 160 }, { "epoch": 0.07565337001375516, "grad_norm": 3.115099552868115, "learning_rate": 7.534246575342466e-06, "loss": 0.8436, "step": 165 }, { "epoch": 0.07794589637780834, "grad_norm": 3.1551201699069282, "learning_rate": 7.76255707762557e-06, "loss": 0.8311, "step": 170 }, { "epoch": 0.08023842274186153, "grad_norm": 3.2013023977541617, "learning_rate": 7.990867579908676e-06, "loss": 0.8244, "step": 175 }, { "epoch": 0.08253094910591471, "grad_norm": 3.1031180959674716, "learning_rate": 8.219178082191782e-06, "loss": 0.8362, "step": 180 }, { "epoch": 0.08482347546996791, "grad_norm": 3.056534274967503, "learning_rate": 8.447488584474887e-06, "loss": 0.827, "step": 185 }, { "epoch": 0.08711600183402109, "grad_norm": 2.8738007240926016, "learning_rate": 8.675799086757991e-06, "loss": 0.8264, "step": 190 }, { "epoch": 0.08940852819807428, "grad_norm": 2.9833947743009044, "learning_rate": 8.904109589041097e-06, "loss": 0.8364, "step": 195 }, { "epoch": 0.09170105456212746, "grad_norm": 3.0590617698737606, "learning_rate": 9.132420091324201e-06, "loss": 0.8385, "step": 200 }, { "epoch": 0.09399358092618065, "grad_norm": 2.9544649860589964, "learning_rate": 9.360730593607307e-06, "loss": 0.8306, "step": 205 }, { "epoch": 0.09628610729023383, "grad_norm": 3.156467119939513, "learning_rate": 9.589041095890411e-06, "loss": 0.812, "step": 210 }, { "epoch": 0.09857863365428703, "grad_norm": 3.241792877196348, "learning_rate": 9.817351598173517e-06, "loss": 0.8098, "step": 215 }, { "epoch": 0.1008711600183402, "grad_norm": 3.329896188306964, "learning_rate": 9.999993590241675e-06, "loss": 0.8321, "step": 220 }, { "epoch": 0.1031636863823934, "grad_norm": 2.961456684151267, "learning_rate": 9.999769250425817e-06, "loss": 0.8296, "step": 225 }, { "epoch": 0.10545621274644658, "grad_norm": 3.0123856993460723, "learning_rate": 9.999224439127452e-06, "loss": 0.8223, "step": 230 }, { "epoch": 0.10774873911049977, "grad_norm": 3.1722352404227263, "learning_rate": 9.998359191267488e-06, "loss": 0.8183, "step": 235 }, { "epoch": 0.11004126547455295, "grad_norm": 3.339283823835408, "learning_rate": 9.997173562305937e-06, "loss": 0.812, "step": 240 }, { "epoch": 0.11233379183860615, "grad_norm": 3.051005936600519, "learning_rate": 9.995667628238362e-06, "loss": 0.8159, "step": 245 }, { "epoch": 0.11462631820265932, "grad_norm": 3.621892868476315, "learning_rate": 9.993841485591e-06, "loss": 0.8265, "step": 250 }, { "epoch": 0.11691884456671252, "grad_norm": 3.1501195933267727, "learning_rate": 9.991695251414584e-06, "loss": 0.7829, "step": 255 }, { "epoch": 0.1192113709307657, "grad_norm": 3.2077051728198436, "learning_rate": 9.989229063276829e-06, "loss": 0.8061, "step": 260 }, { "epoch": 0.12150389729481889, "grad_norm": 2.813867856532736, "learning_rate": 9.986443079253628e-06, "loss": 0.8088, "step": 265 }, { "epoch": 0.12379642365887207, "grad_norm": 2.953479405448006, "learning_rate": 9.983337477918904e-06, "loss": 0.8013, "step": 270 }, { "epoch": 0.12608895002292525, "grad_norm": 2.9765536692485752, "learning_rate": 9.979912458333179e-06, "loss": 0.8112, "step": 275 }, { "epoch": 0.12838147638697844, "grad_norm": 2.9261553011693313, "learning_rate": 9.976168240030804e-06, "loss": 0.797, "step": 280 }, { "epoch": 0.13067400275103164, "grad_norm": 2.7549890848982668, "learning_rate": 9.972105063005895e-06, "loss": 0.8047, "step": 285 }, { "epoch": 0.13296652911508483, "grad_norm": 2.783923747108222, "learning_rate": 9.96772318769694e-06, "loss": 0.8045, "step": 290 }, { "epoch": 0.13525905547913802, "grad_norm": 2.922181282361273, "learning_rate": 9.96302289497012e-06, "loss": 0.7891, "step": 295 }, { "epoch": 0.1375515818431912, "grad_norm": 2.8387565382348807, "learning_rate": 9.958004486101293e-06, "loss": 0.7756, "step": 300 }, { "epoch": 0.13984410820724438, "grad_norm": 2.869327340764152, "learning_rate": 9.952668282756692e-06, "loss": 0.8027, "step": 305 }, { "epoch": 0.14213663457129758, "grad_norm": 2.874303723785054, "learning_rate": 9.947014626972298e-06, "loss": 0.7826, "step": 310 }, { "epoch": 0.14442916093535077, "grad_norm": 2.737834462358364, "learning_rate": 9.941043881131928e-06, "loss": 0.7702, "step": 315 }, { "epoch": 0.14672168729940394, "grad_norm": 2.858629644409334, "learning_rate": 9.934756427943996e-06, "loss": 0.7761, "step": 320 }, { "epoch": 0.14901421366345713, "grad_norm": 2.941702373835629, "learning_rate": 9.92815267041699e-06, "loss": 0.7778, "step": 325 }, { "epoch": 0.15130674002751032, "grad_norm": 2.832449171435636, "learning_rate": 9.921233031833639e-06, "loss": 0.7747, "step": 330 }, { "epoch": 0.15359926639156352, "grad_norm": 2.838327247569131, "learning_rate": 9.913997955723777e-06, "loss": 0.7798, "step": 335 }, { "epoch": 0.15589179275561668, "grad_norm": 3.0053878829121357, "learning_rate": 9.90644790583592e-06, "loss": 0.7504, "step": 340 }, { "epoch": 0.15818431911966988, "grad_norm": 2.737407601036532, "learning_rate": 9.898583366107539e-06, "loss": 0.7655, "step": 345 }, { "epoch": 0.16047684548372307, "grad_norm": 3.0259958169837717, "learning_rate": 9.890404840634037e-06, "loss": 0.7582, "step": 350 }, { "epoch": 0.16276937184777626, "grad_norm": 2.804766086619055, "learning_rate": 9.881912853636445e-06, "loss": 0.7747, "step": 355 }, { "epoch": 0.16506189821182943, "grad_norm": 2.7915942235581785, "learning_rate": 9.873107949427815e-06, "loss": 0.7584, "step": 360 }, { "epoch": 0.16735442457588262, "grad_norm": 2.8708773578370588, "learning_rate": 9.863990692378333e-06, "loss": 0.7538, "step": 365 }, { "epoch": 0.16964695093993581, "grad_norm": 2.8372441642155097, "learning_rate": 9.854561666879148e-06, "loss": 0.7457, "step": 370 }, { "epoch": 0.171939477303989, "grad_norm": 2.7820083192682197, "learning_rate": 9.844821477304904e-06, "loss": 0.775, "step": 375 }, { "epoch": 0.17423200366804217, "grad_norm": 2.6780715561867066, "learning_rate": 9.834770747975015e-06, "loss": 0.7442, "step": 380 }, { "epoch": 0.17652453003209537, "grad_norm": 2.7545319149727763, "learning_rate": 9.824410123113634e-06, "loss": 0.7416, "step": 385 }, { "epoch": 0.17881705639614856, "grad_norm": 2.6402444423405225, "learning_rate": 9.813740266808375e-06, "loss": 0.7362, "step": 390 }, { "epoch": 0.18110958276020175, "grad_norm": 2.730909608534738, "learning_rate": 9.802761862967731e-06, "loss": 0.7252, "step": 395 }, { "epoch": 0.18340210912425492, "grad_norm": 2.9284254959639355, "learning_rate": 9.791475615277248e-06, "loss": 0.7453, "step": 400 }, { "epoch": 0.1856946354883081, "grad_norm": 2.790088757652803, "learning_rate": 9.779882247154419e-06, "loss": 0.7344, "step": 405 }, { "epoch": 0.1879871618523613, "grad_norm": 2.725250925456166, "learning_rate": 9.76798250170231e-06, "loss": 0.7246, "step": 410 }, { "epoch": 0.1902796882164145, "grad_norm": 2.667869321574359, "learning_rate": 9.755777141661937e-06, "loss": 0.7193, "step": 415 }, { "epoch": 0.19257221458046767, "grad_norm": 2.5119646512097997, "learning_rate": 9.743266949363368e-06, "loss": 0.7402, "step": 420 }, { "epoch": 0.19486474094452086, "grad_norm": 2.847215415311532, "learning_rate": 9.730452726675583e-06, "loss": 0.7173, "step": 425 }, { "epoch": 0.19715726730857405, "grad_norm": 2.779126735326216, "learning_rate": 9.717335294955078e-06, "loss": 0.7157, "step": 430 }, { "epoch": 0.19944979367262725, "grad_norm": 3.4561646981046454, "learning_rate": 9.703915494993215e-06, "loss": 0.7312, "step": 435 }, { "epoch": 0.2017423200366804, "grad_norm": 2.7730394910581913, "learning_rate": 9.690194186962326e-06, "loss": 0.7335, "step": 440 }, { "epoch": 0.2040348464007336, "grad_norm": 2.859201150645261, "learning_rate": 9.676172250360583e-06, "loss": 0.7383, "step": 445 }, { "epoch": 0.2063273727647868, "grad_norm": 2.9209175577350313, "learning_rate": 9.66185058395563e-06, "loss": 0.7263, "step": 450 }, { "epoch": 0.20861989912884, "grad_norm": 2.704547531489439, "learning_rate": 9.647230105726963e-06, "loss": 0.7143, "step": 455 }, { "epoch": 0.21091242549289316, "grad_norm": 2.670951446360455, "learning_rate": 9.632311752807097e-06, "loss": 0.7307, "step": 460 }, { "epoch": 0.21320495185694635, "grad_norm": 3.2268092839390485, "learning_rate": 9.617096481421498e-06, "loss": 0.6985, "step": 465 }, { "epoch": 0.21549747822099954, "grad_norm": 2.939723635315935, "learning_rate": 9.601585266827288e-06, "loss": 0.7181, "step": 470 }, { "epoch": 0.21779000458505274, "grad_norm": 2.7240300289732082, "learning_rate": 9.58577910325074e-06, "loss": 0.7079, "step": 475 }, { "epoch": 0.2200825309491059, "grad_norm": 2.7348057628577815, "learning_rate": 9.569679003823542e-06, "loss": 0.7063, "step": 480 }, { "epoch": 0.2223750573131591, "grad_norm": 2.6209148336683894, "learning_rate": 9.55328600051787e-06, "loss": 0.7019, "step": 485 }, { "epoch": 0.2246675836772123, "grad_norm": 2.7094717894075093, "learning_rate": 9.536601144080224e-06, "loss": 0.6933, "step": 490 }, { "epoch": 0.22696011004126548, "grad_norm": 2.6005478056383393, "learning_rate": 9.5196255039641e-06, "loss": 0.7008, "step": 495 }, { "epoch": 0.22925263640531865, "grad_norm": 2.9435017052734933, "learning_rate": 9.502360168261424e-06, "loss": 0.7168, "step": 500 }, { "epoch": 0.23154516276937184, "grad_norm": 15.281241231781962, "learning_rate": 9.48480624363281e-06, "loss": 0.6968, "step": 505 }, { "epoch": 0.23383768913342504, "grad_norm": 2.803746155734926, "learning_rate": 9.46696485523664e-06, "loss": 0.7176, "step": 510 }, { "epoch": 0.23613021549747823, "grad_norm": 2.9572910983459275, "learning_rate": 9.448837146656924e-06, "loss": 0.6983, "step": 515 }, { "epoch": 0.2384227418615314, "grad_norm": 2.66575290909559, "learning_rate": 9.430424279830014e-06, "loss": 0.679, "step": 520 }, { "epoch": 0.2407152682255846, "grad_norm": 2.6071015601683056, "learning_rate": 9.411727434970121e-06, "loss": 0.6796, "step": 525 }, { "epoch": 0.24300779458963778, "grad_norm": 2.6190152299969975, "learning_rate": 9.392747810493675e-06, "loss": 0.6922, "step": 530 }, { "epoch": 0.24530032095369098, "grad_norm": 2.9035286162764624, "learning_rate": 9.373486622942494e-06, "loss": 0.6881, "step": 535 }, { "epoch": 0.24759284731774414, "grad_norm": 2.722112266367375, "learning_rate": 9.353945106905822e-06, "loss": 0.691, "step": 540 }, { "epoch": 0.24988537368179733, "grad_norm": 2.8551591177378173, "learning_rate": 9.334124514941185e-06, "loss": 0.6786, "step": 545 }, { "epoch": 0.2521779000458505, "grad_norm": 2.789372421806793, "learning_rate": 9.314026117494116e-06, "loss": 0.6965, "step": 550 }, { "epoch": 0.2544704264099037, "grad_norm": 2.943178087845294, "learning_rate": 9.29365120281671e-06, "loss": 0.6734, "step": 555 }, { "epoch": 0.2567629527739569, "grad_norm": 2.9269593678262678, "learning_rate": 9.273001076885059e-06, "loss": 0.6567, "step": 560 }, { "epoch": 0.2590554791380101, "grad_norm": 2.7577714835234457, "learning_rate": 9.252077063315545e-06, "loss": 0.6628, "step": 565 }, { "epoch": 0.2613480055020633, "grad_norm": 2.595587224144848, "learning_rate": 9.230880503279991e-06, "loss": 0.6593, "step": 570 }, { "epoch": 0.26364053186611647, "grad_norm": 2.6421320876444425, "learning_rate": 9.209412755419703e-06, "loss": 0.6616, "step": 575 }, { "epoch": 0.26593305823016966, "grad_norm": 2.5889083746551487, "learning_rate": 9.18767519575838e-06, "loss": 0.6574, "step": 580 }, { "epoch": 0.26822558459422285, "grad_norm": 2.644361824371662, "learning_rate": 9.165669217613919e-06, "loss": 0.6631, "step": 585 }, { "epoch": 0.27051811095827605, "grad_norm": 2.7328270481402166, "learning_rate": 9.143396231509102e-06, "loss": 0.6591, "step": 590 }, { "epoch": 0.2728106373223292, "grad_norm": 2.6202953814608247, "learning_rate": 9.12085766508119e-06, "loss": 0.6465, "step": 595 }, { "epoch": 0.2751031636863824, "grad_norm": 2.688621083531908, "learning_rate": 9.098054962990415e-06, "loss": 0.6678, "step": 600 }, { "epoch": 0.2773956900504356, "grad_norm": 2.684577688850206, "learning_rate": 9.074989586827375e-06, "loss": 0.6478, "step": 605 }, { "epoch": 0.27968821641448877, "grad_norm": 2.6991742230220708, "learning_rate": 9.05166301501936e-06, "loss": 0.6575, "step": 610 }, { "epoch": 0.28198074277854196, "grad_norm": 2.8422733898390353, "learning_rate": 9.028076742735583e-06, "loss": 0.6606, "step": 615 }, { "epoch": 0.28427326914259515, "grad_norm": 3.3111069999457174, "learning_rate": 9.004232281791341e-06, "loss": 0.6501, "step": 620 }, { "epoch": 0.28656579550664835, "grad_norm": 2.8352207612326676, "learning_rate": 8.980131160551118e-06, "loss": 0.6497, "step": 625 }, { "epoch": 0.28885832187070154, "grad_norm": 2.622577509095012, "learning_rate": 8.955774923830618e-06, "loss": 0.6265, "step": 630 }, { "epoch": 0.2911508482347547, "grad_norm": 2.6180287881898363, "learning_rate": 8.931165132797747e-06, "loss": 0.6397, "step": 635 }, { "epoch": 0.29344337459880787, "grad_norm": 2.7463986227282713, "learning_rate": 8.906303364872545e-06, "loss": 0.6668, "step": 640 }, { "epoch": 0.29573590096286106, "grad_norm": 2.6468423935127254, "learning_rate": 8.881191213626084e-06, "loss": 0.6393, "step": 645 }, { "epoch": 0.29802842732691426, "grad_norm": 2.6005030935816245, "learning_rate": 8.855830288678311e-06, "loss": 0.644, "step": 650 }, { "epoch": 0.30032095369096745, "grad_norm": 2.7192686848560554, "learning_rate": 8.83022221559489e-06, "loss": 0.6479, "step": 655 }, { "epoch": 0.30261348005502064, "grad_norm": 2.673457233400223, "learning_rate": 8.804368635783002e-06, "loss": 0.6384, "step": 660 }, { "epoch": 0.30490600641907384, "grad_norm": 2.850654385793331, "learning_rate": 8.778271206386135e-06, "loss": 0.6456, "step": 665 }, { "epoch": 0.30719853278312703, "grad_norm": 2.6958806241423643, "learning_rate": 8.751931600177863e-06, "loss": 0.6025, "step": 670 }, { "epoch": 0.30949105914718017, "grad_norm": 2.764991202053115, "learning_rate": 8.725351505454631e-06, "loss": 0.6194, "step": 675 }, { "epoch": 0.31178358551123336, "grad_norm": 2.6590991144561906, "learning_rate": 8.69853262592754e-06, "loss": 0.6348, "step": 680 }, { "epoch": 0.31407611187528656, "grad_norm": 2.708732600879308, "learning_rate": 8.671476680613134e-06, "loss": 0.6411, "step": 685 }, { "epoch": 0.31636863823933975, "grad_norm": 2.5456418831079457, "learning_rate": 8.644185403723231e-06, "loss": 0.6138, "step": 690 }, { "epoch": 0.31866116460339294, "grad_norm": 2.903106819651818, "learning_rate": 8.616660544553754e-06, "loss": 0.6237, "step": 695 }, { "epoch": 0.32095369096744614, "grad_norm": 2.7280408027219942, "learning_rate": 8.588903867372607e-06, "loss": 0.6138, "step": 700 }, { "epoch": 0.32324621733149933, "grad_norm": 2.886662280669305, "learning_rate": 8.560917151306594e-06, "loss": 0.6066, "step": 705 }, { "epoch": 0.3255387436955525, "grad_norm": 2.6016420791711994, "learning_rate": 8.53270219022738e-06, "loss": 0.6126, "step": 710 }, { "epoch": 0.32783127005960566, "grad_norm": 2.5696831024854827, "learning_rate": 8.50426079263651e-06, "loss": 0.6191, "step": 715 }, { "epoch": 0.33012379642365886, "grad_norm": 2.789642739261612, "learning_rate": 8.475594781549483e-06, "loss": 0.6171, "step": 720 }, { "epoch": 0.33241632278771205, "grad_norm": 2.662350967821026, "learning_rate": 8.446705994378913e-06, "loss": 0.6262, "step": 725 }, { "epoch": 0.33470884915176524, "grad_norm": 2.749133969632543, "learning_rate": 8.417596282816742e-06, "loss": 0.6084, "step": 730 }, { "epoch": 0.33700137551581844, "grad_norm": 2.8389384155162736, "learning_rate": 8.388267512715565e-06, "loss": 0.6089, "step": 735 }, { "epoch": 0.33929390187987163, "grad_norm": 2.6423715957870115, "learning_rate": 8.358721563969027e-06, "loss": 0.5912, "step": 740 }, { "epoch": 0.3415864282439248, "grad_norm": 2.582427374014035, "learning_rate": 8.328960330391325e-06, "loss": 0.6015, "step": 745 }, { "epoch": 0.343878954607978, "grad_norm": 2.5641005198848763, "learning_rate": 8.298985719595824e-06, "loss": 0.6127, "step": 750 }, { "epoch": 0.34617148097203115, "grad_norm": 2.573968171901929, "learning_rate": 8.268799652872786e-06, "loss": 0.6108, "step": 755 }, { "epoch": 0.34846400733608435, "grad_norm": 2.555840575858041, "learning_rate": 8.23840406506621e-06, "loss": 0.6013, "step": 760 }, { "epoch": 0.35075653370013754, "grad_norm": 2.608505400595271, "learning_rate": 8.207800904449829e-06, "loss": 0.5868, "step": 765 }, { "epoch": 0.35304906006419073, "grad_norm": 2.564041005915397, "learning_rate": 8.176992132602221e-06, "loss": 0.5935, "step": 770 }, { "epoch": 0.3553415864282439, "grad_norm": 2.835188198766609, "learning_rate": 8.145979724281079e-06, "loss": 0.577, "step": 775 }, { "epoch": 0.3576341127922971, "grad_norm": 2.624154236961289, "learning_rate": 8.114765667296628e-06, "loss": 0.5807, "step": 780 }, { "epoch": 0.3599266391563503, "grad_norm": 2.803920892055745, "learning_rate": 8.083351962384234e-06, "loss": 0.5827, "step": 785 }, { "epoch": 0.3622191655204035, "grad_norm": 2.7453769474392438, "learning_rate": 8.051740623076132e-06, "loss": 0.5743, "step": 790 }, { "epoch": 0.36451169188445665, "grad_norm": 2.642012832230722, "learning_rate": 8.019933675572389e-06, "loss": 0.5924, "step": 795 }, { "epoch": 0.36680421824850984, "grad_norm": 2.5959618878893496, "learning_rate": 7.987933158611013e-06, "loss": 0.5765, "step": 800 }, { "epoch": 0.36909674461256303, "grad_norm": 2.6981842811728107, "learning_rate": 7.95574112333729e-06, "loss": 0.5636, "step": 805 }, { "epoch": 0.3713892709766162, "grad_norm": 2.7155825019244246, "learning_rate": 7.923359633172299e-06, "loss": 0.5676, "step": 810 }, { "epoch": 0.3736817973406694, "grad_norm": 2.722727252289237, "learning_rate": 7.890790763680658e-06, "loss": 0.5849, "step": 815 }, { "epoch": 0.3759743237047226, "grad_norm": 2.5941959497564073, "learning_rate": 7.85803660243749e-06, "loss": 0.582, "step": 820 }, { "epoch": 0.3782668500687758, "grad_norm": 2.448527666302428, "learning_rate": 7.8250992488946e-06, "loss": 0.586, "step": 825 }, { "epoch": 0.380559376432829, "grad_norm": 2.786081596311819, "learning_rate": 7.791980814245931e-06, "loss": 0.5547, "step": 830 }, { "epoch": 0.38285190279688214, "grad_norm": 2.6225345564151237, "learning_rate": 7.758683421292217e-06, "loss": 0.5562, "step": 835 }, { "epoch": 0.38514442916093533, "grad_norm": 2.495977821656378, "learning_rate": 7.72520920430493e-06, "loss": 0.5728, "step": 840 }, { "epoch": 0.3874369555249885, "grad_norm": 2.5523314447232535, "learning_rate": 7.691560308889478e-06, "loss": 0.5748, "step": 845 }, { "epoch": 0.3897294818890417, "grad_norm": 2.702511447586494, "learning_rate": 7.657738891847679e-06, "loss": 0.5651, "step": 850 }, { "epoch": 0.3920220082530949, "grad_norm": 2.6729070020445533, "learning_rate": 7.623747121039512e-06, "loss": 0.5716, "step": 855 }, { "epoch": 0.3943145346171481, "grad_norm": 2.7351708064638665, "learning_rate": 7.589587175244162e-06, "loss": 0.565, "step": 860 }, { "epoch": 0.3966070609812013, "grad_norm": 2.5916997954156638, "learning_rate": 7.555261244020371e-06, "loss": 0.5691, "step": 865 }, { "epoch": 0.3988995873452545, "grad_norm": 2.4806248685486407, "learning_rate": 7.520771527566093e-06, "loss": 0.5672, "step": 870 }, { "epoch": 0.40119211370930763, "grad_norm": 2.691711711440267, "learning_rate": 7.486120236577464e-06, "loss": 0.5555, "step": 875 }, { "epoch": 0.4034846400733608, "grad_norm": 2.6506103202422797, "learning_rate": 7.451309592107104e-06, "loss": 0.5548, "step": 880 }, { "epoch": 0.405777166437414, "grad_norm": 2.5210545941984983, "learning_rate": 7.416341825421755e-06, "loss": 0.573, "step": 885 }, { "epoch": 0.4080696928014672, "grad_norm": 2.7103495153803627, "learning_rate": 7.381219177859257e-06, "loss": 0.5428, "step": 890 }, { "epoch": 0.4103622191655204, "grad_norm": 2.5223081344987826, "learning_rate": 7.345943900684896e-06, "loss": 0.5605, "step": 895 }, { "epoch": 0.4126547455295736, "grad_norm": 2.5684242617186364, "learning_rate": 7.310518254947092e-06, "loss": 0.5432, "step": 900 }, { "epoch": 0.4149472718936268, "grad_norm": 2.8905063764239327, "learning_rate": 7.274944511332479e-06, "loss": 0.5355, "step": 905 }, { "epoch": 0.41723979825768, "grad_norm": 2.7288840976281543, "learning_rate": 7.239224950020359e-06, "loss": 0.5583, "step": 910 }, { "epoch": 0.4195323246217332, "grad_norm": 2.573090270715344, "learning_rate": 7.203361860536544e-06, "loss": 0.5528, "step": 915 }, { "epoch": 0.4218248509857863, "grad_norm": 2.7074335935753897, "learning_rate": 7.167357541606613e-06, "loss": 0.5457, "step": 920 }, { "epoch": 0.4241173773498395, "grad_norm": 2.6225623425429614, "learning_rate": 7.131214301008564e-06, "loss": 0.5405, "step": 925 }, { "epoch": 0.4264099037138927, "grad_norm": 2.638186367850455, "learning_rate": 7.094934455424889e-06, "loss": 0.5457, "step": 930 }, { "epoch": 0.4287024300779459, "grad_norm": 2.663625944879504, "learning_rate": 7.058520330294087e-06, "loss": 0.5499, "step": 935 }, { "epoch": 0.4309949564419991, "grad_norm": 2.594656111210185, "learning_rate": 7.021974259661607e-06, "loss": 0.5471, "step": 940 }, { "epoch": 0.4332874828060523, "grad_norm": 2.558300587882855, "learning_rate": 6.985298586030241e-06, "loss": 0.5465, "step": 945 }, { "epoch": 0.4355800091701055, "grad_norm": 2.6435075817238425, "learning_rate": 6.948495660209983e-06, "loss": 0.5331, "step": 950 }, { "epoch": 0.43787253553415867, "grad_norm": 2.494991656905618, "learning_rate": 6.9115678411673345e-06, "loss": 0.5371, "step": 955 }, { "epoch": 0.4401650618982118, "grad_norm": 2.4881542600695643, "learning_rate": 6.8745174958741164e-06, "loss": 0.5329, "step": 960 }, { "epoch": 0.442457588262265, "grad_norm": 2.552409503690461, "learning_rate": 6.837346999155743e-06, "loss": 0.532, "step": 965 }, { "epoch": 0.4447501146263182, "grad_norm": 2.4970182042863445, "learning_rate": 6.800058733539003e-06, "loss": 0.5376, "step": 970 }, { "epoch": 0.4470426409903714, "grad_norm": 2.468594629574796, "learning_rate": 6.762655089099353e-06, "loss": 0.513, "step": 975 }, { "epoch": 0.4493351673544246, "grad_norm": 2.5797501981324453, "learning_rate": 6.725138463307714e-06, "loss": 0.5408, "step": 980 }, { "epoch": 0.4516276937184778, "grad_norm": 2.8482359445979246, "learning_rate": 6.687511260876799e-06, "loss": 0.5189, "step": 985 }, { "epoch": 0.45392022008253097, "grad_norm": 2.6612518014120816, "learning_rate": 6.649775893606982e-06, "loss": 0.5318, "step": 990 }, { "epoch": 0.45621274644658416, "grad_norm": 2.5372082111080347, "learning_rate": 6.611934780231704e-06, "loss": 0.5076, "step": 995 }, { "epoch": 0.4585052728106373, "grad_norm": 2.4460238122171916, "learning_rate": 6.573990346262445e-06, "loss": 0.5028, "step": 1000 }, { "epoch": 0.4607977991746905, "grad_norm": 2.5523381259232747, "learning_rate": 6.535945023833249e-06, "loss": 0.5188, "step": 1005 }, { "epoch": 0.4630903255387437, "grad_norm": 2.6717883324323104, "learning_rate": 6.497801251544833e-06, "loss": 0.5137, "step": 1010 }, { "epoch": 0.4653828519027969, "grad_norm": 2.4441200104866763, "learning_rate": 6.459561474308278e-06, "loss": 0.513, "step": 1015 }, { "epoch": 0.4676753782668501, "grad_norm": 2.4626953473958046, "learning_rate": 6.421228143188325e-06, "loss": 0.5241, "step": 1020 }, { "epoch": 0.46996790463090327, "grad_norm": 2.414799048761899, "learning_rate": 6.382803715246254e-06, "loss": 0.5265, "step": 1025 }, { "epoch": 0.47226043099495646, "grad_norm": 2.661888186403354, "learning_rate": 6.344290653382408e-06, "loss": 0.5122, "step": 1030 }, { "epoch": 0.47455295735900965, "grad_norm": 2.705613301623184, "learning_rate": 6.305691426178316e-06, "loss": 0.5076, "step": 1035 }, { "epoch": 0.4768454837230628, "grad_norm": 2.5901180556298007, "learning_rate": 6.267008507738472e-06, "loss": 0.5309, "step": 1040 }, { "epoch": 0.479138010087116, "grad_norm": 2.5393961483789345, "learning_rate": 6.228244377531747e-06, "loss": 0.506, "step": 1045 }, { "epoch": 0.4814305364511692, "grad_norm": 2.5959034041763154, "learning_rate": 6.189401520232464e-06, "loss": 0.5065, "step": 1050 }, { "epoch": 0.48372306281522237, "grad_norm": 2.6419168193929963, "learning_rate": 6.150482425561135e-06, "loss": 0.5189, "step": 1055 }, { "epoch": 0.48601558917927556, "grad_norm": 2.58024430648069, "learning_rate": 6.11148958812488e-06, "loss": 0.5071, "step": 1060 }, { "epoch": 0.48830811554332876, "grad_norm": 2.4501378891077987, "learning_rate": 6.072425507257528e-06, "loss": 0.5033, "step": 1065 }, { "epoch": 0.49060064190738195, "grad_norm": 2.783006969507733, "learning_rate": 6.033292686859414e-06, "loss": 0.4955, "step": 1070 }, { "epoch": 0.49289316827143514, "grad_norm": 2.428894458608491, "learning_rate": 5.99409363523689e-06, "loss": 0.4973, "step": 1075 }, { "epoch": 0.4951856946354883, "grad_norm": 2.7389561374869342, "learning_rate": 5.9548308649415486e-06, "loss": 0.5051, "step": 1080 }, { "epoch": 0.4974782209995415, "grad_norm": 2.5456232835838124, "learning_rate": 5.91550689260917e-06, "loss": 0.4935, "step": 1085 }, { "epoch": 0.49977074736359467, "grad_norm": 2.6057045786417685, "learning_rate": 5.876124238798424e-06, "loss": 0.501, "step": 1090 }, { "epoch": 0.5020632737276479, "grad_norm": 2.4695060680872873, "learning_rate": 5.836685427829296e-06, "loss": 0.5032, "step": 1095 }, { "epoch": 0.504355800091701, "grad_norm": 2.3783397469941376, "learning_rate": 5.797192987621293e-06, "loss": 0.4985, "step": 1100 }, { "epoch": 0.5066483264557542, "grad_norm": 2.491153548859691, "learning_rate": 5.7576494495314105e-06, "loss": 0.5043, "step": 1105 }, { "epoch": 0.5089408528198074, "grad_norm": 2.6062141152111673, "learning_rate": 5.718057348191874e-06, "loss": 0.4868, "step": 1110 }, { "epoch": 0.5112333791838606, "grad_norm": 2.5012205713207405, "learning_rate": 5.678419221347687e-06, "loss": 0.4979, "step": 1115 }, { "epoch": 0.5135259055479138, "grad_norm": 2.609877005241944, "learning_rate": 5.638737609693953e-06, "loss": 0.495, "step": 1120 }, { "epoch": 0.515818431911967, "grad_norm": 2.684672446431491, "learning_rate": 5.599015056713037e-06, "loss": 0.4823, "step": 1125 }, { "epoch": 0.5181109582760202, "grad_norm": 2.4771534112729228, "learning_rate": 5.559254108511531e-06, "loss": 0.5016, "step": 1130 }, { "epoch": 0.5204034846400734, "grad_norm": 2.46810743209868, "learning_rate": 5.519457313657056e-06, "loss": 0.4896, "step": 1135 }, { "epoch": 0.5226960110041265, "grad_norm": 2.5795208204825983, "learning_rate": 5.479627223014902e-06, "loss": 0.4886, "step": 1140 }, { "epoch": 0.5249885373681797, "grad_norm": 2.434086073989824, "learning_rate": 5.439766389584527e-06, "loss": 0.4865, "step": 1145 }, { "epoch": 0.5272810637322329, "grad_norm": 2.4538097489169934, "learning_rate": 5.399877368335922e-06, "loss": 0.4914, "step": 1150 }, { "epoch": 0.5295735900962861, "grad_norm": 2.5415775013932063, "learning_rate": 5.359962716045836e-06, "loss": 0.4936, "step": 1155 }, { "epoch": 0.5318661164603393, "grad_norm": 2.56697946552087, "learning_rate": 5.3200249911338986e-06, "loss": 0.4894, "step": 1160 }, { "epoch": 0.5341586428243925, "grad_norm": 2.572922499741503, "learning_rate": 5.280066753498632e-06, "loss": 0.4794, "step": 1165 }, { "epoch": 0.5364511691884457, "grad_norm": 2.623599926005301, "learning_rate": 5.240090564353365e-06, "loss": 0.4959, "step": 1170 }, { "epoch": 0.5387436955524989, "grad_norm": 2.4231120561633324, "learning_rate": 5.200098986062072e-06, "loss": 0.4753, "step": 1175 }, { "epoch": 0.5410362219165521, "grad_norm": 2.5196186316057108, "learning_rate": 5.160094581975127e-06, "loss": 0.4783, "step": 1180 }, { "epoch": 0.5433287482806052, "grad_norm": 2.527690400984075, "learning_rate": 5.1200799162650035e-06, "loss": 0.4916, "step": 1185 }, { "epoch": 0.5456212746446584, "grad_norm": 2.6015322908629415, "learning_rate": 5.080057553761917e-06, "loss": 0.4738, "step": 1190 }, { "epoch": 0.5479138010087116, "grad_norm": 2.3467602506879786, "learning_rate": 5.040030059789426e-06, "loss": 0.476, "step": 1195 }, { "epoch": 0.5502063273727648, "grad_norm": 2.570425940808593, "learning_rate": 5e-06, "loss": 0.4903, "step": 1200 }, { "epoch": 0.552498853736818, "grad_norm": 2.5543989632263284, "learning_rate": 4.9599699402105755e-06, "loss": 0.4673, "step": 1205 }, { "epoch": 0.5547913801008711, "grad_norm": 2.5213973685823277, "learning_rate": 4.919942446238085e-06, "loss": 0.4693, "step": 1210 }, { "epoch": 0.5570839064649243, "grad_norm": 2.4952425404718075, "learning_rate": 4.879920083734997e-06, "loss": 0.4692, "step": 1215 }, { "epoch": 0.5593764328289775, "grad_norm": 2.5419193115674776, "learning_rate": 4.839905418024875e-06, "loss": 0.4814, "step": 1220 }, { "epoch": 0.5616689591930307, "grad_norm": 2.558303192571574, "learning_rate": 4.7999010139379295e-06, "loss": 0.4698, "step": 1225 }, { "epoch": 0.5639614855570839, "grad_norm": 2.4678859101946315, "learning_rate": 4.759909435646636e-06, "loss": 0.4896, "step": 1230 }, { "epoch": 0.5662540119211371, "grad_norm": 2.6716519633665783, "learning_rate": 4.719933246501369e-06, "loss": 0.4852, "step": 1235 }, { "epoch": 0.5685465382851903, "grad_norm": 2.4330925797194807, "learning_rate": 4.679975008866103e-06, "loss": 0.4554, "step": 1240 }, { "epoch": 0.5708390646492435, "grad_norm": 2.437937005459216, "learning_rate": 4.640037283954165e-06, "loss": 0.4598, "step": 1245 }, { "epoch": 0.5731315910132967, "grad_norm": 2.413361545021729, "learning_rate": 4.6001226316640804e-06, "loss": 0.4739, "step": 1250 }, { "epoch": 0.5754241173773499, "grad_norm": 2.3552453394422503, "learning_rate": 4.5602336104154745e-06, "loss": 0.4646, "step": 1255 }, { "epoch": 0.5777166437414031, "grad_norm": 2.623470049632146, "learning_rate": 4.520372776985101e-06, "loss": 0.4579, "step": 1260 }, { "epoch": 0.5800091701054562, "grad_norm": 2.4219278336672874, "learning_rate": 4.480542686342946e-06, "loss": 0.4613, "step": 1265 }, { "epoch": 0.5823016964695094, "grad_norm": 2.517369439139374, "learning_rate": 4.440745891488471e-06, "loss": 0.4523, "step": 1270 }, { "epoch": 0.5845942228335625, "grad_norm": 2.501700820037027, "learning_rate": 4.400984943286965e-06, "loss": 0.4671, "step": 1275 }, { "epoch": 0.5868867491976157, "grad_norm": 2.4011689731614605, "learning_rate": 4.361262390306049e-06, "loss": 0.4527, "step": 1280 }, { "epoch": 0.5891792755616689, "grad_norm": 2.5994696717863706, "learning_rate": 4.321580778652316e-06, "loss": 0.4493, "step": 1285 }, { "epoch": 0.5914718019257221, "grad_norm": 2.491956972995198, "learning_rate": 4.2819426518081265e-06, "loss": 0.456, "step": 1290 }, { "epoch": 0.5937643282897753, "grad_norm": 2.4353572047335996, "learning_rate": 4.2423505504685894e-06, "loss": 0.4611, "step": 1295 }, { "epoch": 0.5960568546538285, "grad_norm": 2.4904358458702944, "learning_rate": 4.202807012378707e-06, "loss": 0.4546, "step": 1300 }, { "epoch": 0.5983493810178817, "grad_norm": 2.4617619082762636, "learning_rate": 4.163314572170704e-06, "loss": 0.458, "step": 1305 }, { "epoch": 0.6006419073819349, "grad_norm": 2.354023280982333, "learning_rate": 4.123875761201576e-06, "loss": 0.4433, "step": 1310 }, { "epoch": 0.6029344337459881, "grad_norm": 2.540723557518342, "learning_rate": 4.08449310739083e-06, "loss": 0.4484, "step": 1315 }, { "epoch": 0.6052269601100413, "grad_norm": 2.4043887566981446, "learning_rate": 4.045169135058452e-06, "loss": 0.4416, "step": 1320 }, { "epoch": 0.6075194864740945, "grad_norm": 2.481355244310724, "learning_rate": 4.0059063647631105e-06, "loss": 0.4645, "step": 1325 }, { "epoch": 0.6098120128381477, "grad_norm": 2.499493147862873, "learning_rate": 3.966707313140587e-06, "loss": 0.4542, "step": 1330 }, { "epoch": 0.6121045392022009, "grad_norm": 2.5034183191594477, "learning_rate": 3.927574492742473e-06, "loss": 0.4465, "step": 1335 }, { "epoch": 0.6143970655662541, "grad_norm": 2.450159706952634, "learning_rate": 3.888510411875121e-06, "loss": 0.4451, "step": 1340 }, { "epoch": 0.6166895919303071, "grad_norm": 2.437273107870038, "learning_rate": 3.849517574438866e-06, "loss": 0.4393, "step": 1345 }, { "epoch": 0.6189821182943603, "grad_norm": 2.4867270897195164, "learning_rate": 3.8105984797675364e-06, "loss": 0.4369, "step": 1350 }, { "epoch": 0.6212746446584135, "grad_norm": 2.4474532182002156, "learning_rate": 3.771755622468254e-06, "loss": 0.4459, "step": 1355 }, { "epoch": 0.6235671710224667, "grad_norm": 2.3883568752400737, "learning_rate": 3.7329914922615283e-06, "loss": 0.4414, "step": 1360 }, { "epoch": 0.6258596973865199, "grad_norm": 2.323604786191338, "learning_rate": 3.6943085738216855e-06, "loss": 0.4294, "step": 1365 }, { "epoch": 0.6281522237505731, "grad_norm": 2.5364327673030553, "learning_rate": 3.655709346617593e-06, "loss": 0.4482, "step": 1370 }, { "epoch": 0.6304447501146263, "grad_norm": 2.528211312039227, "learning_rate": 3.6171962847537466e-06, "loss": 0.4483, "step": 1375 }, { "epoch": 0.6327372764786795, "grad_norm": 2.4014535334880533, "learning_rate": 3.5787718568116764e-06, "loss": 0.4479, "step": 1380 }, { "epoch": 0.6350298028427327, "grad_norm": 2.6961239350559687, "learning_rate": 3.540438525691723e-06, "loss": 0.4375, "step": 1385 }, { "epoch": 0.6373223292067859, "grad_norm": 2.4568407427026027, "learning_rate": 3.502198748455169e-06, "loss": 0.4461, "step": 1390 }, { "epoch": 0.6396148555708391, "grad_norm": 2.444432290321262, "learning_rate": 3.464054976166753e-06, "loss": 0.4409, "step": 1395 }, { "epoch": 0.6419073819348923, "grad_norm": 2.3930367223498927, "learning_rate": 3.4260096537375553e-06, "loss": 0.433, "step": 1400 }, { "epoch": 0.6441999082989455, "grad_norm": 2.431394532574176, "learning_rate": 3.3880652197682974e-06, "loss": 0.4229, "step": 1405 }, { "epoch": 0.6464924346629987, "grad_norm": 2.434581693659057, "learning_rate": 3.3502241063930196e-06, "loss": 0.4389, "step": 1410 }, { "epoch": 0.6487849610270519, "grad_norm": 2.3993499417107156, "learning_rate": 3.3124887391232026e-06, "loss": 0.4219, "step": 1415 }, { "epoch": 0.651077487391105, "grad_norm": 2.476740652860741, "learning_rate": 3.2748615366922864e-06, "loss": 0.427, "step": 1420 }, { "epoch": 0.6533700137551581, "grad_norm": 2.507048548706466, "learning_rate": 3.2373449109006476e-06, "loss": 0.4341, "step": 1425 }, { "epoch": 0.6556625401192113, "grad_norm": 2.418497030941838, "learning_rate": 3.1999412664609986e-06, "loss": 0.4329, "step": 1430 }, { "epoch": 0.6579550664832645, "grad_norm": 2.4312888314629144, "learning_rate": 3.162653000844259e-06, "loss": 0.4227, "step": 1435 }, { "epoch": 0.6602475928473177, "grad_norm": 2.353877004261892, "learning_rate": 3.1254825041258852e-06, "loss": 0.4302, "step": 1440 }, { "epoch": 0.6625401192113709, "grad_norm": 2.381814531488306, "learning_rate": 3.0884321588326668e-06, "loss": 0.4376, "step": 1445 }, { "epoch": 0.6648326455754241, "grad_norm": 2.4501307973874287, "learning_rate": 3.051504339790019e-06, "loss": 0.4254, "step": 1450 }, { "epoch": 0.6671251719394773, "grad_norm": 2.459251255110059, "learning_rate": 3.0147014139697596e-06, "loss": 0.4263, "step": 1455 }, { "epoch": 0.6694176983035305, "grad_norm": 2.5254030222294466, "learning_rate": 2.978025740338396e-06, "loss": 0.4195, "step": 1460 }, { "epoch": 0.6717102246675837, "grad_norm": 2.2951603398964235, "learning_rate": 2.9414796697059155e-06, "loss": 0.4129, "step": 1465 }, { "epoch": 0.6740027510316369, "grad_norm": 2.364236291272217, "learning_rate": 2.905065544575114e-06, "loss": 0.4197, "step": 1470 }, { "epoch": 0.6762952773956901, "grad_norm": 2.4601102682369205, "learning_rate": 2.8687856989914393e-06, "loss": 0.4234, "step": 1475 }, { "epoch": 0.6785878037597433, "grad_norm": 2.686432591416178, "learning_rate": 2.8326424583933878e-06, "loss": 0.4223, "step": 1480 }, { "epoch": 0.6808803301237965, "grad_norm": 2.3448228852350788, "learning_rate": 2.796638139463456e-06, "loss": 0.4149, "step": 1485 }, { "epoch": 0.6831728564878496, "grad_norm": 2.317745266155718, "learning_rate": 2.7607750499796426e-06, "loss": 0.4161, "step": 1490 }, { "epoch": 0.6854653828519028, "grad_norm": 2.3719922106424725, "learning_rate": 2.725055488667522e-06, "loss": 0.4275, "step": 1495 }, { "epoch": 0.687757909215956, "grad_norm": 2.4553896347366746, "learning_rate": 2.689481745052908e-06, "loss": 0.3954, "step": 1500 }, { "epoch": 0.6900504355800092, "grad_norm": 2.471280707724599, "learning_rate": 2.6540560993151045e-06, "loss": 0.408, "step": 1505 }, { "epoch": 0.6923429619440623, "grad_norm": 2.375550619652342, "learning_rate": 2.6187808221407433e-06, "loss": 0.4091, "step": 1510 }, { "epoch": 0.6946354883081155, "grad_norm": 2.3794291144670865, "learning_rate": 2.5836581745782474e-06, "loss": 0.4203, "step": 1515 }, { "epoch": 0.6969280146721687, "grad_norm": 2.3959254909604133, "learning_rate": 2.5486904078928954e-06, "loss": 0.4019, "step": 1520 }, { "epoch": 0.6992205410362219, "grad_norm": 2.4572132670378593, "learning_rate": 2.5138797634225358e-06, "loss": 0.4025, "step": 1525 }, { "epoch": 0.7015130674002751, "grad_norm": 2.567664513817577, "learning_rate": 2.4792284724339077e-06, "loss": 0.4096, "step": 1530 }, { "epoch": 0.7038055937643283, "grad_norm": 2.473854002398598, "learning_rate": 2.4447387559796306e-06, "loss": 0.4129, "step": 1535 }, { "epoch": 0.7060981201283815, "grad_norm": 2.2347261984430844, "learning_rate": 2.410412824755839e-06, "loss": 0.4147, "step": 1540 }, { "epoch": 0.7083906464924347, "grad_norm": 2.45007211279529, "learning_rate": 2.3762528789604887e-06, "loss": 0.4159, "step": 1545 }, { "epoch": 0.7106831728564879, "grad_norm": 2.57319881552059, "learning_rate": 2.3422611081523215e-06, "loss": 0.4044, "step": 1550 }, { "epoch": 0.712975699220541, "grad_norm": 2.40694698697041, "learning_rate": 2.3084396911105233e-06, "loss": 0.3888, "step": 1555 }, { "epoch": 0.7152682255845942, "grad_norm": 2.6193951641238837, "learning_rate": 2.274790795695071e-06, "loss": 0.4186, "step": 1560 }, { "epoch": 0.7175607519486474, "grad_norm": 2.3915420788033686, "learning_rate": 2.2413165787077844e-06, "loss": 0.4105, "step": 1565 }, { "epoch": 0.7198532783127006, "grad_norm": 2.4922945082662706, "learning_rate": 2.20801918575407e-06, "loss": 0.41, "step": 1570 }, { "epoch": 0.7221458046767538, "grad_norm": 2.361018492853961, "learning_rate": 2.1749007511054005e-06, "loss": 0.4075, "step": 1575 }, { "epoch": 0.724438331040807, "grad_norm": 2.453915782459234, "learning_rate": 2.1419633975625113e-06, "loss": 0.4123, "step": 1580 }, { "epoch": 0.7267308574048602, "grad_norm": 2.2558599145275458, "learning_rate": 2.109209236319342e-06, "loss": 0.3971, "step": 1585 }, { "epoch": 0.7290233837689133, "grad_norm": 2.3942282865574103, "learning_rate": 2.076640366827703e-06, "loss": 0.4012, "step": 1590 }, { "epoch": 0.7313159101329665, "grad_norm": 2.4100293351001714, "learning_rate": 2.04425887666271e-06, "loss": 0.3926, "step": 1595 }, { "epoch": 0.7336084364970197, "grad_norm": 2.5693096989442927, "learning_rate": 2.0120668413889877e-06, "loss": 0.4021, "step": 1600 }, { "epoch": 0.7359009628610729, "grad_norm": 2.513834629858347, "learning_rate": 1.980066324427613e-06, "loss": 0.3926, "step": 1605 }, { "epoch": 0.7381934892251261, "grad_norm": 2.500502153829468, "learning_rate": 1.9482593769238695e-06, "loss": 0.3932, "step": 1610 }, { "epoch": 0.7404860155891793, "grad_norm": 2.2943690678553827, "learning_rate": 1.916648037615767e-06, "loss": 0.3961, "step": 1615 }, { "epoch": 0.7427785419532325, "grad_norm": 2.4947450845729904, "learning_rate": 1.8852343327033717e-06, "loss": 0.3918, "step": 1620 }, { "epoch": 0.7450710683172856, "grad_norm": 2.475640064869192, "learning_rate": 1.854020275718924e-06, "loss": 0.3953, "step": 1625 }, { "epoch": 0.7473635946813388, "grad_norm": 2.380898479266151, "learning_rate": 1.8230078673977802e-06, "loss": 0.3767, "step": 1630 }, { "epoch": 0.749656121045392, "grad_norm": 2.3124836007659444, "learning_rate": 1.7921990955501705e-06, "loss": 0.386, "step": 1635 }, { "epoch": 0.7519486474094452, "grad_norm": 2.3942291132445375, "learning_rate": 1.7615959349337914e-06, "loss": 0.3964, "step": 1640 }, { "epoch": 0.7542411737734984, "grad_norm": 2.4125792225674614, "learning_rate": 1.731200347127217e-06, "loss": 0.3918, "step": 1645 }, { "epoch": 0.7565337001375516, "grad_norm": 2.4570540617910788, "learning_rate": 1.7010142804041785e-06, "loss": 0.4012, "step": 1650 }, { "epoch": 0.7588262265016048, "grad_norm": 2.3060832536528006, "learning_rate": 1.6710396696086768e-06, "loss": 0.4026, "step": 1655 }, { "epoch": 0.761118752865658, "grad_norm": 2.357410070095031, "learning_rate": 1.6412784360309753e-06, "loss": 0.3876, "step": 1660 }, { "epoch": 0.7634112792297112, "grad_norm": 2.5569987658890434, "learning_rate": 1.611732487284437e-06, "loss": 0.3875, "step": 1665 }, { "epoch": 0.7657038055937643, "grad_norm": 2.5367416684876805, "learning_rate": 1.5824037171832595e-06, "loss": 0.3923, "step": 1670 }, { "epoch": 0.7679963319578175, "grad_norm": 2.370553404803813, "learning_rate": 1.5532940056210882e-06, "loss": 0.3916, "step": 1675 }, { "epoch": 0.7702888583218707, "grad_norm": 2.445473374507484, "learning_rate": 1.524405218450517e-06, "loss": 0.4005, "step": 1680 }, { "epoch": 0.7725813846859239, "grad_norm": 2.416383451707918, "learning_rate": 1.4957392073634912e-06, "loss": 0.385, "step": 1685 }, { "epoch": 0.774873911049977, "grad_norm": 2.4307180782279976, "learning_rate": 1.4672978097726204e-06, "loss": 0.3857, "step": 1690 }, { "epoch": 0.7771664374140302, "grad_norm": 2.4572760495599795, "learning_rate": 1.439082848693406e-06, "loss": 0.3916, "step": 1695 }, { "epoch": 0.7794589637780834, "grad_norm": 2.408412846059606, "learning_rate": 1.4110961326273936e-06, "loss": 0.3908, "step": 1700 }, { "epoch": 0.7817514901421366, "grad_norm": 2.6601098763821596, "learning_rate": 1.3833394554462477e-06, "loss": 0.3859, "step": 1705 }, { "epoch": 0.7840440165061898, "grad_norm": 2.520675032421566, "learning_rate": 1.35581459627677e-06, "loss": 0.3936, "step": 1710 }, { "epoch": 0.786336542870243, "grad_norm": 2.257467358094596, "learning_rate": 1.3285233193868663e-06, "loss": 0.3799, "step": 1715 }, { "epoch": 0.7886290692342962, "grad_norm": 2.327829634660073, "learning_rate": 1.3014673740724615e-06, "loss": 0.3876, "step": 1720 }, { "epoch": 0.7909215955983494, "grad_norm": 2.366347981314184, "learning_rate": 1.2746484945453691e-06, "loss": 0.3829, "step": 1725 }, { "epoch": 0.7932141219624026, "grad_norm": 2.391058577508851, "learning_rate": 1.2480683998221365e-06, "loss": 0.3825, "step": 1730 }, { "epoch": 0.7955066483264558, "grad_norm": 2.470899547865623, "learning_rate": 1.221728793613865e-06, "loss": 0.3895, "step": 1735 }, { "epoch": 0.797799174690509, "grad_norm": 2.399551521415764, "learning_rate": 1.1956313642169974e-06, "loss": 0.3846, "step": 1740 }, { "epoch": 0.8000917010545622, "grad_norm": 2.463312952219633, "learning_rate": 1.1697777844051105e-06, "loss": 0.3788, "step": 1745 }, { "epoch": 0.8023842274186153, "grad_norm": 2.4348320894873092, "learning_rate": 1.1441697113216893e-06, "loss": 0.3803, "step": 1750 }, { "epoch": 0.8046767537826685, "grad_norm": 2.385545108416876, "learning_rate": 1.1188087863739173e-06, "loss": 0.3859, "step": 1755 }, { "epoch": 0.8069692801467216, "grad_norm": 2.4484362721344195, "learning_rate": 1.0936966351274554e-06, "loss": 0.3739, "step": 1760 }, { "epoch": 0.8092618065107748, "grad_norm": 2.4361451039130317, "learning_rate": 1.0688348672022547e-06, "loss": 0.4012, "step": 1765 }, { "epoch": 0.811554332874828, "grad_norm": 2.5671935693516192, "learning_rate": 1.0442250761693829e-06, "loss": 0.3717, "step": 1770 }, { "epoch": 0.8138468592388812, "grad_norm": 2.3910678127476475, "learning_rate": 1.0198688394488837e-06, "loss": 0.3824, "step": 1775 }, { "epoch": 0.8161393856029344, "grad_norm": 2.4337476998865237, "learning_rate": 9.957677182086611e-07, "loss": 0.3754, "step": 1780 }, { "epoch": 0.8184319119669876, "grad_norm": 2.3930303860053055, "learning_rate": 9.719232572644189e-07, "loss": 0.3814, "step": 1785 }, { "epoch": 0.8207244383310408, "grad_norm": 2.4070725664187194, "learning_rate": 9.483369849806401e-07, "loss": 0.3681, "step": 1790 }, { "epoch": 0.823016964695094, "grad_norm": 2.4234654890940277, "learning_rate": 9.250104131726256e-07, "loss": 0.3748, "step": 1795 }, { "epoch": 0.8253094910591472, "grad_norm": 2.4405075201633486, "learning_rate": 9.019450370095867e-07, "loss": 0.3852, "step": 1800 }, { "epoch": 0.8276020174232004, "grad_norm": 2.4157009817816535, "learning_rate": 8.791423349188111e-07, "loss": 0.3738, "step": 1805 }, { "epoch": 0.8298945437872536, "grad_norm": 2.3817117068747695, "learning_rate": 8.566037684908985e-07, "loss": 0.3774, "step": 1810 }, { "epoch": 0.8321870701513068, "grad_norm": 2.643862606121901, "learning_rate": 8.343307823860819e-07, "loss": 0.3747, "step": 1815 }, { "epoch": 0.83447959651536, "grad_norm": 2.415451666660326, "learning_rate": 8.123248042416209e-07, "loss": 0.3807, "step": 1820 }, { "epoch": 0.8367721228794132, "grad_norm": 2.367699275816763, "learning_rate": 7.905872445802976e-07, "loss": 0.3819, "step": 1825 }, { "epoch": 0.8390646492434664, "grad_norm": 2.401428866906129, "learning_rate": 7.691194967200099e-07, "loss": 0.3773, "step": 1830 }, { "epoch": 0.8413571756075194, "grad_norm": 2.3851132017870444, "learning_rate": 7.47922936684457e-07, "loss": 0.3848, "step": 1835 }, { "epoch": 0.8436497019715726, "grad_norm": 2.334920050986847, "learning_rate": 7.269989231149432e-07, "loss": 0.3646, "step": 1840 }, { "epoch": 0.8459422283356258, "grad_norm": 2.302533584527464, "learning_rate": 7.063487971832922e-07, "loss": 0.3719, "step": 1845 }, { "epoch": 0.848234754699679, "grad_norm": 2.4631469089449443, "learning_rate": 6.85973882505886e-07, "loss": 0.3951, "step": 1850 }, { "epoch": 0.8505272810637322, "grad_norm": 2.4860937019904426, "learning_rate": 6.658754850588161e-07, "loss": 0.3877, "step": 1855 }, { "epoch": 0.8528198074277854, "grad_norm": 2.366824744001058, "learning_rate": 6.460548930941801e-07, "loss": 0.3711, "step": 1860 }, { "epoch": 0.8551123337918386, "grad_norm": 2.587488334709295, "learning_rate": 6.265133770575066e-07, "loss": 0.366, "step": 1865 }, { "epoch": 0.8574048601558918, "grad_norm": 2.4606917803825072, "learning_rate": 6.072521895063255e-07, "loss": 0.3818, "step": 1870 }, { "epoch": 0.859697386519945, "grad_norm": 2.4967563072720576, "learning_rate": 5.882725650298787e-07, "loss": 0.3804, "step": 1875 }, { "epoch": 0.8619899128839982, "grad_norm": 2.4902108475668214, "learning_rate": 5.695757201699875e-07, "loss": 0.3751, "step": 1880 }, { "epoch": 0.8642824392480514, "grad_norm": 2.3545990508632713, "learning_rate": 5.511628533430769e-07, "loss": 0.3887, "step": 1885 }, { "epoch": 0.8665749656121046, "grad_norm": 2.4583864322248363, "learning_rate": 5.330351447633603e-07, "loss": 0.3846, "step": 1890 }, { "epoch": 0.8688674919761578, "grad_norm": 2.558178264129578, "learning_rate": 5.151937563671889e-07, "loss": 0.3761, "step": 1895 }, { "epoch": 0.871160018340211, "grad_norm": 2.4125538046249133, "learning_rate": 4.976398317385767e-07, "loss": 0.3789, "step": 1900 }, { "epoch": 0.8734525447042641, "grad_norm": 2.5261586438137718, "learning_rate": 4.803744960358992e-07, "loss": 0.3692, "step": 1905 }, { "epoch": 0.8757450710683173, "grad_norm": 2.5343814063203913, "learning_rate": 4.633988559197761e-07, "loss": 0.3741, "step": 1910 }, { "epoch": 0.8780375974323704, "grad_norm": 2.5455270767430305, "learning_rate": 4.4671399948213233e-07, "loss": 0.3742, "step": 1915 }, { "epoch": 0.8803301237964236, "grad_norm": 2.4299267640638442, "learning_rate": 4.3032099617645874e-07, "loss": 0.3793, "step": 1920 }, { "epoch": 0.8826226501604768, "grad_norm": 2.5350282869807215, "learning_rate": 4.1422089674926113e-07, "loss": 0.3708, "step": 1925 }, { "epoch": 0.88491517652453, "grad_norm": 2.4052098639642745, "learning_rate": 3.984147331727128e-07, "loss": 0.3815, "step": 1930 }, { "epoch": 0.8872077028885832, "grad_norm": 2.440029806154777, "learning_rate": 3.829035185785035e-07, "loss": 0.3559, "step": 1935 }, { "epoch": 0.8895002292526364, "grad_norm": 2.4757422584836783, "learning_rate": 3.676882471929044e-07, "loss": 0.3724, "step": 1940 }, { "epoch": 0.8917927556166896, "grad_norm": 2.405181037542438, "learning_rate": 3.527698942730384e-07, "loss": 0.3678, "step": 1945 }, { "epoch": 0.8940852819807428, "grad_norm": 2.477077740628022, "learning_rate": 3.3814941604437155e-07, "loss": 0.3696, "step": 1950 }, { "epoch": 0.896377808344796, "grad_norm": 2.594645970360135, "learning_rate": 3.2382774963941823e-07, "loss": 0.3689, "step": 1955 }, { "epoch": 0.8986703347088492, "grad_norm": 2.4996924524050526, "learning_rate": 3.0980581303767576e-07, "loss": 0.3641, "step": 1960 }, { "epoch": 0.9009628610729024, "grad_norm": 2.5351766412057364, "learning_rate": 2.9608450500678566e-07, "loss": 0.3736, "step": 1965 }, { "epoch": 0.9032553874369555, "grad_norm": 2.4812985119515374, "learning_rate": 2.826647050449216e-07, "loss": 0.3652, "step": 1970 }, { "epoch": 0.9055479138010087, "grad_norm": 2.4498300583099506, "learning_rate": 2.69547273324417e-07, "loss": 0.3653, "step": 1975 }, { "epoch": 0.9078404401650619, "grad_norm": 2.546961383266402, "learning_rate": 2.5673305063663335e-07, "loss": 0.3723, "step": 1980 }, { "epoch": 0.9101329665291151, "grad_norm": 2.34777660611532, "learning_rate": 2.442228583380646e-07, "loss": 0.3596, "step": 1985 }, { "epoch": 0.9124254928931683, "grad_norm": 2.410870301241545, "learning_rate": 2.3201749829769083e-07, "loss": 0.3783, "step": 1990 }, { "epoch": 0.9147180192572214, "grad_norm": 2.519326020963244, "learning_rate": 2.201177528455828e-07, "loss": 0.3739, "step": 1995 }, { "epoch": 0.9170105456212746, "grad_norm": 2.4872058403028574, "learning_rate": 2.085243847227525e-07, "loss": 0.3768, "step": 2000 }, { "epoch": 0.9193030719853278, "grad_norm": 2.4175176965392544, "learning_rate": 1.9723813703227013e-07, "loss": 0.3794, "step": 2005 }, { "epoch": 0.921595598349381, "grad_norm": 2.514035461894725, "learning_rate": 1.8625973319162605e-07, "loss": 0.3656, "step": 2010 }, { "epoch": 0.9238881247134342, "grad_norm": 2.4532676789082166, "learning_rate": 1.7558987688636675e-07, "loss": 0.361, "step": 2015 }, { "epoch": 0.9261806510774874, "grad_norm": 2.580005311393483, "learning_rate": 1.652292520249865e-07, "loss": 0.369, "step": 2020 }, { "epoch": 0.9284731774415406, "grad_norm": 2.359368965829793, "learning_rate": 1.5517852269509692e-07, "loss": 0.3571, "step": 2025 }, { "epoch": 0.9307657038055938, "grad_norm": 2.4993672807867178, "learning_rate": 1.4543833312085365e-07, "loss": 0.3588, "step": 2030 }, { "epoch": 0.933058230169647, "grad_norm": 2.41149322411576, "learning_rate": 1.360093076216673e-07, "loss": 0.3705, "step": 2035 }, { "epoch": 0.9353507565337001, "grad_norm": 2.474736948512413, "learning_rate": 1.2689205057218602e-07, "loss": 0.361, "step": 2040 }, { "epoch": 0.9376432828977533, "grad_norm": 2.3336360044904736, "learning_rate": 1.1808714636355634e-07, "loss": 0.3568, "step": 2045 }, { "epoch": 0.9399358092618065, "grad_norm": 2.566200023951429, "learning_rate": 1.0959515936596387e-07, "loss": 0.3783, "step": 2050 }, { "epoch": 0.9422283356258597, "grad_norm": 2.5160190954507264, "learning_rate": 1.014166338924627e-07, "loss": 0.372, "step": 2055 }, { "epoch": 0.9445208619899129, "grad_norm": 2.509256348018165, "learning_rate": 9.355209416408051e-08, "loss": 0.3853, "step": 2060 }, { "epoch": 0.9468133883539661, "grad_norm": 2.5224442995349152, "learning_rate": 8.600204427622438e-08, "loss": 0.365, "step": 2065 }, { "epoch": 0.9491059147180193, "grad_norm": 2.4001792608745602, "learning_rate": 7.876696816636276e-08, "loss": 0.3736, "step": 2070 }, { "epoch": 0.9513984410820725, "grad_norm": 2.4422332203602553, "learning_rate": 7.184732958301078e-08, "loss": 0.3651, "step": 2075 }, { "epoch": 0.9536909674461256, "grad_norm": 2.471890892444078, "learning_rate": 6.524357205600518e-08, "loss": 0.3624, "step": 2080 }, { "epoch": 0.9559834938101788, "grad_norm": 2.523417346804641, "learning_rate": 5.895611886807317e-08, "loss": 0.369, "step": 2085 }, { "epoch": 0.958276020174232, "grad_norm": 2.4584360575665776, "learning_rate": 5.2985373027702455e-08, "loss": 0.363, "step": 2090 }, { "epoch": 0.9605685465382852, "grad_norm": 2.467603595232153, "learning_rate": 4.733171724330854e-08, "loss": 0.3814, "step": 2095 }, { "epoch": 0.9628610729023384, "grad_norm": 2.5238201533198072, "learning_rate": 4.19955138987066e-08, "loss": 0.369, "step": 2100 }, { "epoch": 0.9651535992663915, "grad_norm": 2.5600424647957807, "learning_rate": 3.697710502988006e-08, "loss": 0.3652, "step": 2105 }, { "epoch": 0.9674461256304447, "grad_norm": 2.475992842961113, "learning_rate": 3.2276812303060346e-08, "loss": 0.3741, "step": 2110 }, { "epoch": 0.9697386519944979, "grad_norm": 2.4735410644370606, "learning_rate": 2.7894936994106724e-08, "loss": 0.3571, "step": 2115 }, { "epoch": 0.9720311783585511, "grad_norm": 2.384962513457078, "learning_rate": 2.383175996919673e-08, "loss": 0.3654, "step": 2120 }, { "epoch": 0.9743237047226043, "grad_norm": 2.4369560907719414, "learning_rate": 2.008754166682225e-08, "loss": 0.3614, "step": 2125 }, { "epoch": 0.9766162310866575, "grad_norm": 2.334334624814976, "learning_rate": 1.6662522081097308e-08, "loss": 0.3598, "step": 2130 }, { "epoch": 0.9789087574507107, "grad_norm": 2.515966550970349, "learning_rate": 1.3556920746373714e-08, "loss": 0.3539, "step": 2135 }, { "epoch": 0.9812012838147639, "grad_norm": 2.4578356166282704, "learning_rate": 1.0770936723171199e-08, "loss": 0.3684, "step": 2140 }, { "epoch": 0.9834938101788171, "grad_norm": 2.534561019356648, "learning_rate": 8.304748585417077e-09, "loss": 0.3629, "step": 2145 }, { "epoch": 0.9857863365428703, "grad_norm": 2.4815228224834254, "learning_rate": 6.158514409000393e-09, "loss": 0.3617, "step": 2150 }, { "epoch": 0.9880788629069235, "grad_norm": 2.520302407708297, "learning_rate": 4.332371761638921e-09, "loss": 0.3716, "step": 2155 }, { "epoch": 0.9903713892709766, "grad_norm": 2.939805778253569, "learning_rate": 2.8264376940634332e-09, "loss": 0.3685, "step": 2160 }, { "epoch": 0.9926639156350298, "grad_norm": 2.6736093020039484, "learning_rate": 1.640808732513155e-09, "loss": 0.3724, "step": 2165 }, { "epoch": 0.994956441999083, "grad_norm": 2.3884833213363144, "learning_rate": 7.755608725490415e-10, "loss": 0.354, "step": 2170 }, { "epoch": 0.9972489683631361, "grad_norm": 2.378189457774983, "learning_rate": 2.307495741843413e-10, "loss": 0.356, "step": 2175 }, { "epoch": 0.9995414947271893, "grad_norm": 2.543796138070999, "learning_rate": 6.4097583263311725e-12, "loss": 0.3664, "step": 2180 }, { "epoch": 1.0, "eval_runtime": 2.6844, "eval_samples_per_second": 3.725, "eval_steps_per_second": 1.118, "step": 2181 }, { "epoch": 1.0, "step": 2181, "total_flos": 228328514519040.0, "train_loss": 0.5532772317904499, "train_runtime": 23497.974, "train_samples_per_second": 1.484, "train_steps_per_second": 0.093 } ], "logging_steps": 5, "max_steps": 2181, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 228328514519040.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }