{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2187, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004572473708276177, "grad_norm": 8.096893602247533, "learning_rate": 4.5662100456621004e-08, "loss": 4.0564, "step": 1 }, { "epoch": 0.002286236854138089, "grad_norm": 9.362922497604572, "learning_rate": 2.2831050228310502e-07, "loss": 4.0286, "step": 5 }, { "epoch": 0.004572473708276178, "grad_norm": 8.436200820951127, "learning_rate": 4.5662100456621004e-07, "loss": 4.0215, "step": 10 }, { "epoch": 0.006858710562414266, "grad_norm": 8.311166463949595, "learning_rate": 6.849315068493151e-07, "loss": 3.995, "step": 15 }, { "epoch": 0.009144947416552356, "grad_norm": 8.314737539002055, "learning_rate": 9.132420091324201e-07, "loss": 4.0089, "step": 20 }, { "epoch": 0.011431184270690443, "grad_norm": 8.207038819761589, "learning_rate": 1.1415525114155251e-06, "loss": 4.0363, "step": 25 }, { "epoch": 0.013717421124828532, "grad_norm": 7.559088257570073, "learning_rate": 1.3698630136986302e-06, "loss": 3.9854, "step": 30 }, { "epoch": 0.01600365797896662, "grad_norm": 7.1469550333759315, "learning_rate": 1.5981735159817353e-06, "loss": 3.9172, "step": 35 }, { "epoch": 0.01828989483310471, "grad_norm": 5.971779564925808, "learning_rate": 1.8264840182648401e-06, "loss": 3.7833, "step": 40 }, { "epoch": 0.0205761316872428, "grad_norm": 5.297232176162252, "learning_rate": 2.0547945205479454e-06, "loss": 3.7258, "step": 45 }, { "epoch": 0.022862368541380886, "grad_norm": 4.217560181118984, "learning_rate": 2.2831050228310503e-06, "loss": 3.6069, "step": 50 }, { "epoch": 0.025148605395518976, "grad_norm": 3.786598130029432, "learning_rate": 2.511415525114155e-06, "loss": 3.553, "step": 55 }, { "epoch": 0.027434842249657063, "grad_norm": 2.6863763381878782, "learning_rate": 2.7397260273972604e-06, "loss": 3.4564, "step": 60 }, { "epoch": 0.029721079103795154, "grad_norm": 2.322206779034821, "learning_rate": 2.9680365296803653e-06, "loss": 3.331, "step": 65 }, { "epoch": 0.03200731595793324, "grad_norm": 1.7328570105860337, "learning_rate": 3.1963470319634706e-06, "loss": 3.2806, "step": 70 }, { "epoch": 0.03429355281207133, "grad_norm": 1.4846651112766411, "learning_rate": 3.4246575342465754e-06, "loss": 3.2356, "step": 75 }, { "epoch": 0.03657978966620942, "grad_norm": 1.1848731903024705, "learning_rate": 3.6529680365296803e-06, "loss": 3.1934, "step": 80 }, { "epoch": 0.038866026520347506, "grad_norm": 1.0381920877926754, "learning_rate": 3.881278538812785e-06, "loss": 3.1245, "step": 85 }, { "epoch": 0.0411522633744856, "grad_norm": 0.9141218079482407, "learning_rate": 4.109589041095891e-06, "loss": 3.0468, "step": 90 }, { "epoch": 0.04343850022862369, "grad_norm": 1.0161753499176187, "learning_rate": 4.337899543378996e-06, "loss": 3.0329, "step": 95 }, { "epoch": 0.04572473708276177, "grad_norm": 0.8996453649527762, "learning_rate": 4.566210045662101e-06, "loss": 2.9958, "step": 100 }, { "epoch": 0.04801097393689986, "grad_norm": 0.8082797335102323, "learning_rate": 4.7945205479452054e-06, "loss": 2.9651, "step": 105 }, { "epoch": 0.05029721079103795, "grad_norm": 0.7090431405422901, "learning_rate": 5.02283105022831e-06, "loss": 2.927, "step": 110 }, { "epoch": 0.05258344764517604, "grad_norm": 1.2265537925663061, "learning_rate": 5.251141552511416e-06, "loss": 2.862, "step": 115 }, { "epoch": 0.05486968449931413, "grad_norm": 0.650571444620453, "learning_rate": 5.479452054794521e-06, "loss": 2.857, "step": 120 }, { "epoch": 0.05715592135345222, "grad_norm": 0.7089485111846239, "learning_rate": 5.7077625570776266e-06, "loss": 2.8209, "step": 125 }, { "epoch": 0.05944215820759031, "grad_norm": 0.5922494361050838, "learning_rate": 5.936073059360731e-06, "loss": 2.8037, "step": 130 }, { "epoch": 0.06172839506172839, "grad_norm": 0.5597217919230902, "learning_rate": 6.164383561643836e-06, "loss": 2.7487, "step": 135 }, { "epoch": 0.06401463191586648, "grad_norm": 0.6045746583730743, "learning_rate": 6.392694063926941e-06, "loss": 2.6981, "step": 140 }, { "epoch": 0.06630086877000457, "grad_norm": 0.6479924774135967, "learning_rate": 6.621004566210046e-06, "loss": 2.7036, "step": 145 }, { "epoch": 0.06858710562414266, "grad_norm": 0.760723993748018, "learning_rate": 6.849315068493151e-06, "loss": 2.6821, "step": 150 }, { "epoch": 0.07087334247828075, "grad_norm": 0.5889973577684341, "learning_rate": 7.077625570776257e-06, "loss": 2.6882, "step": 155 }, { "epoch": 0.07315957933241884, "grad_norm": 0.6201384588278992, "learning_rate": 7.305936073059361e-06, "loss": 2.6441, "step": 160 }, { "epoch": 0.07544581618655692, "grad_norm": 0.645862573214957, "learning_rate": 7.534246575342466e-06, "loss": 2.5878, "step": 165 }, { "epoch": 0.07773205304069501, "grad_norm": 0.880791499233313, "learning_rate": 7.76255707762557e-06, "loss": 2.5665, "step": 170 }, { "epoch": 0.0800182898948331, "grad_norm": 0.7581098091472079, "learning_rate": 7.990867579908676e-06, "loss": 2.5423, "step": 175 }, { "epoch": 0.0823045267489712, "grad_norm": 0.7502504535360037, "learning_rate": 8.219178082191782e-06, "loss": 2.5348, "step": 180 }, { "epoch": 0.08459076360310928, "grad_norm": 0.9587325899501735, "learning_rate": 8.447488584474887e-06, "loss": 2.4652, "step": 185 }, { "epoch": 0.08687700045724737, "grad_norm": 1.0327228370595574, "learning_rate": 8.675799086757991e-06, "loss": 2.4066, "step": 190 }, { "epoch": 0.08916323731138547, "grad_norm": 0.8853835960264104, "learning_rate": 8.904109589041097e-06, "loss": 2.3642, "step": 195 }, { "epoch": 0.09144947416552354, "grad_norm": 1.0446953486337078, "learning_rate": 9.132420091324201e-06, "loss": 2.3237, "step": 200 }, { "epoch": 0.09373571101966163, "grad_norm": 1.1013758488210148, "learning_rate": 9.360730593607307e-06, "loss": 2.2331, "step": 205 }, { "epoch": 0.09602194787379972, "grad_norm": 1.2192543249794923, "learning_rate": 9.589041095890411e-06, "loss": 2.1264, "step": 210 }, { "epoch": 0.09830818472793781, "grad_norm": 1.3533953895273099, "learning_rate": 9.817351598173517e-06, "loss": 2.0554, "step": 215 }, { "epoch": 0.1005944215820759, "grad_norm": 1.1876482609404326, "learning_rate": 9.999993629265979e-06, "loss": 1.9859, "step": 220 }, { "epoch": 0.102880658436214, "grad_norm": 1.1847416528253172, "learning_rate": 9.999770655279843e-06, "loss": 1.8986, "step": 225 }, { "epoch": 0.10516689529035209, "grad_norm": 1.3137466650624998, "learning_rate": 9.999229160826947e-06, "loss": 1.8, "step": 230 }, { "epoch": 0.10745313214449016, "grad_norm": 1.830150495140023, "learning_rate": 9.998369180404283e-06, "loss": 1.7138, "step": 235 }, { "epoch": 0.10973936899862825, "grad_norm": 1.1159850299398295, "learning_rate": 9.997190768798639e-06, "loss": 1.6867, "step": 240 }, { "epoch": 0.11202560585276634, "grad_norm": 0.9727694366367986, "learning_rate": 9.995694001083103e-06, "loss": 1.6469, "step": 245 }, { "epoch": 0.11431184270690443, "grad_norm": 1.135743426814773, "learning_rate": 9.993878972612276e-06, "loss": 1.5607, "step": 250 }, { "epoch": 0.11659807956104253, "grad_norm": 1.0363437963731608, "learning_rate": 9.991745799016206e-06, "loss": 1.5332, "step": 255 }, { "epoch": 0.11888431641518062, "grad_norm": 1.018006180331875, "learning_rate": 9.989294616193018e-06, "loss": 1.4962, "step": 260 }, { "epoch": 0.1211705532693187, "grad_norm": 0.9493951106581935, "learning_rate": 9.986525580300253e-06, "loss": 1.4403, "step": 265 }, { "epoch": 0.12345679012345678, "grad_norm": 4.150830186272059, "learning_rate": 9.983438867744923e-06, "loss": 1.4382, "step": 270 }, { "epoch": 0.12574302697759487, "grad_norm": 0.8458476848705546, "learning_rate": 9.980034675172274e-06, "loss": 1.4248, "step": 275 }, { "epoch": 0.12802926383173296, "grad_norm": 1.8171861028727991, "learning_rate": 9.976313219453255e-06, "loss": 1.4055, "step": 280 }, { "epoch": 0.13031550068587106, "grad_norm": 0.7389926811741014, "learning_rate": 9.972274737670702e-06, "loss": 1.4033, "step": 285 }, { "epoch": 0.13260173754000915, "grad_norm": 0.8834746515415843, "learning_rate": 9.967919487104237e-06, "loss": 1.3724, "step": 290 }, { "epoch": 0.13488797439414724, "grad_norm": 0.8166186304734012, "learning_rate": 9.963247745213876e-06, "loss": 1.3721, "step": 295 }, { "epoch": 0.13717421124828533, "grad_norm": 0.6771475216933378, "learning_rate": 9.958259809622353e-06, "loss": 1.3555, "step": 300 }, { "epoch": 0.13946044810242342, "grad_norm": 0.60525762012324, "learning_rate": 9.952955998096155e-06, "loss": 1.36, "step": 305 }, { "epoch": 0.1417466849565615, "grad_norm": 0.6126617626167846, "learning_rate": 9.94733664852529e-06, "loss": 1.353, "step": 310 }, { "epoch": 0.1440329218106996, "grad_norm": 0.6630794657190928, "learning_rate": 9.941402118901743e-06, "loss": 1.3359, "step": 315 }, { "epoch": 0.1463191586648377, "grad_norm": 0.6758533351396738, "learning_rate": 9.935152787296689e-06, "loss": 1.3402, "step": 320 }, { "epoch": 0.14860539551897575, "grad_norm": 0.739719330356037, "learning_rate": 9.928589051836392e-06, "loss": 1.3346, "step": 325 }, { "epoch": 0.15089163237311384, "grad_norm": 0.7258290118963521, "learning_rate": 9.921711330676848e-06, "loss": 1.3356, "step": 330 }, { "epoch": 0.15317786922725193, "grad_norm": 0.6274092924270468, "learning_rate": 9.91452006197715e-06, "loss": 1.3362, "step": 335 }, { "epoch": 0.15546410608139002, "grad_norm": 0.768028072114212, "learning_rate": 9.907015703871558e-06, "loss": 1.3214, "step": 340 }, { "epoch": 0.15775034293552812, "grad_norm": 0.7738373400419118, "learning_rate": 9.899198734440335e-06, "loss": 1.331, "step": 345 }, { "epoch": 0.1600365797896662, "grad_norm": 0.6855410863811031, "learning_rate": 9.891069651679273e-06, "loss": 1.3142, "step": 350 }, { "epoch": 0.1623228166438043, "grad_norm": 0.6405023247699122, "learning_rate": 9.882628973467972e-06, "loss": 1.3171, "step": 355 }, { "epoch": 0.1646090534979424, "grad_norm": 0.6764400756880153, "learning_rate": 9.873877237536854e-06, "loss": 1.3189, "step": 360 }, { "epoch": 0.16689529035208048, "grad_norm": 0.6298462983903607, "learning_rate": 9.86481500143289e-06, "loss": 1.3059, "step": 365 }, { "epoch": 0.16918152720621857, "grad_norm": 0.6606697771559132, "learning_rate": 9.855442842484101e-06, "loss": 1.3267, "step": 370 }, { "epoch": 0.17146776406035666, "grad_norm": 0.5895037669135822, "learning_rate": 9.84576135776276e-06, "loss": 1.3057, "step": 375 }, { "epoch": 0.17375400091449475, "grad_norm": 0.5762405642901876, "learning_rate": 9.835771164047365e-06, "loss": 1.3016, "step": 380 }, { "epoch": 0.17604023776863284, "grad_norm": 0.6301891918568133, "learning_rate": 9.825472897783344e-06, "loss": 1.3046, "step": 385 }, { "epoch": 0.17832647462277093, "grad_norm": 0.6189017845225122, "learning_rate": 9.814867215042503e-06, "loss": 1.3089, "step": 390 }, { "epoch": 0.18061271147690902, "grad_norm": 0.6279515665165573, "learning_rate": 9.803954791481239e-06, "loss": 1.3011, "step": 395 }, { "epoch": 0.18289894833104708, "grad_norm": 0.6380039476156935, "learning_rate": 9.792736322297489e-06, "loss": 1.2758, "step": 400 }, { "epoch": 0.18518518518518517, "grad_norm": 0.7506004279154695, "learning_rate": 9.781212522186442e-06, "loss": 1.312, "step": 405 }, { "epoch": 0.18747142203932327, "grad_norm": 0.7054181242720778, "learning_rate": 9.769384125295012e-06, "loss": 1.3112, "step": 410 }, { "epoch": 0.18975765889346136, "grad_norm": 0.5797880483237029, "learning_rate": 9.757251885175063e-06, "loss": 1.2998, "step": 415 }, { "epoch": 0.19204389574759945, "grad_norm": 0.6040659600524477, "learning_rate": 9.744816574735405e-06, "loss": 1.3018, "step": 420 }, { "epoch": 0.19433013260173754, "grad_norm": 0.7044299546094256, "learning_rate": 9.732078986192552e-06, "loss": 1.2818, "step": 425 }, { "epoch": 0.19661636945587563, "grad_norm": 0.567841572649114, "learning_rate": 9.719039931020258e-06, "loss": 1.2733, "step": 430 }, { "epoch": 0.19890260631001372, "grad_norm": 0.5378351616772565, "learning_rate": 9.705700239897809e-06, "loss": 1.2861, "step": 435 }, { "epoch": 0.2011888431641518, "grad_norm": 0.5372339490006793, "learning_rate": 9.692060762657118e-06, "loss": 1.2821, "step": 440 }, { "epoch": 0.2034750800182899, "grad_norm": 0.6353680076674888, "learning_rate": 9.678122368228571e-06, "loss": 1.2643, "step": 445 }, { "epoch": 0.205761316872428, "grad_norm": 0.6263499547366734, "learning_rate": 9.66388594458568e-06, "loss": 1.2826, "step": 450 }, { "epoch": 0.20804755372656608, "grad_norm": 0.6119180746423146, "learning_rate": 9.649352398688506e-06, "loss": 1.2856, "step": 455 }, { "epoch": 0.21033379058070417, "grad_norm": 0.6640618234127624, "learning_rate": 9.634522656425885e-06, "loss": 1.2765, "step": 460 }, { "epoch": 0.21262002743484226, "grad_norm": 0.6253602428713037, "learning_rate": 9.619397662556434e-06, "loss": 1.2661, "step": 465 }, { "epoch": 0.21490626428898033, "grad_norm": 0.6463257272674591, "learning_rate": 9.603978380648375e-06, "loss": 1.2838, "step": 470 }, { "epoch": 0.21719250114311842, "grad_norm": 0.6916869993480118, "learning_rate": 9.588265793018141e-06, "loss": 1.2785, "step": 475 }, { "epoch": 0.2194787379972565, "grad_norm": 0.578420093141111, "learning_rate": 9.572260900667794e-06, "loss": 1.2627, "step": 480 }, { "epoch": 0.2217649748513946, "grad_norm": 0.6016744117162259, "learning_rate": 9.555964723221258e-06, "loss": 1.2672, "step": 485 }, { "epoch": 0.2240512117055327, "grad_norm": 0.6325422647436533, "learning_rate": 9.539378298859365e-06, "loss": 1.2667, "step": 490 }, { "epoch": 0.22633744855967078, "grad_norm": 0.674420764332063, "learning_rate": 9.522502684253709e-06, "loss": 1.2601, "step": 495 }, { "epoch": 0.22862368541380887, "grad_norm": 0.6942742236531446, "learning_rate": 9.505338954499332e-06, "loss": 1.275, "step": 500 }, { "epoch": 0.23090992226794696, "grad_norm": 0.5661617220667517, "learning_rate": 9.487888203046232e-06, "loss": 1.2683, "step": 505 }, { "epoch": 0.23319615912208505, "grad_norm": 0.6389133947347537, "learning_rate": 9.4701515416297e-06, "loss": 1.2659, "step": 510 }, { "epoch": 0.23548239597622314, "grad_norm": 0.561786602813537, "learning_rate": 9.452130100199504e-06, "loss": 1.2664, "step": 515 }, { "epoch": 0.23776863283036123, "grad_norm": 0.5666699221383189, "learning_rate": 9.433825026847891e-06, "loss": 1.2573, "step": 520 }, { "epoch": 0.24005486968449932, "grad_norm": 0.6718711112993888, "learning_rate": 9.415237487736452e-06, "loss": 1.2545, "step": 525 }, { "epoch": 0.2423411065386374, "grad_norm": 0.5637527283960878, "learning_rate": 9.396368667021835e-06, "loss": 1.2723, "step": 530 }, { "epoch": 0.2446273433927755, "grad_norm": 0.583426898925874, "learning_rate": 9.377219766780288e-06, "loss": 1.2473, "step": 535 }, { "epoch": 0.24691358024691357, "grad_norm": 0.7422622561747031, "learning_rate": 9.3577920069311e-06, "loss": 1.2609, "step": 540 }, { "epoch": 0.24919981710105166, "grad_norm": 0.7536416453907702, "learning_rate": 9.338086625158867e-06, "loss": 1.2655, "step": 545 }, { "epoch": 0.25148605395518975, "grad_norm": 0.5911621999933799, "learning_rate": 9.318104876834652e-06, "loss": 1.2652, "step": 550 }, { "epoch": 0.25377229080932784, "grad_norm": 0.6482915887304207, "learning_rate": 9.297848034936007e-06, "loss": 1.2488, "step": 555 }, { "epoch": 0.25605852766346593, "grad_norm": 0.7813862221549358, "learning_rate": 9.277317389965871e-06, "loss": 1.2678, "step": 560 }, { "epoch": 0.258344764517604, "grad_norm": 0.601959447185496, "learning_rate": 9.256514249870366e-06, "loss": 1.2549, "step": 565 }, { "epoch": 0.2606310013717421, "grad_norm": 0.5439593292691556, "learning_rate": 9.235439939955458e-06, "loss": 1.2311, "step": 570 }, { "epoch": 0.2629172382258802, "grad_norm": 0.6462948109732727, "learning_rate": 9.214095802802533e-06, "loss": 1.2605, "step": 575 }, { "epoch": 0.2652034750800183, "grad_norm": 0.6523908850821281, "learning_rate": 9.192483198182876e-06, "loss": 1.2577, "step": 580 }, { "epoch": 0.2674897119341564, "grad_norm": 0.6285230592028435, "learning_rate": 9.170603502971017e-06, "loss": 1.233, "step": 585 }, { "epoch": 0.2697759487882945, "grad_norm": 0.5990676661488948, "learning_rate": 9.148458111057043e-06, "loss": 1.2444, "step": 590 }, { "epoch": 0.27206218564243256, "grad_norm": 0.5443537881683997, "learning_rate": 9.12604843325778e-06, "loss": 1.2282, "step": 595 }, { "epoch": 0.27434842249657065, "grad_norm": 0.5804764131758829, "learning_rate": 9.103375897226919e-06, "loss": 1.253, "step": 600 }, { "epoch": 0.27663465935070874, "grad_norm": 0.5905170219986889, "learning_rate": 9.080441947364065e-06, "loss": 1.2472, "step": 605 }, { "epoch": 0.27892089620484684, "grad_norm": 0.6003218456115103, "learning_rate": 9.057248044722718e-06, "loss": 1.2421, "step": 610 }, { "epoch": 0.2812071330589849, "grad_norm": 0.5683857920528798, "learning_rate": 9.033795666917191e-06, "loss": 1.2551, "step": 615 }, { "epoch": 0.283493369913123, "grad_norm": 0.5908776822300396, "learning_rate": 9.010086308028487e-06, "loss": 1.2375, "step": 620 }, { "epoch": 0.2857796067672611, "grad_norm": 0.6118010788168986, "learning_rate": 8.986121478509096e-06, "loss": 1.2347, "step": 625 }, { "epoch": 0.2880658436213992, "grad_norm": 0.5787813457678733, "learning_rate": 8.961902705086785e-06, "loss": 1.2395, "step": 630 }, { "epoch": 0.2903520804755373, "grad_norm": 0.6290839595278495, "learning_rate": 8.937431530667329e-06, "loss": 1.2263, "step": 635 }, { "epoch": 0.2926383173296754, "grad_norm": 0.5459763353494508, "learning_rate": 8.912709514236218e-06, "loss": 1.2285, "step": 640 }, { "epoch": 0.29492455418381347, "grad_norm": 0.6301840515917086, "learning_rate": 8.887738230759334e-06, "loss": 1.2374, "step": 645 }, { "epoch": 0.2972107910379515, "grad_norm": 0.5413584040020849, "learning_rate": 8.862519271082624e-06, "loss": 1.2505, "step": 650 }, { "epoch": 0.2994970278920896, "grad_norm": 0.5979355091788396, "learning_rate": 8.83705424183074e-06, "loss": 1.2238, "step": 655 }, { "epoch": 0.3017832647462277, "grad_norm": 0.6873493941298675, "learning_rate": 8.811344765304698e-06, "loss": 1.2262, "step": 660 }, { "epoch": 0.3040695016003658, "grad_norm": 0.6699975954695512, "learning_rate": 8.785392479378522e-06, "loss": 1.23, "step": 665 }, { "epoch": 0.30635573845450387, "grad_norm": 0.6860546025784545, "learning_rate": 8.759199037394888e-06, "loss": 1.2424, "step": 670 }, { "epoch": 0.30864197530864196, "grad_norm": 0.7598573834174616, "learning_rate": 8.732766108059814e-06, "loss": 1.2138, "step": 675 }, { "epoch": 0.31092821216278005, "grad_norm": 0.723323270057115, "learning_rate": 8.70609537533634e-06, "loss": 1.2373, "step": 680 }, { "epoch": 0.31321444901691814, "grad_norm": 0.6170455054157933, "learning_rate": 8.679188538337248e-06, "loss": 1.2257, "step": 685 }, { "epoch": 0.31550068587105623, "grad_norm": 0.7413957440287698, "learning_rate": 8.652047311216823e-06, "loss": 1.2075, "step": 690 }, { "epoch": 0.3177869227251943, "grad_norm": 0.7424365012242525, "learning_rate": 8.62467342306164e-06, "loss": 1.2238, "step": 695 }, { "epoch": 0.3200731595793324, "grad_norm": 0.8566227798899636, "learning_rate": 8.597068617780419e-06, "loss": 1.2278, "step": 700 }, { "epoch": 0.3223593964334705, "grad_norm": 0.647075376724737, "learning_rate": 8.569234653992916e-06, "loss": 1.2407, "step": 705 }, { "epoch": 0.3246456332876086, "grad_norm": 0.6249088936722902, "learning_rate": 8.541173304917895e-06, "loss": 1.2231, "step": 710 }, { "epoch": 0.3269318701417467, "grad_norm": 0.70817264277616, "learning_rate": 8.512886358260162e-06, "loss": 1.2345, "step": 715 }, { "epoch": 0.3292181069958848, "grad_norm": 0.5956107721750036, "learning_rate": 8.484375616096658e-06, "loss": 1.225, "step": 720 }, { "epoch": 0.33150434385002286, "grad_norm": 0.6062042871270218, "learning_rate": 8.455642894761684e-06, "loss": 1.2185, "step": 725 }, { "epoch": 0.33379058070416096, "grad_norm": 0.66611343630398, "learning_rate": 8.426690024731161e-06, "loss": 1.2171, "step": 730 }, { "epoch": 0.33607681755829905, "grad_norm": 0.6006939272932527, "learning_rate": 8.39751885050603e-06, "loss": 1.2168, "step": 735 }, { "epoch": 0.33836305441243714, "grad_norm": 0.5888998376074026, "learning_rate": 8.36813123049474e-06, "loss": 1.2447, "step": 740 }, { "epoch": 0.3406492912665752, "grad_norm": 0.6170255283448466, "learning_rate": 8.338529036894855e-06, "loss": 1.2386, "step": 745 }, { "epoch": 0.3429355281207133, "grad_norm": 0.6592250171561639, "learning_rate": 8.308714155573785e-06, "loss": 1.2095, "step": 750 }, { "epoch": 0.3452217649748514, "grad_norm": 0.5948350472440084, "learning_rate": 8.278688485948634e-06, "loss": 1.2204, "step": 755 }, { "epoch": 0.3475080018289895, "grad_norm": 0.6884759018973265, "learning_rate": 8.248453940865204e-06, "loss": 1.2205, "step": 760 }, { "epoch": 0.3497942386831276, "grad_norm": 0.5629453296642776, "learning_rate": 8.218012446476128e-06, "loss": 1.2087, "step": 765 }, { "epoch": 0.3520804755372657, "grad_norm": 0.5703699859674032, "learning_rate": 8.187365942118162e-06, "loss": 1.2038, "step": 770 }, { "epoch": 0.35436671239140377, "grad_norm": 0.5758055939006159, "learning_rate": 8.156516380188635e-06, "loss": 1.2015, "step": 775 }, { "epoch": 0.35665294924554186, "grad_norm": 0.6814380489670292, "learning_rate": 8.125465726021068e-06, "loss": 1.2267, "step": 780 }, { "epoch": 0.35893918609967995, "grad_norm": 0.58819101648096, "learning_rate": 8.09421595775997e-06, "loss": 1.2065, "step": 785 }, { "epoch": 0.36122542295381804, "grad_norm": 0.599220106737159, "learning_rate": 8.062769066234807e-06, "loss": 1.2084, "step": 790 }, { "epoch": 0.3635116598079561, "grad_norm": 0.5687079813226833, "learning_rate": 8.031127054833192e-06, "loss": 1.2311, "step": 795 }, { "epoch": 0.36579789666209417, "grad_norm": 0.6076443328436887, "learning_rate": 7.999291939373232e-06, "loss": 1.209, "step": 800 }, { "epoch": 0.36808413351623226, "grad_norm": 0.5767468288489239, "learning_rate": 7.967265747975124e-06, "loss": 1.2153, "step": 805 }, { "epoch": 0.37037037037037035, "grad_norm": 0.6275130557605428, "learning_rate": 7.93505052093194e-06, "loss": 1.2206, "step": 810 }, { "epoch": 0.37265660722450844, "grad_norm": 0.5920904031157348, "learning_rate": 7.90264831057965e-06, "loss": 1.2149, "step": 815 }, { "epoch": 0.37494284407864653, "grad_norm": 0.5841477404583847, "learning_rate": 7.870061181166372e-06, "loss": 1.2134, "step": 820 }, { "epoch": 0.3772290809327846, "grad_norm": 0.545565275285448, "learning_rate": 7.837291208720867e-06, "loss": 1.2185, "step": 825 }, { "epoch": 0.3795153177869227, "grad_norm": 0.6183231148929101, "learning_rate": 7.804340480920274e-06, "loss": 1.2064, "step": 830 }, { "epoch": 0.3818015546410608, "grad_norm": 0.5801259298558049, "learning_rate": 7.771211096957125e-06, "loss": 1.2049, "step": 835 }, { "epoch": 0.3840877914951989, "grad_norm": 0.579347207611424, "learning_rate": 7.737905167405596e-06, "loss": 1.2185, "step": 840 }, { "epoch": 0.386374028349337, "grad_norm": 0.6262921976973932, "learning_rate": 7.704424814087056e-06, "loss": 1.2137, "step": 845 }, { "epoch": 0.3886602652034751, "grad_norm": 0.6070706881138944, "learning_rate": 7.670772169934902e-06, "loss": 1.2177, "step": 850 }, { "epoch": 0.39094650205761317, "grad_norm": 0.5688216055326876, "learning_rate": 7.636949378858647e-06, "loss": 1.2016, "step": 855 }, { "epoch": 0.39323273891175126, "grad_norm": 0.6166249078020826, "learning_rate": 7.602958595607375e-06, "loss": 1.1957, "step": 860 }, { "epoch": 0.39551897576588935, "grad_norm": 0.5778886288472463, "learning_rate": 7.568801985632439e-06, "loss": 1.2105, "step": 865 }, { "epoch": 0.39780521262002744, "grad_norm": 0.6732218435967291, "learning_rate": 7.5344817249495195e-06, "loss": 1.2047, "step": 870 }, { "epoch": 0.40009144947416553, "grad_norm": 0.672208759556888, "learning_rate": 7.500000000000001e-06, "loss": 1.1854, "step": 875 }, { "epoch": 0.4023776863283036, "grad_norm": 0.6180565492464766, "learning_rate": 7.465359007511667e-06, "loss": 1.185, "step": 880 }, { "epoch": 0.4046639231824417, "grad_norm": 0.6266745151721254, "learning_rate": 7.430560954358764e-06, "loss": 1.2082, "step": 885 }, { "epoch": 0.4069501600365798, "grad_norm": 0.6163182978581346, "learning_rate": 7.395608057421406e-06, "loss": 1.2194, "step": 890 }, { "epoch": 0.4092363968907179, "grad_norm": 0.6262674693601461, "learning_rate": 7.360502543444339e-06, "loss": 1.2188, "step": 895 }, { "epoch": 0.411522633744856, "grad_norm": 0.5549642780561265, "learning_rate": 7.325246648895089e-06, "loss": 1.1986, "step": 900 }, { "epoch": 0.41380887059899407, "grad_norm": 0.5540368046559051, "learning_rate": 7.289842619821475e-06, "loss": 1.2175, "step": 905 }, { "epoch": 0.41609510745313216, "grad_norm": 0.587023330497459, "learning_rate": 7.254292711708529e-06, "loss": 1.2029, "step": 910 }, { "epoch": 0.41838134430727025, "grad_norm": 0.5513581130094706, "learning_rate": 7.218599189334799e-06, "loss": 1.2009, "step": 915 }, { "epoch": 0.42066758116140834, "grad_norm": 0.7237520794327035, "learning_rate": 7.182764326628068e-06, "loss": 1.2063, "step": 920 }, { "epoch": 0.42295381801554643, "grad_norm": 0.5476819110298711, "learning_rate": 7.146790406520491e-06, "loss": 1.2107, "step": 925 }, { "epoch": 0.4252400548696845, "grad_norm": 0.5753924094787153, "learning_rate": 7.1106797208031554e-06, "loss": 1.2133, "step": 930 }, { "epoch": 0.4275262917238226, "grad_norm": 0.6489054914059448, "learning_rate": 7.0744345699800755e-06, "loss": 1.1991, "step": 935 }, { "epoch": 0.42981252857796065, "grad_norm": 0.6239602498665449, "learning_rate": 7.038057263121639e-06, "loss": 1.1937, "step": 940 }, { "epoch": 0.43209876543209874, "grad_norm": 0.5954140813357963, "learning_rate": 7.001550117717499e-06, "loss": 1.2092, "step": 945 }, { "epoch": 0.43438500228623683, "grad_norm": 0.5953175778315464, "learning_rate": 6.9649154595289326e-06, "loss": 1.1957, "step": 950 }, { "epoch": 0.4366712391403749, "grad_norm": 0.6030938627687562, "learning_rate": 6.92815562244068e-06, "loss": 1.1827, "step": 955 }, { "epoch": 0.438957475994513, "grad_norm": 0.6882999466791362, "learning_rate": 6.891272948312251e-06, "loss": 1.2102, "step": 960 }, { "epoch": 0.4412437128486511, "grad_norm": 0.6080281045836577, "learning_rate": 6.854269786828741e-06, "loss": 1.2093, "step": 965 }, { "epoch": 0.4435299497027892, "grad_norm": 0.756192409869553, "learning_rate": 6.817148495351131e-06, "loss": 1.2159, "step": 970 }, { "epoch": 0.4458161865569273, "grad_norm": 0.5892520162590819, "learning_rate": 6.779911438766117e-06, "loss": 1.193, "step": 975 }, { "epoch": 0.4481024234110654, "grad_norm": 0.6265917897470434, "learning_rate": 6.742560989335438e-06, "loss": 1.1951, "step": 980 }, { "epoch": 0.45038866026520347, "grad_norm": 0.5927415516536023, "learning_rate": 6.705099526544757e-06, "loss": 1.1973, "step": 985 }, { "epoch": 0.45267489711934156, "grad_norm": 0.5602604942191215, "learning_rate": 6.667529436952064e-06, "loss": 1.1945, "step": 990 }, { "epoch": 0.45496113397347965, "grad_norm": 0.751574883051813, "learning_rate": 6.629853114035643e-06, "loss": 1.2134, "step": 995 }, { "epoch": 0.45724737082761774, "grad_norm": 0.6000318274839507, "learning_rate": 6.5920729580415795e-06, "loss": 1.2104, "step": 1000 }, { "epoch": 0.45953360768175583, "grad_norm": 0.5783065549399249, "learning_rate": 6.554191375830861e-06, "loss": 1.2016, "step": 1005 }, { "epoch": 0.4618198445358939, "grad_norm": 0.5751980188798808, "learning_rate": 6.516210780726032e-06, "loss": 1.1794, "step": 1010 }, { "epoch": 0.464106081390032, "grad_norm": 0.6096335885035103, "learning_rate": 6.478133592357455e-06, "loss": 1.1816, "step": 1015 }, { "epoch": 0.4663923182441701, "grad_norm": 0.5848690144740822, "learning_rate": 6.43996223650916e-06, "loss": 1.1735, "step": 1020 }, { "epoch": 0.4686785550983082, "grad_norm": 0.6273777569367492, "learning_rate": 6.401699144964306e-06, "loss": 1.1864, "step": 1025 }, { "epoch": 0.4709647919524463, "grad_norm": 0.5772389229176554, "learning_rate": 6.3633467553502625e-06, "loss": 1.1953, "step": 1030 }, { "epoch": 0.4732510288065844, "grad_norm": 0.6320660706578101, "learning_rate": 6.32490751098331e-06, "loss": 1.1778, "step": 1035 }, { "epoch": 0.47553726566072246, "grad_norm": 0.628014857385664, "learning_rate": 6.286383860712982e-06, "loss": 1.1978, "step": 1040 }, { "epoch": 0.47782350251486055, "grad_norm": 0.6165011857453245, "learning_rate": 6.247778258766069e-06, "loss": 1.1783, "step": 1045 }, { "epoch": 0.48010973936899864, "grad_norm": 0.6680859473813631, "learning_rate": 6.209093164590253e-06, "loss": 1.1883, "step": 1050 }, { "epoch": 0.48239597622313674, "grad_norm": 0.6230269069079273, "learning_rate": 6.170331042697425e-06, "loss": 1.1923, "step": 1055 }, { "epoch": 0.4846822130772748, "grad_norm": 0.6472681484163015, "learning_rate": 6.131494362506693e-06, "loss": 1.1826, "step": 1060 }, { "epoch": 0.4869684499314129, "grad_norm": 0.6799978087591872, "learning_rate": 6.09258559818704e-06, "loss": 1.1829, "step": 1065 }, { "epoch": 0.489254686785551, "grad_norm": 0.5617426984448537, "learning_rate": 6.053607228499719e-06, "loss": 1.1941, "step": 1070 }, { "epoch": 0.4915409236396891, "grad_norm": 0.6444058153599652, "learning_rate": 6.014561736640334e-06, "loss": 1.2, "step": 1075 }, { "epoch": 0.49382716049382713, "grad_norm": 0.6016265988080601, "learning_rate": 5.975451610080643e-06, "loss": 1.1655, "step": 1080 }, { "epoch": 0.4961133973479652, "grad_norm": 0.7053148286233416, "learning_rate": 5.936279340410082e-06, "loss": 1.172, "step": 1085 }, { "epoch": 0.4983996342021033, "grad_norm": 0.5586357561653685, "learning_rate": 5.8970474231770445e-06, "loss": 1.1922, "step": 1090 }, { "epoch": 0.5006858710562414, "grad_norm": 0.7895760074140119, "learning_rate": 5.857758357729892e-06, "loss": 1.1839, "step": 1095 }, { "epoch": 0.5029721079103795, "grad_norm": 0.7313666592611404, "learning_rate": 5.8184146470577265e-06, "loss": 1.1813, "step": 1100 }, { "epoch": 0.5052583447645176, "grad_norm": 0.6067591576327228, "learning_rate": 5.779018797630934e-06, "loss": 1.1855, "step": 1105 }, { "epoch": 0.5075445816186557, "grad_norm": 0.6144330199450508, "learning_rate": 5.739573319241505e-06, "loss": 1.1924, "step": 1110 }, { "epoch": 0.5098308184727938, "grad_norm": 0.6075048668745815, "learning_rate": 5.7000807248431466e-06, "loss": 1.1783, "step": 1115 }, { "epoch": 0.5121170553269319, "grad_norm": 0.6763365315316732, "learning_rate": 5.66054353039118e-06, "loss": 1.1873, "step": 1120 }, { "epoch": 0.51440329218107, "grad_norm": 0.652936999197392, "learning_rate": 5.620964254682267e-06, "loss": 1.2019, "step": 1125 }, { "epoch": 0.516689529035208, "grad_norm": 0.7510930690144121, "learning_rate": 5.58134541919394e-06, "loss": 1.1863, "step": 1130 }, { "epoch": 0.5189757658893461, "grad_norm": 0.7485282723991191, "learning_rate": 5.5416895479239665e-06, "loss": 1.1878, "step": 1135 }, { "epoch": 0.5212620027434842, "grad_norm": 0.6650793765929232, "learning_rate": 5.501999167229554e-06, "loss": 1.1844, "step": 1140 }, { "epoch": 0.5235482395976223, "grad_norm": 0.6617004106280673, "learning_rate": 5.4622768056664e-06, "loss": 1.1819, "step": 1145 }, { "epoch": 0.5258344764517604, "grad_norm": 0.639306148093516, "learning_rate": 5.42252499382761e-06, "loss": 1.1844, "step": 1150 }, { "epoch": 0.5281207133058985, "grad_norm": 0.590573720499581, "learning_rate": 5.38274626418248e-06, "loss": 1.1848, "step": 1155 }, { "epoch": 0.5304069501600366, "grad_norm": 0.625235396788826, "learning_rate": 5.3429431509151515e-06, "loss": 1.1904, "step": 1160 }, { "epoch": 0.5326931870141747, "grad_norm": 0.5840052674712635, "learning_rate": 5.303118189763187e-06, "loss": 1.1829, "step": 1165 }, { "epoch": 0.5349794238683128, "grad_norm": 0.5940842973816081, "learning_rate": 5.263273917856e-06, "loss": 1.1774, "step": 1170 }, { "epoch": 0.5372656607224509, "grad_norm": 0.5991239115995499, "learning_rate": 5.22341287355324e-06, "loss": 1.1857, "step": 1175 }, { "epoch": 0.539551897576589, "grad_norm": 0.6248756548437343, "learning_rate": 5.183537596283075e-06, "loss": 1.1799, "step": 1180 }, { "epoch": 0.541838134430727, "grad_norm": 0.6023807247895316, "learning_rate": 5.143650626380417e-06, "loss": 1.1858, "step": 1185 }, { "epoch": 0.5441243712848651, "grad_norm": 0.6101959497751839, "learning_rate": 5.103754504925071e-06, "loss": 1.1961, "step": 1190 }, { "epoch": 0.5464106081390032, "grad_norm": 0.569676114190435, "learning_rate": 5.06385177357987e-06, "loss": 1.1766, "step": 1195 }, { "epoch": 0.5486968449931413, "grad_norm": 0.5819652008689743, "learning_rate": 5.023944974428739e-06, "loss": 1.1734, "step": 1200 }, { "epoch": 0.5509830818472794, "grad_norm": 0.5661449507234365, "learning_rate": 4.9840366498147495e-06, "loss": 1.1908, "step": 1205 }, { "epoch": 0.5532693187014175, "grad_norm": 0.6109491726102372, "learning_rate": 4.944129342178156e-06, "loss": 1.1784, "step": 1210 }, { "epoch": 0.5555555555555556, "grad_norm": 0.5811074689104263, "learning_rate": 4.90422559389443e-06, "loss": 1.1746, "step": 1215 }, { "epoch": 0.5578417924096937, "grad_norm": 0.6060458081756667, "learning_rate": 4.864327947112281e-06, "loss": 1.195, "step": 1220 }, { "epoch": 0.5601280292638318, "grad_norm": 0.6226718536570417, "learning_rate": 4.82443894359171e-06, "loss": 1.1786, "step": 1225 }, { "epoch": 0.5624142661179699, "grad_norm": 0.5995864510713481, "learning_rate": 4.784561124542088e-06, "loss": 1.1791, "step": 1230 }, { "epoch": 0.5647005029721079, "grad_norm": 0.5701958838449743, "learning_rate": 4.744697030460248e-06, "loss": 1.1647, "step": 1235 }, { "epoch": 0.566986739826246, "grad_norm": 0.6293939505655973, "learning_rate": 4.7048492009686525e-06, "loss": 1.1692, "step": 1240 }, { "epoch": 0.5692729766803841, "grad_norm": 0.6850447194966206, "learning_rate": 4.6650201746535926e-06, "loss": 1.1673, "step": 1245 }, { "epoch": 0.5715592135345222, "grad_norm": 0.6040120516739561, "learning_rate": 4.625212488903467e-06, "loss": 1.1834, "step": 1250 }, { "epoch": 0.5738454503886603, "grad_norm": 0.5686706476550618, "learning_rate": 4.585428679747133e-06, "loss": 1.1716, "step": 1255 }, { "epoch": 0.5761316872427984, "grad_norm": 0.5946931657837966, "learning_rate": 4.545671281692331e-06, "loss": 1.1705, "step": 1260 }, { "epoch": 0.5784179240969365, "grad_norm": 0.6120143356512502, "learning_rate": 4.505942827564242e-06, "loss": 1.1807, "step": 1265 }, { "epoch": 0.5807041609510746, "grad_norm": 0.6341171747185648, "learning_rate": 4.466245848344106e-06, "loss": 1.1839, "step": 1270 }, { "epoch": 0.5829903978052127, "grad_norm": 0.6494090868678567, "learning_rate": 4.426582873007999e-06, "loss": 1.1684, "step": 1275 }, { "epoch": 0.5852766346593508, "grad_norm": 0.6252524175950205, "learning_rate": 4.386956428365701e-06, "loss": 1.1878, "step": 1280 }, { "epoch": 0.5875628715134888, "grad_norm": 0.5911175497758677, "learning_rate": 4.347369038899744e-06, "loss": 1.1828, "step": 1285 }, { "epoch": 0.5898491083676269, "grad_norm": 0.5988939599453593, "learning_rate": 4.307823226604555e-06, "loss": 1.1735, "step": 1290 }, { "epoch": 0.5921353452217649, "grad_norm": 0.5813355536422021, "learning_rate": 4.2683215108258145e-06, "loss": 1.1706, "step": 1295 }, { "epoch": 0.594421582075903, "grad_norm": 0.6208043705991068, "learning_rate": 4.228866408099945e-06, "loss": 1.1907, "step": 1300 }, { "epoch": 0.5967078189300411, "grad_norm": 0.6512006631857741, "learning_rate": 4.189460431993788e-06, "loss": 1.1951, "step": 1305 }, { "epoch": 0.5989940557841792, "grad_norm": 0.5845471180993255, "learning_rate": 4.150106092944475e-06, "loss": 1.1717, "step": 1310 }, { "epoch": 0.6012802926383173, "grad_norm": 0.5949045334275538, "learning_rate": 4.110805898099492e-06, "loss": 1.1833, "step": 1315 }, { "epoch": 0.6035665294924554, "grad_norm": 0.5971913414181261, "learning_rate": 4.071562351156966e-06, "loss": 1.1786, "step": 1320 }, { "epoch": 0.6058527663465935, "grad_norm": 0.6178601149254982, "learning_rate": 4.032377952206148e-06, "loss": 1.1793, "step": 1325 }, { "epoch": 0.6081390032007316, "grad_norm": 0.6046188006147395, "learning_rate": 3.993255197568154e-06, "loss": 1.169, "step": 1330 }, { "epoch": 0.6104252400548696, "grad_norm": 0.5919458656130715, "learning_rate": 3.954196579636918e-06, "loss": 1.1692, "step": 1335 }, { "epoch": 0.6127114769090077, "grad_norm": 0.5727049539306068, "learning_rate": 3.91520458672042e-06, "loss": 1.1747, "step": 1340 }, { "epoch": 0.6149977137631458, "grad_norm": 0.6040809405921704, "learning_rate": 3.876281702882156e-06, "loss": 1.1935, "step": 1345 }, { "epoch": 0.6172839506172839, "grad_norm": 0.5747789602798682, "learning_rate": 3.837430407782896e-06, "loss": 1.175, "step": 1350 }, { "epoch": 0.619570187471422, "grad_norm": 0.6001909994942644, "learning_rate": 3.7986531765226965e-06, "loss": 1.1718, "step": 1355 }, { "epoch": 0.6218564243255601, "grad_norm": 0.5499338552551708, "learning_rate": 3.759952479483232e-06, "loss": 1.1615, "step": 1360 }, { "epoch": 0.6241426611796982, "grad_norm": 0.62697610396954, "learning_rate": 3.7213307821704115e-06, "loss": 1.1616, "step": 1365 }, { "epoch": 0.6264288980338363, "grad_norm": 0.637904015143814, "learning_rate": 3.6827905450573022e-06, "loss": 1.1784, "step": 1370 }, { "epoch": 0.6287151348879744, "grad_norm": 0.6235229612947039, "learning_rate": 3.6443342234273905e-06, "loss": 1.1674, "step": 1375 }, { "epoch": 0.6310013717421125, "grad_norm": 0.744429415227132, "learning_rate": 3.6059642672181537e-06, "loss": 1.1678, "step": 1380 }, { "epoch": 0.6332876085962506, "grad_norm": 0.5903117671660288, "learning_rate": 3.5676831208649887e-06, "loss": 1.1661, "step": 1385 }, { "epoch": 0.6355738454503886, "grad_norm": 0.5977435348831742, "learning_rate": 3.5294932231454838e-06, "loss": 1.1655, "step": 1390 }, { "epoch": 0.6378600823045267, "grad_norm": 0.6262251229258455, "learning_rate": 3.4913970070240388e-06, "loss": 1.1827, "step": 1395 }, { "epoch": 0.6401463191586648, "grad_norm": 0.6039362156672261, "learning_rate": 3.4533968994968913e-06, "loss": 1.162, "step": 1400 }, { "epoch": 0.6424325560128029, "grad_norm": 0.610471777862986, "learning_rate": 3.41549532143748e-06, "loss": 1.1719, "step": 1405 }, { "epoch": 0.644718792866941, "grad_norm": 0.6124948412563855, "learning_rate": 3.3776946874422268e-06, "loss": 1.161, "step": 1410 }, { "epoch": 0.6470050297210791, "grad_norm": 0.596054515528405, "learning_rate": 3.3399974056767095e-06, "loss": 1.1677, "step": 1415 }, { "epoch": 0.6492912665752172, "grad_norm": 0.6199519548446956, "learning_rate": 3.30240587772224e-06, "loss": 1.1731, "step": 1420 }, { "epoch": 0.6515775034293553, "grad_norm": 0.6123382818220521, "learning_rate": 3.2649224984228756e-06, "loss": 1.1751, "step": 1425 }, { "epoch": 0.6538637402834934, "grad_norm": 0.6521756883889377, "learning_rate": 3.227549655732843e-06, "loss": 1.1746, "step": 1430 }, { "epoch": 0.6561499771376315, "grad_norm": 0.6292502440238857, "learning_rate": 3.19028973056441e-06, "loss": 1.1796, "step": 1435 }, { "epoch": 0.6584362139917695, "grad_norm": 0.7223300006546375, "learning_rate": 3.153145096636211e-06, "loss": 1.1769, "step": 1440 }, { "epoch": 0.6607224508459076, "grad_norm": 0.6123252900962536, "learning_rate": 3.1161181203220146e-06, "loss": 1.1798, "step": 1445 }, { "epoch": 0.6630086877000457, "grad_norm": 0.6176590524451245, "learning_rate": 3.079211160499975e-06, "loss": 1.1628, "step": 1450 }, { "epoch": 0.6652949245541838, "grad_norm": 0.6851380779593121, "learning_rate": 3.0424265684023556e-06, "loss": 1.1621, "step": 1455 }, { "epoch": 0.6675811614083219, "grad_norm": 0.6135186798564677, "learning_rate": 3.0057666874657365e-06, "loss": 1.1817, "step": 1460 }, { "epoch": 0.66986739826246, "grad_norm": 0.6162664151552476, "learning_rate": 2.9692338531817205e-06, "loss": 1.1621, "step": 1465 }, { "epoch": 0.6721536351165981, "grad_norm": 0.6209879083469707, "learning_rate": 2.9328303929481507e-06, "loss": 1.1788, "step": 1470 }, { "epoch": 0.6744398719707362, "grad_norm": 0.6564960801220917, "learning_rate": 2.8965586259208295e-06, "loss": 1.1497, "step": 1475 }, { "epoch": 0.6767261088248743, "grad_norm": 0.6100366044161921, "learning_rate": 2.860420862865787e-06, "loss": 1.1641, "step": 1480 }, { "epoch": 0.6790123456790124, "grad_norm": 0.6401282278697755, "learning_rate": 2.82441940601205e-06, "loss": 1.1647, "step": 1485 }, { "epoch": 0.6812985825331505, "grad_norm": 0.5948814066139619, "learning_rate": 2.7885565489049948e-06, "loss": 1.1862, "step": 1490 }, { "epoch": 0.6835848193872885, "grad_norm": 0.575891260626997, "learning_rate": 2.7528345762602125e-06, "loss": 1.149, "step": 1495 }, { "epoch": 0.6858710562414266, "grad_norm": 0.6321328549868929, "learning_rate": 2.7172557638179674e-06, "loss": 1.1722, "step": 1500 }, { "epoch": 0.6881572930955647, "grad_norm": 0.620537429422375, "learning_rate": 2.681822378198221e-06, "loss": 1.1667, "step": 1505 }, { "epoch": 0.6904435299497028, "grad_norm": 0.5916688359774108, "learning_rate": 2.6465366767562162e-06, "loss": 1.1742, "step": 1510 }, { "epoch": 0.6927297668038409, "grad_norm": 0.649532932905328, "learning_rate": 2.611400907438685e-06, "loss": 1.1664, "step": 1515 }, { "epoch": 0.695016003657979, "grad_norm": 0.5887639490410209, "learning_rate": 2.5764173086406306e-06, "loss": 1.1684, "step": 1520 }, { "epoch": 0.6973022405121171, "grad_norm": 0.5909674256777088, "learning_rate": 2.5415881090627227e-06, "loss": 1.1681, "step": 1525 }, { "epoch": 0.6995884773662552, "grad_norm": 0.6669572713903603, "learning_rate": 2.506915527569318e-06, "loss": 1.1692, "step": 1530 }, { "epoch": 0.7018747142203933, "grad_norm": 0.6291006193664693, "learning_rate": 2.472401773047107e-06, "loss": 1.1707, "step": 1535 }, { "epoch": 0.7041609510745314, "grad_norm": 0.6241336853751712, "learning_rate": 2.438049044264382e-06, "loss": 1.1763, "step": 1540 }, { "epoch": 0.7064471879286695, "grad_norm": 0.6233093811845397, "learning_rate": 2.4038595297309712e-06, "loss": 1.1595, "step": 1545 }, { "epoch": 0.7087334247828075, "grad_norm": 0.6099376654855213, "learning_rate": 2.3698354075588105e-06, "loss": 1.1815, "step": 1550 }, { "epoch": 0.7110196616369456, "grad_norm": 0.608739940642273, "learning_rate": 2.3359788453231723e-06, "loss": 1.1558, "step": 1555 }, { "epoch": 0.7133058984910837, "grad_norm": 0.6060804682823651, "learning_rate": 2.3022919999245964e-06, "loss": 1.1737, "step": 1560 }, { "epoch": 0.7155921353452218, "grad_norm": 0.6554029837627439, "learning_rate": 2.2687770174514674e-06, "loss": 1.1763, "step": 1565 }, { "epoch": 0.7178783721993599, "grad_norm": 0.6199763037940721, "learning_rate": 2.23543603304329e-06, "loss": 1.1668, "step": 1570 }, { "epoch": 0.720164609053498, "grad_norm": 0.7002533112076955, "learning_rate": 2.20227117075468e-06, "loss": 1.1717, "step": 1575 }, { "epoch": 0.7224508459076361, "grad_norm": 0.5685258465602809, "learning_rate": 2.1692845434200323e-06, "loss": 1.1793, "step": 1580 }, { "epoch": 0.7247370827617741, "grad_norm": 0.5988803647429354, "learning_rate": 2.136478252518924e-06, "loss": 1.1762, "step": 1585 }, { "epoch": 0.7270233196159122, "grad_norm": 0.6220944262982843, "learning_rate": 2.103854388042243e-06, "loss": 1.1732, "step": 1590 }, { "epoch": 0.7293095564700502, "grad_norm": 0.5872374752551915, "learning_rate": 2.071415028359026e-06, "loss": 1.1653, "step": 1595 }, { "epoch": 0.7315957933241883, "grad_norm": 0.6315378201627972, "learning_rate": 2.0391622400840665e-06, "loss": 1.1631, "step": 1600 }, { "epoch": 0.7338820301783264, "grad_norm": 0.6166479295990325, "learning_rate": 2.0070980779462513e-06, "loss": 1.1632, "step": 1605 }, { "epoch": 0.7361682670324645, "grad_norm": 0.6082820756952414, "learning_rate": 1.975224584657648e-06, "loss": 1.1609, "step": 1610 }, { "epoch": 0.7384545038866026, "grad_norm": 0.5711567863660318, "learning_rate": 1.943543790783392e-06, "loss": 1.1629, "step": 1615 }, { "epoch": 0.7407407407407407, "grad_norm": 0.5934876997772376, "learning_rate": 1.9120577146123125e-06, "loss": 1.1711, "step": 1620 }, { "epoch": 0.7430269775948788, "grad_norm": 0.6044258229955937, "learning_rate": 1.8807683620283496e-06, "loss": 1.1792, "step": 1625 }, { "epoch": 0.7453132144490169, "grad_norm": 0.6414108282805848, "learning_rate": 1.8496777263827775e-06, "loss": 1.1909, "step": 1630 }, { "epoch": 0.747599451303155, "grad_norm": 0.5928077840962543, "learning_rate": 1.8187877883672024e-06, "loss": 1.177, "step": 1635 }, { "epoch": 0.7498856881572931, "grad_norm": 0.5674967348667851, "learning_rate": 1.7881005158873826e-06, "loss": 1.1698, "step": 1640 }, { "epoch": 0.7521719250114312, "grad_norm": 0.6190325214784786, "learning_rate": 1.757617863937865e-06, "loss": 1.1564, "step": 1645 }, { "epoch": 0.7544581618655692, "grad_norm": 0.5994621485851359, "learning_rate": 1.7273417744774323e-06, "loss": 1.1682, "step": 1650 }, { "epoch": 0.7567443987197073, "grad_norm": 0.6486512119864596, "learning_rate": 1.6972741763053835e-06, "loss": 1.1695, "step": 1655 }, { "epoch": 0.7590306355738454, "grad_norm": 0.6124244446703457, "learning_rate": 1.6674169849386606e-06, "loss": 1.1735, "step": 1660 }, { "epoch": 0.7613168724279835, "grad_norm": 0.6215393083401685, "learning_rate": 1.6377721024898214e-06, "loss": 1.1611, "step": 1665 }, { "epoch": 0.7636031092821216, "grad_norm": 0.6379465283211975, "learning_rate": 1.608341417545849e-06, "loss": 1.1481, "step": 1670 }, { "epoch": 0.7658893461362597, "grad_norm": 0.5646658898706897, "learning_rate": 1.5791268050478487e-06, "loss": 1.1732, "step": 1675 }, { "epoch": 0.7681755829903978, "grad_norm": 0.6028441016085894, "learning_rate": 1.5501301261715896e-06, "loss": 1.1703, "step": 1680 }, { "epoch": 0.7704618198445359, "grad_norm": 0.6313316478647917, "learning_rate": 1.5213532282089466e-06, "loss": 1.1631, "step": 1685 }, { "epoch": 0.772748056698674, "grad_norm": 0.600237347487572, "learning_rate": 1.4927979444502028e-06, "loss": 1.1642, "step": 1690 }, { "epoch": 0.7750342935528121, "grad_norm": 0.5957448361281138, "learning_rate": 1.4644660940672628e-06, "loss": 1.1668, "step": 1695 }, { "epoch": 0.7773205304069501, "grad_norm": 0.5872437663700951, "learning_rate": 1.4363594819977606e-06, "loss": 1.1707, "step": 1700 }, { "epoch": 0.7796067672610882, "grad_norm": 0.7075549655922131, "learning_rate": 1.4084798988300684e-06, "loss": 1.1723, "step": 1705 }, { "epoch": 0.7818930041152263, "grad_norm": 0.6203199463017092, "learning_rate": 1.3808291206892232e-06, "loss": 1.1668, "step": 1710 }, { "epoch": 0.7841792409693644, "grad_norm": 0.5759538308213393, "learning_rate": 1.3534089091237757e-06, "loss": 1.1598, "step": 1715 }, { "epoch": 0.7864654778235025, "grad_norm": 0.5942123152988342, "learning_rate": 1.3262210109935719e-06, "loss": 1.1699, "step": 1720 }, { "epoch": 0.7887517146776406, "grad_norm": 0.6597153339968819, "learning_rate": 1.2992671583584587e-06, "loss": 1.163, "step": 1725 }, { "epoch": 0.7910379515317787, "grad_norm": 0.5994756887911626, "learning_rate": 1.2725490683679458e-06, "loss": 1.1797, "step": 1730 }, { "epoch": 0.7933241883859168, "grad_norm": 0.5942174681280669, "learning_rate": 1.2460684431518055e-06, "loss": 1.1649, "step": 1735 }, { "epoch": 0.7956104252400549, "grad_norm": 0.5884403788886147, "learning_rate": 1.2198269697116416e-06, "loss": 1.1627, "step": 1740 }, { "epoch": 0.797896662094193, "grad_norm": 0.5917506875732326, "learning_rate": 1.1938263198134087e-06, "loss": 1.1729, "step": 1745 }, { "epoch": 0.8001828989483311, "grad_norm": 0.5689945244963683, "learning_rate": 1.168068149880912e-06, "loss": 1.1639, "step": 1750 }, { "epoch": 0.8024691358024691, "grad_norm": 0.5945700377730089, "learning_rate": 1.1425541008902852e-06, "loss": 1.1616, "step": 1755 }, { "epoch": 0.8047553726566072, "grad_norm": 0.5960318855848052, "learning_rate": 1.1172857982654445e-06, "loss": 1.1796, "step": 1760 }, { "epoch": 0.8070416095107453, "grad_norm": 0.606906781862042, "learning_rate": 1.092264851774536e-06, "loss": 1.1524, "step": 1765 }, { "epoch": 0.8093278463648834, "grad_norm": 0.6686014083887466, "learning_rate": 1.067492855427385e-06, "loss": 1.1681, "step": 1770 }, { "epoch": 0.8116140832190215, "grad_norm": 0.6637295349703526, "learning_rate": 1.0429713873739505e-06, "loss": 1.1603, "step": 1775 }, { "epoch": 0.8139003200731596, "grad_norm": 0.5937746781646984, "learning_rate": 1.0187020098037759e-06, "loss": 1.1577, "step": 1780 }, { "epoch": 0.8161865569272977, "grad_norm": 0.6154438358761861, "learning_rate": 9.946862688464753e-07, "loss": 1.1596, "step": 1785 }, { "epoch": 0.8184727937814358, "grad_norm": 0.6511739287376433, "learning_rate": 9.709256944732343e-07, "loss": 1.1707, "step": 1790 }, { "epoch": 0.8207590306355739, "grad_norm": 0.6174881374069865, "learning_rate": 9.474218003993275e-07, "loss": 1.1775, "step": 1795 }, { "epoch": 0.823045267489712, "grad_norm": 0.5791204684491382, "learning_rate": 9.241760839877023e-07, "loss": 1.1571, "step": 1800 }, { "epoch": 0.82533150434385, "grad_norm": 0.6464260391976697, "learning_rate": 9.011900261535767e-07, "loss": 1.1713, "step": 1805 }, { "epoch": 0.8276177411979881, "grad_norm": 0.6102288143326278, "learning_rate": 8.784650912700909e-07, "loss": 1.1654, "step": 1810 }, { "epoch": 0.8299039780521262, "grad_norm": 0.6226743471510658, "learning_rate": 8.560027270750276e-07, "loss": 1.1655, "step": 1815 }, { "epoch": 0.8321902149062643, "grad_norm": 0.6079710775307922, "learning_rate": 8.338043645785698e-07, "loss": 1.1669, "step": 1820 }, { "epoch": 0.8344764517604024, "grad_norm": 0.6077180347148399, "learning_rate": 8.118714179721404e-07, "loss": 1.1529, "step": 1825 }, { "epoch": 0.8367626886145405, "grad_norm": 0.6420590181680129, "learning_rate": 7.902052845383112e-07, "loss": 1.1662, "step": 1830 }, { "epoch": 0.8390489254686786, "grad_norm": 0.5675937752707487, "learning_rate": 7.6880734456178e-07, "loss": 1.1638, "step": 1835 }, { "epoch": 0.8413351623228167, "grad_norm": 0.5963600943686237, "learning_rate": 7.476789612414414e-07, "loss": 1.1648, "step": 1840 }, { "epoch": 0.8436213991769548, "grad_norm": 0.6248451529177521, "learning_rate": 7.268214806035423e-07, "loss": 1.1704, "step": 1845 }, { "epoch": 0.8459076360310929, "grad_norm": 0.6582130785897107, "learning_rate": 7.062362314159211e-07, "loss": 1.1716, "step": 1850 }, { "epoch": 0.848193872885231, "grad_norm": 0.6104979563533071, "learning_rate": 6.859245251033697e-07, "loss": 1.1551, "step": 1855 }, { "epoch": 0.850480109739369, "grad_norm": 0.6291505363028616, "learning_rate": 6.658876556640781e-07, "loss": 1.1606, "step": 1860 }, { "epoch": 0.8527663465935071, "grad_norm": 0.626351910055198, "learning_rate": 6.461268995871967e-07, "loss": 1.1648, "step": 1865 }, { "epoch": 0.8550525834476452, "grad_norm": 0.5991977091276379, "learning_rate": 6.266435157715222e-07, "loss": 1.1403, "step": 1870 }, { "epoch": 0.8573388203017832, "grad_norm": 0.6133109082285381, "learning_rate": 6.074387454452891e-07, "loss": 1.1578, "step": 1875 }, { "epoch": 0.8596250571559213, "grad_norm": 0.6062420232877472, "learning_rate": 5.885138120870965e-07, "loss": 1.1422, "step": 1880 }, { "epoch": 0.8619112940100594, "grad_norm": 0.5920619164293491, "learning_rate": 5.698699213479697e-07, "loss": 1.1503, "step": 1885 }, { "epoch": 0.8641975308641975, "grad_norm": 0.6179934405963249, "learning_rate": 5.515082609745465e-07, "loss": 1.1728, "step": 1890 }, { "epoch": 0.8664837677183356, "grad_norm": 0.6191884681224713, "learning_rate": 5.334300007334065e-07, "loss": 1.1514, "step": 1895 }, { "epoch": 0.8687700045724737, "grad_norm": 0.6148818189812965, "learning_rate": 5.156362923365587e-07, "loss": 1.1772, "step": 1900 }, { "epoch": 0.8710562414266118, "grad_norm": 0.5927964681781609, "learning_rate": 4.981282693680584e-07, "loss": 1.1747, "step": 1905 }, { "epoch": 0.8733424782807498, "grad_norm": 0.630038523819453, "learning_rate": 4.80907047211796e-07, "loss": 1.1638, "step": 1910 }, { "epoch": 0.8756287151348879, "grad_norm": 0.5822419290829026, "learning_rate": 4.639737229804403e-07, "loss": 1.1667, "step": 1915 }, { "epoch": 0.877914951989026, "grad_norm": 0.6169634205827448, "learning_rate": 4.473293754455399e-07, "loss": 1.1695, "step": 1920 }, { "epoch": 0.8802011888431641, "grad_norm": 0.5892947845386679, "learning_rate": 4.3097506496880325e-07, "loss": 1.1684, "step": 1925 }, { "epoch": 0.8824874256973022, "grad_norm": 0.6796811793089527, "learning_rate": 4.149118334345403e-07, "loss": 1.1604, "step": 1930 }, { "epoch": 0.8847736625514403, "grad_norm": 0.5951100132603444, "learning_rate": 3.9914070418329123e-07, "loss": 1.1632, "step": 1935 }, { "epoch": 0.8870598994055784, "grad_norm": 0.6710610553022762, "learning_rate": 3.836626819466338e-07, "loss": 1.1455, "step": 1940 }, { "epoch": 0.8893461362597165, "grad_norm": 0.6128779790737046, "learning_rate": 3.684787527831707e-07, "loss": 1.1609, "step": 1945 }, { "epoch": 0.8916323731138546, "grad_norm": 0.5800567298586133, "learning_rate": 3.53589884015712e-07, "loss": 1.1636, "step": 1950 }, { "epoch": 0.8939186099679927, "grad_norm": 0.5600191099569565, "learning_rate": 3.3899702416965166e-07, "loss": 1.1721, "step": 1955 }, { "epoch": 0.8962048468221308, "grad_norm": 0.5964683215562515, "learning_rate": 3.247011029125391e-07, "loss": 1.1508, "step": 1960 }, { "epoch": 0.8984910836762688, "grad_norm": 0.6125213377358303, "learning_rate": 3.1070303099485055e-07, "loss": 1.1716, "step": 1965 }, { "epoch": 0.9007773205304069, "grad_norm": 0.5812964318078312, "learning_rate": 2.9700370019197287e-07, "loss": 1.1495, "step": 1970 }, { "epoch": 0.903063557384545, "grad_norm": 0.5947330421470328, "learning_rate": 2.8360398324738415e-07, "loss": 1.1446, "step": 1975 }, { "epoch": 0.9053497942386831, "grad_norm": 0.5936630268160432, "learning_rate": 2.7050473381706186e-07, "loss": 1.1519, "step": 1980 }, { "epoch": 0.9076360310928212, "grad_norm": 0.6228979256825669, "learning_rate": 2.577067864150906e-07, "loss": 1.1688, "step": 1985 }, { "epoch": 0.9099222679469593, "grad_norm": 0.6500515468078818, "learning_rate": 2.452109563605065e-07, "loss": 1.1718, "step": 1990 }, { "epoch": 0.9122085048010974, "grad_norm": 0.568112374463465, "learning_rate": 2.330180397253473e-07, "loss": 1.169, "step": 1995 }, { "epoch": 0.9144947416552355, "grad_norm": 0.6014335143268985, "learning_rate": 2.2112881328394287e-07, "loss": 1.1556, "step": 2000 }, { "epoch": 0.9167809785093736, "grad_norm": 0.5814781144236604, "learning_rate": 2.0954403446342753e-07, "loss": 1.1688, "step": 2005 }, { "epoch": 0.9190672153635117, "grad_norm": 0.6269697024329176, "learning_rate": 1.9826444129548317e-07, "loss": 1.1791, "step": 2010 }, { "epoch": 0.9213534522176497, "grad_norm": 0.5793724546294099, "learning_rate": 1.8729075236932903e-07, "loss": 1.1736, "step": 2015 }, { "epoch": 0.9236396890717878, "grad_norm": 0.5757028817840649, "learning_rate": 1.7662366678593502e-07, "loss": 1.1674, "step": 2020 }, { "epoch": 0.9259259259259259, "grad_norm": 0.6383512892284545, "learning_rate": 1.6626386411348783e-07, "loss": 1.1725, "step": 2025 }, { "epoch": 0.928212162780064, "grad_norm": 0.6064267969457637, "learning_rate": 1.56212004344099e-07, "loss": 1.1596, "step": 2030 }, { "epoch": 0.9304983996342021, "grad_norm": 0.6046327277263103, "learning_rate": 1.4646872785175182e-07, "loss": 1.1616, "step": 2035 }, { "epoch": 0.9327846364883402, "grad_norm": 0.611959733363112, "learning_rate": 1.3703465535151505e-07, "loss": 1.1614, "step": 2040 }, { "epoch": 0.9350708733424783, "grad_norm": 0.6153837948383357, "learning_rate": 1.2791038785999243e-07, "loss": 1.1494, "step": 2045 }, { "epoch": 0.9373571101966164, "grad_norm": 0.5507733416769363, "learning_rate": 1.1909650665703265e-07, "loss": 1.1331, "step": 2050 }, { "epoch": 0.9396433470507545, "grad_norm": 0.5787602661155832, "learning_rate": 1.1059357324870456e-07, "loss": 1.1548, "step": 2055 }, { "epoch": 0.9419295839048926, "grad_norm": 0.5848374134615248, "learning_rate": 1.024021293315175e-07, "loss": 1.1628, "step": 2060 }, { "epoch": 0.9442158207590307, "grad_norm": 0.585861722501522, "learning_rate": 9.452269675791603e-08, "loss": 1.1424, "step": 2065 }, { "epoch": 0.9465020576131687, "grad_norm": 0.5870866242087308, "learning_rate": 8.69557775030344e-08, "loss": 1.181, "step": 2070 }, { "epoch": 0.9487882944673068, "grad_norm": 0.5917858310575264, "learning_rate": 7.970185363271432e-08, "loss": 1.1564, "step": 2075 }, { "epoch": 0.9510745313214449, "grad_norm": 0.6272259568011471, "learning_rate": 7.276138727279669e-08, "loss": 1.1659, "step": 2080 }, { "epoch": 0.953360768175583, "grad_norm": 0.607366888512829, "learning_rate": 6.613482057968023e-08, "loss": 1.1612, "step": 2085 }, { "epoch": 0.9556470050297211, "grad_norm": 0.61579614820576, "learning_rate": 5.982257571215178e-08, "loss": 1.1644, "step": 2090 }, { "epoch": 0.9579332418838592, "grad_norm": 0.6162342496797737, "learning_rate": 5.382505480449274e-08, "loss": 1.1439, "step": 2095 }, { "epoch": 0.9602194787379973, "grad_norm": 0.5880335959078453, "learning_rate": 4.814263994086077e-08, "loss": 1.1405, "step": 2100 }, { "epoch": 0.9625057155921354, "grad_norm": 0.5978901392727579, "learning_rate": 4.2775693130948094e-08, "loss": 1.1792, "step": 2105 }, { "epoch": 0.9647919524462735, "grad_norm": 0.5725207858399001, "learning_rate": 3.772455628691829e-08, "loss": 1.1679, "step": 2110 }, { "epoch": 0.9670781893004116, "grad_norm": 0.6126681514493614, "learning_rate": 3.2989551201624836e-08, "loss": 1.1621, "step": 2115 }, { "epoch": 0.9693644261545497, "grad_norm": 0.6026354249744876, "learning_rate": 2.857097952810972e-08, "loss": 1.1728, "step": 2120 }, { "epoch": 0.9716506630086877, "grad_norm": 0.5876159431495082, "learning_rate": 2.4469122760388264e-08, "loss": 1.1552, "step": 2125 }, { "epoch": 0.9739368998628258, "grad_norm": 0.5795939734314318, "learning_rate": 2.0684242215511797e-08, "loss": 1.1586, "step": 2130 }, { "epoch": 0.9762231367169639, "grad_norm": 0.6100064497073957, "learning_rate": 1.7216579016925415e-08, "loss": 1.1585, "step": 2135 }, { "epoch": 0.978509373571102, "grad_norm": 0.6410024148442394, "learning_rate": 1.4066354079101396e-08, "loss": 1.1576, "step": 2140 }, { "epoch": 0.9807956104252401, "grad_norm": 0.5946394925998356, "learning_rate": 1.1233768093468766e-08, "loss": 1.1565, "step": 2145 }, { "epoch": 0.9830818472793782, "grad_norm": 0.5993080705042445, "learning_rate": 8.719001515627434e-09, "loss": 1.1649, "step": 2150 }, { "epoch": 0.9853680841335163, "grad_norm": 0.5857680491868433, "learning_rate": 6.5222145538501595e-09, "loss": 1.176, "step": 2155 }, { "epoch": 0.9876543209876543, "grad_norm": 0.6157142971328977, "learning_rate": 4.643547158878492e-09, "loss": 1.146, "step": 2160 }, { "epoch": 0.9899405578417924, "grad_norm": 0.6005659801135901, "learning_rate": 3.0831190150054646e-09, "loss": 1.1607, "step": 2165 }, { "epoch": 0.9922267946959304, "grad_norm": 0.5963682235084494, "learning_rate": 1.8410295324505778e-09, "loss": 1.1668, "step": 2170 }, { "epoch": 0.9945130315500685, "grad_norm": 0.649218390898171, "learning_rate": 9.173578410281992e-10, "loss": 1.1602, "step": 2175 }, { "epoch": 0.9967992684042066, "grad_norm": 0.612662110275474, "learning_rate": 3.1216278510493027e-10, "loss": 1.1596, "step": 2180 }, { "epoch": 0.9990855052583447, "grad_norm": 0.6025732837303296, "learning_rate": 2.548291985149387e-11, "loss": 1.147, "step": 2185 }, { "epoch": 1.0, "eval_runtime": 4.0833, "eval_samples_per_second": 2.449, "eval_steps_per_second": 0.735, "step": 2187 }, { "epoch": 1.0, "step": 2187, "total_flos": 9703359095242752.0, "train_loss": 0.0, "train_runtime": 0.009, "train_samples_per_second": 3870652.356, "train_steps_per_second": 241991.844 } ], "logging_steps": 5, "max_steps": 2187, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9703359095242752.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }