{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9985964912280703, "eval_steps": 623, "global_step": 2492, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008020050125313284, "grad_norm": 0.09629630297422409, "learning_rate": 0.000499999801338539, "loss": 0.8462, "step": 1 }, { "epoch": 0.0040100250626566416, "grad_norm": 0.09335128962993622, "learning_rate": 0.0004999950334792631, "loss": 0.9167, "step": 5 }, { "epoch": 0.008020050125313283, "grad_norm": 0.0814199447631836, "learning_rate": 0.0004999801341143832, "loss": 0.8982, "step": 10 }, { "epoch": 0.012030075187969926, "grad_norm": 0.0830962136387825, "learning_rate": 0.0004999553024973442, "loss": 0.8568, "step": 15 }, { "epoch": 0.016040100250626566, "grad_norm": 0.07535357028245926, "learning_rate": 0.00049992053961476, "loss": 0.8423, "step": 20 }, { "epoch": 0.020050125313283207, "grad_norm": 0.08100854605436325, "learning_rate": 0.0004998758468478354, "loss": 0.822, "step": 25 }, { "epoch": 0.02406015037593985, "grad_norm": 0.0720616802573204, "learning_rate": 0.0004998212259723108, "loss": 0.801, "step": 30 }, { "epoch": 0.028070175438596492, "grad_norm": 0.07352820038795471, "learning_rate": 0.0004997566791583916, "loss": 0.819, "step": 35 }, { "epoch": 0.03208020050125313, "grad_norm": 0.07747522741556168, "learning_rate": 0.0004996822089706628, "loss": 0.7917, "step": 40 }, { "epoch": 0.03609022556390978, "grad_norm": 0.0775732770562172, "learning_rate": 0.0004995978183679864, "loss": 0.8007, "step": 45 }, { "epoch": 0.040100250626566414, "grad_norm": 0.08353012800216675, "learning_rate": 0.0004995035107033833, "loss": 0.7939, "step": 50 }, { "epoch": 0.04411027568922306, "grad_norm": 0.09015482664108276, "learning_rate": 0.0004993992897239018, "loss": 0.7844, "step": 55 }, { "epoch": 0.0481203007518797, "grad_norm": 0.08346273750066757, "learning_rate": 0.0004992851595704668, "loss": 0.7805, "step": 60 }, { "epoch": 0.05213032581453634, "grad_norm": 0.08374553918838501, "learning_rate": 0.0004991611247777166, "loss": 0.7758, "step": 65 }, { "epoch": 0.056140350877192984, "grad_norm": 0.0855209082365036, "learning_rate": 0.0004990271902738223, "loss": 0.7799, "step": 70 }, { "epoch": 0.06015037593984962, "grad_norm": 0.08624681085348129, "learning_rate": 0.0004988833613802917, "loss": 0.7461, "step": 75 }, { "epoch": 0.06416040100250626, "grad_norm": 0.08796326816082001, "learning_rate": 0.0004987296438117581, "loss": 0.7181, "step": 80 }, { "epoch": 0.06817042606516291, "grad_norm": 0.08574078977108002, "learning_rate": 0.0004985660436757538, "loss": 0.7209, "step": 85 }, { "epoch": 0.07218045112781955, "grad_norm": 0.09403970092535019, "learning_rate": 0.0004983925674724662, "loss": 0.7297, "step": 90 }, { "epoch": 0.0761904761904762, "grad_norm": 0.1051216572523117, "learning_rate": 0.0004982092220944808, "loss": 0.7077, "step": 95 }, { "epoch": 0.08020050125313283, "grad_norm": 0.10198145359754562, "learning_rate": 0.0004980160148265066, "loss": 0.7369, "step": 100 }, { "epoch": 0.08421052631578947, "grad_norm": 0.10444864630699158, "learning_rate": 0.0004978129533450866, "loss": 0.7044, "step": 105 }, { "epoch": 0.08822055137844612, "grad_norm": 0.09810823947191238, "learning_rate": 0.0004976000457182935, "loss": 0.7055, "step": 110 }, { "epoch": 0.09223057644110276, "grad_norm": 0.10619106888771057, "learning_rate": 0.0004973773004054081, "loss": 0.6951, "step": 115 }, { "epoch": 0.0962406015037594, "grad_norm": 0.11827559024095535, "learning_rate": 0.0004971447262565846, "loss": 0.7285, "step": 120 }, { "epoch": 0.10025062656641603, "grad_norm": 0.09872109442949295, "learning_rate": 0.0004969023325124974, "loss": 0.7065, "step": 125 }, { "epoch": 0.10426065162907268, "grad_norm": 0.11186440289020538, "learning_rate": 0.0004966501288039749, "loss": 0.6663, "step": 130 }, { "epoch": 0.10827067669172932, "grad_norm": 0.10218355059623718, "learning_rate": 0.0004963881251516169, "loss": 0.6763, "step": 135 }, { "epoch": 0.11228070175438597, "grad_norm": 0.11396008729934692, "learning_rate": 0.0004961163319653958, "loss": 0.6704, "step": 140 }, { "epoch": 0.11629072681704261, "grad_norm": 0.10969991981983185, "learning_rate": 0.0004958347600442437, "loss": 0.6779, "step": 145 }, { "epoch": 0.12030075187969924, "grad_norm": 0.11111714690923691, "learning_rate": 0.0004955434205756226, "loss": 0.6835, "step": 150 }, { "epoch": 0.12431077694235589, "grad_norm": 0.11582209914922714, "learning_rate": 0.000495242325135081, "loss": 0.6966, "step": 155 }, { "epoch": 0.12832080200501253, "grad_norm": 0.11664141714572906, "learning_rate": 0.0004949314856857926, "loss": 0.641, "step": 160 }, { "epoch": 0.13233082706766916, "grad_norm": 0.11299869418144226, "learning_rate": 0.000494610914578082, "loss": 0.6349, "step": 165 }, { "epoch": 0.13634085213032582, "grad_norm": 0.10604275017976761, "learning_rate": 0.0004942806245489338, "loss": 0.6498, "step": 170 }, { "epoch": 0.14035087719298245, "grad_norm": 0.11253874003887177, "learning_rate": 0.000493940628721486, "loss": 0.6536, "step": 175 }, { "epoch": 0.1443609022556391, "grad_norm": 0.12596091628074646, "learning_rate": 0.0004935909406045095, "loss": 0.6348, "step": 180 }, { "epoch": 0.14837092731829574, "grad_norm": 0.12171223014593124, "learning_rate": 0.0004932315740918703, "loss": 0.64, "step": 185 }, { "epoch": 0.1523809523809524, "grad_norm": 0.12628182768821716, "learning_rate": 0.0004928625434619785, "loss": 0.6225, "step": 190 }, { "epoch": 0.15639097744360902, "grad_norm": 0.12631121277809143, "learning_rate": 0.0004924838633772201, "loss": 0.6197, "step": 195 }, { "epoch": 0.16040100250626566, "grad_norm": 0.12037768959999084, "learning_rate": 0.0004920955488833753, "loss": 0.6201, "step": 200 }, { "epoch": 0.1644110275689223, "grad_norm": 0.11302505433559418, "learning_rate": 0.0004916976154090198, "loss": 0.5972, "step": 205 }, { "epoch": 0.16842105263157894, "grad_norm": 0.15172865986824036, "learning_rate": 0.0004912900787649124, "loss": 0.6322, "step": 210 }, { "epoch": 0.1724310776942356, "grad_norm": 0.12461714446544647, "learning_rate": 0.0004908729551433668, "loss": 0.5681, "step": 215 }, { "epoch": 0.17644110275689223, "grad_norm": 0.11686723679304123, "learning_rate": 0.0004904462611176079, "loss": 0.6112, "step": 220 }, { "epoch": 0.18045112781954886, "grad_norm": 0.12082964181900024, "learning_rate": 0.0004900100136411134, "loss": 0.59, "step": 225 }, { "epoch": 0.18446115288220552, "grad_norm": 0.14786957204341888, "learning_rate": 0.0004895642300469405, "loss": 0.6008, "step": 230 }, { "epoch": 0.18847117794486215, "grad_norm": 0.11940759420394897, "learning_rate": 0.0004891089280470369, "loss": 0.5781, "step": 235 }, { "epoch": 0.1924812030075188, "grad_norm": 0.12067493796348572, "learning_rate": 0.0004886441257315373, "loss": 0.5963, "step": 240 }, { "epoch": 0.19649122807017544, "grad_norm": 0.12630167603492737, "learning_rate": 0.0004881698415680442, "loss": 0.5767, "step": 245 }, { "epoch": 0.20050125313283207, "grad_norm": 0.12695525586605072, "learning_rate": 0.00048768609440089474, "loss": 0.6039, "step": 250 }, { "epoch": 0.20451127819548873, "grad_norm": 0.13445709645748138, "learning_rate": 0.00048719290345041166, "loss": 0.6087, "step": 255 }, { "epoch": 0.20852130325814536, "grad_norm": 0.13432475924491882, "learning_rate": 0.0004866902883121397, "loss": 0.5491, "step": 260 }, { "epoch": 0.21253132832080202, "grad_norm": 0.15156829357147217, "learning_rate": 0.0004861782689560667, "loss": 0.5492, "step": 265 }, { "epoch": 0.21654135338345865, "grad_norm": 0.13093040883541107, "learning_rate": 0.0004856568657258308, "loss": 0.6045, "step": 270 }, { "epoch": 0.22055137844611528, "grad_norm": 0.1290189027786255, "learning_rate": 0.00048512609933791165, "loss": 0.5433, "step": 275 }, { "epoch": 0.22456140350877193, "grad_norm": 0.13374298810958862, "learning_rate": 0.00048458599088080736, "loss": 0.5575, "step": 280 }, { "epoch": 0.22857142857142856, "grad_norm": 0.1266585886478424, "learning_rate": 0.00048403656181419686, "loss": 0.5474, "step": 285 }, { "epoch": 0.23258145363408522, "grad_norm": 0.13824708759784698, "learning_rate": 0.00048347783396808687, "loss": 0.5465, "step": 290 }, { "epoch": 0.23659147869674185, "grad_norm": 0.12988659739494324, "learning_rate": 0.00048290982954194485, "loss": 0.5605, "step": 295 }, { "epoch": 0.24060150375939848, "grad_norm": 0.13609863817691803, "learning_rate": 0.0004823325711038169, "loss": 0.5812, "step": 300 }, { "epoch": 0.24461152882205514, "grad_norm": 0.14271323382854462, "learning_rate": 0.00048174608158943114, "loss": 0.5485, "step": 305 }, { "epoch": 0.24862155388471177, "grad_norm": 0.13438940048217773, "learning_rate": 0.0004811503843012861, "loss": 0.5325, "step": 310 }, { "epoch": 0.25263157894736843, "grad_norm": 0.138273686170578, "learning_rate": 0.0004805455029077255, "loss": 0.5283, "step": 315 }, { "epoch": 0.25664160401002506, "grad_norm": 0.15901249647140503, "learning_rate": 0.0004799314614419969, "loss": 0.5717, "step": 320 }, { "epoch": 0.2606516290726817, "grad_norm": 0.14647658169269562, "learning_rate": 0.0004793082843012979, "loss": 0.5378, "step": 325 }, { "epoch": 0.2646616541353383, "grad_norm": 0.1300746202468872, "learning_rate": 0.0004786759962458059, "loss": 0.5183, "step": 330 }, { "epoch": 0.268671679197995, "grad_norm": 0.13711751997470856, "learning_rate": 0.000478034622397695, "loss": 0.5442, "step": 335 }, { "epoch": 0.27268170426065164, "grad_norm": 0.150499626994133, "learning_rate": 0.0004773841882401372, "loss": 0.524, "step": 340 }, { "epoch": 0.27669172932330827, "grad_norm": 0.13912473618984222, "learning_rate": 0.00047672471961629037, "loss": 0.5412, "step": 345 }, { "epoch": 0.2807017543859649, "grad_norm": 0.1455031782388687, "learning_rate": 0.0004760562427282712, "loss": 0.5321, "step": 350 }, { "epoch": 0.2847117794486215, "grad_norm": 0.12485536932945251, "learning_rate": 0.0004753787841361145, "loss": 0.5113, "step": 355 }, { "epoch": 0.2887218045112782, "grad_norm": 0.13250643014907837, "learning_rate": 0.00047469237075671724, "loss": 0.5402, "step": 360 }, { "epoch": 0.29273182957393484, "grad_norm": 0.13680434226989746, "learning_rate": 0.00047399702986276984, "loss": 0.5751, "step": 365 }, { "epoch": 0.2967418546365915, "grad_norm": 0.14655643701553345, "learning_rate": 0.0004732927890816719, "loss": 0.5371, "step": 370 }, { "epoch": 0.3007518796992481, "grad_norm": 0.14064128696918488, "learning_rate": 0.00047257967639443513, "loss": 0.531, "step": 375 }, { "epoch": 0.3047619047619048, "grad_norm": 0.14199021458625793, "learning_rate": 0.00047185772013457096, "loss": 0.4934, "step": 380 }, { "epoch": 0.3087719298245614, "grad_norm": 0.14177988469600677, "learning_rate": 0.00047112694898696533, "loss": 0.5249, "step": 385 }, { "epoch": 0.31278195488721805, "grad_norm": 0.14135052263736725, "learning_rate": 0.00047038739198673876, "loss": 0.5272, "step": 390 }, { "epoch": 0.3167919799498747, "grad_norm": 0.12976044416427612, "learning_rate": 0.00046963907851809263, "loss": 0.5263, "step": 395 }, { "epoch": 0.3208020050125313, "grad_norm": 0.15720346570014954, "learning_rate": 0.0004688820383131418, "loss": 0.4858, "step": 400 }, { "epoch": 0.324812030075188, "grad_norm": 0.14797399938106537, "learning_rate": 0.0004681163014507334, "loss": 0.5201, "step": 405 }, { "epoch": 0.3288220551378446, "grad_norm": 0.15095673501491547, "learning_rate": 0.0004673418983552513, "loss": 0.5366, "step": 410 }, { "epoch": 0.33283208020050126, "grad_norm": 0.16217216849327087, "learning_rate": 0.00046655885979540783, "loss": 0.4836, "step": 415 }, { "epoch": 0.3368421052631579, "grad_norm": 0.14743471145629883, "learning_rate": 0.00046576721688302105, "loss": 0.5067, "step": 420 }, { "epoch": 0.3408521303258145, "grad_norm": 0.15485595166683197, "learning_rate": 0.00046496700107177835, "loss": 0.4614, "step": 425 }, { "epoch": 0.3448621553884712, "grad_norm": 0.15458199381828308, "learning_rate": 0.0004641582441559872, "loss": 0.4897, "step": 430 }, { "epoch": 0.34887218045112783, "grad_norm": 0.13934871554374695, "learning_rate": 0.00046334097826931144, "loss": 0.522, "step": 435 }, { "epoch": 0.35288220551378446, "grad_norm": 0.1473606377840042, "learning_rate": 0.00046251523588349485, "loss": 0.4851, "step": 440 }, { "epoch": 0.3568922305764411, "grad_norm": 0.14684675633907318, "learning_rate": 0.00046168104980707104, "loss": 0.4888, "step": 445 }, { "epoch": 0.3609022556390977, "grad_norm": 0.15129096806049347, "learning_rate": 0.0004608384531840595, "loss": 0.5236, "step": 450 }, { "epoch": 0.3649122807017544, "grad_norm": 0.15553347766399384, "learning_rate": 0.00045998747949264895, "loss": 0.4791, "step": 455 }, { "epoch": 0.36892230576441104, "grad_norm": 0.16991403698921204, "learning_rate": 0.00045912816254386747, "loss": 0.4672, "step": 460 }, { "epoch": 0.37293233082706767, "grad_norm": 0.15001340210437775, "learning_rate": 0.0004582605364802385, "loss": 0.4989, "step": 465 }, { "epoch": 0.3769423558897243, "grad_norm": 0.15301191806793213, "learning_rate": 0.00045738463577442467, "loss": 0.4885, "step": 470 }, { "epoch": 0.38095238095238093, "grad_norm": 0.1510782241821289, "learning_rate": 0.00045650049522785827, "loss": 0.4827, "step": 475 }, { "epoch": 0.3849624060150376, "grad_norm": 0.14636896550655365, "learning_rate": 0.0004556081499693581, "loss": 0.4791, "step": 480 }, { "epoch": 0.38897243107769425, "grad_norm": 0.1476137489080429, "learning_rate": 0.00045470763545373395, "loss": 0.4709, "step": 485 }, { "epoch": 0.3929824561403509, "grad_norm": 0.15410684049129486, "learning_rate": 0.000453798987460378, "loss": 0.4794, "step": 490 }, { "epoch": 0.3969924812030075, "grad_norm": 0.14843297004699707, "learning_rate": 0.00045288224209184315, "loss": 0.5077, "step": 495 }, { "epoch": 0.40100250626566414, "grad_norm": 0.1490759700536728, "learning_rate": 0.00045195743577240835, "loss": 0.464, "step": 500 }, { "epoch": 0.4050125313283208, "grad_norm": 0.17478294670581818, "learning_rate": 0.0004510246052466317, "loss": 0.4754, "step": 505 }, { "epoch": 0.40902255639097745, "grad_norm": 0.15610414743423462, "learning_rate": 0.0004500837875778905, "loss": 0.488, "step": 510 }, { "epoch": 0.4130325814536341, "grad_norm": 0.15166425704956055, "learning_rate": 0.00044913502014690837, "loss": 0.4772, "step": 515 }, { "epoch": 0.4170426065162907, "grad_norm": 0.14028780162334442, "learning_rate": 0.0004481783406502703, "loss": 0.5013, "step": 520 }, { "epoch": 0.42105263157894735, "grad_norm": 0.17099718749523163, "learning_rate": 0.0004472137870989247, "loss": 0.4811, "step": 525 }, { "epoch": 0.42506265664160403, "grad_norm": 0.1719314604997635, "learning_rate": 0.0004462413978166734, "loss": 0.4735, "step": 530 }, { "epoch": 0.42907268170426066, "grad_norm": 0.17192725837230682, "learning_rate": 0.0004452612114386485, "loss": 0.4945, "step": 535 }, { "epoch": 0.4330827067669173, "grad_norm": 0.1921142190694809, "learning_rate": 0.000444273266909778, "loss": 0.4975, "step": 540 }, { "epoch": 0.4370927318295739, "grad_norm": 0.16015449166297913, "learning_rate": 0.00044327760348323767, "loss": 0.4566, "step": 545 }, { "epoch": 0.44110275689223055, "grad_norm": 0.15686671435832977, "learning_rate": 0.000442274260718892, "loss": 0.449, "step": 550 }, { "epoch": 0.44511278195488724, "grad_norm": 0.1583927422761917, "learning_rate": 0.0004412632784817221, "loss": 0.4738, "step": 555 }, { "epoch": 0.44912280701754387, "grad_norm": 0.1498887687921524, "learning_rate": 0.00044024469694024196, "loss": 0.4416, "step": 560 }, { "epoch": 0.4531328320802005, "grad_norm": 0.16318589448928833, "learning_rate": 0.0004392185565649024, "loss": 0.4527, "step": 565 }, { "epoch": 0.45714285714285713, "grad_norm": 0.14862380921840668, "learning_rate": 0.000438184898126483, "loss": 0.4613, "step": 570 }, { "epoch": 0.46115288220551376, "grad_norm": 0.17321060597896576, "learning_rate": 0.00043714376269447233, "loss": 0.4587, "step": 575 }, { "epoch": 0.46516290726817044, "grad_norm": 0.15772369503974915, "learning_rate": 0.0004360951916354361, "loss": 0.4546, "step": 580 }, { "epoch": 0.4691729323308271, "grad_norm": 0.1607331931591034, "learning_rate": 0.0004350392266113736, "loss": 0.4836, "step": 585 }, { "epoch": 0.4731829573934837, "grad_norm": 0.16422398388385773, "learning_rate": 0.0004339759095780623, "loss": 0.446, "step": 590 }, { "epoch": 0.47719298245614034, "grad_norm": 0.15711261332035065, "learning_rate": 0.00043290528278339097, "loss": 0.4308, "step": 595 }, { "epoch": 0.48120300751879697, "grad_norm": 0.15732958912849426, "learning_rate": 0.0004318273887656811, "loss": 0.4687, "step": 600 }, { "epoch": 0.48521303258145365, "grad_norm": 0.15295931696891785, "learning_rate": 0.00043074227035199656, "loss": 0.4474, "step": 605 }, { "epoch": 0.4892230576441103, "grad_norm": 0.16534899175167084, "learning_rate": 0.00042964997065644204, "loss": 0.4552, "step": 610 }, { "epoch": 0.4932330827067669, "grad_norm": 0.15630511939525604, "learning_rate": 0.00042855053307845056, "loss": 0.4842, "step": 615 }, { "epoch": 0.49724310776942354, "grad_norm": 0.1624716818332672, "learning_rate": 0.0004274440013010581, "loss": 0.4228, "step": 620 }, { "epoch": 0.49964912280701756, "eval_loss": 0.33371835947036743, "eval_runtime": 8.4047, "eval_samples_per_second": 6.544, "eval_steps_per_second": 0.833, "step": 623 }, { "epoch": 0.5012531328320802, "grad_norm": 0.16876687109470367, "learning_rate": 0.0004263304192891689, "loss": 0.4628, "step": 625 }, { "epoch": 0.5052631578947369, "grad_norm": 0.16050459444522858, "learning_rate": 0.00042520983128780824, "loss": 0.422, "step": 630 }, { "epoch": 0.5092731829573934, "grad_norm": 0.1509382724761963, "learning_rate": 0.00042408228182036446, "loss": 0.4473, "step": 635 }, { "epoch": 0.5132832080200501, "grad_norm": 0.16160839796066284, "learning_rate": 0.00042294781568682, "loss": 0.4264, "step": 640 }, { "epoch": 0.5172932330827068, "grad_norm": 0.1524878293275833, "learning_rate": 0.0004218064779619715, "loss": 0.4332, "step": 645 }, { "epoch": 0.5213032581453634, "grad_norm": 0.18008528649806976, "learning_rate": 0.00042065831399363884, "loss": 0.4365, "step": 650 }, { "epoch": 0.5253132832080201, "grad_norm": 0.16481877863407135, "learning_rate": 0.0004195033694008632, "loss": 0.4417, "step": 655 }, { "epoch": 0.5293233082706766, "grad_norm": 0.16098377108573914, "learning_rate": 0.00041834169007209487, "loss": 0.4459, "step": 660 }, { "epoch": 0.5333333333333333, "grad_norm": 0.15441514551639557, "learning_rate": 0.0004171733221633695, "loss": 0.4178, "step": 665 }, { "epoch": 0.53734335839599, "grad_norm": 0.15172994136810303, "learning_rate": 0.00041599831209647475, "loss": 0.4261, "step": 670 }, { "epoch": 0.5413533834586466, "grad_norm": 0.16424188017845154, "learning_rate": 0.0004148167065571055, "loss": 0.4375, "step": 675 }, { "epoch": 0.5453634085213033, "grad_norm": 0.1634334772825241, "learning_rate": 0.0004136285524930091, "loss": 0.4453, "step": 680 }, { "epoch": 0.5493734335839598, "grad_norm": 0.1613541692495346, "learning_rate": 0.00041243389711211997, "loss": 0.4835, "step": 685 }, { "epoch": 0.5533834586466165, "grad_norm": 0.16017889976501465, "learning_rate": 0.00041123278788068375, "loss": 0.4245, "step": 690 }, { "epoch": 0.5573934837092732, "grad_norm": 0.1616198718547821, "learning_rate": 0.0004100252725213718, "loss": 0.4757, "step": 695 }, { "epoch": 0.5614035087719298, "grad_norm": 0.15503592789173126, "learning_rate": 0.00040881139901138467, "loss": 0.438, "step": 700 }, { "epoch": 0.5654135338345865, "grad_norm": 0.15209273993968964, "learning_rate": 0.000407591215580546, "loss": 0.4258, "step": 705 }, { "epoch": 0.569423558897243, "grad_norm": 0.16087529063224792, "learning_rate": 0.0004063647707093864, "loss": 0.4319, "step": 710 }, { "epoch": 0.5734335839598997, "grad_norm": 0.16061115264892578, "learning_rate": 0.0004051321131272167, "loss": 0.4562, "step": 715 }, { "epoch": 0.5774436090225564, "grad_norm": 0.15838468074798584, "learning_rate": 0.0004038932918101927, "loss": 0.4171, "step": 720 }, { "epoch": 0.581453634085213, "grad_norm": 0.16260045766830444, "learning_rate": 0.00040264835597936846, "loss": 0.4197, "step": 725 }, { "epoch": 0.5854636591478697, "grad_norm": 0.20412659645080566, "learning_rate": 0.0004013973550987408, "loss": 0.4399, "step": 730 }, { "epoch": 0.5894736842105263, "grad_norm": 0.16249172389507294, "learning_rate": 0.0004001403388732842, "loss": 0.4141, "step": 735 }, { "epoch": 0.593483709273183, "grad_norm": 0.16174912452697754, "learning_rate": 0.0003988773572469761, "loss": 0.4235, "step": 740 }, { "epoch": 0.5974937343358396, "grad_norm": 0.16679826378822327, "learning_rate": 0.0003976084604008119, "loss": 0.4078, "step": 745 }, { "epoch": 0.6015037593984962, "grad_norm": 0.1594962477684021, "learning_rate": 0.0003963336987508115, "loss": 0.4229, "step": 750 }, { "epoch": 0.6055137844611529, "grad_norm": 0.1605675369501114, "learning_rate": 0.00039505312294601635, "loss": 0.4084, "step": 755 }, { "epoch": 0.6095238095238096, "grad_norm": 0.16422423720359802, "learning_rate": 0.0003937667838664768, "loss": 0.3926, "step": 760 }, { "epoch": 0.6135338345864662, "grad_norm": 0.15119189023971558, "learning_rate": 0.00039247473262123036, "loss": 0.3998, "step": 765 }, { "epoch": 0.6175438596491228, "grad_norm": 0.1493472158908844, "learning_rate": 0.0003911770205462717, "loss": 0.3965, "step": 770 }, { "epoch": 0.6215538847117794, "grad_norm": 0.16699016094207764, "learning_rate": 0.00038987369920251213, "loss": 0.4089, "step": 775 }, { "epoch": 0.6255639097744361, "grad_norm": 0.15896373987197876, "learning_rate": 0.00038856482037373155, "loss": 0.3785, "step": 780 }, { "epoch": 0.6295739348370928, "grad_norm": 0.16796669363975525, "learning_rate": 0.0003872504360645206, "loss": 0.3973, "step": 785 }, { "epoch": 0.6335839598997494, "grad_norm": 0.14833569526672363, "learning_rate": 0.0003859305984982149, "loss": 0.4149, "step": 790 }, { "epoch": 0.637593984962406, "grad_norm": 0.1650949865579605, "learning_rate": 0.0003846053601148194, "loss": 0.4253, "step": 795 }, { "epoch": 0.6416040100250626, "grad_norm": 0.15514247119426727, "learning_rate": 0.00038327477356892546, "loss": 0.3654, "step": 800 }, { "epoch": 0.6456140350877193, "grad_norm": 0.16105593740940094, "learning_rate": 0.0003819388917276185, "loss": 0.4399, "step": 805 }, { "epoch": 0.649624060150376, "grad_norm": 0.15280301868915558, "learning_rate": 0.00038059776766837753, "loss": 0.4166, "step": 810 }, { "epoch": 0.6536340852130326, "grad_norm": 0.16363288462162018, "learning_rate": 0.000379251454676966, "loss": 0.3977, "step": 815 }, { "epoch": 0.6576441102756893, "grad_norm": 0.16015923023223877, "learning_rate": 0.00037790000624531505, "loss": 0.3918, "step": 820 }, { "epoch": 0.6616541353383458, "grad_norm": 0.14514611661434174, "learning_rate": 0.0003765434760693981, "loss": 0.3801, "step": 825 }, { "epoch": 0.6656641604010025, "grad_norm": 0.16084541380405426, "learning_rate": 0.0003751819180470969, "loss": 0.4123, "step": 830 }, { "epoch": 0.6696741854636592, "grad_norm": 0.14666476845741272, "learning_rate": 0.000373815386276061, "loss": 0.3712, "step": 835 }, { "epoch": 0.6736842105263158, "grad_norm": 0.16997689008712769, "learning_rate": 0.0003724439350515571, "loss": 0.3999, "step": 840 }, { "epoch": 0.6776942355889725, "grad_norm": 0.15905480086803436, "learning_rate": 0.000371067618864313, "loss": 0.4068, "step": 845 }, { "epoch": 0.681704260651629, "grad_norm": 0.1562589704990387, "learning_rate": 0.0003696864923983516, "loss": 0.3843, "step": 850 }, { "epoch": 0.6857142857142857, "grad_norm": 0.16755416989326477, "learning_rate": 0.00036830061052881897, "loss": 0.4426, "step": 855 }, { "epoch": 0.6897243107769424, "grad_norm": 0.14882604777812958, "learning_rate": 0.00036691002831980323, "loss": 0.3939, "step": 860 }, { "epoch": 0.693734335839599, "grad_norm": 0.16580046713352203, "learning_rate": 0.0003655148010221476, "loss": 0.3823, "step": 865 }, { "epoch": 0.6977443609022557, "grad_norm": 0.15855063498020172, "learning_rate": 0.00036411498407125435, "loss": 0.3907, "step": 870 }, { "epoch": 0.7017543859649122, "grad_norm": 0.1665424108505249, "learning_rate": 0.00036271063308488294, "loss": 0.4133, "step": 875 }, { "epoch": 0.7057644110275689, "grad_norm": 0.1685834378004074, "learning_rate": 0.0003613018038609398, "loss": 0.4359, "step": 880 }, { "epoch": 0.7097744360902256, "grad_norm": 0.15352065861225128, "learning_rate": 0.0003598885523752611, "loss": 0.4037, "step": 885 }, { "epoch": 0.7137844611528822, "grad_norm": 0.17349396646022797, "learning_rate": 0.00035847093477938953, "loss": 0.411, "step": 890 }, { "epoch": 0.7177944862155389, "grad_norm": 0.15559545159339905, "learning_rate": 0.0003570490073983425, "loss": 0.3817, "step": 895 }, { "epoch": 0.7218045112781954, "grad_norm": 0.16843955218791962, "learning_rate": 0.00035562282672837477, "loss": 0.4162, "step": 900 }, { "epoch": 0.7258145363408521, "grad_norm": 0.17087852954864502, "learning_rate": 0.00035419244943473305, "loss": 0.412, "step": 905 }, { "epoch": 0.7298245614035088, "grad_norm": 0.16542008519172668, "learning_rate": 0.00035275793234940545, "loss": 0.421, "step": 910 }, { "epoch": 0.7338345864661654, "grad_norm": 0.17914004623889923, "learning_rate": 0.0003513193324688627, "loss": 0.3759, "step": 915 }, { "epoch": 0.7378446115288221, "grad_norm": 0.16926446557044983, "learning_rate": 0.00034987670695179403, "loss": 0.3988, "step": 920 }, { "epoch": 0.7418546365914787, "grad_norm": 0.15922464430332184, "learning_rate": 0.00034843011311683566, "loss": 0.3641, "step": 925 }, { "epoch": 0.7458646616541353, "grad_norm": 0.16068434715270996, "learning_rate": 0.00034697960844029383, "loss": 0.3746, "step": 930 }, { "epoch": 0.749874686716792, "grad_norm": 0.18251362442970276, "learning_rate": 0.0003455252505538611, "loss": 0.4308, "step": 935 }, { "epoch": 0.7538847117794486, "grad_norm": 0.1559171974658966, "learning_rate": 0.0003440670972423262, "loss": 0.4045, "step": 940 }, { "epoch": 0.7578947368421053, "grad_norm": 0.17260219156742096, "learning_rate": 0.00034260520644127846, "loss": 0.3642, "step": 945 }, { "epoch": 0.7619047619047619, "grad_norm": 0.1719062477350235, "learning_rate": 0.0003411396362348056, "loss": 0.3973, "step": 950 }, { "epoch": 0.7659147869674185, "grad_norm": 0.16649049520492554, "learning_rate": 0.0003396704448531863, "loss": 0.4031, "step": 955 }, { "epoch": 0.7699248120300752, "grad_norm": 0.15968754887580872, "learning_rate": 0.00033819769067057617, "loss": 0.3648, "step": 960 }, { "epoch": 0.7739348370927318, "grad_norm": 0.18287472426891327, "learning_rate": 0.0003367214322026888, "loss": 0.3921, "step": 965 }, { "epoch": 0.7779448621553885, "grad_norm": 0.1621864289045334, "learning_rate": 0.00033524172810447055, "loss": 0.3956, "step": 970 }, { "epoch": 0.7819548872180451, "grad_norm": 0.1772567480802536, "learning_rate": 0.00033375863716777007, "loss": 0.3745, "step": 975 }, { "epoch": 0.7859649122807018, "grad_norm": 0.17318162322044373, "learning_rate": 0.0003322722183190025, "loss": 0.3924, "step": 980 }, { "epoch": 0.7899749373433584, "grad_norm": 0.17136339843273163, "learning_rate": 0.0003307825306168082, "loss": 0.3897, "step": 985 }, { "epoch": 0.793984962406015, "grad_norm": 0.15933868288993835, "learning_rate": 0.00032928963324970595, "loss": 0.3723, "step": 990 }, { "epoch": 0.7979949874686717, "grad_norm": 0.1778361201286316, "learning_rate": 0.0003277935855337417, "loss": 0.3822, "step": 995 }, { "epoch": 0.8020050125313283, "grad_norm": 0.15671303868293762, "learning_rate": 0.00032629444691013144, "loss": 0.3805, "step": 1000 }, { "epoch": 0.806015037593985, "grad_norm": 0.16916193068027496, "learning_rate": 0.0003247922769428998, "loss": 0.3931, "step": 1005 }, { "epoch": 0.8100250626566416, "grad_norm": 0.15111452341079712, "learning_rate": 0.0003232871353165129, "loss": 0.3398, "step": 1010 }, { "epoch": 0.8140350877192982, "grad_norm": 0.14872141182422638, "learning_rate": 0.0003217790818335077, "loss": 0.3541, "step": 1015 }, { "epoch": 0.8180451127819549, "grad_norm": 0.1576554924249649, "learning_rate": 0.00032026817641211524, "loss": 0.3612, "step": 1020 }, { "epoch": 0.8220551378446115, "grad_norm": 0.16731838881969452, "learning_rate": 0.0003187544790838805, "loss": 0.3742, "step": 1025 }, { "epoch": 0.8260651629072682, "grad_norm": 0.17792581021785736, "learning_rate": 0.0003172380499912768, "loss": 0.3835, "step": 1030 }, { "epoch": 0.8300751879699249, "grad_norm": 0.15929646790027618, "learning_rate": 0.0003157189493853164, "loss": 0.3547, "step": 1035 }, { "epoch": 0.8340852130325814, "grad_norm": 0.17030423879623413, "learning_rate": 0.00031419723762315656, "loss": 0.3833, "step": 1040 }, { "epoch": 0.8380952380952381, "grad_norm": 0.16632360219955444, "learning_rate": 0.0003126729751657015, "loss": 0.3851, "step": 1045 }, { "epoch": 0.8421052631578947, "grad_norm": 0.16034843027591705, "learning_rate": 0.0003111462225752, "loss": 0.3457, "step": 1050 }, { "epoch": 0.8461152882205514, "grad_norm": 0.1584872305393219, "learning_rate": 0.0003096170405128393, "loss": 0.3627, "step": 1055 }, { "epoch": 0.8501253132832081, "grad_norm": 0.15513049066066742, "learning_rate": 0.0003080854897363348, "loss": 0.3571, "step": 1060 }, { "epoch": 0.8541353383458646, "grad_norm": 0.18553143739700317, "learning_rate": 0.000306551631097516, "loss": 0.3844, "step": 1065 }, { "epoch": 0.8581453634085213, "grad_norm": 0.17072825133800507, "learning_rate": 0.00030501552553990885, "loss": 0.3968, "step": 1070 }, { "epoch": 0.8621553884711779, "grad_norm": 0.17334377765655518, "learning_rate": 0.00030347723409631413, "loss": 0.3759, "step": 1075 }, { "epoch": 0.8661654135338346, "grad_norm": 0.16847221553325653, "learning_rate": 0.00030193681788638274, "loss": 0.3437, "step": 1080 }, { "epoch": 0.8701754385964913, "grad_norm": 0.17665381729602814, "learning_rate": 0.000300394338114187, "loss": 0.3691, "step": 1085 }, { "epoch": 0.8741854636591478, "grad_norm": 0.17721857130527496, "learning_rate": 0.00029884985606578907, "loss": 0.3776, "step": 1090 }, { "epoch": 0.8781954887218045, "grad_norm": 0.14506922662258148, "learning_rate": 0.0002973034331068061, "loss": 0.3329, "step": 1095 }, { "epoch": 0.8822055137844611, "grad_norm": 0.14164331555366516, "learning_rate": 0.0002957551306799715, "loss": 0.3296, "step": 1100 }, { "epoch": 0.8862155388471178, "grad_norm": 0.17468836903572083, "learning_rate": 0.0002942050103026942, "loss": 0.3814, "step": 1105 }, { "epoch": 0.8902255639097745, "grad_norm": 0.1707754135131836, "learning_rate": 0.0002926531335646141, "loss": 0.3288, "step": 1110 }, { "epoch": 0.894235588972431, "grad_norm": 0.1548461616039276, "learning_rate": 0.0002910995621251554, "loss": 0.3837, "step": 1115 }, { "epoch": 0.8982456140350877, "grad_norm": 0.18606321513652802, "learning_rate": 0.000289544357711076, "loss": 0.411, "step": 1120 }, { "epoch": 0.9022556390977443, "grad_norm": 0.1885378062725067, "learning_rate": 0.00028798758211401586, "loss": 0.3965, "step": 1125 }, { "epoch": 0.906265664160401, "grad_norm": 0.15879781544208527, "learning_rate": 0.00028642929718804126, "loss": 0.3217, "step": 1130 }, { "epoch": 0.9102756892230577, "grad_norm": 0.16221250593662262, "learning_rate": 0.00028486956484718735, "loss": 0.3431, "step": 1135 }, { "epoch": 0.9142857142857143, "grad_norm": 0.15938705205917358, "learning_rate": 0.0002833084470629983, "loss": 0.3645, "step": 1140 }, { "epoch": 0.9182957393483709, "grad_norm": 0.18234948813915253, "learning_rate": 0.0002817460058620647, "loss": 0.3696, "step": 1145 }, { "epoch": 0.9223057644110275, "grad_norm": 0.17734579741954803, "learning_rate": 0.0002801823033235598, "loss": 0.3939, "step": 1150 }, { "epoch": 0.9263157894736842, "grad_norm": 0.17581012845039368, "learning_rate": 0.00027861740157677204, "loss": 0.3857, "step": 1155 }, { "epoch": 0.9303258145363409, "grad_norm": 0.16594088077545166, "learning_rate": 0.00027705136279863754, "loss": 0.3784, "step": 1160 }, { "epoch": 0.9343358395989975, "grad_norm": 0.17413030564785004, "learning_rate": 0.0002754842492112685, "loss": 0.3791, "step": 1165 }, { "epoch": 0.9383458646616541, "grad_norm": 0.16821810603141785, "learning_rate": 0.0002739161230794822, "loss": 0.3896, "step": 1170 }, { "epoch": 0.9423558897243107, "grad_norm": 0.18337774276733398, "learning_rate": 0.000272347046708326, "loss": 0.3745, "step": 1175 }, { "epoch": 0.9463659147869674, "grad_norm": 0.15810732543468475, "learning_rate": 0.00027077708244060283, "loss": 0.3576, "step": 1180 }, { "epoch": 0.9503759398496241, "grad_norm": 0.16330723464488983, "learning_rate": 0.00026920629265439326, "loss": 0.3641, "step": 1185 }, { "epoch": 0.9543859649122807, "grad_norm": 0.168942391872406, "learning_rate": 0.0002676347397605777, "loss": 0.36, "step": 1190 }, { "epoch": 0.9583959899749374, "grad_norm": 0.1779577136039734, "learning_rate": 0.0002660624862003566, "loss": 0.3644, "step": 1195 }, { "epoch": 0.9624060150375939, "grad_norm": 0.175537571310997, "learning_rate": 0.00026448959444276896, "loss": 0.3766, "step": 1200 }, { "epoch": 0.9664160401002506, "grad_norm": 0.17511993646621704, "learning_rate": 0.0002629161269822113, "loss": 0.3636, "step": 1205 }, { "epoch": 0.9704260651629073, "grad_norm": 0.17956364154815674, "learning_rate": 0.00026134214633595347, "loss": 0.3916, "step": 1210 }, { "epoch": 0.9744360902255639, "grad_norm": 0.15933747589588165, "learning_rate": 0.000259767715041656, "loss": 0.3316, "step": 1215 }, { "epoch": 0.9784461152882206, "grad_norm": 0.18015708029270172, "learning_rate": 0.000258192895654884, "loss": 0.3791, "step": 1220 }, { "epoch": 0.9824561403508771, "grad_norm": 0.15591025352478027, "learning_rate": 0.00025661775074662275, "loss": 0.3222, "step": 1225 }, { "epoch": 0.9864661654135338, "grad_norm": 0.1800549179315567, "learning_rate": 0.0002550423429007909, "loss": 0.3282, "step": 1230 }, { "epoch": 0.9904761904761905, "grad_norm": 0.1630546748638153, "learning_rate": 0.0002534667347117544, "loss": 0.3659, "step": 1235 }, { "epoch": 0.9944862155388471, "grad_norm": 0.169642373919487, "learning_rate": 0.0002518909887818393, "loss": 0.3409, "step": 1240 }, { "epoch": 0.9984962406015038, "grad_norm": 0.1650785356760025, "learning_rate": 0.00025031516771884416, "loss": 0.3632, "step": 1245 }, { "epoch": 0.9992982456140351, "eval_loss": 0.2687419354915619, "eval_runtime": 8.3871, "eval_samples_per_second": 6.558, "eval_steps_per_second": 0.835, "step": 1246 }, { "epoch": 1.0025062656641603, "grad_norm": 0.1833638846874237, "learning_rate": 0.000248739334133553, "loss": 0.3377, "step": 1250 }, { "epoch": 1.006516290726817, "grad_norm": 0.16308297216892242, "learning_rate": 0.00024716355063724707, "loss": 0.2999, "step": 1255 }, { "epoch": 1.0105263157894737, "grad_norm": 0.16679681837558746, "learning_rate": 0.00024558787983921783, "loss": 0.2986, "step": 1260 }, { "epoch": 1.0145363408521304, "grad_norm": 0.17338265478610992, "learning_rate": 0.0002440123843442788, "loss": 0.2845, "step": 1265 }, { "epoch": 1.0185463659147869, "grad_norm": 0.16227693855762482, "learning_rate": 0.00024243712675027827, "loss": 0.3051, "step": 1270 }, { "epoch": 1.0225563909774436, "grad_norm": 0.15637050569057465, "learning_rate": 0.00024086216964561236, "loss": 0.2896, "step": 1275 }, { "epoch": 1.0265664160401002, "grad_norm": 0.1684030145406723, "learning_rate": 0.00023928757560673808, "loss": 0.272, "step": 1280 }, { "epoch": 1.030576441102757, "grad_norm": 0.1560041755437851, "learning_rate": 0.00023771340719568688, "loss": 0.2795, "step": 1285 }, { "epoch": 1.0345864661654136, "grad_norm": 0.15775898098945618, "learning_rate": 0.00023613972695757919, "loss": 0.2721, "step": 1290 }, { "epoch": 1.03859649122807, "grad_norm": 0.17277134954929352, "learning_rate": 0.00023456659741813944, "loss": 0.2726, "step": 1295 }, { "epoch": 1.0426065162907268, "grad_norm": 0.15520408749580383, "learning_rate": 0.00023299408108121134, "loss": 0.2681, "step": 1300 }, { "epoch": 1.0466165413533834, "grad_norm": 0.16784396767616272, "learning_rate": 0.00023142224042627495, "loss": 0.2817, "step": 1305 }, { "epoch": 1.0506265664160401, "grad_norm": 0.16616085171699524, "learning_rate": 0.00022985113790596393, "loss": 0.2863, "step": 1310 }, { "epoch": 1.0546365914786968, "grad_norm": 0.15373535454273224, "learning_rate": 0.00022828083594358416, "loss": 0.2837, "step": 1315 }, { "epoch": 1.0586466165413533, "grad_norm": 0.15861855447292328, "learning_rate": 0.00022671139693063386, "loss": 0.2878, "step": 1320 }, { "epoch": 1.06265664160401, "grad_norm": 0.15624883770942688, "learning_rate": 0.00022514288322432414, "loss": 0.2952, "step": 1325 }, { "epoch": 1.0666666666666667, "grad_norm": 0.17166998982429504, "learning_rate": 0.0002235753571451018, "loss": 0.2957, "step": 1330 }, { "epoch": 1.0706766917293233, "grad_norm": 0.15910623967647552, "learning_rate": 0.00022200888097417305, "loss": 0.2954, "step": 1335 }, { "epoch": 1.07468671679198, "grad_norm": 0.154972106218338, "learning_rate": 0.00022044351695102907, "loss": 0.2624, "step": 1340 }, { "epoch": 1.0786967418546367, "grad_norm": 0.16736608743667603, "learning_rate": 0.00021887932727097278, "loss": 0.2677, "step": 1345 }, { "epoch": 1.0827067669172932, "grad_norm": 0.17113475501537323, "learning_rate": 0.00021731637408264822, "loss": 0.2613, "step": 1350 }, { "epoch": 1.0867167919799499, "grad_norm": 0.17451037466526031, "learning_rate": 0.00021575471948557062, "loss": 0.291, "step": 1355 }, { "epoch": 1.0907268170426065, "grad_norm": 0.15798042714595795, "learning_rate": 0.00021419442552765948, "loss": 0.2628, "step": 1360 }, { "epoch": 1.0947368421052632, "grad_norm": 0.17537885904312134, "learning_rate": 0.00021263555420277335, "loss": 0.2764, "step": 1365 }, { "epoch": 1.09874686716792, "grad_norm": 0.14623847603797913, "learning_rate": 0.00021107816744824617, "loss": 0.29, "step": 1370 }, { "epoch": 1.1027568922305764, "grad_norm": 0.17818230390548706, "learning_rate": 0.00020952232714242685, "loss": 0.2866, "step": 1375 }, { "epoch": 1.106766917293233, "grad_norm": 0.16421560943126678, "learning_rate": 0.00020796809510222048, "loss": 0.2718, "step": 1380 }, { "epoch": 1.1107769423558898, "grad_norm": 0.16824981570243835, "learning_rate": 0.00020641553308063247, "loss": 0.2976, "step": 1385 }, { "epoch": 1.1147869674185464, "grad_norm": 0.15799076855182648, "learning_rate": 0.00020486470276431446, "loss": 0.2934, "step": 1390 }, { "epoch": 1.1187969924812031, "grad_norm": 0.18710538744926453, "learning_rate": 0.000203315665771114, "loss": 0.2792, "step": 1395 }, { "epoch": 1.1228070175438596, "grad_norm": 0.18664434552192688, "learning_rate": 0.00020176848364762578, "loss": 0.3125, "step": 1400 }, { "epoch": 1.1268170426065163, "grad_norm": 0.14843712747097015, "learning_rate": 0.00020022321786674664, "loss": 0.2636, "step": 1405 }, { "epoch": 1.130827067669173, "grad_norm": 0.18187515437602997, "learning_rate": 0.000198679929825233, "loss": 0.3186, "step": 1410 }, { "epoch": 1.1348370927318296, "grad_norm": 0.15874765813350677, "learning_rate": 0.0001971386808412612, "loss": 0.2578, "step": 1415 }, { "epoch": 1.1388471177944863, "grad_norm": 0.17650403082370758, "learning_rate": 0.00019559953215199168, "loss": 0.3012, "step": 1420 }, { "epoch": 1.1428571428571428, "grad_norm": 0.17228610813617706, "learning_rate": 0.0001940625449111354, "loss": 0.2666, "step": 1425 }, { "epoch": 1.1468671679197995, "grad_norm": 0.15302354097366333, "learning_rate": 0.00019252778018652447, "loss": 0.272, "step": 1430 }, { "epoch": 1.1508771929824562, "grad_norm": 0.17536306381225586, "learning_rate": 0.0001909952989576855, "loss": 0.3183, "step": 1435 }, { "epoch": 1.1548872180451129, "grad_norm": 0.15911608934402466, "learning_rate": 0.00018946516211341695, "loss": 0.3022, "step": 1440 }, { "epoch": 1.1588972431077695, "grad_norm": 0.15123997628688812, "learning_rate": 0.00018793743044936972, "loss": 0.289, "step": 1445 }, { "epoch": 1.162907268170426, "grad_norm": 0.16768620908260345, "learning_rate": 0.00018641216466563183, "loss": 0.2694, "step": 1450 }, { "epoch": 1.1669172932330827, "grad_norm": 0.16061028838157654, "learning_rate": 0.00018488942536431618, "loss": 0.2824, "step": 1455 }, { "epoch": 1.1709273182957394, "grad_norm": 0.15412503480911255, "learning_rate": 0.00018336927304715356, "loss": 0.3025, "step": 1460 }, { "epoch": 1.174937343358396, "grad_norm": 0.17830166220664978, "learning_rate": 0.0001818517681130879, "loss": 0.3046, "step": 1465 }, { "epoch": 1.1789473684210527, "grad_norm": 0.17978624999523163, "learning_rate": 0.00018033697085587696, "loss": 0.2966, "step": 1470 }, { "epoch": 1.1829573934837092, "grad_norm": 0.1743180751800537, "learning_rate": 0.00017882494146169678, "loss": 0.3072, "step": 1475 }, { "epoch": 1.186967418546366, "grad_norm": 0.16470114886760712, "learning_rate": 0.00017731574000674996, "loss": 0.2797, "step": 1480 }, { "epoch": 1.1909774436090226, "grad_norm": 0.18040384352207184, "learning_rate": 0.0001758094264548792, "loss": 0.3093, "step": 1485 }, { "epoch": 1.1949874686716793, "grad_norm": 0.17426797747612, "learning_rate": 0.00017430606065518435, "loss": 0.318, "step": 1490 }, { "epoch": 1.198997493734336, "grad_norm": 0.1632205694913864, "learning_rate": 0.00017280570233964485, "loss": 0.2691, "step": 1495 }, { "epoch": 1.2030075187969924, "grad_norm": 0.1813742071390152, "learning_rate": 0.00017130841112074602, "loss": 0.2769, "step": 1500 }, { "epoch": 1.207017543859649, "grad_norm": 0.17400313913822174, "learning_rate": 0.00016981424648911112, "loss": 0.2723, "step": 1505 }, { "epoch": 1.2110275689223058, "grad_norm": 0.15530717372894287, "learning_rate": 0.0001683232678111371, "loss": 0.2971, "step": 1510 }, { "epoch": 1.2150375939849625, "grad_norm": 0.17954862117767334, "learning_rate": 0.00016683553432663616, "loss": 0.2866, "step": 1515 }, { "epoch": 1.2190476190476192, "grad_norm": 0.16163484752178192, "learning_rate": 0.0001653511051464819, "loss": 0.2557, "step": 1520 }, { "epoch": 1.2230576441102756, "grad_norm": 0.16374574601650238, "learning_rate": 0.00016387003925026077, "loss": 0.2956, "step": 1525 }, { "epoch": 1.2270676691729323, "grad_norm": 0.17792537808418274, "learning_rate": 0.0001623923954839287, "loss": 0.2777, "step": 1530 }, { "epoch": 1.231077694235589, "grad_norm": 0.18563567101955414, "learning_rate": 0.00016091823255747292, "loss": 0.301, "step": 1535 }, { "epoch": 1.2350877192982457, "grad_norm": 0.15728415548801422, "learning_rate": 0.00015944760904257942, "loss": 0.2784, "step": 1540 }, { "epoch": 1.2390977443609024, "grad_norm": 0.15651676058769226, "learning_rate": 0.00015798058337030551, "loss": 0.2876, "step": 1545 }, { "epoch": 1.2431077694235588, "grad_norm": 0.1799081265926361, "learning_rate": 0.00015651721382875874, "loss": 0.2919, "step": 1550 }, { "epoch": 1.2471177944862155, "grad_norm": 0.18436437845230103, "learning_rate": 0.00015505755856078048, "loss": 0.2934, "step": 1555 }, { "epoch": 1.2511278195488722, "grad_norm": 0.16269451379776, "learning_rate": 0.00015360167556163583, "loss": 0.2719, "step": 1560 }, { "epoch": 1.2551378446115289, "grad_norm": 0.15247991681098938, "learning_rate": 0.0001521496226767098, "loss": 0.2678, "step": 1565 }, { "epoch": 1.2591478696741856, "grad_norm": 0.1753372848033905, "learning_rate": 0.0001507014575992085, "loss": 0.3065, "step": 1570 }, { "epoch": 1.263157894736842, "grad_norm": 0.16919946670532227, "learning_rate": 0.0001492572378678669, "loss": 0.2659, "step": 1575 }, { "epoch": 1.2671679197994987, "grad_norm": 0.17600032687187195, "learning_rate": 0.00014781702086466314, "loss": 0.2987, "step": 1580 }, { "epoch": 1.2711779448621554, "grad_norm": 0.15909601747989655, "learning_rate": 0.000146380863812538, "loss": 0.2811, "step": 1585 }, { "epoch": 1.275187969924812, "grad_norm": 0.165226012468338, "learning_rate": 0.00014494882377312168, "loss": 0.2682, "step": 1590 }, { "epoch": 1.2791979949874688, "grad_norm": 0.17594917118549347, "learning_rate": 0.00014352095764446675, "loss": 0.2602, "step": 1595 }, { "epoch": 1.2832080200501252, "grad_norm": 0.154309943318367, "learning_rate": 0.0001420973221587869, "loss": 0.2579, "step": 1600 }, { "epoch": 1.287218045112782, "grad_norm": 0.16517868638038635, "learning_rate": 0.00014067797388020353, "loss": 0.2929, "step": 1605 }, { "epoch": 1.2912280701754386, "grad_norm": 0.1717553734779358, "learning_rate": 0.00013926296920249796, "loss": 0.2849, "step": 1610 }, { "epoch": 1.2952380952380953, "grad_norm": 0.16944153606891632, "learning_rate": 0.0001378523643468708, "loss": 0.2527, "step": 1615 }, { "epoch": 1.299248120300752, "grad_norm": 0.16444195806980133, "learning_rate": 0.00013644621535970804, "loss": 0.2665, "step": 1620 }, { "epoch": 1.3032581453634084, "grad_norm": 0.17226529121398926, "learning_rate": 0.0001350445781103547, "loss": 0.2758, "step": 1625 }, { "epoch": 1.3072681704260651, "grad_norm": 0.18174055218696594, "learning_rate": 0.0001336475082888943, "loss": 0.2831, "step": 1630 }, { "epoch": 1.3112781954887218, "grad_norm": 0.16900789737701416, "learning_rate": 0.00013225506140393696, "loss": 0.2882, "step": 1635 }, { "epoch": 1.3152882205513785, "grad_norm": 0.16686789691448212, "learning_rate": 0.00013086729278041304, "loss": 0.2556, "step": 1640 }, { "epoch": 1.3192982456140352, "grad_norm": 0.16731151938438416, "learning_rate": 0.00012948425755737592, "loss": 0.2665, "step": 1645 }, { "epoch": 1.3233082706766917, "grad_norm": 0.17180107533931732, "learning_rate": 0.00012810601068581056, "loss": 0.3001, "step": 1650 }, { "epoch": 1.3273182957393483, "grad_norm": 0.1701911836862564, "learning_rate": 0.0001267326069264501, "loss": 0.2977, "step": 1655 }, { "epoch": 1.331328320802005, "grad_norm": 0.19208161532878876, "learning_rate": 0.0001253641008476007, "loss": 0.2752, "step": 1660 }, { "epoch": 1.3353383458646617, "grad_norm": 0.15200825035572052, "learning_rate": 0.00012400054682297298, "loss": 0.2875, "step": 1665 }, { "epoch": 1.3393483709273184, "grad_norm": 0.17044006288051605, "learning_rate": 0.00012264199902952154, "loss": 0.2501, "step": 1670 }, { "epoch": 1.3433583959899749, "grad_norm": 0.16576853394508362, "learning_rate": 0.00012128851144529257, "loss": 0.2758, "step": 1675 }, { "epoch": 1.3473684210526315, "grad_norm": 0.17262569069862366, "learning_rate": 0.00011994013784727947, "loss": 0.2661, "step": 1680 }, { "epoch": 1.3513784461152882, "grad_norm": 0.161615252494812, "learning_rate": 0.00011859693180928574, "loss": 0.2689, "step": 1685 }, { "epoch": 1.355388471177945, "grad_norm": 0.16010525822639465, "learning_rate": 0.00011725894669979639, "loss": 0.2623, "step": 1690 }, { "epoch": 1.3593984962406016, "grad_norm": 0.1819092184305191, "learning_rate": 0.00011592623567985783, "loss": 0.2667, "step": 1695 }, { "epoch": 1.363408521303258, "grad_norm": 0.16817572712898254, "learning_rate": 0.00011459885170096537, "loss": 0.2756, "step": 1700 }, { "epoch": 1.3674185463659148, "grad_norm": 0.17827191948890686, "learning_rate": 0.00011327684750295958, "loss": 0.3166, "step": 1705 }, { "epoch": 1.3714285714285714, "grad_norm": 0.15747743844985962, "learning_rate": 0.00011196027561193045, "loss": 0.2952, "step": 1710 }, { "epoch": 1.3754385964912281, "grad_norm": 0.17462944984436035, "learning_rate": 0.00011064918833813073, "loss": 0.2754, "step": 1715 }, { "epoch": 1.3794486215538848, "grad_norm": 0.16550135612487793, "learning_rate": 0.00010934363777389752, "loss": 0.2658, "step": 1720 }, { "epoch": 1.3834586466165413, "grad_norm": 0.19006794691085815, "learning_rate": 0.00010804367579158256, "loss": 0.2679, "step": 1725 }, { "epoch": 1.387468671679198, "grad_norm": 0.2023632973432541, "learning_rate": 0.00010674935404149075, "loss": 0.2739, "step": 1730 }, { "epoch": 1.3914786967418546, "grad_norm": 0.17994622886180878, "learning_rate": 0.00010546072394982872, "loss": 0.2585, "step": 1735 }, { "epoch": 1.3954887218045113, "grad_norm": 0.1834658533334732, "learning_rate": 0.00010417783671666114, "loss": 0.2567, "step": 1740 }, { "epoch": 1.399498746867168, "grad_norm": 0.16949954628944397, "learning_rate": 0.00010290074331387617, "loss": 0.2462, "step": 1745 }, { "epoch": 1.4035087719298245, "grad_norm": 0.17096513509750366, "learning_rate": 0.00010162949448316089, "loss": 0.2818, "step": 1750 }, { "epoch": 1.4075187969924812, "grad_norm": 0.16146983206272125, "learning_rate": 0.00010036414073398479, "loss": 0.2691, "step": 1755 }, { "epoch": 1.4115288220551379, "grad_norm": 0.15868982672691345, "learning_rate": 9.910473234159286e-05, "loss": 0.2857, "step": 1760 }, { "epoch": 1.4155388471177945, "grad_norm": 0.1619461625814438, "learning_rate": 9.785131934500818e-05, "loss": 0.2807, "step": 1765 }, { "epoch": 1.4195488721804512, "grad_norm": 0.1557190865278244, "learning_rate": 9.660395154504401e-05, "loss": 0.2548, "step": 1770 }, { "epoch": 1.4235588972431077, "grad_norm": 0.17061957716941833, "learning_rate": 9.536267850232472e-05, "loss": 0.2801, "step": 1775 }, { "epoch": 1.4275689223057644, "grad_norm": 0.17641028761863708, "learning_rate": 9.412754953531663e-05, "loss": 0.2924, "step": 1780 }, { "epoch": 1.431578947368421, "grad_norm": 0.15920333564281464, "learning_rate": 9.289861371836885e-05, "loss": 0.2563, "step": 1785 }, { "epoch": 1.4355889724310777, "grad_norm": 0.15198518335819244, "learning_rate": 9.167591987976312e-05, "loss": 0.2357, "step": 1790 }, { "epoch": 1.4395989974937344, "grad_norm": 0.1758808046579361, "learning_rate": 9.045951659977397e-05, "loss": 0.2535, "step": 1795 }, { "epoch": 1.443609022556391, "grad_norm": 0.18127872049808502, "learning_rate": 8.924945220873823e-05, "loss": 0.2888, "step": 1800 }, { "epoch": 1.4476190476190476, "grad_norm": 0.1601109355688095, "learning_rate": 8.804577478513492e-05, "loss": 0.2528, "step": 1805 }, { "epoch": 1.4516290726817043, "grad_norm": 0.16406309604644775, "learning_rate": 8.684853215367522e-05, "loss": 0.2766, "step": 1810 }, { "epoch": 1.455639097744361, "grad_norm": 0.158805251121521, "learning_rate": 8.565777188340207e-05, "loss": 0.2747, "step": 1815 }, { "epoch": 1.4596491228070176, "grad_norm": 0.17480885982513428, "learning_rate": 8.44735412857999e-05, "loss": 0.3079, "step": 1820 }, { "epoch": 1.463659147869674, "grad_norm": 0.1617252379655838, "learning_rate": 8.329588741291535e-05, "loss": 0.2701, "step": 1825 }, { "epoch": 1.4676691729323308, "grad_norm": 0.1652277261018753, "learning_rate": 8.212485705548756e-05, "loss": 0.25, "step": 1830 }, { "epoch": 1.4716791979949875, "grad_norm": 0.16213206946849823, "learning_rate": 8.096049674108877e-05, "loss": 0.2683, "step": 1835 }, { "epoch": 1.4756892230576442, "grad_norm": 0.167341947555542, "learning_rate": 7.980285273227633e-05, "loss": 0.2724, "step": 1840 }, { "epoch": 1.4796992481203008, "grad_norm": 0.1590610146522522, "learning_rate": 7.865197102475388e-05, "loss": 0.2518, "step": 1845 }, { "epoch": 1.4837092731829573, "grad_norm": 0.16797657310962677, "learning_rate": 7.75078973455445e-05, "loss": 0.2651, "step": 1850 }, { "epoch": 1.487719298245614, "grad_norm": 0.17504721879959106, "learning_rate": 7.637067715117327e-05, "loss": 0.3017, "step": 1855 }, { "epoch": 1.4917293233082707, "grad_norm": 0.16186365485191345, "learning_rate": 7.52403556258617e-05, "loss": 0.2344, "step": 1860 }, { "epoch": 1.4957393483709274, "grad_norm": 0.17528776824474335, "learning_rate": 7.41169776797322e-05, "loss": 0.264, "step": 1865 }, { "epoch": 1.4989473684210526, "eval_loss": 0.2417936623096466, "eval_runtime": 8.3922, "eval_samples_per_second": 6.554, "eval_steps_per_second": 0.834, "step": 1869 }, { "epoch": 1.499749373433584, "grad_norm": 0.1840500384569168, "learning_rate": 7.300058794702352e-05, "loss": 0.27, "step": 1870 }, { "epoch": 1.5037593984962405, "grad_norm": 0.1910168081521988, "learning_rate": 7.189123078431784e-05, "loss": 0.2823, "step": 1875 }, { "epoch": 1.5077694235588972, "grad_norm": 0.16170190274715424, "learning_rate": 7.078895026877804e-05, "loss": 0.253, "step": 1880 }, { "epoch": 1.511779448621554, "grad_norm": 0.17806147038936615, "learning_rate": 6.969379019639635e-05, "loss": 0.2836, "step": 1885 }, { "epoch": 1.5157894736842106, "grad_norm": 0.15123490989208221, "learning_rate": 6.860579408025436e-05, "loss": 0.2495, "step": 1890 }, { "epoch": 1.5197994987468673, "grad_norm": 0.16695557534694672, "learning_rate": 6.752500514879437e-05, "loss": 0.2773, "step": 1895 }, { "epoch": 1.5238095238095237, "grad_norm": 0.16708271205425262, "learning_rate": 6.645146634410151e-05, "loss": 0.252, "step": 1900 }, { "epoch": 1.5278195488721804, "grad_norm": 0.1510598361492157, "learning_rate": 6.538522032019759e-05, "loss": 0.2373, "step": 1905 }, { "epoch": 1.531829573934837, "grad_norm": 0.1686723679304123, "learning_rate": 6.432630944134654e-05, "loss": 0.2743, "step": 1910 }, { "epoch": 1.5358395989974938, "grad_norm": 0.16884025931358337, "learning_rate": 6.327477578037106e-05, "loss": 0.2673, "step": 1915 }, { "epoch": 1.5398496240601505, "grad_norm": 0.16477200388908386, "learning_rate": 6.223066111698111e-05, "loss": 0.2713, "step": 1920 }, { "epoch": 1.543859649122807, "grad_norm": 0.18342123925685883, "learning_rate": 6.119400693611357e-05, "loss": 0.2791, "step": 1925 }, { "epoch": 1.5478696741854636, "grad_norm": 0.16829492151737213, "learning_rate": 6.0164854426284447e-05, "loss": 0.2702, "step": 1930 }, { "epoch": 1.5518796992481203, "grad_norm": 0.16608929634094238, "learning_rate": 5.914324447795186e-05, "loss": 0.2741, "step": 1935 }, { "epoch": 1.555889724310777, "grad_norm": 0.16408754885196686, "learning_rate": 5.8129217681891886e-05, "loss": 0.2661, "step": 1940 }, { "epoch": 1.5598997493734337, "grad_norm": 0.1709819734096527, "learning_rate": 5.7122814327585316e-05, "loss": 0.2557, "step": 1945 }, { "epoch": 1.5639097744360901, "grad_norm": 0.187583327293396, "learning_rate": 5.612407440161721e-05, "loss": 0.2623, "step": 1950 }, { "epoch": 1.5679197994987468, "grad_norm": 0.16979658603668213, "learning_rate": 5.513303758608804e-05, "loss": 0.2615, "step": 1955 }, { "epoch": 1.5719298245614035, "grad_norm": 0.13367925584316254, "learning_rate": 5.414974325703686e-05, "loss": 0.2276, "step": 1960 }, { "epoch": 1.5759398496240602, "grad_norm": 0.17686474323272705, "learning_rate": 5.317423048287717e-05, "loss": 0.2444, "step": 1965 }, { "epoch": 1.5799498746867169, "grad_norm": 0.17005467414855957, "learning_rate": 5.220653802284439e-05, "loss": 0.2581, "step": 1970 }, { "epoch": 1.5839598997493733, "grad_norm": 0.1664769947528839, "learning_rate": 5.124670432545578e-05, "loss": 0.2628, "step": 1975 }, { "epoch": 1.58796992481203, "grad_norm": 0.17945070564746857, "learning_rate": 5.0294767526983006e-05, "loss": 0.276, "step": 1980 }, { "epoch": 1.5919799498746867, "grad_norm": 0.1758703887462616, "learning_rate": 4.9350765449936915e-05, "loss": 0.2744, "step": 1985 }, { "epoch": 1.5959899749373434, "grad_norm": 0.1651277393102646, "learning_rate": 4.84147356015647e-05, "loss": 0.2872, "step": 1990 }, { "epoch": 1.6, "grad_norm": 0.16701501607894897, "learning_rate": 4.748671517235948e-05, "loss": 0.2676, "step": 1995 }, { "epoch": 1.6040100250626566, "grad_norm": 0.16929472982883453, "learning_rate": 4.656674103458291e-05, "loss": 0.2498, "step": 2000 }, { "epoch": 1.6080200501253132, "grad_norm": 0.15207117795944214, "learning_rate": 4.5654849740800145e-05, "loss": 0.26, "step": 2005 }, { "epoch": 1.61203007518797, "grad_norm": 0.15770703554153442, "learning_rate": 4.47510775224273e-05, "loss": 0.2644, "step": 2010 }, { "epoch": 1.6160401002506266, "grad_norm": 0.16916294395923615, "learning_rate": 4.38554602882921e-05, "loss": 0.3026, "step": 2015 }, { "epoch": 1.6200501253132833, "grad_norm": 0.16087712347507477, "learning_rate": 4.2968033623206974e-05, "loss": 0.2632, "step": 2020 }, { "epoch": 1.6240601503759398, "grad_norm": 0.18064051866531372, "learning_rate": 4.2088832786555456e-05, "loss": 0.2654, "step": 2025 }, { "epoch": 1.6280701754385964, "grad_norm": 0.17459240555763245, "learning_rate": 4.121789271089113e-05, "loss": 0.276, "step": 2030 }, { "epoch": 1.6320802005012531, "grad_norm": 0.15657363831996918, "learning_rate": 4.0355248000549434e-05, "loss": 0.241, "step": 2035 }, { "epoch": 1.6360902255639098, "grad_norm": 0.1620418131351471, "learning_rate": 3.950093293027318e-05, "loss": 0.2821, "step": 2040 }, { "epoch": 1.6401002506265665, "grad_norm": 0.18404340744018555, "learning_rate": 3.865498144385049e-05, "loss": 0.2935, "step": 2045 }, { "epoch": 1.644110275689223, "grad_norm": 0.16540312767028809, "learning_rate": 3.7817427152766094e-05, "loss": 0.2583, "step": 2050 }, { "epoch": 1.6481203007518797, "grad_norm": 0.16649490594863892, "learning_rate": 3.6988303334866054e-05, "loss": 0.2774, "step": 2055 }, { "epoch": 1.6521303258145363, "grad_norm": 0.1626511514186859, "learning_rate": 3.6167642933035486e-05, "loss": 0.2467, "step": 2060 }, { "epoch": 1.656140350877193, "grad_norm": 0.16354456543922424, "learning_rate": 3.535547855388963e-05, "loss": 0.2544, "step": 2065 }, { "epoch": 1.6601503759398497, "grad_norm": 0.16961799561977386, "learning_rate": 3.455184246647822e-05, "loss": 0.2389, "step": 2070 }, { "epoch": 1.6641604010025062, "grad_norm": 0.17093317210674286, "learning_rate": 3.375676660100366e-05, "loss": 0.2767, "step": 2075 }, { "epoch": 1.6681704260651629, "grad_norm": 0.1904112696647644, "learning_rate": 3.297028254755221e-05, "loss": 0.29, "step": 2080 }, { "epoch": 1.6721804511278195, "grad_norm": 0.18005013465881348, "learning_rate": 3.219242155483868e-05, "loss": 0.249, "step": 2085 }, { "epoch": 1.6761904761904762, "grad_norm": 0.1755589246749878, "learning_rate": 3.142321452896504e-05, "loss": 0.2956, "step": 2090 }, { "epoch": 1.680200501253133, "grad_norm": 0.15832673013210297, "learning_rate": 3.0662692032192516e-05, "loss": 0.2561, "step": 2095 }, { "epoch": 1.6842105263157894, "grad_norm": 0.15485481917858124, "learning_rate": 2.9910884281727225e-05, "loss": 0.2502, "step": 2100 }, { "epoch": 1.688220551378446, "grad_norm": 0.16627748310565948, "learning_rate": 2.916782114851918e-05, "loss": 0.2939, "step": 2105 }, { "epoch": 1.6922305764411028, "grad_norm": 0.17444783449172974, "learning_rate": 2.843353215607619e-05, "loss": 0.2731, "step": 2110 }, { "epoch": 1.6962406015037594, "grad_norm": 0.17445623874664307, "learning_rate": 2.7708046479290316e-05, "loss": 0.2752, "step": 2115 }, { "epoch": 1.7002506265664161, "grad_norm": 0.17704740166664124, "learning_rate": 2.6991392943278874e-05, "loss": 0.2423, "step": 2120 }, { "epoch": 1.7042606516290726, "grad_norm": 0.16555817425251007, "learning_rate": 2.6283600022238925e-05, "loss": 0.2811, "step": 2125 }, { "epoch": 1.7082706766917293, "grad_norm": 0.1633770912885666, "learning_rate": 2.5584695838316323e-05, "loss": 0.2463, "step": 2130 }, { "epoch": 1.712280701754386, "grad_norm": 0.17787782847881317, "learning_rate": 2.4894708160488057e-05, "loss": 0.266, "step": 2135 }, { "epoch": 1.7162907268170426, "grad_norm": 0.15796466171741486, "learning_rate": 2.4213664403458903e-05, "loss": 0.2574, "step": 2140 }, { "epoch": 1.7203007518796993, "grad_norm": 0.1675783395767212, "learning_rate": 2.3541591626572518e-05, "loss": 0.251, "step": 2145 }, { "epoch": 1.7243107769423558, "grad_norm": 0.17418669164180756, "learning_rate": 2.287851653273587e-05, "loss": 0.2552, "step": 2150 }, { "epoch": 1.7283208020050125, "grad_norm": 0.17587542533874512, "learning_rate": 2.222446546735868e-05, "loss": 0.272, "step": 2155 }, { "epoch": 1.7323308270676692, "grad_norm": 0.17269235849380493, "learning_rate": 2.1579464417306266e-05, "loss": 0.2586, "step": 2160 }, { "epoch": 1.7363408521303259, "grad_norm": 0.1612161546945572, "learning_rate": 2.0943539009867423e-05, "loss": 0.2783, "step": 2165 }, { "epoch": 1.7403508771929825, "grad_norm": 0.15821614861488342, "learning_rate": 2.0316714511736002e-05, "loss": 0.2634, "step": 2170 }, { "epoch": 1.744360902255639, "grad_norm": 0.17940925061702728, "learning_rate": 1.9699015828006788e-05, "loss": 0.2632, "step": 2175 }, { "epoch": 1.7483709273182957, "grad_norm": 0.1658666729927063, "learning_rate": 1.909046750118648e-05, "loss": 0.2666, "step": 2180 }, { "epoch": 1.7523809523809524, "grad_norm": 0.16111963987350464, "learning_rate": 1.849109371021815e-05, "loss": 0.2561, "step": 2185 }, { "epoch": 1.756390977443609, "grad_norm": 0.17400680482387543, "learning_rate": 1.7900918269520672e-05, "loss": 0.2499, "step": 2190 }, { "epoch": 1.7604010025062657, "grad_norm": 0.16988424956798553, "learning_rate": 1.73199646280425e-05, "loss": 0.2719, "step": 2195 }, { "epoch": 1.7644110275689222, "grad_norm": 0.16524524986743927, "learning_rate": 1.6748255868330148e-05, "loss": 0.2296, "step": 2200 }, { "epoch": 1.768421052631579, "grad_norm": 0.18039652705192566, "learning_rate": 1.6185814705610925e-05, "loss": 0.2677, "step": 2205 }, { "epoch": 1.7724310776942356, "grad_norm": 0.1674271821975708, "learning_rate": 1.5632663486890407e-05, "loss": 0.2644, "step": 2210 }, { "epoch": 1.7764411027568923, "grad_norm": 0.1731635481119156, "learning_rate": 1.5088824190064521e-05, "loss": 0.2676, "step": 2215 }, { "epoch": 1.780451127819549, "grad_norm": 0.1482301503419876, "learning_rate": 1.455431842304647e-05, "loss": 0.2497, "step": 2220 }, { "epoch": 1.7844611528822054, "grad_norm": 0.16372230648994446, "learning_rate": 1.4029167422908107e-05, "loss": 0.266, "step": 2225 }, { "epoch": 1.788471177944862, "grad_norm": 0.17107130587100983, "learning_rate": 1.3513392055036072e-05, "loss": 0.279, "step": 2230 }, { "epoch": 1.7924812030075188, "grad_norm": 0.15426160395145416, "learning_rate": 1.3007012812302737e-05, "loss": 0.2512, "step": 2235 }, { "epoch": 1.7964912280701755, "grad_norm": 0.16106005012989044, "learning_rate": 1.25100498142523e-05, "loss": 0.2639, "step": 2240 }, { "epoch": 1.8005012531328322, "grad_norm": 0.16801631450653076, "learning_rate": 1.2022522806301005e-05, "loss": 0.2504, "step": 2245 }, { "epoch": 1.8045112781954886, "grad_norm": 0.15076345205307007, "learning_rate": 1.1544451158952807e-05, "loss": 0.2516, "step": 2250 }, { "epoch": 1.8085213032581455, "grad_norm": 0.16535955667495728, "learning_rate": 1.1075853867029694e-05, "loss": 0.2541, "step": 2255 }, { "epoch": 1.812531328320802, "grad_norm": 0.14483940601348877, "learning_rate": 1.0616749548917059e-05, "loss": 0.2392, "step": 2260 }, { "epoch": 1.8165413533834587, "grad_norm": 0.1692323386669159, "learning_rate": 1.0167156445823777e-05, "loss": 0.2535, "step": 2265 }, { "epoch": 1.8205513784461154, "grad_norm": 0.15716001391410828, "learning_rate": 9.727092421057627e-06, "loss": 0.2369, "step": 2270 }, { "epoch": 1.8245614035087718, "grad_norm": 0.14877931773662567, "learning_rate": 9.296574959315463e-06, "loss": 0.2565, "step": 2275 }, { "epoch": 1.8285714285714287, "grad_norm": 0.1587458699941635, "learning_rate": 8.875621165988474e-06, "loss": 0.269, "step": 2280 }, { "epoch": 1.8325814536340852, "grad_norm": 0.17435844242572784, "learning_rate": 8.46424776648258e-06, "loss": 0.2456, "step": 2285 }, { "epoch": 1.8365914786967419, "grad_norm": 0.16262808442115784, "learning_rate": 8.06247110555397e-06, "loss": 0.2582, "step": 2290 }, { "epoch": 1.8406015037593986, "grad_norm": 0.15774986147880554, "learning_rate": 7.670307146659588e-06, "loss": 0.2713, "step": 2295 }, { "epoch": 1.844611528822055, "grad_norm": 0.16883234679698944, "learning_rate": 7.287771471322951e-06, "loss": 0.2732, "step": 2300 }, { "epoch": 1.848621553884712, "grad_norm": 0.16868174076080322, "learning_rate": 6.91487927851489e-06, "loss": 0.2495, "step": 2305 }, { "epoch": 1.8526315789473684, "grad_norm": 0.16309338808059692, "learning_rate": 6.551645384049898e-06, "loss": 0.2684, "step": 2310 }, { "epoch": 1.856641604010025, "grad_norm": 0.1636061668395996, "learning_rate": 6.198084219997374e-06, "loss": 0.2668, "step": 2315 }, { "epoch": 1.8606516290726818, "grad_norm": 0.17537201941013336, "learning_rate": 5.854209834108087e-06, "loss": 0.2743, "step": 2320 }, { "epoch": 1.8646616541353382, "grad_norm": 0.18073910474777222, "learning_rate": 5.5200358892561755e-06, "loss": 0.2738, "step": 2325 }, { "epoch": 1.8686716791979952, "grad_norm": 0.16718195378780365, "learning_rate": 5.195575662896301e-06, "loss": 0.2668, "step": 2330 }, { "epoch": 1.8726817042606516, "grad_norm": 0.15906399488449097, "learning_rate": 4.880842046536021e-06, "loss": 0.2522, "step": 2335 }, { "epoch": 1.8766917293233083, "grad_norm": 0.1543634831905365, "learning_rate": 4.575847545223499e-06, "loss": 0.2325, "step": 2340 }, { "epoch": 1.880701754385965, "grad_norm": 0.15531331300735474, "learning_rate": 4.2806042770509316e-06, "loss": 0.2503, "step": 2345 }, { "epoch": 1.8847117794486214, "grad_norm": 0.17484238743782043, "learning_rate": 3.9951239726728485e-06, "loss": 0.2491, "step": 2350 }, { "epoch": 1.8887218045112784, "grad_norm": 0.15897777676582336, "learning_rate": 3.719417974839989e-06, "loss": 0.2708, "step": 2355 }, { "epoch": 1.8927318295739348, "grad_norm": 0.16040709614753723, "learning_rate": 3.453497237948855e-06, "loss": 0.2655, "step": 2360 }, { "epoch": 1.8967418546365915, "grad_norm": 0.14991337060928345, "learning_rate": 3.1973723276062517e-06, "loss": 0.2275, "step": 2365 }, { "epoch": 1.9007518796992482, "grad_norm": 0.1545797735452652, "learning_rate": 2.9510534202096263e-06, "loss": 0.2576, "step": 2370 }, { "epoch": 1.9047619047619047, "grad_norm": 0.16641472280025482, "learning_rate": 2.71455030254264e-06, "loss": 0.2607, "step": 2375 }, { "epoch": 1.9087719298245616, "grad_norm": 0.1749178171157837, "learning_rate": 2.487872371386424e-06, "loss": 0.2679, "step": 2380 }, { "epoch": 1.912781954887218, "grad_norm": 0.1886933445930481, "learning_rate": 2.271028633146127e-06, "loss": 0.2631, "step": 2385 }, { "epoch": 1.9167919799498747, "grad_norm": 0.15673983097076416, "learning_rate": 2.064027703493149e-06, "loss": 0.2502, "step": 2390 }, { "epoch": 1.9208020050125314, "grad_norm": 0.16062475740909576, "learning_rate": 1.866877807022771e-06, "loss": 0.2295, "step": 2395 }, { "epoch": 1.9248120300751879, "grad_norm": 0.16452881693840027, "learning_rate": 1.6795867769273665e-06, "loss": 0.2585, "step": 2400 }, { "epoch": 1.9288220551378448, "grad_norm": 0.17597217857837677, "learning_rate": 1.50216205468523e-06, "loss": 0.2586, "step": 2405 }, { "epoch": 1.9328320802005012, "grad_norm": 0.1579117476940155, "learning_rate": 1.3346106897648424e-06, "loss": 0.2442, "step": 2410 }, { "epoch": 1.936842105263158, "grad_norm": 0.16751398146152496, "learning_rate": 1.1769393393448457e-06, "loss": 0.2649, "step": 2415 }, { "epoch": 1.9408521303258146, "grad_norm": 0.17506803572177887, "learning_rate": 1.0291542680494758e-06, "loss": 0.2709, "step": 2420 }, { "epoch": 1.944862155388471, "grad_norm": 0.17089371383190155, "learning_rate": 8.912613476997067e-07, "loss": 0.2739, "step": 2425 }, { "epoch": 1.948872180451128, "grad_norm": 0.16731411218643188, "learning_rate": 7.632660570799366e-07, "loss": 0.2617, "step": 2430 }, { "epoch": 1.9528822055137844, "grad_norm": 0.15917718410491943, "learning_rate": 6.451734817202736e-07, "loss": 0.2783, "step": 2435 }, { "epoch": 1.9568922305764411, "grad_norm": 0.17604759335517883, "learning_rate": 5.369883136945309e-07, "loss": 0.2538, "step": 2440 }, { "epoch": 1.9609022556390978, "grad_norm": 0.1530042290687561, "learning_rate": 4.387148514337358e-07, "loss": 0.2652, "step": 2445 }, { "epoch": 1.9649122807017543, "grad_norm": 0.15754002332687378, "learning_rate": 3.503569995554068e-07, "loss": 0.2615, "step": 2450 }, { "epoch": 1.9689223057644112, "grad_norm": 0.1827680766582489, "learning_rate": 2.7191826870834325e-07, "loss": 0.2742, "step": 2455 }, { "epoch": 1.9729323308270676, "grad_norm": 0.15967601537704468, "learning_rate": 2.034017754332651e-07, "loss": 0.2466, "step": 2460 }, { "epoch": 1.9769423558897243, "grad_norm": 0.17333175241947174, "learning_rate": 1.4481024203877313e-07, "loss": 0.2764, "step": 2465 }, { "epoch": 1.980952380952381, "grad_norm": 0.16165874898433685, "learning_rate": 9.6145996493463e-08, "loss": 0.2568, "step": 2470 }, { "epoch": 1.9849624060150375, "grad_norm": 0.16772541403770447, "learning_rate": 5.7410972333193924e-08, "loss": 0.2407, "step": 2475 }, { "epoch": 1.9889724310776944, "grad_norm": 0.14310236275196075, "learning_rate": 2.8606708584344445e-08, "loss": 0.2468, "step": 2480 }, { "epoch": 1.9929824561403509, "grad_norm": 0.1455429494380951, "learning_rate": 9.734349702722467e-09, "loss": 0.2635, "step": 2485 }, { "epoch": 1.9969924812030075, "grad_norm": 0.16766951978206635, "learning_rate": 7.94645528018334e-10, "loss": 0.2535, "step": 2490 }, { "epoch": 1.9985964912280703, "eval_loss": 0.22910664975643158, "eval_runtime": 8.4374, "eval_samples_per_second": 6.519, "eval_steps_per_second": 0.83, "step": 2492 }, { "epoch": 1.9985964912280703, "step": 2492, "total_flos": 8.025299762167153e+17, "train_loss": 0.37913749282088366, "train_runtime": 28381.1385, "train_samples_per_second": 2.812, "train_steps_per_second": 0.088 }, { "epoch": 1.9985964912280703, "eval_loss": 0.22910664975643158, "eval_runtime": 8.378, "eval_samples_per_second": 6.565, "eval_steps_per_second": 0.836, "step": 2492 } ], "logging_steps": 5, "max_steps": 2492, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.025299762167153e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }