{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.540606328833813, "eval_steps": 500, "global_step": 36000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.835017580093924e-05, "grad_norm": 3.649945020675659, "learning_rate": 3.9292730844793716e-08, "loss": 1.0527, "step": 1 }, { "epoch": 0.0009835017580093924, "grad_norm": 11.982413291931152, "learning_rate": 3.9292730844793716e-07, "loss": 1.8048, "step": 10 }, { "epoch": 0.0019670035160187847, "grad_norm": 3.4894094467163086, "learning_rate": 7.858546168958743e-07, "loss": 1.7688, "step": 20 }, { "epoch": 0.0029505052740281773, "grad_norm": 3.314408779144287, "learning_rate": 1.1787819253438115e-06, "loss": 1.8063, "step": 30 }, { "epoch": 0.003934007032037569, "grad_norm": 3.1056833267211914, "learning_rate": 1.5717092337917486e-06, "loss": 2.0023, "step": 40 }, { "epoch": 0.004917508790046962, "grad_norm": 4.661435604095459, "learning_rate": 1.9646365422396858e-06, "loss": 2.06, "step": 50 }, { "epoch": 0.005901010548056355, "grad_norm": 2.0999605655670166, "learning_rate": 2.357563850687623e-06, "loss": 1.7816, "step": 60 }, { "epoch": 0.006884512306065747, "grad_norm": 5.44950008392334, "learning_rate": 2.7504911591355604e-06, "loss": 1.789, "step": 70 }, { "epoch": 0.007868014064075139, "grad_norm": 5.730320453643799, "learning_rate": 3.1434184675834973e-06, "loss": 1.5076, "step": 80 }, { "epoch": 0.008851515822084531, "grad_norm": 10.488595008850098, "learning_rate": 3.5363457760314346e-06, "loss": 1.6972, "step": 90 }, { "epoch": 0.009835017580093924, "grad_norm": 6.945803642272949, "learning_rate": 3.9292730844793715e-06, "loss": 1.343, "step": 100 }, { "epoch": 0.010818519338103317, "grad_norm": 6.9783830642700195, "learning_rate": 4.322200392927308e-06, "loss": 1.0908, "step": 110 }, { "epoch": 0.01180202109611271, "grad_norm": 2.9272944927215576, "learning_rate": 4.715127701375246e-06, "loss": 0.8392, "step": 120 }, { "epoch": 0.012785522854122102, "grad_norm": 3.644608736038208, "learning_rate": 5.108055009823183e-06, "loss": 0.6187, "step": 130 }, { "epoch": 0.013769024612131494, "grad_norm": 2.0485031604766846, "learning_rate": 5.500982318271121e-06, "loss": 0.7187, "step": 140 }, { "epoch": 0.014752526370140887, "grad_norm": 1.5527009963989258, "learning_rate": 5.893909626719058e-06, "loss": 0.536, "step": 150 }, { "epoch": 0.015736028128150278, "grad_norm": 1.9112921953201294, "learning_rate": 6.286836935166995e-06, "loss": 0.509, "step": 160 }, { "epoch": 0.01671952988615967, "grad_norm": 2.1568541526794434, "learning_rate": 6.6797642436149315e-06, "loss": 0.4402, "step": 170 }, { "epoch": 0.017703031644169063, "grad_norm": 0.6417804956436157, "learning_rate": 7.072691552062869e-06, "loss": 0.4267, "step": 180 }, { "epoch": 0.018686533402178455, "grad_norm": 1.3143049478530884, "learning_rate": 7.465618860510806e-06, "loss": 0.4131, "step": 190 }, { "epoch": 0.019670035160187848, "grad_norm": 1.5052763223648071, "learning_rate": 7.858546168958743e-06, "loss": 0.4092, "step": 200 }, { "epoch": 0.02065353691819724, "grad_norm": 1.0570193529129028, "learning_rate": 8.25147347740668e-06, "loss": 0.4324, "step": 210 }, { "epoch": 0.021637038676206633, "grad_norm": 1.710619330406189, "learning_rate": 8.644400785854617e-06, "loss": 0.4052, "step": 220 }, { "epoch": 0.022620540434216026, "grad_norm": 1.0562747716903687, "learning_rate": 9.037328094302554e-06, "loss": 0.5058, "step": 230 }, { "epoch": 0.02360404219222542, "grad_norm": 1.6072394847869873, "learning_rate": 9.430255402750492e-06, "loss": 0.5282, "step": 240 }, { "epoch": 0.02458754395023481, "grad_norm": 0.5480562448501587, "learning_rate": 9.82318271119843e-06, "loss": 0.3765, "step": 250 }, { "epoch": 0.025571045708244203, "grad_norm": 0.7658659219741821, "learning_rate": 1.0216110019646366e-05, "loss": 0.3856, "step": 260 }, { "epoch": 0.026554547466253596, "grad_norm": 1.19699227809906, "learning_rate": 1.0609037328094303e-05, "loss": 0.414, "step": 270 }, { "epoch": 0.02753804922426299, "grad_norm": 1.9998314380645752, "learning_rate": 1.1001964636542242e-05, "loss": 0.4558, "step": 280 }, { "epoch": 0.02852155098227238, "grad_norm": 0.4776442050933838, "learning_rate": 1.1394891944990178e-05, "loss": 0.4623, "step": 290 }, { "epoch": 0.029505052740281774, "grad_norm": 0.5863878726959229, "learning_rate": 1.1787819253438115e-05, "loss": 0.317, "step": 300 }, { "epoch": 0.030488554498291166, "grad_norm": 0.7499637007713318, "learning_rate": 1.2180746561886052e-05, "loss": 0.4085, "step": 310 }, { "epoch": 0.031472056256300555, "grad_norm": 0.7671499848365784, "learning_rate": 1.257367387033399e-05, "loss": 0.2961, "step": 320 }, { "epoch": 0.03245555801430995, "grad_norm": 0.7907651662826538, "learning_rate": 1.2966601178781926e-05, "loss": 0.3862, "step": 330 }, { "epoch": 0.03343905977231934, "grad_norm": 0.6451826691627502, "learning_rate": 1.3359528487229863e-05, "loss": 0.4217, "step": 340 }, { "epoch": 0.03442256153032874, "grad_norm": 0.42799535393714905, "learning_rate": 1.3752455795677802e-05, "loss": 0.3331, "step": 350 }, { "epoch": 0.035406063288338126, "grad_norm": 0.5752848386764526, "learning_rate": 1.4145383104125738e-05, "loss": 0.3665, "step": 360 }, { "epoch": 0.03638956504634752, "grad_norm": 0.5795350074768066, "learning_rate": 1.4538310412573675e-05, "loss": 0.3811, "step": 370 }, { "epoch": 0.03737306680435691, "grad_norm": 1.5207698345184326, "learning_rate": 1.4931237721021612e-05, "loss": 0.3787, "step": 380 }, { "epoch": 0.03835656856236631, "grad_norm": 1.3819210529327393, "learning_rate": 1.532416502946955e-05, "loss": 0.4208, "step": 390 }, { "epoch": 0.039340070320375696, "grad_norm": 0.7597386837005615, "learning_rate": 1.5717092337917486e-05, "loss": 0.335, "step": 400 }, { "epoch": 0.04032357207838509, "grad_norm": 0.39009803533554077, "learning_rate": 1.6110019646365423e-05, "loss": 0.3017, "step": 410 }, { "epoch": 0.04130707383639448, "grad_norm": 0.6308045983314514, "learning_rate": 1.650294695481336e-05, "loss": 0.3232, "step": 420 }, { "epoch": 0.04229057559440388, "grad_norm": 0.8969710469245911, "learning_rate": 1.6895874263261297e-05, "loss": 0.3076, "step": 430 }, { "epoch": 0.043274077352413266, "grad_norm": 0.6697237491607666, "learning_rate": 1.7288801571709234e-05, "loss": 0.3273, "step": 440 }, { "epoch": 0.04425757911042266, "grad_norm": 0.6771184802055359, "learning_rate": 1.768172888015717e-05, "loss": 0.3012, "step": 450 }, { "epoch": 0.04524108086843205, "grad_norm": 0.32029828429222107, "learning_rate": 1.8074656188605107e-05, "loss": 0.273, "step": 460 }, { "epoch": 0.04622458262644145, "grad_norm": 0.388177752494812, "learning_rate": 1.8467583497053048e-05, "loss": 0.3686, "step": 470 }, { "epoch": 0.04720808438445084, "grad_norm": 1.197744607925415, "learning_rate": 1.8860510805500985e-05, "loss": 0.3402, "step": 480 }, { "epoch": 0.04819158614246023, "grad_norm": 0.7969218492507935, "learning_rate": 1.925343811394892e-05, "loss": 0.3683, "step": 490 }, { "epoch": 0.04917508790046962, "grad_norm": 1.7514092922210693, "learning_rate": 1.964636542239686e-05, "loss": 0.3258, "step": 500 }, { "epoch": 0.04917508790046962, "eval_loss": 0.2306613028049469, "eval_runtime": 17.398, "eval_samples_per_second": 2.874, "eval_steps_per_second": 1.437, "step": 500 }, { "epoch": 0.05015858965847902, "grad_norm": 0.8328887820243835, "learning_rate": 1.999960259110599e-05, "loss": 0.3144, "step": 510 }, { "epoch": 0.05114209141648841, "grad_norm": 0.6266557574272156, "learning_rate": 1.999562850216588e-05, "loss": 0.3344, "step": 520 }, { "epoch": 0.052125593174497796, "grad_norm": 0.6225624084472656, "learning_rate": 1.999165441322577e-05, "loss": 0.2535, "step": 530 }, { "epoch": 0.05310909493250719, "grad_norm": 1.2749239206314087, "learning_rate": 1.998768032428566e-05, "loss": 0.2902, "step": 540 }, { "epoch": 0.05409259669051658, "grad_norm": 1.1305932998657227, "learning_rate": 1.998370623534555e-05, "loss": 0.333, "step": 550 }, { "epoch": 0.05507609844852598, "grad_norm": 0.9573833346366882, "learning_rate": 1.997973214640544e-05, "loss": 0.2581, "step": 560 }, { "epoch": 0.056059600206535366, "grad_norm": 0.5128825306892395, "learning_rate": 1.997575805746533e-05, "loss": 0.2887, "step": 570 }, { "epoch": 0.05704310196454476, "grad_norm": 0.887127697467804, "learning_rate": 1.9971783968525218e-05, "loss": 0.4077, "step": 580 }, { "epoch": 0.05802660372255415, "grad_norm": 2.3436291217803955, "learning_rate": 1.9967809879585108e-05, "loss": 0.2335, "step": 590 }, { "epoch": 0.05901010548056355, "grad_norm": 1.4168339967727661, "learning_rate": 1.9963835790644998e-05, "loss": 0.4166, "step": 600 }, { "epoch": 0.05999360723857294, "grad_norm": 0.6777006387710571, "learning_rate": 1.9959861701704887e-05, "loss": 0.3611, "step": 610 }, { "epoch": 0.06097710899658233, "grad_norm": 1.5377174615859985, "learning_rate": 1.9955887612764774e-05, "loss": 0.3877, "step": 620 }, { "epoch": 0.06196061075459172, "grad_norm": 1.4269481897354126, "learning_rate": 1.9951913523824663e-05, "loss": 0.3423, "step": 630 }, { "epoch": 0.06294411251260111, "grad_norm": 0.7028114795684814, "learning_rate": 1.9947939434884553e-05, "loss": 0.4658, "step": 640 }, { "epoch": 0.06392761427061051, "grad_norm": 0.6338093280792236, "learning_rate": 1.9943965345944446e-05, "loss": 0.3796, "step": 650 }, { "epoch": 0.0649111160286199, "grad_norm": 0.8471394777297974, "learning_rate": 1.9939991257004336e-05, "loss": 0.3291, "step": 660 }, { "epoch": 0.06589461778662929, "grad_norm": 1.587775707244873, "learning_rate": 1.9936017168064222e-05, "loss": 0.367, "step": 670 }, { "epoch": 0.06687811954463868, "grad_norm": 0.6585375070571899, "learning_rate": 1.993204307912411e-05, "loss": 0.3023, "step": 680 }, { "epoch": 0.06786162130264808, "grad_norm": 0.8488882184028625, "learning_rate": 1.9928068990184e-05, "loss": 0.3238, "step": 690 }, { "epoch": 0.06884512306065747, "grad_norm": 0.6817759275436401, "learning_rate": 1.992409490124389e-05, "loss": 0.228, "step": 700 }, { "epoch": 0.06982862481866686, "grad_norm": 1.6214189529418945, "learning_rate": 1.992012081230378e-05, "loss": 0.3284, "step": 710 }, { "epoch": 0.07081212657667625, "grad_norm": 1.3868869543075562, "learning_rate": 1.991614672336367e-05, "loss": 0.3909, "step": 720 }, { "epoch": 0.07179562833468565, "grad_norm": 1.374632716178894, "learning_rate": 1.991217263442356e-05, "loss": 0.3086, "step": 730 }, { "epoch": 0.07277913009269504, "grad_norm": 1.1037342548370361, "learning_rate": 1.990819854548345e-05, "loss": 0.4078, "step": 740 }, { "epoch": 0.07376263185070443, "grad_norm": 1.1040786504745483, "learning_rate": 1.990422445654334e-05, "loss": 0.3532, "step": 750 }, { "epoch": 0.07474613360871382, "grad_norm": 1.2953940629959106, "learning_rate": 1.990025036760323e-05, "loss": 0.3954, "step": 760 }, { "epoch": 0.07572963536672322, "grad_norm": 1.4143564701080322, "learning_rate": 1.989627627866312e-05, "loss": 0.4358, "step": 770 }, { "epoch": 0.07671313712473261, "grad_norm": 1.4138742685317993, "learning_rate": 1.9892302189723008e-05, "loss": 0.3119, "step": 780 }, { "epoch": 0.077696638882742, "grad_norm": 0.6395207643508911, "learning_rate": 1.9888328100782898e-05, "loss": 0.3049, "step": 790 }, { "epoch": 0.07868014064075139, "grad_norm": 1.8704555034637451, "learning_rate": 1.9884354011842784e-05, "loss": 0.368, "step": 800 }, { "epoch": 0.07966364239876078, "grad_norm": 0.8261243104934692, "learning_rate": 1.9880379922902677e-05, "loss": 0.3592, "step": 810 }, { "epoch": 0.08064714415677018, "grad_norm": 2.373087167739868, "learning_rate": 1.9876405833962567e-05, "loss": 0.2079, "step": 820 }, { "epoch": 0.08163064591477957, "grad_norm": 0.4923926591873169, "learning_rate": 1.9872431745022456e-05, "loss": 0.3669, "step": 830 }, { "epoch": 0.08261414767278896, "grad_norm": 2.16957688331604, "learning_rate": 1.9868457656082346e-05, "loss": 0.4158, "step": 840 }, { "epoch": 0.08359764943079835, "grad_norm": 1.1719224452972412, "learning_rate": 1.9864483567142236e-05, "loss": 0.3503, "step": 850 }, { "epoch": 0.08458115118880775, "grad_norm": 1.3015501499176025, "learning_rate": 1.9860509478202122e-05, "loss": 0.3569, "step": 860 }, { "epoch": 0.08556465294681714, "grad_norm": 1.0028891563415527, "learning_rate": 1.9856535389262012e-05, "loss": 0.3994, "step": 870 }, { "epoch": 0.08654815470482653, "grad_norm": 1.8605300188064575, "learning_rate": 1.98525613003219e-05, "loss": 0.382, "step": 880 }, { "epoch": 0.08753165646283592, "grad_norm": 0.8204103708267212, "learning_rate": 1.9848587211381794e-05, "loss": 0.2986, "step": 890 }, { "epoch": 0.08851515822084532, "grad_norm": 0.7822405695915222, "learning_rate": 1.9844613122441684e-05, "loss": 0.231, "step": 900 }, { "epoch": 0.08949865997885471, "grad_norm": 0.5924950242042542, "learning_rate": 1.984063903350157e-05, "loss": 0.3128, "step": 910 }, { "epoch": 0.0904821617368641, "grad_norm": 1.5780599117279053, "learning_rate": 1.983666494456146e-05, "loss": 0.2309, "step": 920 }, { "epoch": 0.09146566349487349, "grad_norm": 1.5509686470031738, "learning_rate": 1.983269085562135e-05, "loss": 0.3227, "step": 930 }, { "epoch": 0.0924491652528829, "grad_norm": 1.5279356241226196, "learning_rate": 1.982871676668124e-05, "loss": 0.3902, "step": 940 }, { "epoch": 0.09343266701089228, "grad_norm": 1.8690674304962158, "learning_rate": 1.982474267774113e-05, "loss": 0.2989, "step": 950 }, { "epoch": 0.09441616876890167, "grad_norm": 1.274799108505249, "learning_rate": 1.982076858880102e-05, "loss": 0.1906, "step": 960 }, { "epoch": 0.09539967052691106, "grad_norm": 0.8589666485786438, "learning_rate": 1.981679449986091e-05, "loss": 0.3481, "step": 970 }, { "epoch": 0.09638317228492047, "grad_norm": 1.0235803127288818, "learning_rate": 1.9812820410920798e-05, "loss": 0.3185, "step": 980 }, { "epoch": 0.09736667404292985, "grad_norm": 1.4823609590530396, "learning_rate": 1.9808846321980688e-05, "loss": 0.4237, "step": 990 }, { "epoch": 0.09835017580093924, "grad_norm": 1.3744195699691772, "learning_rate": 1.9804872233040577e-05, "loss": 0.2549, "step": 1000 }, { "epoch": 0.09835017580093924, "eval_loss": 0.20229017734527588, "eval_runtime": 16.1384, "eval_samples_per_second": 3.098, "eval_steps_per_second": 1.549, "step": 1000 }, { "epoch": 0.09933367755894863, "grad_norm": 2.249887466430664, "learning_rate": 1.9800898144100467e-05, "loss": 0.4879, "step": 1010 }, { "epoch": 0.10031717931695804, "grad_norm": 1.4358183145523071, "learning_rate": 1.9796924055160357e-05, "loss": 0.3384, "step": 1020 }, { "epoch": 0.10130068107496742, "grad_norm": 0.7850083708763123, "learning_rate": 1.9792949966220246e-05, "loss": 0.4727, "step": 1030 }, { "epoch": 0.10228418283297681, "grad_norm": 1.24857759475708, "learning_rate": 1.9788975877280133e-05, "loss": 0.2594, "step": 1040 }, { "epoch": 0.1032676845909862, "grad_norm": 0.6814334392547607, "learning_rate": 1.9785001788340022e-05, "loss": 0.3108, "step": 1050 }, { "epoch": 0.10425118634899559, "grad_norm": 0.8959739208221436, "learning_rate": 1.9781027699399915e-05, "loss": 0.3643, "step": 1060 }, { "epoch": 0.105234688107005, "grad_norm": 1.1151331663131714, "learning_rate": 1.9777053610459805e-05, "loss": 0.4537, "step": 1070 }, { "epoch": 0.10621818986501438, "grad_norm": 0.7030103802680969, "learning_rate": 1.9773079521519695e-05, "loss": 0.2592, "step": 1080 }, { "epoch": 0.10720169162302377, "grad_norm": 0.9018835425376892, "learning_rate": 1.9769105432579584e-05, "loss": 0.3518, "step": 1090 }, { "epoch": 0.10818519338103316, "grad_norm": 0.46352618932724, "learning_rate": 1.976513134363947e-05, "loss": 0.3255, "step": 1100 }, { "epoch": 0.10916869513904257, "grad_norm": 1.0427846908569336, "learning_rate": 1.976115725469936e-05, "loss": 0.3765, "step": 1110 }, { "epoch": 0.11015219689705195, "grad_norm": 0.5563482642173767, "learning_rate": 1.975718316575925e-05, "loss": 0.2775, "step": 1120 }, { "epoch": 0.11113569865506134, "grad_norm": 1.104644536972046, "learning_rate": 1.975320907681914e-05, "loss": 0.2524, "step": 1130 }, { "epoch": 0.11211920041307073, "grad_norm": 0.8505045175552368, "learning_rate": 1.9749234987879033e-05, "loss": 0.3474, "step": 1140 }, { "epoch": 0.11310270217108014, "grad_norm": 0.6132721304893494, "learning_rate": 1.974526089893892e-05, "loss": 0.2291, "step": 1150 }, { "epoch": 0.11408620392908952, "grad_norm": 1.056565284729004, "learning_rate": 1.974128680999881e-05, "loss": 0.3622, "step": 1160 }, { "epoch": 0.11506970568709891, "grad_norm": 1.2748444080352783, "learning_rate": 1.97373127210587e-05, "loss": 0.3838, "step": 1170 }, { "epoch": 0.1160532074451083, "grad_norm": 0.8781066536903381, "learning_rate": 1.9733338632118588e-05, "loss": 0.3436, "step": 1180 }, { "epoch": 0.1170367092031177, "grad_norm": 1.7326856851577759, "learning_rate": 1.9729364543178478e-05, "loss": 0.3075, "step": 1190 }, { "epoch": 0.1180202109611271, "grad_norm": 1.2784039974212646, "learning_rate": 1.9725390454238367e-05, "loss": 0.4899, "step": 1200 }, { "epoch": 0.11900371271913648, "grad_norm": 2.0357894897460938, "learning_rate": 1.9721416365298257e-05, "loss": 0.3078, "step": 1210 }, { "epoch": 0.11998721447714587, "grad_norm": 0.5481784343719482, "learning_rate": 1.9717442276358147e-05, "loss": 0.2839, "step": 1220 }, { "epoch": 0.12097071623515528, "grad_norm": 1.6970115900039673, "learning_rate": 1.9713468187418036e-05, "loss": 0.3257, "step": 1230 }, { "epoch": 0.12195421799316467, "grad_norm": 1.4209277629852295, "learning_rate": 1.9709494098477926e-05, "loss": 0.4143, "step": 1240 }, { "epoch": 0.12293771975117405, "grad_norm": 1.3740328550338745, "learning_rate": 1.9705520009537816e-05, "loss": 0.3004, "step": 1250 }, { "epoch": 0.12392122150918344, "grad_norm": 1.2223352193832397, "learning_rate": 1.9701545920597705e-05, "loss": 0.2664, "step": 1260 }, { "epoch": 0.12490472326719285, "grad_norm": 1.5225772857666016, "learning_rate": 1.9697571831657595e-05, "loss": 0.2847, "step": 1270 }, { "epoch": 0.12588822502520222, "grad_norm": 1.2129939794540405, "learning_rate": 1.969359774271748e-05, "loss": 0.3743, "step": 1280 }, { "epoch": 0.12687172678321162, "grad_norm": 1.3428657054901123, "learning_rate": 1.968962365377737e-05, "loss": 0.2888, "step": 1290 }, { "epoch": 0.12785522854122103, "grad_norm": 1.0801212787628174, "learning_rate": 1.9685649564837264e-05, "loss": 0.2564, "step": 1300 }, { "epoch": 0.1288387302992304, "grad_norm": 0.9565796256065369, "learning_rate": 1.9681675475897154e-05, "loss": 0.2988, "step": 1310 }, { "epoch": 0.1298222320572398, "grad_norm": 1.2454416751861572, "learning_rate": 1.9677701386957043e-05, "loss": 0.2863, "step": 1320 }, { "epoch": 0.13080573381524918, "grad_norm": 0.6081139445304871, "learning_rate": 1.9673727298016933e-05, "loss": 0.2704, "step": 1330 }, { "epoch": 0.13178923557325858, "grad_norm": 0.44817450642585754, "learning_rate": 1.966975320907682e-05, "loss": 0.3165, "step": 1340 }, { "epoch": 0.132772737331268, "grad_norm": 1.6914284229278564, "learning_rate": 1.966577912013671e-05, "loss": 0.286, "step": 1350 }, { "epoch": 0.13375623908927736, "grad_norm": 0.9408580660820007, "learning_rate": 1.96618050311966e-05, "loss": 0.2856, "step": 1360 }, { "epoch": 0.13473974084728677, "grad_norm": 0.3792386054992676, "learning_rate": 1.9657830942256488e-05, "loss": 0.3859, "step": 1370 }, { "epoch": 0.13572324260529617, "grad_norm": 0.9228833913803101, "learning_rate": 1.965385685331638e-05, "loss": 0.2832, "step": 1380 }, { "epoch": 0.13670674436330554, "grad_norm": 1.593618392944336, "learning_rate": 1.9649882764376268e-05, "loss": 0.2249, "step": 1390 }, { "epoch": 0.13769024612131495, "grad_norm": 1.7434206008911133, "learning_rate": 1.9645908675436157e-05, "loss": 0.2834, "step": 1400 }, { "epoch": 0.13867374787932432, "grad_norm": 1.7918550968170166, "learning_rate": 1.9641934586496047e-05, "loss": 0.3431, "step": 1410 }, { "epoch": 0.13965724963733372, "grad_norm": 1.3712127208709717, "learning_rate": 1.9637960497555937e-05, "loss": 0.3033, "step": 1420 }, { "epoch": 0.14064075139534313, "grad_norm": 2.3793137073516846, "learning_rate": 1.9633986408615826e-05, "loss": 0.3397, "step": 1430 }, { "epoch": 0.1416242531533525, "grad_norm": 0.6082277894020081, "learning_rate": 1.9630012319675716e-05, "loss": 0.2817, "step": 1440 }, { "epoch": 0.1426077549113619, "grad_norm": 1.1378542184829712, "learning_rate": 1.9626038230735606e-05, "loss": 0.4318, "step": 1450 }, { "epoch": 0.1435912566693713, "grad_norm": 0.5686970353126526, "learning_rate": 1.9622064141795495e-05, "loss": 0.2451, "step": 1460 }, { "epoch": 0.14457475842738068, "grad_norm": 0.6625441312789917, "learning_rate": 1.9618090052855385e-05, "loss": 0.2514, "step": 1470 }, { "epoch": 0.1455582601853901, "grad_norm": 0.542182445526123, "learning_rate": 1.9614115963915275e-05, "loss": 0.3149, "step": 1480 }, { "epoch": 0.14654176194339946, "grad_norm": 0.3858255445957184, "learning_rate": 1.9610141874975164e-05, "loss": 0.2722, "step": 1490 }, { "epoch": 0.14752526370140887, "grad_norm": 0.773241400718689, "learning_rate": 1.9606167786035054e-05, "loss": 0.2475, "step": 1500 }, { "epoch": 0.14752526370140887, "eval_loss": 0.19610656797885895, "eval_runtime": 19.5568, "eval_samples_per_second": 2.557, "eval_steps_per_second": 1.278, "step": 1500 }, { "epoch": 0.14850876545941827, "grad_norm": 0.9091148376464844, "learning_rate": 1.9602193697094944e-05, "loss": 0.3697, "step": 1510 }, { "epoch": 0.14949226721742764, "grad_norm": 1.2830495834350586, "learning_rate": 1.959821960815483e-05, "loss": 0.3447, "step": 1520 }, { "epoch": 0.15047576897543705, "grad_norm": 1.6588833332061768, "learning_rate": 1.959424551921472e-05, "loss": 0.2362, "step": 1530 }, { "epoch": 0.15145927073344645, "grad_norm": 1.5319061279296875, "learning_rate": 1.9590271430274613e-05, "loss": 0.2416, "step": 1540 }, { "epoch": 0.15244277249145582, "grad_norm": 0.7510396838188171, "learning_rate": 1.9586297341334502e-05, "loss": 0.2539, "step": 1550 }, { "epoch": 0.15342627424946523, "grad_norm": 1.2139660120010376, "learning_rate": 1.9582323252394392e-05, "loss": 0.3512, "step": 1560 }, { "epoch": 0.1544097760074746, "grad_norm": 1.344325304031372, "learning_rate": 1.957834916345428e-05, "loss": 0.2667, "step": 1570 }, { "epoch": 0.155393277765484, "grad_norm": 1.6625233888626099, "learning_rate": 1.9574375074514168e-05, "loss": 0.2317, "step": 1580 }, { "epoch": 0.1563767795234934, "grad_norm": 1.9059685468673706, "learning_rate": 1.9570400985574057e-05, "loss": 0.2561, "step": 1590 }, { "epoch": 0.15736028128150278, "grad_norm": 0.7797982692718506, "learning_rate": 1.9566426896633947e-05, "loss": 0.3168, "step": 1600 }, { "epoch": 0.1583437830395122, "grad_norm": 0.5057277679443359, "learning_rate": 1.9562452807693837e-05, "loss": 0.2496, "step": 1610 }, { "epoch": 0.15932728479752156, "grad_norm": 1.5704916715621948, "learning_rate": 1.955847871875373e-05, "loss": 0.3539, "step": 1620 }, { "epoch": 0.16031078655553097, "grad_norm": 1.0662189722061157, "learning_rate": 1.9554504629813616e-05, "loss": 0.2697, "step": 1630 }, { "epoch": 0.16129428831354037, "grad_norm": 1.3776342868804932, "learning_rate": 1.9550530540873506e-05, "loss": 0.3037, "step": 1640 }, { "epoch": 0.16227779007154974, "grad_norm": 1.1762497425079346, "learning_rate": 1.9546556451933395e-05, "loss": 0.2372, "step": 1650 }, { "epoch": 0.16326129182955915, "grad_norm": 0.9527464509010315, "learning_rate": 1.9542582362993285e-05, "loss": 0.3551, "step": 1660 }, { "epoch": 0.16424479358756855, "grad_norm": 1.6634376049041748, "learning_rate": 1.9538608274053175e-05, "loss": 0.3816, "step": 1670 }, { "epoch": 0.16522829534557792, "grad_norm": 1.6701467037200928, "learning_rate": 1.9534634185113064e-05, "loss": 0.2199, "step": 1680 }, { "epoch": 0.16621179710358733, "grad_norm": 1.0209531784057617, "learning_rate": 1.9530660096172954e-05, "loss": 0.3532, "step": 1690 }, { "epoch": 0.1671952988615967, "grad_norm": 0.8011378049850464, "learning_rate": 1.9526686007232844e-05, "loss": 0.3637, "step": 1700 }, { "epoch": 0.1681788006196061, "grad_norm": 0.581768274307251, "learning_rate": 1.9522711918292733e-05, "loss": 0.3278, "step": 1710 }, { "epoch": 0.1691623023776155, "grad_norm": 1.9274336099624634, "learning_rate": 1.9518737829352623e-05, "loss": 0.236, "step": 1720 }, { "epoch": 0.17014580413562488, "grad_norm": 1.341434359550476, "learning_rate": 1.9514763740412513e-05, "loss": 0.2646, "step": 1730 }, { "epoch": 0.1711293058936343, "grad_norm": 1.8525949716567993, "learning_rate": 1.9510789651472402e-05, "loss": 0.3428, "step": 1740 }, { "epoch": 0.1721128076516437, "grad_norm": 1.5868077278137207, "learning_rate": 1.9506815562532292e-05, "loss": 0.3762, "step": 1750 }, { "epoch": 0.17309630940965307, "grad_norm": 0.9035124778747559, "learning_rate": 1.950284147359218e-05, "loss": 0.275, "step": 1760 }, { "epoch": 0.17407981116766247, "grad_norm": 1.7473918199539185, "learning_rate": 1.9498867384652068e-05, "loss": 0.2784, "step": 1770 }, { "epoch": 0.17506331292567184, "grad_norm": 1.0584453344345093, "learning_rate": 1.9494893295711958e-05, "loss": 0.3078, "step": 1780 }, { "epoch": 0.17604681468368125, "grad_norm": 0.5479601621627808, "learning_rate": 1.949091920677185e-05, "loss": 0.3793, "step": 1790 }, { "epoch": 0.17703031644169065, "grad_norm": 0.8641221523284912, "learning_rate": 1.948694511783174e-05, "loss": 0.4181, "step": 1800 }, { "epoch": 0.17801381819970002, "grad_norm": 0.9480550289154053, "learning_rate": 1.948297102889163e-05, "loss": 0.4344, "step": 1810 }, { "epoch": 0.17899731995770943, "grad_norm": 1.129366397857666, "learning_rate": 1.9478996939951516e-05, "loss": 0.3321, "step": 1820 }, { "epoch": 0.17998082171571883, "grad_norm": 1.2757154703140259, "learning_rate": 1.9475022851011406e-05, "loss": 0.3701, "step": 1830 }, { "epoch": 0.1809643234737282, "grad_norm": 1.2727177143096924, "learning_rate": 1.9471048762071296e-05, "loss": 0.2436, "step": 1840 }, { "epoch": 0.1819478252317376, "grad_norm": 2.6235814094543457, "learning_rate": 1.9467074673131185e-05, "loss": 0.2863, "step": 1850 }, { "epoch": 0.18293132698974698, "grad_norm": 2.299156904220581, "learning_rate": 1.9463100584191075e-05, "loss": 0.3019, "step": 1860 }, { "epoch": 0.1839148287477564, "grad_norm": 0.4710278809070587, "learning_rate": 1.9459126495250965e-05, "loss": 0.2553, "step": 1870 }, { "epoch": 0.1848983305057658, "grad_norm": 0.47247934341430664, "learning_rate": 1.9455152406310854e-05, "loss": 0.3571, "step": 1880 }, { "epoch": 0.18588183226377517, "grad_norm": 0.878594160079956, "learning_rate": 1.9451178317370744e-05, "loss": 0.1848, "step": 1890 }, { "epoch": 0.18686533402178457, "grad_norm": 0.7783956527709961, "learning_rate": 1.9447204228430634e-05, "loss": 0.2342, "step": 1900 }, { "epoch": 0.18784883577979394, "grad_norm": 0.9626131057739258, "learning_rate": 1.9443230139490523e-05, "loss": 0.2647, "step": 1910 }, { "epoch": 0.18883233753780335, "grad_norm": 1.0547096729278564, "learning_rate": 1.9439256050550413e-05, "loss": 0.2782, "step": 1920 }, { "epoch": 0.18981583929581275, "grad_norm": 1.5676674842834473, "learning_rate": 1.9435281961610303e-05, "loss": 0.3556, "step": 1930 }, { "epoch": 0.19079934105382212, "grad_norm": 0.7233918309211731, "learning_rate": 1.9431307872670192e-05, "loss": 0.2339, "step": 1940 }, { "epoch": 0.19178284281183153, "grad_norm": 2.287632465362549, "learning_rate": 1.9427333783730082e-05, "loss": 0.273, "step": 1950 }, { "epoch": 0.19276634456984093, "grad_norm": 0.5366862416267395, "learning_rate": 1.942335969478997e-05, "loss": 0.2995, "step": 1960 }, { "epoch": 0.1937498463278503, "grad_norm": 1.5294963121414185, "learning_rate": 1.941938560584986e-05, "loss": 0.2755, "step": 1970 }, { "epoch": 0.1947333480858597, "grad_norm": 1.1591711044311523, "learning_rate": 1.941541151690975e-05, "loss": 0.3569, "step": 1980 }, { "epoch": 0.19571684984386908, "grad_norm": 1.1852513551712036, "learning_rate": 1.941143742796964e-05, "loss": 0.3179, "step": 1990 }, { "epoch": 0.1967003516018785, "grad_norm": 1.4614847898483276, "learning_rate": 1.9407463339029527e-05, "loss": 0.1913, "step": 2000 }, { "epoch": 0.1967003516018785, "eval_loss": 0.18594306707382202, "eval_runtime": 18.3957, "eval_samples_per_second": 2.718, "eval_steps_per_second": 1.359, "step": 2000 }, { "epoch": 0.1976838533598879, "grad_norm": 2.192962884902954, "learning_rate": 1.9403489250089417e-05, "loss": 0.2024, "step": 2010 }, { "epoch": 0.19866735511789727, "grad_norm": 0.4256015419960022, "learning_rate": 1.9399515161149306e-05, "loss": 0.2221, "step": 2020 }, { "epoch": 0.19965085687590667, "grad_norm": 1.269654393196106, "learning_rate": 1.93955410722092e-05, "loss": 0.2569, "step": 2030 }, { "epoch": 0.20063435863391607, "grad_norm": 1.8245949745178223, "learning_rate": 1.939156698326909e-05, "loss": 0.3545, "step": 2040 }, { "epoch": 0.20161786039192545, "grad_norm": 1.3251378536224365, "learning_rate": 1.938759289432898e-05, "loss": 0.257, "step": 2050 }, { "epoch": 0.20260136214993485, "grad_norm": 1.2136321067810059, "learning_rate": 1.9383618805388865e-05, "loss": 0.2392, "step": 2060 }, { "epoch": 0.20358486390794422, "grad_norm": 1.2105964422225952, "learning_rate": 1.9379644716448755e-05, "loss": 0.4406, "step": 2070 }, { "epoch": 0.20456836566595363, "grad_norm": 0.9377761483192444, "learning_rate": 1.9375670627508644e-05, "loss": 0.2832, "step": 2080 }, { "epoch": 0.20555186742396303, "grad_norm": 1.6811449527740479, "learning_rate": 1.9371696538568534e-05, "loss": 0.3275, "step": 2090 }, { "epoch": 0.2065353691819724, "grad_norm": 1.4369457960128784, "learning_rate": 1.9367722449628424e-05, "loss": 0.3204, "step": 2100 }, { "epoch": 0.2075188709399818, "grad_norm": 0.8904029130935669, "learning_rate": 1.9363748360688313e-05, "loss": 0.2644, "step": 2110 }, { "epoch": 0.20850237269799118, "grad_norm": 0.8172305822372437, "learning_rate": 1.9359774271748203e-05, "loss": 0.2975, "step": 2120 }, { "epoch": 0.2094858744560006, "grad_norm": 1.6503185033798218, "learning_rate": 1.9355800182808093e-05, "loss": 0.2737, "step": 2130 }, { "epoch": 0.21046937621401, "grad_norm": 2.1273269653320312, "learning_rate": 1.9351826093867982e-05, "loss": 0.302, "step": 2140 }, { "epoch": 0.21145287797201937, "grad_norm": 1.331870436668396, "learning_rate": 1.9347852004927872e-05, "loss": 0.3874, "step": 2150 }, { "epoch": 0.21243637973002877, "grad_norm": 0.8876633048057556, "learning_rate": 1.934387791598776e-05, "loss": 0.2727, "step": 2160 }, { "epoch": 0.21341988148803817, "grad_norm": 1.1674466133117676, "learning_rate": 1.933990382704765e-05, "loss": 0.3417, "step": 2170 }, { "epoch": 0.21440338324604755, "grad_norm": 1.6989260911941528, "learning_rate": 1.933592973810754e-05, "loss": 0.387, "step": 2180 }, { "epoch": 0.21538688500405695, "grad_norm": 1.3016669750213623, "learning_rate": 1.933195564916743e-05, "loss": 0.3226, "step": 2190 }, { "epoch": 0.21637038676206632, "grad_norm": 0.5776589512825012, "learning_rate": 1.932798156022732e-05, "loss": 0.2674, "step": 2200 }, { "epoch": 0.21735388852007573, "grad_norm": 0.7415688037872314, "learning_rate": 1.932400747128721e-05, "loss": 0.2484, "step": 2210 }, { "epoch": 0.21833739027808513, "grad_norm": 1.439495325088501, "learning_rate": 1.93200333823471e-05, "loss": 0.3143, "step": 2220 }, { "epoch": 0.2193208920360945, "grad_norm": 2.2038726806640625, "learning_rate": 1.931605929340699e-05, "loss": 0.2642, "step": 2230 }, { "epoch": 0.2203043937941039, "grad_norm": 0.8493722081184387, "learning_rate": 1.9312085204466875e-05, "loss": 0.2897, "step": 2240 }, { "epoch": 0.2212878955521133, "grad_norm": 2.087738037109375, "learning_rate": 1.9308111115526765e-05, "loss": 0.3154, "step": 2250 }, { "epoch": 0.2222713973101227, "grad_norm": 1.0796573162078857, "learning_rate": 1.9304137026586655e-05, "loss": 0.2283, "step": 2260 }, { "epoch": 0.2232548990681321, "grad_norm": 1.657051920890808, "learning_rate": 1.9300162937646548e-05, "loss": 0.2957, "step": 2270 }, { "epoch": 0.22423840082614147, "grad_norm": 0.8360177278518677, "learning_rate": 1.9296188848706438e-05, "loss": 0.2801, "step": 2280 }, { "epoch": 0.22522190258415087, "grad_norm": 0.4915439784526825, "learning_rate": 1.9292214759766327e-05, "loss": 0.2689, "step": 2290 }, { "epoch": 0.22620540434216027, "grad_norm": 0.8634927868843079, "learning_rate": 1.9288240670826213e-05, "loss": 0.2637, "step": 2300 }, { "epoch": 0.22718890610016965, "grad_norm": 0.7675266265869141, "learning_rate": 1.9284266581886103e-05, "loss": 0.3316, "step": 2310 }, { "epoch": 0.22817240785817905, "grad_norm": 1.2641220092773438, "learning_rate": 1.9280292492945993e-05, "loss": 0.3831, "step": 2320 }, { "epoch": 0.22915590961618845, "grad_norm": 1.1142165660858154, "learning_rate": 1.9276318404005882e-05, "loss": 0.324, "step": 2330 }, { "epoch": 0.23013941137419783, "grad_norm": 1.5316585302352905, "learning_rate": 1.9272344315065772e-05, "loss": 0.3329, "step": 2340 }, { "epoch": 0.23112291313220723, "grad_norm": 2.2005603313446045, "learning_rate": 1.9268370226125662e-05, "loss": 0.4578, "step": 2350 }, { "epoch": 0.2321064148902166, "grad_norm": 0.6494570374488831, "learning_rate": 1.926439613718555e-05, "loss": 0.2819, "step": 2360 }, { "epoch": 0.233089916648226, "grad_norm": 1.4297327995300293, "learning_rate": 1.926042204824544e-05, "loss": 0.2775, "step": 2370 }, { "epoch": 0.2340734184062354, "grad_norm": 1.6414613723754883, "learning_rate": 1.925644795930533e-05, "loss": 0.3015, "step": 2380 }, { "epoch": 0.2350569201642448, "grad_norm": 1.6433988809585571, "learning_rate": 1.925247387036522e-05, "loss": 0.2282, "step": 2390 }, { "epoch": 0.2360404219222542, "grad_norm": 1.0170117616653442, "learning_rate": 1.924849978142511e-05, "loss": 0.2157, "step": 2400 }, { "epoch": 0.23702392368026357, "grad_norm": 0.700157880783081, "learning_rate": 1.9244525692485e-05, "loss": 0.3026, "step": 2410 }, { "epoch": 0.23800742543827297, "grad_norm": 1.600882887840271, "learning_rate": 1.924055160354489e-05, "loss": 0.2904, "step": 2420 }, { "epoch": 0.23899092719628237, "grad_norm": 1.2791134119033813, "learning_rate": 1.923657751460478e-05, "loss": 0.1481, "step": 2430 }, { "epoch": 0.23997442895429175, "grad_norm": 0.642951488494873, "learning_rate": 1.923260342566467e-05, "loss": 0.2284, "step": 2440 }, { "epoch": 0.24095793071230115, "grad_norm": 1.4791512489318848, "learning_rate": 1.922862933672456e-05, "loss": 0.4285, "step": 2450 }, { "epoch": 0.24194143247031055, "grad_norm": 0.9878162145614624, "learning_rate": 1.9224655247784448e-05, "loss": 0.2794, "step": 2460 }, { "epoch": 0.24292493422831993, "grad_norm": 1.7200347185134888, "learning_rate": 1.9220681158844338e-05, "loss": 0.2782, "step": 2470 }, { "epoch": 0.24390843598632933, "grad_norm": 0.8549171686172485, "learning_rate": 1.9216707069904224e-05, "loss": 0.3878, "step": 2480 }, { "epoch": 0.2448919377443387, "grad_norm": 0.695548951625824, "learning_rate": 1.9212732980964114e-05, "loss": 0.3212, "step": 2490 }, { "epoch": 0.2458754395023481, "grad_norm": 0.5230987668037415, "learning_rate": 1.9208758892024003e-05, "loss": 0.2651, "step": 2500 }, { "epoch": 0.2458754395023481, "eval_loss": 0.1891545057296753, "eval_runtime": 15.2446, "eval_samples_per_second": 3.28, "eval_steps_per_second": 1.64, "step": 2500 }, { "epoch": 0.2468589412603575, "grad_norm": 1.109619379043579, "learning_rate": 1.9204784803083893e-05, "loss": 0.3115, "step": 2510 }, { "epoch": 0.2478424430183669, "grad_norm": 1.729632019996643, "learning_rate": 1.9200810714143786e-05, "loss": 0.3126, "step": 2520 }, { "epoch": 0.2488259447763763, "grad_norm": 2.7448902130126953, "learning_rate": 1.9196836625203676e-05, "loss": 0.3002, "step": 2530 }, { "epoch": 0.2498094465343857, "grad_norm": 1.259028434753418, "learning_rate": 1.9192862536263562e-05, "loss": 0.2244, "step": 2540 }, { "epoch": 0.25079294829239507, "grad_norm": 0.6467012763023376, "learning_rate": 1.9188888447323452e-05, "loss": 0.2983, "step": 2550 }, { "epoch": 0.25177645005040444, "grad_norm": 1.3998416662216187, "learning_rate": 1.918491435838334e-05, "loss": 0.3287, "step": 2560 }, { "epoch": 0.2527599518084139, "grad_norm": 1.7507133483886719, "learning_rate": 1.918094026944323e-05, "loss": 0.286, "step": 2570 }, { "epoch": 0.25374345356642325, "grad_norm": 2.0538573265075684, "learning_rate": 1.917696618050312e-05, "loss": 0.3549, "step": 2580 }, { "epoch": 0.2547269553244326, "grad_norm": 0.7538926005363464, "learning_rate": 1.917299209156301e-05, "loss": 0.309, "step": 2590 }, { "epoch": 0.25571045708244206, "grad_norm": 0.7768136858940125, "learning_rate": 1.91690180026229e-05, "loss": 0.3403, "step": 2600 }, { "epoch": 0.25669395884045143, "grad_norm": 1.5854865312576294, "learning_rate": 1.916504391368279e-05, "loss": 0.3446, "step": 2610 }, { "epoch": 0.2576774605984608, "grad_norm": 0.8464900851249695, "learning_rate": 1.916106982474268e-05, "loss": 0.2657, "step": 2620 }, { "epoch": 0.25866096235647024, "grad_norm": 0.8218358755111694, "learning_rate": 1.915709573580257e-05, "loss": 0.253, "step": 2630 }, { "epoch": 0.2596444641144796, "grad_norm": 1.4568731784820557, "learning_rate": 1.915312164686246e-05, "loss": 0.2408, "step": 2640 }, { "epoch": 0.260627965872489, "grad_norm": 1.706364631652832, "learning_rate": 1.914914755792235e-05, "loss": 0.2368, "step": 2650 }, { "epoch": 0.26161146763049836, "grad_norm": 0.566818356513977, "learning_rate": 1.9145173468982238e-05, "loss": 0.3488, "step": 2660 }, { "epoch": 0.2625949693885078, "grad_norm": 0.9257858991622925, "learning_rate": 1.9141199380042124e-05, "loss": 0.2329, "step": 2670 }, { "epoch": 0.26357847114651717, "grad_norm": 1.2466871738433838, "learning_rate": 1.9137225291102017e-05, "loss": 0.428, "step": 2680 }, { "epoch": 0.26456197290452654, "grad_norm": 0.7756865620613098, "learning_rate": 1.9133251202161907e-05, "loss": 0.3775, "step": 2690 }, { "epoch": 0.265545474662536, "grad_norm": 0.9694525003433228, "learning_rate": 1.9129277113221797e-05, "loss": 0.252, "step": 2700 }, { "epoch": 0.26652897642054535, "grad_norm": 1.9040061235427856, "learning_rate": 1.9125303024281686e-05, "loss": 0.2924, "step": 2710 }, { "epoch": 0.2675124781785547, "grad_norm": 1.2722103595733643, "learning_rate": 1.9121328935341573e-05, "loss": 0.2348, "step": 2720 }, { "epoch": 0.26849597993656416, "grad_norm": 0.8166412115097046, "learning_rate": 1.9117354846401462e-05, "loss": 0.2093, "step": 2730 }, { "epoch": 0.26947948169457353, "grad_norm": 0.6143937110900879, "learning_rate": 1.9113380757461352e-05, "loss": 0.2024, "step": 2740 }, { "epoch": 0.2704629834525829, "grad_norm": 1.0523463487625122, "learning_rate": 1.910940666852124e-05, "loss": 0.3963, "step": 2750 }, { "epoch": 0.27144648521059234, "grad_norm": 1.7578366994857788, "learning_rate": 1.9105432579581135e-05, "loss": 0.3699, "step": 2760 }, { "epoch": 0.2724299869686017, "grad_norm": 0.3840101659297943, "learning_rate": 1.9101458490641024e-05, "loss": 0.3475, "step": 2770 }, { "epoch": 0.2734134887266111, "grad_norm": 1.2118396759033203, "learning_rate": 1.909748440170091e-05, "loss": 0.2155, "step": 2780 }, { "epoch": 0.2743969904846205, "grad_norm": 1.1841282844543457, "learning_rate": 1.90935103127608e-05, "loss": 0.3818, "step": 2790 }, { "epoch": 0.2753804922426299, "grad_norm": 0.46255505084991455, "learning_rate": 1.908953622382069e-05, "loss": 0.3608, "step": 2800 }, { "epoch": 0.27636399400063927, "grad_norm": 1.2726696729660034, "learning_rate": 1.908556213488058e-05, "loss": 0.191, "step": 2810 }, { "epoch": 0.27734749575864864, "grad_norm": 2.2195935249328613, "learning_rate": 1.908158804594047e-05, "loss": 0.2622, "step": 2820 }, { "epoch": 0.2783309975166581, "grad_norm": 0.9706044793128967, "learning_rate": 1.907761395700036e-05, "loss": 0.2811, "step": 2830 }, { "epoch": 0.27931449927466745, "grad_norm": 1.3599534034729004, "learning_rate": 1.907363986806025e-05, "loss": 0.398, "step": 2840 }, { "epoch": 0.2802980010326768, "grad_norm": 0.937558114528656, "learning_rate": 1.9069665779120138e-05, "loss": 0.3319, "step": 2850 }, { "epoch": 0.28128150279068626, "grad_norm": 1.9157116413116455, "learning_rate": 1.9065691690180028e-05, "loss": 0.2936, "step": 2860 }, { "epoch": 0.28226500454869563, "grad_norm": 1.3096779584884644, "learning_rate": 1.9061717601239918e-05, "loss": 0.3654, "step": 2870 }, { "epoch": 0.283248506306705, "grad_norm": 1.887032151222229, "learning_rate": 1.9057743512299807e-05, "loss": 0.364, "step": 2880 }, { "epoch": 0.28423200806471444, "grad_norm": 1.9582734107971191, "learning_rate": 1.9053769423359697e-05, "loss": 0.2359, "step": 2890 }, { "epoch": 0.2852155098227238, "grad_norm": 1.3817726373672485, "learning_rate": 1.9049795334419587e-05, "loss": 0.2605, "step": 2900 }, { "epoch": 0.2861990115807332, "grad_norm": 1.1931792497634888, "learning_rate": 1.9045821245479473e-05, "loss": 0.3133, "step": 2910 }, { "epoch": 0.2871825133387426, "grad_norm": 1.1820030212402344, "learning_rate": 1.9041847156539366e-05, "loss": 0.3351, "step": 2920 }, { "epoch": 0.288166015096752, "grad_norm": 1.4011718034744263, "learning_rate": 1.9037873067599256e-05, "loss": 0.2993, "step": 2930 }, { "epoch": 0.28914951685476137, "grad_norm": 1.7731127738952637, "learning_rate": 1.9033898978659145e-05, "loss": 0.304, "step": 2940 }, { "epoch": 0.29013301861277074, "grad_norm": 1.2245922088623047, "learning_rate": 1.9029924889719035e-05, "loss": 0.3627, "step": 2950 }, { "epoch": 0.2911165203707802, "grad_norm": 1.4606093168258667, "learning_rate": 1.902595080077892e-05, "loss": 0.3513, "step": 2960 }, { "epoch": 0.29210002212878955, "grad_norm": 1.4176182746887207, "learning_rate": 1.902197671183881e-05, "loss": 0.3211, "step": 2970 }, { "epoch": 0.2930835238867989, "grad_norm": 0.8106399178504944, "learning_rate": 1.90180026228987e-05, "loss": 0.3057, "step": 2980 }, { "epoch": 0.29406702564480836, "grad_norm": 1.1231039762496948, "learning_rate": 1.901402853395859e-05, "loss": 0.3033, "step": 2990 }, { "epoch": 0.29505052740281773, "grad_norm": 0.3606371581554413, "learning_rate": 1.9010054445018483e-05, "loss": 0.2825, "step": 3000 }, { "epoch": 0.29505052740281773, "eval_loss": 0.18925747275352478, "eval_runtime": 18.2369, "eval_samples_per_second": 2.742, "eval_steps_per_second": 1.371, "step": 3000 }, { "epoch": 0.2960340291608271, "grad_norm": 0.6441012620925903, "learning_rate": 1.9006080356078373e-05, "loss": 0.2882, "step": 3010 }, { "epoch": 0.29701753091883654, "grad_norm": 1.2217457294464111, "learning_rate": 1.900210626713826e-05, "loss": 0.2755, "step": 3020 }, { "epoch": 0.2980010326768459, "grad_norm": 1.9655147790908813, "learning_rate": 1.899813217819815e-05, "loss": 0.2335, "step": 3030 }, { "epoch": 0.2989845344348553, "grad_norm": 0.8391982913017273, "learning_rate": 1.899415808925804e-05, "loss": 0.2076, "step": 3040 }, { "epoch": 0.2999680361928647, "grad_norm": 1.5566798448562622, "learning_rate": 1.8990184000317928e-05, "loss": 0.2677, "step": 3050 }, { "epoch": 0.3009515379508741, "grad_norm": 1.858270525932312, "learning_rate": 1.8986209911377818e-05, "loss": 0.4234, "step": 3060 }, { "epoch": 0.30193503970888347, "grad_norm": 1.1900537014007568, "learning_rate": 1.8982235822437708e-05, "loss": 0.2877, "step": 3070 }, { "epoch": 0.3029185414668929, "grad_norm": 2.3156630992889404, "learning_rate": 1.8978261733497597e-05, "loss": 0.4231, "step": 3080 }, { "epoch": 0.3039020432249023, "grad_norm": 1.8294111490249634, "learning_rate": 1.8974287644557487e-05, "loss": 0.2523, "step": 3090 }, { "epoch": 0.30488554498291165, "grad_norm": 1.9268901348114014, "learning_rate": 1.8970313555617377e-05, "loss": 0.2829, "step": 3100 }, { "epoch": 0.305869046740921, "grad_norm": 1.6466792821884155, "learning_rate": 1.8966339466677266e-05, "loss": 0.2802, "step": 3110 }, { "epoch": 0.30685254849893046, "grad_norm": 0.9668562412261963, "learning_rate": 1.8962365377737156e-05, "loss": 0.2447, "step": 3120 }, { "epoch": 0.30783605025693983, "grad_norm": 1.3234363794326782, "learning_rate": 1.8958391288797045e-05, "loss": 0.3151, "step": 3130 }, { "epoch": 0.3088195520149492, "grad_norm": 0.7863141894340515, "learning_rate": 1.8954417199856935e-05, "loss": 0.2525, "step": 3140 }, { "epoch": 0.30980305377295864, "grad_norm": 1.0784416198730469, "learning_rate": 1.895044311091682e-05, "loss": 0.3044, "step": 3150 }, { "epoch": 0.310786555530968, "grad_norm": 1.47318696975708, "learning_rate": 1.8946469021976714e-05, "loss": 0.2453, "step": 3160 }, { "epoch": 0.3117700572889774, "grad_norm": 0.8541170954704285, "learning_rate": 1.8942494933036604e-05, "loss": 0.2838, "step": 3170 }, { "epoch": 0.3127535590469868, "grad_norm": 1.8033347129821777, "learning_rate": 1.8938520844096494e-05, "loss": 0.2572, "step": 3180 }, { "epoch": 0.3137370608049962, "grad_norm": 0.6662768125534058, "learning_rate": 1.8934546755156383e-05, "loss": 0.2875, "step": 3190 }, { "epoch": 0.31472056256300557, "grad_norm": 1.5999656915664673, "learning_rate": 1.893057266621627e-05, "loss": 0.3892, "step": 3200 }, { "epoch": 0.315704064321015, "grad_norm": 1.12599778175354, "learning_rate": 1.892659857727616e-05, "loss": 0.3817, "step": 3210 }, { "epoch": 0.3166875660790244, "grad_norm": 0.5991163849830627, "learning_rate": 1.892262448833605e-05, "loss": 0.3791, "step": 3220 }, { "epoch": 0.31767106783703375, "grad_norm": 2.04829478263855, "learning_rate": 1.891865039939594e-05, "loss": 0.2998, "step": 3230 }, { "epoch": 0.3186545695950431, "grad_norm": 1.1123751401901245, "learning_rate": 1.8914676310455832e-05, "loss": 0.2266, "step": 3240 }, { "epoch": 0.31963807135305256, "grad_norm": 2.765881299972534, "learning_rate": 1.891070222151572e-05, "loss": 0.3068, "step": 3250 }, { "epoch": 0.32062157311106193, "grad_norm": 0.9850254058837891, "learning_rate": 1.8906728132575608e-05, "loss": 0.1555, "step": 3260 }, { "epoch": 0.3216050748690713, "grad_norm": 0.6333345174789429, "learning_rate": 1.8902754043635497e-05, "loss": 0.2767, "step": 3270 }, { "epoch": 0.32258857662708074, "grad_norm": 1.5807948112487793, "learning_rate": 1.8898779954695387e-05, "loss": 0.4006, "step": 3280 }, { "epoch": 0.3235720783850901, "grad_norm": 0.7924546003341675, "learning_rate": 1.8894805865755277e-05, "loss": 0.3417, "step": 3290 }, { "epoch": 0.3245555801430995, "grad_norm": 0.6605357527732849, "learning_rate": 1.8890831776815166e-05, "loss": 0.2169, "step": 3300 }, { "epoch": 0.3255390819011089, "grad_norm": 1.03006112575531, "learning_rate": 1.8886857687875056e-05, "loss": 0.3255, "step": 3310 }, { "epoch": 0.3265225836591183, "grad_norm": 1.244502305984497, "learning_rate": 1.8882883598934946e-05, "loss": 0.2502, "step": 3320 }, { "epoch": 0.32750608541712767, "grad_norm": 2.606926679611206, "learning_rate": 1.8878909509994835e-05, "loss": 0.344, "step": 3330 }, { "epoch": 0.3284895871751371, "grad_norm": 0.2963784337043762, "learning_rate": 1.8874935421054725e-05, "loss": 0.371, "step": 3340 }, { "epoch": 0.3294730889331465, "grad_norm": 1.087789535522461, "learning_rate": 1.8870961332114615e-05, "loss": 0.2594, "step": 3350 }, { "epoch": 0.33045659069115585, "grad_norm": 0.8091050386428833, "learning_rate": 1.8866987243174504e-05, "loss": 0.3005, "step": 3360 }, { "epoch": 0.3314400924491653, "grad_norm": 0.6955603957176208, "learning_rate": 1.8863013154234394e-05, "loss": 0.2407, "step": 3370 }, { "epoch": 0.33242359420717466, "grad_norm": 1.2869349718093872, "learning_rate": 1.8859039065294284e-05, "loss": 0.2972, "step": 3380 }, { "epoch": 0.33340709596518403, "grad_norm": 1.0467299222946167, "learning_rate": 1.885506497635417e-05, "loss": 0.3742, "step": 3390 }, { "epoch": 0.3343905977231934, "grad_norm": 1.2822316884994507, "learning_rate": 1.885109088741406e-05, "loss": 0.2625, "step": 3400 }, { "epoch": 0.33537409948120284, "grad_norm": 1.278634786605835, "learning_rate": 1.8847116798473953e-05, "loss": 0.3095, "step": 3410 }, { "epoch": 0.3363576012392122, "grad_norm": 0.4241173267364502, "learning_rate": 1.8843142709533842e-05, "loss": 0.3163, "step": 3420 }, { "epoch": 0.3373411029972216, "grad_norm": 1.171540379524231, "learning_rate": 1.8839168620593732e-05, "loss": 0.3011, "step": 3430 }, { "epoch": 0.338324604755231, "grad_norm": 1.1309036016464233, "learning_rate": 1.883519453165362e-05, "loss": 0.3909, "step": 3440 }, { "epoch": 0.3393081065132404, "grad_norm": 1.4988631010055542, "learning_rate": 1.8831220442713508e-05, "loss": 0.3911, "step": 3450 }, { "epoch": 0.34029160827124977, "grad_norm": 0.8498252630233765, "learning_rate": 1.8827246353773398e-05, "loss": 0.2321, "step": 3460 }, { "epoch": 0.3412751100292592, "grad_norm": 3.627526044845581, "learning_rate": 1.8823272264833287e-05, "loss": 0.3124, "step": 3470 }, { "epoch": 0.3422586117872686, "grad_norm": 0.695372998714447, "learning_rate": 1.8819298175893177e-05, "loss": 0.3233, "step": 3480 }, { "epoch": 0.34324211354527795, "grad_norm": 0.9676603078842163, "learning_rate": 1.881532408695307e-05, "loss": 0.2751, "step": 3490 }, { "epoch": 0.3442256153032874, "grad_norm": 2.4137444496154785, "learning_rate": 1.8811349998012956e-05, "loss": 0.239, "step": 3500 }, { "epoch": 0.3442256153032874, "eval_loss": 0.18515843152999878, "eval_runtime": 18.8557, "eval_samples_per_second": 2.652, "eval_steps_per_second": 1.326, "step": 3500 }, { "epoch": 0.34520911706129676, "grad_norm": 0.8635812401771545, "learning_rate": 1.8807375909072846e-05, "loss": 0.2727, "step": 3510 }, { "epoch": 0.34619261881930613, "grad_norm": 0.8130320906639099, "learning_rate": 1.8803401820132736e-05, "loss": 0.3389, "step": 3520 }, { "epoch": 0.3471761205773155, "grad_norm": 0.8832435011863708, "learning_rate": 1.8799427731192625e-05, "loss": 0.3392, "step": 3530 }, { "epoch": 0.34815962233532494, "grad_norm": 0.41087064146995544, "learning_rate": 1.8795453642252515e-05, "loss": 0.3043, "step": 3540 }, { "epoch": 0.3491431240933343, "grad_norm": 0.865750253200531, "learning_rate": 1.8791479553312405e-05, "loss": 0.3286, "step": 3550 }, { "epoch": 0.3501266258513437, "grad_norm": 2.180868625640869, "learning_rate": 1.8787505464372294e-05, "loss": 0.2891, "step": 3560 }, { "epoch": 0.3511101276093531, "grad_norm": 1.0887324810028076, "learning_rate": 1.8783531375432184e-05, "loss": 0.3794, "step": 3570 }, { "epoch": 0.3520936293673625, "grad_norm": 1.2184492349624634, "learning_rate": 1.8779557286492074e-05, "loss": 0.2399, "step": 3580 }, { "epoch": 0.35307713112537187, "grad_norm": 1.5976436138153076, "learning_rate": 1.8775583197551963e-05, "loss": 0.321, "step": 3590 }, { "epoch": 0.3540606328833813, "grad_norm": 0.7627467513084412, "learning_rate": 1.8771609108611853e-05, "loss": 0.368, "step": 3600 }, { "epoch": 0.3550441346413907, "grad_norm": 0.7713624238967896, "learning_rate": 1.8767635019671743e-05, "loss": 0.3409, "step": 3610 }, { "epoch": 0.35602763639940005, "grad_norm": 0.6090192198753357, "learning_rate": 1.8763660930731632e-05, "loss": 0.236, "step": 3620 }, { "epoch": 0.3570111381574095, "grad_norm": 1.4962300062179565, "learning_rate": 1.875968684179152e-05, "loss": 0.1852, "step": 3630 }, { "epoch": 0.35799463991541886, "grad_norm": 1.274659276008606, "learning_rate": 1.8755712752851408e-05, "loss": 0.2773, "step": 3640 }, { "epoch": 0.35897814167342823, "grad_norm": 1.3212956190109253, "learning_rate": 1.87517386639113e-05, "loss": 0.221, "step": 3650 }, { "epoch": 0.35996164343143766, "grad_norm": 1.0321497917175293, "learning_rate": 1.874776457497119e-05, "loss": 0.3394, "step": 3660 }, { "epoch": 0.36094514518944704, "grad_norm": 0.7226223349571228, "learning_rate": 1.874379048603108e-05, "loss": 0.246, "step": 3670 }, { "epoch": 0.3619286469474564, "grad_norm": 2.584559917449951, "learning_rate": 1.8739816397090967e-05, "loss": 0.1398, "step": 3680 }, { "epoch": 0.3629121487054658, "grad_norm": 0.9170656800270081, "learning_rate": 1.8735842308150857e-05, "loss": 0.1937, "step": 3690 }, { "epoch": 0.3638956504634752, "grad_norm": 1.994689702987671, "learning_rate": 1.8731868219210746e-05, "loss": 0.3006, "step": 3700 }, { "epoch": 0.3648791522214846, "grad_norm": 1.0057986974716187, "learning_rate": 1.8727894130270636e-05, "loss": 0.2274, "step": 3710 }, { "epoch": 0.36586265397949397, "grad_norm": 0.529000461101532, "learning_rate": 1.8723920041330526e-05, "loss": 0.2544, "step": 3720 }, { "epoch": 0.3668461557375034, "grad_norm": 2.1913716793060303, "learning_rate": 1.871994595239042e-05, "loss": 0.2118, "step": 3730 }, { "epoch": 0.3678296574955128, "grad_norm": 1.2990810871124268, "learning_rate": 1.8715971863450305e-05, "loss": 0.3832, "step": 3740 }, { "epoch": 0.36881315925352215, "grad_norm": 1.4489165544509888, "learning_rate": 1.8711997774510195e-05, "loss": 0.3792, "step": 3750 }, { "epoch": 0.3697966610115316, "grad_norm": 1.727668285369873, "learning_rate": 1.8708023685570084e-05, "loss": 0.3826, "step": 3760 }, { "epoch": 0.37078016276954096, "grad_norm": 1.3508563041687012, "learning_rate": 1.8704049596629974e-05, "loss": 0.3114, "step": 3770 }, { "epoch": 0.37176366452755033, "grad_norm": 2.063981533050537, "learning_rate": 1.8700075507689864e-05, "loss": 0.1861, "step": 3780 }, { "epoch": 0.37274716628555976, "grad_norm": 1.491122841835022, "learning_rate": 1.8696101418749753e-05, "loss": 0.315, "step": 3790 }, { "epoch": 0.37373066804356914, "grad_norm": 1.6444443464279175, "learning_rate": 1.8692127329809643e-05, "loss": 0.3538, "step": 3800 }, { "epoch": 0.3747141698015785, "grad_norm": 1.7396812438964844, "learning_rate": 1.8688153240869533e-05, "loss": 0.2084, "step": 3810 }, { "epoch": 0.3756976715595879, "grad_norm": 1.6162070035934448, "learning_rate": 1.8684179151929422e-05, "loss": 0.3024, "step": 3820 }, { "epoch": 0.3766811733175973, "grad_norm": 1.6525781154632568, "learning_rate": 1.8680205062989312e-05, "loss": 0.3075, "step": 3830 }, { "epoch": 0.3776646750756067, "grad_norm": 0.7809441089630127, "learning_rate": 1.86762309740492e-05, "loss": 0.33, "step": 3840 }, { "epoch": 0.37864817683361607, "grad_norm": 1.2998336553573608, "learning_rate": 1.867225688510909e-05, "loss": 0.2978, "step": 3850 }, { "epoch": 0.3796316785916255, "grad_norm": 0.5490562319755554, "learning_rate": 1.866828279616898e-05, "loss": 0.2625, "step": 3860 }, { "epoch": 0.3806151803496349, "grad_norm": 1.808087944984436, "learning_rate": 1.8664308707228867e-05, "loss": 0.1814, "step": 3870 }, { "epoch": 0.38159868210764425, "grad_norm": 1.9151523113250732, "learning_rate": 1.8660334618288757e-05, "loss": 0.2716, "step": 3880 }, { "epoch": 0.3825821838656537, "grad_norm": 1.526478886604309, "learning_rate": 1.865636052934865e-05, "loss": 0.2755, "step": 3890 }, { "epoch": 0.38356568562366306, "grad_norm": 0.8774662017822266, "learning_rate": 1.865238644040854e-05, "loss": 0.3243, "step": 3900 }, { "epoch": 0.38454918738167243, "grad_norm": 1.8767133951187134, "learning_rate": 1.864841235146843e-05, "loss": 0.3164, "step": 3910 }, { "epoch": 0.38553268913968186, "grad_norm": 1.6160063743591309, "learning_rate": 1.8644438262528315e-05, "loss": 0.34, "step": 3920 }, { "epoch": 0.38651619089769124, "grad_norm": 0.7355303764343262, "learning_rate": 1.8640464173588205e-05, "loss": 0.3179, "step": 3930 }, { "epoch": 0.3874996926557006, "grad_norm": 0.8210369348526001, "learning_rate": 1.8636490084648095e-05, "loss": 0.2245, "step": 3940 }, { "epoch": 0.38848319441371, "grad_norm": 2.0436363220214844, "learning_rate": 1.8632515995707984e-05, "loss": 0.3267, "step": 3950 }, { "epoch": 0.3894666961717194, "grad_norm": 1.3316550254821777, "learning_rate": 1.8628541906767874e-05, "loss": 0.2627, "step": 3960 }, { "epoch": 0.3904501979297288, "grad_norm": 2.4543538093566895, "learning_rate": 1.8624567817827767e-05, "loss": 0.3055, "step": 3970 }, { "epoch": 0.39143369968773817, "grad_norm": 1.4763861894607544, "learning_rate": 1.8620593728887653e-05, "loss": 0.224, "step": 3980 }, { "epoch": 0.3924172014457476, "grad_norm": 2.1367290019989014, "learning_rate": 1.8616619639947543e-05, "loss": 0.2971, "step": 3990 }, { "epoch": 0.393400703203757, "grad_norm": 0.5473704934120178, "learning_rate": 1.8612645551007433e-05, "loss": 0.1911, "step": 4000 }, { "epoch": 0.393400703203757, "eval_loss": 0.18661783635616302, "eval_runtime": 18.7545, "eval_samples_per_second": 2.666, "eval_steps_per_second": 1.333, "step": 4000 }, { "epoch": 0.39438420496176635, "grad_norm": 0.6994380950927734, "learning_rate": 1.8608671462067322e-05, "loss": 0.2789, "step": 4010 }, { "epoch": 0.3953677067197758, "grad_norm": 0.8082060813903809, "learning_rate": 1.8604697373127212e-05, "loss": 0.2897, "step": 4020 }, { "epoch": 0.39635120847778516, "grad_norm": 1.6137787103652954, "learning_rate": 1.8600723284187102e-05, "loss": 0.3562, "step": 4030 }, { "epoch": 0.39733471023579453, "grad_norm": 0.4196963310241699, "learning_rate": 1.859674919524699e-05, "loss": 0.1581, "step": 4040 }, { "epoch": 0.39831821199380396, "grad_norm": 1.2669254541397095, "learning_rate": 1.859277510630688e-05, "loss": 0.2467, "step": 4050 }, { "epoch": 0.39930171375181334, "grad_norm": 0.7445014715194702, "learning_rate": 1.858880101736677e-05, "loss": 0.3687, "step": 4060 }, { "epoch": 0.4002852155098227, "grad_norm": 1.2076808214187622, "learning_rate": 1.858482692842666e-05, "loss": 0.2203, "step": 4070 }, { "epoch": 0.40126871726783214, "grad_norm": 1.9768540859222412, "learning_rate": 1.858085283948655e-05, "loss": 0.2688, "step": 4080 }, { "epoch": 0.4022522190258415, "grad_norm": 1.1903109550476074, "learning_rate": 1.857687875054644e-05, "loss": 0.2881, "step": 4090 }, { "epoch": 0.4032357207838509, "grad_norm": 1.5276085138320923, "learning_rate": 1.857290466160633e-05, "loss": 0.2557, "step": 4100 }, { "epoch": 0.40421922254186027, "grad_norm": 2.2157998085021973, "learning_rate": 1.8568930572666216e-05, "loss": 0.298, "step": 4110 }, { "epoch": 0.4052027242998697, "grad_norm": 1.0562926530838013, "learning_rate": 1.8564956483726105e-05, "loss": 0.2623, "step": 4120 }, { "epoch": 0.4061862260578791, "grad_norm": 2.135159730911255, "learning_rate": 1.8560982394785995e-05, "loss": 0.2718, "step": 4130 }, { "epoch": 0.40716972781588845, "grad_norm": 1.4359196424484253, "learning_rate": 1.8557008305845888e-05, "loss": 0.2623, "step": 4140 }, { "epoch": 0.4081532295738979, "grad_norm": 0.8158294558525085, "learning_rate": 1.8553034216905778e-05, "loss": 0.2514, "step": 4150 }, { "epoch": 0.40913673133190726, "grad_norm": 1.8092743158340454, "learning_rate": 1.8549060127965664e-05, "loss": 0.5151, "step": 4160 }, { "epoch": 0.41012023308991663, "grad_norm": 1.847460389137268, "learning_rate": 1.8545086039025554e-05, "loss": 0.1929, "step": 4170 }, { "epoch": 0.41110373484792606, "grad_norm": 1.5607435703277588, "learning_rate": 1.8541111950085443e-05, "loss": 0.2819, "step": 4180 }, { "epoch": 0.41208723660593544, "grad_norm": 0.7382811903953552, "learning_rate": 1.8537137861145333e-05, "loss": 0.2232, "step": 4190 }, { "epoch": 0.4130707383639448, "grad_norm": 1.0930057764053345, "learning_rate": 1.8533163772205223e-05, "loss": 0.2593, "step": 4200 }, { "epoch": 0.41405424012195424, "grad_norm": 4.021615505218506, "learning_rate": 1.8529189683265112e-05, "loss": 0.3129, "step": 4210 }, { "epoch": 0.4150377418799636, "grad_norm": 0.39899012446403503, "learning_rate": 1.8525215594325002e-05, "loss": 0.2396, "step": 4220 }, { "epoch": 0.416021243637973, "grad_norm": 1.7783839702606201, "learning_rate": 1.852124150538489e-05, "loss": 0.2734, "step": 4230 }, { "epoch": 0.41700474539598237, "grad_norm": 1.0922350883483887, "learning_rate": 1.851726741644478e-05, "loss": 0.2591, "step": 4240 }, { "epoch": 0.4179882471539918, "grad_norm": 1.4296014308929443, "learning_rate": 1.851329332750467e-05, "loss": 0.2808, "step": 4250 }, { "epoch": 0.4189717489120012, "grad_norm": 1.9061700105667114, "learning_rate": 1.850931923856456e-05, "loss": 0.3218, "step": 4260 }, { "epoch": 0.41995525067001055, "grad_norm": 2.600834608078003, "learning_rate": 1.850534514962445e-05, "loss": 0.3231, "step": 4270 }, { "epoch": 0.42093875242802, "grad_norm": 1.5851668119430542, "learning_rate": 1.850137106068434e-05, "loss": 0.3265, "step": 4280 }, { "epoch": 0.42192225418602936, "grad_norm": 2.13275146484375, "learning_rate": 1.849739697174423e-05, "loss": 0.2575, "step": 4290 }, { "epoch": 0.42290575594403873, "grad_norm": 1.6749345064163208, "learning_rate": 1.849342288280412e-05, "loss": 0.3271, "step": 4300 }, { "epoch": 0.42388925770204816, "grad_norm": 0.5474580526351929, "learning_rate": 1.848944879386401e-05, "loss": 0.2267, "step": 4310 }, { "epoch": 0.42487275946005754, "grad_norm": 0.8282570242881775, "learning_rate": 1.84854747049239e-05, "loss": 0.321, "step": 4320 }, { "epoch": 0.4258562612180669, "grad_norm": 0.44433537125587463, "learning_rate": 1.848150061598379e-05, "loss": 0.297, "step": 4330 }, { "epoch": 0.42683976297607634, "grad_norm": 1.6867755651474, "learning_rate": 1.8477526527043678e-05, "loss": 0.3186, "step": 4340 }, { "epoch": 0.4278232647340857, "grad_norm": 1.451416254043579, "learning_rate": 1.8473552438103564e-05, "loss": 0.2344, "step": 4350 }, { "epoch": 0.4288067664920951, "grad_norm": 0.8302897214889526, "learning_rate": 1.8469578349163454e-05, "loss": 0.2629, "step": 4360 }, { "epoch": 0.4297902682501045, "grad_norm": 1.7050567865371704, "learning_rate": 1.8465604260223344e-05, "loss": 0.3174, "step": 4370 }, { "epoch": 0.4307737700081139, "grad_norm": 1.5906550884246826, "learning_rate": 1.8461630171283237e-05, "loss": 0.3394, "step": 4380 }, { "epoch": 0.4317572717661233, "grad_norm": 0.8430159091949463, "learning_rate": 1.8457656082343126e-05, "loss": 0.3178, "step": 4390 }, { "epoch": 0.43274077352413265, "grad_norm": 0.9703214168548584, "learning_rate": 1.8453681993403013e-05, "loss": 0.2287, "step": 4400 }, { "epoch": 0.4337242752821421, "grad_norm": 0.5622348189353943, "learning_rate": 1.8449707904462902e-05, "loss": 0.298, "step": 4410 }, { "epoch": 0.43470777704015146, "grad_norm": 1.1415741443634033, "learning_rate": 1.8445733815522792e-05, "loss": 0.1874, "step": 4420 }, { "epoch": 0.43569127879816083, "grad_norm": 1.2584352493286133, "learning_rate": 1.844175972658268e-05, "loss": 0.307, "step": 4430 }, { "epoch": 0.43667478055617026, "grad_norm": 1.9526134729385376, "learning_rate": 1.843778563764257e-05, "loss": 0.1829, "step": 4440 }, { "epoch": 0.43765828231417964, "grad_norm": 2.4884955883026123, "learning_rate": 1.843381154870246e-05, "loss": 0.256, "step": 4450 }, { "epoch": 0.438641784072189, "grad_norm": 0.5439223647117615, "learning_rate": 1.842983745976235e-05, "loss": 0.2438, "step": 4460 }, { "epoch": 0.43962528583019844, "grad_norm": 2.7485218048095703, "learning_rate": 1.842586337082224e-05, "loss": 0.288, "step": 4470 }, { "epoch": 0.4406087875882078, "grad_norm": 0.49018505215644836, "learning_rate": 1.842188928188213e-05, "loss": 0.237, "step": 4480 }, { "epoch": 0.4415922893462172, "grad_norm": 2.2588608264923096, "learning_rate": 1.841791519294202e-05, "loss": 0.2408, "step": 4490 }, { "epoch": 0.4425757911042266, "grad_norm": 1.4313758611679077, "learning_rate": 1.841394110400191e-05, "loss": 0.2917, "step": 4500 }, { "epoch": 0.4425757911042266, "eval_loss": 0.17805011570453644, "eval_runtime": 18.5759, "eval_samples_per_second": 2.692, "eval_steps_per_second": 1.346, "step": 4500 }, { "epoch": 0.443559292862236, "grad_norm": 1.1929224729537964, "learning_rate": 1.84099670150618e-05, "loss": 0.2371, "step": 4510 }, { "epoch": 0.4445427946202454, "grad_norm": 1.5835803747177124, "learning_rate": 1.840599292612169e-05, "loss": 0.273, "step": 4520 }, { "epoch": 0.44552629637825475, "grad_norm": 1.1826454401016235, "learning_rate": 1.8402018837181578e-05, "loss": 0.2249, "step": 4530 }, { "epoch": 0.4465097981362642, "grad_norm": 0.6066482663154602, "learning_rate": 1.8398044748241468e-05, "loss": 0.2553, "step": 4540 }, { "epoch": 0.44749329989427356, "grad_norm": 0.774614691734314, "learning_rate": 1.8394070659301358e-05, "loss": 0.1926, "step": 4550 }, { "epoch": 0.44847680165228293, "grad_norm": 1.106279969215393, "learning_rate": 1.8390096570361247e-05, "loss": 0.2636, "step": 4560 }, { "epoch": 0.44946030341029236, "grad_norm": 1.2709283828735352, "learning_rate": 1.8386122481421137e-05, "loss": 0.2895, "step": 4570 }, { "epoch": 0.45044380516830174, "grad_norm": 0.7416197657585144, "learning_rate": 1.8382148392481027e-05, "loss": 0.2984, "step": 4580 }, { "epoch": 0.4514273069263111, "grad_norm": 1.7245205640792847, "learning_rate": 1.8378174303540913e-05, "loss": 0.2593, "step": 4590 }, { "epoch": 0.45241080868432054, "grad_norm": 1.1603763103485107, "learning_rate": 1.8374200214600802e-05, "loss": 0.3668, "step": 4600 }, { "epoch": 0.4533943104423299, "grad_norm": 4.167576313018799, "learning_rate": 1.8370226125660692e-05, "loss": 0.2337, "step": 4610 }, { "epoch": 0.4543778122003393, "grad_norm": 2.9600322246551514, "learning_rate": 1.8366252036720585e-05, "loss": 0.2843, "step": 4620 }, { "epoch": 0.4553613139583487, "grad_norm": 0.6669513583183289, "learning_rate": 1.8362277947780475e-05, "loss": 0.2346, "step": 4630 }, { "epoch": 0.4563448157163581, "grad_norm": 1.5543279647827148, "learning_rate": 1.835830385884036e-05, "loss": 0.2542, "step": 4640 }, { "epoch": 0.4573283174743675, "grad_norm": 1.5348154306411743, "learning_rate": 1.835432976990025e-05, "loss": 0.268, "step": 4650 }, { "epoch": 0.4583118192323769, "grad_norm": 2.105473518371582, "learning_rate": 1.835035568096014e-05, "loss": 0.2747, "step": 4660 }, { "epoch": 0.4592953209903863, "grad_norm": 1.7820807695388794, "learning_rate": 1.834638159202003e-05, "loss": 0.2834, "step": 4670 }, { "epoch": 0.46027882274839566, "grad_norm": 1.1379271745681763, "learning_rate": 1.834240750307992e-05, "loss": 0.2521, "step": 4680 }, { "epoch": 0.46126232450640503, "grad_norm": 1.2687854766845703, "learning_rate": 1.833843341413981e-05, "loss": 0.304, "step": 4690 }, { "epoch": 0.46224582626441446, "grad_norm": 0.5946239829063416, "learning_rate": 1.83344593251997e-05, "loss": 0.3317, "step": 4700 }, { "epoch": 0.46322932802242384, "grad_norm": 1.261626958847046, "learning_rate": 1.833048523625959e-05, "loss": 0.3162, "step": 4710 }, { "epoch": 0.4642128297804332, "grad_norm": 2.5874712467193604, "learning_rate": 1.832651114731948e-05, "loss": 0.2443, "step": 4720 }, { "epoch": 0.46519633153844264, "grad_norm": 0.868843138217926, "learning_rate": 1.8322537058379368e-05, "loss": 0.2325, "step": 4730 }, { "epoch": 0.466179833296452, "grad_norm": 3.2576451301574707, "learning_rate": 1.8318562969439258e-05, "loss": 0.3519, "step": 4740 }, { "epoch": 0.4671633350544614, "grad_norm": 1.2640223503112793, "learning_rate": 1.8314588880499147e-05, "loss": 0.2424, "step": 4750 }, { "epoch": 0.4681468368124708, "grad_norm": 2.574295997619629, "learning_rate": 1.8310614791559037e-05, "loss": 0.32, "step": 4760 }, { "epoch": 0.4691303385704802, "grad_norm": 1.2373532056808472, "learning_rate": 1.8306640702618927e-05, "loss": 0.3316, "step": 4770 }, { "epoch": 0.4701138403284896, "grad_norm": 1.2597739696502686, "learning_rate": 1.8302666613678816e-05, "loss": 0.32, "step": 4780 }, { "epoch": 0.471097342086499, "grad_norm": 0.7831315994262695, "learning_rate": 1.8298692524738706e-05, "loss": 0.2963, "step": 4790 }, { "epoch": 0.4720808438445084, "grad_norm": 0.567329466342926, "learning_rate": 1.8294718435798596e-05, "loss": 0.1686, "step": 4800 }, { "epoch": 0.47306434560251776, "grad_norm": 1.0442653894424438, "learning_rate": 1.8290744346858485e-05, "loss": 0.1783, "step": 4810 }, { "epoch": 0.47404784736052713, "grad_norm": 1.4119641780853271, "learning_rate": 1.8286770257918375e-05, "loss": 0.2443, "step": 4820 }, { "epoch": 0.47503134911853656, "grad_norm": 2.7391903400421143, "learning_rate": 1.828279616897826e-05, "loss": 0.3288, "step": 4830 }, { "epoch": 0.47601485087654594, "grad_norm": 2.836953639984131, "learning_rate": 1.827882208003815e-05, "loss": 0.2084, "step": 4840 }, { "epoch": 0.4769983526345553, "grad_norm": 0.7899764180183411, "learning_rate": 1.827484799109804e-05, "loss": 0.2799, "step": 4850 }, { "epoch": 0.47798185439256474, "grad_norm": 1.3200337886810303, "learning_rate": 1.8270873902157934e-05, "loss": 0.1918, "step": 4860 }, { "epoch": 0.4789653561505741, "grad_norm": 0.9252869486808777, "learning_rate": 1.8266899813217823e-05, "loss": 0.3871, "step": 4870 }, { "epoch": 0.4799488579085835, "grad_norm": 0.5087369084358215, "learning_rate": 1.826292572427771e-05, "loss": 0.2481, "step": 4880 }, { "epoch": 0.4809323596665929, "grad_norm": 0.6862156987190247, "learning_rate": 1.82589516353376e-05, "loss": 0.313, "step": 4890 }, { "epoch": 0.4819158614246023, "grad_norm": 2.2030715942382812, "learning_rate": 1.825497754639749e-05, "loss": 0.2758, "step": 4900 }, { "epoch": 0.4828993631826117, "grad_norm": 1.4650092124938965, "learning_rate": 1.825100345745738e-05, "loss": 0.2371, "step": 4910 }, { "epoch": 0.4838828649406211, "grad_norm": 1.1407153606414795, "learning_rate": 1.824702936851727e-05, "loss": 0.2681, "step": 4920 }, { "epoch": 0.4848663666986305, "grad_norm": 0.8102698922157288, "learning_rate": 1.8243055279577158e-05, "loss": 0.3045, "step": 4930 }, { "epoch": 0.48584986845663986, "grad_norm": 2.678874969482422, "learning_rate": 1.8239081190637048e-05, "loss": 0.296, "step": 4940 }, { "epoch": 0.4868333702146493, "grad_norm": 2.3300135135650635, "learning_rate": 1.8235107101696937e-05, "loss": 0.3185, "step": 4950 }, { "epoch": 0.48781687197265866, "grad_norm": 0.7017766833305359, "learning_rate": 1.8231133012756827e-05, "loss": 0.2241, "step": 4960 }, { "epoch": 0.48880037373066804, "grad_norm": 0.9197968244552612, "learning_rate": 1.8227158923816717e-05, "loss": 0.2869, "step": 4970 }, { "epoch": 0.4897838754886774, "grad_norm": 1.4702986478805542, "learning_rate": 1.8223184834876606e-05, "loss": 0.2739, "step": 4980 }, { "epoch": 0.49076737724668684, "grad_norm": 0.45094242691993713, "learning_rate": 1.8219210745936496e-05, "loss": 0.2225, "step": 4990 }, { "epoch": 0.4917508790046962, "grad_norm": 2.3170652389526367, "learning_rate": 1.8215236656996386e-05, "loss": 0.2958, "step": 5000 }, { "epoch": 0.4917508790046962, "eval_loss": 0.17952224612236023, "eval_runtime": 16.9014, "eval_samples_per_second": 2.958, "eval_steps_per_second": 1.479, "step": 5000 }, { "epoch": 0.4927343807627056, "grad_norm": 0.6945057511329651, "learning_rate": 1.8211262568056275e-05, "loss": 0.2397, "step": 5010 }, { "epoch": 0.493717882520715, "grad_norm": 1.9027206897735596, "learning_rate": 1.820728847911616e-05, "loss": 0.2174, "step": 5020 }, { "epoch": 0.4947013842787244, "grad_norm": 0.959458589553833, "learning_rate": 1.8203314390176055e-05, "loss": 0.2665, "step": 5030 }, { "epoch": 0.4956848860367338, "grad_norm": 2.319744348526001, "learning_rate": 1.8199340301235944e-05, "loss": 0.2563, "step": 5040 }, { "epoch": 0.4966683877947432, "grad_norm": 2.804980754852295, "learning_rate": 1.8195366212295834e-05, "loss": 0.2734, "step": 5050 }, { "epoch": 0.4976518895527526, "grad_norm": 1.1066958904266357, "learning_rate": 1.8191392123355724e-05, "loss": 0.3222, "step": 5060 }, { "epoch": 0.49863539131076196, "grad_norm": 1.7555015087127686, "learning_rate": 1.818741803441561e-05, "loss": 0.2398, "step": 5070 }, { "epoch": 0.4996188930687714, "grad_norm": 3.245117425918579, "learning_rate": 1.81834439454755e-05, "loss": 0.176, "step": 5080 }, { "epoch": 0.5006023948267807, "grad_norm": 1.5190283060073853, "learning_rate": 1.817946985653539e-05, "loss": 0.2128, "step": 5090 }, { "epoch": 0.5015858965847901, "grad_norm": 1.867402195930481, "learning_rate": 1.817549576759528e-05, "loss": 0.3745, "step": 5100 }, { "epoch": 0.5025693983427996, "grad_norm": 1.7303107976913452, "learning_rate": 1.8171521678655172e-05, "loss": 0.2556, "step": 5110 }, { "epoch": 0.5035529001008089, "grad_norm": 1.0047192573547363, "learning_rate": 1.8167547589715058e-05, "loss": 0.2547, "step": 5120 }, { "epoch": 0.5045364018588183, "grad_norm": 1.7902576923370361, "learning_rate": 1.8163573500774948e-05, "loss": 0.2026, "step": 5130 }, { "epoch": 0.5055199036168277, "grad_norm": 1.2141826152801514, "learning_rate": 1.8159599411834838e-05, "loss": 0.3075, "step": 5140 }, { "epoch": 0.5065034053748371, "grad_norm": 1.0879597663879395, "learning_rate": 1.8155625322894727e-05, "loss": 0.3035, "step": 5150 }, { "epoch": 0.5074869071328465, "grad_norm": 0.6125375032424927, "learning_rate": 1.8151651233954617e-05, "loss": 0.1975, "step": 5160 }, { "epoch": 0.5084704088908559, "grad_norm": 1.361868143081665, "learning_rate": 1.8147677145014507e-05, "loss": 0.4418, "step": 5170 }, { "epoch": 0.5094539106488652, "grad_norm": 2.4404656887054443, "learning_rate": 1.8143703056074396e-05, "loss": 0.2808, "step": 5180 }, { "epoch": 0.5104374124068747, "grad_norm": 3.8543829917907715, "learning_rate": 1.8139728967134286e-05, "loss": 0.1814, "step": 5190 }, { "epoch": 0.5114209141648841, "grad_norm": 1.682686448097229, "learning_rate": 1.8135754878194176e-05, "loss": 0.2773, "step": 5200 }, { "epoch": 0.5124044159228934, "grad_norm": 0.599181056022644, "learning_rate": 1.8131780789254065e-05, "loss": 0.1515, "step": 5210 }, { "epoch": 0.5133879176809029, "grad_norm": 1.6492522954940796, "learning_rate": 1.8127806700313955e-05, "loss": 0.3122, "step": 5220 }, { "epoch": 0.5143714194389123, "grad_norm": 1.381274938583374, "learning_rate": 1.8123832611373845e-05, "loss": 0.2715, "step": 5230 }, { "epoch": 0.5153549211969216, "grad_norm": 3.652162790298462, "learning_rate": 1.8119858522433734e-05, "loss": 0.361, "step": 5240 }, { "epoch": 0.516338422954931, "grad_norm": 0.7934686541557312, "learning_rate": 1.8115884433493624e-05, "loss": 0.2311, "step": 5250 }, { "epoch": 0.5173219247129405, "grad_norm": 1.178621530532837, "learning_rate": 1.811191034455351e-05, "loss": 0.3368, "step": 5260 }, { "epoch": 0.5183054264709498, "grad_norm": 0.9035441279411316, "learning_rate": 1.8107936255613403e-05, "loss": 0.2595, "step": 5270 }, { "epoch": 0.5192889282289592, "grad_norm": 3.2949626445770264, "learning_rate": 1.8103962166673293e-05, "loss": 0.3289, "step": 5280 }, { "epoch": 0.5202724299869687, "grad_norm": 2.0482380390167236, "learning_rate": 1.8099988077733183e-05, "loss": 0.2056, "step": 5290 }, { "epoch": 0.521255931744978, "grad_norm": 0.5833696722984314, "learning_rate": 1.8096013988793072e-05, "loss": 0.334, "step": 5300 }, { "epoch": 0.5222394335029874, "grad_norm": 0.6023494005203247, "learning_rate": 1.809203989985296e-05, "loss": 0.3494, "step": 5310 }, { "epoch": 0.5232229352609967, "grad_norm": 0.8255459666252136, "learning_rate": 1.8088065810912848e-05, "loss": 0.2537, "step": 5320 }, { "epoch": 0.5242064370190062, "grad_norm": 1.5937799215316772, "learning_rate": 1.8084091721972738e-05, "loss": 0.2484, "step": 5330 }, { "epoch": 0.5251899387770156, "grad_norm": 0.9566192030906677, "learning_rate": 1.8080117633032628e-05, "loss": 0.2696, "step": 5340 }, { "epoch": 0.5261734405350249, "grad_norm": 2.102485179901123, "learning_rate": 1.807614354409252e-05, "loss": 0.3206, "step": 5350 }, { "epoch": 0.5271569422930343, "grad_norm": 1.0212339162826538, "learning_rate": 1.8072169455152407e-05, "loss": 0.2393, "step": 5360 }, { "epoch": 0.5281404440510438, "grad_norm": 0.5305758118629456, "learning_rate": 1.8068195366212297e-05, "loss": 0.2471, "step": 5370 }, { "epoch": 0.5291239458090531, "grad_norm": 0.722772479057312, "learning_rate": 1.8064221277272186e-05, "loss": 0.3649, "step": 5380 }, { "epoch": 0.5301074475670625, "grad_norm": 2.7323200702667236, "learning_rate": 1.8060247188332076e-05, "loss": 0.2667, "step": 5390 }, { "epoch": 0.531090949325072, "grad_norm": 0.9532175660133362, "learning_rate": 1.8056273099391966e-05, "loss": 0.3844, "step": 5400 }, { "epoch": 0.5320744510830813, "grad_norm": 1.4109523296356201, "learning_rate": 1.8052299010451855e-05, "loss": 0.3101, "step": 5410 }, { "epoch": 0.5330579528410907, "grad_norm": 1.0147038698196411, "learning_rate": 1.8048324921511745e-05, "loss": 0.2475, "step": 5420 }, { "epoch": 0.5340414545991001, "grad_norm": 1.0827841758728027, "learning_rate": 1.8044350832571635e-05, "loss": 0.2254, "step": 5430 }, { "epoch": 0.5350249563571094, "grad_norm": 0.37398141622543335, "learning_rate": 1.8040376743631524e-05, "loss": 0.2033, "step": 5440 }, { "epoch": 0.5360084581151189, "grad_norm": 0.6746113300323486, "learning_rate": 1.8036402654691414e-05, "loss": 0.3645, "step": 5450 }, { "epoch": 0.5369919598731283, "grad_norm": 1.3214387893676758, "learning_rate": 1.8032428565751304e-05, "loss": 0.1994, "step": 5460 }, { "epoch": 0.5379754616311376, "grad_norm": 2.1967718601226807, "learning_rate": 1.8028454476811193e-05, "loss": 0.2416, "step": 5470 }, { "epoch": 0.5389589633891471, "grad_norm": 2.4310414791107178, "learning_rate": 1.8024480387871083e-05, "loss": 0.4078, "step": 5480 }, { "epoch": 0.5399424651471565, "grad_norm": 0.9052999019622803, "learning_rate": 1.8020506298930972e-05, "loss": 0.3281, "step": 5490 }, { "epoch": 0.5409259669051658, "grad_norm": 2.1289238929748535, "learning_rate": 1.801653220999086e-05, "loss": 0.252, "step": 5500 }, { "epoch": 0.5409259669051658, "eval_loss": 0.18257686495780945, "eval_runtime": 16.2375, "eval_samples_per_second": 3.079, "eval_steps_per_second": 1.54, "step": 5500 }, { "epoch": 0.5419094686631752, "grad_norm": 1.422521948814392, "learning_rate": 1.8012558121050752e-05, "loss": 0.2931, "step": 5510 }, { "epoch": 0.5428929704211847, "grad_norm": 0.9348315000534058, "learning_rate": 1.800858403211064e-05, "loss": 0.2606, "step": 5520 }, { "epoch": 0.543876472179194, "grad_norm": 1.98324453830719, "learning_rate": 1.800460994317053e-05, "loss": 0.194, "step": 5530 }, { "epoch": 0.5448599739372034, "grad_norm": 1.9873881340026855, "learning_rate": 1.800063585423042e-05, "loss": 0.2718, "step": 5540 }, { "epoch": 0.5458434756952129, "grad_norm": 0.650570809841156, "learning_rate": 1.7996661765290307e-05, "loss": 0.1921, "step": 5550 }, { "epoch": 0.5468269774532222, "grad_norm": 1.0361882448196411, "learning_rate": 1.7992687676350197e-05, "loss": 0.1932, "step": 5560 }, { "epoch": 0.5478104792112316, "grad_norm": 1.8897120952606201, "learning_rate": 1.7988713587410086e-05, "loss": 0.254, "step": 5570 }, { "epoch": 0.548793980969241, "grad_norm": 1.37480890750885, "learning_rate": 1.7984739498469976e-05, "loss": 0.2603, "step": 5580 }, { "epoch": 0.5497774827272504, "grad_norm": 1.5733188390731812, "learning_rate": 1.798076540952987e-05, "loss": 0.3241, "step": 5590 }, { "epoch": 0.5507609844852598, "grad_norm": 1.8521591424942017, "learning_rate": 1.7976791320589755e-05, "loss": 0.3151, "step": 5600 }, { "epoch": 0.5517444862432691, "grad_norm": 1.0588831901550293, "learning_rate": 1.7972817231649645e-05, "loss": 0.1527, "step": 5610 }, { "epoch": 0.5527279880012785, "grad_norm": 0.48786550760269165, "learning_rate": 1.7968843142709535e-05, "loss": 0.2082, "step": 5620 }, { "epoch": 0.553711489759288, "grad_norm": 1.9939007759094238, "learning_rate": 1.7964869053769424e-05, "loss": 0.2558, "step": 5630 }, { "epoch": 0.5546949915172973, "grad_norm": 1.3654463291168213, "learning_rate": 1.7960894964829314e-05, "loss": 0.2808, "step": 5640 }, { "epoch": 0.5556784932753067, "grad_norm": 1.640133261680603, "learning_rate": 1.7956920875889204e-05, "loss": 0.3643, "step": 5650 }, { "epoch": 0.5566619950333161, "grad_norm": 1.6016790866851807, "learning_rate": 1.7952946786949093e-05, "loss": 0.2268, "step": 5660 }, { "epoch": 0.5576454967913255, "grad_norm": 1.3572251796722412, "learning_rate": 1.7948972698008983e-05, "loss": 0.3403, "step": 5670 }, { "epoch": 0.5586289985493349, "grad_norm": 1.4128433465957642, "learning_rate": 1.7944998609068873e-05, "loss": 0.332, "step": 5680 }, { "epoch": 0.5596125003073443, "grad_norm": 1.1187504529953003, "learning_rate": 1.7941024520128762e-05, "loss": 0.2393, "step": 5690 }, { "epoch": 0.5605960020653536, "grad_norm": 1.141859531402588, "learning_rate": 1.7937050431188652e-05, "loss": 0.2431, "step": 5700 }, { "epoch": 0.5615795038233631, "grad_norm": 0.8871030211448669, "learning_rate": 1.7933076342248542e-05, "loss": 0.3159, "step": 5710 }, { "epoch": 0.5625630055813725, "grad_norm": 1.2970197200775146, "learning_rate": 1.792910225330843e-05, "loss": 0.2678, "step": 5720 }, { "epoch": 0.5635465073393818, "grad_norm": 1.7814615964889526, "learning_rate": 1.792512816436832e-05, "loss": 0.2298, "step": 5730 }, { "epoch": 0.5645300090973913, "grad_norm": 1.900589108467102, "learning_rate": 1.7921154075428207e-05, "loss": 0.3047, "step": 5740 }, { "epoch": 0.5655135108554007, "grad_norm": 0.9044830203056335, "learning_rate": 1.7917179986488097e-05, "loss": 0.3199, "step": 5750 }, { "epoch": 0.56649701261341, "grad_norm": 1.0888880491256714, "learning_rate": 1.791320589754799e-05, "loss": 0.2272, "step": 5760 }, { "epoch": 0.5674805143714194, "grad_norm": 4.100197792053223, "learning_rate": 1.790923180860788e-05, "loss": 0.322, "step": 5770 }, { "epoch": 0.5684640161294289, "grad_norm": 0.7008423209190369, "learning_rate": 1.790525771966777e-05, "loss": 0.2291, "step": 5780 }, { "epoch": 0.5694475178874382, "grad_norm": 1.2595419883728027, "learning_rate": 1.7901283630727656e-05, "loss": 0.2018, "step": 5790 }, { "epoch": 0.5704310196454476, "grad_norm": 1.9947835206985474, "learning_rate": 1.7897309541787545e-05, "loss": 0.3107, "step": 5800 }, { "epoch": 0.571414521403457, "grad_norm": 0.5234070420265198, "learning_rate": 1.7893335452847435e-05, "loss": 0.2079, "step": 5810 }, { "epoch": 0.5723980231614664, "grad_norm": 2.341140031814575, "learning_rate": 1.7889361363907325e-05, "loss": 0.2779, "step": 5820 }, { "epoch": 0.5733815249194758, "grad_norm": 1.9745830297470093, "learning_rate": 1.7885387274967214e-05, "loss": 0.2857, "step": 5830 }, { "epoch": 0.5743650266774852, "grad_norm": 2.5059823989868164, "learning_rate": 1.7881413186027104e-05, "loss": 0.2123, "step": 5840 }, { "epoch": 0.5753485284354946, "grad_norm": 1.8198421001434326, "learning_rate": 1.7877439097086994e-05, "loss": 0.2633, "step": 5850 }, { "epoch": 0.576332030193504, "grad_norm": 1.688056230545044, "learning_rate": 1.7873465008146883e-05, "loss": 0.2896, "step": 5860 }, { "epoch": 0.5773155319515134, "grad_norm": 0.9039472341537476, "learning_rate": 1.7869490919206773e-05, "loss": 0.2343, "step": 5870 }, { "epoch": 0.5782990337095227, "grad_norm": 1.2950299978256226, "learning_rate": 1.7865516830266663e-05, "loss": 0.2592, "step": 5880 }, { "epoch": 0.5792825354675322, "grad_norm": 2.150629997253418, "learning_rate": 1.7861542741326552e-05, "loss": 0.3204, "step": 5890 }, { "epoch": 0.5802660372255415, "grad_norm": 2.5115997791290283, "learning_rate": 1.7857568652386442e-05, "loss": 0.2205, "step": 5900 }, { "epoch": 0.5812495389835509, "grad_norm": 2.0928642749786377, "learning_rate": 1.785359456344633e-05, "loss": 0.233, "step": 5910 }, { "epoch": 0.5822330407415603, "grad_norm": 3.6651504039764404, "learning_rate": 1.784962047450622e-05, "loss": 0.3126, "step": 5920 }, { "epoch": 0.5832165424995697, "grad_norm": 1.2650208473205566, "learning_rate": 1.784564638556611e-05, "loss": 0.3493, "step": 5930 }, { "epoch": 0.5842000442575791, "grad_norm": 2.006185531616211, "learning_rate": 1.7841672296626e-05, "loss": 0.2449, "step": 5940 }, { "epoch": 0.5851835460155885, "grad_norm": 1.3783490657806396, "learning_rate": 1.783769820768589e-05, "loss": 0.3871, "step": 5950 }, { "epoch": 0.5861670477735978, "grad_norm": 2.5335588455200195, "learning_rate": 1.783372411874578e-05, "loss": 0.3549, "step": 5960 }, { "epoch": 0.5871505495316073, "grad_norm": 2.9504337310791016, "learning_rate": 1.782975002980567e-05, "loss": 0.3744, "step": 5970 }, { "epoch": 0.5881340512896167, "grad_norm": 1.9796600341796875, "learning_rate": 1.7825775940865556e-05, "loss": 0.321, "step": 5980 }, { "epoch": 0.589117553047626, "grad_norm": 2.798205614089966, "learning_rate": 1.7821801851925446e-05, "loss": 0.238, "step": 5990 }, { "epoch": 0.5901010548056355, "grad_norm": 1.6959587335586548, "learning_rate": 1.781782776298534e-05, "loss": 0.3073, "step": 6000 }, { "epoch": 0.5901010548056355, "eval_loss": 0.18480060994625092, "eval_runtime": 19.2678, "eval_samples_per_second": 2.595, "eval_steps_per_second": 1.297, "step": 6000 }, { "epoch": 0.5910845565636449, "grad_norm": 2.3862385749816895, "learning_rate": 1.7813853674045228e-05, "loss": 0.3129, "step": 6010 }, { "epoch": 0.5920680583216542, "grad_norm": 1.5851318836212158, "learning_rate": 1.7809879585105118e-05, "loss": 0.2187, "step": 6020 }, { "epoch": 0.5930515600796636, "grad_norm": 1.9704920053482056, "learning_rate": 1.7805905496165004e-05, "loss": 0.2699, "step": 6030 }, { "epoch": 0.5940350618376731, "grad_norm": 0.945516049861908, "learning_rate": 1.7801931407224894e-05, "loss": 0.2358, "step": 6040 }, { "epoch": 0.5950185635956824, "grad_norm": 1.1421146392822266, "learning_rate": 1.7797957318284784e-05, "loss": 0.2533, "step": 6050 }, { "epoch": 0.5960020653536918, "grad_norm": 0.9118548035621643, "learning_rate": 1.7793983229344673e-05, "loss": 0.2338, "step": 6060 }, { "epoch": 0.5969855671117013, "grad_norm": 1.8159881830215454, "learning_rate": 1.7790009140404563e-05, "loss": 0.2702, "step": 6070 }, { "epoch": 0.5979690688697106, "grad_norm": 2.794337272644043, "learning_rate": 1.7786035051464453e-05, "loss": 0.2871, "step": 6080 }, { "epoch": 0.59895257062772, "grad_norm": 1.0487737655639648, "learning_rate": 1.7782060962524342e-05, "loss": 0.2411, "step": 6090 }, { "epoch": 0.5999360723857294, "grad_norm": 0.22618043422698975, "learning_rate": 1.7778086873584232e-05, "loss": 0.2489, "step": 6100 }, { "epoch": 0.6009195741437388, "grad_norm": 1.117783546447754, "learning_rate": 1.777411278464412e-05, "loss": 0.2851, "step": 6110 }, { "epoch": 0.6019030759017482, "grad_norm": 0.869219183921814, "learning_rate": 1.777013869570401e-05, "loss": 0.2425, "step": 6120 }, { "epoch": 0.6028865776597576, "grad_norm": 1.8589662313461304, "learning_rate": 1.77661646067639e-05, "loss": 0.3514, "step": 6130 }, { "epoch": 0.6038700794177669, "grad_norm": 0.949151873588562, "learning_rate": 1.776219051782379e-05, "loss": 0.2864, "step": 6140 }, { "epoch": 0.6048535811757764, "grad_norm": 0.8056336045265198, "learning_rate": 1.775821642888368e-05, "loss": 0.2232, "step": 6150 }, { "epoch": 0.6058370829337858, "grad_norm": 1.1552402973175049, "learning_rate": 1.775424233994357e-05, "loss": 0.2317, "step": 6160 }, { "epoch": 0.6068205846917951, "grad_norm": 1.5513499975204468, "learning_rate": 1.775026825100346e-05, "loss": 0.2486, "step": 6170 }, { "epoch": 0.6078040864498045, "grad_norm": 0.8391079306602478, "learning_rate": 1.774629416206335e-05, "loss": 0.1522, "step": 6180 }, { "epoch": 0.6087875882078139, "grad_norm": 4.214269161224365, "learning_rate": 1.774232007312324e-05, "loss": 0.3992, "step": 6190 }, { "epoch": 0.6097710899658233, "grad_norm": 1.27689790725708, "learning_rate": 1.773834598418313e-05, "loss": 0.2539, "step": 6200 }, { "epoch": 0.6107545917238327, "grad_norm": 3.4815011024475098, "learning_rate": 1.7734371895243018e-05, "loss": 0.3174, "step": 6210 }, { "epoch": 0.611738093481842, "grad_norm": 0.5101733803749084, "learning_rate": 1.7730397806302904e-05, "loss": 0.2232, "step": 6220 }, { "epoch": 0.6127215952398515, "grad_norm": 1.796726942062378, "learning_rate": 1.7726423717362794e-05, "loss": 0.0914, "step": 6230 }, { "epoch": 0.6137050969978609, "grad_norm": 1.0965911149978638, "learning_rate": 1.7722449628422687e-05, "loss": 0.3303, "step": 6240 }, { "epoch": 0.6146885987558702, "grad_norm": 1.8056279420852661, "learning_rate": 1.7718475539482577e-05, "loss": 0.2987, "step": 6250 }, { "epoch": 0.6156721005138797, "grad_norm": 3.1960597038269043, "learning_rate": 1.7714501450542467e-05, "loss": 0.1829, "step": 6260 }, { "epoch": 0.6166556022718891, "grad_norm": 0.9879154562950134, "learning_rate": 1.7710527361602353e-05, "loss": 0.2514, "step": 6270 }, { "epoch": 0.6176391040298984, "grad_norm": 1.268825888633728, "learning_rate": 1.7706553272662242e-05, "loss": 0.3541, "step": 6280 }, { "epoch": 0.6186226057879078, "grad_norm": 0.49746277928352356, "learning_rate": 1.7702579183722132e-05, "loss": 0.35, "step": 6290 }, { "epoch": 0.6196061075459173, "grad_norm": 1.2664997577667236, "learning_rate": 1.7698605094782022e-05, "loss": 0.3302, "step": 6300 }, { "epoch": 0.6205896093039266, "grad_norm": 2.224809169769287, "learning_rate": 1.769463100584191e-05, "loss": 0.265, "step": 6310 }, { "epoch": 0.621573111061936, "grad_norm": 0.7661302089691162, "learning_rate": 1.76906569169018e-05, "loss": 0.163, "step": 6320 }, { "epoch": 0.6225566128199455, "grad_norm": 2.224334478378296, "learning_rate": 1.768668282796169e-05, "loss": 0.2592, "step": 6330 }, { "epoch": 0.6235401145779548, "grad_norm": 1.4434432983398438, "learning_rate": 1.768270873902158e-05, "loss": 0.1518, "step": 6340 }, { "epoch": 0.6245236163359642, "grad_norm": 0.9845334887504578, "learning_rate": 1.767873465008147e-05, "loss": 0.275, "step": 6350 }, { "epoch": 0.6255071180939736, "grad_norm": 0.7070971131324768, "learning_rate": 1.767476056114136e-05, "loss": 0.203, "step": 6360 }, { "epoch": 0.626490619851983, "grad_norm": 1.071750521659851, "learning_rate": 1.767078647220125e-05, "loss": 0.4071, "step": 6370 }, { "epoch": 0.6274741216099924, "grad_norm": 0.7713740468025208, "learning_rate": 1.766681238326114e-05, "loss": 0.2863, "step": 6380 }, { "epoch": 0.6284576233680018, "grad_norm": 2.0365185737609863, "learning_rate": 1.766283829432103e-05, "loss": 0.1939, "step": 6390 }, { "epoch": 0.6294411251260111, "grad_norm": 1.9870047569274902, "learning_rate": 1.765886420538092e-05, "loss": 0.183, "step": 6400 }, { "epoch": 0.6304246268840206, "grad_norm": 0.9188165068626404, "learning_rate": 1.7654890116440808e-05, "loss": 0.2881, "step": 6410 }, { "epoch": 0.63140812864203, "grad_norm": 1.0348327159881592, "learning_rate": 1.7650916027500698e-05, "loss": 0.3133, "step": 6420 }, { "epoch": 0.6323916304000393, "grad_norm": 0.5961625576019287, "learning_rate": 1.7646941938560587e-05, "loss": 0.3248, "step": 6430 }, { "epoch": 0.6333751321580487, "grad_norm": 1.9765043258666992, "learning_rate": 1.7642967849620477e-05, "loss": 0.261, "step": 6440 }, { "epoch": 0.6343586339160582, "grad_norm": 1.0637900829315186, "learning_rate": 1.7638993760680367e-05, "loss": 0.3368, "step": 6450 }, { "epoch": 0.6353421356740675, "grad_norm": 1.868017554283142, "learning_rate": 1.7635019671740253e-05, "loss": 0.303, "step": 6460 }, { "epoch": 0.6363256374320769, "grad_norm": 1.0811495780944824, "learning_rate": 1.7631045582800143e-05, "loss": 0.3333, "step": 6470 }, { "epoch": 0.6373091391900862, "grad_norm": 1.6021066904067993, "learning_rate": 1.7627071493860036e-05, "loss": 0.2341, "step": 6480 }, { "epoch": 0.6382926409480957, "grad_norm": 0.8280308246612549, "learning_rate": 1.7623097404919925e-05, "loss": 0.2931, "step": 6490 }, { "epoch": 0.6392761427061051, "grad_norm": 0.4079805016517639, "learning_rate": 1.7619123315979815e-05, "loss": 0.2615, "step": 6500 }, { "epoch": 0.6392761427061051, "eval_loss": 0.16756634414196014, "eval_runtime": 21.0382, "eval_samples_per_second": 2.377, "eval_steps_per_second": 1.188, "step": 6500 }, { "epoch": 0.6402596444641144, "grad_norm": 1.9855930805206299, "learning_rate": 1.76151492270397e-05, "loss": 0.3118, "step": 6510 }, { "epoch": 0.6412431462221239, "grad_norm": 2.828648090362549, "learning_rate": 1.761117513809959e-05, "loss": 0.2595, "step": 6520 }, { "epoch": 0.6422266479801333, "grad_norm": 1.2599165439605713, "learning_rate": 1.760720104915948e-05, "loss": 0.2549, "step": 6530 }, { "epoch": 0.6432101497381426, "grad_norm": 2.565375804901123, "learning_rate": 1.760322696021937e-05, "loss": 0.4036, "step": 6540 }, { "epoch": 0.644193651496152, "grad_norm": 1.0683103799819946, "learning_rate": 1.759925287127926e-05, "loss": 0.2982, "step": 6550 }, { "epoch": 0.6451771532541615, "grad_norm": 0.5626850128173828, "learning_rate": 1.759527878233915e-05, "loss": 0.2816, "step": 6560 }, { "epoch": 0.6461606550121708, "grad_norm": 2.2032320499420166, "learning_rate": 1.759130469339904e-05, "loss": 0.2405, "step": 6570 }, { "epoch": 0.6471441567701802, "grad_norm": 1.1257829666137695, "learning_rate": 1.758733060445893e-05, "loss": 0.3195, "step": 6580 }, { "epoch": 0.6481276585281897, "grad_norm": 1.7821928262710571, "learning_rate": 1.758335651551882e-05, "loss": 0.2793, "step": 6590 }, { "epoch": 0.649111160286199, "grad_norm": 1.9086956977844238, "learning_rate": 1.757938242657871e-05, "loss": 0.2243, "step": 6600 }, { "epoch": 0.6500946620442084, "grad_norm": 0.624267041683197, "learning_rate": 1.7575408337638598e-05, "loss": 0.2452, "step": 6610 }, { "epoch": 0.6510781638022178, "grad_norm": 0.9951871037483215, "learning_rate": 1.7571434248698488e-05, "loss": 0.3167, "step": 6620 }, { "epoch": 0.6520616655602272, "grad_norm": 0.869670033454895, "learning_rate": 1.7567460159758377e-05, "loss": 0.2729, "step": 6630 }, { "epoch": 0.6530451673182366, "grad_norm": 2.440999984741211, "learning_rate": 1.7563486070818264e-05, "loss": 0.2323, "step": 6640 }, { "epoch": 0.654028669076246, "grad_norm": 2.6656980514526367, "learning_rate": 1.7559511981878157e-05, "loss": 0.2467, "step": 6650 }, { "epoch": 0.6550121708342553, "grad_norm": 1.7781723737716675, "learning_rate": 1.7555537892938046e-05, "loss": 0.3012, "step": 6660 }, { "epoch": 0.6559956725922648, "grad_norm": 2.4312820434570312, "learning_rate": 1.7551563803997936e-05, "loss": 0.3306, "step": 6670 }, { "epoch": 0.6569791743502742, "grad_norm": 1.759849190711975, "learning_rate": 1.7547589715057826e-05, "loss": 0.2786, "step": 6680 }, { "epoch": 0.6579626761082835, "grad_norm": 3.491177558898926, "learning_rate": 1.7543615626117715e-05, "loss": 0.2349, "step": 6690 }, { "epoch": 0.658946177866293, "grad_norm": 0.7771234512329102, "learning_rate": 1.75396415371776e-05, "loss": 0.2444, "step": 6700 }, { "epoch": 0.6599296796243024, "grad_norm": 0.4858251214027405, "learning_rate": 1.753566744823749e-05, "loss": 0.3178, "step": 6710 }, { "epoch": 0.6609131813823117, "grad_norm": 0.627582848072052, "learning_rate": 1.753169335929738e-05, "loss": 0.1489, "step": 6720 }, { "epoch": 0.6618966831403211, "grad_norm": 0.4715290665626526, "learning_rate": 1.7527719270357274e-05, "loss": 0.3177, "step": 6730 }, { "epoch": 0.6628801848983306, "grad_norm": 2.229329824447632, "learning_rate": 1.7523745181417164e-05, "loss": 0.3776, "step": 6740 }, { "epoch": 0.6638636866563399, "grad_norm": 1.317760944366455, "learning_rate": 1.751977109247705e-05, "loss": 0.284, "step": 6750 }, { "epoch": 0.6648471884143493, "grad_norm": 0.655670166015625, "learning_rate": 1.751579700353694e-05, "loss": 0.3467, "step": 6760 }, { "epoch": 0.6658306901723586, "grad_norm": 0.7603923678398132, "learning_rate": 1.751182291459683e-05, "loss": 0.2619, "step": 6770 }, { "epoch": 0.6668141919303681, "grad_norm": 1.0717917680740356, "learning_rate": 1.750784882565672e-05, "loss": 0.1964, "step": 6780 }, { "epoch": 0.6677976936883775, "grad_norm": 2.8406436443328857, "learning_rate": 1.750387473671661e-05, "loss": 0.2389, "step": 6790 }, { "epoch": 0.6687811954463868, "grad_norm": 1.8166040182113647, "learning_rate": 1.7499900647776498e-05, "loss": 0.2813, "step": 6800 }, { "epoch": 0.6697646972043962, "grad_norm": 0.37853848934173584, "learning_rate": 1.7495926558836388e-05, "loss": 0.3369, "step": 6810 }, { "epoch": 0.6707481989624057, "grad_norm": 1.757017731666565, "learning_rate": 1.7491952469896278e-05, "loss": 0.1964, "step": 6820 }, { "epoch": 0.671731700720415, "grad_norm": 0.8635709881782532, "learning_rate": 1.7487978380956167e-05, "loss": 0.2284, "step": 6830 }, { "epoch": 0.6727152024784244, "grad_norm": 1.8809075355529785, "learning_rate": 1.7484004292016057e-05, "loss": 0.3185, "step": 6840 }, { "epoch": 0.6736987042364339, "grad_norm": 2.1071417331695557, "learning_rate": 1.7480030203075947e-05, "loss": 0.2415, "step": 6850 }, { "epoch": 0.6746822059944432, "grad_norm": 1.8228193521499634, "learning_rate": 1.7476056114135836e-05, "loss": 0.2727, "step": 6860 }, { "epoch": 0.6756657077524526, "grad_norm": 1.8638427257537842, "learning_rate": 1.7472082025195726e-05, "loss": 0.3047, "step": 6870 }, { "epoch": 0.676649209510462, "grad_norm": 0.2602676749229431, "learning_rate": 1.7468107936255612e-05, "loss": 0.3469, "step": 6880 }, { "epoch": 0.6776327112684714, "grad_norm": 0.9284095764160156, "learning_rate": 1.7464133847315505e-05, "loss": 0.2939, "step": 6890 }, { "epoch": 0.6786162130264808, "grad_norm": 2.8218131065368652, "learning_rate": 1.7460159758375395e-05, "loss": 0.2497, "step": 6900 }, { "epoch": 0.6795997147844902, "grad_norm": 1.4904463291168213, "learning_rate": 1.7456185669435285e-05, "loss": 0.4435, "step": 6910 }, { "epoch": 0.6805832165424995, "grad_norm": 0.9148056507110596, "learning_rate": 1.7452211580495174e-05, "loss": 0.2398, "step": 6920 }, { "epoch": 0.681566718300509, "grad_norm": 1.0198630094528198, "learning_rate": 1.7448237491555064e-05, "loss": 0.3254, "step": 6930 }, { "epoch": 0.6825502200585184, "grad_norm": 1.9961555004119873, "learning_rate": 1.744426340261495e-05, "loss": 0.2865, "step": 6940 }, { "epoch": 0.6835337218165277, "grad_norm": 1.7275830507278442, "learning_rate": 1.744028931367484e-05, "loss": 0.2833, "step": 6950 }, { "epoch": 0.6845172235745371, "grad_norm": 1.0084987878799438, "learning_rate": 1.743631522473473e-05, "loss": 0.283, "step": 6960 }, { "epoch": 0.6855007253325466, "grad_norm": 2.165138006210327, "learning_rate": 1.7432341135794623e-05, "loss": 0.3183, "step": 6970 }, { "epoch": 0.6864842270905559, "grad_norm": 1.2501497268676758, "learning_rate": 1.7428367046854512e-05, "loss": 0.2878, "step": 6980 }, { "epoch": 0.6874677288485653, "grad_norm": 1.473513126373291, "learning_rate": 1.74243929579144e-05, "loss": 0.2705, "step": 6990 }, { "epoch": 0.6884512306065748, "grad_norm": 2.8227286338806152, "learning_rate": 1.7420418868974288e-05, "loss": 0.2592, "step": 7000 }, { "epoch": 0.6884512306065748, "eval_loss": 0.17142616212368011, "eval_runtime": 17.0015, "eval_samples_per_second": 2.941, "eval_steps_per_second": 1.47, "step": 7000 }, { "epoch": 0.6894347323645841, "grad_norm": 1.9842414855957031, "learning_rate": 1.7416444780034178e-05, "loss": 0.2982, "step": 7010 }, { "epoch": 0.6904182341225935, "grad_norm": 1.5404380559921265, "learning_rate": 1.7412470691094067e-05, "loss": 0.2632, "step": 7020 }, { "epoch": 0.6914017358806029, "grad_norm": 3.1655123233795166, "learning_rate": 1.7408496602153957e-05, "loss": 0.3676, "step": 7030 }, { "epoch": 0.6923852376386123, "grad_norm": 2.0858147144317627, "learning_rate": 1.7404522513213847e-05, "loss": 0.2786, "step": 7040 }, { "epoch": 0.6933687393966217, "grad_norm": 1.6330032348632812, "learning_rate": 1.7400548424273736e-05, "loss": 0.3458, "step": 7050 }, { "epoch": 0.694352241154631, "grad_norm": 1.110235571861267, "learning_rate": 1.7396574335333626e-05, "loss": 0.2221, "step": 7060 }, { "epoch": 0.6953357429126404, "grad_norm": 1.8372716903686523, "learning_rate": 1.7392600246393516e-05, "loss": 0.3086, "step": 7070 }, { "epoch": 0.6963192446706499, "grad_norm": 1.990007996559143, "learning_rate": 1.7388626157453405e-05, "loss": 0.3037, "step": 7080 }, { "epoch": 0.6973027464286592, "grad_norm": 1.34180748462677, "learning_rate": 1.7384652068513295e-05, "loss": 0.2019, "step": 7090 }, { "epoch": 0.6982862481866686, "grad_norm": 1.4167652130126953, "learning_rate": 1.7380677979573185e-05, "loss": 0.2586, "step": 7100 }, { "epoch": 0.699269749944678, "grad_norm": 2.2565560340881348, "learning_rate": 1.7376703890633074e-05, "loss": 0.2973, "step": 7110 }, { "epoch": 0.7002532517026874, "grad_norm": 2.2221834659576416, "learning_rate": 1.737272980169296e-05, "loss": 0.2, "step": 7120 }, { "epoch": 0.7012367534606968, "grad_norm": 1.9819475412368774, "learning_rate": 1.7368755712752854e-05, "loss": 0.2872, "step": 7130 }, { "epoch": 0.7022202552187062, "grad_norm": 2.585352897644043, "learning_rate": 1.7364781623812743e-05, "loss": 0.2436, "step": 7140 }, { "epoch": 0.7032037569767156, "grad_norm": 1.5775268077850342, "learning_rate": 1.7360807534872633e-05, "loss": 0.2617, "step": 7150 }, { "epoch": 0.704187258734725, "grad_norm": 0.8917414546012878, "learning_rate": 1.7356833445932523e-05, "loss": 0.1701, "step": 7160 }, { "epoch": 0.7051707604927344, "grad_norm": 1.4610384702682495, "learning_rate": 1.7352859356992412e-05, "loss": 0.267, "step": 7170 }, { "epoch": 0.7061542622507437, "grad_norm": 1.2714320421218872, "learning_rate": 1.73488852680523e-05, "loss": 0.2911, "step": 7180 }, { "epoch": 0.7071377640087532, "grad_norm": 1.6755682229995728, "learning_rate": 1.734491117911219e-05, "loss": 0.2565, "step": 7190 }, { "epoch": 0.7081212657667626, "grad_norm": 0.6051779985427856, "learning_rate": 1.7340937090172078e-05, "loss": 0.1993, "step": 7200 }, { "epoch": 0.7091047675247719, "grad_norm": 0.8265473246574402, "learning_rate": 1.733696300123197e-05, "loss": 0.2547, "step": 7210 }, { "epoch": 0.7100882692827813, "grad_norm": 1.1671370267868042, "learning_rate": 1.733298891229186e-05, "loss": 0.2217, "step": 7220 }, { "epoch": 0.7110717710407908, "grad_norm": 2.788594961166382, "learning_rate": 1.7329014823351747e-05, "loss": 0.2691, "step": 7230 }, { "epoch": 0.7120552727988001, "grad_norm": 2.4674558639526367, "learning_rate": 1.7325040734411637e-05, "loss": 0.3065, "step": 7240 }, { "epoch": 0.7130387745568095, "grad_norm": 1.8941340446472168, "learning_rate": 1.7321066645471526e-05, "loss": 0.2059, "step": 7250 }, { "epoch": 0.714022276314819, "grad_norm": 1.2387781143188477, "learning_rate": 1.7317092556531416e-05, "loss": 0.2265, "step": 7260 }, { "epoch": 0.7150057780728283, "grad_norm": 2.167513370513916, "learning_rate": 1.7313118467591306e-05, "loss": 0.3098, "step": 7270 }, { "epoch": 0.7159892798308377, "grad_norm": 1.3851529359817505, "learning_rate": 1.7309144378651195e-05, "loss": 0.2835, "step": 7280 }, { "epoch": 0.7169727815888471, "grad_norm": 2.2308197021484375, "learning_rate": 1.7305170289711085e-05, "loss": 0.2906, "step": 7290 }, { "epoch": 0.7179562833468565, "grad_norm": 2.374825954437256, "learning_rate": 1.7301196200770975e-05, "loss": 0.3629, "step": 7300 }, { "epoch": 0.7189397851048659, "grad_norm": 1.3319405317306519, "learning_rate": 1.7297222111830864e-05, "loss": 0.2441, "step": 7310 }, { "epoch": 0.7199232868628753, "grad_norm": 2.5196781158447266, "learning_rate": 1.7293248022890754e-05, "loss": 0.2844, "step": 7320 }, { "epoch": 0.7209067886208846, "grad_norm": 0.5706827640533447, "learning_rate": 1.7289273933950644e-05, "loss": 0.3429, "step": 7330 }, { "epoch": 0.7218902903788941, "grad_norm": 0.982522189617157, "learning_rate": 1.7285299845010533e-05, "loss": 0.2678, "step": 7340 }, { "epoch": 0.7228737921369034, "grad_norm": 1.3022701740264893, "learning_rate": 1.7281325756070423e-05, "loss": 0.2514, "step": 7350 }, { "epoch": 0.7238572938949128, "grad_norm": 2.1632461547851562, "learning_rate": 1.727735166713031e-05, "loss": 0.2359, "step": 7360 }, { "epoch": 0.7248407956529223, "grad_norm": 2.1435694694519043, "learning_rate": 1.72733775781902e-05, "loss": 0.1974, "step": 7370 }, { "epoch": 0.7258242974109316, "grad_norm": 2.0765726566314697, "learning_rate": 1.7269403489250092e-05, "loss": 0.2958, "step": 7380 }, { "epoch": 0.726807799168941, "grad_norm": 0.7352653741836548, "learning_rate": 1.726542940030998e-05, "loss": 0.3014, "step": 7390 }, { "epoch": 0.7277913009269504, "grad_norm": 1.3814529180526733, "learning_rate": 1.726145531136987e-05, "loss": 0.3172, "step": 7400 }, { "epoch": 0.7287748026849598, "grad_norm": 0.7797669768333435, "learning_rate": 1.725748122242976e-05, "loss": 0.2484, "step": 7410 }, { "epoch": 0.7297583044429692, "grad_norm": 1.010933518409729, "learning_rate": 1.7253507133489647e-05, "loss": 0.2504, "step": 7420 }, { "epoch": 0.7307418062009786, "grad_norm": 0.7638196349143982, "learning_rate": 1.7249533044549537e-05, "loss": 0.2663, "step": 7430 }, { "epoch": 0.7317253079589879, "grad_norm": 1.047466516494751, "learning_rate": 1.7245558955609427e-05, "loss": 0.1963, "step": 7440 }, { "epoch": 0.7327088097169974, "grad_norm": 0.6840983033180237, "learning_rate": 1.7241584866669316e-05, "loss": 0.2936, "step": 7450 }, { "epoch": 0.7336923114750068, "grad_norm": 0.9364534020423889, "learning_rate": 1.723761077772921e-05, "loss": 0.2979, "step": 7460 }, { "epoch": 0.7346758132330161, "grad_norm": 1.0363335609436035, "learning_rate": 1.7233636688789096e-05, "loss": 0.3094, "step": 7470 }, { "epoch": 0.7356593149910255, "grad_norm": 1.6205226182937622, "learning_rate": 1.7229662599848985e-05, "loss": 0.2576, "step": 7480 }, { "epoch": 0.736642816749035, "grad_norm": 2.2230446338653564, "learning_rate": 1.7225688510908875e-05, "loss": 0.4179, "step": 7490 }, { "epoch": 0.7376263185070443, "grad_norm": 2.2729926109313965, "learning_rate": 1.7221714421968765e-05, "loss": 0.2167, "step": 7500 }, { "epoch": 0.7376263185070443, "eval_loss": 0.17984023690223694, "eval_runtime": 19.5642, "eval_samples_per_second": 2.556, "eval_steps_per_second": 1.278, "step": 7500 }, { "epoch": 0.7386098202650537, "grad_norm": 1.362148404121399, "learning_rate": 1.7217740333028654e-05, "loss": 0.244, "step": 7510 }, { "epoch": 0.7395933220230632, "grad_norm": 3.1289501190185547, "learning_rate": 1.7213766244088544e-05, "loss": 0.3236, "step": 7520 }, { "epoch": 0.7405768237810725, "grad_norm": 1.7559800148010254, "learning_rate": 1.7209792155148434e-05, "loss": 0.3897, "step": 7530 }, { "epoch": 0.7415603255390819, "grad_norm": 1.110167145729065, "learning_rate": 1.7205818066208323e-05, "loss": 0.3277, "step": 7540 }, { "epoch": 0.7425438272970913, "grad_norm": 1.0508075952529907, "learning_rate": 1.7201843977268213e-05, "loss": 0.3637, "step": 7550 }, { "epoch": 0.7435273290551007, "grad_norm": 1.4188263416290283, "learning_rate": 1.7197869888328103e-05, "loss": 0.2367, "step": 7560 }, { "epoch": 0.7445108308131101, "grad_norm": 2.6133649349212646, "learning_rate": 1.7193895799387992e-05, "loss": 0.207, "step": 7570 }, { "epoch": 0.7454943325711195, "grad_norm": 0.7872998118400574, "learning_rate": 1.7189921710447882e-05, "loss": 0.3007, "step": 7580 }, { "epoch": 0.7464778343291288, "grad_norm": 1.397812843322754, "learning_rate": 1.718594762150777e-05, "loss": 0.213, "step": 7590 }, { "epoch": 0.7474613360871383, "grad_norm": 1.035705804824829, "learning_rate": 1.7181973532567658e-05, "loss": 0.3404, "step": 7600 }, { "epoch": 0.7484448378451477, "grad_norm": 1.8265693187713623, "learning_rate": 1.7177999443627548e-05, "loss": 0.3536, "step": 7610 }, { "epoch": 0.749428339603157, "grad_norm": 0.547683596611023, "learning_rate": 1.717402535468744e-05, "loss": 0.2403, "step": 7620 }, { "epoch": 0.7504118413611665, "grad_norm": 1.1466445922851562, "learning_rate": 1.717005126574733e-05, "loss": 0.284, "step": 7630 }, { "epoch": 0.7513953431191758, "grad_norm": 1.1045866012573242, "learning_rate": 1.716607717680722e-05, "loss": 0.3185, "step": 7640 }, { "epoch": 0.7523788448771852, "grad_norm": 0.5115216374397278, "learning_rate": 1.716210308786711e-05, "loss": 0.395, "step": 7650 }, { "epoch": 0.7533623466351946, "grad_norm": 0.9218469858169556, "learning_rate": 1.7158128998926996e-05, "loss": 0.2298, "step": 7660 }, { "epoch": 0.754345848393204, "grad_norm": 0.998163640499115, "learning_rate": 1.7154154909986886e-05, "loss": 0.2902, "step": 7670 }, { "epoch": 0.7553293501512134, "grad_norm": 1.475361704826355, "learning_rate": 1.7150180821046775e-05, "loss": 0.3588, "step": 7680 }, { "epoch": 0.7563128519092228, "grad_norm": 0.8406019806861877, "learning_rate": 1.7146206732106665e-05, "loss": 0.2207, "step": 7690 }, { "epoch": 0.7572963536672321, "grad_norm": 1.2818818092346191, "learning_rate": 1.7142232643166558e-05, "loss": 0.1986, "step": 7700 }, { "epoch": 0.7582798554252416, "grad_norm": 0.9673545956611633, "learning_rate": 1.7138258554226444e-05, "loss": 0.1709, "step": 7710 }, { "epoch": 0.759263357183251, "grad_norm": 0.7116090655326843, "learning_rate": 1.7134284465286334e-05, "loss": 0.2115, "step": 7720 }, { "epoch": 0.7602468589412603, "grad_norm": 1.4711158275604248, "learning_rate": 1.7130310376346224e-05, "loss": 0.2365, "step": 7730 }, { "epoch": 0.7612303606992697, "grad_norm": 0.7362675070762634, "learning_rate": 1.7126336287406113e-05, "loss": 0.2748, "step": 7740 }, { "epoch": 0.7622138624572792, "grad_norm": 1.4853023290634155, "learning_rate": 1.7122362198466003e-05, "loss": 0.1993, "step": 7750 }, { "epoch": 0.7631973642152885, "grad_norm": 4.937195777893066, "learning_rate": 1.7118388109525893e-05, "loss": 0.3454, "step": 7760 }, { "epoch": 0.7641808659732979, "grad_norm": 1.7255171537399292, "learning_rate": 1.7114414020585782e-05, "loss": 0.3258, "step": 7770 }, { "epoch": 0.7651643677313074, "grad_norm": 2.4765617847442627, "learning_rate": 1.7110439931645672e-05, "loss": 0.3158, "step": 7780 }, { "epoch": 0.7661478694893167, "grad_norm": 0.8955258131027222, "learning_rate": 1.710646584270556e-05, "loss": 0.3209, "step": 7790 }, { "epoch": 0.7671313712473261, "grad_norm": 2.678504467010498, "learning_rate": 1.710249175376545e-05, "loss": 0.2641, "step": 7800 }, { "epoch": 0.7681148730053355, "grad_norm": 2.017256736755371, "learning_rate": 1.709851766482534e-05, "loss": 0.318, "step": 7810 }, { "epoch": 0.7690983747633449, "grad_norm": 1.0673086643218994, "learning_rate": 1.709454357588523e-05, "loss": 0.1347, "step": 7820 }, { "epoch": 0.7700818765213543, "grad_norm": 2.0851361751556396, "learning_rate": 1.709056948694512e-05, "loss": 0.2498, "step": 7830 }, { "epoch": 0.7710653782793637, "grad_norm": 2.2323834896087646, "learning_rate": 1.7086595398005006e-05, "loss": 0.3145, "step": 7840 }, { "epoch": 0.772048880037373, "grad_norm": 1.902326226234436, "learning_rate": 1.7082621309064896e-05, "loss": 0.1788, "step": 7850 }, { "epoch": 0.7730323817953825, "grad_norm": 1.267497181892395, "learning_rate": 1.707864722012479e-05, "loss": 0.2404, "step": 7860 }, { "epoch": 0.7740158835533919, "grad_norm": 2.428555727005005, "learning_rate": 1.707467313118468e-05, "loss": 0.2913, "step": 7870 }, { "epoch": 0.7749993853114012, "grad_norm": 1.689639925956726, "learning_rate": 1.707069904224457e-05, "loss": 0.1982, "step": 7880 }, { "epoch": 0.7759828870694107, "grad_norm": 3.5605409145355225, "learning_rate": 1.7066724953304458e-05, "loss": 0.2529, "step": 7890 }, { "epoch": 0.77696638882742, "grad_norm": 6.425320148468018, "learning_rate": 1.7062750864364344e-05, "loss": 0.3126, "step": 7900 }, { "epoch": 0.7779498905854294, "grad_norm": 1.4042447805404663, "learning_rate": 1.7058776775424234e-05, "loss": 0.2954, "step": 7910 }, { "epoch": 0.7789333923434388, "grad_norm": 2.0059497356414795, "learning_rate": 1.7054802686484124e-05, "loss": 0.2812, "step": 7920 }, { "epoch": 0.7799168941014482, "grad_norm": 2.8876333236694336, "learning_rate": 1.7050828597544013e-05, "loss": 0.3512, "step": 7930 }, { "epoch": 0.7809003958594576, "grad_norm": 1.6756712198257446, "learning_rate": 1.7046854508603906e-05, "loss": 0.3118, "step": 7940 }, { "epoch": 0.781883897617467, "grad_norm": 0.9000738263130188, "learning_rate": 1.7042880419663793e-05, "loss": 0.2629, "step": 7950 }, { "epoch": 0.7828673993754763, "grad_norm": 2.579625368118286, "learning_rate": 1.7038906330723682e-05, "loss": 0.2921, "step": 7960 }, { "epoch": 0.7838509011334858, "grad_norm": 0.917523205280304, "learning_rate": 1.7034932241783572e-05, "loss": 0.1828, "step": 7970 }, { "epoch": 0.7848344028914952, "grad_norm": 1.431494951248169, "learning_rate": 1.7030958152843462e-05, "loss": 0.1177, "step": 7980 }, { "epoch": 0.7858179046495045, "grad_norm": 1.6248607635498047, "learning_rate": 1.702698406390335e-05, "loss": 0.3062, "step": 7990 }, { "epoch": 0.786801406407514, "grad_norm": 2.2361698150634766, "learning_rate": 1.702300997496324e-05, "loss": 0.1935, "step": 8000 }, { "epoch": 0.786801406407514, "eval_loss": 0.1714898645877838, "eval_runtime": 18.9887, "eval_samples_per_second": 2.633, "eval_steps_per_second": 1.317, "step": 8000 }, { "epoch": 0.7877849081655234, "grad_norm": 0.6476970911026001, "learning_rate": 1.701903588602313e-05, "loss": 0.3041, "step": 8010 }, { "epoch": 0.7887684099235327, "grad_norm": 0.468868225812912, "learning_rate": 1.701506179708302e-05, "loss": 0.2378, "step": 8020 }, { "epoch": 0.7897519116815421, "grad_norm": 1.6067599058151245, "learning_rate": 1.701108770814291e-05, "loss": 0.2721, "step": 8030 }, { "epoch": 0.7907354134395516, "grad_norm": 1.386991262435913, "learning_rate": 1.70071136192028e-05, "loss": 0.2652, "step": 8040 }, { "epoch": 0.7917189151975609, "grad_norm": 1.6014008522033691, "learning_rate": 1.700313953026269e-05, "loss": 0.2458, "step": 8050 }, { "epoch": 0.7927024169555703, "grad_norm": 1.2447532415390015, "learning_rate": 1.699916544132258e-05, "loss": 0.2731, "step": 8060 }, { "epoch": 0.7936859187135797, "grad_norm": 2.875056743621826, "learning_rate": 1.699519135238247e-05, "loss": 0.3028, "step": 8070 }, { "epoch": 0.7946694204715891, "grad_norm": 2.680910587310791, "learning_rate": 1.6991217263442355e-05, "loss": 0.3273, "step": 8080 }, { "epoch": 0.7956529222295985, "grad_norm": 1.944817304611206, "learning_rate": 1.6987243174502245e-05, "loss": 0.2732, "step": 8090 }, { "epoch": 0.7966364239876079, "grad_norm": 3.19771409034729, "learning_rate": 1.6983269085562134e-05, "loss": 0.1702, "step": 8100 }, { "epoch": 0.7976199257456172, "grad_norm": 0.1668514907360077, "learning_rate": 1.6979294996622027e-05, "loss": 0.3662, "step": 8110 }, { "epoch": 0.7986034275036267, "grad_norm": 1.421762466430664, "learning_rate": 1.6975320907681917e-05, "loss": 0.3515, "step": 8120 }, { "epoch": 0.7995869292616361, "grad_norm": 0.5891507267951965, "learning_rate": 1.6971346818741807e-05, "loss": 0.3687, "step": 8130 }, { "epoch": 0.8005704310196454, "grad_norm": 2.5104644298553467, "learning_rate": 1.6967372729801693e-05, "loss": 0.2561, "step": 8140 }, { "epoch": 0.8015539327776549, "grad_norm": 0.637360692024231, "learning_rate": 1.6963398640861583e-05, "loss": 0.1898, "step": 8150 }, { "epoch": 0.8025374345356643, "grad_norm": 0.8222745656967163, "learning_rate": 1.6959424551921472e-05, "loss": 0.2434, "step": 8160 }, { "epoch": 0.8035209362936736, "grad_norm": 1.6066304445266724, "learning_rate": 1.6955450462981362e-05, "loss": 0.354, "step": 8170 }, { "epoch": 0.804504438051683, "grad_norm": 1.689774751663208, "learning_rate": 1.695147637404125e-05, "loss": 0.3564, "step": 8180 }, { "epoch": 0.8054879398096924, "grad_norm": 1.4014936685562134, "learning_rate": 1.694750228510114e-05, "loss": 0.4074, "step": 8190 }, { "epoch": 0.8064714415677018, "grad_norm": 0.9569060802459717, "learning_rate": 1.694352819616103e-05, "loss": 0.2387, "step": 8200 }, { "epoch": 0.8074549433257112, "grad_norm": 0.9100037813186646, "learning_rate": 1.693955410722092e-05, "loss": 0.2636, "step": 8210 }, { "epoch": 0.8084384450837205, "grad_norm": 0.5279045701026917, "learning_rate": 1.693558001828081e-05, "loss": 0.283, "step": 8220 }, { "epoch": 0.80942194684173, "grad_norm": 1.4399425983428955, "learning_rate": 1.69316059293407e-05, "loss": 0.2874, "step": 8230 }, { "epoch": 0.8104054485997394, "grad_norm": 0.5070255994796753, "learning_rate": 1.692763184040059e-05, "loss": 0.2691, "step": 8240 }, { "epoch": 0.8113889503577487, "grad_norm": 1.517592430114746, "learning_rate": 1.692365775146048e-05, "loss": 0.2214, "step": 8250 }, { "epoch": 0.8123724521157581, "grad_norm": 0.8653010129928589, "learning_rate": 1.691968366252037e-05, "loss": 0.2421, "step": 8260 }, { "epoch": 0.8133559538737676, "grad_norm": 0.9418889284133911, "learning_rate": 1.691570957358026e-05, "loss": 0.2321, "step": 8270 }, { "epoch": 0.8143394556317769, "grad_norm": 1.4153378009796143, "learning_rate": 1.6911735484640148e-05, "loss": 0.2954, "step": 8280 }, { "epoch": 0.8153229573897863, "grad_norm": 1.4692529439926147, "learning_rate": 1.6907761395700038e-05, "loss": 0.3166, "step": 8290 }, { "epoch": 0.8163064591477958, "grad_norm": 1.7206356525421143, "learning_rate": 1.6903787306759928e-05, "loss": 0.2786, "step": 8300 }, { "epoch": 0.8172899609058051, "grad_norm": 2.9285550117492676, "learning_rate": 1.6899813217819817e-05, "loss": 0.2223, "step": 8310 }, { "epoch": 0.8182734626638145, "grad_norm": 1.6375441551208496, "learning_rate": 1.6895839128879704e-05, "loss": 0.4594, "step": 8320 }, { "epoch": 0.8192569644218239, "grad_norm": 1.23320734500885, "learning_rate": 1.6891865039939593e-05, "loss": 0.2349, "step": 8330 }, { "epoch": 0.8202404661798333, "grad_norm": 0.9066193699836731, "learning_rate": 1.6887890950999483e-05, "loss": 0.2678, "step": 8340 }, { "epoch": 0.8212239679378427, "grad_norm": 2.4347150325775146, "learning_rate": 1.6883916862059376e-05, "loss": 0.2813, "step": 8350 }, { "epoch": 0.8222074696958521, "grad_norm": 1.2904943227767944, "learning_rate": 1.6879942773119266e-05, "loss": 0.274, "step": 8360 }, { "epoch": 0.8231909714538614, "grad_norm": 0.9572737812995911, "learning_rate": 1.6875968684179155e-05, "loss": 0.2692, "step": 8370 }, { "epoch": 0.8241744732118709, "grad_norm": 1.2107280492782593, "learning_rate": 1.687199459523904e-05, "loss": 0.2212, "step": 8380 }, { "epoch": 0.8251579749698803, "grad_norm": 0.5360173583030701, "learning_rate": 1.686802050629893e-05, "loss": 0.2378, "step": 8390 }, { "epoch": 0.8261414767278896, "grad_norm": 1.0714260339736938, "learning_rate": 1.686404641735882e-05, "loss": 0.2666, "step": 8400 }, { "epoch": 0.827124978485899, "grad_norm": 1.5305095911026, "learning_rate": 1.686007232841871e-05, "loss": 0.2645, "step": 8410 }, { "epoch": 0.8281084802439085, "grad_norm": 1.4608124494552612, "learning_rate": 1.68560982394786e-05, "loss": 0.374, "step": 8420 }, { "epoch": 0.8290919820019178, "grad_norm": 3.1354146003723145, "learning_rate": 1.685212415053849e-05, "loss": 0.2793, "step": 8430 }, { "epoch": 0.8300754837599272, "grad_norm": 0.5149635076522827, "learning_rate": 1.684815006159838e-05, "loss": 0.3259, "step": 8440 }, { "epoch": 0.8310589855179367, "grad_norm": 1.0487557649612427, "learning_rate": 1.684417597265827e-05, "loss": 0.2507, "step": 8450 }, { "epoch": 0.832042487275946, "grad_norm": 3.3058054447174072, "learning_rate": 1.684020188371816e-05, "loss": 0.3226, "step": 8460 }, { "epoch": 0.8330259890339554, "grad_norm": 1.1440839767456055, "learning_rate": 1.683622779477805e-05, "loss": 0.2999, "step": 8470 }, { "epoch": 0.8340094907919647, "grad_norm": 1.0936781167984009, "learning_rate": 1.6832253705837938e-05, "loss": 0.2374, "step": 8480 }, { "epoch": 0.8349929925499742, "grad_norm": 2.1155974864959717, "learning_rate": 1.6828279616897828e-05, "loss": 0.2307, "step": 8490 }, { "epoch": 0.8359764943079836, "grad_norm": 0.9060395359992981, "learning_rate": 1.6824305527957718e-05, "loss": 0.2469, "step": 8500 }, { "epoch": 0.8359764943079836, "eval_loss": 0.1610024869441986, "eval_runtime": 16.8772, "eval_samples_per_second": 2.963, "eval_steps_per_second": 1.481, "step": 8500 }, { "epoch": 0.8369599960659929, "grad_norm": 1.9168851375579834, "learning_rate": 1.6820331439017607e-05, "loss": 0.3033, "step": 8510 }, { "epoch": 0.8379434978240023, "grad_norm": 1.245879054069519, "learning_rate": 1.6816357350077497e-05, "loss": 0.3564, "step": 8520 }, { "epoch": 0.8389269995820118, "grad_norm": 2.700672149658203, "learning_rate": 1.6812383261137387e-05, "loss": 0.2612, "step": 8530 }, { "epoch": 0.8399105013400211, "grad_norm": 1.1932783126831055, "learning_rate": 1.6808409172197276e-05, "loss": 0.254, "step": 8540 }, { "epoch": 0.8408940030980305, "grad_norm": 1.1512099504470825, "learning_rate": 1.6804435083257166e-05, "loss": 0.2405, "step": 8550 }, { "epoch": 0.84187750485604, "grad_norm": 0.9437609314918518, "learning_rate": 1.6800460994317052e-05, "loss": 0.2365, "step": 8560 }, { "epoch": 0.8428610066140493, "grad_norm": 2.9077837467193604, "learning_rate": 1.6796486905376942e-05, "loss": 0.3955, "step": 8570 }, { "epoch": 0.8438445083720587, "grad_norm": 2.052456855773926, "learning_rate": 1.679251281643683e-05, "loss": 0.2618, "step": 8580 }, { "epoch": 0.8448280101300681, "grad_norm": 0.6724761128425598, "learning_rate": 1.6788538727496725e-05, "loss": 0.2363, "step": 8590 }, { "epoch": 0.8458115118880775, "grad_norm": 3.1223673820495605, "learning_rate": 1.6784564638556614e-05, "loss": 0.4068, "step": 8600 }, { "epoch": 0.8467950136460869, "grad_norm": 0.6590930819511414, "learning_rate": 1.6780590549616504e-05, "loss": 0.3605, "step": 8610 }, { "epoch": 0.8477785154040963, "grad_norm": 1.586635947227478, "learning_rate": 1.677661646067639e-05, "loss": 0.3407, "step": 8620 }, { "epoch": 0.8487620171621056, "grad_norm": 1.203317403793335, "learning_rate": 1.677264237173628e-05, "loss": 0.2719, "step": 8630 }, { "epoch": 0.8497455189201151, "grad_norm": 1.2657392024993896, "learning_rate": 1.676866828279617e-05, "loss": 0.2217, "step": 8640 }, { "epoch": 0.8507290206781245, "grad_norm": 2.0403079986572266, "learning_rate": 1.676469419385606e-05, "loss": 0.291, "step": 8650 }, { "epoch": 0.8517125224361338, "grad_norm": 1.1299984455108643, "learning_rate": 1.676072010491595e-05, "loss": 0.2155, "step": 8660 }, { "epoch": 0.8526960241941433, "grad_norm": 3.073261260986328, "learning_rate": 1.675674601597584e-05, "loss": 0.2985, "step": 8670 }, { "epoch": 0.8536795259521527, "grad_norm": 2.1931588649749756, "learning_rate": 1.6752771927035728e-05, "loss": 0.2018, "step": 8680 }, { "epoch": 0.854663027710162, "grad_norm": 1.0738897323608398, "learning_rate": 1.6748797838095618e-05, "loss": 0.2533, "step": 8690 }, { "epoch": 0.8556465294681714, "grad_norm": 2.443499803543091, "learning_rate": 1.6744823749155507e-05, "loss": 0.2284, "step": 8700 }, { "epoch": 0.8566300312261809, "grad_norm": 1.1267093420028687, "learning_rate": 1.6740849660215397e-05, "loss": 0.2141, "step": 8710 }, { "epoch": 0.8576135329841902, "grad_norm": 0.6887378096580505, "learning_rate": 1.6736875571275287e-05, "loss": 0.1738, "step": 8720 }, { "epoch": 0.8585970347421996, "grad_norm": 1.0813156366348267, "learning_rate": 1.6732901482335176e-05, "loss": 0.1604, "step": 8730 }, { "epoch": 0.859580536500209, "grad_norm": 2.0310513973236084, "learning_rate": 1.6728927393395066e-05, "loss": 0.3649, "step": 8740 }, { "epoch": 0.8605640382582184, "grad_norm": 1.7131977081298828, "learning_rate": 1.6724953304454956e-05, "loss": 0.2718, "step": 8750 }, { "epoch": 0.8615475400162278, "grad_norm": 1.4873853921890259, "learning_rate": 1.6720979215514845e-05, "loss": 0.3012, "step": 8760 }, { "epoch": 0.8625310417742371, "grad_norm": 2.010711669921875, "learning_rate": 1.6717005126574735e-05, "loss": 0.3212, "step": 8770 }, { "epoch": 0.8635145435322465, "grad_norm": 2.3279435634613037, "learning_rate": 1.6713031037634625e-05, "loss": 0.2521, "step": 8780 }, { "epoch": 0.864498045290256, "grad_norm": 2.6230878829956055, "learning_rate": 1.6709056948694514e-05, "loss": 0.2162, "step": 8790 }, { "epoch": 0.8654815470482653, "grad_norm": 1.1890896558761597, "learning_rate": 1.67050828597544e-05, "loss": 0.2358, "step": 8800 }, { "epoch": 0.8664650488062747, "grad_norm": 0.9919412732124329, "learning_rate": 1.670110877081429e-05, "loss": 0.3321, "step": 8810 }, { "epoch": 0.8674485505642842, "grad_norm": 0.4444110095500946, "learning_rate": 1.669713468187418e-05, "loss": 0.1969, "step": 8820 }, { "epoch": 0.8684320523222935, "grad_norm": 0.6418336629867554, "learning_rate": 1.6693160592934073e-05, "loss": 0.2933, "step": 8830 }, { "epoch": 0.8694155540803029, "grad_norm": 1.0164114236831665, "learning_rate": 1.6689186503993963e-05, "loss": 0.2688, "step": 8840 }, { "epoch": 0.8703990558383123, "grad_norm": 2.1948771476745605, "learning_rate": 1.6685212415053852e-05, "loss": 0.2807, "step": 8850 }, { "epoch": 0.8713825575963217, "grad_norm": 1.2677373886108398, "learning_rate": 1.668123832611374e-05, "loss": 0.2645, "step": 8860 }, { "epoch": 0.8723660593543311, "grad_norm": 2.6350746154785156, "learning_rate": 1.667726423717363e-05, "loss": 0.2108, "step": 8870 }, { "epoch": 0.8733495611123405, "grad_norm": 1.0468815565109253, "learning_rate": 1.6673290148233518e-05, "loss": 0.2295, "step": 8880 }, { "epoch": 0.8743330628703498, "grad_norm": 0.9300365447998047, "learning_rate": 1.6669316059293408e-05, "loss": 0.198, "step": 8890 }, { "epoch": 0.8753165646283593, "grad_norm": 1.8281514644622803, "learning_rate": 1.6665341970353297e-05, "loss": 0.3443, "step": 8900 }, { "epoch": 0.8763000663863687, "grad_norm": 0.6567210555076599, "learning_rate": 1.6661367881413187e-05, "loss": 0.2858, "step": 8910 }, { "epoch": 0.877283568144378, "grad_norm": 6.081761360168457, "learning_rate": 1.6657393792473077e-05, "loss": 0.3481, "step": 8920 }, { "epoch": 0.8782670699023875, "grad_norm": 0.5137054324150085, "learning_rate": 1.6653419703532966e-05, "loss": 0.1864, "step": 8930 }, { "epoch": 0.8792505716603969, "grad_norm": 0.3564288318157196, "learning_rate": 1.6649445614592856e-05, "loss": 0.2181, "step": 8940 }, { "epoch": 0.8802340734184062, "grad_norm": 0.3564998209476471, "learning_rate": 1.6645471525652746e-05, "loss": 0.1426, "step": 8950 }, { "epoch": 0.8812175751764156, "grad_norm": 0.7296929359436035, "learning_rate": 1.6641497436712635e-05, "loss": 0.36, "step": 8960 }, { "epoch": 0.8822010769344251, "grad_norm": 1.9744415283203125, "learning_rate": 1.6637523347772525e-05, "loss": 0.2383, "step": 8970 }, { "epoch": 0.8831845786924344, "grad_norm": 1.7271389961242676, "learning_rate": 1.6633549258832415e-05, "loss": 0.182, "step": 8980 }, { "epoch": 0.8841680804504438, "grad_norm": 1.1774275302886963, "learning_rate": 1.66295751698923e-05, "loss": 0.2283, "step": 8990 }, { "epoch": 0.8851515822084532, "grad_norm": 0.5575772523880005, "learning_rate": 1.6625601080952194e-05, "loss": 0.3044, "step": 9000 }, { "epoch": 0.8851515822084532, "eval_loss": 0.1609308272600174, "eval_runtime": 19.1494, "eval_samples_per_second": 2.611, "eval_steps_per_second": 1.306, "step": 9000 }, { "epoch": 0.8861350839664626, "grad_norm": 2.1011977195739746, "learning_rate": 1.6621626992012084e-05, "loss": 0.264, "step": 9010 }, { "epoch": 0.887118585724472, "grad_norm": 1.9053205251693726, "learning_rate": 1.6617652903071973e-05, "loss": 0.2125, "step": 9020 }, { "epoch": 0.8881020874824814, "grad_norm": 1.034562349319458, "learning_rate": 1.6613678814131863e-05, "loss": 0.1994, "step": 9030 }, { "epoch": 0.8890855892404907, "grad_norm": 0.4264748990535736, "learning_rate": 1.660970472519175e-05, "loss": 0.2756, "step": 9040 }, { "epoch": 0.8900690909985002, "grad_norm": 2.293015241622925, "learning_rate": 1.660573063625164e-05, "loss": 0.2315, "step": 9050 }, { "epoch": 0.8910525927565095, "grad_norm": 1.570285439491272, "learning_rate": 1.660175654731153e-05, "loss": 0.3664, "step": 9060 }, { "epoch": 0.8920360945145189, "grad_norm": 0.8685938119888306, "learning_rate": 1.6597782458371418e-05, "loss": 0.2805, "step": 9070 }, { "epoch": 0.8930195962725284, "grad_norm": 1.1349201202392578, "learning_rate": 1.659380836943131e-05, "loss": 0.3772, "step": 9080 }, { "epoch": 0.8940030980305377, "grad_norm": 2.440044403076172, "learning_rate": 1.65898342804912e-05, "loss": 0.2051, "step": 9090 }, { "epoch": 0.8949865997885471, "grad_norm": 0.9280198216438293, "learning_rate": 1.6585860191551087e-05, "loss": 0.2351, "step": 9100 }, { "epoch": 0.8959701015465565, "grad_norm": 0.7486225962638855, "learning_rate": 1.6581886102610977e-05, "loss": 0.2782, "step": 9110 }, { "epoch": 0.8969536033045659, "grad_norm": 1.4163689613342285, "learning_rate": 1.6577912013670867e-05, "loss": 0.3084, "step": 9120 }, { "epoch": 0.8979371050625753, "grad_norm": 0.8505094647407532, "learning_rate": 1.6573937924730756e-05, "loss": 0.2212, "step": 9130 }, { "epoch": 0.8989206068205847, "grad_norm": 1.43965482711792, "learning_rate": 1.6569963835790646e-05, "loss": 0.2867, "step": 9140 }, { "epoch": 0.899904108578594, "grad_norm": 1.275529384613037, "learning_rate": 1.6565989746850536e-05, "loss": 0.2106, "step": 9150 }, { "epoch": 0.9008876103366035, "grad_norm": 1.2573920488357544, "learning_rate": 1.6562015657910425e-05, "loss": 0.2128, "step": 9160 }, { "epoch": 0.9018711120946129, "grad_norm": 1.113728642463684, "learning_rate": 1.6558041568970315e-05, "loss": 0.3224, "step": 9170 }, { "epoch": 0.9028546138526222, "grad_norm": 2.209773063659668, "learning_rate": 1.6554067480030205e-05, "loss": 0.2121, "step": 9180 }, { "epoch": 0.9038381156106317, "grad_norm": 1.6331260204315186, "learning_rate": 1.6550093391090094e-05, "loss": 0.2636, "step": 9190 }, { "epoch": 0.9048216173686411, "grad_norm": 0.5104572176933289, "learning_rate": 1.6546119302149984e-05, "loss": 0.1854, "step": 9200 }, { "epoch": 0.9058051191266504, "grad_norm": 2.398444414138794, "learning_rate": 1.6542145213209874e-05, "loss": 0.2637, "step": 9210 }, { "epoch": 0.9067886208846598, "grad_norm": 2.7371108531951904, "learning_rate": 1.6538171124269763e-05, "loss": 0.329, "step": 9220 }, { "epoch": 0.9077721226426693, "grad_norm": 0.7008442878723145, "learning_rate": 1.653419703532965e-05, "loss": 0.2298, "step": 9230 }, { "epoch": 0.9087556244006786, "grad_norm": 1.478804349899292, "learning_rate": 1.6530222946389543e-05, "loss": 0.2447, "step": 9240 }, { "epoch": 0.909739126158688, "grad_norm": 2.0109310150146484, "learning_rate": 1.6526248857449432e-05, "loss": 0.3323, "step": 9250 }, { "epoch": 0.9107226279166974, "grad_norm": 0.9319548010826111, "learning_rate": 1.6522274768509322e-05, "loss": 0.3731, "step": 9260 }, { "epoch": 0.9117061296747068, "grad_norm": 0.899541974067688, "learning_rate": 1.651830067956921e-05, "loss": 0.2662, "step": 9270 }, { "epoch": 0.9126896314327162, "grad_norm": 2.4901561737060547, "learning_rate": 1.6514326590629098e-05, "loss": 0.2997, "step": 9280 }, { "epoch": 0.9136731331907256, "grad_norm": 2.7003557682037354, "learning_rate": 1.6510352501688988e-05, "loss": 0.2364, "step": 9290 }, { "epoch": 0.914656634948735, "grad_norm": 1.2085120677947998, "learning_rate": 1.6506378412748877e-05, "loss": 0.2667, "step": 9300 }, { "epoch": 0.9156401367067444, "grad_norm": 0.7746391892433167, "learning_rate": 1.6502404323808767e-05, "loss": 0.3026, "step": 9310 }, { "epoch": 0.9166236384647538, "grad_norm": 1.1951857805252075, "learning_rate": 1.649843023486866e-05, "loss": 0.332, "step": 9320 }, { "epoch": 0.9176071402227631, "grad_norm": 0.9990243315696716, "learning_rate": 1.649445614592855e-05, "loss": 0.2422, "step": 9330 }, { "epoch": 0.9185906419807726, "grad_norm": 1.8502593040466309, "learning_rate": 1.6490482056988436e-05, "loss": 0.3554, "step": 9340 }, { "epoch": 0.9195741437387819, "grad_norm": 1.1711889505386353, "learning_rate": 1.6486507968048325e-05, "loss": 0.2293, "step": 9350 }, { "epoch": 0.9205576454967913, "grad_norm": 2.536191463470459, "learning_rate": 1.6482533879108215e-05, "loss": 0.2577, "step": 9360 }, { "epoch": 0.9215411472548007, "grad_norm": 0.34721025824546814, "learning_rate": 1.6478559790168105e-05, "loss": 0.2612, "step": 9370 }, { "epoch": 0.9225246490128101, "grad_norm": 1.5637190341949463, "learning_rate": 1.6474585701227994e-05, "loss": 0.2713, "step": 9380 }, { "epoch": 0.9235081507708195, "grad_norm": 2.7713522911071777, "learning_rate": 1.6470611612287884e-05, "loss": 0.3551, "step": 9390 }, { "epoch": 0.9244916525288289, "grad_norm": 1.4273444414138794, "learning_rate": 1.6466637523347774e-05, "loss": 0.35, "step": 9400 }, { "epoch": 0.9254751542868382, "grad_norm": 2.1872549057006836, "learning_rate": 1.6462663434407663e-05, "loss": 0.2972, "step": 9410 }, { "epoch": 0.9264586560448477, "grad_norm": 1.7454928159713745, "learning_rate": 1.6458689345467553e-05, "loss": 0.3815, "step": 9420 }, { "epoch": 0.9274421578028571, "grad_norm": 1.8231502771377563, "learning_rate": 1.6454715256527443e-05, "loss": 0.2866, "step": 9430 }, { "epoch": 0.9284256595608664, "grad_norm": 1.008145809173584, "learning_rate": 1.6450741167587332e-05, "loss": 0.1845, "step": 9440 }, { "epoch": 0.9294091613188759, "grad_norm": 1.538116216659546, "learning_rate": 1.6446767078647222e-05, "loss": 0.2184, "step": 9450 }, { "epoch": 0.9303926630768853, "grad_norm": 1.898038387298584, "learning_rate": 1.6442792989707112e-05, "loss": 0.2032, "step": 9460 }, { "epoch": 0.9313761648348946, "grad_norm": 0.3986801505088806, "learning_rate": 1.6438818900766998e-05, "loss": 0.3476, "step": 9470 }, { "epoch": 0.932359666592904, "grad_norm": 1.6517155170440674, "learning_rate": 1.643484481182689e-05, "loss": 0.2283, "step": 9480 }, { "epoch": 0.9333431683509135, "grad_norm": 2.5912892818450928, "learning_rate": 1.643087072288678e-05, "loss": 0.235, "step": 9490 }, { "epoch": 0.9343266701089228, "grad_norm": 3.804778814315796, "learning_rate": 1.642689663394667e-05, "loss": 0.2845, "step": 9500 }, { "epoch": 0.9343266701089228, "eval_loss": 0.16946163773536682, "eval_runtime": 18.2063, "eval_samples_per_second": 2.746, "eval_steps_per_second": 1.373, "step": 9500 }, { "epoch": 0.9353101718669322, "grad_norm": 2.673412799835205, "learning_rate": 1.642292254500656e-05, "loss": 0.33, "step": 9510 }, { "epoch": 0.9362936736249416, "grad_norm": 0.5579766631126404, "learning_rate": 1.6418948456066446e-05, "loss": 0.4, "step": 9520 }, { "epoch": 0.937277175382951, "grad_norm": 1.3179048299789429, "learning_rate": 1.6414974367126336e-05, "loss": 0.28, "step": 9530 }, { "epoch": 0.9382606771409604, "grad_norm": 1.813857078552246, "learning_rate": 1.6411000278186226e-05, "loss": 0.2542, "step": 9540 }, { "epoch": 0.9392441788989698, "grad_norm": 8.707640647888184, "learning_rate": 1.6407026189246115e-05, "loss": 0.2239, "step": 9550 }, { "epoch": 0.9402276806569791, "grad_norm": 0.7323013544082642, "learning_rate": 1.640305210030601e-05, "loss": 0.2907, "step": 9560 }, { "epoch": 0.9412111824149886, "grad_norm": 0.32580333948135376, "learning_rate": 1.6399078011365898e-05, "loss": 0.2646, "step": 9570 }, { "epoch": 0.942194684172998, "grad_norm": 1.0481038093566895, "learning_rate": 1.6395103922425784e-05, "loss": 0.2836, "step": 9580 }, { "epoch": 0.9431781859310073, "grad_norm": 1.2796183824539185, "learning_rate": 1.6391129833485674e-05, "loss": 0.2562, "step": 9590 }, { "epoch": 0.9441616876890168, "grad_norm": 2.2194364070892334, "learning_rate": 1.6387155744545564e-05, "loss": 0.3578, "step": 9600 }, { "epoch": 0.9451451894470262, "grad_norm": 1.8737331628799438, "learning_rate": 1.6383181655605453e-05, "loss": 0.1999, "step": 9610 }, { "epoch": 0.9461286912050355, "grad_norm": 1.1581262350082397, "learning_rate": 1.6379207566665343e-05, "loss": 0.1792, "step": 9620 }, { "epoch": 0.9471121929630449, "grad_norm": 1.0477012395858765, "learning_rate": 1.6375233477725233e-05, "loss": 0.2503, "step": 9630 }, { "epoch": 0.9480956947210543, "grad_norm": 3.4494800567626953, "learning_rate": 1.6371259388785122e-05, "loss": 0.2906, "step": 9640 }, { "epoch": 0.9490791964790637, "grad_norm": 1.5456995964050293, "learning_rate": 1.6367285299845012e-05, "loss": 0.2774, "step": 9650 }, { "epoch": 0.9500626982370731, "grad_norm": 3.0428876876831055, "learning_rate": 1.6363311210904902e-05, "loss": 0.219, "step": 9660 }, { "epoch": 0.9510461999950824, "grad_norm": 0.7785629630088806, "learning_rate": 1.635933712196479e-05, "loss": 0.2467, "step": 9670 }, { "epoch": 0.9520297017530919, "grad_norm": 2.675586462020874, "learning_rate": 1.635536303302468e-05, "loss": 0.2585, "step": 9680 }, { "epoch": 0.9530132035111013, "grad_norm": 2.9050750732421875, "learning_rate": 1.635138894408457e-05, "loss": 0.3041, "step": 9690 }, { "epoch": 0.9539967052691106, "grad_norm": 1.8580806255340576, "learning_rate": 1.634741485514446e-05, "loss": 0.3407, "step": 9700 }, { "epoch": 0.95498020702712, "grad_norm": 1.26399827003479, "learning_rate": 1.6343440766204347e-05, "loss": 0.3217, "step": 9710 }, { "epoch": 0.9559637087851295, "grad_norm": 2.8637497425079346, "learning_rate": 1.6339466677264236e-05, "loss": 0.1938, "step": 9720 }, { "epoch": 0.9569472105431388, "grad_norm": 1.6564754247665405, "learning_rate": 1.633549258832413e-05, "loss": 0.2557, "step": 9730 }, { "epoch": 0.9579307123011482, "grad_norm": 1.5192463397979736, "learning_rate": 1.633151849938402e-05, "loss": 0.3262, "step": 9740 }, { "epoch": 0.9589142140591577, "grad_norm": 5.074928283691406, "learning_rate": 1.632754441044391e-05, "loss": 0.2573, "step": 9750 }, { "epoch": 0.959897715817167, "grad_norm": 0.720064640045166, "learning_rate": 1.6323570321503795e-05, "loss": 0.2451, "step": 9760 }, { "epoch": 0.9608812175751764, "grad_norm": 2.334301233291626, "learning_rate": 1.6319596232563685e-05, "loss": 0.2748, "step": 9770 }, { "epoch": 0.9618647193331858, "grad_norm": 0.672520101070404, "learning_rate": 1.6315622143623574e-05, "loss": 0.2755, "step": 9780 }, { "epoch": 0.9628482210911952, "grad_norm": 1.1749473810195923, "learning_rate": 1.6311648054683464e-05, "loss": 0.3088, "step": 9790 }, { "epoch": 0.9638317228492046, "grad_norm": 1.1671876907348633, "learning_rate": 1.6307673965743354e-05, "loss": 0.3445, "step": 9800 }, { "epoch": 0.964815224607214, "grad_norm": 1.8781317472457886, "learning_rate": 1.6303699876803247e-05, "loss": 0.1894, "step": 9810 }, { "epoch": 0.9657987263652233, "grad_norm": 2.0575361251831055, "learning_rate": 1.6299725787863133e-05, "loss": 0.4052, "step": 9820 }, { "epoch": 0.9667822281232328, "grad_norm": 1.0505220890045166, "learning_rate": 1.6295751698923023e-05, "loss": 0.2261, "step": 9830 }, { "epoch": 0.9677657298812422, "grad_norm": 1.2270594835281372, "learning_rate": 1.6291777609982912e-05, "loss": 0.3129, "step": 9840 }, { "epoch": 0.9687492316392515, "grad_norm": 1.5712100267410278, "learning_rate": 1.6287803521042802e-05, "loss": 0.2792, "step": 9850 }, { "epoch": 0.969732733397261, "grad_norm": 1.1416723728179932, "learning_rate": 1.628382943210269e-05, "loss": 0.1883, "step": 9860 }, { "epoch": 0.9707162351552704, "grad_norm": 2.3748650550842285, "learning_rate": 1.627985534316258e-05, "loss": 0.3079, "step": 9870 }, { "epoch": 0.9716997369132797, "grad_norm": 0.8768253326416016, "learning_rate": 1.627588125422247e-05, "loss": 0.3344, "step": 9880 }, { "epoch": 0.9726832386712891, "grad_norm": 1.2483183145523071, "learning_rate": 1.627190716528236e-05, "loss": 0.3065, "step": 9890 }, { "epoch": 0.9736667404292986, "grad_norm": 0.6402847766876221, "learning_rate": 1.626793307634225e-05, "loss": 0.271, "step": 9900 }, { "epoch": 0.9746502421873079, "grad_norm": 1.9914218187332153, "learning_rate": 1.626395898740214e-05, "loss": 0.1675, "step": 9910 }, { "epoch": 0.9756337439453173, "grad_norm": 1.2543543577194214, "learning_rate": 1.625998489846203e-05, "loss": 0.2689, "step": 9920 }, { "epoch": 0.9766172457033266, "grad_norm": 1.3631449937820435, "learning_rate": 1.625601080952192e-05, "loss": 0.2454, "step": 9930 }, { "epoch": 0.9776007474613361, "grad_norm": 2.1351165771484375, "learning_rate": 1.625203672058181e-05, "loss": 0.2253, "step": 9940 }, { "epoch": 0.9785842492193455, "grad_norm": 0.6863617300987244, "learning_rate": 1.6248062631641695e-05, "loss": 0.3127, "step": 9950 }, { "epoch": 0.9795677509773548, "grad_norm": 1.2482006549835205, "learning_rate": 1.6244088542701585e-05, "loss": 0.3138, "step": 9960 }, { "epoch": 0.9805512527353643, "grad_norm": 1.801457166671753, "learning_rate": 1.6240114453761478e-05, "loss": 0.2613, "step": 9970 }, { "epoch": 0.9815347544933737, "grad_norm": 2.327338457107544, "learning_rate": 1.6236140364821368e-05, "loss": 0.1701, "step": 9980 }, { "epoch": 0.982518256251383, "grad_norm": 0.7098455429077148, "learning_rate": 1.6232166275881257e-05, "loss": 0.3116, "step": 9990 }, { "epoch": 0.9835017580093924, "grad_norm": 1.502175211906433, "learning_rate": 1.6228192186941144e-05, "loss": 0.274, "step": 10000 }, { "epoch": 0.9835017580093924, "eval_loss": 0.16346381604671478, "eval_runtime": 17.3264, "eval_samples_per_second": 2.886, "eval_steps_per_second": 1.443, "step": 10000 }, { "epoch": 0.9844852597674019, "grad_norm": 0.8948222398757935, "learning_rate": 1.6224218098001033e-05, "loss": 0.1495, "step": 10010 }, { "epoch": 0.9854687615254112, "grad_norm": 0.9255267381668091, "learning_rate": 1.6220244009060923e-05, "loss": 0.3118, "step": 10020 }, { "epoch": 0.9864522632834206, "grad_norm": 3.8730742931365967, "learning_rate": 1.6216269920120813e-05, "loss": 0.3908, "step": 10030 }, { "epoch": 0.98743576504143, "grad_norm": 1.2976429462432861, "learning_rate": 1.6212295831180702e-05, "loss": 0.2071, "step": 10040 }, { "epoch": 0.9884192667994394, "grad_norm": 0.9446991682052612, "learning_rate": 1.6208321742240595e-05, "loss": 0.3687, "step": 10050 }, { "epoch": 0.9894027685574488, "grad_norm": 3.9752187728881836, "learning_rate": 1.620434765330048e-05, "loss": 0.2103, "step": 10060 }, { "epoch": 0.9903862703154582, "grad_norm": 2.023470878601074, "learning_rate": 1.620037356436037e-05, "loss": 0.2543, "step": 10070 }, { "epoch": 0.9913697720734675, "grad_norm": 1.043302297592163, "learning_rate": 1.619639947542026e-05, "loss": 0.2258, "step": 10080 }, { "epoch": 0.992353273831477, "grad_norm": 3.0941057205200195, "learning_rate": 1.619242538648015e-05, "loss": 0.3121, "step": 10090 }, { "epoch": 0.9933367755894864, "grad_norm": 1.3577592372894287, "learning_rate": 1.618845129754004e-05, "loss": 0.1366, "step": 10100 }, { "epoch": 0.9943202773474957, "grad_norm": 1.7372103929519653, "learning_rate": 1.618447720859993e-05, "loss": 0.2481, "step": 10110 }, { "epoch": 0.9953037791055052, "grad_norm": 2.0239365100860596, "learning_rate": 1.618050311965982e-05, "loss": 0.3117, "step": 10120 }, { "epoch": 0.9962872808635146, "grad_norm": 2.3687198162078857, "learning_rate": 1.617652903071971e-05, "loss": 0.3065, "step": 10130 }, { "epoch": 0.9972707826215239, "grad_norm": 2.562297821044922, "learning_rate": 1.61725549417796e-05, "loss": 0.2936, "step": 10140 }, { "epoch": 0.9982542843795333, "grad_norm": 1.0834999084472656, "learning_rate": 1.616858085283949e-05, "loss": 0.2266, "step": 10150 }, { "epoch": 0.9992377861375428, "grad_norm": 0.6850488185882568, "learning_rate": 1.6164606763899378e-05, "loss": 0.3046, "step": 10160 }, { "epoch": 1.000221287895552, "grad_norm": 1.008405089378357, "learning_rate": 1.6160632674959268e-05, "loss": 0.307, "step": 10170 }, { "epoch": 1.0012047896535614, "grad_norm": 1.3072564601898193, "learning_rate": 1.6156658586019158e-05, "loss": 0.2647, "step": 10180 }, { "epoch": 1.002188291411571, "grad_norm": 1.2347207069396973, "learning_rate": 1.6152684497079044e-05, "loss": 0.2033, "step": 10190 }, { "epoch": 1.0031717931695803, "grad_norm": 1.1007719039916992, "learning_rate": 1.6148710408138933e-05, "loss": 0.145, "step": 10200 }, { "epoch": 1.0041552949275896, "grad_norm": 0.5624765753746033, "learning_rate": 1.6144736319198826e-05, "loss": 0.1642, "step": 10210 }, { "epoch": 1.0051387966855991, "grad_norm": 2.5909204483032227, "learning_rate": 1.6140762230258716e-05, "loss": 0.2944, "step": 10220 }, { "epoch": 1.0061222984436085, "grad_norm": 1.661992073059082, "learning_rate": 1.6136788141318606e-05, "loss": 0.194, "step": 10230 }, { "epoch": 1.0071058002016178, "grad_norm": 1.6907767057418823, "learning_rate": 1.6132814052378492e-05, "loss": 0.2617, "step": 10240 }, { "epoch": 1.0080893019596273, "grad_norm": 1.1987255811691284, "learning_rate": 1.6128839963438382e-05, "loss": 0.2384, "step": 10250 }, { "epoch": 1.0090728037176366, "grad_norm": 0.7288364171981812, "learning_rate": 1.612486587449827e-05, "loss": 0.244, "step": 10260 }, { "epoch": 1.010056305475646, "grad_norm": 1.0956259965896606, "learning_rate": 1.612089178555816e-05, "loss": 0.2002, "step": 10270 }, { "epoch": 1.0110398072336555, "grad_norm": 1.2467955350875854, "learning_rate": 1.611691769661805e-05, "loss": 0.1787, "step": 10280 }, { "epoch": 1.0120233089916648, "grad_norm": 0.8599863648414612, "learning_rate": 1.6112943607677944e-05, "loss": 0.2282, "step": 10290 }, { "epoch": 1.0130068107496741, "grad_norm": 0.4900651276111603, "learning_rate": 1.610896951873783e-05, "loss": 0.2283, "step": 10300 }, { "epoch": 1.0139903125076837, "grad_norm": 1.299378752708435, "learning_rate": 1.610499542979772e-05, "loss": 0.2481, "step": 10310 }, { "epoch": 1.014973814265693, "grad_norm": 1.2478209733963013, "learning_rate": 1.610102134085761e-05, "loss": 0.2631, "step": 10320 }, { "epoch": 1.0159573160237023, "grad_norm": 1.5120837688446045, "learning_rate": 1.60970472519175e-05, "loss": 0.2013, "step": 10330 }, { "epoch": 1.0169408177817119, "grad_norm": 0.5610188245773315, "learning_rate": 1.609307316297739e-05, "loss": 0.3882, "step": 10340 }, { "epoch": 1.0179243195397212, "grad_norm": 1.6709067821502686, "learning_rate": 1.608909907403728e-05, "loss": 0.2784, "step": 10350 }, { "epoch": 1.0189078212977305, "grad_norm": 0.8348563313484192, "learning_rate": 1.6085124985097168e-05, "loss": 0.2556, "step": 10360 }, { "epoch": 1.01989132305574, "grad_norm": 2.1276087760925293, "learning_rate": 1.6081150896157058e-05, "loss": 0.2456, "step": 10370 }, { "epoch": 1.0208748248137494, "grad_norm": 1.0101721286773682, "learning_rate": 1.6077176807216947e-05, "loss": 0.2188, "step": 10380 }, { "epoch": 1.0218583265717587, "grad_norm": 0.9049150347709656, "learning_rate": 1.6073202718276837e-05, "loss": 0.1653, "step": 10390 }, { "epoch": 1.0228418283297682, "grad_norm": 2.162771224975586, "learning_rate": 1.6069228629336727e-05, "loss": 0.2223, "step": 10400 }, { "epoch": 1.0238253300877775, "grad_norm": 1.646711826324463, "learning_rate": 1.6065254540396616e-05, "loss": 0.2035, "step": 10410 }, { "epoch": 1.0248088318457869, "grad_norm": 0.6415451169013977, "learning_rate": 1.6061280451456506e-05, "loss": 0.3438, "step": 10420 }, { "epoch": 1.0257923336037964, "grad_norm": 0.9765540361404419, "learning_rate": 1.6057306362516392e-05, "loss": 0.2076, "step": 10430 }, { "epoch": 1.0267758353618057, "grad_norm": 0.8718576431274414, "learning_rate": 1.6053332273576282e-05, "loss": 0.2563, "step": 10440 }, { "epoch": 1.027759337119815, "grad_norm": 3.8872036933898926, "learning_rate": 1.6049358184636175e-05, "loss": 0.2637, "step": 10450 }, { "epoch": 1.0287428388778246, "grad_norm": 0.34129858016967773, "learning_rate": 1.6045384095696065e-05, "loss": 0.2198, "step": 10460 }, { "epoch": 1.029726340635834, "grad_norm": 0.7682720422744751, "learning_rate": 1.6041410006755954e-05, "loss": 0.3239, "step": 10470 }, { "epoch": 1.0307098423938432, "grad_norm": 2.776768445968628, "learning_rate": 1.603743591781584e-05, "loss": 0.1555, "step": 10480 }, { "epoch": 1.0316933441518528, "grad_norm": 1.3365027904510498, "learning_rate": 1.603346182887573e-05, "loss": 0.2112, "step": 10490 }, { "epoch": 1.032676845909862, "grad_norm": 1.3282992839813232, "learning_rate": 1.602948773993562e-05, "loss": 0.195, "step": 10500 }, { "epoch": 1.032676845909862, "eval_loss": 0.1750352829694748, "eval_runtime": 16.9966, "eval_samples_per_second": 2.942, "eval_steps_per_second": 1.471, "step": 10500 }, { "epoch": 1.0336603476678714, "grad_norm": 1.3822355270385742, "learning_rate": 1.602551365099551e-05, "loss": 0.1851, "step": 10510 }, { "epoch": 1.034643849425881, "grad_norm": 1.0904524326324463, "learning_rate": 1.60215395620554e-05, "loss": 0.1616, "step": 10520 }, { "epoch": 1.0356273511838903, "grad_norm": 0.7074946165084839, "learning_rate": 1.601756547311529e-05, "loss": 0.2985, "step": 10530 }, { "epoch": 1.0366108529418996, "grad_norm": 0.7284005284309387, "learning_rate": 1.601359138417518e-05, "loss": 0.3503, "step": 10540 }, { "epoch": 1.0375943546999091, "grad_norm": 0.3212083578109741, "learning_rate": 1.600961729523507e-05, "loss": 0.2078, "step": 10550 }, { "epoch": 1.0385778564579184, "grad_norm": 1.210832953453064, "learning_rate": 1.6005643206294958e-05, "loss": 0.242, "step": 10560 }, { "epoch": 1.0395613582159278, "grad_norm": 1.1374540328979492, "learning_rate": 1.6001669117354848e-05, "loss": 0.2138, "step": 10570 }, { "epoch": 1.0405448599739373, "grad_norm": 0.281815767288208, "learning_rate": 1.5997695028414737e-05, "loss": 0.2453, "step": 10580 }, { "epoch": 1.0415283617319466, "grad_norm": 0.8695719242095947, "learning_rate": 1.5993720939474627e-05, "loss": 0.1835, "step": 10590 }, { "epoch": 1.042511863489956, "grad_norm": 1.4869102239608765, "learning_rate": 1.5989746850534517e-05, "loss": 0.2998, "step": 10600 }, { "epoch": 1.0434953652479653, "grad_norm": 2.7580795288085938, "learning_rate": 1.5985772761594406e-05, "loss": 0.3063, "step": 10610 }, { "epoch": 1.0444788670059748, "grad_norm": 0.5467950701713562, "learning_rate": 1.5981798672654296e-05, "loss": 0.2473, "step": 10620 }, { "epoch": 1.0454623687639841, "grad_norm": 1.4874897003173828, "learning_rate": 1.5977824583714186e-05, "loss": 0.2189, "step": 10630 }, { "epoch": 1.0464458705219934, "grad_norm": 1.1254550218582153, "learning_rate": 1.5973850494774075e-05, "loss": 0.2375, "step": 10640 }, { "epoch": 1.047429372280003, "grad_norm": 1.4423521757125854, "learning_rate": 1.5969876405833965e-05, "loss": 0.1031, "step": 10650 }, { "epoch": 1.0484128740380123, "grad_norm": 0.7423785328865051, "learning_rate": 1.5965902316893855e-05, "loss": 0.2293, "step": 10660 }, { "epoch": 1.0493963757960216, "grad_norm": 0.626494824886322, "learning_rate": 1.596192822795374e-05, "loss": 0.1934, "step": 10670 }, { "epoch": 1.0503798775540312, "grad_norm": 1.428344488143921, "learning_rate": 1.595795413901363e-05, "loss": 0.2119, "step": 10680 }, { "epoch": 1.0513633793120405, "grad_norm": 0.5278069376945496, "learning_rate": 1.595398005007352e-05, "loss": 0.3225, "step": 10690 }, { "epoch": 1.0523468810700498, "grad_norm": 0.3313775956630707, "learning_rate": 1.5950005961133413e-05, "loss": 0.1946, "step": 10700 }, { "epoch": 1.0533303828280594, "grad_norm": 1.9315088987350464, "learning_rate": 1.5946031872193303e-05, "loss": 0.2707, "step": 10710 }, { "epoch": 1.0543138845860687, "grad_norm": 0.6008973121643066, "learning_rate": 1.594205778325319e-05, "loss": 0.3136, "step": 10720 }, { "epoch": 1.055297386344078, "grad_norm": 0.7295891046524048, "learning_rate": 1.593808369431308e-05, "loss": 0.2464, "step": 10730 }, { "epoch": 1.0562808881020875, "grad_norm": 1.5947253704071045, "learning_rate": 1.593410960537297e-05, "loss": 0.1516, "step": 10740 }, { "epoch": 1.0572643898600969, "grad_norm": 1.8050662279129028, "learning_rate": 1.5930135516432858e-05, "loss": 0.2702, "step": 10750 }, { "epoch": 1.0582478916181062, "grad_norm": 1.135501742362976, "learning_rate": 1.5926161427492748e-05, "loss": 0.2593, "step": 10760 }, { "epoch": 1.0592313933761157, "grad_norm": 1.1788592338562012, "learning_rate": 1.5922187338552638e-05, "loss": 0.2392, "step": 10770 }, { "epoch": 1.060214895134125, "grad_norm": 1.9654382467269897, "learning_rate": 1.5918213249612527e-05, "loss": 0.2876, "step": 10780 }, { "epoch": 1.0611983968921344, "grad_norm": 0.15102195739746094, "learning_rate": 1.5914239160672417e-05, "loss": 0.244, "step": 10790 }, { "epoch": 1.062181898650144, "grad_norm": 0.8354254961013794, "learning_rate": 1.5910265071732307e-05, "loss": 0.2165, "step": 10800 }, { "epoch": 1.0631654004081532, "grad_norm": 0.7330989837646484, "learning_rate": 1.5906290982792196e-05, "loss": 0.1486, "step": 10810 }, { "epoch": 1.0641489021661625, "grad_norm": 2.2787623405456543, "learning_rate": 1.5902316893852086e-05, "loss": 0.2067, "step": 10820 }, { "epoch": 1.065132403924172, "grad_norm": 0.8515722155570984, "learning_rate": 1.5898342804911976e-05, "loss": 0.2311, "step": 10830 }, { "epoch": 1.0661159056821814, "grad_norm": 2.1706154346466064, "learning_rate": 1.5894368715971865e-05, "loss": 0.1841, "step": 10840 }, { "epoch": 1.0670994074401907, "grad_norm": 3.861100673675537, "learning_rate": 1.5890394627031755e-05, "loss": 0.1702, "step": 10850 }, { "epoch": 1.0680829091982003, "grad_norm": 0.5234455466270447, "learning_rate": 1.5886420538091645e-05, "loss": 0.1791, "step": 10860 }, { "epoch": 1.0690664109562096, "grad_norm": 0.9792625904083252, "learning_rate": 1.5882446449151534e-05, "loss": 0.2743, "step": 10870 }, { "epoch": 1.070049912714219, "grad_norm": 1.8640176057815552, "learning_rate": 1.5878472360211424e-05, "loss": 0.2103, "step": 10880 }, { "epoch": 1.0710334144722284, "grad_norm": 2.448150396347046, "learning_rate": 1.5874498271271314e-05, "loss": 0.3244, "step": 10890 }, { "epoch": 1.0720169162302378, "grad_norm": 1.2162164449691772, "learning_rate": 1.5870524182331203e-05, "loss": 0.3108, "step": 10900 }, { "epoch": 1.073000417988247, "grad_norm": 0.895965039730072, "learning_rate": 1.586655009339109e-05, "loss": 0.2238, "step": 10910 }, { "epoch": 1.0739839197462566, "grad_norm": 3.080109119415283, "learning_rate": 1.586257600445098e-05, "loss": 0.2297, "step": 10920 }, { "epoch": 1.074967421504266, "grad_norm": 0.6591482162475586, "learning_rate": 1.585860191551087e-05, "loss": 0.2226, "step": 10930 }, { "epoch": 1.0759509232622753, "grad_norm": 1.4663394689559937, "learning_rate": 1.5854627826570762e-05, "loss": 0.2442, "step": 10940 }, { "epoch": 1.0769344250202848, "grad_norm": 1.6239320039749146, "learning_rate": 1.585065373763065e-05, "loss": 0.2115, "step": 10950 }, { "epoch": 1.0779179267782941, "grad_norm": 0.2651664912700653, "learning_rate": 1.5846679648690538e-05, "loss": 0.2726, "step": 10960 }, { "epoch": 1.0789014285363034, "grad_norm": 0.9693211913108826, "learning_rate": 1.5842705559750427e-05, "loss": 0.157, "step": 10970 }, { "epoch": 1.079884930294313, "grad_norm": 1.8968473672866821, "learning_rate": 1.5838731470810317e-05, "loss": 0.2652, "step": 10980 }, { "epoch": 1.0808684320523223, "grad_norm": 1.7201496362686157, "learning_rate": 1.5834757381870207e-05, "loss": 0.2528, "step": 10990 }, { "epoch": 1.0818519338103316, "grad_norm": 0.6101740598678589, "learning_rate": 1.5830783292930096e-05, "loss": 0.275, "step": 11000 }, { "epoch": 1.0818519338103316, "eval_loss": 0.1684889793395996, "eval_runtime": 20.0453, "eval_samples_per_second": 2.494, "eval_steps_per_second": 1.247, "step": 11000 }, { "epoch": 1.0828354355683412, "grad_norm": 1.1838539838790894, "learning_rate": 1.5826809203989986e-05, "loss": 0.1932, "step": 11010 }, { "epoch": 1.0838189373263505, "grad_norm": 1.040086030960083, "learning_rate": 1.5822835115049876e-05, "loss": 0.2281, "step": 11020 }, { "epoch": 1.0848024390843598, "grad_norm": 2.302246332168579, "learning_rate": 1.5818861026109765e-05, "loss": 0.1934, "step": 11030 }, { "epoch": 1.0857859408423693, "grad_norm": 1.5548522472381592, "learning_rate": 1.5814886937169655e-05, "loss": 0.211, "step": 11040 }, { "epoch": 1.0867694426003787, "grad_norm": 1.5086666345596313, "learning_rate": 1.5810912848229545e-05, "loss": 0.2239, "step": 11050 }, { "epoch": 1.087752944358388, "grad_norm": 0.8068061470985413, "learning_rate": 1.5806938759289434e-05, "loss": 0.2373, "step": 11060 }, { "epoch": 1.0887364461163975, "grad_norm": 1.569872260093689, "learning_rate": 1.5802964670349324e-05, "loss": 0.2313, "step": 11070 }, { "epoch": 1.0897199478744068, "grad_norm": 1.8688373565673828, "learning_rate": 1.5798990581409214e-05, "loss": 0.2964, "step": 11080 }, { "epoch": 1.0907034496324162, "grad_norm": 1.7373290061950684, "learning_rate": 1.5795016492469103e-05, "loss": 0.2808, "step": 11090 }, { "epoch": 1.0916869513904257, "grad_norm": 1.3782762289047241, "learning_rate": 1.5791042403528993e-05, "loss": 0.2017, "step": 11100 }, { "epoch": 1.092670453148435, "grad_norm": 0.6687967777252197, "learning_rate": 1.5787068314588883e-05, "loss": 0.1564, "step": 11110 }, { "epoch": 1.0936539549064443, "grad_norm": 1.177499532699585, "learning_rate": 1.5783094225648772e-05, "loss": 0.2055, "step": 11120 }, { "epoch": 1.094637456664454, "grad_norm": 1.7758307456970215, "learning_rate": 1.5779120136708662e-05, "loss": 0.308, "step": 11130 }, { "epoch": 1.0956209584224632, "grad_norm": 0.8646954894065857, "learning_rate": 1.5775146047768552e-05, "loss": 0.2851, "step": 11140 }, { "epoch": 1.0966044601804725, "grad_norm": 1.7536402940750122, "learning_rate": 1.5771171958828438e-05, "loss": 0.2581, "step": 11150 }, { "epoch": 1.097587961938482, "grad_norm": 0.7464534640312195, "learning_rate": 1.5767197869888328e-05, "loss": 0.2356, "step": 11160 }, { "epoch": 1.0985714636964914, "grad_norm": 0.9927958846092224, "learning_rate": 1.5763223780948217e-05, "loss": 0.2994, "step": 11170 }, { "epoch": 1.0995549654545007, "grad_norm": 1.3818191289901733, "learning_rate": 1.575924969200811e-05, "loss": 0.2317, "step": 11180 }, { "epoch": 1.1005384672125103, "grad_norm": 1.2731701135635376, "learning_rate": 1.5755275603068e-05, "loss": 0.1586, "step": 11190 }, { "epoch": 1.1015219689705196, "grad_norm": 0.4445424973964691, "learning_rate": 1.5751301514127886e-05, "loss": 0.2629, "step": 11200 }, { "epoch": 1.102505470728529, "grad_norm": 0.8182106614112854, "learning_rate": 1.5747327425187776e-05, "loss": 0.2465, "step": 11210 }, { "epoch": 1.1034889724865384, "grad_norm": 0.32335585355758667, "learning_rate": 1.5743353336247666e-05, "loss": 0.175, "step": 11220 }, { "epoch": 1.1044724742445478, "grad_norm": 1.8322887420654297, "learning_rate": 1.5739379247307555e-05, "loss": 0.2475, "step": 11230 }, { "epoch": 1.105455976002557, "grad_norm": 0.381082683801651, "learning_rate": 1.5735405158367445e-05, "loss": 0.2041, "step": 11240 }, { "epoch": 1.1064394777605666, "grad_norm": 0.8768240809440613, "learning_rate": 1.5731431069427335e-05, "loss": 0.2022, "step": 11250 }, { "epoch": 1.107422979518576, "grad_norm": 1.1599154472351074, "learning_rate": 1.5727456980487224e-05, "loss": 0.2321, "step": 11260 }, { "epoch": 1.1084064812765853, "grad_norm": 0.9594438076019287, "learning_rate": 1.5723482891547114e-05, "loss": 0.1651, "step": 11270 }, { "epoch": 1.1093899830345946, "grad_norm": 0.7346277236938477, "learning_rate": 1.5719508802607004e-05, "loss": 0.209, "step": 11280 }, { "epoch": 1.1103734847926041, "grad_norm": 1.4004619121551514, "learning_rate": 1.5715534713666893e-05, "loss": 0.237, "step": 11290 }, { "epoch": 1.1113569865506134, "grad_norm": 1.0641740560531616, "learning_rate": 1.5711560624726783e-05, "loss": 0.2332, "step": 11300 }, { "epoch": 1.1123404883086228, "grad_norm": 0.6625562906265259, "learning_rate": 1.5707586535786673e-05, "loss": 0.1923, "step": 11310 }, { "epoch": 1.1133239900666323, "grad_norm": 2.6541385650634766, "learning_rate": 1.5703612446846562e-05, "loss": 0.2174, "step": 11320 }, { "epoch": 1.1143074918246416, "grad_norm": 0.5809294581413269, "learning_rate": 1.5699638357906452e-05, "loss": 0.254, "step": 11330 }, { "epoch": 1.115290993582651, "grad_norm": 0.5939483642578125, "learning_rate": 1.5695664268966338e-05, "loss": 0.1861, "step": 11340 }, { "epoch": 1.1162744953406605, "grad_norm": 0.6164649128913879, "learning_rate": 1.569169018002623e-05, "loss": 0.2705, "step": 11350 }, { "epoch": 1.1172579970986698, "grad_norm": 0.39910444617271423, "learning_rate": 1.568771609108612e-05, "loss": 0.258, "step": 11360 }, { "epoch": 1.1182414988566791, "grad_norm": 0.707711935043335, "learning_rate": 1.568374200214601e-05, "loss": 0.1839, "step": 11370 }, { "epoch": 1.1192250006146887, "grad_norm": 0.9458222985267639, "learning_rate": 1.56797679132059e-05, "loss": 0.3005, "step": 11380 }, { "epoch": 1.120208502372698, "grad_norm": 1.1831637620925903, "learning_rate": 1.5675793824265787e-05, "loss": 0.3427, "step": 11390 }, { "epoch": 1.1211920041307073, "grad_norm": 2.5058209896087646, "learning_rate": 1.5671819735325676e-05, "loss": 0.2011, "step": 11400 }, { "epoch": 1.1221755058887168, "grad_norm": 0.7695888876914978, "learning_rate": 1.5667845646385566e-05, "loss": 0.1912, "step": 11410 }, { "epoch": 1.1231590076467262, "grad_norm": 0.6722692251205444, "learning_rate": 1.5663871557445456e-05, "loss": 0.2892, "step": 11420 }, { "epoch": 1.1241425094047355, "grad_norm": 0.46231839060783386, "learning_rate": 1.565989746850535e-05, "loss": 0.1979, "step": 11430 }, { "epoch": 1.125126011162745, "grad_norm": 1.6147507429122925, "learning_rate": 1.5655923379565235e-05, "loss": 0.2097, "step": 11440 }, { "epoch": 1.1261095129207543, "grad_norm": 0.5903246402740479, "learning_rate": 1.5651949290625125e-05, "loss": 0.2322, "step": 11450 }, { "epoch": 1.1270930146787637, "grad_norm": 0.9642149209976196, "learning_rate": 1.5647975201685014e-05, "loss": 0.2188, "step": 11460 }, { "epoch": 1.1280765164367732, "grad_norm": 0.9721670746803284, "learning_rate": 1.5644001112744904e-05, "loss": 0.3213, "step": 11470 }, { "epoch": 1.1290600181947825, "grad_norm": 0.5057275891304016, "learning_rate": 1.5640027023804794e-05, "loss": 0.1826, "step": 11480 }, { "epoch": 1.1300435199527918, "grad_norm": 0.7329443097114563, "learning_rate": 1.5636052934864683e-05, "loss": 0.2357, "step": 11490 }, { "epoch": 1.1310270217108014, "grad_norm": 0.41778743267059326, "learning_rate": 1.5632078845924573e-05, "loss": 0.242, "step": 11500 }, { "epoch": 1.1310270217108014, "eval_loss": 0.16718797385692596, "eval_runtime": 20.7187, "eval_samples_per_second": 2.413, "eval_steps_per_second": 1.207, "step": 11500 }, { "epoch": 1.1320105234688107, "grad_norm": 1.1029562950134277, "learning_rate": 1.5628104756984463e-05, "loss": 0.1791, "step": 11510 }, { "epoch": 1.13299402522682, "grad_norm": 1.1086807250976562, "learning_rate": 1.5624130668044352e-05, "loss": 0.2324, "step": 11520 }, { "epoch": 1.1339775269848296, "grad_norm": 1.1632509231567383, "learning_rate": 1.5620156579104242e-05, "loss": 0.2047, "step": 11530 }, { "epoch": 1.1349610287428389, "grad_norm": 1.6066392660140991, "learning_rate": 1.561618249016413e-05, "loss": 0.1266, "step": 11540 }, { "epoch": 1.1359445305008482, "grad_norm": 1.0749987363815308, "learning_rate": 1.561220840122402e-05, "loss": 0.2094, "step": 11550 }, { "epoch": 1.1369280322588577, "grad_norm": 0.9814296364784241, "learning_rate": 1.560823431228391e-05, "loss": 0.2656, "step": 11560 }, { "epoch": 1.137911534016867, "grad_norm": 2.2510077953338623, "learning_rate": 1.56042602233438e-05, "loss": 0.3081, "step": 11570 }, { "epoch": 1.1388950357748764, "grad_norm": 0.728635847568512, "learning_rate": 1.5600286134403687e-05, "loss": 0.3033, "step": 11580 }, { "epoch": 1.139878537532886, "grad_norm": 1.4196513891220093, "learning_rate": 1.559631204546358e-05, "loss": 0.2354, "step": 11590 }, { "epoch": 1.1408620392908952, "grad_norm": 2.0953516960144043, "learning_rate": 1.559233795652347e-05, "loss": 0.2404, "step": 11600 }, { "epoch": 1.1418455410489046, "grad_norm": 1.5039966106414795, "learning_rate": 1.558836386758336e-05, "loss": 0.1953, "step": 11610 }, { "epoch": 1.142829042806914, "grad_norm": 1.7572118043899536, "learning_rate": 1.558438977864325e-05, "loss": 0.2377, "step": 11620 }, { "epoch": 1.1438125445649234, "grad_norm": 6.402324199676514, "learning_rate": 1.5580415689703135e-05, "loss": 0.3005, "step": 11630 }, { "epoch": 1.1447960463229327, "grad_norm": 1.5109554529190063, "learning_rate": 1.5576441600763025e-05, "loss": 0.1312, "step": 11640 }, { "epoch": 1.1457795480809423, "grad_norm": 0.4686489999294281, "learning_rate": 1.5572467511822915e-05, "loss": 0.2823, "step": 11650 }, { "epoch": 1.1467630498389516, "grad_norm": 3.5495054721832275, "learning_rate": 1.5568493422882804e-05, "loss": 0.19, "step": 11660 }, { "epoch": 1.147746551596961, "grad_norm": 1.636431336402893, "learning_rate": 1.5564519333942697e-05, "loss": 0.1589, "step": 11670 }, { "epoch": 1.1487300533549702, "grad_norm": 0.634340763092041, "learning_rate": 1.5560545245002583e-05, "loss": 0.1228, "step": 11680 }, { "epoch": 1.1497135551129798, "grad_norm": 2.047029972076416, "learning_rate": 1.5556571156062473e-05, "loss": 0.1921, "step": 11690 }, { "epoch": 1.150697056870989, "grad_norm": 1.4832496643066406, "learning_rate": 1.5552597067122363e-05, "loss": 0.2528, "step": 11700 }, { "epoch": 1.1516805586289984, "grad_norm": 1.1205974817276, "learning_rate": 1.5548622978182252e-05, "loss": 0.1315, "step": 11710 }, { "epoch": 1.152664060387008, "grad_norm": 2.786681890487671, "learning_rate": 1.5544648889242142e-05, "loss": 0.3437, "step": 11720 }, { "epoch": 1.1536475621450173, "grad_norm": 2.386262893676758, "learning_rate": 1.5540674800302032e-05, "loss": 0.194, "step": 11730 }, { "epoch": 1.1546310639030266, "grad_norm": 1.2636781930923462, "learning_rate": 1.553670071136192e-05, "loss": 0.2482, "step": 11740 }, { "epoch": 1.1556145656610362, "grad_norm": 1.3071417808532715, "learning_rate": 1.553272662242181e-05, "loss": 0.2084, "step": 11750 }, { "epoch": 1.1565980674190455, "grad_norm": 2.5322470664978027, "learning_rate": 1.55287525334817e-05, "loss": 0.2909, "step": 11760 }, { "epoch": 1.1575815691770548, "grad_norm": 0.9364299178123474, "learning_rate": 1.552477844454159e-05, "loss": 0.2372, "step": 11770 }, { "epoch": 1.1585650709350643, "grad_norm": 1.4364525079727173, "learning_rate": 1.552080435560148e-05, "loss": 0.3222, "step": 11780 }, { "epoch": 1.1595485726930737, "grad_norm": 0.7772247195243835, "learning_rate": 1.551683026666137e-05, "loss": 0.227, "step": 11790 }, { "epoch": 1.160532074451083, "grad_norm": 3.083230972290039, "learning_rate": 1.551285617772126e-05, "loss": 0.1622, "step": 11800 }, { "epoch": 1.1615155762090925, "grad_norm": 0.6908173561096191, "learning_rate": 1.550888208878115e-05, "loss": 0.2731, "step": 11810 }, { "epoch": 1.1624990779671018, "grad_norm": 1.6049355268478394, "learning_rate": 1.5504907999841035e-05, "loss": 0.2115, "step": 11820 }, { "epoch": 1.1634825797251112, "grad_norm": 1.3253471851348877, "learning_rate": 1.550093391090093e-05, "loss": 0.2486, "step": 11830 }, { "epoch": 1.1644660814831207, "grad_norm": 1.939974308013916, "learning_rate": 1.5496959821960818e-05, "loss": 0.2514, "step": 11840 }, { "epoch": 1.16544958324113, "grad_norm": 0.3961417078971863, "learning_rate": 1.5492985733020708e-05, "loss": 0.1514, "step": 11850 }, { "epoch": 1.1664330849991393, "grad_norm": 2.070683002471924, "learning_rate": 1.5489011644080597e-05, "loss": 0.236, "step": 11860 }, { "epoch": 1.1674165867571489, "grad_norm": 2.060924768447876, "learning_rate": 1.5485037555140484e-05, "loss": 0.2529, "step": 11870 }, { "epoch": 1.1684000885151582, "grad_norm": 0.6187996864318848, "learning_rate": 1.5481063466200373e-05, "loss": 0.4012, "step": 11880 }, { "epoch": 1.1693835902731675, "grad_norm": 2.7358853816986084, "learning_rate": 1.5477089377260263e-05, "loss": 0.299, "step": 11890 }, { "epoch": 1.170367092031177, "grad_norm": 1.793433666229248, "learning_rate": 1.5473115288320153e-05, "loss": 0.3304, "step": 11900 }, { "epoch": 1.1713505937891864, "grad_norm": 1.451925277709961, "learning_rate": 1.5469141199380046e-05, "loss": 0.2606, "step": 11910 }, { "epoch": 1.1723340955471957, "grad_norm": 0.632253885269165, "learning_rate": 1.5465167110439932e-05, "loss": 0.2446, "step": 11920 }, { "epoch": 1.1733175973052052, "grad_norm": 1.721764326095581, "learning_rate": 1.5461193021499822e-05, "loss": 0.3533, "step": 11930 }, { "epoch": 1.1743010990632146, "grad_norm": 1.3763706684112549, "learning_rate": 1.545721893255971e-05, "loss": 0.2624, "step": 11940 }, { "epoch": 1.1752846008212239, "grad_norm": 1.520142912864685, "learning_rate": 1.54532448436196e-05, "loss": 0.159, "step": 11950 }, { "epoch": 1.1762681025792334, "grad_norm": 2.0177927017211914, "learning_rate": 1.544927075467949e-05, "loss": 0.2352, "step": 11960 }, { "epoch": 1.1772516043372427, "grad_norm": 1.737296223640442, "learning_rate": 1.544529666573938e-05, "loss": 0.1823, "step": 11970 }, { "epoch": 1.178235106095252, "grad_norm": 1.6785165071487427, "learning_rate": 1.544132257679927e-05, "loss": 0.1911, "step": 11980 }, { "epoch": 1.1792186078532616, "grad_norm": 0.7725062370300293, "learning_rate": 1.543734848785916e-05, "loss": 0.1469, "step": 11990 }, { "epoch": 1.180202109611271, "grad_norm": 1.0442862510681152, "learning_rate": 1.543337439891905e-05, "loss": 0.28, "step": 12000 }, { "epoch": 1.180202109611271, "eval_loss": 0.15947729349136353, "eval_runtime": 19.5412, "eval_samples_per_second": 2.559, "eval_steps_per_second": 1.279, "step": 12000 }, { "epoch": 1.1811856113692802, "grad_norm": 0.7599958181381226, "learning_rate": 1.542940030997894e-05, "loss": 0.2466, "step": 12010 }, { "epoch": 1.1821691131272898, "grad_norm": 1.1370816230773926, "learning_rate": 1.542542622103883e-05, "loss": 0.2842, "step": 12020 }, { "epoch": 1.183152614885299, "grad_norm": 1.1675643920898438, "learning_rate": 1.542145213209872e-05, "loss": 0.2127, "step": 12030 }, { "epoch": 1.1841361166433084, "grad_norm": 1.1006205081939697, "learning_rate": 1.5417478043158608e-05, "loss": 0.2202, "step": 12040 }, { "epoch": 1.185119618401318, "grad_norm": 0.6677995324134827, "learning_rate": 1.5413503954218498e-05, "loss": 0.2997, "step": 12050 }, { "epoch": 1.1861031201593273, "grad_norm": 1.038744568824768, "learning_rate": 1.5409529865278384e-05, "loss": 0.2648, "step": 12060 }, { "epoch": 1.1870866219173366, "grad_norm": 2.356663227081299, "learning_rate": 1.5405555776338277e-05, "loss": 0.1663, "step": 12070 }, { "epoch": 1.1880701236753461, "grad_norm": 1.1240571737289429, "learning_rate": 1.5401581687398167e-05, "loss": 0.2276, "step": 12080 }, { "epoch": 1.1890536254333555, "grad_norm": 1.8712751865386963, "learning_rate": 1.5397607598458056e-05, "loss": 0.2572, "step": 12090 }, { "epoch": 1.1900371271913648, "grad_norm": 0.5862861275672913, "learning_rate": 1.5393633509517946e-05, "loss": 0.1785, "step": 12100 }, { "epoch": 1.1910206289493743, "grad_norm": 1.5912821292877197, "learning_rate": 1.5389659420577832e-05, "loss": 0.3814, "step": 12110 }, { "epoch": 1.1920041307073836, "grad_norm": 0.7632641196250916, "learning_rate": 1.5385685331637722e-05, "loss": 0.1927, "step": 12120 }, { "epoch": 1.192987632465393, "grad_norm": 0.7380467057228088, "learning_rate": 1.538171124269761e-05, "loss": 0.2317, "step": 12130 }, { "epoch": 1.1939711342234025, "grad_norm": 1.1679322719573975, "learning_rate": 1.53777371537575e-05, "loss": 0.2264, "step": 12140 }, { "epoch": 1.1949546359814118, "grad_norm": 1.4404914379119873, "learning_rate": 1.537376306481739e-05, "loss": 0.2184, "step": 12150 }, { "epoch": 1.1959381377394211, "grad_norm": 0.8603619337081909, "learning_rate": 1.536978897587728e-05, "loss": 0.2118, "step": 12160 }, { "epoch": 1.1969216394974307, "grad_norm": 1.1067661046981812, "learning_rate": 1.536581488693717e-05, "loss": 0.3043, "step": 12170 }, { "epoch": 1.19790514125544, "grad_norm": 0.4780625104904175, "learning_rate": 1.536184079799706e-05, "loss": 0.1925, "step": 12180 }, { "epoch": 1.1988886430134493, "grad_norm": 1.2900044918060303, "learning_rate": 1.535786670905695e-05, "loss": 0.2877, "step": 12190 }, { "epoch": 1.1998721447714589, "grad_norm": 0.5605069994926453, "learning_rate": 1.535389262011684e-05, "loss": 0.1849, "step": 12200 }, { "epoch": 1.2008556465294682, "grad_norm": 2.1221747398376465, "learning_rate": 1.534991853117673e-05, "loss": 0.2137, "step": 12210 }, { "epoch": 1.2018391482874775, "grad_norm": 2.1045353412628174, "learning_rate": 1.534594444223662e-05, "loss": 0.197, "step": 12220 }, { "epoch": 1.202822650045487, "grad_norm": 1.5403351783752441, "learning_rate": 1.5341970353296508e-05, "loss": 0.158, "step": 12230 }, { "epoch": 1.2038061518034964, "grad_norm": 0.4863685369491577, "learning_rate": 1.5337996264356398e-05, "loss": 0.2061, "step": 12240 }, { "epoch": 1.2047896535615057, "grad_norm": 0.5963559150695801, "learning_rate": 1.5334022175416288e-05, "loss": 0.2193, "step": 12250 }, { "epoch": 1.2057731553195152, "grad_norm": 1.3666846752166748, "learning_rate": 1.5330048086476177e-05, "loss": 0.2201, "step": 12260 }, { "epoch": 1.2067566570775246, "grad_norm": 1.747308373451233, "learning_rate": 1.5326073997536067e-05, "loss": 0.3071, "step": 12270 }, { "epoch": 1.2077401588355339, "grad_norm": 0.7220170497894287, "learning_rate": 1.5322099908595957e-05, "loss": 0.228, "step": 12280 }, { "epoch": 1.2087236605935434, "grad_norm": 1.7711100578308105, "learning_rate": 1.5318125819655846e-05, "loss": 0.2558, "step": 12290 }, { "epoch": 1.2097071623515527, "grad_norm": 1.0758968591690063, "learning_rate": 1.5314151730715733e-05, "loss": 0.2629, "step": 12300 }, { "epoch": 1.210690664109562, "grad_norm": 1.715317964553833, "learning_rate": 1.5310177641775622e-05, "loss": 0.2422, "step": 12310 }, { "epoch": 1.2116741658675716, "grad_norm": 1.1539181470870972, "learning_rate": 1.5306203552835515e-05, "loss": 0.2245, "step": 12320 }, { "epoch": 1.212657667625581, "grad_norm": 3.100677251815796, "learning_rate": 1.5302229463895405e-05, "loss": 0.3347, "step": 12330 }, { "epoch": 1.2136411693835902, "grad_norm": 2.0572402477264404, "learning_rate": 1.5298255374955295e-05, "loss": 0.2516, "step": 12340 }, { "epoch": 1.2146246711415998, "grad_norm": 2.0834171772003174, "learning_rate": 1.529428128601518e-05, "loss": 0.2509, "step": 12350 }, { "epoch": 1.215608172899609, "grad_norm": 0.7151486277580261, "learning_rate": 1.529030719707507e-05, "loss": 0.1851, "step": 12360 }, { "epoch": 1.2165916746576184, "grad_norm": 2.3506407737731934, "learning_rate": 1.528633310813496e-05, "loss": 0.2926, "step": 12370 }, { "epoch": 1.217575176415628, "grad_norm": 2.338649272918701, "learning_rate": 1.528235901919485e-05, "loss": 0.2026, "step": 12380 }, { "epoch": 1.2185586781736373, "grad_norm": 0.8904348611831665, "learning_rate": 1.527838493025474e-05, "loss": 0.188, "step": 12390 }, { "epoch": 1.2195421799316466, "grad_norm": 1.264668345451355, "learning_rate": 1.527441084131463e-05, "loss": 0.2037, "step": 12400 }, { "epoch": 1.2205256816896561, "grad_norm": 1.8992657661437988, "learning_rate": 1.527043675237452e-05, "loss": 0.2614, "step": 12410 }, { "epoch": 1.2215091834476655, "grad_norm": 0.7366829514503479, "learning_rate": 1.526646266343441e-05, "loss": 0.1882, "step": 12420 }, { "epoch": 1.2224926852056748, "grad_norm": 1.5639548301696777, "learning_rate": 1.5262488574494298e-05, "loss": 0.2037, "step": 12430 }, { "epoch": 1.2234761869636843, "grad_norm": 0.48022663593292236, "learning_rate": 1.5258514485554188e-05, "loss": 0.2173, "step": 12440 }, { "epoch": 1.2244596887216936, "grad_norm": 3.2655975818634033, "learning_rate": 1.5254540396614078e-05, "loss": 0.3362, "step": 12450 }, { "epoch": 1.225443190479703, "grad_norm": 0.9768887162208557, "learning_rate": 1.5250566307673965e-05, "loss": 0.2947, "step": 12460 }, { "epoch": 1.2264266922377123, "grad_norm": 0.5320988297462463, "learning_rate": 1.5246592218733855e-05, "loss": 0.2609, "step": 12470 }, { "epoch": 1.2274101939957218, "grad_norm": 3.5792882442474365, "learning_rate": 1.5242618129793747e-05, "loss": 0.1552, "step": 12480 }, { "epoch": 1.2283936957537311, "grad_norm": 0.6869518756866455, "learning_rate": 1.5238644040853636e-05, "loss": 0.1569, "step": 12490 }, { "epoch": 1.2293771975117405, "grad_norm": 0.6907517313957214, "learning_rate": 1.5234669951913526e-05, "loss": 0.3387, "step": 12500 }, { "epoch": 1.2293771975117405, "eval_loss": 0.16012313961982727, "eval_runtime": 19.5828, "eval_samples_per_second": 2.553, "eval_steps_per_second": 1.277, "step": 12500 }, { "epoch": 1.23036069926975, "grad_norm": 0.5602425932884216, "learning_rate": 1.5230695862973416e-05, "loss": 0.395, "step": 12510 }, { "epoch": 1.2313442010277593, "grad_norm": 1.9788906574249268, "learning_rate": 1.5226721774033303e-05, "loss": 0.1828, "step": 12520 }, { "epoch": 1.2323277027857686, "grad_norm": 1.832395315170288, "learning_rate": 1.5222747685093193e-05, "loss": 0.1319, "step": 12530 }, { "epoch": 1.2333112045437782, "grad_norm": 0.36584335565567017, "learning_rate": 1.5218773596153083e-05, "loss": 0.2585, "step": 12540 }, { "epoch": 1.2342947063017875, "grad_norm": 1.3098914623260498, "learning_rate": 1.5214799507212972e-05, "loss": 0.2438, "step": 12550 }, { "epoch": 1.2352782080597968, "grad_norm": 4.020171165466309, "learning_rate": 1.5210825418272864e-05, "loss": 0.2908, "step": 12560 }, { "epoch": 1.2362617098178064, "grad_norm": 0.9674311280250549, "learning_rate": 1.5206851329332752e-05, "loss": 0.2159, "step": 12570 }, { "epoch": 1.2372452115758157, "grad_norm": 2.408463478088379, "learning_rate": 1.5202877240392641e-05, "loss": 0.1815, "step": 12580 }, { "epoch": 1.238228713333825, "grad_norm": 0.9522561430931091, "learning_rate": 1.5198903151452531e-05, "loss": 0.2076, "step": 12590 }, { "epoch": 1.2392122150918345, "grad_norm": 0.6239114999771118, "learning_rate": 1.519492906251242e-05, "loss": 0.208, "step": 12600 }, { "epoch": 1.2401957168498439, "grad_norm": 1.2748032808303833, "learning_rate": 1.5190954973572309e-05, "loss": 0.2277, "step": 12610 }, { "epoch": 1.2411792186078532, "grad_norm": 2.681368589401245, "learning_rate": 1.5186980884632198e-05, "loss": 0.2248, "step": 12620 }, { "epoch": 1.2421627203658627, "grad_norm": 0.48817378282546997, "learning_rate": 1.5183006795692088e-05, "loss": 0.2876, "step": 12630 }, { "epoch": 1.243146222123872, "grad_norm": 3.0966594219207764, "learning_rate": 1.517903270675198e-05, "loss": 0.2638, "step": 12640 }, { "epoch": 1.2441297238818814, "grad_norm": 2.382673501968384, "learning_rate": 1.5175058617811869e-05, "loss": 0.2204, "step": 12650 }, { "epoch": 1.245113225639891, "grad_norm": 3.4702672958374023, "learning_rate": 1.5171084528871757e-05, "loss": 0.2438, "step": 12660 }, { "epoch": 1.2460967273979002, "grad_norm": 1.1455655097961426, "learning_rate": 1.5167110439931647e-05, "loss": 0.323, "step": 12670 }, { "epoch": 1.2470802291559095, "grad_norm": 1.0415674448013306, "learning_rate": 1.5163136350991536e-05, "loss": 0.1862, "step": 12680 }, { "epoch": 1.248063730913919, "grad_norm": 0.9128134250640869, "learning_rate": 1.5159162262051426e-05, "loss": 0.1964, "step": 12690 }, { "epoch": 1.2490472326719284, "grad_norm": 1.5137019157409668, "learning_rate": 1.5155188173111314e-05, "loss": 0.2638, "step": 12700 }, { "epoch": 1.2500307344299377, "grad_norm": 2.8884811401367188, "learning_rate": 1.5151214084171204e-05, "loss": 0.1713, "step": 12710 }, { "epoch": 1.251014236187947, "grad_norm": 1.3755849599838257, "learning_rate": 1.5147239995231095e-05, "loss": 0.2103, "step": 12720 }, { "epoch": 1.2519977379459566, "grad_norm": 0.6168149709701538, "learning_rate": 1.5143265906290985e-05, "loss": 0.3774, "step": 12730 }, { "epoch": 1.252981239703966, "grad_norm": 0.5615456700325012, "learning_rate": 1.5139291817350874e-05, "loss": 0.1406, "step": 12740 }, { "epoch": 1.2539647414619752, "grad_norm": 1.0317662954330444, "learning_rate": 1.5135317728410764e-05, "loss": 0.352, "step": 12750 }, { "epoch": 1.2549482432199848, "grad_norm": 1.1228874921798706, "learning_rate": 1.5131343639470652e-05, "loss": 0.1881, "step": 12760 }, { "epoch": 1.255931744977994, "grad_norm": 0.7314227223396301, "learning_rate": 1.5127369550530542e-05, "loss": 0.3255, "step": 12770 }, { "epoch": 1.2569152467360034, "grad_norm": 3.771770715713501, "learning_rate": 1.5123395461590431e-05, "loss": 0.1557, "step": 12780 }, { "epoch": 1.257898748494013, "grad_norm": 4.539660453796387, "learning_rate": 1.5119421372650321e-05, "loss": 0.1951, "step": 12790 }, { "epoch": 1.2588822502520223, "grad_norm": 0.7272465825080872, "learning_rate": 1.5115447283710212e-05, "loss": 0.2017, "step": 12800 }, { "epoch": 1.2598657520100316, "grad_norm": 0.36164748668670654, "learning_rate": 1.51114731947701e-05, "loss": 0.268, "step": 12810 }, { "epoch": 1.2608492537680411, "grad_norm": 0.9491702914237976, "learning_rate": 1.510749910582999e-05, "loss": 0.3221, "step": 12820 }, { "epoch": 1.2618327555260505, "grad_norm": 1.4638797044754028, "learning_rate": 1.510352501688988e-05, "loss": 0.2655, "step": 12830 }, { "epoch": 1.2628162572840598, "grad_norm": 0.7647467851638794, "learning_rate": 1.509955092794977e-05, "loss": 0.2605, "step": 12840 }, { "epoch": 1.2637997590420693, "grad_norm": 1.9903299808502197, "learning_rate": 1.5095576839009657e-05, "loss": 0.1408, "step": 12850 }, { "epoch": 1.2647832608000786, "grad_norm": 0.31834790110588074, "learning_rate": 1.5091602750069547e-05, "loss": 0.2054, "step": 12860 }, { "epoch": 1.265766762558088, "grad_norm": 0.9295620918273926, "learning_rate": 1.5087628661129437e-05, "loss": 0.2929, "step": 12870 }, { "epoch": 1.2667502643160975, "grad_norm": 0.9623826146125793, "learning_rate": 1.5083654572189328e-05, "loss": 0.1799, "step": 12880 }, { "epoch": 1.2677337660741068, "grad_norm": 1.424196720123291, "learning_rate": 1.5079680483249218e-05, "loss": 0.2967, "step": 12890 }, { "epoch": 1.2687172678321161, "grad_norm": 3.3792314529418945, "learning_rate": 1.5075706394309106e-05, "loss": 0.1802, "step": 12900 }, { "epoch": 1.2697007695901257, "grad_norm": 0.9537076950073242, "learning_rate": 1.5071732305368995e-05, "loss": 0.1846, "step": 12910 }, { "epoch": 1.270684271348135, "grad_norm": 1.3461072444915771, "learning_rate": 1.5067758216428885e-05, "loss": 0.1985, "step": 12920 }, { "epoch": 1.2716677731061443, "grad_norm": 3.7908403873443604, "learning_rate": 1.5063784127488775e-05, "loss": 0.2591, "step": 12930 }, { "epoch": 1.2726512748641539, "grad_norm": 2.368834972381592, "learning_rate": 1.5059810038548663e-05, "loss": 0.2977, "step": 12940 }, { "epoch": 1.2736347766221632, "grad_norm": 1.240795612335205, "learning_rate": 1.5055835949608552e-05, "loss": 0.222, "step": 12950 }, { "epoch": 1.2746182783801725, "grad_norm": 0.35216814279556274, "learning_rate": 1.5051861860668442e-05, "loss": 0.2719, "step": 12960 }, { "epoch": 1.275601780138182, "grad_norm": 0.6168338656425476, "learning_rate": 1.5047887771728333e-05, "loss": 0.3023, "step": 12970 }, { "epoch": 1.2765852818961914, "grad_norm": 5.514144420623779, "learning_rate": 1.5043913682788223e-05, "loss": 0.2775, "step": 12980 }, { "epoch": 1.2775687836542007, "grad_norm": 1.363470435142517, "learning_rate": 1.5039939593848113e-05, "loss": 0.3031, "step": 12990 }, { "epoch": 1.2785522854122102, "grad_norm": 1.6486947536468506, "learning_rate": 1.5035965504908e-05, "loss": 0.1949, "step": 13000 }, { "epoch": 1.2785522854122102, "eval_loss": 0.16120639443397522, "eval_runtime": 17.3094, "eval_samples_per_second": 2.889, "eval_steps_per_second": 1.444, "step": 13000 }, { "epoch": 1.2795357871702195, "grad_norm": 0.5180084109306335, "learning_rate": 1.503199141596789e-05, "loss": 0.2341, "step": 13010 }, { "epoch": 1.2805192889282289, "grad_norm": 0.9390528798103333, "learning_rate": 1.502801732702778e-05, "loss": 0.2389, "step": 13020 }, { "epoch": 1.2815027906862384, "grad_norm": 2.1658847332000732, "learning_rate": 1.502404323808767e-05, "loss": 0.3606, "step": 13030 }, { "epoch": 1.2824862924442477, "grad_norm": 1.2159450054168701, "learning_rate": 1.5020069149147558e-05, "loss": 0.3057, "step": 13040 }, { "epoch": 1.283469794202257, "grad_norm": 2.1090898513793945, "learning_rate": 1.5016095060207449e-05, "loss": 0.2236, "step": 13050 }, { "epoch": 1.2844532959602666, "grad_norm": 0.6798930168151855, "learning_rate": 1.5012120971267339e-05, "loss": 0.228, "step": 13060 }, { "epoch": 1.285436797718276, "grad_norm": 1.0679746866226196, "learning_rate": 1.5008146882327228e-05, "loss": 0.238, "step": 13070 }, { "epoch": 1.2864202994762852, "grad_norm": 1.5444985628128052, "learning_rate": 1.5004172793387118e-05, "loss": 0.2334, "step": 13080 }, { "epoch": 1.2874038012342948, "grad_norm": 2.783684253692627, "learning_rate": 1.5000198704447006e-05, "loss": 0.2227, "step": 13090 }, { "epoch": 1.288387302992304, "grad_norm": 2.795809745788574, "learning_rate": 1.4996224615506896e-05, "loss": 0.283, "step": 13100 }, { "epoch": 1.2893708047503134, "grad_norm": 0.8713507652282715, "learning_rate": 1.4992250526566785e-05, "loss": 0.2309, "step": 13110 }, { "epoch": 1.290354306508323, "grad_norm": 0.8911621570587158, "learning_rate": 1.4988276437626675e-05, "loss": 0.2284, "step": 13120 }, { "epoch": 1.2913378082663323, "grad_norm": 0.840497612953186, "learning_rate": 1.4984302348686566e-05, "loss": 0.23, "step": 13130 }, { "epoch": 1.2923213100243416, "grad_norm": 1.8198657035827637, "learning_rate": 1.4980328259746454e-05, "loss": 0.2973, "step": 13140 }, { "epoch": 1.2933048117823511, "grad_norm": 2.5215814113616943, "learning_rate": 1.4976354170806344e-05, "loss": 0.3074, "step": 13150 }, { "epoch": 1.2942883135403604, "grad_norm": 1.1055022478103638, "learning_rate": 1.4972380081866234e-05, "loss": 0.1795, "step": 13160 }, { "epoch": 1.2952718152983698, "grad_norm": 1.3594915866851807, "learning_rate": 1.4968405992926123e-05, "loss": 0.1897, "step": 13170 }, { "epoch": 1.2962553170563793, "grad_norm": 0.9490194916725159, "learning_rate": 1.4964431903986011e-05, "loss": 0.208, "step": 13180 }, { "epoch": 1.2972388188143886, "grad_norm": 0.922896146774292, "learning_rate": 1.4960457815045901e-05, "loss": 0.2214, "step": 13190 }, { "epoch": 1.298222320572398, "grad_norm": 1.7510035037994385, "learning_rate": 1.495648372610579e-05, "loss": 0.1776, "step": 13200 }, { "epoch": 1.2992058223304075, "grad_norm": 2.1112654209136963, "learning_rate": 1.4952509637165682e-05, "loss": 0.2239, "step": 13210 }, { "epoch": 1.3001893240884168, "grad_norm": 0.6557284593582153, "learning_rate": 1.4948535548225572e-05, "loss": 0.1871, "step": 13220 }, { "epoch": 1.3011728258464261, "grad_norm": 3.473644733428955, "learning_rate": 1.4944561459285461e-05, "loss": 0.2617, "step": 13230 }, { "epoch": 1.3021563276044357, "grad_norm": 2.699770212173462, "learning_rate": 1.494058737034535e-05, "loss": 0.228, "step": 13240 }, { "epoch": 1.303139829362445, "grad_norm": 1.5528502464294434, "learning_rate": 1.4936613281405239e-05, "loss": 0.3006, "step": 13250 }, { "epoch": 1.3041233311204543, "grad_norm": 1.9565653800964355, "learning_rate": 1.4932639192465129e-05, "loss": 0.2494, "step": 13260 }, { "epoch": 1.3051068328784639, "grad_norm": 0.38119733333587646, "learning_rate": 1.4928665103525018e-05, "loss": 0.1939, "step": 13270 }, { "epoch": 1.3060903346364732, "grad_norm": 2.829664707183838, "learning_rate": 1.4924691014584906e-05, "loss": 0.166, "step": 13280 }, { "epoch": 1.3070738363944825, "grad_norm": 1.1717098951339722, "learning_rate": 1.4920716925644797e-05, "loss": 0.2323, "step": 13290 }, { "epoch": 1.308057338152492, "grad_norm": 1.2100815773010254, "learning_rate": 1.4916742836704687e-05, "loss": 0.3392, "step": 13300 }, { "epoch": 1.3090408399105014, "grad_norm": 1.8113962411880493, "learning_rate": 1.4912768747764577e-05, "loss": 0.2639, "step": 13310 }, { "epoch": 1.3100243416685107, "grad_norm": 3.25687837600708, "learning_rate": 1.4908794658824466e-05, "loss": 0.2449, "step": 13320 }, { "epoch": 1.3110078434265202, "grad_norm": 0.5960381031036377, "learning_rate": 1.4904820569884354e-05, "loss": 0.2679, "step": 13330 }, { "epoch": 1.3119913451845295, "grad_norm": 1.6405853033065796, "learning_rate": 1.4900846480944244e-05, "loss": 0.1678, "step": 13340 }, { "epoch": 1.3129748469425389, "grad_norm": 1.3594471216201782, "learning_rate": 1.4896872392004134e-05, "loss": 0.2544, "step": 13350 }, { "epoch": 1.3139583487005484, "grad_norm": 1.7740055322647095, "learning_rate": 1.4892898303064023e-05, "loss": 0.2556, "step": 13360 }, { "epoch": 1.3149418504585577, "grad_norm": 1.7624731063842773, "learning_rate": 1.4888924214123915e-05, "loss": 0.3219, "step": 13370 }, { "epoch": 1.315925352216567, "grad_norm": 1.0945662260055542, "learning_rate": 1.4884950125183803e-05, "loss": 0.3301, "step": 13380 }, { "epoch": 1.3169088539745766, "grad_norm": 0.43884381651878357, "learning_rate": 1.4880976036243692e-05, "loss": 0.3388, "step": 13390 }, { "epoch": 1.317892355732586, "grad_norm": 0.7153629660606384, "learning_rate": 1.4877001947303582e-05, "loss": 0.1712, "step": 13400 }, { "epoch": 1.3188758574905952, "grad_norm": 2.686660051345825, "learning_rate": 1.4873027858363472e-05, "loss": 0.1999, "step": 13410 }, { "epoch": 1.3198593592486048, "grad_norm": 1.2886070013046265, "learning_rate": 1.486905376942336e-05, "loss": 0.3052, "step": 13420 }, { "epoch": 1.320842861006614, "grad_norm": 1.0095949172973633, "learning_rate": 1.486507968048325e-05, "loss": 0.2075, "step": 13430 }, { "epoch": 1.3218263627646234, "grad_norm": 2.6597988605499268, "learning_rate": 1.4861105591543139e-05, "loss": 0.1725, "step": 13440 }, { "epoch": 1.322809864522633, "grad_norm": 0.8734903335571289, "learning_rate": 1.485713150260303e-05, "loss": 0.3388, "step": 13450 }, { "epoch": 1.3237933662806423, "grad_norm": 0.8990864753723145, "learning_rate": 1.485315741366292e-05, "loss": 0.1874, "step": 13460 }, { "epoch": 1.3247768680386516, "grad_norm": 1.3725947141647339, "learning_rate": 1.484918332472281e-05, "loss": 0.2593, "step": 13470 }, { "epoch": 1.3257603697966611, "grad_norm": 2.846909284591675, "learning_rate": 1.4845209235782698e-05, "loss": 0.3015, "step": 13480 }, { "epoch": 1.3267438715546704, "grad_norm": 0.32054901123046875, "learning_rate": 1.4841235146842587e-05, "loss": 0.1395, "step": 13490 }, { "epoch": 1.3277273733126798, "grad_norm": 1.4633597135543823, "learning_rate": 1.4837261057902477e-05, "loss": 0.312, "step": 13500 }, { "epoch": 1.3277273733126798, "eval_loss": 0.16013847291469574, "eval_runtime": 15.9302, "eval_samples_per_second": 3.139, "eval_steps_per_second": 1.569, "step": 13500 }, { "epoch": 1.3287108750706893, "grad_norm": 1.0678695440292358, "learning_rate": 1.4833286968962367e-05, "loss": 0.153, "step": 13510 }, { "epoch": 1.3296943768286986, "grad_norm": 2.600311040878296, "learning_rate": 1.4829312880022255e-05, "loss": 0.1909, "step": 13520 }, { "epoch": 1.330677878586708, "grad_norm": 0.7847926020622253, "learning_rate": 1.4825338791082146e-05, "loss": 0.3542, "step": 13530 }, { "epoch": 1.3316613803447175, "grad_norm": 1.6524434089660645, "learning_rate": 1.4821364702142036e-05, "loss": 0.1395, "step": 13540 }, { "epoch": 1.3326448821027268, "grad_norm": 0.7375239729881287, "learning_rate": 1.4817390613201925e-05, "loss": 0.2814, "step": 13550 }, { "epoch": 1.3336283838607361, "grad_norm": 1.0905476808547974, "learning_rate": 1.4813416524261815e-05, "loss": 0.1523, "step": 13560 }, { "epoch": 1.3346118856187457, "grad_norm": 1.2427202463150024, "learning_rate": 1.4809442435321703e-05, "loss": 0.2604, "step": 13570 }, { "epoch": 1.335595387376755, "grad_norm": 1.518129825592041, "learning_rate": 1.4805468346381593e-05, "loss": 0.2609, "step": 13580 }, { "epoch": 1.3365788891347643, "grad_norm": 0.9168410897254944, "learning_rate": 1.4801494257441482e-05, "loss": 0.2219, "step": 13590 }, { "epoch": 1.3375623908927738, "grad_norm": 0.8411401510238647, "learning_rate": 1.4797520168501372e-05, "loss": 0.2587, "step": 13600 }, { "epoch": 1.3385458926507832, "grad_norm": 2.603593587875366, "learning_rate": 1.4793546079561263e-05, "loss": 0.1987, "step": 13610 }, { "epoch": 1.3395293944087925, "grad_norm": 1.346815824508667, "learning_rate": 1.4789571990621151e-05, "loss": 0.284, "step": 13620 }, { "epoch": 1.340512896166802, "grad_norm": 1.0927194356918335, "learning_rate": 1.4785597901681041e-05, "loss": 0.2548, "step": 13630 }, { "epoch": 1.3414963979248113, "grad_norm": 1.6921230554580688, "learning_rate": 1.478162381274093e-05, "loss": 0.2953, "step": 13640 }, { "epoch": 1.3424798996828207, "grad_norm": 0.9991982579231262, "learning_rate": 1.477764972380082e-05, "loss": 0.19, "step": 13650 }, { "epoch": 1.3434634014408302, "grad_norm": 1.5114970207214355, "learning_rate": 1.4773675634860708e-05, "loss": 0.2231, "step": 13660 }, { "epoch": 1.3444469031988395, "grad_norm": 1.9252091646194458, "learning_rate": 1.4769701545920598e-05, "loss": 0.2684, "step": 13670 }, { "epoch": 1.3454304049568488, "grad_norm": 0.333156019449234, "learning_rate": 1.4765727456980488e-05, "loss": 0.1842, "step": 13680 }, { "epoch": 1.3464139067148584, "grad_norm": 1.4571133852005005, "learning_rate": 1.4761753368040377e-05, "loss": 0.3031, "step": 13690 }, { "epoch": 1.3473974084728677, "grad_norm": 3.225837469100952, "learning_rate": 1.4757779279100269e-05, "loss": 0.2891, "step": 13700 }, { "epoch": 1.348380910230877, "grad_norm": 0.8479882478713989, "learning_rate": 1.4753805190160158e-05, "loss": 0.2203, "step": 13710 }, { "epoch": 1.3493644119888863, "grad_norm": 1.3609168529510498, "learning_rate": 1.4749831101220046e-05, "loss": 0.3754, "step": 13720 }, { "epoch": 1.350347913746896, "grad_norm": 2.286158561706543, "learning_rate": 1.4745857012279936e-05, "loss": 0.3493, "step": 13730 }, { "epoch": 1.3513314155049052, "grad_norm": 0.7514020204544067, "learning_rate": 1.4741882923339826e-05, "loss": 0.3501, "step": 13740 }, { "epoch": 1.3523149172629145, "grad_norm": 1.157344937324524, "learning_rate": 1.4737908834399715e-05, "loss": 0.3249, "step": 13750 }, { "epoch": 1.353298419020924, "grad_norm": 2.1621084213256836, "learning_rate": 1.4733934745459603e-05, "loss": 0.1955, "step": 13760 }, { "epoch": 1.3542819207789334, "grad_norm": 1.8904935121536255, "learning_rate": 1.4729960656519493e-05, "loss": 0.2264, "step": 13770 }, { "epoch": 1.3552654225369427, "grad_norm": 0.9935181736946106, "learning_rate": 1.4725986567579384e-05, "loss": 0.2989, "step": 13780 }, { "epoch": 1.3562489242949523, "grad_norm": 1.2855091094970703, "learning_rate": 1.4722012478639274e-05, "loss": 0.1557, "step": 13790 }, { "epoch": 1.3572324260529616, "grad_norm": 0.6436872482299805, "learning_rate": 1.4718038389699164e-05, "loss": 0.3041, "step": 13800 }, { "epoch": 1.358215927810971, "grad_norm": 2.33740496635437, "learning_rate": 1.4714064300759052e-05, "loss": 0.3134, "step": 13810 }, { "epoch": 1.3591994295689804, "grad_norm": 0.6652860641479492, "learning_rate": 1.4710090211818941e-05, "loss": 0.232, "step": 13820 }, { "epoch": 1.3601829313269898, "grad_norm": 1.8206769227981567, "learning_rate": 1.4706116122878831e-05, "loss": 0.2781, "step": 13830 }, { "epoch": 1.361166433084999, "grad_norm": 2.324544906616211, "learning_rate": 1.470214203393872e-05, "loss": 0.3082, "step": 13840 }, { "epoch": 1.3621499348430086, "grad_norm": 1.1942061185836792, "learning_rate": 1.4698167944998609e-05, "loss": 0.2314, "step": 13850 }, { "epoch": 1.363133436601018, "grad_norm": 0.7377877831459045, "learning_rate": 1.46941938560585e-05, "loss": 0.1839, "step": 13860 }, { "epoch": 1.3641169383590273, "grad_norm": 1.2359178066253662, "learning_rate": 1.469021976711839e-05, "loss": 0.33, "step": 13870 }, { "epoch": 1.3651004401170366, "grad_norm": 0.2554020583629608, "learning_rate": 1.468624567817828e-05, "loss": 0.2232, "step": 13880 }, { "epoch": 1.3660839418750461, "grad_norm": 1.4700244665145874, "learning_rate": 1.4682271589238169e-05, "loss": 0.2821, "step": 13890 }, { "epoch": 1.3670674436330554, "grad_norm": 2.099269390106201, "learning_rate": 1.4678297500298057e-05, "loss": 0.3381, "step": 13900 }, { "epoch": 1.3680509453910648, "grad_norm": 1.11494779586792, "learning_rate": 1.4674323411357947e-05, "loss": 0.2198, "step": 13910 }, { "epoch": 1.3690344471490743, "grad_norm": 0.775420069694519, "learning_rate": 1.4670349322417836e-05, "loss": 0.2713, "step": 13920 }, { "epoch": 1.3700179489070836, "grad_norm": 0.717750072479248, "learning_rate": 1.4666375233477726e-05, "loss": 0.2709, "step": 13930 }, { "epoch": 1.371001450665093, "grad_norm": 4.715325355529785, "learning_rate": 1.4662401144537617e-05, "loss": 0.3318, "step": 13940 }, { "epoch": 1.3719849524231025, "grad_norm": 1.486010193824768, "learning_rate": 1.4658427055597507e-05, "loss": 0.2406, "step": 13950 }, { "epoch": 1.3729684541811118, "grad_norm": 0.7549448609352112, "learning_rate": 1.4654452966657395e-05, "loss": 0.2218, "step": 13960 }, { "epoch": 1.3739519559391211, "grad_norm": 4.172186374664307, "learning_rate": 1.4650478877717285e-05, "loss": 0.2255, "step": 13970 }, { "epoch": 1.3749354576971307, "grad_norm": 1.9817334413528442, "learning_rate": 1.4646504788777174e-05, "loss": 0.2186, "step": 13980 }, { "epoch": 1.37591895945514, "grad_norm": 0.8794751763343811, "learning_rate": 1.4642530699837064e-05, "loss": 0.225, "step": 13990 }, { "epoch": 1.3769024612131493, "grad_norm": 2.93635630607605, "learning_rate": 1.4638556610896952e-05, "loss": 0.2007, "step": 14000 }, { "epoch": 1.3769024612131493, "eval_loss": 0.1604984849691391, "eval_runtime": 18.6319, "eval_samples_per_second": 2.684, "eval_steps_per_second": 1.342, "step": 14000 }, { "epoch": 1.3778859629711588, "grad_norm": 0.817743182182312, "learning_rate": 1.4634582521956842e-05, "loss": 0.3481, "step": 14010 }, { "epoch": 1.3788694647291682, "grad_norm": 1.1612430810928345, "learning_rate": 1.4630608433016733e-05, "loss": 0.2793, "step": 14020 }, { "epoch": 1.3798529664871775, "grad_norm": 3.0838816165924072, "learning_rate": 1.4626634344076623e-05, "loss": 0.2032, "step": 14030 }, { "epoch": 1.380836468245187, "grad_norm": 0.568408191204071, "learning_rate": 1.4622660255136512e-05, "loss": 0.2087, "step": 14040 }, { "epoch": 1.3818199700031963, "grad_norm": 3.588900327682495, "learning_rate": 1.46186861661964e-05, "loss": 0.2822, "step": 14050 }, { "epoch": 1.3828034717612057, "grad_norm": 1.1956599950790405, "learning_rate": 1.461471207725629e-05, "loss": 0.3036, "step": 14060 }, { "epoch": 1.3837869735192152, "grad_norm": 0.5779142379760742, "learning_rate": 1.461073798831618e-05, "loss": 0.2908, "step": 14070 }, { "epoch": 1.3847704752772245, "grad_norm": 1.5077542066574097, "learning_rate": 1.460676389937607e-05, "loss": 0.2289, "step": 14080 }, { "epoch": 1.3857539770352338, "grad_norm": 1.0276594161987305, "learning_rate": 1.4602789810435957e-05, "loss": 0.3104, "step": 14090 }, { "epoch": 1.3867374787932434, "grad_norm": 1.3784536123275757, "learning_rate": 1.4598815721495848e-05, "loss": 0.2864, "step": 14100 }, { "epoch": 1.3877209805512527, "grad_norm": 0.7437291145324707, "learning_rate": 1.4594841632555738e-05, "loss": 0.236, "step": 14110 }, { "epoch": 1.388704482309262, "grad_norm": 0.7670959234237671, "learning_rate": 1.4590867543615628e-05, "loss": 0.273, "step": 14120 }, { "epoch": 1.3896879840672716, "grad_norm": 1.2429335117340088, "learning_rate": 1.4586893454675517e-05, "loss": 0.21, "step": 14130 }, { "epoch": 1.3906714858252809, "grad_norm": 0.8416498303413391, "learning_rate": 1.4582919365735405e-05, "loss": 0.2271, "step": 14140 }, { "epoch": 1.3916549875832902, "grad_norm": 0.8700377941131592, "learning_rate": 1.4578945276795295e-05, "loss": 0.2903, "step": 14150 }, { "epoch": 1.3926384893412997, "grad_norm": 1.6111570596694946, "learning_rate": 1.4574971187855185e-05, "loss": 0.1538, "step": 14160 }, { "epoch": 1.393621991099309, "grad_norm": 2.0806915760040283, "learning_rate": 1.4570997098915074e-05, "loss": 0.2772, "step": 14170 }, { "epoch": 1.3946054928573184, "grad_norm": 2.6657345294952393, "learning_rate": 1.4567023009974966e-05, "loss": 0.2076, "step": 14180 }, { "epoch": 1.395588994615328, "grad_norm": 0.41977888345718384, "learning_rate": 1.4563048921034855e-05, "loss": 0.1928, "step": 14190 }, { "epoch": 1.3965724963733372, "grad_norm": 2.8233563899993896, "learning_rate": 1.4559074832094743e-05, "loss": 0.309, "step": 14200 }, { "epoch": 1.3975559981313466, "grad_norm": 0.8026371598243713, "learning_rate": 1.4555100743154633e-05, "loss": 0.2731, "step": 14210 }, { "epoch": 1.398539499889356, "grad_norm": 1.8230639696121216, "learning_rate": 1.4551126654214523e-05, "loss": 0.1933, "step": 14220 }, { "epoch": 1.3995230016473654, "grad_norm": 2.1196181774139404, "learning_rate": 1.4547152565274412e-05, "loss": 0.2856, "step": 14230 }, { "epoch": 1.4005065034053747, "grad_norm": 0.863348662853241, "learning_rate": 1.45431784763343e-05, "loss": 0.3251, "step": 14240 }, { "epoch": 1.4014900051633843, "grad_norm": 0.5216766595840454, "learning_rate": 1.453920438739419e-05, "loss": 0.292, "step": 14250 }, { "epoch": 1.4024735069213936, "grad_norm": 1.193357229232788, "learning_rate": 1.4535230298454081e-05, "loss": 0.2078, "step": 14260 }, { "epoch": 1.403457008679403, "grad_norm": 1.630039095878601, "learning_rate": 1.4531256209513971e-05, "loss": 0.26, "step": 14270 }, { "epoch": 1.4044405104374125, "grad_norm": 1.493647813796997, "learning_rate": 1.452728212057386e-05, "loss": 0.1423, "step": 14280 }, { "epoch": 1.4054240121954218, "grad_norm": 1.684227466583252, "learning_rate": 1.4523308031633749e-05, "loss": 0.2289, "step": 14290 }, { "epoch": 1.406407513953431, "grad_norm": 0.8049089908599854, "learning_rate": 1.4519333942693638e-05, "loss": 0.285, "step": 14300 }, { "epoch": 1.4073910157114407, "grad_norm": 2.782198905944824, "learning_rate": 1.4515359853753528e-05, "loss": 0.1941, "step": 14310 }, { "epoch": 1.40837451746945, "grad_norm": 0.9174168109893799, "learning_rate": 1.4511385764813418e-05, "loss": 0.2563, "step": 14320 }, { "epoch": 1.4093580192274593, "grad_norm": 0.38679203391075134, "learning_rate": 1.4507411675873306e-05, "loss": 0.2634, "step": 14330 }, { "epoch": 1.4103415209854688, "grad_norm": 1.094164490699768, "learning_rate": 1.4503437586933197e-05, "loss": 0.2494, "step": 14340 }, { "epoch": 1.4113250227434782, "grad_norm": 3.6802244186401367, "learning_rate": 1.4499463497993087e-05, "loss": 0.2378, "step": 14350 }, { "epoch": 1.4123085245014875, "grad_norm": 2.8969945907592773, "learning_rate": 1.4495489409052976e-05, "loss": 0.246, "step": 14360 }, { "epoch": 1.413292026259497, "grad_norm": 5.990109443664551, "learning_rate": 1.4491515320112866e-05, "loss": 0.2624, "step": 14370 }, { "epoch": 1.4142755280175063, "grad_norm": 0.3921045958995819, "learning_rate": 1.4487541231172754e-05, "loss": 0.1834, "step": 14380 }, { "epoch": 1.4152590297755157, "grad_norm": 0.8760048747062683, "learning_rate": 1.4483567142232644e-05, "loss": 0.2399, "step": 14390 }, { "epoch": 1.4162425315335252, "grad_norm": 0.5012834668159485, "learning_rate": 1.4479593053292533e-05, "loss": 0.1621, "step": 14400 }, { "epoch": 1.4172260332915345, "grad_norm": 2.5045790672302246, "learning_rate": 1.4475618964352423e-05, "loss": 0.2474, "step": 14410 }, { "epoch": 1.4182095350495438, "grad_norm": 0.9351314902305603, "learning_rate": 1.4471644875412314e-05, "loss": 0.1777, "step": 14420 }, { "epoch": 1.4191930368075534, "grad_norm": 1.5870965719223022, "learning_rate": 1.4467670786472204e-05, "loss": 0.3198, "step": 14430 }, { "epoch": 1.4201765385655627, "grad_norm": 1.0936942100524902, "learning_rate": 1.4463696697532092e-05, "loss": 0.2152, "step": 14440 }, { "epoch": 1.421160040323572, "grad_norm": 0.5950337648391724, "learning_rate": 1.4459722608591982e-05, "loss": 0.2609, "step": 14450 }, { "epoch": 1.4221435420815816, "grad_norm": 0.7351011633872986, "learning_rate": 1.4455748519651871e-05, "loss": 0.2023, "step": 14460 }, { "epoch": 1.4231270438395909, "grad_norm": 1.832258939743042, "learning_rate": 1.4451774430711761e-05, "loss": 0.2673, "step": 14470 }, { "epoch": 1.4241105455976002, "grad_norm": 1.5636357069015503, "learning_rate": 1.4447800341771649e-05, "loss": 0.2644, "step": 14480 }, { "epoch": 1.4250940473556097, "grad_norm": 1.3714380264282227, "learning_rate": 1.4443826252831539e-05, "loss": 0.2067, "step": 14490 }, { "epoch": 1.426077549113619, "grad_norm": 3.591639757156372, "learning_rate": 1.4439852163891428e-05, "loss": 0.2843, "step": 14500 }, { "epoch": 1.426077549113619, "eval_loss": 0.1673988401889801, "eval_runtime": 16.9001, "eval_samples_per_second": 2.959, "eval_steps_per_second": 1.479, "step": 14500 }, { "epoch": 1.4270610508716284, "grad_norm": 1.7980598211288452, "learning_rate": 1.443587807495132e-05, "loss": 0.2681, "step": 14510 }, { "epoch": 1.428044552629638, "grad_norm": 1.097467064857483, "learning_rate": 1.443190398601121e-05, "loss": 0.262, "step": 14520 }, { "epoch": 1.4290280543876472, "grad_norm": 3.742469072341919, "learning_rate": 1.4427929897071097e-05, "loss": 0.1493, "step": 14530 }, { "epoch": 1.4300115561456566, "grad_norm": 0.4028743505477905, "learning_rate": 1.4423955808130987e-05, "loss": 0.3007, "step": 14540 }, { "epoch": 1.430995057903666, "grad_norm": 1.2121284008026123, "learning_rate": 1.4419981719190877e-05, "loss": 0.2974, "step": 14550 }, { "epoch": 1.4319785596616754, "grad_norm": 1.3690731525421143, "learning_rate": 1.4416007630250766e-05, "loss": 0.199, "step": 14560 }, { "epoch": 1.4329620614196847, "grad_norm": 1.1825859546661377, "learning_rate": 1.4412033541310654e-05, "loss": 0.2872, "step": 14570 }, { "epoch": 1.4339455631776943, "grad_norm": 2.86308217048645, "learning_rate": 1.4408059452370544e-05, "loss": 0.2329, "step": 14580 }, { "epoch": 1.4349290649357036, "grad_norm": 1.6855119466781616, "learning_rate": 1.4404085363430435e-05, "loss": 0.2582, "step": 14590 }, { "epoch": 1.435912566693713, "grad_norm": 4.255486965179443, "learning_rate": 1.4400111274490325e-05, "loss": 0.345, "step": 14600 }, { "epoch": 1.4368960684517225, "grad_norm": 3.486799478530884, "learning_rate": 1.4396137185550215e-05, "loss": 0.166, "step": 14610 }, { "epoch": 1.4378795702097318, "grad_norm": 0.6261281371116638, "learning_rate": 1.4392163096610103e-05, "loss": 0.1763, "step": 14620 }, { "epoch": 1.438863071967741, "grad_norm": 0.37830328941345215, "learning_rate": 1.4388189007669992e-05, "loss": 0.2208, "step": 14630 }, { "epoch": 1.4398465737257506, "grad_norm": 1.6429592370986938, "learning_rate": 1.4384214918729882e-05, "loss": 0.2523, "step": 14640 }, { "epoch": 1.44083007548376, "grad_norm": 0.754708468914032, "learning_rate": 1.4380240829789772e-05, "loss": 0.281, "step": 14650 }, { "epoch": 1.4418135772417693, "grad_norm": 0.8801028728485107, "learning_rate": 1.437626674084966e-05, "loss": 0.2863, "step": 14660 }, { "epoch": 1.4427970789997788, "grad_norm": 3.150028944015503, "learning_rate": 1.4372292651909553e-05, "loss": 0.2345, "step": 14670 }, { "epoch": 1.4437805807577881, "grad_norm": 0.9762952923774719, "learning_rate": 1.436831856296944e-05, "loss": 0.2124, "step": 14680 }, { "epoch": 1.4447640825157975, "grad_norm": 1.6731175184249878, "learning_rate": 1.436434447402933e-05, "loss": 0.2326, "step": 14690 }, { "epoch": 1.445747584273807, "grad_norm": 2.7856829166412354, "learning_rate": 1.436037038508922e-05, "loss": 0.2434, "step": 14700 }, { "epoch": 1.4467310860318163, "grad_norm": 1.2525384426116943, "learning_rate": 1.435639629614911e-05, "loss": 0.2459, "step": 14710 }, { "epoch": 1.4477145877898256, "grad_norm": 0.6227846741676331, "learning_rate": 1.4352422207208998e-05, "loss": 0.2067, "step": 14720 }, { "epoch": 1.4486980895478352, "grad_norm": 1.1095690727233887, "learning_rate": 1.4348448118268887e-05, "loss": 0.1717, "step": 14730 }, { "epoch": 1.4496815913058445, "grad_norm": 0.937445878982544, "learning_rate": 1.4344474029328777e-05, "loss": 0.1699, "step": 14740 }, { "epoch": 1.4506650930638538, "grad_norm": 0.8587270379066467, "learning_rate": 1.4340499940388668e-05, "loss": 0.2592, "step": 14750 }, { "epoch": 1.4516485948218634, "grad_norm": 1.02701997756958, "learning_rate": 1.4336525851448558e-05, "loss": 0.2234, "step": 14760 }, { "epoch": 1.4526320965798727, "grad_norm": 1.4647469520568848, "learning_rate": 1.4332551762508446e-05, "loss": 0.2237, "step": 14770 }, { "epoch": 1.453615598337882, "grad_norm": 0.657336950302124, "learning_rate": 1.4328577673568336e-05, "loss": 0.1871, "step": 14780 }, { "epoch": 1.4545991000958916, "grad_norm": 1.5731921195983887, "learning_rate": 1.4324603584628225e-05, "loss": 0.2241, "step": 14790 }, { "epoch": 1.4555826018539009, "grad_norm": 2.8147099018096924, "learning_rate": 1.4320629495688115e-05, "loss": 0.2887, "step": 14800 }, { "epoch": 1.4565661036119102, "grad_norm": 1.6455349922180176, "learning_rate": 1.4316655406748003e-05, "loss": 0.2719, "step": 14810 }, { "epoch": 1.4575496053699197, "grad_norm": 1.0510404109954834, "learning_rate": 1.4312681317807892e-05, "loss": 0.293, "step": 14820 }, { "epoch": 1.458533107127929, "grad_norm": 2.582322120666504, "learning_rate": 1.4308707228867784e-05, "loss": 0.1853, "step": 14830 }, { "epoch": 1.4595166088859384, "grad_norm": 1.337494134902954, "learning_rate": 1.4304733139927674e-05, "loss": 0.3178, "step": 14840 }, { "epoch": 1.460500110643948, "grad_norm": 0.4273045063018799, "learning_rate": 1.4300759050987563e-05, "loss": 0.2021, "step": 14850 }, { "epoch": 1.4614836124019572, "grad_norm": 1.0830575227737427, "learning_rate": 1.4296784962047451e-05, "loss": 0.3078, "step": 14860 }, { "epoch": 1.4624671141599666, "grad_norm": 0.7531421184539795, "learning_rate": 1.429281087310734e-05, "loss": 0.2925, "step": 14870 }, { "epoch": 1.4634506159179759, "grad_norm": 1.9662960767745972, "learning_rate": 1.428883678416723e-05, "loss": 0.2291, "step": 14880 }, { "epoch": 1.4644341176759854, "grad_norm": 0.8408644795417786, "learning_rate": 1.428486269522712e-05, "loss": 0.3016, "step": 14890 }, { "epoch": 1.4654176194339947, "grad_norm": 0.5292760729789734, "learning_rate": 1.4280888606287008e-05, "loss": 0.1698, "step": 14900 }, { "epoch": 1.466401121192004, "grad_norm": 1.8510633707046509, "learning_rate": 1.4276914517346901e-05, "loss": 0.2566, "step": 14910 }, { "epoch": 1.4673846229500136, "grad_norm": 0.41463837027549744, "learning_rate": 1.4272940428406789e-05, "loss": 0.1905, "step": 14920 }, { "epoch": 1.468368124708023, "grad_norm": 1.1431360244750977, "learning_rate": 1.4268966339466679e-05, "loss": 0.1914, "step": 14930 }, { "epoch": 1.4693516264660322, "grad_norm": 2.338446617126465, "learning_rate": 1.4264992250526568e-05, "loss": 0.3817, "step": 14940 }, { "epoch": 1.4703351282240418, "grad_norm": 0.3825877606868744, "learning_rate": 1.4261018161586458e-05, "loss": 0.265, "step": 14950 }, { "epoch": 1.471318629982051, "grad_norm": 6.677371025085449, "learning_rate": 1.4257044072646346e-05, "loss": 0.2294, "step": 14960 }, { "epoch": 1.4723021317400604, "grad_norm": 0.9877021312713623, "learning_rate": 1.4253069983706236e-05, "loss": 0.186, "step": 14970 }, { "epoch": 1.47328563349807, "grad_norm": 0.9139722585678101, "learning_rate": 1.4249095894766125e-05, "loss": 0.2549, "step": 14980 }, { "epoch": 1.4742691352560793, "grad_norm": 8.23216438293457, "learning_rate": 1.4245121805826017e-05, "loss": 0.2506, "step": 14990 }, { "epoch": 1.4752526370140886, "grad_norm": 1.6111382246017456, "learning_rate": 1.4241147716885906e-05, "loss": 0.1843, "step": 15000 }, { "epoch": 1.4752526370140886, "eval_loss": 0.15471838414669037, "eval_runtime": 19.6276, "eval_samples_per_second": 2.547, "eval_steps_per_second": 1.274, "step": 15000 }, { "epoch": 1.476236138772098, "grad_norm": 4.984705448150635, "learning_rate": 1.4237173627945794e-05, "loss": 0.2281, "step": 15010 }, { "epoch": 1.4772196405301075, "grad_norm": 0.5729973316192627, "learning_rate": 1.4233199539005684e-05, "loss": 0.1993, "step": 15020 }, { "epoch": 1.4782031422881168, "grad_norm": 1.0867680311203003, "learning_rate": 1.4229225450065574e-05, "loss": 0.2719, "step": 15030 }, { "epoch": 1.479186644046126, "grad_norm": 0.7489296793937683, "learning_rate": 1.4225251361125463e-05, "loss": 0.2843, "step": 15040 }, { "epoch": 1.4801701458041356, "grad_norm": 2.038597345352173, "learning_rate": 1.4221277272185351e-05, "loss": 0.1898, "step": 15050 }, { "epoch": 1.481153647562145, "grad_norm": 0.9547818899154663, "learning_rate": 1.4217303183245241e-05, "loss": 0.2054, "step": 15060 }, { "epoch": 1.4821371493201543, "grad_norm": 0.9691654443740845, "learning_rate": 1.4213329094305132e-05, "loss": 0.2412, "step": 15070 }, { "epoch": 1.4831206510781638, "grad_norm": 0.49495363235473633, "learning_rate": 1.4209355005365022e-05, "loss": 0.2072, "step": 15080 }, { "epoch": 1.4841041528361731, "grad_norm": 1.4077001810073853, "learning_rate": 1.4205380916424912e-05, "loss": 0.3349, "step": 15090 }, { "epoch": 1.4850876545941825, "grad_norm": 2.472428560256958, "learning_rate": 1.42014068274848e-05, "loss": 0.1961, "step": 15100 }, { "epoch": 1.486071156352192, "grad_norm": 2.658175230026245, "learning_rate": 1.419743273854469e-05, "loss": 0.3282, "step": 15110 }, { "epoch": 1.4870546581102013, "grad_norm": 1.4328405857086182, "learning_rate": 1.4193458649604579e-05, "loss": 0.1471, "step": 15120 }, { "epoch": 1.4880381598682106, "grad_norm": 8.422746658325195, "learning_rate": 1.4189484560664469e-05, "loss": 0.3153, "step": 15130 }, { "epoch": 1.4890216616262202, "grad_norm": 1.2293131351470947, "learning_rate": 1.4185510471724357e-05, "loss": 0.1695, "step": 15140 }, { "epoch": 1.4900051633842295, "grad_norm": 1.2073103189468384, "learning_rate": 1.418153638278425e-05, "loss": 0.2651, "step": 15150 }, { "epoch": 1.4909886651422388, "grad_norm": 1.6706494092941284, "learning_rate": 1.4177562293844138e-05, "loss": 0.1824, "step": 15160 }, { "epoch": 1.4919721669002484, "grad_norm": 3.6499946117401123, "learning_rate": 1.4173588204904027e-05, "loss": 0.2299, "step": 15170 }, { "epoch": 1.4929556686582577, "grad_norm": 0.6618145704269409, "learning_rate": 1.4169614115963917e-05, "loss": 0.2916, "step": 15180 }, { "epoch": 1.493939170416267, "grad_norm": 1.2981956005096436, "learning_rate": 1.4165640027023807e-05, "loss": 0.19, "step": 15190 }, { "epoch": 1.4949226721742765, "grad_norm": 2.071259021759033, "learning_rate": 1.4161665938083695e-05, "loss": 0.2375, "step": 15200 }, { "epoch": 1.4959061739322859, "grad_norm": 0.239904522895813, "learning_rate": 1.4157691849143584e-05, "loss": 0.2239, "step": 15210 }, { "epoch": 1.4968896756902952, "grad_norm": 1.6176055669784546, "learning_rate": 1.4153717760203474e-05, "loss": 0.2425, "step": 15220 }, { "epoch": 1.4978731774483047, "grad_norm": 1.9910478591918945, "learning_rate": 1.4149743671263365e-05, "loss": 0.2636, "step": 15230 }, { "epoch": 1.498856679206314, "grad_norm": 2.132188081741333, "learning_rate": 1.4145769582323255e-05, "loss": 0.1761, "step": 15240 }, { "epoch": 1.4998401809643234, "grad_norm": 0.621258020401001, "learning_rate": 1.4141795493383143e-05, "loss": 0.2084, "step": 15250 }, { "epoch": 1.500823682722333, "grad_norm": 0.6594855189323425, "learning_rate": 1.4137821404443033e-05, "loss": 0.131, "step": 15260 }, { "epoch": 1.5018071844803422, "grad_norm": 2.1136739253997803, "learning_rate": 1.4133847315502922e-05, "loss": 0.2408, "step": 15270 }, { "epoch": 1.5027906862383515, "grad_norm": 0.18802011013031006, "learning_rate": 1.4129873226562812e-05, "loss": 0.1964, "step": 15280 }, { "epoch": 1.503774187996361, "grad_norm": 1.3580917119979858, "learning_rate": 1.41258991376227e-05, "loss": 0.3299, "step": 15290 }, { "epoch": 1.5047576897543704, "grad_norm": 0.8832782506942749, "learning_rate": 1.412192504868259e-05, "loss": 0.1468, "step": 15300 }, { "epoch": 1.5057411915123797, "grad_norm": 1.706316590309143, "learning_rate": 1.411795095974248e-05, "loss": 0.2762, "step": 15310 }, { "epoch": 1.5067246932703893, "grad_norm": 0.9284764528274536, "learning_rate": 1.411397687080237e-05, "loss": 0.2199, "step": 15320 }, { "epoch": 1.5077081950283986, "grad_norm": 2.1995420455932617, "learning_rate": 1.411000278186226e-05, "loss": 0.2495, "step": 15330 }, { "epoch": 1.508691696786408, "grad_norm": 0.9569781422615051, "learning_rate": 1.4106028692922148e-05, "loss": 0.2262, "step": 15340 }, { "epoch": 1.5096751985444175, "grad_norm": 2.7542948722839355, "learning_rate": 1.4102054603982038e-05, "loss": 0.3087, "step": 15350 }, { "epoch": 1.5106587003024268, "grad_norm": 0.3515753746032715, "learning_rate": 1.4098080515041928e-05, "loss": 0.1827, "step": 15360 }, { "epoch": 1.511642202060436, "grad_norm": 0.8161625862121582, "learning_rate": 1.4094106426101817e-05, "loss": 0.2475, "step": 15370 }, { "epoch": 1.5126257038184456, "grad_norm": 1.6958364248275757, "learning_rate": 1.4090132337161705e-05, "loss": 0.1875, "step": 15380 }, { "epoch": 1.513609205576455, "grad_norm": 2.8651888370513916, "learning_rate": 1.4086158248221595e-05, "loss": 0.2401, "step": 15390 }, { "epoch": 1.5145927073344643, "grad_norm": 0.4766581058502197, "learning_rate": 1.4082184159281486e-05, "loss": 0.2394, "step": 15400 }, { "epoch": 1.5155762090924738, "grad_norm": 2.744847059249878, "learning_rate": 1.4078210070341376e-05, "loss": 0.2546, "step": 15410 }, { "epoch": 1.5165597108504831, "grad_norm": 1.2522701025009155, "learning_rate": 1.4074235981401266e-05, "loss": 0.2419, "step": 15420 }, { "epoch": 1.5175432126084925, "grad_norm": 0.5592695474624634, "learning_rate": 1.4070261892461155e-05, "loss": 0.2088, "step": 15430 }, { "epoch": 1.518526714366502, "grad_norm": 0.7614436745643616, "learning_rate": 1.4066287803521043e-05, "loss": 0.2493, "step": 15440 }, { "epoch": 1.5195102161245113, "grad_norm": 3.61337947845459, "learning_rate": 1.4062313714580933e-05, "loss": 0.3177, "step": 15450 }, { "epoch": 1.5204937178825206, "grad_norm": 1.7841546535491943, "learning_rate": 1.4058339625640823e-05, "loss": 0.316, "step": 15460 }, { "epoch": 1.5214772196405302, "grad_norm": 0.8750028610229492, "learning_rate": 1.4054365536700712e-05, "loss": 0.2631, "step": 15470 }, { "epoch": 1.5224607213985395, "grad_norm": 1.0509700775146484, "learning_rate": 1.4050391447760604e-05, "loss": 0.1831, "step": 15480 }, { "epoch": 1.5234442231565488, "grad_norm": 1.5600563287734985, "learning_rate": 1.4046417358820492e-05, "loss": 0.1879, "step": 15490 }, { "epoch": 1.5244277249145584, "grad_norm": 1.9055111408233643, "learning_rate": 1.4042443269880381e-05, "loss": 0.2838, "step": 15500 }, { "epoch": 1.5244277249145584, "eval_loss": 0.15059475600719452, "eval_runtime": 18.3111, "eval_samples_per_second": 2.731, "eval_steps_per_second": 1.365, "step": 15500 }, { "epoch": 1.5254112266725677, "grad_norm": 2.142474412918091, "learning_rate": 1.4038469180940271e-05, "loss": 0.1434, "step": 15510 }, { "epoch": 1.526394728430577, "grad_norm": 1.5137566328048706, "learning_rate": 1.403449509200016e-05, "loss": 0.1432, "step": 15520 }, { "epoch": 1.5273782301885865, "grad_norm": 0.8162533044815063, "learning_rate": 1.4030521003060049e-05, "loss": 0.1593, "step": 15530 }, { "epoch": 1.5283617319465959, "grad_norm": 2.5503768920898438, "learning_rate": 1.4026546914119938e-05, "loss": 0.2212, "step": 15540 }, { "epoch": 1.5293452337046052, "grad_norm": 1.5552313327789307, "learning_rate": 1.4022572825179828e-05, "loss": 0.2256, "step": 15550 }, { "epoch": 1.5303287354626147, "grad_norm": 0.6033895611763, "learning_rate": 1.401859873623972e-05, "loss": 0.1807, "step": 15560 }, { "epoch": 1.531312237220624, "grad_norm": 2.3704938888549805, "learning_rate": 1.4014624647299609e-05, "loss": 0.336, "step": 15570 }, { "epoch": 1.5322957389786334, "grad_norm": 0.3030024468898773, "learning_rate": 1.4010650558359497e-05, "loss": 0.2108, "step": 15580 }, { "epoch": 1.533279240736643, "grad_norm": 0.4000900089740753, "learning_rate": 1.4006676469419387e-05, "loss": 0.195, "step": 15590 }, { "epoch": 1.5342627424946522, "grad_norm": 2.0329573154449463, "learning_rate": 1.4002702380479276e-05, "loss": 0.2763, "step": 15600 }, { "epoch": 1.5352462442526615, "grad_norm": 0.7310929298400879, "learning_rate": 1.3998728291539166e-05, "loss": 0.2881, "step": 15610 }, { "epoch": 1.536229746010671, "grad_norm": 1.0867869853973389, "learning_rate": 1.3994754202599054e-05, "loss": 0.3376, "step": 15620 }, { "epoch": 1.5372132477686804, "grad_norm": 1.6135231256484985, "learning_rate": 1.3990780113658943e-05, "loss": 0.1918, "step": 15630 }, { "epoch": 1.5381967495266897, "grad_norm": 1.075868844985962, "learning_rate": 1.3986806024718835e-05, "loss": 0.3447, "step": 15640 }, { "epoch": 1.5391802512846993, "grad_norm": 0.5639548301696777, "learning_rate": 1.3982831935778724e-05, "loss": 0.2486, "step": 15650 }, { "epoch": 1.5401637530427086, "grad_norm": 0.9546343684196472, "learning_rate": 1.3978857846838614e-05, "loss": 0.2489, "step": 15660 }, { "epoch": 1.541147254800718, "grad_norm": 0.4168679118156433, "learning_rate": 1.3974883757898504e-05, "loss": 0.2325, "step": 15670 }, { "epoch": 1.5421307565587274, "grad_norm": 0.40552496910095215, "learning_rate": 1.3970909668958392e-05, "loss": 0.2235, "step": 15680 }, { "epoch": 1.5431142583167368, "grad_norm": 2.4377217292785645, "learning_rate": 1.3966935580018281e-05, "loss": 0.2474, "step": 15690 }, { "epoch": 1.544097760074746, "grad_norm": 1.058454990386963, "learning_rate": 1.3962961491078171e-05, "loss": 0.3348, "step": 15700 }, { "epoch": 1.5450812618327556, "grad_norm": 2.9664440155029297, "learning_rate": 1.395898740213806e-05, "loss": 0.2238, "step": 15710 }, { "epoch": 1.546064763590765, "grad_norm": 1.4083367586135864, "learning_rate": 1.3955013313197952e-05, "loss": 0.2907, "step": 15720 }, { "epoch": 1.5470482653487743, "grad_norm": 1.7369105815887451, "learning_rate": 1.395103922425784e-05, "loss": 0.2887, "step": 15730 }, { "epoch": 1.5480317671067838, "grad_norm": 1.3579238653182983, "learning_rate": 1.394706513531773e-05, "loss": 0.3529, "step": 15740 }, { "epoch": 1.5490152688647931, "grad_norm": 1.0139018297195435, "learning_rate": 1.394309104637762e-05, "loss": 0.1786, "step": 15750 }, { "epoch": 1.5499987706228024, "grad_norm": 2.8200855255126953, "learning_rate": 1.3939116957437509e-05, "loss": 0.2144, "step": 15760 }, { "epoch": 1.550982272380812, "grad_norm": 1.6889891624450684, "learning_rate": 1.3935142868497397e-05, "loss": 0.2217, "step": 15770 }, { "epoch": 1.5519657741388213, "grad_norm": 0.8183413743972778, "learning_rate": 1.3931168779557287e-05, "loss": 0.2242, "step": 15780 }, { "epoch": 1.5529492758968306, "grad_norm": 1.7975190877914429, "learning_rate": 1.3927194690617176e-05, "loss": 0.2008, "step": 15790 }, { "epoch": 1.5539327776548402, "grad_norm": 3.9404609203338623, "learning_rate": 1.3923220601677068e-05, "loss": 0.3022, "step": 15800 }, { "epoch": 1.5549162794128495, "grad_norm": 1.471698522567749, "learning_rate": 1.3919246512736957e-05, "loss": 0.2656, "step": 15810 }, { "epoch": 1.5558997811708588, "grad_norm": 2.58359956741333, "learning_rate": 1.3915272423796845e-05, "loss": 0.2049, "step": 15820 }, { "epoch": 1.5568832829288684, "grad_norm": 1.1108944416046143, "learning_rate": 1.3911298334856735e-05, "loss": 0.3153, "step": 15830 }, { "epoch": 1.5578667846868777, "grad_norm": 2.1699142456054688, "learning_rate": 1.3907324245916625e-05, "loss": 0.1621, "step": 15840 }, { "epoch": 1.558850286444887, "grad_norm": 4.686306476593018, "learning_rate": 1.3903350156976514e-05, "loss": 0.2476, "step": 15850 }, { "epoch": 1.5598337882028965, "grad_norm": 0.7942647933959961, "learning_rate": 1.3899376068036402e-05, "loss": 0.315, "step": 15860 }, { "epoch": 1.5608172899609059, "grad_norm": 3.1741652488708496, "learning_rate": 1.3895401979096292e-05, "loss": 0.1179, "step": 15870 }, { "epoch": 1.5618007917189152, "grad_norm": 0.33635735511779785, "learning_rate": 1.3891427890156183e-05, "loss": 0.2017, "step": 15880 }, { "epoch": 1.5627842934769247, "grad_norm": 2.7269420623779297, "learning_rate": 1.3887453801216073e-05, "loss": 0.2831, "step": 15890 }, { "epoch": 1.563767795234934, "grad_norm": 0.5738785862922668, "learning_rate": 1.3883479712275963e-05, "loss": 0.22, "step": 15900 }, { "epoch": 1.5647512969929434, "grad_norm": 0.5138858556747437, "learning_rate": 1.3879505623335852e-05, "loss": 0.253, "step": 15910 }, { "epoch": 1.565734798750953, "grad_norm": 0.9820569753646851, "learning_rate": 1.387553153439574e-05, "loss": 0.2013, "step": 15920 }, { "epoch": 1.5667183005089622, "grad_norm": 2.7193431854248047, "learning_rate": 1.387155744545563e-05, "loss": 0.1752, "step": 15930 }, { "epoch": 1.5677018022669715, "grad_norm": 1.6706477403640747, "learning_rate": 1.386758335651552e-05, "loss": 0.2432, "step": 15940 }, { "epoch": 1.568685304024981, "grad_norm": 2.1528844833374023, "learning_rate": 1.386360926757541e-05, "loss": 0.158, "step": 15950 }, { "epoch": 1.5696688057829902, "grad_norm": 2.0076661109924316, "learning_rate": 1.38596351786353e-05, "loss": 0.3423, "step": 15960 }, { "epoch": 1.5706523075409997, "grad_norm": 1.225509524345398, "learning_rate": 1.3855661089695189e-05, "loss": 0.2511, "step": 15970 }, { "epoch": 1.5716358092990093, "grad_norm": 2.767773389816284, "learning_rate": 1.3851687000755078e-05, "loss": 0.217, "step": 15980 }, { "epoch": 1.5726193110570184, "grad_norm": 0.551873505115509, "learning_rate": 1.3847712911814968e-05, "loss": 0.1632, "step": 15990 }, { "epoch": 1.573602812815028, "grad_norm": 1.119555115699768, "learning_rate": 1.3843738822874858e-05, "loss": 0.2923, "step": 16000 }, { "epoch": 1.573602812815028, "eval_loss": 0.1566813588142395, "eval_runtime": 20.9844, "eval_samples_per_second": 2.383, "eval_steps_per_second": 1.191, "step": 16000 }, { "epoch": 1.5745863145730374, "grad_norm": 0.5722016096115112, "learning_rate": 1.3839764733934746e-05, "loss": 0.34, "step": 16010 }, { "epoch": 1.5755698163310465, "grad_norm": 2.6336591243743896, "learning_rate": 1.3835790644994635e-05, "loss": 0.1729, "step": 16020 }, { "epoch": 1.576553318089056, "grad_norm": 1.409031629562378, "learning_rate": 1.3831816556054525e-05, "loss": 0.246, "step": 16030 }, { "epoch": 1.5775368198470656, "grad_norm": 0.5853147506713867, "learning_rate": 1.3827842467114416e-05, "loss": 0.1857, "step": 16040 }, { "epoch": 1.5785203216050747, "grad_norm": 0.7682684063911438, "learning_rate": 1.3823868378174306e-05, "loss": 0.2208, "step": 16050 }, { "epoch": 1.5795038233630843, "grad_norm": 0.4091372489929199, "learning_rate": 1.3819894289234194e-05, "loss": 0.3363, "step": 16060 }, { "epoch": 1.5804873251210938, "grad_norm": 0.44479209184646606, "learning_rate": 1.3815920200294084e-05, "loss": 0.2537, "step": 16070 }, { "epoch": 1.581470826879103, "grad_norm": 1.0543726682662964, "learning_rate": 1.3811946111353973e-05, "loss": 0.2711, "step": 16080 }, { "epoch": 1.5824543286371124, "grad_norm": 0.8796023726463318, "learning_rate": 1.3807972022413863e-05, "loss": 0.2295, "step": 16090 }, { "epoch": 1.583437830395122, "grad_norm": 0.8024858832359314, "learning_rate": 1.3803997933473751e-05, "loss": 0.1683, "step": 16100 }, { "epoch": 1.584421332153131, "grad_norm": 1.081507921218872, "learning_rate": 1.380002384453364e-05, "loss": 0.1549, "step": 16110 }, { "epoch": 1.5854048339111406, "grad_norm": 1.3907278776168823, "learning_rate": 1.379604975559353e-05, "loss": 0.2263, "step": 16120 }, { "epoch": 1.5863883356691502, "grad_norm": 1.305191159248352, "learning_rate": 1.3792075666653422e-05, "loss": 0.1947, "step": 16130 }, { "epoch": 1.5873718374271593, "grad_norm": 1.4547632932662964, "learning_rate": 1.3788101577713311e-05, "loss": 0.204, "step": 16140 }, { "epoch": 1.5883553391851688, "grad_norm": 0.6436703205108643, "learning_rate": 1.3784127488773201e-05, "loss": 0.1092, "step": 16150 }, { "epoch": 1.5893388409431783, "grad_norm": 0.6339607834815979, "learning_rate": 1.3780153399833089e-05, "loss": 0.3019, "step": 16160 }, { "epoch": 1.5903223427011874, "grad_norm": 2.8297510147094727, "learning_rate": 1.3776179310892979e-05, "loss": 0.275, "step": 16170 }, { "epoch": 1.591305844459197, "grad_norm": 1.2241812944412231, "learning_rate": 1.3772205221952868e-05, "loss": 0.2319, "step": 16180 }, { "epoch": 1.5922893462172065, "grad_norm": 1.707702875137329, "learning_rate": 1.3768231133012758e-05, "loss": 0.2802, "step": 16190 }, { "epoch": 1.5932728479752156, "grad_norm": 1.518728494644165, "learning_rate": 1.3764257044072646e-05, "loss": 0.2386, "step": 16200 }, { "epoch": 1.5942563497332252, "grad_norm": 0.9995889067649841, "learning_rate": 1.3760282955132537e-05, "loss": 0.1726, "step": 16210 }, { "epoch": 1.5952398514912347, "grad_norm": 1.0202362537384033, "learning_rate": 1.3756308866192427e-05, "loss": 0.2434, "step": 16220 }, { "epoch": 1.5962233532492438, "grad_norm": 0.6778249144554138, "learning_rate": 1.3752334777252317e-05, "loss": 0.2346, "step": 16230 }, { "epoch": 1.5972068550072533, "grad_norm": 2.993239164352417, "learning_rate": 1.3748360688312206e-05, "loss": 0.1847, "step": 16240 }, { "epoch": 1.5981903567652627, "grad_norm": 0.5373549461364746, "learning_rate": 1.3744386599372094e-05, "loss": 0.2517, "step": 16250 }, { "epoch": 1.599173858523272, "grad_norm": 1.3796418905258179, "learning_rate": 1.3740412510431984e-05, "loss": 0.1498, "step": 16260 }, { "epoch": 1.6001573602812815, "grad_norm": 1.9999388456344604, "learning_rate": 1.3736438421491874e-05, "loss": 0.1945, "step": 16270 }, { "epoch": 1.6011408620392908, "grad_norm": 0.6067172884941101, "learning_rate": 1.3732464332551763e-05, "loss": 0.2447, "step": 16280 }, { "epoch": 1.6021243637973002, "grad_norm": 2.563868284225464, "learning_rate": 1.3728490243611655e-05, "loss": 0.2142, "step": 16290 }, { "epoch": 1.6031078655553097, "grad_norm": 2.002779483795166, "learning_rate": 1.3724516154671543e-05, "loss": 0.2112, "step": 16300 }, { "epoch": 1.604091367313319, "grad_norm": 2.3529181480407715, "learning_rate": 1.3720542065731432e-05, "loss": 0.2633, "step": 16310 }, { "epoch": 1.6050748690713283, "grad_norm": 1.228992223739624, "learning_rate": 1.3716567976791322e-05, "loss": 0.1728, "step": 16320 }, { "epoch": 1.606058370829338, "grad_norm": 1.3093105554580688, "learning_rate": 1.3712593887851212e-05, "loss": 0.1955, "step": 16330 }, { "epoch": 1.6070418725873472, "grad_norm": 0.8239392638206482, "learning_rate": 1.37086197989111e-05, "loss": 0.235, "step": 16340 }, { "epoch": 1.6080253743453565, "grad_norm": 1.030205488204956, "learning_rate": 1.370464570997099e-05, "loss": 0.2271, "step": 16350 }, { "epoch": 1.609008876103366, "grad_norm": 0.7238038778305054, "learning_rate": 1.3700671621030879e-05, "loss": 0.1725, "step": 16360 }, { "epoch": 1.6099923778613754, "grad_norm": 0.4815444350242615, "learning_rate": 1.369669753209077e-05, "loss": 0.2095, "step": 16370 }, { "epoch": 1.6109758796193847, "grad_norm": 1.8351434469223022, "learning_rate": 1.369272344315066e-05, "loss": 0.1796, "step": 16380 }, { "epoch": 1.6119593813773943, "grad_norm": 2.1774728298187256, "learning_rate": 1.368874935421055e-05, "loss": 0.2293, "step": 16390 }, { "epoch": 1.6129428831354036, "grad_norm": 0.7291221022605896, "learning_rate": 1.3684775265270437e-05, "loss": 0.1647, "step": 16400 }, { "epoch": 1.613926384893413, "grad_norm": 0.8646822571754456, "learning_rate": 1.3680801176330327e-05, "loss": 0.2258, "step": 16410 }, { "epoch": 1.6149098866514224, "grad_norm": 0.8485159873962402, "learning_rate": 1.3676827087390217e-05, "loss": 0.2776, "step": 16420 }, { "epoch": 1.6158933884094318, "grad_norm": 2.74750018119812, "learning_rate": 1.3672852998450106e-05, "loss": 0.3018, "step": 16430 }, { "epoch": 1.616876890167441, "grad_norm": 1.1387678384780884, "learning_rate": 1.3668878909509994e-05, "loss": 0.2192, "step": 16440 }, { "epoch": 1.6178603919254506, "grad_norm": 0.5467789173126221, "learning_rate": 1.3664904820569886e-05, "loss": 0.2944, "step": 16450 }, { "epoch": 1.61884389368346, "grad_norm": 3.122748851776123, "learning_rate": 1.3660930731629775e-05, "loss": 0.3162, "step": 16460 }, { "epoch": 1.6198273954414693, "grad_norm": 0.3885956108570099, "learning_rate": 1.3656956642689665e-05, "loss": 0.2615, "step": 16470 }, { "epoch": 1.6208108971994788, "grad_norm": 1.7819777727127075, "learning_rate": 1.3652982553749555e-05, "loss": 0.277, "step": 16480 }, { "epoch": 1.6217943989574881, "grad_norm": 1.2958201169967651, "learning_rate": 1.3649008464809443e-05, "loss": 0.2059, "step": 16490 }, { "epoch": 1.6227779007154974, "grad_norm": 1.3207638263702393, "learning_rate": 1.3645034375869332e-05, "loss": 0.1881, "step": 16500 }, { "epoch": 1.6227779007154974, "eval_loss": 0.1571766883134842, "eval_runtime": 16.1033, "eval_samples_per_second": 3.105, "eval_steps_per_second": 1.552, "step": 16500 }, { "epoch": 1.623761402473507, "grad_norm": 1.1741260290145874, "learning_rate": 1.3641060286929222e-05, "loss": 0.3229, "step": 16510 }, { "epoch": 1.6247449042315163, "grad_norm": 2.915156364440918, "learning_rate": 1.3637086197989112e-05, "loss": 0.1764, "step": 16520 }, { "epoch": 1.6257284059895256, "grad_norm": 1.0990588665008545, "learning_rate": 1.3633112109049003e-05, "loss": 0.3262, "step": 16530 }, { "epoch": 1.6267119077475352, "grad_norm": 1.178744912147522, "learning_rate": 1.3629138020108891e-05, "loss": 0.1941, "step": 16540 }, { "epoch": 1.6276954095055445, "grad_norm": 1.4768882989883423, "learning_rate": 1.362516393116878e-05, "loss": 0.2371, "step": 16550 }, { "epoch": 1.6286789112635538, "grad_norm": 0.5752533674240112, "learning_rate": 1.362118984222867e-05, "loss": 0.2492, "step": 16560 }, { "epoch": 1.6296624130215633, "grad_norm": 2.9447243213653564, "learning_rate": 1.361721575328856e-05, "loss": 0.1487, "step": 16570 }, { "epoch": 1.6306459147795727, "grad_norm": 1.0001193284988403, "learning_rate": 1.3613241664348448e-05, "loss": 0.3564, "step": 16580 }, { "epoch": 1.631629416537582, "grad_norm": 1.4185397624969482, "learning_rate": 1.3609267575408338e-05, "loss": 0.2264, "step": 16590 }, { "epoch": 1.6326129182955915, "grad_norm": 2.0910747051239014, "learning_rate": 1.3605293486468227e-05, "loss": 0.1866, "step": 16600 }, { "epoch": 1.6335964200536008, "grad_norm": 4.188348293304443, "learning_rate": 1.3601319397528119e-05, "loss": 0.1481, "step": 16610 }, { "epoch": 1.6345799218116102, "grad_norm": 2.285092830657959, "learning_rate": 1.3597345308588008e-05, "loss": 0.2241, "step": 16620 }, { "epoch": 1.6355634235696197, "grad_norm": 0.2983485162258148, "learning_rate": 1.3593371219647898e-05, "loss": 0.1065, "step": 16630 }, { "epoch": 1.636546925327629, "grad_norm": 1.9504050016403198, "learning_rate": 1.3589397130707786e-05, "loss": 0.1945, "step": 16640 }, { "epoch": 1.6375304270856383, "grad_norm": 1.3591973781585693, "learning_rate": 1.3585423041767676e-05, "loss": 0.26, "step": 16650 }, { "epoch": 1.6385139288436479, "grad_norm": 3.9365203380584717, "learning_rate": 1.3581448952827565e-05, "loss": 0.1286, "step": 16660 }, { "epoch": 1.6394974306016572, "grad_norm": 1.2122793197631836, "learning_rate": 1.3577474863887455e-05, "loss": 0.2407, "step": 16670 }, { "epoch": 1.6404809323596665, "grad_norm": 0.9854438304901123, "learning_rate": 1.3573500774947343e-05, "loss": 0.297, "step": 16680 }, { "epoch": 1.641464434117676, "grad_norm": 1.231350302696228, "learning_rate": 1.3569526686007234e-05, "loss": 0.2017, "step": 16690 }, { "epoch": 1.6424479358756854, "grad_norm": 1.2629377841949463, "learning_rate": 1.3565552597067124e-05, "loss": 0.2411, "step": 16700 }, { "epoch": 1.6434314376336947, "grad_norm": 0.4649147391319275, "learning_rate": 1.3561578508127014e-05, "loss": 0.2721, "step": 16710 }, { "epoch": 1.6444149393917042, "grad_norm": 1.1152622699737549, "learning_rate": 1.3557604419186903e-05, "loss": 0.2632, "step": 16720 }, { "epoch": 1.6453984411497136, "grad_norm": 1.209902286529541, "learning_rate": 1.3553630330246791e-05, "loss": 0.2572, "step": 16730 }, { "epoch": 1.6463819429077229, "grad_norm": 2.158609628677368, "learning_rate": 1.3549656241306681e-05, "loss": 0.2976, "step": 16740 }, { "epoch": 1.6473654446657324, "grad_norm": 0.4169705808162689, "learning_rate": 1.354568215236657e-05, "loss": 0.2098, "step": 16750 }, { "epoch": 1.6483489464237417, "grad_norm": 1.0553632974624634, "learning_rate": 1.354170806342646e-05, "loss": 0.1597, "step": 16760 }, { "epoch": 1.649332448181751, "grad_norm": 0.7214099168777466, "learning_rate": 1.3537733974486352e-05, "loss": 0.2487, "step": 16770 }, { "epoch": 1.6503159499397606, "grad_norm": 1.0439274311065674, "learning_rate": 1.353375988554624e-05, "loss": 0.3632, "step": 16780 }, { "epoch": 1.65129945169777, "grad_norm": 0.5576213598251343, "learning_rate": 1.352978579660613e-05, "loss": 0.1836, "step": 16790 }, { "epoch": 1.6522829534557792, "grad_norm": 1.6675044298171997, "learning_rate": 1.3525811707666019e-05, "loss": 0.2255, "step": 16800 }, { "epoch": 1.6532664552137888, "grad_norm": 0.6567625403404236, "learning_rate": 1.3521837618725909e-05, "loss": 0.2699, "step": 16810 }, { "epoch": 1.654249956971798, "grad_norm": 0.30426979064941406, "learning_rate": 1.3517863529785797e-05, "loss": 0.2056, "step": 16820 }, { "epoch": 1.6552334587298074, "grad_norm": 0.6561160683631897, "learning_rate": 1.3513889440845686e-05, "loss": 0.2077, "step": 16830 }, { "epoch": 1.656216960487817, "grad_norm": 0.47024333477020264, "learning_rate": 1.3509915351905576e-05, "loss": 0.307, "step": 16840 }, { "epoch": 1.6572004622458263, "grad_norm": 2.3148345947265625, "learning_rate": 1.3505941262965467e-05, "loss": 0.2145, "step": 16850 }, { "epoch": 1.6581839640038356, "grad_norm": 1.8510692119598389, "learning_rate": 1.3501967174025357e-05, "loss": 0.2056, "step": 16860 }, { "epoch": 1.6591674657618452, "grad_norm": 5.483177661895752, "learning_rate": 1.3497993085085247e-05, "loss": 0.1826, "step": 16870 }, { "epoch": 1.6601509675198545, "grad_norm": 0.8653757572174072, "learning_rate": 1.3494018996145135e-05, "loss": 0.2596, "step": 16880 }, { "epoch": 1.6611344692778638, "grad_norm": 0.8157027363777161, "learning_rate": 1.3490044907205024e-05, "loss": 0.2814, "step": 16890 }, { "epoch": 1.6621179710358733, "grad_norm": 1.144384503364563, "learning_rate": 1.3486070818264914e-05, "loss": 0.3275, "step": 16900 }, { "epoch": 1.6631014727938827, "grad_norm": 3.486330509185791, "learning_rate": 1.3482096729324804e-05, "loss": 0.2449, "step": 16910 }, { "epoch": 1.664084974551892, "grad_norm": 0.7469828724861145, "learning_rate": 1.3478122640384692e-05, "loss": 0.2374, "step": 16920 }, { "epoch": 1.6650684763099015, "grad_norm": 1.1899664402008057, "learning_rate": 1.3474148551444581e-05, "loss": 0.1911, "step": 16930 }, { "epoch": 1.6660519780679108, "grad_norm": 2.471895694732666, "learning_rate": 1.3470174462504473e-05, "loss": 0.1957, "step": 16940 }, { "epoch": 1.6670354798259202, "grad_norm": 1.0681343078613281, "learning_rate": 1.3466200373564362e-05, "loss": 0.3053, "step": 16950 }, { "epoch": 1.6680189815839297, "grad_norm": 2.1743273735046387, "learning_rate": 1.3462226284624252e-05, "loss": 0.227, "step": 16960 }, { "epoch": 1.669002483341939, "grad_norm": 6.08627462387085, "learning_rate": 1.345825219568414e-05, "loss": 0.2416, "step": 16970 }, { "epoch": 1.6699859850999483, "grad_norm": 0.654421329498291, "learning_rate": 1.345427810674403e-05, "loss": 0.2075, "step": 16980 }, { "epoch": 1.6709694868579579, "grad_norm": 0.7476296424865723, "learning_rate": 1.345030401780392e-05, "loss": 0.1306, "step": 16990 }, { "epoch": 1.6719529886159672, "grad_norm": 0.9483456611633301, "learning_rate": 1.3446329928863809e-05, "loss": 0.3279, "step": 17000 }, { "epoch": 1.6719529886159672, "eval_loss": 0.15918737649917603, "eval_runtime": 18.2512, "eval_samples_per_second": 2.74, "eval_steps_per_second": 1.37, "step": 17000 }, { "epoch": 1.6729364903739765, "grad_norm": 4.026605606079102, "learning_rate": 1.3442355839923697e-05, "loss": 0.2603, "step": 17010 }, { "epoch": 1.673919992131986, "grad_norm": 1.619104266166687, "learning_rate": 1.3438381750983588e-05, "loss": 0.2716, "step": 17020 }, { "epoch": 1.6749034938899954, "grad_norm": 1.8711702823638916, "learning_rate": 1.3434407662043478e-05, "loss": 0.2267, "step": 17030 }, { "epoch": 1.6758869956480047, "grad_norm": 2.9381566047668457, "learning_rate": 1.3430433573103368e-05, "loss": 0.2086, "step": 17040 }, { "epoch": 1.6768704974060142, "grad_norm": 0.5107515454292297, "learning_rate": 1.3426459484163257e-05, "loss": 0.1975, "step": 17050 }, { "epoch": 1.6778539991640236, "grad_norm": 3.57466197013855, "learning_rate": 1.3422485395223145e-05, "loss": 0.2691, "step": 17060 }, { "epoch": 1.6788375009220329, "grad_norm": 1.05047607421875, "learning_rate": 1.3418511306283035e-05, "loss": 0.2854, "step": 17070 }, { "epoch": 1.6798210026800424, "grad_norm": 1.1247413158416748, "learning_rate": 1.3414537217342925e-05, "loss": 0.2301, "step": 17080 }, { "epoch": 1.6808045044380515, "grad_norm": 3.0839147567749023, "learning_rate": 1.3410563128402814e-05, "loss": 0.1918, "step": 17090 }, { "epoch": 1.681788006196061, "grad_norm": 0.8100689053535461, "learning_rate": 1.3406589039462706e-05, "loss": 0.307, "step": 17100 }, { "epoch": 1.6827715079540706, "grad_norm": 1.0803511142730713, "learning_rate": 1.3402614950522595e-05, "loss": 0.2355, "step": 17110 }, { "epoch": 1.6837550097120797, "grad_norm": 1.2122515439987183, "learning_rate": 1.3398640861582483e-05, "loss": 0.2274, "step": 17120 }, { "epoch": 1.6847385114700892, "grad_norm": 0.907745897769928, "learning_rate": 1.3394666772642373e-05, "loss": 0.1913, "step": 17130 }, { "epoch": 1.6857220132280988, "grad_norm": 1.1718151569366455, "learning_rate": 1.3390692683702263e-05, "loss": 0.2933, "step": 17140 }, { "epoch": 1.6867055149861079, "grad_norm": 0.6171371340751648, "learning_rate": 1.3386718594762152e-05, "loss": 0.2956, "step": 17150 }, { "epoch": 1.6876890167441174, "grad_norm": 1.1524289846420288, "learning_rate": 1.338274450582204e-05, "loss": 0.2185, "step": 17160 }, { "epoch": 1.688672518502127, "grad_norm": 0.5245811939239502, "learning_rate": 1.337877041688193e-05, "loss": 0.2736, "step": 17170 }, { "epoch": 1.689656020260136, "grad_norm": 4.1112470626831055, "learning_rate": 1.3374796327941821e-05, "loss": 0.2162, "step": 17180 }, { "epoch": 1.6906395220181456, "grad_norm": 0.631099283695221, "learning_rate": 1.337082223900171e-05, "loss": 0.3262, "step": 17190 }, { "epoch": 1.6916230237761551, "grad_norm": 0.7816455364227295, "learning_rate": 1.33668481500616e-05, "loss": 0.2709, "step": 17200 }, { "epoch": 1.6926065255341642, "grad_norm": 0.8032369017601013, "learning_rate": 1.3362874061121488e-05, "loss": 0.1441, "step": 17210 }, { "epoch": 1.6935900272921738, "grad_norm": 1.126566767692566, "learning_rate": 1.3358899972181378e-05, "loss": 0.1997, "step": 17220 }, { "epoch": 1.6945735290501833, "grad_norm": 0.9024196267127991, "learning_rate": 1.3354925883241268e-05, "loss": 0.2046, "step": 17230 }, { "epoch": 1.6955570308081924, "grad_norm": 1.4062788486480713, "learning_rate": 1.3350951794301157e-05, "loss": 0.3639, "step": 17240 }, { "epoch": 1.696540532566202, "grad_norm": 2.7009520530700684, "learning_rate": 1.3346977705361045e-05, "loss": 0.2141, "step": 17250 }, { "epoch": 1.6975240343242115, "grad_norm": 1.0145210027694702, "learning_rate": 1.3343003616420937e-05, "loss": 0.3401, "step": 17260 }, { "epoch": 1.6985075360822206, "grad_norm": 0.8520883321762085, "learning_rate": 1.3339029527480826e-05, "loss": 0.2577, "step": 17270 }, { "epoch": 1.6994910378402301, "grad_norm": 0.5788146257400513, "learning_rate": 1.3335055438540716e-05, "loss": 0.2418, "step": 17280 }, { "epoch": 1.7004745395982397, "grad_norm": 1.2726303339004517, "learning_rate": 1.3331081349600606e-05, "loss": 0.2434, "step": 17290 }, { "epoch": 1.7014580413562488, "grad_norm": 1.0344926118850708, "learning_rate": 1.3327107260660494e-05, "loss": 0.1686, "step": 17300 }, { "epoch": 1.7024415431142583, "grad_norm": 1.0950030088424683, "learning_rate": 1.3323133171720383e-05, "loss": 0.3009, "step": 17310 }, { "epoch": 1.7034250448722679, "grad_norm": 0.7477503418922424, "learning_rate": 1.3319159082780273e-05, "loss": 0.1959, "step": 17320 }, { "epoch": 1.704408546630277, "grad_norm": 0.9627076983451843, "learning_rate": 1.3315184993840163e-05, "loss": 0.2229, "step": 17330 }, { "epoch": 1.7053920483882865, "grad_norm": 2.0675864219665527, "learning_rate": 1.3311210904900054e-05, "loss": 0.1794, "step": 17340 }, { "epoch": 1.706375550146296, "grad_norm": 0.5318220257759094, "learning_rate": 1.3307236815959944e-05, "loss": 0.1252, "step": 17350 }, { "epoch": 1.7073590519043051, "grad_norm": 1.1632235050201416, "learning_rate": 1.3303262727019832e-05, "loss": 0.2574, "step": 17360 }, { "epoch": 1.7083425536623147, "grad_norm": 0.5292351245880127, "learning_rate": 1.3299288638079721e-05, "loss": 0.2551, "step": 17370 }, { "epoch": 1.709326055420324, "grad_norm": 0.38190001249313354, "learning_rate": 1.3295314549139611e-05, "loss": 0.2473, "step": 17380 }, { "epoch": 1.7103095571783333, "grad_norm": 5.364218711853027, "learning_rate": 1.32913404601995e-05, "loss": 0.3242, "step": 17390 }, { "epoch": 1.7112930589363429, "grad_norm": 1.7767460346221924, "learning_rate": 1.3287366371259389e-05, "loss": 0.2854, "step": 17400 }, { "epoch": 1.7122765606943522, "grad_norm": 0.7090295553207397, "learning_rate": 1.3283392282319278e-05, "loss": 0.1963, "step": 17410 }, { "epoch": 1.7132600624523615, "grad_norm": 0.6008509993553162, "learning_rate": 1.327941819337917e-05, "loss": 0.3349, "step": 17420 }, { "epoch": 1.714243564210371, "grad_norm": 1.0188231468200684, "learning_rate": 1.327544410443906e-05, "loss": 0.2329, "step": 17430 }, { "epoch": 1.7152270659683804, "grad_norm": 2.4888973236083984, "learning_rate": 1.3271470015498949e-05, "loss": 0.2454, "step": 17440 }, { "epoch": 1.7162105677263897, "grad_norm": 1.0194109678268433, "learning_rate": 1.3267495926558837e-05, "loss": 0.1806, "step": 17450 }, { "epoch": 1.7171940694843992, "grad_norm": 1.178078532218933, "learning_rate": 1.3263521837618727e-05, "loss": 0.3933, "step": 17460 }, { "epoch": 1.7181775712424086, "grad_norm": 1.6931121349334717, "learning_rate": 1.3259547748678616e-05, "loss": 0.2191, "step": 17470 }, { "epoch": 1.7191610730004179, "grad_norm": 2.874523639678955, "learning_rate": 1.3255573659738506e-05, "loss": 0.3308, "step": 17480 }, { "epoch": 1.7201445747584274, "grad_norm": 3.4492263793945312, "learning_rate": 1.3251599570798394e-05, "loss": 0.1748, "step": 17490 }, { "epoch": 1.7211280765164367, "grad_norm": 1.1551506519317627, "learning_rate": 1.3247625481858285e-05, "loss": 0.2132, "step": 17500 }, { "epoch": 1.7211280765164367, "eval_loss": 0.15584313869476318, "eval_runtime": 17.0322, "eval_samples_per_second": 2.936, "eval_steps_per_second": 1.468, "step": 17500 }, { "epoch": 1.722111578274446, "grad_norm": 3.204711437225342, "learning_rate": 1.3243651392918175e-05, "loss": 0.2223, "step": 17510 }, { "epoch": 1.7230950800324556, "grad_norm": 1.4167150259017944, "learning_rate": 1.3239677303978065e-05, "loss": 0.2441, "step": 17520 }, { "epoch": 1.724078581790465, "grad_norm": 2.277780294418335, "learning_rate": 1.3235703215037954e-05, "loss": 0.2682, "step": 17530 }, { "epoch": 1.7250620835484742, "grad_norm": 0.6956846714019775, "learning_rate": 1.3231729126097842e-05, "loss": 0.16, "step": 17540 }, { "epoch": 1.7260455853064838, "grad_norm": 0.8625393509864807, "learning_rate": 1.3227755037157732e-05, "loss": 0.2059, "step": 17550 }, { "epoch": 1.727029087064493, "grad_norm": 0.696932852268219, "learning_rate": 1.3223780948217622e-05, "loss": 0.3554, "step": 17560 }, { "epoch": 1.7280125888225024, "grad_norm": 3.09187388420105, "learning_rate": 1.3219806859277511e-05, "loss": 0.197, "step": 17570 }, { "epoch": 1.728996090580512, "grad_norm": 2.1331708431243896, "learning_rate": 1.3215832770337403e-05, "loss": 0.2262, "step": 17580 }, { "epoch": 1.7299795923385213, "grad_norm": 2.1550347805023193, "learning_rate": 1.3211858681397292e-05, "loss": 0.3214, "step": 17590 }, { "epoch": 1.7309630940965306, "grad_norm": 3.0045039653778076, "learning_rate": 1.320788459245718e-05, "loss": 0.1884, "step": 17600 }, { "epoch": 1.7319465958545401, "grad_norm": 1.457900047302246, "learning_rate": 1.320391050351707e-05, "loss": 0.2306, "step": 17610 }, { "epoch": 1.7329300976125495, "grad_norm": 1.152354121208191, "learning_rate": 1.319993641457696e-05, "loss": 0.2643, "step": 17620 }, { "epoch": 1.7339135993705588, "grad_norm": 0.5737890601158142, "learning_rate": 1.319596232563685e-05, "loss": 0.2697, "step": 17630 }, { "epoch": 1.7348971011285683, "grad_norm": 0.5850633978843689, "learning_rate": 1.3191988236696737e-05, "loss": 0.2399, "step": 17640 }, { "epoch": 1.7358806028865776, "grad_norm": 2.2772579193115234, "learning_rate": 1.3188014147756627e-05, "loss": 0.1175, "step": 17650 }, { "epoch": 1.736864104644587, "grad_norm": 2.7282495498657227, "learning_rate": 1.3184040058816518e-05, "loss": 0.2286, "step": 17660 }, { "epoch": 1.7378476064025965, "grad_norm": 1.0198181867599487, "learning_rate": 1.3180065969876408e-05, "loss": 0.2325, "step": 17670 }, { "epoch": 1.7388311081606058, "grad_norm": 1.8783186674118042, "learning_rate": 1.3176091880936298e-05, "loss": 0.1183, "step": 17680 }, { "epoch": 1.7398146099186151, "grad_norm": 2.2488327026367188, "learning_rate": 1.3172117791996186e-05, "loss": 0.2618, "step": 17690 }, { "epoch": 1.7407981116766247, "grad_norm": 5.103484630584717, "learning_rate": 1.3168143703056075e-05, "loss": 0.2685, "step": 17700 }, { "epoch": 1.741781613434634, "grad_norm": 0.5979586839675903, "learning_rate": 1.3164169614115965e-05, "loss": 0.2041, "step": 17710 }, { "epoch": 1.7427651151926433, "grad_norm": 3.733778238296509, "learning_rate": 1.3160195525175855e-05, "loss": 0.1391, "step": 17720 }, { "epoch": 1.7437486169506529, "grad_norm": 1.909714698791504, "learning_rate": 1.3156221436235743e-05, "loss": 0.235, "step": 17730 }, { "epoch": 1.7447321187086622, "grad_norm": 1.2753373384475708, "learning_rate": 1.3152247347295632e-05, "loss": 0.2916, "step": 17740 }, { "epoch": 1.7457156204666715, "grad_norm": 1.276990294456482, "learning_rate": 1.3148273258355524e-05, "loss": 0.2657, "step": 17750 }, { "epoch": 1.746699122224681, "grad_norm": 2.39896297454834, "learning_rate": 1.3144299169415413e-05, "loss": 0.2799, "step": 17760 }, { "epoch": 1.7476826239826904, "grad_norm": 0.5444474220275879, "learning_rate": 1.3140325080475303e-05, "loss": 0.1874, "step": 17770 }, { "epoch": 1.7486661257406997, "grad_norm": 1.2919403314590454, "learning_rate": 1.3136350991535191e-05, "loss": 0.2485, "step": 17780 }, { "epoch": 1.7496496274987092, "grad_norm": 0.9817249774932861, "learning_rate": 1.313237690259508e-05, "loss": 0.1711, "step": 17790 }, { "epoch": 1.7506331292567185, "grad_norm": 0.4421529769897461, "learning_rate": 1.312840281365497e-05, "loss": 0.1969, "step": 17800 }, { "epoch": 1.7516166310147279, "grad_norm": 1.9339884519577026, "learning_rate": 1.312442872471486e-05, "loss": 0.3163, "step": 17810 }, { "epoch": 1.7526001327727374, "grad_norm": 0.7091568112373352, "learning_rate": 1.3120454635774748e-05, "loss": 0.2134, "step": 17820 }, { "epoch": 1.7535836345307467, "grad_norm": 1.4017484188079834, "learning_rate": 1.3116480546834641e-05, "loss": 0.206, "step": 17830 }, { "epoch": 1.754567136288756, "grad_norm": 0.38947057723999023, "learning_rate": 1.3112506457894529e-05, "loss": 0.3262, "step": 17840 }, { "epoch": 1.7555506380467656, "grad_norm": 1.5356085300445557, "learning_rate": 1.3108532368954419e-05, "loss": 0.3551, "step": 17850 }, { "epoch": 1.756534139804775, "grad_norm": 2.8606903553009033, "learning_rate": 1.3104558280014308e-05, "loss": 0.2159, "step": 17860 }, { "epoch": 1.7575176415627842, "grad_norm": 2.0851094722747803, "learning_rate": 1.3100584191074198e-05, "loss": 0.1928, "step": 17870 }, { "epoch": 1.7585011433207938, "grad_norm": 0.6435067057609558, "learning_rate": 1.3096610102134086e-05, "loss": 0.2425, "step": 17880 }, { "epoch": 1.759484645078803, "grad_norm": 0.6339534521102905, "learning_rate": 1.3092636013193976e-05, "loss": 0.1548, "step": 17890 }, { "epoch": 1.7604681468368124, "grad_norm": 2.1207547187805176, "learning_rate": 1.3088661924253865e-05, "loss": 0.2594, "step": 17900 }, { "epoch": 1.761451648594822, "grad_norm": 0.6143303513526917, "learning_rate": 1.3084687835313757e-05, "loss": 0.2998, "step": 17910 }, { "epoch": 1.7624351503528313, "grad_norm": 0.7414078712463379, "learning_rate": 1.3080713746373646e-05, "loss": 0.1891, "step": 17920 }, { "epoch": 1.7634186521108406, "grad_norm": 1.9617842435836792, "learning_rate": 1.3076739657433534e-05, "loss": 0.297, "step": 17930 }, { "epoch": 1.7644021538688501, "grad_norm": 2.322709321975708, "learning_rate": 1.3072765568493424e-05, "loss": 0.2052, "step": 17940 }, { "epoch": 1.7653856556268595, "grad_norm": 1.813644528388977, "learning_rate": 1.3068791479553314e-05, "loss": 0.2478, "step": 17950 }, { "epoch": 1.7663691573848688, "grad_norm": 1.4270058870315552, "learning_rate": 1.3064817390613203e-05, "loss": 0.124, "step": 17960 }, { "epoch": 1.7673526591428783, "grad_norm": 2.472296953201294, "learning_rate": 1.3060843301673091e-05, "loss": 0.2331, "step": 17970 }, { "epoch": 1.7683361609008876, "grad_norm": 2.430737257003784, "learning_rate": 1.305686921273298e-05, "loss": 0.2501, "step": 17980 }, { "epoch": 1.769319662658897, "grad_norm": 0.6140730977058411, "learning_rate": 1.3052895123792872e-05, "loss": 0.1823, "step": 17990 }, { "epoch": 1.7703031644169065, "grad_norm": 4.935455799102783, "learning_rate": 1.3048921034852762e-05, "loss": 0.2335, "step": 18000 }, { "epoch": 1.7703031644169065, "eval_loss": 0.16023403406143188, "eval_runtime": 18.1415, "eval_samples_per_second": 2.756, "eval_steps_per_second": 1.378, "step": 18000 }, { "epoch": 1.7712866661749158, "grad_norm": 3.046405076980591, "learning_rate": 1.3044946945912651e-05, "loss": 0.2097, "step": 18010 }, { "epoch": 1.7722701679329251, "grad_norm": 0.8612200021743774, "learning_rate": 1.304097285697254e-05, "loss": 0.2561, "step": 18020 }, { "epoch": 1.7732536696909347, "grad_norm": 1.075387954711914, "learning_rate": 1.3036998768032429e-05, "loss": 0.2052, "step": 18030 }, { "epoch": 1.774237171448944, "grad_norm": 0.7561732530593872, "learning_rate": 1.3033024679092319e-05, "loss": 0.2157, "step": 18040 }, { "epoch": 1.7752206732069533, "grad_norm": 0.7665427923202515, "learning_rate": 1.3029050590152208e-05, "loss": 0.1883, "step": 18050 }, { "epoch": 1.7762041749649629, "grad_norm": 1.8395278453826904, "learning_rate": 1.3025076501212096e-05, "loss": 0.2507, "step": 18060 }, { "epoch": 1.7771876767229722, "grad_norm": 1.2933135032653809, "learning_rate": 1.302110241227199e-05, "loss": 0.307, "step": 18070 }, { "epoch": 1.7781711784809815, "grad_norm": 0.7886130213737488, "learning_rate": 1.3017128323331877e-05, "loss": 0.2525, "step": 18080 }, { "epoch": 1.779154680238991, "grad_norm": 1.028622031211853, "learning_rate": 1.3013154234391767e-05, "loss": 0.2672, "step": 18090 }, { "epoch": 1.7801381819970004, "grad_norm": 0.8745284080505371, "learning_rate": 1.3009180145451657e-05, "loss": 0.1684, "step": 18100 }, { "epoch": 1.7811216837550097, "grad_norm": 0.5309194922447205, "learning_rate": 1.3005206056511546e-05, "loss": 0.2063, "step": 18110 }, { "epoch": 1.7821051855130192, "grad_norm": 1.1381081342697144, "learning_rate": 1.3001231967571434e-05, "loss": 0.3476, "step": 18120 }, { "epoch": 1.7830886872710285, "grad_norm": 0.6130464673042297, "learning_rate": 1.2997257878631324e-05, "loss": 0.2196, "step": 18130 }, { "epoch": 1.7840721890290379, "grad_norm": 0.6083819270133972, "learning_rate": 1.2993283789691214e-05, "loss": 0.2292, "step": 18140 }, { "epoch": 1.7850556907870474, "grad_norm": 0.9610211253166199, "learning_rate": 1.2989309700751105e-05, "loss": 0.3196, "step": 18150 }, { "epoch": 1.7860391925450567, "grad_norm": 4.28626823425293, "learning_rate": 1.2985335611810995e-05, "loss": 0.1689, "step": 18160 }, { "epoch": 1.787022694303066, "grad_norm": 2.249110698699951, "learning_rate": 1.2981361522870883e-05, "loss": 0.2129, "step": 18170 }, { "epoch": 1.7880061960610756, "grad_norm": 0.44461897015571594, "learning_rate": 1.2977387433930772e-05, "loss": 0.2095, "step": 18180 }, { "epoch": 1.788989697819085, "grad_norm": 0.26538947224617004, "learning_rate": 1.2973413344990662e-05, "loss": 0.2116, "step": 18190 }, { "epoch": 1.7899731995770942, "grad_norm": 2.8765032291412354, "learning_rate": 1.2969439256050552e-05, "loss": 0.2552, "step": 18200 }, { "epoch": 1.7909567013351038, "grad_norm": 1.751949667930603, "learning_rate": 1.296546516711044e-05, "loss": 0.2782, "step": 18210 }, { "epoch": 1.791940203093113, "grad_norm": 1.121596336364746, "learning_rate": 1.296149107817033e-05, "loss": 0.1649, "step": 18220 }, { "epoch": 1.7929237048511224, "grad_norm": 1.0512248277664185, "learning_rate": 1.295751698923022e-05, "loss": 0.3041, "step": 18230 }, { "epoch": 1.793907206609132, "grad_norm": 2.1734120845794678, "learning_rate": 1.295354290029011e-05, "loss": 0.1779, "step": 18240 }, { "epoch": 1.794890708367141, "grad_norm": 1.8944523334503174, "learning_rate": 1.294956881135e-05, "loss": 0.2143, "step": 18250 }, { "epoch": 1.7958742101251506, "grad_norm": 4.825395584106445, "learning_rate": 1.2945594722409888e-05, "loss": 0.2896, "step": 18260 }, { "epoch": 1.7968577118831601, "grad_norm": 0.7082745432853699, "learning_rate": 1.2941620633469778e-05, "loss": 0.234, "step": 18270 }, { "epoch": 1.7978412136411692, "grad_norm": 0.8548723459243774, "learning_rate": 1.2937646544529667e-05, "loss": 0.1127, "step": 18280 }, { "epoch": 1.7988247153991788, "grad_norm": 3.628645181655884, "learning_rate": 1.2933672455589557e-05, "loss": 0.2571, "step": 18290 }, { "epoch": 1.7998082171571883, "grad_norm": 0.721328616142273, "learning_rate": 1.2929698366649445e-05, "loss": 0.3004, "step": 18300 }, { "epoch": 1.8007917189151974, "grad_norm": 1.983008623123169, "learning_rate": 1.2925724277709338e-05, "loss": 0.1744, "step": 18310 }, { "epoch": 1.801775220673207, "grad_norm": 1.1194294691085815, "learning_rate": 1.2921750188769226e-05, "loss": 0.242, "step": 18320 }, { "epoch": 1.8027587224312165, "grad_norm": 0.8498597741127014, "learning_rate": 1.2917776099829116e-05, "loss": 0.1855, "step": 18330 }, { "epoch": 1.8037422241892256, "grad_norm": 1.695459246635437, "learning_rate": 1.2913802010889005e-05, "loss": 0.2498, "step": 18340 }, { "epoch": 1.8047257259472351, "grad_norm": 1.5076769590377808, "learning_rate": 1.2909827921948895e-05, "loss": 0.1511, "step": 18350 }, { "epoch": 1.8057092277052447, "grad_norm": 1.4327051639556885, "learning_rate": 1.2905853833008783e-05, "loss": 0.1891, "step": 18360 }, { "epoch": 1.8066927294632538, "grad_norm": 1.8368661403656006, "learning_rate": 1.2901879744068673e-05, "loss": 0.2104, "step": 18370 }, { "epoch": 1.8076762312212633, "grad_norm": 0.7648179531097412, "learning_rate": 1.2897905655128562e-05, "loss": 0.2628, "step": 18380 }, { "epoch": 1.8086597329792728, "grad_norm": 1.534253478050232, "learning_rate": 1.2893931566188454e-05, "loss": 0.1789, "step": 18390 }, { "epoch": 1.809643234737282, "grad_norm": 1.4189066886901855, "learning_rate": 1.2889957477248343e-05, "loss": 0.2721, "step": 18400 }, { "epoch": 1.8106267364952915, "grad_norm": 2.705341100692749, "learning_rate": 1.2885983388308231e-05, "loss": 0.2601, "step": 18410 }, { "epoch": 1.811610238253301, "grad_norm": 0.8574623465538025, "learning_rate": 1.2882009299368121e-05, "loss": 0.2281, "step": 18420 }, { "epoch": 1.8125937400113101, "grad_norm": 0.766758143901825, "learning_rate": 1.287803521042801e-05, "loss": 0.2576, "step": 18430 }, { "epoch": 1.8135772417693197, "grad_norm": 1.3519665002822876, "learning_rate": 1.28740611214879e-05, "loss": 0.153, "step": 18440 }, { "epoch": 1.8145607435273292, "grad_norm": 0.4131558835506439, "learning_rate": 1.2870087032547788e-05, "loss": 0.2223, "step": 18450 }, { "epoch": 1.8155442452853383, "grad_norm": 2.183131456375122, "learning_rate": 1.2866112943607678e-05, "loss": 0.1797, "step": 18460 }, { "epoch": 1.8165277470433479, "grad_norm": 1.3639419078826904, "learning_rate": 1.286213885466757e-05, "loss": 0.2461, "step": 18470 }, { "epoch": 1.8175112488013574, "grad_norm": 1.5011903047561646, "learning_rate": 1.2858164765727459e-05, "loss": 0.1996, "step": 18480 }, { "epoch": 1.8184947505593665, "grad_norm": 1.1669235229492188, "learning_rate": 1.2854190676787349e-05, "loss": 0.3128, "step": 18490 }, { "epoch": 1.819478252317376, "grad_norm": 1.762707233428955, "learning_rate": 1.2850216587847237e-05, "loss": 0.1563, "step": 18500 }, { "epoch": 1.819478252317376, "eval_loss": 0.12783123552799225, "eval_runtime": 16.797, "eval_samples_per_second": 2.977, "eval_steps_per_second": 1.488, "step": 18500 }, { "epoch": 1.8204617540753856, "grad_norm": 3.2158849239349365, "learning_rate": 1.2846242498907126e-05, "loss": 0.2463, "step": 18510 }, { "epoch": 1.8214452558333947, "grad_norm": 0.165601447224617, "learning_rate": 1.2842268409967016e-05, "loss": 0.1599, "step": 18520 }, { "epoch": 1.8224287575914042, "grad_norm": 1.2064436674118042, "learning_rate": 1.2838294321026906e-05, "loss": 0.251, "step": 18530 }, { "epoch": 1.8234122593494135, "grad_norm": 1.1310498714447021, "learning_rate": 1.2834320232086794e-05, "loss": 0.2921, "step": 18540 }, { "epoch": 1.8243957611074229, "grad_norm": 1.2284313440322876, "learning_rate": 1.2830346143146683e-05, "loss": 0.2508, "step": 18550 }, { "epoch": 1.8253792628654324, "grad_norm": 0.17183829843997955, "learning_rate": 1.2826372054206575e-05, "loss": 0.21, "step": 18560 }, { "epoch": 1.8263627646234417, "grad_norm": 1.699674129486084, "learning_rate": 1.2822397965266464e-05, "loss": 0.2536, "step": 18570 }, { "epoch": 1.827346266381451, "grad_norm": 1.4755967855453491, "learning_rate": 1.2818423876326354e-05, "loss": 0.2296, "step": 18580 }, { "epoch": 1.8283297681394606, "grad_norm": 0.8879458904266357, "learning_rate": 1.2814449787386244e-05, "loss": 0.2436, "step": 18590 }, { "epoch": 1.82931326989747, "grad_norm": 0.39650049805641174, "learning_rate": 1.2810475698446132e-05, "loss": 0.19, "step": 18600 }, { "epoch": 1.8302967716554792, "grad_norm": 3.2891576290130615, "learning_rate": 1.2806501609506021e-05, "loss": 0.2146, "step": 18610 }, { "epoch": 1.8312802734134888, "grad_norm": 0.9302894473075867, "learning_rate": 1.2802527520565911e-05, "loss": 0.2911, "step": 18620 }, { "epoch": 1.832263775171498, "grad_norm": 0.9274962544441223, "learning_rate": 1.27985534316258e-05, "loss": 0.1712, "step": 18630 }, { "epoch": 1.8332472769295074, "grad_norm": 0.8566771149635315, "learning_rate": 1.2794579342685692e-05, "loss": 0.1546, "step": 18640 }, { "epoch": 1.834230778687517, "grad_norm": 2.141951322555542, "learning_rate": 1.279060525374558e-05, "loss": 0.3158, "step": 18650 }, { "epoch": 1.8352142804455263, "grad_norm": 0.6342948079109192, "learning_rate": 1.278663116480547e-05, "loss": 0.3165, "step": 18660 }, { "epoch": 1.8361977822035356, "grad_norm": 0.9059627056121826, "learning_rate": 1.278265707586536e-05, "loss": 0.2401, "step": 18670 }, { "epoch": 1.8371812839615451, "grad_norm": 2.2364370822906494, "learning_rate": 1.2778682986925249e-05, "loss": 0.1763, "step": 18680 }, { "epoch": 1.8381647857195544, "grad_norm": 1.2001042366027832, "learning_rate": 1.2774708897985137e-05, "loss": 0.1694, "step": 18690 }, { "epoch": 1.8391482874775638, "grad_norm": 1.6993484497070312, "learning_rate": 1.2770734809045027e-05, "loss": 0.2158, "step": 18700 }, { "epoch": 1.8401317892355733, "grad_norm": 1.790419578552246, "learning_rate": 1.2766760720104916e-05, "loss": 0.1804, "step": 18710 }, { "epoch": 1.8411152909935826, "grad_norm": 0.9894189834594727, "learning_rate": 1.2762786631164808e-05, "loss": 0.203, "step": 18720 }, { "epoch": 1.842098792751592, "grad_norm": 0.6162654161453247, "learning_rate": 1.2758812542224697e-05, "loss": 0.2229, "step": 18730 }, { "epoch": 1.8430822945096015, "grad_norm": 0.6175249814987183, "learning_rate": 1.2754838453284585e-05, "loss": 0.2179, "step": 18740 }, { "epoch": 1.8440657962676108, "grad_norm": 0.8340634703636169, "learning_rate": 1.2750864364344475e-05, "loss": 0.2586, "step": 18750 }, { "epoch": 1.8450492980256201, "grad_norm": 1.8784235715866089, "learning_rate": 1.2746890275404364e-05, "loss": 0.1957, "step": 18760 }, { "epoch": 1.8460327997836297, "grad_norm": 1.9351670742034912, "learning_rate": 1.2742916186464254e-05, "loss": 0.2829, "step": 18770 }, { "epoch": 1.847016301541639, "grad_norm": 0.6201217770576477, "learning_rate": 1.2738942097524142e-05, "loss": 0.2467, "step": 18780 }, { "epoch": 1.8479998032996483, "grad_norm": 0.5133494734764099, "learning_rate": 1.2734968008584032e-05, "loss": 0.2822, "step": 18790 }, { "epoch": 1.8489833050576578, "grad_norm": 1.161572813987732, "learning_rate": 1.2730993919643923e-05, "loss": 0.2605, "step": 18800 }, { "epoch": 1.8499668068156672, "grad_norm": 2.1235814094543457, "learning_rate": 1.2727019830703813e-05, "loss": 0.2341, "step": 18810 }, { "epoch": 1.8509503085736765, "grad_norm": 3.7511794567108154, "learning_rate": 1.2723045741763702e-05, "loss": 0.2416, "step": 18820 }, { "epoch": 1.851933810331686, "grad_norm": 1.0420117378234863, "learning_rate": 1.2719071652823592e-05, "loss": 0.2211, "step": 18830 }, { "epoch": 1.8529173120896953, "grad_norm": 1.2071075439453125, "learning_rate": 1.271509756388348e-05, "loss": 0.2008, "step": 18840 }, { "epoch": 1.8539008138477047, "grad_norm": 1.5324718952178955, "learning_rate": 1.271112347494337e-05, "loss": 0.2727, "step": 18850 }, { "epoch": 1.8548843156057142, "grad_norm": 2.7754740715026855, "learning_rate": 1.270714938600326e-05, "loss": 0.3221, "step": 18860 }, { "epoch": 1.8558678173637235, "grad_norm": 0.812025249004364, "learning_rate": 1.2703175297063149e-05, "loss": 0.2665, "step": 18870 }, { "epoch": 1.8568513191217328, "grad_norm": 0.7611910104751587, "learning_rate": 1.269920120812304e-05, "loss": 0.2205, "step": 18880 }, { "epoch": 1.8578348208797424, "grad_norm": 1.793511986732483, "learning_rate": 1.2695227119182928e-05, "loss": 0.1648, "step": 18890 }, { "epoch": 1.8588183226377517, "grad_norm": 1.9614449739456177, "learning_rate": 1.2691253030242818e-05, "loss": 0.2559, "step": 18900 }, { "epoch": 1.859801824395761, "grad_norm": 2.4327352046966553, "learning_rate": 1.2687278941302708e-05, "loss": 0.2072, "step": 18910 }, { "epoch": 1.8607853261537706, "grad_norm": 0.6133950352668762, "learning_rate": 1.2683304852362597e-05, "loss": 0.1654, "step": 18920 }, { "epoch": 1.8617688279117799, "grad_norm": 1.5375863313674927, "learning_rate": 1.2679330763422485e-05, "loss": 0.1405, "step": 18930 }, { "epoch": 1.8627523296697892, "grad_norm": 1.1150537729263306, "learning_rate": 1.2675356674482375e-05, "loss": 0.2805, "step": 18940 }, { "epoch": 1.8637358314277988, "grad_norm": 0.5071201920509338, "learning_rate": 1.2671382585542265e-05, "loss": 0.1458, "step": 18950 }, { "epoch": 1.864719333185808, "grad_norm": 0.7502576112747192, "learning_rate": 1.2667408496602156e-05, "loss": 0.1965, "step": 18960 }, { "epoch": 1.8657028349438174, "grad_norm": 0.3296958804130554, "learning_rate": 1.2663434407662046e-05, "loss": 0.2225, "step": 18970 }, { "epoch": 1.866686336701827, "grad_norm": 1.3702101707458496, "learning_rate": 1.2659460318721934e-05, "loss": 0.1995, "step": 18980 }, { "epoch": 1.8676698384598363, "grad_norm": 3.903212070465088, "learning_rate": 1.2655486229781823e-05, "loss": 0.1307, "step": 18990 }, { "epoch": 1.8686533402178456, "grad_norm": 3.3656933307647705, "learning_rate": 1.2651512140841713e-05, "loss": 0.2051, "step": 19000 }, { "epoch": 1.8686533402178456, "eval_loss": 0.1276712268590927, "eval_runtime": 17.9095, "eval_samples_per_second": 2.792, "eval_steps_per_second": 1.396, "step": 19000 }, { "epoch": 1.8696368419758551, "grad_norm": 5.251664161682129, "learning_rate": 1.2647538051901603e-05, "loss": 0.169, "step": 19010 }, { "epoch": 1.8706203437338644, "grad_norm": 1.5694376230239868, "learning_rate": 1.264356396296149e-05, "loss": 0.2338, "step": 19020 }, { "epoch": 1.8716038454918738, "grad_norm": 1.6634079217910767, "learning_rate": 1.263958987402138e-05, "loss": 0.205, "step": 19030 }, { "epoch": 1.8725873472498833, "grad_norm": 0.9844359755516052, "learning_rate": 1.2635615785081272e-05, "loss": 0.1634, "step": 19040 }, { "epoch": 1.8735708490078926, "grad_norm": 3.8357009887695312, "learning_rate": 1.2631641696141161e-05, "loss": 0.3183, "step": 19050 }, { "epoch": 1.874554350765902, "grad_norm": 4.7126946449279785, "learning_rate": 1.2627667607201051e-05, "loss": 0.2224, "step": 19060 }, { "epoch": 1.8755378525239115, "grad_norm": 1.908003568649292, "learning_rate": 1.262369351826094e-05, "loss": 0.1636, "step": 19070 }, { "epoch": 1.8765213542819208, "grad_norm": 0.8002133965492249, "learning_rate": 1.2619719429320829e-05, "loss": 0.1864, "step": 19080 }, { "epoch": 1.8775048560399301, "grad_norm": 1.1409190893173218, "learning_rate": 1.2615745340380718e-05, "loss": 0.2992, "step": 19090 }, { "epoch": 1.8784883577979397, "grad_norm": 0.7023701071739197, "learning_rate": 1.2611771251440608e-05, "loss": 0.2487, "step": 19100 }, { "epoch": 1.879471859555949, "grad_norm": 0.6862336993217468, "learning_rate": 1.2607797162500498e-05, "loss": 0.2431, "step": 19110 }, { "epoch": 1.8804553613139583, "grad_norm": 0.65170818567276, "learning_rate": 1.2603823073560389e-05, "loss": 0.2289, "step": 19120 }, { "epoch": 1.8814388630719678, "grad_norm": 0.6964781880378723, "learning_rate": 1.2599848984620277e-05, "loss": 0.2642, "step": 19130 }, { "epoch": 1.8824223648299772, "grad_norm": 0.9222815632820129, "learning_rate": 1.2595874895680167e-05, "loss": 0.2591, "step": 19140 }, { "epoch": 1.8834058665879865, "grad_norm": 1.1235778331756592, "learning_rate": 1.2591900806740056e-05, "loss": 0.2364, "step": 19150 }, { "epoch": 1.884389368345996, "grad_norm": 1.0253043174743652, "learning_rate": 1.2587926717799946e-05, "loss": 0.1966, "step": 19160 }, { "epoch": 1.8853728701040053, "grad_norm": 2.779137134552002, "learning_rate": 1.2583952628859834e-05, "loss": 0.1278, "step": 19170 }, { "epoch": 1.8863563718620147, "grad_norm": 1.0971506834030151, "learning_rate": 1.2579978539919724e-05, "loss": 0.1466, "step": 19180 }, { "epoch": 1.8873398736200242, "grad_norm": 1.736945390701294, "learning_rate": 1.2576004450979613e-05, "loss": 0.1307, "step": 19190 }, { "epoch": 1.8883233753780335, "grad_norm": 2.4305453300476074, "learning_rate": 1.2572030362039505e-05, "loss": 0.2525, "step": 19200 }, { "epoch": 1.8893068771360428, "grad_norm": 0.7563501596450806, "learning_rate": 1.2568056273099394e-05, "loss": 0.2186, "step": 19210 }, { "epoch": 1.8902903788940524, "grad_norm": 2.958002805709839, "learning_rate": 1.2564082184159282e-05, "loss": 0.2845, "step": 19220 }, { "epoch": 1.8912738806520617, "grad_norm": 2.1089866161346436, "learning_rate": 1.2560108095219172e-05, "loss": 0.2058, "step": 19230 }, { "epoch": 1.892257382410071, "grad_norm": 1.3916528224945068, "learning_rate": 1.2556134006279062e-05, "loss": 0.2509, "step": 19240 }, { "epoch": 1.8932408841680806, "grad_norm": 0.8218703866004944, "learning_rate": 1.2552159917338951e-05, "loss": 0.2344, "step": 19250 }, { "epoch": 1.8942243859260899, "grad_norm": 1.5939429998397827, "learning_rate": 1.254818582839884e-05, "loss": 0.1323, "step": 19260 }, { "epoch": 1.8952078876840992, "grad_norm": 1.384416103363037, "learning_rate": 1.2544211739458729e-05, "loss": 0.3115, "step": 19270 }, { "epoch": 1.8961913894421087, "grad_norm": 1.6949342489242554, "learning_rate": 1.254023765051862e-05, "loss": 0.1937, "step": 19280 }, { "epoch": 1.897174891200118, "grad_norm": 0.34746313095092773, "learning_rate": 1.253626356157851e-05, "loss": 0.2859, "step": 19290 }, { "epoch": 1.8981583929581274, "grad_norm": 0.28613096475601196, "learning_rate": 1.25322894726384e-05, "loss": 0.3169, "step": 19300 }, { "epoch": 1.899141894716137, "grad_norm": 1.5283467769622803, "learning_rate": 1.252831538369829e-05, "loss": 0.1637, "step": 19310 }, { "epoch": 1.9001253964741462, "grad_norm": 3.342489719390869, "learning_rate": 1.2524341294758177e-05, "loss": 0.2558, "step": 19320 }, { "epoch": 1.9011088982321556, "grad_norm": 0.7206294536590576, "learning_rate": 1.2520367205818067e-05, "loss": 0.3589, "step": 19330 }, { "epoch": 1.902092399990165, "grad_norm": 0.2409156858921051, "learning_rate": 1.2516393116877957e-05, "loss": 0.1457, "step": 19340 }, { "epoch": 1.9030759017481744, "grad_norm": 1.1184440851211548, "learning_rate": 1.2512419027937846e-05, "loss": 0.226, "step": 19350 }, { "epoch": 1.9040594035061837, "grad_norm": 1.8674637079238892, "learning_rate": 1.2508444938997734e-05, "loss": 0.2031, "step": 19360 }, { "epoch": 1.9050429052641933, "grad_norm": 0.9980869293212891, "learning_rate": 1.2504470850057626e-05, "loss": 0.2601, "step": 19370 }, { "epoch": 1.9060264070222026, "grad_norm": 1.2215583324432373, "learning_rate": 1.2500496761117515e-05, "loss": 0.1995, "step": 19380 }, { "epoch": 1.907009908780212, "grad_norm": 0.635554850101471, "learning_rate": 1.2496522672177405e-05, "loss": 0.2687, "step": 19390 }, { "epoch": 1.9079934105382215, "grad_norm": 2.1318042278289795, "learning_rate": 1.2492548583237295e-05, "loss": 0.2183, "step": 19400 }, { "epoch": 1.9089769122962306, "grad_norm": 0.2807762622833252, "learning_rate": 1.2488574494297183e-05, "loss": 0.2251, "step": 19410 }, { "epoch": 1.90996041405424, "grad_norm": 2.432757616043091, "learning_rate": 1.2484600405357072e-05, "loss": 0.1734, "step": 19420 }, { "epoch": 1.9109439158122496, "grad_norm": 1.1255124807357788, "learning_rate": 1.2480626316416962e-05, "loss": 0.2215, "step": 19430 }, { "epoch": 1.9119274175702587, "grad_norm": 1.1864866018295288, "learning_rate": 1.2476652227476852e-05, "loss": 0.2485, "step": 19440 }, { "epoch": 1.9129109193282683, "grad_norm": 0.8781852126121521, "learning_rate": 1.2472678138536743e-05, "loss": 0.2639, "step": 19450 }, { "epoch": 1.9138944210862778, "grad_norm": 0.8348128199577332, "learning_rate": 1.2468704049596631e-05, "loss": 0.2793, "step": 19460 }, { "epoch": 1.914877922844287, "grad_norm": 0.407959520816803, "learning_rate": 1.246472996065652e-05, "loss": 0.2604, "step": 19470 }, { "epoch": 1.9158614246022965, "grad_norm": 2.0708372592926025, "learning_rate": 1.246075587171641e-05, "loss": 0.3622, "step": 19480 }, { "epoch": 1.916844926360306, "grad_norm": 1.2978930473327637, "learning_rate": 1.24567817827763e-05, "loss": 0.3135, "step": 19490 }, { "epoch": 1.917828428118315, "grad_norm": 2.307457208633423, "learning_rate": 1.2452807693836188e-05, "loss": 0.2206, "step": 19500 }, { "epoch": 1.917828428118315, "eval_loss": 0.13785213232040405, "eval_runtime": 18.2171, "eval_samples_per_second": 2.745, "eval_steps_per_second": 1.372, "step": 19500 }, { "epoch": 1.9188119298763247, "grad_norm": 0.5169830322265625, "learning_rate": 1.2448833604896077e-05, "loss": 0.273, "step": 19510 }, { "epoch": 1.9197954316343342, "grad_norm": 0.7869198322296143, "learning_rate": 1.2444859515955967e-05, "loss": 0.1729, "step": 19520 }, { "epoch": 1.9207789333923433, "grad_norm": 1.5760027170181274, "learning_rate": 1.2440885427015859e-05, "loss": 0.1508, "step": 19530 }, { "epoch": 1.9217624351503528, "grad_norm": 2.80485200881958, "learning_rate": 1.2436911338075748e-05, "loss": 0.3853, "step": 19540 }, { "epoch": 1.9227459369083624, "grad_norm": 1.0592259168624878, "learning_rate": 1.2432937249135638e-05, "loss": 0.2399, "step": 19550 }, { "epoch": 1.9237294386663715, "grad_norm": 0.8268088698387146, "learning_rate": 1.2428963160195526e-05, "loss": 0.2155, "step": 19560 }, { "epoch": 1.924712940424381, "grad_norm": 1.1707844734191895, "learning_rate": 1.2424989071255415e-05, "loss": 0.2499, "step": 19570 }, { "epoch": 1.9256964421823906, "grad_norm": 1.9008899927139282, "learning_rate": 1.2421014982315305e-05, "loss": 0.2489, "step": 19580 }, { "epoch": 1.9266799439403997, "grad_norm": 0.991098940372467, "learning_rate": 1.2417040893375195e-05, "loss": 0.2994, "step": 19590 }, { "epoch": 1.9276634456984092, "grad_norm": 1.5907323360443115, "learning_rate": 1.2413066804435083e-05, "loss": 0.2677, "step": 19600 }, { "epoch": 1.9286469474564187, "grad_norm": 0.3907619118690491, "learning_rate": 1.2409092715494974e-05, "loss": 0.259, "step": 19610 }, { "epoch": 1.9296304492144278, "grad_norm": 0.6614681482315063, "learning_rate": 1.2405118626554864e-05, "loss": 0.2999, "step": 19620 }, { "epoch": 1.9306139509724374, "grad_norm": 1.7540950775146484, "learning_rate": 1.2401144537614753e-05, "loss": 0.3011, "step": 19630 }, { "epoch": 1.931597452730447, "grad_norm": 2.1542980670928955, "learning_rate": 1.2397170448674643e-05, "loss": 0.1974, "step": 19640 }, { "epoch": 1.932580954488456, "grad_norm": 0.1936643123626709, "learning_rate": 1.2393196359734531e-05, "loss": 0.1426, "step": 19650 }, { "epoch": 1.9335644562464656, "grad_norm": 1.4605008363723755, "learning_rate": 1.238922227079442e-05, "loss": 0.2313, "step": 19660 }, { "epoch": 1.934547958004475, "grad_norm": 0.7753693461418152, "learning_rate": 1.238524818185431e-05, "loss": 0.302, "step": 19670 }, { "epoch": 1.9355314597624842, "grad_norm": 1.482616662979126, "learning_rate": 1.23812740929142e-05, "loss": 0.3001, "step": 19680 }, { "epoch": 1.9365149615204937, "grad_norm": 0.818903386592865, "learning_rate": 1.2377300003974091e-05, "loss": 0.2544, "step": 19690 }, { "epoch": 1.937498463278503, "grad_norm": 0.44427138566970825, "learning_rate": 1.237332591503398e-05, "loss": 0.2739, "step": 19700 }, { "epoch": 1.9384819650365124, "grad_norm": 1.0438803434371948, "learning_rate": 1.2369351826093869e-05, "loss": 0.23, "step": 19710 }, { "epoch": 1.939465466794522, "grad_norm": 2.2877156734466553, "learning_rate": 1.2365377737153759e-05, "loss": 0.2936, "step": 19720 }, { "epoch": 1.9404489685525312, "grad_norm": 0.29466554522514343, "learning_rate": 1.2361403648213648e-05, "loss": 0.2685, "step": 19730 }, { "epoch": 1.9414324703105406, "grad_norm": 1.4913973808288574, "learning_rate": 1.2357429559273536e-05, "loss": 0.2509, "step": 19740 }, { "epoch": 1.94241597206855, "grad_norm": 1.4097399711608887, "learning_rate": 1.2353455470333426e-05, "loss": 0.1971, "step": 19750 }, { "epoch": 1.9433994738265594, "grad_norm": 1.3867621421813965, "learning_rate": 1.2349481381393316e-05, "loss": 0.2442, "step": 19760 }, { "epoch": 1.9443829755845687, "grad_norm": 1.646846890449524, "learning_rate": 1.2345507292453207e-05, "loss": 0.1876, "step": 19770 }, { "epoch": 1.9453664773425783, "grad_norm": 1.509426474571228, "learning_rate": 1.2341533203513097e-05, "loss": 0.32, "step": 19780 }, { "epoch": 1.9463499791005876, "grad_norm": 2.4654855728149414, "learning_rate": 1.2337559114572986e-05, "loss": 0.2244, "step": 19790 }, { "epoch": 1.947333480858597, "grad_norm": 1.9394328594207764, "learning_rate": 1.2333585025632874e-05, "loss": 0.1677, "step": 19800 }, { "epoch": 1.9483169826166065, "grad_norm": 2.2644760608673096, "learning_rate": 1.2329610936692764e-05, "loss": 0.304, "step": 19810 }, { "epoch": 1.9493004843746158, "grad_norm": 0.9643498659133911, "learning_rate": 1.2325636847752654e-05, "loss": 0.2295, "step": 19820 }, { "epoch": 1.950283986132625, "grad_norm": 0.6572133898735046, "learning_rate": 1.2321662758812543e-05, "loss": 0.2946, "step": 19830 }, { "epoch": 1.9512674878906346, "grad_norm": 0.9991806745529175, "learning_rate": 1.2317688669872431e-05, "loss": 0.1694, "step": 19840 }, { "epoch": 1.952250989648644, "grad_norm": 1.4036468267440796, "learning_rate": 1.2313714580932323e-05, "loss": 0.2091, "step": 19850 }, { "epoch": 1.9532344914066533, "grad_norm": 0.4763045012950897, "learning_rate": 1.2309740491992212e-05, "loss": 0.2523, "step": 19860 }, { "epoch": 1.9542179931646628, "grad_norm": 2.5620388984680176, "learning_rate": 1.2305766403052102e-05, "loss": 0.2626, "step": 19870 }, { "epoch": 1.9552014949226721, "grad_norm": 1.4312139749526978, "learning_rate": 1.2301792314111992e-05, "loss": 0.323, "step": 19880 }, { "epoch": 1.9561849966806815, "grad_norm": 0.5666785836219788, "learning_rate": 1.229781822517188e-05, "loss": 0.216, "step": 19890 }, { "epoch": 1.957168498438691, "grad_norm": 2.9127252101898193, "learning_rate": 1.229384413623177e-05, "loss": 0.3055, "step": 19900 }, { "epoch": 1.9581520001967003, "grad_norm": 0.6122851967811584, "learning_rate": 1.2289870047291659e-05, "loss": 0.2277, "step": 19910 }, { "epoch": 1.9591355019547096, "grad_norm": 0.7845405340194702, "learning_rate": 1.2285895958351549e-05, "loss": 0.185, "step": 19920 }, { "epoch": 1.9601190037127192, "grad_norm": 1.7378236055374146, "learning_rate": 1.228192186941144e-05, "loss": 0.2031, "step": 19930 }, { "epoch": 1.9611025054707285, "grad_norm": 1.2192821502685547, "learning_rate": 1.2277947780471328e-05, "loss": 0.1736, "step": 19940 }, { "epoch": 1.9620860072287378, "grad_norm": 0.7177460193634033, "learning_rate": 1.2273973691531218e-05, "loss": 0.2042, "step": 19950 }, { "epoch": 1.9630695089867474, "grad_norm": 0.33699867129325867, "learning_rate": 1.2269999602591107e-05, "loss": 0.1934, "step": 19960 }, { "epoch": 1.9640530107447567, "grad_norm": 0.47641661763191223, "learning_rate": 1.2266025513650997e-05, "loss": 0.233, "step": 19970 }, { "epoch": 1.965036512502766, "grad_norm": 0.7735099792480469, "learning_rate": 1.2262051424710885e-05, "loss": 0.2167, "step": 19980 }, { "epoch": 1.9660200142607756, "grad_norm": 0.8787487149238586, "learning_rate": 1.2258077335770775e-05, "loss": 0.2523, "step": 19990 }, { "epoch": 1.9670035160187849, "grad_norm": 0.5472764372825623, "learning_rate": 1.2254103246830664e-05, "loss": 0.1696, "step": 20000 }, { "epoch": 1.9670035160187849, "eval_loss": 0.1349218338727951, "eval_runtime": 16.6238, "eval_samples_per_second": 3.008, "eval_steps_per_second": 1.504, "step": 20000 }, { "epoch": 1.9679870177767942, "grad_norm": 1.3282941579818726, "learning_rate": 1.2250129157890556e-05, "loss": 0.2454, "step": 20010 }, { "epoch": 1.9689705195348037, "grad_norm": 0.5796499252319336, "learning_rate": 1.2246155068950445e-05, "loss": 0.2414, "step": 20020 }, { "epoch": 1.969954021292813, "grad_norm": 0.5740196704864502, "learning_rate": 1.2242180980010335e-05, "loss": 0.2331, "step": 20030 }, { "epoch": 1.9709375230508224, "grad_norm": 1.5052549839019775, "learning_rate": 1.2238206891070223e-05, "loss": 0.1921, "step": 20040 }, { "epoch": 1.971921024808832, "grad_norm": 1.2629809379577637, "learning_rate": 1.2234232802130113e-05, "loss": 0.198, "step": 20050 }, { "epoch": 1.9729045265668412, "grad_norm": 1.0493677854537964, "learning_rate": 1.2230258713190002e-05, "loss": 0.2535, "step": 20060 }, { "epoch": 1.9738880283248506, "grad_norm": 1.112351417541504, "learning_rate": 1.2226284624249892e-05, "loss": 0.2749, "step": 20070 }, { "epoch": 1.97487153008286, "grad_norm": 0.5583861470222473, "learning_rate": 1.222231053530978e-05, "loss": 0.2528, "step": 20080 }, { "epoch": 1.9758550318408694, "grad_norm": 1.9555214643478394, "learning_rate": 1.221833644636967e-05, "loss": 0.1862, "step": 20090 }, { "epoch": 1.9768385335988787, "grad_norm": 0.671154260635376, "learning_rate": 1.2214362357429561e-05, "loss": 0.1837, "step": 20100 }, { "epoch": 1.9778220353568883, "grad_norm": 1.681707501411438, "learning_rate": 1.221038826848945e-05, "loss": 0.2118, "step": 20110 }, { "epoch": 1.9788055371148976, "grad_norm": 0.433243989944458, "learning_rate": 1.220641417954934e-05, "loss": 0.3268, "step": 20120 }, { "epoch": 1.979789038872907, "grad_norm": 1.1356432437896729, "learning_rate": 1.2202440090609228e-05, "loss": 0.2099, "step": 20130 }, { "epoch": 1.9807725406309165, "grad_norm": 0.8848307728767395, "learning_rate": 1.2198466001669118e-05, "loss": 0.2387, "step": 20140 }, { "epoch": 1.9817560423889258, "grad_norm": 1.2508822679519653, "learning_rate": 1.2194491912729008e-05, "loss": 0.2348, "step": 20150 }, { "epoch": 1.982739544146935, "grad_norm": 1.3926819562911987, "learning_rate": 1.2190517823788897e-05, "loss": 0.1377, "step": 20160 }, { "epoch": 1.9837230459049446, "grad_norm": 1.2700151205062866, "learning_rate": 1.2186543734848785e-05, "loss": 0.217, "step": 20170 }, { "epoch": 1.984706547662954, "grad_norm": 1.615131139755249, "learning_rate": 1.2182569645908677e-05, "loss": 0.2022, "step": 20180 }, { "epoch": 1.9856900494209633, "grad_norm": 0.827870786190033, "learning_rate": 1.2178595556968566e-05, "loss": 0.3, "step": 20190 }, { "epoch": 1.9866735511789728, "grad_norm": 0.9399486780166626, "learning_rate": 1.2174621468028456e-05, "loss": 0.2433, "step": 20200 }, { "epoch": 1.9876570529369821, "grad_norm": 0.638542652130127, "learning_rate": 1.2170647379088346e-05, "loss": 0.2699, "step": 20210 }, { "epoch": 1.9886405546949915, "grad_norm": 0.666766881942749, "learning_rate": 1.2166673290148234e-05, "loss": 0.2616, "step": 20220 }, { "epoch": 1.989624056453001, "grad_norm": 1.5862071514129639, "learning_rate": 1.2162699201208123e-05, "loss": 0.189, "step": 20230 }, { "epoch": 1.9906075582110103, "grad_norm": 1.093273639678955, "learning_rate": 1.2158725112268013e-05, "loss": 0.3595, "step": 20240 }, { "epoch": 1.9915910599690196, "grad_norm": 0.5610359907150269, "learning_rate": 1.2154751023327903e-05, "loss": 0.2155, "step": 20250 }, { "epoch": 1.9925745617270292, "grad_norm": 0.38059747219085693, "learning_rate": 1.2150776934387794e-05, "loss": 0.1547, "step": 20260 }, { "epoch": 1.9935580634850385, "grad_norm": 0.5128862261772156, "learning_rate": 1.2146802845447684e-05, "loss": 0.2517, "step": 20270 }, { "epoch": 1.9945415652430478, "grad_norm": 0.22182230651378632, "learning_rate": 1.2142828756507572e-05, "loss": 0.167, "step": 20280 }, { "epoch": 1.9955250670010574, "grad_norm": 0.8813309073448181, "learning_rate": 1.2138854667567461e-05, "loss": 0.3163, "step": 20290 }, { "epoch": 1.9965085687590667, "grad_norm": 0.5298721194267273, "learning_rate": 1.213488057862735e-05, "loss": 0.2425, "step": 20300 }, { "epoch": 1.997492070517076, "grad_norm": 0.42940881848335266, "learning_rate": 1.213090648968724e-05, "loss": 0.1336, "step": 20310 }, { "epoch": 1.9984755722750855, "grad_norm": 0.486102819442749, "learning_rate": 1.2126932400747128e-05, "loss": 0.3274, "step": 20320 }, { "epoch": 1.9994590740330949, "grad_norm": 1.8005328178405762, "learning_rate": 1.2122958311807018e-05, "loss": 0.2202, "step": 20330 }, { "epoch": 2.000442575791104, "grad_norm": 1.0286778211593628, "learning_rate": 1.211898422286691e-05, "loss": 0.1598, "step": 20340 }, { "epoch": 2.0014260775491137, "grad_norm": 0.1806839257478714, "learning_rate": 1.21150101339268e-05, "loss": 0.1884, "step": 20350 }, { "epoch": 2.002409579307123, "grad_norm": 0.7888972163200378, "learning_rate": 1.2111036044986689e-05, "loss": 0.1484, "step": 20360 }, { "epoch": 2.0033930810651324, "grad_norm": 0.4760337769985199, "learning_rate": 1.2107061956046577e-05, "loss": 0.1861, "step": 20370 }, { "epoch": 2.004376582823142, "grad_norm": 1.431585431098938, "learning_rate": 1.2103087867106466e-05, "loss": 0.2105, "step": 20380 }, { "epoch": 2.005360084581151, "grad_norm": 1.5212070941925049, "learning_rate": 1.2099113778166356e-05, "loss": 0.2318, "step": 20390 }, { "epoch": 2.0063435863391605, "grad_norm": 0.7492786645889282, "learning_rate": 1.2095139689226246e-05, "loss": 0.2358, "step": 20400 }, { "epoch": 2.00732708809717, "grad_norm": 1.1958400011062622, "learning_rate": 1.2091165600286134e-05, "loss": 0.2445, "step": 20410 }, { "epoch": 2.008310589855179, "grad_norm": 1.0838582515716553, "learning_rate": 1.2087191511346025e-05, "loss": 0.1547, "step": 20420 }, { "epoch": 2.0092940916131887, "grad_norm": 2.8028171062469482, "learning_rate": 1.2083217422405915e-05, "loss": 0.1836, "step": 20430 }, { "epoch": 2.0102775933711983, "grad_norm": 0.11121039092540741, "learning_rate": 1.2079243333465804e-05, "loss": 0.2085, "step": 20440 }, { "epoch": 2.0112610951292074, "grad_norm": 0.6349248886108398, "learning_rate": 1.2075269244525694e-05, "loss": 0.1738, "step": 20450 }, { "epoch": 2.012244596887217, "grad_norm": 2.560642957687378, "learning_rate": 1.2071295155585582e-05, "loss": 0.2848, "step": 20460 }, { "epoch": 2.0132280986452264, "grad_norm": 0.6541864275932312, "learning_rate": 1.2067321066645472e-05, "loss": 0.2546, "step": 20470 }, { "epoch": 2.0142116004032355, "grad_norm": 0.8187896609306335, "learning_rate": 1.2063346977705361e-05, "loss": 0.189, "step": 20480 }, { "epoch": 2.015195102161245, "grad_norm": 2.4058218002319336, "learning_rate": 1.2059372888765251e-05, "loss": 0.181, "step": 20490 }, { "epoch": 2.0161786039192546, "grad_norm": 0.8954346776008606, "learning_rate": 1.2055398799825142e-05, "loss": 0.238, "step": 20500 }, { "epoch": 2.0161786039192546, "eval_loss": 0.13197824358940125, "eval_runtime": 18.561, "eval_samples_per_second": 2.694, "eval_steps_per_second": 1.347, "step": 20500 }, { "epoch": 2.0171621056772637, "grad_norm": 1.251834511756897, "learning_rate": 1.2051424710885032e-05, "loss": 0.2142, "step": 20510 }, { "epoch": 2.0181456074352733, "grad_norm": 1.2214981317520142, "learning_rate": 1.204745062194492e-05, "loss": 0.2327, "step": 20520 }, { "epoch": 2.019129109193283, "grad_norm": 1.1138148307800293, "learning_rate": 1.204347653300481e-05, "loss": 0.241, "step": 20530 }, { "epoch": 2.020112610951292, "grad_norm": 1.0483516454696655, "learning_rate": 1.20395024440647e-05, "loss": 0.2041, "step": 20540 }, { "epoch": 2.0210961127093015, "grad_norm": 0.6449570655822754, "learning_rate": 1.2035528355124589e-05, "loss": 0.204, "step": 20550 }, { "epoch": 2.022079614467311, "grad_norm": 0.3355649411678314, "learning_rate": 1.2031554266184477e-05, "loss": 0.236, "step": 20560 }, { "epoch": 2.02306311622532, "grad_norm": 1.418927788734436, "learning_rate": 1.2027580177244367e-05, "loss": 0.2231, "step": 20570 }, { "epoch": 2.0240466179833296, "grad_norm": 1.9252437353134155, "learning_rate": 1.2023606088304258e-05, "loss": 0.1752, "step": 20580 }, { "epoch": 2.025030119741339, "grad_norm": 0.509128749370575, "learning_rate": 1.2019631999364148e-05, "loss": 0.2209, "step": 20590 }, { "epoch": 2.0260136214993483, "grad_norm": 1.4364620447158813, "learning_rate": 1.2015657910424037e-05, "loss": 0.2018, "step": 20600 }, { "epoch": 2.026997123257358, "grad_norm": 0.6519067287445068, "learning_rate": 1.2011683821483925e-05, "loss": 0.2391, "step": 20610 }, { "epoch": 2.0279806250153674, "grad_norm": 0.9485163688659668, "learning_rate": 1.2007709732543815e-05, "loss": 0.1662, "step": 20620 }, { "epoch": 2.0289641267733765, "grad_norm": 0.6042230129241943, "learning_rate": 1.2003735643603705e-05, "loss": 0.1974, "step": 20630 }, { "epoch": 2.029947628531386, "grad_norm": 0.9156279563903809, "learning_rate": 1.1999761554663594e-05, "loss": 0.2873, "step": 20640 }, { "epoch": 2.0309311302893955, "grad_norm": 1.7737699747085571, "learning_rate": 1.1995787465723482e-05, "loss": 0.2187, "step": 20650 }, { "epoch": 2.0319146320474046, "grad_norm": 1.2266044616699219, "learning_rate": 1.1991813376783374e-05, "loss": 0.214, "step": 20660 }, { "epoch": 2.032898133805414, "grad_norm": 1.0398406982421875, "learning_rate": 1.1987839287843263e-05, "loss": 0.1384, "step": 20670 }, { "epoch": 2.0338816355634237, "grad_norm": 0.5955280065536499, "learning_rate": 1.1983865198903153e-05, "loss": 0.3007, "step": 20680 }, { "epoch": 2.034865137321433, "grad_norm": 2.1822447776794434, "learning_rate": 1.1979891109963043e-05, "loss": 0.2657, "step": 20690 }, { "epoch": 2.0358486390794424, "grad_norm": 0.9914331436157227, "learning_rate": 1.197591702102293e-05, "loss": 0.1636, "step": 20700 }, { "epoch": 2.036832140837452, "grad_norm": 0.28163185715675354, "learning_rate": 1.197194293208282e-05, "loss": 0.1733, "step": 20710 }, { "epoch": 2.037815642595461, "grad_norm": 1.0898809432983398, "learning_rate": 1.196796884314271e-05, "loss": 0.128, "step": 20720 }, { "epoch": 2.0387991443534705, "grad_norm": 0.48158377408981323, "learning_rate": 1.19639947542026e-05, "loss": 0.3055, "step": 20730 }, { "epoch": 2.03978264611148, "grad_norm": 0.6361274123191833, "learning_rate": 1.1960020665262491e-05, "loss": 0.2653, "step": 20740 }, { "epoch": 2.040766147869489, "grad_norm": 0.229063481092453, "learning_rate": 1.195604657632238e-05, "loss": 0.2892, "step": 20750 }, { "epoch": 2.0417496496274987, "grad_norm": 0.7816784381866455, "learning_rate": 1.1952072487382269e-05, "loss": 0.2426, "step": 20760 }, { "epoch": 2.0427331513855083, "grad_norm": 1.1882227659225464, "learning_rate": 1.1948098398442158e-05, "loss": 0.2364, "step": 20770 }, { "epoch": 2.0437166531435174, "grad_norm": 1.2545922994613647, "learning_rate": 1.1944124309502048e-05, "loss": 0.2674, "step": 20780 }, { "epoch": 2.044700154901527, "grad_norm": 1.2698802947998047, "learning_rate": 1.1940150220561938e-05, "loss": 0.2482, "step": 20790 }, { "epoch": 2.0456836566595364, "grad_norm": 1.024978756904602, "learning_rate": 1.1936176131621826e-05, "loss": 0.3956, "step": 20800 }, { "epoch": 2.0466671584175455, "grad_norm": 0.33348727226257324, "learning_rate": 1.1932202042681715e-05, "loss": 0.2243, "step": 20810 }, { "epoch": 2.047650660175555, "grad_norm": 2.104206085205078, "learning_rate": 1.1928227953741607e-05, "loss": 0.2308, "step": 20820 }, { "epoch": 2.0486341619335646, "grad_norm": 0.27045074105262756, "learning_rate": 1.1924253864801496e-05, "loss": 0.2153, "step": 20830 }, { "epoch": 2.0496176636915737, "grad_norm": 1.4179080724716187, "learning_rate": 1.1920279775861386e-05, "loss": 0.164, "step": 20840 }, { "epoch": 2.0506011654495833, "grad_norm": 0.8275901079177856, "learning_rate": 1.1916305686921274e-05, "loss": 0.3085, "step": 20850 }, { "epoch": 2.051584667207593, "grad_norm": 0.627906858921051, "learning_rate": 1.1912331597981164e-05, "loss": 0.1773, "step": 20860 }, { "epoch": 2.052568168965602, "grad_norm": 0.8649151921272278, "learning_rate": 1.1908357509041053e-05, "loss": 0.1718, "step": 20870 }, { "epoch": 2.0535516707236114, "grad_norm": 0.573418378829956, "learning_rate": 1.1904383420100943e-05, "loss": 0.2993, "step": 20880 }, { "epoch": 2.054535172481621, "grad_norm": 1.994998812675476, "learning_rate": 1.1900409331160831e-05, "loss": 0.2411, "step": 20890 }, { "epoch": 2.05551867423963, "grad_norm": 0.6588026285171509, "learning_rate": 1.189643524222072e-05, "loss": 0.1765, "step": 20900 }, { "epoch": 2.0565021759976396, "grad_norm": 1.3483917713165283, "learning_rate": 1.1892461153280612e-05, "loss": 0.3692, "step": 20910 }, { "epoch": 2.057485677755649, "grad_norm": 1.204897165298462, "learning_rate": 1.1888487064340502e-05, "loss": 0.1944, "step": 20920 }, { "epoch": 2.0584691795136583, "grad_norm": 0.5633821487426758, "learning_rate": 1.1884512975400391e-05, "loss": 0.2193, "step": 20930 }, { "epoch": 2.059452681271668, "grad_norm": 0.7229523658752441, "learning_rate": 1.188053888646028e-05, "loss": 0.1945, "step": 20940 }, { "epoch": 2.0604361830296773, "grad_norm": 3.430799961090088, "learning_rate": 1.1876564797520169e-05, "loss": 0.131, "step": 20950 }, { "epoch": 2.0614196847876864, "grad_norm": 0.41817110776901245, "learning_rate": 1.1872590708580059e-05, "loss": 0.3194, "step": 20960 }, { "epoch": 2.062403186545696, "grad_norm": 0.5513402223587036, "learning_rate": 1.1868616619639948e-05, "loss": 0.2289, "step": 20970 }, { "epoch": 2.0633866883037055, "grad_norm": 0.4411904513835907, "learning_rate": 1.1864642530699836e-05, "loss": 0.2082, "step": 20980 }, { "epoch": 2.0643701900617146, "grad_norm": 0.8698786497116089, "learning_rate": 1.186066844175973e-05, "loss": 0.199, "step": 20990 }, { "epoch": 2.065353691819724, "grad_norm": 0.9931325912475586, "learning_rate": 1.1856694352819617e-05, "loss": 0.1714, "step": 21000 }, { "epoch": 2.065353691819724, "eval_loss": 0.12633630633354187, "eval_runtime": 19.4678, "eval_samples_per_second": 2.568, "eval_steps_per_second": 1.284, "step": 21000 }, { "epoch": 2.0663371935777337, "grad_norm": 0.9795089960098267, "learning_rate": 1.1852720263879507e-05, "loss": 0.207, "step": 21010 }, { "epoch": 2.067320695335743, "grad_norm": 1.8761729001998901, "learning_rate": 1.1848746174939397e-05, "loss": 0.1822, "step": 21020 }, { "epoch": 2.0683041970937524, "grad_norm": 0.8495930433273315, "learning_rate": 1.1844772085999286e-05, "loss": 0.1868, "step": 21030 }, { "epoch": 2.069287698851762, "grad_norm": 2.16050386428833, "learning_rate": 1.1840797997059174e-05, "loss": 0.1159, "step": 21040 }, { "epoch": 2.070271200609771, "grad_norm": 0.6666444540023804, "learning_rate": 1.1836823908119064e-05, "loss": 0.1658, "step": 21050 }, { "epoch": 2.0712547023677805, "grad_norm": 0.551084041595459, "learning_rate": 1.1832849819178954e-05, "loss": 0.179, "step": 21060 }, { "epoch": 2.07223820412579, "grad_norm": 0.5672010183334351, "learning_rate": 1.1828875730238845e-05, "loss": 0.155, "step": 21070 }, { "epoch": 2.073221705883799, "grad_norm": 0.702887237071991, "learning_rate": 1.1824901641298735e-05, "loss": 0.2037, "step": 21080 }, { "epoch": 2.0742052076418087, "grad_norm": 0.5012316107749939, "learning_rate": 1.1820927552358622e-05, "loss": 0.2157, "step": 21090 }, { "epoch": 2.0751887093998183, "grad_norm": 1.0360255241394043, "learning_rate": 1.1816953463418512e-05, "loss": 0.259, "step": 21100 }, { "epoch": 2.0761722111578274, "grad_norm": 2.1426212787628174, "learning_rate": 1.1812979374478402e-05, "loss": 0.2645, "step": 21110 }, { "epoch": 2.077155712915837, "grad_norm": 0.9235216975212097, "learning_rate": 1.1809005285538291e-05, "loss": 0.1871, "step": 21120 }, { "epoch": 2.0781392146738464, "grad_norm": 2.114724636077881, "learning_rate": 1.180503119659818e-05, "loss": 0.2081, "step": 21130 }, { "epoch": 2.0791227164318555, "grad_norm": 0.5473008751869202, "learning_rate": 1.1801057107658069e-05, "loss": 0.2544, "step": 21140 }, { "epoch": 2.080106218189865, "grad_norm": 0.6295793652534485, "learning_rate": 1.179708301871796e-05, "loss": 0.2624, "step": 21150 }, { "epoch": 2.0810897199478746, "grad_norm": 1.548294186592102, "learning_rate": 1.179310892977785e-05, "loss": 0.2725, "step": 21160 }, { "epoch": 2.0820732217058837, "grad_norm": 1.4170395135879517, "learning_rate": 1.178913484083774e-05, "loss": 0.2087, "step": 21170 }, { "epoch": 2.0830567234638933, "grad_norm": 1.2778583765029907, "learning_rate": 1.1785160751897628e-05, "loss": 0.1622, "step": 21180 }, { "epoch": 2.084040225221903, "grad_norm": 0.39172619581222534, "learning_rate": 1.1781186662957517e-05, "loss": 0.0954, "step": 21190 }, { "epoch": 2.085023726979912, "grad_norm": 2.5332343578338623, "learning_rate": 1.1777212574017407e-05, "loss": 0.2202, "step": 21200 }, { "epoch": 2.0860072287379214, "grad_norm": 0.7690456509590149, "learning_rate": 1.1773238485077297e-05, "loss": 0.2473, "step": 21210 }, { "epoch": 2.0869907304959305, "grad_norm": 0.6942778825759888, "learning_rate": 1.1769264396137185e-05, "loss": 0.249, "step": 21220 }, { "epoch": 2.08797423225394, "grad_norm": 3.2177066802978516, "learning_rate": 1.1765290307197078e-05, "loss": 0.181, "step": 21230 }, { "epoch": 2.0889577340119496, "grad_norm": 0.6732898354530334, "learning_rate": 1.1761316218256966e-05, "loss": 0.1209, "step": 21240 }, { "epoch": 2.0899412357699587, "grad_norm": 1.3652567863464355, "learning_rate": 1.1757342129316855e-05, "loss": 0.1869, "step": 21250 }, { "epoch": 2.0909247375279683, "grad_norm": 1.0012937784194946, "learning_rate": 1.1753368040376745e-05, "loss": 0.231, "step": 21260 }, { "epoch": 2.091908239285978, "grad_norm": 0.48299527168273926, "learning_rate": 1.1749393951436635e-05, "loss": 0.2221, "step": 21270 }, { "epoch": 2.092891741043987, "grad_norm": 0.378261536359787, "learning_rate": 1.1745419862496523e-05, "loss": 0.166, "step": 21280 }, { "epoch": 2.0938752428019964, "grad_norm": 0.937616765499115, "learning_rate": 1.1741445773556412e-05, "loss": 0.1401, "step": 21290 }, { "epoch": 2.094858744560006, "grad_norm": 1.5174037218093872, "learning_rate": 1.1737471684616302e-05, "loss": 0.2337, "step": 21300 }, { "epoch": 2.095842246318015, "grad_norm": 1.0154869556427002, "learning_rate": 1.1733497595676193e-05, "loss": 0.1803, "step": 21310 }, { "epoch": 2.0968257480760246, "grad_norm": 1.1122788190841675, "learning_rate": 1.1729523506736083e-05, "loss": 0.2925, "step": 21320 }, { "epoch": 2.097809249834034, "grad_norm": 0.744653582572937, "learning_rate": 1.1725549417795971e-05, "loss": 0.1841, "step": 21330 }, { "epoch": 2.0987927515920433, "grad_norm": 1.2522811889648438, "learning_rate": 1.172157532885586e-05, "loss": 0.2478, "step": 21340 }, { "epoch": 2.099776253350053, "grad_norm": 2.0904698371887207, "learning_rate": 1.171760123991575e-05, "loss": 0.1925, "step": 21350 }, { "epoch": 2.1007597551080623, "grad_norm": 0.4512757360935211, "learning_rate": 1.171362715097564e-05, "loss": 0.3095, "step": 21360 }, { "epoch": 2.1017432568660714, "grad_norm": 0.8685400485992432, "learning_rate": 1.1709653062035528e-05, "loss": 0.2012, "step": 21370 }, { "epoch": 2.102726758624081, "grad_norm": 1.649245262145996, "learning_rate": 1.1705678973095418e-05, "loss": 0.0771, "step": 21380 }, { "epoch": 2.1037102603820905, "grad_norm": 2.175687789916992, "learning_rate": 1.1701704884155309e-05, "loss": 0.2466, "step": 21390 }, { "epoch": 2.1046937621400996, "grad_norm": 0.8011319637298584, "learning_rate": 1.1697730795215199e-05, "loss": 0.2011, "step": 21400 }, { "epoch": 2.105677263898109, "grad_norm": 0.9571422934532166, "learning_rate": 1.1693756706275088e-05, "loss": 0.2437, "step": 21410 }, { "epoch": 2.1066607656561187, "grad_norm": 0.3107330799102783, "learning_rate": 1.1689782617334976e-05, "loss": 0.2645, "step": 21420 }, { "epoch": 2.107644267414128, "grad_norm": 0.8494362235069275, "learning_rate": 1.1685808528394866e-05, "loss": 0.2183, "step": 21430 }, { "epoch": 2.1086277691721373, "grad_norm": 0.581502377986908, "learning_rate": 1.1681834439454756e-05, "loss": 0.1728, "step": 21440 }, { "epoch": 2.109611270930147, "grad_norm": 1.8993754386901855, "learning_rate": 1.1677860350514645e-05, "loss": 0.1762, "step": 21450 }, { "epoch": 2.110594772688156, "grad_norm": 2.3973135948181152, "learning_rate": 1.1673886261574533e-05, "loss": 0.1829, "step": 21460 }, { "epoch": 2.1115782744461655, "grad_norm": 1.4782636165618896, "learning_rate": 1.1669912172634426e-05, "loss": 0.2052, "step": 21470 }, { "epoch": 2.112561776204175, "grad_norm": 1.6421269178390503, "learning_rate": 1.1665938083694314e-05, "loss": 0.2584, "step": 21480 }, { "epoch": 2.113545277962184, "grad_norm": 0.34578168392181396, "learning_rate": 1.1661963994754204e-05, "loss": 0.2365, "step": 21490 }, { "epoch": 2.1145287797201937, "grad_norm": 0.6166962385177612, "learning_rate": 1.1657989905814094e-05, "loss": 0.2209, "step": 21500 }, { "epoch": 2.1145287797201937, "eval_loss": 0.12207618355751038, "eval_runtime": 19.3719, "eval_samples_per_second": 2.581, "eval_steps_per_second": 1.291, "step": 21500 }, { "epoch": 2.1155122814782032, "grad_norm": 1.2740017175674438, "learning_rate": 1.1654015816873983e-05, "loss": 0.1723, "step": 21510 }, { "epoch": 2.1164957832362123, "grad_norm": 1.4409515857696533, "learning_rate": 1.1650041727933871e-05, "loss": 0.1431, "step": 21520 }, { "epoch": 2.117479284994222, "grad_norm": 0.5763534307479858, "learning_rate": 1.1646067638993761e-05, "loss": 0.2046, "step": 21530 }, { "epoch": 2.1184627867522314, "grad_norm": 1.4775296449661255, "learning_rate": 1.164209355005365e-05, "loss": 0.2873, "step": 21540 }, { "epoch": 2.1194462885102405, "grad_norm": 1.134960412979126, "learning_rate": 1.1638119461113542e-05, "loss": 0.2338, "step": 21550 }, { "epoch": 2.12042979026825, "grad_norm": 1.5541033744812012, "learning_rate": 1.1634145372173432e-05, "loss": 0.2024, "step": 21560 }, { "epoch": 2.1214132920262596, "grad_norm": 0.3558676838874817, "learning_rate": 1.163017128323332e-05, "loss": 0.2318, "step": 21570 }, { "epoch": 2.1223967937842687, "grad_norm": 0.7510431408882141, "learning_rate": 1.162619719429321e-05, "loss": 0.2736, "step": 21580 }, { "epoch": 2.1233802955422783, "grad_norm": 0.7550353407859802, "learning_rate": 1.1622223105353099e-05, "loss": 0.1648, "step": 21590 }, { "epoch": 2.124363797300288, "grad_norm": 2.832073450088501, "learning_rate": 1.1618249016412989e-05, "loss": 0.1297, "step": 21600 }, { "epoch": 2.125347299058297, "grad_norm": 1.1138674020767212, "learning_rate": 1.1614274927472877e-05, "loss": 0.1841, "step": 21610 }, { "epoch": 2.1263308008163064, "grad_norm": 0.8778446912765503, "learning_rate": 1.1610300838532766e-05, "loss": 0.2238, "step": 21620 }, { "epoch": 2.127314302574316, "grad_norm": 1.0984127521514893, "learning_rate": 1.1606326749592658e-05, "loss": 0.1756, "step": 21630 }, { "epoch": 2.128297804332325, "grad_norm": 1.374091386795044, "learning_rate": 1.1602352660652547e-05, "loss": 0.1335, "step": 21640 }, { "epoch": 2.1292813060903346, "grad_norm": 0.6483007669448853, "learning_rate": 1.1598378571712437e-05, "loss": 0.2623, "step": 21650 }, { "epoch": 2.130264807848344, "grad_norm": 0.8100119233131409, "learning_rate": 1.1594404482772325e-05, "loss": 0.1401, "step": 21660 }, { "epoch": 2.1312483096063533, "grad_norm": 0.619308352470398, "learning_rate": 1.1590430393832215e-05, "loss": 0.2544, "step": 21670 }, { "epoch": 2.132231811364363, "grad_norm": 0.19790104031562805, "learning_rate": 1.1586456304892104e-05, "loss": 0.2049, "step": 21680 }, { "epoch": 2.1332153131223723, "grad_norm": 0.6911652684211731, "learning_rate": 1.1582482215951994e-05, "loss": 0.2325, "step": 21690 }, { "epoch": 2.1341988148803814, "grad_norm": 0.9950113296508789, "learning_rate": 1.1578508127011882e-05, "loss": 0.2132, "step": 21700 }, { "epoch": 2.135182316638391, "grad_norm": 1.3821048736572266, "learning_rate": 1.1574534038071772e-05, "loss": 0.2346, "step": 21710 }, { "epoch": 2.1361658183964005, "grad_norm": 0.7430508732795715, "learning_rate": 1.1570559949131663e-05, "loss": 0.2575, "step": 21720 }, { "epoch": 2.1371493201544096, "grad_norm": 2.4440536499023438, "learning_rate": 1.1566585860191553e-05, "loss": 0.2214, "step": 21730 }, { "epoch": 2.138132821912419, "grad_norm": 4.348588466644287, "learning_rate": 1.1562611771251442e-05, "loss": 0.1869, "step": 21740 }, { "epoch": 2.1391163236704287, "grad_norm": 0.4643959403038025, "learning_rate": 1.1558637682311332e-05, "loss": 0.2129, "step": 21750 }, { "epoch": 2.140099825428438, "grad_norm": 1.3620189428329468, "learning_rate": 1.155466359337122e-05, "loss": 0.2255, "step": 21760 }, { "epoch": 2.1410833271864473, "grad_norm": 1.957170844078064, "learning_rate": 1.155068950443111e-05, "loss": 0.1781, "step": 21770 }, { "epoch": 2.142066828944457, "grad_norm": 1.7803796529769897, "learning_rate": 1.1546715415491e-05, "loss": 0.3, "step": 21780 }, { "epoch": 2.143050330702466, "grad_norm": 0.6519695520401001, "learning_rate": 1.1542741326550889e-05, "loss": 0.1946, "step": 21790 }, { "epoch": 2.1440338324604755, "grad_norm": 0.34365659952163696, "learning_rate": 1.153876723761078e-05, "loss": 0.3447, "step": 21800 }, { "epoch": 2.145017334218485, "grad_norm": 1.302046298980713, "learning_rate": 1.1534793148670668e-05, "loss": 0.3466, "step": 21810 }, { "epoch": 2.146000835976494, "grad_norm": 0.7232846617698669, "learning_rate": 1.1530819059730558e-05, "loss": 0.2998, "step": 21820 }, { "epoch": 2.1469843377345037, "grad_norm": 1.635127305984497, "learning_rate": 1.1526844970790448e-05, "loss": 0.1282, "step": 21830 }, { "epoch": 2.1479678394925132, "grad_norm": 1.3019146919250488, "learning_rate": 1.1522870881850337e-05, "loss": 0.2473, "step": 21840 }, { "epoch": 2.1489513412505223, "grad_norm": 0.8861162066459656, "learning_rate": 1.1518896792910225e-05, "loss": 0.1754, "step": 21850 }, { "epoch": 2.149934843008532, "grad_norm": 1.6959495544433594, "learning_rate": 1.1514922703970115e-05, "loss": 0.1867, "step": 21860 }, { "epoch": 2.1509183447665414, "grad_norm": 0.5939361453056335, "learning_rate": 1.1510948615030004e-05, "loss": 0.2335, "step": 21870 }, { "epoch": 2.1519018465245505, "grad_norm": 0.9883837699890137, "learning_rate": 1.1506974526089896e-05, "loss": 0.1448, "step": 21880 }, { "epoch": 2.15288534828256, "grad_norm": 0.3421158194541931, "learning_rate": 1.1503000437149786e-05, "loss": 0.1313, "step": 21890 }, { "epoch": 2.1538688500405696, "grad_norm": 0.8726436495780945, "learning_rate": 1.1499026348209673e-05, "loss": 0.1874, "step": 21900 }, { "epoch": 2.1548523517985787, "grad_norm": 1.3549585342407227, "learning_rate": 1.1495052259269563e-05, "loss": 0.1903, "step": 21910 }, { "epoch": 2.1558358535565882, "grad_norm": 1.9957268238067627, "learning_rate": 1.1491078170329453e-05, "loss": 0.2602, "step": 21920 }, { "epoch": 2.156819355314598, "grad_norm": 0.9010176658630371, "learning_rate": 1.1487104081389342e-05, "loss": 0.2442, "step": 21930 }, { "epoch": 2.157802857072607, "grad_norm": 0.6982910633087158, "learning_rate": 1.148312999244923e-05, "loss": 0.1611, "step": 21940 }, { "epoch": 2.1587863588306164, "grad_norm": 0.8347655534744263, "learning_rate": 1.147915590350912e-05, "loss": 0.2313, "step": 21950 }, { "epoch": 2.159769860588626, "grad_norm": 3.3864970207214355, "learning_rate": 1.1475181814569011e-05, "loss": 0.2354, "step": 21960 }, { "epoch": 2.160753362346635, "grad_norm": 2.0524072647094727, "learning_rate": 1.1471207725628901e-05, "loss": 0.303, "step": 21970 }, { "epoch": 2.1617368641046446, "grad_norm": 0.5293691158294678, "learning_rate": 1.146723363668879e-05, "loss": 0.1661, "step": 21980 }, { "epoch": 2.162720365862654, "grad_norm": 0.9710631966590881, "learning_rate": 1.146325954774868e-05, "loss": 0.1995, "step": 21990 }, { "epoch": 2.1637038676206632, "grad_norm": 4.298491477966309, "learning_rate": 1.1459285458808568e-05, "loss": 0.126, "step": 22000 }, { "epoch": 2.1637038676206632, "eval_loss": 0.12008929252624512, "eval_runtime": 22.1618, "eval_samples_per_second": 2.256, "eval_steps_per_second": 1.128, "step": 22000 }, { "epoch": 2.164687369378673, "grad_norm": 1.042594313621521, "learning_rate": 1.1455311369868458e-05, "loss": 0.0853, "step": 22010 }, { "epoch": 2.1656708711366823, "grad_norm": 0.550438404083252, "learning_rate": 1.1451337280928348e-05, "loss": 0.2328, "step": 22020 }, { "epoch": 2.1666543728946914, "grad_norm": 2.2264037132263184, "learning_rate": 1.1447363191988237e-05, "loss": 0.2319, "step": 22030 }, { "epoch": 2.167637874652701, "grad_norm": 0.7744798064231873, "learning_rate": 1.1443389103048129e-05, "loss": 0.1784, "step": 22040 }, { "epoch": 2.1686213764107105, "grad_norm": 0.25838974118232727, "learning_rate": 1.1439415014108017e-05, "loss": 0.1522, "step": 22050 }, { "epoch": 2.1696048781687196, "grad_norm": 1.460982084274292, "learning_rate": 1.1435440925167906e-05, "loss": 0.1469, "step": 22060 }, { "epoch": 2.170588379926729, "grad_norm": 0.9990485310554504, "learning_rate": 1.1431466836227796e-05, "loss": 0.2747, "step": 22070 }, { "epoch": 2.1715718816847387, "grad_norm": 1.4448250532150269, "learning_rate": 1.1427492747287686e-05, "loss": 0.1212, "step": 22080 }, { "epoch": 2.172555383442748, "grad_norm": 2.9366111755371094, "learning_rate": 1.1423518658347574e-05, "loss": 0.2416, "step": 22090 }, { "epoch": 2.1735388852007573, "grad_norm": 1.9704415798187256, "learning_rate": 1.1419544569407463e-05, "loss": 0.2473, "step": 22100 }, { "epoch": 2.174522386958767, "grad_norm": 0.8068645000457764, "learning_rate": 1.1415570480467353e-05, "loss": 0.216, "step": 22110 }, { "epoch": 2.175505888716776, "grad_norm": 1.2421001195907593, "learning_rate": 1.1411596391527244e-05, "loss": 0.2233, "step": 22120 }, { "epoch": 2.1764893904747855, "grad_norm": 1.0471240282058716, "learning_rate": 1.1407622302587134e-05, "loss": 0.2244, "step": 22130 }, { "epoch": 2.177472892232795, "grad_norm": 0.49878430366516113, "learning_rate": 1.1403648213647022e-05, "loss": 0.23, "step": 22140 }, { "epoch": 2.178456393990804, "grad_norm": 1.0834639072418213, "learning_rate": 1.1399674124706912e-05, "loss": 0.1489, "step": 22150 }, { "epoch": 2.1794398957488137, "grad_norm": 1.5779012441635132, "learning_rate": 1.1395700035766801e-05, "loss": 0.2697, "step": 22160 }, { "epoch": 2.1804233975068232, "grad_norm": 4.6256279945373535, "learning_rate": 1.1391725946826691e-05, "loss": 0.1769, "step": 22170 }, { "epoch": 2.1814068992648323, "grad_norm": 2.0662331581115723, "learning_rate": 1.1387751857886579e-05, "loss": 0.213, "step": 22180 }, { "epoch": 2.182390401022842, "grad_norm": 0.6344393491744995, "learning_rate": 1.1383777768946469e-05, "loss": 0.3054, "step": 22190 }, { "epoch": 2.1833739027808514, "grad_norm": 1.839341640472412, "learning_rate": 1.137980368000636e-05, "loss": 0.1929, "step": 22200 }, { "epoch": 2.1843574045388605, "grad_norm": 2.015293836593628, "learning_rate": 1.137582959106625e-05, "loss": 0.2403, "step": 22210 }, { "epoch": 2.18534090629687, "grad_norm": 0.6259804964065552, "learning_rate": 1.137185550212614e-05, "loss": 0.2405, "step": 22220 }, { "epoch": 2.1863244080548796, "grad_norm": 0.1938798427581787, "learning_rate": 1.1367881413186029e-05, "loss": 0.1978, "step": 22230 }, { "epoch": 2.1873079098128887, "grad_norm": 0.419461190700531, "learning_rate": 1.1363907324245917e-05, "loss": 0.2166, "step": 22240 }, { "epoch": 2.1882914115708982, "grad_norm": 1.0594192743301392, "learning_rate": 1.1359933235305807e-05, "loss": 0.287, "step": 22250 }, { "epoch": 2.189274913328908, "grad_norm": 1.3059344291687012, "learning_rate": 1.1355959146365696e-05, "loss": 0.2602, "step": 22260 }, { "epoch": 2.190258415086917, "grad_norm": 0.9751050472259521, "learning_rate": 1.1351985057425586e-05, "loss": 0.1706, "step": 22270 }, { "epoch": 2.1912419168449264, "grad_norm": 0.6977351903915405, "learning_rate": 1.1348010968485477e-05, "loss": 0.2563, "step": 22280 }, { "epoch": 2.192225418602936, "grad_norm": 0.644167959690094, "learning_rate": 1.1344036879545365e-05, "loss": 0.1823, "step": 22290 }, { "epoch": 2.193208920360945, "grad_norm": 0.7303948998451233, "learning_rate": 1.1340062790605255e-05, "loss": 0.24, "step": 22300 }, { "epoch": 2.1941924221189546, "grad_norm": 1.255916953086853, "learning_rate": 1.1336088701665145e-05, "loss": 0.2991, "step": 22310 }, { "epoch": 2.195175923876964, "grad_norm": 1.121828317642212, "learning_rate": 1.1332114612725034e-05, "loss": 0.1636, "step": 22320 }, { "epoch": 2.1961594256349732, "grad_norm": 0.6688191890716553, "learning_rate": 1.1328140523784922e-05, "loss": 0.2715, "step": 22330 }, { "epoch": 2.197142927392983, "grad_norm": 1.378141164779663, "learning_rate": 1.1324166434844812e-05, "loss": 0.1942, "step": 22340 }, { "epoch": 2.1981264291509923, "grad_norm": 0.4261777698993683, "learning_rate": 1.1320192345904702e-05, "loss": 0.2135, "step": 22350 }, { "epoch": 2.1991099309090014, "grad_norm": 0.3720337450504303, "learning_rate": 1.1316218256964593e-05, "loss": 0.262, "step": 22360 }, { "epoch": 2.200093432667011, "grad_norm": 0.6999090909957886, "learning_rate": 1.1312244168024483e-05, "loss": 0.1933, "step": 22370 }, { "epoch": 2.2010769344250205, "grad_norm": 0.9601486921310425, "learning_rate": 1.130827007908437e-05, "loss": 0.3139, "step": 22380 }, { "epoch": 2.2020604361830296, "grad_norm": 0.9890156984329224, "learning_rate": 1.130429599014426e-05, "loss": 0.217, "step": 22390 }, { "epoch": 2.203043937941039, "grad_norm": 1.0042225122451782, "learning_rate": 1.130032190120415e-05, "loss": 0.29, "step": 22400 }, { "epoch": 2.2040274396990487, "grad_norm": 0.8810020089149475, "learning_rate": 1.129634781226404e-05, "loss": 0.2314, "step": 22410 }, { "epoch": 2.205010941457058, "grad_norm": 3.3902359008789062, "learning_rate": 1.1292373723323928e-05, "loss": 0.3176, "step": 22420 }, { "epoch": 2.2059944432150673, "grad_norm": 0.7750841379165649, "learning_rate": 1.1288399634383817e-05, "loss": 0.2395, "step": 22430 }, { "epoch": 2.206977944973077, "grad_norm": 0.47941309213638306, "learning_rate": 1.1284425545443709e-05, "loss": 0.2485, "step": 22440 }, { "epoch": 2.207961446731086, "grad_norm": 0.6404972076416016, "learning_rate": 1.1280451456503598e-05, "loss": 0.193, "step": 22450 }, { "epoch": 2.2089449484890955, "grad_norm": 0.5544617176055908, "learning_rate": 1.1276477367563488e-05, "loss": 0.2392, "step": 22460 }, { "epoch": 2.209928450247105, "grad_norm": 0.6468740701675415, "learning_rate": 1.1272503278623378e-05, "loss": 0.1146, "step": 22470 }, { "epoch": 2.210911952005114, "grad_norm": 1.14541494846344, "learning_rate": 1.1268529189683266e-05, "loss": 0.1455, "step": 22480 }, { "epoch": 2.2118954537631237, "grad_norm": 1.0405633449554443, "learning_rate": 1.1264555100743155e-05, "loss": 0.2719, "step": 22490 }, { "epoch": 2.2128789555211332, "grad_norm": 0.6183965802192688, "learning_rate": 1.1260581011803045e-05, "loss": 0.3315, "step": 22500 }, { "epoch": 2.2128789555211332, "eval_loss": 0.12000677734613419, "eval_runtime": 16.723, "eval_samples_per_second": 2.99, "eval_steps_per_second": 1.495, "step": 22500 }, { "epoch": 2.2138624572791423, "grad_norm": 0.9129937887191772, "learning_rate": 1.1256606922862935e-05, "loss": 0.2667, "step": 22510 }, { "epoch": 2.214845959037152, "grad_norm": 0.31970441341400146, "learning_rate": 1.1252632833922823e-05, "loss": 0.0376, "step": 22520 }, { "epoch": 2.215829460795161, "grad_norm": 0.9021411538124084, "learning_rate": 1.1248658744982714e-05, "loss": 0.2351, "step": 22530 }, { "epoch": 2.2168129625531705, "grad_norm": 2.631866455078125, "learning_rate": 1.1244684656042604e-05, "loss": 0.2439, "step": 22540 }, { "epoch": 2.21779646431118, "grad_norm": 0.9322601556777954, "learning_rate": 1.1240710567102493e-05, "loss": 0.2018, "step": 22550 }, { "epoch": 2.218779966069189, "grad_norm": 0.3407883942127228, "learning_rate": 1.1236736478162383e-05, "loss": 0.1916, "step": 22560 }, { "epoch": 2.2197634678271987, "grad_norm": 3.0421650409698486, "learning_rate": 1.1232762389222271e-05, "loss": 0.1731, "step": 22570 }, { "epoch": 2.2207469695852082, "grad_norm": 1.467153787612915, "learning_rate": 1.122878830028216e-05, "loss": 0.2077, "step": 22580 }, { "epoch": 2.2217304713432173, "grad_norm": 2.4003944396972656, "learning_rate": 1.122481421134205e-05, "loss": 0.2574, "step": 22590 }, { "epoch": 2.222713973101227, "grad_norm": 0.5600699782371521, "learning_rate": 1.122084012240194e-05, "loss": 0.2947, "step": 22600 }, { "epoch": 2.2236974748592364, "grad_norm": 0.11170317977666855, "learning_rate": 1.1216866033461831e-05, "loss": 0.2289, "step": 22610 }, { "epoch": 2.2246809766172455, "grad_norm": 0.7878932952880859, "learning_rate": 1.121289194452172e-05, "loss": 0.2115, "step": 22620 }, { "epoch": 2.225664478375255, "grad_norm": 1.297023892402649, "learning_rate": 1.1208917855581609e-05, "loss": 0.315, "step": 22630 }, { "epoch": 2.2266479801332646, "grad_norm": 0.8216570615768433, "learning_rate": 1.1204943766641499e-05, "loss": 0.2609, "step": 22640 }, { "epoch": 2.2276314818912737, "grad_norm": 1.1088916063308716, "learning_rate": 1.1200969677701388e-05, "loss": 0.1976, "step": 22650 }, { "epoch": 2.2286149836492832, "grad_norm": 3.3971316814422607, "learning_rate": 1.1196995588761276e-05, "loss": 0.1781, "step": 22660 }, { "epoch": 2.2295984854072928, "grad_norm": 2.520642042160034, "learning_rate": 1.1193021499821166e-05, "loss": 0.217, "step": 22670 }, { "epoch": 2.230581987165302, "grad_norm": 0.9775090217590332, "learning_rate": 1.1189047410881055e-05, "loss": 0.1775, "step": 22680 }, { "epoch": 2.2315654889233114, "grad_norm": 0.7490184307098389, "learning_rate": 1.1185073321940947e-05, "loss": 0.1408, "step": 22690 }, { "epoch": 2.232548990681321, "grad_norm": 3.4259045124053955, "learning_rate": 1.1181099233000837e-05, "loss": 0.1902, "step": 22700 }, { "epoch": 2.23353249243933, "grad_norm": 2.450721502304077, "learning_rate": 1.1177125144060726e-05, "loss": 0.2047, "step": 22710 }, { "epoch": 2.2345159941973396, "grad_norm": 0.7697030305862427, "learning_rate": 1.1173151055120614e-05, "loss": 0.2477, "step": 22720 }, { "epoch": 2.235499495955349, "grad_norm": 1.226370930671692, "learning_rate": 1.1169176966180504e-05, "loss": 0.1745, "step": 22730 }, { "epoch": 2.2364829977133582, "grad_norm": 1.002942681312561, "learning_rate": 1.1165202877240393e-05, "loss": 0.1485, "step": 22740 }, { "epoch": 2.2374664994713678, "grad_norm": 0.6475685834884644, "learning_rate": 1.1161228788300283e-05, "loss": 0.2848, "step": 22750 }, { "epoch": 2.2384500012293773, "grad_norm": 1.8169118165969849, "learning_rate": 1.1157254699360171e-05, "loss": 0.2519, "step": 22760 }, { "epoch": 2.2394335029873864, "grad_norm": 1.4185065031051636, "learning_rate": 1.1153280610420062e-05, "loss": 0.1619, "step": 22770 }, { "epoch": 2.240417004745396, "grad_norm": 0.21674694120883942, "learning_rate": 1.1149306521479952e-05, "loss": 0.2106, "step": 22780 }, { "epoch": 2.2414005065034055, "grad_norm": 1.7250815629959106, "learning_rate": 1.1145332432539842e-05, "loss": 0.2229, "step": 22790 }, { "epoch": 2.2423840082614146, "grad_norm": 0.9569165706634521, "learning_rate": 1.1141358343599731e-05, "loss": 0.2533, "step": 22800 }, { "epoch": 2.243367510019424, "grad_norm": 1.190146803855896, "learning_rate": 1.113738425465962e-05, "loss": 0.2204, "step": 22810 }, { "epoch": 2.2443510117774337, "grad_norm": 2.0811798572540283, "learning_rate": 1.1133410165719509e-05, "loss": 0.2003, "step": 22820 }, { "epoch": 2.2453345135354428, "grad_norm": 0.4453285038471222, "learning_rate": 1.1129436076779399e-05, "loss": 0.1955, "step": 22830 }, { "epoch": 2.2463180152934523, "grad_norm": 0.8300244808197021, "learning_rate": 1.1125461987839288e-05, "loss": 0.1842, "step": 22840 }, { "epoch": 2.247301517051462, "grad_norm": 1.4930312633514404, "learning_rate": 1.112148789889918e-05, "loss": 0.2175, "step": 22850 }, { "epoch": 2.248285018809471, "grad_norm": 0.8433923125267029, "learning_rate": 1.1117513809959068e-05, "loss": 0.2786, "step": 22860 }, { "epoch": 2.2492685205674805, "grad_norm": 0.6747217178344727, "learning_rate": 1.1113539721018957e-05, "loss": 0.2544, "step": 22870 }, { "epoch": 2.25025202232549, "grad_norm": 1.8815783262252808, "learning_rate": 1.1109565632078847e-05, "loss": 0.1326, "step": 22880 }, { "epoch": 2.251235524083499, "grad_norm": 1.1676626205444336, "learning_rate": 1.1105591543138737e-05, "loss": 0.1685, "step": 22890 }, { "epoch": 2.2522190258415087, "grad_norm": 0.9920549392700195, "learning_rate": 1.1101617454198625e-05, "loss": 0.1928, "step": 22900 }, { "epoch": 2.2532025275995182, "grad_norm": 0.5512763857841492, "learning_rate": 1.1097643365258514e-05, "loss": 0.1643, "step": 22910 }, { "epoch": 2.2541860293575273, "grad_norm": 3.966198444366455, "learning_rate": 1.1093669276318404e-05, "loss": 0.1933, "step": 22920 }, { "epoch": 2.255169531115537, "grad_norm": 1.7686697244644165, "learning_rate": 1.1089695187378295e-05, "loss": 0.1953, "step": 22930 }, { "epoch": 2.2561530328735464, "grad_norm": 1.5562306642532349, "learning_rate": 1.1085721098438185e-05, "loss": 0.1546, "step": 22940 }, { "epoch": 2.2571365346315555, "grad_norm": 1.2184745073318481, "learning_rate": 1.1081747009498075e-05, "loss": 0.1791, "step": 22950 }, { "epoch": 2.258120036389565, "grad_norm": 2.333268642425537, "learning_rate": 1.1077772920557963e-05, "loss": 0.2178, "step": 22960 }, { "epoch": 2.2591035381475746, "grad_norm": 1.0209434032440186, "learning_rate": 1.1073798831617852e-05, "loss": 0.2372, "step": 22970 }, { "epoch": 2.2600870399055837, "grad_norm": 0.5352984070777893, "learning_rate": 1.1069824742677742e-05, "loss": 0.2114, "step": 22980 }, { "epoch": 2.2610705416635932, "grad_norm": 0.33463746309280396, "learning_rate": 1.1065850653737632e-05, "loss": 0.1422, "step": 22990 }, { "epoch": 2.2620540434216028, "grad_norm": 1.2513502836227417, "learning_rate": 1.106187656479752e-05, "loss": 0.2779, "step": 23000 }, { "epoch": 2.2620540434216028, "eval_loss": 0.12226882576942444, "eval_runtime": 19.3685, "eval_samples_per_second": 2.582, "eval_steps_per_second": 1.291, "step": 23000 }, { "epoch": 2.263037545179612, "grad_norm": 1.8305885791778564, "learning_rate": 1.1057902475857411e-05, "loss": 0.1699, "step": 23010 }, { "epoch": 2.2640210469376214, "grad_norm": 0.9677567481994629, "learning_rate": 1.10539283869173e-05, "loss": 0.1796, "step": 23020 }, { "epoch": 2.265004548695631, "grad_norm": 0.470826119184494, "learning_rate": 1.104995429797719e-05, "loss": 0.1581, "step": 23030 }, { "epoch": 2.26598805045364, "grad_norm": 0.6805540919303894, "learning_rate": 1.104598020903708e-05, "loss": 0.2149, "step": 23040 }, { "epoch": 2.2669715522116496, "grad_norm": 1.141538381576538, "learning_rate": 1.1042006120096968e-05, "loss": 0.2186, "step": 23050 }, { "epoch": 2.267955053969659, "grad_norm": 0.43806010484695435, "learning_rate": 1.1038032031156858e-05, "loss": 0.2429, "step": 23060 }, { "epoch": 2.2689385557276682, "grad_norm": 0.746082067489624, "learning_rate": 1.1034057942216747e-05, "loss": 0.2457, "step": 23070 }, { "epoch": 2.2699220574856778, "grad_norm": 1.439133882522583, "learning_rate": 1.1030083853276637e-05, "loss": 0.1903, "step": 23080 }, { "epoch": 2.2709055592436873, "grad_norm": 0.9891164302825928, "learning_rate": 1.1026109764336528e-05, "loss": 0.3032, "step": 23090 }, { "epoch": 2.2718890610016964, "grad_norm": 0.4387306869029999, "learning_rate": 1.1022135675396416e-05, "loss": 0.2149, "step": 23100 }, { "epoch": 2.272872562759706, "grad_norm": 0.6124128103256226, "learning_rate": 1.1018161586456306e-05, "loss": 0.1631, "step": 23110 }, { "epoch": 2.2738560645177155, "grad_norm": 0.19758909940719604, "learning_rate": 1.1014187497516196e-05, "loss": 0.1581, "step": 23120 }, { "epoch": 2.2748395662757246, "grad_norm": 1.1576956510543823, "learning_rate": 1.1010213408576085e-05, "loss": 0.1836, "step": 23130 }, { "epoch": 2.275823068033734, "grad_norm": 1.2810118198394775, "learning_rate": 1.1006239319635973e-05, "loss": 0.218, "step": 23140 }, { "epoch": 2.2768065697917437, "grad_norm": 2.966301918029785, "learning_rate": 1.1002265230695863e-05, "loss": 0.2945, "step": 23150 }, { "epoch": 2.2777900715497528, "grad_norm": 0.5344410538673401, "learning_rate": 1.0998291141755753e-05, "loss": 0.2595, "step": 23160 }, { "epoch": 2.2787735733077623, "grad_norm": 1.2781692743301392, "learning_rate": 1.0994317052815644e-05, "loss": 0.1449, "step": 23170 }, { "epoch": 2.279757075065772, "grad_norm": 0.9891101717948914, "learning_rate": 1.0990342963875534e-05, "loss": 0.2795, "step": 23180 }, { "epoch": 2.280740576823781, "grad_norm": 8.33986759185791, "learning_rate": 1.0986368874935423e-05, "loss": 0.1602, "step": 23190 }, { "epoch": 2.2817240785817905, "grad_norm": 2.056553602218628, "learning_rate": 1.0982394785995311e-05, "loss": 0.2283, "step": 23200 }, { "epoch": 2.2827075803398, "grad_norm": 0.5050723552703857, "learning_rate": 1.0978420697055201e-05, "loss": 0.1988, "step": 23210 }, { "epoch": 2.283691082097809, "grad_norm": 0.78752601146698, "learning_rate": 1.097444660811509e-05, "loss": 0.1589, "step": 23220 }, { "epoch": 2.2846745838558187, "grad_norm": 1.4518001079559326, "learning_rate": 1.097047251917498e-05, "loss": 0.1503, "step": 23230 }, { "epoch": 2.285658085613828, "grad_norm": 1.8836430311203003, "learning_rate": 1.0966498430234868e-05, "loss": 0.2008, "step": 23240 }, { "epoch": 2.2866415873718373, "grad_norm": 1.1787099838256836, "learning_rate": 1.096252434129476e-05, "loss": 0.2306, "step": 23250 }, { "epoch": 2.287625089129847, "grad_norm": 1.4644756317138672, "learning_rate": 1.095855025235465e-05, "loss": 0.2355, "step": 23260 }, { "epoch": 2.2886085908878564, "grad_norm": 0.6272462606430054, "learning_rate": 1.0954576163414539e-05, "loss": 0.2345, "step": 23270 }, { "epoch": 2.2895920926458655, "grad_norm": 0.36970436573028564, "learning_rate": 1.0950602074474429e-05, "loss": 0.2368, "step": 23280 }, { "epoch": 2.290575594403875, "grad_norm": 1.6562615633010864, "learning_rate": 1.0946627985534317e-05, "loss": 0.2508, "step": 23290 }, { "epoch": 2.2915590961618846, "grad_norm": 0.40460827946662903, "learning_rate": 1.0942653896594206e-05, "loss": 0.1283, "step": 23300 }, { "epoch": 2.2925425979198937, "grad_norm": 0.5433436632156372, "learning_rate": 1.0938679807654096e-05, "loss": 0.2428, "step": 23310 }, { "epoch": 2.293526099677903, "grad_norm": 1.0284267663955688, "learning_rate": 1.0934705718713986e-05, "loss": 0.2417, "step": 23320 }, { "epoch": 2.2945096014359128, "grad_norm": 0.09268907457590103, "learning_rate": 1.0930731629773874e-05, "loss": 0.1898, "step": 23330 }, { "epoch": 2.295493103193922, "grad_norm": 2.299131155014038, "learning_rate": 1.0926757540833765e-05, "loss": 0.1991, "step": 23340 }, { "epoch": 2.2964766049519314, "grad_norm": 0.7069013118743896, "learning_rate": 1.0922783451893655e-05, "loss": 0.2412, "step": 23350 }, { "epoch": 2.2974601067099405, "grad_norm": 4.045040607452393, "learning_rate": 1.0918809362953544e-05, "loss": 0.1945, "step": 23360 }, { "epoch": 2.29844360846795, "grad_norm": 2.47917103767395, "learning_rate": 1.0914835274013434e-05, "loss": 0.2878, "step": 23370 }, { "epoch": 2.2994271102259596, "grad_norm": 1.6581519842147827, "learning_rate": 1.0910861185073322e-05, "loss": 0.1854, "step": 23380 }, { "epoch": 2.3004106119839687, "grad_norm": 4.0485687255859375, "learning_rate": 1.0906887096133212e-05, "loss": 0.1432, "step": 23390 }, { "epoch": 2.301394113741978, "grad_norm": 1.495073676109314, "learning_rate": 1.0902913007193101e-05, "loss": 0.3493, "step": 23400 }, { "epoch": 2.3023776154999878, "grad_norm": 1.586604118347168, "learning_rate": 1.089893891825299e-05, "loss": 0.1864, "step": 23410 }, { "epoch": 2.303361117257997, "grad_norm": 0.8354415893554688, "learning_rate": 1.0894964829312882e-05, "loss": 0.2969, "step": 23420 }, { "epoch": 2.3043446190160064, "grad_norm": 0.16059598326683044, "learning_rate": 1.0890990740372772e-05, "loss": 0.2807, "step": 23430 }, { "epoch": 2.305328120774016, "grad_norm": 1.2058663368225098, "learning_rate": 1.088701665143266e-05, "loss": 0.1765, "step": 23440 }, { "epoch": 2.306311622532025, "grad_norm": 0.7288966178894043, "learning_rate": 1.088304256249255e-05, "loss": 0.2272, "step": 23450 }, { "epoch": 2.3072951242900346, "grad_norm": 0.13346998393535614, "learning_rate": 1.087906847355244e-05, "loss": 0.2205, "step": 23460 }, { "epoch": 2.308278626048044, "grad_norm": 2.4363770484924316, "learning_rate": 1.0875094384612329e-05, "loss": 0.1356, "step": 23470 }, { "epoch": 2.309262127806053, "grad_norm": 0.868334949016571, "learning_rate": 1.0871120295672217e-05, "loss": 0.1763, "step": 23480 }, { "epoch": 2.3102456295640628, "grad_norm": 1.074905276298523, "learning_rate": 1.0867146206732106e-05, "loss": 0.1909, "step": 23490 }, { "epoch": 2.3112291313220723, "grad_norm": 1.5321420431137085, "learning_rate": 1.0863172117791998e-05, "loss": 0.1557, "step": 23500 }, { "epoch": 2.3112291313220723, "eval_loss": 0.12389986217021942, "eval_runtime": 18.2263, "eval_samples_per_second": 2.743, "eval_steps_per_second": 1.372, "step": 23500 }, { "epoch": 2.3122126330800814, "grad_norm": 0.3417229652404785, "learning_rate": 1.0859198028851887e-05, "loss": 0.0932, "step": 23510 }, { "epoch": 2.313196134838091, "grad_norm": 1.651539921760559, "learning_rate": 1.0855223939911777e-05, "loss": 0.2012, "step": 23520 }, { "epoch": 2.3141796365961005, "grad_norm": 1.1157879829406738, "learning_rate": 1.0851249850971665e-05, "loss": 0.1719, "step": 23530 }, { "epoch": 2.3151631383541096, "grad_norm": 1.6907389163970947, "learning_rate": 1.0847275762031555e-05, "loss": 0.21, "step": 23540 }, { "epoch": 2.316146640112119, "grad_norm": 0.9047884345054626, "learning_rate": 1.0843301673091444e-05, "loss": 0.2338, "step": 23550 }, { "epoch": 2.3171301418701287, "grad_norm": 0.9360957145690918, "learning_rate": 1.0839327584151334e-05, "loss": 0.1679, "step": 23560 }, { "epoch": 2.3181136436281378, "grad_norm": 0.7328473925590515, "learning_rate": 1.0835353495211222e-05, "loss": 0.2031, "step": 23570 }, { "epoch": 2.3190971453861473, "grad_norm": 1.0778604745864868, "learning_rate": 1.0831379406271113e-05, "loss": 0.1219, "step": 23580 }, { "epoch": 2.320080647144157, "grad_norm": 1.1837531328201294, "learning_rate": 1.0827405317331003e-05, "loss": 0.2164, "step": 23590 }, { "epoch": 2.321064148902166, "grad_norm": 0.9001725912094116, "learning_rate": 1.0823431228390893e-05, "loss": 0.2259, "step": 23600 }, { "epoch": 2.3220476506601755, "grad_norm": 1.0018750429153442, "learning_rate": 1.0819457139450782e-05, "loss": 0.2306, "step": 23610 }, { "epoch": 2.323031152418185, "grad_norm": 1.3852676153182983, "learning_rate": 1.081548305051067e-05, "loss": 0.2403, "step": 23620 }, { "epoch": 2.324014654176194, "grad_norm": 2.191063404083252, "learning_rate": 1.081150896157056e-05, "loss": 0.2695, "step": 23630 }, { "epoch": 2.3249981559342037, "grad_norm": 6.096485137939453, "learning_rate": 1.080753487263045e-05, "loss": 0.1783, "step": 23640 }, { "epoch": 2.325981657692213, "grad_norm": 1.0244005918502808, "learning_rate": 1.080356078369034e-05, "loss": 0.2792, "step": 23650 }, { "epoch": 2.3269651594502223, "grad_norm": 0.5512336492538452, "learning_rate": 1.079958669475023e-05, "loss": 0.2371, "step": 23660 }, { "epoch": 2.327948661208232, "grad_norm": 0.7230295538902283, "learning_rate": 1.079561260581012e-05, "loss": 0.1569, "step": 23670 }, { "epoch": 2.3289321629662414, "grad_norm": 0.17145349085330963, "learning_rate": 1.0791638516870008e-05, "loss": 0.1774, "step": 23680 }, { "epoch": 2.3299156647242505, "grad_norm": 1.245060920715332, "learning_rate": 1.0787664427929898e-05, "loss": 0.1762, "step": 23690 }, { "epoch": 2.33089916648226, "grad_norm": 0.9476994276046753, "learning_rate": 1.0783690338989788e-05, "loss": 0.2464, "step": 23700 }, { "epoch": 2.3318826682402696, "grad_norm": 0.9879484176635742, "learning_rate": 1.0779716250049677e-05, "loss": 0.2248, "step": 23710 }, { "epoch": 2.3328661699982787, "grad_norm": 0.7402428388595581, "learning_rate": 1.0775742161109565e-05, "loss": 0.2144, "step": 23720 }, { "epoch": 2.333849671756288, "grad_norm": 2.64520263671875, "learning_rate": 1.0771768072169455e-05, "loss": 0.2686, "step": 23730 }, { "epoch": 2.3348331735142978, "grad_norm": 0.9090415239334106, "learning_rate": 1.0767793983229346e-05, "loss": 0.3184, "step": 23740 }, { "epoch": 2.335816675272307, "grad_norm": 1.2032514810562134, "learning_rate": 1.0763819894289236e-05, "loss": 0.2231, "step": 23750 }, { "epoch": 2.3368001770303164, "grad_norm": 1.3694044351577759, "learning_rate": 1.0759845805349126e-05, "loss": 0.2158, "step": 23760 }, { "epoch": 2.337783678788326, "grad_norm": 0.6778462529182434, "learning_rate": 1.0755871716409014e-05, "loss": 0.23, "step": 23770 }, { "epoch": 2.338767180546335, "grad_norm": 0.8439954519271851, "learning_rate": 1.0751897627468903e-05, "loss": 0.1731, "step": 23780 }, { "epoch": 2.3397506823043446, "grad_norm": 0.3221350312232971, "learning_rate": 1.0747923538528793e-05, "loss": 0.1623, "step": 23790 }, { "epoch": 2.340734184062354, "grad_norm": 1.6610746383666992, "learning_rate": 1.0743949449588683e-05, "loss": 0.2062, "step": 23800 }, { "epoch": 2.341717685820363, "grad_norm": 0.34602779150009155, "learning_rate": 1.073997536064857e-05, "loss": 0.1302, "step": 23810 }, { "epoch": 2.3427011875783728, "grad_norm": 0.7501428723335266, "learning_rate": 1.0736001271708462e-05, "loss": 0.1755, "step": 23820 }, { "epoch": 2.3436846893363823, "grad_norm": 1.4959636926651, "learning_rate": 1.0732027182768352e-05, "loss": 0.3559, "step": 23830 }, { "epoch": 2.3446681910943914, "grad_norm": 0.8919134140014648, "learning_rate": 1.0728053093828241e-05, "loss": 0.1897, "step": 23840 }, { "epoch": 2.345651692852401, "grad_norm": 1.2190852165222168, "learning_rate": 1.0724079004888131e-05, "loss": 0.2574, "step": 23850 }, { "epoch": 2.3466351946104105, "grad_norm": 1.1519238948822021, "learning_rate": 1.0720104915948019e-05, "loss": 0.1902, "step": 23860 }, { "epoch": 2.3476186963684196, "grad_norm": 0.5527150630950928, "learning_rate": 1.0716130827007909e-05, "loss": 0.1775, "step": 23870 }, { "epoch": 2.348602198126429, "grad_norm": 0.6248764991760254, "learning_rate": 1.0712156738067798e-05, "loss": 0.1542, "step": 23880 }, { "epoch": 2.3495856998844387, "grad_norm": 0.46714580059051514, "learning_rate": 1.0708182649127688e-05, "loss": 0.2932, "step": 23890 }, { "epoch": 2.3505692016424478, "grad_norm": 0.5677179098129272, "learning_rate": 1.070420856018758e-05, "loss": 0.2823, "step": 23900 }, { "epoch": 2.3515527034004573, "grad_norm": 0.47147136926651, "learning_rate": 1.0700234471247469e-05, "loss": 0.1622, "step": 23910 }, { "epoch": 2.352536205158467, "grad_norm": 1.528113603591919, "learning_rate": 1.0696260382307357e-05, "loss": 0.2353, "step": 23920 }, { "epoch": 2.353519706916476, "grad_norm": 1.0832566022872925, "learning_rate": 1.0692286293367247e-05, "loss": 0.1539, "step": 23930 }, { "epoch": 2.3545032086744855, "grad_norm": 2.5159566402435303, "learning_rate": 1.0688312204427136e-05, "loss": 0.222, "step": 23940 }, { "epoch": 2.355486710432495, "grad_norm": 0.45711979269981384, "learning_rate": 1.0684338115487026e-05, "loss": 0.2243, "step": 23950 }, { "epoch": 2.356470212190504, "grad_norm": 0.749158501625061, "learning_rate": 1.0680364026546914e-05, "loss": 0.1643, "step": 23960 }, { "epoch": 2.3574537139485137, "grad_norm": 0.9952805042266846, "learning_rate": 1.0676389937606804e-05, "loss": 0.168, "step": 23970 }, { "epoch": 2.358437215706523, "grad_norm": 2.383165121078491, "learning_rate": 1.0672415848666695e-05, "loss": 0.196, "step": 23980 }, { "epoch": 2.3594207174645323, "grad_norm": 1.5254125595092773, "learning_rate": 1.0668441759726585e-05, "loss": 0.1929, "step": 23990 }, { "epoch": 2.360404219222542, "grad_norm": 1.1301990747451782, "learning_rate": 1.0664467670786474e-05, "loss": 0.202, "step": 24000 }, { "epoch": 2.360404219222542, "eval_loss": 0.12581513822078705, "eval_runtime": 20.8817, "eval_samples_per_second": 2.394, "eval_steps_per_second": 1.197, "step": 24000 }, { "epoch": 2.3613877209805514, "grad_norm": 0.46260926127433777, "learning_rate": 1.0660493581846362e-05, "loss": 0.1141, "step": 24010 }, { "epoch": 2.3623712227385605, "grad_norm": 0.6546036005020142, "learning_rate": 1.0656519492906252e-05, "loss": 0.2127, "step": 24020 }, { "epoch": 2.36335472449657, "grad_norm": 0.46774768829345703, "learning_rate": 1.0652545403966142e-05, "loss": 0.264, "step": 24030 }, { "epoch": 2.3643382262545796, "grad_norm": 0.9853132367134094, "learning_rate": 1.0648571315026031e-05, "loss": 0.1887, "step": 24040 }, { "epoch": 2.3653217280125887, "grad_norm": 0.7118104100227356, "learning_rate": 1.064459722608592e-05, "loss": 0.2641, "step": 24050 }, { "epoch": 2.366305229770598, "grad_norm": 2.5412228107452393, "learning_rate": 1.064062313714581e-05, "loss": 0.2682, "step": 24060 }, { "epoch": 2.3672887315286077, "grad_norm": 0.961894154548645, "learning_rate": 1.06366490482057e-05, "loss": 0.1522, "step": 24070 }, { "epoch": 2.368272233286617, "grad_norm": 0.3436634838581085, "learning_rate": 1.063267495926559e-05, "loss": 0.1739, "step": 24080 }, { "epoch": 2.3692557350446264, "grad_norm": 0.6866682171821594, "learning_rate": 1.062870087032548e-05, "loss": 0.191, "step": 24090 }, { "epoch": 2.370239236802636, "grad_norm": 1.4524731636047363, "learning_rate": 1.0624726781385368e-05, "loss": 0.1792, "step": 24100 }, { "epoch": 2.371222738560645, "grad_norm": 1.2320064306259155, "learning_rate": 1.0620752692445257e-05, "loss": 0.2205, "step": 24110 }, { "epoch": 2.3722062403186546, "grad_norm": 1.1350302696228027, "learning_rate": 1.0616778603505147e-05, "loss": 0.2501, "step": 24120 }, { "epoch": 2.373189742076664, "grad_norm": 3.394150495529175, "learning_rate": 1.0612804514565037e-05, "loss": 0.1806, "step": 24130 }, { "epoch": 2.374173243834673, "grad_norm": 0.8134811520576477, "learning_rate": 1.0608830425624925e-05, "loss": 0.255, "step": 24140 }, { "epoch": 2.3751567455926827, "grad_norm": 0.6805379986763, "learning_rate": 1.0604856336684818e-05, "loss": 0.1241, "step": 24150 }, { "epoch": 2.3761402473506923, "grad_norm": 0.5397747159004211, "learning_rate": 1.0600882247744706e-05, "loss": 0.2283, "step": 24160 }, { "epoch": 2.3771237491087014, "grad_norm": 0.2824234664440155, "learning_rate": 1.0596908158804595e-05, "loss": 0.2521, "step": 24170 }, { "epoch": 2.378107250866711, "grad_norm": 2.4280147552490234, "learning_rate": 1.0592934069864485e-05, "loss": 0.2096, "step": 24180 }, { "epoch": 2.3790907526247205, "grad_norm": 0.49068471789360046, "learning_rate": 1.0588959980924375e-05, "loss": 0.2259, "step": 24190 }, { "epoch": 2.3800742543827296, "grad_norm": 1.3167073726654053, "learning_rate": 1.0584985891984262e-05, "loss": 0.1058, "step": 24200 }, { "epoch": 2.381057756140739, "grad_norm": 0.7268620729446411, "learning_rate": 1.0581011803044152e-05, "loss": 0.1939, "step": 24210 }, { "epoch": 2.3820412578987487, "grad_norm": 0.5210294127464294, "learning_rate": 1.0577037714104042e-05, "loss": 0.1967, "step": 24220 }, { "epoch": 2.3830247596567578, "grad_norm": 2.116398811340332, "learning_rate": 1.0573063625163933e-05, "loss": 0.242, "step": 24230 }, { "epoch": 2.3840082614147673, "grad_norm": 0.8582640886306763, "learning_rate": 1.0569089536223823e-05, "loss": 0.1509, "step": 24240 }, { "epoch": 2.384991763172777, "grad_norm": 0.9974415898323059, "learning_rate": 1.056511544728371e-05, "loss": 0.2076, "step": 24250 }, { "epoch": 2.385975264930786, "grad_norm": 1.5299766063690186, "learning_rate": 1.05611413583436e-05, "loss": 0.2533, "step": 24260 }, { "epoch": 2.3869587666887955, "grad_norm": 0.9930517673492432, "learning_rate": 1.055716726940349e-05, "loss": 0.1992, "step": 24270 }, { "epoch": 2.387942268446805, "grad_norm": 0.5722897052764893, "learning_rate": 1.055319318046338e-05, "loss": 0.1644, "step": 24280 }, { "epoch": 2.388925770204814, "grad_norm": 1.3460361957550049, "learning_rate": 1.0549219091523268e-05, "loss": 0.2251, "step": 24290 }, { "epoch": 2.3899092719628237, "grad_norm": 0.635595977306366, "learning_rate": 1.0545245002583157e-05, "loss": 0.2137, "step": 24300 }, { "epoch": 2.390892773720833, "grad_norm": 2.7226004600524902, "learning_rate": 1.0541270913643049e-05, "loss": 0.1122, "step": 24310 }, { "epoch": 2.3918762754788423, "grad_norm": 1.5168453454971313, "learning_rate": 1.0537296824702938e-05, "loss": 0.1795, "step": 24320 }, { "epoch": 2.392859777236852, "grad_norm": 0.39879947900772095, "learning_rate": 1.0533322735762828e-05, "loss": 0.295, "step": 24330 }, { "epoch": 2.3938432789948614, "grad_norm": 0.5160859227180481, "learning_rate": 1.0529348646822716e-05, "loss": 0.1735, "step": 24340 }, { "epoch": 2.3948267807528705, "grad_norm": 1.0614607334136963, "learning_rate": 1.0525374557882606e-05, "loss": 0.2118, "step": 24350 }, { "epoch": 2.39581028251088, "grad_norm": 1.070089340209961, "learning_rate": 1.0521400468942495e-05, "loss": 0.2928, "step": 24360 }, { "epoch": 2.3967937842688896, "grad_norm": 0.6737170815467834, "learning_rate": 1.0517426380002385e-05, "loss": 0.2404, "step": 24370 }, { "epoch": 2.3977772860268987, "grad_norm": 1.382066011428833, "learning_rate": 1.0513452291062273e-05, "loss": 0.3195, "step": 24380 }, { "epoch": 2.398760787784908, "grad_norm": 1.577677607536316, "learning_rate": 1.0509478202122166e-05, "loss": 0.2037, "step": 24390 }, { "epoch": 2.3997442895429177, "grad_norm": 0.7233482003211975, "learning_rate": 1.0505504113182054e-05, "loss": 0.2168, "step": 24400 }, { "epoch": 2.400727791300927, "grad_norm": 1.9310663938522339, "learning_rate": 1.0501530024241944e-05, "loss": 0.1892, "step": 24410 }, { "epoch": 2.4017112930589364, "grad_norm": 0.4090423882007599, "learning_rate": 1.0497555935301833e-05, "loss": 0.2314, "step": 24420 }, { "epoch": 2.402694794816946, "grad_norm": 0.6759452819824219, "learning_rate": 1.0493581846361723e-05, "loss": 0.2013, "step": 24430 }, { "epoch": 2.403678296574955, "grad_norm": 1.5449095964431763, "learning_rate": 1.0489607757421611e-05, "loss": 0.2065, "step": 24440 }, { "epoch": 2.4046617983329646, "grad_norm": 0.8562779426574707, "learning_rate": 1.04856336684815e-05, "loss": 0.0849, "step": 24450 }, { "epoch": 2.405645300090974, "grad_norm": 2.506950855255127, "learning_rate": 1.048165957954139e-05, "loss": 0.2349, "step": 24460 }, { "epoch": 2.406628801848983, "grad_norm": 0.3711108863353729, "learning_rate": 1.0477685490601282e-05, "loss": 0.1949, "step": 24470 }, { "epoch": 2.4076123036069927, "grad_norm": 0.5963961482048035, "learning_rate": 1.0473711401661171e-05, "loss": 0.3249, "step": 24480 }, { "epoch": 2.4085958053650023, "grad_norm": 0.1989096999168396, "learning_rate": 1.046973731272106e-05, "loss": 0.1634, "step": 24490 }, { "epoch": 2.4095793071230114, "grad_norm": 1.052170753479004, "learning_rate": 1.0465763223780949e-05, "loss": 0.1794, "step": 24500 }, { "epoch": 2.4095793071230114, "eval_loss": 0.12587225437164307, "eval_runtime": 16.8866, "eval_samples_per_second": 2.961, "eval_steps_per_second": 1.48, "step": 24500 }, { "epoch": 2.410562808881021, "grad_norm": 0.8879571557044983, "learning_rate": 1.0461789134840839e-05, "loss": 0.2561, "step": 24510 }, { "epoch": 2.4115463106390305, "grad_norm": 0.6819064617156982, "learning_rate": 1.0457815045900728e-05, "loss": 0.2363, "step": 24520 }, { "epoch": 2.4125298123970396, "grad_norm": 0.4369518458843231, "learning_rate": 1.0453840956960616e-05, "loss": 0.2366, "step": 24530 }, { "epoch": 2.413513314155049, "grad_norm": 0.3565225303173065, "learning_rate": 1.0449866868020506e-05, "loss": 0.2178, "step": 24540 }, { "epoch": 2.4144968159130586, "grad_norm": 0.8098437190055847, "learning_rate": 1.0445892779080397e-05, "loss": 0.278, "step": 24550 }, { "epoch": 2.4154803176710677, "grad_norm": 0.873010516166687, "learning_rate": 1.0441918690140287e-05, "loss": 0.314, "step": 24560 }, { "epoch": 2.4164638194290773, "grad_norm": 1.4115465879440308, "learning_rate": 1.0437944601200177e-05, "loss": 0.2881, "step": 24570 }, { "epoch": 2.417447321187087, "grad_norm": 1.7016613483428955, "learning_rate": 1.0433970512260065e-05, "loss": 0.3005, "step": 24580 }, { "epoch": 2.418430822945096, "grad_norm": 0.9918956756591797, "learning_rate": 1.0429996423319954e-05, "loss": 0.2868, "step": 24590 }, { "epoch": 2.4194143247031055, "grad_norm": 0.4580109417438507, "learning_rate": 1.0426022334379844e-05, "loss": 0.192, "step": 24600 }, { "epoch": 2.420397826461115, "grad_norm": 0.97365802526474, "learning_rate": 1.0422048245439734e-05, "loss": 0.2039, "step": 24610 }, { "epoch": 2.421381328219124, "grad_norm": 1.3537076711654663, "learning_rate": 1.0418074156499622e-05, "loss": 0.1981, "step": 24620 }, { "epoch": 2.4223648299771336, "grad_norm": 0.6715251803398132, "learning_rate": 1.0414100067559515e-05, "loss": 0.2591, "step": 24630 }, { "epoch": 2.423348331735143, "grad_norm": 1.817028284072876, "learning_rate": 1.0410125978619403e-05, "loss": 0.181, "step": 24640 }, { "epoch": 2.4243318334931523, "grad_norm": 1.0385745763778687, "learning_rate": 1.0406151889679292e-05, "loss": 0.1753, "step": 24650 }, { "epoch": 2.425315335251162, "grad_norm": 0.4937601089477539, "learning_rate": 1.0402177800739182e-05, "loss": 0.1725, "step": 24660 }, { "epoch": 2.4262988370091714, "grad_norm": 1.1675567626953125, "learning_rate": 1.0398203711799072e-05, "loss": 0.2863, "step": 24670 }, { "epoch": 2.4272823387671805, "grad_norm": 0.7065426111221313, "learning_rate": 1.039422962285896e-05, "loss": 0.2022, "step": 24680 }, { "epoch": 2.42826584052519, "grad_norm": 0.770263671875, "learning_rate": 1.039025553391885e-05, "loss": 0.2201, "step": 24690 }, { "epoch": 2.4292493422831996, "grad_norm": 0.46510329842567444, "learning_rate": 1.0386281444978739e-05, "loss": 0.2175, "step": 24700 }, { "epoch": 2.4302328440412087, "grad_norm": 0.845753014087677, "learning_rate": 1.038230735603863e-05, "loss": 0.2049, "step": 24710 }, { "epoch": 2.431216345799218, "grad_norm": 1.2518194913864136, "learning_rate": 1.037833326709852e-05, "loss": 0.2833, "step": 24720 }, { "epoch": 2.4321998475572277, "grad_norm": 1.889909029006958, "learning_rate": 1.0374359178158408e-05, "loss": 0.1633, "step": 24730 }, { "epoch": 2.433183349315237, "grad_norm": 0.6773773431777954, "learning_rate": 1.0370385089218298e-05, "loss": 0.2636, "step": 24740 }, { "epoch": 2.4341668510732464, "grad_norm": 1.3301688432693481, "learning_rate": 1.0366411000278187e-05, "loss": 0.1885, "step": 24750 }, { "epoch": 2.435150352831256, "grad_norm": 0.4370648264884949, "learning_rate": 1.0362436911338077e-05, "loss": 0.2366, "step": 24760 }, { "epoch": 2.436133854589265, "grad_norm": 1.1172949075698853, "learning_rate": 1.0358462822397965e-05, "loss": 0.2925, "step": 24770 }, { "epoch": 2.4371173563472746, "grad_norm": 0.37216636538505554, "learning_rate": 1.0354488733457855e-05, "loss": 0.2844, "step": 24780 }, { "epoch": 2.438100858105284, "grad_norm": 0.6310292482376099, "learning_rate": 1.0350514644517746e-05, "loss": 0.2222, "step": 24790 }, { "epoch": 2.439084359863293, "grad_norm": 1.4703803062438965, "learning_rate": 1.0346540555577636e-05, "loss": 0.1935, "step": 24800 }, { "epoch": 2.4400678616213027, "grad_norm": 0.8147374987602234, "learning_rate": 1.0342566466637525e-05, "loss": 0.1447, "step": 24810 }, { "epoch": 2.4410513633793123, "grad_norm": 0.6662246584892273, "learning_rate": 1.0338592377697413e-05, "loss": 0.1878, "step": 24820 }, { "epoch": 2.4420348651373214, "grad_norm": 0.7072861194610596, "learning_rate": 1.0334618288757303e-05, "loss": 0.3015, "step": 24830 }, { "epoch": 2.443018366895331, "grad_norm": 1.876746416091919, "learning_rate": 1.0330644199817193e-05, "loss": 0.3142, "step": 24840 }, { "epoch": 2.4440018686533405, "grad_norm": 0.9753235578536987, "learning_rate": 1.0326670110877082e-05, "loss": 0.2081, "step": 24850 }, { "epoch": 2.4449853704113496, "grad_norm": 1.0954594612121582, "learning_rate": 1.032269602193697e-05, "loss": 0.3415, "step": 24860 }, { "epoch": 2.445968872169359, "grad_norm": 1.0371835231781006, "learning_rate": 1.0318721932996863e-05, "loss": 0.23, "step": 24870 }, { "epoch": 2.4469523739273686, "grad_norm": 1.1946139335632324, "learning_rate": 1.0314747844056751e-05, "loss": 0.1863, "step": 24880 }, { "epoch": 2.4479358756853777, "grad_norm": 0.19204482436180115, "learning_rate": 1.0310773755116641e-05, "loss": 0.2889, "step": 24890 }, { "epoch": 2.4489193774433873, "grad_norm": 1.4059542417526245, "learning_rate": 1.030679966617653e-05, "loss": 0.2667, "step": 24900 }, { "epoch": 2.449902879201397, "grad_norm": 0.19449950754642487, "learning_rate": 1.030282557723642e-05, "loss": 0.2618, "step": 24910 }, { "epoch": 2.450886380959406, "grad_norm": 0.6006851196289062, "learning_rate": 1.0298851488296308e-05, "loss": 0.2489, "step": 24920 }, { "epoch": 2.4518698827174155, "grad_norm": 1.953939437866211, "learning_rate": 1.0294877399356198e-05, "loss": 0.226, "step": 24930 }, { "epoch": 2.4528533844754246, "grad_norm": 1.157392144203186, "learning_rate": 1.0290903310416088e-05, "loss": 0.1784, "step": 24940 }, { "epoch": 2.453836886233434, "grad_norm": 0.37648680806159973, "learning_rate": 1.0286929221475977e-05, "loss": 0.2387, "step": 24950 }, { "epoch": 2.4548203879914436, "grad_norm": 1.1377531290054321, "learning_rate": 1.0282955132535869e-05, "loss": 0.1905, "step": 24960 }, { "epoch": 2.4558038897494527, "grad_norm": 1.6910361051559448, "learning_rate": 1.0278981043595757e-05, "loss": 0.1878, "step": 24970 }, { "epoch": 2.4567873915074623, "grad_norm": 2.3196709156036377, "learning_rate": 1.0275006954655646e-05, "loss": 0.2849, "step": 24980 }, { "epoch": 2.457770893265472, "grad_norm": 1.3127903938293457, "learning_rate": 1.0271032865715536e-05, "loss": 0.2607, "step": 24990 }, { "epoch": 2.458754395023481, "grad_norm": 1.0793159008026123, "learning_rate": 1.0267058776775426e-05, "loss": 0.2561, "step": 25000 }, { "epoch": 2.458754395023481, "eval_loss": 0.12595096230506897, "eval_runtime": 18.2255, "eval_samples_per_second": 2.743, "eval_steps_per_second": 1.372, "step": 25000 }, { "epoch": 2.4597378967814905, "grad_norm": 1.686716914176941, "learning_rate": 1.0263084687835313e-05, "loss": 0.175, "step": 25010 }, { "epoch": 2.4607213985395, "grad_norm": 1.6246962547302246, "learning_rate": 1.0259110598895203e-05, "loss": 0.2054, "step": 25020 }, { "epoch": 2.461704900297509, "grad_norm": 0.8834644556045532, "learning_rate": 1.0255136509955093e-05, "loss": 0.264, "step": 25030 }, { "epoch": 2.4626884020555186, "grad_norm": 2.999387264251709, "learning_rate": 1.0251162421014984e-05, "loss": 0.1614, "step": 25040 }, { "epoch": 2.463671903813528, "grad_norm": 0.9904057383537292, "learning_rate": 1.0247188332074874e-05, "loss": 0.177, "step": 25050 }, { "epoch": 2.4646554055715373, "grad_norm": 0.770376980304718, "learning_rate": 1.0243214243134762e-05, "loss": 0.2024, "step": 25060 }, { "epoch": 2.465638907329547, "grad_norm": 2.4670302867889404, "learning_rate": 1.0239240154194651e-05, "loss": 0.0944, "step": 25070 }, { "epoch": 2.4666224090875564, "grad_norm": 0.5328916907310486, "learning_rate": 1.0235266065254541e-05, "loss": 0.2334, "step": 25080 }, { "epoch": 2.4676059108455655, "grad_norm": 0.724310576915741, "learning_rate": 1.023129197631443e-05, "loss": 0.1495, "step": 25090 }, { "epoch": 2.468589412603575, "grad_norm": 0.9128016233444214, "learning_rate": 1.0227317887374319e-05, "loss": 0.159, "step": 25100 }, { "epoch": 2.4695729143615845, "grad_norm": 0.7053813934326172, "learning_rate": 1.0223343798434208e-05, "loss": 0.3856, "step": 25110 }, { "epoch": 2.4705564161195936, "grad_norm": 1.1937212944030762, "learning_rate": 1.02193697094941e-05, "loss": 0.17, "step": 25120 }, { "epoch": 2.471539917877603, "grad_norm": 1.3782202005386353, "learning_rate": 1.021539562055399e-05, "loss": 0.1572, "step": 25130 }, { "epoch": 2.4725234196356127, "grad_norm": 0.8486477136611938, "learning_rate": 1.0211421531613879e-05, "loss": 0.1794, "step": 25140 }, { "epoch": 2.473506921393622, "grad_norm": 0.7491905093193054, "learning_rate": 1.0207447442673769e-05, "loss": 0.1787, "step": 25150 }, { "epoch": 2.4744904231516314, "grad_norm": 0.6677191853523254, "learning_rate": 1.0203473353733657e-05, "loss": 0.1964, "step": 25160 }, { "epoch": 2.475473924909641, "grad_norm": 1.6847872734069824, "learning_rate": 1.0199499264793546e-05, "loss": 0.1782, "step": 25170 }, { "epoch": 2.47645742666765, "grad_norm": 1.3266242742538452, "learning_rate": 1.0195525175853436e-05, "loss": 0.1698, "step": 25180 }, { "epoch": 2.4774409284256595, "grad_norm": 0.22463712096214294, "learning_rate": 1.0191551086913326e-05, "loss": 0.1967, "step": 25190 }, { "epoch": 2.478424430183669, "grad_norm": 1.7434459924697876, "learning_rate": 1.0187576997973217e-05, "loss": 0.3137, "step": 25200 }, { "epoch": 2.479407931941678, "grad_norm": 2.9534683227539062, "learning_rate": 1.0183602909033105e-05, "loss": 0.1166, "step": 25210 }, { "epoch": 2.4803914336996877, "grad_norm": 0.9041188359260559, "learning_rate": 1.0179628820092995e-05, "loss": 0.2327, "step": 25220 }, { "epoch": 2.4813749354576973, "grad_norm": 2.4157469272613525, "learning_rate": 1.0175654731152884e-05, "loss": 0.2136, "step": 25230 }, { "epoch": 2.4823584372157064, "grad_norm": 3.838491201400757, "learning_rate": 1.0171680642212774e-05, "loss": 0.3159, "step": 25240 }, { "epoch": 2.483341938973716, "grad_norm": 2.9579169750213623, "learning_rate": 1.0167706553272662e-05, "loss": 0.2254, "step": 25250 }, { "epoch": 2.4843254407317255, "grad_norm": 2.0895047187805176, "learning_rate": 1.0163732464332552e-05, "loss": 0.2235, "step": 25260 }, { "epoch": 2.4853089424897346, "grad_norm": 0.3538540303707123, "learning_rate": 1.0159758375392441e-05, "loss": 0.254, "step": 25270 }, { "epoch": 2.486292444247744, "grad_norm": 0.8005692958831787, "learning_rate": 1.0155784286452333e-05, "loss": 0.2868, "step": 25280 }, { "epoch": 2.4872759460057536, "grad_norm": 0.44420817494392395, "learning_rate": 1.0151810197512222e-05, "loss": 0.1653, "step": 25290 }, { "epoch": 2.4882594477637627, "grad_norm": 0.8710966110229492, "learning_rate": 1.014783610857211e-05, "loss": 0.1818, "step": 25300 }, { "epoch": 2.4892429495217723, "grad_norm": 2.267538070678711, "learning_rate": 1.0143862019632e-05, "loss": 0.149, "step": 25310 }, { "epoch": 2.490226451279782, "grad_norm": 0.5567754507064819, "learning_rate": 1.013988793069189e-05, "loss": 0.1821, "step": 25320 }, { "epoch": 2.491209953037791, "grad_norm": 0.5361708998680115, "learning_rate": 1.013591384175178e-05, "loss": 0.1408, "step": 25330 }, { "epoch": 2.4921934547958005, "grad_norm": 1.3161852359771729, "learning_rate": 1.0131939752811667e-05, "loss": 0.1775, "step": 25340 }, { "epoch": 2.49317695655381, "grad_norm": 1.63074791431427, "learning_rate": 1.0127965663871557e-05, "loss": 0.1754, "step": 25350 }, { "epoch": 2.494160458311819, "grad_norm": 0.591672956943512, "learning_rate": 1.0123991574931448e-05, "loss": 0.2224, "step": 25360 }, { "epoch": 2.4951439600698286, "grad_norm": 0.8783940672874451, "learning_rate": 1.0120017485991338e-05, "loss": 0.1685, "step": 25370 }, { "epoch": 2.496127461827838, "grad_norm": 1.294005036354065, "learning_rate": 1.0116043397051228e-05, "loss": 0.2783, "step": 25380 }, { "epoch": 2.4971109635858473, "grad_norm": 1.8267871141433716, "learning_rate": 1.0112069308111117e-05, "loss": 0.2339, "step": 25390 }, { "epoch": 2.498094465343857, "grad_norm": 1.058579683303833, "learning_rate": 1.0108095219171005e-05, "loss": 0.1437, "step": 25400 }, { "epoch": 2.4990779671018664, "grad_norm": 0.35047388076782227, "learning_rate": 1.0104121130230895e-05, "loss": 0.1035, "step": 25410 }, { "epoch": 2.5000614688598755, "grad_norm": 0.5837266445159912, "learning_rate": 1.0100147041290785e-05, "loss": 0.1586, "step": 25420 }, { "epoch": 2.501044970617885, "grad_norm": 0.7025007009506226, "learning_rate": 1.0096172952350674e-05, "loss": 0.2133, "step": 25430 }, { "epoch": 2.502028472375894, "grad_norm": 1.1473788022994995, "learning_rate": 1.0092198863410566e-05, "loss": 0.1158, "step": 25440 }, { "epoch": 2.5030119741339036, "grad_norm": 1.7159006595611572, "learning_rate": 1.0088224774470454e-05, "loss": 0.2806, "step": 25450 }, { "epoch": 2.503995475891913, "grad_norm": 2.7162020206451416, "learning_rate": 1.0084250685530343e-05, "loss": 0.2626, "step": 25460 }, { "epoch": 2.5049789776499223, "grad_norm": 0.5796257257461548, "learning_rate": 1.0080276596590233e-05, "loss": 0.2202, "step": 25470 }, { "epoch": 2.505962479407932, "grad_norm": 0.4555465579032898, "learning_rate": 1.0076302507650123e-05, "loss": 0.1799, "step": 25480 }, { "epoch": 2.5069459811659414, "grad_norm": 0.17807602882385254, "learning_rate": 1.007232841871001e-05, "loss": 0.2468, "step": 25490 }, { "epoch": 2.5079294829239505, "grad_norm": 1.4680837392807007, "learning_rate": 1.00683543297699e-05, "loss": 0.2256, "step": 25500 }, { "epoch": 2.5079294829239505, "eval_loss": 0.12521879374980927, "eval_runtime": 18.9765, "eval_samples_per_second": 2.635, "eval_steps_per_second": 1.317, "step": 25500 }, { "epoch": 2.50891298468196, "grad_norm": 0.8120176792144775, "learning_rate": 1.006438024082979e-05, "loss": 0.2246, "step": 25510 }, { "epoch": 2.5098964864399695, "grad_norm": 1.7058813571929932, "learning_rate": 1.0060406151889681e-05, "loss": 0.1898, "step": 25520 }, { "epoch": 2.5108799881979786, "grad_norm": 2.02591609954834, "learning_rate": 1.0056432062949571e-05, "loss": 0.1844, "step": 25530 }, { "epoch": 2.511863489955988, "grad_norm": 0.452986478805542, "learning_rate": 1.0052457974009459e-05, "loss": 0.1278, "step": 25540 }, { "epoch": 2.5128469917139977, "grad_norm": 0.6622622609138489, "learning_rate": 1.0048483885069349e-05, "loss": 0.2361, "step": 25550 }, { "epoch": 2.513830493472007, "grad_norm": 2.510401487350464, "learning_rate": 1.0044509796129238e-05, "loss": 0.2983, "step": 25560 }, { "epoch": 2.5148139952300164, "grad_norm": 1.1683661937713623, "learning_rate": 1.0040535707189128e-05, "loss": 0.2436, "step": 25570 }, { "epoch": 2.515797496988026, "grad_norm": 1.2521823644638062, "learning_rate": 1.0036561618249016e-05, "loss": 0.145, "step": 25580 }, { "epoch": 2.516780998746035, "grad_norm": 1.279665470123291, "learning_rate": 1.0032587529308906e-05, "loss": 0.1753, "step": 25590 }, { "epoch": 2.5177645005040445, "grad_norm": 0.5141640901565552, "learning_rate": 1.0028613440368797e-05, "loss": 0.2364, "step": 25600 }, { "epoch": 2.518748002262054, "grad_norm": 0.45487838983535767, "learning_rate": 1.0024639351428687e-05, "loss": 0.1719, "step": 25610 }, { "epoch": 2.519731504020063, "grad_norm": 0.6889397501945496, "learning_rate": 1.0020665262488576e-05, "loss": 0.2684, "step": 25620 }, { "epoch": 2.5207150057780727, "grad_norm": 1.1064757108688354, "learning_rate": 1.0016691173548466e-05, "loss": 0.2345, "step": 25630 }, { "epoch": 2.5216985075360823, "grad_norm": 1.3356391191482544, "learning_rate": 1.0012717084608354e-05, "loss": 0.1524, "step": 25640 }, { "epoch": 2.5226820092940914, "grad_norm": 1.0441110134124756, "learning_rate": 1.0008742995668244e-05, "loss": 0.1907, "step": 25650 }, { "epoch": 2.523665511052101, "grad_norm": 0.581211268901825, "learning_rate": 1.0004768906728133e-05, "loss": 0.1763, "step": 25660 }, { "epoch": 2.5246490128101104, "grad_norm": 2.296384811401367, "learning_rate": 1.0000794817788023e-05, "loss": 0.291, "step": 25670 }, { "epoch": 2.5256325145681195, "grad_norm": 0.5643019080162048, "learning_rate": 9.996820728847913e-06, "loss": 0.2914, "step": 25680 }, { "epoch": 2.526616016326129, "grad_norm": 1.7198952436447144, "learning_rate": 9.992846639907802e-06, "loss": 0.1675, "step": 25690 }, { "epoch": 2.5275995180841386, "grad_norm": 0.4589093029499054, "learning_rate": 9.98887255096769e-06, "loss": 0.2152, "step": 25700 }, { "epoch": 2.5285830198421477, "grad_norm": 1.0957061052322388, "learning_rate": 9.984898462027582e-06, "loss": 0.1656, "step": 25710 }, { "epoch": 2.5295665216001573, "grad_norm": 1.6807781457901, "learning_rate": 9.980924373087471e-06, "loss": 0.2362, "step": 25720 }, { "epoch": 2.530550023358167, "grad_norm": 0.4425702691078186, "learning_rate": 9.97695028414736e-06, "loss": 0.2482, "step": 25730 }, { "epoch": 2.531533525116176, "grad_norm": 0.26340290904045105, "learning_rate": 9.972976195207249e-06, "loss": 0.1555, "step": 25740 }, { "epoch": 2.5325170268741855, "grad_norm": 0.6684683561325073, "learning_rate": 9.96900210626714e-06, "loss": 0.2472, "step": 25750 }, { "epoch": 2.533500528632195, "grad_norm": 2.2530484199523926, "learning_rate": 9.965028017327028e-06, "loss": 0.1788, "step": 25760 }, { "epoch": 2.534484030390204, "grad_norm": 1.1162317991256714, "learning_rate": 9.961053928386918e-06, "loss": 0.2171, "step": 25770 }, { "epoch": 2.5354675321482136, "grad_norm": 0.19642123579978943, "learning_rate": 9.957079839446808e-06, "loss": 0.1548, "step": 25780 }, { "epoch": 2.536451033906223, "grad_norm": 1.437812328338623, "learning_rate": 9.953105750506697e-06, "loss": 0.3077, "step": 25790 }, { "epoch": 2.5374345356642323, "grad_norm": 0.3670991361141205, "learning_rate": 9.949131661566587e-06, "loss": 0.228, "step": 25800 }, { "epoch": 2.538418037422242, "grad_norm": 0.945225715637207, "learning_rate": 9.945157572626476e-06, "loss": 0.2034, "step": 25810 }, { "epoch": 2.5394015391802514, "grad_norm": 1.6273648738861084, "learning_rate": 9.941183483686364e-06, "loss": 0.1443, "step": 25820 }, { "epoch": 2.5403850409382605, "grad_norm": 0.4249015152454376, "learning_rate": 9.937209394746256e-06, "loss": 0.0696, "step": 25830 }, { "epoch": 2.54136854269627, "grad_norm": 1.2602684497833252, "learning_rate": 9.933235305806145e-06, "loss": 0.2626, "step": 25840 }, { "epoch": 2.5423520444542795, "grad_norm": 0.5752518773078918, "learning_rate": 9.929261216866033e-06, "loss": 0.2839, "step": 25850 }, { "epoch": 2.5433355462122886, "grad_norm": 1.0904006958007812, "learning_rate": 9.925287127925923e-06, "loss": 0.1481, "step": 25860 }, { "epoch": 2.544319047970298, "grad_norm": 0.8281343579292297, "learning_rate": 9.921313038985814e-06, "loss": 0.2193, "step": 25870 }, { "epoch": 2.5453025497283077, "grad_norm": 0.5674927830696106, "learning_rate": 9.917338950045702e-06, "loss": 0.224, "step": 25880 }, { "epoch": 2.546286051486317, "grad_norm": 0.3176279664039612, "learning_rate": 9.913364861105592e-06, "loss": 0.2513, "step": 25890 }, { "epoch": 2.5472695532443264, "grad_norm": 1.221111536026001, "learning_rate": 9.909390772165482e-06, "loss": 0.1335, "step": 25900 }, { "epoch": 2.548253055002336, "grad_norm": 0.720691978931427, "learning_rate": 9.905416683225371e-06, "loss": 0.1934, "step": 25910 }, { "epoch": 2.549236556760345, "grad_norm": 2.970773696899414, "learning_rate": 9.901442594285261e-06, "loss": 0.2014, "step": 25920 }, { "epoch": 2.5502200585183545, "grad_norm": 3.3352341651916504, "learning_rate": 9.89746850534515e-06, "loss": 0.1885, "step": 25930 }, { "epoch": 2.551203560276364, "grad_norm": 1.7905553579330444, "learning_rate": 9.893494416405039e-06, "loss": 0.3061, "step": 25940 }, { "epoch": 2.552187062034373, "grad_norm": 1.478422999382019, "learning_rate": 9.88952032746493e-06, "loss": 0.1855, "step": 25950 }, { "epoch": 2.5531705637923827, "grad_norm": 0.7994058132171631, "learning_rate": 9.88554623852482e-06, "loss": 0.1579, "step": 25960 }, { "epoch": 2.5541540655503923, "grad_norm": 2.159670114517212, "learning_rate": 9.881572149584708e-06, "loss": 0.1243, "step": 25970 }, { "epoch": 2.5551375673084014, "grad_norm": 1.8579719066619873, "learning_rate": 9.877598060644597e-06, "loss": 0.3046, "step": 25980 }, { "epoch": 2.556121069066411, "grad_norm": 2.223876714706421, "learning_rate": 9.873623971704489e-06, "loss": 0.1892, "step": 25990 }, { "epoch": 2.5571045708244204, "grad_norm": 0.31044161319732666, "learning_rate": 9.869649882764377e-06, "loss": 0.166, "step": 26000 }, { "epoch": 2.5571045708244204, "eval_loss": 0.12297271937131882, "eval_runtime": 18.1156, "eval_samples_per_second": 2.76, "eval_steps_per_second": 1.38, "step": 26000 }, { "epoch": 2.5580880725824295, "grad_norm": 1.37006676197052, "learning_rate": 9.865675793824266e-06, "loss": 0.319, "step": 26010 }, { "epoch": 2.559071574340439, "grad_norm": 1.233155369758606, "learning_rate": 9.861701704884156e-06, "loss": 0.2236, "step": 26020 }, { "epoch": 2.5600550760984486, "grad_norm": 1.331598162651062, "learning_rate": 9.857727615944046e-06, "loss": 0.2089, "step": 26030 }, { "epoch": 2.5610385778564577, "grad_norm": 0.8588127493858337, "learning_rate": 9.853753527003935e-06, "loss": 0.2139, "step": 26040 }, { "epoch": 2.5620220796144673, "grad_norm": 0.8715842366218567, "learning_rate": 9.849779438063825e-06, "loss": 0.1915, "step": 26050 }, { "epoch": 2.563005581372477, "grad_norm": 2.4266014099121094, "learning_rate": 9.845805349123713e-06, "loss": 0.2881, "step": 26060 }, { "epoch": 2.563989083130486, "grad_norm": 0.435737282037735, "learning_rate": 9.841831260183604e-06, "loss": 0.2017, "step": 26070 }, { "epoch": 2.5649725848884954, "grad_norm": 0.880113959312439, "learning_rate": 9.837857171243494e-06, "loss": 0.2318, "step": 26080 }, { "epoch": 2.565956086646505, "grad_norm": 0.9128925204277039, "learning_rate": 9.833883082303382e-06, "loss": 0.1924, "step": 26090 }, { "epoch": 2.566939588404514, "grad_norm": 0.4470633268356323, "learning_rate": 9.829908993363272e-06, "loss": 0.1752, "step": 26100 }, { "epoch": 2.5679230901625236, "grad_norm": 0.5922941565513611, "learning_rate": 9.825934904423163e-06, "loss": 0.1703, "step": 26110 }, { "epoch": 2.568906591920533, "grad_norm": 0.4576959013938904, "learning_rate": 9.821960815483051e-06, "loss": 0.2354, "step": 26120 }, { "epoch": 2.5698900936785423, "grad_norm": 1.6791609525680542, "learning_rate": 9.81798672654294e-06, "loss": 0.1089, "step": 26130 }, { "epoch": 2.570873595436552, "grad_norm": 1.9039769172668457, "learning_rate": 9.81401263760283e-06, "loss": 0.2101, "step": 26140 }, { "epoch": 2.5718570971945613, "grad_norm": 0.5855951905250549, "learning_rate": 9.81003854866272e-06, "loss": 0.2774, "step": 26150 }, { "epoch": 2.5728405989525704, "grad_norm": 0.6082465052604675, "learning_rate": 9.80606445972261e-06, "loss": 0.1697, "step": 26160 }, { "epoch": 2.57382410071058, "grad_norm": 1.0260789394378662, "learning_rate": 9.8020903707825e-06, "loss": 0.2786, "step": 26170 }, { "epoch": 2.5748076024685895, "grad_norm": 4.773690700531006, "learning_rate": 9.798116281842387e-06, "loss": 0.2392, "step": 26180 }, { "epoch": 2.5757911042265986, "grad_norm": 1.1605236530303955, "learning_rate": 9.794142192902279e-06, "loss": 0.2532, "step": 26190 }, { "epoch": 2.576774605984608, "grad_norm": 0.7416682243347168, "learning_rate": 9.790168103962168e-06, "loss": 0.259, "step": 26200 }, { "epoch": 2.5777581077426177, "grad_norm": 0.3520563542842865, "learning_rate": 9.786194015022056e-06, "loss": 0.2625, "step": 26210 }, { "epoch": 2.578741609500627, "grad_norm": 0.9534961581230164, "learning_rate": 9.782219926081946e-06, "loss": 0.3266, "step": 26220 }, { "epoch": 2.5797251112586363, "grad_norm": 1.174358606338501, "learning_rate": 9.778245837141837e-06, "loss": 0.2664, "step": 26230 }, { "epoch": 2.580708613016646, "grad_norm": 1.092491626739502, "learning_rate": 9.774271748201725e-06, "loss": 0.1844, "step": 26240 }, { "epoch": 2.581692114774655, "grad_norm": 0.3747186064720154, "learning_rate": 9.770297659261615e-06, "loss": 0.2445, "step": 26250 }, { "epoch": 2.5826756165326645, "grad_norm": 0.16160008311271667, "learning_rate": 9.766323570321505e-06, "loss": 0.1839, "step": 26260 }, { "epoch": 2.583659118290674, "grad_norm": 0.3357384204864502, "learning_rate": 9.762349481381394e-06, "loss": 0.2154, "step": 26270 }, { "epoch": 2.584642620048683, "grad_norm": 1.854767918586731, "learning_rate": 9.758375392441284e-06, "loss": 0.2245, "step": 26280 }, { "epoch": 2.5856261218066927, "grad_norm": 2.4164562225341797, "learning_rate": 9.754401303501174e-06, "loss": 0.1707, "step": 26290 }, { "epoch": 2.5866096235647023, "grad_norm": 0.44090378284454346, "learning_rate": 9.750427214561062e-06, "loss": 0.2257, "step": 26300 }, { "epoch": 2.5875931253227114, "grad_norm": 0.8784712553024292, "learning_rate": 9.746453125620953e-06, "loss": 0.221, "step": 26310 }, { "epoch": 2.588576627080721, "grad_norm": 1.543505311012268, "learning_rate": 9.742479036680843e-06, "loss": 0.2557, "step": 26320 }, { "epoch": 2.5895601288387304, "grad_norm": 3.15250825881958, "learning_rate": 9.73850494774073e-06, "loss": 0.1896, "step": 26330 }, { "epoch": 2.5905436305967395, "grad_norm": 1.497484803199768, "learning_rate": 9.73453085880062e-06, "loss": 0.2873, "step": 26340 }, { "epoch": 2.591527132354749, "grad_norm": 1.0310429334640503, "learning_rate": 9.730556769860512e-06, "loss": 0.2716, "step": 26350 }, { "epoch": 2.5925106341127586, "grad_norm": 1.2639644145965576, "learning_rate": 9.7265826809204e-06, "loss": 0.1936, "step": 26360 }, { "epoch": 2.5934941358707677, "grad_norm": 0.6588050723075867, "learning_rate": 9.72260859198029e-06, "loss": 0.2893, "step": 26370 }, { "epoch": 2.5944776376287773, "grad_norm": 0.252906858921051, "learning_rate": 9.718634503040179e-06, "loss": 0.2306, "step": 26380 }, { "epoch": 2.595461139386787, "grad_norm": 1.3904304504394531, "learning_rate": 9.714660414100069e-06, "loss": 0.125, "step": 26390 }, { "epoch": 2.596444641144796, "grad_norm": 2.613572835922241, "learning_rate": 9.710686325159958e-06, "loss": 0.3689, "step": 26400 }, { "epoch": 2.5974281429028054, "grad_norm": 0.6632049679756165, "learning_rate": 9.706712236219848e-06, "loss": 0.1896, "step": 26410 }, { "epoch": 2.598411644660815, "grad_norm": 2.895082950592041, "learning_rate": 9.702738147279736e-06, "loss": 0.2164, "step": 26420 }, { "epoch": 2.599395146418824, "grad_norm": 0.6230047345161438, "learning_rate": 9.698764058339627e-06, "loss": 0.1824, "step": 26430 }, { "epoch": 2.6003786481768336, "grad_norm": 0.6954286694526672, "learning_rate": 9.694789969399517e-06, "loss": 0.2352, "step": 26440 }, { "epoch": 2.601362149934843, "grad_norm": 0.4880354106426239, "learning_rate": 9.690815880459405e-06, "loss": 0.1676, "step": 26450 }, { "epoch": 2.6023456516928523, "grad_norm": 0.8374060392379761, "learning_rate": 9.686841791519295e-06, "loss": 0.1599, "step": 26460 }, { "epoch": 2.603329153450862, "grad_norm": 1.3754957914352417, "learning_rate": 9.682867702579186e-06, "loss": 0.1887, "step": 26470 }, { "epoch": 2.6043126552088713, "grad_norm": 1.3043149709701538, "learning_rate": 9.678893613639074e-06, "loss": 0.2618, "step": 26480 }, { "epoch": 2.6052961569668804, "grad_norm": 0.8774238228797913, "learning_rate": 9.674919524698964e-06, "loss": 0.1974, "step": 26490 }, { "epoch": 2.60627965872489, "grad_norm": 2.9066178798675537, "learning_rate": 9.670945435758853e-06, "loss": 0.1928, "step": 26500 }, { "epoch": 2.60627965872489, "eval_loss": 0.12362838536500931, "eval_runtime": 16.6693, "eval_samples_per_second": 3.0, "eval_steps_per_second": 1.5, "step": 26500 }, { "epoch": 2.6072631604828995, "grad_norm": 1.5432078838348389, "learning_rate": 9.666971346818743e-06, "loss": 0.2182, "step": 26510 }, { "epoch": 2.6082466622409086, "grad_norm": 0.46753302216529846, "learning_rate": 9.662997257878633e-06, "loss": 0.2579, "step": 26520 }, { "epoch": 2.609230163998918, "grad_norm": 0.6570789217948914, "learning_rate": 9.659023168938522e-06, "loss": 0.2037, "step": 26530 }, { "epoch": 2.6102136657569277, "grad_norm": 0.64068204164505, "learning_rate": 9.65504907999841e-06, "loss": 0.1439, "step": 26540 }, { "epoch": 2.611197167514937, "grad_norm": 0.6301237344741821, "learning_rate": 9.6510749910583e-06, "loss": 0.1773, "step": 26550 }, { "epoch": 2.6121806692729463, "grad_norm": 0.8529449105262756, "learning_rate": 9.647100902118191e-06, "loss": 0.0962, "step": 26560 }, { "epoch": 2.613164171030956, "grad_norm": 1.1946789026260376, "learning_rate": 9.64312681317808e-06, "loss": 0.2033, "step": 26570 }, { "epoch": 2.614147672788965, "grad_norm": 0.19759801030158997, "learning_rate": 9.639152724237969e-06, "loss": 0.2209, "step": 26580 }, { "epoch": 2.6151311745469745, "grad_norm": 5.525092124938965, "learning_rate": 9.635178635297858e-06, "loss": 0.2819, "step": 26590 }, { "epoch": 2.616114676304984, "grad_norm": 2.7954213619232178, "learning_rate": 9.631204546357748e-06, "loss": 0.2135, "step": 26600 }, { "epoch": 2.617098178062993, "grad_norm": 0.28763845562934875, "learning_rate": 9.627230457417638e-06, "loss": 0.1211, "step": 26610 }, { "epoch": 2.6180816798210027, "grad_norm": 0.9569519758224487, "learning_rate": 9.623256368477527e-06, "loss": 0.2459, "step": 26620 }, { "epoch": 2.6190651815790122, "grad_norm": 0.3743652105331421, "learning_rate": 9.619282279537417e-06, "loss": 0.2082, "step": 26630 }, { "epoch": 2.6200486833370213, "grad_norm": 0.7890741229057312, "learning_rate": 9.615308190597307e-06, "loss": 0.1258, "step": 26640 }, { "epoch": 2.621032185095031, "grad_norm": 0.48824724555015564, "learning_rate": 9.611334101657196e-06, "loss": 0.2455, "step": 26650 }, { "epoch": 2.6220156868530404, "grad_norm": 0.6401808261871338, "learning_rate": 9.607360012717084e-06, "loss": 0.1541, "step": 26660 }, { "epoch": 2.6229991886110495, "grad_norm": 0.747993528842926, "learning_rate": 9.603385923776974e-06, "loss": 0.2794, "step": 26670 }, { "epoch": 2.623982690369059, "grad_norm": 0.5995237231254578, "learning_rate": 9.599411834836865e-06, "loss": 0.2389, "step": 26680 }, { "epoch": 2.6249661921270686, "grad_norm": 1.6733250617980957, "learning_rate": 9.595437745896753e-06, "loss": 0.1522, "step": 26690 }, { "epoch": 2.6259496938850777, "grad_norm": 2.178950071334839, "learning_rate": 9.591463656956643e-06, "loss": 0.1937, "step": 26700 }, { "epoch": 2.6269331956430872, "grad_norm": 1.5474810600280762, "learning_rate": 9.587489568016533e-06, "loss": 0.2734, "step": 26710 }, { "epoch": 2.627916697401097, "grad_norm": 1.2631654739379883, "learning_rate": 9.583515479076422e-06, "loss": 0.2732, "step": 26720 }, { "epoch": 2.628900199159106, "grad_norm": 0.35731109976768494, "learning_rate": 9.579541390136312e-06, "loss": 0.1897, "step": 26730 }, { "epoch": 2.6298837009171154, "grad_norm": 0.9533439874649048, "learning_rate": 9.575567301196202e-06, "loss": 0.1651, "step": 26740 }, { "epoch": 2.630867202675125, "grad_norm": 1.1866984367370605, "learning_rate": 9.571593212256091e-06, "loss": 0.1431, "step": 26750 }, { "epoch": 2.631850704433134, "grad_norm": 1.5564192533493042, "learning_rate": 9.567619123315981e-06, "loss": 0.2232, "step": 26760 }, { "epoch": 2.6328342061911436, "grad_norm": 1.2694995403289795, "learning_rate": 9.56364503437587e-06, "loss": 0.1206, "step": 26770 }, { "epoch": 2.633817707949153, "grad_norm": 0.9269311428070068, "learning_rate": 9.559670945435759e-06, "loss": 0.2766, "step": 26780 }, { "epoch": 2.6348012097071623, "grad_norm": 4.775659084320068, "learning_rate": 9.555696856495648e-06, "loss": 0.149, "step": 26790 }, { "epoch": 2.635784711465172, "grad_norm": 1.256229281425476, "learning_rate": 9.55172276755554e-06, "loss": 0.2743, "step": 26800 }, { "epoch": 2.6367682132231813, "grad_norm": 0.9255625009536743, "learning_rate": 9.547748678615428e-06, "loss": 0.2241, "step": 26810 }, { "epoch": 2.6377517149811904, "grad_norm": 1.4762382507324219, "learning_rate": 9.543774589675317e-06, "loss": 0.2827, "step": 26820 }, { "epoch": 2.6387352167392, "grad_norm": 1.1053285598754883, "learning_rate": 9.539800500735207e-06, "loss": 0.1927, "step": 26830 }, { "epoch": 2.6397187184972095, "grad_norm": 0.5553393959999084, "learning_rate": 9.535826411795097e-06, "loss": 0.2214, "step": 26840 }, { "epoch": 2.6407022202552186, "grad_norm": 1.2319566011428833, "learning_rate": 9.531852322854986e-06, "loss": 0.2522, "step": 26850 }, { "epoch": 2.641685722013228, "grad_norm": 2.134190320968628, "learning_rate": 9.527878233914876e-06, "loss": 0.3017, "step": 26860 }, { "epoch": 2.6426692237712377, "grad_norm": 0.22002668678760529, "learning_rate": 9.523904144974766e-06, "loss": 0.1739, "step": 26870 }, { "epoch": 2.643652725529247, "grad_norm": 0.21295656263828278, "learning_rate": 9.519930056034655e-06, "loss": 0.2597, "step": 26880 }, { "epoch": 2.6446362272872563, "grad_norm": 1.9469270706176758, "learning_rate": 9.515955967094545e-06, "loss": 0.1453, "step": 26890 }, { "epoch": 2.645619729045266, "grad_norm": 0.90763258934021, "learning_rate": 9.511981878154433e-06, "loss": 0.2524, "step": 26900 }, { "epoch": 2.646603230803275, "grad_norm": 1.8186426162719727, "learning_rate": 9.508007789214323e-06, "loss": 0.2746, "step": 26910 }, { "epoch": 2.6475867325612845, "grad_norm": 1.2653254270553589, "learning_rate": 9.504033700274214e-06, "loss": 0.2519, "step": 26920 }, { "epoch": 2.648570234319294, "grad_norm": 1.2931697368621826, "learning_rate": 9.500059611334102e-06, "loss": 0.1913, "step": 26930 }, { "epoch": 2.649553736077303, "grad_norm": 0.5535080432891846, "learning_rate": 9.496085522393992e-06, "loss": 0.209, "step": 26940 }, { "epoch": 2.6505372378353127, "grad_norm": 0.6815732717514038, "learning_rate": 9.492111433453881e-06, "loss": 0.1653, "step": 26950 }, { "epoch": 2.6515207395933222, "grad_norm": 0.4440705180168152, "learning_rate": 9.488137344513771e-06, "loss": 0.1697, "step": 26960 }, { "epoch": 2.6525042413513313, "grad_norm": 0.5131239891052246, "learning_rate": 9.48416325557366e-06, "loss": 0.1631, "step": 26970 }, { "epoch": 2.653487743109341, "grad_norm": 0.3114909529685974, "learning_rate": 9.48018916663355e-06, "loss": 0.1937, "step": 26980 }, { "epoch": 2.6544712448673504, "grad_norm": 1.17475426197052, "learning_rate": 9.47621507769344e-06, "loss": 0.1177, "step": 26990 }, { "epoch": 2.6554547466253595, "grad_norm": 2.7243964672088623, "learning_rate": 9.47224098875333e-06, "loss": 0.2956, "step": 27000 }, { "epoch": 2.6554547466253595, "eval_loss": 0.12182632088661194, "eval_runtime": 18.2151, "eval_samples_per_second": 2.745, "eval_steps_per_second": 1.372, "step": 27000 }, { "epoch": 2.656438248383369, "grad_norm": 1.5404561758041382, "learning_rate": 9.46826689981322e-06, "loss": 0.2371, "step": 27010 }, { "epoch": 2.6574217501413786, "grad_norm": 0.9724194407463074, "learning_rate": 9.464292810873107e-06, "loss": 0.1877, "step": 27020 }, { "epoch": 2.6584052518993877, "grad_norm": 1.9115068912506104, "learning_rate": 9.460318721932997e-06, "loss": 0.203, "step": 27030 }, { "epoch": 2.6593887536573972, "grad_norm": 1.4371966123580933, "learning_rate": 9.456344632992888e-06, "loss": 0.2567, "step": 27040 }, { "epoch": 2.660372255415407, "grad_norm": 1.023058295249939, "learning_rate": 9.452370544052776e-06, "loss": 0.1497, "step": 27050 }, { "epoch": 2.661355757173416, "grad_norm": 2.9103660583496094, "learning_rate": 9.448396455112666e-06, "loss": 0.1619, "step": 27060 }, { "epoch": 2.6623392589314254, "grad_norm": 0.5257496237754822, "learning_rate": 9.444422366172556e-06, "loss": 0.1668, "step": 27070 }, { "epoch": 2.663322760689435, "grad_norm": 0.3518742620944977, "learning_rate": 9.440448277232445e-06, "loss": 0.2312, "step": 27080 }, { "epoch": 2.664306262447444, "grad_norm": 0.6371358633041382, "learning_rate": 9.436474188292335e-06, "loss": 0.138, "step": 27090 }, { "epoch": 2.6652897642054536, "grad_norm": 1.0923123359680176, "learning_rate": 9.432500099352225e-06, "loss": 0.1703, "step": 27100 }, { "epoch": 2.666273265963463, "grad_norm": 0.8021963238716125, "learning_rate": 9.428526010412114e-06, "loss": 0.2727, "step": 27110 }, { "epoch": 2.6672567677214722, "grad_norm": 1.092530369758606, "learning_rate": 9.424551921472004e-06, "loss": 0.3269, "step": 27120 }, { "epoch": 2.668240269479482, "grad_norm": 2.668480634689331, "learning_rate": 9.420577832531894e-06, "loss": 0.2236, "step": 27130 }, { "epoch": 2.6692237712374913, "grad_norm": 2.236377239227295, "learning_rate": 9.416603743591782e-06, "loss": 0.2983, "step": 27140 }, { "epoch": 2.6702072729955004, "grad_norm": 1.4047900438308716, "learning_rate": 9.412629654651671e-06, "loss": 0.2974, "step": 27150 }, { "epoch": 2.67119077475351, "grad_norm": 0.9929019808769226, "learning_rate": 9.408655565711563e-06, "loss": 0.1797, "step": 27160 }, { "epoch": 2.6721742765115195, "grad_norm": 0.6414074897766113, "learning_rate": 9.40468147677145e-06, "loss": 0.1573, "step": 27170 }, { "epoch": 2.6731577782695286, "grad_norm": 0.6707764863967896, "learning_rate": 9.40070738783134e-06, "loss": 0.1535, "step": 27180 }, { "epoch": 2.674141280027538, "grad_norm": 1.877697467803955, "learning_rate": 9.39673329889123e-06, "loss": 0.2274, "step": 27190 }, { "epoch": 2.6751247817855477, "grad_norm": 0.78323894739151, "learning_rate": 9.39275920995112e-06, "loss": 0.295, "step": 27200 }, { "epoch": 2.676108283543557, "grad_norm": 0.5460190773010254, "learning_rate": 9.38878512101101e-06, "loss": 0.2139, "step": 27210 }, { "epoch": 2.6770917853015663, "grad_norm": 1.3874690532684326, "learning_rate": 9.384811032070899e-06, "loss": 0.276, "step": 27220 }, { "epoch": 2.678075287059576, "grad_norm": 0.6501830220222473, "learning_rate": 9.380836943130789e-06, "loss": 0.2158, "step": 27230 }, { "epoch": 2.679058788817585, "grad_norm": 1.5910675525665283, "learning_rate": 9.376862854190678e-06, "loss": 0.2682, "step": 27240 }, { "epoch": 2.6800422905755945, "grad_norm": 1.3960938453674316, "learning_rate": 9.372888765250568e-06, "loss": 0.3153, "step": 27250 }, { "epoch": 2.681025792333604, "grad_norm": 0.7218887805938721, "learning_rate": 9.368914676310456e-06, "loss": 0.1431, "step": 27260 }, { "epoch": 2.682009294091613, "grad_norm": 0.6101197600364685, "learning_rate": 9.364940587370346e-06, "loss": 0.2299, "step": 27270 }, { "epoch": 2.6829927958496227, "grad_norm": 1.9330745935440063, "learning_rate": 9.360966498430235e-06, "loss": 0.1978, "step": 27280 }, { "epoch": 2.6839762976076322, "grad_norm": 0.9924318194389343, "learning_rate": 9.356992409490125e-06, "loss": 0.2722, "step": 27290 }, { "epoch": 2.6849597993656413, "grad_norm": 0.8466578722000122, "learning_rate": 9.353018320550015e-06, "loss": 0.3312, "step": 27300 }, { "epoch": 2.685943301123651, "grad_norm": 0.32302579283714294, "learning_rate": 9.349044231609904e-06, "loss": 0.267, "step": 27310 }, { "epoch": 2.6869268028816604, "grad_norm": 0.049537353217601776, "learning_rate": 9.345070142669794e-06, "loss": 0.1174, "step": 27320 }, { "epoch": 2.6879103046396695, "grad_norm": 1.555120587348938, "learning_rate": 9.341096053729684e-06, "loss": 0.1646, "step": 27330 }, { "epoch": 2.688893806397679, "grad_norm": 1.5903842449188232, "learning_rate": 9.337121964789573e-06, "loss": 0.2165, "step": 27340 }, { "epoch": 2.6898773081556886, "grad_norm": 1.6580713987350464, "learning_rate": 9.333147875849463e-06, "loss": 0.1619, "step": 27350 }, { "epoch": 2.6908608099136977, "grad_norm": 1.3948620557785034, "learning_rate": 9.32917378690935e-06, "loss": 0.1949, "step": 27360 }, { "epoch": 2.6918443116717072, "grad_norm": 1.0545307397842407, "learning_rate": 9.325199697969242e-06, "loss": 0.2992, "step": 27370 }, { "epoch": 2.6928278134297168, "grad_norm": 0.4301310181617737, "learning_rate": 9.32122560902913e-06, "loss": 0.1718, "step": 27380 }, { "epoch": 2.693811315187726, "grad_norm": 0.8084037899971008, "learning_rate": 9.31725152008902e-06, "loss": 0.2577, "step": 27390 }, { "epoch": 2.6947948169457354, "grad_norm": 0.48154643177986145, "learning_rate": 9.31327743114891e-06, "loss": 0.1771, "step": 27400 }, { "epoch": 2.695778318703745, "grad_norm": 1.021758794784546, "learning_rate": 9.309303342208799e-06, "loss": 0.2348, "step": 27410 }, { "epoch": 2.696761820461754, "grad_norm": 0.9262601137161255, "learning_rate": 9.305329253268689e-06, "loss": 0.2558, "step": 27420 }, { "epoch": 2.6977453222197636, "grad_norm": 0.8262643218040466, "learning_rate": 9.301355164328578e-06, "loss": 0.2068, "step": 27430 }, { "epoch": 2.6987288239777727, "grad_norm": 1.3612849712371826, "learning_rate": 9.297381075388468e-06, "loss": 0.21, "step": 27440 }, { "epoch": 2.6997123257357822, "grad_norm": 0.6201552748680115, "learning_rate": 9.293406986448358e-06, "loss": 0.1185, "step": 27450 }, { "epoch": 2.700695827493792, "grad_norm": 1.0904179811477661, "learning_rate": 9.289432897508247e-06, "loss": 0.2703, "step": 27460 }, { "epoch": 2.701679329251801, "grad_norm": 1.0167815685272217, "learning_rate": 9.285458808568137e-06, "loss": 0.1428, "step": 27470 }, { "epoch": 2.7026628310098104, "grad_norm": 1.3628720045089722, "learning_rate": 9.281484719628025e-06, "loss": 0.2142, "step": 27480 }, { "epoch": 2.70364633276782, "grad_norm": 0.7690005898475647, "learning_rate": 9.277510630687916e-06, "loss": 0.1802, "step": 27490 }, { "epoch": 2.704629834525829, "grad_norm": 0.9450478553771973, "learning_rate": 9.273536541747804e-06, "loss": 0.2007, "step": 27500 }, { "epoch": 2.704629834525829, "eval_loss": 0.11772012710571289, "eval_runtime": 18.9836, "eval_samples_per_second": 2.634, "eval_steps_per_second": 1.317, "step": 27500 }, { "epoch": 2.7056133362838386, "grad_norm": 11.136773109436035, "learning_rate": 9.269562452807694e-06, "loss": 0.1494, "step": 27510 }, { "epoch": 2.706596838041848, "grad_norm": 1.8330661058425903, "learning_rate": 9.265588363867584e-06, "loss": 0.2296, "step": 27520 }, { "epoch": 2.7075803397998572, "grad_norm": 0.5862045884132385, "learning_rate": 9.261614274927473e-06, "loss": 0.1731, "step": 27530 }, { "epoch": 2.708563841557867, "grad_norm": 1.7953245639801025, "learning_rate": 9.257640185987363e-06, "loss": 0.2048, "step": 27540 }, { "epoch": 2.7095473433158763, "grad_norm": 0.5008899569511414, "learning_rate": 9.253666097047253e-06, "loss": 0.2088, "step": 27550 }, { "epoch": 2.7105308450738854, "grad_norm": 1.6881611347198486, "learning_rate": 9.249692008107142e-06, "loss": 0.2589, "step": 27560 }, { "epoch": 2.711514346831895, "grad_norm": 2.1870365142822266, "learning_rate": 9.245717919167032e-06, "loss": 0.1861, "step": 27570 }, { "epoch": 2.7124978485899045, "grad_norm": 0.7728579640388489, "learning_rate": 9.241743830226922e-06, "loss": 0.2845, "step": 27580 }, { "epoch": 2.7134813503479136, "grad_norm": 0.534707248210907, "learning_rate": 9.237769741286811e-06, "loss": 0.138, "step": 27590 }, { "epoch": 2.714464852105923, "grad_norm": 0.7656206488609314, "learning_rate": 9.2337956523467e-06, "loss": 0.1957, "step": 27600 }, { "epoch": 2.7154483538639327, "grad_norm": 1.1382602453231812, "learning_rate": 9.22982156340659e-06, "loss": 0.1841, "step": 27610 }, { "epoch": 2.716431855621942, "grad_norm": 0.698901355266571, "learning_rate": 9.225847474466479e-06, "loss": 0.1717, "step": 27620 }, { "epoch": 2.7174153573799513, "grad_norm": 1.258329153060913, "learning_rate": 9.221873385526368e-06, "loss": 0.1288, "step": 27630 }, { "epoch": 2.718398859137961, "grad_norm": 0.5331089496612549, "learning_rate": 9.217899296586258e-06, "loss": 0.224, "step": 27640 }, { "epoch": 2.71938236089597, "grad_norm": 0.28912073373794556, "learning_rate": 9.213925207646148e-06, "loss": 0.2152, "step": 27650 }, { "epoch": 2.7203658626539795, "grad_norm": 0.4922606945037842, "learning_rate": 9.209951118706037e-06, "loss": 0.1785, "step": 27660 }, { "epoch": 2.721349364411989, "grad_norm": 0.6296001076698303, "learning_rate": 9.205977029765927e-06, "loss": 0.1462, "step": 27670 }, { "epoch": 2.722332866169998, "grad_norm": 0.6022535562515259, "learning_rate": 9.202002940825817e-06, "loss": 0.1855, "step": 27680 }, { "epoch": 2.7233163679280077, "grad_norm": 1.7227388620376587, "learning_rate": 9.198028851885706e-06, "loss": 0.1792, "step": 27690 }, { "epoch": 2.7242998696860172, "grad_norm": 0.5050339698791504, "learning_rate": 9.194054762945596e-06, "loss": 0.226, "step": 27700 }, { "epoch": 2.7252833714440263, "grad_norm": 0.4780879020690918, "learning_rate": 9.190080674005486e-06, "loss": 0.2302, "step": 27710 }, { "epoch": 2.726266873202036, "grad_norm": 0.1593957096338272, "learning_rate": 9.186106585065374e-06, "loss": 0.1973, "step": 27720 }, { "epoch": 2.727250374960045, "grad_norm": 0.3137963116168976, "learning_rate": 9.182132496125265e-06, "loss": 0.1865, "step": 27730 }, { "epoch": 2.7282338767180545, "grad_norm": 3.1869826316833496, "learning_rate": 9.178158407185153e-06, "loss": 0.2086, "step": 27740 }, { "epoch": 2.729217378476064, "grad_norm": 1.0982507467269897, "learning_rate": 9.174184318245043e-06, "loss": 0.2532, "step": 27750 }, { "epoch": 2.730200880234073, "grad_norm": 2.14682674407959, "learning_rate": 9.170210229304932e-06, "loss": 0.1477, "step": 27760 }, { "epoch": 2.7311843819920827, "grad_norm": 1.5476717948913574, "learning_rate": 9.166236140364822e-06, "loss": 0.1635, "step": 27770 }, { "epoch": 2.7321678837500922, "grad_norm": 1.0337785482406616, "learning_rate": 9.162262051424712e-06, "loss": 0.2289, "step": 27780 }, { "epoch": 2.7331513855081013, "grad_norm": 1.0297173261642456, "learning_rate": 9.158287962484601e-06, "loss": 0.303, "step": 27790 }, { "epoch": 2.734134887266111, "grad_norm": 0.8042473196983337, "learning_rate": 9.154313873544491e-06, "loss": 0.1754, "step": 27800 }, { "epoch": 2.7351183890241204, "grad_norm": 1.1096657514572144, "learning_rate": 9.15033978460438e-06, "loss": 0.2365, "step": 27810 }, { "epoch": 2.7361018907821295, "grad_norm": 0.8313658237457275, "learning_rate": 9.14636569566427e-06, "loss": 0.1505, "step": 27820 }, { "epoch": 2.737085392540139, "grad_norm": 0.7719443440437317, "learning_rate": 9.14239160672416e-06, "loss": 0.1858, "step": 27830 }, { "epoch": 2.7380688942981486, "grad_norm": 0.19261518120765686, "learning_rate": 9.138417517784048e-06, "loss": 0.2037, "step": 27840 }, { "epoch": 2.7390523960561577, "grad_norm": 0.7655139565467834, "learning_rate": 9.13444342884394e-06, "loss": 0.274, "step": 27850 }, { "epoch": 2.7400358978141672, "grad_norm": 0.6452736258506775, "learning_rate": 9.130469339903827e-06, "loss": 0.2163, "step": 27860 }, { "epoch": 2.7410193995721768, "grad_norm": 1.4266693592071533, "learning_rate": 9.126495250963717e-06, "loss": 0.1944, "step": 27870 }, { "epoch": 2.742002901330186, "grad_norm": 1.0404728651046753, "learning_rate": 9.122521162023607e-06, "loss": 0.2191, "step": 27880 }, { "epoch": 2.7429864030881954, "grad_norm": 1.3982700109481812, "learning_rate": 9.118547073083496e-06, "loss": 0.1796, "step": 27890 }, { "epoch": 2.743969904846205, "grad_norm": 1.3127501010894775, "learning_rate": 9.114572984143386e-06, "loss": 0.2794, "step": 27900 }, { "epoch": 2.744953406604214, "grad_norm": 0.7358602285385132, "learning_rate": 9.110598895203276e-06, "loss": 0.3201, "step": 27910 }, { "epoch": 2.7459369083622236, "grad_norm": 1.1799973249435425, "learning_rate": 9.106624806263165e-06, "loss": 0.2373, "step": 27920 }, { "epoch": 2.746920410120233, "grad_norm": 0.52203768491745, "learning_rate": 9.102650717323055e-06, "loss": 0.1438, "step": 27930 }, { "epoch": 2.7479039118782422, "grad_norm": 1.5550827980041504, "learning_rate": 9.098676628382945e-06, "loss": 0.1915, "step": 27940 }, { "epoch": 2.7488874136362518, "grad_norm": 1.1797600984573364, "learning_rate": 9.094702539442834e-06, "loss": 0.1632, "step": 27950 }, { "epoch": 2.7498709153942613, "grad_norm": 0.9325649738311768, "learning_rate": 9.090728450502722e-06, "loss": 0.1879, "step": 27960 }, { "epoch": 2.7508544171522704, "grad_norm": 1.2596094608306885, "learning_rate": 9.086754361562614e-06, "loss": 0.1654, "step": 27970 }, { "epoch": 2.75183791891028, "grad_norm": 0.39892297983169556, "learning_rate": 9.082780272622502e-06, "loss": 0.2034, "step": 27980 }, { "epoch": 2.7528214206682895, "grad_norm": 1.4378695487976074, "learning_rate": 9.078806183682391e-06, "loss": 0.2561, "step": 27990 }, { "epoch": 2.7538049224262986, "grad_norm": 1.9730724096298218, "learning_rate": 9.074832094742281e-06, "loss": 0.2129, "step": 28000 }, { "epoch": 2.7538049224262986, "eval_loss": 0.12233906984329224, "eval_runtime": 18.4862, "eval_samples_per_second": 2.705, "eval_steps_per_second": 1.352, "step": 28000 }, { "epoch": 2.754788424184308, "grad_norm": 1.2149182558059692, "learning_rate": 9.07085800580217e-06, "loss": 0.1312, "step": 28010 }, { "epoch": 2.7557719259423177, "grad_norm": 1.0150631666183472, "learning_rate": 9.06688391686206e-06, "loss": 0.287, "step": 28020 }, { "epoch": 2.7567554277003268, "grad_norm": 0.542308509349823, "learning_rate": 9.06290982792195e-06, "loss": 0.1862, "step": 28030 }, { "epoch": 2.7577389294583363, "grad_norm": 0.3378434479236603, "learning_rate": 9.05893573898184e-06, "loss": 0.2743, "step": 28040 }, { "epoch": 2.758722431216346, "grad_norm": 1.0021579265594482, "learning_rate": 9.05496165004173e-06, "loss": 0.1584, "step": 28050 }, { "epoch": 2.759705932974355, "grad_norm": 0.13331718742847443, "learning_rate": 9.050987561101619e-06, "loss": 0.1916, "step": 28060 }, { "epoch": 2.7606894347323645, "grad_norm": 2.321328639984131, "learning_rate": 9.047013472161509e-06, "loss": 0.175, "step": 28070 }, { "epoch": 2.761672936490374, "grad_norm": 0.6371372938156128, "learning_rate": 9.043039383221397e-06, "loss": 0.174, "step": 28080 }, { "epoch": 2.762656438248383, "grad_norm": 2.9457879066467285, "learning_rate": 9.039065294281286e-06, "loss": 0.2511, "step": 28090 }, { "epoch": 2.7636399400063927, "grad_norm": 0.9190905690193176, "learning_rate": 9.035091205341176e-06, "loss": 0.1659, "step": 28100 }, { "epoch": 2.7646234417644022, "grad_norm": 1.509871482849121, "learning_rate": 9.031117116401066e-06, "loss": 0.1308, "step": 28110 }, { "epoch": 2.7656069435224113, "grad_norm": 0.33418288826942444, "learning_rate": 9.027143027460955e-06, "loss": 0.1684, "step": 28120 }, { "epoch": 2.766590445280421, "grad_norm": 1.953349232673645, "learning_rate": 9.023168938520845e-06, "loss": 0.1792, "step": 28130 }, { "epoch": 2.7675739470384304, "grad_norm": 0.6116806864738464, "learning_rate": 9.019194849580735e-06, "loss": 0.2809, "step": 28140 }, { "epoch": 2.7685574487964395, "grad_norm": 0.8784261345863342, "learning_rate": 9.015220760640624e-06, "loss": 0.1352, "step": 28150 }, { "epoch": 2.769540950554449, "grad_norm": 1.326364278793335, "learning_rate": 9.011246671700514e-06, "loss": 0.2668, "step": 28160 }, { "epoch": 2.7705244523124586, "grad_norm": 0.5899569392204285, "learning_rate": 9.007272582760402e-06, "loss": 0.3221, "step": 28170 }, { "epoch": 2.7715079540704677, "grad_norm": 0.5236835479736328, "learning_rate": 9.003298493820293e-06, "loss": 0.2541, "step": 28180 }, { "epoch": 2.7724914558284772, "grad_norm": 1.8051170110702515, "learning_rate": 8.999324404880183e-06, "loss": 0.2342, "step": 28190 }, { "epoch": 2.7734749575864868, "grad_norm": 0.22561007738113403, "learning_rate": 8.99535031594007e-06, "loss": 0.2332, "step": 28200 }, { "epoch": 2.774458459344496, "grad_norm": 0.8784757256507874, "learning_rate": 8.99137622699996e-06, "loss": 0.1558, "step": 28210 }, { "epoch": 2.7754419611025054, "grad_norm": 0.8380150198936462, "learning_rate": 8.98740213805985e-06, "loss": 0.224, "step": 28220 }, { "epoch": 2.776425462860515, "grad_norm": 0.8617953658103943, "learning_rate": 8.98342804911974e-06, "loss": 0.2752, "step": 28230 }, { "epoch": 2.777408964618524, "grad_norm": 1.576338291168213, "learning_rate": 8.97945396017963e-06, "loss": 0.1994, "step": 28240 }, { "epoch": 2.7783924663765336, "grad_norm": 0.359642893075943, "learning_rate": 8.975479871239519e-06, "loss": 0.1899, "step": 28250 }, { "epoch": 2.779375968134543, "grad_norm": 1.1645811796188354, "learning_rate": 8.971505782299409e-06, "loss": 0.2148, "step": 28260 }, { "epoch": 2.7803594698925522, "grad_norm": 1.116481900215149, "learning_rate": 8.967531693359298e-06, "loss": 0.2202, "step": 28270 }, { "epoch": 2.7813429716505618, "grad_norm": 0.5225256681442261, "learning_rate": 8.963557604419188e-06, "loss": 0.1449, "step": 28280 }, { "epoch": 2.7823264734085713, "grad_norm": 0.782061755657196, "learning_rate": 8.959583515479076e-06, "loss": 0.2793, "step": 28290 }, { "epoch": 2.7833099751665804, "grad_norm": 0.6921595931053162, "learning_rate": 8.955609426538967e-06, "loss": 0.12, "step": 28300 }, { "epoch": 2.78429347692459, "grad_norm": 0.6155844926834106, "learning_rate": 8.951635337598857e-06, "loss": 0.187, "step": 28310 }, { "epoch": 2.7852769786825995, "grad_norm": 0.61292564868927, "learning_rate": 8.947661248658745e-06, "loss": 0.1792, "step": 28320 }, { "epoch": 2.7862604804406086, "grad_norm": 5.320784091949463, "learning_rate": 8.943687159718635e-06, "loss": 0.1971, "step": 28330 }, { "epoch": 2.787243982198618, "grad_norm": 1.5784276723861694, "learning_rate": 8.939713070778524e-06, "loss": 0.3297, "step": 28340 }, { "epoch": 2.7882274839566277, "grad_norm": 0.4494692087173462, "learning_rate": 8.935738981838414e-06, "loss": 0.1365, "step": 28350 }, { "epoch": 2.7892109857146368, "grad_norm": 0.41863226890563965, "learning_rate": 8.931764892898304e-06, "loss": 0.1865, "step": 28360 }, { "epoch": 2.7901944874726463, "grad_norm": 0.506903886795044, "learning_rate": 8.927790803958193e-06, "loss": 0.2627, "step": 28370 }, { "epoch": 2.791177989230656, "grad_norm": 1.0100220441818237, "learning_rate": 8.923816715018083e-06, "loss": 0.2728, "step": 28380 }, { "epoch": 2.792161490988665, "grad_norm": 0.788859486579895, "learning_rate": 8.919842626077973e-06, "loss": 0.2081, "step": 28390 }, { "epoch": 2.7931449927466745, "grad_norm": 0.8792150616645813, "learning_rate": 8.915868537137862e-06, "loss": 0.2177, "step": 28400 }, { "epoch": 2.794128494504684, "grad_norm": 1.0238473415374756, "learning_rate": 8.91189444819775e-06, "loss": 0.2368, "step": 28410 }, { "epoch": 2.795111996262693, "grad_norm": 2.283637046813965, "learning_rate": 8.907920359257642e-06, "loss": 0.2786, "step": 28420 }, { "epoch": 2.7960954980207027, "grad_norm": 0.11248058825731277, "learning_rate": 8.903946270317531e-06, "loss": 0.1421, "step": 28430 }, { "epoch": 2.797078999778712, "grad_norm": 2.3209190368652344, "learning_rate": 8.89997218137742e-06, "loss": 0.1536, "step": 28440 }, { "epoch": 2.7980625015367213, "grad_norm": 0.4981313943862915, "learning_rate": 8.895998092437309e-06, "loss": 0.1468, "step": 28450 }, { "epoch": 2.799046003294731, "grad_norm": 0.35236284136772156, "learning_rate": 8.892024003497199e-06, "loss": 0.2641, "step": 28460 }, { "epoch": 2.8000295050527404, "grad_norm": 4.2277703285217285, "learning_rate": 8.888049914557088e-06, "loss": 0.1715, "step": 28470 }, { "epoch": 2.8010130068107495, "grad_norm": 0.7296504378318787, "learning_rate": 8.884075825616978e-06, "loss": 0.1647, "step": 28480 }, { "epoch": 2.801996508568759, "grad_norm": 2.090623140335083, "learning_rate": 8.880101736676868e-06, "loss": 0.1697, "step": 28490 }, { "epoch": 2.8029800103267686, "grad_norm": 0.14077714085578918, "learning_rate": 8.876127647736757e-06, "loss": 0.1929, "step": 28500 }, { "epoch": 2.8029800103267686, "eval_loss": 0.12228866666555405, "eval_runtime": 20.9307, "eval_samples_per_second": 2.389, "eval_steps_per_second": 1.194, "step": 28500 }, { "epoch": 2.8039635120847777, "grad_norm": 2.1589229106903076, "learning_rate": 8.872153558796647e-06, "loss": 0.2132, "step": 28510 }, { "epoch": 2.804947013842787, "grad_norm": 0.2110229879617691, "learning_rate": 8.868179469856537e-06, "loss": 0.1616, "step": 28520 }, { "epoch": 2.8059305156007968, "grad_norm": 1.5847959518432617, "learning_rate": 8.864205380916425e-06, "loss": 0.1359, "step": 28530 }, { "epoch": 2.806914017358806, "grad_norm": 0.6458785533905029, "learning_rate": 8.860231291976316e-06, "loss": 0.1785, "step": 28540 }, { "epoch": 2.8078975191168154, "grad_norm": 0.5321483016014099, "learning_rate": 8.856257203036206e-06, "loss": 0.2318, "step": 28550 }, { "epoch": 2.808881020874825, "grad_norm": 1.2807509899139404, "learning_rate": 8.852283114096094e-06, "loss": 0.1583, "step": 28560 }, { "epoch": 2.809864522632834, "grad_norm": 0.766925573348999, "learning_rate": 8.848309025155983e-06, "loss": 0.2679, "step": 28570 }, { "epoch": 2.8108480243908436, "grad_norm": 0.6170240044593811, "learning_rate": 8.844334936215873e-06, "loss": 0.2493, "step": 28580 }, { "epoch": 2.811831526148853, "grad_norm": 1.8137459754943848, "learning_rate": 8.840360847275763e-06, "loss": 0.2313, "step": 28590 }, { "epoch": 2.812815027906862, "grad_norm": 1.1472567319869995, "learning_rate": 8.836386758335652e-06, "loss": 0.1704, "step": 28600 }, { "epoch": 2.8137985296648718, "grad_norm": 2.9038665294647217, "learning_rate": 8.832412669395542e-06, "loss": 0.2366, "step": 28610 }, { "epoch": 2.8147820314228813, "grad_norm": 1.5422487258911133, "learning_rate": 8.828438580455432e-06, "loss": 0.0977, "step": 28620 }, { "epoch": 2.8157655331808904, "grad_norm": 1.3508249521255493, "learning_rate": 8.824464491515321e-06, "loss": 0.2698, "step": 28630 }, { "epoch": 2.8167490349389, "grad_norm": 0.6246904134750366, "learning_rate": 8.820490402575211e-06, "loss": 0.1542, "step": 28640 }, { "epoch": 2.8177325366969095, "grad_norm": 0.2572641968727112, "learning_rate": 8.816516313635099e-06, "loss": 0.2488, "step": 28650 }, { "epoch": 2.8187160384549186, "grad_norm": 0.16635353863239288, "learning_rate": 8.81254222469499e-06, "loss": 0.2405, "step": 28660 }, { "epoch": 2.819699540212928, "grad_norm": 1.9076553583145142, "learning_rate": 8.80856813575488e-06, "loss": 0.329, "step": 28670 }, { "epoch": 2.8206830419709377, "grad_norm": 0.9525811672210693, "learning_rate": 8.804594046814768e-06, "loss": 0.1924, "step": 28680 }, { "epoch": 2.8216665437289468, "grad_norm": 1.3024401664733887, "learning_rate": 8.800619957874658e-06, "loss": 0.2242, "step": 28690 }, { "epoch": 2.8226500454869563, "grad_norm": 0.6237722635269165, "learning_rate": 8.796645868934547e-06, "loss": 0.2294, "step": 28700 }, { "epoch": 2.823633547244966, "grad_norm": 0.5489286184310913, "learning_rate": 8.792671779994437e-06, "loss": 0.1815, "step": 28710 }, { "epoch": 2.824617049002975, "grad_norm": 0.7488086819648743, "learning_rate": 8.788697691054327e-06, "loss": 0.1495, "step": 28720 }, { "epoch": 2.8256005507609845, "grad_norm": 1.6106791496276855, "learning_rate": 8.784723602114216e-06, "loss": 0.2004, "step": 28730 }, { "epoch": 2.826584052518994, "grad_norm": 1.1848503351211548, "learning_rate": 8.780749513174106e-06, "loss": 0.3045, "step": 28740 }, { "epoch": 2.827567554277003, "grad_norm": 1.018410563468933, "learning_rate": 8.776775424233996e-06, "loss": 0.2271, "step": 28750 }, { "epoch": 2.8285510560350127, "grad_norm": 0.7283086776733398, "learning_rate": 8.772801335293885e-06, "loss": 0.3003, "step": 28760 }, { "epoch": 2.829534557793022, "grad_norm": 0.8939152359962463, "learning_rate": 8.768827246353773e-06, "loss": 0.2051, "step": 28770 }, { "epoch": 2.8305180595510313, "grad_norm": 1.9083774089813232, "learning_rate": 8.764853157413665e-06, "loss": 0.3075, "step": 28780 }, { "epoch": 2.831501561309041, "grad_norm": 1.497853398323059, "learning_rate": 8.760879068473554e-06, "loss": 0.1993, "step": 28790 }, { "epoch": 2.8324850630670504, "grad_norm": 0.5038604736328125, "learning_rate": 8.756904979533442e-06, "loss": 0.1835, "step": 28800 }, { "epoch": 2.8334685648250595, "grad_norm": 0.49522608518600464, "learning_rate": 8.752930890593332e-06, "loss": 0.2387, "step": 28810 }, { "epoch": 2.834452066583069, "grad_norm": 0.2055412083864212, "learning_rate": 8.748956801653222e-06, "loss": 0.1647, "step": 28820 }, { "epoch": 2.8354355683410786, "grad_norm": 0.1830047070980072, "learning_rate": 8.744982712713111e-06, "loss": 0.1638, "step": 28830 }, { "epoch": 2.8364190700990877, "grad_norm": 2.257323741912842, "learning_rate": 8.741008623773001e-06, "loss": 0.1678, "step": 28840 }, { "epoch": 2.837402571857097, "grad_norm": 1.6618006229400635, "learning_rate": 8.73703453483289e-06, "loss": 0.2156, "step": 28850 }, { "epoch": 2.8383860736151068, "grad_norm": 1.9732855558395386, "learning_rate": 8.73306044589278e-06, "loss": 0.1709, "step": 28860 }, { "epoch": 2.839369575373116, "grad_norm": 0.7155810594558716, "learning_rate": 8.72908635695267e-06, "loss": 0.1784, "step": 28870 }, { "epoch": 2.8403530771311254, "grad_norm": 0.24890826642513275, "learning_rate": 8.72511226801256e-06, "loss": 0.2786, "step": 28880 }, { "epoch": 2.841336578889135, "grad_norm": 1.0565311908721924, "learning_rate": 8.721138179072448e-06, "loss": 0.2135, "step": 28890 }, { "epoch": 2.842320080647144, "grad_norm": 0.3702998161315918, "learning_rate": 8.717164090132337e-06, "loss": 0.2111, "step": 28900 }, { "epoch": 2.8433035824051536, "grad_norm": 0.5531200766563416, "learning_rate": 8.713190001192229e-06, "loss": 0.2772, "step": 28910 }, { "epoch": 2.844287084163163, "grad_norm": 1.44815194606781, "learning_rate": 8.709215912252116e-06, "loss": 0.2009, "step": 28920 }, { "epoch": 2.845270585921172, "grad_norm": 0.26301082968711853, "learning_rate": 8.705241823312006e-06, "loss": 0.2195, "step": 28930 }, { "epoch": 2.8462540876791818, "grad_norm": 5.985122203826904, "learning_rate": 8.701267734371896e-06, "loss": 0.2069, "step": 28940 }, { "epoch": 2.8472375894371913, "grad_norm": 0.3828321099281311, "learning_rate": 8.697293645431785e-06, "loss": 0.1972, "step": 28950 }, { "epoch": 2.8482210911952004, "grad_norm": 1.7138978242874146, "learning_rate": 8.693319556491675e-06, "loss": 0.1522, "step": 28960 }, { "epoch": 2.84920459295321, "grad_norm": 1.8047984838485718, "learning_rate": 8.689345467551565e-06, "loss": 0.1605, "step": 28970 }, { "epoch": 2.8501880947112195, "grad_norm": 1.2494192123413086, "learning_rate": 8.685371378611453e-06, "loss": 0.2755, "step": 28980 }, { "epoch": 2.8511715964692286, "grad_norm": 0.6676988005638123, "learning_rate": 8.681397289671344e-06, "loss": 0.2336, "step": 28990 }, { "epoch": 2.852155098227238, "grad_norm": 1.2020059823989868, "learning_rate": 8.677423200731234e-06, "loss": 0.2366, "step": 29000 }, { "epoch": 2.852155098227238, "eval_loss": 0.12494421750307083, "eval_runtime": 17.4081, "eval_samples_per_second": 2.872, "eval_steps_per_second": 1.436, "step": 29000 }, { "epoch": 2.8531385999852477, "grad_norm": 1.4287397861480713, "learning_rate": 8.673449111791122e-06, "loss": 0.2116, "step": 29010 }, { "epoch": 2.8541221017432568, "grad_norm": 0.8067907691001892, "learning_rate": 8.669475022851011e-06, "loss": 0.1835, "step": 29020 }, { "epoch": 2.8551056035012663, "grad_norm": 0.2560260593891144, "learning_rate": 8.665500933910903e-06, "loss": 0.2746, "step": 29030 }, { "epoch": 2.856089105259276, "grad_norm": 1.1014164686203003, "learning_rate": 8.66152684497079e-06, "loss": 0.1783, "step": 29040 }, { "epoch": 2.857072607017285, "grad_norm": 1.2261013984680176, "learning_rate": 8.65755275603068e-06, "loss": 0.2009, "step": 29050 }, { "epoch": 2.8580561087752945, "grad_norm": 0.8522171974182129, "learning_rate": 8.65357866709057e-06, "loss": 0.2382, "step": 29060 }, { "epoch": 2.859039610533304, "grad_norm": 1.2073774337768555, "learning_rate": 8.64960457815046e-06, "loss": 0.1162, "step": 29070 }, { "epoch": 2.860023112291313, "grad_norm": 3.7264292240142822, "learning_rate": 8.64563048921035e-06, "loss": 0.2941, "step": 29080 }, { "epoch": 2.8610066140493227, "grad_norm": 0.8291231989860535, "learning_rate": 8.641656400270239e-06, "loss": 0.24, "step": 29090 }, { "epoch": 2.861990115807332, "grad_norm": 0.7799243927001953, "learning_rate": 8.637682311330127e-06, "loss": 0.2807, "step": 29100 }, { "epoch": 2.8629736175653413, "grad_norm": 1.201129674911499, "learning_rate": 8.633708222390018e-06, "loss": 0.2662, "step": 29110 }, { "epoch": 2.863957119323351, "grad_norm": 1.764297604560852, "learning_rate": 8.629734133449908e-06, "loss": 0.1968, "step": 29120 }, { "epoch": 2.8649406210813604, "grad_norm": 0.7513929009437561, "learning_rate": 8.625760044509796e-06, "loss": 0.149, "step": 29130 }, { "epoch": 2.8659241228393695, "grad_norm": 0.11042553931474686, "learning_rate": 8.621785955569686e-06, "loss": 0.1718, "step": 29140 }, { "epoch": 2.866907624597379, "grad_norm": 0.49265816807746887, "learning_rate": 8.617811866629577e-06, "loss": 0.2166, "step": 29150 }, { "epoch": 2.8678911263553886, "grad_norm": 0.6122161746025085, "learning_rate": 8.613837777689465e-06, "loss": 0.1956, "step": 29160 }, { "epoch": 2.8688746281133977, "grad_norm": 0.8903423547744751, "learning_rate": 8.609863688749355e-06, "loss": 0.2179, "step": 29170 }, { "epoch": 2.869858129871407, "grad_norm": 1.8324393033981323, "learning_rate": 8.605889599809244e-06, "loss": 0.2135, "step": 29180 }, { "epoch": 2.8708416316294167, "grad_norm": 0.9282931685447693, "learning_rate": 8.601915510869134e-06, "loss": 0.1615, "step": 29190 }, { "epoch": 2.871825133387426, "grad_norm": 0.577796220779419, "learning_rate": 8.597941421929024e-06, "loss": 0.2672, "step": 29200 }, { "epoch": 2.8728086351454354, "grad_norm": 0.5581834316253662, "learning_rate": 8.593967332988913e-06, "loss": 0.1615, "step": 29210 }, { "epoch": 2.873792136903445, "grad_norm": 1.03250253200531, "learning_rate": 8.589993244048801e-06, "loss": 0.1836, "step": 29220 }, { "epoch": 2.874775638661454, "grad_norm": 0.719618558883667, "learning_rate": 8.586019155108693e-06, "loss": 0.3187, "step": 29230 }, { "epoch": 2.8757591404194636, "grad_norm": 0.5816129446029663, "learning_rate": 8.582045066168582e-06, "loss": 0.1661, "step": 29240 }, { "epoch": 2.876742642177473, "grad_norm": 1.870931625366211, "learning_rate": 8.57807097722847e-06, "loss": 0.1292, "step": 29250 }, { "epoch": 2.877726143935482, "grad_norm": 0.6622405052185059, "learning_rate": 8.57409688828836e-06, "loss": 0.3036, "step": 29260 }, { "epoch": 2.8787096456934917, "grad_norm": 1.67839777469635, "learning_rate": 8.570122799348251e-06, "loss": 0.1944, "step": 29270 }, { "epoch": 2.8796931474515013, "grad_norm": 0.7301672697067261, "learning_rate": 8.56614871040814e-06, "loss": 0.3032, "step": 29280 }, { "epoch": 2.8806766492095104, "grad_norm": 1.228724479675293, "learning_rate": 8.562174621468029e-06, "loss": 0.2329, "step": 29290 }, { "epoch": 2.88166015096752, "grad_norm": 0.5540966987609863, "learning_rate": 8.558200532527919e-06, "loss": 0.2967, "step": 29300 }, { "epoch": 2.8826436527255295, "grad_norm": 0.32900357246398926, "learning_rate": 8.554226443587808e-06, "loss": 0.2202, "step": 29310 }, { "epoch": 2.8836271544835386, "grad_norm": 0.3154147267341614, "learning_rate": 8.550252354647698e-06, "loss": 0.2517, "step": 29320 }, { "epoch": 2.884610656241548, "grad_norm": 0.9213051795959473, "learning_rate": 8.546278265707588e-06, "loss": 0.2602, "step": 29330 }, { "epoch": 2.8855941579995577, "grad_norm": 0.6817721724510193, "learning_rate": 8.542304176767476e-06, "loss": 0.2992, "step": 29340 }, { "epoch": 2.8865776597575667, "grad_norm": 1.0806705951690674, "learning_rate": 8.538330087827367e-06, "loss": 0.1915, "step": 29350 }, { "epoch": 2.8875611615155763, "grad_norm": 0.8594111800193787, "learning_rate": 8.534355998887257e-06, "loss": 0.2021, "step": 29360 }, { "epoch": 2.888544663273586, "grad_norm": 0.2863863408565521, "learning_rate": 8.530381909947145e-06, "loss": 0.225, "step": 29370 }, { "epoch": 2.889528165031595, "grad_norm": 1.9358649253845215, "learning_rate": 8.526407821007034e-06, "loss": 0.2276, "step": 29380 }, { "epoch": 2.8905116667896045, "grad_norm": 0.4354264438152313, "learning_rate": 8.522433732066926e-06, "loss": 0.1729, "step": 29390 }, { "epoch": 2.891495168547614, "grad_norm": 2.1319658756256104, "learning_rate": 8.518459643126814e-06, "loss": 0.1549, "step": 29400 }, { "epoch": 2.892478670305623, "grad_norm": 3.555476188659668, "learning_rate": 8.514485554186703e-06, "loss": 0.1514, "step": 29410 }, { "epoch": 2.8934621720636327, "grad_norm": 0.7322472333908081, "learning_rate": 8.510511465246593e-06, "loss": 0.182, "step": 29420 }, { "epoch": 2.894445673821642, "grad_norm": 1.2825742959976196, "learning_rate": 8.506537376306483e-06, "loss": 0.2376, "step": 29430 }, { "epoch": 2.8954291755796513, "grad_norm": 1.2198630571365356, "learning_rate": 8.502563287366372e-06, "loss": 0.2935, "step": 29440 }, { "epoch": 2.896412677337661, "grad_norm": 0.12802644073963165, "learning_rate": 8.498589198426262e-06, "loss": 0.154, "step": 29450 }, { "epoch": 2.8973961790956704, "grad_norm": 1.0437980890274048, "learning_rate": 8.49461510948615e-06, "loss": 0.2054, "step": 29460 }, { "epoch": 2.8983796808536795, "grad_norm": 2.8305885791778564, "learning_rate": 8.490641020546041e-06, "loss": 0.2329, "step": 29470 }, { "epoch": 2.899363182611689, "grad_norm": 0.8940551280975342, "learning_rate": 8.486666931605931e-06, "loss": 0.1698, "step": 29480 }, { "epoch": 2.9003466843696986, "grad_norm": 1.2706600427627563, "learning_rate": 8.482692842665819e-06, "loss": 0.1952, "step": 29490 }, { "epoch": 2.9013301861277077, "grad_norm": 1.8138004541397095, "learning_rate": 8.478718753725709e-06, "loss": 0.2928, "step": 29500 }, { "epoch": 2.9013301861277077, "eval_loss": 0.12123703211545944, "eval_runtime": 17.8571, "eval_samples_per_second": 2.8, "eval_steps_per_second": 1.4, "step": 29500 }, { "epoch": 2.902313687885717, "grad_norm": 1.4499008655548096, "learning_rate": 8.4747446647856e-06, "loss": 0.211, "step": 29510 }, { "epoch": 2.9032971896437267, "grad_norm": 2.5677297115325928, "learning_rate": 8.470770575845488e-06, "loss": 0.2401, "step": 29520 }, { "epoch": 2.904280691401736, "grad_norm": 1.0985254049301147, "learning_rate": 8.466796486905378e-06, "loss": 0.1441, "step": 29530 }, { "epoch": 2.9052641931597454, "grad_norm": 1.7599356174468994, "learning_rate": 8.462822397965267e-06, "loss": 0.164, "step": 29540 }, { "epoch": 2.906247694917755, "grad_norm": 0.7811633944511414, "learning_rate": 8.458848309025157e-06, "loss": 0.2528, "step": 29550 }, { "epoch": 2.907231196675764, "grad_norm": 0.5994932055473328, "learning_rate": 8.454874220085047e-06, "loss": 0.3667, "step": 29560 }, { "epoch": 2.9082146984337736, "grad_norm": 0.3563966453075409, "learning_rate": 8.450900131144936e-06, "loss": 0.1834, "step": 29570 }, { "epoch": 2.909198200191783, "grad_norm": 3.5165040493011475, "learning_rate": 8.446926042204824e-06, "loss": 0.3832, "step": 29580 }, { "epoch": 2.910181701949792, "grad_norm": 2.4149057865142822, "learning_rate": 8.442951953264716e-06, "loss": 0.2664, "step": 29590 }, { "epoch": 2.9111652037078017, "grad_norm": 1.1386704444885254, "learning_rate": 8.438977864324605e-06, "loss": 0.1699, "step": 29600 }, { "epoch": 2.9121487054658113, "grad_norm": 3.486772298812866, "learning_rate": 8.435003775384493e-06, "loss": 0.2228, "step": 29610 }, { "epoch": 2.9131322072238204, "grad_norm": 0.9155600666999817, "learning_rate": 8.431029686444383e-06, "loss": 0.2772, "step": 29620 }, { "epoch": 2.91411570898183, "grad_norm": 0.644264817237854, "learning_rate": 8.427055597504274e-06, "loss": 0.2006, "step": 29630 }, { "epoch": 2.9150992107398395, "grad_norm": 0.813632607460022, "learning_rate": 8.423081508564162e-06, "loss": 0.2644, "step": 29640 }, { "epoch": 2.9160827124978486, "grad_norm": 1.260005235671997, "learning_rate": 8.419107419624052e-06, "loss": 0.132, "step": 29650 }, { "epoch": 2.917066214255858, "grad_norm": 2.324798107147217, "learning_rate": 8.415133330683942e-06, "loss": 0.2186, "step": 29660 }, { "epoch": 2.9180497160138676, "grad_norm": 0.573977530002594, "learning_rate": 8.411159241743831e-06, "loss": 0.184, "step": 29670 }, { "epoch": 2.9190332177718767, "grad_norm": 0.9063766598701477, "learning_rate": 8.407185152803721e-06, "loss": 0.1659, "step": 29680 }, { "epoch": 2.9200167195298863, "grad_norm": 0.5683373808860779, "learning_rate": 8.40321106386361e-06, "loss": 0.281, "step": 29690 }, { "epoch": 2.921000221287896, "grad_norm": 2.299201011657715, "learning_rate": 8.399236974923498e-06, "loss": 0.3064, "step": 29700 }, { "epoch": 2.921983723045905, "grad_norm": 1.1093686819076538, "learning_rate": 8.395262885983388e-06, "loss": 0.2682, "step": 29710 }, { "epoch": 2.9229672248039145, "grad_norm": 0.9980324506759644, "learning_rate": 8.39128879704328e-06, "loss": 0.1977, "step": 29720 }, { "epoch": 2.9239507265619236, "grad_norm": 0.5987958312034607, "learning_rate": 8.387314708103167e-06, "loss": 0.0887, "step": 29730 }, { "epoch": 2.924934228319933, "grad_norm": 1.2359176874160767, "learning_rate": 8.383340619163057e-06, "loss": 0.2705, "step": 29740 }, { "epoch": 2.9259177300779426, "grad_norm": 0.11103789508342743, "learning_rate": 8.379366530222947e-06, "loss": 0.1573, "step": 29750 }, { "epoch": 2.9269012318359517, "grad_norm": 1.2775187492370605, "learning_rate": 8.375392441282836e-06, "loss": 0.1728, "step": 29760 }, { "epoch": 2.9278847335939613, "grad_norm": 0.3069576323032379, "learning_rate": 8.371418352342726e-06, "loss": 0.2152, "step": 29770 }, { "epoch": 2.928868235351971, "grad_norm": 0.18963612616062164, "learning_rate": 8.367444263402616e-06, "loss": 0.1034, "step": 29780 }, { "epoch": 2.92985173710998, "grad_norm": 4.958619594573975, "learning_rate": 8.363470174462505e-06, "loss": 0.2122, "step": 29790 }, { "epoch": 2.9308352388679895, "grad_norm": 1.2674598693847656, "learning_rate": 8.359496085522395e-06, "loss": 0.1352, "step": 29800 }, { "epoch": 2.931818740625999, "grad_norm": 5.292276382446289, "learning_rate": 8.355521996582285e-06, "loss": 0.1786, "step": 29810 }, { "epoch": 2.932802242384008, "grad_norm": 5.913046360015869, "learning_rate": 8.351547907642173e-06, "loss": 0.2726, "step": 29820 }, { "epoch": 2.9337857441420176, "grad_norm": 1.4404199123382568, "learning_rate": 8.347573818702062e-06, "loss": 0.1817, "step": 29830 }, { "epoch": 2.934769245900027, "grad_norm": 0.4281212389469147, "learning_rate": 8.343599729761954e-06, "loss": 0.3099, "step": 29840 }, { "epoch": 2.9357527476580363, "grad_norm": 0.5780307054519653, "learning_rate": 8.339625640821842e-06, "loss": 0.1779, "step": 29850 }, { "epoch": 2.936736249416046, "grad_norm": 0.4119488000869751, "learning_rate": 8.335651551881731e-06, "loss": 0.2055, "step": 29860 }, { "epoch": 2.9377197511740554, "grad_norm": 2.0947678089141846, "learning_rate": 8.331677462941621e-06, "loss": 0.2263, "step": 29870 }, { "epoch": 2.9387032529320645, "grad_norm": 3.156257152557373, "learning_rate": 8.32770337400151e-06, "loss": 0.2569, "step": 29880 }, { "epoch": 2.939686754690074, "grad_norm": 1.3036227226257324, "learning_rate": 8.3237292850614e-06, "loss": 0.1654, "step": 29890 }, { "epoch": 2.9406702564480836, "grad_norm": 1.7258992195129395, "learning_rate": 8.31975519612129e-06, "loss": 0.2657, "step": 29900 }, { "epoch": 2.9416537582060927, "grad_norm": 0.7538513541221619, "learning_rate": 8.31578110718118e-06, "loss": 0.1335, "step": 29910 }, { "epoch": 2.942637259964102, "grad_norm": 0.5512486696243286, "learning_rate": 8.31180701824107e-06, "loss": 0.2118, "step": 29920 }, { "epoch": 2.9436207617221117, "grad_norm": 1.1244486570358276, "learning_rate": 8.307832929300959e-06, "loss": 0.1494, "step": 29930 }, { "epoch": 2.944604263480121, "grad_norm": 2.096811056137085, "learning_rate": 8.303858840360847e-06, "loss": 0.1748, "step": 29940 }, { "epoch": 2.9455877652381304, "grad_norm": 0.6595863103866577, "learning_rate": 8.299884751420737e-06, "loss": 0.2772, "step": 29950 }, { "epoch": 2.94657126699614, "grad_norm": 0.380372554063797, "learning_rate": 8.295910662480628e-06, "loss": 0.2749, "step": 29960 }, { "epoch": 2.947554768754149, "grad_norm": 1.3502016067504883, "learning_rate": 8.291936573540516e-06, "loss": 0.1691, "step": 29970 }, { "epoch": 2.9485382705121586, "grad_norm": 0.8366619348526001, "learning_rate": 8.287962484600406e-06, "loss": 0.1547, "step": 29980 }, { "epoch": 2.949521772270168, "grad_norm": 0.2709037959575653, "learning_rate": 8.283988395660295e-06, "loss": 0.156, "step": 29990 }, { "epoch": 2.950505274028177, "grad_norm": 0.7497245073318481, "learning_rate": 8.280014306720185e-06, "loss": 0.2401, "step": 30000 }, { "epoch": 2.950505274028177, "eval_loss": 0.12118250876665115, "eval_runtime": 20.871, "eval_samples_per_second": 2.396, "eval_steps_per_second": 1.198, "step": 30000 }, { "epoch": 2.9514887757861867, "grad_norm": 0.6676973700523376, "learning_rate": 8.276040217780075e-06, "loss": 0.2575, "step": 30010 }, { "epoch": 2.952472277544196, "grad_norm": 1.6974183320999146, "learning_rate": 8.272066128839964e-06, "loss": 0.2304, "step": 30020 }, { "epoch": 2.9534557793022054, "grad_norm": 1.2901015281677246, "learning_rate": 8.268092039899854e-06, "loss": 0.1907, "step": 30030 }, { "epoch": 2.954439281060215, "grad_norm": 2.2742185592651367, "learning_rate": 8.264117950959744e-06, "loss": 0.1403, "step": 30040 }, { "epoch": 2.955422782818224, "grad_norm": 0.535651445388794, "learning_rate": 8.260143862019633e-06, "loss": 0.2052, "step": 30050 }, { "epoch": 2.9564062845762336, "grad_norm": 1.6494063138961792, "learning_rate": 8.256169773079521e-06, "loss": 0.1801, "step": 30060 }, { "epoch": 2.957389786334243, "grad_norm": 2.04034161567688, "learning_rate": 8.252195684139411e-06, "loss": 0.2169, "step": 30070 }, { "epoch": 2.958373288092252, "grad_norm": 0.3735710680484772, "learning_rate": 8.248221595199302e-06, "loss": 0.1573, "step": 30080 }, { "epoch": 2.9593567898502617, "grad_norm": 0.9145519137382507, "learning_rate": 8.24424750625919e-06, "loss": 0.2631, "step": 30090 }, { "epoch": 2.9603402916082713, "grad_norm": 0.36900460720062256, "learning_rate": 8.24027341731908e-06, "loss": 0.1587, "step": 30100 }, { "epoch": 2.9613237933662804, "grad_norm": 1.695147156715393, "learning_rate": 8.23629932837897e-06, "loss": 0.2926, "step": 30110 }, { "epoch": 2.96230729512429, "grad_norm": 1.1021901369094849, "learning_rate": 8.23232523943886e-06, "loss": 0.1658, "step": 30120 }, { "epoch": 2.9632907968822995, "grad_norm": 2.1732630729675293, "learning_rate": 8.228351150498749e-06, "loss": 0.1942, "step": 30130 }, { "epoch": 2.9642742986403086, "grad_norm": 2.214512586593628, "learning_rate": 8.224377061558639e-06, "loss": 0.1876, "step": 30140 }, { "epoch": 2.965257800398318, "grad_norm": 0.7325847148895264, "learning_rate": 8.220402972618528e-06, "loss": 0.1788, "step": 30150 }, { "epoch": 2.9662413021563276, "grad_norm": 0.7839856743812561, "learning_rate": 8.216428883678418e-06, "loss": 0.195, "step": 30160 }, { "epoch": 2.9672248039143367, "grad_norm": 1.3128502368927002, "learning_rate": 8.212454794738308e-06, "loss": 0.3257, "step": 30170 }, { "epoch": 2.9682083056723463, "grad_norm": 0.9301101565361023, "learning_rate": 8.208480705798196e-06, "loss": 0.3217, "step": 30180 }, { "epoch": 2.969191807430356, "grad_norm": 1.5134687423706055, "learning_rate": 8.204506616858085e-06, "loss": 0.2426, "step": 30190 }, { "epoch": 2.970175309188365, "grad_norm": 0.48188215494155884, "learning_rate": 8.200532527917977e-06, "loss": 0.2246, "step": 30200 }, { "epoch": 2.9711588109463745, "grad_norm": 1.5620793104171753, "learning_rate": 8.196558438977865e-06, "loss": 0.2173, "step": 30210 }, { "epoch": 2.972142312704384, "grad_norm": 0.5938036441802979, "learning_rate": 8.192584350037754e-06, "loss": 0.1634, "step": 30220 }, { "epoch": 2.973125814462393, "grad_norm": 1.9721848964691162, "learning_rate": 8.188610261097644e-06, "loss": 0.1698, "step": 30230 }, { "epoch": 2.9741093162204026, "grad_norm": 1.3379566669464111, "learning_rate": 8.184636172157534e-06, "loss": 0.2143, "step": 30240 }, { "epoch": 2.975092817978412, "grad_norm": 2.263864517211914, "learning_rate": 8.180662083217423e-06, "loss": 0.1574, "step": 30250 }, { "epoch": 2.9760763197364213, "grad_norm": 1.6023887395858765, "learning_rate": 8.176687994277313e-06, "loss": 0.1478, "step": 30260 }, { "epoch": 2.977059821494431, "grad_norm": 0.4679398536682129, "learning_rate": 8.172713905337203e-06, "loss": 0.1792, "step": 30270 }, { "epoch": 2.9780433232524404, "grad_norm": 0.6919494271278381, "learning_rate": 8.168739816397092e-06, "loss": 0.2046, "step": 30280 }, { "epoch": 2.9790268250104495, "grad_norm": 0.637020468711853, "learning_rate": 8.164765727456982e-06, "loss": 0.1326, "step": 30290 }, { "epoch": 2.980010326768459, "grad_norm": 0.6778492331504822, "learning_rate": 8.16079163851687e-06, "loss": 0.1781, "step": 30300 }, { "epoch": 2.9809938285264685, "grad_norm": 1.518211007118225, "learning_rate": 8.15681754957676e-06, "loss": 0.2089, "step": 30310 }, { "epoch": 2.9819773302844776, "grad_norm": 0.9797000885009766, "learning_rate": 8.152843460636651e-06, "loss": 0.2146, "step": 30320 }, { "epoch": 2.982960832042487, "grad_norm": 1.643019199371338, "learning_rate": 8.148869371696539e-06, "loss": 0.3223, "step": 30330 }, { "epoch": 2.9839443338004967, "grad_norm": 1.33889639377594, "learning_rate": 8.144895282756429e-06, "loss": 0.1224, "step": 30340 }, { "epoch": 2.984927835558506, "grad_norm": 1.031048059463501, "learning_rate": 8.140921193816318e-06, "loss": 0.2173, "step": 30350 }, { "epoch": 2.9859113373165154, "grad_norm": 0.3998883068561554, "learning_rate": 8.136947104876208e-06, "loss": 0.1328, "step": 30360 }, { "epoch": 2.986894839074525, "grad_norm": 0.7423034906387329, "learning_rate": 8.132973015936098e-06, "loss": 0.2294, "step": 30370 }, { "epoch": 2.987878340832534, "grad_norm": 1.018033504486084, "learning_rate": 8.128998926995987e-06, "loss": 0.2343, "step": 30380 }, { "epoch": 2.9888618425905435, "grad_norm": 0.8161916732788086, "learning_rate": 8.125024838055877e-06, "loss": 0.3269, "step": 30390 }, { "epoch": 2.989845344348553, "grad_norm": 0.3255046308040619, "learning_rate": 8.121050749115767e-06, "loss": 0.2651, "step": 30400 }, { "epoch": 2.990828846106562, "grad_norm": 0.3091247081756592, "learning_rate": 8.117076660175656e-06, "loss": 0.1769, "step": 30410 }, { "epoch": 2.9918123478645717, "grad_norm": 0.1740453690290451, "learning_rate": 8.113102571235544e-06, "loss": 0.1343, "step": 30420 }, { "epoch": 2.9927958496225813, "grad_norm": 0.5746241807937622, "learning_rate": 8.109128482295434e-06, "loss": 0.2753, "step": 30430 }, { "epoch": 2.9937793513805904, "grad_norm": 0.605728805065155, "learning_rate": 8.105154393355325e-06, "loss": 0.2022, "step": 30440 }, { "epoch": 2.9947628531386, "grad_norm": 0.6295777559280396, "learning_rate": 8.101180304415213e-06, "loss": 0.1666, "step": 30450 }, { "epoch": 2.9957463548966095, "grad_norm": 0.697580873966217, "learning_rate": 8.097206215475103e-06, "loss": 0.1339, "step": 30460 }, { "epoch": 2.9967298566546186, "grad_norm": 0.7362121939659119, "learning_rate": 8.093232126534993e-06, "loss": 0.2896, "step": 30470 }, { "epoch": 2.997713358412628, "grad_norm": 0.18402792513370514, "learning_rate": 8.089258037594882e-06, "loss": 0.1232, "step": 30480 }, { "epoch": 2.9986968601706376, "grad_norm": 1.2359050512313843, "learning_rate": 8.085283948654772e-06, "loss": 0.2461, "step": 30490 }, { "epoch": 2.9996803619286467, "grad_norm": 0.38628000020980835, "learning_rate": 8.081309859714662e-06, "loss": 0.1896, "step": 30500 }, { "epoch": 2.9996803619286467, "eval_loss": 0.1228804811835289, "eval_runtime": 17.7254, "eval_samples_per_second": 2.821, "eval_steps_per_second": 1.41, "step": 30500 }, { "epoch": 3.0006638636866563, "grad_norm": 1.5436768531799316, "learning_rate": 8.077335770774551e-06, "loss": 0.235, "step": 30510 }, { "epoch": 3.001647365444666, "grad_norm": 0.49330833554267883, "learning_rate": 8.073361681834439e-06, "loss": 0.1425, "step": 30520 }, { "epoch": 3.002630867202675, "grad_norm": 0.7838341593742371, "learning_rate": 8.06938759289433e-06, "loss": 0.1806, "step": 30530 }, { "epoch": 3.0036143689606845, "grad_norm": 0.5121620893478394, "learning_rate": 8.065413503954218e-06, "loss": 0.1986, "step": 30540 }, { "epoch": 3.004597870718694, "grad_norm": 1.0584651231765747, "learning_rate": 8.061439415014108e-06, "loss": 0.2191, "step": 30550 }, { "epoch": 3.005581372476703, "grad_norm": 2.7569994926452637, "learning_rate": 8.057465326073998e-06, "loss": 0.1805, "step": 30560 }, { "epoch": 3.0065648742347126, "grad_norm": 1.0442713499069214, "learning_rate": 8.053491237133887e-06, "loss": 0.3119, "step": 30570 }, { "epoch": 3.007548375992722, "grad_norm": 0.8963764309883118, "learning_rate": 8.049517148193777e-06, "loss": 0.1806, "step": 30580 }, { "epoch": 3.0085318777507313, "grad_norm": 1.6401909589767456, "learning_rate": 8.045543059253667e-06, "loss": 0.2019, "step": 30590 }, { "epoch": 3.009515379508741, "grad_norm": 0.45249977707862854, "learning_rate": 8.041568970313556e-06, "loss": 0.1886, "step": 30600 }, { "epoch": 3.0104988812667504, "grad_norm": 0.792337954044342, "learning_rate": 8.037594881373446e-06, "loss": 0.2098, "step": 30610 }, { "epoch": 3.0114823830247595, "grad_norm": 0.5009030103683472, "learning_rate": 8.033620792433336e-06, "loss": 0.2774, "step": 30620 }, { "epoch": 3.012465884782769, "grad_norm": 1.3158507347106934, "learning_rate": 8.029646703493225e-06, "loss": 0.1768, "step": 30630 }, { "epoch": 3.0134493865407785, "grad_norm": 0.5353335738182068, "learning_rate": 8.025672614553113e-06, "loss": 0.3349, "step": 30640 }, { "epoch": 3.0144328882987876, "grad_norm": 0.9227167367935181, "learning_rate": 8.021698525613005e-06, "loss": 0.2027, "step": 30650 }, { "epoch": 3.015416390056797, "grad_norm": 1.4296085834503174, "learning_rate": 8.017724436672893e-06, "loss": 0.1577, "step": 30660 }, { "epoch": 3.0163998918148067, "grad_norm": 1.0844863653182983, "learning_rate": 8.013750347732782e-06, "loss": 0.1652, "step": 30670 }, { "epoch": 3.017383393572816, "grad_norm": 0.8566285371780396, "learning_rate": 8.009776258792672e-06, "loss": 0.2387, "step": 30680 }, { "epoch": 3.0183668953308254, "grad_norm": 0.4086707830429077, "learning_rate": 8.005802169852562e-06, "loss": 0.2151, "step": 30690 }, { "epoch": 3.019350397088835, "grad_norm": 0.4113174080848694, "learning_rate": 8.001828080912451e-06, "loss": 0.1869, "step": 30700 }, { "epoch": 3.020333898846844, "grad_norm": 0.35407760739326477, "learning_rate": 7.997853991972341e-06, "loss": 0.1761, "step": 30710 }, { "epoch": 3.0213174006048535, "grad_norm": 0.7007184624671936, "learning_rate": 7.99387990303223e-06, "loss": 0.2157, "step": 30720 }, { "epoch": 3.022300902362863, "grad_norm": 0.623153567314148, "learning_rate": 7.98990581409212e-06, "loss": 0.2404, "step": 30730 }, { "epoch": 3.023284404120872, "grad_norm": 0.4026603698730469, "learning_rate": 7.98593172515201e-06, "loss": 0.2679, "step": 30740 }, { "epoch": 3.0242679058788817, "grad_norm": 0.36850112676620483, "learning_rate": 7.9819576362119e-06, "loss": 0.1855, "step": 30750 }, { "epoch": 3.0252514076368913, "grad_norm": 0.7698309421539307, "learning_rate": 7.977983547271788e-06, "loss": 0.2256, "step": 30760 }, { "epoch": 3.0262349093949004, "grad_norm": 2.0218682289123535, "learning_rate": 7.974009458331679e-06, "loss": 0.1395, "step": 30770 }, { "epoch": 3.02721841115291, "grad_norm": 0.40838736295700073, "learning_rate": 7.970035369391567e-06, "loss": 0.1798, "step": 30780 }, { "epoch": 3.0282019129109194, "grad_norm": 0.21284779906272888, "learning_rate": 7.966061280451457e-06, "loss": 0.1356, "step": 30790 }, { "epoch": 3.0291854146689285, "grad_norm": 0.8340510725975037, "learning_rate": 7.962087191511346e-06, "loss": 0.287, "step": 30800 }, { "epoch": 3.030168916426938, "grad_norm": 0.8605313301086426, "learning_rate": 7.958113102571236e-06, "loss": 0.1405, "step": 30810 }, { "epoch": 3.0311524181849476, "grad_norm": 1.0435175895690918, "learning_rate": 7.954139013631126e-06, "loss": 0.2611, "step": 30820 }, { "epoch": 3.0321359199429567, "grad_norm": 0.8442772626876831, "learning_rate": 7.950164924691015e-06, "loss": 0.2721, "step": 30830 }, { "epoch": 3.0331194217009663, "grad_norm": 1.565529704093933, "learning_rate": 7.946190835750905e-06, "loss": 0.1798, "step": 30840 }, { "epoch": 3.034102923458976, "grad_norm": 0.5709851384162903, "learning_rate": 7.942216746810795e-06, "loss": 0.1672, "step": 30850 }, { "epoch": 3.035086425216985, "grad_norm": 0.4600856304168701, "learning_rate": 7.938242657870684e-06, "loss": 0.2025, "step": 30860 }, { "epoch": 3.0360699269749944, "grad_norm": 0.6264340877532959, "learning_rate": 7.934268568930574e-06, "loss": 0.1344, "step": 30870 }, { "epoch": 3.037053428733004, "grad_norm": 1.8087103366851807, "learning_rate": 7.930294479990462e-06, "loss": 0.2786, "step": 30880 }, { "epoch": 3.038036930491013, "grad_norm": 3.1393840312957764, "learning_rate": 7.926320391050353e-06, "loss": 0.2822, "step": 30890 }, { "epoch": 3.0390204322490226, "grad_norm": 0.3457108438014984, "learning_rate": 7.922346302110241e-06, "loss": 0.2511, "step": 30900 }, { "epoch": 3.040003934007032, "grad_norm": 0.5391051769256592, "learning_rate": 7.918372213170131e-06, "loss": 0.1501, "step": 30910 }, { "epoch": 3.0409874357650413, "grad_norm": 0.30457252264022827, "learning_rate": 7.91439812423002e-06, "loss": 0.1377, "step": 30920 }, { "epoch": 3.041970937523051, "grad_norm": 0.6017821431159973, "learning_rate": 7.91042403528991e-06, "loss": 0.2246, "step": 30930 }, { "epoch": 3.0429544392810604, "grad_norm": 1.6097629070281982, "learning_rate": 7.9064499463498e-06, "loss": 0.2797, "step": 30940 }, { "epoch": 3.0439379410390695, "grad_norm": 1.6000497341156006, "learning_rate": 7.90247585740969e-06, "loss": 0.2186, "step": 30950 }, { "epoch": 3.044921442797079, "grad_norm": 0.5865890383720398, "learning_rate": 7.89850176846958e-06, "loss": 0.1514, "step": 30960 }, { "epoch": 3.0459049445550885, "grad_norm": 1.072675108909607, "learning_rate": 7.894527679529469e-06, "loss": 0.2242, "step": 30970 }, { "epoch": 3.0468884463130976, "grad_norm": 0.5994366407394409, "learning_rate": 7.890553590589359e-06, "loss": 0.2024, "step": 30980 }, { "epoch": 3.047871948071107, "grad_norm": 0.466334730386734, "learning_rate": 7.886579501649248e-06, "loss": 0.1856, "step": 30990 }, { "epoch": 3.0488554498291167, "grad_norm": 3.52630615234375, "learning_rate": 7.882605412709136e-06, "loss": 0.1959, "step": 31000 }, { "epoch": 3.0488554498291167, "eval_loss": 0.11854755133390427, "eval_runtime": 16.899, "eval_samples_per_second": 2.959, "eval_steps_per_second": 1.479, "step": 31000 }, { "epoch": 3.049838951587126, "grad_norm": 1.0482897758483887, "learning_rate": 7.878631323769028e-06, "loss": 0.2095, "step": 31010 }, { "epoch": 3.0508224533451354, "grad_norm": 1.1640293598175049, "learning_rate": 7.874657234828916e-06, "loss": 0.2858, "step": 31020 }, { "epoch": 3.051805955103145, "grad_norm": 0.5311436653137207, "learning_rate": 7.870683145888805e-06, "loss": 0.1531, "step": 31030 }, { "epoch": 3.052789456861154, "grad_norm": 0.36933091282844543, "learning_rate": 7.866709056948695e-06, "loss": 0.2279, "step": 31040 }, { "epoch": 3.0537729586191635, "grad_norm": 0.9239271879196167, "learning_rate": 7.862734968008585e-06, "loss": 0.1999, "step": 31050 }, { "epoch": 3.054756460377173, "grad_norm": 0.23407602310180664, "learning_rate": 7.858760879068474e-06, "loss": 0.2136, "step": 31060 }, { "epoch": 3.055739962135182, "grad_norm": 0.6105637550354004, "learning_rate": 7.854786790128364e-06, "loss": 0.2174, "step": 31070 }, { "epoch": 3.0567234638931917, "grad_norm": 0.5166285634040833, "learning_rate": 7.850812701188254e-06, "loss": 0.182, "step": 31080 }, { "epoch": 3.0577069656512013, "grad_norm": 1.0382561683654785, "learning_rate": 7.846838612248143e-06, "loss": 0.2865, "step": 31090 }, { "epoch": 3.0586904674092104, "grad_norm": 0.04259077087044716, "learning_rate": 7.842864523308033e-06, "loss": 0.2629, "step": 31100 }, { "epoch": 3.05967396916722, "grad_norm": 0.3611224889755249, "learning_rate": 7.838890434367923e-06, "loss": 0.1513, "step": 31110 }, { "epoch": 3.0606574709252294, "grad_norm": 1.518552303314209, "learning_rate": 7.83491634542781e-06, "loss": 0.203, "step": 31120 }, { "epoch": 3.0616409726832385, "grad_norm": 0.2685289680957794, "learning_rate": 7.830942256487702e-06, "loss": 0.1509, "step": 31130 }, { "epoch": 3.062624474441248, "grad_norm": 0.5196729898452759, "learning_rate": 7.82696816754759e-06, "loss": 0.171, "step": 31140 }, { "epoch": 3.0636079761992576, "grad_norm": 0.7608104944229126, "learning_rate": 7.82299407860748e-06, "loss": 0.2358, "step": 31150 }, { "epoch": 3.0645914779572667, "grad_norm": 0.5399700403213501, "learning_rate": 7.81901998966737e-06, "loss": 0.1741, "step": 31160 }, { "epoch": 3.0655749797152763, "grad_norm": 0.7594351172447205, "learning_rate": 7.815045900727259e-06, "loss": 0.1544, "step": 31170 }, { "epoch": 3.066558481473286, "grad_norm": 0.9618496298789978, "learning_rate": 7.811071811787149e-06, "loss": 0.1786, "step": 31180 }, { "epoch": 3.067541983231295, "grad_norm": 0.7477512359619141, "learning_rate": 7.807097722847038e-06, "loss": 0.1425, "step": 31190 }, { "epoch": 3.0685254849893044, "grad_norm": 0.30735233426094055, "learning_rate": 7.803123633906928e-06, "loss": 0.2515, "step": 31200 }, { "epoch": 3.069508986747314, "grad_norm": 0.758304238319397, "learning_rate": 7.799149544966818e-06, "loss": 0.1892, "step": 31210 }, { "epoch": 3.070492488505323, "grad_norm": 0.4117341637611389, "learning_rate": 7.795175456026707e-06, "loss": 0.1546, "step": 31220 }, { "epoch": 3.0714759902633326, "grad_norm": 1.254814863204956, "learning_rate": 7.791201367086597e-06, "loss": 0.1797, "step": 31230 }, { "epoch": 3.072459492021342, "grad_norm": 1.452399492263794, "learning_rate": 7.787227278146485e-06, "loss": 0.2549, "step": 31240 }, { "epoch": 3.0734429937793513, "grad_norm": 0.6258145570755005, "learning_rate": 7.783253189206376e-06, "loss": 0.1528, "step": 31250 }, { "epoch": 3.074426495537361, "grad_norm": 1.303449273109436, "learning_rate": 7.779279100266264e-06, "loss": 0.1748, "step": 31260 }, { "epoch": 3.0754099972953703, "grad_norm": 1.0126678943634033, "learning_rate": 7.775305011326154e-06, "loss": 0.2219, "step": 31270 }, { "epoch": 3.0763934990533794, "grad_norm": 0.4630603790283203, "learning_rate": 7.771330922386043e-06, "loss": 0.1699, "step": 31280 }, { "epoch": 3.077377000811389, "grad_norm": 1.1373541355133057, "learning_rate": 7.767356833445933e-06, "loss": 0.2667, "step": 31290 }, { "epoch": 3.0783605025693985, "grad_norm": 3.100982427597046, "learning_rate": 7.763382744505823e-06, "loss": 0.1706, "step": 31300 }, { "epoch": 3.0793440043274076, "grad_norm": 0.4552260637283325, "learning_rate": 7.759408655565712e-06, "loss": 0.1831, "step": 31310 }, { "epoch": 3.080327506085417, "grad_norm": 2.0742850303649902, "learning_rate": 7.755434566625602e-06, "loss": 0.1867, "step": 31320 }, { "epoch": 3.0813110078434267, "grad_norm": 0.5648025870323181, "learning_rate": 7.75146047768549e-06, "loss": 0.179, "step": 31330 }, { "epoch": 3.082294509601436, "grad_norm": 0.28768691420555115, "learning_rate": 7.747486388745381e-06, "loss": 0.1708, "step": 31340 }, { "epoch": 3.0832780113594453, "grad_norm": 0.522784411907196, "learning_rate": 7.743512299805271e-06, "loss": 0.192, "step": 31350 }, { "epoch": 3.084261513117455, "grad_norm": 0.7258384227752686, "learning_rate": 7.739538210865159e-06, "loss": 0.2074, "step": 31360 }, { "epoch": 3.085245014875464, "grad_norm": 1.0599561929702759, "learning_rate": 7.735564121925049e-06, "loss": 0.1306, "step": 31370 }, { "epoch": 3.0862285166334735, "grad_norm": 0.6479315161705017, "learning_rate": 7.731590032984938e-06, "loss": 0.3329, "step": 31380 }, { "epoch": 3.087212018391483, "grad_norm": 1.6357970237731934, "learning_rate": 7.727615944044828e-06, "loss": 0.2368, "step": 31390 }, { "epoch": 3.088195520149492, "grad_norm": 1.8428471088409424, "learning_rate": 7.723641855104718e-06, "loss": 0.1648, "step": 31400 }, { "epoch": 3.0891790219075017, "grad_norm": 0.43346959352493286, "learning_rate": 7.719667766164607e-06, "loss": 0.1035, "step": 31410 }, { "epoch": 3.0901625236655113, "grad_norm": 1.0585253238677979, "learning_rate": 7.715693677224497e-06, "loss": 0.1864, "step": 31420 }, { "epoch": 3.0911460254235203, "grad_norm": 0.7270784974098206, "learning_rate": 7.711719588284387e-06, "loss": 0.1715, "step": 31430 }, { "epoch": 3.09212952718153, "grad_norm": 0.19803878664970398, "learning_rate": 7.707745499344276e-06, "loss": 0.1547, "step": 31440 }, { "epoch": 3.0931130289395394, "grad_norm": 0.36690932512283325, "learning_rate": 7.703771410404164e-06, "loss": 0.2647, "step": 31450 }, { "epoch": 3.0940965306975485, "grad_norm": 0.42811933159828186, "learning_rate": 7.699797321464056e-06, "loss": 0.2242, "step": 31460 }, { "epoch": 3.095080032455558, "grad_norm": 0.3193596303462982, "learning_rate": 7.695823232523945e-06, "loss": 0.1819, "step": 31470 }, { "epoch": 3.0960635342135676, "grad_norm": 0.5629414319992065, "learning_rate": 7.691849143583833e-06, "loss": 0.2528, "step": 31480 }, { "epoch": 3.0970470359715767, "grad_norm": 0.21826787292957306, "learning_rate": 7.687875054643723e-06, "loss": 0.1463, "step": 31490 }, { "epoch": 3.0980305377295863, "grad_norm": 1.1460973024368286, "learning_rate": 7.683900965703613e-06, "loss": 0.1669, "step": 31500 }, { "epoch": 3.0980305377295863, "eval_loss": 0.1169167309999466, "eval_runtime": 18.3179, "eval_samples_per_second": 2.73, "eval_steps_per_second": 1.365, "step": 31500 }, { "epoch": 3.099014039487596, "grad_norm": 1.028279423713684, "learning_rate": 7.679926876763502e-06, "loss": 0.2178, "step": 31510 }, { "epoch": 3.099997541245605, "grad_norm": 0.3888244330883026, "learning_rate": 7.675952787823392e-06, "loss": 0.1574, "step": 31520 }, { "epoch": 3.1009810430036144, "grad_norm": 0.4609070420265198, "learning_rate": 7.671978698883282e-06, "loss": 0.1974, "step": 31530 }, { "epoch": 3.101964544761624, "grad_norm": 0.3917083740234375, "learning_rate": 7.668004609943171e-06, "loss": 0.2735, "step": 31540 }, { "epoch": 3.102948046519633, "grad_norm": 1.1195194721221924, "learning_rate": 7.664030521003061e-06, "loss": 0.1768, "step": 31550 }, { "epoch": 3.1039315482776426, "grad_norm": 0.25413259863853455, "learning_rate": 7.66005643206295e-06, "loss": 0.187, "step": 31560 }, { "epoch": 3.104915050035652, "grad_norm": 0.1355896294116974, "learning_rate": 7.656082343122839e-06, "loss": 0.0985, "step": 31570 }, { "epoch": 3.1058985517936613, "grad_norm": 0.3639681339263916, "learning_rate": 7.65210825418273e-06, "loss": 0.2879, "step": 31580 }, { "epoch": 3.106882053551671, "grad_norm": 0.13450194895267487, "learning_rate": 7.64813416524262e-06, "loss": 0.1446, "step": 31590 }, { "epoch": 3.1078655553096803, "grad_norm": 30.96489715576172, "learning_rate": 7.644160076302508e-06, "loss": 0.1567, "step": 31600 }, { "epoch": 3.1088490570676894, "grad_norm": 1.1467623710632324, "learning_rate": 7.640185987362397e-06, "loss": 0.1917, "step": 31610 }, { "epoch": 3.109832558825699, "grad_norm": 2.060237407684326, "learning_rate": 7.636211898422287e-06, "loss": 0.1424, "step": 31620 }, { "epoch": 3.1108160605837085, "grad_norm": 0.38937437534332275, "learning_rate": 7.632237809482177e-06, "loss": 0.1967, "step": 31630 }, { "epoch": 3.1117995623417176, "grad_norm": 0.9515398740768433, "learning_rate": 7.628263720542066e-06, "loss": 0.1264, "step": 31640 }, { "epoch": 3.112783064099727, "grad_norm": 0.6877108216285706, "learning_rate": 7.624289631601955e-06, "loss": 0.1955, "step": 31650 }, { "epoch": 3.1137665658577363, "grad_norm": 0.5784842371940613, "learning_rate": 7.620315542661846e-06, "loss": 0.1444, "step": 31660 }, { "epoch": 3.114750067615746, "grad_norm": 1.5829799175262451, "learning_rate": 7.616341453721735e-06, "loss": 0.2132, "step": 31670 }, { "epoch": 3.1157335693737553, "grad_norm": 0.7843524217605591, "learning_rate": 7.612367364781624e-06, "loss": 0.1878, "step": 31680 }, { "epoch": 3.1167170711317644, "grad_norm": 1.2282633781433105, "learning_rate": 7.608393275841514e-06, "loss": 0.2217, "step": 31690 }, { "epoch": 3.117700572889774, "grad_norm": 0.3797919750213623, "learning_rate": 7.6044191869014035e-06, "loss": 0.119, "step": 31700 }, { "epoch": 3.1186840746477835, "grad_norm": 2.6364216804504395, "learning_rate": 7.600445097961293e-06, "loss": 0.2002, "step": 31710 }, { "epoch": 3.1196675764057926, "grad_norm": 1.5436702966690063, "learning_rate": 7.596471009021183e-06, "loss": 0.1477, "step": 31720 }, { "epoch": 3.120651078163802, "grad_norm": 0.4043786823749542, "learning_rate": 7.592496920081072e-06, "loss": 0.2011, "step": 31730 }, { "epoch": 3.1216345799218117, "grad_norm": 0.8174799084663391, "learning_rate": 7.588522831140962e-06, "loss": 0.1601, "step": 31740 }, { "epoch": 3.122618081679821, "grad_norm": 0.34667864441871643, "learning_rate": 7.584548742200851e-06, "loss": 0.1948, "step": 31750 }, { "epoch": 3.1236015834378303, "grad_norm": 0.3857724070549011, "learning_rate": 7.580574653260741e-06, "loss": 0.118, "step": 31760 }, { "epoch": 3.12458508519584, "grad_norm": 1.493760347366333, "learning_rate": 7.5766005643206294e-06, "loss": 0.2103, "step": 31770 }, { "epoch": 3.125568586953849, "grad_norm": 1.1172994375228882, "learning_rate": 7.57262647538052e-06, "loss": 0.1778, "step": 31780 }, { "epoch": 3.1265520887118585, "grad_norm": 0.6166003346443176, "learning_rate": 7.56865238644041e-06, "loss": 0.152, "step": 31790 }, { "epoch": 3.127535590469868, "grad_norm": 0.5556543469429016, "learning_rate": 7.5646782975002984e-06, "loss": 0.2136, "step": 31800 }, { "epoch": 3.128519092227877, "grad_norm": 1.0468389987945557, "learning_rate": 7.560704208560188e-06, "loss": 0.2625, "step": 31810 }, { "epoch": 3.1295025939858867, "grad_norm": 0.9141632914543152, "learning_rate": 7.556730119620078e-06, "loss": 0.1199, "step": 31820 }, { "epoch": 3.1304860957438962, "grad_norm": 1.8059278726577759, "learning_rate": 7.5527560306799674e-06, "loss": 0.1591, "step": 31830 }, { "epoch": 3.1314695975019053, "grad_norm": 0.9291358590126038, "learning_rate": 7.548781941739857e-06, "loss": 0.1537, "step": 31840 }, { "epoch": 3.132453099259915, "grad_norm": 0.43296074867248535, "learning_rate": 7.544807852799746e-06, "loss": 0.1286, "step": 31850 }, { "epoch": 3.1334366010179244, "grad_norm": 0.7689695954322815, "learning_rate": 7.5408337638596364e-06, "loss": 0.1265, "step": 31860 }, { "epoch": 3.1344201027759335, "grad_norm": 1.3215502500534058, "learning_rate": 7.536859674919525e-06, "loss": 0.1528, "step": 31870 }, { "epoch": 3.135403604533943, "grad_norm": 0.22979998588562012, "learning_rate": 7.532885585979415e-06, "loss": 0.2016, "step": 31880 }, { "epoch": 3.1363871062919526, "grad_norm": 1.030752182006836, "learning_rate": 7.528911497039304e-06, "loss": 0.1128, "step": 31890 }, { "epoch": 3.1373706080499617, "grad_norm": 2.3236682415008545, "learning_rate": 7.524937408099194e-06, "loss": 0.1911, "step": 31900 }, { "epoch": 3.1383541098079712, "grad_norm": 0.23360274732112885, "learning_rate": 7.520963319159084e-06, "loss": 0.1864, "step": 31910 }, { "epoch": 3.139337611565981, "grad_norm": 1.6743212938308716, "learning_rate": 7.516989230218973e-06, "loss": 0.1219, "step": 31920 }, { "epoch": 3.14032111332399, "grad_norm": 0.8622710704803467, "learning_rate": 7.513015141278862e-06, "loss": 0.2372, "step": 31930 }, { "epoch": 3.1413046150819994, "grad_norm": 0.4087425172328949, "learning_rate": 7.509041052338752e-06, "loss": 0.1996, "step": 31940 }, { "epoch": 3.142288116840009, "grad_norm": 1.8322125673294067, "learning_rate": 7.505066963398642e-06, "loss": 0.2281, "step": 31950 }, { "epoch": 3.143271618598018, "grad_norm": 2.583505392074585, "learning_rate": 7.501092874458531e-06, "loss": 0.1595, "step": 31960 }, { "epoch": 3.1442551203560276, "grad_norm": 0.5924628376960754, "learning_rate": 7.49711878551842e-06, "loss": 0.1901, "step": 31970 }, { "epoch": 3.145238622114037, "grad_norm": 1.7466529607772827, "learning_rate": 7.493144696578311e-06, "loss": 0.1545, "step": 31980 }, { "epoch": 3.1462221238720463, "grad_norm": 1.0587272644042969, "learning_rate": 7.4891706076381995e-06, "loss": 0.1402, "step": 31990 }, { "epoch": 3.147205625630056, "grad_norm": 1.4738565683364868, "learning_rate": 7.485196518698089e-06, "loss": 0.2065, "step": 32000 }, { "epoch": 3.147205625630056, "eval_loss": 0.11695942282676697, "eval_runtime": 16.6238, "eval_samples_per_second": 3.008, "eval_steps_per_second": 1.504, "step": 32000 }, { "epoch": 3.1481891273880653, "grad_norm": 1.245446801185608, "learning_rate": 7.481222429757978e-06, "loss": 0.1854, "step": 32010 }, { "epoch": 3.1491726291460744, "grad_norm": 1.756117343902588, "learning_rate": 7.4772483408178685e-06, "loss": 0.2727, "step": 32020 }, { "epoch": 3.150156130904084, "grad_norm": 0.3403267562389374, "learning_rate": 7.473274251877758e-06, "loss": 0.2097, "step": 32030 }, { "epoch": 3.1511396326620935, "grad_norm": 0.45989128947257996, "learning_rate": 7.469300162937647e-06, "loss": 0.1552, "step": 32040 }, { "epoch": 3.1521231344201026, "grad_norm": 1.0242866277694702, "learning_rate": 7.465326073997537e-06, "loss": 0.2169, "step": 32050 }, { "epoch": 3.153106636178112, "grad_norm": 0.25825977325439453, "learning_rate": 7.461351985057426e-06, "loss": 0.1225, "step": 32060 }, { "epoch": 3.1540901379361217, "grad_norm": 1.4743444919586182, "learning_rate": 7.457377896117316e-06, "loss": 0.2738, "step": 32070 }, { "epoch": 3.155073639694131, "grad_norm": 1.8272091150283813, "learning_rate": 7.453403807177206e-06, "loss": 0.2369, "step": 32080 }, { "epoch": 3.1560571414521403, "grad_norm": 0.8361017107963562, "learning_rate": 7.4494297182370945e-06, "loss": 0.1716, "step": 32090 }, { "epoch": 3.15704064321015, "grad_norm": 0.7447807788848877, "learning_rate": 7.445455629296984e-06, "loss": 0.1722, "step": 32100 }, { "epoch": 3.158024144968159, "grad_norm": 3.4965121746063232, "learning_rate": 7.441481540356874e-06, "loss": 0.2161, "step": 32110 }, { "epoch": 3.1590076467261685, "grad_norm": 2.7191264629364014, "learning_rate": 7.4375074514167635e-06, "loss": 0.1452, "step": 32120 }, { "epoch": 3.159991148484178, "grad_norm": 1.3699681758880615, "learning_rate": 7.433533362476652e-06, "loss": 0.2319, "step": 32130 }, { "epoch": 3.160974650242187, "grad_norm": 0.4341868460178375, "learning_rate": 7.429559273536542e-06, "loss": 0.1768, "step": 32140 }, { "epoch": 3.1619581520001967, "grad_norm": 0.7707095146179199, "learning_rate": 7.4255851845964325e-06, "loss": 0.1715, "step": 32150 }, { "epoch": 3.1629416537582062, "grad_norm": 0.3822473883628845, "learning_rate": 7.421611095656321e-06, "loss": 0.1659, "step": 32160 }, { "epoch": 3.1639251555162153, "grad_norm": 1.2275961637496948, "learning_rate": 7.417637006716211e-06, "loss": 0.2094, "step": 32170 }, { "epoch": 3.164908657274225, "grad_norm": 1.0981230735778809, "learning_rate": 7.4136629177761e-06, "loss": 0.1301, "step": 32180 }, { "epoch": 3.1658921590322344, "grad_norm": 0.4137146770954132, "learning_rate": 7.40968882883599e-06, "loss": 0.1893, "step": 32190 }, { "epoch": 3.1668756607902435, "grad_norm": 0.7313489317893982, "learning_rate": 7.40571473989588e-06, "loss": 0.2226, "step": 32200 }, { "epoch": 3.167859162548253, "grad_norm": 0.5336371064186096, "learning_rate": 7.401740650955769e-06, "loss": 0.1276, "step": 32210 }, { "epoch": 3.1688426643062626, "grad_norm": 0.16569511592388153, "learning_rate": 7.3977665620156584e-06, "loss": 0.1372, "step": 32220 }, { "epoch": 3.1698261660642717, "grad_norm": 0.27265745401382446, "learning_rate": 7.393792473075548e-06, "loss": 0.3227, "step": 32230 }, { "epoch": 3.1708096678222812, "grad_norm": 1.2933647632598877, "learning_rate": 7.389818384135438e-06, "loss": 0.163, "step": 32240 }, { "epoch": 3.171793169580291, "grad_norm": 2.2621102333068848, "learning_rate": 7.3858442951953266e-06, "loss": 0.1644, "step": 32250 }, { "epoch": 3.1727766713383, "grad_norm": 0.5217112898826599, "learning_rate": 7.381870206255216e-06, "loss": 0.2593, "step": 32260 }, { "epoch": 3.1737601730963094, "grad_norm": 0.8894764184951782, "learning_rate": 7.377896117315107e-06, "loss": 0.1204, "step": 32270 }, { "epoch": 3.174743674854319, "grad_norm": 1.873124361038208, "learning_rate": 7.3739220283749956e-06, "loss": 0.2319, "step": 32280 }, { "epoch": 3.175727176612328, "grad_norm": 0.8186569213867188, "learning_rate": 7.369947939434885e-06, "loss": 0.1459, "step": 32290 }, { "epoch": 3.1767106783703376, "grad_norm": 0.42005041241645813, "learning_rate": 7.365973850494774e-06, "loss": 0.159, "step": 32300 }, { "epoch": 3.177694180128347, "grad_norm": 1.3418946266174316, "learning_rate": 7.3619997615546646e-06, "loss": 0.2799, "step": 32310 }, { "epoch": 3.1786776818863562, "grad_norm": 1.168924331665039, "learning_rate": 7.358025672614554e-06, "loss": 0.2939, "step": 32320 }, { "epoch": 3.179661183644366, "grad_norm": 0.21440809965133667, "learning_rate": 7.354051583674443e-06, "loss": 0.0732, "step": 32330 }, { "epoch": 3.1806446854023753, "grad_norm": 1.0417389869689941, "learning_rate": 7.350077494734333e-06, "loss": 0.1937, "step": 32340 }, { "epoch": 3.1816281871603844, "grad_norm": 0.9816319346427917, "learning_rate": 7.346103405794222e-06, "loss": 0.1935, "step": 32350 }, { "epoch": 3.182611688918394, "grad_norm": 1.8569186925888062, "learning_rate": 7.342129316854112e-06, "loss": 0.169, "step": 32360 }, { "epoch": 3.1835951906764035, "grad_norm": 1.4092768430709839, "learning_rate": 7.338155227914001e-06, "loss": 0.2134, "step": 32370 }, { "epoch": 3.1845786924344126, "grad_norm": 0.5531893372535706, "learning_rate": 7.3341811389738905e-06, "loss": 0.2163, "step": 32380 }, { "epoch": 3.185562194192422, "grad_norm": 1.8517935276031494, "learning_rate": 7.330207050033781e-06, "loss": 0.1831, "step": 32390 }, { "epoch": 3.1865456959504317, "grad_norm": 1.3389599323272705, "learning_rate": 7.32623296109367e-06, "loss": 0.1613, "step": 32400 }, { "epoch": 3.187529197708441, "grad_norm": 0.3436429798603058, "learning_rate": 7.3222588721535595e-06, "loss": 0.2205, "step": 32410 }, { "epoch": 3.1885126994664503, "grad_norm": 1.0781066417694092, "learning_rate": 7.318284783213448e-06, "loss": 0.121, "step": 32420 }, { "epoch": 3.18949620122446, "grad_norm": 0.9889883995056152, "learning_rate": 7.314310694273339e-06, "loss": 0.1683, "step": 32430 }, { "epoch": 3.190479702982469, "grad_norm": 2.1204090118408203, "learning_rate": 7.310336605333228e-06, "loss": 0.2273, "step": 32440 }, { "epoch": 3.1914632047404785, "grad_norm": 0.8797752857208252, "learning_rate": 7.306362516393117e-06, "loss": 0.1523, "step": 32450 }, { "epoch": 3.192446706498488, "grad_norm": 1.7521774768829346, "learning_rate": 7.302388427453007e-06, "loss": 0.2407, "step": 32460 }, { "epoch": 3.193430208256497, "grad_norm": 1.2867456674575806, "learning_rate": 7.298414338512897e-06, "loss": 0.176, "step": 32470 }, { "epoch": 3.1944137100145067, "grad_norm": 0.11294372379779816, "learning_rate": 7.294440249572786e-06, "loss": 0.1571, "step": 32480 }, { "epoch": 3.1953972117725162, "grad_norm": 1.2366126775741577, "learning_rate": 7.290466160632675e-06, "loss": 0.1813, "step": 32490 }, { "epoch": 3.1963807135305253, "grad_norm": 0.6433688402175903, "learning_rate": 7.286492071692565e-06, "loss": 0.1792, "step": 32500 }, { "epoch": 3.1963807135305253, "eval_loss": 0.11758752912282944, "eval_runtime": 20.8811, "eval_samples_per_second": 2.395, "eval_steps_per_second": 1.197, "step": 32500 }, { "epoch": 3.197364215288535, "grad_norm": 1.1521432399749756, "learning_rate": 7.282517982752455e-06, "loss": 0.2708, "step": 32510 }, { "epoch": 3.1983477170465444, "grad_norm": 0.4797680377960205, "learning_rate": 7.278543893812344e-06, "loss": 0.3051, "step": 32520 }, { "epoch": 3.1993312188045535, "grad_norm": 1.5195132493972778, "learning_rate": 7.274569804872234e-06, "loss": 0.1316, "step": 32530 }, { "epoch": 3.200314720562563, "grad_norm": 0.17302559316158295, "learning_rate": 7.270595715932123e-06, "loss": 0.1391, "step": 32540 }, { "epoch": 3.2012982223205726, "grad_norm": 0.9784829616546631, "learning_rate": 7.266621626992013e-06, "loss": 0.2081, "step": 32550 }, { "epoch": 3.2022817240785817, "grad_norm": 0.8939286470413208, "learning_rate": 7.262647538051902e-06, "loss": 0.1981, "step": 32560 }, { "epoch": 3.2032652258365912, "grad_norm": 0.6267346143722534, "learning_rate": 7.258673449111792e-06, "loss": 0.2875, "step": 32570 }, { "epoch": 3.2042487275946008, "grad_norm": 0.15202075242996216, "learning_rate": 7.254699360171681e-06, "loss": 0.2477, "step": 32580 }, { "epoch": 3.20523222935261, "grad_norm": 0.5904789566993713, "learning_rate": 7.250725271231571e-06, "loss": 0.1588, "step": 32590 }, { "epoch": 3.2062157311106194, "grad_norm": 0.4800198972225189, "learning_rate": 7.246751182291461e-06, "loss": 0.0764, "step": 32600 }, { "epoch": 3.207199232868629, "grad_norm": 0.41733959317207336, "learning_rate": 7.242777093351349e-06, "loss": 0.2756, "step": 32610 }, { "epoch": 3.208182734626638, "grad_norm": 0.09101834148168564, "learning_rate": 7.238803004411239e-06, "loss": 0.1507, "step": 32620 }, { "epoch": 3.2091662363846476, "grad_norm": 0.7980632781982422, "learning_rate": 7.23482891547113e-06, "loss": 0.1985, "step": 32630 }, { "epoch": 3.2101497381426567, "grad_norm": 1.3122634887695312, "learning_rate": 7.230854826531018e-06, "loss": 0.1364, "step": 32640 }, { "epoch": 3.2111332399006662, "grad_norm": 1.0960237979888916, "learning_rate": 7.226880737590908e-06, "loss": 0.1429, "step": 32650 }, { "epoch": 3.212116741658676, "grad_norm": 0.1762075573205948, "learning_rate": 7.222906648650797e-06, "loss": 0.1854, "step": 32660 }, { "epoch": 3.213100243416685, "grad_norm": 0.3500872254371643, "learning_rate": 7.218932559710687e-06, "loss": 0.2285, "step": 32670 }, { "epoch": 3.2140837451746944, "grad_norm": 0.602541446685791, "learning_rate": 7.214958470770576e-06, "loss": 0.2755, "step": 32680 }, { "epoch": 3.215067246932704, "grad_norm": 0.34568265080451965, "learning_rate": 7.210984381830466e-06, "loss": 0.1966, "step": 32690 }, { "epoch": 3.216050748690713, "grad_norm": 0.6943805813789368, "learning_rate": 7.2070102928903556e-06, "loss": 0.2801, "step": 32700 }, { "epoch": 3.2170342504487226, "grad_norm": 1.8825277090072632, "learning_rate": 7.203036203950245e-06, "loss": 0.2246, "step": 32710 }, { "epoch": 3.218017752206732, "grad_norm": 1.251085877418518, "learning_rate": 7.199062115010135e-06, "loss": 0.2346, "step": 32720 }, { "epoch": 3.2190012539647412, "grad_norm": 1.6574790477752686, "learning_rate": 7.195088026070024e-06, "loss": 0.2262, "step": 32730 }, { "epoch": 3.219984755722751, "grad_norm": 0.347114622592926, "learning_rate": 7.191113937129913e-06, "loss": 0.1595, "step": 32740 }, { "epoch": 3.2209682574807603, "grad_norm": 1.9753527641296387, "learning_rate": 7.187139848189804e-06, "loss": 0.2737, "step": 32750 }, { "epoch": 3.2219517592387694, "grad_norm": 0.15236753225326538, "learning_rate": 7.183165759249693e-06, "loss": 0.1592, "step": 32760 }, { "epoch": 3.222935260996779, "grad_norm": 0.43684929609298706, "learning_rate": 7.179191670309582e-06, "loss": 0.1877, "step": 32770 }, { "epoch": 3.2239187627547885, "grad_norm": 0.6682396531105042, "learning_rate": 7.175217581369471e-06, "loss": 0.1321, "step": 32780 }, { "epoch": 3.2249022645127976, "grad_norm": 0.430677205324173, "learning_rate": 7.171243492429362e-06, "loss": 0.2208, "step": 32790 }, { "epoch": 3.225885766270807, "grad_norm": 1.3447238206863403, "learning_rate": 7.1672694034892505e-06, "loss": 0.2048, "step": 32800 }, { "epoch": 3.2268692680288167, "grad_norm": 0.4871419072151184, "learning_rate": 7.16329531454914e-06, "loss": 0.1191, "step": 32810 }, { "epoch": 3.227852769786826, "grad_norm": 0.9699082970619202, "learning_rate": 7.15932122560903e-06, "loss": 0.2104, "step": 32820 }, { "epoch": 3.2288362715448353, "grad_norm": 0.5603752136230469, "learning_rate": 7.1553471366689195e-06, "loss": 0.177, "step": 32830 }, { "epoch": 3.229819773302845, "grad_norm": 0.5944997668266296, "learning_rate": 7.151373047728809e-06, "loss": 0.1743, "step": 32840 }, { "epoch": 3.230803275060854, "grad_norm": 1.1690326929092407, "learning_rate": 7.147398958788698e-06, "loss": 0.2163, "step": 32850 }, { "epoch": 3.2317867768188635, "grad_norm": 0.5509839057922363, "learning_rate": 7.143424869848588e-06, "loss": 0.2276, "step": 32860 }, { "epoch": 3.232770278576873, "grad_norm": 1.3236446380615234, "learning_rate": 7.1394507809084765e-06, "loss": 0.1761, "step": 32870 }, { "epoch": 3.233753780334882, "grad_norm": 0.7778728604316711, "learning_rate": 7.135476691968367e-06, "loss": 0.2241, "step": 32880 }, { "epoch": 3.2347372820928917, "grad_norm": 0.4267124831676483, "learning_rate": 7.131502603028257e-06, "loss": 0.1965, "step": 32890 }, { "epoch": 3.2357207838509012, "grad_norm": 1.4709194898605347, "learning_rate": 7.1275285140881455e-06, "loss": 0.1544, "step": 32900 }, { "epoch": 3.2367042856089103, "grad_norm": 0.23302286863327026, "learning_rate": 7.123554425148035e-06, "loss": 0.1595, "step": 32910 }, { "epoch": 3.23768778736692, "grad_norm": 0.6105418801307678, "learning_rate": 7.119580336207925e-06, "loss": 0.1827, "step": 32920 }, { "epoch": 3.2386712891249294, "grad_norm": 0.40103980898857117, "learning_rate": 7.1156062472678145e-06, "loss": 0.377, "step": 32930 }, { "epoch": 3.2396547908829385, "grad_norm": 0.8761731386184692, "learning_rate": 7.111632158327704e-06, "loss": 0.2274, "step": 32940 }, { "epoch": 3.240638292640948, "grad_norm": 0.181829035282135, "learning_rate": 7.107658069387593e-06, "loss": 0.2274, "step": 32950 }, { "epoch": 3.2416217943989576, "grad_norm": 1.0195627212524414, "learning_rate": 7.1036839804474835e-06, "loss": 0.3015, "step": 32960 }, { "epoch": 3.2426052961569667, "grad_norm": 1.2234985828399658, "learning_rate": 7.099709891507372e-06, "loss": 0.1759, "step": 32970 }, { "epoch": 3.2435887979149762, "grad_norm": 2.0877585411071777, "learning_rate": 7.095735802567262e-06, "loss": 0.1551, "step": 32980 }, { "epoch": 3.2445722996729858, "grad_norm": 1.9813555479049683, "learning_rate": 7.091761713627151e-06, "loss": 0.1677, "step": 32990 }, { "epoch": 3.245555801430995, "grad_norm": 1.2899110317230225, "learning_rate": 7.087787624687041e-06, "loss": 0.2256, "step": 33000 }, { "epoch": 3.245555801430995, "eval_loss": 0.11789312213659286, "eval_runtime": 20.8963, "eval_samples_per_second": 2.393, "eval_steps_per_second": 1.196, "step": 33000 }, { "epoch": 3.2465393031890044, "grad_norm": 0.9657261967658997, "learning_rate": 7.083813535746931e-06, "loss": 0.2636, "step": 33010 }, { "epoch": 3.247522804947014, "grad_norm": 1.7438515424728394, "learning_rate": 7.07983944680682e-06, "loss": 0.2387, "step": 33020 }, { "epoch": 3.248506306705023, "grad_norm": 0.4442780613899231, "learning_rate": 7.075865357866709e-06, "loss": 0.274, "step": 33030 }, { "epoch": 3.2494898084630326, "grad_norm": 0.63568115234375, "learning_rate": 7.071891268926599e-06, "loss": 0.2495, "step": 33040 }, { "epoch": 3.250473310221042, "grad_norm": 0.8722742795944214, "learning_rate": 7.067917179986489e-06, "loss": 0.168, "step": 33050 }, { "epoch": 3.2514568119790512, "grad_norm": 1.4411827325820923, "learning_rate": 7.063943091046378e-06, "loss": 0.2814, "step": 33060 }, { "epoch": 3.2524403137370608, "grad_norm": 0.23856908082962036, "learning_rate": 7.059969002106267e-06, "loss": 0.2155, "step": 33070 }, { "epoch": 3.2534238154950703, "grad_norm": 0.4681982696056366, "learning_rate": 7.055994913166158e-06, "loss": 0.1963, "step": 33080 }, { "epoch": 3.2544073172530794, "grad_norm": 0.9917487502098083, "learning_rate": 7.0520208242260466e-06, "loss": 0.2566, "step": 33090 }, { "epoch": 3.255390819011089, "grad_norm": 0.6508780717849731, "learning_rate": 7.048046735285936e-06, "loss": 0.2226, "step": 33100 }, { "epoch": 3.2563743207690985, "grad_norm": 1.2814282178878784, "learning_rate": 7.044072646345825e-06, "loss": 0.2213, "step": 33110 }, { "epoch": 3.2573578225271076, "grad_norm": 1.1933754682540894, "learning_rate": 7.0400985574057155e-06, "loss": 0.2197, "step": 33120 }, { "epoch": 3.258341324285117, "grad_norm": 0.5371510982513428, "learning_rate": 7.036124468465605e-06, "loss": 0.2021, "step": 33130 }, { "epoch": 3.2593248260431267, "grad_norm": 0.8904058337211609, "learning_rate": 7.032150379525494e-06, "loss": 0.1948, "step": 33140 }, { "epoch": 3.2603083278011358, "grad_norm": 0.3558623790740967, "learning_rate": 7.028176290585384e-06, "loss": 0.2308, "step": 33150 }, { "epoch": 3.2612918295591453, "grad_norm": 0.20771129429340363, "learning_rate": 7.024202201645273e-06, "loss": 0.0968, "step": 33160 }, { "epoch": 3.262275331317155, "grad_norm": 0.3460518419742584, "learning_rate": 7.020228112705163e-06, "loss": 0.1776, "step": 33170 }, { "epoch": 3.263258833075164, "grad_norm": 0.3401819169521332, "learning_rate": 7.016254023765053e-06, "loss": 0.1276, "step": 33180 }, { "epoch": 3.2642423348331735, "grad_norm": 0.787639856338501, "learning_rate": 7.0122799348249415e-06, "loss": 0.2096, "step": 33190 }, { "epoch": 3.265225836591183, "grad_norm": 0.21232843399047852, "learning_rate": 7.008305845884832e-06, "loss": 0.1858, "step": 33200 }, { "epoch": 3.266209338349192, "grad_norm": 0.18599532544612885, "learning_rate": 7.004331756944721e-06, "loss": 0.1921, "step": 33210 }, { "epoch": 3.2671928401072017, "grad_norm": 0.44295501708984375, "learning_rate": 7.0003576680046105e-06, "loss": 0.126, "step": 33220 }, { "epoch": 3.268176341865211, "grad_norm": 0.9706286787986755, "learning_rate": 6.996383579064499e-06, "loss": 0.2774, "step": 33230 }, { "epoch": 3.2691598436232203, "grad_norm": 0.16358977556228638, "learning_rate": 6.99240949012439e-06, "loss": 0.2533, "step": 33240 }, { "epoch": 3.27014334538123, "grad_norm": 2.576416492462158, "learning_rate": 6.9884354011842795e-06, "loss": 0.2855, "step": 33250 }, { "epoch": 3.2711268471392394, "grad_norm": 1.0210784673690796, "learning_rate": 6.984461312244168e-06, "loss": 0.2505, "step": 33260 }, { "epoch": 3.2721103488972485, "grad_norm": 0.8797509670257568, "learning_rate": 6.980487223304058e-06, "loss": 0.2141, "step": 33270 }, { "epoch": 3.273093850655258, "grad_norm": 0.30824482440948486, "learning_rate": 6.976513134363948e-06, "loss": 0.1742, "step": 33280 }, { "epoch": 3.2740773524132676, "grad_norm": 0.39702358841896057, "learning_rate": 6.972539045423837e-06, "loss": 0.1995, "step": 33290 }, { "epoch": 3.2750608541712767, "grad_norm": 2.781437873840332, "learning_rate": 6.968564956483727e-06, "loss": 0.1352, "step": 33300 }, { "epoch": 3.2760443559292862, "grad_norm": 1.4445058107376099, "learning_rate": 6.964590867543616e-06, "loss": 0.1443, "step": 33310 }, { "epoch": 3.2770278576872958, "grad_norm": 0.854621410369873, "learning_rate": 6.960616778603506e-06, "loss": 0.2027, "step": 33320 }, { "epoch": 3.278011359445305, "grad_norm": 1.5687012672424316, "learning_rate": 6.956642689663395e-06, "loss": 0.2498, "step": 33330 }, { "epoch": 3.2789948612033144, "grad_norm": 1.041850209236145, "learning_rate": 6.952668600723285e-06, "loss": 0.1666, "step": 33340 }, { "epoch": 3.279978362961324, "grad_norm": 0.35938170552253723, "learning_rate": 6.948694511783174e-06, "loss": 0.2129, "step": 33350 }, { "epoch": 3.280961864719333, "grad_norm": 0.10888555645942688, "learning_rate": 6.944720422843064e-06, "loss": 0.1266, "step": 33360 }, { "epoch": 3.2819453664773426, "grad_norm": 0.6250441670417786, "learning_rate": 6.940746333902954e-06, "loss": 0.1325, "step": 33370 }, { "epoch": 3.282928868235352, "grad_norm": 1.226815938949585, "learning_rate": 6.936772244962843e-06, "loss": 0.2172, "step": 33380 }, { "epoch": 3.2839123699933612, "grad_norm": 0.16180641949176788, "learning_rate": 6.932798156022732e-06, "loss": 0.1938, "step": 33390 }, { "epoch": 3.2848958717513708, "grad_norm": 0.600370466709137, "learning_rate": 6.928824067082622e-06, "loss": 0.1631, "step": 33400 }, { "epoch": 3.2858793735093803, "grad_norm": 1.6209992170333862, "learning_rate": 6.924849978142512e-06, "loss": 0.1728, "step": 33410 }, { "epoch": 3.2868628752673894, "grad_norm": 1.65892493724823, "learning_rate": 6.9208758892024e-06, "loss": 0.1649, "step": 33420 }, { "epoch": 3.287846377025399, "grad_norm": 0.6686263084411621, "learning_rate": 6.91690180026229e-06, "loss": 0.3185, "step": 33430 }, { "epoch": 3.2888298787834085, "grad_norm": 0.32287952303886414, "learning_rate": 6.912927711322181e-06, "loss": 0.2165, "step": 33440 }, { "epoch": 3.2898133805414176, "grad_norm": 0.7468112111091614, "learning_rate": 6.908953622382069e-06, "loss": 0.2343, "step": 33450 }, { "epoch": 3.290796882299427, "grad_norm": 0.6859676241874695, "learning_rate": 6.904979533441959e-06, "loss": 0.2155, "step": 33460 }, { "epoch": 3.2917803840574367, "grad_norm": 2.146181106567383, "learning_rate": 6.901005444501848e-06, "loss": 0.214, "step": 33470 }, { "epoch": 3.2927638858154458, "grad_norm": 1.0984450578689575, "learning_rate": 6.897031355561738e-06, "loss": 0.2845, "step": 33480 }, { "epoch": 3.2937473875734553, "grad_norm": 0.38485586643218994, "learning_rate": 6.893057266621628e-06, "loss": 0.2578, "step": 33490 }, { "epoch": 3.294730889331465, "grad_norm": 0.7894379496574402, "learning_rate": 6.889083177681517e-06, "loss": 0.1563, "step": 33500 }, { "epoch": 3.294730889331465, "eval_loss": 0.117005854845047, "eval_runtime": 16.1268, "eval_samples_per_second": 3.1, "eval_steps_per_second": 1.55, "step": 33500 }, { "epoch": 3.295714391089474, "grad_norm": 0.5310940742492676, "learning_rate": 6.8851090887414065e-06, "loss": 0.2821, "step": 33510 }, { "epoch": 3.2966978928474835, "grad_norm": 1.7750648260116577, "learning_rate": 6.881134999801296e-06, "loss": 0.2697, "step": 33520 }, { "epoch": 3.297681394605493, "grad_norm": 1.573096513748169, "learning_rate": 6.877160910861186e-06, "loss": 0.2191, "step": 33530 }, { "epoch": 3.298664896363502, "grad_norm": 0.6690852642059326, "learning_rate": 6.873186821921075e-06, "loss": 0.2901, "step": 33540 }, { "epoch": 3.2996483981215117, "grad_norm": 1.3802883625030518, "learning_rate": 6.869212732980964e-06, "loss": 0.2022, "step": 33550 }, { "epoch": 3.300631899879521, "grad_norm": 0.8098796606063843, "learning_rate": 6.865238644040855e-06, "loss": 0.2498, "step": 33560 }, { "epoch": 3.3016154016375303, "grad_norm": 1.7303550243377686, "learning_rate": 6.861264555100744e-06, "loss": 0.2695, "step": 33570 }, { "epoch": 3.30259890339554, "grad_norm": 0.9760664105415344, "learning_rate": 6.857290466160633e-06, "loss": 0.1864, "step": 33580 }, { "epoch": 3.3035824051535494, "grad_norm": 0.6733797788619995, "learning_rate": 6.853316377220522e-06, "loss": 0.1678, "step": 33590 }, { "epoch": 3.3045659069115585, "grad_norm": 1.4998847246170044, "learning_rate": 6.849342288280413e-06, "loss": 0.182, "step": 33600 }, { "epoch": 3.305549408669568, "grad_norm": 0.3813591003417969, "learning_rate": 6.845368199340302e-06, "loss": 0.1512, "step": 33610 }, { "epoch": 3.3065329104275776, "grad_norm": 0.8506468534469604, "learning_rate": 6.841394110400191e-06, "loss": 0.1841, "step": 33620 }, { "epoch": 3.3075164121855867, "grad_norm": 0.5800037980079651, "learning_rate": 6.837420021460081e-06, "loss": 0.1648, "step": 33630 }, { "epoch": 3.308499913943596, "grad_norm": 0.4694751799106598, "learning_rate": 6.8334459325199705e-06, "loss": 0.1034, "step": 33640 }, { "epoch": 3.3094834157016058, "grad_norm": 1.381948471069336, "learning_rate": 6.82947184357986e-06, "loss": 0.1966, "step": 33650 }, { "epoch": 3.310466917459615, "grad_norm": 0.9570664167404175, "learning_rate": 6.825497754639749e-06, "loss": 0.1416, "step": 33660 }, { "epoch": 3.3114504192176244, "grad_norm": 1.3214004039764404, "learning_rate": 6.821523665699639e-06, "loss": 0.2701, "step": 33670 }, { "epoch": 3.312433920975634, "grad_norm": 1.189394235610962, "learning_rate": 6.817549576759528e-06, "loss": 0.322, "step": 33680 }, { "epoch": 3.313417422733643, "grad_norm": 0.7238168716430664, "learning_rate": 6.813575487819418e-06, "loss": 0.1982, "step": 33690 }, { "epoch": 3.3144009244916526, "grad_norm": 0.3914591372013092, "learning_rate": 6.809601398879308e-06, "loss": 0.173, "step": 33700 }, { "epoch": 3.315384426249662, "grad_norm": 0.491703063249588, "learning_rate": 6.8056273099391965e-06, "loss": 0.2151, "step": 33710 }, { "epoch": 3.316367928007671, "grad_norm": 1.746738076210022, "learning_rate": 6.801653220999086e-06, "loss": 0.2345, "step": 33720 }, { "epoch": 3.3173514297656808, "grad_norm": 1.026167869567871, "learning_rate": 6.797679132058977e-06, "loss": 0.2082, "step": 33730 }, { "epoch": 3.3183349315236903, "grad_norm": 1.7243843078613281, "learning_rate": 6.7937050431188654e-06, "loss": 0.2422, "step": 33740 }, { "epoch": 3.3193184332816994, "grad_norm": 2.0129072666168213, "learning_rate": 6.789730954178755e-06, "loss": 0.2314, "step": 33750 }, { "epoch": 3.320301935039709, "grad_norm": 1.2712618112564087, "learning_rate": 6.785756865238644e-06, "loss": 0.2042, "step": 33760 }, { "epoch": 3.3212854367977185, "grad_norm": 0.9794314503669739, "learning_rate": 6.7817827762985344e-06, "loss": 0.2315, "step": 33770 }, { "epoch": 3.3222689385557276, "grad_norm": 0.66265869140625, "learning_rate": 6.777808687358423e-06, "loss": 0.1541, "step": 33780 }, { "epoch": 3.323252440313737, "grad_norm": 0.7355524301528931, "learning_rate": 6.773834598418313e-06, "loss": 0.1472, "step": 33790 }, { "epoch": 3.3242359420717467, "grad_norm": 0.4393594563007355, "learning_rate": 6.769860509478203e-06, "loss": 0.2169, "step": 33800 }, { "epoch": 3.3252194438297558, "grad_norm": 0.4566223621368408, "learning_rate": 6.765886420538092e-06, "loss": 0.2252, "step": 33810 }, { "epoch": 3.3262029455877653, "grad_norm": 0.6023932099342346, "learning_rate": 6.761912331597982e-06, "loss": 0.129, "step": 33820 }, { "epoch": 3.327186447345775, "grad_norm": 0.10506648570299149, "learning_rate": 6.757938242657871e-06, "loss": 0.1242, "step": 33830 }, { "epoch": 3.328169949103784, "grad_norm": 1.2360717058181763, "learning_rate": 6.75396415371776e-06, "loss": 0.2681, "step": 33840 }, { "epoch": 3.3291534508617935, "grad_norm": 0.9308496713638306, "learning_rate": 6.749990064777651e-06, "loss": 0.1293, "step": 33850 }, { "epoch": 3.330136952619803, "grad_norm": 0.4173755645751953, "learning_rate": 6.74601597583754e-06, "loss": 0.2014, "step": 33860 }, { "epoch": 3.331120454377812, "grad_norm": 4.138999938964844, "learning_rate": 6.742041886897429e-06, "loss": 0.2215, "step": 33870 }, { "epoch": 3.3321039561358217, "grad_norm": 0.23688192665576935, "learning_rate": 6.738067797957318e-06, "loss": 0.245, "step": 33880 }, { "epoch": 3.333087457893831, "grad_norm": 0.947866678237915, "learning_rate": 6.734093709017209e-06, "loss": 0.2418, "step": 33890 }, { "epoch": 3.3340709596518403, "grad_norm": 1.1110749244689941, "learning_rate": 6.7301196200770975e-06, "loss": 0.2828, "step": 33900 }, { "epoch": 3.33505446140985, "grad_norm": 1.053632140159607, "learning_rate": 6.726145531136987e-06, "loss": 0.1867, "step": 33910 }, { "epoch": 3.3360379631678594, "grad_norm": 0.5738980174064636, "learning_rate": 6.722171442196877e-06, "loss": 0.1942, "step": 33920 }, { "epoch": 3.3370214649258685, "grad_norm": 0.07501865923404694, "learning_rate": 6.7181973532567665e-06, "loss": 0.1547, "step": 33930 }, { "epoch": 3.338004966683878, "grad_norm": 0.46395015716552734, "learning_rate": 6.714223264316656e-06, "loss": 0.2232, "step": 33940 }, { "epoch": 3.3389884684418876, "grad_norm": 0.21362808346748352, "learning_rate": 6.710249175376545e-06, "loss": 0.2594, "step": 33950 }, { "epoch": 3.3399719701998967, "grad_norm": 1.4730660915374756, "learning_rate": 6.706275086436435e-06, "loss": 0.1954, "step": 33960 }, { "epoch": 3.340955471957906, "grad_norm": 1.2356129884719849, "learning_rate": 6.702300997496325e-06, "loss": 0.2185, "step": 33970 }, { "epoch": 3.3419389737159158, "grad_norm": 1.5479633808135986, "learning_rate": 6.698326908556214e-06, "loss": 0.2533, "step": 33980 }, { "epoch": 3.342922475473925, "grad_norm": 1.2569940090179443, "learning_rate": 6.694352819616104e-06, "loss": 0.2133, "step": 33990 }, { "epoch": 3.3439059772319344, "grad_norm": 0.516693115234375, "learning_rate": 6.6903787306759925e-06, "loss": 0.2356, "step": 34000 }, { "epoch": 3.3439059772319344, "eval_loss": 0.11839893460273743, "eval_runtime": 17.6612, "eval_samples_per_second": 2.831, "eval_steps_per_second": 1.416, "step": 34000 }, { "epoch": 3.344889478989944, "grad_norm": 0.4600711762905121, "learning_rate": 6.686404641735883e-06, "loss": 0.1115, "step": 34010 }, { "epoch": 3.345872980747953, "grad_norm": 0.40922555327415466, "learning_rate": 6.682430552795772e-06, "loss": 0.2602, "step": 34020 }, { "epoch": 3.3468564825059626, "grad_norm": 0.4795723259449005, "learning_rate": 6.6784564638556615e-06, "loss": 0.1489, "step": 34030 }, { "epoch": 3.347839984263972, "grad_norm": 0.5202747583389282, "learning_rate": 6.674482374915551e-06, "loss": 0.1751, "step": 34040 }, { "epoch": 3.348823486021981, "grad_norm": 0.8103330135345459, "learning_rate": 6.670508285975441e-06, "loss": 0.1866, "step": 34050 }, { "epoch": 3.3498069877799908, "grad_norm": 0.6577498316764832, "learning_rate": 6.6665341970353305e-06, "loss": 0.1945, "step": 34060 }, { "epoch": 3.3507904895380003, "grad_norm": 0.5543909668922424, "learning_rate": 6.662560108095219e-06, "loss": 0.0748, "step": 34070 }, { "epoch": 3.3517739912960094, "grad_norm": 0.3950757682323456, "learning_rate": 6.658586019155109e-06, "loss": 0.2112, "step": 34080 }, { "epoch": 3.352757493054019, "grad_norm": 0.6226676106452942, "learning_rate": 6.6546119302149995e-06, "loss": 0.2478, "step": 34090 }, { "epoch": 3.3537409948120285, "grad_norm": 0.7202355265617371, "learning_rate": 6.650637841274888e-06, "loss": 0.2961, "step": 34100 }, { "epoch": 3.3547244965700376, "grad_norm": 2.388300657272339, "learning_rate": 6.646663752334778e-06, "loss": 0.183, "step": 34110 }, { "epoch": 3.355707998328047, "grad_norm": 0.7886066436767578, "learning_rate": 6.642689663394667e-06, "loss": 0.2087, "step": 34120 }, { "epoch": 3.356691500086056, "grad_norm": 0.3806540369987488, "learning_rate": 6.638715574454557e-06, "loss": 0.1824, "step": 34130 }, { "epoch": 3.3576750018440658, "grad_norm": 2.428232431411743, "learning_rate": 6.634741485514446e-06, "loss": 0.2822, "step": 34140 }, { "epoch": 3.3586585036020753, "grad_norm": 0.47671598196029663, "learning_rate": 6.630767396574336e-06, "loss": 0.2101, "step": 34150 }, { "epoch": 3.3596420053600844, "grad_norm": 1.0764350891113281, "learning_rate": 6.6267933076342254e-06, "loss": 0.2439, "step": 34160 }, { "epoch": 3.360625507118094, "grad_norm": 0.19563129544258118, "learning_rate": 6.622819218694115e-06, "loss": 0.2158, "step": 34170 }, { "epoch": 3.3616090088761035, "grad_norm": 0.4823220372200012, "learning_rate": 6.618845129754005e-06, "loss": 0.1742, "step": 34180 }, { "epoch": 3.3625925106341126, "grad_norm": 1.3417798280715942, "learning_rate": 6.614871040813894e-06, "loss": 0.2537, "step": 34190 }, { "epoch": 3.363576012392122, "grad_norm": 1.0107239484786987, "learning_rate": 6.610896951873783e-06, "loss": 0.176, "step": 34200 }, { "epoch": 3.3645595141501317, "grad_norm": 0.7224860191345215, "learning_rate": 6.606922862933674e-06, "loss": 0.2095, "step": 34210 }, { "epoch": 3.3655430159081408, "grad_norm": 0.8758500218391418, "learning_rate": 6.602948773993563e-06, "loss": 0.1393, "step": 34220 }, { "epoch": 3.3665265176661503, "grad_norm": 0.7904019355773926, "learning_rate": 6.598974685053452e-06, "loss": 0.1872, "step": 34230 }, { "epoch": 3.36751001942416, "grad_norm": 1.1942181587219238, "learning_rate": 6.595000596113341e-06, "loss": 0.2817, "step": 34240 }, { "epoch": 3.368493521182169, "grad_norm": 0.9850848913192749, "learning_rate": 6.5910265071732316e-06, "loss": 0.177, "step": 34250 }, { "epoch": 3.3694770229401785, "grad_norm": 1.3652971982955933, "learning_rate": 6.58705241823312e-06, "loss": 0.232, "step": 34260 }, { "epoch": 3.370460524698188, "grad_norm": 0.37273120880126953, "learning_rate": 6.58307832929301e-06, "loss": 0.1928, "step": 34270 }, { "epoch": 3.371444026456197, "grad_norm": 1.5884888172149658, "learning_rate": 6.5791042403529e-06, "loss": 0.0956, "step": 34280 }, { "epoch": 3.3724275282142067, "grad_norm": 1.9908277988433838, "learning_rate": 6.575130151412789e-06, "loss": 0.2205, "step": 34290 }, { "epoch": 3.373411029972216, "grad_norm": 0.3747865855693817, "learning_rate": 6.571156062472679e-06, "loss": 0.1879, "step": 34300 }, { "epoch": 3.3743945317302253, "grad_norm": 0.24655213952064514, "learning_rate": 6.567181973532568e-06, "loss": 0.2002, "step": 34310 }, { "epoch": 3.375378033488235, "grad_norm": 0.8035260438919067, "learning_rate": 6.5632078845924575e-06, "loss": 0.2027, "step": 34320 }, { "epoch": 3.3763615352462444, "grad_norm": 0.25017067790031433, "learning_rate": 6.559233795652348e-06, "loss": 0.2793, "step": 34330 }, { "epoch": 3.3773450370042535, "grad_norm": 0.13351485133171082, "learning_rate": 6.555259706712237e-06, "loss": 0.1746, "step": 34340 }, { "epoch": 3.378328538762263, "grad_norm": 0.4170202314853668, "learning_rate": 6.5512856177721265e-06, "loss": 0.1552, "step": 34350 }, { "epoch": 3.3793120405202726, "grad_norm": 0.25968262553215027, "learning_rate": 6.547311528832015e-06, "loss": 0.1619, "step": 34360 }, { "epoch": 3.3802955422782817, "grad_norm": 2.428452253341675, "learning_rate": 6.543337439891906e-06, "loss": 0.1165, "step": 34370 }, { "epoch": 3.381279044036291, "grad_norm": 1.7165864706039429, "learning_rate": 6.539363350951795e-06, "loss": 0.1918, "step": 34380 }, { "epoch": 3.3822625457943007, "grad_norm": 0.634429931640625, "learning_rate": 6.535389262011684e-06, "loss": 0.1812, "step": 34390 }, { "epoch": 3.38324604755231, "grad_norm": 1.2559707164764404, "learning_rate": 6.531415173071573e-06, "loss": 0.3113, "step": 34400 }, { "epoch": 3.3842295493103194, "grad_norm": 0.8117685914039612, "learning_rate": 6.527441084131464e-06, "loss": 0.1822, "step": 34410 }, { "epoch": 3.385213051068329, "grad_norm": 1.5536867380142212, "learning_rate": 6.523466995191353e-06, "loss": 0.1113, "step": 34420 }, { "epoch": 3.386196552826338, "grad_norm": 1.0924869775772095, "learning_rate": 6.519492906251242e-06, "loss": 0.2487, "step": 34430 }, { "epoch": 3.3871800545843476, "grad_norm": 0.7709922790527344, "learning_rate": 6.515518817311132e-06, "loss": 0.2193, "step": 34440 }, { "epoch": 3.388163556342357, "grad_norm": 1.841170310974121, "learning_rate": 6.511544728371022e-06, "loss": 0.1149, "step": 34450 }, { "epoch": 3.389147058100366, "grad_norm": 0.4802115559577942, "learning_rate": 6.507570639430911e-06, "loss": 0.2525, "step": 34460 }, { "epoch": 3.3901305598583757, "grad_norm": 0.5731269717216492, "learning_rate": 6.503596550490801e-06, "loss": 0.1602, "step": 34470 }, { "epoch": 3.3911140616163853, "grad_norm": 0.488396018743515, "learning_rate": 6.49962246155069e-06, "loss": 0.1671, "step": 34480 }, { "epoch": 3.3920975633743944, "grad_norm": 0.9870609045028687, "learning_rate": 6.495648372610579e-06, "loss": 0.17, "step": 34490 }, { "epoch": 3.393081065132404, "grad_norm": 0.36211255192756653, "learning_rate": 6.491674283670469e-06, "loss": 0.2352, "step": 34500 }, { "epoch": 3.393081065132404, "eval_loss": 0.11643476784229279, "eval_runtime": 16.9274, "eval_samples_per_second": 2.954, "eval_steps_per_second": 1.477, "step": 34500 }, { "epoch": 3.3940645668904135, "grad_norm": 0.9025580286979675, "learning_rate": 6.487700194730359e-06, "loss": 0.2056, "step": 34510 }, { "epoch": 3.3950480686484226, "grad_norm": 1.125788688659668, "learning_rate": 6.4837261057902474e-06, "loss": 0.1919, "step": 34520 }, { "epoch": 3.396031570406432, "grad_norm": 0.9839863181114197, "learning_rate": 6.479752016850137e-06, "loss": 0.1315, "step": 34530 }, { "epoch": 3.3970150721644417, "grad_norm": 0.07589655369520187, "learning_rate": 6.475777927910028e-06, "loss": 0.1146, "step": 34540 }, { "epoch": 3.3979985739224507, "grad_norm": 2.114081859588623, "learning_rate": 6.4718038389699164e-06, "loss": 0.116, "step": 34550 }, { "epoch": 3.3989820756804603, "grad_norm": 0.7543653845787048, "learning_rate": 6.467829750029806e-06, "loss": 0.167, "step": 34560 }, { "epoch": 3.39996557743847, "grad_norm": 0.4026939272880554, "learning_rate": 6.463855661089695e-06, "loss": 0.181, "step": 34570 }, { "epoch": 3.400949079196479, "grad_norm": 2.032933473587036, "learning_rate": 6.4598815721495854e-06, "loss": 0.0675, "step": 34580 }, { "epoch": 3.4019325809544885, "grad_norm": 1.10984206199646, "learning_rate": 6.455907483209475e-06, "loss": 0.23, "step": 34590 }, { "epoch": 3.402916082712498, "grad_norm": 0.7900817394256592, "learning_rate": 6.451933394269364e-06, "loss": 0.188, "step": 34600 }, { "epoch": 3.403899584470507, "grad_norm": 0.9765810370445251, "learning_rate": 6.4479593053292536e-06, "loss": 0.1501, "step": 34610 }, { "epoch": 3.4048830862285167, "grad_norm": 0.9752830862998962, "learning_rate": 6.443985216389143e-06, "loss": 0.2133, "step": 34620 }, { "epoch": 3.405866587986526, "grad_norm": 2.466362237930298, "learning_rate": 6.440011127449033e-06, "loss": 0.2199, "step": 34630 }, { "epoch": 3.4068500897445353, "grad_norm": 1.1002997159957886, "learning_rate": 6.436037038508922e-06, "loss": 0.2043, "step": 34640 }, { "epoch": 3.407833591502545, "grad_norm": 2.216463327407837, "learning_rate": 6.432062949568811e-06, "loss": 0.2458, "step": 34650 }, { "epoch": 3.4088170932605544, "grad_norm": 0.41991540789604187, "learning_rate": 6.428088860628702e-06, "loss": 0.1945, "step": 34660 }, { "epoch": 3.4098005950185635, "grad_norm": 0.6682888865470886, "learning_rate": 6.424114771688591e-06, "loss": 0.218, "step": 34670 }, { "epoch": 3.410784096776573, "grad_norm": 1.0558764934539795, "learning_rate": 6.42014068274848e-06, "loss": 0.1781, "step": 34680 }, { "epoch": 3.4117675985345826, "grad_norm": 0.5618786811828613, "learning_rate": 6.416166593808369e-06, "loss": 0.2492, "step": 34690 }, { "epoch": 3.4127511002925917, "grad_norm": 0.3969118893146515, "learning_rate": 6.41219250486826e-06, "loss": 0.1723, "step": 34700 }, { "epoch": 3.413734602050601, "grad_norm": 0.7788593173027039, "learning_rate": 6.408218415928149e-06, "loss": 0.1819, "step": 34710 }, { "epoch": 3.4147181038086107, "grad_norm": 0.6339419484138489, "learning_rate": 6.404244326988038e-06, "loss": 0.1933, "step": 34720 }, { "epoch": 3.41570160556662, "grad_norm": 3.0357329845428467, "learning_rate": 6.400270238047928e-06, "loss": 0.2384, "step": 34730 }, { "epoch": 3.4166851073246294, "grad_norm": 0.7546097040176392, "learning_rate": 6.3962961491078175e-06, "loss": 0.1562, "step": 34740 }, { "epoch": 3.417668609082639, "grad_norm": 0.2636072635650635, "learning_rate": 6.392322060167707e-06, "loss": 0.1887, "step": 34750 }, { "epoch": 3.418652110840648, "grad_norm": 0.1599857211112976, "learning_rate": 6.388347971227596e-06, "loss": 0.1095, "step": 34760 }, { "epoch": 3.4196356125986576, "grad_norm": 0.12882690131664276, "learning_rate": 6.384373882287486e-06, "loss": 0.1667, "step": 34770 }, { "epoch": 3.420619114356667, "grad_norm": 1.473196268081665, "learning_rate": 6.380399793347376e-06, "loss": 0.2108, "step": 34780 }, { "epoch": 3.421602616114676, "grad_norm": 0.6438500881195068, "learning_rate": 6.376425704407265e-06, "loss": 0.2418, "step": 34790 }, { "epoch": 3.4225861178726857, "grad_norm": 1.5048677921295166, "learning_rate": 6.372451615467155e-06, "loss": 0.2191, "step": 34800 }, { "epoch": 3.4235696196306953, "grad_norm": 1.208437442779541, "learning_rate": 6.3684775265270435e-06, "loss": 0.1866, "step": 34810 }, { "epoch": 3.4245531213887044, "grad_norm": 0.35769960284233093, "learning_rate": 6.364503437586934e-06, "loss": 0.1976, "step": 34820 }, { "epoch": 3.425536623146714, "grad_norm": 0.31596699357032776, "learning_rate": 6.360529348646824e-06, "loss": 0.1637, "step": 34830 }, { "epoch": 3.426520124904723, "grad_norm": 0.6313391327857971, "learning_rate": 6.3565552597067125e-06, "loss": 0.1783, "step": 34840 }, { "epoch": 3.4275036266627326, "grad_norm": 0.5374031662940979, "learning_rate": 6.352581170766602e-06, "loss": 0.2105, "step": 34850 }, { "epoch": 3.428487128420742, "grad_norm": 1.1502714157104492, "learning_rate": 6.348607081826492e-06, "loss": 0.1998, "step": 34860 }, { "epoch": 3.429470630178751, "grad_norm": 0.2589196562767029, "learning_rate": 6.3446329928863815e-06, "loss": 0.2485, "step": 34870 }, { "epoch": 3.4304541319367607, "grad_norm": 0.5239776968955994, "learning_rate": 6.34065890394627e-06, "loss": 0.2309, "step": 34880 }, { "epoch": 3.4314376336947703, "grad_norm": 0.4215831458568573, "learning_rate": 6.33668481500616e-06, "loss": 0.2086, "step": 34890 }, { "epoch": 3.4324211354527794, "grad_norm": 0.2826988697052002, "learning_rate": 6.3327107260660505e-06, "loss": 0.1729, "step": 34900 }, { "epoch": 3.433404637210789, "grad_norm": 1.7079347372055054, "learning_rate": 6.328736637125939e-06, "loss": 0.224, "step": 34910 }, { "epoch": 3.4343881389687985, "grad_norm": 2.6849637031555176, "learning_rate": 6.324762548185829e-06, "loss": 0.2203, "step": 34920 }, { "epoch": 3.4353716407268076, "grad_norm": 0.39513906836509705, "learning_rate": 6.320788459245718e-06, "loss": 0.2422, "step": 34930 }, { "epoch": 3.436355142484817, "grad_norm": 1.4758193492889404, "learning_rate": 6.316814370305608e-06, "loss": 0.1015, "step": 34940 }, { "epoch": 3.4373386442428266, "grad_norm": 0.9000169038772583, "learning_rate": 6.312840281365498e-06, "loss": 0.1771, "step": 34950 }, { "epoch": 3.4383221460008357, "grad_norm": 1.5330618619918823, "learning_rate": 6.308866192425387e-06, "loss": 0.2195, "step": 34960 }, { "epoch": 3.4393056477588453, "grad_norm": 0.6581733226776123, "learning_rate": 6.304892103485276e-06, "loss": 0.2737, "step": 34970 }, { "epoch": 3.440289149516855, "grad_norm": 0.494530588388443, "learning_rate": 6.300918014545166e-06, "loss": 0.2535, "step": 34980 }, { "epoch": 3.441272651274864, "grad_norm": 2.8313052654266357, "learning_rate": 6.296943925605056e-06, "loss": 0.2243, "step": 34990 }, { "epoch": 3.4422561530328735, "grad_norm": 0.35256826877593994, "learning_rate": 6.2929698366649446e-06, "loss": 0.1598, "step": 35000 }, { "epoch": 3.4422561530328735, "eval_loss": 0.11682377755641937, "eval_runtime": 19.7716, "eval_samples_per_second": 2.529, "eval_steps_per_second": 1.264, "step": 35000 }, { "epoch": 3.443239654790883, "grad_norm": 0.9331240057945251, "learning_rate": 6.288995747724834e-06, "loss": 0.2248, "step": 35010 }, { "epoch": 3.444223156548892, "grad_norm": 0.9498600363731384, "learning_rate": 6.285021658784725e-06, "loss": 0.2281, "step": 35020 }, { "epoch": 3.4452066583069016, "grad_norm": 1.1861779689788818, "learning_rate": 6.2810475698446136e-06, "loss": 0.2029, "step": 35030 }, { "epoch": 3.446190160064911, "grad_norm": 0.6265971064567566, "learning_rate": 6.277073480904503e-06, "loss": 0.1566, "step": 35040 }, { "epoch": 3.4471736618229203, "grad_norm": 0.1985936313867569, "learning_rate": 6.273099391964392e-06, "loss": 0.1737, "step": 35050 }, { "epoch": 3.44815716358093, "grad_norm": 0.2556389272212982, "learning_rate": 6.2691253030242826e-06, "loss": 0.1608, "step": 35060 }, { "epoch": 3.4491406653389394, "grad_norm": 0.5306591987609863, "learning_rate": 6.265151214084172e-06, "loss": 0.2038, "step": 35070 }, { "epoch": 3.4501241670969485, "grad_norm": 0.7322729825973511, "learning_rate": 6.261177125144061e-06, "loss": 0.2146, "step": 35080 }, { "epoch": 3.451107668854958, "grad_norm": 1.0988006591796875, "learning_rate": 6.257203036203951e-06, "loss": 0.2701, "step": 35090 }, { "epoch": 3.4520911706129676, "grad_norm": 0.382568359375, "learning_rate": 6.25322894726384e-06, "loss": 0.1788, "step": 35100 }, { "epoch": 3.4530746723709766, "grad_norm": 1.1923779249191284, "learning_rate": 6.24925485832373e-06, "loss": 0.2115, "step": 35110 }, { "epoch": 3.454058174128986, "grad_norm": 0.3326428234577179, "learning_rate": 6.245280769383619e-06, "loss": 0.1822, "step": 35120 }, { "epoch": 3.4550416758869957, "grad_norm": 0.5573846101760864, "learning_rate": 6.2413066804435085e-06, "loss": 0.2395, "step": 35130 }, { "epoch": 3.456025177645005, "grad_norm": 0.848188042640686, "learning_rate": 6.237332591503399e-06, "loss": 0.1655, "step": 35140 }, { "epoch": 3.4570086794030144, "grad_norm": 0.7337232232093811, "learning_rate": 6.233358502563288e-06, "loss": 0.2413, "step": 35150 }, { "epoch": 3.457992181161024, "grad_norm": 0.5480116605758667, "learning_rate": 6.2293844136231775e-06, "loss": 0.1676, "step": 35160 }, { "epoch": 3.458975682919033, "grad_norm": 0.938613772392273, "learning_rate": 6.225410324683066e-06, "loss": 0.1292, "step": 35170 }, { "epoch": 3.4599591846770426, "grad_norm": 0.7171252369880676, "learning_rate": 6.221436235742957e-06, "loss": 0.2464, "step": 35180 }, { "epoch": 3.460942686435052, "grad_norm": 0.6132894158363342, "learning_rate": 6.2174621468028465e-06, "loss": 0.2727, "step": 35190 }, { "epoch": 3.461926188193061, "grad_norm": 4.07462215423584, "learning_rate": 6.213488057862735e-06, "loss": 0.1068, "step": 35200 }, { "epoch": 3.4629096899510707, "grad_norm": 0.724358856678009, "learning_rate": 6.209513968922625e-06, "loss": 0.1658, "step": 35210 }, { "epoch": 3.4638931917090803, "grad_norm": 0.09326642751693726, "learning_rate": 6.205539879982515e-06, "loss": 0.0746, "step": 35220 }, { "epoch": 3.4648766934670894, "grad_norm": 4.169434070587158, "learning_rate": 6.201565791042404e-06, "loss": 0.173, "step": 35230 }, { "epoch": 3.465860195225099, "grad_norm": 1.4873085021972656, "learning_rate": 6.197591702102293e-06, "loss": 0.2532, "step": 35240 }, { "epoch": 3.4668436969831085, "grad_norm": 0.22203786671161652, "learning_rate": 6.193617613162183e-06, "loss": 0.1647, "step": 35250 }, { "epoch": 3.4678271987411176, "grad_norm": 0.8890891075134277, "learning_rate": 6.1896435242220725e-06, "loss": 0.1263, "step": 35260 }, { "epoch": 3.468810700499127, "grad_norm": 1.1431549787521362, "learning_rate": 6.185669435281962e-06, "loss": 0.2693, "step": 35270 }, { "epoch": 3.4697942022571366, "grad_norm": 1.133975625038147, "learning_rate": 6.181695346341852e-06, "loss": 0.2801, "step": 35280 }, { "epoch": 3.4707777040151457, "grad_norm": 0.5976429581642151, "learning_rate": 6.177721257401741e-06, "loss": 0.1671, "step": 35290 }, { "epoch": 3.4717612057731553, "grad_norm": 1.3519940376281738, "learning_rate": 6.17374716846163e-06, "loss": 0.3204, "step": 35300 }, { "epoch": 3.472744707531165, "grad_norm": 0.4874255359172821, "learning_rate": 6.169773079521521e-06, "loss": 0.1915, "step": 35310 }, { "epoch": 3.473728209289174, "grad_norm": 2.0294809341430664, "learning_rate": 6.16579899058141e-06, "loss": 0.1796, "step": 35320 }, { "epoch": 3.4747117110471835, "grad_norm": 0.573607861995697, "learning_rate": 6.161824901641299e-06, "loss": 0.1704, "step": 35330 }, { "epoch": 3.475695212805193, "grad_norm": 1.4688594341278076, "learning_rate": 6.157850812701188e-06, "loss": 0.2388, "step": 35340 }, { "epoch": 3.476678714563202, "grad_norm": 1.3949116468429565, "learning_rate": 6.153876723761079e-06, "loss": 0.2454, "step": 35350 }, { "epoch": 3.4776622163212116, "grad_norm": 1.074675440788269, "learning_rate": 6.149902634820967e-06, "loss": 0.1916, "step": 35360 }, { "epoch": 3.478645718079221, "grad_norm": 1.8467960357666016, "learning_rate": 6.145928545880857e-06, "loss": 0.2278, "step": 35370 }, { "epoch": 3.4796292198372303, "grad_norm": 0.5333929061889648, "learning_rate": 6.141954456940746e-06, "loss": 0.2142, "step": 35380 }, { "epoch": 3.48061272159524, "grad_norm": 0.37479594349861145, "learning_rate": 6.137980368000636e-06, "loss": 0.188, "step": 35390 }, { "epoch": 3.4815962233532494, "grad_norm": 0.4170948266983032, "learning_rate": 6.134006279060526e-06, "loss": 0.104, "step": 35400 }, { "epoch": 3.4825797251112585, "grad_norm": 1.5275107622146606, "learning_rate": 6.130032190120415e-06, "loss": 0.2214, "step": 35410 }, { "epoch": 3.483563226869268, "grad_norm": 1.325067162513733, "learning_rate": 6.1260581011803046e-06, "loss": 0.2264, "step": 35420 }, { "epoch": 3.4845467286272775, "grad_norm": 1.20541250705719, "learning_rate": 6.122084012240195e-06, "loss": 0.2887, "step": 35430 }, { "epoch": 3.4855302303852866, "grad_norm": 0.8210470676422119, "learning_rate": 6.118109923300084e-06, "loss": 0.1476, "step": 35440 }, { "epoch": 3.486513732143296, "grad_norm": 1.8403013944625854, "learning_rate": 6.1141358343599736e-06, "loss": 0.157, "step": 35450 }, { "epoch": 3.4874972339013057, "grad_norm": 0.9750491976737976, "learning_rate": 6.110161745419862e-06, "loss": 0.2567, "step": 35460 }, { "epoch": 3.488480735659315, "grad_norm": 0.4504006505012512, "learning_rate": 6.106187656479753e-06, "loss": 0.2618, "step": 35470 }, { "epoch": 3.4894642374173244, "grad_norm": 0.8415562510490417, "learning_rate": 6.102213567539642e-06, "loss": 0.1635, "step": 35480 }, { "epoch": 3.490447739175334, "grad_norm": 2.1266133785247803, "learning_rate": 6.098239478599531e-06, "loss": 0.2345, "step": 35490 }, { "epoch": 3.491431240933343, "grad_norm": 1.2690298557281494, "learning_rate": 6.09426538965942e-06, "loss": 0.1781, "step": 35500 }, { "epoch": 3.491431240933343, "eval_loss": 0.11791013926267624, "eval_runtime": 17.481, "eval_samples_per_second": 2.86, "eval_steps_per_second": 1.43, "step": 35500 }, { "epoch": 3.4924147426913525, "grad_norm": 1.0220181941986084, "learning_rate": 6.090291300719311e-06, "loss": 0.0995, "step": 35510 }, { "epoch": 3.493398244449362, "grad_norm": 0.8852249979972839, "learning_rate": 6.0863172117792e-06, "loss": 0.2229, "step": 35520 }, { "epoch": 3.494381746207371, "grad_norm": 0.6984592080116272, "learning_rate": 6.082343122839089e-06, "loss": 0.2568, "step": 35530 }, { "epoch": 3.4953652479653807, "grad_norm": 0.2515038549900055, "learning_rate": 6.078369033898979e-06, "loss": 0.2064, "step": 35540 }, { "epoch": 3.4963487497233903, "grad_norm": 0.8225836753845215, "learning_rate": 6.074394944958869e-06, "loss": 0.2517, "step": 35550 }, { "epoch": 3.4973322514813994, "grad_norm": 0.27752599120140076, "learning_rate": 6.070420856018758e-06, "loss": 0.0464, "step": 35560 }, { "epoch": 3.498315753239409, "grad_norm": 0.038399215787649155, "learning_rate": 6.066446767078648e-06, "loss": 0.1506, "step": 35570 }, { "epoch": 3.4992992549974185, "grad_norm": 0.7299541234970093, "learning_rate": 6.062472678138537e-06, "loss": 0.2598, "step": 35580 }, { "epoch": 3.5002827567554275, "grad_norm": 1.7209151983261108, "learning_rate": 6.058498589198427e-06, "loss": 0.188, "step": 35590 }, { "epoch": 3.501266258513437, "grad_norm": 0.3954479694366455, "learning_rate": 6.054524500258316e-06, "loss": 0.1885, "step": 35600 }, { "epoch": 3.5022497602714466, "grad_norm": 1.3695343732833862, "learning_rate": 6.050550411318206e-06, "loss": 0.0878, "step": 35610 }, { "epoch": 3.5032332620294557, "grad_norm": 0.9690384864807129, "learning_rate": 6.0465763223780945e-06, "loss": 0.1552, "step": 35620 }, { "epoch": 3.5042167637874653, "grad_norm": 1.019343614578247, "learning_rate": 6.042602233437985e-06, "loss": 0.2125, "step": 35630 }, { "epoch": 3.505200265545475, "grad_norm": 0.5192074179649353, "learning_rate": 6.038628144497875e-06, "loss": 0.1729, "step": 35640 }, { "epoch": 3.506183767303484, "grad_norm": 1.5759379863739014, "learning_rate": 6.0346540555577635e-06, "loss": 0.1651, "step": 35650 }, { "epoch": 3.5071672690614935, "grad_norm": 1.7547839879989624, "learning_rate": 6.030679966617653e-06, "loss": 0.2143, "step": 35660 }, { "epoch": 3.508150770819503, "grad_norm": 0.5516292452812195, "learning_rate": 6.026705877677544e-06, "loss": 0.1183, "step": 35670 }, { "epoch": 3.509134272577512, "grad_norm": 0.2886405885219574, "learning_rate": 6.0227317887374325e-06, "loss": 0.1418, "step": 35680 }, { "epoch": 3.5101177743355216, "grad_norm": 1.5895148515701294, "learning_rate": 6.018757699797322e-06, "loss": 0.277, "step": 35690 }, { "epoch": 3.511101276093531, "grad_norm": 1.234676718711853, "learning_rate": 6.014783610857211e-06, "loss": 0.2684, "step": 35700 }, { "epoch": 3.5120847778515403, "grad_norm": 0.45959922671318054, "learning_rate": 6.0108095219171014e-06, "loss": 0.1182, "step": 35710 }, { "epoch": 3.51306827960955, "grad_norm": 0.6054401993751526, "learning_rate": 6.00683543297699e-06, "loss": 0.2596, "step": 35720 }, { "epoch": 3.5140517813675594, "grad_norm": 0.4694426357746124, "learning_rate": 6.00286134403688e-06, "loss": 0.1923, "step": 35730 }, { "epoch": 3.5150352831255685, "grad_norm": 0.8458042144775391, "learning_rate": 5.998887255096769e-06, "loss": 0.2215, "step": 35740 }, { "epoch": 3.516018784883578, "grad_norm": 0.3480573892593384, "learning_rate": 5.994913166156659e-06, "loss": 0.2045, "step": 35750 }, { "epoch": 3.5170022866415875, "grad_norm": 1.7498723268508911, "learning_rate": 5.990939077216549e-06, "loss": 0.1557, "step": 35760 }, { "epoch": 3.5179857883995966, "grad_norm": 0.4557783305644989, "learning_rate": 5.986964988276438e-06, "loss": 0.0803, "step": 35770 }, { "epoch": 3.518969290157606, "grad_norm": 0.8801820278167725, "learning_rate": 5.982990899336327e-06, "loss": 0.2201, "step": 35780 }, { "epoch": 3.5199527919156157, "grad_norm": 0.32409054040908813, "learning_rate": 5.979016810396218e-06, "loss": 0.146, "step": 35790 }, { "epoch": 3.520936293673625, "grad_norm": 2.258561372756958, "learning_rate": 5.975042721456107e-06, "loss": 0.2379, "step": 35800 }, { "epoch": 3.5219197954316344, "grad_norm": 0.1241031289100647, "learning_rate": 5.971068632515996e-06, "loss": 0.1933, "step": 35810 }, { "epoch": 3.522903297189644, "grad_norm": 0.32828909158706665, "learning_rate": 5.967094543575885e-06, "loss": 0.1732, "step": 35820 }, { "epoch": 3.523886798947653, "grad_norm": 1.036968469619751, "learning_rate": 5.963120454635776e-06, "loss": 0.1615, "step": 35830 }, { "epoch": 3.5248703007056625, "grad_norm": 0.6440073847770691, "learning_rate": 5.9591463656956646e-06, "loss": 0.1422, "step": 35840 }, { "epoch": 3.525853802463672, "grad_norm": 0.7314437031745911, "learning_rate": 5.955172276755554e-06, "loss": 0.1484, "step": 35850 }, { "epoch": 3.526837304221681, "grad_norm": 1.1431103944778442, "learning_rate": 5.951198187815443e-06, "loss": 0.2525, "step": 35860 }, { "epoch": 3.5278208059796907, "grad_norm": 0.30557236075401306, "learning_rate": 5.9472240988753335e-06, "loss": 0.1921, "step": 35870 }, { "epoch": 3.5288043077377003, "grad_norm": 1.6383107900619507, "learning_rate": 5.943250009935223e-06, "loss": 0.2466, "step": 35880 }, { "epoch": 3.5297878094957094, "grad_norm": 0.8749147057533264, "learning_rate": 5.939275920995112e-06, "loss": 0.2489, "step": 35890 }, { "epoch": 3.530771311253719, "grad_norm": 0.33233484625816345, "learning_rate": 5.935301832055002e-06, "loss": 0.1653, "step": 35900 }, { "epoch": 3.5317548130117284, "grad_norm": 1.1335556507110596, "learning_rate": 5.931327743114892e-06, "loss": 0.2507, "step": 35910 }, { "epoch": 3.5327383147697375, "grad_norm": 0.05900925025343895, "learning_rate": 5.927353654174781e-06, "loss": 0.2375, "step": 35920 }, { "epoch": 3.533721816527747, "grad_norm": 2.2414448261260986, "learning_rate": 5.923379565234671e-06, "loss": 0.243, "step": 35930 }, { "epoch": 3.5347053182857566, "grad_norm": 1.2612745761871338, "learning_rate": 5.9194054762945595e-06, "loss": 0.2604, "step": 35940 }, { "epoch": 3.5356888200437657, "grad_norm": 0.41445934772491455, "learning_rate": 5.91543138735445e-06, "loss": 0.2468, "step": 35950 }, { "epoch": 3.5366723218017753, "grad_norm": 0.3327304422855377, "learning_rate": 5.911457298414339e-06, "loss": 0.3412, "step": 35960 }, { "epoch": 3.537655823559785, "grad_norm": 1.1569575071334839, "learning_rate": 5.9074832094742285e-06, "loss": 0.2789, "step": 35970 }, { "epoch": 3.538639325317794, "grad_norm": 1.7389458417892456, "learning_rate": 5.903509120534117e-06, "loss": 0.1788, "step": 35980 }, { "epoch": 3.5396228270758034, "grad_norm": 1.139573574066162, "learning_rate": 5.899535031594008e-06, "loss": 0.208, "step": 35990 }, { "epoch": 3.540606328833813, "grad_norm": 0.546160101890564, "learning_rate": 5.8955609426538975e-06, "loss": 0.2703, "step": 36000 }, { "epoch": 3.540606328833813, "eval_loss": 0.11898215115070343, "eval_runtime": 18.2129, "eval_samples_per_second": 2.745, "eval_steps_per_second": 1.373, "step": 36000 } ], "logging_steps": 10, "max_steps": 50835, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "total_flos": 3.4553845225556214e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }