diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,37485 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999953267752413, + "eval_steps": 500, + "global_step": 26748, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001869289903497909, + "grad_norm": 2.171574354171753, + "learning_rate": 1.9999998275643267e-05, + "loss": 0.8437, + "step": 5 + }, + { + "epoch": 0.0003738579806995818, + "grad_norm": 2.646630048751831, + "learning_rate": 1.999999310257367e-05, + "loss": 0.6249, + "step": 10 + }, + { + "epoch": 0.0005607869710493726, + "grad_norm": 1.4575234651565552, + "learning_rate": 1.9999984480792985e-05, + "loss": 0.4614, + "step": 15 + }, + { + "epoch": 0.0007477159613991635, + "grad_norm": 0.6195482015609741, + "learning_rate": 1.9999972410304184e-05, + "loss": 0.4029, + "step": 20 + }, + { + "epoch": 0.0009346449517489544, + "grad_norm": 1.0422546863555908, + "learning_rate": 1.999995689111144e-05, + "loss": 0.4514, + "step": 25 + }, + { + "epoch": 0.0011215739420987452, + "grad_norm": 1.1793930530548096, + "learning_rate": 1.9999937923220094e-05, + "loss": 0.3913, + "step": 30 + }, + { + "epoch": 0.0013085029324485362, + "grad_norm": 0.8712885975837708, + "learning_rate": 1.9999915506636697e-05, + "loss": 0.4137, + "step": 35 + }, + { + "epoch": 0.001495431922798327, + "grad_norm": 0.795931875705719, + "learning_rate": 1.9999889641368975e-05, + "loss": 0.4616, + "step": 40 + }, + { + "epoch": 0.0016823609131481178, + "grad_norm": 1.453735589981079, + "learning_rate": 1.9999860327425846e-05, + "loss": 0.4453, + "step": 45 + }, + { + "epoch": 0.0018692899034979088, + "grad_norm": 0.865328311920166, + "learning_rate": 1.9999827564817424e-05, + "loss": 0.358, + "step": 50 + }, + { + "epoch": 0.0020562188938476997, + "grad_norm": 0.9025278091430664, + "learning_rate": 1.9999791353555008e-05, + "loss": 0.4263, + "step": 55 + }, + { + "epoch": 0.0022431478841974904, + "grad_norm": 0.8391484618186951, + "learning_rate": 1.9999751693651083e-05, + "loss": 0.3838, + "step": 60 + }, + { + "epoch": 0.002430076874547281, + "grad_norm": 0.7536634206771851, + "learning_rate": 1.9999708585119328e-05, + "loss": 0.5194, + "step": 65 + }, + { + "epoch": 0.0026170058648970723, + "grad_norm": 1.1140562295913696, + "learning_rate": 1.999966202797461e-05, + "loss": 0.379, + "step": 70 + }, + { + "epoch": 0.002803934855246863, + "grad_norm": 0.9421278238296509, + "learning_rate": 1.9999612022232982e-05, + "loss": 0.3929, + "step": 75 + }, + { + "epoch": 0.002990863845596654, + "grad_norm": 0.5064008831977844, + "learning_rate": 1.9999558567911697e-05, + "loss": 0.4323, + "step": 80 + }, + { + "epoch": 0.003177792835946445, + "grad_norm": 0.7780022621154785, + "learning_rate": 1.9999501665029185e-05, + "loss": 0.3719, + "step": 85 + }, + { + "epoch": 0.0033647218262962356, + "grad_norm": 1.0911474227905273, + "learning_rate": 1.9999441313605068e-05, + "loss": 0.3523, + "step": 90 + }, + { + "epoch": 0.003551650816646027, + "grad_norm": 0.665628969669342, + "learning_rate": 1.9999377513660167e-05, + "loss": 0.3953, + "step": 95 + }, + { + "epoch": 0.0037385798069958175, + "grad_norm": 1.2322030067443848, + "learning_rate": 1.9999310265216472e-05, + "loss": 0.4117, + "step": 100 + }, + { + "epoch": 0.003925508797345608, + "grad_norm": 0.8758599758148193, + "learning_rate": 1.999923956829719e-05, + "loss": 0.3031, + "step": 105 + }, + { + "epoch": 0.004112437787695399, + "grad_norm": 0.8419223427772522, + "learning_rate": 1.9999165422926696e-05, + "loss": 0.3016, + "step": 110 + }, + { + "epoch": 0.00429936677804519, + "grad_norm": 1.1489617824554443, + "learning_rate": 1.9999087829130554e-05, + "loss": 0.4597, + "step": 115 + }, + { + "epoch": 0.004486295768394981, + "grad_norm": 0.46470773220062256, + "learning_rate": 1.9999006786935532e-05, + "loss": 0.337, + "step": 120 + }, + { + "epoch": 0.004673224758744772, + "grad_norm": 0.4822236895561218, + "learning_rate": 1.999892229636958e-05, + "loss": 0.3476, + "step": 125 + }, + { + "epoch": 0.004860153749094562, + "grad_norm": 0.691170334815979, + "learning_rate": 1.9998834357461834e-05, + "loss": 0.3542, + "step": 130 + }, + { + "epoch": 0.0050470827394443534, + "grad_norm": 0.6940286755561829, + "learning_rate": 1.9998742970242614e-05, + "loss": 0.3268, + "step": 135 + }, + { + "epoch": 0.005234011729794145, + "grad_norm": 0.943358302116394, + "learning_rate": 1.999864813474345e-05, + "loss": 0.3943, + "step": 140 + }, + { + "epoch": 0.005420940720143935, + "grad_norm": 1.0193995237350464, + "learning_rate": 1.9998549850997044e-05, + "loss": 0.3066, + "step": 145 + }, + { + "epoch": 0.005607869710493726, + "grad_norm": 0.7878953814506531, + "learning_rate": 1.999844811903728e-05, + "loss": 0.3635, + "step": 150 + }, + { + "epoch": 0.005794798700843517, + "grad_norm": 0.4438617527484894, + "learning_rate": 1.9998342938899257e-05, + "loss": 0.4297, + "step": 155 + }, + { + "epoch": 0.005981727691193308, + "grad_norm": 0.35109004378318787, + "learning_rate": 1.999823431061924e-05, + "loss": 0.4432, + "step": 160 + }, + { + "epoch": 0.006168656681543099, + "grad_norm": 0.7942967414855957, + "learning_rate": 1.99981222342347e-05, + "loss": 0.4196, + "step": 165 + }, + { + "epoch": 0.00635558567189289, + "grad_norm": 0.8898890018463135, + "learning_rate": 1.9998006709784283e-05, + "loss": 0.3688, + "step": 170 + }, + { + "epoch": 0.006542514662242681, + "grad_norm": 1.1046160459518433, + "learning_rate": 1.999788773730783e-05, + "loss": 0.3895, + "step": 175 + }, + { + "epoch": 0.006729443652592471, + "grad_norm": 0.5191996693611145, + "learning_rate": 1.999776531684637e-05, + "loss": 0.3963, + "step": 180 + }, + { + "epoch": 0.006916372642942262, + "grad_norm": 0.5099379420280457, + "learning_rate": 1.9997639448442125e-05, + "loss": 0.3328, + "step": 185 + }, + { + "epoch": 0.007103301633292054, + "grad_norm": 1.2763317823410034, + "learning_rate": 1.9997510132138505e-05, + "loss": 0.2926, + "step": 190 + }, + { + "epoch": 0.007290230623641844, + "grad_norm": 1.150545358657837, + "learning_rate": 1.9997377367980104e-05, + "loss": 0.4188, + "step": 195 + }, + { + "epoch": 0.007477159613991635, + "grad_norm": 0.8739756941795349, + "learning_rate": 1.999724115601271e-05, + "loss": 0.4185, + "step": 200 + }, + { + "epoch": 0.007664088604341426, + "grad_norm": 1.0264869928359985, + "learning_rate": 1.99971014962833e-05, + "loss": 0.3185, + "step": 205 + }, + { + "epoch": 0.007851017594691216, + "grad_norm": 0.5631968975067139, + "learning_rate": 1.9996958388840036e-05, + "loss": 0.3156, + "step": 210 + }, + { + "epoch": 0.008037946585041008, + "grad_norm": 0.6525606513023376, + "learning_rate": 1.999681183373227e-05, + "loss": 0.3579, + "step": 215 + }, + { + "epoch": 0.008224875575390799, + "grad_norm": 0.894953727722168, + "learning_rate": 1.999666183101055e-05, + "loss": 0.3544, + "step": 220 + }, + { + "epoch": 0.00841180456574059, + "grad_norm": 0.5953158140182495, + "learning_rate": 1.9996508380726608e-05, + "loss": 0.3366, + "step": 225 + }, + { + "epoch": 0.00859873355609038, + "grad_norm": 0.47904253005981445, + "learning_rate": 1.9996351482933355e-05, + "loss": 0.448, + "step": 230 + }, + { + "epoch": 0.00878566254644017, + "grad_norm": 0.9660627841949463, + "learning_rate": 1.9996191137684913e-05, + "loss": 0.4367, + "step": 235 + }, + { + "epoch": 0.008972591536789962, + "grad_norm": 1.3929287195205688, + "learning_rate": 1.9996027345036574e-05, + "loss": 0.4314, + "step": 240 + }, + { + "epoch": 0.009159520527139753, + "grad_norm": 0.5760064721107483, + "learning_rate": 1.999586010504482e-05, + "loss": 0.389, + "step": 245 + }, + { + "epoch": 0.009346449517489544, + "grad_norm": 1.9712883234024048, + "learning_rate": 1.999568941776734e-05, + "loss": 0.3424, + "step": 250 + }, + { + "epoch": 0.009533378507839335, + "grad_norm": 0.6370198726654053, + "learning_rate": 1.9995515283262993e-05, + "loss": 0.4801, + "step": 255 + }, + { + "epoch": 0.009720307498189125, + "grad_norm": 1.003278136253357, + "learning_rate": 1.999533770159183e-05, + "loss": 0.3515, + "step": 260 + }, + { + "epoch": 0.009907236488538916, + "grad_norm": 0.4942162036895752, + "learning_rate": 1.9995156672815096e-05, + "loss": 0.3689, + "step": 265 + }, + { + "epoch": 0.010094165478888707, + "grad_norm": 0.5202749371528625, + "learning_rate": 1.9994972196995223e-05, + "loss": 0.3163, + "step": 270 + }, + { + "epoch": 0.010281094469238498, + "grad_norm": 0.6780266165733337, + "learning_rate": 1.9994784274195834e-05, + "loss": 0.3699, + "step": 275 + }, + { + "epoch": 0.01046802345958829, + "grad_norm": 0.40867510437965393, + "learning_rate": 1.9994592904481732e-05, + "loss": 0.3337, + "step": 280 + }, + { + "epoch": 0.01065495244993808, + "grad_norm": 0.36129143834114075, + "learning_rate": 1.999439808791892e-05, + "loss": 0.3753, + "step": 285 + }, + { + "epoch": 0.01084188144028787, + "grad_norm": 0.4958774447441101, + "learning_rate": 1.9994199824574583e-05, + "loss": 0.3724, + "step": 290 + }, + { + "epoch": 0.011028810430637661, + "grad_norm": 0.9592061042785645, + "learning_rate": 1.9993998114517096e-05, + "loss": 0.4238, + "step": 295 + }, + { + "epoch": 0.011215739420987452, + "grad_norm": 0.6660071015357971, + "learning_rate": 1.9993792957816027e-05, + "loss": 0.3774, + "step": 300 + }, + { + "epoch": 0.011402668411337243, + "grad_norm": 1.37131929397583, + "learning_rate": 1.999358435454212e-05, + "loss": 0.4088, + "step": 305 + }, + { + "epoch": 0.011589597401687034, + "grad_norm": 0.8485438227653503, + "learning_rate": 1.9993372304767327e-05, + "loss": 0.3734, + "step": 310 + }, + { + "epoch": 0.011776526392036826, + "grad_norm": 0.41537079215049744, + "learning_rate": 1.999315680856477e-05, + "loss": 0.2756, + "step": 315 + }, + { + "epoch": 0.011963455382386617, + "grad_norm": 0.36821046471595764, + "learning_rate": 1.999293786600877e-05, + "loss": 0.3323, + "step": 320 + }, + { + "epoch": 0.012150384372736406, + "grad_norm": 0.6895926594734192, + "learning_rate": 1.9992715477174832e-05, + "loss": 0.2989, + "step": 325 + }, + { + "epoch": 0.012337313363086197, + "grad_norm": 0.7400907278060913, + "learning_rate": 1.9992489642139654e-05, + "loss": 0.4937, + "step": 330 + }, + { + "epoch": 0.012524242353435988, + "grad_norm": 0.6539915800094604, + "learning_rate": 1.999226036098112e-05, + "loss": 0.2889, + "step": 335 + }, + { + "epoch": 0.01271117134378578, + "grad_norm": 0.7676389217376709, + "learning_rate": 1.9992027633778303e-05, + "loss": 0.3172, + "step": 340 + }, + { + "epoch": 0.01289810033413557, + "grad_norm": 0.6981975436210632, + "learning_rate": 1.9991791460611464e-05, + "loss": 0.3242, + "step": 345 + }, + { + "epoch": 0.013085029324485362, + "grad_norm": 0.6231608986854553, + "learning_rate": 1.999155184156205e-05, + "loss": 0.4696, + "step": 350 + }, + { + "epoch": 0.013271958314835151, + "grad_norm": 0.5723335146903992, + "learning_rate": 1.9991308776712697e-05, + "loss": 0.3813, + "step": 355 + }, + { + "epoch": 0.013458887305184943, + "grad_norm": 0.5270363092422485, + "learning_rate": 1.9991062266147237e-05, + "loss": 0.3927, + "step": 360 + }, + { + "epoch": 0.013645816295534734, + "grad_norm": 1.0495415925979614, + "learning_rate": 1.999081230995068e-05, + "loss": 0.3547, + "step": 365 + }, + { + "epoch": 0.013832745285884525, + "grad_norm": 0.9163959622383118, + "learning_rate": 1.9990558908209234e-05, + "loss": 0.3432, + "step": 370 + }, + { + "epoch": 0.014019674276234316, + "grad_norm": 1.1537407636642456, + "learning_rate": 1.9990302061010282e-05, + "loss": 0.3514, + "step": 375 + }, + { + "epoch": 0.014206603266584107, + "grad_norm": 0.7320118546485901, + "learning_rate": 1.999004176844241e-05, + "loss": 0.4305, + "step": 380 + }, + { + "epoch": 0.014393532256933897, + "grad_norm": 0.47620531916618347, + "learning_rate": 1.9989778030595385e-05, + "loss": 0.3117, + "step": 385 + }, + { + "epoch": 0.014580461247283688, + "grad_norm": 0.5840729475021362, + "learning_rate": 1.9989510847560157e-05, + "loss": 0.3125, + "step": 390 + }, + { + "epoch": 0.014767390237633479, + "grad_norm": 0.5005705952644348, + "learning_rate": 1.9989240219428873e-05, + "loss": 0.2977, + "step": 395 + }, + { + "epoch": 0.01495431922798327, + "grad_norm": 0.8225285410881042, + "learning_rate": 1.9988966146294867e-05, + "loss": 0.3511, + "step": 400 + }, + { + "epoch": 0.015141248218333061, + "grad_norm": 0.5237758755683899, + "learning_rate": 1.9988688628252656e-05, + "loss": 0.3672, + "step": 405 + }, + { + "epoch": 0.015328177208682852, + "grad_norm": 0.6276610493659973, + "learning_rate": 1.9988407665397952e-05, + "loss": 0.3592, + "step": 410 + }, + { + "epoch": 0.015515106199032642, + "grad_norm": 0.6155140995979309, + "learning_rate": 1.9988123257827646e-05, + "loss": 0.3913, + "step": 415 + }, + { + "epoch": 0.015702035189382433, + "grad_norm": 1.1031259298324585, + "learning_rate": 1.9987835405639827e-05, + "loss": 0.3597, + "step": 420 + }, + { + "epoch": 0.015888964179732224, + "grad_norm": 0.9590380787849426, + "learning_rate": 1.9987544108933758e-05, + "loss": 0.3436, + "step": 425 + }, + { + "epoch": 0.016075893170082015, + "grad_norm": 0.6944395899772644, + "learning_rate": 1.998724936780991e-05, + "loss": 0.4223, + "step": 430 + }, + { + "epoch": 0.016262822160431806, + "grad_norm": 0.8819741606712341, + "learning_rate": 1.9986951182369923e-05, + "loss": 0.3349, + "step": 435 + }, + { + "epoch": 0.016449751150781598, + "grad_norm": 0.8658530712127686, + "learning_rate": 1.998664955271664e-05, + "loss": 0.3447, + "step": 440 + }, + { + "epoch": 0.01663668014113139, + "grad_norm": 0.7974675297737122, + "learning_rate": 1.9986344478954078e-05, + "loss": 0.4101, + "step": 445 + }, + { + "epoch": 0.01682360913148118, + "grad_norm": 0.46253588795661926, + "learning_rate": 1.998603596118745e-05, + "loss": 0.3763, + "step": 450 + }, + { + "epoch": 0.01701053812183097, + "grad_norm": 0.67314213514328, + "learning_rate": 1.9985723999523154e-05, + "loss": 0.3419, + "step": 455 + }, + { + "epoch": 0.01719746711218076, + "grad_norm": 0.5673795938491821, + "learning_rate": 1.998540859406878e-05, + "loss": 0.3533, + "step": 460 + }, + { + "epoch": 0.01738439610253055, + "grad_norm": 0.34820371866226196, + "learning_rate": 1.99850897449331e-05, + "loss": 0.3095, + "step": 465 + }, + { + "epoch": 0.01757132509288034, + "grad_norm": 1.0526245832443237, + "learning_rate": 1.998476745222607e-05, + "loss": 0.3775, + "step": 470 + }, + { + "epoch": 0.017758254083230132, + "grad_norm": 0.2607547342777252, + "learning_rate": 1.9984441716058855e-05, + "loss": 0.319, + "step": 475 + }, + { + "epoch": 0.017945183073579923, + "grad_norm": 0.5493923425674438, + "learning_rate": 1.9984112536543774e-05, + "loss": 0.2914, + "step": 480 + }, + { + "epoch": 0.018132112063929715, + "grad_norm": 0.6481868028640747, + "learning_rate": 1.9983779913794366e-05, + "loss": 0.39, + "step": 485 + }, + { + "epoch": 0.018319041054279506, + "grad_norm": 0.4423280954360962, + "learning_rate": 1.9983443847925334e-05, + "loss": 0.383, + "step": 490 + }, + { + "epoch": 0.018505970044629297, + "grad_norm": 0.24521741271018982, + "learning_rate": 1.998310433905258e-05, + "loss": 0.3797, + "step": 495 + }, + { + "epoch": 0.018692899034979088, + "grad_norm": 0.4319721460342407, + "learning_rate": 1.9982761387293196e-05, + "loss": 0.3635, + "step": 500 + }, + { + "epoch": 0.01887982802532888, + "grad_norm": 0.6990941166877747, + "learning_rate": 1.998241499276545e-05, + "loss": 0.2869, + "step": 505 + }, + { + "epoch": 0.01906675701567867, + "grad_norm": 0.6501472592353821, + "learning_rate": 1.9982065155588803e-05, + "loss": 0.4004, + "step": 510 + }, + { + "epoch": 0.01925368600602846, + "grad_norm": 0.5194607973098755, + "learning_rate": 1.9981711875883908e-05, + "loss": 0.3062, + "step": 515 + }, + { + "epoch": 0.01944061499637825, + "grad_norm": 0.5444199442863464, + "learning_rate": 1.9981355153772603e-05, + "loss": 0.3218, + "step": 520 + }, + { + "epoch": 0.01962754398672804, + "grad_norm": 0.8375357985496521, + "learning_rate": 1.9980994989377902e-05, + "loss": 0.2581, + "step": 525 + }, + { + "epoch": 0.01981447297707783, + "grad_norm": 0.47779718041419983, + "learning_rate": 1.998063138282402e-05, + "loss": 0.2845, + "step": 530 + }, + { + "epoch": 0.020001401967427623, + "grad_norm": 0.5198971629142761, + "learning_rate": 1.998026433423636e-05, + "loss": 0.3539, + "step": 535 + }, + { + "epoch": 0.020188330957777414, + "grad_norm": 0.6025378108024597, + "learning_rate": 1.9979893843741498e-05, + "loss": 0.3003, + "step": 540 + }, + { + "epoch": 0.020375259948127205, + "grad_norm": 0.454045832157135, + "learning_rate": 1.997951991146721e-05, + "loss": 0.3132, + "step": 545 + }, + { + "epoch": 0.020562188938476996, + "grad_norm": 0.9200149178504944, + "learning_rate": 1.9979142537542455e-05, + "loss": 0.3469, + "step": 550 + }, + { + "epoch": 0.020749117928826787, + "grad_norm": 0.5452691912651062, + "learning_rate": 1.997876172209738e-05, + "loss": 0.3445, + "step": 555 + }, + { + "epoch": 0.02093604691917658, + "grad_norm": 0.46097660064697266, + "learning_rate": 1.997837746526331e-05, + "loss": 0.3302, + "step": 560 + }, + { + "epoch": 0.02112297590952637, + "grad_norm": 0.7622095346450806, + "learning_rate": 1.997798976717277e-05, + "loss": 0.3539, + "step": 565 + }, + { + "epoch": 0.02130990489987616, + "grad_norm": 0.44673827290534973, + "learning_rate": 1.9977598627959467e-05, + "loss": 0.3418, + "step": 570 + }, + { + "epoch": 0.021496833890225952, + "grad_norm": 0.38425078988075256, + "learning_rate": 1.9977204047758293e-05, + "loss": 0.342, + "step": 575 + }, + { + "epoch": 0.02168376288057574, + "grad_norm": 1.0525974035263062, + "learning_rate": 1.997680602670532e-05, + "loss": 0.3249, + "step": 580 + }, + { + "epoch": 0.02187069187092553, + "grad_norm": 0.5911397933959961, + "learning_rate": 1.9976404564937825e-05, + "loss": 0.2806, + "step": 585 + }, + { + "epoch": 0.022057620861275322, + "grad_norm": 0.7274040579795837, + "learning_rate": 1.9975999662594254e-05, + "loss": 0.3742, + "step": 590 + }, + { + "epoch": 0.022244549851625113, + "grad_norm": 0.8570606708526611, + "learning_rate": 1.9975591319814248e-05, + "loss": 0.4207, + "step": 595 + }, + { + "epoch": 0.022431478841974904, + "grad_norm": 0.6169318556785583, + "learning_rate": 1.9975179536738633e-05, + "loss": 0.3025, + "step": 600 + }, + { + "epoch": 0.022618407832324695, + "grad_norm": 0.7750716805458069, + "learning_rate": 1.997476431350942e-05, + "loss": 0.2713, + "step": 605 + }, + { + "epoch": 0.022805336822674487, + "grad_norm": 0.6509002447128296, + "learning_rate": 1.9974345650269812e-05, + "loss": 0.2948, + "step": 610 + }, + { + "epoch": 0.022992265813024278, + "grad_norm": 0.875956654548645, + "learning_rate": 1.9973923547164183e-05, + "loss": 0.4136, + "step": 615 + }, + { + "epoch": 0.02317919480337407, + "grad_norm": 0.5139532089233398, + "learning_rate": 1.9973498004338115e-05, + "loss": 0.3333, + "step": 620 + }, + { + "epoch": 0.02336612379372386, + "grad_norm": 0.7778888940811157, + "learning_rate": 1.9973069021938366e-05, + "loss": 0.3016, + "step": 625 + }, + { + "epoch": 0.02355305278407365, + "grad_norm": 0.4906633794307709, + "learning_rate": 1.9972636600112873e-05, + "loss": 0.3478, + "step": 630 + }, + { + "epoch": 0.023739981774423442, + "grad_norm": 0.43176034092903137, + "learning_rate": 1.997220073901077e-05, + "loss": 0.2925, + "step": 635 + }, + { + "epoch": 0.023926910764773233, + "grad_norm": 0.6139638423919678, + "learning_rate": 1.997176143878237e-05, + "loss": 0.336, + "step": 640 + }, + { + "epoch": 0.02411383975512302, + "grad_norm": 0.5035068988800049, + "learning_rate": 1.9971318699579177e-05, + "loss": 0.322, + "step": 645 + }, + { + "epoch": 0.024300768745472812, + "grad_norm": 0.8413116931915283, + "learning_rate": 1.997087252155388e-05, + "loss": 0.437, + "step": 650 + }, + { + "epoch": 0.024487697735822603, + "grad_norm": 0.33662039041519165, + "learning_rate": 1.9970422904860352e-05, + "loss": 0.2931, + "step": 655 + }, + { + "epoch": 0.024674626726172395, + "grad_norm": 0.6480875611305237, + "learning_rate": 1.9969969849653652e-05, + "loss": 0.2838, + "step": 660 + }, + { + "epoch": 0.024861555716522186, + "grad_norm": 1.1229877471923828, + "learning_rate": 1.9969513356090027e-05, + "loss": 0.3157, + "step": 665 + }, + { + "epoch": 0.025048484706871977, + "grad_norm": 0.5545793771743774, + "learning_rate": 1.9969053424326908e-05, + "loss": 0.4423, + "step": 670 + }, + { + "epoch": 0.025235413697221768, + "grad_norm": 0.5844056010246277, + "learning_rate": 1.9968590054522914e-05, + "loss": 0.4168, + "step": 675 + }, + { + "epoch": 0.02542234268757156, + "grad_norm": 0.5306451916694641, + "learning_rate": 1.996812324683784e-05, + "loss": 0.3978, + "step": 680 + }, + { + "epoch": 0.02560927167792135, + "grad_norm": 0.4964573085308075, + "learning_rate": 1.996765300143269e-05, + "loss": 0.351, + "step": 685 + }, + { + "epoch": 0.02579620066827114, + "grad_norm": 0.4789879024028778, + "learning_rate": 1.9967179318469626e-05, + "loss": 0.3372, + "step": 690 + }, + { + "epoch": 0.025983129658620933, + "grad_norm": 0.48183321952819824, + "learning_rate": 1.996670219811201e-05, + "loss": 0.2788, + "step": 695 + }, + { + "epoch": 0.026170058648970724, + "grad_norm": 0.2901974320411682, + "learning_rate": 1.996622164052439e-05, + "loss": 0.3617, + "step": 700 + }, + { + "epoch": 0.02635698763932051, + "grad_norm": 0.8643401861190796, + "learning_rate": 1.996573764587249e-05, + "loss": 0.3683, + "step": 705 + }, + { + "epoch": 0.026543916629670303, + "grad_norm": 0.6790117621421814, + "learning_rate": 1.9965250214323232e-05, + "loss": 0.4397, + "step": 710 + }, + { + "epoch": 0.026730845620020094, + "grad_norm": 0.7502917051315308, + "learning_rate": 1.996475934604472e-05, + "loss": 0.3743, + "step": 715 + }, + { + "epoch": 0.026917774610369885, + "grad_norm": 0.6247079372406006, + "learning_rate": 1.996426504120623e-05, + "loss": 0.3414, + "step": 720 + }, + { + "epoch": 0.027104703600719676, + "grad_norm": 1.0865947008132935, + "learning_rate": 1.9963767299978243e-05, + "loss": 0.332, + "step": 725 + }, + { + "epoch": 0.027291632591069467, + "grad_norm": 0.8259230256080627, + "learning_rate": 1.9963266122532416e-05, + "loss": 0.3709, + "step": 730 + }, + { + "epoch": 0.02747856158141926, + "grad_norm": 0.7781583070755005, + "learning_rate": 1.9962761509041578e-05, + "loss": 0.357, + "step": 735 + }, + { + "epoch": 0.02766549057176905, + "grad_norm": 0.4106627404689789, + "learning_rate": 1.996225345967977e-05, + "loss": 0.4027, + "step": 740 + }, + { + "epoch": 0.02785241956211884, + "grad_norm": 0.538112223148346, + "learning_rate": 1.99617419746222e-05, + "loss": 0.3817, + "step": 745 + }, + { + "epoch": 0.028039348552468632, + "grad_norm": 0.4181678593158722, + "learning_rate": 1.996122705404526e-05, + "loss": 0.3379, + "step": 750 + }, + { + "epoch": 0.028226277542818423, + "grad_norm": 0.7423063516616821, + "learning_rate": 1.9960708698126536e-05, + "loss": 0.3501, + "step": 755 + }, + { + "epoch": 0.028413206533168214, + "grad_norm": 0.9700421690940857, + "learning_rate": 1.996018690704479e-05, + "loss": 0.3911, + "step": 760 + }, + { + "epoch": 0.028600135523518002, + "grad_norm": 0.6143549084663391, + "learning_rate": 1.995966168097998e-05, + "loss": 0.3379, + "step": 765 + }, + { + "epoch": 0.028787064513867793, + "grad_norm": 0.5459659695625305, + "learning_rate": 1.995913302011323e-05, + "loss": 0.3502, + "step": 770 + }, + { + "epoch": 0.028973993504217584, + "grad_norm": 0.8577987551689148, + "learning_rate": 1.9958600924626873e-05, + "loss": 0.3926, + "step": 775 + }, + { + "epoch": 0.029160922494567375, + "grad_norm": 0.8069790601730347, + "learning_rate": 1.9958065394704406e-05, + "loss": 0.3648, + "step": 780 + }, + { + "epoch": 0.029347851484917167, + "grad_norm": 1.1032791137695312, + "learning_rate": 1.9957526430530514e-05, + "loss": 0.3606, + "step": 785 + }, + { + "epoch": 0.029534780475266958, + "grad_norm": 0.5585815906524658, + "learning_rate": 1.995698403229108e-05, + "loss": 0.3777, + "step": 790 + }, + { + "epoch": 0.02972170946561675, + "grad_norm": 0.524401843547821, + "learning_rate": 1.9956438200173155e-05, + "loss": 0.2764, + "step": 795 + }, + { + "epoch": 0.02990863845596654, + "grad_norm": 0.4294147789478302, + "learning_rate": 1.9955888934364985e-05, + "loss": 0.3334, + "step": 800 + }, + { + "epoch": 0.03009556744631633, + "grad_norm": 0.6010425686836243, + "learning_rate": 1.995533623505599e-05, + "loss": 0.3192, + "step": 805 + }, + { + "epoch": 0.030282496436666122, + "grad_norm": 0.8194842338562012, + "learning_rate": 1.9954780102436786e-05, + "loss": 0.3757, + "step": 810 + }, + { + "epoch": 0.030469425427015914, + "grad_norm": 0.7027654647827148, + "learning_rate": 1.995422053669917e-05, + "loss": 0.3285, + "step": 815 + }, + { + "epoch": 0.030656354417365705, + "grad_norm": 0.3135683238506317, + "learning_rate": 1.9953657538036105e-05, + "loss": 0.3776, + "step": 820 + }, + { + "epoch": 0.030843283407715492, + "grad_norm": 0.45088908076286316, + "learning_rate": 1.9953091106641772e-05, + "loss": 0.3567, + "step": 825 + }, + { + "epoch": 0.031030212398065284, + "grad_norm": 0.6652802228927612, + "learning_rate": 1.9952521242711504e-05, + "loss": 0.3103, + "step": 830 + }, + { + "epoch": 0.031217141388415075, + "grad_norm": 0.396379292011261, + "learning_rate": 1.9951947946441835e-05, + "loss": 0.3106, + "step": 835 + }, + { + "epoch": 0.031404070378764866, + "grad_norm": 0.5185825228691101, + "learning_rate": 1.9951371218030483e-05, + "loss": 0.3243, + "step": 840 + }, + { + "epoch": 0.03159099936911466, + "grad_norm": 0.48012250661849976, + "learning_rate": 1.995079105767634e-05, + "loss": 0.352, + "step": 845 + }, + { + "epoch": 0.03177792835946445, + "grad_norm": 0.576318621635437, + "learning_rate": 1.9950207465579483e-05, + "loss": 0.4734, + "step": 850 + }, + { + "epoch": 0.03196485734981424, + "grad_norm": 0.41435566544532776, + "learning_rate": 1.9949620441941183e-05, + "loss": 0.3589, + "step": 855 + }, + { + "epoch": 0.03215178634016403, + "grad_norm": 0.5953770875930786, + "learning_rate": 1.994902998696389e-05, + "loss": 0.3151, + "step": 860 + }, + { + "epoch": 0.03233871533051382, + "grad_norm": 0.43497735261917114, + "learning_rate": 1.9948436100851224e-05, + "loss": 0.3793, + "step": 865 + }, + { + "epoch": 0.03252564432086361, + "grad_norm": 0.8335877656936646, + "learning_rate": 1.9947838783808005e-05, + "loss": 0.362, + "step": 870 + }, + { + "epoch": 0.032712573311213404, + "grad_norm": 0.9302036762237549, + "learning_rate": 1.9947238036040234e-05, + "loss": 0.2667, + "step": 875 + }, + { + "epoch": 0.032899502301563195, + "grad_norm": 0.733805239200592, + "learning_rate": 1.994663385775509e-05, + "loss": 0.3825, + "step": 880 + }, + { + "epoch": 0.033086431291912986, + "grad_norm": 0.46970003843307495, + "learning_rate": 1.9946026249160935e-05, + "loss": 0.2545, + "step": 885 + }, + { + "epoch": 0.03327336028226278, + "grad_norm": 0.9111189246177673, + "learning_rate": 1.9945415210467316e-05, + "loss": 0.3019, + "step": 890 + }, + { + "epoch": 0.03346028927261257, + "grad_norm": 0.35354894399642944, + "learning_rate": 1.9944800741884963e-05, + "loss": 0.296, + "step": 895 + }, + { + "epoch": 0.03364721826296236, + "grad_norm": 0.5118780732154846, + "learning_rate": 1.9944182843625786e-05, + "loss": 0.3321, + "step": 900 + }, + { + "epoch": 0.03383414725331215, + "grad_norm": 0.6458944082260132, + "learning_rate": 1.9943561515902886e-05, + "loss": 0.2565, + "step": 905 + }, + { + "epoch": 0.03402107624366194, + "grad_norm": 0.34854066371917725, + "learning_rate": 1.9942936758930537e-05, + "loss": 0.3219, + "step": 910 + }, + { + "epoch": 0.034208005234011726, + "grad_norm": 0.4601725935935974, + "learning_rate": 1.9942308572924204e-05, + "loss": 0.274, + "step": 915 + }, + { + "epoch": 0.03439493422436152, + "grad_norm": 0.998417854309082, + "learning_rate": 1.9941676958100526e-05, + "loss": 0.3507, + "step": 920 + }, + { + "epoch": 0.03458186321471131, + "grad_norm": 0.7837041616439819, + "learning_rate": 1.9941041914677327e-05, + "loss": 0.3192, + "step": 925 + }, + { + "epoch": 0.0347687922050611, + "grad_norm": 0.541322648525238, + "learning_rate": 1.994040344287362e-05, + "loss": 0.3525, + "step": 930 + }, + { + "epoch": 0.03495572119541089, + "grad_norm": 0.7367837429046631, + "learning_rate": 1.9939761542909594e-05, + "loss": 0.3349, + "step": 935 + }, + { + "epoch": 0.03514265018576068, + "grad_norm": 0.6418364644050598, + "learning_rate": 1.9939116215006626e-05, + "loss": 0.3164, + "step": 940 + }, + { + "epoch": 0.03532957917611047, + "grad_norm": 0.6306796669960022, + "learning_rate": 1.993846745938726e-05, + "loss": 0.3192, + "step": 945 + }, + { + "epoch": 0.035516508166460264, + "grad_norm": 0.2718103229999542, + "learning_rate": 1.9937815276275247e-05, + "loss": 0.3019, + "step": 950 + }, + { + "epoch": 0.035703437156810056, + "grad_norm": 0.8403748869895935, + "learning_rate": 1.9937159665895494e-05, + "loss": 0.2813, + "step": 955 + }, + { + "epoch": 0.03589036614715985, + "grad_norm": 0.4756128191947937, + "learning_rate": 1.9936500628474115e-05, + "loss": 0.4187, + "step": 960 + }, + { + "epoch": 0.03607729513750964, + "grad_norm": 0.8436702489852905, + "learning_rate": 1.993583816423838e-05, + "loss": 0.3813, + "step": 965 + }, + { + "epoch": 0.03626422412785943, + "grad_norm": 0.3379393517971039, + "learning_rate": 1.9935172273416762e-05, + "loss": 0.3279, + "step": 970 + }, + { + "epoch": 0.03645115311820922, + "grad_norm": 0.5459058880805969, + "learning_rate": 1.9934502956238905e-05, + "loss": 0.3037, + "step": 975 + }, + { + "epoch": 0.03663808210855901, + "grad_norm": 0.6367349028587341, + "learning_rate": 1.9933830212935637e-05, + "loss": 0.4326, + "step": 980 + }, + { + "epoch": 0.0368250110989088, + "grad_norm": 0.35184261202812195, + "learning_rate": 1.993315404373897e-05, + "loss": 0.309, + "step": 985 + }, + { + "epoch": 0.037011940089258594, + "grad_norm": 0.49862006306648254, + "learning_rate": 1.9932474448882097e-05, + "loss": 0.3977, + "step": 990 + }, + { + "epoch": 0.037198869079608385, + "grad_norm": 0.4049328863620758, + "learning_rate": 1.9931791428599386e-05, + "loss": 0.3397, + "step": 995 + }, + { + "epoch": 0.037385798069958176, + "grad_norm": 0.4314810633659363, + "learning_rate": 1.993110498312639e-05, + "loss": 0.3494, + "step": 1000 + }, + { + "epoch": 0.03757272706030797, + "grad_norm": 0.5296122431755066, + "learning_rate": 1.993041511269985e-05, + "loss": 0.2799, + "step": 1005 + }, + { + "epoch": 0.03775965605065776, + "grad_norm": 0.506058394908905, + "learning_rate": 1.9929721817557682e-05, + "loss": 0.2591, + "step": 1010 + }, + { + "epoch": 0.03794658504100755, + "grad_norm": 0.49406224489212036, + "learning_rate": 1.9929025097938978e-05, + "loss": 0.4447, + "step": 1015 + }, + { + "epoch": 0.03813351403135734, + "grad_norm": 0.4546676576137543, + "learning_rate": 1.9928324954084023e-05, + "loss": 0.445, + "step": 1020 + }, + { + "epoch": 0.03832044302170713, + "grad_norm": 0.3749759793281555, + "learning_rate": 1.9927621386234274e-05, + "loss": 0.2486, + "step": 1025 + }, + { + "epoch": 0.03850737201205692, + "grad_norm": 1.062105655670166, + "learning_rate": 1.9926914394632368e-05, + "loss": 0.4197, + "step": 1030 + }, + { + "epoch": 0.038694301002406714, + "grad_norm": 0.4789983034133911, + "learning_rate": 1.992620397952213e-05, + "loss": 0.3374, + "step": 1035 + }, + { + "epoch": 0.0388812299927565, + "grad_norm": 0.4073324501514435, + "learning_rate": 1.9925490141148564e-05, + "loss": 0.3918, + "step": 1040 + }, + { + "epoch": 0.03906815898310629, + "grad_norm": 0.6208945512771606, + "learning_rate": 1.9924772879757848e-05, + "loss": 0.3542, + "step": 1045 + }, + { + "epoch": 0.03925508797345608, + "grad_norm": 0.4267159104347229, + "learning_rate": 1.9924052195597346e-05, + "loss": 0.3543, + "step": 1050 + }, + { + "epoch": 0.03944201696380587, + "grad_norm": 0.7129971385002136, + "learning_rate": 1.9923328088915603e-05, + "loss": 0.3433, + "step": 1055 + }, + { + "epoch": 0.03962894595415566, + "grad_norm": 0.7188255786895752, + "learning_rate": 1.992260055996234e-05, + "loss": 0.311, + "step": 1060 + }, + { + "epoch": 0.039815874944505454, + "grad_norm": 0.760067343711853, + "learning_rate": 1.9921869608988464e-05, + "loss": 0.3193, + "step": 1065 + }, + { + "epoch": 0.040002803934855245, + "grad_norm": 0.5148400664329529, + "learning_rate": 1.9921135236246058e-05, + "loss": 0.316, + "step": 1070 + }, + { + "epoch": 0.040189732925205036, + "grad_norm": 0.6598585844039917, + "learning_rate": 1.9920397441988384e-05, + "loss": 0.3337, + "step": 1075 + }, + { + "epoch": 0.04037666191555483, + "grad_norm": 0.40405112504959106, + "learning_rate": 1.991965622646989e-05, + "loss": 0.3585, + "step": 1080 + }, + { + "epoch": 0.04056359090590462, + "grad_norm": 0.6789656281471252, + "learning_rate": 1.9918911589946193e-05, + "loss": 0.3226, + "step": 1085 + }, + { + "epoch": 0.04075051989625441, + "grad_norm": 0.6832015514373779, + "learning_rate": 1.99181635326741e-05, + "loss": 0.3428, + "step": 1090 + }, + { + "epoch": 0.0409374488866042, + "grad_norm": 0.4636186361312866, + "learning_rate": 1.99174120549116e-05, + "loss": 0.4286, + "step": 1095 + }, + { + "epoch": 0.04112437787695399, + "grad_norm": 0.673987865447998, + "learning_rate": 1.9916657156917852e-05, + "loss": 0.3035, + "step": 1100 + }, + { + "epoch": 0.04131130686730378, + "grad_norm": 0.7312000393867493, + "learning_rate": 1.99158988389532e-05, + "loss": 0.3005, + "step": 1105 + }, + { + "epoch": 0.041498235857653575, + "grad_norm": 0.45022085309028625, + "learning_rate": 1.9915137101279163e-05, + "loss": 0.2549, + "step": 1110 + }, + { + "epoch": 0.041685164848003366, + "grad_norm": 0.695288360118866, + "learning_rate": 1.9914371944158445e-05, + "loss": 0.367, + "step": 1115 + }, + { + "epoch": 0.04187209383835316, + "grad_norm": 0.4332289695739746, + "learning_rate": 1.9913603367854927e-05, + "loss": 0.2729, + "step": 1120 + }, + { + "epoch": 0.04205902282870295, + "grad_norm": 0.36939793825149536, + "learning_rate": 1.9912831372633665e-05, + "loss": 0.3085, + "step": 1125 + }, + { + "epoch": 0.04224595181905274, + "grad_norm": 0.5997341871261597, + "learning_rate": 1.99120559587609e-05, + "loss": 0.3238, + "step": 1130 + }, + { + "epoch": 0.04243288080940253, + "grad_norm": 0.5867207646369934, + "learning_rate": 1.9911277126504056e-05, + "loss": 0.3768, + "step": 1135 + }, + { + "epoch": 0.04261980979975232, + "grad_norm": 0.45231306552886963, + "learning_rate": 1.9910494876131726e-05, + "loss": 0.3349, + "step": 1140 + }, + { + "epoch": 0.04280673879010211, + "grad_norm": 0.9641021490097046, + "learning_rate": 1.990970920791368e-05, + "loss": 0.3352, + "step": 1145 + }, + { + "epoch": 0.042993667780451904, + "grad_norm": 0.24853017926216125, + "learning_rate": 1.990892012212088e-05, + "loss": 0.3269, + "step": 1150 + }, + { + "epoch": 0.043180596770801695, + "grad_norm": 0.357064813375473, + "learning_rate": 1.9908127619025458e-05, + "loss": 0.2854, + "step": 1155 + }, + { + "epoch": 0.04336752576115148, + "grad_norm": 0.6567901968955994, + "learning_rate": 1.990733169890072e-05, + "loss": 0.4157, + "step": 1160 + }, + { + "epoch": 0.04355445475150127, + "grad_norm": 0.7041186690330505, + "learning_rate": 1.9906532362021164e-05, + "loss": 0.3406, + "step": 1165 + }, + { + "epoch": 0.04374138374185106, + "grad_norm": 0.7448705434799194, + "learning_rate": 1.990572960866245e-05, + "loss": 0.3272, + "step": 1170 + }, + { + "epoch": 0.04392831273220085, + "grad_norm": 0.35884958505630493, + "learning_rate": 1.9904923439101432e-05, + "loss": 0.3539, + "step": 1175 + }, + { + "epoch": 0.044115241722550644, + "grad_norm": 0.8289523124694824, + "learning_rate": 1.9904113853616128e-05, + "loss": 0.296, + "step": 1180 + }, + { + "epoch": 0.044302170712900435, + "grad_norm": 0.5754719972610474, + "learning_rate": 1.990330085248575e-05, + "loss": 0.3425, + "step": 1185 + }, + { + "epoch": 0.044489099703250226, + "grad_norm": 0.6339451670646667, + "learning_rate": 1.990248443599067e-05, + "loss": 0.4721, + "step": 1190 + }, + { + "epoch": 0.04467602869360002, + "grad_norm": 0.35451385378837585, + "learning_rate": 1.9901664604412453e-05, + "loss": 0.3476, + "step": 1195 + }, + { + "epoch": 0.04486295768394981, + "grad_norm": 0.466686487197876, + "learning_rate": 1.990084135803383e-05, + "loss": 0.3168, + "step": 1200 + }, + { + "epoch": 0.0450498866742996, + "grad_norm": 0.3746946454048157, + "learning_rate": 1.9900014697138718e-05, + "loss": 0.2576, + "step": 1205 + }, + { + "epoch": 0.04523681566464939, + "grad_norm": 0.4259583353996277, + "learning_rate": 1.9899184622012208e-05, + "loss": 0.3146, + "step": 1210 + }, + { + "epoch": 0.04542374465499918, + "grad_norm": 0.6210399270057678, + "learning_rate": 1.989835113294057e-05, + "loss": 0.3105, + "step": 1215 + }, + { + "epoch": 0.04561067364534897, + "grad_norm": 0.45747750997543335, + "learning_rate": 1.989751423021125e-05, + "loss": 0.2911, + "step": 1220 + }, + { + "epoch": 0.045797602635698764, + "grad_norm": 0.4624209702014923, + "learning_rate": 1.989667391411287e-05, + "loss": 0.3643, + "step": 1225 + }, + { + "epoch": 0.045984531626048555, + "grad_norm": 0.7682859301567078, + "learning_rate": 1.9895830184935233e-05, + "loss": 0.3374, + "step": 1230 + }, + { + "epoch": 0.04617146061639835, + "grad_norm": 0.6099391579627991, + "learning_rate": 1.989498304296932e-05, + "loss": 0.299, + "step": 1235 + }, + { + "epoch": 0.04635838960674814, + "grad_norm": 0.5577886700630188, + "learning_rate": 1.989413248850728e-05, + "loss": 0.3821, + "step": 1240 + }, + { + "epoch": 0.04654531859709793, + "grad_norm": 0.4436083436012268, + "learning_rate": 1.9893278521842448e-05, + "loss": 0.358, + "step": 1245 + }, + { + "epoch": 0.04673224758744772, + "grad_norm": 0.6322022676467896, + "learning_rate": 1.989242114326933e-05, + "loss": 0.2856, + "step": 1250 + }, + { + "epoch": 0.04691917657779751, + "grad_norm": 0.4859846234321594, + "learning_rate": 1.9891560353083616e-05, + "loss": 0.2951, + "step": 1255 + }, + { + "epoch": 0.0471061055681473, + "grad_norm": 0.4296378791332245, + "learning_rate": 1.989069615158217e-05, + "loss": 0.3627, + "step": 1260 + }, + { + "epoch": 0.047293034558497093, + "grad_norm": 0.4696434438228607, + "learning_rate": 1.9889828539063017e-05, + "loss": 0.3404, + "step": 1265 + }, + { + "epoch": 0.047479963548846885, + "grad_norm": 0.5244821310043335, + "learning_rate": 1.9888957515825383e-05, + "loss": 0.3347, + "step": 1270 + }, + { + "epoch": 0.047666892539196676, + "grad_norm": 0.5117040872573853, + "learning_rate": 1.9888083082169657e-05, + "loss": 0.3017, + "step": 1275 + }, + { + "epoch": 0.04785382152954647, + "grad_norm": 0.6786810159683228, + "learning_rate": 1.9887205238397405e-05, + "loss": 0.3495, + "step": 1280 + }, + { + "epoch": 0.04804075051989625, + "grad_norm": 0.41801556944847107, + "learning_rate": 1.988632398481137e-05, + "loss": 0.2713, + "step": 1285 + }, + { + "epoch": 0.04822767951024604, + "grad_norm": 0.3679557740688324, + "learning_rate": 1.988543932171547e-05, + "loss": 0.2617, + "step": 1290 + }, + { + "epoch": 0.048414608500595833, + "grad_norm": 0.3665943145751953, + "learning_rate": 1.9884551249414806e-05, + "loss": 0.3392, + "step": 1295 + }, + { + "epoch": 0.048601537490945625, + "grad_norm": 0.6276513934135437, + "learning_rate": 1.9883659768215642e-05, + "loss": 0.3797, + "step": 1300 + }, + { + "epoch": 0.048788466481295416, + "grad_norm": 0.3099709451198578, + "learning_rate": 1.988276487842543e-05, + "loss": 0.384, + "step": 1305 + }, + { + "epoch": 0.04897539547164521, + "grad_norm": 0.3573191463947296, + "learning_rate": 1.9881866580352783e-05, + "loss": 0.3194, + "step": 1310 + }, + { + "epoch": 0.049162324461995, + "grad_norm": 0.4190731644630432, + "learning_rate": 1.9880964874307506e-05, + "loss": 0.284, + "step": 1315 + }, + { + "epoch": 0.04934925345234479, + "grad_norm": 0.4039645791053772, + "learning_rate": 1.988005976060057e-05, + "loss": 0.2548, + "step": 1320 + }, + { + "epoch": 0.04953618244269458, + "grad_norm": 0.4496673345565796, + "learning_rate": 1.987915123954412e-05, + "loss": 0.3529, + "step": 1325 + }, + { + "epoch": 0.04972311143304437, + "grad_norm": 0.8639070987701416, + "learning_rate": 1.9878239311451483e-05, + "loss": 0.3073, + "step": 1330 + }, + { + "epoch": 0.04991004042339416, + "grad_norm": 0.5019544959068298, + "learning_rate": 1.9877323976637153e-05, + "loss": 0.3829, + "step": 1335 + }, + { + "epoch": 0.050096969413743954, + "grad_norm": 0.5682312846183777, + "learning_rate": 1.9876405235416808e-05, + "loss": 0.4139, + "step": 1340 + }, + { + "epoch": 0.050283898404093745, + "grad_norm": 0.671947717666626, + "learning_rate": 1.987548308810729e-05, + "loss": 0.2905, + "step": 1345 + }, + { + "epoch": 0.050470827394443536, + "grad_norm": 0.2840399444103241, + "learning_rate": 1.9874557535026623e-05, + "loss": 0.3269, + "step": 1350 + }, + { + "epoch": 0.05065775638479333, + "grad_norm": 0.5511133074760437, + "learning_rate": 1.9873628576494004e-05, + "loss": 0.3221, + "step": 1355 + }, + { + "epoch": 0.05084468537514312, + "grad_norm": 0.4360320270061493, + "learning_rate": 1.9872696212829804e-05, + "loss": 0.3094, + "step": 1360 + }, + { + "epoch": 0.05103161436549291, + "grad_norm": 0.342729777097702, + "learning_rate": 1.987176044435557e-05, + "loss": 0.3401, + "step": 1365 + }, + { + "epoch": 0.0512185433558427, + "grad_norm": 0.43248656392097473, + "learning_rate": 1.987082127139402e-05, + "loss": 0.2907, + "step": 1370 + }, + { + "epoch": 0.05140547234619249, + "grad_norm": 0.6353283524513245, + "learning_rate": 1.9869878694269048e-05, + "loss": 0.3599, + "step": 1375 + }, + { + "epoch": 0.05159240133654228, + "grad_norm": 0.5560038685798645, + "learning_rate": 1.9868932713305723e-05, + "loss": 0.3177, + "step": 1380 + }, + { + "epoch": 0.051779330326892074, + "grad_norm": 0.49626341462135315, + "learning_rate": 1.9867983328830283e-05, + "loss": 0.3419, + "step": 1385 + }, + { + "epoch": 0.051966259317241865, + "grad_norm": 0.5227237939834595, + "learning_rate": 1.986703054117015e-05, + "loss": 0.2357, + "step": 1390 + }, + { + "epoch": 0.05215318830759166, + "grad_norm": 0.45654603838920593, + "learning_rate": 1.986607435065391e-05, + "loss": 0.2313, + "step": 1395 + }, + { + "epoch": 0.05234011729794145, + "grad_norm": 0.4107493460178375, + "learning_rate": 1.9865114757611322e-05, + "loss": 0.3056, + "step": 1400 + }, + { + "epoch": 0.05252704628829123, + "grad_norm": 0.9286690354347229, + "learning_rate": 1.9864151762373323e-05, + "loss": 0.3116, + "step": 1405 + }, + { + "epoch": 0.05271397527864102, + "grad_norm": 0.5546656847000122, + "learning_rate": 1.986318536527203e-05, + "loss": 0.3392, + "step": 1410 + }, + { + "epoch": 0.052900904268990814, + "grad_norm": 0.45263174176216125, + "learning_rate": 1.986221556664072e-05, + "loss": 0.3175, + "step": 1415 + }, + { + "epoch": 0.053087833259340605, + "grad_norm": 0.2992209792137146, + "learning_rate": 1.9861242366813846e-05, + "loss": 0.2616, + "step": 1420 + }, + { + "epoch": 0.0532747622496904, + "grad_norm": 0.39666202664375305, + "learning_rate": 1.9860265766127045e-05, + "loss": 0.3728, + "step": 1425 + }, + { + "epoch": 0.05346169124004019, + "grad_norm": 0.616473376750946, + "learning_rate": 1.9859285764917108e-05, + "loss": 0.3823, + "step": 1430 + }, + { + "epoch": 0.05364862023038998, + "grad_norm": 0.3824799954891205, + "learning_rate": 1.985830236352202e-05, + "loss": 0.2945, + "step": 1435 + }, + { + "epoch": 0.05383554922073977, + "grad_norm": 0.45680978894233704, + "learning_rate": 1.9857315562280923e-05, + "loss": 0.3025, + "step": 1440 + }, + { + "epoch": 0.05402247821108956, + "grad_norm": 0.6660309433937073, + "learning_rate": 1.9856325361534133e-05, + "loss": 0.2585, + "step": 1445 + }, + { + "epoch": 0.05420940720143935, + "grad_norm": 0.6838333010673523, + "learning_rate": 1.9855331761623143e-05, + "loss": 0.2684, + "step": 1450 + }, + { + "epoch": 0.054396336191789144, + "grad_norm": 0.36576396226882935, + "learning_rate": 1.9854334762890626e-05, + "loss": 0.3123, + "step": 1455 + }, + { + "epoch": 0.054583265182138935, + "grad_norm": 0.4945361614227295, + "learning_rate": 1.9853334365680408e-05, + "loss": 0.3113, + "step": 1460 + }, + { + "epoch": 0.054770194172488726, + "grad_norm": 0.4163508117198944, + "learning_rate": 1.98523305703375e-05, + "loss": 0.3394, + "step": 1465 + }, + { + "epoch": 0.05495712316283852, + "grad_norm": 0.42047226428985596, + "learning_rate": 1.985132337720809e-05, + "loss": 0.2486, + "step": 1470 + }, + { + "epoch": 0.05514405215318831, + "grad_norm": 0.4733279347419739, + "learning_rate": 1.9850312786639513e-05, + "loss": 0.3164, + "step": 1475 + }, + { + "epoch": 0.0553309811435381, + "grad_norm": 0.36004531383514404, + "learning_rate": 1.984929879898031e-05, + "loss": 0.2725, + "step": 1480 + }, + { + "epoch": 0.05551791013388789, + "grad_norm": 0.32165494561195374, + "learning_rate": 1.9848281414580167e-05, + "loss": 0.3624, + "step": 1485 + }, + { + "epoch": 0.05570483912423768, + "grad_norm": 0.7197104096412659, + "learning_rate": 1.9847260633789953e-05, + "loss": 0.3337, + "step": 1490 + }, + { + "epoch": 0.05589176811458747, + "grad_norm": 0.4705203175544739, + "learning_rate": 1.984623645696171e-05, + "loss": 0.3503, + "step": 1495 + }, + { + "epoch": 0.056078697104937264, + "grad_norm": 0.6553797125816345, + "learning_rate": 1.984520888444864e-05, + "loss": 0.4203, + "step": 1500 + }, + { + "epoch": 0.056265626095287055, + "grad_norm": 0.46082812547683716, + "learning_rate": 1.9844177916605126e-05, + "loss": 0.299, + "step": 1505 + }, + { + "epoch": 0.056452555085636846, + "grad_norm": 0.6781812906265259, + "learning_rate": 1.984314355378672e-05, + "loss": 0.3063, + "step": 1510 + }, + { + "epoch": 0.05663948407598664, + "grad_norm": 0.6019364595413208, + "learning_rate": 1.9842105796350143e-05, + "loss": 0.389, + "step": 1515 + }, + { + "epoch": 0.05682641306633643, + "grad_norm": 0.4727480113506317, + "learning_rate": 1.9841064644653293e-05, + "loss": 0.367, + "step": 1520 + }, + { + "epoch": 0.05701334205668621, + "grad_norm": 0.5794846415519714, + "learning_rate": 1.9840020099055226e-05, + "loss": 0.3148, + "step": 1525 + }, + { + "epoch": 0.057200271047036004, + "grad_norm": 0.48816463351249695, + "learning_rate": 1.983897215991618e-05, + "loss": 0.3408, + "step": 1530 + }, + { + "epoch": 0.057387200037385795, + "grad_norm": 0.2970341145992279, + "learning_rate": 1.983792082759756e-05, + "loss": 0.3218, + "step": 1535 + }, + { + "epoch": 0.057574129027735586, + "grad_norm": 0.2880699336528778, + "learning_rate": 1.9836866102461933e-05, + "loss": 0.3219, + "step": 1540 + }, + { + "epoch": 0.05776105801808538, + "grad_norm": 0.368488609790802, + "learning_rate": 1.9835807984873055e-05, + "loss": 0.3782, + "step": 1545 + }, + { + "epoch": 0.05794798700843517, + "grad_norm": 0.5185800194740295, + "learning_rate": 1.983474647519583e-05, + "loss": 0.2973, + "step": 1550 + }, + { + "epoch": 0.05813491599878496, + "grad_norm": 0.5686115026473999, + "learning_rate": 1.9833681573796352e-05, + "loss": 0.317, + "step": 1555 + }, + { + "epoch": 0.05832184498913475, + "grad_norm": 0.6750605702400208, + "learning_rate": 1.983261328104187e-05, + "loss": 0.3243, + "step": 1560 + }, + { + "epoch": 0.05850877397948454, + "grad_norm": 0.36497625708580017, + "learning_rate": 1.9831541597300804e-05, + "loss": 0.3339, + "step": 1565 + }, + { + "epoch": 0.05869570296983433, + "grad_norm": 0.5015937089920044, + "learning_rate": 1.983046652294275e-05, + "loss": 0.3349, + "step": 1570 + }, + { + "epoch": 0.058882631960184124, + "grad_norm": 0.36055922508239746, + "learning_rate": 1.982938805833847e-05, + "loss": 0.2826, + "step": 1575 + }, + { + "epoch": 0.059069560950533916, + "grad_norm": 0.36943939328193665, + "learning_rate": 1.9828306203859896e-05, + "loss": 0.3517, + "step": 1580 + }, + { + "epoch": 0.05925648994088371, + "grad_norm": 0.47426244616508484, + "learning_rate": 1.982722095988013e-05, + "loss": 0.3198, + "step": 1585 + }, + { + "epoch": 0.0594434189312335, + "grad_norm": 0.5041938424110413, + "learning_rate": 1.9826132326773443e-05, + "loss": 0.3636, + "step": 1590 + }, + { + "epoch": 0.05963034792158329, + "grad_norm": 0.5388785600662231, + "learning_rate": 1.982504030491527e-05, + "loss": 0.3527, + "step": 1595 + }, + { + "epoch": 0.05981727691193308, + "grad_norm": 0.399564653635025, + "learning_rate": 1.982394489468222e-05, + "loss": 0.3333, + "step": 1600 + }, + { + "epoch": 0.06000420590228287, + "grad_norm": 1.5852733850479126, + "learning_rate": 1.9822846096452064e-05, + "loss": 0.3851, + "step": 1605 + }, + { + "epoch": 0.06019113489263266, + "grad_norm": 0.3177071809768677, + "learning_rate": 1.982174391060375e-05, + "loss": 0.3981, + "step": 1610 + }, + { + "epoch": 0.060378063882982454, + "grad_norm": 0.9603758454322815, + "learning_rate": 1.982063833751739e-05, + "loss": 0.3691, + "step": 1615 + }, + { + "epoch": 0.060564992873332245, + "grad_norm": 0.3188650608062744, + "learning_rate": 1.9819529377574265e-05, + "loss": 0.3143, + "step": 1620 + }, + { + "epoch": 0.060751921863682036, + "grad_norm": 0.48925361037254333, + "learning_rate": 1.9818417031156826e-05, + "loss": 0.3063, + "step": 1625 + }, + { + "epoch": 0.06093885085403183, + "grad_norm": 0.4794778823852539, + "learning_rate": 1.9817301298648683e-05, + "loss": 0.3278, + "step": 1630 + }, + { + "epoch": 0.06112577984438162, + "grad_norm": 0.4850408434867859, + "learning_rate": 1.9816182180434622e-05, + "loss": 0.3245, + "step": 1635 + }, + { + "epoch": 0.06131270883473141, + "grad_norm": 0.5055322647094727, + "learning_rate": 1.9815059676900597e-05, + "loss": 0.2969, + "step": 1640 + }, + { + "epoch": 0.0614996378250812, + "grad_norm": 0.8014519214630127, + "learning_rate": 1.9813933788433724e-05, + "loss": 0.2473, + "step": 1645 + }, + { + "epoch": 0.061686566815430985, + "grad_norm": 0.3348430395126343, + "learning_rate": 1.9812804515422298e-05, + "loss": 0.3928, + "step": 1650 + }, + { + "epoch": 0.061873495805780776, + "grad_norm": 0.6837722659111023, + "learning_rate": 1.9811671858255764e-05, + "loss": 0.2898, + "step": 1655 + }, + { + "epoch": 0.06206042479613057, + "grad_norm": 0.422375351190567, + "learning_rate": 1.9810535817324746e-05, + "loss": 0.3522, + "step": 1660 + }, + { + "epoch": 0.06224735378648036, + "grad_norm": 0.8504202365875244, + "learning_rate": 1.980939639302103e-05, + "loss": 0.4159, + "step": 1665 + }, + { + "epoch": 0.06243428277683015, + "grad_norm": 0.31550297141075134, + "learning_rate": 1.9808253585737577e-05, + "loss": 0.3423, + "step": 1670 + }, + { + "epoch": 0.06262121176717994, + "grad_norm": 0.52349454164505, + "learning_rate": 1.9807107395868503e-05, + "loss": 0.337, + "step": 1675 + }, + { + "epoch": 0.06280814075752973, + "grad_norm": 0.9710644483566284, + "learning_rate": 1.9805957823809095e-05, + "loss": 0.3331, + "step": 1680 + }, + { + "epoch": 0.06299506974787952, + "grad_norm": 0.5250943899154663, + "learning_rate": 1.9804804869955815e-05, + "loss": 0.3059, + "step": 1685 + }, + { + "epoch": 0.06318199873822931, + "grad_norm": 0.6633884310722351, + "learning_rate": 1.980364853470627e-05, + "loss": 0.3314, + "step": 1690 + }, + { + "epoch": 0.0633689277285791, + "grad_norm": 0.35779082775115967, + "learning_rate": 1.9802488818459263e-05, + "loss": 0.2959, + "step": 1695 + }, + { + "epoch": 0.0635558567189289, + "grad_norm": 0.3499528169631958, + "learning_rate": 1.980132572161474e-05, + "loss": 0.3142, + "step": 1700 + }, + { + "epoch": 0.06374278570927869, + "grad_norm": 0.45091912150382996, + "learning_rate": 1.9800159244573813e-05, + "loss": 0.342, + "step": 1705 + }, + { + "epoch": 0.06392971469962848, + "grad_norm": 0.46202903985977173, + "learning_rate": 1.9798989387738776e-05, + "loss": 0.3052, + "step": 1710 + }, + { + "epoch": 0.06411664368997827, + "grad_norm": 0.4897025227546692, + "learning_rate": 1.9797816151513075e-05, + "loss": 0.3864, + "step": 1715 + }, + { + "epoch": 0.06430357268032806, + "grad_norm": 0.44095808267593384, + "learning_rate": 1.979663953630133e-05, + "loss": 0.3165, + "step": 1720 + }, + { + "epoch": 0.06449050167067785, + "grad_norm": 0.4061107337474823, + "learning_rate": 1.9795459542509314e-05, + "loss": 0.3123, + "step": 1725 + }, + { + "epoch": 0.06467743066102764, + "grad_norm": 0.37612634897232056, + "learning_rate": 1.9794276170543974e-05, + "loss": 0.3726, + "step": 1730 + }, + { + "epoch": 0.06486435965137743, + "grad_norm": 0.8302399516105652, + "learning_rate": 1.979308942081343e-05, + "loss": 0.3406, + "step": 1735 + }, + { + "epoch": 0.06505128864172723, + "grad_norm": 0.6120762228965759, + "learning_rate": 1.9791899293726947e-05, + "loss": 0.3493, + "step": 1740 + }, + { + "epoch": 0.06523821763207702, + "grad_norm": 1.227718710899353, + "learning_rate": 1.9790705789694977e-05, + "loss": 0.3243, + "step": 1745 + }, + { + "epoch": 0.06542514662242681, + "grad_norm": 0.2927764356136322, + "learning_rate": 1.978950890912912e-05, + "loss": 0.2685, + "step": 1750 + }, + { + "epoch": 0.0656120756127766, + "grad_norm": 0.18635421991348267, + "learning_rate": 1.9788308652442137e-05, + "loss": 0.326, + "step": 1755 + }, + { + "epoch": 0.06579900460312639, + "grad_norm": 0.5031612515449524, + "learning_rate": 1.978710502004798e-05, + "loss": 0.2777, + "step": 1760 + }, + { + "epoch": 0.06598593359347618, + "grad_norm": 0.5212199091911316, + "learning_rate": 1.9785898012361732e-05, + "loss": 0.3813, + "step": 1765 + }, + { + "epoch": 0.06617286258382597, + "grad_norm": 0.40999093651771545, + "learning_rate": 1.978468762979966e-05, + "loss": 0.3218, + "step": 1770 + }, + { + "epoch": 0.06635979157417576, + "grad_norm": 0.29708200693130493, + "learning_rate": 1.9783473872779192e-05, + "loss": 0.2983, + "step": 1775 + }, + { + "epoch": 0.06654672056452555, + "grad_norm": 0.48487138748168945, + "learning_rate": 1.978225674171892e-05, + "loss": 0.2956, + "step": 1780 + }, + { + "epoch": 0.06673364955487535, + "grad_norm": 0.7466865181922913, + "learning_rate": 1.9781036237038593e-05, + "loss": 0.4038, + "step": 1785 + }, + { + "epoch": 0.06692057854522514, + "grad_norm": 0.5608679056167603, + "learning_rate": 1.977981235915913e-05, + "loss": 0.31, + "step": 1790 + }, + { + "epoch": 0.06710750753557493, + "grad_norm": 0.3457610011100769, + "learning_rate": 1.9778585108502613e-05, + "loss": 0.3171, + "step": 1795 + }, + { + "epoch": 0.06729443652592472, + "grad_norm": 0.3912654221057892, + "learning_rate": 1.977735448549228e-05, + "loss": 0.2944, + "step": 1800 + }, + { + "epoch": 0.06748136551627451, + "grad_norm": 0.660007119178772, + "learning_rate": 1.9776120490552545e-05, + "loss": 0.3354, + "step": 1805 + }, + { + "epoch": 0.0676682945066243, + "grad_norm": 0.5889789462089539, + "learning_rate": 1.9774883124108975e-05, + "loss": 0.2601, + "step": 1810 + }, + { + "epoch": 0.06785522349697409, + "grad_norm": 0.43226462602615356, + "learning_rate": 1.97736423865883e-05, + "loss": 0.26, + "step": 1815 + }, + { + "epoch": 0.06804215248732388, + "grad_norm": 0.38729366660118103, + "learning_rate": 1.9772398278418414e-05, + "loss": 0.2675, + "step": 1820 + }, + { + "epoch": 0.06822908147767368, + "grad_norm": 0.472239226102829, + "learning_rate": 1.9771150800028376e-05, + "loss": 0.3722, + "step": 1825 + }, + { + "epoch": 0.06841601046802345, + "grad_norm": 0.5041793584823608, + "learning_rate": 1.976989995184841e-05, + "loss": 0.2839, + "step": 1830 + }, + { + "epoch": 0.06860293945837324, + "grad_norm": 0.5347047448158264, + "learning_rate": 1.9768645734309896e-05, + "loss": 0.365, + "step": 1835 + }, + { + "epoch": 0.06878986844872303, + "grad_norm": 0.3948882818222046, + "learning_rate": 1.976738814784537e-05, + "loss": 0.3337, + "step": 1840 + }, + { + "epoch": 0.06897679743907283, + "grad_norm": 0.8375716209411621, + "learning_rate": 1.9766127192888543e-05, + "loss": 0.2908, + "step": 1845 + }, + { + "epoch": 0.06916372642942262, + "grad_norm": 0.6502942442893982, + "learning_rate": 1.9764862869874282e-05, + "loss": 0.2642, + "step": 1850 + }, + { + "epoch": 0.06935065541977241, + "grad_norm": 0.36091551184654236, + "learning_rate": 1.9763595179238617e-05, + "loss": 0.3685, + "step": 1855 + }, + { + "epoch": 0.0695375844101222, + "grad_norm": 0.6670817732810974, + "learning_rate": 1.9762324121418735e-05, + "loss": 0.2686, + "step": 1860 + }, + { + "epoch": 0.06972451340047199, + "grad_norm": 0.50603187084198, + "learning_rate": 1.9761049696852996e-05, + "loss": 0.4413, + "step": 1865 + }, + { + "epoch": 0.06991144239082178, + "grad_norm": 0.4863389730453491, + "learning_rate": 1.97597719059809e-05, + "loss": 0.216, + "step": 1870 + }, + { + "epoch": 0.07009837138117157, + "grad_norm": 0.3261346220970154, + "learning_rate": 1.975849074924313e-05, + "loss": 0.3634, + "step": 1875 + }, + { + "epoch": 0.07028530037152136, + "grad_norm": 0.44674980640411377, + "learning_rate": 1.9757206227081514e-05, + "loss": 0.268, + "step": 1880 + }, + { + "epoch": 0.07047222936187116, + "grad_norm": 0.4099423587322235, + "learning_rate": 1.975591833993905e-05, + "loss": 0.3505, + "step": 1885 + }, + { + "epoch": 0.07065915835222095, + "grad_norm": 1.4500172138214111, + "learning_rate": 1.9754627088259894e-05, + "loss": 0.3023, + "step": 1890 + }, + { + "epoch": 0.07084608734257074, + "grad_norm": 0.49058449268341064, + "learning_rate": 1.975333247248936e-05, + "loss": 0.2925, + "step": 1895 + }, + { + "epoch": 0.07103301633292053, + "grad_norm": 0.4941057860851288, + "learning_rate": 1.9752034493073924e-05, + "loss": 0.2957, + "step": 1900 + }, + { + "epoch": 0.07121994532327032, + "grad_norm": 0.8845500349998474, + "learning_rate": 1.9750733150461225e-05, + "loss": 0.3244, + "step": 1905 + }, + { + "epoch": 0.07140687431362011, + "grad_norm": 1.2214986085891724, + "learning_rate": 1.9749428445100053e-05, + "loss": 0.3882, + "step": 1910 + }, + { + "epoch": 0.0715938033039699, + "grad_norm": 0.6646692752838135, + "learning_rate": 1.974812037744037e-05, + "loss": 0.4436, + "step": 1915 + }, + { + "epoch": 0.0717807322943197, + "grad_norm": 0.5141096115112305, + "learning_rate": 1.9746808947933285e-05, + "loss": 0.2812, + "step": 1920 + }, + { + "epoch": 0.07196766128466948, + "grad_norm": 0.39117810130119324, + "learning_rate": 1.9745494157031075e-05, + "loss": 0.2966, + "step": 1925 + }, + { + "epoch": 0.07215459027501928, + "grad_norm": 0.5144658088684082, + "learning_rate": 1.9744176005187173e-05, + "loss": 0.3422, + "step": 1930 + }, + { + "epoch": 0.07234151926536907, + "grad_norm": 0.6618564128875732, + "learning_rate": 1.9742854492856178e-05, + "loss": 0.3295, + "step": 1935 + }, + { + "epoch": 0.07252844825571886, + "grad_norm": 0.7013601660728455, + "learning_rate": 1.9741529620493833e-05, + "loss": 0.3461, + "step": 1940 + }, + { + "epoch": 0.07271537724606865, + "grad_norm": 0.5347566604614258, + "learning_rate": 1.9740201388557053e-05, + "loss": 0.278, + "step": 1945 + }, + { + "epoch": 0.07290230623641844, + "grad_norm": 0.4391633868217468, + "learning_rate": 1.9738869797503905e-05, + "loss": 0.2967, + "step": 1950 + }, + { + "epoch": 0.07308923522676823, + "grad_norm": 0.4526468813419342, + "learning_rate": 1.9737534847793622e-05, + "loss": 0.2962, + "step": 1955 + }, + { + "epoch": 0.07327616421711802, + "grad_norm": 0.652113676071167, + "learning_rate": 1.9736196539886578e-05, + "loss": 0.2854, + "step": 1960 + }, + { + "epoch": 0.07346309320746781, + "grad_norm": 0.39846867322921753, + "learning_rate": 1.9734854874244332e-05, + "loss": 0.2783, + "step": 1965 + }, + { + "epoch": 0.0736500221978176, + "grad_norm": 0.3684355318546295, + "learning_rate": 1.9733509851329574e-05, + "loss": 0.2901, + "step": 1970 + }, + { + "epoch": 0.0738369511881674, + "grad_norm": 0.6221095323562622, + "learning_rate": 1.973216147160617e-05, + "loss": 0.3012, + "step": 1975 + }, + { + "epoch": 0.07402388017851719, + "grad_norm": 0.41872674226760864, + "learning_rate": 1.9730809735539134e-05, + "loss": 0.3818, + "step": 1980 + }, + { + "epoch": 0.07421080916886698, + "grad_norm": 0.35373455286026, + "learning_rate": 1.9729454643594646e-05, + "loss": 0.354, + "step": 1985 + }, + { + "epoch": 0.07439773815921677, + "grad_norm": 0.795148491859436, + "learning_rate": 1.9728096196240035e-05, + "loss": 0.3374, + "step": 1990 + }, + { + "epoch": 0.07458466714956656, + "grad_norm": 0.32079294323921204, + "learning_rate": 1.972673439394379e-05, + "loss": 0.2758, + "step": 1995 + }, + { + "epoch": 0.07477159613991635, + "grad_norm": 0.3288242518901825, + "learning_rate": 1.9725369237175562e-05, + "loss": 0.2796, + "step": 2000 + }, + { + "epoch": 0.07495852513026614, + "grad_norm": 0.428270548582077, + "learning_rate": 1.9724000726406148e-05, + "loss": 0.2982, + "step": 2005 + }, + { + "epoch": 0.07514545412061593, + "grad_norm": 0.38480132818222046, + "learning_rate": 1.9722628862107514e-05, + "loss": 0.2905, + "step": 2010 + }, + { + "epoch": 0.07533238311096573, + "grad_norm": 0.5246176719665527, + "learning_rate": 1.9721253644752774e-05, + "loss": 0.3551, + "step": 2015 + }, + { + "epoch": 0.07551931210131552, + "grad_norm": 0.5422767400741577, + "learning_rate": 1.97198750748162e-05, + "loss": 0.3718, + "step": 2020 + }, + { + "epoch": 0.07570624109166531, + "grad_norm": 0.5160179138183594, + "learning_rate": 1.971849315277322e-05, + "loss": 0.3526, + "step": 2025 + }, + { + "epoch": 0.0758931700820151, + "grad_norm": 0.5089318752288818, + "learning_rate": 1.9717107879100426e-05, + "loss": 0.344, + "step": 2030 + }, + { + "epoch": 0.07608009907236489, + "grad_norm": 0.6274924874305725, + "learning_rate": 1.971571925427555e-05, + "loss": 0.2923, + "step": 2035 + }, + { + "epoch": 0.07626702806271468, + "grad_norm": 0.5537222027778625, + "learning_rate": 1.9714327278777495e-05, + "loss": 0.3163, + "step": 2040 + }, + { + "epoch": 0.07645395705306447, + "grad_norm": 0.7514776587486267, + "learning_rate": 1.9712931953086314e-05, + "loss": 0.4283, + "step": 2045 + }, + { + "epoch": 0.07664088604341426, + "grad_norm": 0.33425864577293396, + "learning_rate": 1.9711533277683214e-05, + "loss": 0.2521, + "step": 2050 + }, + { + "epoch": 0.07682781503376405, + "grad_norm": 0.4531971514225006, + "learning_rate": 1.9710131253050555e-05, + "loss": 0.3491, + "step": 2055 + }, + { + "epoch": 0.07701474402411385, + "grad_norm": 0.2874973714351654, + "learning_rate": 1.9708725879671856e-05, + "loss": 0.334, + "step": 2060 + }, + { + "epoch": 0.07720167301446364, + "grad_norm": 0.4508047103881836, + "learning_rate": 1.9707317158031794e-05, + "loss": 0.2811, + "step": 2065 + }, + { + "epoch": 0.07738860200481343, + "grad_norm": 0.233233243227005, + "learning_rate": 1.9705905088616195e-05, + "loss": 0.2521, + "step": 2070 + }, + { + "epoch": 0.0775755309951632, + "grad_norm": 0.31307104229927063, + "learning_rate": 1.970448967191204e-05, + "loss": 0.322, + "step": 2075 + }, + { + "epoch": 0.077762459985513, + "grad_norm": 0.6601381897926331, + "learning_rate": 1.9703070908407463e-05, + "loss": 0.3456, + "step": 2080 + }, + { + "epoch": 0.07794938897586279, + "grad_norm": 0.37102803587913513, + "learning_rate": 1.970164879859176e-05, + "loss": 0.374, + "step": 2085 + }, + { + "epoch": 0.07813631796621258, + "grad_norm": 0.37135210633277893, + "learning_rate": 1.9700223342955374e-05, + "loss": 0.3463, + "step": 2090 + }, + { + "epoch": 0.07832324695656237, + "grad_norm": 0.5516648292541504, + "learning_rate": 1.9698794541989903e-05, + "loss": 0.4849, + "step": 2095 + }, + { + "epoch": 0.07851017594691216, + "grad_norm": 0.5477395057678223, + "learning_rate": 1.96973623961881e-05, + "loss": 0.2745, + "step": 2100 + }, + { + "epoch": 0.07869710493726195, + "grad_norm": 0.5191754102706909, + "learning_rate": 1.969592690604387e-05, + "loss": 0.3044, + "step": 2105 + }, + { + "epoch": 0.07888403392761174, + "grad_norm": 1.8438035249710083, + "learning_rate": 1.9694488072052275e-05, + "loss": 0.3786, + "step": 2110 + }, + { + "epoch": 0.07907096291796153, + "grad_norm": 0.3772248327732086, + "learning_rate": 1.9693045894709524e-05, + "loss": 0.2567, + "step": 2115 + }, + { + "epoch": 0.07925789190831133, + "grad_norm": 0.6036641597747803, + "learning_rate": 1.9691600374512988e-05, + "loss": 0.3108, + "step": 2120 + }, + { + "epoch": 0.07944482089866112, + "grad_norm": 0.4737173020839691, + "learning_rate": 1.969015151196118e-05, + "loss": 0.2473, + "step": 2125 + }, + { + "epoch": 0.07963174988901091, + "grad_norm": 0.741744339466095, + "learning_rate": 1.9688699307553774e-05, + "loss": 0.3534, + "step": 2130 + }, + { + "epoch": 0.0798186788793607, + "grad_norm": 0.4742072522640228, + "learning_rate": 1.9687243761791595e-05, + "loss": 0.3127, + "step": 2135 + }, + { + "epoch": 0.08000560786971049, + "grad_norm": 0.4502277374267578, + "learning_rate": 1.9685784875176613e-05, + "loss": 0.3633, + "step": 2140 + }, + { + "epoch": 0.08019253686006028, + "grad_norm": 0.9466716647148132, + "learning_rate": 1.9684322648211964e-05, + "loss": 0.3007, + "step": 2145 + }, + { + "epoch": 0.08037946585041007, + "grad_norm": 0.4492679834365845, + "learning_rate": 1.9682857081401923e-05, + "loss": 0.2861, + "step": 2150 + }, + { + "epoch": 0.08056639484075986, + "grad_norm": 0.8264315724372864, + "learning_rate": 1.9681388175251925e-05, + "loss": 0.3445, + "step": 2155 + }, + { + "epoch": 0.08075332383110966, + "grad_norm": 0.7091488242149353, + "learning_rate": 1.9679915930268553e-05, + "loss": 0.3153, + "step": 2160 + }, + { + "epoch": 0.08094025282145945, + "grad_norm": 0.5118813514709473, + "learning_rate": 1.967844034695954e-05, + "loss": 0.3658, + "step": 2165 + }, + { + "epoch": 0.08112718181180924, + "grad_norm": 0.6051716208457947, + "learning_rate": 1.967696142583377e-05, + "loss": 0.3552, + "step": 2170 + }, + { + "epoch": 0.08131411080215903, + "grad_norm": 0.5814588665962219, + "learning_rate": 1.967547916740129e-05, + "loss": 0.3133, + "step": 2175 + }, + { + "epoch": 0.08150103979250882, + "grad_norm": 0.6085551977157593, + "learning_rate": 1.967399357217328e-05, + "loss": 0.3033, + "step": 2180 + }, + { + "epoch": 0.08168796878285861, + "grad_norm": 0.5651482939720154, + "learning_rate": 1.9672504640662083e-05, + "loss": 0.3819, + "step": 2185 + }, + { + "epoch": 0.0818748977732084, + "grad_norm": 0.3710572123527527, + "learning_rate": 1.9671012373381188e-05, + "loss": 0.3037, + "step": 2190 + }, + { + "epoch": 0.0820618267635582, + "grad_norm": 0.43990039825439453, + "learning_rate": 1.9669516770845233e-05, + "loss": 0.4436, + "step": 2195 + }, + { + "epoch": 0.08224875575390798, + "grad_norm": 0.6509809494018555, + "learning_rate": 1.966801783357001e-05, + "loss": 0.2811, + "step": 2200 + }, + { + "epoch": 0.08243568474425778, + "grad_norm": 0.5239195823669434, + "learning_rate": 1.9666515562072463e-05, + "loss": 0.3285, + "step": 2205 + }, + { + "epoch": 0.08262261373460757, + "grad_norm": 0.6424049139022827, + "learning_rate": 1.9665009956870678e-05, + "loss": 0.2595, + "step": 2210 + }, + { + "epoch": 0.08280954272495736, + "grad_norm": 0.3653183579444885, + "learning_rate": 1.9663501018483897e-05, + "loss": 0.3201, + "step": 2215 + }, + { + "epoch": 0.08299647171530715, + "grad_norm": 0.6920386552810669, + "learning_rate": 1.9661988747432508e-05, + "loss": 0.2718, + "step": 2220 + }, + { + "epoch": 0.08318340070565694, + "grad_norm": 0.4126833379268646, + "learning_rate": 1.966047314423805e-05, + "loss": 0.336, + "step": 2225 + }, + { + "epoch": 0.08337032969600673, + "grad_norm": 0.11903409659862518, + "learning_rate": 1.9658954209423214e-05, + "loss": 0.3856, + "step": 2230 + }, + { + "epoch": 0.08355725868635652, + "grad_norm": 0.44001504778862, + "learning_rate": 1.9657431943511837e-05, + "loss": 0.3288, + "step": 2235 + }, + { + "epoch": 0.08374418767670631, + "grad_norm": 0.2602495849132538, + "learning_rate": 1.96559063470289e-05, + "loss": 0.2417, + "step": 2240 + }, + { + "epoch": 0.0839311166670561, + "grad_norm": 0.48337632417678833, + "learning_rate": 1.965437742050054e-05, + "loss": 0.4299, + "step": 2245 + }, + { + "epoch": 0.0841180456574059, + "grad_norm": 0.40766072273254395, + "learning_rate": 1.9652845164454044e-05, + "loss": 0.3488, + "step": 2250 + }, + { + "epoch": 0.08430497464775569, + "grad_norm": 0.4196990132331848, + "learning_rate": 1.9651309579417835e-05, + "loss": 0.3123, + "step": 2255 + }, + { + "epoch": 0.08449190363810548, + "grad_norm": 0.4198533296585083, + "learning_rate": 1.96497706659215e-05, + "loss": 0.2318, + "step": 2260 + }, + { + "epoch": 0.08467883262845527, + "grad_norm": 0.48852840065956116, + "learning_rate": 1.9648228424495765e-05, + "loss": 0.2639, + "step": 2265 + }, + { + "epoch": 0.08486576161880506, + "grad_norm": 0.42771658301353455, + "learning_rate": 1.96466828556725e-05, + "loss": 0.2426, + "step": 2270 + }, + { + "epoch": 0.08505269060915485, + "grad_norm": 0.46135228872299194, + "learning_rate": 1.9645133959984733e-05, + "loss": 0.3736, + "step": 2275 + }, + { + "epoch": 0.08523961959950464, + "grad_norm": 0.673338770866394, + "learning_rate": 1.9643581737966628e-05, + "loss": 0.2744, + "step": 2280 + }, + { + "epoch": 0.08542654858985443, + "grad_norm": 0.5732733011245728, + "learning_rate": 1.964202619015351e-05, + "loss": 0.3472, + "step": 2285 + }, + { + "epoch": 0.08561347758020423, + "grad_norm": 0.6712455749511719, + "learning_rate": 1.9640467317081833e-05, + "loss": 0.2671, + "step": 2290 + }, + { + "epoch": 0.08580040657055402, + "grad_norm": 0.6656973958015442, + "learning_rate": 1.9638905119289215e-05, + "loss": 0.3475, + "step": 2295 + }, + { + "epoch": 0.08598733556090381, + "grad_norm": 0.3552950620651245, + "learning_rate": 1.963733959731441e-05, + "loss": 0.2821, + "step": 2300 + }, + { + "epoch": 0.0861742645512536, + "grad_norm": 0.7310354709625244, + "learning_rate": 1.9635770751697326e-05, + "loss": 0.3669, + "step": 2305 + }, + { + "epoch": 0.08636119354160339, + "grad_norm": 0.3826546370983124, + "learning_rate": 1.9634198582979005e-05, + "loss": 0.3274, + "step": 2310 + }, + { + "epoch": 0.08654812253195318, + "grad_norm": 0.8085774183273315, + "learning_rate": 1.963262309170165e-05, + "loss": 0.303, + "step": 2315 + }, + { + "epoch": 0.08673505152230296, + "grad_norm": 0.2076191008090973, + "learning_rate": 1.96310442784086e-05, + "loss": 0.2976, + "step": 2320 + }, + { + "epoch": 0.08692198051265275, + "grad_norm": 0.8541048169136047, + "learning_rate": 1.962946214364434e-05, + "loss": 0.3923, + "step": 2325 + }, + { + "epoch": 0.08710890950300254, + "grad_norm": 0.4931800663471222, + "learning_rate": 1.9627876687954508e-05, + "loss": 0.3641, + "step": 2330 + }, + { + "epoch": 0.08729583849335233, + "grad_norm": 0.33070865273475647, + "learning_rate": 1.9626287911885882e-05, + "loss": 0.3196, + "step": 2335 + }, + { + "epoch": 0.08748276748370212, + "grad_norm": 0.2637079060077667, + "learning_rate": 1.9624695815986383e-05, + "loss": 0.3229, + "step": 2340 + }, + { + "epoch": 0.08766969647405191, + "grad_norm": 0.34368112683296204, + "learning_rate": 1.9623100400805076e-05, + "loss": 0.2956, + "step": 2345 + }, + { + "epoch": 0.0878566254644017, + "grad_norm": 0.5603893995285034, + "learning_rate": 1.9621501666892178e-05, + "loss": 0.2941, + "step": 2350 + }, + { + "epoch": 0.0880435544547515, + "grad_norm": 0.5940320491790771, + "learning_rate": 1.9619899614799046e-05, + "loss": 0.315, + "step": 2355 + }, + { + "epoch": 0.08823048344510129, + "grad_norm": 0.36051857471466064, + "learning_rate": 1.9618294245078184e-05, + "loss": 0.391, + "step": 2360 + }, + { + "epoch": 0.08841741243545108, + "grad_norm": 0.5375169515609741, + "learning_rate": 1.9616685558283234e-05, + "loss": 0.3213, + "step": 2365 + }, + { + "epoch": 0.08860434142580087, + "grad_norm": 0.6388794779777527, + "learning_rate": 1.9615073554968988e-05, + "loss": 0.347, + "step": 2370 + }, + { + "epoch": 0.08879127041615066, + "grad_norm": 1.1879587173461914, + "learning_rate": 1.961345823569138e-05, + "loss": 0.2717, + "step": 2375 + }, + { + "epoch": 0.08897819940650045, + "grad_norm": 0.21704648435115814, + "learning_rate": 1.961183960100749e-05, + "loss": 0.3559, + "step": 2380 + }, + { + "epoch": 0.08916512839685024, + "grad_norm": 1.803924560546875, + "learning_rate": 1.961021765147553e-05, + "loss": 0.3177, + "step": 2385 + }, + { + "epoch": 0.08935205738720003, + "grad_norm": 0.6759556531906128, + "learning_rate": 1.9608592387654873e-05, + "loss": 0.2972, + "step": 2390 + }, + { + "epoch": 0.08953898637754983, + "grad_norm": 0.2949463725090027, + "learning_rate": 1.9606963810106023e-05, + "loss": 0.3087, + "step": 2395 + }, + { + "epoch": 0.08972591536789962, + "grad_norm": 0.34233054518699646, + "learning_rate": 1.9605331919390627e-05, + "loss": 0.4371, + "step": 2400 + }, + { + "epoch": 0.08991284435824941, + "grad_norm": 0.3700423836708069, + "learning_rate": 1.960369671607148e-05, + "loss": 0.2686, + "step": 2405 + }, + { + "epoch": 0.0900997733485992, + "grad_norm": 0.528294026851654, + "learning_rate": 1.9602058200712516e-05, + "loss": 0.3552, + "step": 2410 + }, + { + "epoch": 0.09028670233894899, + "grad_norm": 2.334455966949463, + "learning_rate": 1.9600416373878815e-05, + "loss": 0.3232, + "step": 2415 + }, + { + "epoch": 0.09047363132929878, + "grad_norm": 0.4088938236236572, + "learning_rate": 1.959877123613659e-05, + "loss": 0.2649, + "step": 2420 + }, + { + "epoch": 0.09066056031964857, + "grad_norm": 0.5448519587516785, + "learning_rate": 1.959712278805321e-05, + "loss": 0.3223, + "step": 2425 + }, + { + "epoch": 0.09084748930999836, + "grad_norm": 0.5493069291114807, + "learning_rate": 1.9595471030197165e-05, + "loss": 0.345, + "step": 2430 + }, + { + "epoch": 0.09103441830034815, + "grad_norm": 0.39371412992477417, + "learning_rate": 1.9593815963138115e-05, + "loss": 0.3379, + "step": 2435 + }, + { + "epoch": 0.09122134729069795, + "grad_norm": 0.5203854441642761, + "learning_rate": 1.9592157587446833e-05, + "loss": 0.2653, + "step": 2440 + }, + { + "epoch": 0.09140827628104774, + "grad_norm": 0.4320240616798401, + "learning_rate": 1.9590495903695248e-05, + "loss": 0.3827, + "step": 2445 + }, + { + "epoch": 0.09159520527139753, + "grad_norm": 0.35967013239860535, + "learning_rate": 1.958883091245643e-05, + "loss": 0.4025, + "step": 2450 + }, + { + "epoch": 0.09178213426174732, + "grad_norm": 0.47668302059173584, + "learning_rate": 1.9587162614304588e-05, + "loss": 0.2894, + "step": 2455 + }, + { + "epoch": 0.09196906325209711, + "grad_norm": 0.463037371635437, + "learning_rate": 1.958549100981506e-05, + "loss": 0.2681, + "step": 2460 + }, + { + "epoch": 0.0921559922424469, + "grad_norm": 0.5469281077384949, + "learning_rate": 1.9583816099564346e-05, + "loss": 0.346, + "step": 2465 + }, + { + "epoch": 0.0923429212327967, + "grad_norm": 0.40648287534713745, + "learning_rate": 1.958213788413007e-05, + "loss": 0.3613, + "step": 2470 + }, + { + "epoch": 0.09252985022314648, + "grad_norm": 0.35621514916419983, + "learning_rate": 1.9580456364091003e-05, + "loss": 0.3235, + "step": 2475 + }, + { + "epoch": 0.09271677921349628, + "grad_norm": 0.6592122316360474, + "learning_rate": 1.957877154002705e-05, + "loss": 0.3155, + "step": 2480 + }, + { + "epoch": 0.09290370820384607, + "grad_norm": 0.515332818031311, + "learning_rate": 1.9577083412519258e-05, + "loss": 0.2627, + "step": 2485 + }, + { + "epoch": 0.09309063719419586, + "grad_norm": 0.3281778395175934, + "learning_rate": 1.9575391982149814e-05, + "loss": 0.3066, + "step": 2490 + }, + { + "epoch": 0.09327756618454565, + "grad_norm": 0.4038369953632355, + "learning_rate": 1.9573697249502046e-05, + "loss": 0.2842, + "step": 2495 + }, + { + "epoch": 0.09346449517489544, + "grad_norm": 0.5899633169174194, + "learning_rate": 1.957199921516042e-05, + "loss": 0.2495, + "step": 2500 + }, + { + "epoch": 0.09365142416524523, + "grad_norm": 0.2653183043003082, + "learning_rate": 1.9570297879710533e-05, + "loss": 0.3687, + "step": 2505 + }, + { + "epoch": 0.09383835315559502, + "grad_norm": 0.5264580249786377, + "learning_rate": 1.9568593243739133e-05, + "loss": 0.3317, + "step": 2510 + }, + { + "epoch": 0.09402528214594481, + "grad_norm": 0.8166529536247253, + "learning_rate": 1.95668853078341e-05, + "loss": 0.3921, + "step": 2515 + }, + { + "epoch": 0.0942122111362946, + "grad_norm": 3.684605360031128, + "learning_rate": 1.9565174072584448e-05, + "loss": 0.383, + "step": 2520 + }, + { + "epoch": 0.0943991401266444, + "grad_norm": 0.3563925325870514, + "learning_rate": 1.9563459538580337e-05, + "loss": 0.2752, + "step": 2525 + }, + { + "epoch": 0.09458606911699419, + "grad_norm": 0.696751594543457, + "learning_rate": 1.9561741706413055e-05, + "loss": 0.2909, + "step": 2530 + }, + { + "epoch": 0.09477299810734398, + "grad_norm": 0.47342103719711304, + "learning_rate": 1.956002057667504e-05, + "loss": 0.3115, + "step": 2535 + }, + { + "epoch": 0.09495992709769377, + "grad_norm": 0.3338661193847656, + "learning_rate": 1.955829614995986e-05, + "loss": 0.3501, + "step": 2540 + }, + { + "epoch": 0.09514685608804356, + "grad_norm": 0.48812028765678406, + "learning_rate": 1.9556568426862214e-05, + "loss": 0.3285, + "step": 2545 + }, + { + "epoch": 0.09533378507839335, + "grad_norm": 0.5304372310638428, + "learning_rate": 1.9554837407977948e-05, + "loss": 0.3472, + "step": 2550 + }, + { + "epoch": 0.09552071406874314, + "grad_norm": 0.5442197322845459, + "learning_rate": 1.9553103093904043e-05, + "loss": 0.3138, + "step": 2555 + }, + { + "epoch": 0.09570764305909293, + "grad_norm": 0.34639909863471985, + "learning_rate": 1.955136548523861e-05, + "loss": 0.2917, + "step": 2560 + }, + { + "epoch": 0.09589457204944271, + "grad_norm": 0.43235430121421814, + "learning_rate": 1.9549624582580905e-05, + "loss": 0.382, + "step": 2565 + }, + { + "epoch": 0.0960815010397925, + "grad_norm": 0.37063688039779663, + "learning_rate": 1.9547880386531307e-05, + "loss": 0.3394, + "step": 2570 + }, + { + "epoch": 0.0962684300301423, + "grad_norm": 1.0126583576202393, + "learning_rate": 1.954613289769135e-05, + "loss": 0.3272, + "step": 2575 + }, + { + "epoch": 0.09645535902049208, + "grad_norm": 0.6808075308799744, + "learning_rate": 1.9544382116663687e-05, + "loss": 0.3496, + "step": 2580 + }, + { + "epoch": 0.09664228801084188, + "grad_norm": 0.4482937753200531, + "learning_rate": 1.9542628044052115e-05, + "loss": 0.3691, + "step": 2585 + }, + { + "epoch": 0.09682921700119167, + "grad_norm": 0.5843492150306702, + "learning_rate": 1.954087068046156e-05, + "loss": 0.3311, + "step": 2590 + }, + { + "epoch": 0.09701614599154146, + "grad_norm": 0.45115911960601807, + "learning_rate": 1.9539110026498085e-05, + "loss": 0.2578, + "step": 2595 + }, + { + "epoch": 0.09720307498189125, + "grad_norm": 0.4495108425617218, + "learning_rate": 1.9537346082768894e-05, + "loss": 0.3572, + "step": 2600 + }, + { + "epoch": 0.09739000397224104, + "grad_norm": 0.5485822558403015, + "learning_rate": 1.9535578849882318e-05, + "loss": 0.3271, + "step": 2605 + }, + { + "epoch": 0.09757693296259083, + "grad_norm": 0.3917326331138611, + "learning_rate": 1.9533808328447828e-05, + "loss": 0.2825, + "step": 2610 + }, + { + "epoch": 0.09776386195294062, + "grad_norm": 0.7909905314445496, + "learning_rate": 1.953203451907602e-05, + "loss": 0.299, + "step": 2615 + }, + { + "epoch": 0.09795079094329041, + "grad_norm": 0.6403763294219971, + "learning_rate": 1.9530257422378635e-05, + "loss": 0.4017, + "step": 2620 + }, + { + "epoch": 0.0981377199336402, + "grad_norm": 0.5497360229492188, + "learning_rate": 1.9528477038968542e-05, + "loss": 0.4055, + "step": 2625 + }, + { + "epoch": 0.09832464892399, + "grad_norm": 0.35415706038475037, + "learning_rate": 1.9526693369459747e-05, + "loss": 0.2735, + "step": 2630 + }, + { + "epoch": 0.09851157791433979, + "grad_norm": 0.6400545239448547, + "learning_rate": 1.9524906414467376e-05, + "loss": 0.3236, + "step": 2635 + }, + { + "epoch": 0.09869850690468958, + "grad_norm": 0.6510913968086243, + "learning_rate": 1.952311617460771e-05, + "loss": 0.2668, + "step": 2640 + }, + { + "epoch": 0.09888543589503937, + "grad_norm": 0.36324068903923035, + "learning_rate": 1.9521322650498148e-05, + "loss": 0.3129, + "step": 2645 + }, + { + "epoch": 0.09907236488538916, + "grad_norm": 0.4775453209877014, + "learning_rate": 1.9519525842757223e-05, + "loss": 0.265, + "step": 2650 + }, + { + "epoch": 0.09925929387573895, + "grad_norm": 0.6153637170791626, + "learning_rate": 1.9517725752004605e-05, + "loss": 0.2791, + "step": 2655 + }, + { + "epoch": 0.09944622286608874, + "grad_norm": 0.704619288444519, + "learning_rate": 1.951592237886109e-05, + "loss": 0.2513, + "step": 2660 + }, + { + "epoch": 0.09963315185643853, + "grad_norm": 0.49600765109062195, + "learning_rate": 1.9514115723948612e-05, + "loss": 0.416, + "step": 2665 + }, + { + "epoch": 0.09982008084678833, + "grad_norm": 0.5411927103996277, + "learning_rate": 1.9512305787890237e-05, + "loss": 0.3892, + "step": 2670 + }, + { + "epoch": 0.10000700983713812, + "grad_norm": 0.4148860275745392, + "learning_rate": 1.9510492571310157e-05, + "loss": 0.316, + "step": 2675 + }, + { + "epoch": 0.10019393882748791, + "grad_norm": 0.7685550451278687, + "learning_rate": 1.95086760748337e-05, + "loss": 0.3357, + "step": 2680 + }, + { + "epoch": 0.1003808678178377, + "grad_norm": 0.7916879057884216, + "learning_rate": 1.950685629908732e-05, + "loss": 0.4379, + "step": 2685 + }, + { + "epoch": 0.10056779680818749, + "grad_norm": 0.4039478003978729, + "learning_rate": 1.9505033244698614e-05, + "loss": 0.2756, + "step": 2690 + }, + { + "epoch": 0.10075472579853728, + "grad_norm": 0.4937886595726013, + "learning_rate": 1.950320691229629e-05, + "loss": 0.2877, + "step": 2695 + }, + { + "epoch": 0.10094165478888707, + "grad_norm": 0.7463287711143494, + "learning_rate": 1.9501377302510204e-05, + "loss": 0.277, + "step": 2700 + }, + { + "epoch": 0.10112858377923686, + "grad_norm": 0.591262936592102, + "learning_rate": 1.9499544415971337e-05, + "loss": 0.316, + "step": 2705 + }, + { + "epoch": 0.10131551276958665, + "grad_norm": 0.3209913372993469, + "learning_rate": 1.94977082533118e-05, + "loss": 0.2486, + "step": 2710 + }, + { + "epoch": 0.10150244175993645, + "grad_norm": 0.5596148371696472, + "learning_rate": 1.9495868815164827e-05, + "loss": 0.3891, + "step": 2715 + }, + { + "epoch": 0.10168937075028624, + "grad_norm": 0.3874911367893219, + "learning_rate": 1.949402610216479e-05, + "loss": 0.3855, + "step": 2720 + }, + { + "epoch": 0.10187629974063603, + "grad_norm": 0.5168285369873047, + "learning_rate": 1.9492180114947187e-05, + "loss": 0.2817, + "step": 2725 + }, + { + "epoch": 0.10206322873098582, + "grad_norm": 0.5476559996604919, + "learning_rate": 1.949033085414865e-05, + "loss": 0.3411, + "step": 2730 + }, + { + "epoch": 0.10225015772133561, + "grad_norm": 0.8609138131141663, + "learning_rate": 1.9488478320406937e-05, + "loss": 0.3765, + "step": 2735 + }, + { + "epoch": 0.1024370867116854, + "grad_norm": 0.5176308751106262, + "learning_rate": 1.9486622514360928e-05, + "loss": 0.2958, + "step": 2740 + }, + { + "epoch": 0.10262401570203519, + "grad_norm": 0.49085232615470886, + "learning_rate": 1.9484763436650637e-05, + "loss": 0.3155, + "step": 2745 + }, + { + "epoch": 0.10281094469238498, + "grad_norm": 0.5792192816734314, + "learning_rate": 1.948290108791721e-05, + "loss": 0.3046, + "step": 2750 + }, + { + "epoch": 0.10299787368273478, + "grad_norm": 0.5799537301063538, + "learning_rate": 1.9481035468802922e-05, + "loss": 0.3144, + "step": 2755 + }, + { + "epoch": 0.10318480267308457, + "grad_norm": 2.7372591495513916, + "learning_rate": 1.9479166579951162e-05, + "loss": 0.5132, + "step": 2760 + }, + { + "epoch": 0.10337173166343436, + "grad_norm": 0.6942354440689087, + "learning_rate": 1.9477294422006462e-05, + "loss": 0.312, + "step": 2765 + }, + { + "epoch": 0.10355866065378415, + "grad_norm": 0.42621228098869324, + "learning_rate": 1.947541899561448e-05, + "loss": 0.3342, + "step": 2770 + }, + { + "epoch": 0.10374558964413394, + "grad_norm": 0.2672087252140045, + "learning_rate": 1.9473540301421985e-05, + "loss": 0.3697, + "step": 2775 + }, + { + "epoch": 0.10393251863448373, + "grad_norm": 0.3490724563598633, + "learning_rate": 1.9471658340076895e-05, + "loss": 0.397, + "step": 2780 + }, + { + "epoch": 0.10411944762483352, + "grad_norm": 0.4320790767669678, + "learning_rate": 1.9469773112228237e-05, + "loss": 0.354, + "step": 2785 + }, + { + "epoch": 0.10430637661518331, + "grad_norm": 0.19510655105113983, + "learning_rate": 1.946788461852618e-05, + "loss": 0.3459, + "step": 2790 + }, + { + "epoch": 0.1044933056055331, + "grad_norm": 0.4755474627017975, + "learning_rate": 1.9465992859622006e-05, + "loss": 0.3761, + "step": 2795 + }, + { + "epoch": 0.1046802345958829, + "grad_norm": 0.3486561179161072, + "learning_rate": 1.946409783616813e-05, + "loss": 0.2303, + "step": 2800 + }, + { + "epoch": 0.10486716358623269, + "grad_norm": 0.7743209004402161, + "learning_rate": 1.946219954881809e-05, + "loss": 0.3435, + "step": 2805 + }, + { + "epoch": 0.10505409257658246, + "grad_norm": 1.9734418392181396, + "learning_rate": 1.9460297998226552e-05, + "loss": 0.3507, + "step": 2810 + }, + { + "epoch": 0.10524102156693226, + "grad_norm": 0.5270185470581055, + "learning_rate": 1.9458393185049303e-05, + "loss": 0.4341, + "step": 2815 + }, + { + "epoch": 0.10542795055728205, + "grad_norm": 0.41794025897979736, + "learning_rate": 1.945648510994327e-05, + "loss": 0.3172, + "step": 2820 + }, + { + "epoch": 0.10561487954763184, + "grad_norm": 0.22198650240898132, + "learning_rate": 1.9454573773566478e-05, + "loss": 0.2623, + "step": 2825 + }, + { + "epoch": 0.10580180853798163, + "grad_norm": 0.36959734559059143, + "learning_rate": 1.94526591765781e-05, + "loss": 0.2379, + "step": 2830 + }, + { + "epoch": 0.10598873752833142, + "grad_norm": 0.21317122876644135, + "learning_rate": 1.945074131963843e-05, + "loss": 0.3395, + "step": 2835 + }, + { + "epoch": 0.10617566651868121, + "grad_norm": 0.5071830153465271, + "learning_rate": 1.944882020340887e-05, + "loss": 0.3268, + "step": 2840 + }, + { + "epoch": 0.106362595509031, + "grad_norm": 0.4251273274421692, + "learning_rate": 1.944689582855197e-05, + "loss": 0.3505, + "step": 2845 + }, + { + "epoch": 0.1065495244993808, + "grad_norm": 0.6162049174308777, + "learning_rate": 1.9444968195731384e-05, + "loss": 0.308, + "step": 2850 + }, + { + "epoch": 0.10673645348973058, + "grad_norm": 0.522985577583313, + "learning_rate": 1.94430373056119e-05, + "loss": 0.2516, + "step": 2855 + }, + { + "epoch": 0.10692338248008038, + "grad_norm": 0.7239044308662415, + "learning_rate": 1.9441103158859427e-05, + "loss": 0.3029, + "step": 2860 + }, + { + "epoch": 0.10711031147043017, + "grad_norm": 0.5046908855438232, + "learning_rate": 1.9439165756141e-05, + "loss": 0.3179, + "step": 2865 + }, + { + "epoch": 0.10729724046077996, + "grad_norm": 0.4853164553642273, + "learning_rate": 1.9437225098124765e-05, + "loss": 0.2714, + "step": 2870 + }, + { + "epoch": 0.10748416945112975, + "grad_norm": 0.472434401512146, + "learning_rate": 1.9435281185480007e-05, + "loss": 0.2768, + "step": 2875 + }, + { + "epoch": 0.10767109844147954, + "grad_norm": 0.549471914768219, + "learning_rate": 1.943333401887712e-05, + "loss": 0.3117, + "step": 2880 + }, + { + "epoch": 0.10785802743182933, + "grad_norm": 0.3362756669521332, + "learning_rate": 1.943138359898763e-05, + "loss": 0.2906, + "step": 2885 + }, + { + "epoch": 0.10804495642217912, + "grad_norm": 0.6930930614471436, + "learning_rate": 1.9429429926484184e-05, + "loss": 0.2813, + "step": 2890 + }, + { + "epoch": 0.10823188541252891, + "grad_norm": 1.1984028816223145, + "learning_rate": 1.942747300204054e-05, + "loss": 0.3159, + "step": 2895 + }, + { + "epoch": 0.1084188144028787, + "grad_norm": 0.6312728524208069, + "learning_rate": 1.9425512826331593e-05, + "loss": 0.3663, + "step": 2900 + }, + { + "epoch": 0.1086057433932285, + "grad_norm": 0.4989972710609436, + "learning_rate": 1.9423549400033344e-05, + "loss": 0.3155, + "step": 2905 + }, + { + "epoch": 0.10879267238357829, + "grad_norm": 0.22321349382400513, + "learning_rate": 1.9421582723822926e-05, + "loss": 0.3021, + "step": 2910 + }, + { + "epoch": 0.10897960137392808, + "grad_norm": 0.4356032609939575, + "learning_rate": 1.9419612798378588e-05, + "loss": 0.341, + "step": 2915 + }, + { + "epoch": 0.10916653036427787, + "grad_norm": 0.6972177028656006, + "learning_rate": 1.9417639624379704e-05, + "loss": 0.31, + "step": 2920 + }, + { + "epoch": 0.10935345935462766, + "grad_norm": 0.6108505725860596, + "learning_rate": 1.9415663202506757e-05, + "loss": 0.2865, + "step": 2925 + }, + { + "epoch": 0.10954038834497745, + "grad_norm": 0.29747897386550903, + "learning_rate": 1.941368353344137e-05, + "loss": 0.2782, + "step": 2930 + }, + { + "epoch": 0.10972731733532724, + "grad_norm": 0.46353158354759216, + "learning_rate": 1.9411700617866268e-05, + "loss": 0.3503, + "step": 2935 + }, + { + "epoch": 0.10991424632567703, + "grad_norm": 0.3761847913265228, + "learning_rate": 1.9409714456465303e-05, + "loss": 0.3263, + "step": 2940 + }, + { + "epoch": 0.11010117531602683, + "grad_norm": 0.3461897671222687, + "learning_rate": 1.9407725049923443e-05, + "loss": 0.2596, + "step": 2945 + }, + { + "epoch": 0.11028810430637662, + "grad_norm": 0.3752954602241516, + "learning_rate": 1.940573239892678e-05, + "loss": 0.3043, + "step": 2950 + }, + { + "epoch": 0.11047503329672641, + "grad_norm": 0.41294610500335693, + "learning_rate": 1.940373650416252e-05, + "loss": 0.327, + "step": 2955 + }, + { + "epoch": 0.1106619622870762, + "grad_norm": 0.4009648859500885, + "learning_rate": 1.940173736631899e-05, + "loss": 0.2781, + "step": 2960 + }, + { + "epoch": 0.11084889127742599, + "grad_norm": 0.20430001616477966, + "learning_rate": 1.9399734986085636e-05, + "loss": 0.3943, + "step": 2965 + }, + { + "epoch": 0.11103582026777578, + "grad_norm": 0.7819133400917053, + "learning_rate": 1.9397729364153025e-05, + "loss": 0.295, + "step": 2970 + }, + { + "epoch": 0.11122274925812557, + "grad_norm": 0.7665151953697205, + "learning_rate": 1.9395720501212833e-05, + "loss": 0.2599, + "step": 2975 + }, + { + "epoch": 0.11140967824847536, + "grad_norm": 0.38904261589050293, + "learning_rate": 1.9393708397957863e-05, + "loss": 0.3369, + "step": 2980 + }, + { + "epoch": 0.11159660723882515, + "grad_norm": 0.4791386127471924, + "learning_rate": 1.9391693055082028e-05, + "loss": 0.3276, + "step": 2985 + }, + { + "epoch": 0.11178353622917495, + "grad_norm": 0.5694244503974915, + "learning_rate": 1.9389674473280365e-05, + "loss": 0.3659, + "step": 2990 + }, + { + "epoch": 0.11197046521952474, + "grad_norm": 0.7917852401733398, + "learning_rate": 1.9387652653249023e-05, + "loss": 0.333, + "step": 2995 + }, + { + "epoch": 0.11215739420987453, + "grad_norm": 0.41513529419898987, + "learning_rate": 1.9385627595685275e-05, + "loss": 0.2983, + "step": 3000 + }, + { + "epoch": 0.11234432320022432, + "grad_norm": 0.39530879259109497, + "learning_rate": 1.9383599301287498e-05, + "loss": 0.2979, + "step": 3005 + }, + { + "epoch": 0.11253125219057411, + "grad_norm": 0.40975919365882874, + "learning_rate": 1.93815677707552e-05, + "loss": 0.3145, + "step": 3010 + }, + { + "epoch": 0.1127181811809239, + "grad_norm": 0.7045682072639465, + "learning_rate": 1.9379533004788992e-05, + "loss": 0.3777, + "step": 3015 + }, + { + "epoch": 0.11290511017127369, + "grad_norm": 0.5002708435058594, + "learning_rate": 1.9377495004090605e-05, + "loss": 0.3937, + "step": 3020 + }, + { + "epoch": 0.11309203916162348, + "grad_norm": 0.9641180634498596, + "learning_rate": 1.937545376936289e-05, + "loss": 0.2889, + "step": 3025 + }, + { + "epoch": 0.11327896815197327, + "grad_norm": 0.41359230875968933, + "learning_rate": 1.9373409301309817e-05, + "loss": 0.2924, + "step": 3030 + }, + { + "epoch": 0.11346589714232307, + "grad_norm": 0.5380722284317017, + "learning_rate": 1.9371361600636452e-05, + "loss": 0.3907, + "step": 3035 + }, + { + "epoch": 0.11365282613267286, + "grad_norm": 0.8563070297241211, + "learning_rate": 1.9369310668049e-05, + "loss": 0.341, + "step": 3040 + }, + { + "epoch": 0.11383975512302265, + "grad_norm": 1.5900834798812866, + "learning_rate": 1.936725650425476e-05, + "loss": 0.3633, + "step": 3045 + }, + { + "epoch": 0.11402668411337243, + "grad_norm": 0.4389062821865082, + "learning_rate": 1.936519910996216e-05, + "loss": 0.341, + "step": 3050 + }, + { + "epoch": 0.11421361310372222, + "grad_norm": 0.4638637602329254, + "learning_rate": 1.936313848588073e-05, + "loss": 0.3595, + "step": 3055 + }, + { + "epoch": 0.11440054209407201, + "grad_norm": 0.541275680065155, + "learning_rate": 1.9361074632721125e-05, + "loss": 0.3566, + "step": 3060 + }, + { + "epoch": 0.1145874710844218, + "grad_norm": 0.38814595341682434, + "learning_rate": 1.935900755119511e-05, + "loss": 0.28, + "step": 3065 + }, + { + "epoch": 0.11477440007477159, + "grad_norm": 0.3759201169013977, + "learning_rate": 1.935693724201556e-05, + "loss": 0.3574, + "step": 3070 + }, + { + "epoch": 0.11496132906512138, + "grad_norm": 0.4212549030780792, + "learning_rate": 1.9354863705896464e-05, + "loss": 0.2823, + "step": 3075 + }, + { + "epoch": 0.11514825805547117, + "grad_norm": 0.24448025226593018, + "learning_rate": 1.9352786943552925e-05, + "loss": 0.3305, + "step": 3080 + }, + { + "epoch": 0.11533518704582096, + "grad_norm": 1.0636529922485352, + "learning_rate": 1.9350706955701163e-05, + "loss": 0.4083, + "step": 3085 + }, + { + "epoch": 0.11552211603617075, + "grad_norm": 0.554562509059906, + "learning_rate": 1.9348623743058504e-05, + "loss": 0.3184, + "step": 3090 + }, + { + "epoch": 0.11570904502652055, + "grad_norm": 0.5117464661598206, + "learning_rate": 1.9346537306343384e-05, + "loss": 0.2759, + "step": 3095 + }, + { + "epoch": 0.11589597401687034, + "grad_norm": 0.2832476794719696, + "learning_rate": 1.9344447646275367e-05, + "loss": 0.2997, + "step": 3100 + }, + { + "epoch": 0.11608290300722013, + "grad_norm": 0.38118836283683777, + "learning_rate": 1.9342354763575103e-05, + "loss": 0.3417, + "step": 3105 + }, + { + "epoch": 0.11626983199756992, + "grad_norm": 0.42836499214172363, + "learning_rate": 1.9340258658964376e-05, + "loss": 0.2619, + "step": 3110 + }, + { + "epoch": 0.11645676098791971, + "grad_norm": 0.6346667408943176, + "learning_rate": 1.9338159333166063e-05, + "loss": 0.2843, + "step": 3115 + }, + { + "epoch": 0.1166436899782695, + "grad_norm": 0.4850265681743622, + "learning_rate": 1.9336056786904175e-05, + "loss": 0.3905, + "step": 3120 + }, + { + "epoch": 0.1168306189686193, + "grad_norm": 0.6557062864303589, + "learning_rate": 1.9333951020903812e-05, + "loss": 0.3158, + "step": 3125 + }, + { + "epoch": 0.11701754795896908, + "grad_norm": 0.30885252356529236, + "learning_rate": 1.9331842035891193e-05, + "loss": 0.2957, + "step": 3130 + }, + { + "epoch": 0.11720447694931888, + "grad_norm": 0.3093513250350952, + "learning_rate": 1.9329729832593646e-05, + "loss": 0.3044, + "step": 3135 + }, + { + "epoch": 0.11739140593966867, + "grad_norm": 0.4149554967880249, + "learning_rate": 1.932761441173961e-05, + "loss": 0.3545, + "step": 3140 + }, + { + "epoch": 0.11757833493001846, + "grad_norm": 0.5750139951705933, + "learning_rate": 1.932549577405864e-05, + "loss": 0.2389, + "step": 3145 + }, + { + "epoch": 0.11776526392036825, + "grad_norm": 0.5002467632293701, + "learning_rate": 1.932337392028138e-05, + "loss": 0.3064, + "step": 3150 + }, + { + "epoch": 0.11795219291071804, + "grad_norm": 0.7145326733589172, + "learning_rate": 1.9321248851139605e-05, + "loss": 0.3383, + "step": 3155 + }, + { + "epoch": 0.11813912190106783, + "grad_norm": 0.3954809010028839, + "learning_rate": 1.9319120567366186e-05, + "loss": 0.3623, + "step": 3160 + }, + { + "epoch": 0.11832605089141762, + "grad_norm": 0.637115478515625, + "learning_rate": 1.9316989069695112e-05, + "loss": 0.2777, + "step": 3165 + }, + { + "epoch": 0.11851297988176741, + "grad_norm": 0.45427948236465454, + "learning_rate": 1.9314854358861478e-05, + "loss": 0.5832, + "step": 3170 + }, + { + "epoch": 0.1186999088721172, + "grad_norm": 0.41445136070251465, + "learning_rate": 1.931271643560147e-05, + "loss": 0.3425, + "step": 3175 + }, + { + "epoch": 0.118886837862467, + "grad_norm": 0.6264867186546326, + "learning_rate": 1.9310575300652416e-05, + "loss": 0.2942, + "step": 3180 + }, + { + "epoch": 0.11907376685281679, + "grad_norm": 0.2800760269165039, + "learning_rate": 1.9308430954752717e-05, + "loss": 0.2636, + "step": 3185 + }, + { + "epoch": 0.11926069584316658, + "grad_norm": 0.49179184436798096, + "learning_rate": 1.9306283398641906e-05, + "loss": 0.2834, + "step": 3190 + }, + { + "epoch": 0.11944762483351637, + "grad_norm": 0.17769566178321838, + "learning_rate": 1.9304132633060605e-05, + "loss": 0.3386, + "step": 3195 + }, + { + "epoch": 0.11963455382386616, + "grad_norm": 0.35259896516799927, + "learning_rate": 1.930197865875056e-05, + "loss": 0.3855, + "step": 3200 + }, + { + "epoch": 0.11982148281421595, + "grad_norm": 0.44811856746673584, + "learning_rate": 1.929982147645461e-05, + "loss": 0.3237, + "step": 3205 + }, + { + "epoch": 0.12000841180456574, + "grad_norm": 0.6957955360412598, + "learning_rate": 1.9297661086916704e-05, + "loss": 0.3235, + "step": 3210 + }, + { + "epoch": 0.12019534079491553, + "grad_norm": 0.35209953784942627, + "learning_rate": 1.9295497490881902e-05, + "loss": 0.3576, + "step": 3215 + }, + { + "epoch": 0.12038226978526533, + "grad_norm": 0.30803924798965454, + "learning_rate": 1.9293330689096366e-05, + "loss": 0.313, + "step": 3220 + }, + { + "epoch": 0.12056919877561512, + "grad_norm": 0.39552509784698486, + "learning_rate": 1.9291160682307363e-05, + "loss": 0.2947, + "step": 3225 + }, + { + "epoch": 0.12075612776596491, + "grad_norm": 0.5420857667922974, + "learning_rate": 1.9288987471263266e-05, + "loss": 0.3885, + "step": 3230 + }, + { + "epoch": 0.1209430567563147, + "grad_norm": 0.36529114842414856, + "learning_rate": 1.928681105671355e-05, + "loss": 0.3692, + "step": 3235 + }, + { + "epoch": 0.12112998574666449, + "grad_norm": 0.27940261363983154, + "learning_rate": 1.9284631439408804e-05, + "loss": 0.2968, + "step": 3240 + }, + { + "epoch": 0.12131691473701428, + "grad_norm": 0.38085952401161194, + "learning_rate": 1.9282448620100716e-05, + "loss": 0.3231, + "step": 3245 + }, + { + "epoch": 0.12150384372736407, + "grad_norm": 0.4754383862018585, + "learning_rate": 1.928026259954207e-05, + "loss": 0.396, + "step": 3250 + }, + { + "epoch": 0.12169077271771386, + "grad_norm": 0.5790641903877258, + "learning_rate": 1.927807337848677e-05, + "loss": 0.2934, + "step": 3255 + }, + { + "epoch": 0.12187770170806365, + "grad_norm": 0.4564800560474396, + "learning_rate": 1.927588095768981e-05, + "loss": 0.2913, + "step": 3260 + }, + { + "epoch": 0.12206463069841345, + "grad_norm": 0.5212480425834656, + "learning_rate": 1.9273685337907295e-05, + "loss": 0.2929, + "step": 3265 + }, + { + "epoch": 0.12225155968876324, + "grad_norm": 0.43916815519332886, + "learning_rate": 1.9271486519896434e-05, + "loss": 0.345, + "step": 3270 + }, + { + "epoch": 0.12243848867911303, + "grad_norm": 0.47202667593955994, + "learning_rate": 1.926928450441553e-05, + "loss": 0.3599, + "step": 3275 + }, + { + "epoch": 0.12262541766946282, + "grad_norm": 0.3427533507347107, + "learning_rate": 1.9267079292224005e-05, + "loss": 0.2895, + "step": 3280 + }, + { + "epoch": 0.12281234665981261, + "grad_norm": 0.23820289969444275, + "learning_rate": 1.9264870884082362e-05, + "loss": 0.2214, + "step": 3285 + }, + { + "epoch": 0.1229992756501624, + "grad_norm": 0.7368319630622864, + "learning_rate": 1.9262659280752224e-05, + "loss": 0.2429, + "step": 3290 + }, + { + "epoch": 0.12318620464051218, + "grad_norm": 0.36256974935531616, + "learning_rate": 1.9260444482996313e-05, + "loss": 0.3072, + "step": 3295 + }, + { + "epoch": 0.12337313363086197, + "grad_norm": 0.44561657309532166, + "learning_rate": 1.9258226491578443e-05, + "loss": 0.2567, + "step": 3300 + }, + { + "epoch": 0.12356006262121176, + "grad_norm": 0.708128035068512, + "learning_rate": 1.9256005307263536e-05, + "loss": 0.3169, + "step": 3305 + }, + { + "epoch": 0.12374699161156155, + "grad_norm": 0.37106049060821533, + "learning_rate": 1.925378093081762e-05, + "loss": 0.2737, + "step": 3310 + }, + { + "epoch": 0.12393392060191134, + "grad_norm": 3.3657779693603516, + "learning_rate": 1.925155336300781e-05, + "loss": 0.3468, + "step": 3315 + }, + { + "epoch": 0.12412084959226113, + "grad_norm": 0.33611029386520386, + "learning_rate": 1.9249322604602342e-05, + "loss": 0.3304, + "step": 3320 + }, + { + "epoch": 0.12430777858261093, + "grad_norm": 0.4634288251399994, + "learning_rate": 1.9247088656370528e-05, + "loss": 0.2953, + "step": 3325 + }, + { + "epoch": 0.12449470757296072, + "grad_norm": 1.3842734098434448, + "learning_rate": 1.9244851519082802e-05, + "loss": 0.4189, + "step": 3330 + }, + { + "epoch": 0.12468163656331051, + "grad_norm": 0.5978188514709473, + "learning_rate": 1.924261119351069e-05, + "loss": 0.3122, + "step": 3335 + }, + { + "epoch": 0.1248685655536603, + "grad_norm": 0.30435624718666077, + "learning_rate": 1.9240367680426804e-05, + "loss": 0.3596, + "step": 3340 + }, + { + "epoch": 0.1250554945440101, + "grad_norm": 0.30297866463661194, + "learning_rate": 1.923812098060488e-05, + "loss": 0.3459, + "step": 3345 + }, + { + "epoch": 0.12524242353435988, + "grad_norm": 0.3620128929615021, + "learning_rate": 1.923587109481973e-05, + "loss": 0.342, + "step": 3350 + }, + { + "epoch": 0.1254293525247097, + "grad_norm": 0.3281639516353607, + "learning_rate": 1.9233618023847285e-05, + "loss": 0.2632, + "step": 3355 + }, + { + "epoch": 0.12561628151505946, + "grad_norm": 0.3437563478946686, + "learning_rate": 1.923136176846456e-05, + "loss": 0.3144, + "step": 3360 + }, + { + "epoch": 0.12580321050540927, + "grad_norm": 0.4065919816493988, + "learning_rate": 1.9229102329449674e-05, + "loss": 0.328, + "step": 3365 + }, + { + "epoch": 0.12599013949575905, + "grad_norm": 0.714131236076355, + "learning_rate": 1.9226839707581838e-05, + "loss": 0.3692, + "step": 3370 + }, + { + "epoch": 0.12617706848610885, + "grad_norm": 0.6316368579864502, + "learning_rate": 1.9224573903641374e-05, + "loss": 0.3259, + "step": 3375 + }, + { + "epoch": 0.12636399747645863, + "grad_norm": 0.23467862606048584, + "learning_rate": 1.9222304918409684e-05, + "loss": 0.2745, + "step": 3380 + }, + { + "epoch": 0.12655092646680843, + "grad_norm": 0.5963695645332336, + "learning_rate": 1.9220032752669282e-05, + "loss": 0.297, + "step": 3385 + }, + { + "epoch": 0.1267378554571582, + "grad_norm": 0.38383743166923523, + "learning_rate": 1.921775740720377e-05, + "loss": 0.3192, + "step": 3390 + }, + { + "epoch": 0.12692478444750802, + "grad_norm": 0.5108599662780762, + "learning_rate": 1.9215478882797852e-05, + "loss": 0.321, + "step": 3395 + }, + { + "epoch": 0.1271117134378578, + "grad_norm": 0.6184415221214294, + "learning_rate": 1.9213197180237325e-05, + "loss": 0.3903, + "step": 3400 + }, + { + "epoch": 0.1272986424282076, + "grad_norm": 0.4752747416496277, + "learning_rate": 1.921091230030908e-05, + "loss": 0.2943, + "step": 3405 + }, + { + "epoch": 0.12748557141855738, + "grad_norm": 0.4967573583126068, + "learning_rate": 1.9208624243801107e-05, + "loss": 0.3427, + "step": 3410 + }, + { + "epoch": 0.12767250040890718, + "grad_norm": 0.6884995102882385, + "learning_rate": 1.9206333011502497e-05, + "loss": 0.2472, + "step": 3415 + }, + { + "epoch": 0.12785942939925696, + "grad_norm": 0.49374425411224365, + "learning_rate": 1.9204038604203423e-05, + "loss": 0.3751, + "step": 3420 + }, + { + "epoch": 0.12804635838960673, + "grad_norm": 0.38376346230506897, + "learning_rate": 1.9201741022695165e-05, + "loss": 0.3392, + "step": 3425 + }, + { + "epoch": 0.12823328737995654, + "grad_norm": 0.4707247018814087, + "learning_rate": 1.919944026777009e-05, + "loss": 0.2892, + "step": 3430 + }, + { + "epoch": 0.12842021637030632, + "grad_norm": 0.5075487494468689, + "learning_rate": 1.9197136340221667e-05, + "loss": 0.2932, + "step": 3435 + }, + { + "epoch": 0.12860714536065612, + "grad_norm": 0.3079968988895416, + "learning_rate": 1.9194829240844448e-05, + "loss": 0.3229, + "step": 3440 + }, + { + "epoch": 0.1287940743510059, + "grad_norm": 0.41244247555732727, + "learning_rate": 1.919251897043409e-05, + "loss": 0.3227, + "step": 3445 + }, + { + "epoch": 0.1289810033413557, + "grad_norm": 0.6245453953742981, + "learning_rate": 1.9190205529787336e-05, + "loss": 0.3171, + "step": 3450 + }, + { + "epoch": 0.12916793233170548, + "grad_norm": 0.5110964775085449, + "learning_rate": 1.9187888919702035e-05, + "loss": 0.3235, + "step": 3455 + }, + { + "epoch": 0.1293548613220553, + "grad_norm": 0.5428165197372437, + "learning_rate": 1.9185569140977104e-05, + "loss": 0.2773, + "step": 3460 + }, + { + "epoch": 0.12954179031240506, + "grad_norm": 0.24074698984622955, + "learning_rate": 1.9183246194412583e-05, + "loss": 0.2858, + "step": 3465 + }, + { + "epoch": 0.12972871930275487, + "grad_norm": 0.5543522834777832, + "learning_rate": 1.9180920080809575e-05, + "loss": 0.2618, + "step": 3470 + }, + { + "epoch": 0.12991564829310465, + "grad_norm": 0.15020422637462616, + "learning_rate": 1.9178590800970302e-05, + "loss": 0.3799, + "step": 3475 + }, + { + "epoch": 0.13010257728345445, + "grad_norm": 0.5467866659164429, + "learning_rate": 1.9176258355698062e-05, + "loss": 0.335, + "step": 3480 + }, + { + "epoch": 0.13028950627380423, + "grad_norm": 0.3544618487358093, + "learning_rate": 1.917392274579725e-05, + "loss": 0.322, + "step": 3485 + }, + { + "epoch": 0.13047643526415403, + "grad_norm": 0.32355251908302307, + "learning_rate": 1.9171583972073345e-05, + "loss": 0.2544, + "step": 3490 + }, + { + "epoch": 0.1306633642545038, + "grad_norm": 0.3455124795436859, + "learning_rate": 1.916924203533293e-05, + "loss": 0.2684, + "step": 3495 + }, + { + "epoch": 0.13085029324485362, + "grad_norm": 0.5765972137451172, + "learning_rate": 1.9166896936383668e-05, + "loss": 0.3054, + "step": 3500 + }, + { + "epoch": 0.1310372222352034, + "grad_norm": 0.5195266008377075, + "learning_rate": 1.9164548676034312e-05, + "loss": 0.3135, + "step": 3505 + }, + { + "epoch": 0.1312241512255532, + "grad_norm": 0.35077986121177673, + "learning_rate": 1.9162197255094722e-05, + "loss": 0.3495, + "step": 3510 + }, + { + "epoch": 0.13141108021590298, + "grad_norm": 0.34569740295410156, + "learning_rate": 1.915984267437583e-05, + "loss": 0.2855, + "step": 3515 + }, + { + "epoch": 0.13159800920625278, + "grad_norm": 0.36829873919487, + "learning_rate": 1.915748493468966e-05, + "loss": 0.3321, + "step": 3520 + }, + { + "epoch": 0.13178493819660256, + "grad_norm": 0.5086042881011963, + "learning_rate": 1.915512403684933e-05, + "loss": 0.3072, + "step": 3525 + }, + { + "epoch": 0.13197186718695236, + "grad_norm": 0.3874140679836273, + "learning_rate": 1.9152759981669046e-05, + "loss": 0.2989, + "step": 3530 + }, + { + "epoch": 0.13215879617730214, + "grad_norm": 0.4253312945365906, + "learning_rate": 1.9150392769964106e-05, + "loss": 0.3197, + "step": 3535 + }, + { + "epoch": 0.13234572516765195, + "grad_norm": 0.309948205947876, + "learning_rate": 1.914802240255089e-05, + "loss": 0.3147, + "step": 3540 + }, + { + "epoch": 0.13253265415800172, + "grad_norm": 0.38400158286094666, + "learning_rate": 1.9145648880246877e-05, + "loss": 0.3448, + "step": 3545 + }, + { + "epoch": 0.13271958314835153, + "grad_norm": 0.2646450996398926, + "learning_rate": 1.914327220387062e-05, + "loss": 0.4444, + "step": 3550 + }, + { + "epoch": 0.1329065121387013, + "grad_norm": 0.4903238117694855, + "learning_rate": 1.914089237424176e-05, + "loss": 0.3533, + "step": 3555 + }, + { + "epoch": 0.1330934411290511, + "grad_norm": 0.48070570826530457, + "learning_rate": 1.9138509392181047e-05, + "loss": 0.2815, + "step": 3560 + }, + { + "epoch": 0.1332803701194009, + "grad_norm": 0.3975090980529785, + "learning_rate": 1.9136123258510292e-05, + "loss": 0.3287, + "step": 3565 + }, + { + "epoch": 0.1334672991097507, + "grad_norm": 0.3869096040725708, + "learning_rate": 1.9133733974052412e-05, + "loss": 0.3055, + "step": 3570 + }, + { + "epoch": 0.13365422810010047, + "grad_norm": 0.26971435546875, + "learning_rate": 1.9131341539631395e-05, + "loss": 0.3044, + "step": 3575 + }, + { + "epoch": 0.13384115709045027, + "grad_norm": 0.5639915466308594, + "learning_rate": 1.912894595607233e-05, + "loss": 0.2953, + "step": 3580 + }, + { + "epoch": 0.13402808608080005, + "grad_norm": 0.463853120803833, + "learning_rate": 1.912654722420138e-05, + "loss": 0.2509, + "step": 3585 + }, + { + "epoch": 0.13421501507114986, + "grad_norm": 0.38717523217201233, + "learning_rate": 1.9124145344845804e-05, + "loss": 0.3899, + "step": 3590 + }, + { + "epoch": 0.13440194406149963, + "grad_norm": 0.2911614179611206, + "learning_rate": 1.9121740318833938e-05, + "loss": 0.2567, + "step": 3595 + }, + { + "epoch": 0.13458887305184944, + "grad_norm": 0.5385882258415222, + "learning_rate": 1.9119332146995205e-05, + "loss": 0.265, + "step": 3600 + }, + { + "epoch": 0.13477580204219922, + "grad_norm": 0.3904414176940918, + "learning_rate": 1.9116920830160117e-05, + "loss": 0.251, + "step": 3605 + }, + { + "epoch": 0.13496273103254902, + "grad_norm": 0.41085904836654663, + "learning_rate": 1.9114506369160267e-05, + "loss": 0.3327, + "step": 3610 + }, + { + "epoch": 0.1351496600228988, + "grad_norm": 0.6957544088363647, + "learning_rate": 1.9112088764828335e-05, + "loss": 0.3716, + "step": 3615 + }, + { + "epoch": 0.1353365890132486, + "grad_norm": 0.4141400456428528, + "learning_rate": 1.910966801799808e-05, + "loss": 0.2635, + "step": 3620 + }, + { + "epoch": 0.13552351800359838, + "grad_norm": 0.773308277130127, + "learning_rate": 1.910724412950435e-05, + "loss": 0.2762, + "step": 3625 + }, + { + "epoch": 0.13571044699394819, + "grad_norm": 0.4335046708583832, + "learning_rate": 1.910481710018308e-05, + "loss": 0.3463, + "step": 3630 + }, + { + "epoch": 0.13589737598429796, + "grad_norm": 0.6109732985496521, + "learning_rate": 1.9102386930871276e-05, + "loss": 0.3244, + "step": 3635 + }, + { + "epoch": 0.13608430497464777, + "grad_norm": 0.3992200791835785, + "learning_rate": 1.9099953622407038e-05, + "loss": 0.4066, + "step": 3640 + }, + { + "epoch": 0.13627123396499755, + "grad_norm": 0.2817442715167999, + "learning_rate": 1.9097517175629535e-05, + "loss": 0.382, + "step": 3645 + }, + { + "epoch": 0.13645816295534735, + "grad_norm": 0.6066967248916626, + "learning_rate": 1.9095077591379044e-05, + "loss": 0.3066, + "step": 3650 + }, + { + "epoch": 0.13664509194569713, + "grad_norm": 0.638088047504425, + "learning_rate": 1.9092634870496892e-05, + "loss": 0.341, + "step": 3655 + }, + { + "epoch": 0.1368320209360469, + "grad_norm": 0.5067809224128723, + "learning_rate": 1.9090189013825515e-05, + "loss": 0.3039, + "step": 3660 + }, + { + "epoch": 0.1370189499263967, + "grad_norm": 0.3270426392555237, + "learning_rate": 1.908774002220841e-05, + "loss": 0.3165, + "step": 3665 + }, + { + "epoch": 0.1372058789167465, + "grad_norm": 0.46038034558296204, + "learning_rate": 1.908528789649017e-05, + "loss": 0.3287, + "step": 3670 + }, + { + "epoch": 0.1373928079070963, + "grad_norm": 0.35730430483818054, + "learning_rate": 1.9082832637516458e-05, + "loss": 0.2788, + "step": 3675 + }, + { + "epoch": 0.13757973689744607, + "grad_norm": 0.4533166289329529, + "learning_rate": 1.908037424613403e-05, + "loss": 0.3384, + "step": 3680 + }, + { + "epoch": 0.13776666588779587, + "grad_norm": 0.6102305054664612, + "learning_rate": 1.907791272319071e-05, + "loss": 0.3651, + "step": 3685 + }, + { + "epoch": 0.13795359487814565, + "grad_norm": 0.6289255023002625, + "learning_rate": 1.9075448069535406e-05, + "loss": 0.311, + "step": 3690 + }, + { + "epoch": 0.13814052386849546, + "grad_norm": 0.44820600748062134, + "learning_rate": 1.9072980286018104e-05, + "loss": 0.3194, + "step": 3695 + }, + { + "epoch": 0.13832745285884523, + "grad_norm": 0.36444956064224243, + "learning_rate": 1.907050937348988e-05, + "loss": 0.293, + "step": 3700 + }, + { + "epoch": 0.13851438184919504, + "grad_norm": 0.3149167001247406, + "learning_rate": 1.9068035332802874e-05, + "loss": 0.3288, + "step": 3705 + }, + { + "epoch": 0.13870131083954482, + "grad_norm": 0.4111466705799103, + "learning_rate": 1.9065558164810312e-05, + "loss": 0.2532, + "step": 3710 + }, + { + "epoch": 0.13888823982989462, + "grad_norm": 0.37402135133743286, + "learning_rate": 1.9063077870366504e-05, + "loss": 0.3662, + "step": 3715 + }, + { + "epoch": 0.1390751688202444, + "grad_norm": 0.706342875957489, + "learning_rate": 1.9060594450326824e-05, + "loss": 0.2887, + "step": 3720 + }, + { + "epoch": 0.1392620978105942, + "grad_norm": 0.5995466709136963, + "learning_rate": 1.9058107905547737e-05, + "loss": 0.3387, + "step": 3725 + }, + { + "epoch": 0.13944902680094398, + "grad_norm": 0.16998915374279022, + "learning_rate": 1.9055618236886784e-05, + "loss": 0.3133, + "step": 3730 + }, + { + "epoch": 0.1396359557912938, + "grad_norm": 0.3938615024089813, + "learning_rate": 1.9053125445202574e-05, + "loss": 0.3253, + "step": 3735 + }, + { + "epoch": 0.13982288478164356, + "grad_norm": 0.7660725712776184, + "learning_rate": 1.9050629531354806e-05, + "loss": 0.3604, + "step": 3740 + }, + { + "epoch": 0.14000981377199337, + "grad_norm": 0.4029218256473541, + "learning_rate": 1.9048130496204247e-05, + "loss": 0.3705, + "step": 3745 + }, + { + "epoch": 0.14019674276234315, + "grad_norm": 0.5488607883453369, + "learning_rate": 1.9045628340612737e-05, + "loss": 0.2867, + "step": 3750 + }, + { + "epoch": 0.14038367175269295, + "grad_norm": 0.38521480560302734, + "learning_rate": 1.904312306544321e-05, + "loss": 0.2713, + "step": 3755 + }, + { + "epoch": 0.14057060074304273, + "grad_norm": 0.23381686210632324, + "learning_rate": 1.9040614671559647e-05, + "loss": 0.3445, + "step": 3760 + }, + { + "epoch": 0.14075752973339253, + "grad_norm": 0.6382274031639099, + "learning_rate": 1.9038103159827136e-05, + "loss": 0.3306, + "step": 3765 + }, + { + "epoch": 0.1409444587237423, + "grad_norm": 0.17501187324523926, + "learning_rate": 1.9035588531111818e-05, + "loss": 0.2653, + "step": 3770 + }, + { + "epoch": 0.14113138771409212, + "grad_norm": 0.4778057038784027, + "learning_rate": 1.903307078628092e-05, + "loss": 0.3657, + "step": 3775 + }, + { + "epoch": 0.1413183167044419, + "grad_norm": 0.22998036444187164, + "learning_rate": 1.9030549926202732e-05, + "loss": 0.2792, + "step": 3780 + }, + { + "epoch": 0.1415052456947917, + "grad_norm": 0.35750317573547363, + "learning_rate": 1.902802595174664e-05, + "loss": 0.2231, + "step": 3785 + }, + { + "epoch": 0.14169217468514148, + "grad_norm": 0.5821340680122375, + "learning_rate": 1.902549886378308e-05, + "loss": 0.4172, + "step": 3790 + }, + { + "epoch": 0.14187910367549128, + "grad_norm": 0.6900777816772461, + "learning_rate": 1.902296866318357e-05, + "loss": 0.3648, + "step": 3795 + }, + { + "epoch": 0.14206603266584106, + "grad_norm": 0.4445832669734955, + "learning_rate": 1.9020435350820715e-05, + "loss": 0.2523, + "step": 3800 + }, + { + "epoch": 0.14225296165619086, + "grad_norm": 0.43811532855033875, + "learning_rate": 1.9017898927568173e-05, + "loss": 0.3028, + "step": 3805 + }, + { + "epoch": 0.14243989064654064, + "grad_norm": 0.4545620381832123, + "learning_rate": 1.9015359394300686e-05, + "loss": 0.3506, + "step": 3810 + }, + { + "epoch": 0.14262681963689045, + "grad_norm": 0.401100754737854, + "learning_rate": 1.901281675189407e-05, + "loss": 0.3335, + "step": 3815 + }, + { + "epoch": 0.14281374862724022, + "grad_norm": 0.9869058132171631, + "learning_rate": 1.9010271001225203e-05, + "loss": 0.3321, + "step": 3820 + }, + { + "epoch": 0.14300067761759003, + "grad_norm": 0.5079956650733948, + "learning_rate": 1.9007722143172046e-05, + "loss": 0.2671, + "step": 3825 + }, + { + "epoch": 0.1431876066079398, + "grad_norm": 0.5241971015930176, + "learning_rate": 1.9005170178613624e-05, + "loss": 0.2777, + "step": 3830 + }, + { + "epoch": 0.1433745355982896, + "grad_norm": 0.39962106943130493, + "learning_rate": 1.900261510843004e-05, + "loss": 0.3102, + "step": 3835 + }, + { + "epoch": 0.1435614645886394, + "grad_norm": 0.4119807183742523, + "learning_rate": 1.9000056933502466e-05, + "loss": 0.3591, + "step": 3840 + }, + { + "epoch": 0.1437483935789892, + "grad_norm": 0.4325822591781616, + "learning_rate": 1.8997495654713133e-05, + "loss": 0.3438, + "step": 3845 + }, + { + "epoch": 0.14393532256933897, + "grad_norm": 0.2697000503540039, + "learning_rate": 1.8994931272945364e-05, + "loss": 0.3043, + "step": 3850 + }, + { + "epoch": 0.14412225155968877, + "grad_norm": 0.4875340461730957, + "learning_rate": 1.8992363789083534e-05, + "loss": 0.2718, + "step": 3855 + }, + { + "epoch": 0.14430918055003855, + "grad_norm": 0.2768021821975708, + "learning_rate": 1.89897932040131e-05, + "loss": 0.2831, + "step": 3860 + }, + { + "epoch": 0.14449610954038836, + "grad_norm": 0.3775290548801422, + "learning_rate": 1.8987219518620573e-05, + "loss": 0.3754, + "step": 3865 + }, + { + "epoch": 0.14468303853073813, + "grad_norm": 0.4714037775993347, + "learning_rate": 1.8984642733793556e-05, + "loss": 0.3069, + "step": 3870 + }, + { + "epoch": 0.14486996752108794, + "grad_norm": 0.3473433554172516, + "learning_rate": 1.8982062850420705e-05, + "loss": 0.2979, + "step": 3875 + }, + { + "epoch": 0.14505689651143772, + "grad_norm": 0.4559517204761505, + "learning_rate": 1.897947986939174e-05, + "loss": 0.3666, + "step": 3880 + }, + { + "epoch": 0.14524382550178752, + "grad_norm": 0.5417743921279907, + "learning_rate": 1.8976893791597465e-05, + "loss": 0.3124, + "step": 3885 + }, + { + "epoch": 0.1454307544921373, + "grad_norm": 0.2836984097957611, + "learning_rate": 1.8974304617929746e-05, + "loss": 0.2611, + "step": 3890 + }, + { + "epoch": 0.1456176834824871, + "grad_norm": 0.3992460370063782, + "learning_rate": 1.8971712349281506e-05, + "loss": 0.3606, + "step": 3895 + }, + { + "epoch": 0.14580461247283688, + "grad_norm": 0.40167370438575745, + "learning_rate": 1.896911698654675e-05, + "loss": 0.3131, + "step": 3900 + }, + { + "epoch": 0.14599154146318666, + "grad_norm": 0.45140784978866577, + "learning_rate": 1.8966518530620542e-05, + "loss": 0.3149, + "step": 3905 + }, + { + "epoch": 0.14617847045353646, + "grad_norm": 0.5974220633506775, + "learning_rate": 1.8963916982399014e-05, + "loss": 0.2877, + "step": 3910 + }, + { + "epoch": 0.14636539944388624, + "grad_norm": 0.2815289795398712, + "learning_rate": 1.8961312342779374e-05, + "loss": 0.3502, + "step": 3915 + }, + { + "epoch": 0.14655232843423605, + "grad_norm": 0.3491719365119934, + "learning_rate": 1.8958704612659876e-05, + "loss": 0.2682, + "step": 3920 + }, + { + "epoch": 0.14673925742458582, + "grad_norm": 0.28988122940063477, + "learning_rate": 1.8956093792939855e-05, + "loss": 0.369, + "step": 3925 + }, + { + "epoch": 0.14692618641493563, + "grad_norm": 0.6046833992004395, + "learning_rate": 1.895347988451971e-05, + "loss": 0.4345, + "step": 3930 + }, + { + "epoch": 0.1471131154052854, + "grad_norm": 0.5748773813247681, + "learning_rate": 1.895086288830091e-05, + "loss": 0.3157, + "step": 3935 + }, + { + "epoch": 0.1473000443956352, + "grad_norm": 2.4972338676452637, + "learning_rate": 1.8948242805185966e-05, + "loss": 0.4136, + "step": 3940 + }, + { + "epoch": 0.147486973385985, + "grad_norm": 0.41845616698265076, + "learning_rate": 1.8945619636078483e-05, + "loss": 0.2917, + "step": 3945 + }, + { + "epoch": 0.1476739023763348, + "grad_norm": 0.42868664860725403, + "learning_rate": 1.894299338188311e-05, + "loss": 0.2979, + "step": 3950 + }, + { + "epoch": 0.14786083136668457, + "grad_norm": 0.5431304574012756, + "learning_rate": 1.8940364043505568e-05, + "loss": 0.3204, + "step": 3955 + }, + { + "epoch": 0.14804776035703437, + "grad_norm": 0.40625184774398804, + "learning_rate": 1.893773162185264e-05, + "loss": 0.2749, + "step": 3960 + }, + { + "epoch": 0.14823468934738415, + "grad_norm": 0.4661557078361511, + "learning_rate": 1.893509611783218e-05, + "loss": 0.326, + "step": 3965 + }, + { + "epoch": 0.14842161833773396, + "grad_norm": 0.27836883068084717, + "learning_rate": 1.8932457532353087e-05, + "loss": 0.2812, + "step": 3970 + }, + { + "epoch": 0.14860854732808373, + "grad_norm": 0.3862547278404236, + "learning_rate": 1.892981586632534e-05, + "loss": 0.3615, + "step": 3975 + }, + { + "epoch": 0.14879547631843354, + "grad_norm": 0.46969127655029297, + "learning_rate": 1.892717112065997e-05, + "loss": 0.3065, + "step": 3980 + }, + { + "epoch": 0.14898240530878332, + "grad_norm": 0.4676092565059662, + "learning_rate": 1.8924523296269077e-05, + "loss": 0.3214, + "step": 3985 + }, + { + "epoch": 0.14916933429913312, + "grad_norm": 0.5357167720794678, + "learning_rate": 1.8921872394065822e-05, + "loss": 0.3075, + "step": 3990 + }, + { + "epoch": 0.1493562632894829, + "grad_norm": 0.5506564974784851, + "learning_rate": 1.891921841496442e-05, + "loss": 0.2875, + "step": 3995 + }, + { + "epoch": 0.1495431922798327, + "grad_norm": 0.2982819080352783, + "learning_rate": 1.8916561359880153e-05, + "loss": 0.3099, + "step": 4000 + }, + { + "epoch": 0.14973012127018248, + "grad_norm": 1.2449369430541992, + "learning_rate": 1.8913901229729367e-05, + "loss": 0.385, + "step": 4005 + }, + { + "epoch": 0.1499170502605323, + "grad_norm": 0.7360920310020447, + "learning_rate": 1.8911238025429464e-05, + "loss": 0.3481, + "step": 4010 + }, + { + "epoch": 0.15010397925088206, + "grad_norm": 0.37266823649406433, + "learning_rate": 1.8908571747898902e-05, + "loss": 0.262, + "step": 4015 + }, + { + "epoch": 0.15029090824123187, + "grad_norm": 0.5776626467704773, + "learning_rate": 1.8905902398057208e-05, + "loss": 0.3394, + "step": 4020 + }, + { + "epoch": 0.15047783723158165, + "grad_norm": 0.6980562210083008, + "learning_rate": 1.8903229976824963e-05, + "loss": 0.3497, + "step": 4025 + }, + { + "epoch": 0.15066476622193145, + "grad_norm": 0.46382442116737366, + "learning_rate": 1.890055448512381e-05, + "loss": 0.2935, + "step": 4030 + }, + { + "epoch": 0.15085169521228123, + "grad_norm": 0.4823243319988251, + "learning_rate": 1.889787592387645e-05, + "loss": 0.2739, + "step": 4035 + }, + { + "epoch": 0.15103862420263103, + "grad_norm": 0.4817698895931244, + "learning_rate": 1.8895194294006635e-05, + "loss": 0.2478, + "step": 4040 + }, + { + "epoch": 0.1512255531929808, + "grad_norm": 0.6213811039924622, + "learning_rate": 1.8892509596439192e-05, + "loss": 0.3307, + "step": 4045 + }, + { + "epoch": 0.15141248218333062, + "grad_norm": 0.4142370820045471, + "learning_rate": 1.8889821832099988e-05, + "loss": 0.3091, + "step": 4050 + }, + { + "epoch": 0.1515994111736804, + "grad_norm": 0.5155577659606934, + "learning_rate": 1.8887131001915964e-05, + "loss": 0.3226, + "step": 4055 + }, + { + "epoch": 0.1517863401640302, + "grad_norm": 0.6737167239189148, + "learning_rate": 1.8884437106815103e-05, + "loss": 0.3247, + "step": 4060 + }, + { + "epoch": 0.15197326915437998, + "grad_norm": 0.2817344665527344, + "learning_rate": 1.8881740147726458e-05, + "loss": 0.3737, + "step": 4065 + }, + { + "epoch": 0.15216019814472978, + "grad_norm": 0.3459798991680145, + "learning_rate": 1.887904012558013e-05, + "loss": 0.3037, + "step": 4070 + }, + { + "epoch": 0.15234712713507956, + "grad_norm": 0.34796640276908875, + "learning_rate": 1.8876337041307275e-05, + "loss": 0.2671, + "step": 4075 + }, + { + "epoch": 0.15253405612542936, + "grad_norm": 0.6775915622711182, + "learning_rate": 1.8873630895840114e-05, + "loss": 0.3625, + "step": 4080 + }, + { + "epoch": 0.15272098511577914, + "grad_norm": 0.35140863060951233, + "learning_rate": 1.887092169011192e-05, + "loss": 0.2942, + "step": 4085 + }, + { + "epoch": 0.15290791410612894, + "grad_norm": 0.8965099453926086, + "learning_rate": 1.8868209425057025e-05, + "loss": 0.3787, + "step": 4090 + }, + { + "epoch": 0.15309484309647872, + "grad_norm": 0.4382193684577942, + "learning_rate": 1.88654941016108e-05, + "loss": 0.3455, + "step": 4095 + }, + { + "epoch": 0.15328177208682853, + "grad_norm": 0.4126281142234802, + "learning_rate": 1.8862775720709686e-05, + "loss": 0.2807, + "step": 4100 + }, + { + "epoch": 0.1534687010771783, + "grad_norm": 0.31981319189071655, + "learning_rate": 1.886005428329118e-05, + "loss": 0.3167, + "step": 4105 + }, + { + "epoch": 0.1536556300675281, + "grad_norm": 0.41247910261154175, + "learning_rate": 1.8857329790293824e-05, + "loss": 0.2936, + "step": 4110 + }, + { + "epoch": 0.1538425590578779, + "grad_norm": 0.48248907923698425, + "learning_rate": 1.885460224265722e-05, + "loss": 0.3004, + "step": 4115 + }, + { + "epoch": 0.1540294880482277, + "grad_norm": 0.578687846660614, + "learning_rate": 1.8851871641322016e-05, + "loss": 0.28, + "step": 4120 + }, + { + "epoch": 0.15421641703857747, + "grad_norm": 0.20185329020023346, + "learning_rate": 1.884913798722992e-05, + "loss": 0.334, + "step": 4125 + }, + { + "epoch": 0.15440334602892727, + "grad_norm": 0.5341799259185791, + "learning_rate": 1.8846401281323693e-05, + "loss": 0.3087, + "step": 4130 + }, + { + "epoch": 0.15459027501927705, + "grad_norm": 0.2198217362165451, + "learning_rate": 1.884366152454715e-05, + "loss": 0.2779, + "step": 4135 + }, + { + "epoch": 0.15477720400962686, + "grad_norm": 0.4372238218784332, + "learning_rate": 1.8840918717845146e-05, + "loss": 0.3726, + "step": 4140 + }, + { + "epoch": 0.15496413299997663, + "grad_norm": 0.48354169726371765, + "learning_rate": 1.88381728621636e-05, + "loss": 0.3891, + "step": 4145 + }, + { + "epoch": 0.1551510619903264, + "grad_norm": 0.5194223523139954, + "learning_rate": 1.883542395844948e-05, + "loss": 0.3032, + "step": 4150 + }, + { + "epoch": 0.15533799098067622, + "grad_norm": 0.23291267454624176, + "learning_rate": 1.8832672007650805e-05, + "loss": 0.2777, + "step": 4155 + }, + { + "epoch": 0.155524919971026, + "grad_norm": 0.8497887253761292, + "learning_rate": 1.882991701071664e-05, + "loss": 0.3094, + "step": 4160 + }, + { + "epoch": 0.1557118489613758, + "grad_norm": 0.4399624466896057, + "learning_rate": 1.8827158968597113e-05, + "loss": 0.3451, + "step": 4165 + }, + { + "epoch": 0.15589877795172558, + "grad_norm": 0.3427572250366211, + "learning_rate": 1.8824397882243382e-05, + "loss": 0.2513, + "step": 4170 + }, + { + "epoch": 0.15608570694207538, + "grad_norm": 0.5564326643943787, + "learning_rate": 1.8821633752607672e-05, + "loss": 0.3496, + "step": 4175 + }, + { + "epoch": 0.15627263593242516, + "grad_norm": 0.4042254090309143, + "learning_rate": 1.8818866580643254e-05, + "loss": 0.3207, + "step": 4180 + }, + { + "epoch": 0.15645956492277496, + "grad_norm": 0.25726428627967834, + "learning_rate": 1.8816096367304447e-05, + "loss": 0.2839, + "step": 4185 + }, + { + "epoch": 0.15664649391312474, + "grad_norm": 0.6856520175933838, + "learning_rate": 1.8813323113546614e-05, + "loss": 0.3228, + "step": 4190 + }, + { + "epoch": 0.15683342290347455, + "grad_norm": 0.27812889218330383, + "learning_rate": 1.8810546820326173e-05, + "loss": 0.2724, + "step": 4195 + }, + { + "epoch": 0.15702035189382432, + "grad_norm": 0.48270055651664734, + "learning_rate": 1.880776748860059e-05, + "loss": 0.4022, + "step": 4200 + }, + { + "epoch": 0.15720728088417413, + "grad_norm": 0.8398357033729553, + "learning_rate": 1.8804985119328375e-05, + "loss": 0.3086, + "step": 4205 + }, + { + "epoch": 0.1573942098745239, + "grad_norm": 0.34941208362579346, + "learning_rate": 1.8802199713469084e-05, + "loss": 0.3078, + "step": 4210 + }, + { + "epoch": 0.1575811388648737, + "grad_norm": 0.3891669511795044, + "learning_rate": 1.8799411271983325e-05, + "loss": 0.2914, + "step": 4215 + }, + { + "epoch": 0.1577680678552235, + "grad_norm": 0.4491298794746399, + "learning_rate": 1.8796619795832758e-05, + "loss": 0.3676, + "step": 4220 + }, + { + "epoch": 0.1579549968455733, + "grad_norm": 0.3361729383468628, + "learning_rate": 1.8793825285980076e-05, + "loss": 0.3815, + "step": 4225 + }, + { + "epoch": 0.15814192583592307, + "grad_norm": 0.3507533371448517, + "learning_rate": 1.879102774338903e-05, + "loss": 0.3435, + "step": 4230 + }, + { + "epoch": 0.15832885482627287, + "grad_norm": 0.44350340962409973, + "learning_rate": 1.8788227169024406e-05, + "loss": 0.2433, + "step": 4235 + }, + { + "epoch": 0.15851578381662265, + "grad_norm": 0.4336090683937073, + "learning_rate": 1.878542356385205e-05, + "loss": 0.3624, + "step": 4240 + }, + { + "epoch": 0.15870271280697246, + "grad_norm": 0.5720579028129578, + "learning_rate": 1.878261692883884e-05, + "loss": 0.3331, + "step": 4245 + }, + { + "epoch": 0.15888964179732223, + "grad_norm": 0.5189465880393982, + "learning_rate": 1.8779807264952704e-05, + "loss": 0.3359, + "step": 4250 + }, + { + "epoch": 0.15907657078767204, + "grad_norm": 0.2319631725549698, + "learning_rate": 1.8776994573162615e-05, + "loss": 0.3025, + "step": 4255 + }, + { + "epoch": 0.15926349977802182, + "grad_norm": 0.25383996963500977, + "learning_rate": 1.877417885443859e-05, + "loss": 0.332, + "step": 4260 + }, + { + "epoch": 0.15945042876837162, + "grad_norm": 0.4644213020801544, + "learning_rate": 1.8771360109751694e-05, + "loss": 0.4095, + "step": 4265 + }, + { + "epoch": 0.1596373577587214, + "grad_norm": 0.37912002205848694, + "learning_rate": 1.8768538340074024e-05, + "loss": 0.3924, + "step": 4270 + }, + { + "epoch": 0.1598242867490712, + "grad_norm": 0.3221535086631775, + "learning_rate": 1.8765713546378733e-05, + "loss": 0.3375, + "step": 4275 + }, + { + "epoch": 0.16001121573942098, + "grad_norm": 0.3765072822570801, + "learning_rate": 1.8762885729640007e-05, + "loss": 0.3454, + "step": 4280 + }, + { + "epoch": 0.16019814472977079, + "grad_norm": 0.43610912561416626, + "learning_rate": 1.8760054890833083e-05, + "loss": 0.2819, + "step": 4285 + }, + { + "epoch": 0.16038507372012056, + "grad_norm": 0.5509352684020996, + "learning_rate": 1.8757221030934234e-05, + "loss": 0.3796, + "step": 4290 + }, + { + "epoch": 0.16057200271047037, + "grad_norm": 0.43372535705566406, + "learning_rate": 1.8754384150920777e-05, + "loss": 0.3896, + "step": 4295 + }, + { + "epoch": 0.16075893170082015, + "grad_norm": 0.35043320059776306, + "learning_rate": 1.8751544251771072e-05, + "loss": 0.3586, + "step": 4300 + }, + { + "epoch": 0.16094586069116995, + "grad_norm": 0.6886361837387085, + "learning_rate": 1.874870133446452e-05, + "loss": 0.3648, + "step": 4305 + }, + { + "epoch": 0.16113278968151973, + "grad_norm": 0.6359232068061829, + "learning_rate": 1.8745855399981555e-05, + "loss": 0.2569, + "step": 4310 + }, + { + "epoch": 0.16131971867186953, + "grad_norm": 0.4465806782245636, + "learning_rate": 1.8743006449303663e-05, + "loss": 0.3305, + "step": 4315 + }, + { + "epoch": 0.1615066476622193, + "grad_norm": 0.6533617973327637, + "learning_rate": 1.874015448341337e-05, + "loss": 0.3634, + "step": 4320 + }, + { + "epoch": 0.16169357665256912, + "grad_norm": 0.4724544286727905, + "learning_rate": 1.8737299503294233e-05, + "loss": 0.2838, + "step": 4325 + }, + { + "epoch": 0.1618805056429189, + "grad_norm": 0.4181117117404938, + "learning_rate": 1.873444150993085e-05, + "loss": 0.2875, + "step": 4330 + }, + { + "epoch": 0.1620674346332687, + "grad_norm": 0.2569214105606079, + "learning_rate": 1.8731580504308865e-05, + "loss": 0.4176, + "step": 4335 + }, + { + "epoch": 0.16225436362361847, + "grad_norm": 1.067435622215271, + "learning_rate": 1.872871648741495e-05, + "loss": 0.3486, + "step": 4340 + }, + { + "epoch": 0.16244129261396828, + "grad_norm": 0.20301935076713562, + "learning_rate": 1.8725849460236833e-05, + "loss": 0.2639, + "step": 4345 + }, + { + "epoch": 0.16262822160431806, + "grad_norm": 0.3194063901901245, + "learning_rate": 1.8722979423763264e-05, + "loss": 0.2772, + "step": 4350 + }, + { + "epoch": 0.16281515059466786, + "grad_norm": 0.5217241048812866, + "learning_rate": 1.872010637898404e-05, + "loss": 0.3767, + "step": 4355 + }, + { + "epoch": 0.16300207958501764, + "grad_norm": 0.658311128616333, + "learning_rate": 1.8717230326889984e-05, + "loss": 0.4492, + "step": 4360 + }, + { + "epoch": 0.16318900857536744, + "grad_norm": 0.26515570282936096, + "learning_rate": 1.871435126847297e-05, + "loss": 0.2921, + "step": 4365 + }, + { + "epoch": 0.16337593756571722, + "grad_norm": 0.3176003396511078, + "learning_rate": 1.87114692047259e-05, + "loss": 0.3403, + "step": 4370 + }, + { + "epoch": 0.16356286655606703, + "grad_norm": 0.5091459155082703, + "learning_rate": 1.8708584136642717e-05, + "loss": 0.3209, + "step": 4375 + }, + { + "epoch": 0.1637497955464168, + "grad_norm": 0.35556361079216003, + "learning_rate": 1.8705696065218398e-05, + "loss": 0.311, + "step": 4380 + }, + { + "epoch": 0.1639367245367666, + "grad_norm": 0.43555188179016113, + "learning_rate": 1.8702804991448955e-05, + "loss": 0.3309, + "step": 4385 + }, + { + "epoch": 0.1641236535271164, + "grad_norm": 0.2813642919063568, + "learning_rate": 1.8699910916331438e-05, + "loss": 0.2802, + "step": 4390 + }, + { + "epoch": 0.16431058251746616, + "grad_norm": 0.32983142137527466, + "learning_rate": 1.869701384086393e-05, + "loss": 0.3307, + "step": 4395 + }, + { + "epoch": 0.16449751150781597, + "grad_norm": 0.8605227470397949, + "learning_rate": 1.8694113766045552e-05, + "loss": 0.2888, + "step": 4400 + }, + { + "epoch": 0.16468444049816575, + "grad_norm": 0.4275653660297394, + "learning_rate": 1.869121069287645e-05, + "loss": 0.3053, + "step": 4405 + }, + { + "epoch": 0.16487136948851555, + "grad_norm": 0.32885563373565674, + "learning_rate": 1.8688304622357817e-05, + "loss": 0.3125, + "step": 4410 + }, + { + "epoch": 0.16505829847886533, + "grad_norm": 0.38488566875457764, + "learning_rate": 1.868539555549187e-05, + "loss": 0.2482, + "step": 4415 + }, + { + "epoch": 0.16524522746921513, + "grad_norm": 0.3290596902370453, + "learning_rate": 1.8682483493281864e-05, + "loss": 0.3019, + "step": 4420 + }, + { + "epoch": 0.1654321564595649, + "grad_norm": 0.9788287878036499, + "learning_rate": 1.8679568436732084e-05, + "loss": 0.3028, + "step": 4425 + }, + { + "epoch": 0.16561908544991472, + "grad_norm": 0.728779137134552, + "learning_rate": 1.8676650386847855e-05, + "loss": 0.3395, + "step": 4430 + }, + { + "epoch": 0.1658060144402645, + "grad_norm": 0.40499404072761536, + "learning_rate": 1.8673729344635524e-05, + "loss": 0.4077, + "step": 4435 + }, + { + "epoch": 0.1659929434306143, + "grad_norm": 1.272477626800537, + "learning_rate": 1.8670805311102477e-05, + "loss": 0.3258, + "step": 4440 + }, + { + "epoch": 0.16617987242096408, + "grad_norm": 0.32982587814331055, + "learning_rate": 1.8667878287257124e-05, + "loss": 0.283, + "step": 4445 + }, + { + "epoch": 0.16636680141131388, + "grad_norm": 0.4656912088394165, + "learning_rate": 1.8664948274108918e-05, + "loss": 0.2622, + "step": 4450 + }, + { + "epoch": 0.16655373040166366, + "grad_norm": 0.3891690671443939, + "learning_rate": 1.866201527266834e-05, + "loss": 0.3216, + "step": 4455 + }, + { + "epoch": 0.16674065939201346, + "grad_norm": 0.40880194306373596, + "learning_rate": 1.8659079283946882e-05, + "loss": 0.2894, + "step": 4460 + }, + { + "epoch": 0.16692758838236324, + "grad_norm": 1.5993868112564087, + "learning_rate": 1.86561403089571e-05, + "loss": 0.3405, + "step": 4465 + }, + { + "epoch": 0.16711451737271305, + "grad_norm": 0.5294637084007263, + "learning_rate": 1.8653198348712552e-05, + "loss": 0.2742, + "step": 4470 + }, + { + "epoch": 0.16730144636306282, + "grad_norm": 0.47895947098731995, + "learning_rate": 1.865025340422784e-05, + "loss": 0.271, + "step": 4475 + }, + { + "epoch": 0.16748837535341263, + "grad_norm": 0.17551173269748688, + "learning_rate": 1.864730547651859e-05, + "loss": 0.2728, + "step": 4480 + }, + { + "epoch": 0.1676753043437624, + "grad_norm": 0.5507614612579346, + "learning_rate": 1.8644354566601458e-05, + "loss": 0.3346, + "step": 4485 + }, + { + "epoch": 0.1678622333341122, + "grad_norm": 0.4217040240764618, + "learning_rate": 1.864140067549413e-05, + "loss": 0.3026, + "step": 4490 + }, + { + "epoch": 0.168049162324462, + "grad_norm": 0.31403791904449463, + "learning_rate": 1.8638443804215315e-05, + "loss": 0.2778, + "step": 4495 + }, + { + "epoch": 0.1682360913148118, + "grad_norm": 0.3219631016254425, + "learning_rate": 1.8635483953784755e-05, + "loss": 0.3311, + "step": 4500 + }, + { + "epoch": 0.16842302030516157, + "grad_norm": 0.708217442035675, + "learning_rate": 1.8632521125223215e-05, + "loss": 0.4032, + "step": 4505 + }, + { + "epoch": 0.16860994929551137, + "grad_norm": 1.9042284488677979, + "learning_rate": 1.8629555319552492e-05, + "loss": 0.3184, + "step": 4510 + }, + { + "epoch": 0.16879687828586115, + "grad_norm": 0.42551693320274353, + "learning_rate": 1.862658653779541e-05, + "loss": 0.2671, + "step": 4515 + }, + { + "epoch": 0.16898380727621096, + "grad_norm": 0.22539173066616058, + "learning_rate": 1.8623614780975813e-05, + "loss": 0.3398, + "step": 4520 + }, + { + "epoch": 0.16917073626656073, + "grad_norm": 0.353363037109375, + "learning_rate": 1.862064005011858e-05, + "loss": 0.2993, + "step": 4525 + }, + { + "epoch": 0.16935766525691054, + "grad_norm": 0.25013571977615356, + "learning_rate": 1.86176623462496e-05, + "loss": 0.2873, + "step": 4530 + }, + { + "epoch": 0.16954459424726032, + "grad_norm": 0.28396397829055786, + "learning_rate": 1.8614681670395808e-05, + "loss": 0.326, + "step": 4535 + }, + { + "epoch": 0.16973152323761012, + "grad_norm": 0.33238470554351807, + "learning_rate": 1.8611698023585146e-05, + "loss": 0.3196, + "step": 4540 + }, + { + "epoch": 0.1699184522279599, + "grad_norm": 0.4479984939098358, + "learning_rate": 1.8608711406846595e-05, + "loss": 0.3363, + "step": 4545 + }, + { + "epoch": 0.1701053812183097, + "grad_norm": 0.4897647500038147, + "learning_rate": 1.8605721821210146e-05, + "loss": 0.3515, + "step": 4550 + }, + { + "epoch": 0.17029231020865948, + "grad_norm": 0.33985239267349243, + "learning_rate": 1.8602729267706833e-05, + "loss": 0.3559, + "step": 4555 + }, + { + "epoch": 0.17047923919900929, + "grad_norm": 0.4013058543205261, + "learning_rate": 1.859973374736869e-05, + "loss": 0.2976, + "step": 4560 + }, + { + "epoch": 0.17066616818935906, + "grad_norm": 0.3209231197834015, + "learning_rate": 1.8596735261228793e-05, + "loss": 0.3168, + "step": 4565 + }, + { + "epoch": 0.17085309717970887, + "grad_norm": 0.5259041786193848, + "learning_rate": 1.859373381032123e-05, + "loss": 0.3052, + "step": 4570 + }, + { + "epoch": 0.17104002617005865, + "grad_norm": 0.34932300448417664, + "learning_rate": 1.8590729395681122e-05, + "loss": 0.3515, + "step": 4575 + }, + { + "epoch": 0.17122695516040845, + "grad_norm": 0.615368127822876, + "learning_rate": 1.8587722018344598e-05, + "loss": 0.2997, + "step": 4580 + }, + { + "epoch": 0.17141388415075823, + "grad_norm": 0.36148715019226074, + "learning_rate": 1.8584711679348818e-05, + "loss": 0.2574, + "step": 4585 + }, + { + "epoch": 0.17160081314110803, + "grad_norm": 0.23802785575389862, + "learning_rate": 1.8581698379731965e-05, + "loss": 0.2897, + "step": 4590 + }, + { + "epoch": 0.1717877421314578, + "grad_norm": 0.3403271436691284, + "learning_rate": 1.857868212053324e-05, + "loss": 0.3736, + "step": 4595 + }, + { + "epoch": 0.17197467112180762, + "grad_norm": 0.2740768492221832, + "learning_rate": 1.8575662902792854e-05, + "loss": 0.3326, + "step": 4600 + }, + { + "epoch": 0.1721616001121574, + "grad_norm": 0.32492953538894653, + "learning_rate": 1.8572640727552064e-05, + "loss": 0.3538, + "step": 4605 + }, + { + "epoch": 0.1723485291025072, + "grad_norm": 0.25937604904174805, + "learning_rate": 1.856961559585312e-05, + "loss": 0.268, + "step": 4610 + }, + { + "epoch": 0.17253545809285697, + "grad_norm": 0.28443995118141174, + "learning_rate": 1.8566587508739312e-05, + "loss": 0.278, + "step": 4615 + }, + { + "epoch": 0.17272238708320678, + "grad_norm": 0.31061285734176636, + "learning_rate": 1.856355646725493e-05, + "loss": 0.3424, + "step": 4620 + }, + { + "epoch": 0.17290931607355656, + "grad_norm": 0.29571932554244995, + "learning_rate": 1.8560522472445304e-05, + "loss": 0.3045, + "step": 4625 + }, + { + "epoch": 0.17309624506390636, + "grad_norm": 0.38453951478004456, + "learning_rate": 1.8557485525356765e-05, + "loss": 0.3163, + "step": 4630 + }, + { + "epoch": 0.17328317405425614, + "grad_norm": 0.21908405423164368, + "learning_rate": 1.855444562703667e-05, + "loss": 0.2375, + "step": 4635 + }, + { + "epoch": 0.17347010304460592, + "grad_norm": 0.3414257764816284, + "learning_rate": 1.85514027785334e-05, + "loss": 0.3415, + "step": 4640 + }, + { + "epoch": 0.17365703203495572, + "grad_norm": 0.6744435429573059, + "learning_rate": 1.8548356980896337e-05, + "loss": 0.3277, + "step": 4645 + }, + { + "epoch": 0.1738439610253055, + "grad_norm": 0.6530652046203613, + "learning_rate": 1.854530823517589e-05, + "loss": 0.3593, + "step": 4650 + }, + { + "epoch": 0.1740308900156553, + "grad_norm": 0.5472162961959839, + "learning_rate": 1.854225654242349e-05, + "loss": 0.295, + "step": 4655 + }, + { + "epoch": 0.17421781900600508, + "grad_norm": 0.5321584939956665, + "learning_rate": 1.8539201903691574e-05, + "loss": 0.3521, + "step": 4660 + }, + { + "epoch": 0.1744047479963549, + "grad_norm": 0.4973386824131012, + "learning_rate": 1.8536144320033602e-05, + "loss": 0.3237, + "step": 4665 + }, + { + "epoch": 0.17459167698670466, + "grad_norm": 0.37571483850479126, + "learning_rate": 1.8533083792504043e-05, + "loss": 0.3179, + "step": 4670 + }, + { + "epoch": 0.17477860597705447, + "grad_norm": 0.31560763716697693, + "learning_rate": 1.8530020322158392e-05, + "loss": 0.3937, + "step": 4675 + }, + { + "epoch": 0.17496553496740425, + "grad_norm": 0.3787771165370941, + "learning_rate": 1.8526953910053143e-05, + "loss": 0.2801, + "step": 4680 + }, + { + "epoch": 0.17515246395775405, + "grad_norm": 0.2740449905395508, + "learning_rate": 1.852388455724582e-05, + "loss": 0.3061, + "step": 4685 + }, + { + "epoch": 0.17533939294810383, + "grad_norm": 0.43204188346862793, + "learning_rate": 1.8520812264794954e-05, + "loss": 0.3166, + "step": 4690 + }, + { + "epoch": 0.17552632193845363, + "grad_norm": 0.2784161865711212, + "learning_rate": 1.851773703376009e-05, + "loss": 0.2652, + "step": 4695 + }, + { + "epoch": 0.1757132509288034, + "grad_norm": 0.45802968740463257, + "learning_rate": 1.8514658865201786e-05, + "loss": 0.3275, + "step": 4700 + }, + { + "epoch": 0.17590017991915322, + "grad_norm": 0.30260100960731506, + "learning_rate": 1.8511577760181615e-05, + "loss": 0.3034, + "step": 4705 + }, + { + "epoch": 0.176087108909503, + "grad_norm": 0.44082552194595337, + "learning_rate": 1.8508493719762162e-05, + "loss": 0.3504, + "step": 4710 + }, + { + "epoch": 0.1762740378998528, + "grad_norm": 0.3765546977519989, + "learning_rate": 1.850540674500703e-05, + "loss": 0.3524, + "step": 4715 + }, + { + "epoch": 0.17646096689020258, + "grad_norm": 0.30463677644729614, + "learning_rate": 1.8502316836980815e-05, + "loss": 0.3177, + "step": 4720 + }, + { + "epoch": 0.17664789588055238, + "grad_norm": 0.39802396297454834, + "learning_rate": 1.8499223996749148e-05, + "loss": 0.2554, + "step": 4725 + }, + { + "epoch": 0.17683482487090216, + "grad_norm": 0.44730278849601746, + "learning_rate": 1.849612822537866e-05, + "loss": 0.3275, + "step": 4730 + }, + { + "epoch": 0.17702175386125196, + "grad_norm": 0.20249329507350922, + "learning_rate": 1.849302952393699e-05, + "loss": 0.3258, + "step": 4735 + }, + { + "epoch": 0.17720868285160174, + "grad_norm": 0.4662615656852722, + "learning_rate": 1.8489927893492794e-05, + "loss": 0.3144, + "step": 4740 + }, + { + "epoch": 0.17739561184195154, + "grad_norm": 0.43017593026161194, + "learning_rate": 1.8486823335115735e-05, + "loss": 0.3936, + "step": 4745 + }, + { + "epoch": 0.17758254083230132, + "grad_norm": 0.3419100344181061, + "learning_rate": 1.8483715849876486e-05, + "loss": 0.4101, + "step": 4750 + }, + { + "epoch": 0.17776946982265113, + "grad_norm": 0.08327355980873108, + "learning_rate": 1.8480605438846724e-05, + "loss": 0.3454, + "step": 4755 + }, + { + "epoch": 0.1779563988130009, + "grad_norm": 0.386849045753479, + "learning_rate": 1.847749210309915e-05, + "loss": 0.2913, + "step": 4760 + }, + { + "epoch": 0.1781433278033507, + "grad_norm": 0.7153798937797546, + "learning_rate": 1.8474375843707464e-05, + "loss": 0.2839, + "step": 4765 + }, + { + "epoch": 0.1783302567937005, + "grad_norm": 0.3605651259422302, + "learning_rate": 1.8471256661746367e-05, + "loss": 0.3087, + "step": 4770 + }, + { + "epoch": 0.1785171857840503, + "grad_norm": 0.3076781630516052, + "learning_rate": 1.8468134558291582e-05, + "loss": 0.3429, + "step": 4775 + }, + { + "epoch": 0.17870411477440007, + "grad_norm": 0.29881712794303894, + "learning_rate": 1.846500953441983e-05, + "loss": 0.3679, + "step": 4780 + }, + { + "epoch": 0.17889104376474987, + "grad_norm": 0.39074286818504333, + "learning_rate": 1.8461881591208843e-05, + "loss": 0.4408, + "step": 4785 + }, + { + "epoch": 0.17907797275509965, + "grad_norm": 0.5738774538040161, + "learning_rate": 1.8458750729737356e-05, + "loss": 0.3933, + "step": 4790 + }, + { + "epoch": 0.17926490174544946, + "grad_norm": 0.6319369077682495, + "learning_rate": 1.8455616951085118e-05, + "loss": 0.3198, + "step": 4795 + }, + { + "epoch": 0.17945183073579923, + "grad_norm": 0.4841550886631012, + "learning_rate": 1.845248025633288e-05, + "loss": 0.389, + "step": 4800 + }, + { + "epoch": 0.17963875972614904, + "grad_norm": 0.31131798028945923, + "learning_rate": 1.8449340646562396e-05, + "loss": 0.2501, + "step": 4805 + }, + { + "epoch": 0.17982568871649882, + "grad_norm": 0.24542942643165588, + "learning_rate": 1.844619812285642e-05, + "loss": 0.3488, + "step": 4810 + }, + { + "epoch": 0.18001261770684862, + "grad_norm": 0.31681129336357117, + "learning_rate": 1.8443052686298733e-05, + "loss": 0.331, + "step": 4815 + }, + { + "epoch": 0.1801995466971984, + "grad_norm": 0.22183604538440704, + "learning_rate": 1.8439904337974095e-05, + "loss": 0.259, + "step": 4820 + }, + { + "epoch": 0.1803864756875482, + "grad_norm": 1.1499030590057373, + "learning_rate": 1.843675307896829e-05, + "loss": 0.2915, + "step": 4825 + }, + { + "epoch": 0.18057340467789798, + "grad_norm": 0.2945778965950012, + "learning_rate": 1.8433598910368085e-05, + "loss": 0.3305, + "step": 4830 + }, + { + "epoch": 0.18076033366824779, + "grad_norm": 0.3298734724521637, + "learning_rate": 1.8430441833261273e-05, + "loss": 0.2948, + "step": 4835 + }, + { + "epoch": 0.18094726265859756, + "grad_norm": 0.46152833104133606, + "learning_rate": 1.8427281848736632e-05, + "loss": 0.3241, + "step": 4840 + }, + { + "epoch": 0.18113419164894737, + "grad_norm": 0.37429046630859375, + "learning_rate": 1.8424118957883954e-05, + "loss": 0.4391, + "step": 4845 + }, + { + "epoch": 0.18132112063929715, + "grad_norm": 0.44875389337539673, + "learning_rate": 1.8420953161794033e-05, + "loss": 0.2826, + "step": 4850 + }, + { + "epoch": 0.18150804962964695, + "grad_norm": 0.287893146276474, + "learning_rate": 1.8417784461558656e-05, + "loss": 0.2415, + "step": 4855 + }, + { + "epoch": 0.18169497861999673, + "grad_norm": 0.36352619528770447, + "learning_rate": 1.8414612858270616e-05, + "loss": 0.2222, + "step": 4860 + }, + { + "epoch": 0.18188190761034653, + "grad_norm": 0.35056108236312866, + "learning_rate": 1.841143835302371e-05, + "loss": 0.2468, + "step": 4865 + }, + { + "epoch": 0.1820688366006963, + "grad_norm": 0.384459912776947, + "learning_rate": 1.8408260946912736e-05, + "loss": 0.2602, + "step": 4870 + }, + { + "epoch": 0.18225576559104611, + "grad_norm": 0.38520097732543945, + "learning_rate": 1.8405080641033487e-05, + "loss": 0.2951, + "step": 4875 + }, + { + "epoch": 0.1824426945813959, + "grad_norm": 0.2897312343120575, + "learning_rate": 1.8401897436482762e-05, + "loss": 0.3556, + "step": 4880 + }, + { + "epoch": 0.18262962357174567, + "grad_norm": 0.5431839823722839, + "learning_rate": 1.8398711334358355e-05, + "loss": 0.2967, + "step": 4885 + }, + { + "epoch": 0.18281655256209547, + "grad_norm": 0.6530733108520508, + "learning_rate": 1.8395522335759067e-05, + "loss": 0.296, + "step": 4890 + }, + { + "epoch": 0.18300348155244525, + "grad_norm": 0.5999047160148621, + "learning_rate": 1.8392330441784683e-05, + "loss": 0.3362, + "step": 4895 + }, + { + "epoch": 0.18319041054279506, + "grad_norm": 0.5073540806770325, + "learning_rate": 1.8389135653535998e-05, + "loss": 0.2777, + "step": 4900 + }, + { + "epoch": 0.18337733953314483, + "grad_norm": 0.48142045736312866, + "learning_rate": 1.838593797211481e-05, + "loss": 0.2659, + "step": 4905 + }, + { + "epoch": 0.18356426852349464, + "grad_norm": 0.2865387201309204, + "learning_rate": 1.8382737398623904e-05, + "loss": 0.2702, + "step": 4910 + }, + { + "epoch": 0.18375119751384442, + "grad_norm": 0.6015134453773499, + "learning_rate": 1.8379533934167063e-05, + "loss": 0.3637, + "step": 4915 + }, + { + "epoch": 0.18393812650419422, + "grad_norm": 0.33356979489326477, + "learning_rate": 1.8376327579849068e-05, + "loss": 0.3628, + "step": 4920 + }, + { + "epoch": 0.184125055494544, + "grad_norm": 0.7000239491462708, + "learning_rate": 1.8373118336775707e-05, + "loss": 0.3346, + "step": 4925 + }, + { + "epoch": 0.1843119844848938, + "grad_norm": 0.4333287477493286, + "learning_rate": 1.8369906206053753e-05, + "loss": 0.2867, + "step": 4930 + }, + { + "epoch": 0.18449891347524358, + "grad_norm": 0.4251829981803894, + "learning_rate": 1.8366691188790976e-05, + "loss": 0.2435, + "step": 4935 + }, + { + "epoch": 0.1846858424655934, + "grad_norm": 0.41755521297454834, + "learning_rate": 1.836347328609614e-05, + "loss": 0.3786, + "step": 4940 + }, + { + "epoch": 0.18487277145594316, + "grad_norm": 0.40600693225860596, + "learning_rate": 1.8360252499079015e-05, + "loss": 0.2621, + "step": 4945 + }, + { + "epoch": 0.18505970044629297, + "grad_norm": 0.5500521063804626, + "learning_rate": 1.8357028828850356e-05, + "loss": 0.2544, + "step": 4950 + }, + { + "epoch": 0.18524662943664275, + "grad_norm": 0.5678396224975586, + "learning_rate": 1.8353802276521908e-05, + "loss": 0.3178, + "step": 4955 + }, + { + "epoch": 0.18543355842699255, + "grad_norm": 0.6940974593162537, + "learning_rate": 1.8350572843206425e-05, + "loss": 0.3536, + "step": 4960 + }, + { + "epoch": 0.18562048741734233, + "grad_norm": 0.2244710922241211, + "learning_rate": 1.8347340530017642e-05, + "loss": 0.3592, + "step": 4965 + }, + { + "epoch": 0.18580741640769213, + "grad_norm": 0.3188341557979584, + "learning_rate": 1.8344105338070294e-05, + "loss": 0.35, + "step": 4970 + }, + { + "epoch": 0.1859943453980419, + "grad_norm": 0.33533531427383423, + "learning_rate": 1.8340867268480102e-05, + "loss": 0.3611, + "step": 4975 + }, + { + "epoch": 0.18618127438839172, + "grad_norm": 0.3602057099342346, + "learning_rate": 1.8337626322363782e-05, + "loss": 0.3204, + "step": 4980 + }, + { + "epoch": 0.1863682033787415, + "grad_norm": 0.4599422216415405, + "learning_rate": 1.833438250083905e-05, + "loss": 0.3248, + "step": 4985 + }, + { + "epoch": 0.1865551323690913, + "grad_norm": 0.24511076509952545, + "learning_rate": 1.8331135805024606e-05, + "loss": 0.3311, + "step": 4990 + }, + { + "epoch": 0.18674206135944107, + "grad_norm": 0.6302551031112671, + "learning_rate": 1.8327886236040137e-05, + "loss": 0.3417, + "step": 4995 + }, + { + "epoch": 0.18692899034979088, + "grad_norm": 0.3400326073169708, + "learning_rate": 1.832463379500633e-05, + "loss": 0.2946, + "step": 5000 + }, + { + "epoch": 0.18711591934014066, + "grad_norm": 0.2726198732852936, + "learning_rate": 1.8321378483044855e-05, + "loss": 0.317, + "step": 5005 + }, + { + "epoch": 0.18730284833049046, + "grad_norm": 0.28910648822784424, + "learning_rate": 1.8318120301278382e-05, + "loss": 0.4624, + "step": 5010 + }, + { + "epoch": 0.18748977732084024, + "grad_norm": 0.47752344608306885, + "learning_rate": 1.831485925083056e-05, + "loss": 0.3485, + "step": 5015 + }, + { + "epoch": 0.18767670631119004, + "grad_norm": 0.4108814597129822, + "learning_rate": 1.8311595332826034e-05, + "loss": 0.2964, + "step": 5020 + }, + { + "epoch": 0.18786363530153982, + "grad_norm": 0.38854309916496277, + "learning_rate": 1.8308328548390437e-05, + "loss": 0.3555, + "step": 5025 + }, + { + "epoch": 0.18805056429188963, + "grad_norm": 0.3640444576740265, + "learning_rate": 1.8305058898650387e-05, + "loss": 0.2959, + "step": 5030 + }, + { + "epoch": 0.1882374932822394, + "grad_norm": 0.5036413073539734, + "learning_rate": 1.830178638473349e-05, + "loss": 0.2471, + "step": 5035 + }, + { + "epoch": 0.1884244222725892, + "grad_norm": 0.8091880083084106, + "learning_rate": 1.8298511007768347e-05, + "loss": 0.3366, + "step": 5040 + }, + { + "epoch": 0.188611351262939, + "grad_norm": 0.43067753314971924, + "learning_rate": 1.829523276888454e-05, + "loss": 0.2554, + "step": 5045 + }, + { + "epoch": 0.1887982802532888, + "grad_norm": 0.1925475299358368, + "learning_rate": 1.8291951669212637e-05, + "loss": 0.2483, + "step": 5050 + }, + { + "epoch": 0.18898520924363857, + "grad_norm": 0.5737656354904175, + "learning_rate": 1.82886677098842e-05, + "loss": 0.3381, + "step": 5055 + }, + { + "epoch": 0.18917213823398837, + "grad_norm": 0.6750152707099915, + "learning_rate": 1.828538089203177e-05, + "loss": 0.3419, + "step": 5060 + }, + { + "epoch": 0.18935906722433815, + "grad_norm": 0.4922321140766144, + "learning_rate": 1.828209121678888e-05, + "loss": 0.268, + "step": 5065 + }, + { + "epoch": 0.18954599621468796, + "grad_norm": 0.557983934879303, + "learning_rate": 1.8278798685290037e-05, + "loss": 0.2987, + "step": 5070 + }, + { + "epoch": 0.18973292520503773, + "grad_norm": 0.4714908301830292, + "learning_rate": 1.8275503298670742e-05, + "loss": 0.3768, + "step": 5075 + }, + { + "epoch": 0.18991985419538754, + "grad_norm": 0.38415268063545227, + "learning_rate": 1.8272205058067488e-05, + "loss": 0.2899, + "step": 5080 + }, + { + "epoch": 0.19010678318573732, + "grad_norm": 0.640612006187439, + "learning_rate": 1.8268903964617738e-05, + "loss": 0.3488, + "step": 5085 + }, + { + "epoch": 0.19029371217608712, + "grad_norm": 0.3866754472255707, + "learning_rate": 1.826560001945994e-05, + "loss": 0.2575, + "step": 5090 + }, + { + "epoch": 0.1904806411664369, + "grad_norm": 0.6181028485298157, + "learning_rate": 1.826229322373354e-05, + "loss": 0.3078, + "step": 5095 + }, + { + "epoch": 0.1906675701567867, + "grad_norm": 0.5132717490196228, + "learning_rate": 1.825898357857895e-05, + "loss": 0.3201, + "step": 5100 + }, + { + "epoch": 0.19085449914713648, + "grad_norm": 0.3530460298061371, + "learning_rate": 1.825567108513757e-05, + "loss": 0.27, + "step": 5105 + }, + { + "epoch": 0.19104142813748629, + "grad_norm": 0.6483767628669739, + "learning_rate": 1.825235574455179e-05, + "loss": 0.3382, + "step": 5110 + }, + { + "epoch": 0.19122835712783606, + "grad_norm": 0.6425184607505798, + "learning_rate": 1.8249037557964975e-05, + "loss": 0.2799, + "step": 5115 + }, + { + "epoch": 0.19141528611818587, + "grad_norm": 0.5412061810493469, + "learning_rate": 1.8245716526521475e-05, + "loss": 0.3593, + "step": 5120 + }, + { + "epoch": 0.19160221510853565, + "grad_norm": 0.4309811592102051, + "learning_rate": 1.8242392651366607e-05, + "loss": 0.3289, + "step": 5125 + }, + { + "epoch": 0.19178914409888542, + "grad_norm": 0.48865464329719543, + "learning_rate": 1.823906593364669e-05, + "loss": 0.2389, + "step": 5130 + }, + { + "epoch": 0.19197607308923523, + "grad_norm": 0.4698772132396698, + "learning_rate": 1.8235736374509015e-05, + "loss": 0.3453, + "step": 5135 + }, + { + "epoch": 0.192163002079585, + "grad_norm": 0.39952799677848816, + "learning_rate": 1.8232403975101845e-05, + "loss": 0.2537, + "step": 5140 + }, + { + "epoch": 0.1923499310699348, + "grad_norm": 0.3500300347805023, + "learning_rate": 1.8229068736574434e-05, + "loss": 0.3181, + "step": 5145 + }, + { + "epoch": 0.1925368600602846, + "grad_norm": 0.3574506342411041, + "learning_rate": 1.8225730660077007e-05, + "loss": 0.2995, + "step": 5150 + }, + { + "epoch": 0.1927237890506344, + "grad_norm": 0.5655731558799744, + "learning_rate": 1.8222389746760774e-05, + "loss": 0.2967, + "step": 5155 + }, + { + "epoch": 0.19291071804098417, + "grad_norm": 0.5786092877388, + "learning_rate": 1.8219045997777916e-05, + "loss": 0.265, + "step": 5160 + }, + { + "epoch": 0.19309764703133397, + "grad_norm": 0.30824795365333557, + "learning_rate": 1.8215699414281602e-05, + "loss": 0.3385, + "step": 5165 + }, + { + "epoch": 0.19328457602168375, + "grad_norm": 0.379058301448822, + "learning_rate": 1.8212349997425967e-05, + "loss": 0.3196, + "step": 5170 + }, + { + "epoch": 0.19347150501203356, + "grad_norm": 0.2118494063615799, + "learning_rate": 1.820899774836613e-05, + "loss": 0.2843, + "step": 5175 + }, + { + "epoch": 0.19365843400238333, + "grad_norm": 0.49254727363586426, + "learning_rate": 1.820564266825819e-05, + "loss": 0.3449, + "step": 5180 + }, + { + "epoch": 0.19384536299273314, + "grad_norm": 0.45277997851371765, + "learning_rate": 1.8202284758259215e-05, + "loss": 0.3104, + "step": 5185 + }, + { + "epoch": 0.19403229198308292, + "grad_norm": 0.45387160778045654, + "learning_rate": 1.8198924019527252e-05, + "loss": 0.3288, + "step": 5190 + }, + { + "epoch": 0.19421922097343272, + "grad_norm": 0.30750924348831177, + "learning_rate": 1.8195560453221322e-05, + "loss": 0.2391, + "step": 5195 + }, + { + "epoch": 0.1944061499637825, + "grad_norm": 0.2546007037162781, + "learning_rate": 1.8192194060501428e-05, + "loss": 0.3273, + "step": 5200 + }, + { + "epoch": 0.1945930789541323, + "grad_norm": 0.37746095657348633, + "learning_rate": 1.8188824842528535e-05, + "loss": 0.401, + "step": 5205 + }, + { + "epoch": 0.19478000794448208, + "grad_norm": 0.3548900783061981, + "learning_rate": 1.8185452800464593e-05, + "loss": 0.2809, + "step": 5210 + }, + { + "epoch": 0.19496693693483189, + "grad_norm": 0.21679793298244476, + "learning_rate": 1.8182077935472525e-05, + "loss": 0.337, + "step": 5215 + }, + { + "epoch": 0.19515386592518166, + "grad_norm": 0.6175905466079712, + "learning_rate": 1.8178700248716225e-05, + "loss": 0.3285, + "step": 5220 + }, + { + "epoch": 0.19534079491553147, + "grad_norm": 0.42689886689186096, + "learning_rate": 1.8175319741360553e-05, + "loss": 0.3127, + "step": 5225 + }, + { + "epoch": 0.19552772390588125, + "grad_norm": 0.26765209436416626, + "learning_rate": 1.8171936414571358e-05, + "loss": 0.2753, + "step": 5230 + }, + { + "epoch": 0.19571465289623105, + "grad_norm": 0.2162155956029892, + "learning_rate": 1.816855026951545e-05, + "loss": 0.3051, + "step": 5235 + }, + { + "epoch": 0.19590158188658083, + "grad_norm": 0.4493172764778137, + "learning_rate": 1.8165161307360613e-05, + "loss": 0.2757, + "step": 5240 + }, + { + "epoch": 0.19608851087693063, + "grad_norm": 0.3421705961227417, + "learning_rate": 1.81617695292756e-05, + "loss": 0.3372, + "step": 5245 + }, + { + "epoch": 0.1962754398672804, + "grad_norm": 0.48634567856788635, + "learning_rate": 1.8158374936430144e-05, + "loss": 0.3392, + "step": 5250 + }, + { + "epoch": 0.19646236885763022, + "grad_norm": 0.3790494203567505, + "learning_rate": 1.8154977529994934e-05, + "loss": 0.2742, + "step": 5255 + }, + { + "epoch": 0.19664929784798, + "grad_norm": 0.48067718744277954, + "learning_rate": 1.8151577311141647e-05, + "loss": 0.304, + "step": 5260 + }, + { + "epoch": 0.1968362268383298, + "grad_norm": 0.374163419008255, + "learning_rate": 1.814817428104292e-05, + "loss": 0.2721, + "step": 5265 + }, + { + "epoch": 0.19702315582867957, + "grad_norm": 0.3786110281944275, + "learning_rate": 1.8144768440872353e-05, + "loss": 0.3316, + "step": 5270 + }, + { + "epoch": 0.19721008481902938, + "grad_norm": 0.4367314875125885, + "learning_rate": 1.8141359791804532e-05, + "loss": 0.3282, + "step": 5275 + }, + { + "epoch": 0.19739701380937916, + "grad_norm": 0.3415897786617279, + "learning_rate": 1.8137948335014998e-05, + "loss": 0.2399, + "step": 5280 + }, + { + "epoch": 0.19758394279972896, + "grad_norm": 0.46700483560562134, + "learning_rate": 1.813453407168026e-05, + "loss": 0.2813, + "step": 5285 + }, + { + "epoch": 0.19777087179007874, + "grad_norm": 0.30502578616142273, + "learning_rate": 1.813111700297781e-05, + "loss": 0.3091, + "step": 5290 + }, + { + "epoch": 0.19795780078042854, + "grad_norm": 0.37599360942840576, + "learning_rate": 1.812769713008609e-05, + "loss": 0.2241, + "step": 5295 + }, + { + "epoch": 0.19814472977077832, + "grad_norm": 0.4802122712135315, + "learning_rate": 1.812427445418452e-05, + "loss": 0.2691, + "step": 5300 + }, + { + "epoch": 0.19833165876112813, + "grad_norm": 0.41680780053138733, + "learning_rate": 1.8120848976453475e-05, + "loss": 0.4307, + "step": 5305 + }, + { + "epoch": 0.1985185877514779, + "grad_norm": 0.4099524915218353, + "learning_rate": 1.8117420698074318e-05, + "loss": 0.3089, + "step": 5310 + }, + { + "epoch": 0.1987055167418277, + "grad_norm": 0.6154845952987671, + "learning_rate": 1.811398962022935e-05, + "loss": 0.3269, + "step": 5315 + }, + { + "epoch": 0.1988924457321775, + "grad_norm": 0.29491856694221497, + "learning_rate": 1.811055574410186e-05, + "loss": 0.2856, + "step": 5320 + }, + { + "epoch": 0.1990793747225273, + "grad_norm": 0.3641104996204376, + "learning_rate": 1.810711907087609e-05, + "loss": 0.2881, + "step": 5325 + }, + { + "epoch": 0.19926630371287707, + "grad_norm": 0.33928000926971436, + "learning_rate": 1.8103679601737244e-05, + "loss": 0.2707, + "step": 5330 + }, + { + "epoch": 0.19945323270322687, + "grad_norm": 0.21978841722011566, + "learning_rate": 1.810023733787151e-05, + "loss": 0.2939, + "step": 5335 + }, + { + "epoch": 0.19964016169357665, + "grad_norm": 0.3131517469882965, + "learning_rate": 1.8096792280466016e-05, + "loss": 0.2912, + "step": 5340 + }, + { + "epoch": 0.19982709068392646, + "grad_norm": 0.42469385266304016, + "learning_rate": 1.8093344430708873e-05, + "loss": 0.2581, + "step": 5345 + }, + { + "epoch": 0.20001401967427623, + "grad_norm": 0.31522685289382935, + "learning_rate": 1.8089893789789134e-05, + "loss": 0.3042, + "step": 5350 + }, + { + "epoch": 0.20020094866462604, + "grad_norm": 0.5416140556335449, + "learning_rate": 1.8086440358896834e-05, + "loss": 0.2743, + "step": 5355 + }, + { + "epoch": 0.20038787765497582, + "grad_norm": 0.424735963344574, + "learning_rate": 1.808298413922296e-05, + "loss": 0.2687, + "step": 5360 + }, + { + "epoch": 0.20057480664532562, + "grad_norm": 0.24916088581085205, + "learning_rate": 1.807952513195946e-05, + "loss": 0.3091, + "step": 5365 + }, + { + "epoch": 0.2007617356356754, + "grad_norm": 0.5970064401626587, + "learning_rate": 1.8076063338299254e-05, + "loss": 0.3088, + "step": 5370 + }, + { + "epoch": 0.20094866462602518, + "grad_norm": 0.5019383430480957, + "learning_rate": 1.807259875943621e-05, + "loss": 0.2508, + "step": 5375 + }, + { + "epoch": 0.20113559361637498, + "grad_norm": 0.39974379539489746, + "learning_rate": 1.8069131396565164e-05, + "loss": 0.2661, + "step": 5380 + }, + { + "epoch": 0.20132252260672476, + "grad_norm": 0.42386549711227417, + "learning_rate": 1.8065661250881908e-05, + "loss": 0.2875, + "step": 5385 + }, + { + "epoch": 0.20150945159707456, + "grad_norm": 0.626395583152771, + "learning_rate": 1.8062188323583193e-05, + "loss": 0.3221, + "step": 5390 + }, + { + "epoch": 0.20169638058742434, + "grad_norm": 0.5648181438446045, + "learning_rate": 1.805871261586674e-05, + "loss": 0.308, + "step": 5395 + }, + { + "epoch": 0.20188330957777414, + "grad_norm": 0.39882054924964905, + "learning_rate": 1.8055234128931218e-05, + "loss": 0.2746, + "step": 5400 + }, + { + "epoch": 0.20207023856812392, + "grad_norm": 0.3538493812084198, + "learning_rate": 1.8051752863976257e-05, + "loss": 0.364, + "step": 5405 + }, + { + "epoch": 0.20225716755847373, + "grad_norm": 0.5525597333908081, + "learning_rate": 1.804826882220244e-05, + "loss": 0.315, + "step": 5410 + }, + { + "epoch": 0.2024440965488235, + "grad_norm": 0.42425400018692017, + "learning_rate": 1.8044782004811325e-05, + "loss": 0.3296, + "step": 5415 + }, + { + "epoch": 0.2026310255391733, + "grad_norm": 0.3802151083946228, + "learning_rate": 1.8041292413005406e-05, + "loss": 0.2939, + "step": 5420 + }, + { + "epoch": 0.2028179545295231, + "grad_norm": 0.6813692450523376, + "learning_rate": 1.8037800047988145e-05, + "loss": 0.329, + "step": 5425 + }, + { + "epoch": 0.2030048835198729, + "grad_norm": 0.4450630247592926, + "learning_rate": 1.8034304910963957e-05, + "loss": 0.2982, + "step": 5430 + }, + { + "epoch": 0.20319181251022267, + "grad_norm": 0.4038054347038269, + "learning_rate": 1.8030807003138223e-05, + "loss": 0.4678, + "step": 5435 + }, + { + "epoch": 0.20337874150057247, + "grad_norm": 0.6570813655853271, + "learning_rate": 1.8027306325717263e-05, + "loss": 0.2544, + "step": 5440 + }, + { + "epoch": 0.20356567049092225, + "grad_norm": 0.3153902292251587, + "learning_rate": 1.802380287990836e-05, + "loss": 0.2733, + "step": 5445 + }, + { + "epoch": 0.20375259948127206, + "grad_norm": 0.3435896635055542, + "learning_rate": 1.802029666691976e-05, + "loss": 0.2971, + "step": 5450 + }, + { + "epoch": 0.20393952847162183, + "grad_norm": 0.24299199879169464, + "learning_rate": 1.8016787687960645e-05, + "loss": 0.2603, + "step": 5455 + }, + { + "epoch": 0.20412645746197164, + "grad_norm": 0.7944544553756714, + "learning_rate": 1.801327594424117e-05, + "loss": 0.3944, + "step": 5460 + }, + { + "epoch": 0.20431338645232142, + "grad_norm": 0.3297552764415741, + "learning_rate": 1.8009761436972427e-05, + "loss": 0.282, + "step": 5465 + }, + { + "epoch": 0.20450031544267122, + "grad_norm": 0.48783180117607117, + "learning_rate": 1.8006244167366478e-05, + "loss": 0.2335, + "step": 5470 + }, + { + "epoch": 0.204687244433021, + "grad_norm": 0.6959665417671204, + "learning_rate": 1.800272413663632e-05, + "loss": 0.2497, + "step": 5475 + }, + { + "epoch": 0.2048741734233708, + "grad_norm": 0.3114238679409027, + "learning_rate": 1.7999201345995918e-05, + "loss": 0.248, + "step": 5480 + }, + { + "epoch": 0.20506110241372058, + "grad_norm": 0.4099029004573822, + "learning_rate": 1.7995675796660175e-05, + "loss": 0.3176, + "step": 5485 + }, + { + "epoch": 0.20524803140407039, + "grad_norm": 0.9323566555976868, + "learning_rate": 1.7992147489844956e-05, + "loss": 0.3653, + "step": 5490 + }, + { + "epoch": 0.20543496039442016, + "grad_norm": 0.5710521340370178, + "learning_rate": 1.798861642676707e-05, + "loss": 0.2791, + "step": 5495 + }, + { + "epoch": 0.20562188938476997, + "grad_norm": 0.39266684651374817, + "learning_rate": 1.7985082608644285e-05, + "loss": 0.2759, + "step": 5500 + }, + { + "epoch": 0.20580881837511975, + "grad_norm": 0.9457874298095703, + "learning_rate": 1.7981546036695307e-05, + "loss": 0.354, + "step": 5505 + }, + { + "epoch": 0.20599574736546955, + "grad_norm": 0.507675290107727, + "learning_rate": 1.7978006712139802e-05, + "loss": 0.3088, + "step": 5510 + }, + { + "epoch": 0.20618267635581933, + "grad_norm": 0.44382691383361816, + "learning_rate": 1.797446463619838e-05, + "loss": 0.3084, + "step": 5515 + }, + { + "epoch": 0.20636960534616913, + "grad_norm": 0.6009101271629333, + "learning_rate": 1.7970919810092603e-05, + "loss": 0.2258, + "step": 5520 + }, + { + "epoch": 0.2065565343365189, + "grad_norm": 0.46515893936157227, + "learning_rate": 1.7967372235044975e-05, + "loss": 0.3775, + "step": 5525 + }, + { + "epoch": 0.20674346332686871, + "grad_norm": 0.3712780475616455, + "learning_rate": 1.7963821912278963e-05, + "loss": 0.3656, + "step": 5530 + }, + { + "epoch": 0.2069303923172185, + "grad_norm": 0.6315010190010071, + "learning_rate": 1.7960268843018964e-05, + "loss": 0.3175, + "step": 5535 + }, + { + "epoch": 0.2071173213075683, + "grad_norm": 0.4107741117477417, + "learning_rate": 1.7956713028490332e-05, + "loss": 0.2603, + "step": 5540 + }, + { + "epoch": 0.20730425029791807, + "grad_norm": 0.2759909927845001, + "learning_rate": 1.7953154469919365e-05, + "loss": 0.2768, + "step": 5545 + }, + { + "epoch": 0.20749117928826788, + "grad_norm": 0.3919317126274109, + "learning_rate": 1.7949593168533304e-05, + "loss": 0.4107, + "step": 5550 + }, + { + "epoch": 0.20767810827861766, + "grad_norm": 0.23572702705860138, + "learning_rate": 1.7946029125560352e-05, + "loss": 0.3234, + "step": 5555 + }, + { + "epoch": 0.20786503726896746, + "grad_norm": 0.40106138586997986, + "learning_rate": 1.794246234222963e-05, + "loss": 0.3748, + "step": 5560 + }, + { + "epoch": 0.20805196625931724, + "grad_norm": 0.5880841016769409, + "learning_rate": 1.793889281977123e-05, + "loss": 0.2439, + "step": 5565 + }, + { + "epoch": 0.20823889524966704, + "grad_norm": 0.2483060657978058, + "learning_rate": 1.7935320559416173e-05, + "loss": 0.3053, + "step": 5570 + }, + { + "epoch": 0.20842582424001682, + "grad_norm": 1.2637639045715332, + "learning_rate": 1.7931745562396432e-05, + "loss": 0.3467, + "step": 5575 + }, + { + "epoch": 0.20861275323036663, + "grad_norm": 0.26382896304130554, + "learning_rate": 1.7928167829944917e-05, + "loss": 0.2091, + "step": 5580 + }, + { + "epoch": 0.2087996822207164, + "grad_norm": 0.3013812005519867, + "learning_rate": 1.7924587363295493e-05, + "loss": 0.2777, + "step": 5585 + }, + { + "epoch": 0.2089866112110662, + "grad_norm": 0.5539411306381226, + "learning_rate": 1.792100416368295e-05, + "loss": 0.3764, + "step": 5590 + }, + { + "epoch": 0.209173540201416, + "grad_norm": 0.40408313274383545, + "learning_rate": 1.791741823234304e-05, + "loss": 0.3294, + "step": 5595 + }, + { + "epoch": 0.2093604691917658, + "grad_norm": 0.7178158164024353, + "learning_rate": 1.7913829570512445e-05, + "loss": 0.3448, + "step": 5600 + }, + { + "epoch": 0.20954739818211557, + "grad_norm": 0.31547850370407104, + "learning_rate": 1.791023817942879e-05, + "loss": 0.3079, + "step": 5605 + }, + { + "epoch": 0.20973432717246537, + "grad_norm": 0.4582591950893402, + "learning_rate": 1.7906644060330646e-05, + "loss": 0.2815, + "step": 5610 + }, + { + "epoch": 0.20992125616281515, + "grad_norm": 0.7002764344215393, + "learning_rate": 1.7903047214457517e-05, + "loss": 0.2797, + "step": 5615 + }, + { + "epoch": 0.21010818515316493, + "grad_norm": 0.4524244964122772, + "learning_rate": 1.7899447643049855e-05, + "loss": 0.402, + "step": 5620 + }, + { + "epoch": 0.21029511414351473, + "grad_norm": 0.40630853176116943, + "learning_rate": 1.7895845347349047e-05, + "loss": 0.3465, + "step": 5625 + }, + { + "epoch": 0.2104820431338645, + "grad_norm": 0.21824871003627777, + "learning_rate": 1.7892240328597427e-05, + "loss": 0.2492, + "step": 5630 + }, + { + "epoch": 0.21066897212421432, + "grad_norm": 0.3022218644618988, + "learning_rate": 1.7888632588038256e-05, + "loss": 0.2882, + "step": 5635 + }, + { + "epoch": 0.2108559011145641, + "grad_norm": 0.761012077331543, + "learning_rate": 1.7885022126915743e-05, + "loss": 0.3003, + "step": 5640 + }, + { + "epoch": 0.2110428301049139, + "grad_norm": 0.39378008246421814, + "learning_rate": 1.7881408946475035e-05, + "loss": 0.296, + "step": 5645 + }, + { + "epoch": 0.21122975909526367, + "grad_norm": 0.6320983171463013, + "learning_rate": 1.787779304796221e-05, + "loss": 0.2578, + "step": 5650 + }, + { + "epoch": 0.21141668808561348, + "grad_norm": 0.3272438943386078, + "learning_rate": 1.787417443262429e-05, + "loss": 0.297, + "step": 5655 + }, + { + "epoch": 0.21160361707596326, + "grad_norm": 0.4508860409259796, + "learning_rate": 1.7870553101709232e-05, + "loss": 0.2263, + "step": 5660 + }, + { + "epoch": 0.21179054606631306, + "grad_norm": 0.3349319398403168, + "learning_rate": 1.7866929056465933e-05, + "loss": 0.3182, + "step": 5665 + }, + { + "epoch": 0.21197747505666284, + "grad_norm": 0.7591943740844727, + "learning_rate": 1.7863302298144218e-05, + "loss": 0.2884, + "step": 5670 + }, + { + "epoch": 0.21216440404701264, + "grad_norm": 0.35864681005477905, + "learning_rate": 1.785967282799485e-05, + "loss": 0.2924, + "step": 5675 + }, + { + "epoch": 0.21235133303736242, + "grad_norm": 0.42692896723747253, + "learning_rate": 1.785604064726953e-05, + "loss": 0.2845, + "step": 5680 + }, + { + "epoch": 0.21253826202771223, + "grad_norm": 0.5757957100868225, + "learning_rate": 1.7852405757220898e-05, + "loss": 0.3391, + "step": 5685 + }, + { + "epoch": 0.212725191018062, + "grad_norm": 0.48782822489738464, + "learning_rate": 1.7848768159102522e-05, + "loss": 0.3037, + "step": 5690 + }, + { + "epoch": 0.2129121200084118, + "grad_norm": 0.31845298409461975, + "learning_rate": 1.78451278541689e-05, + "loss": 0.2772, + "step": 5695 + }, + { + "epoch": 0.2130990489987616, + "grad_norm": 0.33685699105262756, + "learning_rate": 1.7841484843675473e-05, + "loss": 0.319, + "step": 5700 + }, + { + "epoch": 0.2132859779891114, + "grad_norm": 0.391272634267807, + "learning_rate": 1.7837839128878608e-05, + "loss": 0.324, + "step": 5705 + }, + { + "epoch": 0.21347290697946117, + "grad_norm": 0.42025449872016907, + "learning_rate": 1.7834190711035613e-05, + "loss": 0.2744, + "step": 5710 + }, + { + "epoch": 0.21365983596981097, + "grad_norm": 0.5056503415107727, + "learning_rate": 1.7830539591404717e-05, + "loss": 0.2977, + "step": 5715 + }, + { + "epoch": 0.21384676496016075, + "grad_norm": 0.5261359214782715, + "learning_rate": 1.7826885771245094e-05, + "loss": 0.3341, + "step": 5720 + }, + { + "epoch": 0.21403369395051056, + "grad_norm": 0.44913962483406067, + "learning_rate": 1.7823229251816836e-05, + "loss": 0.2768, + "step": 5725 + }, + { + "epoch": 0.21422062294086033, + "grad_norm": 0.2031090408563614, + "learning_rate": 1.7819570034380968e-05, + "loss": 0.2962, + "step": 5730 + }, + { + "epoch": 0.21440755193121014, + "grad_norm": 0.3239678144454956, + "learning_rate": 1.7815908120199462e-05, + "loss": 0.2717, + "step": 5735 + }, + { + "epoch": 0.21459448092155992, + "grad_norm": 0.25322166085243225, + "learning_rate": 1.7812243510535194e-05, + "loss": 0.3227, + "step": 5740 + }, + { + "epoch": 0.21478140991190972, + "grad_norm": 0.2611919045448303, + "learning_rate": 1.780857620665199e-05, + "loss": 0.3009, + "step": 5745 + }, + { + "epoch": 0.2149683389022595, + "grad_norm": 0.580700695514679, + "learning_rate": 1.7804906209814597e-05, + "loss": 0.26, + "step": 5750 + }, + { + "epoch": 0.2151552678926093, + "grad_norm": 0.7626398205757141, + "learning_rate": 1.780123352128869e-05, + "loss": 0.3374, + "step": 5755 + }, + { + "epoch": 0.21534219688295908, + "grad_norm": 0.34990110993385315, + "learning_rate": 1.779755814234087e-05, + "loss": 0.2281, + "step": 5760 + }, + { + "epoch": 0.21552912587330889, + "grad_norm": 0.5652785301208496, + "learning_rate": 1.779388007423868e-05, + "loss": 0.324, + "step": 5765 + }, + { + "epoch": 0.21571605486365866, + "grad_norm": 0.45265763998031616, + "learning_rate": 1.779019931825058e-05, + "loss": 0.3418, + "step": 5770 + }, + { + "epoch": 0.21590298385400847, + "grad_norm": 0.47476527094841003, + "learning_rate": 1.7786515875645945e-05, + "loss": 0.2861, + "step": 5775 + }, + { + "epoch": 0.21608991284435825, + "grad_norm": 0.4736066162586212, + "learning_rate": 1.77828297476951e-05, + "loss": 0.3211, + "step": 5780 + }, + { + "epoch": 0.21627684183470805, + "grad_norm": 0.4887428879737854, + "learning_rate": 1.777914093566928e-05, + "loss": 0.3205, + "step": 5785 + }, + { + "epoch": 0.21646377082505783, + "grad_norm": 0.7084537148475647, + "learning_rate": 1.7775449440840656e-05, + "loss": 0.2698, + "step": 5790 + }, + { + "epoch": 0.21665069981540763, + "grad_norm": 0.7960801124572754, + "learning_rate": 1.777175526448231e-05, + "loss": 0.3193, + "step": 5795 + }, + { + "epoch": 0.2168376288057574, + "grad_norm": 0.48259130120277405, + "learning_rate": 1.776805840786826e-05, + "loss": 0.3232, + "step": 5800 + }, + { + "epoch": 0.21702455779610721, + "grad_norm": 0.5537059307098389, + "learning_rate": 1.7764358872273452e-05, + "loss": 0.2775, + "step": 5805 + }, + { + "epoch": 0.217211486786457, + "grad_norm": 0.2787805497646332, + "learning_rate": 1.7760656658973748e-05, + "loss": 0.285, + "step": 5810 + }, + { + "epoch": 0.2173984157768068, + "grad_norm": 0.6352003216743469, + "learning_rate": 1.7756951769245933e-05, + "loss": 0.3008, + "step": 5815 + }, + { + "epoch": 0.21758534476715657, + "grad_norm": 0.45176759362220764, + "learning_rate": 1.7753244204367713e-05, + "loss": 0.3192, + "step": 5820 + }, + { + "epoch": 0.21777227375750638, + "grad_norm": 0.38481858372688293, + "learning_rate": 1.7749533965617726e-05, + "loss": 0.3189, + "step": 5825 + }, + { + "epoch": 0.21795920274785616, + "grad_norm": 0.2588234841823578, + "learning_rate": 1.7745821054275533e-05, + "loss": 0.3369, + "step": 5830 + }, + { + "epoch": 0.21814613173820596, + "grad_norm": 0.4388332664966583, + "learning_rate": 1.7742105471621597e-05, + "loss": 0.3092, + "step": 5835 + }, + { + "epoch": 0.21833306072855574, + "grad_norm": 0.5704757571220398, + "learning_rate": 1.7738387218937326e-05, + "loss": 0.3485, + "step": 5840 + }, + { + "epoch": 0.21851998971890554, + "grad_norm": 0.23019002377986908, + "learning_rate": 1.7734666297505034e-05, + "loss": 0.3094, + "step": 5845 + }, + { + "epoch": 0.21870691870925532, + "grad_norm": 0.582446277141571, + "learning_rate": 1.7730942708607965e-05, + "loss": 0.3464, + "step": 5850 + }, + { + "epoch": 0.21889384769960513, + "grad_norm": 0.621008574962616, + "learning_rate": 1.7727216453530275e-05, + "loss": 0.3706, + "step": 5855 + }, + { + "epoch": 0.2190807766899549, + "grad_norm": 0.4594647288322449, + "learning_rate": 1.7723487533557043e-05, + "loss": 0.3341, + "step": 5860 + }, + { + "epoch": 0.21926770568030468, + "grad_norm": 0.3757644593715668, + "learning_rate": 1.7719755949974264e-05, + "loss": 0.2758, + "step": 5865 + }, + { + "epoch": 0.21945463467065449, + "grad_norm": 0.38085031509399414, + "learning_rate": 1.7716021704068858e-05, + "loss": 0.248, + "step": 5870 + }, + { + "epoch": 0.21964156366100426, + "grad_norm": 0.22712992131710052, + "learning_rate": 1.771228479712866e-05, + "loss": 0.2829, + "step": 5875 + }, + { + "epoch": 0.21982849265135407, + "grad_norm": 0.7901753783226013, + "learning_rate": 1.770854523044242e-05, + "loss": 0.3236, + "step": 5880 + }, + { + "epoch": 0.22001542164170385, + "grad_norm": 0.3070071041584015, + "learning_rate": 1.7704803005299806e-05, + "loss": 0.2932, + "step": 5885 + }, + { + "epoch": 0.22020235063205365, + "grad_norm": 0.6579908132553101, + "learning_rate": 1.7701058122991407e-05, + "loss": 0.3087, + "step": 5890 + }, + { + "epoch": 0.22038927962240343, + "grad_norm": 0.29225432872772217, + "learning_rate": 1.7697310584808726e-05, + "loss": 0.2783, + "step": 5895 + }, + { + "epoch": 0.22057620861275323, + "grad_norm": 0.4828365743160248, + "learning_rate": 1.769356039204418e-05, + "loss": 0.3803, + "step": 5900 + }, + { + "epoch": 0.220763137603103, + "grad_norm": 0.47344258427619934, + "learning_rate": 1.76898075459911e-05, + "loss": 0.3296, + "step": 5905 + }, + { + "epoch": 0.22095006659345282, + "grad_norm": 0.335570752620697, + "learning_rate": 1.768605204794374e-05, + "loss": 0.4109, + "step": 5910 + }, + { + "epoch": 0.2211369955838026, + "grad_norm": 0.4336024820804596, + "learning_rate": 1.7682293899197264e-05, + "loss": 0.3487, + "step": 5915 + }, + { + "epoch": 0.2213239245741524, + "grad_norm": 0.43026483058929443, + "learning_rate": 1.7678533101047745e-05, + "loss": 0.4125, + "step": 5920 + }, + { + "epoch": 0.22151085356450217, + "grad_norm": 0.3426781892776489, + "learning_rate": 1.767476965479218e-05, + "loss": 0.274, + "step": 5925 + }, + { + "epoch": 0.22169778255485198, + "grad_norm": 0.26045331358909607, + "learning_rate": 1.7671003561728468e-05, + "loss": 0.2431, + "step": 5930 + }, + { + "epoch": 0.22188471154520176, + "grad_norm": 0.3758314251899719, + "learning_rate": 1.766723482315543e-05, + "loss": 0.3625, + "step": 5935 + }, + { + "epoch": 0.22207164053555156, + "grad_norm": 0.5306219458580017, + "learning_rate": 1.7663463440372795e-05, + "loss": 0.3293, + "step": 5940 + }, + { + "epoch": 0.22225856952590134, + "grad_norm": 0.4184575080871582, + "learning_rate": 1.7659689414681208e-05, + "loss": 0.2876, + "step": 5945 + }, + { + "epoch": 0.22244549851625114, + "grad_norm": 0.4982224702835083, + "learning_rate": 1.765591274738222e-05, + "loss": 0.4022, + "step": 5950 + }, + { + "epoch": 0.22263242750660092, + "grad_norm": 0.6839665770530701, + "learning_rate": 1.765213343977829e-05, + "loss": 0.3823, + "step": 5955 + }, + { + "epoch": 0.22281935649695073, + "grad_norm": 0.3338620066642761, + "learning_rate": 1.7648351493172805e-05, + "loss": 0.2864, + "step": 5960 + }, + { + "epoch": 0.2230062854873005, + "grad_norm": 0.436450332403183, + "learning_rate": 1.764456690887004e-05, + "loss": 0.3375, + "step": 5965 + }, + { + "epoch": 0.2231932144776503, + "grad_norm": 0.4416438341140747, + "learning_rate": 1.7640779688175192e-05, + "loss": 0.3351, + "step": 5970 + }, + { + "epoch": 0.2233801434680001, + "grad_norm": 2.5495193004608154, + "learning_rate": 1.7636989832394365e-05, + "loss": 0.3343, + "step": 5975 + }, + { + "epoch": 0.2235670724583499, + "grad_norm": 0.6355164647102356, + "learning_rate": 1.7633197342834574e-05, + "loss": 0.3348, + "step": 5980 + }, + { + "epoch": 0.22375400144869967, + "grad_norm": 0.6266055703163147, + "learning_rate": 1.762940222080374e-05, + "loss": 0.3222, + "step": 5985 + }, + { + "epoch": 0.22394093043904947, + "grad_norm": 0.5203749537467957, + "learning_rate": 1.7625604467610685e-05, + "loss": 0.2935, + "step": 5990 + }, + { + "epoch": 0.22412785942939925, + "grad_norm": 0.40464696288108826, + "learning_rate": 1.7621804084565153e-05, + "loss": 0.3872, + "step": 5995 + }, + { + "epoch": 0.22431478841974906, + "grad_norm": 0.43349775671958923, + "learning_rate": 1.7618001072977783e-05, + "loss": 0.2921, + "step": 6000 + }, + { + "epoch": 0.22450171741009883, + "grad_norm": 2.7697973251342773, + "learning_rate": 1.7614195434160128e-05, + "loss": 0.3545, + "step": 6005 + }, + { + "epoch": 0.22468864640044864, + "grad_norm": 1.0596578121185303, + "learning_rate": 1.7610387169424643e-05, + "loss": 0.3049, + "step": 6010 + }, + { + "epoch": 0.22487557539079842, + "grad_norm": 0.39064550399780273, + "learning_rate": 1.7606576280084688e-05, + "loss": 0.3061, + "step": 6015 + }, + { + "epoch": 0.22506250438114822, + "grad_norm": 0.18736739456653595, + "learning_rate": 1.760276276745453e-05, + "loss": 0.337, + "step": 6020 + }, + { + "epoch": 0.225249433371498, + "grad_norm": 0.4019501805305481, + "learning_rate": 1.7598946632849338e-05, + "loss": 0.2916, + "step": 6025 + }, + { + "epoch": 0.2254363623618478, + "grad_norm": 0.5677745938301086, + "learning_rate": 1.759512787758519e-05, + "loss": 0.3083, + "step": 6030 + }, + { + "epoch": 0.22562329135219758, + "grad_norm": 0.6863211393356323, + "learning_rate": 1.7591306502979066e-05, + "loss": 0.3761, + "step": 6035 + }, + { + "epoch": 0.22581022034254739, + "grad_norm": 0.25616636872291565, + "learning_rate": 1.7587482510348848e-05, + "loss": 0.2742, + "step": 6040 + }, + { + "epoch": 0.22599714933289716, + "grad_norm": 0.37293756008148193, + "learning_rate": 1.7583655901013323e-05, + "loss": 0.3009, + "step": 6045 + }, + { + "epoch": 0.22618407832324697, + "grad_norm": 0.33705809712409973, + "learning_rate": 1.757982667629217e-05, + "loss": 0.2773, + "step": 6050 + }, + { + "epoch": 0.22637100731359674, + "grad_norm": 0.43777912855148315, + "learning_rate": 1.757599483750599e-05, + "loss": 0.3039, + "step": 6055 + }, + { + "epoch": 0.22655793630394655, + "grad_norm": 0.4632292091846466, + "learning_rate": 1.7572160385976273e-05, + "loss": 0.3124, + "step": 6060 + }, + { + "epoch": 0.22674486529429633, + "grad_norm": 0.2688206434249878, + "learning_rate": 1.756832332302541e-05, + "loss": 0.2935, + "step": 6065 + }, + { + "epoch": 0.22693179428464613, + "grad_norm": 0.5086357593536377, + "learning_rate": 1.7564483649976686e-05, + "loss": 0.342, + "step": 6070 + }, + { + "epoch": 0.2271187232749959, + "grad_norm": 1.0471845865249634, + "learning_rate": 1.7560641368154307e-05, + "loss": 0.3716, + "step": 6075 + }, + { + "epoch": 0.22730565226534571, + "grad_norm": 0.4194478988647461, + "learning_rate": 1.7556796478883355e-05, + "loss": 0.2641, + "step": 6080 + }, + { + "epoch": 0.2274925812556955, + "grad_norm": 0.6006360650062561, + "learning_rate": 1.755294898348983e-05, + "loss": 0.3655, + "step": 6085 + }, + { + "epoch": 0.2276795102460453, + "grad_norm": 0.48307010531425476, + "learning_rate": 1.754909888330062e-05, + "loss": 0.2505, + "step": 6090 + }, + { + "epoch": 0.22786643923639507, + "grad_norm": 0.3270209729671478, + "learning_rate": 1.7545246179643513e-05, + "loss": 0.3142, + "step": 6095 + }, + { + "epoch": 0.22805336822674485, + "grad_norm": 0.5060667395591736, + "learning_rate": 1.7541390873847195e-05, + "loss": 0.4389, + "step": 6100 + }, + { + "epoch": 0.22824029721709466, + "grad_norm": 0.15000495314598083, + "learning_rate": 1.7537532967241255e-05, + "loss": 0.3561, + "step": 6105 + }, + { + "epoch": 0.22842722620744443, + "grad_norm": 0.6780317425727844, + "learning_rate": 1.7533672461156173e-05, + "loss": 0.2892, + "step": 6110 + }, + { + "epoch": 0.22861415519779424, + "grad_norm": 0.4353383481502533, + "learning_rate": 1.7529809356923327e-05, + "loss": 0.3026, + "step": 6115 + }, + { + "epoch": 0.22880108418814402, + "grad_norm": 0.34928640723228455, + "learning_rate": 1.7525943655874987e-05, + "loss": 0.3004, + "step": 6120 + }, + { + "epoch": 0.22898801317849382, + "grad_norm": 0.538777232170105, + "learning_rate": 1.752207535934433e-05, + "loss": 0.3053, + "step": 6125 + }, + { + "epoch": 0.2291749421688436, + "grad_norm": 0.6690159440040588, + "learning_rate": 1.7518204468665415e-05, + "loss": 0.3058, + "step": 6130 + }, + { + "epoch": 0.2293618711591934, + "grad_norm": 0.29182419180870056, + "learning_rate": 1.75143309851732e-05, + "loss": 0.2783, + "step": 6135 + }, + { + "epoch": 0.22954880014954318, + "grad_norm": 0.4138169586658478, + "learning_rate": 1.751045491020354e-05, + "loss": 0.363, + "step": 6140 + }, + { + "epoch": 0.22973572913989299, + "grad_norm": 0.433714359998703, + "learning_rate": 1.750657624509319e-05, + "loss": 0.2915, + "step": 6145 + }, + { + "epoch": 0.22992265813024276, + "grad_norm": 0.41636890172958374, + "learning_rate": 1.750269499117978e-05, + "loss": 0.275, + "step": 6150 + }, + { + "epoch": 0.23010958712059257, + "grad_norm": 0.38736167550086975, + "learning_rate": 1.7498811149801845e-05, + "loss": 0.4277, + "step": 6155 + }, + { + "epoch": 0.23029651611094235, + "grad_norm": 0.578729510307312, + "learning_rate": 1.749492472229881e-05, + "loss": 0.3489, + "step": 6160 + }, + { + "epoch": 0.23048344510129215, + "grad_norm": 0.6496894359588623, + "learning_rate": 1.7491035710011e-05, + "loss": 0.2933, + "step": 6165 + }, + { + "epoch": 0.23067037409164193, + "grad_norm": 0.2986353039741516, + "learning_rate": 1.748714411427962e-05, + "loss": 0.2677, + "step": 6170 + }, + { + "epoch": 0.23085730308199173, + "grad_norm": 0.38289961218833923, + "learning_rate": 1.7483249936446768e-05, + "loss": 0.2705, + "step": 6175 + }, + { + "epoch": 0.2310442320723415, + "grad_norm": 0.4401322603225708, + "learning_rate": 1.7479353177855434e-05, + "loss": 0.2505, + "step": 6180 + }, + { + "epoch": 0.23123116106269131, + "grad_norm": 0.48002535104751587, + "learning_rate": 1.74754538398495e-05, + "loss": 0.3417, + "step": 6185 + }, + { + "epoch": 0.2314180900530411, + "grad_norm": 0.32335153222084045, + "learning_rate": 1.7471551923773732e-05, + "loss": 0.3704, + "step": 6190 + }, + { + "epoch": 0.2316050190433909, + "grad_norm": 0.34287896752357483, + "learning_rate": 1.7467647430973796e-05, + "loss": 0.3006, + "step": 6195 + }, + { + "epoch": 0.23179194803374067, + "grad_norm": 0.6139170527458191, + "learning_rate": 1.7463740362796235e-05, + "loss": 0.4078, + "step": 6200 + }, + { + "epoch": 0.23197887702409048, + "grad_norm": 0.5172100067138672, + "learning_rate": 1.7459830720588486e-05, + "loss": 0.4518, + "step": 6205 + }, + { + "epoch": 0.23216580601444026, + "grad_norm": 0.5083472728729248, + "learning_rate": 1.7455918505698876e-05, + "loss": 0.2808, + "step": 6210 + }, + { + "epoch": 0.23235273500479006, + "grad_norm": 0.3061027526855469, + "learning_rate": 1.745200371947661e-05, + "loss": 0.3108, + "step": 6215 + }, + { + "epoch": 0.23253966399513984, + "grad_norm": 0.5908476710319519, + "learning_rate": 1.7448086363271785e-05, + "loss": 0.2467, + "step": 6220 + }, + { + "epoch": 0.23272659298548964, + "grad_norm": 0.5854301452636719, + "learning_rate": 1.7444166438435392e-05, + "loss": 0.3062, + "step": 6225 + }, + { + "epoch": 0.23291352197583942, + "grad_norm": 0.6378867030143738, + "learning_rate": 1.7440243946319294e-05, + "loss": 0.2687, + "step": 6230 + }, + { + "epoch": 0.23310045096618923, + "grad_norm": 0.195028617978096, + "learning_rate": 1.7436318888276252e-05, + "loss": 0.3161, + "step": 6235 + }, + { + "epoch": 0.233287379956539, + "grad_norm": 0.22017711400985718, + "learning_rate": 1.74323912656599e-05, + "loss": 0.381, + "step": 6240 + }, + { + "epoch": 0.2334743089468888, + "grad_norm": 0.43266749382019043, + "learning_rate": 1.742846107982477e-05, + "loss": 0.2814, + "step": 6245 + }, + { + "epoch": 0.2336612379372386, + "grad_norm": 0.30291399359703064, + "learning_rate": 1.742452833212626e-05, + "loss": 0.2848, + "step": 6250 + }, + { + "epoch": 0.2338481669275884, + "grad_norm": 0.38583922386169434, + "learning_rate": 1.7420593023920673e-05, + "loss": 0.4128, + "step": 6255 + }, + { + "epoch": 0.23403509591793817, + "grad_norm": 0.34890538454055786, + "learning_rate": 1.7416655156565173e-05, + "loss": 0.2512, + "step": 6260 + }, + { + "epoch": 0.23422202490828797, + "grad_norm": 0.35292932391166687, + "learning_rate": 1.741271473141783e-05, + "loss": 0.3371, + "step": 6265 + }, + { + "epoch": 0.23440895389863775, + "grad_norm": 0.45217952132225037, + "learning_rate": 1.7408771749837572e-05, + "loss": 0.2664, + "step": 6270 + }, + { + "epoch": 0.23459588288898756, + "grad_norm": 0.32959669828414917, + "learning_rate": 1.7404826213184228e-05, + "loss": 0.392, + "step": 6275 + }, + { + "epoch": 0.23478281187933733, + "grad_norm": 0.4836936891078949, + "learning_rate": 1.74008781228185e-05, + "loss": 0.2778, + "step": 6280 + }, + { + "epoch": 0.23496974086968714, + "grad_norm": 0.31659331917762756, + "learning_rate": 1.7396927480101968e-05, + "loss": 0.2593, + "step": 6285 + }, + { + "epoch": 0.23515666986003692, + "grad_norm": 0.3063521981239319, + "learning_rate": 1.7392974286397096e-05, + "loss": 0.2791, + "step": 6290 + }, + { + "epoch": 0.23534359885038672, + "grad_norm": 0.3990836441516876, + "learning_rate": 1.7389018543067227e-05, + "loss": 0.3199, + "step": 6295 + }, + { + "epoch": 0.2355305278407365, + "grad_norm": 0.38259872794151306, + "learning_rate": 1.738506025147659e-05, + "loss": 0.2996, + "step": 6300 + }, + { + "epoch": 0.2357174568310863, + "grad_norm": 0.4150097072124481, + "learning_rate": 1.7381099412990276e-05, + "loss": 0.338, + "step": 6305 + }, + { + "epoch": 0.23590438582143608, + "grad_norm": 0.18883201479911804, + "learning_rate": 1.737713602897427e-05, + "loss": 0.3198, + "step": 6310 + }, + { + "epoch": 0.23609131481178589, + "grad_norm": 0.2726154923439026, + "learning_rate": 1.7373170100795433e-05, + "loss": 0.2758, + "step": 6315 + }, + { + "epoch": 0.23627824380213566, + "grad_norm": 0.3209015727043152, + "learning_rate": 1.736920162982149e-05, + "loss": 0.3209, + "step": 6320 + }, + { + "epoch": 0.23646517279248547, + "grad_norm": 0.32282012701034546, + "learning_rate": 1.736523061742107e-05, + "loss": 0.3198, + "step": 6325 + }, + { + "epoch": 0.23665210178283524, + "grad_norm": 0.6019655466079712, + "learning_rate": 1.736125706496364e-05, + "loss": 0.3568, + "step": 6330 + }, + { + "epoch": 0.23683903077318505, + "grad_norm": 0.42038559913635254, + "learning_rate": 1.7357280973819576e-05, + "loss": 0.3844, + "step": 6335 + }, + { + "epoch": 0.23702595976353483, + "grad_norm": 0.6067512631416321, + "learning_rate": 1.7353302345360118e-05, + "loss": 0.2941, + "step": 6340 + }, + { + "epoch": 0.2372128887538846, + "grad_norm": 0.3025815486907959, + "learning_rate": 1.7349321180957382e-05, + "loss": 0.2795, + "step": 6345 + }, + { + "epoch": 0.2373998177442344, + "grad_norm": 0.30944767594337463, + "learning_rate": 1.7345337481984355e-05, + "loss": 0.2756, + "step": 6350 + }, + { + "epoch": 0.2375867467345842, + "grad_norm": 0.33044394850730896, + "learning_rate": 1.73413512498149e-05, + "loss": 0.2536, + "step": 6355 + }, + { + "epoch": 0.237773675724934, + "grad_norm": 0.45606037974357605, + "learning_rate": 1.7337362485823757e-05, + "loss": 0.3169, + "step": 6360 + }, + { + "epoch": 0.23796060471528377, + "grad_norm": 0.38473767042160034, + "learning_rate": 1.7333371191386535e-05, + "loss": 0.2228, + "step": 6365 + }, + { + "epoch": 0.23814753370563357, + "grad_norm": 0.3136763870716095, + "learning_rate": 1.7329377367879715e-05, + "loss": 0.2625, + "step": 6370 + }, + { + "epoch": 0.23833446269598335, + "grad_norm": 0.33016493916511536, + "learning_rate": 1.7325381016680657e-05, + "loss": 0.4031, + "step": 6375 + }, + { + "epoch": 0.23852139168633316, + "grad_norm": 0.6611918807029724, + "learning_rate": 1.7321382139167578e-05, + "loss": 0.346, + "step": 6380 + }, + { + "epoch": 0.23870832067668293, + "grad_norm": 0.3350640833377838, + "learning_rate": 1.7317380736719588e-05, + "loss": 0.2905, + "step": 6385 + }, + { + "epoch": 0.23889524966703274, + "grad_norm": 0.6378026008605957, + "learning_rate": 1.7313376810716654e-05, + "loss": 0.2809, + "step": 6390 + }, + { + "epoch": 0.23908217865738252, + "grad_norm": 0.5519046783447266, + "learning_rate": 1.7309370362539607e-05, + "loss": 0.3125, + "step": 6395 + }, + { + "epoch": 0.23926910764773232, + "grad_norm": 0.4152654707431793, + "learning_rate": 1.7305361393570165e-05, + "loss": 0.267, + "step": 6400 + }, + { + "epoch": 0.2394560366380821, + "grad_norm": 0.47034916281700134, + "learning_rate": 1.7301349905190904e-05, + "loss": 0.2861, + "step": 6405 + }, + { + "epoch": 0.2396429656284319, + "grad_norm": 1.0928969383239746, + "learning_rate": 1.729733589878527e-05, + "loss": 0.306, + "step": 6410 + }, + { + "epoch": 0.23982989461878168, + "grad_norm": 0.28271111845970154, + "learning_rate": 1.729331937573758e-05, + "loss": 0.2999, + "step": 6415 + }, + { + "epoch": 0.24001682360913149, + "grad_norm": 0.27167975902557373, + "learning_rate": 1.7289300337433017e-05, + "loss": 0.2157, + "step": 6420 + }, + { + "epoch": 0.24020375259948126, + "grad_norm": 0.3268529176712036, + "learning_rate": 1.7285278785257633e-05, + "loss": 0.2802, + "step": 6425 + }, + { + "epoch": 0.24039068158983107, + "grad_norm": 0.6253595352172852, + "learning_rate": 1.7281254720598342e-05, + "loss": 0.3881, + "step": 6430 + }, + { + "epoch": 0.24057761058018085, + "grad_norm": 0.3022212088108063, + "learning_rate": 1.727722814484294e-05, + "loss": 0.3592, + "step": 6435 + }, + { + "epoch": 0.24076453957053065, + "grad_norm": 1.0882967710494995, + "learning_rate": 1.7273199059380062e-05, + "loss": 0.2548, + "step": 6440 + }, + { + "epoch": 0.24095146856088043, + "grad_norm": 0.23813073337078094, + "learning_rate": 1.7269167465599236e-05, + "loss": 0.2696, + "step": 6445 + }, + { + "epoch": 0.24113839755123023, + "grad_norm": 0.4797143042087555, + "learning_rate": 1.7265133364890842e-05, + "loss": 0.2992, + "step": 6450 + }, + { + "epoch": 0.24132532654158, + "grad_norm": 0.3314330577850342, + "learning_rate": 1.7261096758646115e-05, + "loss": 0.4107, + "step": 6455 + }, + { + "epoch": 0.24151225553192981, + "grad_norm": 0.38850629329681396, + "learning_rate": 1.7257057648257174e-05, + "loss": 0.3134, + "step": 6460 + }, + { + "epoch": 0.2416991845222796, + "grad_norm": 0.39222410321235657, + "learning_rate": 1.7253016035117e-05, + "loss": 0.2126, + "step": 6465 + }, + { + "epoch": 0.2418861135126294, + "grad_norm": 0.18110430240631104, + "learning_rate": 1.7248971920619413e-05, + "loss": 0.2886, + "step": 6470 + }, + { + "epoch": 0.24207304250297917, + "grad_norm": 0.47243568301200867, + "learning_rate": 1.724492530615912e-05, + "loss": 0.3672, + "step": 6475 + }, + { + "epoch": 0.24225997149332898, + "grad_norm": 0.3299618661403656, + "learning_rate": 1.7240876193131685e-05, + "loss": 0.3709, + "step": 6480 + }, + { + "epoch": 0.24244690048367876, + "grad_norm": 0.34539106488227844, + "learning_rate": 1.7236824582933525e-05, + "loss": 0.2648, + "step": 6485 + }, + { + "epoch": 0.24263382947402856, + "grad_norm": 0.5265419483184814, + "learning_rate": 1.7232770476961932e-05, + "loss": 0.2256, + "step": 6490 + }, + { + "epoch": 0.24282075846437834, + "grad_norm": 0.5372416377067566, + "learning_rate": 1.7228713876615043e-05, + "loss": 0.2817, + "step": 6495 + }, + { + "epoch": 0.24300768745472814, + "grad_norm": 0.4615037739276886, + "learning_rate": 1.7224654783291867e-05, + "loss": 0.2583, + "step": 6500 + }, + { + "epoch": 0.24319461644507792, + "grad_norm": 0.5872400403022766, + "learning_rate": 1.722059319839227e-05, + "loss": 0.3394, + "step": 6505 + }, + { + "epoch": 0.24338154543542773, + "grad_norm": 0.5134822726249695, + "learning_rate": 1.7216529123316975e-05, + "loss": 0.2918, + "step": 6510 + }, + { + "epoch": 0.2435684744257775, + "grad_norm": 0.3390646278858185, + "learning_rate": 1.7212462559467567e-05, + "loss": 0.338, + "step": 6515 + }, + { + "epoch": 0.2437554034161273, + "grad_norm": 0.24070733785629272, + "learning_rate": 1.7208393508246484e-05, + "loss": 0.3063, + "step": 6520 + }, + { + "epoch": 0.24394233240647709, + "grad_norm": 0.4138699471950531, + "learning_rate": 1.7204321971057024e-05, + "loss": 0.2647, + "step": 6525 + }, + { + "epoch": 0.2441292613968269, + "grad_norm": 0.5086038708686829, + "learning_rate": 1.720024794930335e-05, + "loss": 0.2562, + "step": 6530 + }, + { + "epoch": 0.24431619038717667, + "grad_norm": 0.8470866680145264, + "learning_rate": 1.719617144439047e-05, + "loss": 0.276, + "step": 6535 + }, + { + "epoch": 0.24450311937752647, + "grad_norm": 0.5553832650184631, + "learning_rate": 1.7192092457724254e-05, + "loss": 0.2874, + "step": 6540 + }, + { + "epoch": 0.24469004836787625, + "grad_norm": 0.32335466146469116, + "learning_rate": 1.718801099071143e-05, + "loss": 0.2903, + "step": 6545 + }, + { + "epoch": 0.24487697735822606, + "grad_norm": 0.46024224162101746, + "learning_rate": 1.7183927044759576e-05, + "loss": 0.3618, + "step": 6550 + }, + { + "epoch": 0.24506390634857583, + "grad_norm": 0.29070380330085754, + "learning_rate": 1.7179840621277132e-05, + "loss": 0.4624, + "step": 6555 + }, + { + "epoch": 0.24525083533892564, + "grad_norm": 0.35333648324012756, + "learning_rate": 1.7175751721673384e-05, + "loss": 0.2641, + "step": 6560 + }, + { + "epoch": 0.24543776432927542, + "grad_norm": 0.5118494629859924, + "learning_rate": 1.7171660347358482e-05, + "loss": 0.289, + "step": 6565 + }, + { + "epoch": 0.24562469331962522, + "grad_norm": 0.4456328749656677, + "learning_rate": 1.7167566499743417e-05, + "loss": 0.2784, + "step": 6570 + }, + { + "epoch": 0.245811622309975, + "grad_norm": 0.42578622698783875, + "learning_rate": 1.7163470180240047e-05, + "loss": 0.242, + "step": 6575 + }, + { + "epoch": 0.2459985513003248, + "grad_norm": 0.38927942514419556, + "learning_rate": 1.7159371390261067e-05, + "loss": 0.3147, + "step": 6580 + }, + { + "epoch": 0.24618548029067458, + "grad_norm": 0.4326514005661011, + "learning_rate": 1.715527013122004e-05, + "loss": 0.3499, + "step": 6585 + }, + { + "epoch": 0.24637240928102436, + "grad_norm": 0.4785178601741791, + "learning_rate": 1.7151166404531365e-05, + "loss": 0.3341, + "step": 6590 + }, + { + "epoch": 0.24655933827137416, + "grad_norm": 0.09519144147634506, + "learning_rate": 1.7147060211610305e-05, + "loss": 0.2694, + "step": 6595 + }, + { + "epoch": 0.24674626726172394, + "grad_norm": 0.4483666718006134, + "learning_rate": 1.7142951553872968e-05, + "loss": 0.4049, + "step": 6600 + }, + { + "epoch": 0.24693319625207374, + "grad_norm": 0.2505245804786682, + "learning_rate": 1.713884043273631e-05, + "loss": 0.3139, + "step": 6605 + }, + { + "epoch": 0.24712012524242352, + "grad_norm": 0.5089213848114014, + "learning_rate": 1.7134726849618144e-05, + "loss": 0.3026, + "step": 6610 + }, + { + "epoch": 0.24730705423277333, + "grad_norm": 0.3417368531227112, + "learning_rate": 1.7130610805937123e-05, + "loss": 0.219, + "step": 6615 + }, + { + "epoch": 0.2474939832231231, + "grad_norm": 0.428158164024353, + "learning_rate": 1.712649230311275e-05, + "loss": 0.3102, + "step": 6620 + }, + { + "epoch": 0.2476809122134729, + "grad_norm": 0.4470468759536743, + "learning_rate": 1.7122371342565384e-05, + "loss": 0.2733, + "step": 6625 + }, + { + "epoch": 0.2478678412038227, + "grad_norm": 0.9151923060417175, + "learning_rate": 1.7118247925716223e-05, + "loss": 0.3076, + "step": 6630 + }, + { + "epoch": 0.2480547701941725, + "grad_norm": 0.7877910137176514, + "learning_rate": 1.7114122053987318e-05, + "loss": 0.3281, + "step": 6635 + }, + { + "epoch": 0.24824169918452227, + "grad_norm": 0.47747930884361267, + "learning_rate": 1.7109993728801556e-05, + "loss": 0.258, + "step": 6640 + }, + { + "epoch": 0.24842862817487207, + "grad_norm": 0.3599812388420105, + "learning_rate": 1.710586295158269e-05, + "loss": 0.2615, + "step": 6645 + }, + { + "epoch": 0.24861555716522185, + "grad_norm": 0.34985432028770447, + "learning_rate": 1.7101729723755296e-05, + "loss": 0.3707, + "step": 6650 + }, + { + "epoch": 0.24880248615557166, + "grad_norm": 0.48352763056755066, + "learning_rate": 1.7097594046744815e-05, + "loss": 0.3285, + "step": 6655 + }, + { + "epoch": 0.24898941514592143, + "grad_norm": 0.3733879625797272, + "learning_rate": 1.7093455921977516e-05, + "loss": 0.3347, + "step": 6660 + }, + { + "epoch": 0.24917634413627124, + "grad_norm": 1.183453917503357, + "learning_rate": 1.7089315350880525e-05, + "loss": 0.395, + "step": 6665 + }, + { + "epoch": 0.24936327312662102, + "grad_norm": 0.3995364010334015, + "learning_rate": 1.70851723348818e-05, + "loss": 0.3719, + "step": 6670 + }, + { + "epoch": 0.24955020211697082, + "grad_norm": 0.2912960350513458, + "learning_rate": 1.7081026875410156e-05, + "loss": 0.2581, + "step": 6675 + }, + { + "epoch": 0.2497371311073206, + "grad_norm": 0.32783398032188416, + "learning_rate": 1.7076878973895242e-05, + "loss": 0.3313, + "step": 6680 + }, + { + "epoch": 0.2499240600976704, + "grad_norm": 0.45547330379486084, + "learning_rate": 1.7072728631767543e-05, + "loss": 0.2585, + "step": 6685 + }, + { + "epoch": 0.2501109890880202, + "grad_norm": 0.4544295370578766, + "learning_rate": 1.7068575850458402e-05, + "loss": 0.2982, + "step": 6690 + }, + { + "epoch": 0.25029791807837, + "grad_norm": 0.5787556171417236, + "learning_rate": 1.7064420631399986e-05, + "loss": 0.2737, + "step": 6695 + }, + { + "epoch": 0.25048484706871976, + "grad_norm": 0.4456428587436676, + "learning_rate": 1.706026297602532e-05, + "loss": 0.2809, + "step": 6700 + }, + { + "epoch": 0.25067177605906954, + "grad_norm": 0.27072015404701233, + "learning_rate": 1.705610288576825e-05, + "loss": 0.2661, + "step": 6705 + }, + { + "epoch": 0.2508587050494194, + "grad_norm": 0.2884485721588135, + "learning_rate": 1.7051940362063486e-05, + "loss": 0.3243, + "step": 6710 + }, + { + "epoch": 0.25104563403976915, + "grad_norm": 0.3565753400325775, + "learning_rate": 1.7047775406346548e-05, + "loss": 0.4407, + "step": 6715 + }, + { + "epoch": 0.2512325630301189, + "grad_norm": 0.5307539105415344, + "learning_rate": 1.7043608020053823e-05, + "loss": 0.3288, + "step": 6720 + }, + { + "epoch": 0.2514194920204687, + "grad_norm": 0.2853598892688751, + "learning_rate": 1.7039438204622515e-05, + "loss": 0.3014, + "step": 6725 + }, + { + "epoch": 0.25160642101081854, + "grad_norm": 0.476520836353302, + "learning_rate": 1.7035265961490673e-05, + "loss": 0.337, + "step": 6730 + }, + { + "epoch": 0.2517933500011683, + "grad_norm": 0.34905505180358887, + "learning_rate": 1.7031091292097186e-05, + "loss": 0.2644, + "step": 6735 + }, + { + "epoch": 0.2519802789915181, + "grad_norm": 0.4414178729057312, + "learning_rate": 1.702691419788178e-05, + "loss": 0.3584, + "step": 6740 + }, + { + "epoch": 0.25216720798186787, + "grad_norm": 0.4812869131565094, + "learning_rate": 1.7022734680285013e-05, + "loss": 0.2968, + "step": 6745 + }, + { + "epoch": 0.2523541369722177, + "grad_norm": 0.5577378869056702, + "learning_rate": 1.701855274074828e-05, + "loss": 0.2407, + "step": 6750 + }, + { + "epoch": 0.2525410659625675, + "grad_norm": 0.3004097044467926, + "learning_rate": 1.701436838071382e-05, + "loss": 0.2893, + "step": 6755 + }, + { + "epoch": 0.25272799495291726, + "grad_norm": 0.5560919046401978, + "learning_rate": 1.7010181601624687e-05, + "loss": 0.305, + "step": 6760 + }, + { + "epoch": 0.25291492394326703, + "grad_norm": 0.5027373433113098, + "learning_rate": 1.700599240492479e-05, + "loss": 0.2415, + "step": 6765 + }, + { + "epoch": 0.25310185293361687, + "grad_norm": 0.2853597104549408, + "learning_rate": 1.7001800792058856e-05, + "loss": 0.236, + "step": 6770 + }, + { + "epoch": 0.25328878192396664, + "grad_norm": 0.36475807428359985, + "learning_rate": 1.6997606764472457e-05, + "loss": 0.2476, + "step": 6775 + }, + { + "epoch": 0.2534757109143164, + "grad_norm": 0.2956780791282654, + "learning_rate": 1.6993410323611993e-05, + "loss": 0.3781, + "step": 6780 + }, + { + "epoch": 0.2536626399046662, + "grad_norm": 0.3027787208557129, + "learning_rate": 1.6989211470924694e-05, + "loss": 0.2951, + "step": 6785 + }, + { + "epoch": 0.25384956889501603, + "grad_norm": 0.36557620763778687, + "learning_rate": 1.6985010207858624e-05, + "loss": 0.2903, + "step": 6790 + }, + { + "epoch": 0.2540364978853658, + "grad_norm": 0.2095050811767578, + "learning_rate": 1.6980806535862683e-05, + "loss": 0.2859, + "step": 6795 + }, + { + "epoch": 0.2542234268757156, + "grad_norm": 0.4164871573448181, + "learning_rate": 1.6976600456386593e-05, + "loss": 0.24, + "step": 6800 + }, + { + "epoch": 0.25441035586606536, + "grad_norm": 0.2171694040298462, + "learning_rate": 1.6972391970880906e-05, + "loss": 0.4062, + "step": 6805 + }, + { + "epoch": 0.2545972848564152, + "grad_norm": 0.6664031744003296, + "learning_rate": 1.6968181080797012e-05, + "loss": 0.2982, + "step": 6810 + }, + { + "epoch": 0.254784213846765, + "grad_norm": 0.3303912878036499, + "learning_rate": 1.6963967787587133e-05, + "loss": 0.2349, + "step": 6815 + }, + { + "epoch": 0.25497114283711475, + "grad_norm": 0.42287972569465637, + "learning_rate": 1.6959752092704302e-05, + "loss": 0.3224, + "step": 6820 + }, + { + "epoch": 0.2551580718274645, + "grad_norm": 0.5297410488128662, + "learning_rate": 1.69555339976024e-05, + "loss": 0.3097, + "step": 6825 + }, + { + "epoch": 0.25534500081781436, + "grad_norm": 0.44122767448425293, + "learning_rate": 1.695131350373612e-05, + "loss": 0.2884, + "step": 6830 + }, + { + "epoch": 0.25553192980816414, + "grad_norm": 0.4497906267642975, + "learning_rate": 1.6947090612560995e-05, + "loss": 0.2767, + "step": 6835 + }, + { + "epoch": 0.2557188587985139, + "grad_norm": 0.383256196975708, + "learning_rate": 1.6942865325533374e-05, + "loss": 0.3156, + "step": 6840 + }, + { + "epoch": 0.2559057877888637, + "grad_norm": 0.45405879616737366, + "learning_rate": 1.6938637644110442e-05, + "loss": 0.3549, + "step": 6845 + }, + { + "epoch": 0.25609271677921347, + "grad_norm": 0.26836884021759033, + "learning_rate": 1.6934407569750208e-05, + "loss": 0.265, + "step": 6850 + }, + { + "epoch": 0.2562796457695633, + "grad_norm": 0.42300936579704285, + "learning_rate": 1.6930175103911492e-05, + "loss": 0.3418, + "step": 6855 + }, + { + "epoch": 0.2564665747599131, + "grad_norm": 0.4314024746417999, + "learning_rate": 1.6925940248053964e-05, + "loss": 0.3864, + "step": 6860 + }, + { + "epoch": 0.25665350375026286, + "grad_norm": 0.4368232488632202, + "learning_rate": 1.6921703003638094e-05, + "loss": 0.3285, + "step": 6865 + }, + { + "epoch": 0.25684043274061263, + "grad_norm": 0.33699876070022583, + "learning_rate": 1.691746337212519e-05, + "loss": 0.3067, + "step": 6870 + }, + { + "epoch": 0.25702736173096247, + "grad_norm": 0.28076720237731934, + "learning_rate": 1.691322135497738e-05, + "loss": 0.2381, + "step": 6875 + }, + { + "epoch": 0.25721429072131224, + "grad_norm": 0.5777077674865723, + "learning_rate": 1.6908976953657612e-05, + "loss": 0.2946, + "step": 6880 + }, + { + "epoch": 0.257401219711662, + "grad_norm": 0.43163543939590454, + "learning_rate": 1.690473016962966e-05, + "loss": 0.307, + "step": 6885 + }, + { + "epoch": 0.2575881487020118, + "grad_norm": 0.4134599268436432, + "learning_rate": 1.6900481004358123e-05, + "loss": 0.2846, + "step": 6890 + }, + { + "epoch": 0.25777507769236163, + "grad_norm": 0.40419724583625793, + "learning_rate": 1.6896229459308405e-05, + "loss": 0.2843, + "step": 6895 + }, + { + "epoch": 0.2579620066827114, + "grad_norm": 0.4465784728527069, + "learning_rate": 1.6891975535946753e-05, + "loss": 0.277, + "step": 6900 + }, + { + "epoch": 0.2581489356730612, + "grad_norm": 0.39851880073547363, + "learning_rate": 1.6887719235740216e-05, + "loss": 0.2687, + "step": 6905 + }, + { + "epoch": 0.25833586466341096, + "grad_norm": 0.30198317766189575, + "learning_rate": 1.6883460560156673e-05, + "loss": 0.2999, + "step": 6910 + }, + { + "epoch": 0.2585227936537608, + "grad_norm": 0.43067798018455505, + "learning_rate": 1.687919951066482e-05, + "loss": 0.3473, + "step": 6915 + }, + { + "epoch": 0.2587097226441106, + "grad_norm": 0.2943825423717499, + "learning_rate": 1.687493608873417e-05, + "loss": 0.3303, + "step": 6920 + }, + { + "epoch": 0.25889665163446035, + "grad_norm": 0.3708643317222595, + "learning_rate": 1.6870670295835055e-05, + "loss": 0.3585, + "step": 6925 + }, + { + "epoch": 0.25908358062481013, + "grad_norm": 0.45866164565086365, + "learning_rate": 1.6866402133438623e-05, + "loss": 0.3606, + "step": 6930 + }, + { + "epoch": 0.25927050961515996, + "grad_norm": 0.4115564227104187, + "learning_rate": 1.6862131603016844e-05, + "loss": 0.2645, + "step": 6935 + }, + { + "epoch": 0.25945743860550974, + "grad_norm": 0.21493466198444366, + "learning_rate": 1.68578587060425e-05, + "loss": 0.312, + "step": 6940 + }, + { + "epoch": 0.2596443675958595, + "grad_norm": 0.8563496470451355, + "learning_rate": 1.6853583443989186e-05, + "loss": 0.362, + "step": 6945 + }, + { + "epoch": 0.2598312965862093, + "grad_norm": 0.36561504006385803, + "learning_rate": 1.684930581833133e-05, + "loss": 0.3779, + "step": 6950 + }, + { + "epoch": 0.2600182255765591, + "grad_norm": 0.37505069375038147, + "learning_rate": 1.6845025830544147e-05, + "loss": 0.301, + "step": 6955 + }, + { + "epoch": 0.2602051545669089, + "grad_norm": 0.45678603649139404, + "learning_rate": 1.68407434821037e-05, + "loss": 0.3282, + "step": 6960 + }, + { + "epoch": 0.2603920835572587, + "grad_norm": 0.5878984332084656, + "learning_rate": 1.6836458774486827e-05, + "loss": 0.3805, + "step": 6965 + }, + { + "epoch": 0.26057901254760846, + "grad_norm": 0.20040619373321533, + "learning_rate": 1.683217170917122e-05, + "loss": 0.2674, + "step": 6970 + }, + { + "epoch": 0.2607659415379583, + "grad_norm": 0.45022228360176086, + "learning_rate": 1.6827882287635353e-05, + "loss": 0.315, + "step": 6975 + }, + { + "epoch": 0.26095287052830807, + "grad_norm": 0.39582857489585876, + "learning_rate": 1.682359051135853e-05, + "loss": 0.3558, + "step": 6980 + }, + { + "epoch": 0.26113979951865784, + "grad_norm": 0.5623447895050049, + "learning_rate": 1.681929638182086e-05, + "loss": 0.2928, + "step": 6985 + }, + { + "epoch": 0.2613267285090076, + "grad_norm": 0.35395896434783936, + "learning_rate": 1.6814999900503265e-05, + "loss": 0.3276, + "step": 6990 + }, + { + "epoch": 0.26151365749935745, + "grad_norm": 0.2845790684223175, + "learning_rate": 1.681070106888748e-05, + "loss": 0.3025, + "step": 6995 + }, + { + "epoch": 0.26170058648970723, + "grad_norm": 0.3993929326534271, + "learning_rate": 1.6806399888456043e-05, + "loss": 0.327, + "step": 7000 + }, + { + "epoch": 0.261887515480057, + "grad_norm": 0.23860380053520203, + "learning_rate": 1.6802096360692316e-05, + "loss": 0.3101, + "step": 7005 + }, + { + "epoch": 0.2620744444704068, + "grad_norm": 0.20932121574878693, + "learning_rate": 1.679779048708046e-05, + "loss": 0.2384, + "step": 7010 + }, + { + "epoch": 0.2622613734607566, + "grad_norm": 0.4419725239276886, + "learning_rate": 1.6793482269105446e-05, + "loss": 0.2384, + "step": 7015 + }, + { + "epoch": 0.2624483024511064, + "grad_norm": 0.30131080746650696, + "learning_rate": 1.6789171708253052e-05, + "loss": 0.3222, + "step": 7020 + }, + { + "epoch": 0.2626352314414562, + "grad_norm": 0.2808610498905182, + "learning_rate": 1.6784858806009875e-05, + "loss": 0.249, + "step": 7025 + }, + { + "epoch": 0.26282216043180595, + "grad_norm": 0.4096542298793793, + "learning_rate": 1.67805435638633e-05, + "loss": 0.2912, + "step": 7030 + }, + { + "epoch": 0.2630090894221558, + "grad_norm": 0.32321956753730774, + "learning_rate": 1.6776225983301543e-05, + "loss": 0.2665, + "step": 7035 + }, + { + "epoch": 0.26319601841250556, + "grad_norm": 0.27172431349754333, + "learning_rate": 1.6771906065813607e-05, + "loss": 0.2614, + "step": 7040 + }, + { + "epoch": 0.26338294740285534, + "grad_norm": 0.2878829836845398, + "learning_rate": 1.676758381288931e-05, + "loss": 0.2974, + "step": 7045 + }, + { + "epoch": 0.2635698763932051, + "grad_norm": 0.3150484263896942, + "learning_rate": 1.6763259226019267e-05, + "loss": 0.2796, + "step": 7050 + }, + { + "epoch": 0.26375680538355495, + "grad_norm": 0.39884239435195923, + "learning_rate": 1.6758932306694913e-05, + "loss": 0.261, + "step": 7055 + }, + { + "epoch": 0.2639437343739047, + "grad_norm": 0.42523136734962463, + "learning_rate": 1.6754603056408473e-05, + "loss": 0.3706, + "step": 7060 + }, + { + "epoch": 0.2641306633642545, + "grad_norm": 0.5379177927970886, + "learning_rate": 1.675027147665298e-05, + "loss": 0.275, + "step": 7065 + }, + { + "epoch": 0.2643175923546043, + "grad_norm": 1.383110761642456, + "learning_rate": 1.674593756892228e-05, + "loss": 0.2515, + "step": 7070 + }, + { + "epoch": 0.2645045213449541, + "grad_norm": 0.3739633560180664, + "learning_rate": 1.6741601334711004e-05, + "loss": 0.3131, + "step": 7075 + }, + { + "epoch": 0.2646914503353039, + "grad_norm": 0.485005646944046, + "learning_rate": 1.67372627755146e-05, + "loss": 0.2868, + "step": 7080 + }, + { + "epoch": 0.26487837932565367, + "grad_norm": 0.4296754002571106, + "learning_rate": 1.6732921892829313e-05, + "loss": 0.3443, + "step": 7085 + }, + { + "epoch": 0.26506530831600345, + "grad_norm": 0.6618226170539856, + "learning_rate": 1.6728578688152186e-05, + "loss": 0.3642, + "step": 7090 + }, + { + "epoch": 0.2652522373063532, + "grad_norm": 0.3030281662940979, + "learning_rate": 1.6724233162981067e-05, + "loss": 0.2784, + "step": 7095 + }, + { + "epoch": 0.26543916629670306, + "grad_norm": 0.5461763143539429, + "learning_rate": 1.6719885318814604e-05, + "loss": 0.4215, + "step": 7100 + }, + { + "epoch": 0.26562609528705283, + "grad_norm": 0.38784629106521606, + "learning_rate": 1.6715535157152244e-05, + "loss": 0.3164, + "step": 7105 + }, + { + "epoch": 0.2658130242774026, + "grad_norm": 0.46060433983802795, + "learning_rate": 1.6711182679494232e-05, + "loss": 0.2347, + "step": 7110 + }, + { + "epoch": 0.2659999532677524, + "grad_norm": 0.7013404369354248, + "learning_rate": 1.6706827887341613e-05, + "loss": 0.2908, + "step": 7115 + }, + { + "epoch": 0.2661868822581022, + "grad_norm": 0.7355599999427795, + "learning_rate": 1.6702470782196237e-05, + "loss": 0.2816, + "step": 7120 + }, + { + "epoch": 0.266373811248452, + "grad_norm": 0.23429067432880402, + "learning_rate": 1.6698111365560733e-05, + "loss": 0.3136, + "step": 7125 + }, + { + "epoch": 0.2665607402388018, + "grad_norm": 0.7480411529541016, + "learning_rate": 1.6693749638938544e-05, + "loss": 0.2388, + "step": 7130 + }, + { + "epoch": 0.26674766922915155, + "grad_norm": 0.6238270998001099, + "learning_rate": 1.6689385603833907e-05, + "loss": 0.4418, + "step": 7135 + }, + { + "epoch": 0.2669345982195014, + "grad_norm": 0.15828560292720795, + "learning_rate": 1.6685019261751848e-05, + "loss": 0.251, + "step": 7140 + }, + { + "epoch": 0.26712152720985116, + "grad_norm": 0.44209611415863037, + "learning_rate": 1.6680650614198194e-05, + "loss": 0.2724, + "step": 7145 + }, + { + "epoch": 0.26730845620020094, + "grad_norm": 0.49470680952072144, + "learning_rate": 1.667627966267957e-05, + "loss": 0.2514, + "step": 7150 + }, + { + "epoch": 0.2674953851905507, + "grad_norm": 0.3461754620075226, + "learning_rate": 1.6671906408703394e-05, + "loss": 0.2894, + "step": 7155 + }, + { + "epoch": 0.26768231418090055, + "grad_norm": 0.31336525082588196, + "learning_rate": 1.6667530853777865e-05, + "loss": 0.2696, + "step": 7160 + }, + { + "epoch": 0.2678692431712503, + "grad_norm": 0.5694870352745056, + "learning_rate": 1.6663152999411998e-05, + "loss": 0.2751, + "step": 7165 + }, + { + "epoch": 0.2680561721616001, + "grad_norm": 0.8204647898674011, + "learning_rate": 1.6658772847115584e-05, + "loss": 0.4243, + "step": 7170 + }, + { + "epoch": 0.2682431011519499, + "grad_norm": 0.44578155875205994, + "learning_rate": 1.6654390398399213e-05, + "loss": 0.2863, + "step": 7175 + }, + { + "epoch": 0.2684300301422997, + "grad_norm": 0.42481744289398193, + "learning_rate": 1.6650005654774265e-05, + "loss": 0.2577, + "step": 7180 + }, + { + "epoch": 0.2686169591326495, + "grad_norm": 0.5261092782020569, + "learning_rate": 1.6645618617752914e-05, + "loss": 0.2702, + "step": 7185 + }, + { + "epoch": 0.26880388812299927, + "grad_norm": 0.468940794467926, + "learning_rate": 1.6641229288848123e-05, + "loss": 0.3976, + "step": 7190 + }, + { + "epoch": 0.26899081711334905, + "grad_norm": 0.5336934328079224, + "learning_rate": 1.6636837669573647e-05, + "loss": 0.3726, + "step": 7195 + }, + { + "epoch": 0.2691777461036989, + "grad_norm": 0.2591412365436554, + "learning_rate": 1.6632443761444027e-05, + "loss": 0.3307, + "step": 7200 + }, + { + "epoch": 0.26936467509404866, + "grad_norm": 0.5266516804695129, + "learning_rate": 1.6628047565974594e-05, + "loss": 0.307, + "step": 7205 + }, + { + "epoch": 0.26955160408439843, + "grad_norm": 0.38717594742774963, + "learning_rate": 1.6623649084681477e-05, + "loss": 0.3238, + "step": 7210 + }, + { + "epoch": 0.2697385330747482, + "grad_norm": 0.17658643424510956, + "learning_rate": 1.6619248319081583e-05, + "loss": 0.2985, + "step": 7215 + }, + { + "epoch": 0.26992546206509804, + "grad_norm": 0.30146273970603943, + "learning_rate": 1.6614845270692606e-05, + "loss": 0.294, + "step": 7220 + }, + { + "epoch": 0.2701123910554478, + "grad_norm": 0.6402953863143921, + "learning_rate": 1.6610439941033038e-05, + "loss": 0.2622, + "step": 7225 + }, + { + "epoch": 0.2702993200457976, + "grad_norm": 0.5695781111717224, + "learning_rate": 1.6606032331622148e-05, + "loss": 0.2279, + "step": 7230 + }, + { + "epoch": 0.2704862490361474, + "grad_norm": 0.23263224959373474, + "learning_rate": 1.6601622443979987e-05, + "loss": 0.2919, + "step": 7235 + }, + { + "epoch": 0.2706731780264972, + "grad_norm": 0.4788752496242523, + "learning_rate": 1.659721027962741e-05, + "loss": 0.2335, + "step": 7240 + }, + { + "epoch": 0.270860107016847, + "grad_norm": 0.4149765372276306, + "learning_rate": 1.659279584008604e-05, + "loss": 0.3214, + "step": 7245 + }, + { + "epoch": 0.27104703600719676, + "grad_norm": 0.5265377759933472, + "learning_rate": 1.6588379126878293e-05, + "loss": 0.3483, + "step": 7250 + }, + { + "epoch": 0.27123396499754654, + "grad_norm": 0.6144738793373108, + "learning_rate": 1.6583960141527367e-05, + "loss": 0.424, + "step": 7255 + }, + { + "epoch": 0.27142089398789637, + "grad_norm": 0.3015303909778595, + "learning_rate": 1.6579538885557242e-05, + "loss": 0.343, + "step": 7260 + }, + { + "epoch": 0.27160782297824615, + "grad_norm": 0.4408928155899048, + "learning_rate": 1.6575115360492683e-05, + "loss": 0.2704, + "step": 7265 + }, + { + "epoch": 0.2717947519685959, + "grad_norm": 0.3285323679447174, + "learning_rate": 1.6570689567859237e-05, + "loss": 0.2789, + "step": 7270 + }, + { + "epoch": 0.2719816809589457, + "grad_norm": 0.09705019742250443, + "learning_rate": 1.6566261509183232e-05, + "loss": 0.3737, + "step": 7275 + }, + { + "epoch": 0.27216860994929554, + "grad_norm": 0.24546971917152405, + "learning_rate": 1.6561831185991782e-05, + "loss": 0.2472, + "step": 7280 + }, + { + "epoch": 0.2723555389396453, + "grad_norm": 0.1780613362789154, + "learning_rate": 1.6557398599812774e-05, + "loss": 0.2864, + "step": 7285 + }, + { + "epoch": 0.2725424679299951, + "grad_norm": 0.5108126997947693, + "learning_rate": 1.655296375217488e-05, + "loss": 0.3593, + "step": 7290 + }, + { + "epoch": 0.27272939692034487, + "grad_norm": 0.14681376516819, + "learning_rate": 1.654852664460756e-05, + "loss": 0.318, + "step": 7295 + }, + { + "epoch": 0.2729163259106947, + "grad_norm": 0.42053887248039246, + "learning_rate": 1.6544087278641037e-05, + "loss": 0.2802, + "step": 7300 + }, + { + "epoch": 0.2731032549010445, + "grad_norm": 0.5258027911186218, + "learning_rate": 1.653964565580632e-05, + "loss": 0.3422, + "step": 7305 + }, + { + "epoch": 0.27329018389139426, + "grad_norm": 0.2713332772254944, + "learning_rate": 1.6535201777635206e-05, + "loss": 0.3497, + "step": 7310 + }, + { + "epoch": 0.27347711288174403, + "grad_norm": 0.5129302740097046, + "learning_rate": 1.6530755645660254e-05, + "loss": 0.2799, + "step": 7315 + }, + { + "epoch": 0.2736640418720938, + "grad_norm": 0.33113789558410645, + "learning_rate": 1.6526307261414812e-05, + "loss": 0.351, + "step": 7320 + }, + { + "epoch": 0.27385097086244364, + "grad_norm": 0.3590730130672455, + "learning_rate": 1.6521856626432992e-05, + "loss": 0.3425, + "step": 7325 + }, + { + "epoch": 0.2740378998527934, + "grad_norm": 0.2738291323184967, + "learning_rate": 1.65174037422497e-05, + "loss": 0.3775, + "step": 7330 + }, + { + "epoch": 0.2742248288431432, + "grad_norm": 0.3311813771724701, + "learning_rate": 1.6512948610400606e-05, + "loss": 0.3176, + "step": 7335 + }, + { + "epoch": 0.274411757833493, + "grad_norm": 0.2696897089481354, + "learning_rate": 1.6508491232422153e-05, + "loss": 0.314, + "step": 7340 + }, + { + "epoch": 0.2745986868238428, + "grad_norm": 0.2976570725440979, + "learning_rate": 1.6504031609851567e-05, + "loss": 0.4029, + "step": 7345 + }, + { + "epoch": 0.2747856158141926, + "grad_norm": 0.25655868649482727, + "learning_rate": 1.6499569744226843e-05, + "loss": 0.3326, + "step": 7350 + }, + { + "epoch": 0.27497254480454236, + "grad_norm": 0.21505354344844818, + "learning_rate": 1.649510563708675e-05, + "loss": 0.2943, + "step": 7355 + }, + { + "epoch": 0.27515947379489214, + "grad_norm": 0.4883136749267578, + "learning_rate": 1.6490639289970834e-05, + "loss": 0.2989, + "step": 7360 + }, + { + "epoch": 0.275346402785242, + "grad_norm": 0.26657021045684814, + "learning_rate": 1.6486170704419402e-05, + "loss": 0.2817, + "step": 7365 + }, + { + "epoch": 0.27553333177559175, + "grad_norm": 0.40542852878570557, + "learning_rate": 1.648169988197355e-05, + "loss": 0.2866, + "step": 7370 + }, + { + "epoch": 0.2757202607659415, + "grad_norm": 0.901739776134491, + "learning_rate": 1.647722682417513e-05, + "loss": 0.2885, + "step": 7375 + }, + { + "epoch": 0.2759071897562913, + "grad_norm": 0.42218196392059326, + "learning_rate": 1.6472751532566777e-05, + "loss": 0.225, + "step": 7380 + }, + { + "epoch": 0.27609411874664114, + "grad_norm": 0.5886632800102234, + "learning_rate": 1.6468274008691888e-05, + "loss": 0.3544, + "step": 7385 + }, + { + "epoch": 0.2762810477369909, + "grad_norm": 0.4941720962524414, + "learning_rate": 1.646379425409463e-05, + "loss": 0.3206, + "step": 7390 + }, + { + "epoch": 0.2764679767273407, + "grad_norm": 0.3011486530303955, + "learning_rate": 1.6459312270319946e-05, + "loss": 0.2848, + "step": 7395 + }, + { + "epoch": 0.27665490571769047, + "grad_norm": 0.5940350890159607, + "learning_rate": 1.6454828058913544e-05, + "loss": 0.3851, + "step": 7400 + }, + { + "epoch": 0.2768418347080403, + "grad_norm": 0.41424158215522766, + "learning_rate": 1.64503416214219e-05, + "loss": 0.3082, + "step": 7405 + }, + { + "epoch": 0.2770287636983901, + "grad_norm": 0.43419891595840454, + "learning_rate": 1.6445852959392257e-05, + "loss": 0.2973, + "step": 7410 + }, + { + "epoch": 0.27721569268873986, + "grad_norm": 0.4673122465610504, + "learning_rate": 1.644136207437262e-05, + "loss": 0.2796, + "step": 7415 + }, + { + "epoch": 0.27740262167908963, + "grad_norm": 0.5394338965415955, + "learning_rate": 1.6436868967911777e-05, + "loss": 0.4372, + "step": 7420 + }, + { + "epoch": 0.27758955066943947, + "grad_norm": 0.39527252316474915, + "learning_rate": 1.6432373641559266e-05, + "loss": 0.2922, + "step": 7425 + }, + { + "epoch": 0.27777647965978924, + "grad_norm": 0.35765504837036133, + "learning_rate": 1.6427876096865394e-05, + "loss": 0.3001, + "step": 7430 + }, + { + "epoch": 0.277963408650139, + "grad_norm": 0.6142287254333496, + "learning_rate": 1.642337633538124e-05, + "loss": 0.3109, + "step": 7435 + }, + { + "epoch": 0.2781503376404888, + "grad_norm": 0.45784667134284973, + "learning_rate": 1.641887435865864e-05, + "loss": 0.3899, + "step": 7440 + }, + { + "epoch": 0.27833726663083863, + "grad_norm": 0.3831481337547302, + "learning_rate": 1.64143701682502e-05, + "loss": 0.2393, + "step": 7445 + }, + { + "epoch": 0.2785241956211884, + "grad_norm": 0.628398060798645, + "learning_rate": 1.6409863765709282e-05, + "loss": 0.3072, + "step": 7450 + }, + { + "epoch": 0.2787111246115382, + "grad_norm": 0.3460518717765808, + "learning_rate": 1.6405355152590018e-05, + "loss": 0.3902, + "step": 7455 + }, + { + "epoch": 0.27889805360188796, + "grad_norm": 0.48453855514526367, + "learning_rate": 1.64008443304473e-05, + "loss": 0.339, + "step": 7460 + }, + { + "epoch": 0.2790849825922378, + "grad_norm": 0.4171224534511566, + "learning_rate": 1.6396331300836778e-05, + "loss": 0.358, + "step": 7465 + }, + { + "epoch": 0.2792719115825876, + "grad_norm": 0.3539869487285614, + "learning_rate": 1.6391816065314865e-05, + "loss": 0.3195, + "step": 7470 + }, + { + "epoch": 0.27945884057293735, + "grad_norm": 0.39307695627212524, + "learning_rate": 1.6387298625438743e-05, + "loss": 0.378, + "step": 7475 + }, + { + "epoch": 0.2796457695632871, + "grad_norm": 0.28309181332588196, + "learning_rate": 1.6382778982766347e-05, + "loss": 0.2541, + "step": 7480 + }, + { + "epoch": 0.27983269855363696, + "grad_norm": 0.4474238455295563, + "learning_rate": 1.6378257138856365e-05, + "loss": 0.2949, + "step": 7485 + }, + { + "epoch": 0.28001962754398674, + "grad_norm": 0.5505223274230957, + "learning_rate": 1.6373733095268258e-05, + "loss": 0.3328, + "step": 7490 + }, + { + "epoch": 0.2802065565343365, + "grad_norm": 0.3556166887283325, + "learning_rate": 1.636920685356224e-05, + "loss": 0.3287, + "step": 7495 + }, + { + "epoch": 0.2803934855246863, + "grad_norm": 0.3573717772960663, + "learning_rate": 1.6364678415299274e-05, + "loss": 0.363, + "step": 7500 + }, + { + "epoch": 0.2805804145150361, + "grad_norm": 0.3663536608219147, + "learning_rate": 1.6360147782041097e-05, + "loss": 0.3575, + "step": 7505 + }, + { + "epoch": 0.2807673435053859, + "grad_norm": 0.26305726170539856, + "learning_rate": 1.6355614955350187e-05, + "loss": 0.3127, + "step": 7510 + }, + { + "epoch": 0.2809542724957357, + "grad_norm": 0.5720646977424622, + "learning_rate": 1.6351079936789792e-05, + "loss": 0.377, + "step": 7515 + }, + { + "epoch": 0.28114120148608546, + "grad_norm": 0.3031104505062103, + "learning_rate": 1.634654272792391e-05, + "loss": 0.3199, + "step": 7520 + }, + { + "epoch": 0.2813281304764353, + "grad_norm": 0.4231772720813751, + "learning_rate": 1.6342003330317295e-05, + "loss": 0.3691, + "step": 7525 + }, + { + "epoch": 0.28151505946678507, + "grad_norm": 0.4373528063297272, + "learning_rate": 1.6337461745535446e-05, + "loss": 0.3065, + "step": 7530 + }, + { + "epoch": 0.28170198845713484, + "grad_norm": 0.3790377676486969, + "learning_rate": 1.6332917975144638e-05, + "loss": 0.3129, + "step": 7535 + }, + { + "epoch": 0.2818889174474846, + "grad_norm": 0.5475689172744751, + "learning_rate": 1.632837202071188e-05, + "loss": 0.3161, + "step": 7540 + }, + { + "epoch": 0.28207584643783445, + "grad_norm": 0.3444899320602417, + "learning_rate": 1.6323823883804942e-05, + "loss": 0.3612, + "step": 7545 + }, + { + "epoch": 0.28226277542818423, + "grad_norm": 0.38195663690567017, + "learning_rate": 1.631927356599235e-05, + "loss": 0.3729, + "step": 7550 + }, + { + "epoch": 0.282449704418534, + "grad_norm": 0.3962342143058777, + "learning_rate": 1.6314721068843367e-05, + "loss": 0.3536, + "step": 7555 + }, + { + "epoch": 0.2826366334088838, + "grad_norm": 0.41847673058509827, + "learning_rate": 1.6310166393928036e-05, + "loss": 0.3601, + "step": 7560 + }, + { + "epoch": 0.28282356239923356, + "grad_norm": 0.4625031650066376, + "learning_rate": 1.630560954281712e-05, + "loss": 0.3712, + "step": 7565 + }, + { + "epoch": 0.2830104913895834, + "grad_norm": 0.24907182157039642, + "learning_rate": 1.6301050517082154e-05, + "loss": 0.2722, + "step": 7570 + }, + { + "epoch": 0.2831974203799332, + "grad_norm": 0.4163358211517334, + "learning_rate": 1.629648931829541e-05, + "loss": 0.3019, + "step": 7575 + }, + { + "epoch": 0.28338434937028295, + "grad_norm": 0.24772894382476807, + "learning_rate": 1.6291925948029918e-05, + "loss": 0.2406, + "step": 7580 + }, + { + "epoch": 0.28357127836063273, + "grad_norm": 0.42197006940841675, + "learning_rate": 1.6287360407859452e-05, + "loss": 0.2812, + "step": 7585 + }, + { + "epoch": 0.28375820735098256, + "grad_norm": 0.38658830523490906, + "learning_rate": 1.628279269935854e-05, + "loss": 0.3223, + "step": 7590 + }, + { + "epoch": 0.28394513634133234, + "grad_norm": 0.6128913164138794, + "learning_rate": 1.627822282410245e-05, + "loss": 0.3035, + "step": 7595 + }, + { + "epoch": 0.2841320653316821, + "grad_norm": 0.44375917315483093, + "learning_rate": 1.62736507836672e-05, + "loss": 0.3087, + "step": 7600 + }, + { + "epoch": 0.2843189943220319, + "grad_norm": 0.35817980766296387, + "learning_rate": 1.626907657962956e-05, + "loss": 0.2697, + "step": 7605 + }, + { + "epoch": 0.2845059233123817, + "grad_norm": 0.3279922902584076, + "learning_rate": 1.6264500213567038e-05, + "loss": 0.2629, + "step": 7610 + }, + { + "epoch": 0.2846928523027315, + "grad_norm": 0.4176391363143921, + "learning_rate": 1.625992168705789e-05, + "loss": 0.2812, + "step": 7615 + }, + { + "epoch": 0.2848797812930813, + "grad_norm": 0.5126670002937317, + "learning_rate": 1.6255341001681125e-05, + "loss": 0.2976, + "step": 7620 + }, + { + "epoch": 0.28506671028343106, + "grad_norm": 0.40457814931869507, + "learning_rate": 1.625075815901649e-05, + "loss": 0.258, + "step": 7625 + }, + { + "epoch": 0.2852536392737809, + "grad_norm": 0.6561230421066284, + "learning_rate": 1.624617316064447e-05, + "loss": 0.3499, + "step": 7630 + }, + { + "epoch": 0.28544056826413067, + "grad_norm": 0.17372195422649384, + "learning_rate": 1.62415860081463e-05, + "loss": 0.2878, + "step": 7635 + }, + { + "epoch": 0.28562749725448044, + "grad_norm": 0.3161996603012085, + "learning_rate": 1.6236996703103963e-05, + "loss": 0.2953, + "step": 7640 + }, + { + "epoch": 0.2858144262448302, + "grad_norm": 0.41537266969680786, + "learning_rate": 1.6232405247100173e-05, + "loss": 0.4672, + "step": 7645 + }, + { + "epoch": 0.28600135523518005, + "grad_norm": 0.5545313358306885, + "learning_rate": 1.6227811641718392e-05, + "loss": 0.251, + "step": 7650 + }, + { + "epoch": 0.28618828422552983, + "grad_norm": 0.4810824692249298, + "learning_rate": 1.6223215888542832e-05, + "loss": 0.2838, + "step": 7655 + }, + { + "epoch": 0.2863752132158796, + "grad_norm": 0.3870065212249756, + "learning_rate": 1.6218617989158426e-05, + "loss": 0.2886, + "step": 7660 + }, + { + "epoch": 0.2865621422062294, + "grad_norm": 0.5083006024360657, + "learning_rate": 1.6214017945150863e-05, + "loss": 0.2963, + "step": 7665 + }, + { + "epoch": 0.2867490711965792, + "grad_norm": 0.4420605003833771, + "learning_rate": 1.6209415758106565e-05, + "loss": 0.3387, + "step": 7670 + }, + { + "epoch": 0.286936000186929, + "grad_norm": 0.29529669880867004, + "learning_rate": 1.620481142961269e-05, + "loss": 0.2956, + "step": 7675 + }, + { + "epoch": 0.2871229291772788, + "grad_norm": 0.25295814871788025, + "learning_rate": 1.6200204961257148e-05, + "loss": 0.2808, + "step": 7680 + }, + { + "epoch": 0.28730985816762855, + "grad_norm": 0.3369937241077423, + "learning_rate": 1.619559635462857e-05, + "loss": 0.3474, + "step": 7685 + }, + { + "epoch": 0.2874967871579784, + "grad_norm": 0.5575014352798462, + "learning_rate": 1.6190985611316336e-05, + "loss": 0.3019, + "step": 7690 + }, + { + "epoch": 0.28768371614832816, + "grad_norm": 0.5619780421257019, + "learning_rate": 1.618637273291056e-05, + "loss": 0.2889, + "step": 7695 + }, + { + "epoch": 0.28787064513867794, + "grad_norm": 0.6334633231163025, + "learning_rate": 1.6181757721002092e-05, + "loss": 0.406, + "step": 7700 + }, + { + "epoch": 0.2880575741290277, + "grad_norm": 0.27999556064605713, + "learning_rate": 1.617714057718251e-05, + "loss": 0.3111, + "step": 7705 + }, + { + "epoch": 0.28824450311937755, + "grad_norm": 0.2326452136039734, + "learning_rate": 1.6172521303044145e-05, + "loss": 0.3168, + "step": 7710 + }, + { + "epoch": 0.2884314321097273, + "grad_norm": 0.3384558856487274, + "learning_rate": 1.616789990018005e-05, + "loss": 0.3016, + "step": 7715 + }, + { + "epoch": 0.2886183611000771, + "grad_norm": 0.29935914278030396, + "learning_rate": 1.616327637018401e-05, + "loss": 0.2635, + "step": 7720 + }, + { + "epoch": 0.2888052900904269, + "grad_norm": 0.3248922526836395, + "learning_rate": 1.615865071465055e-05, + "loss": 0.3969, + "step": 7725 + }, + { + "epoch": 0.2889922190807767, + "grad_norm": 0.2685738205909729, + "learning_rate": 1.6154022935174923e-05, + "loss": 0.3624, + "step": 7730 + }, + { + "epoch": 0.2891791480711265, + "grad_norm": 0.34825339913368225, + "learning_rate": 1.614939303335312e-05, + "loss": 0.2855, + "step": 7735 + }, + { + "epoch": 0.28936607706147627, + "grad_norm": 0.41288071870803833, + "learning_rate": 1.6144761010781867e-05, + "loss": 0.2983, + "step": 7740 + }, + { + "epoch": 0.28955300605182605, + "grad_norm": 0.34375739097595215, + "learning_rate": 1.614012686905861e-05, + "loss": 0.2286, + "step": 7745 + }, + { + "epoch": 0.2897399350421759, + "grad_norm": 0.16923807561397552, + "learning_rate": 1.6135490609781534e-05, + "loss": 0.2648, + "step": 7750 + }, + { + "epoch": 0.28992686403252566, + "grad_norm": 0.391319215297699, + "learning_rate": 1.613085223454955e-05, + "loss": 0.3083, + "step": 7755 + }, + { + "epoch": 0.29011379302287543, + "grad_norm": 0.35019999742507935, + "learning_rate": 1.61262117449623e-05, + "loss": 0.2544, + "step": 7760 + }, + { + "epoch": 0.2903007220132252, + "grad_norm": 0.35723742842674255, + "learning_rate": 1.612156914262016e-05, + "loss": 0.2807, + "step": 7765 + }, + { + "epoch": 0.29048765100357504, + "grad_norm": 0.5157048106193542, + "learning_rate": 1.6116924429124222e-05, + "loss": 0.3461, + "step": 7770 + }, + { + "epoch": 0.2906745799939248, + "grad_norm": 0.504540741443634, + "learning_rate": 1.6112277606076325e-05, + "loss": 0.3168, + "step": 7775 + }, + { + "epoch": 0.2908615089842746, + "grad_norm": 0.4043572247028351, + "learning_rate": 1.6107628675079023e-05, + "loss": 0.2517, + "step": 7780 + }, + { + "epoch": 0.2910484379746244, + "grad_norm": 0.29459133744239807, + "learning_rate": 1.6102977637735587e-05, + "loss": 0.2881, + "step": 7785 + }, + { + "epoch": 0.2912353669649742, + "grad_norm": 0.20902110636234283, + "learning_rate": 1.6098324495650044e-05, + "loss": 0.3696, + "step": 7790 + }, + { + "epoch": 0.291422295955324, + "grad_norm": 0.3627723157405853, + "learning_rate": 1.609366925042712e-05, + "loss": 0.1937, + "step": 7795 + }, + { + "epoch": 0.29160922494567376, + "grad_norm": 0.4867866337299347, + "learning_rate": 1.6089011903672277e-05, + "loss": 0.362, + "step": 7800 + }, + { + "epoch": 0.29179615393602354, + "grad_norm": 0.2678326964378357, + "learning_rate": 1.6084352456991704e-05, + "loss": 0.36, + "step": 7805 + }, + { + "epoch": 0.2919830829263733, + "grad_norm": 0.7733646631240845, + "learning_rate": 1.6079690911992304e-05, + "loss": 0.2725, + "step": 7810 + }, + { + "epoch": 0.29217001191672315, + "grad_norm": 1.191166639328003, + "learning_rate": 1.6075027270281713e-05, + "loss": 0.2314, + "step": 7815 + }, + { + "epoch": 0.2923569409070729, + "grad_norm": 0.3552202880382538, + "learning_rate": 1.607036153346829e-05, + "loss": 0.3046, + "step": 7820 + }, + { + "epoch": 0.2925438698974227, + "grad_norm": 0.42656761407852173, + "learning_rate": 1.6065693703161113e-05, + "loss": 0.315, + "step": 7825 + }, + { + "epoch": 0.2927307988877725, + "grad_norm": 0.3393990099430084, + "learning_rate": 1.606102378096998e-05, + "loss": 0.2877, + "step": 7830 + }, + { + "epoch": 0.2929177278781223, + "grad_norm": 0.3915458619594574, + "learning_rate": 1.605635176850541e-05, + "loss": 0.3389, + "step": 7835 + }, + { + "epoch": 0.2931046568684721, + "grad_norm": 0.4257470369338989, + "learning_rate": 1.605167766737866e-05, + "loss": 0.317, + "step": 7840 + }, + { + "epoch": 0.29329158585882187, + "grad_norm": 0.36982858180999756, + "learning_rate": 1.604700147920168e-05, + "loss": 0.3218, + "step": 7845 + }, + { + "epoch": 0.29347851484917165, + "grad_norm": 0.4049963057041168, + "learning_rate": 1.604232320558716e-05, + "loss": 0.2894, + "step": 7850 + }, + { + "epoch": 0.2936654438395215, + "grad_norm": 0.7059416174888611, + "learning_rate": 1.6037642848148502e-05, + "loss": 0.2444, + "step": 7855 + }, + { + "epoch": 0.29385237282987126, + "grad_norm": 0.635959267616272, + "learning_rate": 1.6032960408499824e-05, + "loss": 0.2997, + "step": 7860 + }, + { + "epoch": 0.29403930182022103, + "grad_norm": 0.24313776195049286, + "learning_rate": 1.602827588825597e-05, + "loss": 0.2254, + "step": 7865 + }, + { + "epoch": 0.2942262308105708, + "grad_norm": 0.3153620660305023, + "learning_rate": 1.6023589289032494e-05, + "loss": 0.2558, + "step": 7870 + }, + { + "epoch": 0.29441315980092064, + "grad_norm": 0.45469948649406433, + "learning_rate": 1.6018900612445665e-05, + "loss": 0.2513, + "step": 7875 + }, + { + "epoch": 0.2946000887912704, + "grad_norm": 0.47144100069999695, + "learning_rate": 1.601420986011248e-05, + "loss": 0.2372, + "step": 7880 + }, + { + "epoch": 0.2947870177816202, + "grad_norm": 0.22326113283634186, + "learning_rate": 1.6009517033650643e-05, + "loss": 0.2896, + "step": 7885 + }, + { + "epoch": 0.29497394677197, + "grad_norm": 0.5031372308731079, + "learning_rate": 1.6004822134678577e-05, + "loss": 0.2962, + "step": 7890 + }, + { + "epoch": 0.2951608757623198, + "grad_norm": 0.6720777750015259, + "learning_rate": 1.6000125164815418e-05, + "loss": 0.3161, + "step": 7895 + }, + { + "epoch": 0.2953478047526696, + "grad_norm": 0.4197002351284027, + "learning_rate": 1.5995426125681014e-05, + "loss": 0.3376, + "step": 7900 + }, + { + "epoch": 0.29553473374301936, + "grad_norm": 0.13400337100028992, + "learning_rate": 1.599072501889593e-05, + "loss": 0.3168, + "step": 7905 + }, + { + "epoch": 0.29572166273336914, + "grad_norm": 0.3738100230693817, + "learning_rate": 1.598602184608144e-05, + "loss": 0.2846, + "step": 7910 + }, + { + "epoch": 0.29590859172371897, + "grad_norm": 0.5275940895080566, + "learning_rate": 1.598131660885954e-05, + "loss": 0.2793, + "step": 7915 + }, + { + "epoch": 0.29609552071406875, + "grad_norm": 0.41306841373443604, + "learning_rate": 1.597660930885293e-05, + "loss": 0.3141, + "step": 7920 + }, + { + "epoch": 0.2962824497044185, + "grad_norm": 0.2588225305080414, + "learning_rate": 1.5971899947685018e-05, + "loss": 0.2731, + "step": 7925 + }, + { + "epoch": 0.2964693786947683, + "grad_norm": 0.1754133701324463, + "learning_rate": 1.5967188526979928e-05, + "loss": 0.2432, + "step": 7930 + }, + { + "epoch": 0.29665630768511814, + "grad_norm": 0.18629756569862366, + "learning_rate": 1.5962475048362498e-05, + "loss": 0.3458, + "step": 7935 + }, + { + "epoch": 0.2968432366754679, + "grad_norm": 0.6092668175697327, + "learning_rate": 1.5957759513458274e-05, + "loss": 0.3054, + "step": 7940 + }, + { + "epoch": 0.2970301656658177, + "grad_norm": 0.3897908926010132, + "learning_rate": 1.59530419238935e-05, + "loss": 0.2734, + "step": 7945 + }, + { + "epoch": 0.29721709465616747, + "grad_norm": 0.4030907154083252, + "learning_rate": 1.5948322281295147e-05, + "loss": 0.3481, + "step": 7950 + }, + { + "epoch": 0.2974040236465173, + "grad_norm": 0.3852006793022156, + "learning_rate": 1.594360058729088e-05, + "loss": 0.2923, + "step": 7955 + }, + { + "epoch": 0.2975909526368671, + "grad_norm": 0.3748472034931183, + "learning_rate": 1.5938876843509072e-05, + "loss": 0.3313, + "step": 7960 + }, + { + "epoch": 0.29777788162721686, + "grad_norm": 0.5024619698524475, + "learning_rate": 1.5934151051578814e-05, + "loss": 0.2947, + "step": 7965 + }, + { + "epoch": 0.29796481061756663, + "grad_norm": 0.4308888018131256, + "learning_rate": 1.592942321312989e-05, + "loss": 0.2932, + "step": 7970 + }, + { + "epoch": 0.29815173960791647, + "grad_norm": 0.4150945842266083, + "learning_rate": 1.5924693329792808e-05, + "loss": 0.351, + "step": 7975 + }, + { + "epoch": 0.29833866859826624, + "grad_norm": 0.2940865755081177, + "learning_rate": 1.5919961403198752e-05, + "loss": 0.2743, + "step": 7980 + }, + { + "epoch": 0.298525597588616, + "grad_norm": 0.2742552161216736, + "learning_rate": 1.591522743497964e-05, + "loss": 0.266, + "step": 7985 + }, + { + "epoch": 0.2987125265789658, + "grad_norm": 0.5203600525856018, + "learning_rate": 1.591049142676808e-05, + "loss": 0.3469, + "step": 7990 + }, + { + "epoch": 0.29889945556931563, + "grad_norm": 0.29020723700523376, + "learning_rate": 1.590575338019738e-05, + "loss": 0.2792, + "step": 7995 + }, + { + "epoch": 0.2990863845596654, + "grad_norm": 0.5410657525062561, + "learning_rate": 1.590101329690156e-05, + "loss": 0.2447, + "step": 8000 + }, + { + "epoch": 0.2992733135500152, + "grad_norm": 0.36561673879623413, + "learning_rate": 1.589627117851534e-05, + "loss": 0.2502, + "step": 8005 + }, + { + "epoch": 0.29946024254036496, + "grad_norm": 0.2769913673400879, + "learning_rate": 1.589152702667414e-05, + "loss": 0.3428, + "step": 8010 + }, + { + "epoch": 0.2996471715307148, + "grad_norm": 0.34356051683425903, + "learning_rate": 1.5886780843014085e-05, + "loss": 0.3429, + "step": 8015 + }, + { + "epoch": 0.2998341005210646, + "grad_norm": 0.2786838710308075, + "learning_rate": 1.5882032629171993e-05, + "loss": 0.3215, + "step": 8020 + }, + { + "epoch": 0.30002102951141435, + "grad_norm": 0.46318674087524414, + "learning_rate": 1.587728238678539e-05, + "loss": 0.3538, + "step": 8025 + }, + { + "epoch": 0.3002079585017641, + "grad_norm": 0.3690614402294159, + "learning_rate": 1.5872530117492495e-05, + "loss": 0.3055, + "step": 8030 + }, + { + "epoch": 0.30039488749211396, + "grad_norm": 0.8592503070831299, + "learning_rate": 1.5867775822932233e-05, + "loss": 0.3292, + "step": 8035 + }, + { + "epoch": 0.30058181648246374, + "grad_norm": 0.4920271635055542, + "learning_rate": 1.5863019504744222e-05, + "loss": 0.2921, + "step": 8040 + }, + { + "epoch": 0.3007687454728135, + "grad_norm": 0.3858177363872528, + "learning_rate": 1.585826116456878e-05, + "loss": 0.3832, + "step": 8045 + }, + { + "epoch": 0.3009556744631633, + "grad_norm": 0.37820449471473694, + "learning_rate": 1.5853500804046926e-05, + "loss": 0.3231, + "step": 8050 + }, + { + "epoch": 0.30114260345351307, + "grad_norm": 0.24952290952205658, + "learning_rate": 1.5848738424820366e-05, + "loss": 0.3362, + "step": 8055 + }, + { + "epoch": 0.3013295324438629, + "grad_norm": 0.39352136850357056, + "learning_rate": 1.584397402853151e-05, + "loss": 0.2533, + "step": 8060 + }, + { + "epoch": 0.3015164614342127, + "grad_norm": 0.34481966495513916, + "learning_rate": 1.5839207616823468e-05, + "loss": 0.3445, + "step": 8065 + }, + { + "epoch": 0.30170339042456246, + "grad_norm": 0.6301359534263611, + "learning_rate": 1.583443919134003e-05, + "loss": 0.332, + "step": 8070 + }, + { + "epoch": 0.30189031941491223, + "grad_norm": 0.6552008986473083, + "learning_rate": 1.582966875372569e-05, + "loss": 0.3042, + "step": 8075 + }, + { + "epoch": 0.30207724840526207, + "grad_norm": 0.3802003860473633, + "learning_rate": 1.582489630562564e-05, + "loss": 0.2635, + "step": 8080 + }, + { + "epoch": 0.30226417739561184, + "grad_norm": 0.4597533941268921, + "learning_rate": 1.5820121848685758e-05, + "loss": 0.2614, + "step": 8085 + }, + { + "epoch": 0.3024511063859616, + "grad_norm": 0.4375554025173187, + "learning_rate": 1.581534538455262e-05, + "loss": 0.342, + "step": 8090 + }, + { + "epoch": 0.3026380353763114, + "grad_norm": 0.36787232756614685, + "learning_rate": 1.5810566914873487e-05, + "loss": 0.3305, + "step": 8095 + }, + { + "epoch": 0.30282496436666123, + "grad_norm": 0.342817485332489, + "learning_rate": 1.580578644129632e-05, + "loss": 0.3672, + "step": 8100 + }, + { + "epoch": 0.303011893357011, + "grad_norm": 0.8170397281646729, + "learning_rate": 1.5801003965469764e-05, + "loss": 0.2564, + "step": 8105 + }, + { + "epoch": 0.3031988223473608, + "grad_norm": 0.1437387764453888, + "learning_rate": 1.5796219489043164e-05, + "loss": 0.3617, + "step": 8110 + }, + { + "epoch": 0.30338575133771056, + "grad_norm": 0.4565275013446808, + "learning_rate": 1.5791433013666544e-05, + "loss": 0.2808, + "step": 8115 + }, + { + "epoch": 0.3035726803280604, + "grad_norm": 0.5784963369369507, + "learning_rate": 1.5786644540990622e-05, + "loss": 0.234, + "step": 8120 + }, + { + "epoch": 0.3037596093184102, + "grad_norm": 0.44726547598838806, + "learning_rate": 1.578185407266681e-05, + "loss": 0.3222, + "step": 8125 + }, + { + "epoch": 0.30394653830875995, + "grad_norm": 0.260372132062912, + "learning_rate": 1.5777061610347197e-05, + "loss": 0.2867, + "step": 8130 + }, + { + "epoch": 0.3041334672991097, + "grad_norm": 0.5115578174591064, + "learning_rate": 1.5772267155684565e-05, + "loss": 0.3603, + "step": 8135 + }, + { + "epoch": 0.30432039628945956, + "grad_norm": 0.4961742162704468, + "learning_rate": 1.576747071033239e-05, + "loss": 0.2761, + "step": 8140 + }, + { + "epoch": 0.30450732527980934, + "grad_norm": 0.07623039931058884, + "learning_rate": 1.5762672275944826e-05, + "loss": 0.3036, + "step": 8145 + }, + { + "epoch": 0.3046942542701591, + "grad_norm": 0.7987903356552124, + "learning_rate": 1.5757871854176716e-05, + "loss": 0.2701, + "step": 8150 + }, + { + "epoch": 0.3048811832605089, + "grad_norm": 0.34833961725234985, + "learning_rate": 1.5753069446683586e-05, + "loss": 0.2672, + "step": 8155 + }, + { + "epoch": 0.3050681122508587, + "grad_norm": 0.5639849305152893, + "learning_rate": 1.574826505512165e-05, + "loss": 0.3335, + "step": 8160 + }, + { + "epoch": 0.3052550412412085, + "grad_norm": 0.2684768736362457, + "learning_rate": 1.5743458681147807e-05, + "loss": 0.2879, + "step": 8165 + }, + { + "epoch": 0.3054419702315583, + "grad_norm": 0.32152795791625977, + "learning_rate": 1.5738650326419636e-05, + "loss": 0.3091, + "step": 8170 + }, + { + "epoch": 0.30562889922190806, + "grad_norm": 0.4024050533771515, + "learning_rate": 1.57338399925954e-05, + "loss": 0.3315, + "step": 8175 + }, + { + "epoch": 0.3058158282122579, + "grad_norm": 0.32922789454460144, + "learning_rate": 1.5729027681334043e-05, + "loss": 0.3058, + "step": 8180 + }, + { + "epoch": 0.30600275720260767, + "grad_norm": 0.37226831912994385, + "learning_rate": 1.57242133942952e-05, + "loss": 0.2949, + "step": 8185 + }, + { + "epoch": 0.30618968619295744, + "grad_norm": 0.5778080821037292, + "learning_rate": 1.5719397133139172e-05, + "loss": 0.3008, + "step": 8190 + }, + { + "epoch": 0.3063766151833072, + "grad_norm": 0.30780428647994995, + "learning_rate": 1.5714578899526957e-05, + "loss": 0.2412, + "step": 8195 + }, + { + "epoch": 0.30656354417365705, + "grad_norm": 0.13755351305007935, + "learning_rate": 1.5709758695120222e-05, + "loss": 0.2439, + "step": 8200 + }, + { + "epoch": 0.30675047316400683, + "grad_norm": 0.5074127316474915, + "learning_rate": 1.570493652158132e-05, + "loss": 0.3904, + "step": 8205 + }, + { + "epoch": 0.3069374021543566, + "grad_norm": 0.27860695123672485, + "learning_rate": 1.5700112380573277e-05, + "loss": 0.313, + "step": 8210 + }, + { + "epoch": 0.3071243311447064, + "grad_norm": 0.4412946403026581, + "learning_rate": 1.5695286273759805e-05, + "loss": 0.3682, + "step": 8215 + }, + { + "epoch": 0.3073112601350562, + "grad_norm": 0.402165025472641, + "learning_rate": 1.569045820280529e-05, + "loss": 0.2835, + "step": 8220 + }, + { + "epoch": 0.307498189125406, + "grad_norm": 0.49247145652770996, + "learning_rate": 1.5685628169374793e-05, + "loss": 0.2833, + "step": 8225 + }, + { + "epoch": 0.3076851181157558, + "grad_norm": 0.41755223274230957, + "learning_rate": 1.568079617513405e-05, + "loss": 0.2886, + "step": 8230 + }, + { + "epoch": 0.30787204710610555, + "grad_norm": 0.34908807277679443, + "learning_rate": 1.5675962221749478e-05, + "loss": 0.275, + "step": 8235 + }, + { + "epoch": 0.3080589760964554, + "grad_norm": 0.3702501356601715, + "learning_rate": 1.567112631088818e-05, + "loss": 0.3203, + "step": 8240 + }, + { + "epoch": 0.30824590508680516, + "grad_norm": 0.42112669348716736, + "learning_rate": 1.5666288444217915e-05, + "loss": 0.2694, + "step": 8245 + }, + { + "epoch": 0.30843283407715494, + "grad_norm": 0.29839271306991577, + "learning_rate": 1.5661448623407122e-05, + "loss": 0.288, + "step": 8250 + }, + { + "epoch": 0.3086197630675047, + "grad_norm": 0.4897206425666809, + "learning_rate": 1.565660685012492e-05, + "loss": 0.3968, + "step": 8255 + }, + { + "epoch": 0.30880669205785455, + "grad_norm": 0.6160528659820557, + "learning_rate": 1.5651763126041098e-05, + "loss": 0.2819, + "step": 8260 + }, + { + "epoch": 0.3089936210482043, + "grad_norm": 0.2819858193397522, + "learning_rate": 1.564691745282612e-05, + "loss": 0.2829, + "step": 8265 + }, + { + "epoch": 0.3091805500385541, + "grad_norm": 0.33518272638320923, + "learning_rate": 1.5642069832151116e-05, + "loss": 0.2391, + "step": 8270 + }, + { + "epoch": 0.3093674790289039, + "grad_norm": 0.3020849823951721, + "learning_rate": 1.5637220265687894e-05, + "loss": 0.2829, + "step": 8275 + }, + { + "epoch": 0.3095544080192537, + "grad_norm": 0.42169663310050964, + "learning_rate": 1.5632368755108926e-05, + "loss": 0.312, + "step": 8280 + }, + { + "epoch": 0.3097413370096035, + "grad_norm": 0.34373271465301514, + "learning_rate": 1.5627515302087362e-05, + "loss": 0.2879, + "step": 8285 + }, + { + "epoch": 0.30992826599995327, + "grad_norm": 0.4371539056301117, + "learning_rate": 1.562265990829702e-05, + "loss": 0.3221, + "step": 8290 + }, + { + "epoch": 0.31011519499030304, + "grad_norm": 0.3401534855365753, + "learning_rate": 1.5617802575412385e-05, + "loss": 0.2821, + "step": 8295 + }, + { + "epoch": 0.3103021239806528, + "grad_norm": 0.605153501033783, + "learning_rate": 1.5612943305108615e-05, + "loss": 0.3055, + "step": 8300 + }, + { + "epoch": 0.31048905297100265, + "grad_norm": 0.4191633462905884, + "learning_rate": 1.5608082099061528e-05, + "loss": 0.2807, + "step": 8305 + }, + { + "epoch": 0.31067598196135243, + "grad_norm": 0.2965903580188751, + "learning_rate": 1.560321895894762e-05, + "loss": 0.2722, + "step": 8310 + }, + { + "epoch": 0.3108629109517022, + "grad_norm": 0.39185696840286255, + "learning_rate": 1.559835388644404e-05, + "loss": 0.3032, + "step": 8315 + }, + { + "epoch": 0.311049839942052, + "grad_norm": 0.2846728265285492, + "learning_rate": 1.5593486883228617e-05, + "loss": 0.2977, + "step": 8320 + }, + { + "epoch": 0.3112367689324018, + "grad_norm": 0.35575684905052185, + "learning_rate": 1.5588617950979846e-05, + "loss": 0.3072, + "step": 8325 + }, + { + "epoch": 0.3114236979227516, + "grad_norm": 0.12662628293037415, + "learning_rate": 1.5583747091376877e-05, + "loss": 0.2764, + "step": 8330 + }, + { + "epoch": 0.3116106269131014, + "grad_norm": 0.439426451921463, + "learning_rate": 1.5578874306099533e-05, + "loss": 0.2643, + "step": 8335 + }, + { + "epoch": 0.31179755590345115, + "grad_norm": 0.13574817776679993, + "learning_rate": 1.5573999596828292e-05, + "loss": 0.2879, + "step": 8340 + }, + { + "epoch": 0.311984484893801, + "grad_norm": 0.6407355666160583, + "learning_rate": 1.5569122965244306e-05, + "loss": 0.2412, + "step": 8345 + }, + { + "epoch": 0.31217141388415076, + "grad_norm": 0.5395145416259766, + "learning_rate": 1.5564244413029385e-05, + "loss": 0.3068, + "step": 8350 + }, + { + "epoch": 0.31235834287450054, + "grad_norm": 0.17821340262889862, + "learning_rate": 1.5559363941866005e-05, + "loss": 0.4122, + "step": 8355 + }, + { + "epoch": 0.3125452718648503, + "grad_norm": 0.3062966465950012, + "learning_rate": 1.5554481553437294e-05, + "loss": 0.2104, + "step": 8360 + }, + { + "epoch": 0.31273220085520015, + "grad_norm": 0.5049282908439636, + "learning_rate": 1.5549597249427052e-05, + "loss": 0.253, + "step": 8365 + }, + { + "epoch": 0.3129191298455499, + "grad_norm": 0.49796777963638306, + "learning_rate": 1.5544711031519736e-05, + "loss": 0.2669, + "step": 8370 + }, + { + "epoch": 0.3131060588358997, + "grad_norm": 0.34029170870780945, + "learning_rate": 1.553982290140046e-05, + "loss": 0.3638, + "step": 8375 + }, + { + "epoch": 0.3132929878262495, + "grad_norm": 0.5282084941864014, + "learning_rate": 1.5534932860755e-05, + "loss": 0.2841, + "step": 8380 + }, + { + "epoch": 0.3134799168165993, + "grad_norm": 0.4704183340072632, + "learning_rate": 1.553004091126979e-05, + "loss": 0.3326, + "step": 8385 + }, + { + "epoch": 0.3136668458069491, + "grad_norm": 0.442396879196167, + "learning_rate": 1.552514705463193e-05, + "loss": 0.2584, + "step": 8390 + }, + { + "epoch": 0.31385377479729887, + "grad_norm": 0.40408533811569214, + "learning_rate": 1.5520251292529165e-05, + "loss": 0.3079, + "step": 8395 + }, + { + "epoch": 0.31404070378764865, + "grad_norm": 0.34462475776672363, + "learning_rate": 1.5515353626649905e-05, + "loss": 0.2727, + "step": 8400 + }, + { + "epoch": 0.3142276327779985, + "grad_norm": 0.043053608387708664, + "learning_rate": 1.5510454058683216e-05, + "loss": 0.2879, + "step": 8405 + }, + { + "epoch": 0.31441456176834826, + "grad_norm": 0.3124004006385803, + "learning_rate": 1.5505552590318814e-05, + "loss": 0.3576, + "step": 8410 + }, + { + "epoch": 0.31460149075869803, + "grad_norm": 0.4973233938217163, + "learning_rate": 1.5500649223247076e-05, + "loss": 0.2801, + "step": 8415 + }, + { + "epoch": 0.3147884197490478, + "grad_norm": 0.4319716989994049, + "learning_rate": 1.5495743959159034e-05, + "loss": 0.2454, + "step": 8420 + }, + { + "epoch": 0.31497534873939764, + "grad_norm": 0.4788093864917755, + "learning_rate": 1.549083679974638e-05, + "loss": 0.3537, + "step": 8425 + }, + { + "epoch": 0.3151622777297474, + "grad_norm": 0.6235319972038269, + "learning_rate": 1.548592774670144e-05, + "loss": 0.2974, + "step": 8430 + }, + { + "epoch": 0.3153492067200972, + "grad_norm": 0.3328862190246582, + "learning_rate": 1.5481016801717213e-05, + "loss": 0.37, + "step": 8435 + }, + { + "epoch": 0.315536135710447, + "grad_norm": 0.6881264448165894, + "learning_rate": 1.5476103966487345e-05, + "loss": 0.3738, + "step": 8440 + }, + { + "epoch": 0.3157230647007968, + "grad_norm": 0.4371614456176758, + "learning_rate": 1.547118924270613e-05, + "loss": 0.2477, + "step": 8445 + }, + { + "epoch": 0.3159099936911466, + "grad_norm": 0.6535786986351013, + "learning_rate": 1.546627263206851e-05, + "loss": 0.3318, + "step": 8450 + }, + { + "epoch": 0.31609692268149636, + "grad_norm": 0.5133426189422607, + "learning_rate": 1.546135413627009e-05, + "loss": 0.2697, + "step": 8455 + }, + { + "epoch": 0.31628385167184614, + "grad_norm": 0.5738974809646606, + "learning_rate": 1.5456433757007115e-05, + "loss": 0.247, + "step": 8460 + }, + { + "epoch": 0.31647078066219597, + "grad_norm": 0.12999732792377472, + "learning_rate": 1.5451511495976483e-05, + "loss": 0.2726, + "step": 8465 + }, + { + "epoch": 0.31665770965254575, + "grad_norm": 0.30150121450424194, + "learning_rate": 1.5446587354875742e-05, + "loss": 0.2792, + "step": 8470 + }, + { + "epoch": 0.3168446386428955, + "grad_norm": 0.36538806557655334, + "learning_rate": 1.544166133540309e-05, + "loss": 0.3656, + "step": 8475 + }, + { + "epoch": 0.3170315676332453, + "grad_norm": 0.5512821078300476, + "learning_rate": 1.5436733439257362e-05, + "loss": 0.2357, + "step": 8480 + }, + { + "epoch": 0.31721849662359514, + "grad_norm": 0.4331134259700775, + "learning_rate": 1.5431803668138056e-05, + "loss": 0.2833, + "step": 8485 + }, + { + "epoch": 0.3174054256139449, + "grad_norm": 0.2866262197494507, + "learning_rate": 1.5426872023745305e-05, + "loss": 0.1892, + "step": 8490 + }, + { + "epoch": 0.3175923546042947, + "grad_norm": 0.36052101850509644, + "learning_rate": 1.5421938507779893e-05, + "loss": 0.2636, + "step": 8495 + }, + { + "epoch": 0.31777928359464447, + "grad_norm": 0.4259335398674011, + "learning_rate": 1.5417003121943247e-05, + "loss": 0.2615, + "step": 8500 + }, + { + "epoch": 0.3179662125849943, + "grad_norm": 0.270948201417923, + "learning_rate": 1.541206586793744e-05, + "loss": 0.3241, + "step": 8505 + }, + { + "epoch": 0.3181531415753441, + "grad_norm": 0.2681352496147156, + "learning_rate": 1.5407126747465195e-05, + "loss": 0.2701, + "step": 8510 + }, + { + "epoch": 0.31834007056569386, + "grad_norm": 0.4917761981487274, + "learning_rate": 1.5402185762229864e-05, + "loss": 0.2454, + "step": 8515 + }, + { + "epoch": 0.31852699955604363, + "grad_norm": 0.4351048767566681, + "learning_rate": 1.539724291393546e-05, + "loss": 0.2836, + "step": 8520 + }, + { + "epoch": 0.31871392854639347, + "grad_norm": 0.7462584376335144, + "learning_rate": 1.5392298204286623e-05, + "loss": 0.2905, + "step": 8525 + }, + { + "epoch": 0.31890085753674324, + "grad_norm": 0.36334559321403503, + "learning_rate": 1.5387351634988644e-05, + "loss": 0.2733, + "step": 8530 + }, + { + "epoch": 0.319087786527093, + "grad_norm": 0.22820745408535004, + "learning_rate": 1.5382403207747453e-05, + "loss": 0.253, + "step": 8535 + }, + { + "epoch": 0.3192747155174428, + "grad_norm": 0.45713189244270325, + "learning_rate": 1.537745292426962e-05, + "loss": 0.2474, + "step": 8540 + }, + { + "epoch": 0.3194616445077926, + "grad_norm": 0.1870334893465042, + "learning_rate": 1.5372500786262357e-05, + "loss": 0.3835, + "step": 8545 + }, + { + "epoch": 0.3196485734981424, + "grad_norm": 0.6428795456886292, + "learning_rate": 1.5367546795433517e-05, + "loss": 0.3194, + "step": 8550 + }, + { + "epoch": 0.3198355024884922, + "grad_norm": 0.47562816739082336, + "learning_rate": 1.5362590953491586e-05, + "loss": 0.2485, + "step": 8555 + }, + { + "epoch": 0.32002243147884196, + "grad_norm": 0.418270468711853, + "learning_rate": 1.535763326214569e-05, + "loss": 0.318, + "step": 8560 + }, + { + "epoch": 0.32020936046919174, + "grad_norm": 0.7207987904548645, + "learning_rate": 1.53526737231056e-05, + "loss": 0.347, + "step": 8565 + }, + { + "epoch": 0.32039628945954157, + "grad_norm": 0.4449211061000824, + "learning_rate": 1.5347712338081717e-05, + "loss": 0.2959, + "step": 8570 + }, + { + "epoch": 0.32058321844989135, + "grad_norm": 0.29263216257095337, + "learning_rate": 1.5342749108785084e-05, + "loss": 0.2846, + "step": 8575 + }, + { + "epoch": 0.3207701474402411, + "grad_norm": 0.5871028900146484, + "learning_rate": 1.5337784036927367e-05, + "loss": 0.3511, + "step": 8580 + }, + { + "epoch": 0.3209570764305909, + "grad_norm": 0.3245595395565033, + "learning_rate": 1.533281712422088e-05, + "loss": 0.3302, + "step": 8585 + }, + { + "epoch": 0.32114400542094074, + "grad_norm": 0.6525892019271851, + "learning_rate": 1.5327848372378574e-05, + "loss": 0.3053, + "step": 8590 + }, + { + "epoch": 0.3213309344112905, + "grad_norm": 0.3770609498023987, + "learning_rate": 1.5322877783114027e-05, + "loss": 0.3547, + "step": 8595 + }, + { + "epoch": 0.3215178634016403, + "grad_norm": 0.4422697126865387, + "learning_rate": 1.5317905358141456e-05, + "loss": 0.2285, + "step": 8600 + }, + { + "epoch": 0.32170479239199007, + "grad_norm": 0.3939485549926758, + "learning_rate": 1.53129310991757e-05, + "loss": 0.3421, + "step": 8605 + }, + { + "epoch": 0.3218917213823399, + "grad_norm": 0.3674333095550537, + "learning_rate": 1.5307955007932243e-05, + "loss": 0.2389, + "step": 8610 + }, + { + "epoch": 0.3220786503726897, + "grad_norm": 0.27263256907463074, + "learning_rate": 1.5302977086127194e-05, + "loss": 0.2633, + "step": 8615 + }, + { + "epoch": 0.32226557936303946, + "grad_norm": 0.3871941864490509, + "learning_rate": 1.5297997335477302e-05, + "loss": 0.314, + "step": 8620 + }, + { + "epoch": 0.32245250835338923, + "grad_norm": 0.4470764398574829, + "learning_rate": 1.5293015757699935e-05, + "loss": 0.2881, + "step": 8625 + }, + { + "epoch": 0.32263943734373907, + "grad_norm": 0.6860572695732117, + "learning_rate": 1.5288032354513095e-05, + "loss": 0.3377, + "step": 8630 + }, + { + "epoch": 0.32282636633408884, + "grad_norm": 0.4774250090122223, + "learning_rate": 1.5283047127635418e-05, + "loss": 0.2989, + "step": 8635 + }, + { + "epoch": 0.3230132953244386, + "grad_norm": 0.2891339957714081, + "learning_rate": 1.5278060078786166e-05, + "loss": 0.2577, + "step": 8640 + }, + { + "epoch": 0.3232002243147884, + "grad_norm": 0.7874906659126282, + "learning_rate": 1.5273071209685227e-05, + "loss": 0.3161, + "step": 8645 + }, + { + "epoch": 0.32338715330513823, + "grad_norm": 0.3914336562156677, + "learning_rate": 1.526808052205312e-05, + "loss": 0.2692, + "step": 8650 + }, + { + "epoch": 0.323574082295488, + "grad_norm": 0.9589570164680481, + "learning_rate": 1.526308801761099e-05, + "loss": 0.2913, + "step": 8655 + }, + { + "epoch": 0.3237610112858378, + "grad_norm": 0.19636771082878113, + "learning_rate": 1.5258093698080614e-05, + "loss": 0.3008, + "step": 8660 + }, + { + "epoch": 0.32394794027618756, + "grad_norm": 0.27566975355148315, + "learning_rate": 1.5253097565184382e-05, + "loss": 0.2644, + "step": 8665 + }, + { + "epoch": 0.3241348692665374, + "grad_norm": 0.3337952494621277, + "learning_rate": 1.5248099620645321e-05, + "loss": 0.3182, + "step": 8670 + }, + { + "epoch": 0.3243217982568872, + "grad_norm": 0.30488675832748413, + "learning_rate": 1.5243099866187076e-05, + "loss": 0.3166, + "step": 8675 + }, + { + "epoch": 0.32450872724723695, + "grad_norm": 0.2742447853088379, + "learning_rate": 1.5238098303533923e-05, + "loss": 0.2807, + "step": 8680 + }, + { + "epoch": 0.3246956562375867, + "grad_norm": 0.35254549980163574, + "learning_rate": 1.5233094934410755e-05, + "loss": 0.2999, + "step": 8685 + }, + { + "epoch": 0.32488258522793656, + "grad_norm": 0.1497417390346527, + "learning_rate": 1.522808976054309e-05, + "loss": 0.3525, + "step": 8690 + }, + { + "epoch": 0.32506951421828634, + "grad_norm": 0.22152622044086456, + "learning_rate": 1.522308278365707e-05, + "loss": 0.3084, + "step": 8695 + }, + { + "epoch": 0.3252564432086361, + "grad_norm": 0.4548346996307373, + "learning_rate": 1.521807400547946e-05, + "loss": 0.3306, + "step": 8700 + }, + { + "epoch": 0.3254433721989859, + "grad_norm": 0.7274771928787231, + "learning_rate": 1.5213063427737639e-05, + "loss": 0.2431, + "step": 8705 + }, + { + "epoch": 0.3256303011893357, + "grad_norm": 0.4136773347854614, + "learning_rate": 1.5208051052159618e-05, + "loss": 0.2497, + "step": 8710 + }, + { + "epoch": 0.3258172301796855, + "grad_norm": 0.4158473014831543, + "learning_rate": 1.520303688047402e-05, + "loss": 0.3343, + "step": 8715 + }, + { + "epoch": 0.3260041591700353, + "grad_norm": 0.26390954852104187, + "learning_rate": 1.5198020914410085e-05, + "loss": 0.2489, + "step": 8720 + }, + { + "epoch": 0.32619108816038506, + "grad_norm": 0.3240726888179779, + "learning_rate": 1.5193003155697681e-05, + "loss": 0.3173, + "step": 8725 + }, + { + "epoch": 0.3263780171507349, + "grad_norm": 0.4203800857067108, + "learning_rate": 1.5187983606067284e-05, + "loss": 0.2545, + "step": 8730 + }, + { + "epoch": 0.32656494614108467, + "grad_norm": 0.40211477875709534, + "learning_rate": 1.5182962267249997e-05, + "loss": 0.2805, + "step": 8735 + }, + { + "epoch": 0.32675187513143444, + "grad_norm": 0.35995474457740784, + "learning_rate": 1.5177939140977535e-05, + "loss": 0.3068, + "step": 8740 + }, + { + "epoch": 0.3269388041217842, + "grad_norm": 0.36302804946899414, + "learning_rate": 1.517291422898223e-05, + "loss": 0.2778, + "step": 8745 + }, + { + "epoch": 0.32712573311213405, + "grad_norm": 0.4798825681209564, + "learning_rate": 1.5167887532997032e-05, + "loss": 0.2644, + "step": 8750 + }, + { + "epoch": 0.32731266210248383, + "grad_norm": 0.3247123658657074, + "learning_rate": 1.51628590547555e-05, + "loss": 0.259, + "step": 8755 + }, + { + "epoch": 0.3274995910928336, + "grad_norm": 0.23680318892002106, + "learning_rate": 1.5157828795991813e-05, + "loss": 0.3716, + "step": 8760 + }, + { + "epoch": 0.3276865200831834, + "grad_norm": 0.4908321797847748, + "learning_rate": 1.5152796758440769e-05, + "loss": 0.2849, + "step": 8765 + }, + { + "epoch": 0.3278734490735332, + "grad_norm": 0.8543740510940552, + "learning_rate": 1.514776294383777e-05, + "loss": 0.3428, + "step": 8770 + }, + { + "epoch": 0.328060378063883, + "grad_norm": 0.6384404301643372, + "learning_rate": 1.514272735391883e-05, + "loss": 0.291, + "step": 8775 + }, + { + "epoch": 0.3282473070542328, + "grad_norm": 0.33483198285102844, + "learning_rate": 1.5137689990420583e-05, + "loss": 0.3363, + "step": 8780 + }, + { + "epoch": 0.32843423604458255, + "grad_norm": 0.26816031336784363, + "learning_rate": 1.5132650855080275e-05, + "loss": 0.298, + "step": 8785 + }, + { + "epoch": 0.3286211650349323, + "grad_norm": 0.3883032500743866, + "learning_rate": 1.5127609949635753e-05, + "loss": 0.2433, + "step": 8790 + }, + { + "epoch": 0.32880809402528216, + "grad_norm": 0.31391748785972595, + "learning_rate": 1.5122567275825486e-05, + "loss": 0.3271, + "step": 8795 + }, + { + "epoch": 0.32899502301563194, + "grad_norm": 0.5481862425804138, + "learning_rate": 1.5117522835388545e-05, + "loss": 0.2805, + "step": 8800 + }, + { + "epoch": 0.3291819520059817, + "grad_norm": 0.38565173745155334, + "learning_rate": 1.5112476630064615e-05, + "loss": 0.2974, + "step": 8805 + }, + { + "epoch": 0.3293688809963315, + "grad_norm": 0.3033200800418854, + "learning_rate": 1.5107428661593983e-05, + "loss": 0.2851, + "step": 8810 + }, + { + "epoch": 0.3295558099866813, + "grad_norm": 0.3411104381084442, + "learning_rate": 1.5102378931717556e-05, + "loss": 0.3146, + "step": 8815 + }, + { + "epoch": 0.3297427389770311, + "grad_norm": 0.32924318313598633, + "learning_rate": 1.5097327442176837e-05, + "loss": 0.2499, + "step": 8820 + }, + { + "epoch": 0.3299296679673809, + "grad_norm": 0.6186853647232056, + "learning_rate": 1.5092274194713933e-05, + "loss": 0.3388, + "step": 8825 + }, + { + "epoch": 0.33011659695773066, + "grad_norm": 0.2822064757347107, + "learning_rate": 1.5087219191071579e-05, + "loss": 0.307, + "step": 8830 + }, + { + "epoch": 0.3303035259480805, + "grad_norm": 0.4423992931842804, + "learning_rate": 1.5082162432993092e-05, + "loss": 0.2772, + "step": 8835 + }, + { + "epoch": 0.33049045493843027, + "grad_norm": 0.6045133471488953, + "learning_rate": 1.5077103922222402e-05, + "loss": 0.3682, + "step": 8840 + }, + { + "epoch": 0.33067738392878004, + "grad_norm": 0.26116663217544556, + "learning_rate": 1.507204366050405e-05, + "loss": 0.4336, + "step": 8845 + }, + { + "epoch": 0.3308643129191298, + "grad_norm": 0.621884286403656, + "learning_rate": 1.5066981649583168e-05, + "loss": 0.2781, + "step": 8850 + }, + { + "epoch": 0.33105124190947965, + "grad_norm": 0.43372565507888794, + "learning_rate": 1.5061917891205504e-05, + "loss": 0.3472, + "step": 8855 + }, + { + "epoch": 0.33123817089982943, + "grad_norm": 1.028779149055481, + "learning_rate": 1.5056852387117405e-05, + "loss": 0.4705, + "step": 8860 + }, + { + "epoch": 0.3314250998901792, + "grad_norm": 0.4312193691730499, + "learning_rate": 1.505178513906581e-05, + "loss": 0.3649, + "step": 8865 + }, + { + "epoch": 0.331612028880529, + "grad_norm": 0.3952298164367676, + "learning_rate": 1.5046716148798273e-05, + "loss": 0.3109, + "step": 8870 + }, + { + "epoch": 0.3317989578708788, + "grad_norm": 0.19110475480556488, + "learning_rate": 1.5041645418062942e-05, + "loss": 0.3217, + "step": 8875 + }, + { + "epoch": 0.3319858868612286, + "grad_norm": 0.7014629244804382, + "learning_rate": 1.5036572948608572e-05, + "loss": 0.3183, + "step": 8880 + }, + { + "epoch": 0.3321728158515784, + "grad_norm": 0.27941253781318665, + "learning_rate": 1.5031498742184507e-05, + "loss": 0.2359, + "step": 8885 + }, + { + "epoch": 0.33235974484192815, + "grad_norm": 0.29573166370391846, + "learning_rate": 1.5026422800540694e-05, + "loss": 0.2383, + "step": 8890 + }, + { + "epoch": 0.332546673832278, + "grad_norm": 0.36960336565971375, + "learning_rate": 1.5021345125427684e-05, + "loss": 0.3709, + "step": 8895 + }, + { + "epoch": 0.33273360282262776, + "grad_norm": 0.3210650086402893, + "learning_rate": 1.501626571859662e-05, + "loss": 0.3651, + "step": 8900 + }, + { + "epoch": 0.33292053181297754, + "grad_norm": 0.6550555229187012, + "learning_rate": 1.5011184581799243e-05, + "loss": 0.3616, + "step": 8905 + }, + { + "epoch": 0.3331074608033273, + "grad_norm": 0.3219583034515381, + "learning_rate": 1.5006101716787896e-05, + "loss": 0.3333, + "step": 8910 + }, + { + "epoch": 0.33329438979367715, + "grad_norm": 0.40844401717185974, + "learning_rate": 1.5001017125315503e-05, + "loss": 0.3207, + "step": 8915 + }, + { + "epoch": 0.3334813187840269, + "grad_norm": 0.5592089295387268, + "learning_rate": 1.4995930809135604e-05, + "loss": 0.2584, + "step": 8920 + }, + { + "epoch": 0.3336682477743767, + "grad_norm": 0.3412552773952484, + "learning_rate": 1.4990842770002321e-05, + "loss": 0.3286, + "step": 8925 + }, + { + "epoch": 0.3338551767647265, + "grad_norm": 0.2517843246459961, + "learning_rate": 1.4985753009670375e-05, + "loss": 0.3027, + "step": 8930 + }, + { + "epoch": 0.3340421057550763, + "grad_norm": 0.274789422750473, + "learning_rate": 1.4980661529895073e-05, + "loss": 0.3058, + "step": 8935 + }, + { + "epoch": 0.3342290347454261, + "grad_norm": 0.3245002031326294, + "learning_rate": 1.4975568332432322e-05, + "loss": 0.356, + "step": 8940 + }, + { + "epoch": 0.33441596373577587, + "grad_norm": 0.2871999442577362, + "learning_rate": 1.4970473419038623e-05, + "loss": 0.2805, + "step": 8945 + }, + { + "epoch": 0.33460289272612564, + "grad_norm": 0.3988129496574402, + "learning_rate": 1.4965376791471062e-05, + "loss": 0.2596, + "step": 8950 + }, + { + "epoch": 0.3347898217164755, + "grad_norm": 0.3798826038837433, + "learning_rate": 1.4960278451487327e-05, + "loss": 0.3168, + "step": 8955 + }, + { + "epoch": 0.33497675070682525, + "grad_norm": 0.336479127407074, + "learning_rate": 1.4955178400845678e-05, + "loss": 0.294, + "step": 8960 + }, + { + "epoch": 0.33516367969717503, + "grad_norm": 0.31239262223243713, + "learning_rate": 1.4950076641304984e-05, + "loss": 0.2486, + "step": 8965 + }, + { + "epoch": 0.3353506086875248, + "grad_norm": 0.4820721745491028, + "learning_rate": 1.4944973174624695e-05, + "loss": 0.3584, + "step": 8970 + }, + { + "epoch": 0.33553753767787464, + "grad_norm": 0.7084938287734985, + "learning_rate": 1.493986800256485e-05, + "loss": 0.2835, + "step": 8975 + }, + { + "epoch": 0.3357244666682244, + "grad_norm": 0.13603666424751282, + "learning_rate": 1.4934761126886077e-05, + "loss": 0.2406, + "step": 8980 + }, + { + "epoch": 0.3359113956585742, + "grad_norm": 0.28574085235595703, + "learning_rate": 1.4929652549349587e-05, + "loss": 0.331, + "step": 8985 + }, + { + "epoch": 0.336098324648924, + "grad_norm": 0.3157019317150116, + "learning_rate": 1.4924542271717186e-05, + "loss": 0.2754, + "step": 8990 + }, + { + "epoch": 0.3362852536392738, + "grad_norm": 0.4523063004016876, + "learning_rate": 1.4919430295751262e-05, + "loss": 0.2865, + "step": 8995 + }, + { + "epoch": 0.3364721826296236, + "grad_norm": 0.32611748576164246, + "learning_rate": 1.4914316623214788e-05, + "loss": 0.3052, + "step": 9000 + }, + { + "epoch": 0.33665911161997336, + "grad_norm": 0.34001392126083374, + "learning_rate": 1.4909201255871325e-05, + "loss": 0.253, + "step": 9005 + }, + { + "epoch": 0.33684604061032314, + "grad_norm": 0.3103577792644501, + "learning_rate": 1.4904084195485014e-05, + "loss": 0.2962, + "step": 9010 + }, + { + "epoch": 0.33703296960067297, + "grad_norm": 0.3384459614753723, + "learning_rate": 1.4898965443820584e-05, + "loss": 0.2931, + "step": 9015 + }, + { + "epoch": 0.33721989859102275, + "grad_norm": 0.47207170724868774, + "learning_rate": 1.4893845002643345e-05, + "loss": 0.3106, + "step": 9020 + }, + { + "epoch": 0.3374068275813725, + "grad_norm": 0.25210464000701904, + "learning_rate": 1.4888722873719195e-05, + "loss": 0.3618, + "step": 9025 + }, + { + "epoch": 0.3375937565717223, + "grad_norm": 0.4364457130432129, + "learning_rate": 1.4883599058814602e-05, + "loss": 0.3376, + "step": 9030 + }, + { + "epoch": 0.3377806855620721, + "grad_norm": 0.4352976381778717, + "learning_rate": 1.4878473559696625e-05, + "loss": 0.3076, + "step": 9035 + }, + { + "epoch": 0.3379676145524219, + "grad_norm": 0.4709399342536926, + "learning_rate": 1.4873346378132905e-05, + "loss": 0.301, + "step": 9040 + }, + { + "epoch": 0.3381545435427717, + "grad_norm": 0.4114588499069214, + "learning_rate": 1.4868217515891657e-05, + "loss": 0.2685, + "step": 9045 + }, + { + "epoch": 0.33834147253312147, + "grad_norm": 0.7446982860565186, + "learning_rate": 1.4863086974741684e-05, + "loss": 0.294, + "step": 9050 + }, + { + "epoch": 0.33852840152347125, + "grad_norm": 0.29010334610939026, + "learning_rate": 1.4857954756452353e-05, + "loss": 0.356, + "step": 9055 + }, + { + "epoch": 0.3387153305138211, + "grad_norm": 0.4005201756954193, + "learning_rate": 1.4852820862793626e-05, + "loss": 0.2966, + "step": 9060 + }, + { + "epoch": 0.33890225950417086, + "grad_norm": 0.7483821511268616, + "learning_rate": 1.4847685295536037e-05, + "loss": 0.3161, + "step": 9065 + }, + { + "epoch": 0.33908918849452063, + "grad_norm": 0.4686259925365448, + "learning_rate": 1.4842548056450692e-05, + "loss": 0.3532, + "step": 9070 + }, + { + "epoch": 0.3392761174848704, + "grad_norm": 0.44194498658180237, + "learning_rate": 1.4837409147309276e-05, + "loss": 0.2582, + "step": 9075 + }, + { + "epoch": 0.33946304647522024, + "grad_norm": 0.47664833068847656, + "learning_rate": 1.4832268569884058e-05, + "loss": 0.3903, + "step": 9080 + }, + { + "epoch": 0.33964997546557, + "grad_norm": 0.488346129655838, + "learning_rate": 1.4827126325947872e-05, + "loss": 0.2816, + "step": 9085 + }, + { + "epoch": 0.3398369044559198, + "grad_norm": 0.3450961709022522, + "learning_rate": 1.4821982417274128e-05, + "loss": 0.3073, + "step": 9090 + }, + { + "epoch": 0.3400238334462696, + "grad_norm": 0.32565736770629883, + "learning_rate": 1.4816836845636817e-05, + "loss": 0.3467, + "step": 9095 + }, + { + "epoch": 0.3402107624366194, + "grad_norm": 0.3779214322566986, + "learning_rate": 1.4811689612810498e-05, + "loss": 0.2372, + "step": 9100 + }, + { + "epoch": 0.3403976914269692, + "grad_norm": 11.968618392944336, + "learning_rate": 1.4806540720570306e-05, + "loss": 0.4737, + "step": 9105 + }, + { + "epoch": 0.34058462041731896, + "grad_norm": 0.4189259111881256, + "learning_rate": 1.4801390170691941e-05, + "loss": 0.2739, + "step": 9110 + }, + { + "epoch": 0.34077154940766874, + "grad_norm": 0.33259811997413635, + "learning_rate": 1.4796237964951686e-05, + "loss": 0.2829, + "step": 9115 + }, + { + "epoch": 0.34095847839801857, + "grad_norm": 0.2681080400943756, + "learning_rate": 1.4791084105126385e-05, + "loss": 0.2564, + "step": 9120 + }, + { + "epoch": 0.34114540738836835, + "grad_norm": 0.35501420497894287, + "learning_rate": 1.478592859299346e-05, + "loss": 0.2472, + "step": 9125 + }, + { + "epoch": 0.3413323363787181, + "grad_norm": 0.4222002625465393, + "learning_rate": 1.4780771430330894e-05, + "loss": 0.3695, + "step": 9130 + }, + { + "epoch": 0.3415192653690679, + "grad_norm": 0.6013131737709045, + "learning_rate": 1.477561261891725e-05, + "loss": 0.2431, + "step": 9135 + }, + { + "epoch": 0.34170619435941774, + "grad_norm": 0.4116293787956238, + "learning_rate": 1.4770452160531652e-05, + "loss": 0.2915, + "step": 9140 + }, + { + "epoch": 0.3418931233497675, + "grad_norm": 0.4117601811885834, + "learning_rate": 1.4765290056953796e-05, + "loss": 0.2381, + "step": 9145 + }, + { + "epoch": 0.3420800523401173, + "grad_norm": 0.5726933479309082, + "learning_rate": 1.476012630996394e-05, + "loss": 0.3691, + "step": 9150 + }, + { + "epoch": 0.34226698133046707, + "grad_norm": 0.43034738302230835, + "learning_rate": 1.4754960921342916e-05, + "loss": 0.2324, + "step": 9155 + }, + { + "epoch": 0.3424539103208169, + "grad_norm": 0.23042064905166626, + "learning_rate": 1.4749793892872115e-05, + "loss": 0.2359, + "step": 9160 + }, + { + "epoch": 0.3426408393111667, + "grad_norm": 0.4107089340686798, + "learning_rate": 1.4744625226333502e-05, + "loss": 0.2775, + "step": 9165 + }, + { + "epoch": 0.34282776830151646, + "grad_norm": 0.1352885216474533, + "learning_rate": 1.47394549235096e-05, + "loss": 0.2663, + "step": 9170 + }, + { + "epoch": 0.34301469729186623, + "grad_norm": 0.34713098406791687, + "learning_rate": 1.4734282986183494e-05, + "loss": 0.3152, + "step": 9175 + }, + { + "epoch": 0.34320162628221607, + "grad_norm": 0.3076459765434265, + "learning_rate": 1.4729109416138843e-05, + "loss": 0.3371, + "step": 9180 + }, + { + "epoch": 0.34338855527256584, + "grad_norm": 0.2737236022949219, + "learning_rate": 1.4723934215159858e-05, + "loss": 0.3022, + "step": 9185 + }, + { + "epoch": 0.3435754842629156, + "grad_norm": 0.3773277997970581, + "learning_rate": 1.4718757385031321e-05, + "loss": 0.2924, + "step": 9190 + }, + { + "epoch": 0.3437624132532654, + "grad_norm": 0.39832785725593567, + "learning_rate": 1.4713578927538573e-05, + "loss": 0.2438, + "step": 9195 + }, + { + "epoch": 0.34394934224361523, + "grad_norm": 0.19407474994659424, + "learning_rate": 1.4708398844467512e-05, + "loss": 0.3117, + "step": 9200 + }, + { + "epoch": 0.344136271233965, + "grad_norm": 0.3654159605503082, + "learning_rate": 1.4703217137604604e-05, + "loss": 0.3485, + "step": 9205 + }, + { + "epoch": 0.3443232002243148, + "grad_norm": 0.37599438428878784, + "learning_rate": 1.4698033808736867e-05, + "loss": 0.2486, + "step": 9210 + }, + { + "epoch": 0.34451012921466456, + "grad_norm": 0.42682337760925293, + "learning_rate": 1.4692848859651889e-05, + "loss": 0.2533, + "step": 9215 + }, + { + "epoch": 0.3446970582050144, + "grad_norm": 0.3610582947731018, + "learning_rate": 1.4687662292137804e-05, + "loss": 0.2781, + "step": 9220 + }, + { + "epoch": 0.3448839871953642, + "grad_norm": 0.4574204385280609, + "learning_rate": 1.4682474107983314e-05, + "loss": 0.2965, + "step": 9225 + }, + { + "epoch": 0.34507091618571395, + "grad_norm": 0.513286828994751, + "learning_rate": 1.467728430897767e-05, + "loss": 0.2859, + "step": 9230 + }, + { + "epoch": 0.3452578451760637, + "grad_norm": 0.23457178473472595, + "learning_rate": 1.4672092896910692e-05, + "loss": 0.2623, + "step": 9235 + }, + { + "epoch": 0.34544477416641356, + "grad_norm": 0.34765762090682983, + "learning_rate": 1.4666899873572747e-05, + "loss": 0.291, + "step": 9240 + }, + { + "epoch": 0.34563170315676334, + "grad_norm": 0.3738343417644501, + "learning_rate": 1.4661705240754757e-05, + "loss": 0.2701, + "step": 9245 + }, + { + "epoch": 0.3458186321471131, + "grad_norm": 0.3144400119781494, + "learning_rate": 1.4656509000248207e-05, + "loss": 0.2842, + "step": 9250 + }, + { + "epoch": 0.3460055611374629, + "grad_norm": 0.5743440985679626, + "learning_rate": 1.4651311153845127e-05, + "loss": 0.2436, + "step": 9255 + }, + { + "epoch": 0.3461924901278127, + "grad_norm": 0.2858171761035919, + "learning_rate": 1.4646111703338108e-05, + "loss": 0.318, + "step": 9260 + }, + { + "epoch": 0.3463794191181625, + "grad_norm": 0.4376906156539917, + "learning_rate": 1.464091065052029e-05, + "loss": 0.2822, + "step": 9265 + }, + { + "epoch": 0.3465663481085123, + "grad_norm": 0.2567811906337738, + "learning_rate": 1.4635707997185367e-05, + "loss": 0.2571, + "step": 9270 + }, + { + "epoch": 0.34675327709886206, + "grad_norm": 0.3876672089099884, + "learning_rate": 1.4630503745127587e-05, + "loss": 0.3234, + "step": 9275 + }, + { + "epoch": 0.34694020608921183, + "grad_norm": 0.48617517948150635, + "learning_rate": 1.4625297896141741e-05, + "loss": 0.2896, + "step": 9280 + }, + { + "epoch": 0.34712713507956167, + "grad_norm": 0.4633631110191345, + "learning_rate": 1.462009045202319e-05, + "loss": 0.3156, + "step": 9285 + }, + { + "epoch": 0.34731406406991144, + "grad_norm": 0.18796226382255554, + "learning_rate": 1.461488141456782e-05, + "loss": 0.2849, + "step": 9290 + }, + { + "epoch": 0.3475009930602612, + "grad_norm": 0.8571456074714661, + "learning_rate": 1.4609670785572084e-05, + "loss": 0.3728, + "step": 9295 + }, + { + "epoch": 0.347687922050611, + "grad_norm": 0.22898966073989868, + "learning_rate": 1.4604458566832977e-05, + "loss": 0.2989, + "step": 9300 + }, + { + "epoch": 0.34787485104096083, + "grad_norm": 0.32521340250968933, + "learning_rate": 1.4599244760148046e-05, + "loss": 0.2966, + "step": 9305 + }, + { + "epoch": 0.3480617800313106, + "grad_norm": 0.6073753833770752, + "learning_rate": 1.459402936731538e-05, + "loss": 0.3742, + "step": 9310 + }, + { + "epoch": 0.3482487090216604, + "grad_norm": 0.3876205384731293, + "learning_rate": 1.4588812390133624e-05, + "loss": 0.249, + "step": 9315 + }, + { + "epoch": 0.34843563801201016, + "grad_norm": 0.46591299772262573, + "learning_rate": 1.4583593830401958e-05, + "loss": 0.2723, + "step": 9320 + }, + { + "epoch": 0.34862256700236, + "grad_norm": 0.22240598499774933, + "learning_rate": 1.4578373689920116e-05, + "loss": 0.3037, + "step": 9325 + }, + { + "epoch": 0.3488094959927098, + "grad_norm": 0.5356066823005676, + "learning_rate": 1.457315197048838e-05, + "loss": 0.3212, + "step": 9330 + }, + { + "epoch": 0.34899642498305955, + "grad_norm": 0.5799317955970764, + "learning_rate": 1.4567928673907563e-05, + "loss": 0.3065, + "step": 9335 + }, + { + "epoch": 0.3491833539734093, + "grad_norm": 0.2084297090768814, + "learning_rate": 1.4562703801979035e-05, + "loss": 0.3081, + "step": 9340 + }, + { + "epoch": 0.34937028296375916, + "grad_norm": 0.520650327205658, + "learning_rate": 1.4557477356504702e-05, + "loss": 0.3058, + "step": 9345 + }, + { + "epoch": 0.34955721195410894, + "grad_norm": 0.34877529740333557, + "learning_rate": 1.455224933928702e-05, + "loss": 0.2876, + "step": 9350 + }, + { + "epoch": 0.3497441409444587, + "grad_norm": 0.30944302678108215, + "learning_rate": 1.4547019752128977e-05, + "loss": 0.3491, + "step": 9355 + }, + { + "epoch": 0.3499310699348085, + "grad_norm": 0.5403395295143127, + "learning_rate": 1.4541788596834111e-05, + "loss": 0.3018, + "step": 9360 + }, + { + "epoch": 0.3501179989251583, + "grad_norm": 0.43393194675445557, + "learning_rate": 1.4536555875206497e-05, + "loss": 0.3163, + "step": 9365 + }, + { + "epoch": 0.3503049279155081, + "grad_norm": 0.3072391450405121, + "learning_rate": 1.4531321589050749e-05, + "loss": 0.2298, + "step": 9370 + }, + { + "epoch": 0.3504918569058579, + "grad_norm": 0.18442721664905548, + "learning_rate": 1.4526085740172025e-05, + "loss": 0.4021, + "step": 9375 + }, + { + "epoch": 0.35067878589620766, + "grad_norm": 0.4270346462726593, + "learning_rate": 1.4520848330376019e-05, + "loss": 0.2749, + "step": 9380 + }, + { + "epoch": 0.3508657148865575, + "grad_norm": 0.4557771384716034, + "learning_rate": 1.4515609361468959e-05, + "loss": 0.2412, + "step": 9385 + }, + { + "epoch": 0.35105264387690727, + "grad_norm": 0.7857339978218079, + "learning_rate": 1.4510368835257622e-05, + "loss": 0.2772, + "step": 9390 + }, + { + "epoch": 0.35123957286725704, + "grad_norm": 0.30803146958351135, + "learning_rate": 1.450512675354931e-05, + "loss": 0.3163, + "step": 9395 + }, + { + "epoch": 0.3514265018576068, + "grad_norm": 0.4320001006126404, + "learning_rate": 1.449988311815187e-05, + "loss": 0.2661, + "step": 9400 + }, + { + "epoch": 0.35161343084795665, + "grad_norm": 0.13867487013339996, + "learning_rate": 1.4494637930873683e-05, + "loss": 0.2416, + "step": 9405 + }, + { + "epoch": 0.35180035983830643, + "grad_norm": 0.27354222536087036, + "learning_rate": 1.4489391193523658e-05, + "loss": 0.2661, + "step": 9410 + }, + { + "epoch": 0.3519872888286562, + "grad_norm": 0.3791649043560028, + "learning_rate": 1.4484142907911247e-05, + "loss": 0.2581, + "step": 9415 + }, + { + "epoch": 0.352174217819006, + "grad_norm": 0.38127240538597107, + "learning_rate": 1.4478893075846436e-05, + "loss": 0.2605, + "step": 9420 + }, + { + "epoch": 0.3523611468093558, + "grad_norm": 0.23698148131370544, + "learning_rate": 1.447364169913974e-05, + "loss": 0.3725, + "step": 9425 + }, + { + "epoch": 0.3525480757997056, + "grad_norm": 0.3475520610809326, + "learning_rate": 1.4468388779602207e-05, + "loss": 0.2851, + "step": 9430 + }, + { + "epoch": 0.3527350047900554, + "grad_norm": 0.45811226963996887, + "learning_rate": 1.446313431904542e-05, + "loss": 0.3581, + "step": 9435 + }, + { + "epoch": 0.35292193378040515, + "grad_norm": 0.6400048136711121, + "learning_rate": 1.4457878319281491e-05, + "loss": 0.2837, + "step": 9440 + }, + { + "epoch": 0.353108862770755, + "grad_norm": 0.4961070716381073, + "learning_rate": 1.4452620782123063e-05, + "loss": 0.3202, + "step": 9445 + }, + { + "epoch": 0.35329579176110476, + "grad_norm": 0.33612340688705444, + "learning_rate": 1.4447361709383312e-05, + "loss": 0.2923, + "step": 9450 + }, + { + "epoch": 0.35348272075145454, + "grad_norm": 0.4333072006702423, + "learning_rate": 1.4442101102875942e-05, + "loss": 0.2749, + "step": 9455 + }, + { + "epoch": 0.3536696497418043, + "grad_norm": 0.3720749616622925, + "learning_rate": 1.443683896441518e-05, + "loss": 0.333, + "step": 9460 + }, + { + "epoch": 0.35385657873215415, + "grad_norm": 0.638292670249939, + "learning_rate": 1.4431575295815793e-05, + "loss": 0.2825, + "step": 9465 + }, + { + "epoch": 0.3540435077225039, + "grad_norm": 0.27355220913887024, + "learning_rate": 1.4426310098893069e-05, + "loss": 0.334, + "step": 9470 + }, + { + "epoch": 0.3542304367128537, + "grad_norm": 0.3119034171104431, + "learning_rate": 1.4421043375462821e-05, + "loss": 0.3194, + "step": 9475 + }, + { + "epoch": 0.3544173657032035, + "grad_norm": 0.3145153820514679, + "learning_rate": 1.441577512734139e-05, + "loss": 0.2671, + "step": 9480 + }, + { + "epoch": 0.3546042946935533, + "grad_norm": 0.25988125801086426, + "learning_rate": 1.4410505356345645e-05, + "loss": 0.2174, + "step": 9485 + }, + { + "epoch": 0.3547912236839031, + "grad_norm": 0.4911235570907593, + "learning_rate": 1.440523406429298e-05, + "loss": 0.4387, + "step": 9490 + }, + { + "epoch": 0.35497815267425287, + "grad_norm": 0.21677200496196747, + "learning_rate": 1.4399961253001316e-05, + "loss": 0.2656, + "step": 9495 + }, + { + "epoch": 0.35516508166460264, + "grad_norm": 0.3045201897621155, + "learning_rate": 1.4394686924289087e-05, + "loss": 0.2927, + "step": 9500 + }, + { + "epoch": 0.3553520106549525, + "grad_norm": 0.38031861186027527, + "learning_rate": 1.4389411079975262e-05, + "loss": 0.333, + "step": 9505 + }, + { + "epoch": 0.35553893964530225, + "grad_norm": 0.31916332244873047, + "learning_rate": 1.4384133721879327e-05, + "loss": 0.2274, + "step": 9510 + }, + { + "epoch": 0.35572586863565203, + "grad_norm": 0.4681374430656433, + "learning_rate": 1.4378854851821294e-05, + "loss": 0.2061, + "step": 9515 + }, + { + "epoch": 0.3559127976260018, + "grad_norm": 0.43480637669563293, + "learning_rate": 1.4373574471621694e-05, + "loss": 0.3167, + "step": 9520 + }, + { + "epoch": 0.3560997266163516, + "grad_norm": 0.2742566764354706, + "learning_rate": 1.4368292583101578e-05, + "loss": 0.3607, + "step": 9525 + }, + { + "epoch": 0.3562866556067014, + "grad_norm": 0.2994193732738495, + "learning_rate": 1.4363009188082513e-05, + "loss": 0.34, + "step": 9530 + }, + { + "epoch": 0.3564735845970512, + "grad_norm": 0.38814255595207214, + "learning_rate": 1.43577242883866e-05, + "loss": 0.2461, + "step": 9535 + }, + { + "epoch": 0.356660513587401, + "grad_norm": 0.29775434732437134, + "learning_rate": 1.4352437885836441e-05, + "loss": 0.2581, + "step": 9540 + }, + { + "epoch": 0.35684744257775075, + "grad_norm": 0.36987629532814026, + "learning_rate": 1.4347149982255168e-05, + "loss": 0.3376, + "step": 9545 + }, + { + "epoch": 0.3570343715681006, + "grad_norm": 0.2949317693710327, + "learning_rate": 1.4341860579466428e-05, + "loss": 0.3628, + "step": 9550 + }, + { + "epoch": 0.35722130055845036, + "grad_norm": 0.3889232873916626, + "learning_rate": 1.4336569679294385e-05, + "loss": 0.2386, + "step": 9555 + }, + { + "epoch": 0.35740822954880014, + "grad_norm": 0.498597115278244, + "learning_rate": 1.433127728356372e-05, + "loss": 0.2282, + "step": 9560 + }, + { + "epoch": 0.3575951585391499, + "grad_norm": 0.332461416721344, + "learning_rate": 1.4325983394099626e-05, + "loss": 0.3273, + "step": 9565 + }, + { + "epoch": 0.35778208752949975, + "grad_norm": 0.5877690315246582, + "learning_rate": 1.4320688012727814e-05, + "loss": 0.2397, + "step": 9570 + }, + { + "epoch": 0.3579690165198495, + "grad_norm": 0.1286364197731018, + "learning_rate": 1.4315391141274508e-05, + "loss": 0.2346, + "step": 9575 + }, + { + "epoch": 0.3581559455101993, + "grad_norm": 0.3990229368209839, + "learning_rate": 1.4310092781566452e-05, + "loss": 0.3775, + "step": 9580 + }, + { + "epoch": 0.3583428745005491, + "grad_norm": 0.4049658179283142, + "learning_rate": 1.4304792935430893e-05, + "loss": 0.2323, + "step": 9585 + }, + { + "epoch": 0.3585298034908989, + "grad_norm": 0.27659088373184204, + "learning_rate": 1.42994916046956e-05, + "loss": 0.2626, + "step": 9590 + }, + { + "epoch": 0.3587167324812487, + "grad_norm": 0.38999220728874207, + "learning_rate": 1.4294188791188846e-05, + "loss": 0.3191, + "step": 9595 + }, + { + "epoch": 0.35890366147159847, + "grad_norm": 0.5159905552864075, + "learning_rate": 1.4288884496739424e-05, + "loss": 0.2849, + "step": 9600 + }, + { + "epoch": 0.35909059046194824, + "grad_norm": 0.3492240905761719, + "learning_rate": 1.4283578723176632e-05, + "loss": 0.3292, + "step": 9605 + }, + { + "epoch": 0.3592775194522981, + "grad_norm": 0.3294449746608734, + "learning_rate": 1.4278271472330277e-05, + "loss": 0.3421, + "step": 9610 + }, + { + "epoch": 0.35946444844264785, + "grad_norm": 0.543605625629425, + "learning_rate": 1.4272962746030678e-05, + "loss": 0.3525, + "step": 9615 + }, + { + "epoch": 0.35965137743299763, + "grad_norm": 0.5134566426277161, + "learning_rate": 1.4267652546108668e-05, + "loss": 0.2269, + "step": 9620 + }, + { + "epoch": 0.3598383064233474, + "grad_norm": 0.30253830552101135, + "learning_rate": 1.4262340874395574e-05, + "loss": 0.2389, + "step": 9625 + }, + { + "epoch": 0.36002523541369724, + "grad_norm": 0.30939239263534546, + "learning_rate": 1.4257027732723247e-05, + "loss": 0.3656, + "step": 9630 + }, + { + "epoch": 0.360212164404047, + "grad_norm": 0.3319133222103119, + "learning_rate": 1.4251713122924034e-05, + "loss": 0.2553, + "step": 9635 + }, + { + "epoch": 0.3603990933943968, + "grad_norm": 0.37184301018714905, + "learning_rate": 1.424639704683079e-05, + "loss": 0.305, + "step": 9640 + }, + { + "epoch": 0.3605860223847466, + "grad_norm": 0.4830484688282013, + "learning_rate": 1.424107950627688e-05, + "loss": 0.2698, + "step": 9645 + }, + { + "epoch": 0.3607729513750964, + "grad_norm": 0.5102275013923645, + "learning_rate": 1.423576050309617e-05, + "loss": 0.2744, + "step": 9650 + }, + { + "epoch": 0.3609598803654462, + "grad_norm": 0.2823433578014374, + "learning_rate": 1.4230440039123032e-05, + "loss": 0.2687, + "step": 9655 + }, + { + "epoch": 0.36114680935579596, + "grad_norm": 0.36009663343429565, + "learning_rate": 1.4225118116192344e-05, + "loss": 0.3377, + "step": 9660 + }, + { + "epoch": 0.36133373834614574, + "grad_norm": 0.5339310765266418, + "learning_rate": 1.4219794736139484e-05, + "loss": 0.2668, + "step": 9665 + }, + { + "epoch": 0.36152066733649557, + "grad_norm": 0.565954864025116, + "learning_rate": 1.421446990080033e-05, + "loss": 0.3104, + "step": 9670 + }, + { + "epoch": 0.36170759632684535, + "grad_norm": 0.5683127641677856, + "learning_rate": 1.4209143612011268e-05, + "loss": 0.2727, + "step": 9675 + }, + { + "epoch": 0.3618945253171951, + "grad_norm": 0.4476057291030884, + "learning_rate": 1.4203815871609177e-05, + "loss": 0.2485, + "step": 9680 + }, + { + "epoch": 0.3620814543075449, + "grad_norm": 0.3725626468658447, + "learning_rate": 1.419848668143145e-05, + "loss": 0.288, + "step": 9685 + }, + { + "epoch": 0.36226838329789474, + "grad_norm": 0.5492215752601624, + "learning_rate": 1.4193156043315967e-05, + "loss": 0.3209, + "step": 9690 + }, + { + "epoch": 0.3624553122882445, + "grad_norm": 0.3779137134552002, + "learning_rate": 1.4187823959101116e-05, + "loss": 0.2553, + "step": 9695 + }, + { + "epoch": 0.3626422412785943, + "grad_norm": 0.296917587518692, + "learning_rate": 1.4182490430625775e-05, + "loss": 0.3266, + "step": 9700 + }, + { + "epoch": 0.36282917026894407, + "grad_norm": 0.2786087691783905, + "learning_rate": 1.417715545972933e-05, + "loss": 0.2164, + "step": 9705 + }, + { + "epoch": 0.3630160992592939, + "grad_norm": 0.44238272309303284, + "learning_rate": 1.4171819048251658e-05, + "loss": 0.2699, + "step": 9710 + }, + { + "epoch": 0.3632030282496437, + "grad_norm": 0.5811543464660645, + "learning_rate": 1.4166481198033136e-05, + "loss": 0.2559, + "step": 9715 + }, + { + "epoch": 0.36338995723999346, + "grad_norm": 0.26523107290267944, + "learning_rate": 1.4161141910914632e-05, + "loss": 0.2773, + "step": 9720 + }, + { + "epoch": 0.36357688623034323, + "grad_norm": 0.5079830288887024, + "learning_rate": 1.415580118873751e-05, + "loss": 0.2821, + "step": 9725 + }, + { + "epoch": 0.36376381522069307, + "grad_norm": 0.44632482528686523, + "learning_rate": 1.4150459033343639e-05, + "loss": 0.3243, + "step": 9730 + }, + { + "epoch": 0.36395074421104284, + "grad_norm": 0.3747384250164032, + "learning_rate": 1.4145115446575377e-05, + "loss": 0.3253, + "step": 9735 + }, + { + "epoch": 0.3641376732013926, + "grad_norm": 0.423495888710022, + "learning_rate": 1.4139770430275567e-05, + "loss": 0.259, + "step": 9740 + }, + { + "epoch": 0.3643246021917424, + "grad_norm": 0.365408718585968, + "learning_rate": 1.4134423986287555e-05, + "loss": 0.3372, + "step": 9745 + }, + { + "epoch": 0.36451153118209223, + "grad_norm": 0.31306222081184387, + "learning_rate": 1.4129076116455176e-05, + "loss": 0.2578, + "step": 9750 + }, + { + "epoch": 0.364698460172442, + "grad_norm": 0.4542996287345886, + "learning_rate": 1.4123726822622758e-05, + "loss": 0.3076, + "step": 9755 + }, + { + "epoch": 0.3648853891627918, + "grad_norm": 0.2852497398853302, + "learning_rate": 1.411837610663512e-05, + "loss": 0.2407, + "step": 9760 + }, + { + "epoch": 0.36507231815314156, + "grad_norm": 0.4517068564891815, + "learning_rate": 1.411302397033757e-05, + "loss": 0.2958, + "step": 9765 + }, + { + "epoch": 0.36525924714349134, + "grad_norm": 0.24500373005867004, + "learning_rate": 1.4107670415575902e-05, + "loss": 0.3155, + "step": 9770 + }, + { + "epoch": 0.36544617613384117, + "grad_norm": 0.35521599650382996, + "learning_rate": 1.410231544419641e-05, + "loss": 0.2573, + "step": 9775 + }, + { + "epoch": 0.36563310512419095, + "grad_norm": 0.5142762660980225, + "learning_rate": 1.4096959058045867e-05, + "loss": 0.3081, + "step": 9780 + }, + { + "epoch": 0.3658200341145407, + "grad_norm": 0.46503591537475586, + "learning_rate": 1.4091601258971537e-05, + "loss": 0.2943, + "step": 9785 + }, + { + "epoch": 0.3660069631048905, + "grad_norm": 0.35828927159309387, + "learning_rate": 1.4086242048821172e-05, + "loss": 0.2945, + "step": 9790 + }, + { + "epoch": 0.36619389209524034, + "grad_norm": 0.43609440326690674, + "learning_rate": 1.408088142944301e-05, + "loss": 0.3362, + "step": 9795 + }, + { + "epoch": 0.3663808210855901, + "grad_norm": 0.7864788770675659, + "learning_rate": 1.4075519402685775e-05, + "loss": 0.2748, + "step": 9800 + }, + { + "epoch": 0.3665677500759399, + "grad_norm": 0.2505030930042267, + "learning_rate": 1.4070155970398676e-05, + "loss": 0.3193, + "step": 9805 + }, + { + "epoch": 0.36675467906628967, + "grad_norm": 0.2732783257961273, + "learning_rate": 1.406479113443141e-05, + "loss": 0.4463, + "step": 9810 + }, + { + "epoch": 0.3669416080566395, + "grad_norm": 0.4646552801132202, + "learning_rate": 1.405942489663415e-05, + "loss": 0.2432, + "step": 9815 + }, + { + "epoch": 0.3671285370469893, + "grad_norm": 0.23349085450172424, + "learning_rate": 1.405405725885756e-05, + "loss": 0.27, + "step": 9820 + }, + { + "epoch": 0.36731546603733906, + "grad_norm": 0.318560391664505, + "learning_rate": 1.4048688222952787e-05, + "loss": 0.2645, + "step": 9825 + }, + { + "epoch": 0.36750239502768883, + "grad_norm": 0.3994881510734558, + "learning_rate": 1.4043317790771455e-05, + "loss": 0.2549, + "step": 9830 + }, + { + "epoch": 0.36768932401803867, + "grad_norm": 0.4488486647605896, + "learning_rate": 1.4037945964165674e-05, + "loss": 0.2432, + "step": 9835 + }, + { + "epoch": 0.36787625300838844, + "grad_norm": 0.47077760100364685, + "learning_rate": 1.4032572744988029e-05, + "loss": 0.2743, + "step": 9840 + }, + { + "epoch": 0.3680631819987382, + "grad_norm": 0.3115878403186798, + "learning_rate": 1.4027198135091594e-05, + "loss": 0.3284, + "step": 9845 + }, + { + "epoch": 0.368250110989088, + "grad_norm": 0.802329421043396, + "learning_rate": 1.4021822136329914e-05, + "loss": 0.3841, + "step": 9850 + }, + { + "epoch": 0.36843703997943783, + "grad_norm": 0.22909800708293915, + "learning_rate": 1.4016444750557022e-05, + "loss": 0.3512, + "step": 9855 + }, + { + "epoch": 0.3686239689697876, + "grad_norm": 0.4044862687587738, + "learning_rate": 1.4011065979627418e-05, + "loss": 0.2226, + "step": 9860 + }, + { + "epoch": 0.3688108979601374, + "grad_norm": 0.7386347651481628, + "learning_rate": 1.4005685825396091e-05, + "loss": 0.2814, + "step": 9865 + }, + { + "epoch": 0.36899782695048716, + "grad_norm": 0.49540725350379944, + "learning_rate": 1.4000304289718498e-05, + "loss": 0.3408, + "step": 9870 + }, + { + "epoch": 0.369184755940837, + "grad_norm": 0.33938613533973694, + "learning_rate": 1.3994921374450584e-05, + "loss": 0.3386, + "step": 9875 + }, + { + "epoch": 0.3693716849311868, + "grad_norm": 0.6717550158500671, + "learning_rate": 1.3989537081448752e-05, + "loss": 0.351, + "step": 9880 + }, + { + "epoch": 0.36955861392153655, + "grad_norm": 0.3765832483768463, + "learning_rate": 1.3984151412569898e-05, + "loss": 0.2666, + "step": 9885 + }, + { + "epoch": 0.3697455429118863, + "grad_norm": 0.38686302304267883, + "learning_rate": 1.3978764369671378e-05, + "loss": 0.2667, + "step": 9890 + }, + { + "epoch": 0.36993247190223616, + "grad_norm": 0.4677604138851166, + "learning_rate": 1.3973375954611037e-05, + "loss": 0.263, + "step": 9895 + }, + { + "epoch": 0.37011940089258594, + "grad_norm": 0.19471901655197144, + "learning_rate": 1.396798616924718e-05, + "loss": 0.4968, + "step": 9900 + }, + { + "epoch": 0.3703063298829357, + "grad_norm": 0.37783077359199524, + "learning_rate": 1.3962595015438592e-05, + "loss": 0.3632, + "step": 9905 + }, + { + "epoch": 0.3704932588732855, + "grad_norm": 0.26753556728363037, + "learning_rate": 1.3957202495044525e-05, + "loss": 0.2798, + "step": 9910 + }, + { + "epoch": 0.3706801878636353, + "grad_norm": 0.3343743085861206, + "learning_rate": 1.3951808609924703e-05, + "loss": 0.2431, + "step": 9915 + }, + { + "epoch": 0.3708671168539851, + "grad_norm": 0.2634985148906708, + "learning_rate": 1.3946413361939324e-05, + "loss": 0.2888, + "step": 9920 + }, + { + "epoch": 0.3710540458443349, + "grad_norm": 0.8201472163200378, + "learning_rate": 1.3941016752949059e-05, + "loss": 0.3404, + "step": 9925 + }, + { + "epoch": 0.37124097483468466, + "grad_norm": 0.5062202215194702, + "learning_rate": 1.3935618784815042e-05, + "loss": 0.2689, + "step": 9930 + }, + { + "epoch": 0.3714279038250345, + "grad_norm": 0.5348239541053772, + "learning_rate": 1.3930219459398872e-05, + "loss": 0.2577, + "step": 9935 + }, + { + "epoch": 0.37161483281538427, + "grad_norm": 0.3352089822292328, + "learning_rate": 1.3924818778562627e-05, + "loss": 0.3134, + "step": 9940 + }, + { + "epoch": 0.37180176180573404, + "grad_norm": 0.4589434266090393, + "learning_rate": 1.3919416744168846e-05, + "loss": 0.3105, + "step": 9945 + }, + { + "epoch": 0.3719886907960838, + "grad_norm": 0.5832353234291077, + "learning_rate": 1.3914013358080536e-05, + "loss": 0.2674, + "step": 9950 + }, + { + "epoch": 0.37217561978643365, + "grad_norm": 0.46412187814712524, + "learning_rate": 1.3908608622161168e-05, + "loss": 0.2989, + "step": 9955 + }, + { + "epoch": 0.37236254877678343, + "grad_norm": 0.22348807752132416, + "learning_rate": 1.390320253827468e-05, + "loss": 0.306, + "step": 9960 + }, + { + "epoch": 0.3725494777671332, + "grad_norm": 0.3156212270259857, + "learning_rate": 1.389779510828548e-05, + "loss": 0.2838, + "step": 9965 + }, + { + "epoch": 0.372736406757483, + "grad_norm": 0.32380181550979614, + "learning_rate": 1.3892386334058433e-05, + "loss": 0.2335, + "step": 9970 + }, + { + "epoch": 0.3729233357478328, + "grad_norm": 0.39215198159217834, + "learning_rate": 1.3886976217458872e-05, + "loss": 0.2667, + "step": 9975 + }, + { + "epoch": 0.3731102647381826, + "grad_norm": 0.6366074085235596, + "learning_rate": 1.3881564760352589e-05, + "loss": 0.2668, + "step": 9980 + }, + { + "epoch": 0.3732971937285324, + "grad_norm": 0.4511488676071167, + "learning_rate": 1.387615196460584e-05, + "loss": 0.2636, + "step": 9985 + }, + { + "epoch": 0.37348412271888215, + "grad_norm": 0.2915279269218445, + "learning_rate": 1.3870737832085344e-05, + "loss": 0.2942, + "step": 9990 + }, + { + "epoch": 0.373671051709232, + "grad_norm": 0.2640349864959717, + "learning_rate": 1.386532236465828e-05, + "loss": 0.2608, + "step": 9995 + }, + { + "epoch": 0.37385798069958176, + "grad_norm": 0.5220035910606384, + "learning_rate": 1.3859905564192291e-05, + "loss": 0.3061, + "step": 10000 + }, + { + "epoch": 0.37404490968993154, + "grad_norm": 0.26642879843711853, + "learning_rate": 1.3854487432555467e-05, + "loss": 0.2558, + "step": 10005 + }, + { + "epoch": 0.3742318386802813, + "grad_norm": 0.5303171277046204, + "learning_rate": 1.3849067971616377e-05, + "loss": 0.264, + "step": 10010 + }, + { + "epoch": 0.3744187676706311, + "grad_norm": 0.412943035364151, + "learning_rate": 1.384364718324403e-05, + "loss": 0.363, + "step": 10015 + }, + { + "epoch": 0.3746056966609809, + "grad_norm": 0.7517222166061401, + "learning_rate": 1.3838225069307911e-05, + "loss": 0.3029, + "step": 10020 + }, + { + "epoch": 0.3747926256513307, + "grad_norm": 0.34779641032218933, + "learning_rate": 1.383280163167794e-05, + "loss": 0.2702, + "step": 10025 + }, + { + "epoch": 0.3749795546416805, + "grad_norm": 0.5778641104698181, + "learning_rate": 1.382737687222451e-05, + "loss": 0.2748, + "step": 10030 + }, + { + "epoch": 0.37516648363203026, + "grad_norm": 0.43767881393432617, + "learning_rate": 1.3821950792818464e-05, + "loss": 0.3038, + "step": 10035 + }, + { + "epoch": 0.3753534126223801, + "grad_norm": 0.43027085065841675, + "learning_rate": 1.3816523395331104e-05, + "loss": 0.2593, + "step": 10040 + }, + { + "epoch": 0.37554034161272987, + "grad_norm": 0.40595659613609314, + "learning_rate": 1.3811094681634183e-05, + "loss": 0.3294, + "step": 10045 + }, + { + "epoch": 0.37572727060307964, + "grad_norm": 0.21250957250595093, + "learning_rate": 1.3805664653599905e-05, + "loss": 0.3234, + "step": 10050 + }, + { + "epoch": 0.3759141995934294, + "grad_norm": 0.3901183605194092, + "learning_rate": 1.3800233313100935e-05, + "loss": 0.3291, + "step": 10055 + }, + { + "epoch": 0.37610112858377925, + "grad_norm": 0.40727564692497253, + "learning_rate": 1.3794800662010386e-05, + "loss": 0.2554, + "step": 10060 + }, + { + "epoch": 0.37628805757412903, + "grad_norm": 0.6711639165878296, + "learning_rate": 1.3789366702201824e-05, + "loss": 0.2909, + "step": 10065 + }, + { + "epoch": 0.3764749865644788, + "grad_norm": 0.3564465045928955, + "learning_rate": 1.3783931435549266e-05, + "loss": 0.2833, + "step": 10070 + }, + { + "epoch": 0.3766619155548286, + "grad_norm": 0.3789138197898865, + "learning_rate": 1.377849486392718e-05, + "loss": 0.2947, + "step": 10075 + }, + { + "epoch": 0.3768488445451784, + "grad_norm": 0.19771960377693176, + "learning_rate": 1.377305698921048e-05, + "loss": 0.2381, + "step": 10080 + }, + { + "epoch": 0.3770357735355282, + "grad_norm": 0.12256388366222382, + "learning_rate": 1.3767617813274537e-05, + "loss": 0.2924, + "step": 10085 + }, + { + "epoch": 0.377222702525878, + "grad_norm": 0.656947672367096, + "learning_rate": 1.376217733799517e-05, + "loss": 0.326, + "step": 10090 + }, + { + "epoch": 0.37740963151622775, + "grad_norm": 0.2615041136741638, + "learning_rate": 1.3756735565248634e-05, + "loss": 0.2551, + "step": 10095 + }, + { + "epoch": 0.3775965605065776, + "grad_norm": 0.4703410267829895, + "learning_rate": 1.3751292496911646e-05, + "loss": 0.273, + "step": 10100 + }, + { + "epoch": 0.37778348949692736, + "grad_norm": 0.3060661256313324, + "learning_rate": 1.3745848134861367e-05, + "loss": 0.381, + "step": 10105 + }, + { + "epoch": 0.37797041848727714, + "grad_norm": 0.30581894516944885, + "learning_rate": 1.3740402480975394e-05, + "loss": 0.2714, + "step": 10110 + }, + { + "epoch": 0.3781573474776269, + "grad_norm": 0.2676241397857666, + "learning_rate": 1.3734955537131786e-05, + "loss": 0.3834, + "step": 10115 + }, + { + "epoch": 0.37834427646797675, + "grad_norm": 0.28147178888320923, + "learning_rate": 1.3729507305209033e-05, + "loss": 0.2662, + "step": 10120 + }, + { + "epoch": 0.3785312054583265, + "grad_norm": 0.34182241559028625, + "learning_rate": 1.3724057787086073e-05, + "loss": 0.2477, + "step": 10125 + }, + { + "epoch": 0.3787181344486763, + "grad_norm": 0.30519115924835205, + "learning_rate": 1.3718606984642292e-05, + "loss": 0.2354, + "step": 10130 + }, + { + "epoch": 0.3789050634390261, + "grad_norm": 0.2952529788017273, + "learning_rate": 1.3713154899757508e-05, + "loss": 0.3169, + "step": 10135 + }, + { + "epoch": 0.3790919924293759, + "grad_norm": 0.4137369692325592, + "learning_rate": 1.3707701534311999e-05, + "loss": 0.3089, + "step": 10140 + }, + { + "epoch": 0.3792789214197257, + "grad_norm": 0.2354464828968048, + "learning_rate": 1.3702246890186469e-05, + "loss": 0.318, + "step": 10145 + }, + { + "epoch": 0.37946585041007547, + "grad_norm": 0.15742668509483337, + "learning_rate": 1.3696790969262068e-05, + "loss": 0.2672, + "step": 10150 + }, + { + "epoch": 0.37965277940042524, + "grad_norm": 0.21879169344902039, + "learning_rate": 1.369133377342039e-05, + "loss": 0.2698, + "step": 10155 + }, + { + "epoch": 0.3798397083907751, + "grad_norm": 0.8999713063240051, + "learning_rate": 1.3685875304543463e-05, + "loss": 0.4024, + "step": 10160 + }, + { + "epoch": 0.38002663738112485, + "grad_norm": 0.2633039355278015, + "learning_rate": 1.3680415564513759e-05, + "loss": 0.347, + "step": 10165 + }, + { + "epoch": 0.38021356637147463, + "grad_norm": 0.343563437461853, + "learning_rate": 1.3674954555214182e-05, + "loss": 0.3596, + "step": 10170 + }, + { + "epoch": 0.3804004953618244, + "grad_norm": 0.5179651975631714, + "learning_rate": 1.3669492278528079e-05, + "loss": 0.2785, + "step": 10175 + }, + { + "epoch": 0.38058742435217424, + "grad_norm": 0.24854378402233124, + "learning_rate": 1.3664028736339234e-05, + "loss": 0.2958, + "step": 10180 + }, + { + "epoch": 0.380774353342524, + "grad_norm": 0.39850738644599915, + "learning_rate": 1.3658563930531865e-05, + "loss": 0.313, + "step": 10185 + }, + { + "epoch": 0.3809612823328738, + "grad_norm": 0.3405385911464691, + "learning_rate": 1.365309786299063e-05, + "loss": 0.2671, + "step": 10190 + }, + { + "epoch": 0.3811482113232236, + "grad_norm": 0.3381601572036743, + "learning_rate": 1.3647630535600613e-05, + "loss": 0.2986, + "step": 10195 + }, + { + "epoch": 0.3813351403135734, + "grad_norm": 0.3975812792778015, + "learning_rate": 1.3642161950247345e-05, + "loss": 0.3056, + "step": 10200 + }, + { + "epoch": 0.3815220693039232, + "grad_norm": 0.5125154852867126, + "learning_rate": 1.363669210881678e-05, + "loss": 0.2916, + "step": 10205 + }, + { + "epoch": 0.38170899829427296, + "grad_norm": 0.29686829447746277, + "learning_rate": 1.3631221013195313e-05, + "loss": 0.3361, + "step": 10210 + }, + { + "epoch": 0.38189592728462274, + "grad_norm": 0.596336841583252, + "learning_rate": 1.3625748665269765e-05, + "loss": 0.2738, + "step": 10215 + }, + { + "epoch": 0.38208285627497257, + "grad_norm": 0.24768860638141632, + "learning_rate": 1.3620275066927392e-05, + "loss": 0.3018, + "step": 10220 + }, + { + "epoch": 0.38226978526532235, + "grad_norm": 0.30476251244544983, + "learning_rate": 1.3614800220055884e-05, + "loss": 0.2602, + "step": 10225 + }, + { + "epoch": 0.3824567142556721, + "grad_norm": 0.46634262800216675, + "learning_rate": 1.3609324126543357e-05, + "loss": 0.2068, + "step": 10230 + }, + { + "epoch": 0.3826436432460219, + "grad_norm": 0.264059454202652, + "learning_rate": 1.3603846788278357e-05, + "loss": 0.3386, + "step": 10235 + }, + { + "epoch": 0.38283057223637174, + "grad_norm": 0.37096327543258667, + "learning_rate": 1.3598368207149865e-05, + "loss": 0.3036, + "step": 10240 + }, + { + "epoch": 0.3830175012267215, + "grad_norm": 0.35662344098091125, + "learning_rate": 1.3592888385047285e-05, + "loss": 0.3593, + "step": 10245 + }, + { + "epoch": 0.3832044302170713, + "grad_norm": 0.4874442517757416, + "learning_rate": 1.358740732386045e-05, + "loss": 0.2687, + "step": 10250 + }, + { + "epoch": 0.38339135920742107, + "grad_norm": 0.3303247392177582, + "learning_rate": 1.3581925025479618e-05, + "loss": 0.3349, + "step": 10255 + }, + { + "epoch": 0.38357828819777084, + "grad_norm": 0.31140390038490295, + "learning_rate": 1.3576441491795484e-05, + "loss": 0.2713, + "step": 10260 + }, + { + "epoch": 0.3837652171881207, + "grad_norm": 0.2794208824634552, + "learning_rate": 1.3570956724699156e-05, + "loss": 0.2632, + "step": 10265 + }, + { + "epoch": 0.38395214617847045, + "grad_norm": 0.5220376253128052, + "learning_rate": 1.3565470726082176e-05, + "loss": 0.3356, + "step": 10270 + }, + { + "epoch": 0.38413907516882023, + "grad_norm": 0.31608107686042786, + "learning_rate": 1.3559983497836504e-05, + "loss": 0.3079, + "step": 10275 + }, + { + "epoch": 0.38432600415917, + "grad_norm": 0.44200223684310913, + "learning_rate": 1.3554495041854532e-05, + "loss": 0.2978, + "step": 10280 + }, + { + "epoch": 0.38451293314951984, + "grad_norm": 0.2726639211177826, + "learning_rate": 1.3549005360029066e-05, + "loss": 0.2671, + "step": 10285 + }, + { + "epoch": 0.3846998621398696, + "grad_norm": 0.410034716129303, + "learning_rate": 1.3543514454253346e-05, + "loss": 0.2938, + "step": 10290 + }, + { + "epoch": 0.3848867911302194, + "grad_norm": 0.33769235014915466, + "learning_rate": 1.3538022326421023e-05, + "loss": 0.3121, + "step": 10295 + }, + { + "epoch": 0.3850737201205692, + "grad_norm": 0.4343665540218353, + "learning_rate": 1.3532528978426178e-05, + "loss": 0.2907, + "step": 10300 + }, + { + "epoch": 0.385260649110919, + "grad_norm": 0.45513853430747986, + "learning_rate": 1.352703441216331e-05, + "loss": 0.2766, + "step": 10305 + }, + { + "epoch": 0.3854475781012688, + "grad_norm": 0.49546775221824646, + "learning_rate": 1.3521538629527336e-05, + "loss": 0.3089, + "step": 10310 + }, + { + "epoch": 0.38563450709161856, + "grad_norm": 0.6482167840003967, + "learning_rate": 1.351604163241359e-05, + "loss": 0.2859, + "step": 10315 + }, + { + "epoch": 0.38582143608196834, + "grad_norm": 0.6202002763748169, + "learning_rate": 1.3510543422717835e-05, + "loss": 0.3149, + "step": 10320 + }, + { + "epoch": 0.38600836507231817, + "grad_norm": 0.7666960954666138, + "learning_rate": 1.3505044002336241e-05, + "loss": 0.2812, + "step": 10325 + }, + { + "epoch": 0.38619529406266795, + "grad_norm": 0.42803874611854553, + "learning_rate": 1.3499543373165402e-05, + "loss": 0.3083, + "step": 10330 + }, + { + "epoch": 0.3863822230530177, + "grad_norm": 0.5874767899513245, + "learning_rate": 1.3494041537102332e-05, + "loss": 0.3362, + "step": 10335 + }, + { + "epoch": 0.3865691520433675, + "grad_norm": 0.5874844789505005, + "learning_rate": 1.348853849604445e-05, + "loss": 0.3628, + "step": 10340 + }, + { + "epoch": 0.38675608103371734, + "grad_norm": 0.454783171415329, + "learning_rate": 1.34830342518896e-05, + "loss": 0.2621, + "step": 10345 + }, + { + "epoch": 0.3869430100240671, + "grad_norm": 0.3771190345287323, + "learning_rate": 1.347752880653604e-05, + "loss": 0.2833, + "step": 10350 + }, + { + "epoch": 0.3871299390144169, + "grad_norm": 0.30891087651252747, + "learning_rate": 1.3472022161882439e-05, + "loss": 0.2938, + "step": 10355 + }, + { + "epoch": 0.38731686800476667, + "grad_norm": 0.5641288757324219, + "learning_rate": 1.346651431982788e-05, + "loss": 0.3008, + "step": 10360 + }, + { + "epoch": 0.3875037969951165, + "grad_norm": 0.19588641822338104, + "learning_rate": 1.3461005282271857e-05, + "loss": 0.2773, + "step": 10365 + }, + { + "epoch": 0.3876907259854663, + "grad_norm": 0.6388927102088928, + "learning_rate": 1.3455495051114283e-05, + "loss": 0.2966, + "step": 10370 + }, + { + "epoch": 0.38787765497581606, + "grad_norm": 0.4509037733078003, + "learning_rate": 1.344998362825548e-05, + "loss": 0.3202, + "step": 10375 + }, + { + "epoch": 0.38806458396616583, + "grad_norm": 0.3036980628967285, + "learning_rate": 1.344447101559618e-05, + "loss": 0.2795, + "step": 10380 + }, + { + "epoch": 0.38825151295651567, + "grad_norm": 0.34260863065719604, + "learning_rate": 1.343895721503752e-05, + "loss": 0.2886, + "step": 10385 + }, + { + "epoch": 0.38843844194686544, + "grad_norm": 0.2766103744506836, + "learning_rate": 1.3433442228481055e-05, + "loss": 0.3462, + "step": 10390 + }, + { + "epoch": 0.3886253709372152, + "grad_norm": 0.39682698249816895, + "learning_rate": 1.3427926057828749e-05, + "loss": 0.294, + "step": 10395 + }, + { + "epoch": 0.388812299927565, + "grad_norm": 0.3901638388633728, + "learning_rate": 1.3422408704982968e-05, + "loss": 0.2803, + "step": 10400 + }, + { + "epoch": 0.38899922891791483, + "grad_norm": 0.39301908016204834, + "learning_rate": 1.3416890171846486e-05, + "loss": 0.3174, + "step": 10405 + }, + { + "epoch": 0.3891861579082646, + "grad_norm": 0.39801543951034546, + "learning_rate": 1.3411370460322493e-05, + "loss": 0.3569, + "step": 10410 + }, + { + "epoch": 0.3893730868986144, + "grad_norm": 0.3659820854663849, + "learning_rate": 1.3405849572314574e-05, + "loss": 0.3338, + "step": 10415 + }, + { + "epoch": 0.38956001588896416, + "grad_norm": 0.28667154908180237, + "learning_rate": 1.3400327509726726e-05, + "loss": 0.2569, + "step": 10420 + }, + { + "epoch": 0.389746944879314, + "grad_norm": 0.4689178466796875, + "learning_rate": 1.3394804274463358e-05, + "loss": 0.279, + "step": 10425 + }, + { + "epoch": 0.38993387386966377, + "grad_norm": 0.4040241539478302, + "learning_rate": 1.3389279868429264e-05, + "loss": 0.2931, + "step": 10430 + }, + { + "epoch": 0.39012080286001355, + "grad_norm": 0.33305883407592773, + "learning_rate": 1.3383754293529659e-05, + "loss": 0.3057, + "step": 10435 + }, + { + "epoch": 0.3903077318503633, + "grad_norm": 0.37228256464004517, + "learning_rate": 1.3378227551670155e-05, + "loss": 0.315, + "step": 10440 + }, + { + "epoch": 0.39049466084071316, + "grad_norm": 0.4526333510875702, + "learning_rate": 1.3372699644756769e-05, + "loss": 0.294, + "step": 10445 + }, + { + "epoch": 0.39068158983106294, + "grad_norm": 0.4754495620727539, + "learning_rate": 1.3367170574695916e-05, + "loss": 0.2909, + "step": 10450 + }, + { + "epoch": 0.3908685188214127, + "grad_norm": 0.4171309769153595, + "learning_rate": 1.3361640343394411e-05, + "loss": 0.2908, + "step": 10455 + }, + { + "epoch": 0.3910554478117625, + "grad_norm": 0.3807438313961029, + "learning_rate": 1.3356108952759472e-05, + "loss": 0.3104, + "step": 10460 + }, + { + "epoch": 0.3912423768021123, + "grad_norm": 0.20601804554462433, + "learning_rate": 1.3350576404698725e-05, + "loss": 0.299, + "step": 10465 + }, + { + "epoch": 0.3914293057924621, + "grad_norm": 0.4232652187347412, + "learning_rate": 1.334504270112018e-05, + "loss": 0.3072, + "step": 10470 + }, + { + "epoch": 0.3916162347828119, + "grad_norm": 0.40767017006874084, + "learning_rate": 1.3339507843932259e-05, + "loss": 0.273, + "step": 10475 + }, + { + "epoch": 0.39180316377316166, + "grad_norm": 0.49084317684173584, + "learning_rate": 1.3333971835043767e-05, + "loss": 0.2653, + "step": 10480 + }, + { + "epoch": 0.3919900927635115, + "grad_norm": 0.5394593477249146, + "learning_rate": 1.332843467636392e-05, + "loss": 0.3149, + "step": 10485 + }, + { + "epoch": 0.39217702175386127, + "grad_norm": 0.2563055157661438, + "learning_rate": 1.3322896369802325e-05, + "loss": 0.3443, + "step": 10490 + }, + { + "epoch": 0.39236395074421104, + "grad_norm": 0.2230866700410843, + "learning_rate": 1.3317356917268987e-05, + "loss": 0.2512, + "step": 10495 + }, + { + "epoch": 0.3925508797345608, + "grad_norm": 0.5398097038269043, + "learning_rate": 1.33118163206743e-05, + "loss": 0.2493, + "step": 10500 + }, + { + "epoch": 0.3927378087249106, + "grad_norm": 0.36351341009140015, + "learning_rate": 1.3306274581929059e-05, + "loss": 0.3472, + "step": 10505 + }, + { + "epoch": 0.39292473771526043, + "grad_norm": 0.34674593806266785, + "learning_rate": 1.3300731702944453e-05, + "loss": 0.3117, + "step": 10510 + }, + { + "epoch": 0.3931116667056102, + "grad_norm": 0.2768007516860962, + "learning_rate": 1.329518768563206e-05, + "loss": 0.2991, + "step": 10515 + }, + { + "epoch": 0.39329859569596, + "grad_norm": 0.7572014331817627, + "learning_rate": 1.3289642531903857e-05, + "loss": 0.3921, + "step": 10520 + }, + { + "epoch": 0.39348552468630976, + "grad_norm": 0.34211087226867676, + "learning_rate": 1.32840962436722e-05, + "loss": 0.3197, + "step": 10525 + }, + { + "epoch": 0.3936724536766596, + "grad_norm": 0.5510493516921997, + "learning_rate": 1.3278548822849853e-05, + "loss": 0.2646, + "step": 10530 + }, + { + "epoch": 0.3938593826670094, + "grad_norm": 0.39801833033561707, + "learning_rate": 1.3273000271349959e-05, + "loss": 0.2889, + "step": 10535 + }, + { + "epoch": 0.39404631165735915, + "grad_norm": 0.3157884180545807, + "learning_rate": 1.3267450591086051e-05, + "loss": 0.3597, + "step": 10540 + }, + { + "epoch": 0.3942332406477089, + "grad_norm": 0.5300944447517395, + "learning_rate": 1.3261899783972062e-05, + "loss": 0.3158, + "step": 10545 + }, + { + "epoch": 0.39442016963805876, + "grad_norm": 0.48455068469047546, + "learning_rate": 1.32563478519223e-05, + "loss": 0.2964, + "step": 10550 + }, + { + "epoch": 0.39460709862840854, + "grad_norm": 0.33385178446769714, + "learning_rate": 1.3250794796851474e-05, + "loss": 0.3481, + "step": 10555 + }, + { + "epoch": 0.3947940276187583, + "grad_norm": 0.35784122347831726, + "learning_rate": 1.3245240620674667e-05, + "loss": 0.3654, + "step": 10560 + }, + { + "epoch": 0.3949809566091081, + "grad_norm": 0.19756436347961426, + "learning_rate": 1.3239685325307359e-05, + "loss": 0.3123, + "step": 10565 + }, + { + "epoch": 0.3951678855994579, + "grad_norm": 0.33110281825065613, + "learning_rate": 1.323412891266541e-05, + "loss": 0.291, + "step": 10570 + }, + { + "epoch": 0.3953548145898077, + "grad_norm": 0.2022867351770401, + "learning_rate": 1.322857138466507e-05, + "loss": 0.3221, + "step": 10575 + }, + { + "epoch": 0.3955417435801575, + "grad_norm": 0.49643459916114807, + "learning_rate": 1.3223012743222967e-05, + "loss": 0.2867, + "step": 10580 + }, + { + "epoch": 0.39572867257050726, + "grad_norm": 0.43748632073402405, + "learning_rate": 1.3217452990256122e-05, + "loss": 0.334, + "step": 10585 + }, + { + "epoch": 0.3959156015608571, + "grad_norm": 0.6567883491516113, + "learning_rate": 1.3211892127681934e-05, + "loss": 0.3096, + "step": 10590 + }, + { + "epoch": 0.39610253055120687, + "grad_norm": 0.5701555609703064, + "learning_rate": 1.320633015741818e-05, + "loss": 0.249, + "step": 10595 + }, + { + "epoch": 0.39628945954155664, + "grad_norm": 0.33283892273902893, + "learning_rate": 1.3200767081383028e-05, + "loss": 0.3294, + "step": 10600 + }, + { + "epoch": 0.3964763885319064, + "grad_norm": 0.3843041956424713, + "learning_rate": 1.3195202901495024e-05, + "loss": 0.2428, + "step": 10605 + }, + { + "epoch": 0.39666331752225625, + "grad_norm": 0.2713102698326111, + "learning_rate": 1.3189637619673095e-05, + "loss": 0.2753, + "step": 10610 + }, + { + "epoch": 0.39685024651260603, + "grad_norm": 0.28656595945358276, + "learning_rate": 1.3184071237836544e-05, + "loss": 0.2965, + "step": 10615 + }, + { + "epoch": 0.3970371755029558, + "grad_norm": 0.30529990792274475, + "learning_rate": 1.3178503757905058e-05, + "loss": 0.3222, + "step": 10620 + }, + { + "epoch": 0.3972241044933056, + "grad_norm": 0.3871190845966339, + "learning_rate": 1.3172935181798703e-05, + "loss": 0.362, + "step": 10625 + }, + { + "epoch": 0.3974110334836554, + "grad_norm": 0.5080790519714355, + "learning_rate": 1.3167365511437919e-05, + "loss": 0.238, + "step": 10630 + }, + { + "epoch": 0.3975979624740052, + "grad_norm": 0.3145684003829956, + "learning_rate": 1.3161794748743525e-05, + "loss": 0.2148, + "step": 10635 + }, + { + "epoch": 0.397784891464355, + "grad_norm": 0.7068213820457458, + "learning_rate": 1.315622289563672e-05, + "loss": 0.2557, + "step": 10640 + }, + { + "epoch": 0.39797182045470475, + "grad_norm": 0.3006476163864136, + "learning_rate": 1.3150649954039078e-05, + "loss": 0.3824, + "step": 10645 + }, + { + "epoch": 0.3981587494450546, + "grad_norm": 0.269481360912323, + "learning_rate": 1.3145075925872543e-05, + "loss": 0.3222, + "step": 10650 + }, + { + "epoch": 0.39834567843540436, + "grad_norm": 0.42372992634773254, + "learning_rate": 1.3139500813059438e-05, + "loss": 0.2562, + "step": 10655 + }, + { + "epoch": 0.39853260742575414, + "grad_norm": 0.4050086736679077, + "learning_rate": 1.3133924617522462e-05, + "loss": 0.2818, + "step": 10660 + }, + { + "epoch": 0.3987195364161039, + "grad_norm": 0.34156128764152527, + "learning_rate": 1.3128347341184684e-05, + "loss": 0.3041, + "step": 10665 + }, + { + "epoch": 0.39890646540645375, + "grad_norm": 0.25688794255256653, + "learning_rate": 1.3122768985969546e-05, + "loss": 0.3239, + "step": 10670 + }, + { + "epoch": 0.3990933943968035, + "grad_norm": 0.4633738398551941, + "learning_rate": 1.3117189553800861e-05, + "loss": 0.267, + "step": 10675 + }, + { + "epoch": 0.3992803233871533, + "grad_norm": 0.41393613815307617, + "learning_rate": 1.311160904660282e-05, + "loss": 0.2762, + "step": 10680 + }, + { + "epoch": 0.3994672523775031, + "grad_norm": 0.41371089220046997, + "learning_rate": 1.3106027466299977e-05, + "loss": 0.2862, + "step": 10685 + }, + { + "epoch": 0.3996541813678529, + "grad_norm": 0.4804210662841797, + "learning_rate": 1.310044481481726e-05, + "loss": 0.2986, + "step": 10690 + }, + { + "epoch": 0.3998411103582027, + "grad_norm": 0.5706376433372498, + "learning_rate": 1.3094861094079965e-05, + "loss": 0.3143, + "step": 10695 + }, + { + "epoch": 0.40002803934855247, + "grad_norm": 0.6555120944976807, + "learning_rate": 1.3089276306013759e-05, + "loss": 0.2877, + "step": 10700 + }, + { + "epoch": 0.40021496833890224, + "grad_norm": 0.40146076679229736, + "learning_rate": 1.3083690452544673e-05, + "loss": 0.2458, + "step": 10705 + }, + { + "epoch": 0.4004018973292521, + "grad_norm": 0.24645639955997467, + "learning_rate": 1.307810353559911e-05, + "loss": 0.3734, + "step": 10710 + }, + { + "epoch": 0.40058882631960185, + "grad_norm": 0.34604454040527344, + "learning_rate": 1.3072515557103835e-05, + "loss": 0.3489, + "step": 10715 + }, + { + "epoch": 0.40077575530995163, + "grad_norm": 0.3194914758205414, + "learning_rate": 1.3066926518985984e-05, + "loss": 0.2924, + "step": 10720 + }, + { + "epoch": 0.4009626843003014, + "grad_norm": 0.38813045620918274, + "learning_rate": 1.3061336423173053e-05, + "loss": 0.3895, + "step": 10725 + }, + { + "epoch": 0.40114961329065124, + "grad_norm": 0.4124408960342407, + "learning_rate": 1.305574527159291e-05, + "loss": 0.2377, + "step": 10730 + }, + { + "epoch": 0.401336542281001, + "grad_norm": 0.37710294127464294, + "learning_rate": 1.305015306617378e-05, + "loss": 0.2393, + "step": 10735 + }, + { + "epoch": 0.4015234712713508, + "grad_norm": 0.34080588817596436, + "learning_rate": 1.3044559808844257e-05, + "loss": 0.3022, + "step": 10740 + }, + { + "epoch": 0.4017104002617006, + "grad_norm": 0.4511391520500183, + "learning_rate": 1.3038965501533291e-05, + "loss": 0.2905, + "step": 10745 + }, + { + "epoch": 0.40189732925205035, + "grad_norm": 0.5546760559082031, + "learning_rate": 1.3033370146170201e-05, + "loss": 0.3291, + "step": 10750 + }, + { + "epoch": 0.4020842582424002, + "grad_norm": 0.3765566647052765, + "learning_rate": 1.3027773744684669e-05, + "loss": 0.2396, + "step": 10755 + }, + { + "epoch": 0.40227118723274996, + "grad_norm": 0.32864052057266235, + "learning_rate": 1.3022176299006726e-05, + "loss": 0.3394, + "step": 10760 + }, + { + "epoch": 0.40245811622309974, + "grad_norm": 0.3548523187637329, + "learning_rate": 1.3016577811066775e-05, + "loss": 0.3865, + "step": 10765 + }, + { + "epoch": 0.4026450452134495, + "grad_norm": 0.8239120841026306, + "learning_rate": 1.3010978282795569e-05, + "loss": 0.2682, + "step": 10770 + }, + { + "epoch": 0.40283197420379935, + "grad_norm": 0.3894842565059662, + "learning_rate": 1.3005377716124232e-05, + "loss": 0.335, + "step": 10775 + }, + { + "epoch": 0.4030189031941491, + "grad_norm": 0.5358176231384277, + "learning_rate": 1.2999776112984237e-05, + "loss": 0.2949, + "step": 10780 + }, + { + "epoch": 0.4032058321844989, + "grad_norm": 0.5368191003799438, + "learning_rate": 1.2994173475307412e-05, + "loss": 0.2752, + "step": 10785 + }, + { + "epoch": 0.4033927611748487, + "grad_norm": 0.30042341351509094, + "learning_rate": 1.298856980502595e-05, + "loss": 0.2578, + "step": 10790 + }, + { + "epoch": 0.4035796901651985, + "grad_norm": 0.3132897913455963, + "learning_rate": 1.2982965104072397e-05, + "loss": 0.3735, + "step": 10795 + }, + { + "epoch": 0.4037666191555483, + "grad_norm": 2.302870988845825, + "learning_rate": 1.2977359374379652e-05, + "loss": 0.3051, + "step": 10800 + }, + { + "epoch": 0.40395354814589807, + "grad_norm": 0.31478455662727356, + "learning_rate": 1.2971752617880972e-05, + "loss": 0.2558, + "step": 10805 + }, + { + "epoch": 0.40414047713624784, + "grad_norm": 0.5957106947898865, + "learning_rate": 1.2966144836509964e-05, + "loss": 0.3231, + "step": 10810 + }, + { + "epoch": 0.4043274061265977, + "grad_norm": 0.49217236042022705, + "learning_rate": 1.2960536032200592e-05, + "loss": 0.2385, + "step": 10815 + }, + { + "epoch": 0.40451433511694745, + "grad_norm": 0.39011693000793457, + "learning_rate": 1.2954926206887174e-05, + "loss": 0.3561, + "step": 10820 + }, + { + "epoch": 0.40470126410729723, + "grad_norm": 0.4122646152973175, + "learning_rate": 1.2949315362504376e-05, + "loss": 0.3262, + "step": 10825 + }, + { + "epoch": 0.404888193097647, + "grad_norm": 0.3655274510383606, + "learning_rate": 1.2943703500987218e-05, + "loss": 0.2638, + "step": 10830 + }, + { + "epoch": 0.40507512208799684, + "grad_norm": 0.39280015230178833, + "learning_rate": 1.293809062427107e-05, + "loss": 0.3355, + "step": 10835 + }, + { + "epoch": 0.4052620510783466, + "grad_norm": 0.5871797800064087, + "learning_rate": 1.2932476734291652e-05, + "loss": 0.3217, + "step": 10840 + }, + { + "epoch": 0.4054489800686964, + "grad_norm": 0.22152677178382874, + "learning_rate": 1.2926861832985036e-05, + "loss": 0.2709, + "step": 10845 + }, + { + "epoch": 0.4056359090590462, + "grad_norm": 0.4744000732898712, + "learning_rate": 1.292124592228764e-05, + "loss": 0.3501, + "step": 10850 + }, + { + "epoch": 0.405822838049396, + "grad_norm": 0.34133681654930115, + "learning_rate": 1.2915629004136228e-05, + "loss": 0.2422, + "step": 10855 + }, + { + "epoch": 0.4060097670397458, + "grad_norm": 0.5644049048423767, + "learning_rate": 1.2910011080467917e-05, + "loss": 0.2745, + "step": 10860 + }, + { + "epoch": 0.40619669603009556, + "grad_norm": 0.48984065651893616, + "learning_rate": 1.2904392153220164e-05, + "loss": 0.3229, + "step": 10865 + }, + { + "epoch": 0.40638362502044534, + "grad_norm": 0.30956414341926575, + "learning_rate": 1.2898772224330778e-05, + "loss": 0.3053, + "step": 10870 + }, + { + "epoch": 0.40657055401079517, + "grad_norm": 0.24445974826812744, + "learning_rate": 1.2893151295737916e-05, + "loss": 0.2011, + "step": 10875 + }, + { + "epoch": 0.40675748300114495, + "grad_norm": 1.567341923713684, + "learning_rate": 1.288752936938007e-05, + "loss": 0.3163, + "step": 10880 + }, + { + "epoch": 0.4069444119914947, + "grad_norm": 0.28451815247535706, + "learning_rate": 1.2881906447196082e-05, + "loss": 0.2405, + "step": 10885 + }, + { + "epoch": 0.4071313409818445, + "grad_norm": 0.252210408449173, + "learning_rate": 1.287628253112514e-05, + "loss": 0.245, + "step": 10890 + }, + { + "epoch": 0.40731826997219434, + "grad_norm": 0.4983895421028137, + "learning_rate": 1.2870657623106766e-05, + "loss": 0.3025, + "step": 10895 + }, + { + "epoch": 0.4075051989625441, + "grad_norm": 0.6142060160636902, + "learning_rate": 1.2865031725080834e-05, + "loss": 0.3184, + "step": 10900 + }, + { + "epoch": 0.4076921279528939, + "grad_norm": 0.34046661853790283, + "learning_rate": 1.2859404838987552e-05, + "loss": 0.3181, + "step": 10905 + }, + { + "epoch": 0.40787905694324367, + "grad_norm": 0.7890579700469971, + "learning_rate": 1.2853776966767475e-05, + "loss": 0.3095, + "step": 10910 + }, + { + "epoch": 0.4080659859335935, + "grad_norm": 0.4729807376861572, + "learning_rate": 1.284814811036149e-05, + "loss": 0.3228, + "step": 10915 + }, + { + "epoch": 0.4082529149239433, + "grad_norm": 0.5021012425422668, + "learning_rate": 1.2842518271710836e-05, + "loss": 0.2234, + "step": 10920 + }, + { + "epoch": 0.40843984391429305, + "grad_norm": 0.36319997906684875, + "learning_rate": 1.2836887452757076e-05, + "loss": 0.2692, + "step": 10925 + }, + { + "epoch": 0.40862677290464283, + "grad_norm": 0.44691482186317444, + "learning_rate": 1.2831255655442122e-05, + "loss": 0.3243, + "step": 10930 + }, + { + "epoch": 0.40881370189499266, + "grad_norm": 0.2757837772369385, + "learning_rate": 1.2825622881708218e-05, + "loss": 0.3383, + "step": 10935 + }, + { + "epoch": 0.40900063088534244, + "grad_norm": 0.3655282258987427, + "learning_rate": 1.2819989133497945e-05, + "loss": 0.216, + "step": 10940 + }, + { + "epoch": 0.4091875598756922, + "grad_norm": 0.2834169864654541, + "learning_rate": 1.2814354412754227e-05, + "loss": 0.2807, + "step": 10945 + }, + { + "epoch": 0.409374488866042, + "grad_norm": 0.49131330847740173, + "learning_rate": 1.2808718721420308e-05, + "loss": 0.2887, + "step": 10950 + }, + { + "epoch": 0.40956141785639183, + "grad_norm": 0.2866436541080475, + "learning_rate": 1.2803082061439784e-05, + "loss": 0.3064, + "step": 10955 + }, + { + "epoch": 0.4097483468467416, + "grad_norm": 0.5651163458824158, + "learning_rate": 1.2797444434756571e-05, + "loss": 0.2649, + "step": 10960 + }, + { + "epoch": 0.4099352758370914, + "grad_norm": 0.3305072486400604, + "learning_rate": 1.2791805843314937e-05, + "loss": 0.3414, + "step": 10965 + }, + { + "epoch": 0.41012220482744116, + "grad_norm": 0.4713725745677948, + "learning_rate": 1.2786166289059461e-05, + "loss": 0.2735, + "step": 10970 + }, + { + "epoch": 0.410309133817791, + "grad_norm": 0.45371806621551514, + "learning_rate": 1.2780525773935063e-05, + "loss": 0.231, + "step": 10975 + }, + { + "epoch": 0.41049606280814077, + "grad_norm": 0.3463742136955261, + "learning_rate": 1.2774884299887e-05, + "loss": 0.338, + "step": 10980 + }, + { + "epoch": 0.41068299179849055, + "grad_norm": 0.30682888627052307, + "learning_rate": 1.2769241868860851e-05, + "loss": 0.2203, + "step": 10985 + }, + { + "epoch": 0.4108699207888403, + "grad_norm": 0.5777801275253296, + "learning_rate": 1.2763598482802531e-05, + "loss": 0.2488, + "step": 10990 + }, + { + "epoch": 0.4110568497791901, + "grad_norm": 0.3914337754249573, + "learning_rate": 1.2757954143658285e-05, + "loss": 0.3193, + "step": 10995 + }, + { + "epoch": 0.41124377876953994, + "grad_norm": 0.37003767490386963, + "learning_rate": 1.2752308853374675e-05, + "loss": 0.3423, + "step": 11000 + }, + { + "epoch": 0.4114307077598897, + "grad_norm": 0.6970301866531372, + "learning_rate": 1.2746662613898605e-05, + "loss": 0.3753, + "step": 11005 + }, + { + "epoch": 0.4116176367502395, + "grad_norm": 0.5969115495681763, + "learning_rate": 1.2741015427177304e-05, + "loss": 0.3479, + "step": 11010 + }, + { + "epoch": 0.41180456574058927, + "grad_norm": 0.4280428886413574, + "learning_rate": 1.2735367295158324e-05, + "loss": 0.2633, + "step": 11015 + }, + { + "epoch": 0.4119914947309391, + "grad_norm": 0.47678300738334656, + "learning_rate": 1.2729718219789538e-05, + "loss": 0.2922, + "step": 11020 + }, + { + "epoch": 0.4121784237212889, + "grad_norm": 0.4477633833885193, + "learning_rate": 1.2724068203019155e-05, + "loss": 0.3189, + "step": 11025 + }, + { + "epoch": 0.41236535271163866, + "grad_norm": 0.330090194940567, + "learning_rate": 1.2718417246795702e-05, + "loss": 0.2159, + "step": 11030 + }, + { + "epoch": 0.41255228170198843, + "grad_norm": 0.33988553285598755, + "learning_rate": 1.2712765353068036e-05, + "loss": 0.2479, + "step": 11035 + }, + { + "epoch": 0.41273921069233827, + "grad_norm": 0.22661490738391876, + "learning_rate": 1.2707112523785327e-05, + "loss": 0.3321, + "step": 11040 + }, + { + "epoch": 0.41292613968268804, + "grad_norm": 0.29933539032936096, + "learning_rate": 1.2701458760897083e-05, + "loss": 0.2927, + "step": 11045 + }, + { + "epoch": 0.4131130686730378, + "grad_norm": 0.39779335260391235, + "learning_rate": 1.2695804066353113e-05, + "loss": 0.3024, + "step": 11050 + }, + { + "epoch": 0.4132999976633876, + "grad_norm": 0.3741395175457001, + "learning_rate": 1.2690148442103567e-05, + "loss": 0.3496, + "step": 11055 + }, + { + "epoch": 0.41348692665373743, + "grad_norm": 0.33514922857284546, + "learning_rate": 1.2684491890098907e-05, + "loss": 0.2939, + "step": 11060 + }, + { + "epoch": 0.4136738556440872, + "grad_norm": 0.2723214030265808, + "learning_rate": 1.2678834412289915e-05, + "loss": 0.2748, + "step": 11065 + }, + { + "epoch": 0.413860784634437, + "grad_norm": 0.2613365054130554, + "learning_rate": 1.267317601062769e-05, + "loss": 0.268, + "step": 11070 + }, + { + "epoch": 0.41404771362478676, + "grad_norm": 0.3753436207771301, + "learning_rate": 1.2667516687063657e-05, + "loss": 0.2862, + "step": 11075 + }, + { + "epoch": 0.4142346426151366, + "grad_norm": 0.38865718245506287, + "learning_rate": 1.2661856443549551e-05, + "loss": 0.3599, + "step": 11080 + }, + { + "epoch": 0.41442157160548637, + "grad_norm": 0.3524521291255951, + "learning_rate": 1.2656195282037432e-05, + "loss": 0.2869, + "step": 11085 + }, + { + "epoch": 0.41460850059583615, + "grad_norm": 0.28500306606292725, + "learning_rate": 1.265053320447967e-05, + "loss": 0.2769, + "step": 11090 + }, + { + "epoch": 0.4147954295861859, + "grad_norm": 0.514741837978363, + "learning_rate": 1.2644870212828949e-05, + "loss": 0.234, + "step": 11095 + }, + { + "epoch": 0.41498235857653576, + "grad_norm": 0.33767464756965637, + "learning_rate": 1.2639206309038279e-05, + "loss": 0.3528, + "step": 11100 + }, + { + "epoch": 0.41516928756688554, + "grad_norm": 0.4422234296798706, + "learning_rate": 1.2633541495060975e-05, + "loss": 0.2436, + "step": 11105 + }, + { + "epoch": 0.4153562165572353, + "grad_norm": 0.41787955164909363, + "learning_rate": 1.2627875772850672e-05, + "loss": 0.3395, + "step": 11110 + }, + { + "epoch": 0.4155431455475851, + "grad_norm": 1.9292072057724, + "learning_rate": 1.2622209144361313e-05, + "loss": 0.2723, + "step": 11115 + }, + { + "epoch": 0.4157300745379349, + "grad_norm": 0.3203314244747162, + "learning_rate": 1.2616541611547155e-05, + "loss": 0.2928, + "step": 11120 + }, + { + "epoch": 0.4159170035282847, + "grad_norm": 0.49962037801742554, + "learning_rate": 1.2610873176362767e-05, + "loss": 0.3809, + "step": 11125 + }, + { + "epoch": 0.4161039325186345, + "grad_norm": 0.4445996880531311, + "learning_rate": 1.2605203840763034e-05, + "loss": 0.3369, + "step": 11130 + }, + { + "epoch": 0.41629086150898426, + "grad_norm": 0.43643853068351746, + "learning_rate": 1.2599533606703144e-05, + "loss": 0.4172, + "step": 11135 + }, + { + "epoch": 0.4164777904993341, + "grad_norm": 0.7465499639511108, + "learning_rate": 1.2593862476138598e-05, + "loss": 0.4366, + "step": 11140 + }, + { + "epoch": 0.41666471948968387, + "grad_norm": 0.34575432538986206, + "learning_rate": 1.2588190451025209e-05, + "loss": 0.2783, + "step": 11145 + }, + { + "epoch": 0.41685164848003364, + "grad_norm": 0.33487460017204285, + "learning_rate": 1.2582517533319094e-05, + "loss": 0.3308, + "step": 11150 + }, + { + "epoch": 0.4170385774703834, + "grad_norm": 0.33010798692703247, + "learning_rate": 1.257684372497668e-05, + "loss": 0.3056, + "step": 11155 + }, + { + "epoch": 0.41722550646073325, + "grad_norm": 0.49482452869415283, + "learning_rate": 1.2571169027954702e-05, + "loss": 0.2821, + "step": 11160 + }, + { + "epoch": 0.41741243545108303, + "grad_norm": 0.23514960706233978, + "learning_rate": 1.25654934442102e-05, + "loss": 0.3404, + "step": 11165 + }, + { + "epoch": 0.4175993644414328, + "grad_norm": 0.7681819200515747, + "learning_rate": 1.2559816975700518e-05, + "loss": 0.3222, + "step": 11170 + }, + { + "epoch": 0.4177862934317826, + "grad_norm": 0.32423636317253113, + "learning_rate": 1.2554139624383307e-05, + "loss": 0.263, + "step": 11175 + }, + { + "epoch": 0.4179732224221324, + "grad_norm": 0.26687395572662354, + "learning_rate": 1.2548461392216531e-05, + "loss": 0.2467, + "step": 11180 + }, + { + "epoch": 0.4181601514124822, + "grad_norm": 0.3909884989261627, + "learning_rate": 1.2542782281158438e-05, + "loss": 0.2949, + "step": 11185 + }, + { + "epoch": 0.418347080402832, + "grad_norm": 0.27226680517196655, + "learning_rate": 1.2537102293167598e-05, + "loss": 0.2839, + "step": 11190 + }, + { + "epoch": 0.41853400939318175, + "grad_norm": 0.3636428415775299, + "learning_rate": 1.2531421430202875e-05, + "loss": 0.2814, + "step": 11195 + }, + { + "epoch": 0.4187209383835316, + "grad_norm": 0.43354684114456177, + "learning_rate": 1.2525739694223436e-05, + "loss": 0.3003, + "step": 11200 + }, + { + "epoch": 0.41890786737388136, + "grad_norm": 0.3936502933502197, + "learning_rate": 1.2520057087188748e-05, + "loss": 0.2926, + "step": 11205 + }, + { + "epoch": 0.41909479636423114, + "grad_norm": 0.37949803471565247, + "learning_rate": 1.2514373611058578e-05, + "loss": 0.3242, + "step": 11210 + }, + { + "epoch": 0.4192817253545809, + "grad_norm": 0.2999209463596344, + "learning_rate": 1.2508689267792994e-05, + "loss": 0.2679, + "step": 11215 + }, + { + "epoch": 0.41946865434493075, + "grad_norm": 0.37106993794441223, + "learning_rate": 1.2503004059352369e-05, + "loss": 0.2887, + "step": 11220 + }, + { + "epoch": 0.4196555833352805, + "grad_norm": 0.286905437707901, + "learning_rate": 1.2497317987697359e-05, + "loss": 0.2298, + "step": 11225 + }, + { + "epoch": 0.4198425123256303, + "grad_norm": 0.5081347823143005, + "learning_rate": 1.2491631054788936e-05, + "loss": 0.305, + "step": 11230 + }, + { + "epoch": 0.4200294413159801, + "grad_norm": 0.43394070863723755, + "learning_rate": 1.2485943262588353e-05, + "loss": 0.2757, + "step": 11235 + }, + { + "epoch": 0.42021637030632986, + "grad_norm": 0.6610644459724426, + "learning_rate": 1.2480254613057172e-05, + "loss": 0.3212, + "step": 11240 + }, + { + "epoch": 0.4204032992966797, + "grad_norm": 0.6914903521537781, + "learning_rate": 1.2474565108157244e-05, + "loss": 0.3204, + "step": 11245 + }, + { + "epoch": 0.42059022828702947, + "grad_norm": 0.31729546189308167, + "learning_rate": 1.2468874749850715e-05, + "loss": 0.2934, + "step": 11250 + }, + { + "epoch": 0.42077715727737924, + "grad_norm": 0.40817737579345703, + "learning_rate": 1.2463183540100028e-05, + "loss": 0.2576, + "step": 11255 + }, + { + "epoch": 0.420964086267729, + "grad_norm": 0.48512113094329834, + "learning_rate": 1.2457491480867917e-05, + "loss": 0.3062, + "step": 11260 + }, + { + "epoch": 0.42115101525807885, + "grad_norm": 0.32723644375801086, + "learning_rate": 1.2451798574117406e-05, + "loss": 0.2556, + "step": 11265 + }, + { + "epoch": 0.42133794424842863, + "grad_norm": 0.3997679054737091, + "learning_rate": 1.2446104821811825e-05, + "loss": 0.2993, + "step": 11270 + }, + { + "epoch": 0.4215248732387784, + "grad_norm": 0.24848319590091705, + "learning_rate": 1.2440410225914779e-05, + "loss": 0.32, + "step": 11275 + }, + { + "epoch": 0.4217118022291282, + "grad_norm": 0.33824488520622253, + "learning_rate": 1.243471478839017e-05, + "loss": 0.3054, + "step": 11280 + }, + { + "epoch": 0.421898731219478, + "grad_norm": 0.3888397216796875, + "learning_rate": 1.2429018511202195e-05, + "loss": 0.2567, + "step": 11285 + }, + { + "epoch": 0.4220856602098278, + "grad_norm": 0.25221407413482666, + "learning_rate": 1.2423321396315338e-05, + "loss": 0.2519, + "step": 11290 + }, + { + "epoch": 0.4222725892001776, + "grad_norm": 0.4825991094112396, + "learning_rate": 1.2417623445694367e-05, + "loss": 0.2718, + "step": 11295 + }, + { + "epoch": 0.42245951819052735, + "grad_norm": 0.4877346456050873, + "learning_rate": 1.2411924661304346e-05, + "loss": 0.2421, + "step": 11300 + }, + { + "epoch": 0.4226464471808772, + "grad_norm": 0.734813392162323, + "learning_rate": 1.2406225045110617e-05, + "loss": 0.2855, + "step": 11305 + }, + { + "epoch": 0.42283337617122696, + "grad_norm": 0.4634631276130676, + "learning_rate": 1.2400524599078816e-05, + "loss": 0.3167, + "step": 11310 + }, + { + "epoch": 0.42302030516157674, + "grad_norm": 0.2482909858226776, + "learning_rate": 1.2394823325174866e-05, + "loss": 0.333, + "step": 11315 + }, + { + "epoch": 0.4232072341519265, + "grad_norm": 0.31496381759643555, + "learning_rate": 1.2389121225364968e-05, + "loss": 0.3233, + "step": 11320 + }, + { + "epoch": 0.42339416314227635, + "grad_norm": 0.46915552020072937, + "learning_rate": 1.2383418301615622e-05, + "loss": 0.33, + "step": 11325 + }, + { + "epoch": 0.4235810921326261, + "grad_norm": 0.3510735332965851, + "learning_rate": 1.2377714555893595e-05, + "loss": 0.2903, + "step": 11330 + }, + { + "epoch": 0.4237680211229759, + "grad_norm": 0.7571256756782532, + "learning_rate": 1.2372009990165948e-05, + "loss": 0.3083, + "step": 11335 + }, + { + "epoch": 0.4239549501133257, + "grad_norm": 0.37375253438949585, + "learning_rate": 1.2366304606400021e-05, + "loss": 0.2759, + "step": 11340 + }, + { + "epoch": 0.4241418791036755, + "grad_norm": 0.4373493790626526, + "learning_rate": 1.236059840656344e-05, + "loss": 0.2503, + "step": 11345 + }, + { + "epoch": 0.4243288080940253, + "grad_norm": 0.284297376871109, + "learning_rate": 1.235489139262411e-05, + "loss": 0.2491, + "step": 11350 + }, + { + "epoch": 0.42451573708437507, + "grad_norm": 0.24776756763458252, + "learning_rate": 1.2349183566550212e-05, + "loss": 0.2494, + "step": 11355 + }, + { + "epoch": 0.42470266607472484, + "grad_norm": 1.0645568370819092, + "learning_rate": 1.2343474930310213e-05, + "loss": 0.2053, + "step": 11360 + }, + { + "epoch": 0.4248895950650747, + "grad_norm": 0.325014591217041, + "learning_rate": 1.2337765485872859e-05, + "loss": 0.2916, + "step": 11365 + }, + { + "epoch": 0.42507652405542445, + "grad_norm": 0.9632283449172974, + "learning_rate": 1.2332055235207179e-05, + "loss": 0.3179, + "step": 11370 + }, + { + "epoch": 0.42526345304577423, + "grad_norm": 1.18338143825531, + "learning_rate": 1.2326344180282468e-05, + "loss": 0.3688, + "step": 11375 + }, + { + "epoch": 0.425450382036124, + "grad_norm": 0.4803732633590698, + "learning_rate": 1.2320632323068306e-05, + "loss": 0.3574, + "step": 11380 + }, + { + "epoch": 0.42563731102647384, + "grad_norm": 0.38758915662765503, + "learning_rate": 1.2314919665534552e-05, + "loss": 0.3133, + "step": 11385 + }, + { + "epoch": 0.4258242400168236, + "grad_norm": 0.4360613524913788, + "learning_rate": 1.2309206209651336e-05, + "loss": 0.2571, + "step": 11390 + }, + { + "epoch": 0.4260111690071734, + "grad_norm": 0.39218610525131226, + "learning_rate": 1.2303491957389069e-05, + "loss": 0.2504, + "step": 11395 + }, + { + "epoch": 0.4261980979975232, + "grad_norm": 0.4993368685245514, + "learning_rate": 1.2297776910718424e-05, + "loss": 0.3063, + "step": 11400 + }, + { + "epoch": 0.426385026987873, + "grad_norm": 0.5808417201042175, + "learning_rate": 1.2292061071610365e-05, + "loss": 0.2726, + "step": 11405 + }, + { + "epoch": 0.4265719559782228, + "grad_norm": 0.31993383169174194, + "learning_rate": 1.2286344442036114e-05, + "loss": 0.386, + "step": 11410 + }, + { + "epoch": 0.42675888496857256, + "grad_norm": 0.17474965751171112, + "learning_rate": 1.228062702396718e-05, + "loss": 0.2783, + "step": 11415 + }, + { + "epoch": 0.42694581395892234, + "grad_norm": 0.5042828917503357, + "learning_rate": 1.2274908819375335e-05, + "loss": 0.2297, + "step": 11420 + }, + { + "epoch": 0.42713274294927217, + "grad_norm": 0.4278635084629059, + "learning_rate": 1.2269189830232622e-05, + "loss": 0.2978, + "step": 11425 + }, + { + "epoch": 0.42731967193962195, + "grad_norm": 0.5413123369216919, + "learning_rate": 1.2263470058511355e-05, + "loss": 0.3018, + "step": 11430 + }, + { + "epoch": 0.4275066009299717, + "grad_norm": 0.3124532401561737, + "learning_rate": 1.2257749506184125e-05, + "loss": 0.3438, + "step": 11435 + }, + { + "epoch": 0.4276935299203215, + "grad_norm": 0.3359132707118988, + "learning_rate": 1.2252028175223778e-05, + "loss": 0.2612, + "step": 11440 + }, + { + "epoch": 0.42788045891067134, + "grad_norm": 0.5123247504234314, + "learning_rate": 1.224630606760345e-05, + "loss": 0.288, + "step": 11445 + }, + { + "epoch": 0.4280673879010211, + "grad_norm": 0.48125380277633667, + "learning_rate": 1.2240583185296517e-05, + "loss": 0.3077, + "step": 11450 + }, + { + "epoch": 0.4282543168913709, + "grad_norm": 0.24318033456802368, + "learning_rate": 1.2234859530276647e-05, + "loss": 0.2642, + "step": 11455 + }, + { + "epoch": 0.42844124588172067, + "grad_norm": 0.3051791489124298, + "learning_rate": 1.2229135104517757e-05, + "loss": 0.3298, + "step": 11460 + }, + { + "epoch": 0.4286281748720705, + "grad_norm": 0.5335453152656555, + "learning_rate": 1.2223409909994048e-05, + "loss": 0.3253, + "step": 11465 + }, + { + "epoch": 0.4288151038624203, + "grad_norm": 0.7249236702919006, + "learning_rate": 1.221768394867997e-05, + "loss": 0.3483, + "step": 11470 + }, + { + "epoch": 0.42900203285277005, + "grad_norm": 0.16120702028274536, + "learning_rate": 1.221195722255024e-05, + "loss": 0.2528, + "step": 11475 + }, + { + "epoch": 0.42918896184311983, + "grad_norm": 0.6941366791725159, + "learning_rate": 1.2206229733579846e-05, + "loss": 0.2684, + "step": 11480 + }, + { + "epoch": 0.4293758908334696, + "grad_norm": 0.4987410306930542, + "learning_rate": 1.2200501483744032e-05, + "loss": 0.3327, + "step": 11485 + }, + { + "epoch": 0.42956281982381944, + "grad_norm": 0.20793752372264862, + "learning_rate": 1.2194772475018309e-05, + "loss": 0.2491, + "step": 11490 + }, + { + "epoch": 0.4297497488141692, + "grad_norm": 0.4314498007297516, + "learning_rate": 1.218904270937845e-05, + "loss": 0.2784, + "step": 11495 + }, + { + "epoch": 0.429936677804519, + "grad_norm": 0.5785399079322815, + "learning_rate": 1.2183312188800483e-05, + "loss": 0.2836, + "step": 11500 + }, + { + "epoch": 0.4301236067948688, + "grad_norm": 0.4460437297821045, + "learning_rate": 1.2177580915260698e-05, + "loss": 0.3361, + "step": 11505 + }, + { + "epoch": 0.4303105357852186, + "grad_norm": 0.34010326862335205, + "learning_rate": 1.2171848890735655e-05, + "loss": 0.2738, + "step": 11510 + }, + { + "epoch": 0.4304974647755684, + "grad_norm": 0.2805013060569763, + "learning_rate": 1.2166116117202162e-05, + "loss": 0.2981, + "step": 11515 + }, + { + "epoch": 0.43068439376591816, + "grad_norm": 0.3375371992588043, + "learning_rate": 1.2160382596637286e-05, + "loss": 0.2759, + "step": 11520 + }, + { + "epoch": 0.43087132275626794, + "grad_norm": 0.5983409881591797, + "learning_rate": 1.2154648331018355e-05, + "loss": 0.2989, + "step": 11525 + }, + { + "epoch": 0.43105825174661777, + "grad_norm": 0.4123179614543915, + "learning_rate": 1.2148913322322952e-05, + "loss": 0.3054, + "step": 11530 + }, + { + "epoch": 0.43124518073696755, + "grad_norm": 0.6426059603691101, + "learning_rate": 1.214317757252892e-05, + "loss": 0.3205, + "step": 11535 + }, + { + "epoch": 0.4314321097273173, + "grad_norm": 0.35368436574935913, + "learning_rate": 1.2137441083614351e-05, + "loss": 0.297, + "step": 11540 + }, + { + "epoch": 0.4316190387176671, + "grad_norm": 0.3455871343612671, + "learning_rate": 1.21317038575576e-05, + "loss": 0.3098, + "step": 11545 + }, + { + "epoch": 0.43180596770801694, + "grad_norm": 0.34755730628967285, + "learning_rate": 1.2125965896337266e-05, + "loss": 0.3382, + "step": 11550 + }, + { + "epoch": 0.4319928966983667, + "grad_norm": 0.5450564622879028, + "learning_rate": 1.2120227201932213e-05, + "loss": 0.2472, + "step": 11555 + }, + { + "epoch": 0.4321798256887165, + "grad_norm": 0.28686073422431946, + "learning_rate": 1.2114487776321553e-05, + "loss": 0.2876, + "step": 11560 + }, + { + "epoch": 0.43236675467906627, + "grad_norm": 0.33250150084495544, + "learning_rate": 1.2108747621484645e-05, + "loss": 0.3267, + "step": 11565 + }, + { + "epoch": 0.4325536836694161, + "grad_norm": 0.5938117504119873, + "learning_rate": 1.2103006739401105e-05, + "loss": 0.319, + "step": 11570 + }, + { + "epoch": 0.4327406126597659, + "grad_norm": 0.32513856887817383, + "learning_rate": 1.20972651320508e-05, + "loss": 0.2741, + "step": 11575 + }, + { + "epoch": 0.43292754165011565, + "grad_norm": 0.39806506037712097, + "learning_rate": 1.2091522801413844e-05, + "loss": 0.3497, + "step": 11580 + }, + { + "epoch": 0.43311447064046543, + "grad_norm": 0.41519683599472046, + "learning_rate": 1.2085779749470603e-05, + "loss": 0.3165, + "step": 11585 + }, + { + "epoch": 0.43330139963081526, + "grad_norm": 0.5625133514404297, + "learning_rate": 1.2080035978201695e-05, + "loss": 0.2961, + "step": 11590 + }, + { + "epoch": 0.43348832862116504, + "grad_norm": 0.3750125765800476, + "learning_rate": 1.2074291489587972e-05, + "loss": 0.3051, + "step": 11595 + }, + { + "epoch": 0.4336752576115148, + "grad_norm": 0.43589699268341064, + "learning_rate": 1.2068546285610556e-05, + "loss": 0.3055, + "step": 11600 + }, + { + "epoch": 0.4338621866018646, + "grad_norm": 0.29387718439102173, + "learning_rate": 1.2062800368250795e-05, + "loss": 0.2602, + "step": 11605 + }, + { + "epoch": 0.43404911559221443, + "grad_norm": 0.4198211133480072, + "learning_rate": 1.2057053739490297e-05, + "loss": 0.2962, + "step": 11610 + }, + { + "epoch": 0.4342360445825642, + "grad_norm": 0.38284847140312195, + "learning_rate": 1.2051306401310904e-05, + "loss": 0.3328, + "step": 11615 + }, + { + "epoch": 0.434422973572914, + "grad_norm": 0.49905702471733093, + "learning_rate": 1.204555835569471e-05, + "loss": 0.2479, + "step": 11620 + }, + { + "epoch": 0.43460990256326376, + "grad_norm": 0.2864576578140259, + "learning_rate": 1.2039809604624053e-05, + "loss": 0.2545, + "step": 11625 + }, + { + "epoch": 0.4347968315536136, + "grad_norm": 0.3651660084724426, + "learning_rate": 1.203406015008151e-05, + "loss": 0.2738, + "step": 11630 + }, + { + "epoch": 0.43498376054396337, + "grad_norm": 0.5334829688072205, + "learning_rate": 1.2028309994049907e-05, + "loss": 0.2902, + "step": 11635 + }, + { + "epoch": 0.43517068953431315, + "grad_norm": 0.4753320813179016, + "learning_rate": 1.20225591385123e-05, + "loss": 0.3009, + "step": 11640 + }, + { + "epoch": 0.4353576185246629, + "grad_norm": 0.3971807360649109, + "learning_rate": 1.2016807585452004e-05, + "loss": 0.4801, + "step": 11645 + }, + { + "epoch": 0.43554454751501276, + "grad_norm": 0.3786313831806183, + "learning_rate": 1.201105533685256e-05, + "loss": 0.2662, + "step": 11650 + }, + { + "epoch": 0.43573147650536254, + "grad_norm": 0.5463781356811523, + "learning_rate": 1.2005302394697755e-05, + "loss": 0.2264, + "step": 11655 + }, + { + "epoch": 0.4359184054957123, + "grad_norm": 0.29028788208961487, + "learning_rate": 1.1999548760971614e-05, + "loss": 0.2305, + "step": 11660 + }, + { + "epoch": 0.4361053344860621, + "grad_norm": 0.30021342635154724, + "learning_rate": 1.1993794437658397e-05, + "loss": 0.3219, + "step": 11665 + }, + { + "epoch": 0.4362922634764119, + "grad_norm": 0.3362751007080078, + "learning_rate": 1.1988039426742608e-05, + "loss": 0.3006, + "step": 11670 + }, + { + "epoch": 0.4364791924667617, + "grad_norm": 0.25194594264030457, + "learning_rate": 1.1982283730208988e-05, + "loss": 0.2612, + "step": 11675 + }, + { + "epoch": 0.4366661214571115, + "grad_norm": 0.5895513296127319, + "learning_rate": 1.1976527350042507e-05, + "loss": 0.2559, + "step": 11680 + }, + { + "epoch": 0.43685305044746126, + "grad_norm": 0.6330528855323792, + "learning_rate": 1.1970770288228377e-05, + "loss": 0.2361, + "step": 11685 + }, + { + "epoch": 0.4370399794378111, + "grad_norm": 0.37626221776008606, + "learning_rate": 1.1965012546752047e-05, + "loss": 0.2266, + "step": 11690 + }, + { + "epoch": 0.43722690842816087, + "grad_norm": 0.3309417963027954, + "learning_rate": 1.1959254127599191e-05, + "loss": 0.265, + "step": 11695 + }, + { + "epoch": 0.43741383741851064, + "grad_norm": 0.5689538717269897, + "learning_rate": 1.1953495032755726e-05, + "loss": 0.2866, + "step": 11700 + }, + { + "epoch": 0.4376007664088604, + "grad_norm": 0.4492878019809723, + "learning_rate": 1.1947735264207804e-05, + "loss": 0.2061, + "step": 11705 + }, + { + "epoch": 0.43778769539921025, + "grad_norm": 0.40549832582473755, + "learning_rate": 1.1941974823941795e-05, + "loss": 0.3215, + "step": 11710 + }, + { + "epoch": 0.43797462438956003, + "grad_norm": 0.667451798915863, + "learning_rate": 1.1936213713944315e-05, + "loss": 0.292, + "step": 11715 + }, + { + "epoch": 0.4381615533799098, + "grad_norm": 0.349060982465744, + "learning_rate": 1.1930451936202203e-05, + "loss": 0.3204, + "step": 11720 + }, + { + "epoch": 0.4383484823702596, + "grad_norm": 0.3580839931964874, + "learning_rate": 1.1924689492702534e-05, + "loss": 0.2576, + "step": 11725 + }, + { + "epoch": 0.43853541136060936, + "grad_norm": 0.49423980712890625, + "learning_rate": 1.1918926385432608e-05, + "loss": 0.2562, + "step": 11730 + }, + { + "epoch": 0.4387223403509592, + "grad_norm": 0.2973886728286743, + "learning_rate": 1.1913162616379956e-05, + "loss": 0.3051, + "step": 11735 + }, + { + "epoch": 0.43890926934130897, + "grad_norm": 0.50880366563797, + "learning_rate": 1.1907398187532337e-05, + "loss": 0.2851, + "step": 11740 + }, + { + "epoch": 0.43909619833165875, + "grad_norm": 0.4905702471733093, + "learning_rate": 1.1901633100877736e-05, + "loss": 0.2635, + "step": 11745 + }, + { + "epoch": 0.4392831273220085, + "grad_norm": 0.21061702072620392, + "learning_rate": 1.1895867358404369e-05, + "loss": 0.2665, + "step": 11750 + }, + { + "epoch": 0.43947005631235836, + "grad_norm": 0.32583504915237427, + "learning_rate": 1.1890100962100672e-05, + "loss": 0.2656, + "step": 11755 + }, + { + "epoch": 0.43965698530270814, + "grad_norm": 0.2895315885543823, + "learning_rate": 1.1884333913955312e-05, + "loss": 0.2501, + "step": 11760 + }, + { + "epoch": 0.4398439142930579, + "grad_norm": 0.40925106406211853, + "learning_rate": 1.187856621595718e-05, + "loss": 0.2625, + "step": 11765 + }, + { + "epoch": 0.4400308432834077, + "grad_norm": 0.1345365196466446, + "learning_rate": 1.1872797870095385e-05, + "loss": 0.3494, + "step": 11770 + }, + { + "epoch": 0.4402177722737575, + "grad_norm": 0.5240778923034668, + "learning_rate": 1.1867028878359266e-05, + "loss": 0.2615, + "step": 11775 + }, + { + "epoch": 0.4404047012641073, + "grad_norm": 0.3817775547504425, + "learning_rate": 1.1861259242738386e-05, + "loss": 0.272, + "step": 11780 + }, + { + "epoch": 0.4405916302544571, + "grad_norm": 0.3994652032852173, + "learning_rate": 1.1855488965222526e-05, + "loss": 0.2841, + "step": 11785 + }, + { + "epoch": 0.44077855924480686, + "grad_norm": 0.461319237947464, + "learning_rate": 1.1849718047801686e-05, + "loss": 0.3008, + "step": 11790 + }, + { + "epoch": 0.4409654882351567, + "grad_norm": 0.5307036638259888, + "learning_rate": 1.1843946492466093e-05, + "loss": 0.2497, + "step": 11795 + }, + { + "epoch": 0.44115241722550647, + "grad_norm": 0.37287020683288574, + "learning_rate": 1.1838174301206194e-05, + "loss": 0.2826, + "step": 11800 + }, + { + "epoch": 0.44133934621585624, + "grad_norm": 0.538077712059021, + "learning_rate": 1.1832401476012645e-05, + "loss": 0.2475, + "step": 11805 + }, + { + "epoch": 0.441526275206206, + "grad_norm": 0.3736201524734497, + "learning_rate": 1.1826628018876334e-05, + "loss": 0.265, + "step": 11810 + }, + { + "epoch": 0.44171320419655585, + "grad_norm": 0.272097647190094, + "learning_rate": 1.1820853931788357e-05, + "loss": 0.3053, + "step": 11815 + }, + { + "epoch": 0.44190013318690563, + "grad_norm": 0.5018362998962402, + "learning_rate": 1.1815079216740033e-05, + "loss": 0.3002, + "step": 11820 + }, + { + "epoch": 0.4420870621772554, + "grad_norm": 0.5243871808052063, + "learning_rate": 1.1809303875722896e-05, + "loss": 0.3192, + "step": 11825 + }, + { + "epoch": 0.4422739911676052, + "grad_norm": 0.20126721262931824, + "learning_rate": 1.1803527910728695e-05, + "loss": 0.3049, + "step": 11830 + }, + { + "epoch": 0.442460920157955, + "grad_norm": 0.824718177318573, + "learning_rate": 1.1797751323749396e-05, + "loss": 0.3462, + "step": 11835 + }, + { + "epoch": 0.4426478491483048, + "grad_norm": 0.6242169737815857, + "learning_rate": 1.1791974116777179e-05, + "loss": 0.2373, + "step": 11840 + }, + { + "epoch": 0.4428347781386546, + "grad_norm": 0.23284803330898285, + "learning_rate": 1.1786196291804432e-05, + "loss": 0.2514, + "step": 11845 + }, + { + "epoch": 0.44302170712900435, + "grad_norm": 0.5042092800140381, + "learning_rate": 1.1780417850823768e-05, + "loss": 0.2548, + "step": 11850 + }, + { + "epoch": 0.4432086361193542, + "grad_norm": 0.28074219822883606, + "learning_rate": 1.1774638795828004e-05, + "loss": 0.2807, + "step": 11855 + }, + { + "epoch": 0.44339556510970396, + "grad_norm": 0.36168691515922546, + "learning_rate": 1.1768859128810167e-05, + "loss": 0.2616, + "step": 11860 + }, + { + "epoch": 0.44358249410005374, + "grad_norm": 0.6940954923629761, + "learning_rate": 1.1763078851763497e-05, + "loss": 0.3262, + "step": 11865 + }, + { + "epoch": 0.4437694230904035, + "grad_norm": 0.29294854402542114, + "learning_rate": 1.1757297966681455e-05, + "loss": 0.2751, + "step": 11870 + }, + { + "epoch": 0.44395635208075335, + "grad_norm": 0.2644307017326355, + "learning_rate": 1.1751516475557696e-05, + "loss": 0.2402, + "step": 11875 + }, + { + "epoch": 0.4441432810711031, + "grad_norm": 1.0284477472305298, + "learning_rate": 1.1745734380386091e-05, + "loss": 0.3128, + "step": 11880 + }, + { + "epoch": 0.4443302100614529, + "grad_norm": 0.3318440616130829, + "learning_rate": 1.1739951683160719e-05, + "loss": 0.2859, + "step": 11885 + }, + { + "epoch": 0.4445171390518027, + "grad_norm": 0.49884122610092163, + "learning_rate": 1.173416838587587e-05, + "loss": 0.3245, + "step": 11890 + }, + { + "epoch": 0.4447040680421525, + "grad_norm": 0.12781549990177155, + "learning_rate": 1.1728384490526035e-05, + "loss": 0.2808, + "step": 11895 + }, + { + "epoch": 0.4448909970325023, + "grad_norm": 0.4701847434043884, + "learning_rate": 1.1722599999105913e-05, + "loss": 0.3074, + "step": 11900 + }, + { + "epoch": 0.44507792602285207, + "grad_norm": 0.28572845458984375, + "learning_rate": 1.1716814913610409e-05, + "loss": 0.3912, + "step": 11905 + }, + { + "epoch": 0.44526485501320184, + "grad_norm": 0.36786168813705444, + "learning_rate": 1.1711029236034633e-05, + "loss": 0.2701, + "step": 11910 + }, + { + "epoch": 0.4454517840035517, + "grad_norm": 0.49527883529663086, + "learning_rate": 1.1705242968373903e-05, + "loss": 0.2451, + "step": 11915 + }, + { + "epoch": 0.44563871299390145, + "grad_norm": 0.46449634432792664, + "learning_rate": 1.1699456112623733e-05, + "loss": 0.2992, + "step": 11920 + }, + { + "epoch": 0.44582564198425123, + "grad_norm": 0.4579066038131714, + "learning_rate": 1.1693668670779847e-05, + "loss": 0.2084, + "step": 11925 + }, + { + "epoch": 0.446012570974601, + "grad_norm": 0.1956116259098053, + "learning_rate": 1.1687880644838164e-05, + "loss": 0.3574, + "step": 11930 + }, + { + "epoch": 0.44619949996495084, + "grad_norm": 0.37438297271728516, + "learning_rate": 1.1682092036794812e-05, + "loss": 0.3066, + "step": 11935 + }, + { + "epoch": 0.4463864289553006, + "grad_norm": 0.37410229444503784, + "learning_rate": 1.1676302848646116e-05, + "loss": 0.3174, + "step": 11940 + }, + { + "epoch": 0.4465733579456504, + "grad_norm": 0.23035752773284912, + "learning_rate": 1.16705130823886e-05, + "loss": 0.2645, + "step": 11945 + }, + { + "epoch": 0.4467602869360002, + "grad_norm": 0.21039049327373505, + "learning_rate": 1.1664722740018983e-05, + "loss": 0.3209, + "step": 11950 + }, + { + "epoch": 0.44694721592635, + "grad_norm": 0.4789251387119293, + "learning_rate": 1.1658931823534196e-05, + "loss": 0.3161, + "step": 11955 + }, + { + "epoch": 0.4471341449166998, + "grad_norm": 0.33394578099250793, + "learning_rate": 1.1653140334931357e-05, + "loss": 0.2549, + "step": 11960 + }, + { + "epoch": 0.44732107390704956, + "grad_norm": 0.5714621543884277, + "learning_rate": 1.1647348276207789e-05, + "loss": 0.3469, + "step": 11965 + }, + { + "epoch": 0.44750800289739934, + "grad_norm": 0.8689758777618408, + "learning_rate": 1.1641555649360998e-05, + "loss": 0.4199, + "step": 11970 + }, + { + "epoch": 0.4476949318877491, + "grad_norm": 0.1597885936498642, + "learning_rate": 1.1635762456388702e-05, + "loss": 0.3961, + "step": 11975 + }, + { + "epoch": 0.44788186087809895, + "grad_norm": 0.46579235792160034, + "learning_rate": 1.1629968699288805e-05, + "loss": 0.2853, + "step": 11980 + }, + { + "epoch": 0.4480687898684487, + "grad_norm": 0.34550490975379944, + "learning_rate": 1.162417438005941e-05, + "loss": 0.2943, + "step": 11985 + }, + { + "epoch": 0.4482557188587985, + "grad_norm": 0.4065869450569153, + "learning_rate": 1.1618379500698808e-05, + "loss": 0.3224, + "step": 11990 + }, + { + "epoch": 0.4484426478491483, + "grad_norm": 0.19699564576148987, + "learning_rate": 1.161258406320549e-05, + "loss": 0.3538, + "step": 11995 + }, + { + "epoch": 0.4486295768394981, + "grad_norm": 0.851565420627594, + "learning_rate": 1.1606788069578132e-05, + "loss": 0.3719, + "step": 12000 + }, + { + "epoch": 0.4488165058298479, + "grad_norm": 0.36260199546813965, + "learning_rate": 1.160099152181561e-05, + "loss": 0.3062, + "step": 12005 + }, + { + "epoch": 0.44900343482019767, + "grad_norm": 0.36545851826667786, + "learning_rate": 1.1595194421916988e-05, + "loss": 0.329, + "step": 12010 + }, + { + "epoch": 0.44919036381054744, + "grad_norm": 0.339874804019928, + "learning_rate": 1.1589396771881518e-05, + "loss": 0.2977, + "step": 12015 + }, + { + "epoch": 0.4493772928008973, + "grad_norm": 0.4972774386405945, + "learning_rate": 1.1583598573708642e-05, + "loss": 0.385, + "step": 12020 + }, + { + "epoch": 0.44956422179124705, + "grad_norm": 0.41324669122695923, + "learning_rate": 1.1577799829397996e-05, + "loss": 0.2967, + "step": 12025 + }, + { + "epoch": 0.44975115078159683, + "grad_norm": 0.44996240735054016, + "learning_rate": 1.1572000540949398e-05, + "loss": 0.2517, + "step": 12030 + }, + { + "epoch": 0.4499380797719466, + "grad_norm": 0.4872210621833801, + "learning_rate": 1.1566200710362854e-05, + "loss": 0.2805, + "step": 12035 + }, + { + "epoch": 0.45012500876229644, + "grad_norm": 0.3203275799751282, + "learning_rate": 1.1560400339638567e-05, + "loss": 0.2458, + "step": 12040 + }, + { + "epoch": 0.4503119377526462, + "grad_norm": 0.733971118927002, + "learning_rate": 1.1554599430776906e-05, + "loss": 0.2399, + "step": 12045 + }, + { + "epoch": 0.450498866742996, + "grad_norm": 0.38670045137405396, + "learning_rate": 1.1548797985778452e-05, + "loss": 0.2588, + "step": 12050 + }, + { + "epoch": 0.4506857957333458, + "grad_norm": 0.33743932843208313, + "learning_rate": 1.1542996006643952e-05, + "loss": 0.2816, + "step": 12055 + }, + { + "epoch": 0.4508727247236956, + "grad_norm": 0.23398756980895996, + "learning_rate": 1.1537193495374342e-05, + "loss": 0.2784, + "step": 12060 + }, + { + "epoch": 0.4510596537140454, + "grad_norm": 0.5807647705078125, + "learning_rate": 1.153139045397074e-05, + "loss": 0.3134, + "step": 12065 + }, + { + "epoch": 0.45124658270439516, + "grad_norm": 0.4573014974594116, + "learning_rate": 1.152558688443445e-05, + "loss": 0.2615, + "step": 12070 + }, + { + "epoch": 0.45143351169474494, + "grad_norm": 0.31440410017967224, + "learning_rate": 1.1519782788766957e-05, + "loss": 0.3578, + "step": 12075 + }, + { + "epoch": 0.45162044068509477, + "grad_norm": 0.19732816517353058, + "learning_rate": 1.1513978168969929e-05, + "loss": 0.2974, + "step": 12080 + }, + { + "epoch": 0.45180736967544455, + "grad_norm": 0.17791102826595306, + "learning_rate": 1.1508173027045214e-05, + "loss": 0.2286, + "step": 12085 + }, + { + "epoch": 0.4519942986657943, + "grad_norm": 0.35120418667793274, + "learning_rate": 1.1502367364994832e-05, + "loss": 0.3173, + "step": 12090 + }, + { + "epoch": 0.4521812276561441, + "grad_norm": 0.5374488234519958, + "learning_rate": 1.1496561184820991e-05, + "loss": 0.2903, + "step": 12095 + }, + { + "epoch": 0.45236815664649394, + "grad_norm": 0.37684205174446106, + "learning_rate": 1.1490754488526084e-05, + "loss": 0.3131, + "step": 12100 + }, + { + "epoch": 0.4525550856368437, + "grad_norm": 0.3075244426727295, + "learning_rate": 1.1484947278112673e-05, + "loss": 0.2625, + "step": 12105 + }, + { + "epoch": 0.4527420146271935, + "grad_norm": 0.5102051496505737, + "learning_rate": 1.147913955558349e-05, + "loss": 0.3038, + "step": 12110 + }, + { + "epoch": 0.45292894361754327, + "grad_norm": 0.2178102284669876, + "learning_rate": 1.1473331322941457e-05, + "loss": 0.2539, + "step": 12115 + }, + { + "epoch": 0.4531158726078931, + "grad_norm": 0.20256425440311432, + "learning_rate": 1.1467522582189667e-05, + "loss": 0.2735, + "step": 12120 + }, + { + "epoch": 0.4533028015982429, + "grad_norm": 0.5971643924713135, + "learning_rate": 1.1461713335331389e-05, + "loss": 0.4114, + "step": 12125 + }, + { + "epoch": 0.45348973058859265, + "grad_norm": 0.5443209409713745, + "learning_rate": 1.1455903584370065e-05, + "loss": 0.3009, + "step": 12130 + }, + { + "epoch": 0.45367665957894243, + "grad_norm": 0.6024529933929443, + "learning_rate": 1.1450093331309314e-05, + "loss": 0.2964, + "step": 12135 + }, + { + "epoch": 0.45386358856929226, + "grad_norm": 0.4719208776950836, + "learning_rate": 1.1444282578152918e-05, + "loss": 0.3716, + "step": 12140 + }, + { + "epoch": 0.45405051755964204, + "grad_norm": 0.3875749707221985, + "learning_rate": 1.1438471326904847e-05, + "loss": 0.2173, + "step": 12145 + }, + { + "epoch": 0.4542374465499918, + "grad_norm": 0.40976300835609436, + "learning_rate": 1.1432659579569234e-05, + "loss": 0.2978, + "step": 12150 + }, + { + "epoch": 0.4544243755403416, + "grad_norm": 0.32276174426078796, + "learning_rate": 1.1426847338150386e-05, + "loss": 0.3428, + "step": 12155 + }, + { + "epoch": 0.45461130453069143, + "grad_norm": 0.2936963140964508, + "learning_rate": 1.1421034604652771e-05, + "loss": 0.3055, + "step": 12160 + }, + { + "epoch": 0.4547982335210412, + "grad_norm": 0.24648843705654144, + "learning_rate": 1.1415221381081039e-05, + "loss": 0.3737, + "step": 12165 + }, + { + "epoch": 0.454985162511391, + "grad_norm": 0.3906302750110626, + "learning_rate": 1.1409407669440005e-05, + "loss": 0.2709, + "step": 12170 + }, + { + "epoch": 0.45517209150174076, + "grad_norm": 0.23113980889320374, + "learning_rate": 1.140359347173465e-05, + "loss": 0.2874, + "step": 12175 + }, + { + "epoch": 0.4553590204920906, + "grad_norm": 0.4913095533847809, + "learning_rate": 1.1397778789970126e-05, + "loss": 0.253, + "step": 12180 + }, + { + "epoch": 0.45554594948244037, + "grad_norm": 0.34004727005958557, + "learning_rate": 1.1391963626151745e-05, + "loss": 0.2647, + "step": 12185 + }, + { + "epoch": 0.45573287847279015, + "grad_norm": 0.41701698303222656, + "learning_rate": 1.1386147982284996e-05, + "loss": 0.2685, + "step": 12190 + }, + { + "epoch": 0.4559198074631399, + "grad_norm": 0.12583187222480774, + "learning_rate": 1.1380331860375527e-05, + "loss": 0.2764, + "step": 12195 + }, + { + "epoch": 0.4561067364534897, + "grad_norm": 0.29735517501831055, + "learning_rate": 1.1374515262429154e-05, + "loss": 0.2472, + "step": 12200 + }, + { + "epoch": 0.45629366544383954, + "grad_norm": 0.5592828989028931, + "learning_rate": 1.1368698190451848e-05, + "loss": 0.3773, + "step": 12205 + }, + { + "epoch": 0.4564805944341893, + "grad_norm": 0.5007039904594421, + "learning_rate": 1.1362880646449755e-05, + "loss": 0.2827, + "step": 12210 + }, + { + "epoch": 0.4566675234245391, + "grad_norm": 0.5219624042510986, + "learning_rate": 1.1357062632429177e-05, + "loss": 0.3227, + "step": 12215 + }, + { + "epoch": 0.45685445241488887, + "grad_norm": 0.45227476954460144, + "learning_rate": 1.1351244150396581e-05, + "loss": 0.329, + "step": 12220 + }, + { + "epoch": 0.4570413814052387, + "grad_norm": 0.5245965123176575, + "learning_rate": 1.1345425202358597e-05, + "loss": 0.3021, + "step": 12225 + }, + { + "epoch": 0.4572283103955885, + "grad_norm": 0.43462806940078735, + "learning_rate": 1.1339605790322016e-05, + "loss": 0.3904, + "step": 12230 + }, + { + "epoch": 0.45741523938593825, + "grad_norm": 0.44027209281921387, + "learning_rate": 1.1333785916293776e-05, + "loss": 0.2519, + "step": 12235 + }, + { + "epoch": 0.45760216837628803, + "grad_norm": 0.7230022549629211, + "learning_rate": 1.1327965582280995e-05, + "loss": 0.3982, + "step": 12240 + }, + { + "epoch": 0.45778909736663786, + "grad_norm": 0.5000762939453125, + "learning_rate": 1.1322144790290935e-05, + "loss": 0.3101, + "step": 12245 + }, + { + "epoch": 0.45797602635698764, + "grad_norm": 0.3863626718521118, + "learning_rate": 1.1316323542331022e-05, + "loss": 0.3012, + "step": 12250 + }, + { + "epoch": 0.4581629553473374, + "grad_norm": 0.34956860542297363, + "learning_rate": 1.1310501840408837e-05, + "loss": 0.3426, + "step": 12255 + }, + { + "epoch": 0.4583498843376872, + "grad_norm": 0.3766677975654602, + "learning_rate": 1.1304679686532116e-05, + "loss": 0.2716, + "step": 12260 + }, + { + "epoch": 0.45853681332803703, + "grad_norm": 0.5153632164001465, + "learning_rate": 1.1298857082708754e-05, + "loss": 0.2259, + "step": 12265 + }, + { + "epoch": 0.4587237423183868, + "grad_norm": 0.4910812973976135, + "learning_rate": 1.1293034030946804e-05, + "loss": 0.3003, + "step": 12270 + }, + { + "epoch": 0.4589106713087366, + "grad_norm": 0.43328529596328735, + "learning_rate": 1.1287210533254464e-05, + "loss": 0.3041, + "step": 12275 + }, + { + "epoch": 0.45909760029908636, + "grad_norm": 0.24060022830963135, + "learning_rate": 1.1281386591640096e-05, + "loss": 0.3299, + "step": 12280 + }, + { + "epoch": 0.4592845292894362, + "grad_norm": 0.2626323997974396, + "learning_rate": 1.127556220811221e-05, + "loss": 0.3169, + "step": 12285 + }, + { + "epoch": 0.45947145827978597, + "grad_norm": 0.27137619256973267, + "learning_rate": 1.1269737384679465e-05, + "loss": 0.2479, + "step": 12290 + }, + { + "epoch": 0.45965838727013575, + "grad_norm": 0.3326461911201477, + "learning_rate": 1.1263912123350679e-05, + "loss": 0.2324, + "step": 12295 + }, + { + "epoch": 0.4598453162604855, + "grad_norm": 0.20982854068279266, + "learning_rate": 1.1258086426134822e-05, + "loss": 0.2547, + "step": 12300 + }, + { + "epoch": 0.46003224525083536, + "grad_norm": 0.3248259127140045, + "learning_rate": 1.1252260295041003e-05, + "loss": 0.2574, + "step": 12305 + }, + { + "epoch": 0.46021917424118514, + "grad_norm": 0.35238370299339294, + "learning_rate": 1.1246433732078487e-05, + "loss": 0.2612, + "step": 12310 + }, + { + "epoch": 0.4604061032315349, + "grad_norm": 0.5635769367218018, + "learning_rate": 1.1240606739256694e-05, + "loss": 0.3106, + "step": 12315 + }, + { + "epoch": 0.4605930322218847, + "grad_norm": 0.33287879824638367, + "learning_rate": 1.1234779318585182e-05, + "loss": 0.24, + "step": 12320 + }, + { + "epoch": 0.4607799612122345, + "grad_norm": 0.5921958684921265, + "learning_rate": 1.1228951472073669e-05, + "loss": 0.2757, + "step": 12325 + }, + { + "epoch": 0.4609668902025843, + "grad_norm": 0.32586291432380676, + "learning_rate": 1.1223123201732002e-05, + "loss": 0.3202, + "step": 12330 + }, + { + "epoch": 0.4611538191929341, + "grad_norm": 0.4058232605457306, + "learning_rate": 1.1217294509570193e-05, + "loss": 0.2255, + "step": 12335 + }, + { + "epoch": 0.46134074818328386, + "grad_norm": 0.5720903873443604, + "learning_rate": 1.1211465397598387e-05, + "loss": 0.3099, + "step": 12340 + }, + { + "epoch": 0.4615276771736337, + "grad_norm": 0.43414196372032166, + "learning_rate": 1.1205635867826878e-05, + "loss": 0.2535, + "step": 12345 + }, + { + "epoch": 0.46171460616398347, + "grad_norm": 0.5103111863136292, + "learning_rate": 1.1199805922266105e-05, + "loss": 0.2551, + "step": 12350 + }, + { + "epoch": 0.46190153515433324, + "grad_norm": 0.4893711805343628, + "learning_rate": 1.1193975562926646e-05, + "loss": 0.2794, + "step": 12355 + }, + { + "epoch": 0.462088464144683, + "grad_norm": 0.48441797494888306, + "learning_rate": 1.1188144791819226e-05, + "loss": 0.4306, + "step": 12360 + }, + { + "epoch": 0.46227539313503285, + "grad_norm": 0.2794596254825592, + "learning_rate": 1.1182313610954716e-05, + "loss": 0.2915, + "step": 12365 + }, + { + "epoch": 0.46246232212538263, + "grad_norm": 0.5436064004898071, + "learning_rate": 1.1176482022344115e-05, + "loss": 0.297, + "step": 12370 + }, + { + "epoch": 0.4626492511157324, + "grad_norm": 0.40309974551200867, + "learning_rate": 1.1170650027998577e-05, + "loss": 0.4027, + "step": 12375 + }, + { + "epoch": 0.4628361801060822, + "grad_norm": 0.1574869155883789, + "learning_rate": 1.116481762992939e-05, + "loss": 0.2339, + "step": 12380 + }, + { + "epoch": 0.463023109096432, + "grad_norm": 0.43917763233184814, + "learning_rate": 1.1158984830147975e-05, + "loss": 0.3213, + "step": 12385 + }, + { + "epoch": 0.4632100380867818, + "grad_norm": 0.34497371315956116, + "learning_rate": 1.1153151630665902e-05, + "loss": 0.2609, + "step": 12390 + }, + { + "epoch": 0.46339696707713157, + "grad_norm": 0.3034207224845886, + "learning_rate": 1.1147318033494876e-05, + "loss": 0.2787, + "step": 12395 + }, + { + "epoch": 0.46358389606748135, + "grad_norm": 0.28706076741218567, + "learning_rate": 1.1141484040646732e-05, + "loss": 0.2624, + "step": 12400 + }, + { + "epoch": 0.4637708250578312, + "grad_norm": 0.3962991237640381, + "learning_rate": 1.1135649654133453e-05, + "loss": 0.2194, + "step": 12405 + }, + { + "epoch": 0.46395775404818096, + "grad_norm": 0.6293958425521851, + "learning_rate": 1.1129814875967143e-05, + "loss": 0.3141, + "step": 12410 + }, + { + "epoch": 0.46414468303853074, + "grad_norm": 0.4316769242286682, + "learning_rate": 1.1123979708160064e-05, + "loss": 0.2896, + "step": 12415 + }, + { + "epoch": 0.4643316120288805, + "grad_norm": 0.4425215423107147, + "learning_rate": 1.1118144152724584e-05, + "loss": 0.3304, + "step": 12420 + }, + { + "epoch": 0.46451854101923035, + "grad_norm": 0.43086567521095276, + "learning_rate": 1.1112308211673226e-05, + "loss": 0.3048, + "step": 12425 + }, + { + "epoch": 0.4647054700095801, + "grad_norm": 0.5925473570823669, + "learning_rate": 1.1106471887018637e-05, + "loss": 0.3241, + "step": 12430 + }, + { + "epoch": 0.4648923989999299, + "grad_norm": 0.4298711121082306, + "learning_rate": 1.11006351807736e-05, + "loss": 0.3052, + "step": 12435 + }, + { + "epoch": 0.4650793279902797, + "grad_norm": 0.40438607335090637, + "learning_rate": 1.1094798094951027e-05, + "loss": 0.3083, + "step": 12440 + }, + { + "epoch": 0.46526625698062946, + "grad_norm": 0.1780761182308197, + "learning_rate": 1.1088960631563958e-05, + "loss": 0.3185, + "step": 12445 + }, + { + "epoch": 0.4654531859709793, + "grad_norm": 0.5064865946769714, + "learning_rate": 1.108312279262557e-05, + "loss": 0.2251, + "step": 12450 + }, + { + "epoch": 0.46564011496132907, + "grad_norm": 0.49286672472953796, + "learning_rate": 1.1077284580149169e-05, + "loss": 0.2824, + "step": 12455 + }, + { + "epoch": 0.46582704395167884, + "grad_norm": 0.35339727997779846, + "learning_rate": 1.1071445996148182e-05, + "loss": 0.3037, + "step": 12460 + }, + { + "epoch": 0.4660139729420286, + "grad_norm": 0.5857782959938049, + "learning_rate": 1.1065607042636173e-05, + "loss": 0.3552, + "step": 12465 + }, + { + "epoch": 0.46620090193237845, + "grad_norm": 0.4888255000114441, + "learning_rate": 1.1059767721626828e-05, + "loss": 0.3064, + "step": 12470 + }, + { + "epoch": 0.46638783092272823, + "grad_norm": 0.2896244525909424, + "learning_rate": 1.1053928035133964e-05, + "loss": 0.2304, + "step": 12475 + }, + { + "epoch": 0.466574759913078, + "grad_norm": 0.26369529962539673, + "learning_rate": 1.1048087985171517e-05, + "loss": 0.2565, + "step": 12480 + }, + { + "epoch": 0.4667616889034278, + "grad_norm": 0.4036164879798889, + "learning_rate": 1.1042247573753555e-05, + "loss": 0.2744, + "step": 12485 + }, + { + "epoch": 0.4669486178937776, + "grad_norm": 0.03450818732380867, + "learning_rate": 1.103640680289427e-05, + "loss": 0.2852, + "step": 12490 + }, + { + "epoch": 0.4671355468841274, + "grad_norm": 0.39615771174430847, + "learning_rate": 1.1030565674607976e-05, + "loss": 0.2949, + "step": 12495 + }, + { + "epoch": 0.4673224758744772, + "grad_norm": 0.26349276304244995, + "learning_rate": 1.1024724190909109e-05, + "loss": 0.2381, + "step": 12500 + }, + { + "epoch": 0.46750940486482695, + "grad_norm": 0.6820989847183228, + "learning_rate": 1.101888235381223e-05, + "loss": 0.2836, + "step": 12505 + }, + { + "epoch": 0.4676963338551768, + "grad_norm": 0.30709752440452576, + "learning_rate": 1.1013040165332024e-05, + "loss": 0.3014, + "step": 12510 + }, + { + "epoch": 0.46788326284552656, + "grad_norm": 0.31798475980758667, + "learning_rate": 1.1007197627483292e-05, + "loss": 0.2755, + "step": 12515 + }, + { + "epoch": 0.46807019183587634, + "grad_norm": 0.4482247233390808, + "learning_rate": 1.1001354742280959e-05, + "loss": 0.2384, + "step": 12520 + }, + { + "epoch": 0.4682571208262261, + "grad_norm": 0.503452479839325, + "learning_rate": 1.0995511511740066e-05, + "loss": 0.3171, + "step": 12525 + }, + { + "epoch": 0.46844404981657595, + "grad_norm": 0.3593085706233978, + "learning_rate": 1.0989667937875778e-05, + "loss": 0.372, + "step": 12530 + }, + { + "epoch": 0.4686309788069257, + "grad_norm": 0.4590424597263336, + "learning_rate": 1.0983824022703377e-05, + "loss": 0.297, + "step": 12535 + }, + { + "epoch": 0.4688179077972755, + "grad_norm": 0.49093323945999146, + "learning_rate": 1.0977979768238261e-05, + "loss": 0.2479, + "step": 12540 + }, + { + "epoch": 0.4690048367876253, + "grad_norm": 0.29465728998184204, + "learning_rate": 1.0972135176495942e-05, + "loss": 0.2881, + "step": 12545 + }, + { + "epoch": 0.4691917657779751, + "grad_norm": 0.16378015279769897, + "learning_rate": 1.0966290249492057e-05, + "loss": 0.2464, + "step": 12550 + }, + { + "epoch": 0.4693786947683249, + "grad_norm": 0.3140955865383148, + "learning_rate": 1.0960444989242355e-05, + "loss": 0.1831, + "step": 12555 + }, + { + "epoch": 0.46956562375867467, + "grad_norm": 0.3941444158554077, + "learning_rate": 1.0954599397762695e-05, + "loss": 0.2958, + "step": 12560 + }, + { + "epoch": 0.46975255274902444, + "grad_norm": 0.3791537582874298, + "learning_rate": 1.0948753477069057e-05, + "loss": 0.3068, + "step": 12565 + }, + { + "epoch": 0.4699394817393743, + "grad_norm": 0.12725694477558136, + "learning_rate": 1.0942907229177526e-05, + "loss": 0.2489, + "step": 12570 + }, + { + "epoch": 0.47012641072972405, + "grad_norm": 0.6161131858825684, + "learning_rate": 1.0937060656104312e-05, + "loss": 0.3096, + "step": 12575 + }, + { + "epoch": 0.47031333972007383, + "grad_norm": 0.40583592653274536, + "learning_rate": 1.0931213759865729e-05, + "loss": 0.3163, + "step": 12580 + }, + { + "epoch": 0.4705002687104236, + "grad_norm": 3.2883963584899902, + "learning_rate": 1.0925366542478205e-05, + "loss": 0.2901, + "step": 12585 + }, + { + "epoch": 0.47068719770077344, + "grad_norm": 0.5457201600074768, + "learning_rate": 1.0919519005958268e-05, + "loss": 0.332, + "step": 12590 + }, + { + "epoch": 0.4708741266911232, + "grad_norm": 0.42853930592536926, + "learning_rate": 1.0913671152322578e-05, + "loss": 0.2741, + "step": 12595 + }, + { + "epoch": 0.471061055681473, + "grad_norm": 0.43397367000579834, + "learning_rate": 1.0907822983587888e-05, + "loss": 0.366, + "step": 12600 + }, + { + "epoch": 0.4712479846718228, + "grad_norm": 0.6656988263130188, + "learning_rate": 1.0901974501771065e-05, + "loss": 0.2831, + "step": 12605 + }, + { + "epoch": 0.4714349136621726, + "grad_norm": 0.3851011097431183, + "learning_rate": 1.0896125708889077e-05, + "loss": 0.2331, + "step": 12610 + }, + { + "epoch": 0.4716218426525224, + "grad_norm": 0.25610148906707764, + "learning_rate": 1.0890276606959011e-05, + "loss": 0.2575, + "step": 12615 + }, + { + "epoch": 0.47180877164287216, + "grad_norm": 0.4998786449432373, + "learning_rate": 1.0884427197998054e-05, + "loss": 0.2438, + "step": 12620 + }, + { + "epoch": 0.47199570063322194, + "grad_norm": 0.21021483838558197, + "learning_rate": 1.0878577484023496e-05, + "loss": 0.2544, + "step": 12625 + }, + { + "epoch": 0.47218262962357177, + "grad_norm": 0.3968941271305084, + "learning_rate": 1.0872727467052741e-05, + "loss": 0.2302, + "step": 12630 + }, + { + "epoch": 0.47236955861392155, + "grad_norm": 0.4154810905456543, + "learning_rate": 1.0866877149103286e-05, + "loss": 0.2776, + "step": 12635 + }, + { + "epoch": 0.4725564876042713, + "grad_norm": 0.4162626564502716, + "learning_rate": 1.0861026532192745e-05, + "loss": 0.2388, + "step": 12640 + }, + { + "epoch": 0.4727434165946211, + "grad_norm": 0.4286978244781494, + "learning_rate": 1.0855175618338823e-05, + "loss": 0.2645, + "step": 12645 + }, + { + "epoch": 0.47293034558497093, + "grad_norm": 0.3265318274497986, + "learning_rate": 1.0849324409559334e-05, + "loss": 0.2832, + "step": 12650 + }, + { + "epoch": 0.4731172745753207, + "grad_norm": 0.3723618686199188, + "learning_rate": 1.0843472907872192e-05, + "loss": 0.2681, + "step": 12655 + }, + { + "epoch": 0.4733042035656705, + "grad_norm": 0.47997230291366577, + "learning_rate": 1.0837621115295414e-05, + "loss": 0.4914, + "step": 12660 + }, + { + "epoch": 0.47349113255602027, + "grad_norm": 0.3690032362937927, + "learning_rate": 1.0831769033847113e-05, + "loss": 0.2536, + "step": 12665 + }, + { + "epoch": 0.4736780615463701, + "grad_norm": 0.44639065861701965, + "learning_rate": 1.0825916665545506e-05, + "loss": 0.3222, + "step": 12670 + }, + { + "epoch": 0.4738649905367199, + "grad_norm": 0.3845580816268921, + "learning_rate": 1.0820064012408905e-05, + "loss": 0.3047, + "step": 12675 + }, + { + "epoch": 0.47405191952706965, + "grad_norm": 0.5291163921356201, + "learning_rate": 1.0814211076455727e-05, + "loss": 0.2676, + "step": 12680 + }, + { + "epoch": 0.47423884851741943, + "grad_norm": 0.20399098098278046, + "learning_rate": 1.0808357859704478e-05, + "loss": 0.2618, + "step": 12685 + }, + { + "epoch": 0.4744257775077692, + "grad_norm": 0.5159106254577637, + "learning_rate": 1.0802504364173763e-05, + "loss": 0.3227, + "step": 12690 + }, + { + "epoch": 0.47461270649811904, + "grad_norm": 0.24743987619876862, + "learning_rate": 1.079665059188229e-05, + "loss": 0.2674, + "step": 12695 + }, + { + "epoch": 0.4747996354884688, + "grad_norm": 0.41947150230407715, + "learning_rate": 1.0790796544848853e-05, + "loss": 0.297, + "step": 12700 + }, + { + "epoch": 0.4749865644788186, + "grad_norm": 0.5480782389640808, + "learning_rate": 1.078494222509235e-05, + "loss": 0.2761, + "step": 12705 + }, + { + "epoch": 0.4751734934691684, + "grad_norm": 0.2428130805492401, + "learning_rate": 1.0779087634631763e-05, + "loss": 0.2324, + "step": 12710 + }, + { + "epoch": 0.4753604224595182, + "grad_norm": 0.2542586624622345, + "learning_rate": 1.0773232775486173e-05, + "loss": 0.3582, + "step": 12715 + }, + { + "epoch": 0.475547351449868, + "grad_norm": 0.341899573802948, + "learning_rate": 1.0767377649674755e-05, + "loss": 0.2975, + "step": 12720 + }, + { + "epoch": 0.47573428044021776, + "grad_norm": 0.6047893762588501, + "learning_rate": 1.0761522259216777e-05, + "loss": 0.2313, + "step": 12725 + }, + { + "epoch": 0.47592120943056754, + "grad_norm": 0.3485582768917084, + "learning_rate": 1.0755666606131588e-05, + "loss": 0.2606, + "step": 12730 + }, + { + "epoch": 0.47610813842091737, + "grad_norm": 0.38747602701187134, + "learning_rate": 1.074981069243864e-05, + "loss": 0.3315, + "step": 12735 + }, + { + "epoch": 0.47629506741126715, + "grad_norm": 0.5207949876785278, + "learning_rate": 1.0743954520157471e-05, + "loss": 0.34, + "step": 12740 + }, + { + "epoch": 0.4764819964016169, + "grad_norm": 0.6631015539169312, + "learning_rate": 1.0738098091307703e-05, + "loss": 0.2457, + "step": 12745 + }, + { + "epoch": 0.4766689253919667, + "grad_norm": 0.2515263855457306, + "learning_rate": 1.0732241407909057e-05, + "loss": 0.2445, + "step": 12750 + }, + { + "epoch": 0.47685585438231654, + "grad_norm": 0.27167659997940063, + "learning_rate": 1.0726384471981326e-05, + "loss": 0.2514, + "step": 12755 + }, + { + "epoch": 0.4770427833726663, + "grad_norm": 0.44566571712493896, + "learning_rate": 1.0720527285544406e-05, + "loss": 0.2548, + "step": 12760 + }, + { + "epoch": 0.4772297123630161, + "grad_norm": 0.5117403268814087, + "learning_rate": 1.071466985061827e-05, + "loss": 0.2884, + "step": 12765 + }, + { + "epoch": 0.47741664135336587, + "grad_norm": 0.36228641867637634, + "learning_rate": 1.070881216922298e-05, + "loss": 0.3223, + "step": 12770 + }, + { + "epoch": 0.4776035703437157, + "grad_norm": 0.35445940494537354, + "learning_rate": 1.0702954243378685e-05, + "loss": 0.2761, + "step": 12775 + }, + { + "epoch": 0.4777904993340655, + "grad_norm": 0.2719997763633728, + "learning_rate": 1.069709607510561e-05, + "loss": 0.3556, + "step": 12780 + }, + { + "epoch": 0.47797742832441525, + "grad_norm": 0.33862072229385376, + "learning_rate": 1.0691237666424077e-05, + "loss": 0.2534, + "step": 12785 + }, + { + "epoch": 0.47816435731476503, + "grad_norm": 0.2040008008480072, + "learning_rate": 1.0685379019354476e-05, + "loss": 0.2666, + "step": 12790 + }, + { + "epoch": 0.47835128630511486, + "grad_norm": 0.3832045793533325, + "learning_rate": 1.0679520135917293e-05, + "loss": 0.3088, + "step": 12795 + }, + { + "epoch": 0.47853821529546464, + "grad_norm": 0.4770766496658325, + "learning_rate": 1.0673661018133086e-05, + "loss": 0.2701, + "step": 12800 + }, + { + "epoch": 0.4787251442858144, + "grad_norm": 1.6293139457702637, + "learning_rate": 1.0667801668022496e-05, + "loss": 0.2655, + "step": 12805 + }, + { + "epoch": 0.4789120732761642, + "grad_norm": 0.44511693716049194, + "learning_rate": 1.0661942087606243e-05, + "loss": 0.354, + "step": 12810 + }, + { + "epoch": 0.47909900226651403, + "grad_norm": 0.22285427153110504, + "learning_rate": 1.065608227890513e-05, + "loss": 0.2925, + "step": 12815 + }, + { + "epoch": 0.4792859312568638, + "grad_norm": 0.34608471393585205, + "learning_rate": 1.0650222243940043e-05, + "loss": 0.245, + "step": 12820 + }, + { + "epoch": 0.4794728602472136, + "grad_norm": 0.42804983258247375, + "learning_rate": 1.0644361984731932e-05, + "loss": 0.288, + "step": 12825 + }, + { + "epoch": 0.47965978923756336, + "grad_norm": 0.4652557373046875, + "learning_rate": 1.0638501503301837e-05, + "loss": 0.3612, + "step": 12830 + }, + { + "epoch": 0.4798467182279132, + "grad_norm": 0.24128127098083496, + "learning_rate": 1.0632640801670868e-05, + "loss": 0.2583, + "step": 12835 + }, + { + "epoch": 0.48003364721826297, + "grad_norm": 0.45303890109062195, + "learning_rate": 1.0626779881860213e-05, + "loss": 0.2702, + "step": 12840 + }, + { + "epoch": 0.48022057620861275, + "grad_norm": 0.33776313066482544, + "learning_rate": 1.0620918745891143e-05, + "loss": 0.3176, + "step": 12845 + }, + { + "epoch": 0.4804075051989625, + "grad_norm": 0.304514616727829, + "learning_rate": 1.0615057395784983e-05, + "loss": 0.2317, + "step": 12850 + }, + { + "epoch": 0.48059443418931236, + "grad_norm": 0.3652326762676239, + "learning_rate": 1.0609195833563153e-05, + "loss": 0.271, + "step": 12855 + }, + { + "epoch": 0.48078136317966214, + "grad_norm": 0.397659033536911, + "learning_rate": 1.0603334061247133e-05, + "loss": 0.3553, + "step": 12860 + }, + { + "epoch": 0.4809682921700119, + "grad_norm": 0.33267614245414734, + "learning_rate": 1.0597472080858485e-05, + "loss": 0.3367, + "step": 12865 + }, + { + "epoch": 0.4811552211603617, + "grad_norm": 0.35334113240242004, + "learning_rate": 1.0591609894418835e-05, + "loss": 0.3743, + "step": 12870 + }, + { + "epoch": 0.4813421501507115, + "grad_norm": 0.7049282789230347, + "learning_rate": 1.0585747503949883e-05, + "loss": 0.3031, + "step": 12875 + }, + { + "epoch": 0.4815290791410613, + "grad_norm": 0.504492998123169, + "learning_rate": 1.0579884911473404e-05, + "loss": 0.2968, + "step": 12880 + }, + { + "epoch": 0.4817160081314111, + "grad_norm": 0.21734552085399628, + "learning_rate": 1.0574022119011234e-05, + "loss": 0.304, + "step": 12885 + }, + { + "epoch": 0.48190293712176085, + "grad_norm": 0.5968211889266968, + "learning_rate": 1.0568159128585283e-05, + "loss": 0.2402, + "step": 12890 + }, + { + "epoch": 0.4820898661121107, + "grad_norm": 0.5871456265449524, + "learning_rate": 1.056229594221753e-05, + "loss": 0.2916, + "step": 12895 + }, + { + "epoch": 0.48227679510246046, + "grad_norm": 0.4001029431819916, + "learning_rate": 1.0556432561930014e-05, + "loss": 0.2675, + "step": 12900 + }, + { + "epoch": 0.48246372409281024, + "grad_norm": 0.40792810916900635, + "learning_rate": 1.0550568989744852e-05, + "loss": 0.2857, + "step": 12905 + }, + { + "epoch": 0.48265065308316, + "grad_norm": 0.23005032539367676, + "learning_rate": 1.0544705227684223e-05, + "loss": 0.3099, + "step": 12910 + }, + { + "epoch": 0.48283758207350985, + "grad_norm": 0.4021388292312622, + "learning_rate": 1.053884127777037e-05, + "loss": 0.3509, + "step": 12915 + }, + { + "epoch": 0.48302451106385963, + "grad_norm": 0.40135905146598816, + "learning_rate": 1.0532977142025595e-05, + "loss": 0.3563, + "step": 12920 + }, + { + "epoch": 0.4832114400542094, + "grad_norm": 1.3269644975662231, + "learning_rate": 1.0527112822472279e-05, + "loss": 0.34, + "step": 12925 + }, + { + "epoch": 0.4833983690445592, + "grad_norm": 0.28477320075035095, + "learning_rate": 1.0521248321132853e-05, + "loss": 0.3246, + "step": 12930 + }, + { + "epoch": 0.48358529803490896, + "grad_norm": 0.38668492436408997, + "learning_rate": 1.0515383640029819e-05, + "loss": 0.348, + "step": 12935 + }, + { + "epoch": 0.4837722270252588, + "grad_norm": 0.5541387796401978, + "learning_rate": 1.0509518781185735e-05, + "loss": 0.2915, + "step": 12940 + }, + { + "epoch": 0.48395915601560857, + "grad_norm": 0.43491193652153015, + "learning_rate": 1.0503653746623221e-05, + "loss": 0.2804, + "step": 12945 + }, + { + "epoch": 0.48414608500595835, + "grad_norm": 0.3402020037174225, + "learning_rate": 1.0497788538364961e-05, + "loss": 0.3074, + "step": 12950 + }, + { + "epoch": 0.4843330139963081, + "grad_norm": 0.4688038229942322, + "learning_rate": 1.0491923158433696e-05, + "loss": 0.328, + "step": 12955 + }, + { + "epoch": 0.48451994298665796, + "grad_norm": 0.2162652462720871, + "learning_rate": 1.0486057608852236e-05, + "loss": 0.2978, + "step": 12960 + }, + { + "epoch": 0.48470687197700774, + "grad_norm": 0.33628934621810913, + "learning_rate": 1.0480191891643427e-05, + "loss": 0.2905, + "step": 12965 + }, + { + "epoch": 0.4848938009673575, + "grad_norm": 0.30055081844329834, + "learning_rate": 1.0474326008830198e-05, + "loss": 0.288, + "step": 12970 + }, + { + "epoch": 0.4850807299577073, + "grad_norm": 0.30387982726097107, + "learning_rate": 1.0468459962435517e-05, + "loss": 0.2517, + "step": 12975 + }, + { + "epoch": 0.4852676589480571, + "grad_norm": 0.3027805685997009, + "learning_rate": 1.046259375448242e-05, + "loss": 0.2283, + "step": 12980 + }, + { + "epoch": 0.4854545879384069, + "grad_norm": 0.6029101014137268, + "learning_rate": 1.0456727386993992e-05, + "loss": 0.313, + "step": 12985 + }, + { + "epoch": 0.4856415169287567, + "grad_norm": 0.23959915339946747, + "learning_rate": 1.0450860861993374e-05, + "loss": 0.2493, + "step": 12990 + }, + { + "epoch": 0.48582844591910646, + "grad_norm": 0.6326263546943665, + "learning_rate": 1.0444994181503764e-05, + "loss": 0.31, + "step": 12995 + }, + { + "epoch": 0.4860153749094563, + "grad_norm": 0.3467031419277191, + "learning_rate": 1.043912734754841e-05, + "loss": 0.2751, + "step": 13000 + }, + { + "epoch": 0.48620230389980607, + "grad_norm": 0.24758736789226532, + "learning_rate": 1.0433260362150618e-05, + "loss": 0.3156, + "step": 13005 + }, + { + "epoch": 0.48638923289015584, + "grad_norm": 0.4706723093986511, + "learning_rate": 1.0427393227333742e-05, + "loss": 0.2606, + "step": 13010 + }, + { + "epoch": 0.4865761618805056, + "grad_norm": 0.4599349796772003, + "learning_rate": 1.0421525945121187e-05, + "loss": 0.257, + "step": 13015 + }, + { + "epoch": 0.48676309087085545, + "grad_norm": 0.3387102484703064, + "learning_rate": 1.0415658517536414e-05, + "loss": 0.2991, + "step": 13020 + }, + { + "epoch": 0.48695001986120523, + "grad_norm": 0.4700387120246887, + "learning_rate": 1.0409790946602926e-05, + "loss": 0.2578, + "step": 13025 + }, + { + "epoch": 0.487136948851555, + "grad_norm": 0.5611972808837891, + "learning_rate": 1.0403923234344282e-05, + "loss": 0.3931, + "step": 13030 + }, + { + "epoch": 0.4873238778419048, + "grad_norm": 0.3460509479045868, + "learning_rate": 1.0398055382784094e-05, + "loss": 0.2514, + "step": 13035 + }, + { + "epoch": 0.4875108068322546, + "grad_norm": 0.45094433426856995, + "learning_rate": 1.0392187393946004e-05, + "loss": 0.2802, + "step": 13040 + }, + { + "epoch": 0.4876977358226044, + "grad_norm": 0.44121870398521423, + "learning_rate": 1.0386319269853719e-05, + "loss": 0.3061, + "step": 13045 + }, + { + "epoch": 0.48788466481295417, + "grad_norm": 0.35361143946647644, + "learning_rate": 1.038045101253099e-05, + "loss": 0.2768, + "step": 13050 + }, + { + "epoch": 0.48807159380330395, + "grad_norm": 1.3324558734893799, + "learning_rate": 1.0374582624001608e-05, + "loss": 0.3261, + "step": 13055 + }, + { + "epoch": 0.4882585227936538, + "grad_norm": 0.31909024715423584, + "learning_rate": 1.0368714106289412e-05, + "loss": 0.2433, + "step": 13060 + }, + { + "epoch": 0.48844545178400356, + "grad_norm": 0.8846080899238586, + "learning_rate": 1.0362845461418286e-05, + "loss": 0.3087, + "step": 13065 + }, + { + "epoch": 0.48863238077435334, + "grad_norm": 0.2497347593307495, + "learning_rate": 1.0356976691412156e-05, + "loss": 0.2936, + "step": 13070 + }, + { + "epoch": 0.4888193097647031, + "grad_norm": 0.33820840716362, + "learning_rate": 1.0351107798294994e-05, + "loss": 0.3137, + "step": 13075 + }, + { + "epoch": 0.48900623875505295, + "grad_norm": 0.23961906135082245, + "learning_rate": 1.0345238784090816e-05, + "loss": 0.2316, + "step": 13080 + }, + { + "epoch": 0.4891931677454027, + "grad_norm": 0.172140970826149, + "learning_rate": 1.0339369650823672e-05, + "loss": 0.3032, + "step": 13085 + }, + { + "epoch": 0.4893800967357525, + "grad_norm": 0.21862946450710297, + "learning_rate": 1.0333500400517656e-05, + "loss": 0.2501, + "step": 13090 + }, + { + "epoch": 0.4895670257261023, + "grad_norm": 0.35493218898773193, + "learning_rate": 1.032763103519691e-05, + "loss": 0.2768, + "step": 13095 + }, + { + "epoch": 0.4897539547164521, + "grad_norm": 0.2946685254573822, + "learning_rate": 1.0321761556885608e-05, + "loss": 0.3169, + "step": 13100 + }, + { + "epoch": 0.4899408837068019, + "grad_norm": 0.39835453033447266, + "learning_rate": 1.0315891967607968e-05, + "loss": 0.3315, + "step": 13105 + }, + { + "epoch": 0.49012781269715167, + "grad_norm": 0.4330670237541199, + "learning_rate": 1.0310022269388236e-05, + "loss": 0.3063, + "step": 13110 + }, + { + "epoch": 0.49031474168750144, + "grad_norm": 0.3474862277507782, + "learning_rate": 1.0304152464250707e-05, + "loss": 0.3044, + "step": 13115 + }, + { + "epoch": 0.4905016706778513, + "grad_norm": 0.3377613425254822, + "learning_rate": 1.0298282554219707e-05, + "loss": 0.2859, + "step": 13120 + }, + { + "epoch": 0.49068859966820105, + "grad_norm": 0.2067660242319107, + "learning_rate": 1.0292412541319603e-05, + "loss": 0.2821, + "step": 13125 + }, + { + "epoch": 0.49087552865855083, + "grad_norm": 0.5813116431236267, + "learning_rate": 1.0286542427574794e-05, + "loss": 0.3219, + "step": 13130 + }, + { + "epoch": 0.4910624576489006, + "grad_norm": 0.5601276159286499, + "learning_rate": 1.0280672215009706e-05, + "loss": 0.2605, + "step": 13135 + }, + { + "epoch": 0.49124938663925044, + "grad_norm": 0.35725921392440796, + "learning_rate": 1.0274801905648816e-05, + "loss": 0.2353, + "step": 13140 + }, + { + "epoch": 0.4914363156296002, + "grad_norm": 0.3625008165836334, + "learning_rate": 1.0268931501516626e-05, + "loss": 0.3755, + "step": 13145 + }, + { + "epoch": 0.49162324461995, + "grad_norm": 0.2937968671321869, + "learning_rate": 1.0263061004637666e-05, + "loss": 0.2857, + "step": 13150 + }, + { + "epoch": 0.4918101736102998, + "grad_norm": 0.48958075046539307, + "learning_rate": 1.0257190417036502e-05, + "loss": 0.289, + "step": 13155 + }, + { + "epoch": 0.4919971026006496, + "grad_norm": 0.4188752770423889, + "learning_rate": 1.0251319740737732e-05, + "loss": 0.2968, + "step": 13160 + }, + { + "epoch": 0.4921840315909994, + "grad_norm": 0.32891568541526794, + "learning_rate": 1.0245448977765986e-05, + "loss": 0.299, + "step": 13165 + }, + { + "epoch": 0.49237096058134916, + "grad_norm": 0.4842504858970642, + "learning_rate": 1.023957813014592e-05, + "loss": 0.212, + "step": 13170 + }, + { + "epoch": 0.49255788957169894, + "grad_norm": 0.5407660007476807, + "learning_rate": 1.0233707199902223e-05, + "loss": 0.24, + "step": 13175 + }, + { + "epoch": 0.4927448185620487, + "grad_norm": 0.268076092004776, + "learning_rate": 1.0227836189059606e-05, + "loss": 0.229, + "step": 13180 + }, + { + "epoch": 0.49293174755239855, + "grad_norm": 0.6369917392730713, + "learning_rate": 1.0221965099642817e-05, + "loss": 0.2855, + "step": 13185 + }, + { + "epoch": 0.4931186765427483, + "grad_norm": 0.39120930433273315, + "learning_rate": 1.0216093933676625e-05, + "loss": 0.2355, + "step": 13190 + }, + { + "epoch": 0.4933056055330981, + "grad_norm": 0.31261909008026123, + "learning_rate": 1.0210222693185829e-05, + "loss": 0.2723, + "step": 13195 + }, + { + "epoch": 0.4934925345234479, + "grad_norm": 0.5893165469169617, + "learning_rate": 1.0204351380195249e-05, + "loss": 0.2922, + "step": 13200 + }, + { + "epoch": 0.4936794635137977, + "grad_norm": 0.4819238781929016, + "learning_rate": 1.0198479996729736e-05, + "loss": 0.3932, + "step": 13205 + }, + { + "epoch": 0.4938663925041475, + "grad_norm": 0.35270988941192627, + "learning_rate": 1.0192608544814155e-05, + "loss": 0.35, + "step": 13210 + }, + { + "epoch": 0.49405332149449727, + "grad_norm": 0.6773056387901306, + "learning_rate": 1.0186737026473408e-05, + "loss": 0.3297, + "step": 13215 + }, + { + "epoch": 0.49424025048484704, + "grad_norm": 0.28301501274108887, + "learning_rate": 1.0180865443732408e-05, + "loss": 0.3117, + "step": 13220 + }, + { + "epoch": 0.4944271794751969, + "grad_norm": 0.3076423406600952, + "learning_rate": 1.0174993798616101e-05, + "loss": 0.2898, + "step": 13225 + }, + { + "epoch": 0.49461410846554665, + "grad_norm": 0.6145200133323669, + "learning_rate": 1.0169122093149449e-05, + "loss": 0.271, + "step": 13230 + }, + { + "epoch": 0.49480103745589643, + "grad_norm": 0.6317368745803833, + "learning_rate": 1.016325032935743e-05, + "loss": 0.3003, + "step": 13235 + }, + { + "epoch": 0.4949879664462462, + "grad_norm": 0.5369362235069275, + "learning_rate": 1.0157378509265053e-05, + "loss": 0.2718, + "step": 13240 + }, + { + "epoch": 0.49517489543659604, + "grad_norm": 0.3894631564617157, + "learning_rate": 1.015150663489734e-05, + "loss": 0.3061, + "step": 13245 + }, + { + "epoch": 0.4953618244269458, + "grad_norm": 0.35104823112487793, + "learning_rate": 1.0145634708279324e-05, + "loss": 0.3223, + "step": 13250 + }, + { + "epoch": 0.4955487534172956, + "grad_norm": 0.5245153903961182, + "learning_rate": 1.013976273143607e-05, + "loss": 0.3233, + "step": 13255 + }, + { + "epoch": 0.4957356824076454, + "grad_norm": 0.4358454942703247, + "learning_rate": 1.013389070639266e-05, + "loss": 0.344, + "step": 13260 + }, + { + "epoch": 0.4959226113979952, + "grad_norm": 0.3596687912940979, + "learning_rate": 1.0128018635174177e-05, + "loss": 0.2782, + "step": 13265 + }, + { + "epoch": 0.496109540388345, + "grad_norm": 0.314059317111969, + "learning_rate": 1.0122146519805736e-05, + "loss": 0.27, + "step": 13270 + }, + { + "epoch": 0.49629646937869476, + "grad_norm": 0.49434518814086914, + "learning_rate": 1.0116274362312462e-05, + "loss": 0.2295, + "step": 13275 + }, + { + "epoch": 0.49648339836904454, + "grad_norm": 0.42047810554504395, + "learning_rate": 1.011040216471949e-05, + "loss": 0.2461, + "step": 13280 + }, + { + "epoch": 0.49667032735939437, + "grad_norm": 0.44239863753318787, + "learning_rate": 1.0104529929051977e-05, + "loss": 0.3447, + "step": 13285 + }, + { + "epoch": 0.49685725634974415, + "grad_norm": 0.4856170117855072, + "learning_rate": 1.0098657657335083e-05, + "loss": 0.3261, + "step": 13290 + }, + { + "epoch": 0.4970441853400939, + "grad_norm": 0.7703909277915955, + "learning_rate": 1.0092785351593995e-05, + "loss": 0.2374, + "step": 13295 + }, + { + "epoch": 0.4972311143304437, + "grad_norm": 0.44741690158843994, + "learning_rate": 1.0086913013853894e-05, + "loss": 0.2753, + "step": 13300 + }, + { + "epoch": 0.49741804332079353, + "grad_norm": 0.38895756006240845, + "learning_rate": 1.0081040646139985e-05, + "loss": 0.2716, + "step": 13305 + }, + { + "epoch": 0.4976049723111433, + "grad_norm": 0.7058769464492798, + "learning_rate": 1.0075168250477482e-05, + "loss": 0.3592, + "step": 13310 + }, + { + "epoch": 0.4977919013014931, + "grad_norm": 0.34860917925834656, + "learning_rate": 1.00692958288916e-05, + "loss": 0.3461, + "step": 13315 + }, + { + "epoch": 0.49797883029184287, + "grad_norm": 0.3462105691432953, + "learning_rate": 1.0063423383407575e-05, + "loss": 0.2995, + "step": 13320 + }, + { + "epoch": 0.4981657592821927, + "grad_norm": 0.351172536611557, + "learning_rate": 1.005755091605064e-05, + "loss": 0.4439, + "step": 13325 + }, + { + "epoch": 0.4983526882725425, + "grad_norm": 0.16182276606559753, + "learning_rate": 1.0051678428846046e-05, + "loss": 0.2951, + "step": 13330 + }, + { + "epoch": 0.49853961726289225, + "grad_norm": 0.41749125719070435, + "learning_rate": 1.0045805923819039e-05, + "loss": 0.2411, + "step": 13335 + }, + { + "epoch": 0.49872654625324203, + "grad_norm": 0.3556321859359741, + "learning_rate": 1.0039933402994885e-05, + "loss": 0.2667, + "step": 13340 + }, + { + "epoch": 0.49891347524359186, + "grad_norm": 0.5114247798919678, + "learning_rate": 1.0034060868398843e-05, + "loss": 0.248, + "step": 13345 + }, + { + "epoch": 0.49910040423394164, + "grad_norm": 0.29534754157066345, + "learning_rate": 1.0028188322056183e-05, + "loss": 0.2799, + "step": 13350 + }, + { + "epoch": 0.4992873332242914, + "grad_norm": 0.5048911571502686, + "learning_rate": 1.0022315765992179e-05, + "loss": 0.2801, + "step": 13355 + }, + { + "epoch": 0.4994742622146412, + "grad_norm": 0.3704678416252136, + "learning_rate": 1.0016443202232107e-05, + "loss": 0.2554, + "step": 13360 + }, + { + "epoch": 0.49966119120499103, + "grad_norm": 0.4027446210384369, + "learning_rate": 1.0010570632801244e-05, + "loss": 0.2369, + "step": 13365 + }, + { + "epoch": 0.4998481201953408, + "grad_norm": 0.4065941572189331, + "learning_rate": 1.0004698059724873e-05, + "loss": 0.3427, + "step": 13370 + }, + { + "epoch": 0.5000350491856906, + "grad_norm": 0.1492055058479309, + "learning_rate": 9.998825485028277e-06, + "loss": 0.3667, + "step": 13375 + }, + { + "epoch": 0.5002219781760404, + "grad_norm": 0.7147864103317261, + "learning_rate": 9.99295291073674e-06, + "loss": 0.3257, + "step": 13380 + }, + { + "epoch": 0.5004089071663902, + "grad_norm": 0.355305552482605, + "learning_rate": 9.987080338875537e-06, + "loss": 0.384, + "step": 13385 + }, + { + "epoch": 0.50059583615674, + "grad_norm": 0.24693630635738373, + "learning_rate": 9.981207771469956e-06, + "loss": 0.3332, + "step": 13390 + }, + { + "epoch": 0.5007827651470897, + "grad_norm": 0.4253278076648712, + "learning_rate": 9.975335210545279e-06, + "loss": 0.2696, + "step": 13395 + }, + { + "epoch": 0.5009696941374395, + "grad_norm": 0.22588220238685608, + "learning_rate": 9.969462658126778e-06, + "loss": 0.3232, + "step": 13400 + }, + { + "epoch": 0.5011566231277893, + "grad_norm": 0.6063287854194641, + "learning_rate": 9.963590116239734e-06, + "loss": 0.2805, + "step": 13405 + }, + { + "epoch": 0.5013435521181391, + "grad_norm": 0.5377234816551208, + "learning_rate": 9.957717586909415e-06, + "loss": 0.3082, + "step": 13410 + }, + { + "epoch": 0.501530481108489, + "grad_norm": 0.6075440645217896, + "learning_rate": 9.95184507216109e-06, + "loss": 0.2732, + "step": 13415 + }, + { + "epoch": 0.5017174100988387, + "grad_norm": 0.6097599864006042, + "learning_rate": 9.945972574020015e-06, + "loss": 0.3988, + "step": 13420 + }, + { + "epoch": 0.5019043390891885, + "grad_norm": 0.3699340224266052, + "learning_rate": 9.940100094511457e-06, + "loss": 0.3371, + "step": 13425 + }, + { + "epoch": 0.5020912680795383, + "grad_norm": 0.30382204055786133, + "learning_rate": 9.934227635660654e-06, + "loss": 0.2644, + "step": 13430 + }, + { + "epoch": 0.5022781970698881, + "grad_norm": 0.38782837986946106, + "learning_rate": 9.928355199492859e-06, + "loss": 0.2467, + "step": 13435 + }, + { + "epoch": 0.5024651260602379, + "grad_norm": 0.22586065530776978, + "learning_rate": 9.922482788033304e-06, + "loss": 0.2594, + "step": 13440 + }, + { + "epoch": 0.5026520550505876, + "grad_norm": 0.38370853662490845, + "learning_rate": 9.91661040330721e-06, + "loss": 0.2202, + "step": 13445 + }, + { + "epoch": 0.5028389840409374, + "grad_norm": 0.3413105010986328, + "learning_rate": 9.910738047339801e-06, + "loss": 0.2385, + "step": 13450 + }, + { + "epoch": 0.5030259130312872, + "grad_norm": 0.5473100543022156, + "learning_rate": 9.90486572215628e-06, + "loss": 0.2837, + "step": 13455 + }, + { + "epoch": 0.5032128420216371, + "grad_norm": 0.5079526305198669, + "learning_rate": 9.898993429781848e-06, + "loss": 0.328, + "step": 13460 + }, + { + "epoch": 0.5033997710119869, + "grad_norm": 0.23653025925159454, + "learning_rate": 9.893121172241686e-06, + "loss": 0.2817, + "step": 13465 + }, + { + "epoch": 0.5035867000023366, + "grad_norm": 0.3025575280189514, + "learning_rate": 9.887248951560972e-06, + "loss": 0.3381, + "step": 13470 + }, + { + "epoch": 0.5037736289926864, + "grad_norm": 0.3230903148651123, + "learning_rate": 9.88137676976486e-06, + "loss": 0.2964, + "step": 13475 + }, + { + "epoch": 0.5039605579830362, + "grad_norm": 0.8575633764266968, + "learning_rate": 9.875504628878502e-06, + "loss": 0.3539, + "step": 13480 + }, + { + "epoch": 0.504147486973386, + "grad_norm": 0.5326083302497864, + "learning_rate": 9.869632530927033e-06, + "loss": 0.3046, + "step": 13485 + }, + { + "epoch": 0.5043344159637357, + "grad_norm": 0.5689330697059631, + "learning_rate": 9.863760477935565e-06, + "loss": 0.2426, + "step": 13490 + }, + { + "epoch": 0.5045213449540855, + "grad_norm": 0.4615536034107208, + "learning_rate": 9.857888471929207e-06, + "loss": 0.3155, + "step": 13495 + }, + { + "epoch": 0.5047082739444354, + "grad_norm": 0.284390926361084, + "learning_rate": 9.85201651493304e-06, + "loss": 0.2493, + "step": 13500 + }, + { + "epoch": 0.5048952029347852, + "grad_norm": 0.382242351770401, + "learning_rate": 9.846144608972141e-06, + "loss": 0.3991, + "step": 13505 + }, + { + "epoch": 0.505082131925135, + "grad_norm": 0.3869224190711975, + "learning_rate": 9.840272756071556e-06, + "loss": 0.2395, + "step": 13510 + }, + { + "epoch": 0.5052690609154847, + "grad_norm": 0.2770150601863861, + "learning_rate": 9.834400958256322e-06, + "loss": 0.2365, + "step": 13515 + }, + { + "epoch": 0.5054559899058345, + "grad_norm": 0.1850321739912033, + "learning_rate": 9.828529217551448e-06, + "loss": 0.4006, + "step": 13520 + }, + { + "epoch": 0.5056429188961843, + "grad_norm": 0.5925412178039551, + "learning_rate": 9.822657535981936e-06, + "loss": 0.3216, + "step": 13525 + }, + { + "epoch": 0.5058298478865341, + "grad_norm": 0.4952634572982788, + "learning_rate": 9.816785915572762e-06, + "loss": 0.3249, + "step": 13530 + }, + { + "epoch": 0.5060167768768838, + "grad_norm": 0.4701783061027527, + "learning_rate": 9.81091435834887e-06, + "loss": 0.2652, + "step": 13535 + }, + { + "epoch": 0.5062037058672337, + "grad_norm": 0.8160406351089478, + "learning_rate": 9.805042866335202e-06, + "loss": 0.2646, + "step": 13540 + }, + { + "epoch": 0.5063906348575835, + "grad_norm": 0.32033684849739075, + "learning_rate": 9.79917144155666e-06, + "loss": 0.3069, + "step": 13545 + }, + { + "epoch": 0.5065775638479333, + "grad_norm": 0.39233770966529846, + "learning_rate": 9.793300086038137e-06, + "loss": 0.2425, + "step": 13550 + }, + { + "epoch": 0.5067644928382831, + "grad_norm": 0.4731293022632599, + "learning_rate": 9.78742880180449e-06, + "loss": 0.2729, + "step": 13555 + }, + { + "epoch": 0.5069514218286328, + "grad_norm": 0.45634377002716064, + "learning_rate": 9.781557590880559e-06, + "loss": 0.2657, + "step": 13560 + }, + { + "epoch": 0.5071383508189826, + "grad_norm": 0.22461484372615814, + "learning_rate": 9.775686455291153e-06, + "loss": 0.3041, + "step": 13565 + }, + { + "epoch": 0.5073252798093324, + "grad_norm": 0.19289721548557281, + "learning_rate": 9.769815397061062e-06, + "loss": 0.2881, + "step": 13570 + }, + { + "epoch": 0.5075122087996822, + "grad_norm": 0.5333814024925232, + "learning_rate": 9.763944418215047e-06, + "loss": 0.3384, + "step": 13575 + }, + { + "epoch": 0.5076991377900321, + "grad_norm": 0.3757210373878479, + "learning_rate": 9.758073520777837e-06, + "loss": 0.2283, + "step": 13580 + }, + { + "epoch": 0.5078860667803818, + "grad_norm": 0.2797081470489502, + "learning_rate": 9.75220270677414e-06, + "loss": 0.2902, + "step": 13585 + }, + { + "epoch": 0.5080729957707316, + "grad_norm": 0.22128711640834808, + "learning_rate": 9.746331978228623e-06, + "loss": 0.2467, + "step": 13590 + }, + { + "epoch": 0.5082599247610814, + "grad_norm": 0.6138377785682678, + "learning_rate": 9.740461337165945e-06, + "loss": 0.2697, + "step": 13595 + }, + { + "epoch": 0.5084468537514312, + "grad_norm": 0.43696799874305725, + "learning_rate": 9.734590785610713e-06, + "loss": 0.2734, + "step": 13600 + }, + { + "epoch": 0.508633782741781, + "grad_norm": 0.38263434171676636, + "learning_rate": 9.728720325587515e-06, + "loss": 0.2258, + "step": 13605 + }, + { + "epoch": 0.5088207117321307, + "grad_norm": 0.634128749370575, + "learning_rate": 9.722849959120899e-06, + "loss": 0.3134, + "step": 13610 + }, + { + "epoch": 0.5090076407224805, + "grad_norm": 0.3166172504425049, + "learning_rate": 9.716979688235392e-06, + "loss": 0.309, + "step": 13615 + }, + { + "epoch": 0.5091945697128304, + "grad_norm": 0.29206401109695435, + "learning_rate": 9.711109514955485e-06, + "loss": 0.2797, + "step": 13620 + }, + { + "epoch": 0.5093814987031802, + "grad_norm": 0.648830235004425, + "learning_rate": 9.705239441305626e-06, + "loss": 0.2557, + "step": 13625 + }, + { + "epoch": 0.50956842769353, + "grad_norm": 0.601845383644104, + "learning_rate": 9.699369469310238e-06, + "loss": 0.3296, + "step": 13630 + }, + { + "epoch": 0.5097553566838797, + "grad_norm": 0.20830629765987396, + "learning_rate": 9.693499600993705e-06, + "loss": 0.2889, + "step": 13635 + }, + { + "epoch": 0.5099422856742295, + "grad_norm": 0.1208687499165535, + "learning_rate": 9.68762983838038e-06, + "loss": 0.2389, + "step": 13640 + }, + { + "epoch": 0.5101292146645793, + "grad_norm": 0.38630300760269165, + "learning_rate": 9.681760183494568e-06, + "loss": 0.3048, + "step": 13645 + }, + { + "epoch": 0.510316143654929, + "grad_norm": 0.2682333290576935, + "learning_rate": 9.675890638360556e-06, + "loss": 0.3013, + "step": 13650 + }, + { + "epoch": 0.5105030726452788, + "grad_norm": 0.46546033024787903, + "learning_rate": 9.670021205002573e-06, + "loss": 0.3102, + "step": 13655 + }, + { + "epoch": 0.5106900016356287, + "grad_norm": 0.2961997985839844, + "learning_rate": 9.66415188544482e-06, + "loss": 0.2771, + "step": 13660 + }, + { + "epoch": 0.5108769306259785, + "grad_norm": 0.4869229197502136, + "learning_rate": 9.65828268171146e-06, + "loss": 0.292, + "step": 13665 + }, + { + "epoch": 0.5110638596163283, + "grad_norm": 0.42666685581207275, + "learning_rate": 9.652413595826612e-06, + "loss": 0.3738, + "step": 13670 + }, + { + "epoch": 0.511250788606678, + "grad_norm": 0.5058145523071289, + "learning_rate": 9.646544629814357e-06, + "loss": 0.3383, + "step": 13675 + }, + { + "epoch": 0.5114377175970278, + "grad_norm": 0.4218425154685974, + "learning_rate": 9.640675785698726e-06, + "loss": 0.2804, + "step": 13680 + }, + { + "epoch": 0.5116246465873776, + "grad_norm": 0.5010436177253723, + "learning_rate": 9.634807065503726e-06, + "loss": 0.2663, + "step": 13685 + }, + { + "epoch": 0.5118115755777274, + "grad_norm": 0.44729965925216675, + "learning_rate": 9.628938471253302e-06, + "loss": 0.347, + "step": 13690 + }, + { + "epoch": 0.5119985045680772, + "grad_norm": 0.4109884202480316, + "learning_rate": 9.62307000497137e-06, + "loss": 0.2371, + "step": 13695 + }, + { + "epoch": 0.5121854335584269, + "grad_norm": 0.35468733310699463, + "learning_rate": 9.61720166868179e-06, + "loss": 0.2445, + "step": 13700 + }, + { + "epoch": 0.5123723625487768, + "grad_norm": 0.31252321600914, + "learning_rate": 9.611333464408383e-06, + "loss": 0.3187, + "step": 13705 + }, + { + "epoch": 0.5125592915391266, + "grad_norm": 0.31071966886520386, + "learning_rate": 9.605465394174933e-06, + "loss": 0.235, + "step": 13710 + }, + { + "epoch": 0.5127462205294764, + "grad_norm": 0.4738657474517822, + "learning_rate": 9.599597460005161e-06, + "loss": 0.2461, + "step": 13715 + }, + { + "epoch": 0.5129331495198262, + "grad_norm": 0.35220175981521606, + "learning_rate": 9.593729663922752e-06, + "loss": 0.3399, + "step": 13720 + }, + { + "epoch": 0.5131200785101759, + "grad_norm": 0.3583666682243347, + "learning_rate": 9.587862007951343e-06, + "loss": 0.307, + "step": 13725 + }, + { + "epoch": 0.5133070075005257, + "grad_norm": 0.5065263509750366, + "learning_rate": 9.581994494114518e-06, + "loss": 0.3031, + "step": 13730 + }, + { + "epoch": 0.5134939364908755, + "grad_norm": 0.5432529449462891, + "learning_rate": 9.576127124435811e-06, + "loss": 0.2919, + "step": 13735 + }, + { + "epoch": 0.5136808654812253, + "grad_norm": 0.36350008845329285, + "learning_rate": 9.570259900938717e-06, + "loss": 0.284, + "step": 13740 + }, + { + "epoch": 0.5138677944715752, + "grad_norm": 0.3522621691226959, + "learning_rate": 9.564392825646669e-06, + "loss": 0.2702, + "step": 13745 + }, + { + "epoch": 0.5140547234619249, + "grad_norm": 0.24228136241436005, + "learning_rate": 9.55852590058305e-06, + "loss": 0.2595, + "step": 13750 + }, + { + "epoch": 0.5142416524522747, + "grad_norm": 0.40059664845466614, + "learning_rate": 9.552659127771204e-06, + "loss": 0.3595, + "step": 13755 + }, + { + "epoch": 0.5144285814426245, + "grad_norm": 0.3901945948600769, + "learning_rate": 9.5467925092344e-06, + "loss": 0.2465, + "step": 13760 + }, + { + "epoch": 0.5146155104329743, + "grad_norm": 0.4316410422325134, + "learning_rate": 9.54092604699588e-06, + "loss": 0.3, + "step": 13765 + }, + { + "epoch": 0.514802439423324, + "grad_norm": 0.2531914710998535, + "learning_rate": 9.53505974307881e-06, + "loss": 0.275, + "step": 13770 + }, + { + "epoch": 0.5149893684136738, + "grad_norm": 1.1252632141113281, + "learning_rate": 9.529193599506313e-06, + "loss": 0.3713, + "step": 13775 + }, + { + "epoch": 0.5151762974040236, + "grad_norm": 0.3661787807941437, + "learning_rate": 9.52332761830145e-06, + "loss": 0.2687, + "step": 13780 + }, + { + "epoch": 0.5153632263943735, + "grad_norm": 0.234545037150383, + "learning_rate": 9.517461801487239e-06, + "loss": 0.3, + "step": 13785 + }, + { + "epoch": 0.5155501553847233, + "grad_norm": 0.29935866594314575, + "learning_rate": 9.51159615108662e-06, + "loss": 0.3166, + "step": 13790 + }, + { + "epoch": 0.515737084375073, + "grad_norm": 0.5030335783958435, + "learning_rate": 9.505730669122494e-06, + "loss": 0.3111, + "step": 13795 + }, + { + "epoch": 0.5159240133654228, + "grad_norm": 0.33726051449775696, + "learning_rate": 9.499865357617703e-06, + "loss": 0.2607, + "step": 13800 + }, + { + "epoch": 0.5161109423557726, + "grad_norm": 0.4101554751396179, + "learning_rate": 9.494000218595015e-06, + "loss": 0.297, + "step": 13805 + }, + { + "epoch": 0.5162978713461224, + "grad_norm": 0.3463868796825409, + "learning_rate": 9.488135254077155e-06, + "loss": 0.2925, + "step": 13810 + }, + { + "epoch": 0.5164848003364721, + "grad_norm": 0.15302151441574097, + "learning_rate": 9.482270466086778e-06, + "loss": 0.3581, + "step": 13815 + }, + { + "epoch": 0.5166717293268219, + "grad_norm": 0.2611302435398102, + "learning_rate": 9.476405856646485e-06, + "loss": 0.2782, + "step": 13820 + }, + { + "epoch": 0.5168586583171718, + "grad_norm": 0.6067391633987427, + "learning_rate": 9.470541427778805e-06, + "loss": 0.2788, + "step": 13825 + }, + { + "epoch": 0.5170455873075216, + "grad_norm": 0.2857629954814911, + "learning_rate": 9.46467718150622e-06, + "loss": 0.3126, + "step": 13830 + }, + { + "epoch": 0.5172325162978714, + "grad_norm": 0.45087161660194397, + "learning_rate": 9.45881311985113e-06, + "loss": 0.3074, + "step": 13835 + }, + { + "epoch": 0.5174194452882211, + "grad_norm": 0.29453983902931213, + "learning_rate": 9.452949244835893e-06, + "loss": 0.2423, + "step": 13840 + }, + { + "epoch": 0.5176063742785709, + "grad_norm": 0.4951644241809845, + "learning_rate": 9.447085558482787e-06, + "loss": 0.2675, + "step": 13845 + }, + { + "epoch": 0.5177933032689207, + "grad_norm": 0.43756482005119324, + "learning_rate": 9.441222062814024e-06, + "loss": 0.2897, + "step": 13850 + }, + { + "epoch": 0.5179802322592705, + "grad_norm": 0.6385819911956787, + "learning_rate": 9.435358759851767e-06, + "loss": 0.2524, + "step": 13855 + }, + { + "epoch": 0.5181671612496203, + "grad_norm": 0.3710041046142578, + "learning_rate": 9.42949565161809e-06, + "loss": 0.2996, + "step": 13860 + }, + { + "epoch": 0.5183540902399701, + "grad_norm": 0.47704362869262695, + "learning_rate": 9.423632740135021e-06, + "loss": 0.2501, + "step": 13865 + }, + { + "epoch": 0.5185410192303199, + "grad_norm": 0.30794888734817505, + "learning_rate": 9.417770027424499e-06, + "loss": 0.2924, + "step": 13870 + }, + { + "epoch": 0.5187279482206697, + "grad_norm": 0.567466139793396, + "learning_rate": 9.411907515508415e-06, + "loss": 0.3012, + "step": 13875 + }, + { + "epoch": 0.5189148772110195, + "grad_norm": 0.5275660157203674, + "learning_rate": 9.406045206408574e-06, + "loss": 0.3539, + "step": 13880 + }, + { + "epoch": 0.5191018062013693, + "grad_norm": 0.17727385461330414, + "learning_rate": 9.400183102146726e-06, + "loss": 0.2714, + "step": 13885 + }, + { + "epoch": 0.519288735191719, + "grad_norm": 0.5720701813697815, + "learning_rate": 9.394321204744538e-06, + "loss": 0.2234, + "step": 13890 + }, + { + "epoch": 0.5194756641820688, + "grad_norm": 0.19189219176769257, + "learning_rate": 9.388459516223611e-06, + "loss": 0.3371, + "step": 13895 + }, + { + "epoch": 0.5196625931724186, + "grad_norm": 0.7133250832557678, + "learning_rate": 9.382598038605477e-06, + "loss": 0.3986, + "step": 13900 + }, + { + "epoch": 0.5198495221627685, + "grad_norm": 0.3994240164756775, + "learning_rate": 9.376736773911583e-06, + "loss": 0.3639, + "step": 13905 + }, + { + "epoch": 0.5200364511531183, + "grad_norm": 0.27001604437828064, + "learning_rate": 9.370875724163322e-06, + "loss": 0.2863, + "step": 13910 + }, + { + "epoch": 0.520223380143468, + "grad_norm": 0.5072554349899292, + "learning_rate": 9.365014891381996e-06, + "loss": 0.2816, + "step": 13915 + }, + { + "epoch": 0.5204103091338178, + "grad_norm": 0.4427657723426819, + "learning_rate": 9.35915427758884e-06, + "loss": 0.257, + "step": 13920 + }, + { + "epoch": 0.5205972381241676, + "grad_norm": 0.3535575270652771, + "learning_rate": 9.353293884805008e-06, + "loss": 0.2465, + "step": 13925 + }, + { + "epoch": 0.5207841671145174, + "grad_norm": 0.2692049443721771, + "learning_rate": 9.347433715051585e-06, + "loss": 0.2309, + "step": 13930 + }, + { + "epoch": 0.5209710961048671, + "grad_norm": 0.316593199968338, + "learning_rate": 9.341573770349579e-06, + "loss": 0.2596, + "step": 13935 + }, + { + "epoch": 0.5211580250952169, + "grad_norm": 0.5187430381774902, + "learning_rate": 9.33571405271991e-06, + "loss": 0.3025, + "step": 13940 + }, + { + "epoch": 0.5213449540855667, + "grad_norm": 0.4760133922100067, + "learning_rate": 9.329854564183433e-06, + "loss": 0.2937, + "step": 13945 + }, + { + "epoch": 0.5215318830759166, + "grad_norm": 0.36547884345054626, + "learning_rate": 9.323995306760909e-06, + "loss": 0.2883, + "step": 13950 + }, + { + "epoch": 0.5217188120662664, + "grad_norm": 0.3512476980686188, + "learning_rate": 9.31813628247304e-06, + "loss": 0.283, + "step": 13955 + }, + { + "epoch": 0.5219057410566161, + "grad_norm": 0.40813568234443665, + "learning_rate": 9.312277493340428e-06, + "loss": 0.278, + "step": 13960 + }, + { + "epoch": 0.5220926700469659, + "grad_norm": 0.35714733600616455, + "learning_rate": 9.306418941383602e-06, + "loss": 0.2797, + "step": 13965 + }, + { + "epoch": 0.5222795990373157, + "grad_norm": 0.4371785521507263, + "learning_rate": 9.300560628623007e-06, + "loss": 0.2286, + "step": 13970 + }, + { + "epoch": 0.5224665280276655, + "grad_norm": 0.8280014991760254, + "learning_rate": 9.294702557079012e-06, + "loss": 0.3348, + "step": 13975 + }, + { + "epoch": 0.5226534570180152, + "grad_norm": 0.6949049234390259, + "learning_rate": 9.288844728771898e-06, + "loss": 0.327, + "step": 13980 + }, + { + "epoch": 0.522840386008365, + "grad_norm": 0.573320209980011, + "learning_rate": 9.282987145721853e-06, + "loss": 0.2654, + "step": 13985 + }, + { + "epoch": 0.5230273149987149, + "grad_norm": 0.28571873903274536, + "learning_rate": 9.277129809949004e-06, + "loss": 0.3184, + "step": 13990 + }, + { + "epoch": 0.5232142439890647, + "grad_norm": 0.49826252460479736, + "learning_rate": 9.271272723473365e-06, + "loss": 0.2586, + "step": 13995 + }, + { + "epoch": 0.5234011729794145, + "grad_norm": 0.33316782116889954, + "learning_rate": 9.265415888314887e-06, + "loss": 0.3419, + "step": 14000 + }, + { + "epoch": 0.5235881019697642, + "grad_norm": 0.589277446269989, + "learning_rate": 9.25955930649342e-06, + "loss": 0.2259, + "step": 14005 + }, + { + "epoch": 0.523775030960114, + "grad_norm": 0.48725539445877075, + "learning_rate": 9.253702980028732e-06, + "loss": 0.3722, + "step": 14010 + }, + { + "epoch": 0.5239619599504638, + "grad_norm": 0.7195634245872498, + "learning_rate": 9.2478469109405e-06, + "loss": 0.2598, + "step": 14015 + }, + { + "epoch": 0.5241488889408136, + "grad_norm": 0.23791752755641937, + "learning_rate": 9.241991101248314e-06, + "loss": 0.2484, + "step": 14020 + }, + { + "epoch": 0.5243358179311634, + "grad_norm": 0.32968446612358093, + "learning_rate": 9.236135552971684e-06, + "loss": 0.2552, + "step": 14025 + }, + { + "epoch": 0.5245227469215132, + "grad_norm": 0.32867518067359924, + "learning_rate": 9.230280268130011e-06, + "loss": 0.2843, + "step": 14030 + }, + { + "epoch": 0.524709675911863, + "grad_norm": 0.4443724453449249, + "learning_rate": 9.22442524874262e-06, + "loss": 0.313, + "step": 14035 + }, + { + "epoch": 0.5248966049022128, + "grad_norm": 0.38702741265296936, + "learning_rate": 9.218570496828733e-06, + "loss": 0.2381, + "step": 14040 + }, + { + "epoch": 0.5250835338925626, + "grad_norm": 0.27657172083854675, + "learning_rate": 9.212716014407498e-06, + "loss": 0.2563, + "step": 14045 + }, + { + "epoch": 0.5252704628829123, + "grad_norm": 0.247044637799263, + "learning_rate": 9.206861803497946e-06, + "loss": 0.2164, + "step": 14050 + }, + { + "epoch": 0.5254573918732621, + "grad_norm": 0.3950256407260895, + "learning_rate": 9.201007866119035e-06, + "loss": 0.3164, + "step": 14055 + }, + { + "epoch": 0.5256443208636119, + "grad_norm": 0.4747505784034729, + "learning_rate": 9.195154204289614e-06, + "loss": 0.2861, + "step": 14060 + }, + { + "epoch": 0.5258312498539617, + "grad_norm": 0.27522939443588257, + "learning_rate": 9.189300820028444e-06, + "loss": 0.3118, + "step": 14065 + }, + { + "epoch": 0.5260181788443116, + "grad_norm": 0.4004516303539276, + "learning_rate": 9.183447715354197e-06, + "loss": 0.2722, + "step": 14070 + }, + { + "epoch": 0.5262051078346613, + "grad_norm": 1.1275408267974854, + "learning_rate": 9.177594892285434e-06, + "loss": 0.2359, + "step": 14075 + }, + { + "epoch": 0.5263920368250111, + "grad_norm": 0.221548393368721, + "learning_rate": 9.171742352840628e-06, + "loss": 0.2919, + "step": 14080 + }, + { + "epoch": 0.5265789658153609, + "grad_norm": 0.3027479946613312, + "learning_rate": 9.165890099038149e-06, + "loss": 0.2954, + "step": 14085 + }, + { + "epoch": 0.5267658948057107, + "grad_norm": 0.39747804403305054, + "learning_rate": 9.160038132896279e-06, + "loss": 0.2939, + "step": 14090 + }, + { + "epoch": 0.5269528237960605, + "grad_norm": 0.3907001316547394, + "learning_rate": 9.154186456433185e-06, + "loss": 0.3942, + "step": 14095 + }, + { + "epoch": 0.5271397527864102, + "grad_norm": 0.4810253083705902, + "learning_rate": 9.148335071666949e-06, + "loss": 0.2717, + "step": 14100 + }, + { + "epoch": 0.52732668177676, + "grad_norm": 0.21729324758052826, + "learning_rate": 9.142483980615545e-06, + "loss": 0.2178, + "step": 14105 + }, + { + "epoch": 0.5275136107671099, + "grad_norm": 0.331342875957489, + "learning_rate": 9.13663318529684e-06, + "loss": 0.3857, + "step": 14110 + }, + { + "epoch": 0.5277005397574597, + "grad_norm": 0.44765812158584595, + "learning_rate": 9.130782687728615e-06, + "loss": 0.2857, + "step": 14115 + }, + { + "epoch": 0.5278874687478095, + "grad_norm": 0.4248161315917969, + "learning_rate": 9.124932489928535e-06, + "loss": 0.3475, + "step": 14120 + }, + { + "epoch": 0.5280743977381592, + "grad_norm": 0.2518801689147949, + "learning_rate": 9.119082593914164e-06, + "loss": 0.3148, + "step": 14125 + }, + { + "epoch": 0.528261326728509, + "grad_norm": 0.24737538397312164, + "learning_rate": 9.113233001702963e-06, + "loss": 0.2533, + "step": 14130 + }, + { + "epoch": 0.5284482557188588, + "grad_norm": 0.36829543113708496, + "learning_rate": 9.107383715312294e-06, + "loss": 0.3111, + "step": 14135 + }, + { + "epoch": 0.5286351847092086, + "grad_norm": 0.42858853936195374, + "learning_rate": 9.101534736759402e-06, + "loss": 0.2648, + "step": 14140 + }, + { + "epoch": 0.5288221136995583, + "grad_norm": 0.436758428812027, + "learning_rate": 9.095686068061439e-06, + "loss": 0.296, + "step": 14145 + }, + { + "epoch": 0.5290090426899082, + "grad_norm": 0.4839959442615509, + "learning_rate": 9.089837711235436e-06, + "loss": 0.3344, + "step": 14150 + }, + { + "epoch": 0.529195971680258, + "grad_norm": 0.3259483277797699, + "learning_rate": 9.083989668298326e-06, + "loss": 0.2556, + "step": 14155 + }, + { + "epoch": 0.5293829006706078, + "grad_norm": 0.3588860332965851, + "learning_rate": 9.078141941266934e-06, + "loss": 0.2971, + "step": 14160 + }, + { + "epoch": 0.5295698296609576, + "grad_norm": 0.33428633213043213, + "learning_rate": 9.072294532157973e-06, + "loss": 0.2645, + "step": 14165 + }, + { + "epoch": 0.5297567586513073, + "grad_norm": 0.7300418615341187, + "learning_rate": 9.066447442988044e-06, + "loss": 0.3597, + "step": 14170 + }, + { + "epoch": 0.5299436876416571, + "grad_norm": 0.3091430366039276, + "learning_rate": 9.060600675773644e-06, + "loss": 0.2246, + "step": 14175 + }, + { + "epoch": 0.5301306166320069, + "grad_norm": 0.47581833600997925, + "learning_rate": 9.054754232531153e-06, + "loss": 0.2581, + "step": 14180 + }, + { + "epoch": 0.5303175456223567, + "grad_norm": 0.4748400151729584, + "learning_rate": 9.04890811527684e-06, + "loss": 0.2788, + "step": 14185 + }, + { + "epoch": 0.5305044746127064, + "grad_norm": 0.21533802151679993, + "learning_rate": 9.04306232602687e-06, + "loss": 0.2591, + "step": 14190 + }, + { + "epoch": 0.5306914036030563, + "grad_norm": 0.3625734746456146, + "learning_rate": 9.037216866797281e-06, + "loss": 0.2998, + "step": 14195 + }, + { + "epoch": 0.5308783325934061, + "grad_norm": 0.6287603378295898, + "learning_rate": 9.031371739604006e-06, + "loss": 0.3302, + "step": 14200 + }, + { + "epoch": 0.5310652615837559, + "grad_norm": 0.28976184129714966, + "learning_rate": 9.025526946462868e-06, + "loss": 0.2641, + "step": 14205 + }, + { + "epoch": 0.5312521905741057, + "grad_norm": 0.4596756398677826, + "learning_rate": 9.01968248938956e-06, + "loss": 0.2452, + "step": 14210 + }, + { + "epoch": 0.5314391195644554, + "grad_norm": 0.3350590765476227, + "learning_rate": 9.013838370399675e-06, + "loss": 0.3145, + "step": 14215 + }, + { + "epoch": 0.5316260485548052, + "grad_norm": 0.31188416481018066, + "learning_rate": 9.007994591508677e-06, + "loss": 0.2753, + "step": 14220 + }, + { + "epoch": 0.531812977545155, + "grad_norm": 0.22735527157783508, + "learning_rate": 9.002151154731922e-06, + "loss": 0.3517, + "step": 14225 + }, + { + "epoch": 0.5319999065355048, + "grad_norm": 0.32361796498298645, + "learning_rate": 8.996308062084638e-06, + "loss": 0.3578, + "step": 14230 + }, + { + "epoch": 0.5321868355258547, + "grad_norm": 0.44661062955856323, + "learning_rate": 8.990465315581947e-06, + "loss": 0.3692, + "step": 14235 + }, + { + "epoch": 0.5323737645162044, + "grad_norm": 0.6878971457481384, + "learning_rate": 8.984622917238842e-06, + "loss": 0.3867, + "step": 14240 + }, + { + "epoch": 0.5325606935065542, + "grad_norm": 0.33011388778686523, + "learning_rate": 8.978780869070198e-06, + "loss": 0.2912, + "step": 14245 + }, + { + "epoch": 0.532747622496904, + "grad_norm": 0.38461020588874817, + "learning_rate": 8.972939173090768e-06, + "loss": 0.2551, + "step": 14250 + }, + { + "epoch": 0.5329345514872538, + "grad_norm": 0.6979146599769592, + "learning_rate": 8.967097831315188e-06, + "loss": 0.3248, + "step": 14255 + }, + { + "epoch": 0.5331214804776035, + "grad_norm": 0.2943269610404968, + "learning_rate": 8.961256845757973e-06, + "loss": 0.2884, + "step": 14260 + }, + { + "epoch": 0.5333084094679533, + "grad_norm": 0.2779577076435089, + "learning_rate": 8.955416218433506e-06, + "loss": 0.2509, + "step": 14265 + }, + { + "epoch": 0.5334953384583031, + "grad_norm": 0.16579632461071014, + "learning_rate": 8.949575951356057e-06, + "loss": 0.2584, + "step": 14270 + }, + { + "epoch": 0.533682267448653, + "grad_norm": 0.30751311779022217, + "learning_rate": 8.94373604653976e-06, + "loss": 0.2895, + "step": 14275 + }, + { + "epoch": 0.5338691964390028, + "grad_norm": 4.094731330871582, + "learning_rate": 8.937896505998638e-06, + "loss": 0.3417, + "step": 14280 + }, + { + "epoch": 0.5340561254293525, + "grad_norm": 0.3279981017112732, + "learning_rate": 8.932057331746576e-06, + "loss": 0.3156, + "step": 14285 + }, + { + "epoch": 0.5342430544197023, + "grad_norm": 0.6543190479278564, + "learning_rate": 8.926218525797342e-06, + "loss": 0.2528, + "step": 14290 + }, + { + "epoch": 0.5344299834100521, + "grad_norm": 0.28789252042770386, + "learning_rate": 8.920380090164569e-06, + "loss": 0.3179, + "step": 14295 + }, + { + "epoch": 0.5346169124004019, + "grad_norm": 0.4924599230289459, + "learning_rate": 8.914542026861765e-06, + "loss": 0.3752, + "step": 14300 + }, + { + "epoch": 0.5348038413907517, + "grad_norm": 0.33676034212112427, + "learning_rate": 8.908704337902318e-06, + "loss": 0.2342, + "step": 14305 + }, + { + "epoch": 0.5349907703811014, + "grad_norm": 0.3200925290584564, + "learning_rate": 8.902867025299475e-06, + "loss": 0.3312, + "step": 14310 + }, + { + "epoch": 0.5351776993714513, + "grad_norm": 0.39076340198516846, + "learning_rate": 8.897030091066359e-06, + "loss": 0.2409, + "step": 14315 + }, + { + "epoch": 0.5353646283618011, + "grad_norm": 0.1652345061302185, + "learning_rate": 8.891193537215956e-06, + "loss": 0.2935, + "step": 14320 + }, + { + "epoch": 0.5355515573521509, + "grad_norm": 0.33995094895362854, + "learning_rate": 8.885357365761136e-06, + "loss": 0.2829, + "step": 14325 + }, + { + "epoch": 0.5357384863425007, + "grad_norm": 0.48137718439102173, + "learning_rate": 8.879521578714617e-06, + "loss": 0.3732, + "step": 14330 + }, + { + "epoch": 0.5359254153328504, + "grad_norm": 0.3473806381225586, + "learning_rate": 8.873686178089004e-06, + "loss": 0.2582, + "step": 14335 + }, + { + "epoch": 0.5361123443232002, + "grad_norm": 0.9458356499671936, + "learning_rate": 8.867851165896752e-06, + "loss": 0.2783, + "step": 14340 + }, + { + "epoch": 0.53629927331355, + "grad_norm": 0.2733253538608551, + "learning_rate": 8.862016544150192e-06, + "loss": 0.3056, + "step": 14345 + }, + { + "epoch": 0.5364862023038998, + "grad_norm": 0.33390501141548157, + "learning_rate": 8.856182314861524e-06, + "loss": 0.2556, + "step": 14350 + }, + { + "epoch": 0.5366731312942497, + "grad_norm": 0.5112034678459167, + "learning_rate": 8.850348480042794e-06, + "loss": 0.3071, + "step": 14355 + }, + { + "epoch": 0.5368600602845994, + "grad_norm": 0.3586961328983307, + "learning_rate": 8.844515041705938e-06, + "loss": 0.242, + "step": 14360 + }, + { + "epoch": 0.5370469892749492, + "grad_norm": 0.4830436408519745, + "learning_rate": 8.838682001862732e-06, + "loss": 0.227, + "step": 14365 + }, + { + "epoch": 0.537233918265299, + "grad_norm": 0.34037473797798157, + "learning_rate": 8.83284936252483e-06, + "loss": 0.3112, + "step": 14370 + }, + { + "epoch": 0.5374208472556488, + "grad_norm": 0.21011002361774445, + "learning_rate": 8.827017125703735e-06, + "loss": 0.2921, + "step": 14375 + }, + { + "epoch": 0.5376077762459985, + "grad_norm": 0.42194753885269165, + "learning_rate": 8.821185293410827e-06, + "loss": 0.3428, + "step": 14380 + }, + { + "epoch": 0.5377947052363483, + "grad_norm": 0.48263490200042725, + "learning_rate": 8.815353867657334e-06, + "loss": 0.2702, + "step": 14385 + }, + { + "epoch": 0.5379816342266981, + "grad_norm": 0.46013012528419495, + "learning_rate": 8.809522850454343e-06, + "loss": 0.3728, + "step": 14390 + }, + { + "epoch": 0.538168563217048, + "grad_norm": 0.4305642247200012, + "learning_rate": 8.803692243812816e-06, + "loss": 0.2556, + "step": 14395 + }, + { + "epoch": 0.5383554922073978, + "grad_norm": 0.3640795052051544, + "learning_rate": 8.79786204974355e-06, + "loss": 0.3871, + "step": 14400 + }, + { + "epoch": 0.5385424211977475, + "grad_norm": 0.413748562335968, + "learning_rate": 8.792032270257223e-06, + "loss": 0.2976, + "step": 14405 + }, + { + "epoch": 0.5387293501880973, + "grad_norm": 0.28498637676239014, + "learning_rate": 8.786202907364349e-06, + "loss": 0.3093, + "step": 14410 + }, + { + "epoch": 0.5389162791784471, + "grad_norm": 0.3062063455581665, + "learning_rate": 8.780373963075315e-06, + "loss": 0.3509, + "step": 14415 + }, + { + "epoch": 0.5391032081687969, + "grad_norm": 0.34864166378974915, + "learning_rate": 8.774545439400352e-06, + "loss": 0.3229, + "step": 14420 + }, + { + "epoch": 0.5392901371591466, + "grad_norm": 0.27013546228408813, + "learning_rate": 8.768717338349557e-06, + "loss": 0.2775, + "step": 14425 + }, + { + "epoch": 0.5394770661494964, + "grad_norm": 0.23966774344444275, + "learning_rate": 8.762889661932869e-06, + "loss": 0.3353, + "step": 14430 + }, + { + "epoch": 0.5396639951398462, + "grad_norm": 0.3070327639579773, + "learning_rate": 8.757062412160085e-06, + "loss": 0.2272, + "step": 14435 + }, + { + "epoch": 0.5398509241301961, + "grad_norm": 0.9705345630645752, + "learning_rate": 8.751235591040867e-06, + "loss": 0.2292, + "step": 14440 + }, + { + "epoch": 0.5400378531205459, + "grad_norm": 0.2887144386768341, + "learning_rate": 8.745409200584707e-06, + "loss": 0.2384, + "step": 14445 + }, + { + "epoch": 0.5402247821108956, + "grad_norm": 0.22318506240844727, + "learning_rate": 8.739583242800968e-06, + "loss": 0.2734, + "step": 14450 + }, + { + "epoch": 0.5404117111012454, + "grad_norm": 0.4824393093585968, + "learning_rate": 8.733757719698854e-06, + "loss": 0.2736, + "step": 14455 + }, + { + "epoch": 0.5405986400915952, + "grad_norm": 0.677008867263794, + "learning_rate": 8.72793263328742e-06, + "loss": 0.2748, + "step": 14460 + }, + { + "epoch": 0.540785569081945, + "grad_norm": 0.32696858048439026, + "learning_rate": 8.722107985575567e-06, + "loss": 0.2841, + "step": 14465 + }, + { + "epoch": 0.5409724980722947, + "grad_norm": 0.35395094752311707, + "learning_rate": 8.716283778572058e-06, + "loss": 0.3439, + "step": 14470 + }, + { + "epoch": 0.5411594270626445, + "grad_norm": 0.27938494086265564, + "learning_rate": 8.710460014285486e-06, + "loss": 0.2786, + "step": 14475 + }, + { + "epoch": 0.5413463560529944, + "grad_norm": 0.4673710763454437, + "learning_rate": 8.704636694724309e-06, + "loss": 0.2452, + "step": 14480 + }, + { + "epoch": 0.5415332850433442, + "grad_norm": 0.4600588083267212, + "learning_rate": 8.69881382189682e-06, + "loss": 0.327, + "step": 14485 + }, + { + "epoch": 0.541720214033694, + "grad_norm": 0.33369168639183044, + "learning_rate": 8.692991397811157e-06, + "loss": 0.3612, + "step": 14490 + }, + { + "epoch": 0.5419071430240437, + "grad_norm": 0.4194375276565552, + "learning_rate": 8.687169424475312e-06, + "loss": 0.2282, + "step": 14495 + }, + { + "epoch": 0.5420940720143935, + "grad_norm": 0.3811163902282715, + "learning_rate": 8.681347903897115e-06, + "loss": 0.324, + "step": 14500 + }, + { + "epoch": 0.5422810010047433, + "grad_norm": 0.6883916854858398, + "learning_rate": 8.675526838084244e-06, + "loss": 0.3881, + "step": 14505 + }, + { + "epoch": 0.5424679299950931, + "grad_norm": 0.36151525378227234, + "learning_rate": 8.66970622904421e-06, + "loss": 0.2661, + "step": 14510 + }, + { + "epoch": 0.5426548589854429, + "grad_norm": 0.4361328184604645, + "learning_rate": 8.663886078784386e-06, + "loss": 0.3693, + "step": 14515 + }, + { + "epoch": 0.5428417879757927, + "grad_norm": 0.3427176773548126, + "learning_rate": 8.658066389311963e-06, + "loss": 0.3023, + "step": 14520 + }, + { + "epoch": 0.5430287169661425, + "grad_norm": 0.4090884029865265, + "learning_rate": 8.652247162633994e-06, + "loss": 0.2677, + "step": 14525 + }, + { + "epoch": 0.5432156459564923, + "grad_norm": 0.4226764440536499, + "learning_rate": 8.646428400757363e-06, + "loss": 0.294, + "step": 14530 + }, + { + "epoch": 0.5434025749468421, + "grad_norm": 0.5746899843215942, + "learning_rate": 8.640610105688787e-06, + "loss": 0.325, + "step": 14535 + }, + { + "epoch": 0.5435895039371919, + "grad_norm": 0.37049710750579834, + "learning_rate": 8.634792279434838e-06, + "loss": 0.2668, + "step": 14540 + }, + { + "epoch": 0.5437764329275416, + "grad_norm": 0.40856534242630005, + "learning_rate": 8.62897492400191e-06, + "loss": 0.3502, + "step": 14545 + }, + { + "epoch": 0.5439633619178914, + "grad_norm": 0.3843344449996948, + "learning_rate": 8.623158041396251e-06, + "loss": 0.317, + "step": 14550 + }, + { + "epoch": 0.5441502909082412, + "grad_norm": 0.4334682822227478, + "learning_rate": 8.617341633623928e-06, + "loss": 0.3195, + "step": 14555 + }, + { + "epoch": 0.5443372198985911, + "grad_norm": 0.55138099193573, + "learning_rate": 8.611525702690861e-06, + "loss": 0.2328, + "step": 14560 + }, + { + "epoch": 0.5445241488889409, + "grad_norm": 0.599747359752655, + "learning_rate": 8.60571025060279e-06, + "loss": 0.2468, + "step": 14565 + }, + { + "epoch": 0.5447110778792906, + "grad_norm": 0.6509507298469543, + "learning_rate": 8.599895279365303e-06, + "loss": 0.3409, + "step": 14570 + }, + { + "epoch": 0.5448980068696404, + "grad_norm": 0.2125466763973236, + "learning_rate": 8.59408079098382e-06, + "loss": 0.2632, + "step": 14575 + }, + { + "epoch": 0.5450849358599902, + "grad_norm": 0.3049337565898895, + "learning_rate": 8.588266787463582e-06, + "loss": 0.2482, + "step": 14580 + }, + { + "epoch": 0.54527186485034, + "grad_norm": 0.372995525598526, + "learning_rate": 8.582453270809682e-06, + "loss": 0.3199, + "step": 14585 + }, + { + "epoch": 0.5454587938406897, + "grad_norm": 0.3478299379348755, + "learning_rate": 8.576640243027027e-06, + "loss": 0.2878, + "step": 14590 + }, + { + "epoch": 0.5456457228310395, + "grad_norm": 0.32996904850006104, + "learning_rate": 8.570827706120373e-06, + "loss": 0.2564, + "step": 14595 + }, + { + "epoch": 0.5458326518213894, + "grad_norm": 0.33126890659332275, + "learning_rate": 8.565015662094289e-06, + "loss": 0.2473, + "step": 14600 + }, + { + "epoch": 0.5460195808117392, + "grad_norm": 0.6810954809188843, + "learning_rate": 8.559204112953187e-06, + "loss": 0.3061, + "step": 14605 + }, + { + "epoch": 0.546206509802089, + "grad_norm": 0.286606103181839, + "learning_rate": 8.5533930607013e-06, + "loss": 0.4564, + "step": 14610 + }, + { + "epoch": 0.5463934387924387, + "grad_norm": 0.23572850227355957, + "learning_rate": 8.547582507342696e-06, + "loss": 0.2598, + "step": 14615 + }, + { + "epoch": 0.5465803677827885, + "grad_norm": 0.35271137952804565, + "learning_rate": 8.54177245488127e-06, + "loss": 0.2811, + "step": 14620 + }, + { + "epoch": 0.5467672967731383, + "grad_norm": 0.3139854967594147, + "learning_rate": 8.535962905320739e-06, + "loss": 0.2576, + "step": 14625 + }, + { + "epoch": 0.5469542257634881, + "grad_norm": 0.5428050756454468, + "learning_rate": 8.530153860664657e-06, + "loss": 0.2428, + "step": 14630 + }, + { + "epoch": 0.5471411547538378, + "grad_norm": 0.5763229131698608, + "learning_rate": 8.524345322916383e-06, + "loss": 0.2484, + "step": 14635 + }, + { + "epoch": 0.5473280837441876, + "grad_norm": 0.3398786783218384, + "learning_rate": 8.518537294079132e-06, + "loss": 0.3553, + "step": 14640 + }, + { + "epoch": 0.5475150127345375, + "grad_norm": 0.40959233045578003, + "learning_rate": 8.512729776155917e-06, + "loss": 0.255, + "step": 14645 + }, + { + "epoch": 0.5477019417248873, + "grad_norm": 0.34932756423950195, + "learning_rate": 8.506922771149588e-06, + "loss": 0.3235, + "step": 14650 + }, + { + "epoch": 0.5478888707152371, + "grad_norm": 0.2500455379486084, + "learning_rate": 8.501116281062809e-06, + "loss": 0.2978, + "step": 14655 + }, + { + "epoch": 0.5480757997055868, + "grad_norm": 0.37845414876937866, + "learning_rate": 8.495310307898076e-06, + "loss": 0.3534, + "step": 14660 + }, + { + "epoch": 0.5482627286959366, + "grad_norm": 0.1609395295381546, + "learning_rate": 8.489504853657707e-06, + "loss": 0.2224, + "step": 14665 + }, + { + "epoch": 0.5484496576862864, + "grad_norm": 0.37711602449417114, + "learning_rate": 8.48369992034383e-06, + "loss": 0.2195, + "step": 14670 + }, + { + "epoch": 0.5486365866766362, + "grad_norm": 0.373844712972641, + "learning_rate": 8.477895509958407e-06, + "loss": 0.2513, + "step": 14675 + }, + { + "epoch": 0.548823515666986, + "grad_norm": 0.24388177692890167, + "learning_rate": 8.472091624503204e-06, + "loss": 0.3141, + "step": 14680 + }, + { + "epoch": 0.5490104446573358, + "grad_norm": 0.5337631106376648, + "learning_rate": 8.466288265979822e-06, + "loss": 0.3304, + "step": 14685 + }, + { + "epoch": 0.5491973736476856, + "grad_norm": 0.3623740077018738, + "learning_rate": 8.460485436389672e-06, + "loss": 0.2617, + "step": 14690 + }, + { + "epoch": 0.5493843026380354, + "grad_norm": 0.47312211990356445, + "learning_rate": 8.454683137733982e-06, + "loss": 0.2966, + "step": 14695 + }, + { + "epoch": 0.5495712316283852, + "grad_norm": 0.4503675401210785, + "learning_rate": 8.448881372013795e-06, + "loss": 0.299, + "step": 14700 + }, + { + "epoch": 0.549758160618735, + "grad_norm": 0.3221707344055176, + "learning_rate": 8.443080141229978e-06, + "loss": 0.2533, + "step": 14705 + }, + { + "epoch": 0.5499450896090847, + "grad_norm": 0.25598636269569397, + "learning_rate": 8.437279447383213e-06, + "loss": 0.2354, + "step": 14710 + }, + { + "epoch": 0.5501320185994345, + "grad_norm": 0.3973354399204254, + "learning_rate": 8.431479292473986e-06, + "loss": 0.3455, + "step": 14715 + }, + { + "epoch": 0.5503189475897843, + "grad_norm": 0.30457204580307007, + "learning_rate": 8.42567967850261e-06, + "loss": 0.2669, + "step": 14720 + }, + { + "epoch": 0.5505058765801342, + "grad_norm": 0.5336440205574036, + "learning_rate": 8.4198806074692e-06, + "loss": 0.3144, + "step": 14725 + }, + { + "epoch": 0.550692805570484, + "grad_norm": 0.39468759298324585, + "learning_rate": 8.414082081373695e-06, + "loss": 0.3424, + "step": 14730 + }, + { + "epoch": 0.5508797345608337, + "grad_norm": 0.5090770125389099, + "learning_rate": 8.408284102215833e-06, + "loss": 0.2587, + "step": 14735 + }, + { + "epoch": 0.5510666635511835, + "grad_norm": 0.2587001621723175, + "learning_rate": 8.40248667199518e-06, + "loss": 0.2465, + "step": 14740 + }, + { + "epoch": 0.5512535925415333, + "grad_norm": 0.4725680351257324, + "learning_rate": 8.396689792711098e-06, + "loss": 0.2382, + "step": 14745 + }, + { + "epoch": 0.551440521531883, + "grad_norm": 0.4414125084877014, + "learning_rate": 8.390893466362765e-06, + "loss": 0.2707, + "step": 14750 + }, + { + "epoch": 0.5516274505222328, + "grad_norm": 0.43323227763175964, + "learning_rate": 8.385097694949171e-06, + "loss": 0.2367, + "step": 14755 + }, + { + "epoch": 0.5518143795125826, + "grad_norm": 0.4801645874977112, + "learning_rate": 8.379302480469109e-06, + "loss": 0.2377, + "step": 14760 + }, + { + "epoch": 0.5520013085029325, + "grad_norm": 0.42729660868644714, + "learning_rate": 8.373507824921184e-06, + "loss": 0.311, + "step": 14765 + }, + { + "epoch": 0.5521882374932823, + "grad_norm": 0.3292931318283081, + "learning_rate": 8.3677137303038e-06, + "loss": 0.2404, + "step": 14770 + }, + { + "epoch": 0.552375166483632, + "grad_norm": 0.2796480655670166, + "learning_rate": 8.361920198615182e-06, + "loss": 0.2542, + "step": 14775 + }, + { + "epoch": 0.5525620954739818, + "grad_norm": 0.1017402783036232, + "learning_rate": 8.35612723185335e-06, + "loss": 0.3207, + "step": 14780 + }, + { + "epoch": 0.5527490244643316, + "grad_norm": 0.296451598405838, + "learning_rate": 8.350334832016136e-06, + "loss": 0.269, + "step": 14785 + }, + { + "epoch": 0.5529359534546814, + "grad_norm": 0.4688802659511566, + "learning_rate": 8.344543001101167e-06, + "loss": 0.2914, + "step": 14790 + }, + { + "epoch": 0.5531228824450312, + "grad_norm": 0.2229701429605484, + "learning_rate": 8.33875174110588e-06, + "loss": 0.3238, + "step": 14795 + }, + { + "epoch": 0.5533098114353809, + "grad_norm": 0.6549943685531616, + "learning_rate": 8.332961054027522e-06, + "loss": 0.2368, + "step": 14800 + }, + { + "epoch": 0.5534967404257308, + "grad_norm": 0.4109474718570709, + "learning_rate": 8.327170941863124e-06, + "loss": 0.2954, + "step": 14805 + }, + { + "epoch": 0.5536836694160806, + "grad_norm": 0.4711514711380005, + "learning_rate": 8.32138140660954e-06, + "loss": 0.2371, + "step": 14810 + }, + { + "epoch": 0.5538705984064304, + "grad_norm": 0.32290342450141907, + "learning_rate": 8.31559245026341e-06, + "loss": 0.2736, + "step": 14815 + }, + { + "epoch": 0.5540575273967802, + "grad_norm": 0.34553197026252747, + "learning_rate": 8.309804074821179e-06, + "loss": 0.3564, + "step": 14820 + }, + { + "epoch": 0.5542444563871299, + "grad_norm": 0.3491702377796173, + "learning_rate": 8.304016282279089e-06, + "loss": 0.281, + "step": 14825 + }, + { + "epoch": 0.5544313853774797, + "grad_norm": 0.2792864739894867, + "learning_rate": 8.298229074633192e-06, + "loss": 0.2732, + "step": 14830 + }, + { + "epoch": 0.5546183143678295, + "grad_norm": 0.48167872428894043, + "learning_rate": 8.292442453879324e-06, + "loss": 0.277, + "step": 14835 + }, + { + "epoch": 0.5548052433581793, + "grad_norm": 0.563044548034668, + "learning_rate": 8.286656422013122e-06, + "loss": 0.3111, + "step": 14840 + }, + { + "epoch": 0.5549921723485292, + "grad_norm": 0.3365761637687683, + "learning_rate": 8.280870981030031e-06, + "loss": 0.286, + "step": 14845 + }, + { + "epoch": 0.5551791013388789, + "grad_norm": 0.3140876591205597, + "learning_rate": 8.275086132925277e-06, + "loss": 0.3126, + "step": 14850 + }, + { + "epoch": 0.5553660303292287, + "grad_norm": 0.4990136921405792, + "learning_rate": 8.269301879693892e-06, + "loss": 0.2471, + "step": 14855 + }, + { + "epoch": 0.5555529593195785, + "grad_norm": 0.3173409402370453, + "learning_rate": 8.263518223330698e-06, + "loss": 0.3087, + "step": 14860 + }, + { + "epoch": 0.5557398883099283, + "grad_norm": 0.32432377338409424, + "learning_rate": 8.257735165830314e-06, + "loss": 0.1955, + "step": 14865 + }, + { + "epoch": 0.555926817300278, + "grad_norm": 0.41975530982017517, + "learning_rate": 8.251952709187145e-06, + "loss": 0.2308, + "step": 14870 + }, + { + "epoch": 0.5561137462906278, + "grad_norm": 0.22599612176418304, + "learning_rate": 8.2461708553954e-06, + "loss": 0.2998, + "step": 14875 + }, + { + "epoch": 0.5563006752809776, + "grad_norm": 0.2920520603656769, + "learning_rate": 8.240389606449075e-06, + "loss": 0.2895, + "step": 14880 + }, + { + "epoch": 0.5564876042713274, + "grad_norm": 0.2696673274040222, + "learning_rate": 8.234608964341953e-06, + "loss": 0.3093, + "step": 14885 + }, + { + "epoch": 0.5566745332616773, + "grad_norm": 0.37283971905708313, + "learning_rate": 8.228828931067618e-06, + "loss": 0.3073, + "step": 14890 + }, + { + "epoch": 0.556861462252027, + "grad_norm": 0.22317394614219666, + "learning_rate": 8.223049508619429e-06, + "loss": 0.3098, + "step": 14895 + }, + { + "epoch": 0.5570483912423768, + "grad_norm": 0.5522322058677673, + "learning_rate": 8.217270698990555e-06, + "loss": 0.3913, + "step": 14900 + }, + { + "epoch": 0.5572353202327266, + "grad_norm": 0.3208252787590027, + "learning_rate": 8.21149250417393e-06, + "loss": 0.2539, + "step": 14905 + }, + { + "epoch": 0.5574222492230764, + "grad_norm": 0.3736477494239807, + "learning_rate": 8.205714926162298e-06, + "loss": 0.4014, + "step": 14910 + }, + { + "epoch": 0.5576091782134261, + "grad_norm": 0.3176252543926239, + "learning_rate": 8.199937966948168e-06, + "loss": 0.3256, + "step": 14915 + }, + { + "epoch": 0.5577961072037759, + "grad_norm": 0.3628195822238922, + "learning_rate": 8.194161628523863e-06, + "loss": 0.3898, + "step": 14920 + }, + { + "epoch": 0.5579830361941257, + "grad_norm": 0.3258436620235443, + "learning_rate": 8.18838591288146e-06, + "loss": 0.2693, + "step": 14925 + }, + { + "epoch": 0.5581699651844756, + "grad_norm": 0.4312792420387268, + "learning_rate": 8.18261082201285e-06, + "loss": 0.2869, + "step": 14930 + }, + { + "epoch": 0.5583568941748254, + "grad_norm": 0.43801358342170715, + "learning_rate": 8.176836357909697e-06, + "loss": 0.3012, + "step": 14935 + }, + { + "epoch": 0.5585438231651751, + "grad_norm": 0.4672282934188843, + "learning_rate": 8.171062522563438e-06, + "loss": 0.2744, + "step": 14940 + }, + { + "epoch": 0.5587307521555249, + "grad_norm": 0.36364656686782837, + "learning_rate": 8.165289317965314e-06, + "loss": 0.3149, + "step": 14945 + }, + { + "epoch": 0.5589176811458747, + "grad_norm": 0.6358222961425781, + "learning_rate": 8.159516746106331e-06, + "loss": 0.3848, + "step": 14950 + }, + { + "epoch": 0.5591046101362245, + "grad_norm": 0.34080687165260315, + "learning_rate": 8.153744808977287e-06, + "loss": 0.287, + "step": 14955 + }, + { + "epoch": 0.5592915391265743, + "grad_norm": 0.4259713888168335, + "learning_rate": 8.147973508568753e-06, + "loss": 0.2668, + "step": 14960 + }, + { + "epoch": 0.559478468116924, + "grad_norm": 0.25825029611587524, + "learning_rate": 8.142202846871093e-06, + "loss": 0.2385, + "step": 14965 + }, + { + "epoch": 0.5596653971072739, + "grad_norm": 0.5594242215156555, + "learning_rate": 8.136432825874433e-06, + "loss": 0.3407, + "step": 14970 + }, + { + "epoch": 0.5598523260976237, + "grad_norm": 0.39524081349372864, + "learning_rate": 8.130663447568696e-06, + "loss": 0.2397, + "step": 14975 + }, + { + "epoch": 0.5600392550879735, + "grad_norm": 0.391757071018219, + "learning_rate": 8.124894713943576e-06, + "loss": 0.2447, + "step": 14980 + }, + { + "epoch": 0.5602261840783233, + "grad_norm": 0.4008239209651947, + "learning_rate": 8.119126626988535e-06, + "loss": 0.2966, + "step": 14985 + }, + { + "epoch": 0.560413113068673, + "grad_norm": 0.3706965148448944, + "learning_rate": 8.11335918869283e-06, + "loss": 0.3095, + "step": 14990 + }, + { + "epoch": 0.5606000420590228, + "grad_norm": 1.8738888502120972, + "learning_rate": 8.10759240104548e-06, + "loss": 0.3566, + "step": 14995 + }, + { + "epoch": 0.5607869710493726, + "grad_norm": 0.28992828726768494, + "learning_rate": 8.10182626603529e-06, + "loss": 0.2649, + "step": 15000 + }, + { + "epoch": 0.5609739000397224, + "grad_norm": 0.2337641566991806, + "learning_rate": 8.096060785650829e-06, + "loss": 0.2402, + "step": 15005 + }, + { + "epoch": 0.5611608290300723, + "grad_norm": 0.28632795810699463, + "learning_rate": 8.09029596188045e-06, + "loss": 0.25, + "step": 15010 + }, + { + "epoch": 0.561347758020422, + "grad_norm": 0.4603891670703888, + "learning_rate": 8.08453179671227e-06, + "loss": 0.3261, + "step": 15015 + }, + { + "epoch": 0.5615346870107718, + "grad_norm": 0.4253883361816406, + "learning_rate": 8.07876829213419e-06, + "loss": 0.3169, + "step": 15020 + }, + { + "epoch": 0.5617216160011216, + "grad_norm": 0.3845186233520508, + "learning_rate": 8.073005450133877e-06, + "loss": 0.2886, + "step": 15025 + }, + { + "epoch": 0.5619085449914714, + "grad_norm": 0.31487545371055603, + "learning_rate": 8.067243272698766e-06, + "loss": 0.2967, + "step": 15030 + }, + { + "epoch": 0.5620954739818211, + "grad_norm": 0.29043373465538025, + "learning_rate": 8.061481761816073e-06, + "loss": 0.2707, + "step": 15035 + }, + { + "epoch": 0.5622824029721709, + "grad_norm": 0.4541192054748535, + "learning_rate": 8.055720919472771e-06, + "loss": 0.3279, + "step": 15040 + }, + { + "epoch": 0.5624693319625207, + "grad_norm": 0.3087974190711975, + "learning_rate": 8.049960747655618e-06, + "loss": 0.231, + "step": 15045 + }, + { + "epoch": 0.5626562609528706, + "grad_norm": 1.4350837469100952, + "learning_rate": 8.044201248351125e-06, + "loss": 0.3562, + "step": 15050 + }, + { + "epoch": 0.5628431899432204, + "grad_norm": 0.5882908701896667, + "learning_rate": 8.038442423545583e-06, + "loss": 0.337, + "step": 15055 + }, + { + "epoch": 0.5630301189335701, + "grad_norm": 0.3661154806613922, + "learning_rate": 8.032684275225038e-06, + "loss": 0.4017, + "step": 15060 + }, + { + "epoch": 0.5632170479239199, + "grad_norm": 0.552562952041626, + "learning_rate": 8.026926805375319e-06, + "loss": 0.2177, + "step": 15065 + }, + { + "epoch": 0.5634039769142697, + "grad_norm": 0.5023968815803528, + "learning_rate": 8.021170015982009e-06, + "loss": 0.2301, + "step": 15070 + }, + { + "epoch": 0.5635909059046195, + "grad_norm": 0.22658932209014893, + "learning_rate": 8.01541390903046e-06, + "loss": 0.3225, + "step": 15075 + }, + { + "epoch": 0.5637778348949692, + "grad_norm": 0.32941967248916626, + "learning_rate": 8.00965848650579e-06, + "loss": 0.264, + "step": 15080 + }, + { + "epoch": 0.563964763885319, + "grad_norm": 0.5150809288024902, + "learning_rate": 8.003903750392872e-06, + "loss": 0.2672, + "step": 15085 + }, + { + "epoch": 0.5641516928756689, + "grad_norm": 0.2671123743057251, + "learning_rate": 7.99814970267636e-06, + "loss": 0.2327, + "step": 15090 + }, + { + "epoch": 0.5643386218660187, + "grad_norm": 0.4776862561702728, + "learning_rate": 7.992396345340654e-06, + "loss": 0.2703, + "step": 15095 + }, + { + "epoch": 0.5645255508563685, + "grad_norm": 0.34787583351135254, + "learning_rate": 7.986643680369925e-06, + "loss": 0.2732, + "step": 15100 + }, + { + "epoch": 0.5647124798467182, + "grad_norm": 0.34038421511650085, + "learning_rate": 7.980891709748097e-06, + "loss": 0.2566, + "step": 15105 + }, + { + "epoch": 0.564899408837068, + "grad_norm": 0.5559023022651672, + "learning_rate": 7.975140435458864e-06, + "loss": 0.3003, + "step": 15110 + }, + { + "epoch": 0.5650863378274178, + "grad_norm": 0.6152315139770508, + "learning_rate": 7.969389859485679e-06, + "loss": 0.3102, + "step": 15115 + }, + { + "epoch": 0.5652732668177676, + "grad_norm": 0.28499045968055725, + "learning_rate": 7.963639983811744e-06, + "loss": 0.3374, + "step": 15120 + }, + { + "epoch": 0.5654601958081173, + "grad_norm": 0.4572030007839203, + "learning_rate": 7.957890810420033e-06, + "loss": 0.2414, + "step": 15125 + }, + { + "epoch": 0.5656471247984671, + "grad_norm": 0.44276192784309387, + "learning_rate": 7.952142341293264e-06, + "loss": 0.3226, + "step": 15130 + }, + { + "epoch": 0.565834053788817, + "grad_norm": 0.44015127420425415, + "learning_rate": 7.946394578413923e-06, + "loss": 0.2718, + "step": 15135 + }, + { + "epoch": 0.5660209827791668, + "grad_norm": 0.4518749415874481, + "learning_rate": 7.940647523764251e-06, + "loss": 0.2729, + "step": 15140 + }, + { + "epoch": 0.5662079117695166, + "grad_norm": 0.13977321982383728, + "learning_rate": 7.93490117932624e-06, + "loss": 0.3054, + "step": 15145 + }, + { + "epoch": 0.5663948407598663, + "grad_norm": 0.3425142765045166, + "learning_rate": 7.929155547081637e-06, + "loss": 0.2925, + "step": 15150 + }, + { + "epoch": 0.5665817697502161, + "grad_norm": 1.439296841621399, + "learning_rate": 7.923410629011947e-06, + "loss": 0.2461, + "step": 15155 + }, + { + "epoch": 0.5667686987405659, + "grad_norm": 0.3877362310886383, + "learning_rate": 7.917666427098434e-06, + "loss": 0.2503, + "step": 15160 + }, + { + "epoch": 0.5669556277309157, + "grad_norm": 0.4661048352718353, + "learning_rate": 7.911922943322102e-06, + "loss": 0.3003, + "step": 15165 + }, + { + "epoch": 0.5671425567212655, + "grad_norm": 0.5539686679840088, + "learning_rate": 7.906180179663719e-06, + "loss": 0.3221, + "step": 15170 + }, + { + "epoch": 0.5673294857116153, + "grad_norm": 0.45953917503356934, + "learning_rate": 7.900438138103791e-06, + "loss": 0.2582, + "step": 15175 + }, + { + "epoch": 0.5675164147019651, + "grad_norm": 0.5768008828163147, + "learning_rate": 7.894696820622594e-06, + "loss": 0.239, + "step": 15180 + }, + { + "epoch": 0.5677033436923149, + "grad_norm": 0.3129447400569916, + "learning_rate": 7.888956229200134e-06, + "loss": 0.3238, + "step": 15185 + }, + { + "epoch": 0.5678902726826647, + "grad_norm": 0.3107287287712097, + "learning_rate": 7.883216365816186e-06, + "loss": 0.2606, + "step": 15190 + }, + { + "epoch": 0.5680772016730145, + "grad_norm": 0.35008296370506287, + "learning_rate": 7.877477232450258e-06, + "loss": 0.252, + "step": 15195 + }, + { + "epoch": 0.5682641306633642, + "grad_norm": 0.33052533864974976, + "learning_rate": 7.871738831081613e-06, + "loss": 0.2242, + "step": 15200 + }, + { + "epoch": 0.568451059653714, + "grad_norm": 0.36529335379600525, + "learning_rate": 7.866001163689264e-06, + "loss": 0.4166, + "step": 15205 + }, + { + "epoch": 0.5686379886440638, + "grad_norm": 0.5268134474754333, + "learning_rate": 7.860264232251968e-06, + "loss": 0.3337, + "step": 15210 + }, + { + "epoch": 0.5688249176344137, + "grad_norm": 0.5382173657417297, + "learning_rate": 7.85452803874823e-06, + "loss": 0.3197, + "step": 15215 + }, + { + "epoch": 0.5690118466247635, + "grad_norm": 0.29692742228507996, + "learning_rate": 7.84879258515629e-06, + "loss": 0.3722, + "step": 15220 + }, + { + "epoch": 0.5691987756151132, + "grad_norm": 0.31395187973976135, + "learning_rate": 7.843057873454151e-06, + "loss": 0.2898, + "step": 15225 + }, + { + "epoch": 0.569385704605463, + "grad_norm": 0.16513025760650635, + "learning_rate": 7.837323905619543e-06, + "loss": 0.2928, + "step": 15230 + }, + { + "epoch": 0.5695726335958128, + "grad_norm": 0.396751344203949, + "learning_rate": 7.831590683629957e-06, + "loss": 0.3757, + "step": 15235 + }, + { + "epoch": 0.5697595625861626, + "grad_norm": 0.6405811905860901, + "learning_rate": 7.825858209462609e-06, + "loss": 0.3309, + "step": 15240 + }, + { + "epoch": 0.5699464915765123, + "grad_norm": 0.3609001934528351, + "learning_rate": 7.820126485094465e-06, + "loss": 0.1863, + "step": 15245 + }, + { + "epoch": 0.5701334205668621, + "grad_norm": 0.3763851821422577, + "learning_rate": 7.814395512502239e-06, + "loss": 0.254, + "step": 15250 + }, + { + "epoch": 0.570320349557212, + "grad_norm": 0.37580713629722595, + "learning_rate": 7.80866529366237e-06, + "loss": 0.2708, + "step": 15255 + }, + { + "epoch": 0.5705072785475618, + "grad_norm": 0.4320635497570038, + "learning_rate": 7.802935830551058e-06, + "loss": 0.2882, + "step": 15260 + }, + { + "epoch": 0.5706942075379116, + "grad_norm": 0.4472927153110504, + "learning_rate": 7.797207125144222e-06, + "loss": 0.2909, + "step": 15265 + }, + { + "epoch": 0.5708811365282613, + "grad_norm": 0.32192474603652954, + "learning_rate": 7.791479179417532e-06, + "loss": 0.2998, + "step": 15270 + }, + { + "epoch": 0.5710680655186111, + "grad_norm": 0.39683884382247925, + "learning_rate": 7.785751995346385e-06, + "loss": 0.2981, + "step": 15275 + }, + { + "epoch": 0.5712549945089609, + "grad_norm": 0.5190780758857727, + "learning_rate": 7.780025574905935e-06, + "loss": 0.2564, + "step": 15280 + }, + { + "epoch": 0.5714419234993107, + "grad_norm": 0.41170167922973633, + "learning_rate": 7.774299920071052e-06, + "loss": 0.3135, + "step": 15285 + }, + { + "epoch": 0.5716288524896604, + "grad_norm": 0.5996455550193787, + "learning_rate": 7.768575032816347e-06, + "loss": 0.3116, + "step": 15290 + }, + { + "epoch": 0.5718157814800103, + "grad_norm": 0.40233224630355835, + "learning_rate": 7.762850915116183e-06, + "loss": 0.261, + "step": 15295 + }, + { + "epoch": 0.5720027104703601, + "grad_norm": 1.258935809135437, + "learning_rate": 7.757127568944629e-06, + "loss": 0.2613, + "step": 15300 + }, + { + "epoch": 0.5721896394607099, + "grad_norm": 0.4860970377922058, + "learning_rate": 7.751404996275515e-06, + "loss": 0.3349, + "step": 15305 + }, + { + "epoch": 0.5723765684510597, + "grad_norm": 0.6576589941978455, + "learning_rate": 7.745683199082385e-06, + "loss": 0.3026, + "step": 15310 + }, + { + "epoch": 0.5725634974414094, + "grad_norm": 0.39305242896080017, + "learning_rate": 7.739962179338528e-06, + "loss": 0.3438, + "step": 15315 + }, + { + "epoch": 0.5727504264317592, + "grad_norm": 0.23568610846996307, + "learning_rate": 7.734241939016953e-06, + "loss": 0.2488, + "step": 15320 + }, + { + "epoch": 0.572937355422109, + "grad_norm": 0.39915916323661804, + "learning_rate": 7.728522480090415e-06, + "loss": 0.3723, + "step": 15325 + }, + { + "epoch": 0.5731242844124588, + "grad_norm": 0.4517640173435211, + "learning_rate": 7.722803804531385e-06, + "loss": 0.2179, + "step": 15330 + }, + { + "epoch": 0.5733112134028087, + "grad_norm": 0.5729992985725403, + "learning_rate": 7.717085914312071e-06, + "loss": 0.2954, + "step": 15335 + }, + { + "epoch": 0.5734981423931584, + "grad_norm": 0.26858532428741455, + "learning_rate": 7.711368811404417e-06, + "loss": 0.3368, + "step": 15340 + }, + { + "epoch": 0.5736850713835082, + "grad_norm": 0.3362521231174469, + "learning_rate": 7.705652497780076e-06, + "loss": 0.2627, + "step": 15345 + }, + { + "epoch": 0.573872000373858, + "grad_norm": 0.5297291874885559, + "learning_rate": 7.699936975410452e-06, + "loss": 0.3659, + "step": 15350 + }, + { + "epoch": 0.5740589293642078, + "grad_norm": 0.8421899080276489, + "learning_rate": 7.694222246266659e-06, + "loss": 0.2792, + "step": 15355 + }, + { + "epoch": 0.5742458583545575, + "grad_norm": 0.3951667845249176, + "learning_rate": 7.688508312319545e-06, + "loss": 0.3122, + "step": 15360 + }, + { + "epoch": 0.5744327873449073, + "grad_norm": 0.23748940229415894, + "learning_rate": 7.682795175539677e-06, + "loss": 0.3775, + "step": 15365 + }, + { + "epoch": 0.5746197163352571, + "grad_norm": 0.42018768191337585, + "learning_rate": 7.677082837897362e-06, + "loss": 0.2916, + "step": 15370 + }, + { + "epoch": 0.5748066453256069, + "grad_norm": 0.21799716353416443, + "learning_rate": 7.671371301362613e-06, + "loss": 0.2834, + "step": 15375 + }, + { + "epoch": 0.5749935743159568, + "grad_norm": 0.3239682912826538, + "learning_rate": 7.66566056790518e-06, + "loss": 0.3178, + "step": 15380 + }, + { + "epoch": 0.5751805033063065, + "grad_norm": 0.2807358205318451, + "learning_rate": 7.659950639494531e-06, + "loss": 0.2459, + "step": 15385 + }, + { + "epoch": 0.5753674322966563, + "grad_norm": 0.39530667662620544, + "learning_rate": 7.654241518099851e-06, + "loss": 0.306, + "step": 15390 + }, + { + "epoch": 0.5755543612870061, + "grad_norm": 0.2550390362739563, + "learning_rate": 7.648533205690062e-06, + "loss": 0.3522, + "step": 15395 + }, + { + "epoch": 0.5757412902773559, + "grad_norm": 0.4049346148967743, + "learning_rate": 7.64282570423379e-06, + "loss": 0.3467, + "step": 15400 + }, + { + "epoch": 0.5759282192677057, + "grad_norm": 0.7592940926551819, + "learning_rate": 7.637119015699394e-06, + "loss": 0.2773, + "step": 15405 + }, + { + "epoch": 0.5761151482580554, + "grad_norm": 0.43101072311401367, + "learning_rate": 7.631413142054938e-06, + "loss": 0.2392, + "step": 15410 + }, + { + "epoch": 0.5763020772484052, + "grad_norm": 0.2688102126121521, + "learning_rate": 7.625708085268227e-06, + "loss": 0.2488, + "step": 15415 + }, + { + "epoch": 0.5764890062387551, + "grad_norm": 0.4400135576725006, + "learning_rate": 7.620003847306761e-06, + "loss": 0.3319, + "step": 15420 + }, + { + "epoch": 0.5766759352291049, + "grad_norm": 0.35837605595588684, + "learning_rate": 7.6143004301377735e-06, + "loss": 0.3054, + "step": 15425 + }, + { + "epoch": 0.5768628642194547, + "grad_norm": 0.42280635237693787, + "learning_rate": 7.6085978357282105e-06, + "loss": 0.31, + "step": 15430 + }, + { + "epoch": 0.5770497932098044, + "grad_norm": 0.36921992897987366, + "learning_rate": 7.60289606604473e-06, + "loss": 0.2359, + "step": 15435 + }, + { + "epoch": 0.5772367222001542, + "grad_norm": 0.676522433757782, + "learning_rate": 7.597195123053711e-06, + "loss": 0.2833, + "step": 15440 + }, + { + "epoch": 0.577423651190504, + "grad_norm": 0.4574277102947235, + "learning_rate": 7.591495008721243e-06, + "loss": 0.2164, + "step": 15445 + }, + { + "epoch": 0.5776105801808538, + "grad_norm": 0.3864579200744629, + "learning_rate": 7.585795725013138e-06, + "loss": 0.3748, + "step": 15450 + }, + { + "epoch": 0.5777975091712035, + "grad_norm": 0.5847076177597046, + "learning_rate": 7.580097273894911e-06, + "loss": 0.3585, + "step": 15455 + }, + { + "epoch": 0.5779844381615534, + "grad_norm": 0.2284291684627533, + "learning_rate": 7.574399657331796e-06, + "loss": 0.3382, + "step": 15460 + }, + { + "epoch": 0.5781713671519032, + "grad_norm": 0.5911282300949097, + "learning_rate": 7.568702877288732e-06, + "loss": 0.2877, + "step": 15465 + }, + { + "epoch": 0.578358296142253, + "grad_norm": 0.2202225774526596, + "learning_rate": 7.5630069357303835e-06, + "loss": 0.2816, + "step": 15470 + }, + { + "epoch": 0.5785452251326028, + "grad_norm": 0.4826430082321167, + "learning_rate": 7.557311834621116e-06, + "loss": 0.2662, + "step": 15475 + }, + { + "epoch": 0.5787321541229525, + "grad_norm": 0.23101915419101715, + "learning_rate": 7.551617575925001e-06, + "loss": 0.3696, + "step": 15480 + }, + { + "epoch": 0.5789190831133023, + "grad_norm": 0.4637044668197632, + "learning_rate": 7.545924161605832e-06, + "loss": 0.3342, + "step": 15485 + }, + { + "epoch": 0.5791060121036521, + "grad_norm": 0.3432970345020294, + "learning_rate": 7.540231593627098e-06, + "loss": 0.2257, + "step": 15490 + }, + { + "epoch": 0.5792929410940019, + "grad_norm": 0.18397098779678345, + "learning_rate": 7.5345398739520105e-06, + "loss": 0.2383, + "step": 15495 + }, + { + "epoch": 0.5794798700843518, + "grad_norm": 0.47786015272140503, + "learning_rate": 7.528849004543473e-06, + "loss": 0.3306, + "step": 15500 + }, + { + "epoch": 0.5796667990747015, + "grad_norm": 0.2156762033700943, + "learning_rate": 7.52315898736411e-06, + "loss": 0.3126, + "step": 15505 + }, + { + "epoch": 0.5798537280650513, + "grad_norm": 0.22711989283561707, + "learning_rate": 7.517469824376238e-06, + "loss": 0.2556, + "step": 15510 + }, + { + "epoch": 0.5800406570554011, + "grad_norm": 0.4607297480106354, + "learning_rate": 7.5117815175418914e-06, + "loss": 0.3023, + "step": 15515 + }, + { + "epoch": 0.5802275860457509, + "grad_norm": 0.43681520223617554, + "learning_rate": 7.506094068822801e-06, + "loss": 0.2141, + "step": 15520 + }, + { + "epoch": 0.5804145150361006, + "grad_norm": 0.33917364478111267, + "learning_rate": 7.50040748018041e-06, + "loss": 0.3088, + "step": 15525 + }, + { + "epoch": 0.5806014440264504, + "grad_norm": 0.2788265347480774, + "learning_rate": 7.494721753575856e-06, + "loss": 0.2605, + "step": 15530 + }, + { + "epoch": 0.5807883730168002, + "grad_norm": 0.34832385182380676, + "learning_rate": 7.489036890969981e-06, + "loss": 0.2741, + "step": 15535 + }, + { + "epoch": 0.5809753020071501, + "grad_norm": 0.3322335481643677, + "learning_rate": 7.483352894323339e-06, + "loss": 0.2696, + "step": 15540 + }, + { + "epoch": 0.5811622309974999, + "grad_norm": 0.27210044860839844, + "learning_rate": 7.47766976559617e-06, + "loss": 0.272, + "step": 15545 + }, + { + "epoch": 0.5813491599878496, + "grad_norm": 0.4030235707759857, + "learning_rate": 7.471987506748426e-06, + "loss": 0.3164, + "step": 15550 + }, + { + "epoch": 0.5815360889781994, + "grad_norm": 0.31630632281303406, + "learning_rate": 7.466306119739751e-06, + "loss": 0.2757, + "step": 15555 + }, + { + "epoch": 0.5817230179685492, + "grad_norm": 0.34428203105926514, + "learning_rate": 7.4606256065295e-06, + "loss": 0.3276, + "step": 15560 + }, + { + "epoch": 0.581909946958899, + "grad_norm": 0.41596320271492004, + "learning_rate": 7.4549459690767105e-06, + "loss": 0.3167, + "step": 15565 + }, + { + "epoch": 0.5820968759492487, + "grad_norm": 0.275072306394577, + "learning_rate": 7.4492672093401345e-06, + "loss": 0.2606, + "step": 15570 + }, + { + "epoch": 0.5822838049395985, + "grad_norm": 0.29760444164276123, + "learning_rate": 7.443589329278211e-06, + "loss": 0.3137, + "step": 15575 + }, + { + "epoch": 0.5824707339299484, + "grad_norm": 0.40786728262901306, + "learning_rate": 7.4379123308490735e-06, + "loss": 0.2841, + "step": 15580 + }, + { + "epoch": 0.5826576629202982, + "grad_norm": 0.40761783719062805, + "learning_rate": 7.432236216010564e-06, + "loss": 0.2717, + "step": 15585 + }, + { + "epoch": 0.582844591910648, + "grad_norm": 0.36162450909614563, + "learning_rate": 7.426560986720206e-06, + "loss": 0.314, + "step": 15590 + }, + { + "epoch": 0.5830315209009977, + "grad_norm": 0.36004146933555603, + "learning_rate": 7.4208866449352275e-06, + "loss": 0.3408, + "step": 15595 + }, + { + "epoch": 0.5832184498913475, + "grad_norm": 0.42730987071990967, + "learning_rate": 7.4152131926125405e-06, + "loss": 0.2853, + "step": 15600 + }, + { + "epoch": 0.5834053788816973, + "grad_norm": 0.4817928671836853, + "learning_rate": 7.409540631708763e-06, + "loss": 0.3038, + "step": 15605 + }, + { + "epoch": 0.5835923078720471, + "grad_norm": 0.20412497222423553, + "learning_rate": 7.403868964180192e-06, + "loss": 0.262, + "step": 15610 + }, + { + "epoch": 0.5837792368623969, + "grad_norm": 0.18026775121688843, + "learning_rate": 7.398198191982828e-06, + "loss": 0.2356, + "step": 15615 + }, + { + "epoch": 0.5839661658527466, + "grad_norm": 0.22581899166107178, + "learning_rate": 7.39252831707236e-06, + "loss": 0.3154, + "step": 15620 + }, + { + "epoch": 0.5841530948430965, + "grad_norm": 0.26380541920661926, + "learning_rate": 7.386859341404158e-06, + "loss": 0.3565, + "step": 15625 + }, + { + "epoch": 0.5843400238334463, + "grad_norm": 0.4216972291469574, + "learning_rate": 7.3811912669332965e-06, + "loss": 0.2868, + "step": 15630 + }, + { + "epoch": 0.5845269528237961, + "grad_norm": 0.4482162892818451, + "learning_rate": 7.375524095614524e-06, + "loss": 0.2696, + "step": 15635 + }, + { + "epoch": 0.5847138818141459, + "grad_norm": 0.34576284885406494, + "learning_rate": 7.369857829402294e-06, + "loss": 0.2722, + "step": 15640 + }, + { + "epoch": 0.5849008108044956, + "grad_norm": 0.35120487213134766, + "learning_rate": 7.364192470250735e-06, + "loss": 0.2365, + "step": 15645 + }, + { + "epoch": 0.5850877397948454, + "grad_norm": 0.5268846154212952, + "learning_rate": 7.358528020113669e-06, + "loss": 0.3953, + "step": 15650 + }, + { + "epoch": 0.5852746687851952, + "grad_norm": 0.5416049957275391, + "learning_rate": 7.352864480944597e-06, + "loss": 0.3447, + "step": 15655 + }, + { + "epoch": 0.585461597775545, + "grad_norm": 0.3339357376098633, + "learning_rate": 7.3472018546967175e-06, + "loss": 0.3417, + "step": 15660 + }, + { + "epoch": 0.5856485267658949, + "grad_norm": 0.4160725474357605, + "learning_rate": 7.341540143322907e-06, + "loss": 0.331, + "step": 15665 + }, + { + "epoch": 0.5858354557562446, + "grad_norm": 0.4474146366119385, + "learning_rate": 7.335879348775724e-06, + "loss": 0.2705, + "step": 15670 + }, + { + "epoch": 0.5860223847465944, + "grad_norm": 0.39802151918411255, + "learning_rate": 7.33021947300742e-06, + "loss": 0.2973, + "step": 15675 + }, + { + "epoch": 0.5862093137369442, + "grad_norm": 0.3424316346645355, + "learning_rate": 7.324560517969918e-06, + "loss": 0.2956, + "step": 15680 + }, + { + "epoch": 0.586396242727294, + "grad_norm": 0.31695395708084106, + "learning_rate": 7.318902485614836e-06, + "loss": 0.2828, + "step": 15685 + }, + { + "epoch": 0.5865831717176437, + "grad_norm": 0.30436766147613525, + "learning_rate": 7.313245377893461e-06, + "loss": 0.2704, + "step": 15690 + }, + { + "epoch": 0.5867701007079935, + "grad_norm": 0.349795401096344, + "learning_rate": 7.307589196756772e-06, + "loss": 0.3184, + "step": 15695 + }, + { + "epoch": 0.5869570296983433, + "grad_norm": 0.3413912057876587, + "learning_rate": 7.301933944155417e-06, + "loss": 0.2635, + "step": 15700 + }, + { + "epoch": 0.5871439586886932, + "grad_norm": 0.3070078194141388, + "learning_rate": 7.296279622039737e-06, + "loss": 0.36, + "step": 15705 + }, + { + "epoch": 0.587330887679043, + "grad_norm": 0.29595261812210083, + "learning_rate": 7.290626232359746e-06, + "loss": 0.2837, + "step": 15710 + }, + { + "epoch": 0.5875178166693927, + "grad_norm": 0.47354787588119507, + "learning_rate": 7.284973777065134e-06, + "loss": 0.2849, + "step": 15715 + }, + { + "epoch": 0.5877047456597425, + "grad_norm": 0.5047826766967773, + "learning_rate": 7.279322258105272e-06, + "loss": 0.2876, + "step": 15720 + }, + { + "epoch": 0.5878916746500923, + "grad_norm": 0.5029893517494202, + "learning_rate": 7.273671677429202e-06, + "loss": 0.3886, + "step": 15725 + }, + { + "epoch": 0.5880786036404421, + "grad_norm": 0.27061954140663147, + "learning_rate": 7.2680220369856546e-06, + "loss": 0.3005, + "step": 15730 + }, + { + "epoch": 0.5882655326307918, + "grad_norm": 0.5909748673439026, + "learning_rate": 7.2623733387230245e-06, + "loss": 0.2534, + "step": 15735 + }, + { + "epoch": 0.5884524616211416, + "grad_norm": 0.3261001408100128, + "learning_rate": 7.256725584589388e-06, + "loss": 0.2761, + "step": 15740 + }, + { + "epoch": 0.5886393906114915, + "grad_norm": 0.47955918312072754, + "learning_rate": 7.25107877653249e-06, + "loss": 0.2752, + "step": 15745 + }, + { + "epoch": 0.5888263196018413, + "grad_norm": 0.49793973565101624, + "learning_rate": 7.245432916499755e-06, + "loss": 0.3089, + "step": 15750 + }, + { + "epoch": 0.5890132485921911, + "grad_norm": 0.5249269604682922, + "learning_rate": 7.2397880064382816e-06, + "loss": 0.3462, + "step": 15755 + }, + { + "epoch": 0.5892001775825408, + "grad_norm": 0.7072009444236755, + "learning_rate": 7.234144048294833e-06, + "loss": 0.2022, + "step": 15760 + }, + { + "epoch": 0.5893871065728906, + "grad_norm": 0.22332626581192017, + "learning_rate": 7.228501044015854e-06, + "loss": 0.3619, + "step": 15765 + }, + { + "epoch": 0.5895740355632404, + "grad_norm": 0.6335933208465576, + "learning_rate": 7.222858995547446e-06, + "loss": 0.2363, + "step": 15770 + }, + { + "epoch": 0.5897609645535902, + "grad_norm": 0.40368562936782837, + "learning_rate": 7.2172179048354e-06, + "loss": 0.3412, + "step": 15775 + }, + { + "epoch": 0.58994789354394, + "grad_norm": 0.41344451904296875, + "learning_rate": 7.211577773825157e-06, + "loss": 0.3029, + "step": 15780 + }, + { + "epoch": 0.5901348225342898, + "grad_norm": 0.15166114270687103, + "learning_rate": 7.205938604461846e-06, + "loss": 0.2663, + "step": 15785 + }, + { + "epoch": 0.5903217515246396, + "grad_norm": 0.18254537880420685, + "learning_rate": 7.2003003986902474e-06, + "loss": 0.2779, + "step": 15790 + }, + { + "epoch": 0.5905086805149894, + "grad_norm": 0.6421458125114441, + "learning_rate": 7.19466315845482e-06, + "loss": 0.2935, + "step": 15795 + }, + { + "epoch": 0.5906956095053392, + "grad_norm": 0.296409010887146, + "learning_rate": 7.189026885699688e-06, + "loss": 0.2955, + "step": 15800 + }, + { + "epoch": 0.590882538495689, + "grad_norm": 0.444355845451355, + "learning_rate": 7.183391582368637e-06, + "loss": 0.2687, + "step": 15805 + }, + { + "epoch": 0.5910694674860387, + "grad_norm": 0.2945992052555084, + "learning_rate": 7.177757250405126e-06, + "loss": 0.2612, + "step": 15810 + }, + { + "epoch": 0.5912563964763885, + "grad_norm": 0.2884933650493622, + "learning_rate": 7.172123891752268e-06, + "loss": 0.2812, + "step": 15815 + }, + { + "epoch": 0.5914433254667383, + "grad_norm": 0.5446357727050781, + "learning_rate": 7.166491508352853e-06, + "loss": 0.2699, + "step": 15820 + }, + { + "epoch": 0.5916302544570882, + "grad_norm": 0.3308735489845276, + "learning_rate": 7.160860102149323e-06, + "loss": 0.2325, + "step": 15825 + }, + { + "epoch": 0.5918171834474379, + "grad_norm": 0.5215713977813721, + "learning_rate": 7.155229675083797e-06, + "loss": 0.269, + "step": 15830 + }, + { + "epoch": 0.5920041124377877, + "grad_norm": 0.39702996611595154, + "learning_rate": 7.1496002290980415e-06, + "loss": 0.3225, + "step": 15835 + }, + { + "epoch": 0.5921910414281375, + "grad_norm": 0.3645925521850586, + "learning_rate": 7.14397176613349e-06, + "loss": 0.2705, + "step": 15840 + }, + { + "epoch": 0.5923779704184873, + "grad_norm": 0.32077428698539734, + "learning_rate": 7.138344288131245e-06, + "loss": 0.3051, + "step": 15845 + }, + { + "epoch": 0.592564899408837, + "grad_norm": 0.5416418313980103, + "learning_rate": 7.132717797032056e-06, + "loss": 0.3141, + "step": 15850 + }, + { + "epoch": 0.5927518283991868, + "grad_norm": 0.5128157734870911, + "learning_rate": 7.127092294776343e-06, + "loss": 0.3008, + "step": 15855 + }, + { + "epoch": 0.5929387573895366, + "grad_norm": 0.820091962814331, + "learning_rate": 7.121467783304174e-06, + "loss": 0.3023, + "step": 15860 + }, + { + "epoch": 0.5931256863798864, + "grad_norm": 0.2677062153816223, + "learning_rate": 7.1158442645552896e-06, + "loss": 0.2956, + "step": 15865 + }, + { + "epoch": 0.5933126153702363, + "grad_norm": 0.3159048855304718, + "learning_rate": 7.110221740469074e-06, + "loss": 0.3155, + "step": 15870 + }, + { + "epoch": 0.593499544360586, + "grad_norm": 0.2539639472961426, + "learning_rate": 7.10460021298458e-06, + "loss": 0.3094, + "step": 15875 + }, + { + "epoch": 0.5936864733509358, + "grad_norm": 0.36676567792892456, + "learning_rate": 7.098979684040508e-06, + "loss": 0.2947, + "step": 15880 + }, + { + "epoch": 0.5938734023412856, + "grad_norm": 0.3667091131210327, + "learning_rate": 7.093360155575218e-06, + "loss": 0.2889, + "step": 15885 + }, + { + "epoch": 0.5940603313316354, + "grad_norm": 0.6631130576133728, + "learning_rate": 7.087741629526726e-06, + "loss": 0.2206, + "step": 15890 + }, + { + "epoch": 0.5942472603219852, + "grad_norm": 0.7274545431137085, + "learning_rate": 7.082124107832695e-06, + "loss": 0.3667, + "step": 15895 + }, + { + "epoch": 0.5944341893123349, + "grad_norm": 0.5482187867164612, + "learning_rate": 7.076507592430457e-06, + "loss": 0.3779, + "step": 15900 + }, + { + "epoch": 0.5946211183026847, + "grad_norm": 0.4760439991950989, + "learning_rate": 7.070892085256978e-06, + "loss": 0.2652, + "step": 15905 + }, + { + "epoch": 0.5948080472930346, + "grad_norm": 0.27402257919311523, + "learning_rate": 7.065277588248893e-06, + "loss": 0.2663, + "step": 15910 + }, + { + "epoch": 0.5949949762833844, + "grad_norm": 0.5011516213417053, + "learning_rate": 7.059664103342473e-06, + "loss": 0.364, + "step": 15915 + }, + { + "epoch": 0.5951819052737342, + "grad_norm": 0.3318321108818054, + "learning_rate": 7.0540516324736556e-06, + "loss": 0.3262, + "step": 15920 + }, + { + "epoch": 0.5953688342640839, + "grad_norm": 0.34024515748023987, + "learning_rate": 7.0484401775780175e-06, + "loss": 0.3343, + "step": 15925 + }, + { + "epoch": 0.5955557632544337, + "grad_norm": 0.2316112518310547, + "learning_rate": 7.0428297405907865e-06, + "loss": 0.3235, + "step": 15930 + }, + { + "epoch": 0.5957426922447835, + "grad_norm": 0.3854818046092987, + "learning_rate": 7.0372203234468474e-06, + "loss": 0.3301, + "step": 15935 + }, + { + "epoch": 0.5959296212351333, + "grad_norm": 0.34826406836509705, + "learning_rate": 7.031611928080721e-06, + "loss": 0.2321, + "step": 15940 + }, + { + "epoch": 0.596116550225483, + "grad_norm": 0.5255212783813477, + "learning_rate": 7.02600455642659e-06, + "loss": 0.2714, + "step": 15945 + }, + { + "epoch": 0.5963034792158329, + "grad_norm": 0.4153469204902649, + "learning_rate": 7.020398210418269e-06, + "loss": 0.3229, + "step": 15950 + }, + { + "epoch": 0.5964904082061827, + "grad_norm": 0.532195508480072, + "learning_rate": 7.014792891989232e-06, + "loss": 0.2495, + "step": 15955 + }, + { + "epoch": 0.5966773371965325, + "grad_norm": 0.12149854749441147, + "learning_rate": 7.009188603072586e-06, + "loss": 0.2962, + "step": 15960 + }, + { + "epoch": 0.5968642661868823, + "grad_norm": 0.5596807599067688, + "learning_rate": 7.003585345601095e-06, + "loss": 0.342, + "step": 15965 + }, + { + "epoch": 0.597051195177232, + "grad_norm": 0.4083070456981659, + "learning_rate": 6.9979831215071566e-06, + "loss": 0.2877, + "step": 15970 + }, + { + "epoch": 0.5972381241675818, + "grad_norm": 0.5761303901672363, + "learning_rate": 6.9923819327228235e-06, + "loss": 0.2234, + "step": 15975 + }, + { + "epoch": 0.5974250531579316, + "grad_norm": 0.42254355549812317, + "learning_rate": 6.986781781179786e-06, + "loss": 0.3016, + "step": 15980 + }, + { + "epoch": 0.5976119821482814, + "grad_norm": 0.43202266097068787, + "learning_rate": 6.981182668809365e-06, + "loss": 0.3697, + "step": 15985 + }, + { + "epoch": 0.5977989111386313, + "grad_norm": 0.30627140402793884, + "learning_rate": 6.975584597542549e-06, + "loss": 0.3126, + "step": 15990 + }, + { + "epoch": 0.597985840128981, + "grad_norm": 0.5201455950737, + "learning_rate": 6.9699875693099415e-06, + "loss": 0.2483, + "step": 15995 + }, + { + "epoch": 0.5981727691193308, + "grad_norm": 0.49727630615234375, + "learning_rate": 6.964391586041803e-06, + "loss": 0.3368, + "step": 16000 + }, + { + "epoch": 0.5983596981096806, + "grad_norm": 0.3812571167945862, + "learning_rate": 6.95879664966802e-06, + "loss": 0.3413, + "step": 16005 + }, + { + "epoch": 0.5985466271000304, + "grad_norm": 0.35332605242729187, + "learning_rate": 6.953202762118137e-06, + "loss": 0.2443, + "step": 16010 + }, + { + "epoch": 0.5987335560903801, + "grad_norm": 0.45708781480789185, + "learning_rate": 6.947609925321314e-06, + "loss": 0.2457, + "step": 16015 + }, + { + "epoch": 0.5989204850807299, + "grad_norm": 0.43644988536834717, + "learning_rate": 6.942018141206368e-06, + "loss": 0.2944, + "step": 16020 + }, + { + "epoch": 0.5991074140710797, + "grad_norm": 0.4386439323425293, + "learning_rate": 6.9364274117017446e-06, + "loss": 0.3314, + "step": 16025 + }, + { + "epoch": 0.5992943430614296, + "grad_norm": 0.48612180352211, + "learning_rate": 6.930837738735521e-06, + "loss": 0.2833, + "step": 16030 + }, + { + "epoch": 0.5994812720517794, + "grad_norm": 0.5489234924316406, + "learning_rate": 6.925249124235423e-06, + "loss": 0.3497, + "step": 16035 + }, + { + "epoch": 0.5996682010421291, + "grad_norm": 0.47888511419296265, + "learning_rate": 6.919661570128796e-06, + "loss": 0.3317, + "step": 16040 + }, + { + "epoch": 0.5998551300324789, + "grad_norm": 0.12318491190671921, + "learning_rate": 6.914075078342632e-06, + "loss": 0.2215, + "step": 16045 + }, + { + "epoch": 0.6000420590228287, + "grad_norm": 0.5285260081291199, + "learning_rate": 6.908489650803549e-06, + "loss": 0.2446, + "step": 16050 + }, + { + "epoch": 0.6002289880131785, + "grad_norm": 0.34625720977783203, + "learning_rate": 6.902905289437807e-06, + "loss": 0.2358, + "step": 16055 + }, + { + "epoch": 0.6004159170035283, + "grad_norm": 0.5598260760307312, + "learning_rate": 6.897321996171281e-06, + "loss": 0.2095, + "step": 16060 + }, + { + "epoch": 0.600602845993878, + "grad_norm": 0.35202714800834656, + "learning_rate": 6.891739772929499e-06, + "loss": 0.2681, + "step": 16065 + }, + { + "epoch": 0.6007897749842279, + "grad_norm": 0.39519229531288147, + "learning_rate": 6.886158621637608e-06, + "loss": 0.2615, + "step": 16070 + }, + { + "epoch": 0.6009767039745777, + "grad_norm": 0.6391472220420837, + "learning_rate": 6.880578544220382e-06, + "loss": 0.2642, + "step": 16075 + }, + { + "epoch": 0.6011636329649275, + "grad_norm": 0.3933602571487427, + "learning_rate": 6.874999542602237e-06, + "loss": 0.2837, + "step": 16080 + }, + { + "epoch": 0.6013505619552773, + "grad_norm": 0.6235082745552063, + "learning_rate": 6.8694216187072015e-06, + "loss": 0.3124, + "step": 16085 + }, + { + "epoch": 0.601537490945627, + "grad_norm": 0.46364444494247437, + "learning_rate": 6.863844774458954e-06, + "loss": 0.2186, + "step": 16090 + }, + { + "epoch": 0.6017244199359768, + "grad_norm": 0.37542879581451416, + "learning_rate": 6.8582690117807784e-06, + "loss": 0.28, + "step": 16095 + }, + { + "epoch": 0.6019113489263266, + "grad_norm": 0.4766859710216522, + "learning_rate": 6.852694332595601e-06, + "loss": 0.2415, + "step": 16100 + }, + { + "epoch": 0.6020982779166764, + "grad_norm": 0.384067177772522, + "learning_rate": 6.847120738825962e-06, + "loss": 0.2473, + "step": 16105 + }, + { + "epoch": 0.6022852069070261, + "grad_norm": 0.3013162314891815, + "learning_rate": 6.841548232394041e-06, + "loss": 0.2942, + "step": 16110 + }, + { + "epoch": 0.602472135897376, + "grad_norm": 0.4402746558189392, + "learning_rate": 6.835976815221637e-06, + "loss": 0.2965, + "step": 16115 + }, + { + "epoch": 0.6026590648877258, + "grad_norm": 0.42539361119270325, + "learning_rate": 6.830406489230162e-06, + "loss": 0.2081, + "step": 16120 + }, + { + "epoch": 0.6028459938780756, + "grad_norm": 0.47078409790992737, + "learning_rate": 6.824837256340674e-06, + "loss": 0.2934, + "step": 16125 + }, + { + "epoch": 0.6030329228684254, + "grad_norm": 0.4631364345550537, + "learning_rate": 6.819269118473833e-06, + "loss": 0.2897, + "step": 16130 + }, + { + "epoch": 0.6032198518587751, + "grad_norm": 0.48256585001945496, + "learning_rate": 6.813702077549935e-06, + "loss": 0.2225, + "step": 16135 + }, + { + "epoch": 0.6034067808491249, + "grad_norm": 0.5263893604278564, + "learning_rate": 6.808136135488892e-06, + "loss": 0.2288, + "step": 16140 + }, + { + "epoch": 0.6035937098394747, + "grad_norm": 0.3626660108566284, + "learning_rate": 6.802571294210239e-06, + "loss": 0.2377, + "step": 16145 + }, + { + "epoch": 0.6037806388298245, + "grad_norm": 1.2106562852859497, + "learning_rate": 6.797007555633124e-06, + "loss": 0.2988, + "step": 16150 + }, + { + "epoch": 0.6039675678201744, + "grad_norm": 0.2370201051235199, + "learning_rate": 6.791444921676327e-06, + "loss": 0.2449, + "step": 16155 + }, + { + "epoch": 0.6041544968105241, + "grad_norm": 0.5193468928337097, + "learning_rate": 6.785883394258241e-06, + "loss": 0.2805, + "step": 16160 + }, + { + "epoch": 0.6043414258008739, + "grad_norm": 0.5827836990356445, + "learning_rate": 6.780322975296877e-06, + "loss": 0.2585, + "step": 16165 + }, + { + "epoch": 0.6045283547912237, + "grad_norm": 0.22835935652256012, + "learning_rate": 6.7747636667098645e-06, + "loss": 0.2602, + "step": 16170 + }, + { + "epoch": 0.6047152837815735, + "grad_norm": 0.5145224928855896, + "learning_rate": 6.769205470414445e-06, + "loss": 0.278, + "step": 16175 + }, + { + "epoch": 0.6049022127719232, + "grad_norm": 0.26391491293907166, + "learning_rate": 6.763648388327488e-06, + "loss": 0.2507, + "step": 16180 + }, + { + "epoch": 0.605089141762273, + "grad_norm": 0.23646554350852966, + "learning_rate": 6.758092422365468e-06, + "loss": 0.2388, + "step": 16185 + }, + { + "epoch": 0.6052760707526228, + "grad_norm": 0.4633139669895172, + "learning_rate": 6.75253757444448e-06, + "loss": 0.3711, + "step": 16190 + }, + { + "epoch": 0.6054629997429727, + "grad_norm": 0.614550769329071, + "learning_rate": 6.746983846480226e-06, + "loss": 0.3509, + "step": 16195 + }, + { + "epoch": 0.6056499287333225, + "grad_norm": 1.0739442110061646, + "learning_rate": 6.7414312403880345e-06, + "loss": 0.351, + "step": 16200 + }, + { + "epoch": 0.6058368577236722, + "grad_norm": 0.317594051361084, + "learning_rate": 6.735879758082841e-06, + "loss": 0.341, + "step": 16205 + }, + { + "epoch": 0.606023786714022, + "grad_norm": 0.32532641291618347, + "learning_rate": 6.730329401479189e-06, + "loss": 0.2996, + "step": 16210 + }, + { + "epoch": 0.6062107157043718, + "grad_norm": 0.4760509431362152, + "learning_rate": 6.724780172491241e-06, + "loss": 0.2035, + "step": 16215 + }, + { + "epoch": 0.6063976446947216, + "grad_norm": 0.4633803963661194, + "learning_rate": 6.71923207303276e-06, + "loss": 0.3453, + "step": 16220 + }, + { + "epoch": 0.6065845736850713, + "grad_norm": 0.27814042568206787, + "learning_rate": 6.713685105017135e-06, + "loss": 0.3061, + "step": 16225 + }, + { + "epoch": 0.6067715026754211, + "grad_norm": 0.331685334444046, + "learning_rate": 6.708139270357348e-06, + "loss": 0.2412, + "step": 16230 + }, + { + "epoch": 0.606958431665771, + "grad_norm": 0.2586578130722046, + "learning_rate": 6.702594570966008e-06, + "loss": 0.2551, + "step": 16235 + }, + { + "epoch": 0.6071453606561208, + "grad_norm": 0.35025131702423096, + "learning_rate": 6.697051008755315e-06, + "loss": 0.2946, + "step": 16240 + }, + { + "epoch": 0.6073322896464706, + "grad_norm": 0.403495192527771, + "learning_rate": 6.691508585637085e-06, + "loss": 0.2499, + "step": 16245 + }, + { + "epoch": 0.6075192186368203, + "grad_norm": 0.6013374328613281, + "learning_rate": 6.6859673035227495e-06, + "loss": 0.2661, + "step": 16250 + }, + { + "epoch": 0.6077061476271701, + "grad_norm": 0.09546211361885071, + "learning_rate": 6.680427164323329e-06, + "loss": 0.2791, + "step": 16255 + }, + { + "epoch": 0.6078930766175199, + "grad_norm": 0.38992223143577576, + "learning_rate": 6.674888169949463e-06, + "loss": 0.3381, + "step": 16260 + }, + { + "epoch": 0.6080800056078697, + "grad_norm": 0.33082830905914307, + "learning_rate": 6.669350322311388e-06, + "loss": 0.3829, + "step": 16265 + }, + { + "epoch": 0.6082669345982195, + "grad_norm": 0.30394282937049866, + "learning_rate": 6.663813623318954e-06, + "loss": 0.287, + "step": 16270 + }, + { + "epoch": 0.6084538635885693, + "grad_norm": 0.27760547399520874, + "learning_rate": 6.658278074881605e-06, + "loss": 0.2794, + "step": 16275 + }, + { + "epoch": 0.6086407925789191, + "grad_norm": 0.7337043881416321, + "learning_rate": 6.652743678908399e-06, + "loss": 0.2566, + "step": 16280 + }, + { + "epoch": 0.6088277215692689, + "grad_norm": 0.36123204231262207, + "learning_rate": 6.647210437307985e-06, + "loss": 0.2533, + "step": 16285 + }, + { + "epoch": 0.6090146505596187, + "grad_norm": 0.4499099552631378, + "learning_rate": 6.641678351988619e-06, + "loss": 0.314, + "step": 16290 + }, + { + "epoch": 0.6092015795499685, + "grad_norm": 0.3133924901485443, + "learning_rate": 6.6361474248581655e-06, + "loss": 0.2611, + "step": 16295 + }, + { + "epoch": 0.6093885085403182, + "grad_norm": 0.39866819977760315, + "learning_rate": 6.630617657824078e-06, + "loss": 0.2835, + "step": 16300 + }, + { + "epoch": 0.609575437530668, + "grad_norm": 0.45732995867729187, + "learning_rate": 6.625089052793417e-06, + "loss": 0.3571, + "step": 16305 + }, + { + "epoch": 0.6097623665210178, + "grad_norm": 0.41225141286849976, + "learning_rate": 6.619561611672834e-06, + "loss": 0.3494, + "step": 16310 + }, + { + "epoch": 0.6099492955113677, + "grad_norm": 0.5021333694458008, + "learning_rate": 6.6140353363685914e-06, + "loss": 0.2647, + "step": 16315 + }, + { + "epoch": 0.6101362245017175, + "grad_norm": 0.2890709936618805, + "learning_rate": 6.6085102287865385e-06, + "loss": 0.278, + "step": 16320 + }, + { + "epoch": 0.6103231534920672, + "grad_norm": 0.5777745842933655, + "learning_rate": 6.602986290832134e-06, + "loss": 0.2815, + "step": 16325 + }, + { + "epoch": 0.610510082482417, + "grad_norm": 0.4748448133468628, + "learning_rate": 6.597463524410418e-06, + "loss": 0.2811, + "step": 16330 + }, + { + "epoch": 0.6106970114727668, + "grad_norm": 0.5496982336044312, + "learning_rate": 6.591941931426036e-06, + "loss": 0.2409, + "step": 16335 + }, + { + "epoch": 0.6108839404631166, + "grad_norm": 0.2635287046432495, + "learning_rate": 6.5864215137832325e-06, + "loss": 0.311, + "step": 16340 + }, + { + "epoch": 0.6110708694534663, + "grad_norm": 0.2311626374721527, + "learning_rate": 6.580902273385834e-06, + "loss": 0.2865, + "step": 16345 + }, + { + "epoch": 0.6112577984438161, + "grad_norm": 0.39265650510787964, + "learning_rate": 6.575384212137275e-06, + "loss": 0.2652, + "step": 16350 + }, + { + "epoch": 0.6114447274341659, + "grad_norm": 0.35634520649909973, + "learning_rate": 6.569867331940571e-06, + "loss": 0.2976, + "step": 16355 + }, + { + "epoch": 0.6116316564245158, + "grad_norm": 0.7832450866699219, + "learning_rate": 6.564351634698343e-06, + "loss": 0.2907, + "step": 16360 + }, + { + "epoch": 0.6118185854148656, + "grad_norm": 0.28447359800338745, + "learning_rate": 6.558837122312787e-06, + "loss": 0.2886, + "step": 16365 + }, + { + "epoch": 0.6120055144052153, + "grad_norm": 0.9879940748214722, + "learning_rate": 6.553323796685709e-06, + "loss": 0.3565, + "step": 16370 + }, + { + "epoch": 0.6121924433955651, + "grad_norm": 0.35768356919288635, + "learning_rate": 6.547811659718492e-06, + "loss": 0.3012, + "step": 16375 + }, + { + "epoch": 0.6123793723859149, + "grad_norm": 0.3880248963832855, + "learning_rate": 6.542300713312113e-06, + "loss": 0.372, + "step": 16380 + }, + { + "epoch": 0.6125663013762647, + "grad_norm": 0.1782693862915039, + "learning_rate": 6.536790959367149e-06, + "loss": 0.2869, + "step": 16385 + }, + { + "epoch": 0.6127532303666144, + "grad_norm": 0.4560245871543884, + "learning_rate": 6.5312823997837425e-06, + "loss": 0.2764, + "step": 16390 + }, + { + "epoch": 0.6129401593569642, + "grad_norm": 0.3002925217151642, + "learning_rate": 6.525775036461652e-06, + "loss": 0.2596, + "step": 16395 + }, + { + "epoch": 0.6131270883473141, + "grad_norm": 0.45390433073043823, + "learning_rate": 6.520268871300198e-06, + "loss": 0.2895, + "step": 16400 + }, + { + "epoch": 0.6133140173376639, + "grad_norm": 0.19361938536167145, + "learning_rate": 6.514763906198307e-06, + "loss": 0.3229, + "step": 16405 + }, + { + "epoch": 0.6135009463280137, + "grad_norm": 0.6821458339691162, + "learning_rate": 6.509260143054474e-06, + "loss": 0.2455, + "step": 16410 + }, + { + "epoch": 0.6136878753183634, + "grad_norm": 0.405543714761734, + "learning_rate": 6.503757583766802e-06, + "loss": 0.3357, + "step": 16415 + }, + { + "epoch": 0.6138748043087132, + "grad_norm": 0.6047316789627075, + "learning_rate": 6.4982562302329535e-06, + "loss": 0.3021, + "step": 16420 + }, + { + "epoch": 0.614061733299063, + "grad_norm": 0.49325546622276306, + "learning_rate": 6.492756084350196e-06, + "loss": 0.3521, + "step": 16425 + }, + { + "epoch": 0.6142486622894128, + "grad_norm": 0.8090613484382629, + "learning_rate": 6.4872571480153725e-06, + "loss": 0.3772, + "step": 16430 + }, + { + "epoch": 0.6144355912797625, + "grad_norm": 0.4884081780910492, + "learning_rate": 6.4817594231249e-06, + "loss": 0.2789, + "step": 16435 + }, + { + "epoch": 0.6146225202701124, + "grad_norm": 0.4141273498535156, + "learning_rate": 6.476262911574797e-06, + "loss": 0.3313, + "step": 16440 + }, + { + "epoch": 0.6148094492604622, + "grad_norm": 0.6283908486366272, + "learning_rate": 6.470767615260647e-06, + "loss": 0.2758, + "step": 16445 + }, + { + "epoch": 0.614996378250812, + "grad_norm": 0.3659379184246063, + "learning_rate": 6.465273536077623e-06, + "loss": 0.2827, + "step": 16450 + }, + { + "epoch": 0.6151833072411618, + "grad_norm": 0.4448320269584656, + "learning_rate": 6.459780675920468e-06, + "loss": 0.2774, + "step": 16455 + }, + { + "epoch": 0.6153702362315115, + "grad_norm": 0.43518292903900146, + "learning_rate": 6.454289036683523e-06, + "loss": 0.2542, + "step": 16460 + }, + { + "epoch": 0.6155571652218613, + "grad_norm": 0.26366111636161804, + "learning_rate": 6.448798620260688e-06, + "loss": 0.2958, + "step": 16465 + }, + { + "epoch": 0.6157440942122111, + "grad_norm": 0.5211677551269531, + "learning_rate": 6.443309428545457e-06, + "loss": 0.2952, + "step": 16470 + }, + { + "epoch": 0.6159310232025609, + "grad_norm": 0.2922613322734833, + "learning_rate": 6.4378214634308925e-06, + "loss": 0.194, + "step": 16475 + }, + { + "epoch": 0.6161179521929108, + "grad_norm": 0.3419044613838196, + "learning_rate": 6.4323347268096316e-06, + "loss": 0.2536, + "step": 16480 + }, + { + "epoch": 0.6163048811832605, + "grad_norm": 0.37760069966316223, + "learning_rate": 6.426849220573901e-06, + "loss": 0.3227, + "step": 16485 + }, + { + "epoch": 0.6164918101736103, + "grad_norm": 0.8805325627326965, + "learning_rate": 6.4213649466154894e-06, + "loss": 0.3393, + "step": 16490 + }, + { + "epoch": 0.6166787391639601, + "grad_norm": 0.47188910841941833, + "learning_rate": 6.415881906825767e-06, + "loss": 0.2425, + "step": 16495 + }, + { + "epoch": 0.6168656681543099, + "grad_norm": 0.47129133343696594, + "learning_rate": 6.4104001030956755e-06, + "loss": 0.3161, + "step": 16500 + }, + { + "epoch": 0.6170525971446597, + "grad_norm": 0.43512147665023804, + "learning_rate": 6.404919537315737e-06, + "loss": 0.2694, + "step": 16505 + }, + { + "epoch": 0.6172395261350094, + "grad_norm": 0.1717224419116974, + "learning_rate": 6.399440211376033e-06, + "loss": 0.3666, + "step": 16510 + }, + { + "epoch": 0.6174264551253592, + "grad_norm": 0.6517991423606873, + "learning_rate": 6.393962127166233e-06, + "loss": 0.3249, + "step": 16515 + }, + { + "epoch": 0.6176133841157091, + "grad_norm": 0.30633100867271423, + "learning_rate": 6.388485286575572e-06, + "loss": 0.2839, + "step": 16520 + }, + { + "epoch": 0.6178003131060589, + "grad_norm": 0.5757652521133423, + "learning_rate": 6.383009691492847e-06, + "loss": 0.2825, + "step": 16525 + }, + { + "epoch": 0.6179872420964087, + "grad_norm": 0.4330870509147644, + "learning_rate": 6.377535343806446e-06, + "loss": 0.2922, + "step": 16530 + }, + { + "epoch": 0.6181741710867584, + "grad_norm": 0.279230535030365, + "learning_rate": 6.372062245404302e-06, + "loss": 0.2907, + "step": 16535 + }, + { + "epoch": 0.6183611000771082, + "grad_norm": 0.6289684176445007, + "learning_rate": 6.366590398173942e-06, + "loss": 0.4383, + "step": 16540 + }, + { + "epoch": 0.618548029067458, + "grad_norm": 0.6121999621391296, + "learning_rate": 6.3611198040024405e-06, + "loss": 0.2341, + "step": 16545 + }, + { + "epoch": 0.6187349580578078, + "grad_norm": 0.421684592962265, + "learning_rate": 6.355650464776453e-06, + "loss": 0.2145, + "step": 16550 + }, + { + "epoch": 0.6189218870481575, + "grad_norm": 0.45362263917922974, + "learning_rate": 6.350182382382193e-06, + "loss": 0.3438, + "step": 16555 + }, + { + "epoch": 0.6191088160385074, + "grad_norm": 0.35668495297431946, + "learning_rate": 6.34471555870545e-06, + "loss": 0.2971, + "step": 16560 + }, + { + "epoch": 0.6192957450288572, + "grad_norm": 0.3048703372478485, + "learning_rate": 6.339249995631575e-06, + "loss": 0.2846, + "step": 16565 + }, + { + "epoch": 0.619482674019207, + "grad_norm": 0.42319127917289734, + "learning_rate": 6.33378569504548e-06, + "loss": 0.2441, + "step": 16570 + }, + { + "epoch": 0.6196696030095568, + "grad_norm": 0.40297672152519226, + "learning_rate": 6.328322658831652e-06, + "loss": 0.2304, + "step": 16575 + }, + { + "epoch": 0.6198565319999065, + "grad_norm": 0.32730260491371155, + "learning_rate": 6.322860888874129e-06, + "loss": 0.2582, + "step": 16580 + }, + { + "epoch": 0.6200434609902563, + "grad_norm": 0.31443408131599426, + "learning_rate": 6.3174003870565256e-06, + "loss": 0.3275, + "step": 16585 + }, + { + "epoch": 0.6202303899806061, + "grad_norm": 0.440044641494751, + "learning_rate": 6.311941155262007e-06, + "loss": 0.295, + "step": 16590 + }, + { + "epoch": 0.6204173189709559, + "grad_norm": 0.396515816450119, + "learning_rate": 6.306483195373309e-06, + "loss": 0.3439, + "step": 16595 + }, + { + "epoch": 0.6206042479613056, + "grad_norm": 0.43022897839546204, + "learning_rate": 6.301026509272721e-06, + "loss": 0.334, + "step": 16600 + }, + { + "epoch": 0.6207911769516555, + "grad_norm": 0.20897158980369568, + "learning_rate": 6.2955710988421e-06, + "loss": 0.2489, + "step": 16605 + }, + { + "epoch": 0.6209781059420053, + "grad_norm": 0.43990305066108704, + "learning_rate": 6.290116965962867e-06, + "loss": 0.2715, + "step": 16610 + }, + { + "epoch": 0.6211650349323551, + "grad_norm": 0.21257735788822174, + "learning_rate": 6.284664112515988e-06, + "loss": 0.2052, + "step": 16615 + }, + { + "epoch": 0.6213519639227049, + "grad_norm": 0.4820898175239563, + "learning_rate": 6.279212540382e-06, + "loss": 0.3537, + "step": 16620 + }, + { + "epoch": 0.6215388929130546, + "grad_norm": 0.2106875628232956, + "learning_rate": 6.273762251440991e-06, + "loss": 0.2318, + "step": 16625 + }, + { + "epoch": 0.6217258219034044, + "grad_norm": 0.3450741767883301, + "learning_rate": 6.268313247572614e-06, + "loss": 0.3046, + "step": 16630 + }, + { + "epoch": 0.6219127508937542, + "grad_norm": 0.2806203365325928, + "learning_rate": 6.262865530656069e-06, + "loss": 0.2983, + "step": 16635 + }, + { + "epoch": 0.622099679884104, + "grad_norm": 0.5870185494422913, + "learning_rate": 6.257419102570122e-06, + "loss": 0.2252, + "step": 16640 + }, + { + "epoch": 0.6222866088744539, + "grad_norm": 0.33746689558029175, + "learning_rate": 6.251973965193085e-06, + "loss": 0.2831, + "step": 16645 + }, + { + "epoch": 0.6224735378648036, + "grad_norm": 0.4964042901992798, + "learning_rate": 6.246530120402833e-06, + "loss": 0.3002, + "step": 16650 + }, + { + "epoch": 0.6226604668551534, + "grad_norm": 0.31772905588150024, + "learning_rate": 6.241087570076796e-06, + "loss": 0.3224, + "step": 16655 + }, + { + "epoch": 0.6228473958455032, + "grad_norm": 0.547088623046875, + "learning_rate": 6.235646316091945e-06, + "loss": 0.2861, + "step": 16660 + }, + { + "epoch": 0.623034324835853, + "grad_norm": 0.5915294885635376, + "learning_rate": 6.230206360324822e-06, + "loss": 0.3202, + "step": 16665 + }, + { + "epoch": 0.6232212538262027, + "grad_norm": 0.3158590793609619, + "learning_rate": 6.224767704651502e-06, + "loss": 0.3383, + "step": 16670 + }, + { + "epoch": 0.6234081828165525, + "grad_norm": 0.4911237061023712, + "learning_rate": 6.219330350947632e-06, + "loss": 0.247, + "step": 16675 + }, + { + "epoch": 0.6235951118069023, + "grad_norm": 0.253912091255188, + "learning_rate": 6.213894301088388e-06, + "loss": 0.2208, + "step": 16680 + }, + { + "epoch": 0.6237820407972522, + "grad_norm": 0.3339253067970276, + "learning_rate": 6.208459556948519e-06, + "loss": 0.2759, + "step": 16685 + }, + { + "epoch": 0.623968969787602, + "grad_norm": 0.6338766813278198, + "learning_rate": 6.2030261204023055e-06, + "loss": 0.2522, + "step": 16690 + }, + { + "epoch": 0.6241558987779517, + "grad_norm": 0.3243858814239502, + "learning_rate": 6.197593993323583e-06, + "loss": 0.3472, + "step": 16695 + }, + { + "epoch": 0.6243428277683015, + "grad_norm": 0.5515966415405273, + "learning_rate": 6.192163177585745e-06, + "loss": 0.2825, + "step": 16700 + }, + { + "epoch": 0.6245297567586513, + "grad_norm": 0.3765512704849243, + "learning_rate": 6.1867336750617155e-06, + "loss": 0.2992, + "step": 16705 + }, + { + "epoch": 0.6247166857490011, + "grad_norm": 0.3543432950973511, + "learning_rate": 6.181305487623981e-06, + "loss": 0.3033, + "step": 16710 + }, + { + "epoch": 0.6249036147393509, + "grad_norm": 0.29796430468559265, + "learning_rate": 6.175878617144559e-06, + "loss": 0.2713, + "step": 16715 + }, + { + "epoch": 0.6250905437297006, + "grad_norm": 0.38573959469795227, + "learning_rate": 6.170453065495032e-06, + "loss": 0.2466, + "step": 16720 + }, + { + "epoch": 0.6252774727200505, + "grad_norm": 0.44721537828445435, + "learning_rate": 6.165028834546507e-06, + "loss": 0.2684, + "step": 16725 + }, + { + "epoch": 0.6254644017104003, + "grad_norm": 0.290237694978714, + "learning_rate": 6.1596059261696564e-06, + "loss": 0.266, + "step": 16730 + }, + { + "epoch": 0.6256513307007501, + "grad_norm": 0.36702224612236023, + "learning_rate": 6.154184342234678e-06, + "loss": 0.2362, + "step": 16735 + }, + { + "epoch": 0.6258382596910999, + "grad_norm": 0.4877489507198334, + "learning_rate": 6.148764084611325e-06, + "loss": 0.2409, + "step": 16740 + }, + { + "epoch": 0.6260251886814496, + "grad_norm": 0.7535521388053894, + "learning_rate": 6.143345155168885e-06, + "loss": 0.2288, + "step": 16745 + }, + { + "epoch": 0.6262121176717994, + "grad_norm": 0.2947673201560974, + "learning_rate": 6.137927555776194e-06, + "loss": 0.2387, + "step": 16750 + }, + { + "epoch": 0.6263990466621492, + "grad_norm": 0.4088467061519623, + "learning_rate": 6.1325112883016306e-06, + "loss": 0.2728, + "step": 16755 + }, + { + "epoch": 0.626585975652499, + "grad_norm": 0.5079031586647034, + "learning_rate": 6.1270963546131005e-06, + "loss": 0.2597, + "step": 16760 + }, + { + "epoch": 0.6267729046428488, + "grad_norm": 0.31118345260620117, + "learning_rate": 6.121682756578069e-06, + "loss": 0.3091, + "step": 16765 + }, + { + "epoch": 0.6269598336331986, + "grad_norm": 0.7571595311164856, + "learning_rate": 6.116270496063523e-06, + "loss": 0.2882, + "step": 16770 + }, + { + "epoch": 0.6271467626235484, + "grad_norm": 0.3171136975288391, + "learning_rate": 6.110859574936006e-06, + "loss": 0.2565, + "step": 16775 + }, + { + "epoch": 0.6273336916138982, + "grad_norm": 0.448777973651886, + "learning_rate": 6.105449995061579e-06, + "loss": 0.2222, + "step": 16780 + }, + { + "epoch": 0.627520620604248, + "grad_norm": 0.44783681631088257, + "learning_rate": 6.1000417583058595e-06, + "loss": 0.3105, + "step": 16785 + }, + { + "epoch": 0.6277075495945977, + "grad_norm": 0.24276278913021088, + "learning_rate": 6.094634866533984e-06, + "loss": 0.2999, + "step": 16790 + }, + { + "epoch": 0.6278944785849475, + "grad_norm": 0.25277918577194214, + "learning_rate": 6.089229321610641e-06, + "loss": 0.2993, + "step": 16795 + }, + { + "epoch": 0.6280814075752973, + "grad_norm": 0.32562288641929626, + "learning_rate": 6.083825125400052e-06, + "loss": 0.2675, + "step": 16800 + }, + { + "epoch": 0.6282683365656472, + "grad_norm": 0.5652545094490051, + "learning_rate": 6.078422279765961e-06, + "loss": 0.2676, + "step": 16805 + }, + { + "epoch": 0.628455265555997, + "grad_norm": 0.5595022439956665, + "learning_rate": 6.073020786571662e-06, + "loss": 0.2698, + "step": 16810 + }, + { + "epoch": 0.6286421945463467, + "grad_norm": 0.18715853989124298, + "learning_rate": 6.067620647679966e-06, + "loss": 0.3081, + "step": 16815 + }, + { + "epoch": 0.6288291235366965, + "grad_norm": 0.4392125904560089, + "learning_rate": 6.062221864953237e-06, + "loss": 0.2487, + "step": 16820 + }, + { + "epoch": 0.6290160525270463, + "grad_norm": 0.542143702507019, + "learning_rate": 6.0568244402533525e-06, + "loss": 0.3002, + "step": 16825 + }, + { + "epoch": 0.6292029815173961, + "grad_norm": 0.3177799582481384, + "learning_rate": 6.051428375441735e-06, + "loss": 0.2498, + "step": 16830 + }, + { + "epoch": 0.6293899105077458, + "grad_norm": 0.731010377407074, + "learning_rate": 6.046033672379325e-06, + "loss": 0.2696, + "step": 16835 + }, + { + "epoch": 0.6295768394980956, + "grad_norm": 0.3050804138183594, + "learning_rate": 6.040640332926606e-06, + "loss": 0.2236, + "step": 16840 + }, + { + "epoch": 0.6297637684884454, + "grad_norm": 0.48626509308815, + "learning_rate": 6.035248358943591e-06, + "loss": 0.3319, + "step": 16845 + }, + { + "epoch": 0.6299506974787953, + "grad_norm": 0.4176667034626007, + "learning_rate": 6.0298577522898095e-06, + "loss": 0.2927, + "step": 16850 + }, + { + "epoch": 0.6301376264691451, + "grad_norm": 0.3971768319606781, + "learning_rate": 6.024468514824333e-06, + "loss": 0.2824, + "step": 16855 + }, + { + "epoch": 0.6303245554594948, + "grad_norm": 0.4421550929546356, + "learning_rate": 6.019080648405747e-06, + "loss": 0.2794, + "step": 16860 + }, + { + "epoch": 0.6305114844498446, + "grad_norm": 0.30311697721481323, + "learning_rate": 6.013694154892183e-06, + "loss": 0.2382, + "step": 16865 + }, + { + "epoch": 0.6306984134401944, + "grad_norm": 0.14912711083889008, + "learning_rate": 6.008309036141279e-06, + "loss": 0.2516, + "step": 16870 + }, + { + "epoch": 0.6308853424305442, + "grad_norm": 0.4212360978126526, + "learning_rate": 6.0029252940102154e-06, + "loss": 0.3549, + "step": 16875 + }, + { + "epoch": 0.631072271420894, + "grad_norm": 1.1165913343429565, + "learning_rate": 5.997542930355685e-06, + "loss": 0.3441, + "step": 16880 + }, + { + "epoch": 0.6312592004112437, + "grad_norm": 0.5724372267723083, + "learning_rate": 5.992161947033912e-06, + "loss": 0.273, + "step": 16885 + }, + { + "epoch": 0.6314461294015936, + "grad_norm": 0.39647606015205383, + "learning_rate": 5.9867823459006466e-06, + "loss": 0.2822, + "step": 16890 + }, + { + "epoch": 0.6316330583919434, + "grad_norm": 0.6420783996582031, + "learning_rate": 5.981404128811157e-06, + "loss": 0.353, + "step": 16895 + }, + { + "epoch": 0.6318199873822932, + "grad_norm": 0.21946968138217926, + "learning_rate": 5.976027297620237e-06, + "loss": 0.2933, + "step": 16900 + }, + { + "epoch": 0.632006916372643, + "grad_norm": 0.48058822751045227, + "learning_rate": 5.970651854182197e-06, + "loss": 0.2776, + "step": 16905 + }, + { + "epoch": 0.6321938453629927, + "grad_norm": 0.6671790480613708, + "learning_rate": 5.965277800350879e-06, + "loss": 0.2776, + "step": 16910 + }, + { + "epoch": 0.6323807743533425, + "grad_norm": 0.3052579164505005, + "learning_rate": 5.959905137979637e-06, + "loss": 0.3139, + "step": 16915 + }, + { + "epoch": 0.6325677033436923, + "grad_norm": 0.37527140974998474, + "learning_rate": 5.954533868921352e-06, + "loss": 0.2721, + "step": 16920 + }, + { + "epoch": 0.632754632334042, + "grad_norm": 0.3904482126235962, + "learning_rate": 5.949163995028418e-06, + "loss": 0.2949, + "step": 16925 + }, + { + "epoch": 0.6329415613243919, + "grad_norm": 0.3578369617462158, + "learning_rate": 5.943795518152747e-06, + "loss": 0.315, + "step": 16930 + }, + { + "epoch": 0.6331284903147417, + "grad_norm": 0.2731132507324219, + "learning_rate": 5.9384284401457824e-06, + "loss": 0.4051, + "step": 16935 + }, + { + "epoch": 0.6333154193050915, + "grad_norm": 0.3908912241458893, + "learning_rate": 5.933062762858467e-06, + "loss": 0.2259, + "step": 16940 + }, + { + "epoch": 0.6335023482954413, + "grad_norm": 0.6744188070297241, + "learning_rate": 5.927698488141273e-06, + "loss": 0.2906, + "step": 16945 + }, + { + "epoch": 0.633689277285791, + "grad_norm": 0.3423663377761841, + "learning_rate": 5.9223356178441835e-06, + "loss": 0.2811, + "step": 16950 + }, + { + "epoch": 0.6338762062761408, + "grad_norm": 0.4647408723831177, + "learning_rate": 5.9169741538167015e-06, + "loss": 0.2524, + "step": 16955 + }, + { + "epoch": 0.6340631352664906, + "grad_norm": 0.33333298563957214, + "learning_rate": 5.9116140979078364e-06, + "loss": 0.2098, + "step": 16960 + }, + { + "epoch": 0.6342500642568404, + "grad_norm": 0.41983723640441895, + "learning_rate": 5.906255451966127e-06, + "loss": 0.257, + "step": 16965 + }, + { + "epoch": 0.6344369932471903, + "grad_norm": 0.5163545608520508, + "learning_rate": 5.900898217839608e-06, + "loss": 0.3418, + "step": 16970 + }, + { + "epoch": 0.63462392223754, + "grad_norm": 0.5699335932731628, + "learning_rate": 5.895542397375837e-06, + "loss": 0.2937, + "step": 16975 + }, + { + "epoch": 0.6348108512278898, + "grad_norm": 0.3931114077568054, + "learning_rate": 5.8901879924218915e-06, + "loss": 0.3099, + "step": 16980 + }, + { + "epoch": 0.6349977802182396, + "grad_norm": 0.19795525074005127, + "learning_rate": 5.884835004824343e-06, + "loss": 0.3461, + "step": 16985 + }, + { + "epoch": 0.6351847092085894, + "grad_norm": 0.34459546208381653, + "learning_rate": 5.87948343642929e-06, + "loss": 0.2567, + "step": 16990 + }, + { + "epoch": 0.6353716381989392, + "grad_norm": 0.37620869278907776, + "learning_rate": 5.874133289082329e-06, + "loss": 0.2748, + "step": 16995 + }, + { + "epoch": 0.6355585671892889, + "grad_norm": 0.30327436327934265, + "learning_rate": 5.868784564628578e-06, + "loss": 0.2824, + "step": 17000 + }, + { + "epoch": 0.6357454961796387, + "grad_norm": 0.3879458010196686, + "learning_rate": 5.863437264912653e-06, + "loss": 0.3167, + "step": 17005 + }, + { + "epoch": 0.6359324251699886, + "grad_norm": 0.3175660967826843, + "learning_rate": 5.858091391778691e-06, + "loss": 0.3064, + "step": 17010 + }, + { + "epoch": 0.6361193541603384, + "grad_norm": 0.7663373351097107, + "learning_rate": 5.852746947070326e-06, + "loss": 0.3244, + "step": 17015 + }, + { + "epoch": 0.6363062831506882, + "grad_norm": 0.3658159375190735, + "learning_rate": 5.847403932630702e-06, + "loss": 0.2626, + "step": 17020 + }, + { + "epoch": 0.6364932121410379, + "grad_norm": 0.3354896008968353, + "learning_rate": 5.84206235030248e-06, + "loss": 0.2767, + "step": 17025 + }, + { + "epoch": 0.6366801411313877, + "grad_norm": 0.4783341586589813, + "learning_rate": 5.836722201927809e-06, + "loss": 0.383, + "step": 17030 + }, + { + "epoch": 0.6368670701217375, + "grad_norm": 0.5083820819854736, + "learning_rate": 5.831383489348361e-06, + "loss": 0.3022, + "step": 17035 + }, + { + "epoch": 0.6370539991120873, + "grad_norm": 0.34456634521484375, + "learning_rate": 5.826046214405298e-06, + "loss": 0.3142, + "step": 17040 + }, + { + "epoch": 0.637240928102437, + "grad_norm": 0.5159575343132019, + "learning_rate": 5.820710378939301e-06, + "loss": 0.2298, + "step": 17045 + }, + { + "epoch": 0.6374278570927869, + "grad_norm": 0.5502740740776062, + "learning_rate": 5.815375984790543e-06, + "loss": 0.4202, + "step": 17050 + }, + { + "epoch": 0.6376147860831367, + "grad_norm": 0.6210566759109497, + "learning_rate": 5.810043033798702e-06, + "loss": 0.3434, + "step": 17055 + }, + { + "epoch": 0.6378017150734865, + "grad_norm": 0.47056567668914795, + "learning_rate": 5.804711527802957e-06, + "loss": 0.2732, + "step": 17060 + }, + { + "epoch": 0.6379886440638363, + "grad_norm": 0.5329228043556213, + "learning_rate": 5.799381468641998e-06, + "loss": 0.2628, + "step": 17065 + }, + { + "epoch": 0.638175573054186, + "grad_norm": 0.5529435276985168, + "learning_rate": 5.794052858154009e-06, + "loss": 0.3167, + "step": 17070 + }, + { + "epoch": 0.6383625020445358, + "grad_norm": 0.27884042263031006, + "learning_rate": 5.788725698176672e-06, + "loss": 0.246, + "step": 17075 + }, + { + "epoch": 0.6385494310348856, + "grad_norm": 0.4293839931488037, + "learning_rate": 5.783399990547176e-06, + "loss": 0.2365, + "step": 17080 + }, + { + "epoch": 0.6387363600252354, + "grad_norm": 0.47030767798423767, + "learning_rate": 5.778075737102198e-06, + "loss": 0.3154, + "step": 17085 + }, + { + "epoch": 0.6389232890155851, + "grad_norm": 0.2606756091117859, + "learning_rate": 5.772752939677929e-06, + "loss": 0.2479, + "step": 17090 + }, + { + "epoch": 0.639110218005935, + "grad_norm": 0.527047872543335, + "learning_rate": 5.767431600110042e-06, + "loss": 0.3171, + "step": 17095 + }, + { + "epoch": 0.6392971469962848, + "grad_norm": 0.49253425002098083, + "learning_rate": 5.7621117202337205e-06, + "loss": 0.3053, + "step": 17100 + }, + { + "epoch": 0.6394840759866346, + "grad_norm": 0.5128486752510071, + "learning_rate": 5.7567933018836365e-06, + "loss": 0.3014, + "step": 17105 + }, + { + "epoch": 0.6396710049769844, + "grad_norm": 0.46093523502349854, + "learning_rate": 5.751476346893956e-06, + "loss": 0.3348, + "step": 17110 + }, + { + "epoch": 0.6398579339673341, + "grad_norm": 0.3886370360851288, + "learning_rate": 5.746160857098351e-06, + "loss": 0.2569, + "step": 17115 + }, + { + "epoch": 0.6400448629576839, + "grad_norm": 0.6568734049797058, + "learning_rate": 5.740846834329974e-06, + "loss": 0.2808, + "step": 17120 + }, + { + "epoch": 0.6402317919480337, + "grad_norm": 0.39880597591400146, + "learning_rate": 5.735534280421489e-06, + "loss": 0.3035, + "step": 17125 + }, + { + "epoch": 0.6404187209383835, + "grad_norm": 0.47509804368019104, + "learning_rate": 5.730223197205034e-06, + "loss": 0.2522, + "step": 17130 + }, + { + "epoch": 0.6406056499287334, + "grad_norm": 0.22290876507759094, + "learning_rate": 5.7249135865122575e-06, + "loss": 0.2568, + "step": 17135 + }, + { + "epoch": 0.6407925789190831, + "grad_norm": 0.36923328042030334, + "learning_rate": 5.719605450174283e-06, + "loss": 0.2765, + "step": 17140 + }, + { + "epoch": 0.6409795079094329, + "grad_norm": 0.43416711688041687, + "learning_rate": 5.7142987900217464e-06, + "loss": 0.3103, + "step": 17145 + }, + { + "epoch": 0.6411664368997827, + "grad_norm": 0.32463186979293823, + "learning_rate": 5.708993607884754e-06, + "loss": 0.2868, + "step": 17150 + }, + { + "epoch": 0.6413533658901325, + "grad_norm": 0.21319809556007385, + "learning_rate": 5.703689905592911e-06, + "loss": 0.244, + "step": 17155 + }, + { + "epoch": 0.6415402948804823, + "grad_norm": 0.39962127804756165, + "learning_rate": 5.698387684975317e-06, + "loss": 0.3021, + "step": 17160 + }, + { + "epoch": 0.641727223870832, + "grad_norm": 0.48100176453590393, + "learning_rate": 5.693086947860551e-06, + "loss": 0.3278, + "step": 17165 + }, + { + "epoch": 0.6419141528611818, + "grad_norm": 0.5302535891532898, + "learning_rate": 5.687787696076692e-06, + "loss": 0.2589, + "step": 17170 + }, + { + "epoch": 0.6421010818515317, + "grad_norm": 0.5527657866477966, + "learning_rate": 5.682489931451292e-06, + "loss": 0.3159, + "step": 17175 + }, + { + "epoch": 0.6422880108418815, + "grad_norm": 0.3536984920501709, + "learning_rate": 5.677193655811406e-06, + "loss": 0.2313, + "step": 17180 + }, + { + "epoch": 0.6424749398322313, + "grad_norm": 0.49602222442626953, + "learning_rate": 5.67189887098356e-06, + "loss": 0.3013, + "step": 17185 + }, + { + "epoch": 0.642661868822581, + "grad_norm": 0.5658937096595764, + "learning_rate": 5.666605578793782e-06, + "loss": 0.2457, + "step": 17190 + }, + { + "epoch": 0.6428487978129308, + "grad_norm": 0.10081786662340164, + "learning_rate": 5.661313781067572e-06, + "loss": 0.2528, + "step": 17195 + }, + { + "epoch": 0.6430357268032806, + "grad_norm": 0.2929933965206146, + "learning_rate": 5.656023479629915e-06, + "loss": 0.282, + "step": 17200 + }, + { + "epoch": 0.6432226557936304, + "grad_norm": 0.27145472168922424, + "learning_rate": 5.650734676305295e-06, + "loss": 0.2697, + "step": 17205 + }, + { + "epoch": 0.6434095847839801, + "grad_norm": 0.45547887682914734, + "learning_rate": 5.645447372917658e-06, + "loss": 0.2184, + "step": 17210 + }, + { + "epoch": 0.64359651377433, + "grad_norm": 0.3759501576423645, + "learning_rate": 5.640161571290452e-06, + "loss": 0.259, + "step": 17215 + }, + { + "epoch": 0.6437834427646798, + "grad_norm": 0.5527650117874146, + "learning_rate": 5.6348772732465925e-06, + "loss": 0.271, + "step": 17220 + }, + { + "epoch": 0.6439703717550296, + "grad_norm": 0.6469981670379639, + "learning_rate": 5.629594480608487e-06, + "loss": 0.2916, + "step": 17225 + }, + { + "epoch": 0.6441573007453794, + "grad_norm": 0.3704184591770172, + "learning_rate": 5.6243131951980145e-06, + "loss": 0.2614, + "step": 17230 + }, + { + "epoch": 0.6443442297357291, + "grad_norm": 0.3571639358997345, + "learning_rate": 5.619033418836545e-06, + "loss": 0.3213, + "step": 17235 + }, + { + "epoch": 0.6445311587260789, + "grad_norm": 0.40060603618621826, + "learning_rate": 5.613755153344918e-06, + "loss": 0.2876, + "step": 17240 + }, + { + "epoch": 0.6447180877164287, + "grad_norm": 0.38819968700408936, + "learning_rate": 5.608478400543455e-06, + "loss": 0.2737, + "step": 17245 + }, + { + "epoch": 0.6449050167067785, + "grad_norm": 0.38324692845344543, + "learning_rate": 5.60320316225196e-06, + "loss": 0.276, + "step": 17250 + }, + { + "epoch": 0.6450919456971284, + "grad_norm": 0.2735843360424042, + "learning_rate": 5.597929440289709e-06, + "loss": 0.3314, + "step": 17255 + }, + { + "epoch": 0.6452788746874781, + "grad_norm": 0.5796816349029541, + "learning_rate": 5.592657236475461e-06, + "loss": 0.3573, + "step": 17260 + }, + { + "epoch": 0.6454658036778279, + "grad_norm": 0.3068085312843323, + "learning_rate": 5.587386552627442e-06, + "loss": 0.3163, + "step": 17265 + }, + { + "epoch": 0.6456527326681777, + "grad_norm": 0.5472904443740845, + "learning_rate": 5.582117390563368e-06, + "loss": 0.3123, + "step": 17270 + }, + { + "epoch": 0.6458396616585275, + "grad_norm": 0.3581070005893707, + "learning_rate": 5.576849752100413e-06, + "loss": 0.2803, + "step": 17275 + }, + { + "epoch": 0.6460265906488772, + "grad_norm": 0.40146610140800476, + "learning_rate": 5.571583639055243e-06, + "loss": 0.3476, + "step": 17280 + }, + { + "epoch": 0.646213519639227, + "grad_norm": 0.43631863594055176, + "learning_rate": 5.566319053243983e-06, + "loss": 0.2792, + "step": 17285 + }, + { + "epoch": 0.6464004486295768, + "grad_norm": 0.3435373604297638, + "learning_rate": 5.561055996482243e-06, + "loss": 0.291, + "step": 17290 + }, + { + "epoch": 0.6465873776199267, + "grad_norm": 0.5556064248085022, + "learning_rate": 5.555794470585099e-06, + "loss": 0.2866, + "step": 17295 + }, + { + "epoch": 0.6467743066102765, + "grad_norm": 0.6091718077659607, + "learning_rate": 5.550534477367096e-06, + "loss": 0.3378, + "step": 17300 + }, + { + "epoch": 0.6469612356006262, + "grad_norm": 0.3173150420188904, + "learning_rate": 5.54527601864226e-06, + "loss": 0.2673, + "step": 17305 + }, + { + "epoch": 0.647148164590976, + "grad_norm": 0.629828929901123, + "learning_rate": 5.540019096224079e-06, + "loss": 0.2741, + "step": 17310 + }, + { + "epoch": 0.6473350935813258, + "grad_norm": 0.6558170914649963, + "learning_rate": 5.534763711925522e-06, + "loss": 0.3303, + "step": 17315 + }, + { + "epoch": 0.6475220225716756, + "grad_norm": 0.3189551532268524, + "learning_rate": 5.529509867559011e-06, + "loss": 0.2273, + "step": 17320 + }, + { + "epoch": 0.6477089515620253, + "grad_norm": 0.4152987003326416, + "learning_rate": 5.524257564936454e-06, + "loss": 0.2371, + "step": 17325 + }, + { + "epoch": 0.6478958805523751, + "grad_norm": 0.46988746523857117, + "learning_rate": 5.519006805869213e-06, + "loss": 0.2364, + "step": 17330 + }, + { + "epoch": 0.6480828095427249, + "grad_norm": 0.25283321738243103, + "learning_rate": 5.513757592168132e-06, + "loss": 0.2328, + "step": 17335 + }, + { + "epoch": 0.6482697385330748, + "grad_norm": 0.792521059513092, + "learning_rate": 5.508509925643511e-06, + "loss": 0.2674, + "step": 17340 + }, + { + "epoch": 0.6484566675234246, + "grad_norm": 0.7444917559623718, + "learning_rate": 5.503263808105114e-06, + "loss": 0.2861, + "step": 17345 + }, + { + "epoch": 0.6486435965137743, + "grad_norm": 0.4276942312717438, + "learning_rate": 5.4980192413621866e-06, + "loss": 0.2641, + "step": 17350 + }, + { + "epoch": 0.6488305255041241, + "grad_norm": 0.4908183217048645, + "learning_rate": 5.49277622722342e-06, + "loss": 0.2743, + "step": 17355 + }, + { + "epoch": 0.6490174544944739, + "grad_norm": 0.39298996329307556, + "learning_rate": 5.487534767496989e-06, + "loss": 0.2977, + "step": 17360 + }, + { + "epoch": 0.6492043834848237, + "grad_norm": 0.1759922206401825, + "learning_rate": 5.482294863990514e-06, + "loss": 0.3114, + "step": 17365 + }, + { + "epoch": 0.6493913124751735, + "grad_norm": 0.43271568417549133, + "learning_rate": 5.477056518511096e-06, + "loss": 0.2849, + "step": 17370 + }, + { + "epoch": 0.6495782414655232, + "grad_norm": 0.4080221951007843, + "learning_rate": 5.471819732865282e-06, + "loss": 0.2555, + "step": 17375 + }, + { + "epoch": 0.6497651704558731, + "grad_norm": 0.3136856257915497, + "learning_rate": 5.466584508859096e-06, + "loss": 0.1905, + "step": 17380 + }, + { + "epoch": 0.6499520994462229, + "grad_norm": 0.3478422164916992, + "learning_rate": 5.461350848298016e-06, + "loss": 0.356, + "step": 17385 + }, + { + "epoch": 0.6501390284365727, + "grad_norm": 0.3305181562900543, + "learning_rate": 5.456118752986975e-06, + "loss": 0.3352, + "step": 17390 + }, + { + "epoch": 0.6503259574269225, + "grad_norm": 0.35783740878105164, + "learning_rate": 5.4508882247303815e-06, + "loss": 0.2462, + "step": 17395 + }, + { + "epoch": 0.6505128864172722, + "grad_norm": 0.14547370374202728, + "learning_rate": 5.445659265332087e-06, + "loss": 0.2325, + "step": 17400 + }, + { + "epoch": 0.650699815407622, + "grad_norm": 0.26868870854377747, + "learning_rate": 5.440431876595418e-06, + "loss": 0.2746, + "step": 17405 + }, + { + "epoch": 0.6508867443979718, + "grad_norm": 0.4959259629249573, + "learning_rate": 5.435206060323142e-06, + "loss": 0.2315, + "step": 17410 + }, + { + "epoch": 0.6510736733883216, + "grad_norm": 0.5970064997673035, + "learning_rate": 5.4299818183175006e-06, + "loss": 0.3014, + "step": 17415 + }, + { + "epoch": 0.6512606023786714, + "grad_norm": 0.4646480679512024, + "learning_rate": 5.424759152380179e-06, + "loss": 0.3339, + "step": 17420 + }, + { + "epoch": 0.6514475313690212, + "grad_norm": 0.31364089250564575, + "learning_rate": 5.419538064312333e-06, + "loss": 0.3644, + "step": 17425 + }, + { + "epoch": 0.651634460359371, + "grad_norm": 0.4947362244129181, + "learning_rate": 5.414318555914563e-06, + "loss": 0.2448, + "step": 17430 + }, + { + "epoch": 0.6518213893497208, + "grad_norm": 0.3628036677837372, + "learning_rate": 5.409100628986921e-06, + "loss": 0.2198, + "step": 17435 + }, + { + "epoch": 0.6520083183400706, + "grad_norm": 0.38185828924179077, + "learning_rate": 5.4038842853289305e-06, + "loss": 0.3506, + "step": 17440 + }, + { + "epoch": 0.6521952473304203, + "grad_norm": 0.37392547726631165, + "learning_rate": 5.398669526739551e-06, + "loss": 0.2647, + "step": 17445 + }, + { + "epoch": 0.6523821763207701, + "grad_norm": 0.3992078900337219, + "learning_rate": 5.39345635501721e-06, + "loss": 0.2584, + "step": 17450 + }, + { + "epoch": 0.6525691053111199, + "grad_norm": 0.27611681818962097, + "learning_rate": 5.388244771959777e-06, + "loss": 0.2419, + "step": 17455 + }, + { + "epoch": 0.6527560343014698, + "grad_norm": 0.42422646284103394, + "learning_rate": 5.38303477936458e-06, + "loss": 0.2328, + "step": 17460 + }, + { + "epoch": 0.6529429632918196, + "grad_norm": 0.25942620635032654, + "learning_rate": 5.3778263790283905e-06, + "loss": 0.2184, + "step": 17465 + }, + { + "epoch": 0.6531298922821693, + "grad_norm": 0.4233878552913666, + "learning_rate": 5.372619572747442e-06, + "loss": 0.3018, + "step": 17470 + }, + { + "epoch": 0.6533168212725191, + "grad_norm": 0.6455023884773254, + "learning_rate": 5.3674143623174144e-06, + "loss": 0.269, + "step": 17475 + }, + { + "epoch": 0.6535037502628689, + "grad_norm": 0.635086178779602, + "learning_rate": 5.362210749533434e-06, + "loss": 0.2835, + "step": 17480 + }, + { + "epoch": 0.6536906792532187, + "grad_norm": 0.21044528484344482, + "learning_rate": 5.357008736190077e-06, + "loss": 0.3771, + "step": 17485 + }, + { + "epoch": 0.6538776082435684, + "grad_norm": 0.5174884796142578, + "learning_rate": 5.351808324081362e-06, + "loss": 0.2438, + "step": 17490 + }, + { + "epoch": 0.6540645372339182, + "grad_norm": 0.4793969988822937, + "learning_rate": 5.346609515000775e-06, + "loss": 0.2707, + "step": 17495 + }, + { + "epoch": 0.6542514662242681, + "grad_norm": 0.39202970266342163, + "learning_rate": 5.341412310741225e-06, + "loss": 0.2867, + "step": 17500 + }, + { + "epoch": 0.6544383952146179, + "grad_norm": 0.3729572296142578, + "learning_rate": 5.336216713095087e-06, + "loss": 0.3556, + "step": 17505 + }, + { + "epoch": 0.6546253242049677, + "grad_norm": 0.2661343514919281, + "learning_rate": 5.3310227238541665e-06, + "loss": 0.2963, + "step": 17510 + }, + { + "epoch": 0.6548122531953174, + "grad_norm": 0.6629586815834045, + "learning_rate": 5.325830344809726e-06, + "loss": 0.3187, + "step": 17515 + }, + { + "epoch": 0.6549991821856672, + "grad_norm": 0.5217400789260864, + "learning_rate": 5.320639577752471e-06, + "loss": 0.2794, + "step": 17520 + }, + { + "epoch": 0.655186111176017, + "grad_norm": 0.40809670090675354, + "learning_rate": 5.315450424472546e-06, + "loss": 0.3342, + "step": 17525 + }, + { + "epoch": 0.6553730401663668, + "grad_norm": 0.3134659230709076, + "learning_rate": 5.31026288675954e-06, + "loss": 0.2624, + "step": 17530 + }, + { + "epoch": 0.6555599691567165, + "grad_norm": 0.5745450258255005, + "learning_rate": 5.305076966402483e-06, + "loss": 0.3727, + "step": 17535 + }, + { + "epoch": 0.6557468981470664, + "grad_norm": 0.6128538846969604, + "learning_rate": 5.299892665189856e-06, + "loss": 0.2772, + "step": 17540 + }, + { + "epoch": 0.6559338271374162, + "grad_norm": 0.2917860746383667, + "learning_rate": 5.2947099849095695e-06, + "loss": 0.3028, + "step": 17545 + }, + { + "epoch": 0.656120756127766, + "grad_norm": 0.6575778722763062, + "learning_rate": 5.2895289273489915e-06, + "loss": 0.2669, + "step": 17550 + }, + { + "epoch": 0.6563076851181158, + "grad_norm": 0.4796570837497711, + "learning_rate": 5.2843494942949095e-06, + "loss": 0.3287, + "step": 17555 + }, + { + "epoch": 0.6564946141084655, + "grad_norm": 0.5242589712142944, + "learning_rate": 5.279171687533566e-06, + "loss": 0.2346, + "step": 17560 + }, + { + "epoch": 0.6566815430988153, + "grad_norm": 0.4839895963668823, + "learning_rate": 5.273995508850643e-06, + "loss": 0.2419, + "step": 17565 + }, + { + "epoch": 0.6568684720891651, + "grad_norm": 0.4953691065311432, + "learning_rate": 5.268820960031252e-06, + "loss": 0.3018, + "step": 17570 + }, + { + "epoch": 0.6570554010795149, + "grad_norm": 0.34803852438926697, + "learning_rate": 5.263648042859945e-06, + "loss": 0.2719, + "step": 17575 + }, + { + "epoch": 0.6572423300698647, + "grad_norm": 0.4037022888660431, + "learning_rate": 5.258476759120713e-06, + "loss": 0.251, + "step": 17580 + }, + { + "epoch": 0.6574292590602145, + "grad_norm": 0.38551491498947144, + "learning_rate": 5.253307110596988e-06, + "loss": 0.1929, + "step": 17585 + }, + { + "epoch": 0.6576161880505643, + "grad_norm": 0.33148205280303955, + "learning_rate": 5.248139099071625e-06, + "loss": 0.2561, + "step": 17590 + }, + { + "epoch": 0.6578031170409141, + "grad_norm": 0.42896026372909546, + "learning_rate": 5.242972726326934e-06, + "loss": 0.2475, + "step": 17595 + }, + { + "epoch": 0.6579900460312639, + "grad_norm": 0.5869537591934204, + "learning_rate": 5.23780799414464e-06, + "loss": 0.3102, + "step": 17600 + }, + { + "epoch": 0.6581769750216137, + "grad_norm": 0.42332693934440613, + "learning_rate": 5.232644904305914e-06, + "loss": 0.2316, + "step": 17605 + }, + { + "epoch": 0.6583639040119634, + "grad_norm": 0.20432531833648682, + "learning_rate": 5.227483458591364e-06, + "loss": 0.3037, + "step": 17610 + }, + { + "epoch": 0.6585508330023132, + "grad_norm": 0.5013585686683655, + "learning_rate": 5.222323658781018e-06, + "loss": 0.2422, + "step": 17615 + }, + { + "epoch": 0.658737761992663, + "grad_norm": 0.5252900719642639, + "learning_rate": 5.21716550665434e-06, + "loss": 0.2841, + "step": 17620 + }, + { + "epoch": 0.6589246909830129, + "grad_norm": 0.3324468433856964, + "learning_rate": 5.212009003990237e-06, + "loss": 0.2643, + "step": 17625 + }, + { + "epoch": 0.6591116199733627, + "grad_norm": 0.5473644137382507, + "learning_rate": 5.206854152567036e-06, + "loss": 0.2717, + "step": 17630 + }, + { + "epoch": 0.6592985489637124, + "grad_norm": 1.3611345291137695, + "learning_rate": 5.201700954162493e-06, + "loss": 0.2884, + "step": 17635 + }, + { + "epoch": 0.6594854779540622, + "grad_norm": 0.6788679957389832, + "learning_rate": 5.196549410553806e-06, + "loss": 0.2607, + "step": 17640 + }, + { + "epoch": 0.659672406944412, + "grad_norm": 0.5106036067008972, + "learning_rate": 5.191399523517586e-06, + "loss": 0.3072, + "step": 17645 + }, + { + "epoch": 0.6598593359347618, + "grad_norm": 0.8317034840583801, + "learning_rate": 5.1862512948298885e-06, + "loss": 0.3571, + "step": 17650 + }, + { + "epoch": 0.6600462649251115, + "grad_norm": 0.3770235776901245, + "learning_rate": 5.181104726266191e-06, + "loss": 0.3051, + "step": 17655 + }, + { + "epoch": 0.6602331939154613, + "grad_norm": 0.33272841572761536, + "learning_rate": 5.17595981960139e-06, + "loss": 0.3451, + "step": 17660 + }, + { + "epoch": 0.6604201229058112, + "grad_norm": 0.36768677830696106, + "learning_rate": 5.170816576609825e-06, + "loss": 0.259, + "step": 17665 + }, + { + "epoch": 0.660607051896161, + "grad_norm": 0.4106055498123169, + "learning_rate": 5.16567499906525e-06, + "loss": 0.342, + "step": 17670 + }, + { + "epoch": 0.6607939808865108, + "grad_norm": 0.44016748666763306, + "learning_rate": 5.1605350887408466e-06, + "loss": 0.3125, + "step": 17675 + }, + { + "epoch": 0.6609809098768605, + "grad_norm": 0.40315133333206177, + "learning_rate": 5.1553968474092185e-06, + "loss": 0.2638, + "step": 17680 + }, + { + "epoch": 0.6611678388672103, + "grad_norm": 0.22286388278007507, + "learning_rate": 5.150260276842407e-06, + "loss": 0.2822, + "step": 17685 + }, + { + "epoch": 0.6613547678575601, + "grad_norm": 0.3545617163181305, + "learning_rate": 5.1451253788118595e-06, + "loss": 0.3389, + "step": 17690 + }, + { + "epoch": 0.6615416968479099, + "grad_norm": 0.4508470594882965, + "learning_rate": 5.139992155088458e-06, + "loss": 0.28, + "step": 17695 + }, + { + "epoch": 0.6617286258382596, + "grad_norm": 0.3894427418708801, + "learning_rate": 5.13486060744251e-06, + "loss": 0.2735, + "step": 17700 + }, + { + "epoch": 0.6619155548286095, + "grad_norm": 0.5788761377334595, + "learning_rate": 5.12973073764373e-06, + "loss": 0.3752, + "step": 17705 + }, + { + "epoch": 0.6621024838189593, + "grad_norm": 0.2697499394416809, + "learning_rate": 5.124602547461273e-06, + "loss": 0.2663, + "step": 17710 + }, + { + "epoch": 0.6622894128093091, + "grad_norm": 0.41945934295654297, + "learning_rate": 5.119476038663699e-06, + "loss": 0.3097, + "step": 17715 + }, + { + "epoch": 0.6624763417996589, + "grad_norm": 0.20760861039161682, + "learning_rate": 5.1143512130189935e-06, + "loss": 0.2761, + "step": 17720 + }, + { + "epoch": 0.6626632707900086, + "grad_norm": 0.47304192185401917, + "learning_rate": 5.109228072294561e-06, + "loss": 0.277, + "step": 17725 + }, + { + "epoch": 0.6628501997803584, + "grad_norm": 0.3576745390892029, + "learning_rate": 5.1041066182572296e-06, + "loss": 0.2783, + "step": 17730 + }, + { + "epoch": 0.6630371287707082, + "grad_norm": 0.41994425654411316, + "learning_rate": 5.098986852673239e-06, + "loss": 0.2465, + "step": 17735 + }, + { + "epoch": 0.663224057761058, + "grad_norm": 0.3337131440639496, + "learning_rate": 5.093868777308251e-06, + "loss": 0.3563, + "step": 17740 + }, + { + "epoch": 0.6634109867514079, + "grad_norm": 0.6296584010124207, + "learning_rate": 5.088752393927345e-06, + "loss": 0.2916, + "step": 17745 + }, + { + "epoch": 0.6635979157417576, + "grad_norm": 0.1845739334821701, + "learning_rate": 5.08363770429501e-06, + "loss": 0.2693, + "step": 17750 + }, + { + "epoch": 0.6637848447321074, + "grad_norm": 0.3726886510848999, + "learning_rate": 5.0785247101751645e-06, + "loss": 0.2992, + "step": 17755 + }, + { + "epoch": 0.6639717737224572, + "grad_norm": 0.26406246423721313, + "learning_rate": 5.073413413331128e-06, + "loss": 0.2997, + "step": 17760 + }, + { + "epoch": 0.664158702712807, + "grad_norm": 0.421535849571228, + "learning_rate": 5.068303815525639e-06, + "loss": 0.2156, + "step": 17765 + }, + { + "epoch": 0.6643456317031567, + "grad_norm": 0.40606680512428284, + "learning_rate": 5.0631959185208514e-06, + "loss": 0.2638, + "step": 17770 + }, + { + "epoch": 0.6645325606935065, + "grad_norm": 0.31488245725631714, + "learning_rate": 5.0580897240783365e-06, + "loss": 0.2754, + "step": 17775 + }, + { + "epoch": 0.6647194896838563, + "grad_norm": 0.4173290729522705, + "learning_rate": 5.052985233959069e-06, + "loss": 0.2472, + "step": 17780 + }, + { + "epoch": 0.6649064186742062, + "grad_norm": 0.4380118250846863, + "learning_rate": 5.047882449923444e-06, + "loss": 0.2227, + "step": 17785 + }, + { + "epoch": 0.665093347664556, + "grad_norm": 0.22158585488796234, + "learning_rate": 5.04278137373127e-06, + "loss": 0.2522, + "step": 17790 + }, + { + "epoch": 0.6652802766549057, + "grad_norm": 0.5004199147224426, + "learning_rate": 5.037682007141754e-06, + "loss": 0.3162, + "step": 17795 + }, + { + "epoch": 0.6654672056452555, + "grad_norm": 0.5138402581214905, + "learning_rate": 5.03258435191353e-06, + "loss": 0.2277, + "step": 17800 + }, + { + "epoch": 0.6656541346356053, + "grad_norm": 0.569689929485321, + "learning_rate": 5.027488409804624e-06, + "loss": 0.2435, + "step": 17805 + }, + { + "epoch": 0.6658410636259551, + "grad_norm": 0.50981605052948, + "learning_rate": 5.022394182572487e-06, + "loss": 0.2654, + "step": 17810 + }, + { + "epoch": 0.6660279926163049, + "grad_norm": 0.39164555072784424, + "learning_rate": 5.017301671973973e-06, + "loss": 0.2715, + "step": 17815 + }, + { + "epoch": 0.6662149216066546, + "grad_norm": 0.31828561425209045, + "learning_rate": 5.012210879765339e-06, + "loss": 0.2559, + "step": 17820 + }, + { + "epoch": 0.6664018505970044, + "grad_norm": 0.6788938641548157, + "learning_rate": 5.0071218077022495e-06, + "loss": 0.3659, + "step": 17825 + }, + { + "epoch": 0.6665887795873543, + "grad_norm": 0.2640472948551178, + "learning_rate": 5.002034457539786e-06, + "loss": 0.2719, + "step": 17830 + }, + { + "epoch": 0.6667757085777041, + "grad_norm": 0.3954325318336487, + "learning_rate": 4.996948831032431e-06, + "loss": 0.2705, + "step": 17835 + }, + { + "epoch": 0.6669626375680539, + "grad_norm": 0.25173521041870117, + "learning_rate": 4.991864929934065e-06, + "loss": 0.2495, + "step": 17840 + }, + { + "epoch": 0.6671495665584036, + "grad_norm": 0.3650529384613037, + "learning_rate": 4.986782755997987e-06, + "loss": 0.3812, + "step": 17845 + }, + { + "epoch": 0.6673364955487534, + "grad_norm": 0.5799317359924316, + "learning_rate": 4.981702310976887e-06, + "loss": 0.2965, + "step": 17850 + }, + { + "epoch": 0.6675234245391032, + "grad_norm": 0.3628406822681427, + "learning_rate": 4.97662359662287e-06, + "loss": 0.3047, + "step": 17855 + }, + { + "epoch": 0.667710353529453, + "grad_norm": 0.8548519015312195, + "learning_rate": 4.971546614687437e-06, + "loss": 0.3368, + "step": 17860 + }, + { + "epoch": 0.6678972825198027, + "grad_norm": 0.4021095037460327, + "learning_rate": 4.966471366921493e-06, + "loss": 0.3546, + "step": 17865 + }, + { + "epoch": 0.6680842115101526, + "grad_norm": 0.23902134597301483, + "learning_rate": 4.961397855075343e-06, + "loss": 0.3073, + "step": 17870 + }, + { + "epoch": 0.6682711405005024, + "grad_norm": 0.24928809702396393, + "learning_rate": 4.956326080898697e-06, + "loss": 0.2987, + "step": 17875 + }, + { + "epoch": 0.6684580694908522, + "grad_norm": 0.4533480703830719, + "learning_rate": 4.951256046140671e-06, + "loss": 0.3074, + "step": 17880 + }, + { + "epoch": 0.668644998481202, + "grad_norm": 0.3241102695465088, + "learning_rate": 4.946187752549766e-06, + "loss": 0.2708, + "step": 17885 + }, + { + "epoch": 0.6688319274715517, + "grad_norm": 0.5674617290496826, + "learning_rate": 4.9411212018738984e-06, + "loss": 0.3197, + "step": 17890 + }, + { + "epoch": 0.6690188564619015, + "grad_norm": 0.41915619373321533, + "learning_rate": 4.936056395860369e-06, + "loss": 0.3046, + "step": 17895 + }, + { + "epoch": 0.6692057854522513, + "grad_norm": 0.2280120998620987, + "learning_rate": 4.930993336255892e-06, + "loss": 0.2369, + "step": 17900 + }, + { + "epoch": 0.6693927144426011, + "grad_norm": 0.4832659661769867, + "learning_rate": 4.925932024806569e-06, + "loss": 0.3249, + "step": 17905 + }, + { + "epoch": 0.669579643432951, + "grad_norm": 0.5970139503479004, + "learning_rate": 4.9208724632579e-06, + "loss": 0.3852, + "step": 17910 + }, + { + "epoch": 0.6697665724233007, + "grad_norm": 0.3375006914138794, + "learning_rate": 4.915814653354779e-06, + "loss": 0.2146, + "step": 17915 + }, + { + "epoch": 0.6699535014136505, + "grad_norm": 0.3070432245731354, + "learning_rate": 4.910758596841504e-06, + "loss": 0.2901, + "step": 17920 + }, + { + "epoch": 0.6701404304040003, + "grad_norm": 0.4842531681060791, + "learning_rate": 4.905704295461767e-06, + "loss": 0.1967, + "step": 17925 + }, + { + "epoch": 0.6703273593943501, + "grad_norm": 0.24802975356578827, + "learning_rate": 4.900651750958645e-06, + "loss": 0.3403, + "step": 17930 + }, + { + "epoch": 0.6705142883846998, + "grad_norm": 0.32612282037734985, + "learning_rate": 4.895600965074623e-06, + "loss": 0.2696, + "step": 17935 + }, + { + "epoch": 0.6707012173750496, + "grad_norm": 0.3664465844631195, + "learning_rate": 4.890551939551565e-06, + "loss": 0.2614, + "step": 17940 + }, + { + "epoch": 0.6708881463653994, + "grad_norm": 0.4860963523387909, + "learning_rate": 4.885504676130743e-06, + "loss": 0.2449, + "step": 17945 + }, + { + "epoch": 0.6710750753557493, + "grad_norm": 0.38064733147621155, + "learning_rate": 4.880459176552808e-06, + "loss": 0.285, + "step": 17950 + }, + { + "epoch": 0.6712620043460991, + "grad_norm": 0.38366106152534485, + "learning_rate": 4.87541544255781e-06, + "loss": 0.3247, + "step": 17955 + }, + { + "epoch": 0.6714489333364488, + "grad_norm": 0.7270559668540955, + "learning_rate": 4.8703734758851854e-06, + "loss": 0.3462, + "step": 17960 + }, + { + "epoch": 0.6716358623267986, + "grad_norm": 0.23612673580646515, + "learning_rate": 4.865333278273768e-06, + "loss": 0.2441, + "step": 17965 + }, + { + "epoch": 0.6718227913171484, + "grad_norm": 0.24252595007419586, + "learning_rate": 4.860294851461774e-06, + "loss": 0.2234, + "step": 17970 + }, + { + "epoch": 0.6720097203074982, + "grad_norm": 0.5922762751579285, + "learning_rate": 4.8552581971868154e-06, + "loss": 0.2714, + "step": 17975 + }, + { + "epoch": 0.672196649297848, + "grad_norm": 0.477463960647583, + "learning_rate": 4.850223317185891e-06, + "loss": 0.2555, + "step": 17980 + }, + { + "epoch": 0.6723835782881977, + "grad_norm": 0.4230765700340271, + "learning_rate": 4.845190213195382e-06, + "loss": 0.2681, + "step": 17985 + }, + { + "epoch": 0.6725705072785476, + "grad_norm": 0.7639047503471375, + "learning_rate": 4.840158886951069e-06, + "loss": 0.4085, + "step": 17990 + }, + { + "epoch": 0.6727574362688974, + "grad_norm": 0.27008453011512756, + "learning_rate": 4.835129340188101e-06, + "loss": 0.2701, + "step": 17995 + }, + { + "epoch": 0.6729443652592472, + "grad_norm": 0.48787766695022583, + "learning_rate": 4.8301015746410385e-06, + "loss": 0.3341, + "step": 18000 + }, + { + "epoch": 0.673131294249597, + "grad_norm": 0.4696618914604187, + "learning_rate": 4.825075592043805e-06, + "loss": 0.245, + "step": 18005 + }, + { + "epoch": 0.6733182232399467, + "grad_norm": 0.2987324893474579, + "learning_rate": 4.82005139412972e-06, + "loss": 0.3017, + "step": 18010 + }, + { + "epoch": 0.6735051522302965, + "grad_norm": 0.35396862030029297, + "learning_rate": 4.81502898263148e-06, + "loss": 0.2776, + "step": 18015 + }, + { + "epoch": 0.6736920812206463, + "grad_norm": 0.1255078911781311, + "learning_rate": 4.810008359281176e-06, + "loss": 0.2967, + "step": 18020 + }, + { + "epoch": 0.673879010210996, + "grad_norm": 0.4665721356868744, + "learning_rate": 4.804989525810282e-06, + "loss": 0.3912, + "step": 18025 + }, + { + "epoch": 0.6740659392013459, + "grad_norm": 0.4541762173175812, + "learning_rate": 4.79997248394964e-06, + "loss": 0.3989, + "step": 18030 + }, + { + "epoch": 0.6742528681916957, + "grad_norm": 0.17799994349479675, + "learning_rate": 4.794957235429491e-06, + "loss": 0.3017, + "step": 18035 + }, + { + "epoch": 0.6744397971820455, + "grad_norm": 0.5030390024185181, + "learning_rate": 4.789943781979447e-06, + "loss": 0.2906, + "step": 18040 + }, + { + "epoch": 0.6746267261723953, + "grad_norm": 0.3190663158893585, + "learning_rate": 4.784932125328507e-06, + "loss": 0.2697, + "step": 18045 + }, + { + "epoch": 0.674813655162745, + "grad_norm": 0.5195890069007874, + "learning_rate": 4.779922267205048e-06, + "loss": 0.2506, + "step": 18050 + }, + { + "epoch": 0.6750005841530948, + "grad_norm": 0.48416566848754883, + "learning_rate": 4.774914209336824e-06, + "loss": 0.367, + "step": 18055 + }, + { + "epoch": 0.6751875131434446, + "grad_norm": 0.38188669085502625, + "learning_rate": 4.769907953450968e-06, + "loss": 0.277, + "step": 18060 + }, + { + "epoch": 0.6753744421337944, + "grad_norm": 0.4269798696041107, + "learning_rate": 4.764903501273999e-06, + "loss": 0.3099, + "step": 18065 + }, + { + "epoch": 0.6755613711241442, + "grad_norm": 0.5362254977226257, + "learning_rate": 4.75990085453181e-06, + "loss": 0.3038, + "step": 18070 + }, + { + "epoch": 0.675748300114494, + "grad_norm": 0.3357028067111969, + "learning_rate": 4.754900014949665e-06, + "loss": 0.2606, + "step": 18075 + }, + { + "epoch": 0.6759352291048438, + "grad_norm": 0.43899163603782654, + "learning_rate": 4.749900984252218e-06, + "loss": 0.308, + "step": 18080 + }, + { + "epoch": 0.6761221580951936, + "grad_norm": 0.2953328788280487, + "learning_rate": 4.744903764163483e-06, + "loss": 0.2463, + "step": 18085 + }, + { + "epoch": 0.6763090870855434, + "grad_norm": 0.2157057821750641, + "learning_rate": 4.739908356406866e-06, + "loss": 0.3184, + "step": 18090 + }, + { + "epoch": 0.6764960160758932, + "grad_norm": 0.5682590007781982, + "learning_rate": 4.7349147627051365e-06, + "loss": 0.2243, + "step": 18095 + }, + { + "epoch": 0.6766829450662429, + "grad_norm": 0.3066447675228119, + "learning_rate": 4.729922984780441e-06, + "loss": 0.2636, + "step": 18100 + }, + { + "epoch": 0.6768698740565927, + "grad_norm": 0.35900914669036865, + "learning_rate": 4.724933024354298e-06, + "loss": 0.2162, + "step": 18105 + }, + { + "epoch": 0.6770568030469425, + "grad_norm": 0.40200889110565186, + "learning_rate": 4.719944883147605e-06, + "loss": 0.292, + "step": 18110 + }, + { + "epoch": 0.6772437320372924, + "grad_norm": 0.23843464255332947, + "learning_rate": 4.714958562880633e-06, + "loss": 0.2521, + "step": 18115 + }, + { + "epoch": 0.6774306610276422, + "grad_norm": 0.6935325860977173, + "learning_rate": 4.709974065273013e-06, + "loss": 0.2634, + "step": 18120 + }, + { + "epoch": 0.6776175900179919, + "grad_norm": 0.1865166425704956, + "learning_rate": 4.704991392043763e-06, + "loss": 0.3864, + "step": 18125 + }, + { + "epoch": 0.6778045190083417, + "grad_norm": 0.6192229986190796, + "learning_rate": 4.700010544911257e-06, + "loss": 0.2523, + "step": 18130 + }, + { + "epoch": 0.6779914479986915, + "grad_norm": 0.22409962117671967, + "learning_rate": 4.695031525593254e-06, + "loss": 0.2784, + "step": 18135 + }, + { + "epoch": 0.6781783769890413, + "grad_norm": 0.118705615401268, + "learning_rate": 4.690054335806872e-06, + "loss": 0.2382, + "step": 18140 + }, + { + "epoch": 0.678365305979391, + "grad_norm": 0.39932093024253845, + "learning_rate": 4.6850789772685955e-06, + "loss": 0.223, + "step": 18145 + }, + { + "epoch": 0.6785522349697408, + "grad_norm": 0.8999297022819519, + "learning_rate": 4.6801054516942924e-06, + "loss": 0.2789, + "step": 18150 + }, + { + "epoch": 0.6787391639600907, + "grad_norm": 0.44918587803840637, + "learning_rate": 4.675133760799181e-06, + "loss": 0.3355, + "step": 18155 + }, + { + "epoch": 0.6789260929504405, + "grad_norm": 0.4980359673500061, + "learning_rate": 4.6701639062978624e-06, + "loss": 0.2608, + "step": 18160 + }, + { + "epoch": 0.6791130219407903, + "grad_norm": 0.30972304940223694, + "learning_rate": 4.6651958899042895e-06, + "loss": 0.207, + "step": 18165 + }, + { + "epoch": 0.67929995093114, + "grad_norm": 0.34727078676223755, + "learning_rate": 4.660229713331797e-06, + "loss": 0.2291, + "step": 18170 + }, + { + "epoch": 0.6794868799214898, + "grad_norm": 0.2929920256137848, + "learning_rate": 4.655265378293068e-06, + "loss": 0.2834, + "step": 18175 + }, + { + "epoch": 0.6796738089118396, + "grad_norm": 0.37373247742652893, + "learning_rate": 4.650302886500168e-06, + "loss": 0.3505, + "step": 18180 + }, + { + "epoch": 0.6798607379021894, + "grad_norm": 0.3892793655395508, + "learning_rate": 4.645342239664511e-06, + "loss": 0.287, + "step": 18185 + }, + { + "epoch": 0.6800476668925391, + "grad_norm": 0.21411560475826263, + "learning_rate": 4.640383439496888e-06, + "loss": 0.1991, + "step": 18190 + }, + { + "epoch": 0.680234595882889, + "grad_norm": 0.30452221632003784, + "learning_rate": 4.635426487707445e-06, + "loss": 0.2585, + "step": 18195 + }, + { + "epoch": 0.6804215248732388, + "grad_norm": 0.3732444941997528, + "learning_rate": 4.630471386005688e-06, + "loss": 0.2668, + "step": 18200 + }, + { + "epoch": 0.6806084538635886, + "grad_norm": 0.4895915687084198, + "learning_rate": 4.625518136100498e-06, + "loss": 0.3098, + "step": 18205 + }, + { + "epoch": 0.6807953828539384, + "grad_norm": 0.25300148129463196, + "learning_rate": 4.6205667397001e-06, + "loss": 0.2581, + "step": 18210 + }, + { + "epoch": 0.6809823118442881, + "grad_norm": 0.38394615054130554, + "learning_rate": 4.615617198512097e-06, + "loss": 0.2979, + "step": 18215 + }, + { + "epoch": 0.6811692408346379, + "grad_norm": 0.3098186254501343, + "learning_rate": 4.6106695142434355e-06, + "loss": 0.2881, + "step": 18220 + }, + { + "epoch": 0.6813561698249877, + "grad_norm": 0.5445083379745483, + "learning_rate": 4.60572368860044e-06, + "loss": 0.2482, + "step": 18225 + }, + { + "epoch": 0.6815430988153375, + "grad_norm": 0.3289521634578705, + "learning_rate": 4.600779723288774e-06, + "loss": 0.3112, + "step": 18230 + }, + { + "epoch": 0.6817300278056874, + "grad_norm": 0.6174635291099548, + "learning_rate": 4.595837620013478e-06, + "loss": 0.2644, + "step": 18235 + }, + { + "epoch": 0.6819169567960371, + "grad_norm": 0.3422519266605377, + "learning_rate": 4.5908973804789385e-06, + "loss": 0.2874, + "step": 18240 + }, + { + "epoch": 0.6821038857863869, + "grad_norm": 0.42430946230888367, + "learning_rate": 4.585959006388898e-06, + "loss": 0.288, + "step": 18245 + }, + { + "epoch": 0.6822908147767367, + "grad_norm": 0.4555477499961853, + "learning_rate": 4.581022499446468e-06, + "loss": 0.3142, + "step": 18250 + }, + { + "epoch": 0.6824777437670865, + "grad_norm": 1.2911605834960938, + "learning_rate": 4.576087861354101e-06, + "loss": 0.54, + "step": 18255 + }, + { + "epoch": 0.6826646727574363, + "grad_norm": 0.3238535523414612, + "learning_rate": 4.571155093813619e-06, + "loss": 0.2987, + "step": 18260 + }, + { + "epoch": 0.682851601747786, + "grad_norm": 0.2774864137172699, + "learning_rate": 4.5662241985261865e-06, + "loss": 0.2595, + "step": 18265 + }, + { + "epoch": 0.6830385307381358, + "grad_norm": 0.3135109543800354, + "learning_rate": 4.5612951771923345e-06, + "loss": 0.2846, + "step": 18270 + }, + { + "epoch": 0.6832254597284857, + "grad_norm": 0.6082348823547363, + "learning_rate": 4.556368031511932e-06, + "loss": 0.2893, + "step": 18275 + }, + { + "epoch": 0.6834123887188355, + "grad_norm": 0.5355720520019531, + "learning_rate": 4.551442763184221e-06, + "loss": 0.3026, + "step": 18280 + }, + { + "epoch": 0.6835993177091853, + "grad_norm": 0.6126582026481628, + "learning_rate": 4.546519373907778e-06, + "loss": 0.2891, + "step": 18285 + }, + { + "epoch": 0.683786246699535, + "grad_norm": 0.47081178426742554, + "learning_rate": 4.541597865380539e-06, + "loss": 0.245, + "step": 18290 + }, + { + "epoch": 0.6839731756898848, + "grad_norm": 0.6696823835372925, + "learning_rate": 4.536678239299797e-06, + "loss": 0.3148, + "step": 18295 + }, + { + "epoch": 0.6841601046802346, + "grad_norm": 0.41798293590545654, + "learning_rate": 4.531760497362181e-06, + "loss": 0.2587, + "step": 18300 + }, + { + "epoch": 0.6843470336705844, + "grad_norm": 0.4290241003036499, + "learning_rate": 4.526844641263689e-06, + "loss": 0.2839, + "step": 18305 + }, + { + "epoch": 0.6845339626609341, + "grad_norm": 0.4475858509540558, + "learning_rate": 4.521930672699651e-06, + "loss": 0.3038, + "step": 18310 + }, + { + "epoch": 0.6847208916512839, + "grad_norm": 0.3587439954280853, + "learning_rate": 4.517018593364761e-06, + "loss": 0.2565, + "step": 18315 + }, + { + "epoch": 0.6849078206416338, + "grad_norm": 0.6752985715866089, + "learning_rate": 4.512108404953048e-06, + "loss": 0.2591, + "step": 18320 + }, + { + "epoch": 0.6850947496319836, + "grad_norm": 0.3796422779560089, + "learning_rate": 4.507200109157901e-06, + "loss": 0.2285, + "step": 18325 + }, + { + "epoch": 0.6852816786223334, + "grad_norm": 0.23058107495307922, + "learning_rate": 4.502293707672044e-06, + "loss": 0.2528, + "step": 18330 + }, + { + "epoch": 0.6854686076126831, + "grad_norm": 0.29818716645240784, + "learning_rate": 4.497389202187562e-06, + "loss": 0.265, + "step": 18335 + }, + { + "epoch": 0.6856555366030329, + "grad_norm": 0.48177286982536316, + "learning_rate": 4.492486594395875e-06, + "loss": 0.2451, + "step": 18340 + }, + { + "epoch": 0.6858424655933827, + "grad_norm": 0.4235864281654358, + "learning_rate": 4.487585885987747e-06, + "loss": 0.3306, + "step": 18345 + }, + { + "epoch": 0.6860293945837325, + "grad_norm": 0.5081339478492737, + "learning_rate": 4.4826870786533e-06, + "loss": 0.3331, + "step": 18350 + }, + { + "epoch": 0.6862163235740822, + "grad_norm": 0.5094728469848633, + "learning_rate": 4.477790174081984e-06, + "loss": 0.2585, + "step": 18355 + }, + { + "epoch": 0.6864032525644321, + "grad_norm": 0.29224979877471924, + "learning_rate": 4.47289517396261e-06, + "loss": 0.2684, + "step": 18360 + }, + { + "epoch": 0.6865901815547819, + "grad_norm": 0.2801983058452606, + "learning_rate": 4.468002079983315e-06, + "loss": 0.3254, + "step": 18365 + }, + { + "epoch": 0.6867771105451317, + "grad_norm": 0.688702404499054, + "learning_rate": 4.463110893831596e-06, + "loss": 0.2631, + "step": 18370 + }, + { + "epoch": 0.6869640395354815, + "grad_norm": 0.43564876914024353, + "learning_rate": 4.458221617194273e-06, + "loss": 0.2727, + "step": 18375 + }, + { + "epoch": 0.6871509685258312, + "grad_norm": 0.28386905789375305, + "learning_rate": 4.453334251757526e-06, + "loss": 0.3505, + "step": 18380 + }, + { + "epoch": 0.687337897516181, + "grad_norm": 0.3805743157863617, + "learning_rate": 4.448448799206863e-06, + "loss": 0.2635, + "step": 18385 + }, + { + "epoch": 0.6875248265065308, + "grad_norm": 0.2629093825817108, + "learning_rate": 4.443565261227134e-06, + "loss": 0.2978, + "step": 18390 + }, + { + "epoch": 0.6877117554968806, + "grad_norm": 0.26355159282684326, + "learning_rate": 4.438683639502538e-06, + "loss": 0.3312, + "step": 18395 + }, + { + "epoch": 0.6878986844872305, + "grad_norm": 0.42087700963020325, + "learning_rate": 4.4338039357165985e-06, + "loss": 0.2332, + "step": 18400 + }, + { + "epoch": 0.6880856134775802, + "grad_norm": 0.42999762296676636, + "learning_rate": 4.428926151552194e-06, + "loss": 0.3205, + "step": 18405 + }, + { + "epoch": 0.68827254246793, + "grad_norm": 0.38532260060310364, + "learning_rate": 4.424050288691525e-06, + "loss": 0.2726, + "step": 18410 + }, + { + "epoch": 0.6884594714582798, + "grad_norm": 0.40023544430732727, + "learning_rate": 4.419176348816144e-06, + "loss": 0.2823, + "step": 18415 + }, + { + "epoch": 0.6886464004486296, + "grad_norm": 0.35504770278930664, + "learning_rate": 4.414304333606926e-06, + "loss": 0.2744, + "step": 18420 + }, + { + "epoch": 0.6888333294389793, + "grad_norm": 0.5017563700675964, + "learning_rate": 4.409434244744095e-06, + "loss": 0.3578, + "step": 18425 + }, + { + "epoch": 0.6890202584293291, + "grad_norm": 0.4949704706668854, + "learning_rate": 4.4045660839072045e-06, + "loss": 0.2341, + "step": 18430 + }, + { + "epoch": 0.6892071874196789, + "grad_norm": 0.40918684005737305, + "learning_rate": 4.399699852775138e-06, + "loss": 0.2464, + "step": 18435 + }, + { + "epoch": 0.6893941164100288, + "grad_norm": 0.7892428040504456, + "learning_rate": 4.394835553026128e-06, + "loss": 0.4378, + "step": 18440 + }, + { + "epoch": 0.6895810454003786, + "grad_norm": 0.2694770395755768, + "learning_rate": 4.3899731863377225e-06, + "loss": 0.2667, + "step": 18445 + }, + { + "epoch": 0.6897679743907283, + "grad_norm": 0.35795870423316956, + "learning_rate": 4.385112754386821e-06, + "loss": 0.2793, + "step": 18450 + }, + { + "epoch": 0.6899549033810781, + "grad_norm": 0.3480895161628723, + "learning_rate": 4.380254258849641e-06, + "loss": 0.2119, + "step": 18455 + }, + { + "epoch": 0.6901418323714279, + "grad_norm": 0.5051251649856567, + "learning_rate": 4.375397701401745e-06, + "loss": 0.2969, + "step": 18460 + }, + { + "epoch": 0.6903287613617777, + "grad_norm": 0.3847442865371704, + "learning_rate": 4.370543083718012e-06, + "loss": 0.2343, + "step": 18465 + }, + { + "epoch": 0.6905156903521275, + "grad_norm": 0.3477119207382202, + "learning_rate": 4.365690407472668e-06, + "loss": 0.2356, + "step": 18470 + }, + { + "epoch": 0.6907026193424772, + "grad_norm": 0.6154903769493103, + "learning_rate": 4.36083967433926e-06, + "loss": 0.2936, + "step": 18475 + }, + { + "epoch": 0.6908895483328271, + "grad_norm": 0.5352329611778259, + "learning_rate": 4.355990885990663e-06, + "loss": 0.2828, + "step": 18480 + }, + { + "epoch": 0.6910764773231769, + "grad_norm": 0.39464572072029114, + "learning_rate": 4.351144044099091e-06, + "loss": 0.2015, + "step": 18485 + }, + { + "epoch": 0.6912634063135267, + "grad_norm": 0.483270525932312, + "learning_rate": 4.346299150336074e-06, + "loss": 0.2555, + "step": 18490 + }, + { + "epoch": 0.6914503353038765, + "grad_norm": 0.39474979043006897, + "learning_rate": 4.341456206372485e-06, + "loss": 0.3986, + "step": 18495 + }, + { + "epoch": 0.6916372642942262, + "grad_norm": 0.5631116628646851, + "learning_rate": 4.336615213878509e-06, + "loss": 0.2867, + "step": 18500 + }, + { + "epoch": 0.691824193284576, + "grad_norm": 0.623786985874176, + "learning_rate": 4.331776174523673e-06, + "loss": 0.3206, + "step": 18505 + }, + { + "epoch": 0.6920111222749258, + "grad_norm": 0.2712309658527374, + "learning_rate": 4.326939089976815e-06, + "loss": 0.3039, + "step": 18510 + }, + { + "epoch": 0.6921980512652756, + "grad_norm": 0.34029409289360046, + "learning_rate": 4.322103961906113e-06, + "loss": 0.2136, + "step": 18515 + }, + { + "epoch": 0.6923849802556254, + "grad_norm": 0.52415931224823, + "learning_rate": 4.317270791979063e-06, + "loss": 0.1956, + "step": 18520 + }, + { + "epoch": 0.6925719092459752, + "grad_norm": 0.38135063648223877, + "learning_rate": 4.312439581862488e-06, + "loss": 0.2654, + "step": 18525 + }, + { + "epoch": 0.692758838236325, + "grad_norm": 0.8677012920379639, + "learning_rate": 4.307610333222532e-06, + "loss": 0.28, + "step": 18530 + }, + { + "epoch": 0.6929457672266748, + "grad_norm": 0.19962137937545776, + "learning_rate": 4.30278304772466e-06, + "loss": 0.2984, + "step": 18535 + }, + { + "epoch": 0.6931326962170246, + "grad_norm": 0.4274756610393524, + "learning_rate": 4.297957727033673e-06, + "loss": 0.2288, + "step": 18540 + }, + { + "epoch": 0.6933196252073743, + "grad_norm": 0.3446785509586334, + "learning_rate": 4.293134372813678e-06, + "loss": 0.2482, + "step": 18545 + }, + { + "epoch": 0.6935065541977241, + "grad_norm": 0.30156296491622925, + "learning_rate": 4.288312986728119e-06, + "loss": 0.254, + "step": 18550 + }, + { + "epoch": 0.6936934831880739, + "grad_norm": 0.20253904163837433, + "learning_rate": 4.283493570439746e-06, + "loss": 0.2544, + "step": 18555 + }, + { + "epoch": 0.6938804121784237, + "grad_norm": 0.2638651728630066, + "learning_rate": 4.278676125610644e-06, + "loss": 0.2614, + "step": 18560 + }, + { + "epoch": 0.6940673411687736, + "grad_norm": 0.2870006859302521, + "learning_rate": 4.2738606539022105e-06, + "loss": 0.3594, + "step": 18565 + }, + { + "epoch": 0.6942542701591233, + "grad_norm": 0.4544857442378998, + "learning_rate": 4.269047156975166e-06, + "loss": 0.3418, + "step": 18570 + }, + { + "epoch": 0.6944411991494731, + "grad_norm": 0.21792225539684296, + "learning_rate": 4.264235636489542e-06, + "loss": 0.3146, + "step": 18575 + }, + { + "epoch": 0.6946281281398229, + "grad_norm": 0.5876854658126831, + "learning_rate": 4.2594260941046935e-06, + "loss": 0.2753, + "step": 18580 + }, + { + "epoch": 0.6948150571301727, + "grad_norm": 0.4777786135673523, + "learning_rate": 4.254618531479301e-06, + "loss": 0.3524, + "step": 18585 + }, + { + "epoch": 0.6950019861205224, + "grad_norm": 0.3921513259410858, + "learning_rate": 4.249812950271347e-06, + "loss": 0.3673, + "step": 18590 + }, + { + "epoch": 0.6951889151108722, + "grad_norm": 0.284984290599823, + "learning_rate": 4.245009352138146e-06, + "loss": 0.3071, + "step": 18595 + }, + { + "epoch": 0.695375844101222, + "grad_norm": 0.4456053078174591, + "learning_rate": 4.240207738736315e-06, + "loss": 0.3151, + "step": 18600 + }, + { + "epoch": 0.6955627730915719, + "grad_norm": 0.5414977669715881, + "learning_rate": 4.235408111721796e-06, + "loss": 0.249, + "step": 18605 + }, + { + "epoch": 0.6957497020819217, + "grad_norm": 0.7804057598114014, + "learning_rate": 4.230610472749847e-06, + "loss": 0.3312, + "step": 18610 + }, + { + "epoch": 0.6959366310722714, + "grad_norm": 0.4957026541233063, + "learning_rate": 4.225814823475031e-06, + "loss": 0.222, + "step": 18615 + }, + { + "epoch": 0.6961235600626212, + "grad_norm": 0.2853618860244751, + "learning_rate": 4.221021165551232e-06, + "loss": 0.2572, + "step": 18620 + }, + { + "epoch": 0.696310489052971, + "grad_norm": 0.4198627769947052, + "learning_rate": 4.21622950063164e-06, + "loss": 0.2863, + "step": 18625 + }, + { + "epoch": 0.6964974180433208, + "grad_norm": 0.25387126207351685, + "learning_rate": 4.211439830368771e-06, + "loss": 0.2627, + "step": 18630 + }, + { + "epoch": 0.6966843470336705, + "grad_norm": 0.3733402192592621, + "learning_rate": 4.206652156414437e-06, + "loss": 0.3436, + "step": 18635 + }, + { + "epoch": 0.6968712760240203, + "grad_norm": 0.24343183636665344, + "learning_rate": 4.2018664804197784e-06, + "loss": 0.2873, + "step": 18640 + }, + { + "epoch": 0.6970582050143702, + "grad_norm": 0.2914699614048004, + "learning_rate": 4.19708280403523e-06, + "loss": 0.2815, + "step": 18645 + }, + { + "epoch": 0.69724513400472, + "grad_norm": 0.5060830116271973, + "learning_rate": 4.192301128910546e-06, + "loss": 0.2341, + "step": 18650 + }, + { + "epoch": 0.6974320629950698, + "grad_norm": 0.6719004511833191, + "learning_rate": 4.187521456694797e-06, + "loss": 0.2972, + "step": 18655 + }, + { + "epoch": 0.6976189919854195, + "grad_norm": 0.49988633394241333, + "learning_rate": 4.182743789036346e-06, + "loss": 0.3975, + "step": 18660 + }, + { + "epoch": 0.6978059209757693, + "grad_norm": 0.3273427188396454, + "learning_rate": 4.1779681275828795e-06, + "loss": 0.278, + "step": 18665 + }, + { + "epoch": 0.6979928499661191, + "grad_norm": 1.3453400135040283, + "learning_rate": 4.173194473981379e-06, + "loss": 0.3058, + "step": 18670 + }, + { + "epoch": 0.6981797789564689, + "grad_norm": 0.30567625164985657, + "learning_rate": 4.168422829878148e-06, + "loss": 0.3541, + "step": 18675 + }, + { + "epoch": 0.6983667079468187, + "grad_norm": 0.45551925897598267, + "learning_rate": 4.163653196918784e-06, + "loss": 0.334, + "step": 18680 + }, + { + "epoch": 0.6985536369371685, + "grad_norm": 0.33953145146369934, + "learning_rate": 4.158885576748205e-06, + "loss": 0.3156, + "step": 18685 + }, + { + "epoch": 0.6987405659275183, + "grad_norm": 0.5441725254058838, + "learning_rate": 4.154119971010616e-06, + "loss": 0.2464, + "step": 18690 + }, + { + "epoch": 0.6989274949178681, + "grad_norm": 0.25098875164985657, + "learning_rate": 4.149356381349544e-06, + "loss": 0.2644, + "step": 18695 + }, + { + "epoch": 0.6991144239082179, + "grad_norm": 0.35483863949775696, + "learning_rate": 4.144594809407818e-06, + "loss": 0.2527, + "step": 18700 + }, + { + "epoch": 0.6993013528985677, + "grad_norm": 0.6076955795288086, + "learning_rate": 4.139835256827559e-06, + "loss": 0.2728, + "step": 18705 + }, + { + "epoch": 0.6994882818889174, + "grad_norm": 0.26839420199394226, + "learning_rate": 4.135077725250209e-06, + "loss": 0.2568, + "step": 18710 + }, + { + "epoch": 0.6996752108792672, + "grad_norm": 0.3890112042427063, + "learning_rate": 4.130322216316502e-06, + "loss": 0.2509, + "step": 18715 + }, + { + "epoch": 0.699862139869617, + "grad_norm": 0.4457304775714874, + "learning_rate": 4.125568731666473e-06, + "loss": 0.2774, + "step": 18720 + }, + { + "epoch": 0.7000490688599669, + "grad_norm": 0.3597055971622467, + "learning_rate": 4.120817272939462e-06, + "loss": 0.2028, + "step": 18725 + }, + { + "epoch": 0.7002359978503166, + "grad_norm": 0.2370787411928177, + "learning_rate": 4.116067841774116e-06, + "loss": 0.3078, + "step": 18730 + }, + { + "epoch": 0.7004229268406664, + "grad_norm": 0.7779338359832764, + "learning_rate": 4.111320439808373e-06, + "loss": 0.3016, + "step": 18735 + }, + { + "epoch": 0.7006098558310162, + "grad_norm": 0.3370744585990906, + "learning_rate": 4.106575068679477e-06, + "loss": 0.274, + "step": 18740 + }, + { + "epoch": 0.700796784821366, + "grad_norm": 0.46344175934791565, + "learning_rate": 4.101831730023978e-06, + "loss": 0.3209, + "step": 18745 + }, + { + "epoch": 0.7009837138117158, + "grad_norm": 0.3550253212451935, + "learning_rate": 4.097090425477706e-06, + "loss": 0.2878, + "step": 18750 + }, + { + "epoch": 0.7011706428020655, + "grad_norm": 0.5622664093971252, + "learning_rate": 4.092351156675809e-06, + "loss": 0.2869, + "step": 18755 + }, + { + "epoch": 0.7013575717924153, + "grad_norm": 0.27434638142585754, + "learning_rate": 4.087613925252723e-06, + "loss": 0.3175, + "step": 18760 + }, + { + "epoch": 0.7015445007827652, + "grad_norm": 0.5039125084877014, + "learning_rate": 4.082878732842185e-06, + "loss": 0.279, + "step": 18765 + }, + { + "epoch": 0.701731429773115, + "grad_norm": 0.37588292360305786, + "learning_rate": 4.07814558107722e-06, + "loss": 0.2938, + "step": 18770 + }, + { + "epoch": 0.7019183587634648, + "grad_norm": 0.36863166093826294, + "learning_rate": 4.073414471590165e-06, + "loss": 0.3311, + "step": 18775 + }, + { + "epoch": 0.7021052877538145, + "grad_norm": 0.3147136867046356, + "learning_rate": 4.068685406012637e-06, + "loss": 0.3818, + "step": 18780 + }, + { + "epoch": 0.7022922167441643, + "grad_norm": 0.3484380841255188, + "learning_rate": 4.06395838597556e-06, + "loss": 0.3061, + "step": 18785 + }, + { + "epoch": 0.7024791457345141, + "grad_norm": 0.34823089838027954, + "learning_rate": 4.059233413109148e-06, + "loss": 0.2871, + "step": 18790 + }, + { + "epoch": 0.7026660747248639, + "grad_norm": 0.21362809836864471, + "learning_rate": 4.054510489042906e-06, + "loss": 0.2635, + "step": 18795 + }, + { + "epoch": 0.7028530037152136, + "grad_norm": 0.4324122369289398, + "learning_rate": 4.049789615405638e-06, + "loss": 0.2149, + "step": 18800 + }, + { + "epoch": 0.7030399327055634, + "grad_norm": 0.6976597905158997, + "learning_rate": 4.0450707938254385e-06, + "loss": 0.2078, + "step": 18805 + }, + { + "epoch": 0.7032268616959133, + "grad_norm": 0.7980237603187561, + "learning_rate": 4.0403540259296905e-06, + "loss": 0.3072, + "step": 18810 + }, + { + "epoch": 0.7034137906862631, + "grad_norm": 0.3287063241004944, + "learning_rate": 4.03563931334507e-06, + "loss": 0.2836, + "step": 18815 + }, + { + "epoch": 0.7036007196766129, + "grad_norm": 0.6408100128173828, + "learning_rate": 4.030926657697554e-06, + "loss": 0.2664, + "step": 18820 + }, + { + "epoch": 0.7037876486669626, + "grad_norm": 0.44458410143852234, + "learning_rate": 4.0262160606123946e-06, + "loss": 0.3106, + "step": 18825 + }, + { + "epoch": 0.7039745776573124, + "grad_norm": 0.19885852932929993, + "learning_rate": 4.021507523714145e-06, + "loss": 0.2218, + "step": 18830 + }, + { + "epoch": 0.7041615066476622, + "grad_norm": 0.46404361724853516, + "learning_rate": 4.016801048626648e-06, + "loss": 0.251, + "step": 18835 + }, + { + "epoch": 0.704348435638012, + "grad_norm": 0.6827042102813721, + "learning_rate": 4.012096636973027e-06, + "loss": 0.2417, + "step": 18840 + }, + { + "epoch": 0.7045353646283617, + "grad_norm": 0.3932143747806549, + "learning_rate": 4.007394290375703e-06, + "loss": 0.3334, + "step": 18845 + }, + { + "epoch": 0.7047222936187116, + "grad_norm": 0.5734818577766418, + "learning_rate": 4.002694010456379e-06, + "loss": 0.3273, + "step": 18850 + }, + { + "epoch": 0.7049092226090614, + "grad_norm": 0.6905113458633423, + "learning_rate": 3.997995798836046e-06, + "loss": 0.2476, + "step": 18855 + }, + { + "epoch": 0.7050961515994112, + "grad_norm": 0.5125166177749634, + "learning_rate": 3.993299657134979e-06, + "loss": 0.2125, + "step": 18860 + }, + { + "epoch": 0.705283080589761, + "grad_norm": 0.22294095158576965, + "learning_rate": 3.98860558697275e-06, + "loss": 0.2655, + "step": 18865 + }, + { + "epoch": 0.7054700095801107, + "grad_norm": 0.5617648959159851, + "learning_rate": 3.983913589968202e-06, + "loss": 0.3351, + "step": 18870 + }, + { + "epoch": 0.7056569385704605, + "grad_norm": 0.2911111116409302, + "learning_rate": 3.979223667739475e-06, + "loss": 0.2118, + "step": 18875 + }, + { + "epoch": 0.7058438675608103, + "grad_norm": 0.49967023730278015, + "learning_rate": 3.974535821903992e-06, + "loss": 0.2808, + "step": 18880 + }, + { + "epoch": 0.7060307965511601, + "grad_norm": 0.20392078161239624, + "learning_rate": 3.969850054078448e-06, + "loss": 0.2092, + "step": 18885 + }, + { + "epoch": 0.70621772554151, + "grad_norm": 0.31886929273605347, + "learning_rate": 3.965166365878839e-06, + "loss": 0.2398, + "step": 18890 + }, + { + "epoch": 0.7064046545318597, + "grad_norm": 0.4940277934074402, + "learning_rate": 3.960484758920426e-06, + "loss": 0.2671, + "step": 18895 + }, + { + "epoch": 0.7065915835222095, + "grad_norm": 0.4553227424621582, + "learning_rate": 3.95580523481777e-06, + "loss": 0.3112, + "step": 18900 + }, + { + "epoch": 0.7067785125125593, + "grad_norm": 0.4576134979724884, + "learning_rate": 3.9511277951847e-06, + "loss": 0.302, + "step": 18905 + }, + { + "epoch": 0.7069654415029091, + "grad_norm": 0.4832530617713928, + "learning_rate": 3.946452441634332e-06, + "loss": 0.2758, + "step": 18910 + }, + { + "epoch": 0.7071523704932589, + "grad_norm": 0.4215802550315857, + "learning_rate": 3.9417791757790565e-06, + "loss": 0.2889, + "step": 18915 + }, + { + "epoch": 0.7073392994836086, + "grad_norm": 0.6536740660667419, + "learning_rate": 3.937107999230554e-06, + "loss": 0.3214, + "step": 18920 + }, + { + "epoch": 0.7075262284739584, + "grad_norm": 0.4992019832134247, + "learning_rate": 3.932438913599781e-06, + "loss": 0.2528, + "step": 18925 + }, + { + "epoch": 0.7077131574643083, + "grad_norm": 0.21796809136867523, + "learning_rate": 3.927771920496967e-06, + "loss": 0.3074, + "step": 18930 + }, + { + "epoch": 0.7079000864546581, + "grad_norm": 0.25290024280548096, + "learning_rate": 3.923107021531629e-06, + "loss": 0.3147, + "step": 18935 + }, + { + "epoch": 0.7080870154450079, + "grad_norm": 0.2794661521911621, + "learning_rate": 3.918444218312551e-06, + "loss": 0.304, + "step": 18940 + }, + { + "epoch": 0.7082739444353576, + "grad_norm": 0.9677742719650269, + "learning_rate": 3.913783512447806e-06, + "loss": 0.3212, + "step": 18945 + }, + { + "epoch": 0.7084608734257074, + "grad_norm": 0.3141147792339325, + "learning_rate": 3.909124905544737e-06, + "loss": 0.2581, + "step": 18950 + }, + { + "epoch": 0.7086478024160572, + "grad_norm": 0.8014910817146301, + "learning_rate": 3.9044683992099616e-06, + "loss": 0.3311, + "step": 18955 + }, + { + "epoch": 0.708834731406407, + "grad_norm": 0.2427234798669815, + "learning_rate": 3.899813995049373e-06, + "loss": 0.285, + "step": 18960 + }, + { + "epoch": 0.7090216603967567, + "grad_norm": 0.49129942059516907, + "learning_rate": 3.895161694668144e-06, + "loss": 0.2669, + "step": 18965 + }, + { + "epoch": 0.7092085893871066, + "grad_norm": 0.351224422454834, + "learning_rate": 3.890511499670726e-06, + "loss": 0.2503, + "step": 18970 + }, + { + "epoch": 0.7093955183774564, + "grad_norm": 0.4046824872493744, + "learning_rate": 3.885863411660829e-06, + "loss": 0.2601, + "step": 18975 + }, + { + "epoch": 0.7095824473678062, + "grad_norm": 0.49013751745224, + "learning_rate": 3.881217432241451e-06, + "loss": 0.2791, + "step": 18980 + }, + { + "epoch": 0.709769376358156, + "grad_norm": 0.3391354978084564, + "learning_rate": 3.876573563014854e-06, + "loss": 0.2573, + "step": 18985 + }, + { + "epoch": 0.7099563053485057, + "grad_norm": 0.28667691349983215, + "learning_rate": 3.8719318055825785e-06, + "loss": 0.2763, + "step": 18990 + }, + { + "epoch": 0.7101432343388555, + "grad_norm": 0.2968943417072296, + "learning_rate": 3.8672921615454325e-06, + "loss": 0.2952, + "step": 18995 + }, + { + "epoch": 0.7103301633292053, + "grad_norm": 0.2967950403690338, + "learning_rate": 3.862654632503495e-06, + "loss": 0.3106, + "step": 19000 + }, + { + "epoch": 0.7105170923195551, + "grad_norm": 0.21600590646266937, + "learning_rate": 3.858019220056115e-06, + "loss": 0.1917, + "step": 19005 + }, + { + "epoch": 0.710704021309905, + "grad_norm": 0.5353944897651672, + "learning_rate": 3.853385925801916e-06, + "loss": 0.2285, + "step": 19010 + }, + { + "epoch": 0.7108909503002547, + "grad_norm": 0.44608166813850403, + "learning_rate": 3.848754751338792e-06, + "loss": 0.2501, + "step": 19015 + }, + { + "epoch": 0.7110778792906045, + "grad_norm": 0.38213208317756653, + "learning_rate": 3.844125698263896e-06, + "loss": 0.2357, + "step": 19020 + }, + { + "epoch": 0.7112648082809543, + "grad_norm": 0.2469312697649002, + "learning_rate": 3.8394987681736596e-06, + "loss": 0.2981, + "step": 19025 + }, + { + "epoch": 0.7114517372713041, + "grad_norm": 0.30986660718917847, + "learning_rate": 3.834873962663775e-06, + "loss": 0.3125, + "step": 19030 + }, + { + "epoch": 0.7116386662616538, + "grad_norm": 0.5508708357810974, + "learning_rate": 3.830251283329211e-06, + "loss": 0.2756, + "step": 19035 + }, + { + "epoch": 0.7118255952520036, + "grad_norm": 0.44314640760421753, + "learning_rate": 3.825630731764195e-06, + "loss": 0.3161, + "step": 19040 + }, + { + "epoch": 0.7120125242423534, + "grad_norm": 0.3538496494293213, + "learning_rate": 3.8210123095622164e-06, + "loss": 0.2465, + "step": 19045 + }, + { + "epoch": 0.7121994532327032, + "grad_norm": 0.4966811537742615, + "learning_rate": 3.816396018316047e-06, + "loss": 0.2047, + "step": 19050 + }, + { + "epoch": 0.7123863822230531, + "grad_norm": 0.4113473892211914, + "learning_rate": 3.8117818596177035e-06, + "loss": 0.232, + "step": 19055 + }, + { + "epoch": 0.7125733112134028, + "grad_norm": 0.42282649874687195, + "learning_rate": 3.807169835058485e-06, + "loss": 0.3454, + "step": 19060 + }, + { + "epoch": 0.7127602402037526, + "grad_norm": 0.495076060295105, + "learning_rate": 3.8025599462289407e-06, + "loss": 0.2909, + "step": 19065 + }, + { + "epoch": 0.7129471691941024, + "grad_norm": 0.4929969608783722, + "learning_rate": 3.7979521947188946e-06, + "loss": 0.3251, + "step": 19070 + }, + { + "epoch": 0.7131340981844522, + "grad_norm": 0.2015666663646698, + "learning_rate": 3.793346582117422e-06, + "loss": 0.2641, + "step": 19075 + }, + { + "epoch": 0.713321027174802, + "grad_norm": 0.42519763112068176, + "learning_rate": 3.7887431100128746e-06, + "loss": 0.3006, + "step": 19080 + }, + { + "epoch": 0.7135079561651517, + "grad_norm": 0.543765664100647, + "learning_rate": 3.784141779992849e-06, + "loss": 0.319, + "step": 19085 + }, + { + "epoch": 0.7136948851555015, + "grad_norm": 0.39010390639305115, + "learning_rate": 3.77954259364422e-06, + "loss": 0.2191, + "step": 19090 + }, + { + "epoch": 0.7138818141458514, + "grad_norm": 0.6654005646705627, + "learning_rate": 3.7749455525531122e-06, + "loss": 0.2967, + "step": 19095 + }, + { + "epoch": 0.7140687431362012, + "grad_norm": 0.3600732982158661, + "learning_rate": 3.7703506583049097e-06, + "loss": 0.333, + "step": 19100 + }, + { + "epoch": 0.714255672126551, + "grad_norm": 0.42062684893608093, + "learning_rate": 3.765757912484266e-06, + "loss": 0.3078, + "step": 19105 + }, + { + "epoch": 0.7144426011169007, + "grad_norm": 0.324552983045578, + "learning_rate": 3.7611673166750816e-06, + "loss": 0.2595, + "step": 19110 + }, + { + "epoch": 0.7146295301072505, + "grad_norm": 0.522148072719574, + "learning_rate": 3.7565788724605278e-06, + "loss": 0.2575, + "step": 19115 + }, + { + "epoch": 0.7148164590976003, + "grad_norm": 0.2913423776626587, + "learning_rate": 3.751992581423021e-06, + "loss": 0.2701, + "step": 19120 + }, + { + "epoch": 0.71500338808795, + "grad_norm": 0.5544187426567078, + "learning_rate": 3.7474084451442484e-06, + "loss": 0.2332, + "step": 19125 + }, + { + "epoch": 0.7151903170782998, + "grad_norm": 0.7291717529296875, + "learning_rate": 3.7428264652051393e-06, + "loss": 0.3321, + "step": 19130 + }, + { + "epoch": 0.7153772460686497, + "grad_norm": 0.5494731664657593, + "learning_rate": 3.7382466431858966e-06, + "loss": 0.318, + "step": 19135 + }, + { + "epoch": 0.7155641750589995, + "grad_norm": 0.3049234449863434, + "learning_rate": 3.733668980665963e-06, + "loss": 0.2599, + "step": 19140 + }, + { + "epoch": 0.7157511040493493, + "grad_norm": 0.3012910783290863, + "learning_rate": 3.729093479224043e-06, + "loss": 0.2457, + "step": 19145 + }, + { + "epoch": 0.715938033039699, + "grad_norm": 0.23188547790050507, + "learning_rate": 3.7245201404381006e-06, + "loss": 0.2909, + "step": 19150 + }, + { + "epoch": 0.7161249620300488, + "grad_norm": 0.1563846319913864, + "learning_rate": 3.7199489658853428e-06, + "loss": 0.3587, + "step": 19155 + }, + { + "epoch": 0.7163118910203986, + "grad_norm": 0.34965288639068604, + "learning_rate": 3.7153799571422444e-06, + "loss": 0.2779, + "step": 19160 + }, + { + "epoch": 0.7164988200107484, + "grad_norm": 0.3783499300479889, + "learning_rate": 3.710813115784517e-06, + "loss": 0.3366, + "step": 19165 + }, + { + "epoch": 0.7166857490010982, + "grad_norm": 0.5413819551467896, + "learning_rate": 3.7062484433871426e-06, + "loss": 0.3233, + "step": 19170 + }, + { + "epoch": 0.716872677991448, + "grad_norm": 0.43029576539993286, + "learning_rate": 3.7016859415243377e-06, + "loss": 0.2896, + "step": 19175 + }, + { + "epoch": 0.7170596069817978, + "grad_norm": 0.23014762997627258, + "learning_rate": 3.6971256117695853e-06, + "loss": 0.34, + "step": 19180 + }, + { + "epoch": 0.7172465359721476, + "grad_norm": 0.5080450773239136, + "learning_rate": 3.692567455695609e-06, + "loss": 0.2345, + "step": 19185 + }, + { + "epoch": 0.7174334649624974, + "grad_norm": 0.36543214321136475, + "learning_rate": 3.6880114748743832e-06, + "loss": 0.3117, + "step": 19190 + }, + { + "epoch": 0.7176203939528472, + "grad_norm": 0.38558727502822876, + "learning_rate": 3.683457670877142e-06, + "loss": 0.2428, + "step": 19195 + }, + { + "epoch": 0.7178073229431969, + "grad_norm": 0.16458238661289215, + "learning_rate": 3.678906045274355e-06, + "loss": 0.2913, + "step": 19200 + }, + { + "epoch": 0.7179942519335467, + "grad_norm": 0.4184258282184601, + "learning_rate": 3.6743565996357534e-06, + "loss": 0.3226, + "step": 19205 + }, + { + "epoch": 0.7181811809238965, + "grad_norm": 0.33344846963882446, + "learning_rate": 3.6698093355303054e-06, + "loss": 0.3476, + "step": 19210 + }, + { + "epoch": 0.7183681099142464, + "grad_norm": 0.9731647372245789, + "learning_rate": 3.6652642545262374e-06, + "loss": 0.4277, + "step": 19215 + }, + { + "epoch": 0.7185550389045962, + "grad_norm": 0.38592931628227234, + "learning_rate": 3.6607213581910116e-06, + "loss": 0.3507, + "step": 19220 + }, + { + "epoch": 0.7187419678949459, + "grad_norm": 0.5030714273452759, + "learning_rate": 3.65618064809135e-06, + "loss": 0.2249, + "step": 19225 + }, + { + "epoch": 0.7189288968852957, + "grad_norm": 0.3900645673274994, + "learning_rate": 3.6516421257932054e-06, + "loss": 0.304, + "step": 19230 + }, + { + "epoch": 0.7191158258756455, + "grad_norm": 0.4885767698287964, + "learning_rate": 3.6471057928617913e-06, + "loss": 0.2138, + "step": 19235 + }, + { + "epoch": 0.7193027548659953, + "grad_norm": 0.33923253417015076, + "learning_rate": 3.6425716508615574e-06, + "loss": 0.2623, + "step": 19240 + }, + { + "epoch": 0.719489683856345, + "grad_norm": 0.3621087670326233, + "learning_rate": 3.638039701356193e-06, + "loss": 0.2484, + "step": 19245 + }, + { + "epoch": 0.7196766128466948, + "grad_norm": 0.18470481038093567, + "learning_rate": 3.6335099459086453e-06, + "loss": 0.3044, + "step": 19250 + }, + { + "epoch": 0.7198635418370447, + "grad_norm": 0.3943520188331604, + "learning_rate": 3.6289823860810926e-06, + "loss": 0.2953, + "step": 19255 + }, + { + "epoch": 0.7200504708273945, + "grad_norm": 0.47943511605262756, + "learning_rate": 3.624457023434964e-06, + "loss": 0.3055, + "step": 19260 + }, + { + "epoch": 0.7202373998177443, + "grad_norm": 0.5454005002975464, + "learning_rate": 3.619933859530923e-06, + "loss": 0.2572, + "step": 19265 + }, + { + "epoch": 0.720424328808094, + "grad_norm": 0.5797293186187744, + "learning_rate": 3.6154128959288847e-06, + "loss": 0.311, + "step": 19270 + }, + { + "epoch": 0.7206112577984438, + "grad_norm": 0.2344737946987152, + "learning_rate": 3.610894134187993e-06, + "loss": 0.2628, + "step": 19275 + }, + { + "epoch": 0.7207981867887936, + "grad_norm": 0.279786616563797, + "learning_rate": 3.6063775758666485e-06, + "loss": 0.2749, + "step": 19280 + }, + { + "epoch": 0.7209851157791434, + "grad_norm": 0.28806987404823303, + "learning_rate": 3.601863222522477e-06, + "loss": 0.4239, + "step": 19285 + }, + { + "epoch": 0.7211720447694931, + "grad_norm": 0.3944340646266937, + "learning_rate": 3.5973510757123464e-06, + "loss": 0.3092, + "step": 19290 + }, + { + "epoch": 0.7213589737598429, + "grad_norm": 0.7157394886016846, + "learning_rate": 3.5928411369923743e-06, + "loss": 0.3422, + "step": 19295 + }, + { + "epoch": 0.7215459027501928, + "grad_norm": 0.6664456129074097, + "learning_rate": 3.5883334079179023e-06, + "loss": 0.2451, + "step": 19300 + }, + { + "epoch": 0.7217328317405426, + "grad_norm": 0.32842952013015747, + "learning_rate": 3.583827890043524e-06, + "loss": 0.2177, + "step": 19305 + }, + { + "epoch": 0.7219197607308924, + "grad_norm": 0.3629484474658966, + "learning_rate": 3.5793245849230563e-06, + "loss": 0.2519, + "step": 19310 + }, + { + "epoch": 0.7221066897212421, + "grad_norm": 0.506600022315979, + "learning_rate": 3.574823494109567e-06, + "loss": 0.3817, + "step": 19315 + }, + { + "epoch": 0.7222936187115919, + "grad_norm": 0.38654983043670654, + "learning_rate": 3.5703246191553463e-06, + "loss": 0.2811, + "step": 19320 + }, + { + "epoch": 0.7224805477019417, + "grad_norm": 0.488619863986969, + "learning_rate": 3.565827961611935e-06, + "loss": 0.2265, + "step": 19325 + }, + { + "epoch": 0.7226674766922915, + "grad_norm": 0.38500773906707764, + "learning_rate": 3.561333523030097e-06, + "loss": 0.2155, + "step": 19330 + }, + { + "epoch": 0.7228544056826413, + "grad_norm": 0.3992687463760376, + "learning_rate": 3.5568413049598326e-06, + "loss": 0.2698, + "step": 19335 + }, + { + "epoch": 0.7230413346729911, + "grad_norm": 0.5223041772842407, + "learning_rate": 3.552351308950386e-06, + "loss": 0.3776, + "step": 19340 + }, + { + "epoch": 0.7232282636633409, + "grad_norm": 0.32374194264411926, + "learning_rate": 3.54786353655022e-06, + "loss": 0.2399, + "step": 19345 + }, + { + "epoch": 0.7234151926536907, + "grad_norm": 0.3728601038455963, + "learning_rate": 3.5433779893070477e-06, + "loss": 0.2624, + "step": 19350 + }, + { + "epoch": 0.7236021216440405, + "grad_norm": 0.29935961961746216, + "learning_rate": 3.538894668767797e-06, + "loss": 0.2591, + "step": 19355 + }, + { + "epoch": 0.7237890506343903, + "grad_norm": 0.3965166211128235, + "learning_rate": 3.534413576478645e-06, + "loss": 0.2904, + "step": 19360 + }, + { + "epoch": 0.72397597962474, + "grad_norm": 0.32862594723701477, + "learning_rate": 3.5299347139849836e-06, + "loss": 0.3974, + "step": 19365 + }, + { + "epoch": 0.7241629086150898, + "grad_norm": 0.3905617594718933, + "learning_rate": 3.5254580828314524e-06, + "loss": 0.2661, + "step": 19370 + }, + { + "epoch": 0.7243498376054396, + "grad_norm": 0.7117414474487305, + "learning_rate": 3.5209836845619093e-06, + "loss": 0.2346, + "step": 19375 + }, + { + "epoch": 0.7245367665957895, + "grad_norm": 0.2594413757324219, + "learning_rate": 3.5165115207194435e-06, + "loss": 0.2891, + "step": 19380 + }, + { + "epoch": 0.7247236955861392, + "grad_norm": 0.6064930558204651, + "learning_rate": 3.5120415928463813e-06, + "loss": 0.2849, + "step": 19385 + }, + { + "epoch": 0.724910624576489, + "grad_norm": 0.34241193532943726, + "learning_rate": 3.507573902484267e-06, + "loss": 0.2779, + "step": 19390 + }, + { + "epoch": 0.7250975535668388, + "grad_norm": 0.4620073735713959, + "learning_rate": 3.5031084511738855e-06, + "loss": 0.3194, + "step": 19395 + }, + { + "epoch": 0.7252844825571886, + "grad_norm": 0.36498600244522095, + "learning_rate": 3.4986452404552362e-06, + "loss": 0.3193, + "step": 19400 + }, + { + "epoch": 0.7254714115475384, + "grad_norm": 0.24594563245773315, + "learning_rate": 3.49418427186756e-06, + "loss": 0.3006, + "step": 19405 + }, + { + "epoch": 0.7256583405378881, + "grad_norm": 0.6980525255203247, + "learning_rate": 3.4897255469493096e-06, + "loss": 0.2412, + "step": 19410 + }, + { + "epoch": 0.7258452695282379, + "grad_norm": 0.5322422385215759, + "learning_rate": 3.4852690672381785e-06, + "loss": 0.2604, + "step": 19415 + }, + { + "epoch": 0.7260321985185878, + "grad_norm": 0.27667078375816345, + "learning_rate": 3.480814834271072e-06, + "loss": 0.306, + "step": 19420 + }, + { + "epoch": 0.7262191275089376, + "grad_norm": 0.38868507742881775, + "learning_rate": 3.476362849584134e-06, + "loss": 0.2896, + "step": 19425 + }, + { + "epoch": 0.7264060564992874, + "grad_norm": 0.2022697925567627, + "learning_rate": 3.4719131147127237e-06, + "loss": 0.3543, + "step": 19430 + }, + { + "epoch": 0.7265929854896371, + "grad_norm": 0.41005682945251465, + "learning_rate": 3.4674656311914233e-06, + "loss": 0.3022, + "step": 19435 + }, + { + "epoch": 0.7267799144799869, + "grad_norm": 0.33465057611465454, + "learning_rate": 3.463020400554049e-06, + "loss": 0.3551, + "step": 19440 + }, + { + "epoch": 0.7269668434703367, + "grad_norm": 0.3077739477157593, + "learning_rate": 3.4585774243336277e-06, + "loss": 0.2626, + "step": 19445 + }, + { + "epoch": 0.7271537724606865, + "grad_norm": 1.222378134727478, + "learning_rate": 3.45413670406242e-06, + "loss": 0.3006, + "step": 19450 + }, + { + "epoch": 0.7273407014510362, + "grad_norm": 0.28431418538093567, + "learning_rate": 3.449698241271897e-06, + "loss": 0.2822, + "step": 19455 + }, + { + "epoch": 0.7275276304413861, + "grad_norm": 0.5643324851989746, + "learning_rate": 3.445262037492765e-06, + "loss": 0.306, + "step": 19460 + }, + { + "epoch": 0.7277145594317359, + "grad_norm": 0.39042869210243225, + "learning_rate": 3.4408280942549343e-06, + "loss": 0.2474, + "step": 19465 + }, + { + "epoch": 0.7279014884220857, + "grad_norm": 0.5540686249732971, + "learning_rate": 3.436396413087555e-06, + "loss": 0.2843, + "step": 19470 + }, + { + "epoch": 0.7280884174124355, + "grad_norm": 0.3987734317779541, + "learning_rate": 3.4319669955189806e-06, + "loss": 0.34, + "step": 19475 + }, + { + "epoch": 0.7282753464027852, + "grad_norm": 0.4319436252117157, + "learning_rate": 3.427539843076788e-06, + "loss": 0.3197, + "step": 19480 + }, + { + "epoch": 0.728462275393135, + "grad_norm": 0.3150979280471802, + "learning_rate": 3.423114957287783e-06, + "loss": 0.3086, + "step": 19485 + }, + { + "epoch": 0.7286492043834848, + "grad_norm": 0.34863778948783875, + "learning_rate": 3.4186923396779735e-06, + "loss": 0.2332, + "step": 19490 + }, + { + "epoch": 0.7288361333738346, + "grad_norm": 0.41178667545318604, + "learning_rate": 3.414271991772602e-06, + "loss": 0.2602, + "step": 19495 + }, + { + "epoch": 0.7290230623641845, + "grad_norm": 0.5623154044151306, + "learning_rate": 3.4098539150961107e-06, + "loss": 0.2037, + "step": 19500 + }, + { + "epoch": 0.7292099913545342, + "grad_norm": 0.26398512721061707, + "learning_rate": 3.4054381111721767e-06, + "loss": 0.2504, + "step": 19505 + }, + { + "epoch": 0.729396920344884, + "grad_norm": 0.4237319231033325, + "learning_rate": 3.4010245815236775e-06, + "loss": 0.2686, + "step": 19510 + }, + { + "epoch": 0.7295838493352338, + "grad_norm": 0.3419848084449768, + "learning_rate": 3.3966133276727178e-06, + "loss": 0.238, + "step": 19515 + }, + { + "epoch": 0.7297707783255836, + "grad_norm": 0.34581613540649414, + "learning_rate": 3.392204351140611e-06, + "loss": 0.2861, + "step": 19520 + }, + { + "epoch": 0.7299577073159333, + "grad_norm": 0.27548453211784363, + "learning_rate": 3.3877976534478816e-06, + "loss": 0.3436, + "step": 19525 + }, + { + "epoch": 0.7301446363062831, + "grad_norm": 0.5232365727424622, + "learning_rate": 3.383393236114283e-06, + "loss": 0.2877, + "step": 19530 + }, + { + "epoch": 0.7303315652966329, + "grad_norm": 0.36319199204444885, + "learning_rate": 3.378991100658764e-06, + "loss": 0.4038, + "step": 19535 + }, + { + "epoch": 0.7305184942869827, + "grad_norm": 0.3827814757823944, + "learning_rate": 3.3745912485995e-06, + "loss": 0.2816, + "step": 19540 + }, + { + "epoch": 0.7307054232773326, + "grad_norm": 0.7302720546722412, + "learning_rate": 3.370193681453872e-06, + "loss": 0.3484, + "step": 19545 + }, + { + "epoch": 0.7308923522676823, + "grad_norm": 0.3170529901981354, + "learning_rate": 3.3657984007384757e-06, + "loss": 0.2089, + "step": 19550 + }, + { + "epoch": 0.7310792812580321, + "grad_norm": 0.6026202440261841, + "learning_rate": 3.361405407969115e-06, + "loss": 0.3252, + "step": 19555 + }, + { + "epoch": 0.7312662102483819, + "grad_norm": 0.6313705444335938, + "learning_rate": 3.3570147046608125e-06, + "loss": 0.2608, + "step": 19560 + }, + { + "epoch": 0.7314531392387317, + "grad_norm": 0.6118645071983337, + "learning_rate": 3.352626292327793e-06, + "loss": 0.303, + "step": 19565 + }, + { + "epoch": 0.7316400682290815, + "grad_norm": 0.9984606504440308, + "learning_rate": 3.348240172483491e-06, + "loss": 0.269, + "step": 19570 + }, + { + "epoch": 0.7318269972194312, + "grad_norm": 0.4851844906806946, + "learning_rate": 3.3438563466405595e-06, + "loss": 0.2853, + "step": 19575 + }, + { + "epoch": 0.732013926209781, + "grad_norm": 0.5005273222923279, + "learning_rate": 3.339474816310847e-06, + "loss": 0.3988, + "step": 19580 + }, + { + "epoch": 0.7322008552001309, + "grad_norm": 0.31317126750946045, + "learning_rate": 3.3350955830054267e-06, + "loss": 0.3109, + "step": 19585 + }, + { + "epoch": 0.7323877841904807, + "grad_norm": 0.33924540877342224, + "learning_rate": 3.330718648234562e-06, + "loss": 0.3024, + "step": 19590 + }, + { + "epoch": 0.7325747131808305, + "grad_norm": 0.5410972833633423, + "learning_rate": 3.326344013507741e-06, + "loss": 0.3328, + "step": 19595 + }, + { + "epoch": 0.7327616421711802, + "grad_norm": 0.4295196235179901, + "learning_rate": 3.321971680333641e-06, + "loss": 0.2226, + "step": 19600 + }, + { + "epoch": 0.73294857116153, + "grad_norm": 0.3176816999912262, + "learning_rate": 3.317601650220159e-06, + "loss": 0.2551, + "step": 19605 + }, + { + "epoch": 0.7331355001518798, + "grad_norm": 0.5756362676620483, + "learning_rate": 3.313233924674396e-06, + "loss": 0.2447, + "step": 19610 + }, + { + "epoch": 0.7333224291422296, + "grad_norm": 0.6090577840805054, + "learning_rate": 3.3088685052026524e-06, + "loss": 0.3359, + "step": 19615 + }, + { + "epoch": 0.7335093581325793, + "grad_norm": 0.27702534198760986, + "learning_rate": 3.3045053933104366e-06, + "loss": 0.2796, + "step": 19620 + }, + { + "epoch": 0.7336962871229292, + "grad_norm": 0.3996005654335022, + "learning_rate": 3.3001445905024567e-06, + "loss": 0.2924, + "step": 19625 + }, + { + "epoch": 0.733883216113279, + "grad_norm": 0.4653981029987335, + "learning_rate": 3.2957860982826363e-06, + "loss": 0.242, + "step": 19630 + }, + { + "epoch": 0.7340701451036288, + "grad_norm": 0.43299591541290283, + "learning_rate": 3.2914299181540866e-06, + "loss": 0.3355, + "step": 19635 + }, + { + "epoch": 0.7342570740939786, + "grad_norm": 0.35684916377067566, + "learning_rate": 3.287076051619137e-06, + "loss": 0.3164, + "step": 19640 + }, + { + "epoch": 0.7344440030843283, + "grad_norm": 0.5310947895050049, + "learning_rate": 3.282724500179304e-06, + "loss": 0.3019, + "step": 19645 + }, + { + "epoch": 0.7346309320746781, + "grad_norm": 0.38951942324638367, + "learning_rate": 3.2783752653353164e-06, + "loss": 0.2736, + "step": 19650 + }, + { + "epoch": 0.7348178610650279, + "grad_norm": 0.19010038673877716, + "learning_rate": 3.2740283485871038e-06, + "loss": 0.2463, + "step": 19655 + }, + { + "epoch": 0.7350047900553777, + "grad_norm": 0.3501550853252411, + "learning_rate": 3.269683751433791e-06, + "loss": 0.2534, + "step": 19660 + }, + { + "epoch": 0.7351917190457276, + "grad_norm": 0.40544456243515015, + "learning_rate": 3.2653414753737047e-06, + "loss": 0.2525, + "step": 19665 + }, + { + "epoch": 0.7353786480360773, + "grad_norm": 1.1853599548339844, + "learning_rate": 3.261001521904368e-06, + "loss": 0.2623, + "step": 19670 + }, + { + "epoch": 0.7355655770264271, + "grad_norm": 0.24595674872398376, + "learning_rate": 3.2566638925225113e-06, + "loss": 0.2254, + "step": 19675 + }, + { + "epoch": 0.7357525060167769, + "grad_norm": 0.2667876183986664, + "learning_rate": 3.2523285887240553e-06, + "loss": 0.3439, + "step": 19680 + }, + { + "epoch": 0.7359394350071267, + "grad_norm": 0.44477343559265137, + "learning_rate": 3.2479956120041266e-06, + "loss": 0.3082, + "step": 19685 + }, + { + "epoch": 0.7361263639974764, + "grad_norm": 0.455649197101593, + "learning_rate": 3.243664963857038e-06, + "loss": 0.2301, + "step": 19690 + }, + { + "epoch": 0.7363132929878262, + "grad_norm": 0.4365465044975281, + "learning_rate": 3.239336645776311e-06, + "loss": 0.2563, + "step": 19695 + }, + { + "epoch": 0.736500221978176, + "grad_norm": 0.3654913902282715, + "learning_rate": 3.23501065925466e-06, + "loss": 0.2998, + "step": 19700 + }, + { + "epoch": 0.7366871509685259, + "grad_norm": 0.46953657269477844, + "learning_rate": 3.230687005783992e-06, + "loss": 0.2587, + "step": 19705 + }, + { + "epoch": 0.7368740799588757, + "grad_norm": 0.5593171119689941, + "learning_rate": 3.2263656868554092e-06, + "loss": 0.2911, + "step": 19710 + }, + { + "epoch": 0.7370610089492254, + "grad_norm": 0.4246465861797333, + "learning_rate": 3.2220467039592097e-06, + "loss": 0.2599, + "step": 19715 + }, + { + "epoch": 0.7372479379395752, + "grad_norm": 0.38245370984077454, + "learning_rate": 3.2177300585848916e-06, + "loss": 0.1948, + "step": 19720 + }, + { + "epoch": 0.737434866929925, + "grad_norm": 0.525850772857666, + "learning_rate": 3.213415752221136e-06, + "loss": 0.261, + "step": 19725 + }, + { + "epoch": 0.7376217959202748, + "grad_norm": 0.3949277997016907, + "learning_rate": 3.2091037863558316e-06, + "loss": 0.2761, + "step": 19730 + }, + { + "epoch": 0.7378087249106245, + "grad_norm": 0.47665491700172424, + "learning_rate": 3.2047941624760435e-06, + "loss": 0.2357, + "step": 19735 + }, + { + "epoch": 0.7379956539009743, + "grad_norm": 0.3770470917224884, + "learning_rate": 3.2004868820680423e-06, + "loss": 0.2551, + "step": 19740 + }, + { + "epoch": 0.7381825828913242, + "grad_norm": 0.24684104323387146, + "learning_rate": 3.196181946617287e-06, + "loss": 0.2372, + "step": 19745 + }, + { + "epoch": 0.738369511881674, + "grad_norm": 0.5357893705368042, + "learning_rate": 3.191879357608425e-06, + "loss": 0.3861, + "step": 19750 + }, + { + "epoch": 0.7385564408720238, + "grad_norm": 0.4339257776737213, + "learning_rate": 3.187579116525291e-06, + "loss": 0.2941, + "step": 19755 + }, + { + "epoch": 0.7387433698623735, + "grad_norm": 0.4764021933078766, + "learning_rate": 3.1832812248509235e-06, + "loss": 0.2143, + "step": 19760 + }, + { + "epoch": 0.7389302988527233, + "grad_norm": 0.433449923992157, + "learning_rate": 3.178985684067537e-06, + "loss": 0.2832, + "step": 19765 + }, + { + "epoch": 0.7391172278430731, + "grad_norm": 0.5205506086349487, + "learning_rate": 3.1746924956565385e-06, + "loss": 0.2622, + "step": 19770 + }, + { + "epoch": 0.7393041568334229, + "grad_norm": 0.37828561663627625, + "learning_rate": 3.1704016610985313e-06, + "loss": 0.2705, + "step": 19775 + }, + { + "epoch": 0.7394910858237727, + "grad_norm": 0.24176892638206482, + "learning_rate": 3.166113181873296e-06, + "loss": 0.3125, + "step": 19780 + }, + { + "epoch": 0.7396780148141224, + "grad_norm": 0.40736111998558044, + "learning_rate": 3.1618270594598076e-06, + "loss": 0.3194, + "step": 19785 + }, + { + "epoch": 0.7398649438044723, + "grad_norm": 0.47427597641944885, + "learning_rate": 3.1575432953362317e-06, + "loss": 0.3029, + "step": 19790 + }, + { + "epoch": 0.7400518727948221, + "grad_norm": 0.32264068722724915, + "learning_rate": 3.1532618909799095e-06, + "loss": 0.3093, + "step": 19795 + }, + { + "epoch": 0.7402388017851719, + "grad_norm": 0.5120447874069214, + "learning_rate": 3.14898284786738e-06, + "loss": 0.2977, + "step": 19800 + }, + { + "epoch": 0.7404257307755217, + "grad_norm": 0.3379383981227875, + "learning_rate": 3.1447061674743594e-06, + "loss": 0.2976, + "step": 19805 + }, + { + "epoch": 0.7406126597658714, + "grad_norm": 0.6211456060409546, + "learning_rate": 3.1404318512757525e-06, + "loss": 0.2624, + "step": 19810 + }, + { + "epoch": 0.7407995887562212, + "grad_norm": 0.306180864572525, + "learning_rate": 3.1361599007456456e-06, + "loss": 0.2349, + "step": 19815 + }, + { + "epoch": 0.740986517746571, + "grad_norm": 0.5241755247116089, + "learning_rate": 3.131890317357319e-06, + "loss": 0.3073, + "step": 19820 + }, + { + "epoch": 0.7411734467369208, + "grad_norm": 0.4787485599517822, + "learning_rate": 3.1276231025832217e-06, + "loss": 0.2572, + "step": 19825 + }, + { + "epoch": 0.7413603757272706, + "grad_norm": 0.27676764130592346, + "learning_rate": 3.123358257894997e-06, + "loss": 0.3357, + "step": 19830 + }, + { + "epoch": 0.7415473047176204, + "grad_norm": 0.4201495945453644, + "learning_rate": 3.119095784763473e-06, + "loss": 0.2738, + "step": 19835 + }, + { + "epoch": 0.7417342337079702, + "grad_norm": 0.27705347537994385, + "learning_rate": 3.114835684658647e-06, + "loss": 0.2443, + "step": 19840 + }, + { + "epoch": 0.74192116269832, + "grad_norm": 0.4122698903083801, + "learning_rate": 3.1105779590497108e-06, + "loss": 0.3632, + "step": 19845 + }, + { + "epoch": 0.7421080916886698, + "grad_norm": 0.39222484827041626, + "learning_rate": 3.1063226094050304e-06, + "loss": 0.2928, + "step": 19850 + }, + { + "epoch": 0.7422950206790195, + "grad_norm": 0.5306823253631592, + "learning_rate": 3.102069637192152e-06, + "loss": 0.2495, + "step": 19855 + }, + { + "epoch": 0.7424819496693693, + "grad_norm": 0.3978886306285858, + "learning_rate": 3.0978190438778022e-06, + "loss": 0.2175, + "step": 19860 + }, + { + "epoch": 0.7426688786597191, + "grad_norm": 0.34500226378440857, + "learning_rate": 3.0935708309278956e-06, + "loss": 0.2046, + "step": 19865 + }, + { + "epoch": 0.742855807650069, + "grad_norm": 0.3210207521915436, + "learning_rate": 3.0893249998075116e-06, + "loss": 0.2618, + "step": 19870 + }, + { + "epoch": 0.7430427366404188, + "grad_norm": 0.7616083025932312, + "learning_rate": 3.0850815519809184e-06, + "loss": 0.3427, + "step": 19875 + }, + { + "epoch": 0.7432296656307685, + "grad_norm": 0.44957032799720764, + "learning_rate": 3.080840488911565e-06, + "loss": 0.2465, + "step": 19880 + }, + { + "epoch": 0.7434165946211183, + "grad_norm": 0.3584950566291809, + "learning_rate": 3.0766018120620643e-06, + "loss": 0.2426, + "step": 19885 + }, + { + "epoch": 0.7436035236114681, + "grad_norm": 0.6310529112815857, + "learning_rate": 3.072365522894221e-06, + "loss": 0.2662, + "step": 19890 + }, + { + "epoch": 0.7437904526018179, + "grad_norm": 0.575434684753418, + "learning_rate": 3.068131622869007e-06, + "loss": 0.2728, + "step": 19895 + }, + { + "epoch": 0.7439773815921676, + "grad_norm": 0.051615819334983826, + "learning_rate": 3.063900113446574e-06, + "loss": 0.204, + "step": 19900 + }, + { + "epoch": 0.7441643105825174, + "grad_norm": 0.575394332408905, + "learning_rate": 3.0596709960862436e-06, + "loss": 0.2808, + "step": 19905 + }, + { + "epoch": 0.7443512395728673, + "grad_norm": 0.6706823110580444, + "learning_rate": 3.055444272246524e-06, + "loss": 0.3299, + "step": 19910 + }, + { + "epoch": 0.7445381685632171, + "grad_norm": 0.2847936451435089, + "learning_rate": 3.0512199433850855e-06, + "loss": 0.2401, + "step": 19915 + }, + { + "epoch": 0.7447250975535669, + "grad_norm": 0.4267846941947937, + "learning_rate": 3.0469980109587803e-06, + "loss": 0.3545, + "step": 19920 + }, + { + "epoch": 0.7449120265439166, + "grad_norm": 0.3833447992801666, + "learning_rate": 3.042778476423637e-06, + "loss": 0.3209, + "step": 19925 + }, + { + "epoch": 0.7450989555342664, + "grad_norm": 0.41514524817466736, + "learning_rate": 3.0385613412348423e-06, + "loss": 0.222, + "step": 19930 + }, + { + "epoch": 0.7452858845246162, + "grad_norm": 0.22775611281394958, + "learning_rate": 3.0343466068467752e-06, + "loss": 0.2635, + "step": 19935 + }, + { + "epoch": 0.745472813514966, + "grad_norm": 0.4837653636932373, + "learning_rate": 3.030134274712968e-06, + "loss": 0.294, + "step": 19940 + }, + { + "epoch": 0.7456597425053157, + "grad_norm": 0.41959109902381897, + "learning_rate": 3.0259243462861423e-06, + "loss": 0.3014, + "step": 19945 + }, + { + "epoch": 0.7458466714956656, + "grad_norm": 0.3355468511581421, + "learning_rate": 3.021716823018176e-06, + "loss": 0.2535, + "step": 19950 + }, + { + "epoch": 0.7460336004860154, + "grad_norm": 0.31884121894836426, + "learning_rate": 3.0175117063601235e-06, + "loss": 0.2149, + "step": 19955 + }, + { + "epoch": 0.7462205294763652, + "grad_norm": 1.0035449266433716, + "learning_rate": 3.0133089977622076e-06, + "loss": 0.3098, + "step": 19960 + }, + { + "epoch": 0.746407458466715, + "grad_norm": 0.34479397535324097, + "learning_rate": 3.009108698673825e-06, + "loss": 0.2726, + "step": 19965 + }, + { + "epoch": 0.7465943874570647, + "grad_norm": 0.619194746017456, + "learning_rate": 3.00491081054354e-06, + "loss": 0.3021, + "step": 19970 + }, + { + "epoch": 0.7467813164474145, + "grad_norm": 0.4144986867904663, + "learning_rate": 3.0007153348190786e-06, + "loss": 0.2219, + "step": 19975 + }, + { + "epoch": 0.7469682454377643, + "grad_norm": 0.580058753490448, + "learning_rate": 2.9965222729473474e-06, + "loss": 0.3113, + "step": 19980 + }, + { + "epoch": 0.7471551744281141, + "grad_norm": 0.1870926320552826, + "learning_rate": 2.992331626374405e-06, + "loss": 0.2202, + "step": 19985 + }, + { + "epoch": 0.747342103418464, + "grad_norm": 0.48588237166404724, + "learning_rate": 2.988143396545493e-06, + "loss": 0.2335, + "step": 19990 + }, + { + "epoch": 0.7475290324088137, + "grad_norm": 0.36052894592285156, + "learning_rate": 2.9839575849050094e-06, + "loss": 0.2429, + "step": 19995 + }, + { + "epoch": 0.7477159613991635, + "grad_norm": 0.42960643768310547, + "learning_rate": 2.9797741928965185e-06, + "loss": 0.2448, + "step": 20000 + }, + { + "epoch": 0.7479028903895133, + "grad_norm": 0.455585241317749, + "learning_rate": 2.9755932219627514e-06, + "loss": 0.262, + "step": 20005 + }, + { + "epoch": 0.7480898193798631, + "grad_norm": 0.22561267018318176, + "learning_rate": 2.9714146735456063e-06, + "loss": 0.289, + "step": 20010 + }, + { + "epoch": 0.7482767483702129, + "grad_norm": 0.3798793852329254, + "learning_rate": 2.96723854908615e-06, + "loss": 0.3006, + "step": 20015 + }, + { + "epoch": 0.7484636773605626, + "grad_norm": 0.45209431648254395, + "learning_rate": 2.9630648500245993e-06, + "loss": 0.3088, + "step": 20020 + }, + { + "epoch": 0.7486506063509124, + "grad_norm": 0.5453251004219055, + "learning_rate": 2.9588935778003526e-06, + "loss": 0.4175, + "step": 20025 + }, + { + "epoch": 0.7488375353412622, + "grad_norm": 0.5267879366874695, + "learning_rate": 2.9547247338519547e-06, + "loss": 0.2464, + "step": 20030 + }, + { + "epoch": 0.7490244643316121, + "grad_norm": 0.3326359987258911, + "learning_rate": 2.950558319617126e-06, + "loss": 0.2875, + "step": 20035 + }, + { + "epoch": 0.7492113933219618, + "grad_norm": 0.3526333272457123, + "learning_rate": 2.9463943365327406e-06, + "loss": 0.2775, + "step": 20040 + }, + { + "epoch": 0.7493983223123116, + "grad_norm": 0.2863275110721588, + "learning_rate": 2.9422327860348377e-06, + "loss": 0.2816, + "step": 20045 + }, + { + "epoch": 0.7495852513026614, + "grad_norm": 0.5883467793464661, + "learning_rate": 2.938073669558613e-06, + "loss": 0.2752, + "step": 20050 + }, + { + "epoch": 0.7497721802930112, + "grad_norm": 0.5096171498298645, + "learning_rate": 2.93391698853843e-06, + "loss": 0.348, + "step": 20055 + }, + { + "epoch": 0.749959109283361, + "grad_norm": 0.2166975438594818, + "learning_rate": 2.9297627444078115e-06, + "loss": 0.2923, + "step": 20060 + }, + { + "epoch": 0.7501460382737107, + "grad_norm": 0.3335493803024292, + "learning_rate": 2.9256109385994326e-06, + "loss": 0.2554, + "step": 20065 + }, + { + "epoch": 0.7503329672640605, + "grad_norm": 1.0838913917541504, + "learning_rate": 2.9214615725451354e-06, + "loss": 0.3143, + "step": 20070 + }, + { + "epoch": 0.7505198962544104, + "grad_norm": 0.5258083939552307, + "learning_rate": 2.917314647675914e-06, + "loss": 0.2903, + "step": 20075 + }, + { + "epoch": 0.7507068252447602, + "grad_norm": 0.4601660370826721, + "learning_rate": 2.913170165421929e-06, + "loss": 0.2453, + "step": 20080 + }, + { + "epoch": 0.75089375423511, + "grad_norm": 0.19206954538822174, + "learning_rate": 2.909028127212491e-06, + "loss": 0.2646, + "step": 20085 + }, + { + "epoch": 0.7510806832254597, + "grad_norm": 0.3423701524734497, + "learning_rate": 2.904888534476069e-06, + "loss": 0.2862, + "step": 20090 + }, + { + "epoch": 0.7512676122158095, + "grad_norm": 0.5520522594451904, + "learning_rate": 2.9007513886402884e-06, + "loss": 0.3379, + "step": 20095 + }, + { + "epoch": 0.7514545412061593, + "grad_norm": 0.2825580835342407, + "learning_rate": 2.896616691131934e-06, + "loss": 0.3071, + "step": 20100 + }, + { + "epoch": 0.7516414701965091, + "grad_norm": 0.4348238706588745, + "learning_rate": 2.892484443376948e-06, + "loss": 0.2731, + "step": 20105 + }, + { + "epoch": 0.7518283991868588, + "grad_norm": 0.4713153541088104, + "learning_rate": 2.8883546468004196e-06, + "loss": 0.2838, + "step": 20110 + }, + { + "epoch": 0.7520153281772087, + "grad_norm": 1.168811559677124, + "learning_rate": 2.884227302826601e-06, + "loss": 0.2472, + "step": 20115 + }, + { + "epoch": 0.7522022571675585, + "grad_norm": 0.5519205331802368, + "learning_rate": 2.8801024128788903e-06, + "loss": 0.3046, + "step": 20120 + }, + { + "epoch": 0.7523891861579083, + "grad_norm": 0.5526771545410156, + "learning_rate": 2.8759799783798503e-06, + "loss": 0.2885, + "step": 20125 + }, + { + "epoch": 0.7525761151482581, + "grad_norm": 0.4945368468761444, + "learning_rate": 2.871860000751182e-06, + "loss": 0.3089, + "step": 20130 + }, + { + "epoch": 0.7527630441386078, + "grad_norm": 0.5120470523834229, + "learning_rate": 2.8677424814137565e-06, + "loss": 0.3819, + "step": 20135 + }, + { + "epoch": 0.7529499731289576, + "grad_norm": 0.3834647536277771, + "learning_rate": 2.8636274217875846e-06, + "loss": 0.3076, + "step": 20140 + }, + { + "epoch": 0.7531369021193074, + "grad_norm": 0.7137815356254578, + "learning_rate": 2.859514823291829e-06, + "loss": 0.2814, + "step": 20145 + }, + { + "epoch": 0.7533238311096572, + "grad_norm": 0.5744923949241638, + "learning_rate": 2.8554046873448127e-06, + "loss": 0.3303, + "step": 20150 + }, + { + "epoch": 0.7535107601000071, + "grad_norm": 0.7045077085494995, + "learning_rate": 2.8512970153639976e-06, + "loss": 0.227, + "step": 20155 + }, + { + "epoch": 0.7536976890903568, + "grad_norm": 0.6118825078010559, + "learning_rate": 2.8471918087660087e-06, + "loss": 0.3046, + "step": 20160 + }, + { + "epoch": 0.7538846180807066, + "grad_norm": 0.6602465510368347, + "learning_rate": 2.843089068966609e-06, + "loss": 0.2856, + "step": 20165 + }, + { + "epoch": 0.7540715470710564, + "grad_norm": 0.5076795220375061, + "learning_rate": 2.8389887973807207e-06, + "loss": 0.2302, + "step": 20170 + }, + { + "epoch": 0.7542584760614062, + "grad_norm": 0.5142863988876343, + "learning_rate": 2.8348909954224037e-06, + "loss": 0.2519, + "step": 20175 + }, + { + "epoch": 0.754445405051756, + "grad_norm": 0.3825984001159668, + "learning_rate": 2.8307956645048795e-06, + "loss": 0.2828, + "step": 20180 + }, + { + "epoch": 0.7546323340421057, + "grad_norm": 0.5186363458633423, + "learning_rate": 2.8267028060405066e-06, + "loss": 0.3063, + "step": 20185 + }, + { + "epoch": 0.7548192630324555, + "grad_norm": 0.43224039673805237, + "learning_rate": 2.8226124214407912e-06, + "loss": 0.2481, + "step": 20190 + }, + { + "epoch": 0.7550061920228054, + "grad_norm": 0.3157874047756195, + "learning_rate": 2.8185245121163986e-06, + "loss": 0.3186, + "step": 20195 + }, + { + "epoch": 0.7551931210131552, + "grad_norm": 0.2855377197265625, + "learning_rate": 2.8144390794771215e-06, + "loss": 0.2652, + "step": 20200 + }, + { + "epoch": 0.7553800500035049, + "grad_norm": 0.29042690992355347, + "learning_rate": 2.810356124931918e-06, + "loss": 0.3023, + "step": 20205 + }, + { + "epoch": 0.7555669789938547, + "grad_norm": 0.38155996799468994, + "learning_rate": 2.806275649888873e-06, + "loss": 0.2603, + "step": 20210 + }, + { + "epoch": 0.7557539079842045, + "grad_norm": 0.3918231427669525, + "learning_rate": 2.8021976557552346e-06, + "loss": 0.2806, + "step": 20215 + }, + { + "epoch": 0.7559408369745543, + "grad_norm": 0.3024686574935913, + "learning_rate": 2.7981221439373774e-06, + "loss": 0.3187, + "step": 20220 + }, + { + "epoch": 0.756127765964904, + "grad_norm": 0.5795729160308838, + "learning_rate": 2.7940491158408367e-06, + "loss": 0.2417, + "step": 20225 + }, + { + "epoch": 0.7563146949552538, + "grad_norm": 1.9002882242202759, + "learning_rate": 2.7899785728702787e-06, + "loss": 0.2685, + "step": 20230 + }, + { + "epoch": 0.7565016239456037, + "grad_norm": 0.6263231039047241, + "learning_rate": 2.785910516429515e-06, + "loss": 0.3513, + "step": 20235 + }, + { + "epoch": 0.7566885529359535, + "grad_norm": 0.36941060423851013, + "learning_rate": 2.781844947921508e-06, + "loss": 0.4064, + "step": 20240 + }, + { + "epoch": 0.7568754819263033, + "grad_norm": 0.30503350496292114, + "learning_rate": 2.7777818687483483e-06, + "loss": 0.2724, + "step": 20245 + }, + { + "epoch": 0.757062410916653, + "grad_norm": 0.27014967799186707, + "learning_rate": 2.7737212803112824e-06, + "loss": 0.2964, + "step": 20250 + }, + { + "epoch": 0.7572493399070028, + "grad_norm": 0.432959645986557, + "learning_rate": 2.7696631840106847e-06, + "loss": 0.271, + "step": 20255 + }, + { + "epoch": 0.7574362688973526, + "grad_norm": 0.19095739722251892, + "learning_rate": 2.7656075812460835e-06, + "loss": 0.3006, + "step": 20260 + }, + { + "epoch": 0.7576231978877024, + "grad_norm": 0.3333872854709625, + "learning_rate": 2.7615544734161315e-06, + "loss": 0.2004, + "step": 20265 + }, + { + "epoch": 0.7578101268780522, + "grad_norm": 0.37564340233802795, + "learning_rate": 2.757503861918638e-06, + "loss": 0.3621, + "step": 20270 + }, + { + "epoch": 0.7579970558684019, + "grad_norm": 0.3397802412509918, + "learning_rate": 2.7534557481505385e-06, + "loss": 0.249, + "step": 20275 + }, + { + "epoch": 0.7581839848587518, + "grad_norm": 0.5983813405036926, + "learning_rate": 2.7494101335079094e-06, + "loss": 0.2332, + "step": 20280 + }, + { + "epoch": 0.7583709138491016, + "grad_norm": 0.38677194714546204, + "learning_rate": 2.7453670193859716e-06, + "loss": 0.3318, + "step": 20285 + }, + { + "epoch": 0.7585578428394514, + "grad_norm": 0.45902055501937866, + "learning_rate": 2.7413264071790747e-06, + "loss": 0.2212, + "step": 20290 + }, + { + "epoch": 0.7587447718298012, + "grad_norm": 0.6074469685554504, + "learning_rate": 2.737288298280715e-06, + "loss": 0.2984, + "step": 20295 + }, + { + "epoch": 0.7589317008201509, + "grad_norm": 0.4030331075191498, + "learning_rate": 2.7332526940835156e-06, + "loss": 0.3323, + "step": 20300 + }, + { + "epoch": 0.7591186298105007, + "grad_norm": 0.3118947744369507, + "learning_rate": 2.729219595979247e-06, + "loss": 0.263, + "step": 20305 + }, + { + "epoch": 0.7593055588008505, + "grad_norm": 0.2766875624656677, + "learning_rate": 2.7251890053588015e-06, + "loss": 0.2779, + "step": 20310 + }, + { + "epoch": 0.7594924877912003, + "grad_norm": 0.4155539572238922, + "learning_rate": 2.7211609236122216e-06, + "loss": 0.2856, + "step": 20315 + }, + { + "epoch": 0.7596794167815502, + "grad_norm": 0.37500113248825073, + "learning_rate": 2.717135352128671e-06, + "loss": 0.2353, + "step": 20320 + }, + { + "epoch": 0.7598663457718999, + "grad_norm": 0.60191810131073, + "learning_rate": 2.7131122922964603e-06, + "loss": 0.3352, + "step": 20325 + }, + { + "epoch": 0.7600532747622497, + "grad_norm": 0.2800582945346832, + "learning_rate": 2.709091745503024e-06, + "loss": 0.3133, + "step": 20330 + }, + { + "epoch": 0.7602402037525995, + "grad_norm": 0.4130517840385437, + "learning_rate": 2.7050737131349315e-06, + "loss": 0.2603, + "step": 20335 + }, + { + "epoch": 0.7604271327429493, + "grad_norm": 0.5701847076416016, + "learning_rate": 2.7010581965778914e-06, + "loss": 0.2705, + "step": 20340 + }, + { + "epoch": 0.760614061733299, + "grad_norm": 0.47026559710502625, + "learning_rate": 2.6970451972167355e-06, + "loss": 0.2877, + "step": 20345 + }, + { + "epoch": 0.7608009907236488, + "grad_norm": 0.5374239683151245, + "learning_rate": 2.6930347164354376e-06, + "loss": 0.2791, + "step": 20350 + }, + { + "epoch": 0.7609879197139986, + "grad_norm": 0.29200440645217896, + "learning_rate": 2.6890267556170925e-06, + "loss": 0.2677, + "step": 20355 + }, + { + "epoch": 0.7611748487043485, + "grad_norm": 0.3506094515323639, + "learning_rate": 2.6850213161439363e-06, + "loss": 0.2583, + "step": 20360 + }, + { + "epoch": 0.7613617776946983, + "grad_norm": 0.652692973613739, + "learning_rate": 2.6810183993973247e-06, + "loss": 0.259, + "step": 20365 + }, + { + "epoch": 0.761548706685048, + "grad_norm": 0.6381142139434814, + "learning_rate": 2.6770180067577547e-06, + "loss": 0.3052, + "step": 20370 + }, + { + "epoch": 0.7617356356753978, + "grad_norm": 0.3937336206436157, + "learning_rate": 2.6730201396048437e-06, + "loss": 0.2877, + "step": 20375 + }, + { + "epoch": 0.7619225646657476, + "grad_norm": 0.44311031699180603, + "learning_rate": 2.6690247993173393e-06, + "loss": 0.374, + "step": 20380 + }, + { + "epoch": 0.7621094936560974, + "grad_norm": 2.144091844558716, + "learning_rate": 2.6650319872731258e-06, + "loss": 0.4321, + "step": 20385 + }, + { + "epoch": 0.7622964226464471, + "grad_norm": 0.479422390460968, + "learning_rate": 2.661041704849203e-06, + "loss": 0.274, + "step": 20390 + }, + { + "epoch": 0.7624833516367969, + "grad_norm": 0.392315536737442, + "learning_rate": 2.657053953421712e-06, + "loss": 0.2914, + "step": 20395 + }, + { + "epoch": 0.7626702806271468, + "grad_norm": 0.34140515327453613, + "learning_rate": 2.6530687343659067e-06, + "loss": 0.2824, + "step": 20400 + }, + { + "epoch": 0.7628572096174966, + "grad_norm": 0.2744537889957428, + "learning_rate": 2.649086049056182e-06, + "loss": 0.2552, + "step": 20405 + }, + { + "epoch": 0.7630441386078464, + "grad_norm": 0.23247292637825012, + "learning_rate": 2.645105898866046e-06, + "loss": 0.2724, + "step": 20410 + }, + { + "epoch": 0.7632310675981961, + "grad_norm": 0.2504008412361145, + "learning_rate": 2.641128285168144e-06, + "loss": 0.2436, + "step": 20415 + }, + { + "epoch": 0.7634179965885459, + "grad_norm": 0.3857523500919342, + "learning_rate": 2.637153209334239e-06, + "loss": 0.294, + "step": 20420 + }, + { + "epoch": 0.7636049255788957, + "grad_norm": 0.42099565267562866, + "learning_rate": 2.633180672735215e-06, + "loss": 0.3373, + "step": 20425 + }, + { + "epoch": 0.7637918545692455, + "grad_norm": 0.2764213979244232, + "learning_rate": 2.6292106767410953e-06, + "loss": 0.2514, + "step": 20430 + }, + { + "epoch": 0.7639787835595953, + "grad_norm": 0.8510448932647705, + "learning_rate": 2.62524322272101e-06, + "loss": 0.2924, + "step": 20435 + }, + { + "epoch": 0.7641657125499451, + "grad_norm": 0.271930456161499, + "learning_rate": 2.621278312043226e-06, + "loss": 0.2673, + "step": 20440 + }, + { + "epoch": 0.7643526415402949, + "grad_norm": 0.4646458327770233, + "learning_rate": 2.617315946075123e-06, + "loss": 0.2376, + "step": 20445 + }, + { + "epoch": 0.7645395705306447, + "grad_norm": 0.5632447600364685, + "learning_rate": 2.613356126183212e-06, + "loss": 0.2435, + "step": 20450 + }, + { + "epoch": 0.7647264995209945, + "grad_norm": 0.29168501496315, + "learning_rate": 2.6093988537331163e-06, + "loss": 0.3008, + "step": 20455 + }, + { + "epoch": 0.7649134285113443, + "grad_norm": 0.24056875705718994, + "learning_rate": 2.6054441300895905e-06, + "loss": 0.2312, + "step": 20460 + }, + { + "epoch": 0.765100357501694, + "grad_norm": 0.3789188265800476, + "learning_rate": 2.601491956616504e-06, + "loss": 0.2843, + "step": 20465 + }, + { + "epoch": 0.7652872864920438, + "grad_norm": 0.35783255100250244, + "learning_rate": 2.597542334676846e-06, + "loss": 0.2711, + "step": 20470 + }, + { + "epoch": 0.7654742154823936, + "grad_norm": 0.6306200623512268, + "learning_rate": 2.59359526563273e-06, + "loss": 0.3031, + "step": 20475 + }, + { + "epoch": 0.7656611444727435, + "grad_norm": 0.30697113275527954, + "learning_rate": 2.589650750845385e-06, + "loss": 0.2589, + "step": 20480 + }, + { + "epoch": 0.7658480734630932, + "grad_norm": 0.41806843876838684, + "learning_rate": 2.5857087916751656e-06, + "loss": 0.2346, + "step": 20485 + }, + { + "epoch": 0.766035002453443, + "grad_norm": 0.4137856960296631, + "learning_rate": 2.5817693894815342e-06, + "loss": 0.2488, + "step": 20490 + }, + { + "epoch": 0.7662219314437928, + "grad_norm": 1.0388176441192627, + "learning_rate": 2.5778325456230845e-06, + "loss": 0.2959, + "step": 20495 + }, + { + "epoch": 0.7664088604341426, + "grad_norm": 0.3841733932495117, + "learning_rate": 2.5738982614575147e-06, + "loss": 0.3137, + "step": 20500 + }, + { + "epoch": 0.7665957894244924, + "grad_norm": 0.3577060103416443, + "learning_rate": 2.569966538341654e-06, + "loss": 0.3331, + "step": 20505 + }, + { + "epoch": 0.7667827184148421, + "grad_norm": 0.23667654395103455, + "learning_rate": 2.5660373776314318e-06, + "loss": 0.3054, + "step": 20510 + }, + { + "epoch": 0.7669696474051919, + "grad_norm": 0.5192760825157166, + "learning_rate": 2.5621107806819125e-06, + "loss": 0.2532, + "step": 20515 + }, + { + "epoch": 0.7671565763955417, + "grad_norm": 0.2387147843837738, + "learning_rate": 2.558186748847262e-06, + "loss": 0.2256, + "step": 20520 + }, + { + "epoch": 0.7673435053858916, + "grad_norm": 0.5738237500190735, + "learning_rate": 2.5542652834807634e-06, + "loss": 0.2913, + "step": 20525 + }, + { + "epoch": 0.7675304343762414, + "grad_norm": 0.5508410334587097, + "learning_rate": 2.5503463859348245e-06, + "loss": 0.2927, + "step": 20530 + }, + { + "epoch": 0.7677173633665911, + "grad_norm": 0.5864548087120056, + "learning_rate": 2.5464300575609547e-06, + "loss": 0.355, + "step": 20535 + }, + { + "epoch": 0.7679042923569409, + "grad_norm": 0.5955525636672974, + "learning_rate": 2.5425162997097896e-06, + "loss": 0.263, + "step": 20540 + }, + { + "epoch": 0.7680912213472907, + "grad_norm": 0.3412562906742096, + "learning_rate": 2.538605113731065e-06, + "loss": 0.3519, + "step": 20545 + }, + { + "epoch": 0.7682781503376405, + "grad_norm": 0.44794994592666626, + "learning_rate": 2.5346965009736445e-06, + "loss": 0.3314, + "step": 20550 + }, + { + "epoch": 0.7684650793279902, + "grad_norm": 0.5643365979194641, + "learning_rate": 2.5307904627854895e-06, + "loss": 0.2622, + "step": 20555 + }, + { + "epoch": 0.76865200831834, + "grad_norm": 0.3417246639728546, + "learning_rate": 2.526887000513687e-06, + "loss": 0.3421, + "step": 20560 + }, + { + "epoch": 0.7688389373086899, + "grad_norm": 0.1929822713136673, + "learning_rate": 2.5229861155044254e-06, + "loss": 0.267, + "step": 20565 + }, + { + "epoch": 0.7690258662990397, + "grad_norm": 0.2237076312303543, + "learning_rate": 2.5190878091030067e-06, + "loss": 0.2641, + "step": 20570 + }, + { + "epoch": 0.7692127952893895, + "grad_norm": 0.43035122752189636, + "learning_rate": 2.5151920826538514e-06, + "loss": 0.2557, + "step": 20575 + }, + { + "epoch": 0.7693997242797392, + "grad_norm": 0.3139476180076599, + "learning_rate": 2.511298937500476e-06, + "loss": 0.2566, + "step": 20580 + }, + { + "epoch": 0.769586653270089, + "grad_norm": 0.2134442925453186, + "learning_rate": 2.5074083749855216e-06, + "loss": 0.2199, + "step": 20585 + }, + { + "epoch": 0.7697735822604388, + "grad_norm": 0.6720663905143738, + "learning_rate": 2.503520396450725e-06, + "loss": 0.2664, + "step": 20590 + }, + { + "epoch": 0.7699605112507886, + "grad_norm": 0.7962212562561035, + "learning_rate": 2.4996350032369467e-06, + "loss": 0.3161, + "step": 20595 + }, + { + "epoch": 0.7701474402411383, + "grad_norm": 0.24116773903369904, + "learning_rate": 2.4957521966841393e-06, + "loss": 0.3047, + "step": 20600 + }, + { + "epoch": 0.7703343692314882, + "grad_norm": 0.21904657781124115, + "learning_rate": 2.4918719781313782e-06, + "loss": 0.3049, + "step": 20605 + }, + { + "epoch": 0.770521298221838, + "grad_norm": 0.40606293082237244, + "learning_rate": 2.487994348916837e-06, + "loss": 0.3494, + "step": 20610 + }, + { + "epoch": 0.7707082272121878, + "grad_norm": 0.586861789226532, + "learning_rate": 2.484119310377796e-06, + "loss": 0.2998, + "step": 20615 + }, + { + "epoch": 0.7708951562025376, + "grad_norm": 0.5949821472167969, + "learning_rate": 2.4802468638506505e-06, + "loss": 0.3086, + "step": 20620 + }, + { + "epoch": 0.7710820851928873, + "grad_norm": 0.3286888599395752, + "learning_rate": 2.4763770106708907e-06, + "loss": 0.2415, + "step": 20625 + }, + { + "epoch": 0.7712690141832371, + "grad_norm": 0.1509125977754593, + "learning_rate": 2.4725097521731232e-06, + "loss": 0.2471, + "step": 20630 + }, + { + "epoch": 0.7714559431735869, + "grad_norm": 0.5915478467941284, + "learning_rate": 2.4686450896910497e-06, + "loss": 0.2614, + "step": 20635 + }, + { + "epoch": 0.7716428721639367, + "grad_norm": 0.725960910320282, + "learning_rate": 2.4647830245574865e-06, + "loss": 0.3098, + "step": 20640 + }, + { + "epoch": 0.7718298011542866, + "grad_norm": 0.4178033769130707, + "learning_rate": 2.4609235581043457e-06, + "loss": 0.2995, + "step": 20645 + }, + { + "epoch": 0.7720167301446363, + "grad_norm": 0.7236891984939575, + "learning_rate": 2.4570666916626484e-06, + "loss": 0.2878, + "step": 20650 + }, + { + "epoch": 0.7722036591349861, + "grad_norm": 0.33673229813575745, + "learning_rate": 2.4532124265625155e-06, + "loss": 0.3246, + "step": 20655 + }, + { + "epoch": 0.7723905881253359, + "grad_norm": 0.4866393804550171, + "learning_rate": 2.4493607641331762e-06, + "loss": 0.2776, + "step": 20660 + }, + { + "epoch": 0.7725775171156857, + "grad_norm": 0.7614867091178894, + "learning_rate": 2.4455117057029566e-06, + "loss": 0.3473, + "step": 20665 + }, + { + "epoch": 0.7727644461060355, + "grad_norm": 0.46578964591026306, + "learning_rate": 2.441665252599282e-06, + "loss": 0.2915, + "step": 20670 + }, + { + "epoch": 0.7729513750963852, + "grad_norm": 0.5103464126586914, + "learning_rate": 2.4378214061486925e-06, + "loss": 0.246, + "step": 20675 + }, + { + "epoch": 0.773138304086735, + "grad_norm": 0.5652568936347961, + "learning_rate": 2.433980167676813e-06, + "loss": 0.3631, + "step": 20680 + }, + { + "epoch": 0.7733252330770849, + "grad_norm": 0.46092647314071655, + "learning_rate": 2.4301415385083828e-06, + "loss": 0.3188, + "step": 20685 + }, + { + "epoch": 0.7735121620674347, + "grad_norm": 0.5116592645645142, + "learning_rate": 2.426305519967228e-06, + "loss": 0.2404, + "step": 20690 + }, + { + "epoch": 0.7736990910577844, + "grad_norm": 0.6798794269561768, + "learning_rate": 2.4224721133762864e-06, + "loss": 0.2467, + "step": 20695 + }, + { + "epoch": 0.7738860200481342, + "grad_norm": 0.4135216772556305, + "learning_rate": 2.418641320057592e-06, + "loss": 0.2992, + "step": 20700 + }, + { + "epoch": 0.774072949038484, + "grad_norm": 0.5038582682609558, + "learning_rate": 2.414813141332274e-06, + "loss": 0.2296, + "step": 20705 + }, + { + "epoch": 0.7742598780288338, + "grad_norm": 0.35031184554100037, + "learning_rate": 2.4109875785205593e-06, + "loss": 0.25, + "step": 20710 + }, + { + "epoch": 0.7744468070191836, + "grad_norm": 0.21241429448127747, + "learning_rate": 2.407164632941773e-06, + "loss": 0.2752, + "step": 20715 + }, + { + "epoch": 0.7746337360095333, + "grad_norm": 0.2852620482444763, + "learning_rate": 2.403344305914346e-06, + "loss": 0.2454, + "step": 20720 + }, + { + "epoch": 0.7748206649998832, + "grad_norm": 0.20307046175003052, + "learning_rate": 2.3995265987557925e-06, + "loss": 0.3172, + "step": 20725 + }, + { + "epoch": 0.775007593990233, + "grad_norm": 0.38747474551200867, + "learning_rate": 2.395711512782738e-06, + "loss": 0.3205, + "step": 20730 + }, + { + "epoch": 0.7751945229805828, + "grad_norm": 0.2965136468410492, + "learning_rate": 2.3918990493108884e-06, + "loss": 0.2596, + "step": 20735 + }, + { + "epoch": 0.7753814519709326, + "grad_norm": 0.3963155150413513, + "learning_rate": 2.3880892096550578e-06, + "loss": 0.2905, + "step": 20740 + }, + { + "epoch": 0.7755683809612823, + "grad_norm": 0.31911271810531616, + "learning_rate": 2.384281995129153e-06, + "loss": 0.3044, + "step": 20745 + }, + { + "epoch": 0.7757553099516321, + "grad_norm": 0.8403980731964111, + "learning_rate": 2.380477407046169e-06, + "loss": 0.3053, + "step": 20750 + }, + { + "epoch": 0.7759422389419819, + "grad_norm": 0.35401323437690735, + "learning_rate": 2.3766754467182006e-06, + "loss": 0.2962, + "step": 20755 + }, + { + "epoch": 0.7761291679323317, + "grad_norm": 0.5200223922729492, + "learning_rate": 2.3728761154564326e-06, + "loss": 0.3908, + "step": 20760 + }, + { + "epoch": 0.7763160969226814, + "grad_norm": 0.4293026626110077, + "learning_rate": 2.3690794145711505e-06, + "loss": 0.3042, + "step": 20765 + }, + { + "epoch": 0.7765030259130313, + "grad_norm": 0.47955235838890076, + "learning_rate": 2.365285345371722e-06, + "loss": 0.2323, + "step": 20770 + }, + { + "epoch": 0.7766899549033811, + "grad_norm": 0.29286670684814453, + "learning_rate": 2.3614939091666177e-06, + "loss": 0.3391, + "step": 20775 + }, + { + "epoch": 0.7768768838937309, + "grad_norm": 0.21747294068336487, + "learning_rate": 2.3577051072633907e-06, + "loss": 0.3265, + "step": 20780 + }, + { + "epoch": 0.7770638128840807, + "grad_norm": 0.9644585251808167, + "learning_rate": 2.3539189409686937e-06, + "loss": 0.3585, + "step": 20785 + }, + { + "epoch": 0.7772507418744304, + "grad_norm": 0.21008965373039246, + "learning_rate": 2.350135411588267e-06, + "loss": 0.2923, + "step": 20790 + }, + { + "epoch": 0.7774376708647802, + "grad_norm": 0.4879824221134186, + "learning_rate": 2.346354520426942e-06, + "loss": 0.329, + "step": 20795 + }, + { + "epoch": 0.77762459985513, + "grad_norm": 0.26359811425209045, + "learning_rate": 2.3425762687886378e-06, + "loss": 0.2053, + "step": 20800 + }, + { + "epoch": 0.7778115288454798, + "grad_norm": 0.15240828692913055, + "learning_rate": 2.3388006579763623e-06, + "loss": 0.2589, + "step": 20805 + }, + { + "epoch": 0.7779984578358297, + "grad_norm": 0.27661359310150146, + "learning_rate": 2.3350276892922218e-06, + "loss": 0.2356, + "step": 20810 + }, + { + "epoch": 0.7781853868261794, + "grad_norm": 0.5237915515899658, + "learning_rate": 2.3312573640373994e-06, + "loss": 0.3032, + "step": 20815 + }, + { + "epoch": 0.7783723158165292, + "grad_norm": 0.33490797877311707, + "learning_rate": 2.3274896835121772e-06, + "loss": 0.2319, + "step": 20820 + }, + { + "epoch": 0.778559244806879, + "grad_norm": 0.5029880404472351, + "learning_rate": 2.323724649015916e-06, + "loss": 0.2439, + "step": 20825 + }, + { + "epoch": 0.7787461737972288, + "grad_norm": 0.47843530774116516, + "learning_rate": 2.31996226184707e-06, + "loss": 0.3145, + "step": 20830 + }, + { + "epoch": 0.7789331027875785, + "grad_norm": 0.5605517625808716, + "learning_rate": 2.3162025233031814e-06, + "loss": 0.3438, + "step": 20835 + }, + { + "epoch": 0.7791200317779283, + "grad_norm": 0.47056886553764343, + "learning_rate": 2.3124454346808713e-06, + "loss": 0.2864, + "step": 20840 + }, + { + "epoch": 0.7793069607682781, + "grad_norm": 0.5489882230758667, + "learning_rate": 2.3086909972758577e-06, + "loss": 0.272, + "step": 20845 + }, + { + "epoch": 0.779493889758628, + "grad_norm": 0.2519620656967163, + "learning_rate": 2.304939212382934e-06, + "loss": 0.2637, + "step": 20850 + }, + { + "epoch": 0.7796808187489778, + "grad_norm": 0.16588373482227325, + "learning_rate": 2.3011900812959855e-06, + "loss": 0.248, + "step": 20855 + }, + { + "epoch": 0.7798677477393275, + "grad_norm": 0.48377254605293274, + "learning_rate": 2.2974436053079764e-06, + "loss": 0.2892, + "step": 20860 + }, + { + "epoch": 0.7800546767296773, + "grad_norm": 0.32265156507492065, + "learning_rate": 2.2936997857109644e-06, + "loss": 0.2799, + "step": 20865 + }, + { + "epoch": 0.7802416057200271, + "grad_norm": 0.25994792580604553, + "learning_rate": 2.2899586237960793e-06, + "loss": 0.2736, + "step": 20870 + }, + { + "epoch": 0.7804285347103769, + "grad_norm": 0.4288146495819092, + "learning_rate": 2.286220120853545e-06, + "loss": 0.2761, + "step": 20875 + }, + { + "epoch": 0.7806154637007267, + "grad_norm": 0.2761852741241455, + "learning_rate": 2.2824842781726665e-06, + "loss": 0.2326, + "step": 20880 + }, + { + "epoch": 0.7808023926910764, + "grad_norm": 0.561490535736084, + "learning_rate": 2.2787510970418215e-06, + "loss": 0.2932, + "step": 20885 + }, + { + "epoch": 0.7809893216814263, + "grad_norm": 0.28700560331344604, + "learning_rate": 2.2750205787484846e-06, + "loss": 0.1987, + "step": 20890 + }, + { + "epoch": 0.7811762506717761, + "grad_norm": 0.2016209363937378, + "learning_rate": 2.271292724579203e-06, + "loss": 0.2507, + "step": 20895 + }, + { + "epoch": 0.7813631796621259, + "grad_norm": 0.3742198944091797, + "learning_rate": 2.2675675358196037e-06, + "loss": 0.2465, + "step": 20900 + }, + { + "epoch": 0.7815501086524757, + "grad_norm": 0.301685094833374, + "learning_rate": 2.2638450137543967e-06, + "loss": 0.281, + "step": 20905 + }, + { + "epoch": 0.7817370376428254, + "grad_norm": 0.6065481305122375, + "learning_rate": 2.2601251596673778e-06, + "loss": 0.249, + "step": 20910 + }, + { + "epoch": 0.7819239666331752, + "grad_norm": 0.41621100902557373, + "learning_rate": 2.2564079748414138e-06, + "loss": 0.254, + "step": 20915 + }, + { + "epoch": 0.782110895623525, + "grad_norm": 0.33886462450027466, + "learning_rate": 2.252693460558456e-06, + "loss": 0.2795, + "step": 20920 + }, + { + "epoch": 0.7822978246138748, + "grad_norm": 0.6175533533096313, + "learning_rate": 2.2489816180995395e-06, + "loss": 0.3793, + "step": 20925 + }, + { + "epoch": 0.7824847536042246, + "grad_norm": 0.7338931560516357, + "learning_rate": 2.245272448744765e-06, + "loss": 0.289, + "step": 20930 + }, + { + "epoch": 0.7826716825945744, + "grad_norm": 0.292777419090271, + "learning_rate": 2.241565953773325e-06, + "loss": 0.2248, + "step": 20935 + }, + { + "epoch": 0.7828586115849242, + "grad_norm": 0.5425524711608887, + "learning_rate": 2.237862134463479e-06, + "loss": 0.2553, + "step": 20940 + }, + { + "epoch": 0.783045540575274, + "grad_norm": 0.29631131887435913, + "learning_rate": 2.2341609920925698e-06, + "loss": 0.2826, + "step": 20945 + }, + { + "epoch": 0.7832324695656238, + "grad_norm": 0.313694566488266, + "learning_rate": 2.230462527937013e-06, + "loss": 0.3969, + "step": 20950 + }, + { + "epoch": 0.7834193985559735, + "grad_norm": 0.7053239941596985, + "learning_rate": 2.2267667432723073e-06, + "loss": 0.3284, + "step": 20955 + }, + { + "epoch": 0.7836063275463233, + "grad_norm": 0.22134394943714142, + "learning_rate": 2.2230736393730178e-06, + "loss": 0.276, + "step": 20960 + }, + { + "epoch": 0.7837932565366731, + "grad_norm": 0.2921614348888397, + "learning_rate": 2.2193832175127928e-06, + "loss": 0.247, + "step": 20965 + }, + { + "epoch": 0.783980185527023, + "grad_norm": 0.4508916735649109, + "learning_rate": 2.215695478964357e-06, + "loss": 0.2421, + "step": 20970 + }, + { + "epoch": 0.7841671145173728, + "grad_norm": 0.4369545578956604, + "learning_rate": 2.212010424999498e-06, + "loss": 0.2226, + "step": 20975 + }, + { + "epoch": 0.7843540435077225, + "grad_norm": 0.21277235448360443, + "learning_rate": 2.2083280568890918e-06, + "loss": 0.292, + "step": 20980 + }, + { + "epoch": 0.7845409724980723, + "grad_norm": 0.4735264778137207, + "learning_rate": 2.20464837590308e-06, + "loss": 0.2305, + "step": 20985 + }, + { + "epoch": 0.7847279014884221, + "grad_norm": 0.5336333513259888, + "learning_rate": 2.2009713833104785e-06, + "loss": 0.2775, + "step": 20990 + }, + { + "epoch": 0.7849148304787719, + "grad_norm": 0.4231540262699127, + "learning_rate": 2.1972970803793726e-06, + "loss": 0.2616, + "step": 20995 + }, + { + "epoch": 0.7851017594691216, + "grad_norm": 0.5644051432609558, + "learning_rate": 2.193625468376931e-06, + "loss": 0.2852, + "step": 21000 + }, + { + "epoch": 0.7852886884594714, + "grad_norm": 0.5196647047996521, + "learning_rate": 2.189956548569382e-06, + "loss": 0.3341, + "step": 21005 + }, + { + "epoch": 0.7854756174498212, + "grad_norm": 0.4049712121486664, + "learning_rate": 2.186290322222033e-06, + "loss": 0.4098, + "step": 21010 + }, + { + "epoch": 0.7856625464401711, + "grad_norm": 0.4484502375125885, + "learning_rate": 2.182626790599265e-06, + "loss": 0.2502, + "step": 21015 + }, + { + "epoch": 0.7858494754305209, + "grad_norm": 0.1917799860239029, + "learning_rate": 2.1789659549645158e-06, + "loss": 0.2588, + "step": 21020 + }, + { + "epoch": 0.7860364044208706, + "grad_norm": 0.3756994307041168, + "learning_rate": 2.175307816580312e-06, + "loss": 0.2758, + "step": 21025 + }, + { + "epoch": 0.7862233334112204, + "grad_norm": 0.4414009153842926, + "learning_rate": 2.171652376708233e-06, + "loss": 0.2278, + "step": 21030 + }, + { + "epoch": 0.7864102624015702, + "grad_norm": 0.34547463059425354, + "learning_rate": 2.1679996366089428e-06, + "loss": 0.3164, + "step": 21035 + }, + { + "epoch": 0.78659719139192, + "grad_norm": 0.3485727608203888, + "learning_rate": 2.1643495975421612e-06, + "loss": 0.3466, + "step": 21040 + }, + { + "epoch": 0.7867841203822697, + "grad_norm": 0.772977888584137, + "learning_rate": 2.160702260766684e-06, + "loss": 0.2409, + "step": 21045 + }, + { + "epoch": 0.7869710493726195, + "grad_norm": 0.5560535192489624, + "learning_rate": 2.157057627540371e-06, + "loss": 0.3805, + "step": 21050 + }, + { + "epoch": 0.7871579783629694, + "grad_norm": 0.5626355409622192, + "learning_rate": 2.1534156991201528e-06, + "loss": 0.2906, + "step": 21055 + }, + { + "epoch": 0.7873449073533192, + "grad_norm": 0.27478352189064026, + "learning_rate": 2.149776476762029e-06, + "loss": 0.2498, + "step": 21060 + }, + { + "epoch": 0.787531836343669, + "grad_norm": 0.8276031017303467, + "learning_rate": 2.146139961721059e-06, + "loss": 0.2522, + "step": 21065 + }, + { + "epoch": 0.7877187653340187, + "grad_norm": 0.459084153175354, + "learning_rate": 2.142506155251377e-06, + "loss": 0.3553, + "step": 21070 + }, + { + "epoch": 0.7879056943243685, + "grad_norm": 0.6312773823738098, + "learning_rate": 2.1388750586061735e-06, + "loss": 0.2408, + "step": 21075 + }, + { + "epoch": 0.7880926233147183, + "grad_norm": 0.3618089556694031, + "learning_rate": 2.1352466730377164e-06, + "loss": 0.3, + "step": 21080 + }, + { + "epoch": 0.7882795523050681, + "grad_norm": 0.5155732035636902, + "learning_rate": 2.131620999797327e-06, + "loss": 0.2281, + "step": 21085 + }, + { + "epoch": 0.7884664812954179, + "grad_norm": 0.5344289541244507, + "learning_rate": 2.1279980401353972e-06, + "loss": 0.2373, + "step": 21090 + }, + { + "epoch": 0.7886534102857677, + "grad_norm": 0.4500960111618042, + "learning_rate": 2.12437779530138e-06, + "loss": 0.3069, + "step": 21095 + }, + { + "epoch": 0.7888403392761175, + "grad_norm": 0.6218135952949524, + "learning_rate": 2.1207602665437953e-06, + "loss": 0.288, + "step": 21100 + }, + { + "epoch": 0.7890272682664673, + "grad_norm": 0.7415568828582764, + "learning_rate": 2.117145455110229e-06, + "loss": 0.2563, + "step": 21105 + }, + { + "epoch": 0.7892141972568171, + "grad_norm": 0.47236132621765137, + "learning_rate": 2.1135333622473208e-06, + "loss": 0.3075, + "step": 21110 + }, + { + "epoch": 0.7894011262471669, + "grad_norm": 0.3604547083377838, + "learning_rate": 2.1099239892007815e-06, + "loss": 0.3614, + "step": 21115 + }, + { + "epoch": 0.7895880552375166, + "grad_norm": 0.32508572936058044, + "learning_rate": 2.1063173372153778e-06, + "loss": 0.2853, + "step": 21120 + }, + { + "epoch": 0.7897749842278664, + "grad_norm": 0.40659722685813904, + "learning_rate": 2.102713407534943e-06, + "loss": 0.3314, + "step": 21125 + }, + { + "epoch": 0.7899619132182162, + "grad_norm": 0.5361529588699341, + "learning_rate": 2.099112201402369e-06, + "loss": 0.2541, + "step": 21130 + }, + { + "epoch": 0.7901488422085661, + "grad_norm": 0.32573097944259644, + "learning_rate": 2.0955137200596077e-06, + "loss": 0.2379, + "step": 21135 + }, + { + "epoch": 0.7903357711989158, + "grad_norm": 0.5124326348304749, + "learning_rate": 2.0919179647476694e-06, + "loss": 0.3336, + "step": 21140 + }, + { + "epoch": 0.7905227001892656, + "grad_norm": 0.6217731833457947, + "learning_rate": 2.0883249367066294e-06, + "loss": 0.341, + "step": 21145 + }, + { + "epoch": 0.7907096291796154, + "grad_norm": 0.44866645336151123, + "learning_rate": 2.0847346371756237e-06, + "loss": 0.2955, + "step": 21150 + }, + { + "epoch": 0.7908965581699652, + "grad_norm": 0.4434383809566498, + "learning_rate": 2.081147067392838e-06, + "loss": 0.3195, + "step": 21155 + }, + { + "epoch": 0.791083487160315, + "grad_norm": 0.5388154983520508, + "learning_rate": 2.0775622285955264e-06, + "loss": 0.2346, + "step": 21160 + }, + { + "epoch": 0.7912704161506647, + "grad_norm": 0.33477190136909485, + "learning_rate": 2.073980122019994e-06, + "loss": 0.3177, + "step": 21165 + }, + { + "epoch": 0.7914573451410145, + "grad_norm": 0.5977397561073303, + "learning_rate": 2.070400748901611e-06, + "loss": 0.3434, + "step": 21170 + }, + { + "epoch": 0.7916442741313644, + "grad_norm": 0.3142024874687195, + "learning_rate": 2.066824110474798e-06, + "loss": 0.2778, + "step": 21175 + }, + { + "epoch": 0.7918312031217142, + "grad_norm": 0.5590510964393616, + "learning_rate": 2.0632502079730356e-06, + "loss": 0.2392, + "step": 21180 + }, + { + "epoch": 0.792018132112064, + "grad_norm": 0.38960909843444824, + "learning_rate": 2.059679042628856e-06, + "loss": 0.2559, + "step": 21185 + }, + { + "epoch": 0.7922050611024137, + "grad_norm": 0.7024688720703125, + "learning_rate": 2.056110615673855e-06, + "loss": 0.2569, + "step": 21190 + }, + { + "epoch": 0.7923919900927635, + "grad_norm": 0.5414541959762573, + "learning_rate": 2.0525449283386855e-06, + "loss": 0.2865, + "step": 21195 + }, + { + "epoch": 0.7925789190831133, + "grad_norm": 0.4854254126548767, + "learning_rate": 2.0489819818530443e-06, + "loss": 0.267, + "step": 21200 + }, + { + "epoch": 0.7927658480734631, + "grad_norm": 0.34291431307792664, + "learning_rate": 2.045421777445694e-06, + "loss": 0.3289, + "step": 21205 + }, + { + "epoch": 0.7929527770638128, + "grad_norm": 0.366192489862442, + "learning_rate": 2.041864316344443e-06, + "loss": 0.3097, + "step": 21210 + }, + { + "epoch": 0.7931397060541627, + "grad_norm": 0.35338735580444336, + "learning_rate": 2.0383095997761628e-06, + "loss": 0.2791, + "step": 21215 + }, + { + "epoch": 0.7933266350445125, + "grad_norm": 0.5580964684486389, + "learning_rate": 2.0347576289667657e-06, + "loss": 0.2732, + "step": 21220 + }, + { + "epoch": 0.7935135640348623, + "grad_norm": 0.47220098972320557, + "learning_rate": 2.031208405141234e-06, + "loss": 0.243, + "step": 21225 + }, + { + "epoch": 0.7937004930252121, + "grad_norm": 0.6070525646209717, + "learning_rate": 2.027661929523588e-06, + "loss": 0.2208, + "step": 21230 + }, + { + "epoch": 0.7938874220155618, + "grad_norm": 0.4596133828163147, + "learning_rate": 2.0241182033369034e-06, + "loss": 0.2809, + "step": 21235 + }, + { + "epoch": 0.7940743510059116, + "grad_norm": 0.17286808788776398, + "learning_rate": 2.0205772278033153e-06, + "loss": 0.245, + "step": 21240 + }, + { + "epoch": 0.7942612799962614, + "grad_norm": 0.42331114411354065, + "learning_rate": 2.017039004143999e-06, + "loss": 0.2423, + "step": 21245 + }, + { + "epoch": 0.7944482089866112, + "grad_norm": 0.6041852235794067, + "learning_rate": 2.013503533579193e-06, + "loss": 0.2279, + "step": 21250 + }, + { + "epoch": 0.794635137976961, + "grad_norm": 0.4134674668312073, + "learning_rate": 2.009970817328173e-06, + "loss": 0.2674, + "step": 21255 + }, + { + "epoch": 0.7948220669673108, + "grad_norm": 0.44482535123825073, + "learning_rate": 2.0064408566092762e-06, + "loss": 0.2763, + "step": 21260 + }, + { + "epoch": 0.7950089959576606, + "grad_norm": 0.6239652037620544, + "learning_rate": 2.002913652639883e-06, + "loss": 0.2297, + "step": 21265 + }, + { + "epoch": 0.7951959249480104, + "grad_norm": 0.4844609498977661, + "learning_rate": 1.999389206636426e-06, + "loss": 0.2071, + "step": 21270 + }, + { + "epoch": 0.7953828539383602, + "grad_norm": 0.37075021862983704, + "learning_rate": 1.9958675198143873e-06, + "loss": 0.2251, + "step": 21275 + }, + { + "epoch": 0.79556978292871, + "grad_norm": 0.16581667959690094, + "learning_rate": 1.992348593388289e-06, + "loss": 0.1926, + "step": 21280 + }, + { + "epoch": 0.7957567119190597, + "grad_norm": 0.4773890972137451, + "learning_rate": 1.9888324285717166e-06, + "loss": 0.3475, + "step": 21285 + }, + { + "epoch": 0.7959436409094095, + "grad_norm": 0.57837975025177, + "learning_rate": 1.985319026577287e-06, + "loss": 0.3069, + "step": 21290 + }, + { + "epoch": 0.7961305698997593, + "grad_norm": 0.29405614733695984, + "learning_rate": 1.9818083886166795e-06, + "loss": 0.2997, + "step": 21295 + }, + { + "epoch": 0.7963174988901092, + "grad_norm": 0.2895260155200958, + "learning_rate": 1.978300515900604e-06, + "loss": 0.2789, + "step": 21300 + }, + { + "epoch": 0.7965044278804589, + "grad_norm": 0.36291879415512085, + "learning_rate": 1.9747954096388343e-06, + "loss": 0.2466, + "step": 21305 + }, + { + "epoch": 0.7966913568708087, + "grad_norm": 0.477522611618042, + "learning_rate": 1.9712930710401735e-06, + "loss": 0.2304, + "step": 21310 + }, + { + "epoch": 0.7968782858611585, + "grad_norm": 0.7150763273239136, + "learning_rate": 1.967793501312483e-06, + "loss": 0.2485, + "step": 21315 + }, + { + "epoch": 0.7970652148515083, + "grad_norm": 0.4385119378566742, + "learning_rate": 1.9642967016626624e-06, + "loss": 0.2579, + "step": 21320 + }, + { + "epoch": 0.797252143841858, + "grad_norm": 0.4212881624698639, + "learning_rate": 1.9608026732966544e-06, + "loss": 0.2947, + "step": 21325 + }, + { + "epoch": 0.7974390728322078, + "grad_norm": 0.54429692029953, + "learning_rate": 1.957311417419455e-06, + "loss": 0.2657, + "step": 21330 + }, + { + "epoch": 0.7976260018225576, + "grad_norm": 0.6954907178878784, + "learning_rate": 1.9538229352350924e-06, + "loss": 0.2948, + "step": 21335 + }, + { + "epoch": 0.7978129308129075, + "grad_norm": 0.56966632604599, + "learning_rate": 1.95033722794665e-06, + "loss": 0.2921, + "step": 21340 + }, + { + "epoch": 0.7979998598032573, + "grad_norm": 0.6537004113197327, + "learning_rate": 1.9468542967562443e-06, + "loss": 0.4463, + "step": 21345 + }, + { + "epoch": 0.798186788793607, + "grad_norm": 1.18008291721344, + "learning_rate": 1.943374142865042e-06, + "loss": 0.3565, + "step": 21350 + }, + { + "epoch": 0.7983737177839568, + "grad_norm": 0.28714442253112793, + "learning_rate": 1.939896767473243e-06, + "loss": 0.2965, + "step": 21355 + }, + { + "epoch": 0.7985606467743066, + "grad_norm": 1.268236756324768, + "learning_rate": 1.936422171780101e-06, + "loss": 0.3542, + "step": 21360 + }, + { + "epoch": 0.7987475757646564, + "grad_norm": 0.5523670315742493, + "learning_rate": 1.9329503569839002e-06, + "loss": 0.2992, + "step": 21365 + }, + { + "epoch": 0.7989345047550062, + "grad_norm": 0.391519695520401, + "learning_rate": 1.92948132428197e-06, + "loss": 0.3191, + "step": 21370 + }, + { + "epoch": 0.7991214337453559, + "grad_norm": 0.6156255602836609, + "learning_rate": 1.926015074870683e-06, + "loss": 0.2582, + "step": 21375 + }, + { + "epoch": 0.7993083627357058, + "grad_norm": 0.29384270310401917, + "learning_rate": 1.9225516099454456e-06, + "loss": 0.29, + "step": 21380 + }, + { + "epoch": 0.7994952917260556, + "grad_norm": 0.46407338976860046, + "learning_rate": 1.919090930700712e-06, + "loss": 0.2988, + "step": 21385 + }, + { + "epoch": 0.7996822207164054, + "grad_norm": 0.16256499290466309, + "learning_rate": 1.915633038329967e-06, + "loss": 0.3916, + "step": 21390 + }, + { + "epoch": 0.7998691497067552, + "grad_norm": 0.6829771399497986, + "learning_rate": 1.912177934025743e-06, + "loss": 0.2818, + "step": 21395 + }, + { + "epoch": 0.8000560786971049, + "grad_norm": 0.6164911389350891, + "learning_rate": 1.9087256189796012e-06, + "loss": 0.3556, + "step": 21400 + }, + { + "epoch": 0.8002430076874547, + "grad_norm": 0.5262479782104492, + "learning_rate": 1.9052760943821513e-06, + "loss": 0.2089, + "step": 21405 + }, + { + "epoch": 0.8004299366778045, + "grad_norm": 0.3013521730899811, + "learning_rate": 1.901829361423031e-06, + "loss": 0.2159, + "step": 21410 + }, + { + "epoch": 0.8006168656681543, + "grad_norm": 0.37884315848350525, + "learning_rate": 1.8983854212909247e-06, + "loss": 0.2776, + "step": 21415 + }, + { + "epoch": 0.8008037946585042, + "grad_norm": 0.27459797263145447, + "learning_rate": 1.894944275173547e-06, + "loss": 0.3337, + "step": 21420 + }, + { + "epoch": 0.8009907236488539, + "grad_norm": 0.6611989736557007, + "learning_rate": 1.8915059242576462e-06, + "loss": 0.2814, + "step": 21425 + }, + { + "epoch": 0.8011776526392037, + "grad_norm": 0.2187768518924713, + "learning_rate": 1.888070369729016e-06, + "loss": 0.2271, + "step": 21430 + }, + { + "epoch": 0.8013645816295535, + "grad_norm": 0.37497401237487793, + "learning_rate": 1.8846376127724775e-06, + "loss": 0.2489, + "step": 21435 + }, + { + "epoch": 0.8015515106199033, + "grad_norm": 0.44571053981781006, + "learning_rate": 1.881207654571895e-06, + "loss": 0.2394, + "step": 21440 + }, + { + "epoch": 0.801738439610253, + "grad_norm": 0.34269171953201294, + "learning_rate": 1.8777804963101553e-06, + "loss": 0.2689, + "step": 21445 + }, + { + "epoch": 0.8019253686006028, + "grad_norm": 0.3532237410545349, + "learning_rate": 1.8743561391691955e-06, + "loss": 0.2671, + "step": 21450 + }, + { + "epoch": 0.8021122975909526, + "grad_norm": 0.27271535992622375, + "learning_rate": 1.8709345843299708e-06, + "loss": 0.2663, + "step": 21455 + }, + { + "epoch": 0.8022992265813025, + "grad_norm": 0.3709903061389923, + "learning_rate": 1.867515832972484e-06, + "loss": 0.3473, + "step": 21460 + }, + { + "epoch": 0.8024861555716523, + "grad_norm": 0.5782517790794373, + "learning_rate": 1.864099886275761e-06, + "loss": 0.3205, + "step": 21465 + }, + { + "epoch": 0.802673084562002, + "grad_norm": 0.40941113233566284, + "learning_rate": 1.8606867454178612e-06, + "loss": 0.3808, + "step": 21470 + }, + { + "epoch": 0.8028600135523518, + "grad_norm": 0.29615747928619385, + "learning_rate": 1.8572764115758846e-06, + "loss": 0.3111, + "step": 21475 + }, + { + "epoch": 0.8030469425427016, + "grad_norm": 0.8128417134284973, + "learning_rate": 1.8538688859259534e-06, + "loss": 0.3365, + "step": 21480 + }, + { + "epoch": 0.8032338715330514, + "grad_norm": 0.5973396897315979, + "learning_rate": 1.850464169643229e-06, + "loss": 0.2514, + "step": 21485 + }, + { + "epoch": 0.8034208005234011, + "grad_norm": 0.46153488755226135, + "learning_rate": 1.8470622639018964e-06, + "loss": 0.3385, + "step": 21490 + }, + { + "epoch": 0.8036077295137509, + "grad_norm": 0.45193207263946533, + "learning_rate": 1.8436631698751806e-06, + "loss": 0.2695, + "step": 21495 + }, + { + "epoch": 0.8037946585041007, + "grad_norm": 0.5333691835403442, + "learning_rate": 1.840266888735326e-06, + "loss": 0.2737, + "step": 21500 + }, + { + "epoch": 0.8039815874944506, + "grad_norm": 0.46720612049102783, + "learning_rate": 1.8368734216536176e-06, + "loss": 0.2952, + "step": 21505 + }, + { + "epoch": 0.8041685164848004, + "grad_norm": 0.24607418477535248, + "learning_rate": 1.8334827698003644e-06, + "loss": 0.3072, + "step": 21510 + }, + { + "epoch": 0.8043554454751501, + "grad_norm": 0.3917045295238495, + "learning_rate": 1.8300949343449003e-06, + "loss": 0.2582, + "step": 21515 + }, + { + "epoch": 0.8045423744654999, + "grad_norm": 0.27164673805236816, + "learning_rate": 1.8267099164555978e-06, + "loss": 0.2381, + "step": 21520 + }, + { + "epoch": 0.8047293034558497, + "grad_norm": 0.7924597859382629, + "learning_rate": 1.8233277172998486e-06, + "loss": 0.2594, + "step": 21525 + }, + { + "epoch": 0.8049162324461995, + "grad_norm": 0.17336755990982056, + "learning_rate": 1.8199483380440808e-06, + "loss": 0.2524, + "step": 21530 + }, + { + "epoch": 0.8051031614365493, + "grad_norm": 0.5164810419082642, + "learning_rate": 1.8165717798537407e-06, + "loss": 0.31, + "step": 21535 + }, + { + "epoch": 0.805290090426899, + "grad_norm": 0.6605789065361023, + "learning_rate": 1.8131980438933117e-06, + "loss": 0.3017, + "step": 21540 + }, + { + "epoch": 0.8054770194172489, + "grad_norm": 0.31877684593200684, + "learning_rate": 1.809827131326294e-06, + "loss": 0.2414, + "step": 21545 + }, + { + "epoch": 0.8056639484075987, + "grad_norm": 0.4892379641532898, + "learning_rate": 1.8064590433152218e-06, + "loss": 0.2236, + "step": 21550 + }, + { + "epoch": 0.8058508773979485, + "grad_norm": 0.4304451644420624, + "learning_rate": 1.8030937810216486e-06, + "loss": 0.2741, + "step": 21555 + }, + { + "epoch": 0.8060378063882983, + "grad_norm": 0.5623677968978882, + "learning_rate": 1.7997313456061615e-06, + "loss": 0.2997, + "step": 21560 + }, + { + "epoch": 0.806224735378648, + "grad_norm": 0.24887755513191223, + "learning_rate": 1.7963717382283663e-06, + "loss": 0.2429, + "step": 21565 + }, + { + "epoch": 0.8064116643689978, + "grad_norm": 0.4200691282749176, + "learning_rate": 1.7930149600468927e-06, + "loss": 0.2047, + "step": 21570 + }, + { + "epoch": 0.8065985933593476, + "grad_norm": 0.5015040040016174, + "learning_rate": 1.7896610122194015e-06, + "loss": 0.3563, + "step": 21575 + }, + { + "epoch": 0.8067855223496974, + "grad_norm": 0.7050318121910095, + "learning_rate": 1.7863098959025692e-06, + "loss": 0.2453, + "step": 21580 + }, + { + "epoch": 0.8069724513400472, + "grad_norm": 0.39785218238830566, + "learning_rate": 1.7829616122521043e-06, + "loss": 0.2441, + "step": 21585 + }, + { + "epoch": 0.807159380330397, + "grad_norm": 0.5187550187110901, + "learning_rate": 1.7796161624227281e-06, + "loss": 0.3743, + "step": 21590 + }, + { + "epoch": 0.8073463093207468, + "grad_norm": 0.20182538032531738, + "learning_rate": 1.7762735475681947e-06, + "loss": 0.2975, + "step": 21595 + }, + { + "epoch": 0.8075332383110966, + "grad_norm": 0.5243845582008362, + "learning_rate": 1.7729337688412772e-06, + "loss": 0.2309, + "step": 21600 + }, + { + "epoch": 0.8077201673014464, + "grad_norm": 0.39806708693504333, + "learning_rate": 1.769596827393768e-06, + "loss": 0.3061, + "step": 21605 + }, + { + "epoch": 0.8079070962917961, + "grad_norm": 0.37377694249153137, + "learning_rate": 1.7662627243764808e-06, + "loss": 0.3372, + "step": 21610 + }, + { + "epoch": 0.8080940252821459, + "grad_norm": 0.4244140088558197, + "learning_rate": 1.7629314609392523e-06, + "loss": 0.2972, + "step": 21615 + }, + { + "epoch": 0.8082809542724957, + "grad_norm": 0.44188788533210754, + "learning_rate": 1.7596030382309436e-06, + "loss": 0.3291, + "step": 21620 + }, + { + "epoch": 0.8084678832628456, + "grad_norm": 0.37357720732688904, + "learning_rate": 1.7562774573994267e-06, + "loss": 0.2561, + "step": 21625 + }, + { + "epoch": 0.8086548122531954, + "grad_norm": 0.28990504145622253, + "learning_rate": 1.7529547195916052e-06, + "loss": 0.2949, + "step": 21630 + }, + { + "epoch": 0.8088417412435451, + "grad_norm": 0.49605655670166016, + "learning_rate": 1.7496348259533902e-06, + "loss": 0.3587, + "step": 21635 + }, + { + "epoch": 0.8090286702338949, + "grad_norm": 0.3280414044857025, + "learning_rate": 1.7463177776297202e-06, + "loss": 0.2554, + "step": 21640 + }, + { + "epoch": 0.8092155992242447, + "grad_norm": 0.4728814959526062, + "learning_rate": 1.7430035757645546e-06, + "loss": 0.3598, + "step": 21645 + }, + { + "epoch": 0.8094025282145945, + "grad_norm": 0.45704665780067444, + "learning_rate": 1.7396922215008628e-06, + "loss": 0.3224, + "step": 21650 + }, + { + "epoch": 0.8095894572049442, + "grad_norm": 0.47432267665863037, + "learning_rate": 1.7363837159806352e-06, + "loss": 0.276, + "step": 21655 + }, + { + "epoch": 0.809776386195294, + "grad_norm": 0.4984903037548065, + "learning_rate": 1.7330780603448794e-06, + "loss": 0.3783, + "step": 21660 + }, + { + "epoch": 0.8099633151856439, + "grad_norm": 0.4554220736026764, + "learning_rate": 1.7297752557336257e-06, + "loss": 0.2547, + "step": 21665 + }, + { + "epoch": 0.8101502441759937, + "grad_norm": 0.3859618008136749, + "learning_rate": 1.7264753032859115e-06, + "loss": 0.3227, + "step": 21670 + }, + { + "epoch": 0.8103371731663435, + "grad_norm": 0.2784779369831085, + "learning_rate": 1.7231782041398015e-06, + "loss": 0.3581, + "step": 21675 + }, + { + "epoch": 0.8105241021566932, + "grad_norm": 0.4222283363342285, + "learning_rate": 1.7198839594323658e-06, + "loss": 0.3204, + "step": 21680 + }, + { + "epoch": 0.810711031147043, + "grad_norm": 0.8711488842964172, + "learning_rate": 1.7165925702997e-06, + "loss": 0.3242, + "step": 21685 + }, + { + "epoch": 0.8108979601373928, + "grad_norm": 0.39160558581352234, + "learning_rate": 1.7133040378769039e-06, + "loss": 0.2511, + "step": 21690 + }, + { + "epoch": 0.8110848891277426, + "grad_norm": 1.2719933986663818, + "learning_rate": 1.7100183632981039e-06, + "loss": 0.3113, + "step": 21695 + }, + { + "epoch": 0.8112718181180923, + "grad_norm": 0.23821887373924255, + "learning_rate": 1.706735547696432e-06, + "loss": 0.3062, + "step": 21700 + }, + { + "epoch": 0.8114587471084422, + "grad_norm": 0.4011630117893219, + "learning_rate": 1.7034555922040351e-06, + "loss": 0.2491, + "step": 21705 + }, + { + "epoch": 0.811645676098792, + "grad_norm": 0.28980720043182373, + "learning_rate": 1.7001784979520808e-06, + "loss": 0.2797, + "step": 21710 + }, + { + "epoch": 0.8118326050891418, + "grad_norm": 0.41728466749191284, + "learning_rate": 1.6969042660707413e-06, + "loss": 0.3272, + "step": 21715 + }, + { + "epoch": 0.8120195340794916, + "grad_norm": 0.22036214172840118, + "learning_rate": 1.693632897689208e-06, + "loss": 0.2638, + "step": 21720 + }, + { + "epoch": 0.8122064630698413, + "grad_norm": 0.5138218402862549, + "learning_rate": 1.6903643939356784e-06, + "loss": 0.2993, + "step": 21725 + }, + { + "epoch": 0.8123933920601911, + "grad_norm": 0.6489676237106323, + "learning_rate": 1.6870987559373709e-06, + "loss": 0.2849, + "step": 21730 + }, + { + "epoch": 0.8125803210505409, + "grad_norm": 0.2285657376050949, + "learning_rate": 1.6838359848205055e-06, + "loss": 0.2543, + "step": 21735 + }, + { + "epoch": 0.8127672500408907, + "grad_norm": 0.29291707277297974, + "learning_rate": 1.6805760817103201e-06, + "loss": 0.2461, + "step": 21740 + }, + { + "epoch": 0.8129541790312405, + "grad_norm": 0.4174777865409851, + "learning_rate": 1.6773190477310652e-06, + "loss": 0.2149, + "step": 21745 + }, + { + "epoch": 0.8131411080215903, + "grad_norm": 0.2554204761981964, + "learning_rate": 1.6740648840059958e-06, + "loss": 0.2763, + "step": 21750 + }, + { + "epoch": 0.8133280370119401, + "grad_norm": 0.8167181611061096, + "learning_rate": 1.6708135916573797e-06, + "loss": 0.2746, + "step": 21755 + }, + { + "epoch": 0.8135149660022899, + "grad_norm": 0.304171085357666, + "learning_rate": 1.6675651718064922e-06, + "loss": 0.2495, + "step": 21760 + }, + { + "epoch": 0.8137018949926397, + "grad_norm": 0.39825260639190674, + "learning_rate": 1.6643196255736239e-06, + "loss": 0.2251, + "step": 21765 + }, + { + "epoch": 0.8138888239829895, + "grad_norm": 0.36910250782966614, + "learning_rate": 1.661076954078068e-06, + "loss": 0.3272, + "step": 21770 + }, + { + "epoch": 0.8140757529733392, + "grad_norm": 0.31329768896102905, + "learning_rate": 1.6578371584381326e-06, + "loss": 0.2188, + "step": 21775 + }, + { + "epoch": 0.814262681963689, + "grad_norm": 0.2181299477815628, + "learning_rate": 1.6546002397711247e-06, + "loss": 0.3479, + "step": 21780 + }, + { + "epoch": 0.8144496109540388, + "grad_norm": 0.8821444511413574, + "learning_rate": 1.6513661991933694e-06, + "loss": 0.2365, + "step": 21785 + }, + { + "epoch": 0.8146365399443887, + "grad_norm": 0.3816254436969757, + "learning_rate": 1.6481350378201954e-06, + "loss": 0.348, + "step": 21790 + }, + { + "epoch": 0.8148234689347384, + "grad_norm": 0.7494456171989441, + "learning_rate": 1.644906756765935e-06, + "loss": 0.2938, + "step": 21795 + }, + { + "epoch": 0.8150103979250882, + "grad_norm": 0.5195353031158447, + "learning_rate": 1.6416813571439305e-06, + "loss": 0.3243, + "step": 21800 + }, + { + "epoch": 0.815197326915438, + "grad_norm": 0.3925173282623291, + "learning_rate": 1.638458840066528e-06, + "loss": 0.2577, + "step": 21805 + }, + { + "epoch": 0.8153842559057878, + "grad_norm": 0.5360116958618164, + "learning_rate": 1.635239206645085e-06, + "loss": 0.3019, + "step": 21810 + }, + { + "epoch": 0.8155711848961376, + "grad_norm": 0.384461909532547, + "learning_rate": 1.632022457989958e-06, + "loss": 0.2346, + "step": 21815 + }, + { + "epoch": 0.8157581138864873, + "grad_norm": 0.3784307539463043, + "learning_rate": 1.6288085952105126e-06, + "loss": 0.2576, + "step": 21820 + }, + { + "epoch": 0.8159450428768371, + "grad_norm": 0.3324683606624603, + "learning_rate": 1.6255976194151168e-06, + "loss": 0.3859, + "step": 21825 + }, + { + "epoch": 0.816131971867187, + "grad_norm": 0.6262183785438538, + "learning_rate": 1.6223895317111449e-06, + "loss": 0.2211, + "step": 21830 + }, + { + "epoch": 0.8163189008575368, + "grad_norm": 0.3607613444328308, + "learning_rate": 1.6191843332049762e-06, + "loss": 0.2672, + "step": 21835 + }, + { + "epoch": 0.8165058298478866, + "grad_norm": 0.33100825548171997, + "learning_rate": 1.615982025001991e-06, + "loss": 0.2289, + "step": 21840 + }, + { + "epoch": 0.8166927588382363, + "grad_norm": 0.7562757134437561, + "learning_rate": 1.6127826082065723e-06, + "loss": 0.2542, + "step": 21845 + }, + { + "epoch": 0.8168796878285861, + "grad_norm": 0.359550803899765, + "learning_rate": 1.6095860839221055e-06, + "loss": 0.3022, + "step": 21850 + }, + { + "epoch": 0.8170666168189359, + "grad_norm": 0.8539102077484131, + "learning_rate": 1.6063924532509856e-06, + "loss": 0.3232, + "step": 21855 + }, + { + "epoch": 0.8172535458092857, + "grad_norm": 0.23160411417484283, + "learning_rate": 1.603201717294599e-06, + "loss": 0.2307, + "step": 21860 + }, + { + "epoch": 0.8174404747996354, + "grad_norm": 0.5298076272010803, + "learning_rate": 1.6000138771533424e-06, + "loss": 0.223, + "step": 21865 + }, + { + "epoch": 0.8176274037899853, + "grad_norm": 0.5675511360168457, + "learning_rate": 1.5968289339266084e-06, + "loss": 0.3385, + "step": 21870 + }, + { + "epoch": 0.8178143327803351, + "grad_norm": 0.8937890529632568, + "learning_rate": 1.5936468887127932e-06, + "loss": 0.3094, + "step": 21875 + }, + { + "epoch": 0.8180012617706849, + "grad_norm": 0.418905645608902, + "learning_rate": 1.5904677426092964e-06, + "loss": 0.3251, + "step": 21880 + }, + { + "epoch": 0.8181881907610347, + "grad_norm": 0.26117852330207825, + "learning_rate": 1.587291496712512e-06, + "loss": 0.3468, + "step": 21885 + }, + { + "epoch": 0.8183751197513844, + "grad_norm": 0.9076757431030273, + "learning_rate": 1.584118152117835e-06, + "loss": 0.26, + "step": 21890 + }, + { + "epoch": 0.8185620487417342, + "grad_norm": 0.46946465969085693, + "learning_rate": 1.5809477099196592e-06, + "loss": 0.3238, + "step": 21895 + }, + { + "epoch": 0.818748977732084, + "grad_norm": 0.28860294818878174, + "learning_rate": 1.577780171211385e-06, + "loss": 0.2435, + "step": 21900 + }, + { + "epoch": 0.8189359067224338, + "grad_norm": 0.24636733531951904, + "learning_rate": 1.5746155370853998e-06, + "loss": 0.2566, + "step": 21905 + }, + { + "epoch": 0.8191228357127837, + "grad_norm": 0.12167595326900482, + "learning_rate": 1.571453808633101e-06, + "loss": 0.3848, + "step": 21910 + }, + { + "epoch": 0.8193097647031334, + "grad_norm": 0.6894955039024353, + "learning_rate": 1.5682949869448715e-06, + "loss": 0.3028, + "step": 21915 + }, + { + "epoch": 0.8194966936934832, + "grad_norm": 0.3803219199180603, + "learning_rate": 1.5651390731101035e-06, + "loss": 0.2547, + "step": 21920 + }, + { + "epoch": 0.819683622683833, + "grad_norm": 0.28584155440330505, + "learning_rate": 1.5619860682171817e-06, + "loss": 0.2718, + "step": 21925 + }, + { + "epoch": 0.8198705516741828, + "grad_norm": 0.5630354285240173, + "learning_rate": 1.558835973353483e-06, + "loss": 0.2787, + "step": 21930 + }, + { + "epoch": 0.8200574806645325, + "grad_norm": 0.2100723534822464, + "learning_rate": 1.5556887896053896e-06, + "loss": 0.3048, + "step": 21935 + }, + { + "epoch": 0.8202444096548823, + "grad_norm": 0.38809457421302795, + "learning_rate": 1.5525445180582721e-06, + "loss": 0.2565, + "step": 21940 + }, + { + "epoch": 0.8204313386452321, + "grad_norm": 0.4679727554321289, + "learning_rate": 1.549403159796501e-06, + "loss": 0.2861, + "step": 21945 + }, + { + "epoch": 0.820618267635582, + "grad_norm": 0.39624401926994324, + "learning_rate": 1.5462647159034362e-06, + "loss": 0.2531, + "step": 21950 + }, + { + "epoch": 0.8208051966259318, + "grad_norm": 0.32841363549232483, + "learning_rate": 1.543129187461444e-06, + "loss": 0.2593, + "step": 21955 + }, + { + "epoch": 0.8209921256162815, + "grad_norm": 0.3810194134712219, + "learning_rate": 1.539996575551872e-06, + "loss": 0.2991, + "step": 21960 + }, + { + "epoch": 0.8211790546066313, + "grad_norm": 0.37340226769447327, + "learning_rate": 1.5368668812550724e-06, + "loss": 0.2569, + "step": 21965 + }, + { + "epoch": 0.8213659835969811, + "grad_norm": 0.511825680732727, + "learning_rate": 1.5337401056503876e-06, + "loss": 0.2649, + "step": 21970 + }, + { + "epoch": 0.8215529125873309, + "grad_norm": 0.5602962374687195, + "learning_rate": 1.5306162498161493e-06, + "loss": 0.2674, + "step": 21975 + }, + { + "epoch": 0.8217398415776807, + "grad_norm": 0.27479755878448486, + "learning_rate": 1.52749531482969e-06, + "loss": 0.304, + "step": 21980 + }, + { + "epoch": 0.8219267705680304, + "grad_norm": 0.3318054676055908, + "learning_rate": 1.5243773017673292e-06, + "loss": 0.2914, + "step": 21985 + }, + { + "epoch": 0.8221136995583802, + "grad_norm": 0.44144922494888306, + "learning_rate": 1.5212622117043784e-06, + "loss": 0.262, + "step": 21990 + }, + { + "epoch": 0.8223006285487301, + "grad_norm": 0.8043171763420105, + "learning_rate": 1.5181500457151432e-06, + "loss": 0.3467, + "step": 21995 + }, + { + "epoch": 0.8224875575390799, + "grad_norm": 0.4182523787021637, + "learning_rate": 1.5150408048729226e-06, + "loss": 0.3729, + "step": 22000 + }, + { + "epoch": 0.8226744865294296, + "grad_norm": 0.22673340141773224, + "learning_rate": 1.5119344902500022e-06, + "loss": 0.3049, + "step": 22005 + }, + { + "epoch": 0.8228614155197794, + "grad_norm": 0.3719673454761505, + "learning_rate": 1.5088311029176628e-06, + "loss": 0.2519, + "step": 22010 + }, + { + "epoch": 0.8230483445101292, + "grad_norm": 0.3984321355819702, + "learning_rate": 1.5057306439461738e-06, + "loss": 0.2315, + "step": 22015 + }, + { + "epoch": 0.823235273500479, + "grad_norm": 0.24741186201572418, + "learning_rate": 1.5026331144047935e-06, + "loss": 0.3394, + "step": 22020 + }, + { + "epoch": 0.8234222024908288, + "grad_norm": 0.43265968561172485, + "learning_rate": 1.4995385153617725e-06, + "loss": 0.3895, + "step": 22025 + }, + { + "epoch": 0.8236091314811785, + "grad_norm": 0.34110236167907715, + "learning_rate": 1.4964468478843496e-06, + "loss": 0.2754, + "step": 22030 + }, + { + "epoch": 0.8237960604715284, + "grad_norm": 0.21157729625701904, + "learning_rate": 1.4933581130387509e-06, + "loss": 0.253, + "step": 22035 + }, + { + "epoch": 0.8239829894618782, + "grad_norm": 0.5705170035362244, + "learning_rate": 1.4902723118901907e-06, + "loss": 0.2533, + "step": 22040 + }, + { + "epoch": 0.824169918452228, + "grad_norm": 0.41971489787101746, + "learning_rate": 1.4871894455028778e-06, + "loss": 0.2568, + "step": 22045 + }, + { + "epoch": 0.8243568474425778, + "grad_norm": 0.5276783108711243, + "learning_rate": 1.4841095149399998e-06, + "loss": 0.3132, + "step": 22050 + }, + { + "epoch": 0.8245437764329275, + "grad_norm": 0.2946646213531494, + "learning_rate": 1.481032521263739e-06, + "loss": 0.304, + "step": 22055 + }, + { + "epoch": 0.8247307054232773, + "grad_norm": 0.6871627569198608, + "learning_rate": 1.4779584655352652e-06, + "loss": 0.322, + "step": 22060 + }, + { + "epoch": 0.8249176344136271, + "grad_norm": 0.37137484550476074, + "learning_rate": 1.4748873488147264e-06, + "loss": 0.3303, + "step": 22065 + }, + { + "epoch": 0.8251045634039769, + "grad_norm": 0.34239670634269714, + "learning_rate": 1.4718191721612684e-06, + "loss": 0.297, + "step": 22070 + }, + { + "epoch": 0.8252914923943268, + "grad_norm": 0.6226434707641602, + "learning_rate": 1.468753936633014e-06, + "loss": 0.3276, + "step": 22075 + }, + { + "epoch": 0.8254784213846765, + "grad_norm": 0.5200907588005066, + "learning_rate": 1.4656916432870737e-06, + "loss": 0.2458, + "step": 22080 + }, + { + "epoch": 0.8256653503750263, + "grad_norm": 0.513783872127533, + "learning_rate": 1.4626322931795489e-06, + "loss": 0.2929, + "step": 22085 + }, + { + "epoch": 0.8258522793653761, + "grad_norm": 3.522709846496582, + "learning_rate": 1.4595758873655198e-06, + "loss": 0.2378, + "step": 22090 + }, + { + "epoch": 0.8260392083557259, + "grad_norm": 0.12881094217300415, + "learning_rate": 1.4565224268990507e-06, + "loss": 0.2293, + "step": 22095 + }, + { + "epoch": 0.8262261373460756, + "grad_norm": 0.5438992381095886, + "learning_rate": 1.4534719128331953e-06, + "loss": 0.2544, + "step": 22100 + }, + { + "epoch": 0.8264130663364254, + "grad_norm": 0.28744834661483765, + "learning_rate": 1.4504243462199896e-06, + "loss": 0.2217, + "step": 22105 + }, + { + "epoch": 0.8265999953267752, + "grad_norm": 0.39146536588668823, + "learning_rate": 1.4473797281104485e-06, + "loss": 0.2257, + "step": 22110 + }, + { + "epoch": 0.8267869243171251, + "grad_norm": 0.6850770115852356, + "learning_rate": 1.4443380595545787e-06, + "loss": 0.3273, + "step": 22115 + }, + { + "epoch": 0.8269738533074749, + "grad_norm": 0.3585718870162964, + "learning_rate": 1.4412993416013588e-06, + "loss": 0.2597, + "step": 22120 + }, + { + "epoch": 0.8271607822978246, + "grad_norm": 0.6018190979957581, + "learning_rate": 1.4382635752987606e-06, + "loss": 0.2203, + "step": 22125 + }, + { + "epoch": 0.8273477112881744, + "grad_norm": 0.48121556639671326, + "learning_rate": 1.43523076169373e-06, + "loss": 0.2179, + "step": 22130 + }, + { + "epoch": 0.8275346402785242, + "grad_norm": 0.3579247295856476, + "learning_rate": 1.432200901832198e-06, + "loss": 0.2088, + "step": 22135 + }, + { + "epoch": 0.827721569268874, + "grad_norm": 0.58817058801651, + "learning_rate": 1.4291739967590746e-06, + "loss": 0.2333, + "step": 22140 + }, + { + "epoch": 0.8279084982592237, + "grad_norm": 0.44945281744003296, + "learning_rate": 1.4261500475182543e-06, + "loss": 0.2591, + "step": 22145 + }, + { + "epoch": 0.8280954272495735, + "grad_norm": 0.6072959899902344, + "learning_rate": 1.4231290551526133e-06, + "loss": 0.2795, + "step": 22150 + }, + { + "epoch": 0.8282823562399234, + "grad_norm": 0.5549390912055969, + "learning_rate": 1.4201110207039993e-06, + "loss": 0.2617, + "step": 22155 + }, + { + "epoch": 0.8284692852302732, + "grad_norm": 0.419281929731369, + "learning_rate": 1.4170959452132526e-06, + "loss": 0.2794, + "step": 22160 + }, + { + "epoch": 0.828656214220623, + "grad_norm": 0.5446984171867371, + "learning_rate": 1.414083829720181e-06, + "loss": 0.2595, + "step": 22165 + }, + { + "epoch": 0.8288431432109727, + "grad_norm": 0.562941312789917, + "learning_rate": 1.4110746752635806e-06, + "loss": 0.3615, + "step": 22170 + }, + { + "epoch": 0.8290300722013225, + "grad_norm": 0.42342409491539, + "learning_rate": 1.4080684828812219e-06, + "loss": 0.2564, + "step": 22175 + }, + { + "epoch": 0.8292170011916723, + "grad_norm": 0.3505541682243347, + "learning_rate": 1.4050652536098518e-06, + "loss": 0.2427, + "step": 22180 + }, + { + "epoch": 0.8294039301820221, + "grad_norm": 0.5046654939651489, + "learning_rate": 1.4020649884851988e-06, + "loss": 0.2361, + "step": 22185 + }, + { + "epoch": 0.8295908591723719, + "grad_norm": 0.46937140822410583, + "learning_rate": 1.3990676885419685e-06, + "loss": 0.2695, + "step": 22190 + }, + { + "epoch": 0.8297777881627217, + "grad_norm": 1.1296278238296509, + "learning_rate": 1.3960733548138472e-06, + "loss": 0.3084, + "step": 22195 + }, + { + "epoch": 0.8299647171530715, + "grad_norm": 0.5058994889259338, + "learning_rate": 1.3930819883334901e-06, + "loss": 0.2994, + "step": 22200 + }, + { + "epoch": 0.8301516461434213, + "grad_norm": 0.26385819911956787, + "learning_rate": 1.3900935901325374e-06, + "loss": 0.2441, + "step": 22205 + }, + { + "epoch": 0.8303385751337711, + "grad_norm": 0.5458356738090515, + "learning_rate": 1.3871081612415982e-06, + "loss": 0.391, + "step": 22210 + }, + { + "epoch": 0.8305255041241209, + "grad_norm": 0.28343480825424194, + "learning_rate": 1.3841257026902665e-06, + "loss": 0.2884, + "step": 22215 + }, + { + "epoch": 0.8307124331144706, + "grad_norm": 0.3122907876968384, + "learning_rate": 1.3811462155071043e-06, + "loss": 0.2776, + "step": 22220 + }, + { + "epoch": 0.8308993621048204, + "grad_norm": 0.7874910831451416, + "learning_rate": 1.3781697007196493e-06, + "loss": 0.2769, + "step": 22225 + }, + { + "epoch": 0.8310862910951702, + "grad_norm": 0.5420087575912476, + "learning_rate": 1.3751961593544171e-06, + "loss": 0.2888, + "step": 22230 + }, + { + "epoch": 0.83127322008552, + "grad_norm": 0.2580297887325287, + "learning_rate": 1.3722255924368965e-06, + "loss": 0.3204, + "step": 22235 + }, + { + "epoch": 0.8314601490758698, + "grad_norm": 0.40570923686027527, + "learning_rate": 1.3692580009915557e-06, + "loss": 0.2792, + "step": 22240 + }, + { + "epoch": 0.8316470780662196, + "grad_norm": 0.3946312963962555, + "learning_rate": 1.3662933860418249e-06, + "loss": 0.2946, + "step": 22245 + }, + { + "epoch": 0.8318340070565694, + "grad_norm": 0.26744893193244934, + "learning_rate": 1.3633317486101205e-06, + "loss": 0.2792, + "step": 22250 + }, + { + "epoch": 0.8320209360469192, + "grad_norm": 0.41380831599235535, + "learning_rate": 1.3603730897178226e-06, + "loss": 0.2922, + "step": 22255 + }, + { + "epoch": 0.832207865037269, + "grad_norm": 0.27136343717575073, + "learning_rate": 1.3574174103852922e-06, + "loss": 0.315, + "step": 22260 + }, + { + "epoch": 0.8323947940276187, + "grad_norm": 0.6136317849159241, + "learning_rate": 1.3544647116318522e-06, + "loss": 0.2856, + "step": 22265 + }, + { + "epoch": 0.8325817230179685, + "grad_norm": 0.42765137553215027, + "learning_rate": 1.3515149944758098e-06, + "loss": 0.208, + "step": 22270 + }, + { + "epoch": 0.8327686520083183, + "grad_norm": 0.41445282101631165, + "learning_rate": 1.3485682599344351e-06, + "loss": 0.3367, + "step": 22275 + }, + { + "epoch": 0.8329555809986682, + "grad_norm": 0.2044340819120407, + "learning_rate": 1.3456245090239706e-06, + "loss": 0.2173, + "step": 22280 + }, + { + "epoch": 0.833142509989018, + "grad_norm": 0.3924507796764374, + "learning_rate": 1.3426837427596363e-06, + "loss": 0.3163, + "step": 22285 + }, + { + "epoch": 0.8333294389793677, + "grad_norm": 0.6259861588478088, + "learning_rate": 1.339745962155613e-06, + "loss": 0.3531, + "step": 22290 + }, + { + "epoch": 0.8335163679697175, + "grad_norm": 0.6004974246025085, + "learning_rate": 1.336811168225063e-06, + "loss": 0.2664, + "step": 22295 + }, + { + "epoch": 0.8337032969600673, + "grad_norm": 0.3294597268104553, + "learning_rate": 1.333879361980106e-06, + "loss": 0.2862, + "step": 22300 + }, + { + "epoch": 0.8338902259504171, + "grad_norm": 0.8053619861602783, + "learning_rate": 1.3309505444318439e-06, + "loss": 0.2701, + "step": 22305 + }, + { + "epoch": 0.8340771549407668, + "grad_norm": 0.47687965631484985, + "learning_rate": 1.328024716590336e-06, + "loss": 0.2936, + "step": 22310 + }, + { + "epoch": 0.8342640839311166, + "grad_norm": 0.2641568183898926, + "learning_rate": 1.3251018794646232e-06, + "loss": 0.2435, + "step": 22315 + }, + { + "epoch": 0.8344510129214665, + "grad_norm": 0.3692987859249115, + "learning_rate": 1.3221820340627044e-06, + "loss": 0.2775, + "step": 22320 + }, + { + "epoch": 0.8346379419118163, + "grad_norm": 0.6120216846466064, + "learning_rate": 1.319265181391549e-06, + "loss": 0.3437, + "step": 22325 + }, + { + "epoch": 0.8348248709021661, + "grad_norm": 0.5429954528808594, + "learning_rate": 1.3163513224571012e-06, + "loss": 0.3351, + "step": 22330 + }, + { + "epoch": 0.8350117998925158, + "grad_norm": 0.5047636032104492, + "learning_rate": 1.3134404582642612e-06, + "loss": 0.3325, + "step": 22335 + }, + { + "epoch": 0.8351987288828656, + "grad_norm": 0.5833032131195068, + "learning_rate": 1.3105325898169075e-06, + "loss": 0.2929, + "step": 22340 + }, + { + "epoch": 0.8353856578732154, + "grad_norm": 0.35585522651672363, + "learning_rate": 1.3076277181178775e-06, + "loss": 0.246, + "step": 22345 + }, + { + "epoch": 0.8355725868635652, + "grad_norm": 0.4089027941226959, + "learning_rate": 1.3047258441689815e-06, + "loss": 0.3233, + "step": 22350 + }, + { + "epoch": 0.835759515853915, + "grad_norm": 0.4178864657878876, + "learning_rate": 1.3018269689709883e-06, + "loss": 0.3107, + "step": 22355 + }, + { + "epoch": 0.8359464448442648, + "grad_norm": 0.6142130494117737, + "learning_rate": 1.2989310935236421e-06, + "loss": 0.2829, + "step": 22360 + }, + { + "epoch": 0.8361333738346146, + "grad_norm": 0.3945229947566986, + "learning_rate": 1.2960382188256438e-06, + "loss": 0.2744, + "step": 22365 + }, + { + "epoch": 0.8363203028249644, + "grad_norm": 0.3515075147151947, + "learning_rate": 1.2931483458746618e-06, + "loss": 0.2536, + "step": 22370 + }, + { + "epoch": 0.8365072318153142, + "grad_norm": 0.31164297461509705, + "learning_rate": 1.2902614756673348e-06, + "loss": 0.2761, + "step": 22375 + }, + { + "epoch": 0.836694160805664, + "grad_norm": 0.24446457624435425, + "learning_rate": 1.2873776091992574e-06, + "loss": 0.3083, + "step": 22380 + }, + { + "epoch": 0.8368810897960137, + "grad_norm": 0.8027161955833435, + "learning_rate": 1.284496747464996e-06, + "loss": 0.314, + "step": 22385 + }, + { + "epoch": 0.8370680187863635, + "grad_norm": 0.2881372570991516, + "learning_rate": 1.281618891458073e-06, + "loss": 0.2548, + "step": 22390 + }, + { + "epoch": 0.8372549477767133, + "grad_norm": 0.6596559286117554, + "learning_rate": 1.278744042170984e-06, + "loss": 0.2581, + "step": 22395 + }, + { + "epoch": 0.8374418767670632, + "grad_norm": 0.5899741649627686, + "learning_rate": 1.2758722005951773e-06, + "loss": 0.3011, + "step": 22400 + }, + { + "epoch": 0.8376288057574129, + "grad_norm": 0.2703937888145447, + "learning_rate": 1.273003367721073e-06, + "loss": 0.252, + "step": 22405 + }, + { + "epoch": 0.8378157347477627, + "grad_norm": 0.36412695050239563, + "learning_rate": 1.2701375445380459e-06, + "loss": 0.2738, + "step": 22410 + }, + { + "epoch": 0.8380026637381125, + "grad_norm": 0.35132333636283875, + "learning_rate": 1.2672747320344359e-06, + "loss": 0.2776, + "step": 22415 + }, + { + "epoch": 0.8381895927284623, + "grad_norm": 0.4513058662414551, + "learning_rate": 1.2644149311975494e-06, + "loss": 0.2587, + "step": 22420 + }, + { + "epoch": 0.838376521718812, + "grad_norm": 0.3663261830806732, + "learning_rate": 1.2615581430136449e-06, + "loss": 0.2893, + "step": 22425 + }, + { + "epoch": 0.8385634507091618, + "grad_norm": 0.7394602298736572, + "learning_rate": 1.25870436846795e-06, + "loss": 0.2527, + "step": 22430 + }, + { + "epoch": 0.8387503796995116, + "grad_norm": 0.3970726728439331, + "learning_rate": 1.255853608544647e-06, + "loss": 0.2582, + "step": 22435 + }, + { + "epoch": 0.8389373086898615, + "grad_norm": 0.6118816137313843, + "learning_rate": 1.253005864226885e-06, + "loss": 0.3117, + "step": 22440 + }, + { + "epoch": 0.8391242376802113, + "grad_norm": 0.561377763748169, + "learning_rate": 1.2501611364967647e-06, + "loss": 0.2964, + "step": 22445 + }, + { + "epoch": 0.839311166670561, + "grad_norm": 0.41377291083335876, + "learning_rate": 1.247319426335356e-06, + "loss": 0.2623, + "step": 22450 + }, + { + "epoch": 0.8394980956609108, + "grad_norm": 0.6088568568229675, + "learning_rate": 1.2444807347226795e-06, + "loss": 0.2458, + "step": 22455 + }, + { + "epoch": 0.8396850246512606, + "grad_norm": 0.2927430272102356, + "learning_rate": 1.241645062637723e-06, + "loss": 0.2587, + "step": 22460 + }, + { + "epoch": 0.8398719536416104, + "grad_norm": 0.4997739791870117, + "learning_rate": 1.2388124110584255e-06, + "loss": 0.2811, + "step": 22465 + }, + { + "epoch": 0.8400588826319602, + "grad_norm": 0.5061297416687012, + "learning_rate": 1.2359827809616843e-06, + "loss": 0.2929, + "step": 22470 + }, + { + "epoch": 0.8402458116223099, + "grad_norm": 0.5074501037597656, + "learning_rate": 1.2331561733233644e-06, + "loss": 0.3264, + "step": 22475 + }, + { + "epoch": 0.8404327406126597, + "grad_norm": 0.7985924482345581, + "learning_rate": 1.230332589118276e-06, + "loss": 0.3159, + "step": 22480 + }, + { + "epoch": 0.8406196696030096, + "grad_norm": 0.39413031935691833, + "learning_rate": 1.2275120293201969e-06, + "loss": 0.2703, + "step": 22485 + }, + { + "epoch": 0.8408065985933594, + "grad_norm": 0.5623971819877625, + "learning_rate": 1.2246944949018525e-06, + "loss": 0.3326, + "step": 22490 + }, + { + "epoch": 0.8409935275837092, + "grad_norm": 0.3700483739376068, + "learning_rate": 1.2218799868349362e-06, + "loss": 0.3001, + "step": 22495 + }, + { + "epoch": 0.8411804565740589, + "grad_norm": 0.42255735397338867, + "learning_rate": 1.2190685060900843e-06, + "loss": 0.3042, + "step": 22500 + }, + { + "epoch": 0.8413673855644087, + "grad_norm": 0.38053280115127563, + "learning_rate": 1.2162600536369018e-06, + "loss": 0.2707, + "step": 22505 + }, + { + "epoch": 0.8415543145547585, + "grad_norm": 0.42679354548454285, + "learning_rate": 1.2134546304439398e-06, + "loss": 0.2857, + "step": 22510 + }, + { + "epoch": 0.8417412435451083, + "grad_norm": 0.20769809186458588, + "learning_rate": 1.2106522374787078e-06, + "loss": 0.3186, + "step": 22515 + }, + { + "epoch": 0.841928172535458, + "grad_norm": 0.32969167828559875, + "learning_rate": 1.2078528757076746e-06, + "loss": 0.3293, + "step": 22520 + }, + { + "epoch": 0.8421151015258079, + "grad_norm": 0.40005597472190857, + "learning_rate": 1.205056546096256e-06, + "loss": 0.2552, + "step": 22525 + }, + { + "epoch": 0.8423020305161577, + "grad_norm": 0.500026524066925, + "learning_rate": 1.2022632496088294e-06, + "loss": 0.3076, + "step": 22530 + }, + { + "epoch": 0.8424889595065075, + "grad_norm": 0.34214553236961365, + "learning_rate": 1.1994729872087185e-06, + "loss": 0.2829, + "step": 22535 + }, + { + "epoch": 0.8426758884968573, + "grad_norm": 0.35944414138793945, + "learning_rate": 1.1966857598582104e-06, + "loss": 0.2663, + "step": 22540 + }, + { + "epoch": 0.842862817487207, + "grad_norm": 0.8880605101585388, + "learning_rate": 1.193901568518534e-06, + "loss": 0.3047, + "step": 22545 + }, + { + "epoch": 0.8430497464775568, + "grad_norm": 0.316691517829895, + "learning_rate": 1.1911204141498821e-06, + "loss": 0.2996, + "step": 22550 + }, + { + "epoch": 0.8432366754679066, + "grad_norm": 0.49218663573265076, + "learning_rate": 1.1883422977113935e-06, + "loss": 0.2315, + "step": 22555 + }, + { + "epoch": 0.8434236044582564, + "grad_norm": 0.27020570635795593, + "learning_rate": 1.1855672201611578e-06, + "loss": 0.2329, + "step": 22560 + }, + { + "epoch": 0.8436105334486063, + "grad_norm": 0.321402907371521, + "learning_rate": 1.1827951824562245e-06, + "loss": 0.3207, + "step": 22565 + }, + { + "epoch": 0.843797462438956, + "grad_norm": 0.8666585087776184, + "learning_rate": 1.1800261855525862e-06, + "loss": 0.2414, + "step": 22570 + }, + { + "epoch": 0.8439843914293058, + "grad_norm": 0.41892504692077637, + "learning_rate": 1.177260230405194e-06, + "loss": 0.2862, + "step": 22575 + }, + { + "epoch": 0.8441713204196556, + "grad_norm": 0.4117008149623871, + "learning_rate": 1.1744973179679431e-06, + "loss": 0.311, + "step": 22580 + }, + { + "epoch": 0.8443582494100054, + "grad_norm": 0.2825338840484619, + "learning_rate": 1.171737449193686e-06, + "loss": 0.3535, + "step": 22585 + }, + { + "epoch": 0.8445451784003551, + "grad_norm": 0.5039531588554382, + "learning_rate": 1.1689806250342196e-06, + "loss": 0.266, + "step": 22590 + }, + { + "epoch": 0.8447321073907049, + "grad_norm": 0.8699703216552734, + "learning_rate": 1.166226846440297e-06, + "loss": 0.2462, + "step": 22595 + }, + { + "epoch": 0.8449190363810547, + "grad_norm": 0.4708864986896515, + "learning_rate": 1.1634761143616159e-06, + "loss": 0.2971, + "step": 22600 + }, + { + "epoch": 0.8451059653714046, + "grad_norm": 0.34658095240592957, + "learning_rate": 1.1607284297468214e-06, + "loss": 0.2854, + "step": 22605 + }, + { + "epoch": 0.8452928943617544, + "grad_norm": 0.2925431430339813, + "learning_rate": 1.1579837935435168e-06, + "loss": 0.2688, + "step": 22610 + }, + { + "epoch": 0.8454798233521041, + "grad_norm": 0.48946163058280945, + "learning_rate": 1.1552422066982437e-06, + "loss": 0.2699, + "step": 22615 + }, + { + "epoch": 0.8456667523424539, + "grad_norm": 0.2582431733608246, + "learning_rate": 1.1525036701565018e-06, + "loss": 0.2689, + "step": 22620 + }, + { + "epoch": 0.8458536813328037, + "grad_norm": 0.26818692684173584, + "learning_rate": 1.1497681848627284e-06, + "loss": 0.2724, + "step": 22625 + }, + { + "epoch": 0.8460406103231535, + "grad_norm": 0.26108282804489136, + "learning_rate": 1.1470357517603192e-06, + "loss": 0.2324, + "step": 22630 + }, + { + "epoch": 0.8462275393135033, + "grad_norm": 0.3785153925418854, + "learning_rate": 1.1443063717916081e-06, + "loss": 0.2595, + "step": 22635 + }, + { + "epoch": 0.846414468303853, + "grad_norm": 0.4359034597873688, + "learning_rate": 1.141580045897881e-06, + "loss": 0.2702, + "step": 22640 + }, + { + "epoch": 0.8466013972942029, + "grad_norm": 0.41941553354263306, + "learning_rate": 1.1388567750193725e-06, + "loss": 0.2944, + "step": 22645 + }, + { + "epoch": 0.8467883262845527, + "grad_norm": 0.2629951536655426, + "learning_rate": 1.1361365600952589e-06, + "loss": 0.2684, + "step": 22650 + }, + { + "epoch": 0.8469752552749025, + "grad_norm": 0.32962939143180847, + "learning_rate": 1.1334194020636635e-06, + "loss": 0.3649, + "step": 22655 + }, + { + "epoch": 0.8471621842652522, + "grad_norm": 0.6077668070793152, + "learning_rate": 1.1307053018616543e-06, + "loss": 0.2784, + "step": 22660 + }, + { + "epoch": 0.847349113255602, + "grad_norm": 0.2332148402929306, + "learning_rate": 1.1279942604252514e-06, + "loss": 0.2693, + "step": 22665 + }, + { + "epoch": 0.8475360422459518, + "grad_norm": 0.3998556435108185, + "learning_rate": 1.1252862786894103e-06, + "loss": 0.241, + "step": 22670 + }, + { + "epoch": 0.8477229712363016, + "grad_norm": 0.6394714117050171, + "learning_rate": 1.1225813575880417e-06, + "loss": 0.3319, + "step": 22675 + }, + { + "epoch": 0.8479099002266514, + "grad_norm": 0.271589457988739, + "learning_rate": 1.1198794980539908e-06, + "loss": 0.2994, + "step": 22680 + }, + { + "epoch": 0.8480968292170012, + "grad_norm": 0.35430166125297546, + "learning_rate": 1.1171807010190528e-06, + "loss": 0.2702, + "step": 22685 + }, + { + "epoch": 0.848283758207351, + "grad_norm": 0.5060386061668396, + "learning_rate": 1.114484967413969e-06, + "loss": 0.3112, + "step": 22690 + }, + { + "epoch": 0.8484706871977008, + "grad_norm": 0.3983868956565857, + "learning_rate": 1.1117922981684172e-06, + "loss": 0.2696, + "step": 22695 + }, + { + "epoch": 0.8486576161880506, + "grad_norm": 0.569534957408905, + "learning_rate": 1.1091026942110217e-06, + "loss": 0.2857, + "step": 22700 + }, + { + "epoch": 0.8488445451784004, + "grad_norm": 0.9335552453994751, + "learning_rate": 1.1064161564693486e-06, + "loss": 0.2731, + "step": 22705 + }, + { + "epoch": 0.8490314741687501, + "grad_norm": 0.36156487464904785, + "learning_rate": 1.1037326858699126e-06, + "loss": 0.2699, + "step": 22710 + }, + { + "epoch": 0.8492184031590999, + "grad_norm": 0.31813082098960876, + "learning_rate": 1.101052283338161e-06, + "loss": 0.199, + "step": 22715 + }, + { + "epoch": 0.8494053321494497, + "grad_norm": 0.42129233479499817, + "learning_rate": 1.0983749497984908e-06, + "loss": 0.2301, + "step": 22720 + }, + { + "epoch": 0.8495922611397995, + "grad_norm": 0.39161425828933716, + "learning_rate": 1.0957006861742348e-06, + "loss": 0.2858, + "step": 22725 + }, + { + "epoch": 0.8497791901301494, + "grad_norm": 0.3193294107913971, + "learning_rate": 1.093029493387673e-06, + "loss": 0.2037, + "step": 22730 + }, + { + "epoch": 0.8499661191204991, + "grad_norm": 0.3217445909976959, + "learning_rate": 1.0903613723600225e-06, + "loss": 0.288, + "step": 22735 + }, + { + "epoch": 0.8501530481108489, + "grad_norm": 0.3758181035518646, + "learning_rate": 1.0876963240114413e-06, + "loss": 0.1978, + "step": 22740 + }, + { + "epoch": 0.8503399771011987, + "grad_norm": 0.4037764370441437, + "learning_rate": 1.0850343492610282e-06, + "loss": 0.2588, + "step": 22745 + }, + { + "epoch": 0.8505269060915485, + "grad_norm": 0.655259907245636, + "learning_rate": 1.0823754490268213e-06, + "loss": 0.2726, + "step": 22750 + }, + { + "epoch": 0.8507138350818982, + "grad_norm": 0.20327606797218323, + "learning_rate": 1.0797196242258002e-06, + "loss": 0.2396, + "step": 22755 + }, + { + "epoch": 0.850900764072248, + "grad_norm": 0.3590863049030304, + "learning_rate": 1.0770668757738811e-06, + "loss": 0.2711, + "step": 22760 + }, + { + "epoch": 0.8510876930625978, + "grad_norm": 0.5582608580589294, + "learning_rate": 1.0744172045859236e-06, + "loss": 0.2555, + "step": 22765 + }, + { + "epoch": 0.8512746220529477, + "grad_norm": 0.9252792596817017, + "learning_rate": 1.071770611575721e-06, + "loss": 0.3225, + "step": 22770 + }, + { + "epoch": 0.8514615510432975, + "grad_norm": 0.48875078558921814, + "learning_rate": 1.0691270976560075e-06, + "loss": 0.2456, + "step": 22775 + }, + { + "epoch": 0.8516484800336472, + "grad_norm": 0.691635251045227, + "learning_rate": 1.0664866637384574e-06, + "loss": 0.3181, + "step": 22780 + }, + { + "epoch": 0.851835409023997, + "grad_norm": 0.7403832077980042, + "learning_rate": 1.0638493107336811e-06, + "loss": 0.2726, + "step": 22785 + }, + { + "epoch": 0.8520223380143468, + "grad_norm": 0.37632712721824646, + "learning_rate": 1.0612150395512233e-06, + "loss": 0.315, + "step": 22790 + }, + { + "epoch": 0.8522092670046966, + "grad_norm": 0.3026890158653259, + "learning_rate": 1.0585838510995684e-06, + "loss": 0.2889, + "step": 22795 + }, + { + "epoch": 0.8523961959950463, + "grad_norm": 0.516616702079773, + "learning_rate": 1.0559557462861403e-06, + "loss": 0.2493, + "step": 22800 + }, + { + "epoch": 0.8525831249853961, + "grad_norm": 0.3008388876914978, + "learning_rate": 1.0533307260172954e-06, + "loss": 0.2922, + "step": 22805 + }, + { + "epoch": 0.852770053975746, + "grad_norm": 0.4109887182712555, + "learning_rate": 1.0507087911983293e-06, + "loss": 0.2831, + "step": 22810 + }, + { + "epoch": 0.8529569829660958, + "grad_norm": 0.3522479832172394, + "learning_rate": 1.04808994273347e-06, + "loss": 0.3247, + "step": 22815 + }, + { + "epoch": 0.8531439119564456, + "grad_norm": 0.5156667232513428, + "learning_rate": 1.045474181525885e-06, + "loss": 0.2889, + "step": 22820 + }, + { + "epoch": 0.8533308409467953, + "grad_norm": 0.4093747138977051, + "learning_rate": 1.0428615084776772e-06, + "loss": 0.243, + "step": 22825 + }, + { + "epoch": 0.8535177699371451, + "grad_norm": 0.3295019567012787, + "learning_rate": 1.0402519244898778e-06, + "loss": 0.2638, + "step": 22830 + }, + { + "epoch": 0.8537046989274949, + "grad_norm": 0.7684919238090515, + "learning_rate": 1.037645430462464e-06, + "loss": 0.2124, + "step": 22835 + }, + { + "epoch": 0.8538916279178447, + "grad_norm": 0.43423688411712646, + "learning_rate": 1.0350420272943362e-06, + "loss": 0.2397, + "step": 22840 + }, + { + "epoch": 0.8540785569081945, + "grad_norm": 0.6224250197410583, + "learning_rate": 1.0324417158833343e-06, + "loss": 0.2438, + "step": 22845 + }, + { + "epoch": 0.8542654858985443, + "grad_norm": 0.35096117854118347, + "learning_rate": 1.02984449712623e-06, + "loss": 0.2792, + "step": 22850 + }, + { + "epoch": 0.8544524148888941, + "grad_norm": 0.3952169716358185, + "learning_rate": 1.0272503719187332e-06, + "loss": 0.3134, + "step": 22855 + }, + { + "epoch": 0.8546393438792439, + "grad_norm": 0.18396589159965515, + "learning_rate": 1.0246593411554796e-06, + "loss": 0.2938, + "step": 22860 + }, + { + "epoch": 0.8548262728695937, + "grad_norm": 0.35608750581741333, + "learning_rate": 1.022071405730043e-06, + "loss": 0.2834, + "step": 22865 + }, + { + "epoch": 0.8550132018599435, + "grad_norm": 0.45381414890289307, + "learning_rate": 1.0194865665349296e-06, + "loss": 0.3398, + "step": 22870 + }, + { + "epoch": 0.8552001308502932, + "grad_norm": 0.5310588479042053, + "learning_rate": 1.0169048244615742e-06, + "loss": 0.4079, + "step": 22875 + }, + { + "epoch": 0.855387059840643, + "grad_norm": 0.2279624491930008, + "learning_rate": 1.0143261804003479e-06, + "loss": 0.2207, + "step": 22880 + }, + { + "epoch": 0.8555739888309928, + "grad_norm": 0.408582478761673, + "learning_rate": 1.0117506352405503e-06, + "loss": 0.2455, + "step": 22885 + }, + { + "epoch": 0.8557609178213427, + "grad_norm": 0.4538606107234955, + "learning_rate": 1.0091781898704123e-06, + "loss": 0.2467, + "step": 22890 + }, + { + "epoch": 0.8559478468116924, + "grad_norm": 0.36869016289711, + "learning_rate": 1.006608845177095e-06, + "loss": 0.298, + "step": 22895 + }, + { + "epoch": 0.8561347758020422, + "grad_norm": 0.3675679564476013, + "learning_rate": 1.0040426020466965e-06, + "loss": 0.2797, + "step": 22900 + }, + { + "epoch": 0.856321704792392, + "grad_norm": 0.5693662762641907, + "learning_rate": 1.0014794613642354e-06, + "loss": 0.2798, + "step": 22905 + }, + { + "epoch": 0.8565086337827418, + "grad_norm": 0.35014966130256653, + "learning_rate": 9.98919424013669e-07, + "loss": 0.2168, + "step": 22910 + }, + { + "epoch": 0.8566955627730916, + "grad_norm": 0.30344653129577637, + "learning_rate": 9.963624908778791e-07, + "loss": 0.2438, + "step": 22915 + }, + { + "epoch": 0.8568824917634413, + "grad_norm": 0.22757522761821747, + "learning_rate": 9.938086628386778e-07, + "loss": 0.2447, + "step": 22920 + }, + { + "epoch": 0.8570694207537911, + "grad_norm": 0.5425657629966736, + "learning_rate": 9.912579407768118e-07, + "loss": 0.2094, + "step": 22925 + }, + { + "epoch": 0.857256349744141, + "grad_norm": 0.6510607600212097, + "learning_rate": 9.887103255719489e-07, + "loss": 0.3355, + "step": 22930 + }, + { + "epoch": 0.8574432787344908, + "grad_norm": 0.4659566283226013, + "learning_rate": 9.861658181026878e-07, + "loss": 0.2521, + "step": 22935 + }, + { + "epoch": 0.8576302077248406, + "grad_norm": 0.40870416164398193, + "learning_rate": 9.83624419246555e-07, + "loss": 0.3023, + "step": 22940 + }, + { + "epoch": 0.8578171367151903, + "grad_norm": 0.24827082455158234, + "learning_rate": 9.810861298800111e-07, + "loss": 0.2788, + "step": 22945 + }, + { + "epoch": 0.8580040657055401, + "grad_norm": 0.6757010221481323, + "learning_rate": 9.78550950878433e-07, + "loss": 0.3783, + "step": 22950 + }, + { + "epoch": 0.8581909946958899, + "grad_norm": 0.3025006651878357, + "learning_rate": 9.760188831161376e-07, + "loss": 0.3317, + "step": 22955 + }, + { + "epoch": 0.8583779236862397, + "grad_norm": 0.45728322863578796, + "learning_rate": 9.734899274663578e-07, + "loss": 0.3688, + "step": 22960 + }, + { + "epoch": 0.8585648526765894, + "grad_norm": 0.5989465713500977, + "learning_rate": 9.709640848012602e-07, + "loss": 0.2258, + "step": 22965 + }, + { + "epoch": 0.8587517816669392, + "grad_norm": 0.2060554027557373, + "learning_rate": 9.684413559919358e-07, + "loss": 0.3065, + "step": 22970 + }, + { + "epoch": 0.8589387106572891, + "grad_norm": 0.3968755006790161, + "learning_rate": 9.65921741908402e-07, + "loss": 0.3119, + "step": 22975 + }, + { + "epoch": 0.8591256396476389, + "grad_norm": 0.4152187407016754, + "learning_rate": 9.634052434195983e-07, + "loss": 0.2567, + "step": 22980 + }, + { + "epoch": 0.8593125686379887, + "grad_norm": 0.34883493185043335, + "learning_rate": 9.60891861393396e-07, + "loss": 0.2796, + "step": 22985 + }, + { + "epoch": 0.8594994976283384, + "grad_norm": 0.13147194683551788, + "learning_rate": 9.583815966965882e-07, + "loss": 0.2369, + "step": 22990 + }, + { + "epoch": 0.8596864266186882, + "grad_norm": 0.49323028326034546, + "learning_rate": 9.558744501948903e-07, + "loss": 0.3564, + "step": 22995 + }, + { + "epoch": 0.859873355609038, + "grad_norm": 0.23689912259578705, + "learning_rate": 9.533704227529494e-07, + "loss": 0.2567, + "step": 23000 + }, + { + "epoch": 0.8600602845993878, + "grad_norm": 0.8092080950737, + "learning_rate": 9.508695152343295e-07, + "loss": 0.3302, + "step": 23005 + }, + { + "epoch": 0.8602472135897375, + "grad_norm": 0.3174912929534912, + "learning_rate": 9.483717285015237e-07, + "loss": 0.245, + "step": 23010 + }, + { + "epoch": 0.8604341425800874, + "grad_norm": 0.7012712955474854, + "learning_rate": 9.458770634159475e-07, + "loss": 0.3679, + "step": 23015 + }, + { + "epoch": 0.8606210715704372, + "grad_norm": 0.21312133967876434, + "learning_rate": 9.433855208379383e-07, + "loss": 0.2804, + "step": 23020 + }, + { + "epoch": 0.860808000560787, + "grad_norm": 0.3727993667125702, + "learning_rate": 9.408971016267588e-07, + "loss": 0.4381, + "step": 23025 + }, + { + "epoch": 0.8609949295511368, + "grad_norm": 0.3259866237640381, + "learning_rate": 9.384118066405934e-07, + "loss": 0.315, + "step": 23030 + }, + { + "epoch": 0.8611818585414865, + "grad_norm": 0.48218241333961487, + "learning_rate": 9.359296367365478e-07, + "loss": 0.3288, + "step": 23035 + }, + { + "epoch": 0.8613687875318363, + "grad_norm": 0.2950020134449005, + "learning_rate": 9.334505927706516e-07, + "loss": 0.3175, + "step": 23040 + }, + { + "epoch": 0.8615557165221861, + "grad_norm": 0.36184942722320557, + "learning_rate": 9.309746755978566e-07, + "loss": 0.2991, + "step": 23045 + }, + { + "epoch": 0.8617426455125359, + "grad_norm": 0.7623559236526489, + "learning_rate": 9.285018860720352e-07, + "loss": 0.2409, + "step": 23050 + }, + { + "epoch": 0.8619295745028858, + "grad_norm": 0.10221162438392639, + "learning_rate": 9.260322250459808e-07, + "loss": 0.2914, + "step": 23055 + }, + { + "epoch": 0.8621165034932355, + "grad_norm": 0.4181196093559265, + "learning_rate": 9.235656933714121e-07, + "loss": 0.3329, + "step": 23060 + }, + { + "epoch": 0.8623034324835853, + "grad_norm": 0.5724478960037231, + "learning_rate": 9.21102291898962e-07, + "loss": 0.3349, + "step": 23065 + }, + { + "epoch": 0.8624903614739351, + "grad_norm": 0.3799975514411926, + "learning_rate": 9.186420214781888e-07, + "loss": 0.3123, + "step": 23070 + }, + { + "epoch": 0.8626772904642849, + "grad_norm": 0.48635974526405334, + "learning_rate": 9.161848829575693e-07, + "loss": 0.2522, + "step": 23075 + }, + { + "epoch": 0.8628642194546347, + "grad_norm": 0.3440147340297699, + "learning_rate": 9.137308771844988e-07, + "loss": 0.2328, + "step": 23080 + }, + { + "epoch": 0.8630511484449844, + "grad_norm": 0.6380581259727478, + "learning_rate": 9.112800050052927e-07, + "loss": 0.3319, + "step": 23085 + }, + { + "epoch": 0.8632380774353342, + "grad_norm": 0.37005263566970825, + "learning_rate": 9.088322672651906e-07, + "loss": 0.3111, + "step": 23090 + }, + { + "epoch": 0.8634250064256841, + "grad_norm": 0.37442389130592346, + "learning_rate": 9.063876648083414e-07, + "loss": 0.2227, + "step": 23095 + }, + { + "epoch": 0.8636119354160339, + "grad_norm": 0.7940928339958191, + "learning_rate": 9.039461984778231e-07, + "loss": 0.3284, + "step": 23100 + }, + { + "epoch": 0.8637988644063836, + "grad_norm": 0.4885554015636444, + "learning_rate": 9.015078691156265e-07, + "loss": 0.2938, + "step": 23105 + }, + { + "epoch": 0.8639857933967334, + "grad_norm": 0.35439759492874146, + "learning_rate": 8.990726775626602e-07, + "loss": 0.2331, + "step": 23110 + }, + { + "epoch": 0.8641727223870832, + "grad_norm": 0.4514514207839966, + "learning_rate": 8.966406246587545e-07, + "loss": 0.2141, + "step": 23115 + }, + { + "epoch": 0.864359651377433, + "grad_norm": 0.5355989336967468, + "learning_rate": 8.942117112426529e-07, + "loss": 0.2842, + "step": 23120 + }, + { + "epoch": 0.8645465803677828, + "grad_norm": 2.2368109226226807, + "learning_rate": 8.917859381520189e-07, + "loss": 0.2435, + "step": 23125 + }, + { + "epoch": 0.8647335093581325, + "grad_norm": 0.5107064843177795, + "learning_rate": 8.893633062234285e-07, + "loss": 0.2857, + "step": 23130 + }, + { + "epoch": 0.8649204383484824, + "grad_norm": 0.4561639726161957, + "learning_rate": 8.869438162923838e-07, + "loss": 0.247, + "step": 23135 + }, + { + "epoch": 0.8651073673388322, + "grad_norm": 0.40335312485694885, + "learning_rate": 8.845274691932926e-07, + "loss": 0.3122, + "step": 23140 + }, + { + "epoch": 0.865294296329182, + "grad_norm": 0.25808554887771606, + "learning_rate": 8.821142657594861e-07, + "loss": 0.2516, + "step": 23145 + }, + { + "epoch": 0.8654812253195318, + "grad_norm": 0.4519065320491791, + "learning_rate": 8.797042068232098e-07, + "loss": 0.2665, + "step": 23150 + }, + { + "epoch": 0.8656681543098815, + "grad_norm": 0.7045873403549194, + "learning_rate": 8.772972932156221e-07, + "loss": 0.4179, + "step": 23155 + }, + { + "epoch": 0.8658550833002313, + "grad_norm": 0.4974367618560791, + "learning_rate": 8.748935257668012e-07, + "loss": 0.3216, + "step": 23160 + }, + { + "epoch": 0.8660420122905811, + "grad_norm": 0.4620605707168579, + "learning_rate": 8.72492905305734e-07, + "loss": 0.2646, + "step": 23165 + }, + { + "epoch": 0.8662289412809309, + "grad_norm": 0.45460769534111023, + "learning_rate": 8.700954326603295e-07, + "loss": 0.3191, + "step": 23170 + }, + { + "epoch": 0.8664158702712808, + "grad_norm": 0.484361857175827, + "learning_rate": 8.67701108657405e-07, + "loss": 0.2041, + "step": 23175 + }, + { + "epoch": 0.8666027992616305, + "grad_norm": 0.8345504999160767, + "learning_rate": 8.653099341226956e-07, + "loss": 0.382, + "step": 23180 + }, + { + "epoch": 0.8667897282519803, + "grad_norm": 0.5114479660987854, + "learning_rate": 8.62921909880845e-07, + "loss": 0.2313, + "step": 23185 + }, + { + "epoch": 0.8669766572423301, + "grad_norm": 0.35048356652259827, + "learning_rate": 8.605370367554178e-07, + "loss": 0.2199, + "step": 23190 + }, + { + "epoch": 0.8671635862326799, + "grad_norm": 0.5474756360054016, + "learning_rate": 8.581553155688894e-07, + "loss": 0.3222, + "step": 23195 + }, + { + "epoch": 0.8673505152230296, + "grad_norm": 0.4555470645427704, + "learning_rate": 8.557767471426448e-07, + "loss": 0.2262, + "step": 23200 + }, + { + "epoch": 0.8675374442133794, + "grad_norm": 0.5740458965301514, + "learning_rate": 8.534013322969859e-07, + "loss": 0.3478, + "step": 23205 + }, + { + "epoch": 0.8677243732037292, + "grad_norm": 0.4381164014339447, + "learning_rate": 8.510290718511227e-07, + "loss": 0.3054, + "step": 23210 + }, + { + "epoch": 0.867911302194079, + "grad_norm": 0.27854111790657043, + "learning_rate": 8.486599666231832e-07, + "loss": 0.2728, + "step": 23215 + }, + { + "epoch": 0.8680982311844289, + "grad_norm": 0.5790026187896729, + "learning_rate": 8.462940174302026e-07, + "loss": 0.3049, + "step": 23220 + }, + { + "epoch": 0.8682851601747786, + "grad_norm": 0.18455126881599426, + "learning_rate": 8.439312250881282e-07, + "loss": 0.266, + "step": 23225 + }, + { + "epoch": 0.8684720891651284, + "grad_norm": 0.4800267517566681, + "learning_rate": 8.415715904118171e-07, + "loss": 0.3171, + "step": 23230 + }, + { + "epoch": 0.8686590181554782, + "grad_norm": 0.36994990706443787, + "learning_rate": 8.392151142150428e-07, + "loss": 0.3076, + "step": 23235 + }, + { + "epoch": 0.868845947145828, + "grad_norm": 0.7822495698928833, + "learning_rate": 8.368617973104887e-07, + "loss": 0.2798, + "step": 23240 + }, + { + "epoch": 0.8690328761361777, + "grad_norm": 0.38529595732688904, + "learning_rate": 8.345116405097408e-07, + "loss": 0.321, + "step": 23245 + }, + { + "epoch": 0.8692198051265275, + "grad_norm": 0.5859056115150452, + "learning_rate": 8.321646446233056e-07, + "loss": 0.2563, + "step": 23250 + }, + { + "epoch": 0.8694067341168773, + "grad_norm": 0.5239048004150391, + "learning_rate": 8.29820810460591e-07, + "loss": 0.2624, + "step": 23255 + }, + { + "epoch": 0.8695936631072272, + "grad_norm": 0.5495345592498779, + "learning_rate": 8.274801388299225e-07, + "loss": 0.2581, + "step": 23260 + }, + { + "epoch": 0.869780592097577, + "grad_norm": 0.4601067900657654, + "learning_rate": 8.251426305385268e-07, + "loss": 0.2837, + "step": 23265 + }, + { + "epoch": 0.8699675210879267, + "grad_norm": 0.37067458033561707, + "learning_rate": 8.228082863925457e-07, + "loss": 0.2406, + "step": 23270 + }, + { + "epoch": 0.8701544500782765, + "grad_norm": 0.8990246653556824, + "learning_rate": 8.204771071970253e-07, + "loss": 0.303, + "step": 23275 + }, + { + "epoch": 0.8703413790686263, + "grad_norm": 0.23729777336120605, + "learning_rate": 8.181490937559234e-07, + "loss": 0.323, + "step": 23280 + }, + { + "epoch": 0.8705283080589761, + "grad_norm": 0.6693614721298218, + "learning_rate": 8.158242468721078e-07, + "loss": 0.277, + "step": 23285 + }, + { + "epoch": 0.8707152370493259, + "grad_norm": 0.7663869857788086, + "learning_rate": 8.135025673473474e-07, + "loss": 0.303, + "step": 23290 + }, + { + "epoch": 0.8709021660396756, + "grad_norm": 0.389027863740921, + "learning_rate": 8.11184055982327e-07, + "loss": 0.2435, + "step": 23295 + }, + { + "epoch": 0.8710890950300255, + "grad_norm": 0.38779979944229126, + "learning_rate": 8.088687135766316e-07, + "loss": 0.3011, + "step": 23300 + }, + { + "epoch": 0.8712760240203753, + "grad_norm": 0.33862584829330444, + "learning_rate": 8.06556540928759e-07, + "loss": 0.304, + "step": 23305 + }, + { + "epoch": 0.8714629530107251, + "grad_norm": 0.31352442502975464, + "learning_rate": 8.042475388361104e-07, + "loss": 0.2957, + "step": 23310 + }, + { + "epoch": 0.8716498820010748, + "grad_norm": 0.4300394058227539, + "learning_rate": 8.019417080949932e-07, + "loss": 0.2494, + "step": 23315 + }, + { + "epoch": 0.8718368109914246, + "grad_norm": 0.4149588942527771, + "learning_rate": 7.996390495006223e-07, + "loss": 0.2895, + "step": 23320 + }, + { + "epoch": 0.8720237399817744, + "grad_norm": 0.3871593177318573, + "learning_rate": 7.973395638471182e-07, + "loss": 0.2378, + "step": 23325 + }, + { + "epoch": 0.8722106689721242, + "grad_norm": 0.4816226363182068, + "learning_rate": 7.95043251927512e-07, + "loss": 0.2898, + "step": 23330 + }, + { + "epoch": 0.872397597962474, + "grad_norm": 0.6048807501792908, + "learning_rate": 7.927501145337302e-07, + "loss": 0.2711, + "step": 23335 + }, + { + "epoch": 0.8725845269528238, + "grad_norm": 0.3330458700656891, + "learning_rate": 7.904601524566157e-07, + "loss": 0.278, + "step": 23340 + }, + { + "epoch": 0.8727714559431736, + "grad_norm": 0.5402839183807373, + "learning_rate": 7.881733664859048e-07, + "loss": 0.2908, + "step": 23345 + }, + { + "epoch": 0.8729583849335234, + "grad_norm": 0.3132112920284271, + "learning_rate": 7.858897574102508e-07, + "loss": 0.2525, + "step": 23350 + }, + { + "epoch": 0.8731453139238732, + "grad_norm": 0.5713052153587341, + "learning_rate": 7.836093260171995e-07, + "loss": 0.2595, + "step": 23355 + }, + { + "epoch": 0.873332242914223, + "grad_norm": 0.39907294511795044, + "learning_rate": 7.813320730932094e-07, + "loss": 0.2347, + "step": 23360 + }, + { + "epoch": 0.8735191719045727, + "grad_norm": 0.6258445978164673, + "learning_rate": 7.790579994236402e-07, + "loss": 0.3143, + "step": 23365 + }, + { + "epoch": 0.8737061008949225, + "grad_norm": 0.5387104153633118, + "learning_rate": 7.767871057927512e-07, + "loss": 0.2598, + "step": 23370 + }, + { + "epoch": 0.8738930298852723, + "grad_norm": 0.32804661989212036, + "learning_rate": 7.745193929837136e-07, + "loss": 0.2496, + "step": 23375 + }, + { + "epoch": 0.8740799588756222, + "grad_norm": 0.17273744940757751, + "learning_rate": 7.722548617785907e-07, + "loss": 0.3359, + "step": 23380 + }, + { + "epoch": 0.874266887865972, + "grad_norm": 0.2972503900527954, + "learning_rate": 7.699935129583602e-07, + "loss": 0.3159, + "step": 23385 + }, + { + "epoch": 0.8744538168563217, + "grad_norm": 0.49442118406295776, + "learning_rate": 7.677353473028926e-07, + "loss": 0.3058, + "step": 23390 + }, + { + "epoch": 0.8746407458466715, + "grad_norm": 0.5777555108070374, + "learning_rate": 7.654803655909671e-07, + "loss": 0.2823, + "step": 23395 + }, + { + "epoch": 0.8748276748370213, + "grad_norm": 0.549485445022583, + "learning_rate": 7.632285686002594e-07, + "loss": 0.2715, + "step": 23400 + }, + { + "epoch": 0.8750146038273711, + "grad_norm": 0.44350236654281616, + "learning_rate": 7.609799571073529e-07, + "loss": 0.2928, + "step": 23405 + }, + { + "epoch": 0.8752015328177208, + "grad_norm": 0.3184381425380707, + "learning_rate": 7.587345318877282e-07, + "loss": 0.3997, + "step": 23410 + }, + { + "epoch": 0.8753884618080706, + "grad_norm": 0.5490055084228516, + "learning_rate": 7.564922937157659e-07, + "loss": 0.2365, + "step": 23415 + }, + { + "epoch": 0.8755753907984205, + "grad_norm": 0.2554487884044647, + "learning_rate": 7.542532433647532e-07, + "loss": 0.2517, + "step": 23420 + }, + { + "epoch": 0.8757623197887703, + "grad_norm": 0.6824100613594055, + "learning_rate": 7.52017381606871e-07, + "loss": 0.3281, + "step": 23425 + }, + { + "epoch": 0.8759492487791201, + "grad_norm": 0.33024176955223083, + "learning_rate": 7.497847092132071e-07, + "loss": 0.2404, + "step": 23430 + }, + { + "epoch": 0.8761361777694698, + "grad_norm": 0.5682893395423889, + "learning_rate": 7.475552269537434e-07, + "loss": 0.3247, + "step": 23435 + }, + { + "epoch": 0.8763231067598196, + "grad_norm": 0.5878921747207642, + "learning_rate": 7.453289355973669e-07, + "loss": 0.2347, + "step": 23440 + }, + { + "epoch": 0.8765100357501694, + "grad_norm": 0.5948970913887024, + "learning_rate": 7.431058359118593e-07, + "loss": 0.223, + "step": 23445 + }, + { + "epoch": 0.8766969647405192, + "grad_norm": 0.4269593358039856, + "learning_rate": 7.408859286639069e-07, + "loss": 0.2496, + "step": 23450 + }, + { + "epoch": 0.876883893730869, + "grad_norm": 0.35104551911354065, + "learning_rate": 7.38669214619091e-07, + "loss": 0.2273, + "step": 23455 + }, + { + "epoch": 0.8770708227212187, + "grad_norm": 0.5351817607879639, + "learning_rate": 7.36455694541891e-07, + "loss": 0.2464, + "step": 23460 + }, + { + "epoch": 0.8772577517115686, + "grad_norm": 0.3460119962692261, + "learning_rate": 7.342453691956886e-07, + "loss": 0.286, + "step": 23465 + }, + { + "epoch": 0.8774446807019184, + "grad_norm": 0.25786739587783813, + "learning_rate": 7.320382393427595e-07, + "loss": 0.3703, + "step": 23470 + }, + { + "epoch": 0.8776316096922682, + "grad_norm": 0.47995084524154663, + "learning_rate": 7.298343057442825e-07, + "loss": 0.2853, + "step": 23475 + }, + { + "epoch": 0.8778185386826179, + "grad_norm": 0.31466034054756165, + "learning_rate": 7.276335691603276e-07, + "loss": 0.2484, + "step": 23480 + }, + { + "epoch": 0.8780054676729677, + "grad_norm": 0.42213770747184753, + "learning_rate": 7.254360303498697e-07, + "loss": 0.2988, + "step": 23485 + }, + { + "epoch": 0.8781923966633175, + "grad_norm": 0.5641787052154541, + "learning_rate": 7.232416900707739e-07, + "loss": 0.3115, + "step": 23490 + }, + { + "epoch": 0.8783793256536673, + "grad_norm": 0.3340662717819214, + "learning_rate": 7.210505490798081e-07, + "loss": 0.2782, + "step": 23495 + }, + { + "epoch": 0.878566254644017, + "grad_norm": 0.4800506830215454, + "learning_rate": 7.188626081326322e-07, + "loss": 0.2921, + "step": 23500 + }, + { + "epoch": 0.8787531836343669, + "grad_norm": 1.0108988285064697, + "learning_rate": 7.166778679838026e-07, + "loss": 0.2956, + "step": 23505 + }, + { + "epoch": 0.8789401126247167, + "grad_norm": 0.4938255846500397, + "learning_rate": 7.144963293867779e-07, + "loss": 0.3058, + "step": 23510 + }, + { + "epoch": 0.8791270416150665, + "grad_norm": 0.2628172039985657, + "learning_rate": 7.123179930939028e-07, + "loss": 0.3074, + "step": 23515 + }, + { + "epoch": 0.8793139706054163, + "grad_norm": 0.3008750081062317, + "learning_rate": 7.101428598564286e-07, + "loss": 0.2431, + "step": 23520 + }, + { + "epoch": 0.879500899595766, + "grad_norm": 0.5366243124008179, + "learning_rate": 7.07970930424493e-07, + "loss": 0.3393, + "step": 23525 + }, + { + "epoch": 0.8796878285861158, + "grad_norm": 0.3495098054409027, + "learning_rate": 7.058022055471337e-07, + "loss": 0.1941, + "step": 23530 + }, + { + "epoch": 0.8798747575764656, + "grad_norm": 0.5412205457687378, + "learning_rate": 7.036366859722798e-07, + "loss": 0.2646, + "step": 23535 + }, + { + "epoch": 0.8800616865668154, + "grad_norm": 0.4367334842681885, + "learning_rate": 7.014743724467609e-07, + "loss": 0.2841, + "step": 23540 + }, + { + "epoch": 0.8802486155571653, + "grad_norm": 0.2953815460205078, + "learning_rate": 6.993152657162916e-07, + "loss": 0.2756, + "step": 23545 + }, + { + "epoch": 0.880435544547515, + "grad_norm": 0.5892349481582642, + "learning_rate": 6.971593665254917e-07, + "loss": 0.2218, + "step": 23550 + }, + { + "epoch": 0.8806224735378648, + "grad_norm": 0.4493967890739441, + "learning_rate": 6.950066756178653e-07, + "loss": 0.2927, + "step": 23555 + }, + { + "epoch": 0.8808094025282146, + "grad_norm": 0.4241293668746948, + "learning_rate": 6.928571937358131e-07, + "loss": 0.2596, + "step": 23560 + }, + { + "epoch": 0.8809963315185644, + "grad_norm": 0.36346501111984253, + "learning_rate": 6.907109216206342e-07, + "loss": 0.2376, + "step": 23565 + }, + { + "epoch": 0.8811832605089142, + "grad_norm": 0.5046384334564209, + "learning_rate": 6.885678600125101e-07, + "loss": 0.3006, + "step": 23570 + }, + { + "epoch": 0.8813701894992639, + "grad_norm": 0.5586752891540527, + "learning_rate": 6.864280096505283e-07, + "loss": 0.2569, + "step": 23575 + }, + { + "epoch": 0.8815571184896137, + "grad_norm": 0.2976597547531128, + "learning_rate": 6.842913712726551e-07, + "loss": 0.2507, + "step": 23580 + }, + { + "epoch": 0.8817440474799636, + "grad_norm": 0.5597246289253235, + "learning_rate": 6.821579456157612e-07, + "loss": 0.3023, + "step": 23585 + }, + { + "epoch": 0.8819309764703134, + "grad_norm": 0.6922735571861267, + "learning_rate": 6.800277334156013e-07, + "loss": 0.2563, + "step": 23590 + }, + { + "epoch": 0.8821179054606632, + "grad_norm": 0.32556474208831787, + "learning_rate": 6.779007354068257e-07, + "loss": 0.2421, + "step": 23595 + }, + { + "epoch": 0.8823048344510129, + "grad_norm": 0.2897225618362427, + "learning_rate": 6.757769523229751e-07, + "loss": 0.255, + "step": 23600 + }, + { + "epoch": 0.8824917634413627, + "grad_norm": 0.45104631781578064, + "learning_rate": 6.736563848964784e-07, + "loss": 0.2586, + "step": 23605 + }, + { + "epoch": 0.8826786924317125, + "grad_norm": 0.5284656286239624, + "learning_rate": 6.715390338586636e-07, + "loss": 0.2726, + "step": 23610 + }, + { + "epoch": 0.8828656214220623, + "grad_norm": 0.42080292105674744, + "learning_rate": 6.694248999397402e-07, + "loss": 0.1887, + "step": 23615 + }, + { + "epoch": 0.883052550412412, + "grad_norm": 0.7159053087234497, + "learning_rate": 6.673139838688148e-07, + "loss": 0.3204, + "step": 23620 + }, + { + "epoch": 0.8832394794027619, + "grad_norm": 0.635303795337677, + "learning_rate": 6.652062863738795e-07, + "loss": 0.3029, + "step": 23625 + }, + { + "epoch": 0.8834264083931117, + "grad_norm": 0.4003882110118866, + "learning_rate": 6.63101808181823e-07, + "loss": 0.3126, + "step": 23630 + }, + { + "epoch": 0.8836133373834615, + "grad_norm": 0.35992348194122314, + "learning_rate": 6.610005500184147e-07, + "loss": 0.2064, + "step": 23635 + }, + { + "epoch": 0.8838002663738113, + "grad_norm": 0.29172858595848083, + "learning_rate": 6.589025126083216e-07, + "loss": 0.3099, + "step": 23640 + }, + { + "epoch": 0.883987195364161, + "grad_norm": 0.43935173749923706, + "learning_rate": 6.568076966750958e-07, + "loss": 0.3544, + "step": 23645 + }, + { + "epoch": 0.8841741243545108, + "grad_norm": 0.6253902316093445, + "learning_rate": 6.547161029411775e-07, + "loss": 0.3469, + "step": 23650 + }, + { + "epoch": 0.8843610533448606, + "grad_norm": 0.570335865020752, + "learning_rate": 6.526277321279006e-07, + "loss": 0.2235, + "step": 23655 + }, + { + "epoch": 0.8845479823352104, + "grad_norm": 1.0959036350250244, + "learning_rate": 6.505425849554825e-07, + "loss": 0.2871, + "step": 23660 + }, + { + "epoch": 0.8847349113255603, + "grad_norm": 0.6984147429466248, + "learning_rate": 6.484606621430312e-07, + "loss": 0.2487, + "step": 23665 + }, + { + "epoch": 0.88492184031591, + "grad_norm": 0.6359334588050842, + "learning_rate": 6.463819644085412e-07, + "loss": 0.284, + "step": 23670 + }, + { + "epoch": 0.8851087693062598, + "grad_norm": 0.4095711410045624, + "learning_rate": 6.443064924688969e-07, + "loss": 0.2513, + "step": 23675 + }, + { + "epoch": 0.8852956982966096, + "grad_norm": 0.28603771328926086, + "learning_rate": 6.422342470398679e-07, + "loss": 0.2653, + "step": 23680 + }, + { + "epoch": 0.8854826272869594, + "grad_norm": 0.36115431785583496, + "learning_rate": 6.40165228836116e-07, + "loss": 0.2031, + "step": 23685 + }, + { + "epoch": 0.8856695562773091, + "grad_norm": 0.3040491044521332, + "learning_rate": 6.380994385711803e-07, + "loss": 0.3044, + "step": 23690 + }, + { + "epoch": 0.8858564852676589, + "grad_norm": 0.4708629548549652, + "learning_rate": 6.360368769574977e-07, + "loss": 0.3036, + "step": 23695 + }, + { + "epoch": 0.8860434142580087, + "grad_norm": 0.19724823534488678, + "learning_rate": 6.339775447063856e-07, + "loss": 0.2344, + "step": 23700 + }, + { + "epoch": 0.8862303432483585, + "grad_norm": 0.6166355609893799, + "learning_rate": 6.319214425280451e-07, + "loss": 0.2477, + "step": 23705 + }, + { + "epoch": 0.8864172722387084, + "grad_norm": 0.4069211184978485, + "learning_rate": 6.298685711315722e-07, + "loss": 0.2525, + "step": 23710 + }, + { + "epoch": 0.8866042012290581, + "grad_norm": 0.6085258722305298, + "learning_rate": 6.278189312249395e-07, + "loss": 0.3372, + "step": 23715 + }, + { + "epoch": 0.8867911302194079, + "grad_norm": 0.3506983518600464, + "learning_rate": 6.257725235150113e-07, + "loss": 0.2526, + "step": 23720 + }, + { + "epoch": 0.8869780592097577, + "grad_norm": 0.6167741417884827, + "learning_rate": 6.237293487075324e-07, + "loss": 0.2678, + "step": 23725 + }, + { + "epoch": 0.8871649882001075, + "grad_norm": 0.3610391318798065, + "learning_rate": 6.216894075071378e-07, + "loss": 0.1962, + "step": 23730 + }, + { + "epoch": 0.8873519171904573, + "grad_norm": 0.5724371075630188, + "learning_rate": 6.196527006173447e-07, + "loss": 0.2723, + "step": 23735 + }, + { + "epoch": 0.887538846180807, + "grad_norm": 0.4358518719673157, + "learning_rate": 6.176192287405547e-07, + "loss": 0.2568, + "step": 23740 + }, + { + "epoch": 0.8877257751711568, + "grad_norm": 0.34022319316864014, + "learning_rate": 6.155889925780534e-07, + "loss": 0.25, + "step": 23745 + }, + { + "epoch": 0.8879127041615067, + "grad_norm": 0.5654661059379578, + "learning_rate": 6.135619928300096e-07, + "loss": 0.3062, + "step": 23750 + }, + { + "epoch": 0.8880996331518565, + "grad_norm": 0.27159449458122253, + "learning_rate": 6.115382301954809e-07, + "loss": 0.2635, + "step": 23755 + }, + { + "epoch": 0.8882865621422062, + "grad_norm": 0.33312445878982544, + "learning_rate": 6.095177053724011e-07, + "loss": 0.2983, + "step": 23760 + }, + { + "epoch": 0.888473491132556, + "grad_norm": 0.7186973094940186, + "learning_rate": 6.07500419057595e-07, + "loss": 0.2881, + "step": 23765 + }, + { + "epoch": 0.8886604201229058, + "grad_norm": 0.4082525074481964, + "learning_rate": 6.054863719467641e-07, + "loss": 0.2782, + "step": 23770 + }, + { + "epoch": 0.8888473491132556, + "grad_norm": 0.22623296082019806, + "learning_rate": 6.034755647344958e-07, + "loss": 0.198, + "step": 23775 + }, + { + "epoch": 0.8890342781036054, + "grad_norm": 0.26331770420074463, + "learning_rate": 6.014679981142635e-07, + "loss": 0.2145, + "step": 23780 + }, + { + "epoch": 0.8892212070939551, + "grad_norm": 0.613768458366394, + "learning_rate": 5.994636727784153e-07, + "loss": 0.282, + "step": 23785 + }, + { + "epoch": 0.889408136084305, + "grad_norm": 0.27621743083000183, + "learning_rate": 5.974625894181874e-07, + "loss": 0.2704, + "step": 23790 + }, + { + "epoch": 0.8895950650746548, + "grad_norm": 1.3323360681533813, + "learning_rate": 5.954647487236942e-07, + "loss": 0.2972, + "step": 23795 + }, + { + "epoch": 0.8897819940650046, + "grad_norm": 0.46217793226242065, + "learning_rate": 5.934701513839369e-07, + "loss": 0.3143, + "step": 23800 + }, + { + "epoch": 0.8899689230553544, + "grad_norm": 0.6428271532058716, + "learning_rate": 5.91478798086792e-07, + "loss": 0.2476, + "step": 23805 + }, + { + "epoch": 0.8901558520457041, + "grad_norm": 0.5378536581993103, + "learning_rate": 5.894906895190222e-07, + "loss": 0.2954, + "step": 23810 + }, + { + "epoch": 0.8903427810360539, + "grad_norm": 0.34355923533439636, + "learning_rate": 5.875058263662669e-07, + "loss": 0.2932, + "step": 23815 + }, + { + "epoch": 0.8905297100264037, + "grad_norm": 0.42726051807403564, + "learning_rate": 5.855242093130498e-07, + "loss": 0.2709, + "step": 23820 + }, + { + "epoch": 0.8907166390167535, + "grad_norm": 0.4561409652233124, + "learning_rate": 5.835458390427762e-07, + "loss": 0.2824, + "step": 23825 + }, + { + "epoch": 0.8909035680071034, + "grad_norm": 0.523114800453186, + "learning_rate": 5.815707162377271e-07, + "loss": 0.2704, + "step": 23830 + }, + { + "epoch": 0.8910904969974531, + "grad_norm": 3.815091848373413, + "learning_rate": 5.795988415790655e-07, + "loss": 0.3295, + "step": 23835 + }, + { + "epoch": 0.8912774259878029, + "grad_norm": 0.3353985548019409, + "learning_rate": 5.776302157468338e-07, + "loss": 0.3024, + "step": 23840 + }, + { + "epoch": 0.8914643549781527, + "grad_norm": 0.4886054992675781, + "learning_rate": 5.756648394199571e-07, + "loss": 0.3643, + "step": 23845 + }, + { + "epoch": 0.8916512839685025, + "grad_norm": 0.28375887870788574, + "learning_rate": 5.737027132762341e-07, + "loss": 0.2608, + "step": 23850 + }, + { + "epoch": 0.8918382129588522, + "grad_norm": 0.39414143562316895, + "learning_rate": 5.71743837992349e-07, + "loss": 0.3279, + "step": 23855 + }, + { + "epoch": 0.892025141949202, + "grad_norm": 0.29853355884552, + "learning_rate": 5.697882142438594e-07, + "loss": 0.2457, + "step": 23860 + }, + { + "epoch": 0.8922120709395518, + "grad_norm": 0.49426108598709106, + "learning_rate": 5.678358427052045e-07, + "loss": 0.2018, + "step": 23865 + }, + { + "epoch": 0.8923989999299017, + "grad_norm": 0.48164868354797363, + "learning_rate": 5.658867240497034e-07, + "loss": 0.2593, + "step": 23870 + }, + { + "epoch": 0.8925859289202515, + "grad_norm": 0.3493533432483673, + "learning_rate": 5.639408589495476e-07, + "loss": 0.3238, + "step": 23875 + }, + { + "epoch": 0.8927728579106012, + "grad_norm": 0.40912967920303345, + "learning_rate": 5.619982480758146e-07, + "loss": 0.2422, + "step": 23880 + }, + { + "epoch": 0.892959786900951, + "grad_norm": 0.50095134973526, + "learning_rate": 5.600588920984529e-07, + "loss": 0.2204, + "step": 23885 + }, + { + "epoch": 0.8931467158913008, + "grad_norm": 0.3179953992366791, + "learning_rate": 5.581227916862907e-07, + "loss": 0.2518, + "step": 23890 + }, + { + "epoch": 0.8933336448816506, + "grad_norm": 0.39287543296813965, + "learning_rate": 5.56189947507032e-07, + "loss": 0.291, + "step": 23895 + }, + { + "epoch": 0.8935205738720003, + "grad_norm": 0.44171902537345886, + "learning_rate": 5.542603602272622e-07, + "loss": 0.2908, + "step": 23900 + }, + { + "epoch": 0.8937075028623501, + "grad_norm": 0.8764915466308594, + "learning_rate": 5.523340305124381e-07, + "loss": 0.3125, + "step": 23905 + }, + { + "epoch": 0.8938944318527, + "grad_norm": 0.3236830234527588, + "learning_rate": 5.50410959026898e-07, + "loss": 0.2637, + "step": 23910 + }, + { + "epoch": 0.8940813608430498, + "grad_norm": 0.5709930062294006, + "learning_rate": 5.484911464338539e-07, + "loss": 0.2993, + "step": 23915 + }, + { + "epoch": 0.8942682898333996, + "grad_norm": 0.22036674618721008, + "learning_rate": 5.465745933953914e-07, + "loss": 0.3086, + "step": 23920 + }, + { + "epoch": 0.8944552188237493, + "grad_norm": 0.36622971296310425, + "learning_rate": 5.446613005724788e-07, + "loss": 0.287, + "step": 23925 + }, + { + "epoch": 0.8946421478140991, + "grad_norm": 0.37628886103630066, + "learning_rate": 5.427512686249537e-07, + "loss": 0.2985, + "step": 23930 + }, + { + "epoch": 0.8948290768044489, + "grad_norm": 0.3855689764022827, + "learning_rate": 5.408444982115313e-07, + "loss": 0.2803, + "step": 23935 + }, + { + "epoch": 0.8950160057947987, + "grad_norm": 0.0829666256904602, + "learning_rate": 5.389409899898013e-07, + "loss": 0.2945, + "step": 23940 + }, + { + "epoch": 0.8952029347851485, + "grad_norm": 0.5317602753639221, + "learning_rate": 5.370407446162318e-07, + "loss": 0.2388, + "step": 23945 + }, + { + "epoch": 0.8953898637754982, + "grad_norm": 0.635329008102417, + "learning_rate": 5.351437627461598e-07, + "loss": 0.351, + "step": 23950 + }, + { + "epoch": 0.8955767927658481, + "grad_norm": 0.48536187410354614, + "learning_rate": 5.332500450338018e-07, + "loss": 0.3298, + "step": 23955 + }, + { + "epoch": 0.8957637217561979, + "grad_norm": 0.3373602330684662, + "learning_rate": 5.313595921322479e-07, + "loss": 0.2615, + "step": 23960 + }, + { + "epoch": 0.8959506507465477, + "grad_norm": 0.49494582414627075, + "learning_rate": 5.294724046934585e-07, + "loss": 0.2406, + "step": 23965 + }, + { + "epoch": 0.8961375797368974, + "grad_norm": 0.36144334077835083, + "learning_rate": 5.275884833682721e-07, + "loss": 0.2532, + "step": 23970 + }, + { + "epoch": 0.8963245087272472, + "grad_norm": 0.6861150860786438, + "learning_rate": 5.257078288064e-07, + "loss": 0.2478, + "step": 23975 + }, + { + "epoch": 0.896511437717597, + "grad_norm": 0.22255679965019226, + "learning_rate": 5.238304416564243e-07, + "loss": 0.3317, + "step": 23980 + }, + { + "epoch": 0.8966983667079468, + "grad_norm": 0.2977222800254822, + "learning_rate": 5.21956322565802e-07, + "loss": 0.2917, + "step": 23985 + }, + { + "epoch": 0.8968852956982966, + "grad_norm": 0.7013628482818604, + "learning_rate": 5.200854721808645e-07, + "loss": 0.3236, + "step": 23990 + }, + { + "epoch": 0.8970722246886464, + "grad_norm": 0.46044737100601196, + "learning_rate": 5.182178911468128e-07, + "loss": 0.2712, + "step": 23995 + }, + { + "epoch": 0.8972591536789962, + "grad_norm": 0.21883058547973633, + "learning_rate": 5.163535801077235e-07, + "loss": 0.2518, + "step": 24000 + }, + { + "epoch": 0.897446082669346, + "grad_norm": 0.3243284523487091, + "learning_rate": 5.144925397065437e-07, + "loss": 0.3362, + "step": 24005 + }, + { + "epoch": 0.8976330116596958, + "grad_norm": 0.5240607261657715, + "learning_rate": 5.12634770585092e-07, + "loss": 0.2729, + "step": 24010 + }, + { + "epoch": 0.8978199406500456, + "grad_norm": 0.318803608417511, + "learning_rate": 5.107802733840616e-07, + "loss": 0.2773, + "step": 24015 + }, + { + "epoch": 0.8980068696403953, + "grad_norm": 0.41666528582572937, + "learning_rate": 5.089290487430154e-07, + "loss": 0.233, + "step": 24020 + }, + { + "epoch": 0.8981937986307451, + "grad_norm": 0.5397652387619019, + "learning_rate": 5.070810973003859e-07, + "loss": 0.2415, + "step": 24025 + }, + { + "epoch": 0.8983807276210949, + "grad_norm": 0.4510786533355713, + "learning_rate": 5.052364196934779e-07, + "loss": 0.2595, + "step": 24030 + }, + { + "epoch": 0.8985676566114448, + "grad_norm": 0.4586493670940399, + "learning_rate": 5.033950165584711e-07, + "loss": 0.4253, + "step": 24035 + }, + { + "epoch": 0.8987545856017946, + "grad_norm": 0.6832088828086853, + "learning_rate": 5.01556888530409e-07, + "loss": 0.3542, + "step": 24040 + }, + { + "epoch": 0.8989415145921443, + "grad_norm": 0.2557532489299774, + "learning_rate": 4.99722036243212e-07, + "loss": 0.2235, + "step": 24045 + }, + { + "epoch": 0.8991284435824941, + "grad_norm": 0.2579525113105774, + "learning_rate": 4.978904603296686e-07, + "loss": 0.2518, + "step": 24050 + }, + { + "epoch": 0.8993153725728439, + "grad_norm": 0.7935723066329956, + "learning_rate": 4.960621614214334e-07, + "loss": 0.2589, + "step": 24055 + }, + { + "epoch": 0.8995023015631937, + "grad_norm": 0.6913199424743652, + "learning_rate": 4.942371401490386e-07, + "loss": 0.2652, + "step": 24060 + }, + { + "epoch": 0.8996892305535434, + "grad_norm": 0.604274570941925, + "learning_rate": 4.924153971418777e-07, + "loss": 0.2887, + "step": 24065 + }, + { + "epoch": 0.8998761595438932, + "grad_norm": 0.35394561290740967, + "learning_rate": 4.905969330282212e-07, + "loss": 0.2466, + "step": 24070 + }, + { + "epoch": 0.9000630885342431, + "grad_norm": 0.39302387833595276, + "learning_rate": 4.887817484352031e-07, + "loss": 0.3516, + "step": 24075 + }, + { + "epoch": 0.9002500175245929, + "grad_norm": 0.5640286207199097, + "learning_rate": 4.869698439888304e-07, + "loss": 0.3531, + "step": 24080 + }, + { + "epoch": 0.9004369465149427, + "grad_norm": 0.4714716076850891, + "learning_rate": 4.851612203139733e-07, + "loss": 0.2918, + "step": 24085 + }, + { + "epoch": 0.9006238755052924, + "grad_norm": 0.41106295585632324, + "learning_rate": 4.833558780343772e-07, + "loss": 0.3051, + "step": 24090 + }, + { + "epoch": 0.9008108044956422, + "grad_norm": 0.48679038882255554, + "learning_rate": 4.815538177726531e-07, + "loss": 0.2615, + "step": 24095 + }, + { + "epoch": 0.900997733485992, + "grad_norm": 0.5820590257644653, + "learning_rate": 4.797550401502782e-07, + "loss": 0.2669, + "step": 24100 + }, + { + "epoch": 0.9011846624763418, + "grad_norm": 0.5170806050300598, + "learning_rate": 4.779595457876019e-07, + "loss": 0.2829, + "step": 24105 + }, + { + "epoch": 0.9013715914666915, + "grad_norm": 0.4748556613922119, + "learning_rate": 4.761673353038354e-07, + "loss": 0.2686, + "step": 24110 + }, + { + "epoch": 0.9015585204570414, + "grad_norm": 0.3551413118839264, + "learning_rate": 4.743784093170645e-07, + "loss": 0.3079, + "step": 24115 + }, + { + "epoch": 0.9017454494473912, + "grad_norm": 0.47169989347457886, + "learning_rate": 4.725927684442366e-07, + "loss": 0.2218, + "step": 24120 + }, + { + "epoch": 0.901932378437741, + "grad_norm": 0.39478862285614014, + "learning_rate": 4.7081041330116816e-07, + "loss": 0.2159, + "step": 24125 + }, + { + "epoch": 0.9021193074280908, + "grad_norm": 0.33222270011901855, + "learning_rate": 4.6903134450254186e-07, + "loss": 0.3028, + "step": 24130 + }, + { + "epoch": 0.9023062364184405, + "grad_norm": 0.39596304297447205, + "learning_rate": 4.6725556266190687e-07, + "loss": 0.1949, + "step": 24135 + }, + { + "epoch": 0.9024931654087903, + "grad_norm": 0.31857696175575256, + "learning_rate": 4.6548306839168224e-07, + "loss": 0.2729, + "step": 24140 + }, + { + "epoch": 0.9026800943991401, + "grad_norm": 0.4975537359714508, + "learning_rate": 4.6371386230314785e-07, + "loss": 0.3893, + "step": 24145 + }, + { + "epoch": 0.9028670233894899, + "grad_norm": 0.4041035771369934, + "learning_rate": 4.619479450064535e-07, + "loss": 0.3135, + "step": 24150 + }, + { + "epoch": 0.9030539523798398, + "grad_norm": 0.5101954936981201, + "learning_rate": 4.6018531711061297e-07, + "loss": 0.2647, + "step": 24155 + }, + { + "epoch": 0.9032408813701895, + "grad_norm": 0.4400761127471924, + "learning_rate": 4.5842597922350683e-07, + "loss": 0.2631, + "step": 24160 + }, + { + "epoch": 0.9034278103605393, + "grad_norm": 0.4869976341724396, + "learning_rate": 4.566699319518808e-07, + "loss": 0.2841, + "step": 24165 + }, + { + "epoch": 0.9036147393508891, + "grad_norm": 0.6493037343025208, + "learning_rate": 4.5491717590134377e-07, + "loss": 0.3233, + "step": 24170 + }, + { + "epoch": 0.9038016683412389, + "grad_norm": 0.2384498566389084, + "learning_rate": 4.5316771167637e-07, + "loss": 0.2755, + "step": 24175 + }, + { + "epoch": 0.9039885973315887, + "grad_norm": 0.5434466004371643, + "learning_rate": 4.5142153988030236e-07, + "loss": 0.2591, + "step": 24180 + }, + { + "epoch": 0.9041755263219384, + "grad_norm": 0.2663435935974121, + "learning_rate": 4.4967866111534254e-07, + "loss": 0.2451, + "step": 24185 + }, + { + "epoch": 0.9043624553122882, + "grad_norm": 0.48140108585357666, + "learning_rate": 4.4793907598256193e-07, + "loss": 0.3392, + "step": 24190 + }, + { + "epoch": 0.904549384302638, + "grad_norm": 0.29158666729927063, + "learning_rate": 4.4620278508189395e-07, + "loss": 0.2499, + "step": 24195 + }, + { + "epoch": 0.9047363132929879, + "grad_norm": 0.8527382612228394, + "learning_rate": 4.44469789012133e-07, + "loss": 0.2645, + "step": 24200 + }, + { + "epoch": 0.9049232422833376, + "grad_norm": 0.469309538602829, + "learning_rate": 4.4274008837094316e-07, + "loss": 0.3296, + "step": 24205 + }, + { + "epoch": 0.9051101712736874, + "grad_norm": 0.7557592988014221, + "learning_rate": 4.410136837548462e-07, + "loss": 0.2996, + "step": 24210 + }, + { + "epoch": 0.9052971002640372, + "grad_norm": 0.36758342385292053, + "learning_rate": 4.392905757592303e-07, + "loss": 0.2624, + "step": 24215 + }, + { + "epoch": 0.905484029254387, + "grad_norm": 0.6781210899353027, + "learning_rate": 4.3757076497834337e-07, + "loss": 0.3425, + "step": 24220 + }, + { + "epoch": 0.9056709582447368, + "grad_norm": 0.4783806800842285, + "learning_rate": 4.358542520053044e-07, + "loss": 0.3332, + "step": 24225 + }, + { + "epoch": 0.9058578872350865, + "grad_norm": 0.33384254574775696, + "learning_rate": 4.3414103743208426e-07, + "loss": 0.2122, + "step": 24230 + }, + { + "epoch": 0.9060448162254363, + "grad_norm": 0.31750744581222534, + "learning_rate": 4.3243112184952365e-07, + "loss": 0.31, + "step": 24235 + }, + { + "epoch": 0.9062317452157862, + "grad_norm": 0.32413434982299805, + "learning_rate": 4.307245058473253e-07, + "loss": 0.2356, + "step": 24240 + }, + { + "epoch": 0.906418674206136, + "grad_norm": 0.5775876641273499, + "learning_rate": 4.290211900140495e-07, + "loss": 0.3082, + "step": 24245 + }, + { + "epoch": 0.9066056031964858, + "grad_norm": 0.3418591618537903, + "learning_rate": 4.27321174937122e-07, + "loss": 0.3737, + "step": 24250 + }, + { + "epoch": 0.9067925321868355, + "grad_norm": 0.48873278498649597, + "learning_rate": 4.256244612028293e-07, + "loss": 0.3144, + "step": 24255 + }, + { + "epoch": 0.9069794611771853, + "grad_norm": 0.43501177430152893, + "learning_rate": 4.2393104939632e-07, + "loss": 0.3015, + "step": 24260 + }, + { + "epoch": 0.9071663901675351, + "grad_norm": 0.46144506335258484, + "learning_rate": 4.222409401016025e-07, + "loss": 0.3018, + "step": 24265 + }, + { + "epoch": 0.9073533191578849, + "grad_norm": 0.4738104045391083, + "learning_rate": 4.205541339015484e-07, + "loss": 0.2621, + "step": 24270 + }, + { + "epoch": 0.9075402481482346, + "grad_norm": 0.5475369691848755, + "learning_rate": 4.1887063137788565e-07, + "loss": 0.3754, + "step": 24275 + }, + { + "epoch": 0.9077271771385845, + "grad_norm": 0.26058679819107056, + "learning_rate": 4.1719043311120757e-07, + "loss": 0.2475, + "step": 24280 + }, + { + "epoch": 0.9079141061289343, + "grad_norm": 0.6181069016456604, + "learning_rate": 4.155135396809684e-07, + "loss": 0.3012, + "step": 24285 + }, + { + "epoch": 0.9081010351192841, + "grad_norm": 0.4764721095561981, + "learning_rate": 4.13839951665479e-07, + "loss": 0.28, + "step": 24290 + }, + { + "epoch": 0.9082879641096339, + "grad_norm": 0.20308902859687805, + "learning_rate": 4.1216966964191194e-07, + "loss": 0.341, + "step": 24295 + }, + { + "epoch": 0.9084748930999836, + "grad_norm": 0.3182278573513031, + "learning_rate": 4.1050269418629887e-07, + "loss": 0.3403, + "step": 24300 + }, + { + "epoch": 0.9086618220903334, + "grad_norm": 0.4016958475112915, + "learning_rate": 4.088390258735342e-07, + "loss": 0.2859, + "step": 24305 + }, + { + "epoch": 0.9088487510806832, + "grad_norm": 0.5920534729957581, + "learning_rate": 4.071786652773679e-07, + "loss": 0.3373, + "step": 24310 + }, + { + "epoch": 0.909035680071033, + "grad_norm": 0.5127608180046082, + "learning_rate": 4.0552161297041094e-07, + "loss": 0.3156, + "step": 24315 + }, + { + "epoch": 0.9092226090613829, + "grad_norm": 0.40801453590393066, + "learning_rate": 4.038678695241316e-07, + "loss": 0.3016, + "step": 24320 + }, + { + "epoch": 0.9094095380517326, + "grad_norm": 0.5536433458328247, + "learning_rate": 4.022174355088593e-07, + "loss": 0.3749, + "step": 24325 + }, + { + "epoch": 0.9095964670420824, + "grad_norm": 0.3775192201137543, + "learning_rate": 4.005703114937842e-07, + "loss": 0.3362, + "step": 24330 + }, + { + "epoch": 0.9097833960324322, + "grad_norm": 0.3780711591243744, + "learning_rate": 3.989264980469498e-07, + "loss": 0.2515, + "step": 24335 + }, + { + "epoch": 0.909970325022782, + "grad_norm": 0.27277469635009766, + "learning_rate": 3.972859957352604e-07, + "loss": 0.2751, + "step": 24340 + }, + { + "epoch": 0.9101572540131317, + "grad_norm": 0.28428831696510315, + "learning_rate": 3.956488051244789e-07, + "loss": 0.3447, + "step": 24345 + }, + { + "epoch": 0.9103441830034815, + "grad_norm": 0.3745942711830139, + "learning_rate": 3.9401492677922483e-07, + "loss": 0.2715, + "step": 24350 + }, + { + "epoch": 0.9105311119938313, + "grad_norm": 0.33236637711524963, + "learning_rate": 3.9238436126297743e-07, + "loss": 0.3158, + "step": 24355 + }, + { + "epoch": 0.9107180409841812, + "grad_norm": 0.4709293246269226, + "learning_rate": 3.9075710913807016e-07, + "loss": 0.2759, + "step": 24360 + }, + { + "epoch": 0.910904969974531, + "grad_norm": 0.43469110131263733, + "learning_rate": 3.8913317096569427e-07, + "loss": 0.2859, + "step": 24365 + }, + { + "epoch": 0.9110918989648807, + "grad_norm": 0.3980669379234314, + "learning_rate": 3.875125473059027e-07, + "loss": 0.3309, + "step": 24370 + }, + { + "epoch": 0.9112788279552305, + "grad_norm": 0.5004674196243286, + "learning_rate": 3.8589523871760183e-07, + "loss": 0.1799, + "step": 24375 + }, + { + "epoch": 0.9114657569455803, + "grad_norm": 0.28025054931640625, + "learning_rate": 3.8428124575855317e-07, + "loss": 0.2318, + "step": 24380 + }, + { + "epoch": 0.9116526859359301, + "grad_norm": 0.6791032552719116, + "learning_rate": 3.826705689853782e-07, + "loss": 0.2544, + "step": 24385 + }, + { + "epoch": 0.9118396149262799, + "grad_norm": 0.44797998666763306, + "learning_rate": 3.810632089535526e-07, + "loss": 0.2337, + "step": 24390 + }, + { + "epoch": 0.9120265439166296, + "grad_norm": 0.6669244766235352, + "learning_rate": 3.794591662174096e-07, + "loss": 0.1961, + "step": 24395 + }, + { + "epoch": 0.9122134729069794, + "grad_norm": 0.9826619625091553, + "learning_rate": 3.778584413301356e-07, + "loss": 0.24, + "step": 24400 + }, + { + "epoch": 0.9124004018973293, + "grad_norm": 0.5112035870552063, + "learning_rate": 3.7626103484377674e-07, + "loss": 0.2262, + "step": 24405 + }, + { + "epoch": 0.9125873308876791, + "grad_norm": 0.3886997699737549, + "learning_rate": 3.7466694730923124e-07, + "loss": 0.2582, + "step": 24410 + }, + { + "epoch": 0.9127742598780288, + "grad_norm": 0.26052147150039673, + "learning_rate": 3.7307617927625494e-07, + "loss": 0.3355, + "step": 24415 + }, + { + "epoch": 0.9129611888683786, + "grad_norm": 0.18169300258159637, + "learning_rate": 3.7148873129345896e-07, + "loss": 0.2485, + "step": 24420 + }, + { + "epoch": 0.9131481178587284, + "grad_norm": 0.6013249158859253, + "learning_rate": 3.6990460390830653e-07, + "loss": 0.2622, + "step": 24425 + }, + { + "epoch": 0.9133350468490782, + "grad_norm": 0.12792934477329254, + "learning_rate": 3.6832379766712057e-07, + "loss": 0.2445, + "step": 24430 + }, + { + "epoch": 0.913521975839428, + "grad_norm": 0.6494380235671997, + "learning_rate": 3.667463131150728e-07, + "loss": 0.2618, + "step": 24435 + }, + { + "epoch": 0.9137089048297777, + "grad_norm": 1.2967865467071533, + "learning_rate": 3.6517215079619583e-07, + "loss": 0.3326, + "step": 24440 + }, + { + "epoch": 0.9138958338201276, + "grad_norm": 0.43164533376693726, + "learning_rate": 3.6360131125336983e-07, + "loss": 0.2116, + "step": 24445 + }, + { + "epoch": 0.9140827628104774, + "grad_norm": 0.2929776608943939, + "learning_rate": 3.620337950283348e-07, + "loss": 0.3048, + "step": 24450 + }, + { + "epoch": 0.9142696918008272, + "grad_norm": 0.4373328685760498, + "learning_rate": 3.6046960266168163e-07, + "loss": 0.2887, + "step": 24455 + }, + { + "epoch": 0.914456620791177, + "grad_norm": 0.29323285818099976, + "learning_rate": 3.5890873469285325e-07, + "loss": 0.3293, + "step": 24460 + }, + { + "epoch": 0.9146435497815267, + "grad_norm": 0.3576362729072571, + "learning_rate": 3.573511916601513e-07, + "loss": 0.2113, + "step": 24465 + }, + { + "epoch": 0.9148304787718765, + "grad_norm": 0.5483099222183228, + "learning_rate": 3.55796974100725e-07, + "loss": 0.2588, + "step": 24470 + }, + { + "epoch": 0.9150174077622263, + "grad_norm": 0.30468040704727173, + "learning_rate": 3.5424608255058334e-07, + "loss": 0.2478, + "step": 24475 + }, + { + "epoch": 0.9152043367525761, + "grad_norm": 0.3078984320163727, + "learning_rate": 3.526985175445796e-07, + "loss": 0.2239, + "step": 24480 + }, + { + "epoch": 0.915391265742926, + "grad_norm": 0.2722003757953644, + "learning_rate": 3.511542796164291e-07, + "loss": 0.2702, + "step": 24485 + }, + { + "epoch": 0.9155781947332757, + "grad_norm": 0.317244291305542, + "learning_rate": 3.496133692986914e-07, + "loss": 0.2063, + "step": 24490 + }, + { + "epoch": 0.9157651237236255, + "grad_norm": 0.23974300920963287, + "learning_rate": 3.480757871227858e-07, + "loss": 0.2249, + "step": 24495 + }, + { + "epoch": 0.9159520527139753, + "grad_norm": 0.8161414265632629, + "learning_rate": 3.4654153361897815e-07, + "loss": 0.3797, + "step": 24500 + }, + { + "epoch": 0.9161389817043251, + "grad_norm": 0.28393059968948364, + "learning_rate": 3.4501060931638743e-07, + "loss": 0.2648, + "step": 24505 + }, + { + "epoch": 0.9163259106946748, + "grad_norm": 0.44209572672843933, + "learning_rate": 3.4348301474298906e-07, + "loss": 0.3289, + "step": 24510 + }, + { + "epoch": 0.9165128396850246, + "grad_norm": 0.35628998279571533, + "learning_rate": 3.4195875042560276e-07, + "loss": 0.2093, + "step": 24515 + }, + { + "epoch": 0.9166997686753744, + "grad_norm": 0.3359036445617676, + "learning_rate": 3.4043781688990696e-07, + "loss": 0.2941, + "step": 24520 + }, + { + "epoch": 0.9168866976657243, + "grad_norm": 0.6143574118614197, + "learning_rate": 3.3892021466042646e-07, + "loss": 0.303, + "step": 24525 + }, + { + "epoch": 0.9170736266560741, + "grad_norm": 0.24244384467601776, + "learning_rate": 3.374059442605393e-07, + "loss": 0.2823, + "step": 24530 + }, + { + "epoch": 0.9172605556464238, + "grad_norm": 0.8676662445068359, + "learning_rate": 3.3589500621247215e-07, + "loss": 0.3931, + "step": 24535 + }, + { + "epoch": 0.9174474846367736, + "grad_norm": 0.5118948817253113, + "learning_rate": 3.3438740103730716e-07, + "loss": 0.2997, + "step": 24540 + }, + { + "epoch": 0.9176344136271234, + "grad_norm": 1.1253736019134521, + "learning_rate": 3.3288312925497277e-07, + "loss": 0.274, + "step": 24545 + }, + { + "epoch": 0.9178213426174732, + "grad_norm": 0.32212045788764954, + "learning_rate": 3.3138219138424856e-07, + "loss": 0.2363, + "step": 24550 + }, + { + "epoch": 0.918008271607823, + "grad_norm": 0.5436800122261047, + "learning_rate": 3.298845879427659e-07, + "loss": 0.2476, + "step": 24555 + }, + { + "epoch": 0.9181952005981727, + "grad_norm": 0.6046520471572876, + "learning_rate": 3.2839031944700394e-07, + "loss": 0.2633, + "step": 24560 + }, + { + "epoch": 0.9183821295885226, + "grad_norm": 0.5511953234672546, + "learning_rate": 3.2689938641229603e-07, + "loss": 0.3388, + "step": 24565 + }, + { + "epoch": 0.9185690585788724, + "grad_norm": 0.30859842896461487, + "learning_rate": 3.254117893528186e-07, + "loss": 0.2729, + "step": 24570 + }, + { + "epoch": 0.9187559875692222, + "grad_norm": 0.39766547083854675, + "learning_rate": 3.239275287816035e-07, + "loss": 0.3074, + "step": 24575 + }, + { + "epoch": 0.9189429165595719, + "grad_norm": 0.3398101031780243, + "learning_rate": 3.224466052105291e-07, + "loss": 0.3279, + "step": 24580 + }, + { + "epoch": 0.9191298455499217, + "grad_norm": 0.7181034684181213, + "learning_rate": 3.209690191503245e-07, + "loss": 0.3621, + "step": 24585 + }, + { + "epoch": 0.9193167745402715, + "grad_norm": 0.541123628616333, + "learning_rate": 3.194947711105645e-07, + "loss": 0.2336, + "step": 24590 + }, + { + "epoch": 0.9195037035306213, + "grad_norm": 0.24448862671852112, + "learning_rate": 3.180238615996778e-07, + "loss": 0.3324, + "step": 24595 + }, + { + "epoch": 0.919690632520971, + "grad_norm": 0.8702734112739563, + "learning_rate": 3.165562911249376e-07, + "loss": 0.2881, + "step": 24600 + }, + { + "epoch": 0.9198775615113209, + "grad_norm": 0.38017889857292175, + "learning_rate": 3.1509206019246564e-07, + "loss": 0.2344, + "step": 24605 + }, + { + "epoch": 0.9200644905016707, + "grad_norm": 0.48280656337738037, + "learning_rate": 3.136311693072347e-07, + "loss": 0.2581, + "step": 24610 + }, + { + "epoch": 0.9202514194920205, + "grad_norm": 0.37911441922187805, + "learning_rate": 3.1217361897306395e-07, + "loss": 0.2333, + "step": 24615 + }, + { + "epoch": 0.9204383484823703, + "grad_norm": 0.4707018733024597, + "learning_rate": 3.107194096926214e-07, + "loss": 0.2859, + "step": 24620 + }, + { + "epoch": 0.92062527747272, + "grad_norm": 0.308724969625473, + "learning_rate": 3.0926854196742017e-07, + "loss": 0.28, + "step": 24625 + }, + { + "epoch": 0.9208122064630698, + "grad_norm": 0.3189759850502014, + "learning_rate": 3.0782101629782456e-07, + "loss": 0.2821, + "step": 24630 + }, + { + "epoch": 0.9209991354534196, + "grad_norm": 0.24063967168331146, + "learning_rate": 3.063768331830441e-07, + "loss": 0.2809, + "step": 24635 + }, + { + "epoch": 0.9211860644437694, + "grad_norm": 0.5102589726448059, + "learning_rate": 3.049359931211382e-07, + "loss": 0.3776, + "step": 24640 + }, + { + "epoch": 0.9213729934341192, + "grad_norm": 0.5993364453315735, + "learning_rate": 3.034984966090082e-07, + "loss": 0.3378, + "step": 24645 + }, + { + "epoch": 0.921559922424469, + "grad_norm": 0.5947995185852051, + "learning_rate": 3.020643441424065e-07, + "loss": 0.2313, + "step": 24650 + }, + { + "epoch": 0.9217468514148188, + "grad_norm": 0.35502389073371887, + "learning_rate": 3.006335362159329e-07, + "loss": 0.2421, + "step": 24655 + }, + { + "epoch": 0.9219337804051686, + "grad_norm": 0.5855832099914551, + "learning_rate": 2.9920607332302844e-07, + "loss": 0.2156, + "step": 24660 + }, + { + "epoch": 0.9221207093955184, + "grad_norm": 0.2624458074569702, + "learning_rate": 2.97781955955988e-07, + "loss": 0.2104, + "step": 24665 + }, + { + "epoch": 0.9223076383858682, + "grad_norm": 0.41134822368621826, + "learning_rate": 2.9636118460594667e-07, + "loss": 0.3424, + "step": 24670 + }, + { + "epoch": 0.9224945673762179, + "grad_norm": 0.28406789898872375, + "learning_rate": 2.94943759762889e-07, + "loss": 0.2791, + "step": 24675 + }, + { + "epoch": 0.9226814963665677, + "grad_norm": 0.6110890507698059, + "learning_rate": 2.93529681915643e-07, + "loss": 0.3006, + "step": 24680 + }, + { + "epoch": 0.9228684253569175, + "grad_norm": 0.4325246810913086, + "learning_rate": 2.9211895155188406e-07, + "loss": 0.2617, + "step": 24685 + }, + { + "epoch": 0.9230553543472674, + "grad_norm": 0.3159783184528351, + "learning_rate": 2.9071156915813413e-07, + "loss": 0.2363, + "step": 24690 + }, + { + "epoch": 0.9232422833376172, + "grad_norm": 0.3180788457393646, + "learning_rate": 2.8930753521975496e-07, + "loss": 0.2499, + "step": 24695 + }, + { + "epoch": 0.9234292123279669, + "grad_norm": 0.4603706896305084, + "learning_rate": 2.879068502209625e-07, + "loss": 0.2631, + "step": 24700 + }, + { + "epoch": 0.9236161413183167, + "grad_norm": 0.5309453010559082, + "learning_rate": 2.865095146448105e-07, + "loss": 0.2576, + "step": 24705 + }, + { + "epoch": 0.9238030703086665, + "grad_norm": 0.5529757738113403, + "learning_rate": 2.8511552897319997e-07, + "loss": 0.229, + "step": 24710 + }, + { + "epoch": 0.9239899992990163, + "grad_norm": 0.38355591893196106, + "learning_rate": 2.837248936868764e-07, + "loss": 0.2191, + "step": 24715 + }, + { + "epoch": 0.924176928289366, + "grad_norm": 0.40068626403808594, + "learning_rate": 2.823376092654306e-07, + "loss": 0.3506, + "step": 24720 + }, + { + "epoch": 0.9243638572797158, + "grad_norm": 0.46175092458724976, + "learning_rate": 2.8095367618729664e-07, + "loss": 0.2266, + "step": 24725 + }, + { + "epoch": 0.9245507862700657, + "grad_norm": 0.4619698226451874, + "learning_rate": 2.7957309492975483e-07, + "loss": 0.2883, + "step": 24730 + }, + { + "epoch": 0.9247377152604155, + "grad_norm": 0.3867049217224121, + "learning_rate": 2.781958659689277e-07, + "loss": 0.2381, + "step": 24735 + }, + { + "epoch": 0.9249246442507653, + "grad_norm": 0.34441909193992615, + "learning_rate": 2.768219897797797e-07, + "loss": 0.2983, + "step": 24740 + }, + { + "epoch": 0.925111573241115, + "grad_norm": 0.1862328201532364, + "learning_rate": 2.7545146683612413e-07, + "loss": 0.2152, + "step": 24745 + }, + { + "epoch": 0.9252985022314648, + "grad_norm": 0.4825479984283447, + "learning_rate": 2.7408429761061393e-07, + "loss": 0.2617, + "step": 24750 + }, + { + "epoch": 0.9254854312218146, + "grad_norm": 0.40720510482788086, + "learning_rate": 2.727204825747476e-07, + "loss": 0.3691, + "step": 24755 + }, + { + "epoch": 0.9256723602121644, + "grad_norm": 0.48773303627967834, + "learning_rate": 2.7136002219886326e-07, + "loss": 0.3045, + "step": 24760 + }, + { + "epoch": 0.9258592892025141, + "grad_norm": 0.5098922848701477, + "learning_rate": 2.700029169521479e-07, + "loss": 0.2355, + "step": 24765 + }, + { + "epoch": 0.926046218192864, + "grad_norm": 0.3420339524745941, + "learning_rate": 2.6864916730262593e-07, + "loss": 0.2178, + "step": 24770 + }, + { + "epoch": 0.9262331471832138, + "grad_norm": 0.8477463722229004, + "learning_rate": 2.672987737171673e-07, + "loss": 0.2843, + "step": 24775 + }, + { + "epoch": 0.9264200761735636, + "grad_norm": 0.481887549161911, + "learning_rate": 2.65951736661485e-07, + "loss": 0.2758, + "step": 24780 + }, + { + "epoch": 0.9266070051639134, + "grad_norm": 0.2788504660129547, + "learning_rate": 2.646080566001341e-07, + "loss": 0.289, + "step": 24785 + }, + { + "epoch": 0.9267939341542631, + "grad_norm": 0.46086716651916504, + "learning_rate": 2.632677339965095e-07, + "loss": 0.285, + "step": 24790 + }, + { + "epoch": 0.9269808631446129, + "grad_norm": 0.7587642669677734, + "learning_rate": 2.6193076931285035e-07, + "loss": 0.3542, + "step": 24795 + }, + { + "epoch": 0.9271677921349627, + "grad_norm": 0.32218098640441895, + "learning_rate": 2.6059716301023885e-07, + "loss": 0.2192, + "step": 24800 + }, + { + "epoch": 0.9273547211253125, + "grad_norm": 0.6089930534362793, + "learning_rate": 2.5926691554859497e-07, + "loss": 0.2849, + "step": 24805 + }, + { + "epoch": 0.9275416501156624, + "grad_norm": 0.8761005401611328, + "learning_rate": 2.579400273866861e-07, + "loss": 0.2688, + "step": 24810 + }, + { + "epoch": 0.9277285791060121, + "grad_norm": 0.4644549787044525, + "learning_rate": 2.5661649898211515e-07, + "loss": 0.2952, + "step": 24815 + }, + { + "epoch": 0.9279155080963619, + "grad_norm": 0.19469016790390015, + "learning_rate": 2.552963307913303e-07, + "loss": 0.2141, + "step": 24820 + }, + { + "epoch": 0.9281024370867117, + "grad_norm": 0.5360405445098877, + "learning_rate": 2.5397952326962183e-07, + "loss": 0.2797, + "step": 24825 + }, + { + "epoch": 0.9282893660770615, + "grad_norm": 0.5923690795898438, + "learning_rate": 2.526660768711153e-07, + "loss": 0.2603, + "step": 24830 + }, + { + "epoch": 0.9284762950674113, + "grad_norm": 0.47409185767173767, + "learning_rate": 2.5135599204878403e-07, + "loss": 0.2873, + "step": 24835 + }, + { + "epoch": 0.928663224057761, + "grad_norm": 0.7585999965667725, + "learning_rate": 2.500492692544354e-07, + "loss": 0.2297, + "step": 24840 + }, + { + "epoch": 0.9288501530481108, + "grad_norm": 0.5913994908332825, + "learning_rate": 2.487459089387234e-07, + "loss": 0.2202, + "step": 24845 + }, + { + "epoch": 0.9290370820384607, + "grad_norm": 0.5756853818893433, + "learning_rate": 2.474459115511374e-07, + "loss": 0.2813, + "step": 24850 + }, + { + "epoch": 0.9292240110288105, + "grad_norm": 0.8577892780303955, + "learning_rate": 2.461492775400121e-07, + "loss": 0.2344, + "step": 24855 + }, + { + "epoch": 0.9294109400191602, + "grad_norm": 0.26759210228919983, + "learning_rate": 2.448560073525164e-07, + "loss": 0.3403, + "step": 24860 + }, + { + "epoch": 0.92959786900951, + "grad_norm": 1.6753215789794922, + "learning_rate": 2.4356610143466353e-07, + "loss": 0.2956, + "step": 24865 + }, + { + "epoch": 0.9297847979998598, + "grad_norm": 0.6404578685760498, + "learning_rate": 2.422795602313066e-07, + "loss": 0.2465, + "step": 24870 + }, + { + "epoch": 0.9299717269902096, + "grad_norm": 0.33225345611572266, + "learning_rate": 2.40996384186134e-07, + "loss": 0.2499, + "step": 24875 + }, + { + "epoch": 0.9301586559805594, + "grad_norm": 0.5298144221305847, + "learning_rate": 2.3971657374167956e-07, + "loss": 0.2753, + "step": 24880 + }, + { + "epoch": 0.9303455849709091, + "grad_norm": 0.3931209444999695, + "learning_rate": 2.3844012933930906e-07, + "loss": 0.2161, + "step": 24885 + }, + { + "epoch": 0.9305325139612589, + "grad_norm": 0.8054724931716919, + "learning_rate": 2.371670514192348e-07, + "loss": 0.2659, + "step": 24890 + }, + { + "epoch": 0.9307194429516088, + "grad_norm": 0.33258944749832153, + "learning_rate": 2.358973404205034e-07, + "loss": 0.2871, + "step": 24895 + }, + { + "epoch": 0.9309063719419586, + "grad_norm": 0.475870281457901, + "learning_rate": 2.3463099678100344e-07, + "loss": 0.2974, + "step": 24900 + }, + { + "epoch": 0.9310933009323084, + "grad_norm": 0.43682655692100525, + "learning_rate": 2.333680209374578e-07, + "loss": 0.2509, + "step": 24905 + }, + { + "epoch": 0.9312802299226581, + "grad_norm": 0.3373750150203705, + "learning_rate": 2.3210841332543254e-07, + "loss": 0.2143, + "step": 24910 + }, + { + "epoch": 0.9314671589130079, + "grad_norm": 0.33678069710731506, + "learning_rate": 2.3085217437933127e-07, + "loss": 0.2806, + "step": 24915 + }, + { + "epoch": 0.9316540879033577, + "grad_norm": 0.47912338376045227, + "learning_rate": 2.295993045323941e-07, + "loss": 0.2977, + "step": 24920 + }, + { + "epoch": 0.9318410168937075, + "grad_norm": 0.3937797248363495, + "learning_rate": 2.2834980421669872e-07, + "loss": 0.2392, + "step": 24925 + }, + { + "epoch": 0.9320279458840572, + "grad_norm": 0.6482548117637634, + "learning_rate": 2.2710367386316156e-07, + "loss": 0.2579, + "step": 24930 + }, + { + "epoch": 0.9322148748744071, + "grad_norm": 1.0093536376953125, + "learning_rate": 2.2586091390153996e-07, + "loss": 0.3091, + "step": 24935 + }, + { + "epoch": 0.9324018038647569, + "grad_norm": 0.29904359579086304, + "learning_rate": 2.246215247604233e-07, + "loss": 0.3163, + "step": 24940 + }, + { + "epoch": 0.9325887328551067, + "grad_norm": 0.6364412903785706, + "learning_rate": 2.2338550686724413e-07, + "loss": 0.2685, + "step": 24945 + }, + { + "epoch": 0.9327756618454565, + "grad_norm": 0.7019013166427612, + "learning_rate": 2.221528606482659e-07, + "loss": 0.3034, + "step": 24950 + }, + { + "epoch": 0.9329625908358062, + "grad_norm": 0.5660467743873596, + "learning_rate": 2.2092358652859634e-07, + "loss": 0.25, + "step": 24955 + }, + { + "epoch": 0.933149519826156, + "grad_norm": 0.5666435956954956, + "learning_rate": 2.1969768493217747e-07, + "loss": 0.2411, + "step": 24960 + }, + { + "epoch": 0.9333364488165058, + "grad_norm": 0.3779709041118622, + "learning_rate": 2.1847515628178328e-07, + "loss": 0.3184, + "step": 24965 + }, + { + "epoch": 0.9335233778068556, + "grad_norm": 0.6057195067405701, + "learning_rate": 2.1725600099903433e-07, + "loss": 0.23, + "step": 24970 + }, + { + "epoch": 0.9337103067972055, + "grad_norm": 0.5047149062156677, + "learning_rate": 2.160402195043776e-07, + "loss": 0.3242, + "step": 24975 + }, + { + "epoch": 0.9338972357875552, + "grad_norm": 1.0450160503387451, + "learning_rate": 2.1482781221710437e-07, + "loss": 0.2839, + "step": 24980 + }, + { + "epoch": 0.934084164777905, + "grad_norm": 0.44621530175209045, + "learning_rate": 2.1361877955533682e-07, + "loss": 0.2289, + "step": 24985 + }, + { + "epoch": 0.9342710937682548, + "grad_norm": 0.547173023223877, + "learning_rate": 2.1241312193603814e-07, + "loss": 0.3041, + "step": 24990 + }, + { + "epoch": 0.9344580227586046, + "grad_norm": 0.4551314413547516, + "learning_rate": 2.1121083977500346e-07, + "loss": 0.2025, + "step": 24995 + }, + { + "epoch": 0.9346449517489543, + "grad_norm": 0.19889132678508759, + "learning_rate": 2.1001193348686444e-07, + "loss": 0.2547, + "step": 25000 + }, + { + "epoch": 0.9348318807393041, + "grad_norm": 0.4394857585430145, + "learning_rate": 2.0881640348509258e-07, + "loss": 0.3064, + "step": 25005 + }, + { + "epoch": 0.9350188097296539, + "grad_norm": 0.29480358958244324, + "learning_rate": 2.0762425018199028e-07, + "loss": 0.2712, + "step": 25010 + }, + { + "epoch": 0.9352057387200038, + "grad_norm": 0.306598037481308, + "learning_rate": 2.0643547398869646e-07, + "loss": 0.2319, + "step": 25015 + }, + { + "epoch": 0.9353926677103536, + "grad_norm": 0.18189042806625366, + "learning_rate": 2.052500753151876e-07, + "loss": 0.2516, + "step": 25020 + }, + { + "epoch": 0.9355795967007033, + "grad_norm": 0.3686217963695526, + "learning_rate": 2.0406805457027225e-07, + "loss": 0.2465, + "step": 25025 + }, + { + "epoch": 0.9357665256910531, + "grad_norm": 0.3795625567436218, + "learning_rate": 2.028894121615943e-07, + "loss": 0.3719, + "step": 25030 + }, + { + "epoch": 0.9359534546814029, + "grad_norm": 0.3678499758243561, + "learning_rate": 2.0171414849563753e-07, + "loss": 0.2745, + "step": 25035 + }, + { + "epoch": 0.9361403836717527, + "grad_norm": 0.20017379522323608, + "learning_rate": 2.0054226397771216e-07, + "loss": 0.2769, + "step": 25040 + }, + { + "epoch": 0.9363273126621025, + "grad_norm": 0.42065608501434326, + "learning_rate": 1.9937375901197154e-07, + "loss": 0.3136, + "step": 25045 + }, + { + "epoch": 0.9365142416524522, + "grad_norm": 0.4758951663970947, + "learning_rate": 1.9820863400139778e-07, + "loss": 0.2509, + "step": 25050 + }, + { + "epoch": 0.9367011706428021, + "grad_norm": 0.30327993631362915, + "learning_rate": 1.9704688934780946e-07, + "loss": 0.2372, + "step": 25055 + }, + { + "epoch": 0.9368880996331519, + "grad_norm": 0.4054654836654663, + "learning_rate": 1.9588852545185831e-07, + "loss": 0.3371, + "step": 25060 + }, + { + "epoch": 0.9370750286235017, + "grad_norm": 0.40502023696899414, + "learning_rate": 1.9473354271303258e-07, + "loss": 0.2511, + "step": 25065 + }, + { + "epoch": 0.9372619576138514, + "grad_norm": 0.4846619665622711, + "learning_rate": 1.9358194152965139e-07, + "loss": 0.2792, + "step": 25070 + }, + { + "epoch": 0.9374488866042012, + "grad_norm": 0.9523441195487976, + "learning_rate": 1.9243372229886704e-07, + "loss": 0.3035, + "step": 25075 + }, + { + "epoch": 0.937635815594551, + "grad_norm": 0.7273759245872498, + "learning_rate": 1.9128888541667167e-07, + "loss": 0.2516, + "step": 25080 + }, + { + "epoch": 0.9378227445849008, + "grad_norm": 0.32927486300468445, + "learning_rate": 1.9014743127788392e-07, + "loss": 0.2888, + "step": 25085 + }, + { + "epoch": 0.9380096735752506, + "grad_norm": 0.31658056378364563, + "learning_rate": 1.890093602761589e-07, + "loss": 0.2707, + "step": 25090 + }, + { + "epoch": 0.9381966025656004, + "grad_norm": 0.41149401664733887, + "learning_rate": 1.8787467280398597e-07, + "loss": 0.2212, + "step": 25095 + }, + { + "epoch": 0.9383835315559502, + "grad_norm": 0.3825901746749878, + "learning_rate": 1.8674336925268434e-07, + "loss": 0.3109, + "step": 25100 + }, + { + "epoch": 0.9385704605463, + "grad_norm": 0.28844115138053894, + "learning_rate": 1.8561545001240967e-07, + "loss": 0.315, + "step": 25105 + }, + { + "epoch": 0.9387573895366498, + "grad_norm": 0.6432269811630249, + "learning_rate": 1.844909154721497e-07, + "loss": 0.3375, + "step": 25110 + }, + { + "epoch": 0.9389443185269996, + "grad_norm": 0.3793524205684662, + "learning_rate": 1.83369766019722e-07, + "loss": 0.3181, + "step": 25115 + }, + { + "epoch": 0.9391312475173493, + "grad_norm": 0.42755433917045593, + "learning_rate": 1.8225200204177952e-07, + "loss": 0.3514, + "step": 25120 + }, + { + "epoch": 0.9393181765076991, + "grad_norm": 0.2594048082828522, + "learning_rate": 1.811376239238083e-07, + "loss": 0.256, + "step": 25125 + }, + { + "epoch": 0.9395051054980489, + "grad_norm": 0.6199470162391663, + "learning_rate": 1.8002663205012428e-07, + "loss": 0.2888, + "step": 25130 + }, + { + "epoch": 0.9396920344883987, + "grad_norm": 0.5659986734390259, + "learning_rate": 1.7891902680387652e-07, + "loss": 0.2381, + "step": 25135 + }, + { + "epoch": 0.9398789634787486, + "grad_norm": 0.31770560145378113, + "learning_rate": 1.7781480856704835e-07, + "loss": 0.3309, + "step": 25140 + }, + { + "epoch": 0.9400658924690983, + "grad_norm": 0.44480013847351074, + "learning_rate": 1.7671397772044962e-07, + "loss": 0.3511, + "step": 25145 + }, + { + "epoch": 0.9402528214594481, + "grad_norm": 0.48626697063446045, + "learning_rate": 1.7561653464372885e-07, + "loss": 0.2877, + "step": 25150 + }, + { + "epoch": 0.9404397504497979, + "grad_norm": 0.4127686023712158, + "learning_rate": 1.7452247971535995e-07, + "loss": 0.2773, + "step": 25155 + }, + { + "epoch": 0.9406266794401477, + "grad_norm": 0.9780619144439697, + "learning_rate": 1.7343181331265336e-07, + "loss": 0.2247, + "step": 25160 + }, + { + "epoch": 0.9408136084304974, + "grad_norm": 0.4334399104118347, + "learning_rate": 1.7234453581174704e-07, + "loss": 0.2965, + "step": 25165 + }, + { + "epoch": 0.9410005374208472, + "grad_norm": 0.37765777111053467, + "learning_rate": 1.7126064758761217e-07, + "loss": 0.3012, + "step": 25170 + }, + { + "epoch": 0.941187466411197, + "grad_norm": 0.258969247341156, + "learning_rate": 1.701801490140509e-07, + "loss": 0.215, + "step": 25175 + }, + { + "epoch": 0.9413743954015469, + "grad_norm": 0.762693464756012, + "learning_rate": 1.6910304046369618e-07, + "loss": 0.3182, + "step": 25180 + }, + { + "epoch": 0.9415613243918967, + "grad_norm": 0.2748546600341797, + "learning_rate": 1.6802932230801205e-07, + "loss": 0.2761, + "step": 25185 + }, + { + "epoch": 0.9417482533822464, + "grad_norm": 0.556158721446991, + "learning_rate": 1.669589949172934e-07, + "loss": 0.2127, + "step": 25190 + }, + { + "epoch": 0.9419351823725962, + "grad_norm": 0.3723142445087433, + "learning_rate": 1.6589205866066493e-07, + "loss": 0.277, + "step": 25195 + }, + { + "epoch": 0.942122111362946, + "grad_norm": 0.42889419198036194, + "learning_rate": 1.6482851390608235e-07, + "loss": 0.3108, + "step": 25200 + }, + { + "epoch": 0.9423090403532958, + "grad_norm": 1.3041778802871704, + "learning_rate": 1.6376836102033223e-07, + "loss": 0.3281, + "step": 25205 + }, + { + "epoch": 0.9424959693436455, + "grad_norm": 0.5724756121635437, + "learning_rate": 1.6271160036903099e-07, + "loss": 0.3212, + "step": 25210 + }, + { + "epoch": 0.9426828983339953, + "grad_norm": 0.2716809809207916, + "learning_rate": 1.616582323166249e-07, + "loss": 0.2562, + "step": 25215 + }, + { + "epoch": 0.9428698273243452, + "grad_norm": 0.5220561027526855, + "learning_rate": 1.6060825722639005e-07, + "loss": 0.3082, + "step": 25220 + }, + { + "epoch": 0.943056756314695, + "grad_norm": 0.3205987811088562, + "learning_rate": 1.5956167546043234e-07, + "loss": 0.2507, + "step": 25225 + }, + { + "epoch": 0.9432436853050448, + "grad_norm": 0.36693549156188965, + "learning_rate": 1.5851848737968968e-07, + "loss": 0.2144, + "step": 25230 + }, + { + "epoch": 0.9434306142953945, + "grad_norm": 0.26295924186706543, + "learning_rate": 1.574786933439254e-07, + "loss": 0.3514, + "step": 25235 + }, + { + "epoch": 0.9436175432857443, + "grad_norm": 0.5648385286331177, + "learning_rate": 1.5644229371173714e-07, + "loss": 0.3295, + "step": 25240 + }, + { + "epoch": 0.9438044722760941, + "grad_norm": 0.43656599521636963, + "learning_rate": 1.5540928884054674e-07, + "loss": 0.2869, + "step": 25245 + }, + { + "epoch": 0.9439914012664439, + "grad_norm": 0.5460593104362488, + "learning_rate": 1.5437967908661143e-07, + "loss": 0.2633, + "step": 25250 + }, + { + "epoch": 0.9441783302567937, + "grad_norm": 0.4732210040092468, + "learning_rate": 1.5335346480501056e-07, + "loss": 0.2891, + "step": 25255 + }, + { + "epoch": 0.9443652592471435, + "grad_norm": 0.2938053011894226, + "learning_rate": 1.5233064634965878e-07, + "loss": 0.3107, + "step": 25260 + }, + { + "epoch": 0.9445521882374933, + "grad_norm": 0.28973618149757385, + "learning_rate": 1.513112240732939e-07, + "loss": 0.3202, + "step": 25265 + }, + { + "epoch": 0.9447391172278431, + "grad_norm": 0.34332942962646484, + "learning_rate": 1.5029519832748807e-07, + "loss": 0.3028, + "step": 25270 + }, + { + "epoch": 0.9449260462181929, + "grad_norm": 0.37853550910949707, + "learning_rate": 1.4928256946263875e-07, + "loss": 0.3199, + "step": 25275 + }, + { + "epoch": 0.9451129752085426, + "grad_norm": 0.8875368237495422, + "learning_rate": 1.4827333782797216e-07, + "loss": 0.2674, + "step": 25280 + }, + { + "epoch": 0.9452999041988924, + "grad_norm": 0.5367876291275024, + "learning_rate": 1.472675037715443e-07, + "loss": 0.2802, + "step": 25285 + }, + { + "epoch": 0.9454868331892422, + "grad_norm": 0.2964636981487274, + "learning_rate": 1.4626506764023663e-07, + "loss": 0.2168, + "step": 25290 + }, + { + "epoch": 0.945673762179592, + "grad_norm": 0.25260117650032043, + "learning_rate": 1.4526602977976368e-07, + "loss": 0.3053, + "step": 25295 + }, + { + "epoch": 0.9458606911699419, + "grad_norm": 0.49495014548301697, + "learning_rate": 1.4427039053466207e-07, + "loss": 0.2923, + "step": 25300 + }, + { + "epoch": 0.9460476201602916, + "grad_norm": 0.4212638735771179, + "learning_rate": 1.432781502483005e-07, + "loss": 0.2597, + "step": 25305 + }, + { + "epoch": 0.9462345491506414, + "grad_norm": 0.5896459817886353, + "learning_rate": 1.422893092628741e-07, + "loss": 0.2654, + "step": 25310 + }, + { + "epoch": 0.9464214781409912, + "grad_norm": 0.9404630661010742, + "learning_rate": 1.4130386791940564e-07, + "loss": 0.2247, + "step": 25315 + }, + { + "epoch": 0.946608407131341, + "grad_norm": 0.4611830413341522, + "learning_rate": 1.4032182655774661e-07, + "loss": 0.2525, + "step": 25320 + }, + { + "epoch": 0.9467953361216908, + "grad_norm": 0.5101840496063232, + "learning_rate": 1.3934318551657277e-07, + "loss": 0.2983, + "step": 25325 + }, + { + "epoch": 0.9469822651120405, + "grad_norm": 0.6482624411582947, + "learning_rate": 1.383679451333919e-07, + "loss": 0.3196, + "step": 25330 + }, + { + "epoch": 0.9471691941023903, + "grad_norm": 0.2823061943054199, + "learning_rate": 1.373961057445339e-07, + "loss": 0.2113, + "step": 25335 + }, + { + "epoch": 0.9473561230927402, + "grad_norm": 0.6522537469863892, + "learning_rate": 1.364276676851617e-07, + "loss": 0.2637, + "step": 25340 + }, + { + "epoch": 0.94754305208309, + "grad_norm": 0.338235080242157, + "learning_rate": 1.354626312892582e-07, + "loss": 0.2926, + "step": 25345 + }, + { + "epoch": 0.9477299810734398, + "grad_norm": 0.3630608022212982, + "learning_rate": 1.3450099688963823e-07, + "loss": 0.232, + "step": 25350 + }, + { + "epoch": 0.9479169100637895, + "grad_norm": 0.32054030895233154, + "learning_rate": 1.3354276481794325e-07, + "loss": 0.2809, + "step": 25355 + }, + { + "epoch": 0.9481038390541393, + "grad_norm": 0.40766042470932007, + "learning_rate": 1.3258793540463778e-07, + "loss": 0.2871, + "step": 25360 + }, + { + "epoch": 0.9482907680444891, + "grad_norm": 0.6181418299674988, + "learning_rate": 1.3163650897901724e-07, + "loss": 0.3037, + "step": 25365 + }, + { + "epoch": 0.9484776970348389, + "grad_norm": 0.39881226420402527, + "learning_rate": 1.3068848586920035e-07, + "loss": 0.27, + "step": 25370 + }, + { + "epoch": 0.9486646260251886, + "grad_norm": 0.548456609249115, + "learning_rate": 1.2974386640213333e-07, + "loss": 0.2401, + "step": 25375 + }, + { + "epoch": 0.9488515550155384, + "grad_norm": 0.3512159585952759, + "learning_rate": 1.2880265090358668e-07, + "loss": 0.3958, + "step": 25380 + }, + { + "epoch": 0.9490384840058883, + "grad_norm": 0.33197692036628723, + "learning_rate": 1.278648396981619e-07, + "loss": 0.2512, + "step": 25385 + }, + { + "epoch": 0.9492254129962381, + "grad_norm": 0.4836597144603729, + "learning_rate": 1.2693043310928022e-07, + "loss": 0.3093, + "step": 25390 + }, + { + "epoch": 0.9494123419865879, + "grad_norm": 0.3625621795654297, + "learning_rate": 1.2599943145919392e-07, + "loss": 0.3484, + "step": 25395 + }, + { + "epoch": 0.9495992709769376, + "grad_norm": 0.41878730058670044, + "learning_rate": 1.2507183506897723e-07, + "loss": 0.267, + "step": 25400 + }, + { + "epoch": 0.9497861999672874, + "grad_norm": 0.2932325303554535, + "learning_rate": 1.241476442585321e-07, + "loss": 0.2598, + "step": 25405 + }, + { + "epoch": 0.9499731289576372, + "grad_norm": 0.3391314744949341, + "learning_rate": 1.2322685934658573e-07, + "loss": 0.2433, + "step": 25410 + }, + { + "epoch": 0.950160057947987, + "grad_norm": 0.4093460142612457, + "learning_rate": 1.223094806506897e-07, + "loss": 0.2741, + "step": 25415 + }, + { + "epoch": 0.9503469869383367, + "grad_norm": 0.4316021800041199, + "learning_rate": 1.2139550848722203e-07, + "loss": 0.2655, + "step": 25420 + }, + { + "epoch": 0.9505339159286866, + "grad_norm": 0.7140107750892639, + "learning_rate": 1.2048494317138615e-07, + "loss": 0.3403, + "step": 25425 + }, + { + "epoch": 0.9507208449190364, + "grad_norm": 0.5122732520103455, + "learning_rate": 1.1957778501720973e-07, + "loss": 0.2101, + "step": 25430 + }, + { + "epoch": 0.9509077739093862, + "grad_norm": 1.210839867591858, + "learning_rate": 1.1867403433754476e-07, + "loss": 0.2857, + "step": 25435 + }, + { + "epoch": 0.951094702899736, + "grad_norm": 0.507588267326355, + "learning_rate": 1.177736914440697e-07, + "loss": 0.2956, + "step": 25440 + }, + { + "epoch": 0.9512816318900857, + "grad_norm": 0.4931941330432892, + "learning_rate": 1.1687675664728837e-07, + "loss": 0.2871, + "step": 25445 + }, + { + "epoch": 0.9514685608804355, + "grad_norm": 0.5191053748130798, + "learning_rate": 1.1598323025652447e-07, + "loss": 0.2739, + "step": 25450 + }, + { + "epoch": 0.9516554898707853, + "grad_norm": 0.37645572423934937, + "learning_rate": 1.150931125799315e-07, + "loss": 0.2422, + "step": 25455 + }, + { + "epoch": 0.9518424188611351, + "grad_norm": 0.46614864468574524, + "learning_rate": 1.1420640392448612e-07, + "loss": 0.2909, + "step": 25460 + }, + { + "epoch": 0.952029347851485, + "grad_norm": 0.3071075677871704, + "learning_rate": 1.1332310459598928e-07, + "loss": 0.1938, + "step": 25465 + }, + { + "epoch": 0.9522162768418347, + "grad_norm": 0.4759249985218048, + "learning_rate": 1.1244321489906285e-07, + "loss": 0.2831, + "step": 25470 + }, + { + "epoch": 0.9524032058321845, + "grad_norm": 0.4241246283054352, + "learning_rate": 1.1156673513715744e-07, + "loss": 0.2166, + "step": 25475 + }, + { + "epoch": 0.9525901348225343, + "grad_norm": 0.26274579763412476, + "learning_rate": 1.1069366561254679e-07, + "loss": 0.3136, + "step": 25480 + }, + { + "epoch": 0.9527770638128841, + "grad_norm": 0.3445034325122833, + "learning_rate": 1.0982400662632564e-07, + "loss": 0.2326, + "step": 25485 + }, + { + "epoch": 0.9529639928032339, + "grad_norm": 0.46550294756889343, + "learning_rate": 1.0895775847841516e-07, + "loss": 0.2672, + "step": 25490 + }, + { + "epoch": 0.9531509217935836, + "grad_norm": 0.3357786238193512, + "learning_rate": 1.0809492146755973e-07, + "loss": 0.2343, + "step": 25495 + }, + { + "epoch": 0.9533378507839334, + "grad_norm": 0.1831052303314209, + "learning_rate": 1.0723549589132687e-07, + "loss": 0.3252, + "step": 25500 + }, + { + "epoch": 0.9535247797742833, + "grad_norm": 0.726946234703064, + "learning_rate": 1.0637948204610837e-07, + "loss": 0.3023, + "step": 25505 + }, + { + "epoch": 0.9537117087646331, + "grad_norm": 0.5573680996894836, + "learning_rate": 1.055268802271181e-07, + "loss": 0.2786, + "step": 25510 + }, + { + "epoch": 0.9538986377549828, + "grad_norm": 0.5966600179672241, + "learning_rate": 1.0467769072839307e-07, + "loss": 0.3758, + "step": 25515 + }, + { + "epoch": 0.9540855667453326, + "grad_norm": 0.31094256043434143, + "learning_rate": 1.0383191384279789e-07, + "loss": 0.2691, + "step": 25520 + }, + { + "epoch": 0.9542724957356824, + "grad_norm": 0.6314263343811035, + "learning_rate": 1.029895498620126e-07, + "loss": 0.3314, + "step": 25525 + }, + { + "epoch": 0.9544594247260322, + "grad_norm": 0.6387194991111755, + "learning_rate": 1.0215059907654811e-07, + "loss": 0.2326, + "step": 25530 + }, + { + "epoch": 0.954646353716382, + "grad_norm": 0.27241405844688416, + "learning_rate": 1.013150617757308e-07, + "loss": 0.2931, + "step": 25535 + }, + { + "epoch": 0.9548332827067317, + "grad_norm": 0.6504563093185425, + "learning_rate": 1.0048293824771682e-07, + "loss": 0.3224, + "step": 25540 + }, + { + "epoch": 0.9550202116970816, + "grad_norm": 0.5727891325950623, + "learning_rate": 9.965422877948106e-08, + "loss": 0.2353, + "step": 25545 + }, + { + "epoch": 0.9552071406874314, + "grad_norm": 0.5274751782417297, + "learning_rate": 9.882893365681934e-08, + "loss": 0.3013, + "step": 25550 + }, + { + "epoch": 0.9553940696777812, + "grad_norm": 0.41574156284332275, + "learning_rate": 9.80070531643551e-08, + "loss": 0.2363, + "step": 25555 + }, + { + "epoch": 0.955580998668131, + "grad_norm": 0.5117254257202148, + "learning_rate": 9.71885875855294e-08, + "loss": 0.3033, + "step": 25560 + }, + { + "epoch": 0.9557679276584807, + "grad_norm": 0.5276907086372375, + "learning_rate": 9.637353720260867e-08, + "loss": 0.3117, + "step": 25565 + }, + { + "epoch": 0.9559548566488305, + "grad_norm": 0.6600742340087891, + "learning_rate": 9.556190229668027e-08, + "loss": 0.3827, + "step": 25570 + }, + { + "epoch": 0.9561417856391803, + "grad_norm": 0.43783825635910034, + "learning_rate": 9.475368314765365e-08, + "loss": 0.3325, + "step": 25575 + }, + { + "epoch": 0.9563287146295301, + "grad_norm": 0.4194599390029907, + "learning_rate": 9.394888003426028e-08, + "loss": 0.2532, + "step": 25580 + }, + { + "epoch": 0.95651564361988, + "grad_norm": 0.3996281921863556, + "learning_rate": 9.314749323405481e-08, + "loss": 0.2825, + "step": 25585 + }, + { + "epoch": 0.9567025726102297, + "grad_norm": 0.4395323395729065, + "learning_rate": 9.234952302341172e-08, + "loss": 0.2375, + "step": 25590 + }, + { + "epoch": 0.9568895016005795, + "grad_norm": 0.5239723920822144, + "learning_rate": 9.155496967752642e-08, + "loss": 0.3791, + "step": 25595 + }, + { + "epoch": 0.9570764305909293, + "grad_norm": 0.6656010746955872, + "learning_rate": 9.076383347042084e-08, + "loss": 0.3139, + "step": 25600 + }, + { + "epoch": 0.9572633595812791, + "grad_norm": 0.6281600594520569, + "learning_rate": 8.997611467493228e-08, + "loss": 0.3157, + "step": 25605 + }, + { + "epoch": 0.9574502885716288, + "grad_norm": 0.39084675908088684, + "learning_rate": 8.919181356272454e-08, + "loss": 0.2807, + "step": 25610 + }, + { + "epoch": 0.9576372175619786, + "grad_norm": 0.4941912293434143, + "learning_rate": 8.841093040427907e-08, + "loss": 0.2826, + "step": 25615 + }, + { + "epoch": 0.9578241465523284, + "grad_norm": 0.500826358795166, + "learning_rate": 8.76334654689015e-08, + "loss": 0.3496, + "step": 25620 + }, + { + "epoch": 0.9580110755426782, + "grad_norm": 0.6868979930877686, + "learning_rate": 8.685941902471628e-08, + "loss": 0.3204, + "step": 25625 + }, + { + "epoch": 0.9581980045330281, + "grad_norm": 0.30598723888397217, + "learning_rate": 8.608879133866988e-08, + "loss": 0.3358, + "step": 25630 + }, + { + "epoch": 0.9583849335233778, + "grad_norm": 0.3103514611721039, + "learning_rate": 8.53215826765308e-08, + "loss": 0.285, + "step": 25635 + }, + { + "epoch": 0.9585718625137276, + "grad_norm": 0.5536108016967773, + "learning_rate": 8.455779330288516e-08, + "loss": 0.3143, + "step": 25640 + }, + { + "epoch": 0.9587587915040774, + "grad_norm": 0.589036226272583, + "learning_rate": 8.379742348114339e-08, + "loss": 0.286, + "step": 25645 + }, + { + "epoch": 0.9589457204944272, + "grad_norm": 0.2255747765302658, + "learning_rate": 8.30404734735346e-08, + "loss": 0.26, + "step": 25650 + }, + { + "epoch": 0.959132649484777, + "grad_norm": 0.5313140749931335, + "learning_rate": 8.228694354111111e-08, + "loss": 0.2988, + "step": 25655 + }, + { + "epoch": 0.9593195784751267, + "grad_norm": 0.30977725982666016, + "learning_rate": 8.153683394374057e-08, + "loss": 0.2458, + "step": 25660 + }, + { + "epoch": 0.9595065074654765, + "grad_norm": 0.8567424416542053, + "learning_rate": 8.079014494011827e-08, + "loss": 0.2961, + "step": 25665 + }, + { + "epoch": 0.9596934364558264, + "grad_norm": 0.569592297077179, + "learning_rate": 8.004687678775158e-08, + "loss": 0.2552, + "step": 25670 + }, + { + "epoch": 0.9598803654461762, + "grad_norm": 0.2618090808391571, + "learning_rate": 7.930702974297544e-08, + "loss": 0.2402, + "step": 25675 + }, + { + "epoch": 0.9600672944365259, + "grad_norm": 0.5399622917175293, + "learning_rate": 7.85706040609413e-08, + "loss": 0.2917, + "step": 25680 + }, + { + "epoch": 0.9602542234268757, + "grad_norm": 0.4389611780643463, + "learning_rate": 7.783759999562046e-08, + "loss": 0.2601, + "step": 25685 + }, + { + "epoch": 0.9604411524172255, + "grad_norm": 0.3556120693683624, + "learning_rate": 7.710801779980514e-08, + "loss": 0.3564, + "step": 25690 + }, + { + "epoch": 0.9606280814075753, + "grad_norm": 0.3294890522956848, + "learning_rate": 7.638185772510854e-08, + "loss": 0.2801, + "step": 25695 + }, + { + "epoch": 0.960815010397925, + "grad_norm": 0.37796953320503235, + "learning_rate": 7.565912002196141e-08, + "loss": 0.2577, + "step": 25700 + }, + { + "epoch": 0.9610019393882748, + "grad_norm": 0.4705111086368561, + "learning_rate": 7.493980493961439e-08, + "loss": 0.2638, + "step": 25705 + }, + { + "epoch": 0.9611888683786247, + "grad_norm": 0.2994195520877838, + "learning_rate": 7.422391272614016e-08, + "loss": 0.2883, + "step": 25710 + }, + { + "epoch": 0.9613757973689745, + "grad_norm": 0.2776239812374115, + "learning_rate": 7.351144362842898e-08, + "loss": 0.2938, + "step": 25715 + }, + { + "epoch": 0.9615627263593243, + "grad_norm": 0.4236242175102234, + "learning_rate": 7.280239789219213e-08, + "loss": 0.2281, + "step": 25720 + }, + { + "epoch": 0.961749655349674, + "grad_norm": 0.7493228912353516, + "learning_rate": 7.209677576195617e-08, + "loss": 0.3591, + "step": 25725 + }, + { + "epoch": 0.9619365843400238, + "grad_norm": 0.598604679107666, + "learning_rate": 7.139457748107314e-08, + "loss": 0.2865, + "step": 25730 + }, + { + "epoch": 0.9621235133303736, + "grad_norm": 0.147347092628479, + "learning_rate": 7.069580329170933e-08, + "loss": 0.302, + "step": 25735 + }, + { + "epoch": 0.9623104423207234, + "grad_norm": 0.5269376635551453, + "learning_rate": 7.000045343485306e-08, + "loss": 0.3699, + "step": 25740 + }, + { + "epoch": 0.9624973713110732, + "grad_norm": 0.30106881260871887, + "learning_rate": 6.93085281503092e-08, + "loss": 0.2679, + "step": 25745 + }, + { + "epoch": 0.962684300301423, + "grad_norm": 0.6084346771240234, + "learning_rate": 6.862002767670351e-08, + "loss": 0.288, + "step": 25750 + }, + { + "epoch": 0.9628712292917728, + "grad_norm": 0.31380781531333923, + "learning_rate": 6.793495225148161e-08, + "loss": 0.2513, + "step": 25755 + }, + { + "epoch": 0.9630581582821226, + "grad_norm": 0.5058198571205139, + "learning_rate": 6.725330211090342e-08, + "loss": 0.2661, + "step": 25760 + }, + { + "epoch": 0.9632450872724724, + "grad_norm": 0.3875492811203003, + "learning_rate": 6.65750774900531e-08, + "loss": 0.2098, + "step": 25765 + }, + { + "epoch": 0.9634320162628222, + "grad_norm": 0.34512069821357727, + "learning_rate": 6.59002786228291e-08, + "loss": 0.2959, + "step": 25770 + }, + { + "epoch": 0.9636189452531719, + "grad_norm": 0.5447210669517517, + "learning_rate": 6.522890574195195e-08, + "loss": 0.2606, + "step": 25775 + }, + { + "epoch": 0.9638058742435217, + "grad_norm": 0.34275734424591064, + "learning_rate": 6.456095907895754e-08, + "loss": 0.3135, + "step": 25780 + }, + { + "epoch": 0.9639928032338715, + "grad_norm": 0.5490571856498718, + "learning_rate": 6.389643886420161e-08, + "loss": 0.2562, + "step": 25785 + }, + { + "epoch": 0.9641797322242214, + "grad_norm": 0.921448290348053, + "learning_rate": 6.323534532685971e-08, + "loss": 0.3327, + "step": 25790 + }, + { + "epoch": 0.9643666612145712, + "grad_norm": 0.8506829738616943, + "learning_rate": 6.25776786949217e-08, + "loss": 0.238, + "step": 25795 + }, + { + "epoch": 0.9645535902049209, + "grad_norm": 0.24562324583530426, + "learning_rate": 6.192343919519949e-08, + "loss": 0.2228, + "step": 25800 + }, + { + "epoch": 0.9647405191952707, + "grad_norm": 0.4626258909702301, + "learning_rate": 6.127262705332148e-08, + "loss": 0.2939, + "step": 25805 + }, + { + "epoch": 0.9649274481856205, + "grad_norm": 0.46563923358917236, + "learning_rate": 6.06252424937337e-08, + "loss": 0.2838, + "step": 25810 + }, + { + "epoch": 0.9651143771759703, + "grad_norm": 0.3600118160247803, + "learning_rate": 5.998128573969975e-08, + "loss": 0.2402, + "step": 25815 + }, + { + "epoch": 0.96530130616632, + "grad_norm": 0.25077444314956665, + "learning_rate": 5.9340757013304215e-08, + "loss": 0.261, + "step": 25820 + }, + { + "epoch": 0.9654882351566698, + "grad_norm": 0.4118346869945526, + "learning_rate": 5.8703656535444853e-08, + "loss": 0.3058, + "step": 25825 + }, + { + "epoch": 0.9656751641470197, + "grad_norm": 0.5594443678855896, + "learning_rate": 5.806998452584034e-08, + "loss": 0.2788, + "step": 25830 + }, + { + "epoch": 0.9658620931373695, + "grad_norm": 0.28521931171417236, + "learning_rate": 5.743974120302587e-08, + "loss": 0.2705, + "step": 25835 + }, + { + "epoch": 0.9660490221277193, + "grad_norm": 0.44842952489852905, + "learning_rate": 5.681292678435424e-08, + "loss": 0.2366, + "step": 25840 + }, + { + "epoch": 0.966235951118069, + "grad_norm": 0.6687315702438354, + "learning_rate": 5.618954148599587e-08, + "loss": 0.2382, + "step": 25845 + }, + { + "epoch": 0.9664228801084188, + "grad_norm": 0.3889986574649811, + "learning_rate": 5.556958552293878e-08, + "loss": 0.2423, + "step": 25850 + }, + { + "epoch": 0.9666098090987686, + "grad_norm": 0.41925016045570374, + "learning_rate": 5.4953059108987516e-08, + "loss": 0.2204, + "step": 25855 + }, + { + "epoch": 0.9667967380891184, + "grad_norm": 0.502301037311554, + "learning_rate": 5.4339962456763096e-08, + "loss": 0.2187, + "step": 25860 + }, + { + "epoch": 0.9669836670794681, + "grad_norm": 0.5296496748924255, + "learning_rate": 5.37302957777075e-08, + "loss": 0.2298, + "step": 25865 + }, + { + "epoch": 0.9671705960698179, + "grad_norm": 0.34486472606658936, + "learning_rate": 5.3124059282076975e-08, + "loss": 0.2691, + "step": 25870 + }, + { + "epoch": 0.9673575250601678, + "grad_norm": 0.41491997241973877, + "learning_rate": 5.2521253178944295e-08, + "loss": 0.2574, + "step": 25875 + }, + { + "epoch": 0.9675444540505176, + "grad_norm": 0.4363487660884857, + "learning_rate": 5.192187767619872e-08, + "loss": 0.2644, + "step": 25880 + }, + { + "epoch": 0.9677313830408674, + "grad_norm": 0.45662906765937805, + "learning_rate": 5.1325932980550444e-08, + "loss": 0.2408, + "step": 25885 + }, + { + "epoch": 0.9679183120312171, + "grad_norm": 0.5905013680458069, + "learning_rate": 5.073341929752174e-08, + "loss": 0.2843, + "step": 25890 + }, + { + "epoch": 0.9681052410215669, + "grad_norm": 0.43980005383491516, + "learning_rate": 5.0144336831453586e-08, + "loss": 0.2896, + "step": 25895 + }, + { + "epoch": 0.9682921700119167, + "grad_norm": 0.6901749968528748, + "learning_rate": 4.95586857855046e-08, + "loss": 0.3044, + "step": 25900 + }, + { + "epoch": 0.9684790990022665, + "grad_norm": 0.9550485014915466, + "learning_rate": 4.897646636164877e-08, + "loss": 0.2959, + "step": 25905 + }, + { + "epoch": 0.9686660279926163, + "grad_norm": 0.13232779502868652, + "learning_rate": 4.839767876067658e-08, + "loss": 0.271, + "step": 25910 + }, + { + "epoch": 0.9688529569829661, + "grad_norm": 0.3293937146663666, + "learning_rate": 4.782232318219615e-08, + "loss": 0.2995, + "step": 25915 + }, + { + "epoch": 0.9690398859733159, + "grad_norm": 0.4516461491584778, + "learning_rate": 4.7250399824629867e-08, + "loss": 0.3644, + "step": 25920 + }, + { + "epoch": 0.9692268149636657, + "grad_norm": 0.3880869746208191, + "learning_rate": 4.668190888521884e-08, + "loss": 0.3003, + "step": 25925 + }, + { + "epoch": 0.9694137439540155, + "grad_norm": 0.3466978967189789, + "learning_rate": 4.611685056001847e-08, + "loss": 0.2949, + "step": 25930 + }, + { + "epoch": 0.9696006729443652, + "grad_norm": 0.4129624664783478, + "learning_rate": 4.555522504390175e-08, + "loss": 0.2601, + "step": 25935 + }, + { + "epoch": 0.969787601934715, + "grad_norm": 0.30791693925857544, + "learning_rate": 4.499703253055709e-08, + "loss": 0.2434, + "step": 25940 + }, + { + "epoch": 0.9699745309250648, + "grad_norm": 0.7145932912826538, + "learning_rate": 4.4442273212488286e-08, + "loss": 0.2478, + "step": 25945 + }, + { + "epoch": 0.9701614599154146, + "grad_norm": 0.359036386013031, + "learning_rate": 4.3890947281016725e-08, + "loss": 0.2566, + "step": 25950 + }, + { + "epoch": 0.9703483889057645, + "grad_norm": 0.4460180401802063, + "learning_rate": 4.3343054926279216e-08, + "loss": 0.2508, + "step": 25955 + }, + { + "epoch": 0.9705353178961142, + "grad_norm": 0.38728684186935425, + "learning_rate": 4.279859633722794e-08, + "loss": 0.2721, + "step": 25960 + }, + { + "epoch": 0.970722246886464, + "grad_norm": 0.4190647602081299, + "learning_rate": 4.225757170163047e-08, + "loss": 0.3867, + "step": 25965 + }, + { + "epoch": 0.9709091758768138, + "grad_norm": 0.37819820642471313, + "learning_rate": 4.1719981206072015e-08, + "loss": 0.2931, + "step": 25970 + }, + { + "epoch": 0.9710961048671636, + "grad_norm": 0.620923638343811, + "learning_rate": 4.118582503594981e-08, + "loss": 0.2239, + "step": 25975 + }, + { + "epoch": 0.9712830338575134, + "grad_norm": 0.4498381018638611, + "learning_rate": 4.065510337548206e-08, + "loss": 0.338, + "step": 25980 + }, + { + "epoch": 0.9714699628478631, + "grad_norm": 0.5090503096580505, + "learning_rate": 4.0127816407696805e-08, + "loss": 0.2363, + "step": 25985 + }, + { + "epoch": 0.9716568918382129, + "grad_norm": 0.3429947793483734, + "learning_rate": 3.96039643144408e-08, + "loss": 0.263, + "step": 25990 + }, + { + "epoch": 0.9718438208285628, + "grad_norm": 0.392718642950058, + "learning_rate": 3.90835472763762e-08, + "loss": 0.2374, + "step": 25995 + }, + { + "epoch": 0.9720307498189126, + "grad_norm": 0.3899400532245636, + "learning_rate": 3.8566565472980545e-08, + "loss": 0.29, + "step": 26000 + }, + { + "epoch": 0.9722176788092624, + "grad_norm": 0.561836302280426, + "learning_rate": 3.805301908254455e-08, + "loss": 0.2512, + "step": 26005 + }, + { + "epoch": 0.9724046077996121, + "grad_norm": 0.6161816120147705, + "learning_rate": 3.754290828217655e-08, + "loss": 0.1973, + "step": 26010 + }, + { + "epoch": 0.9725915367899619, + "grad_norm": 1.0645028352737427, + "learning_rate": 3.7036233247799144e-08, + "loss": 0.4018, + "step": 26015 + }, + { + "epoch": 0.9727784657803117, + "grad_norm": 0.5088971257209778, + "learning_rate": 3.6532994154150347e-08, + "loss": 0.2651, + "step": 26020 + }, + { + "epoch": 0.9729653947706615, + "grad_norm": 0.3199444115161896, + "learning_rate": 3.603319117478244e-08, + "loss": 0.2447, + "step": 26025 + }, + { + "epoch": 0.9731523237610112, + "grad_norm": 0.3997444808483124, + "learning_rate": 3.5536824482061974e-08, + "loss": 0.2706, + "step": 26030 + }, + { + "epoch": 0.9733392527513611, + "grad_norm": 0.22671836614608765, + "learning_rate": 3.504389424717314e-08, + "loss": 0.3388, + "step": 26035 + }, + { + "epoch": 0.9735261817417109, + "grad_norm": 0.36706823110580444, + "learning_rate": 3.455440064011328e-08, + "loss": 0.2259, + "step": 26040 + }, + { + "epoch": 0.9737131107320607, + "grad_norm": 0.4109925329685211, + "learning_rate": 3.406834382969515e-08, + "loss": 0.1845, + "step": 26045 + }, + { + "epoch": 0.9739000397224105, + "grad_norm": 0.47747719287872314, + "learning_rate": 3.358572398354465e-08, + "loss": 0.2509, + "step": 26050 + }, + { + "epoch": 0.9740869687127602, + "grad_norm": 0.1783730685710907, + "learning_rate": 3.310654126810309e-08, + "loss": 0.273, + "step": 26055 + }, + { + "epoch": 0.97427389770311, + "grad_norm": 0.3882327079772949, + "learning_rate": 3.263079584862938e-08, + "loss": 0.2216, + "step": 26060 + }, + { + "epoch": 0.9744608266934598, + "grad_norm": 0.3611951768398285, + "learning_rate": 3.2158487889192294e-08, + "loss": 0.2725, + "step": 26065 + }, + { + "epoch": 0.9746477556838096, + "grad_norm": 0.5424373149871826, + "learning_rate": 3.168961755267819e-08, + "loss": 0.242, + "step": 26070 + }, + { + "epoch": 0.9748346846741595, + "grad_norm": 0.6144366264343262, + "learning_rate": 3.12241850007855e-08, + "loss": 0.2658, + "step": 26075 + }, + { + "epoch": 0.9750216136645092, + "grad_norm": 0.17419812083244324, + "learning_rate": 3.076219039403139e-08, + "loss": 0.2147, + "step": 26080 + }, + { + "epoch": 0.975208542654859, + "grad_norm": 0.23372456431388855, + "learning_rate": 3.0303633891742844e-08, + "loss": 0.2811, + "step": 26085 + }, + { + "epoch": 0.9753954716452088, + "grad_norm": 0.1994643360376358, + "learning_rate": 2.984851565206226e-08, + "loss": 0.3354, + "step": 26090 + }, + { + "epoch": 0.9755824006355586, + "grad_norm": 0.45933806896209717, + "learning_rate": 2.9396835831947412e-08, + "loss": 0.3562, + "step": 26095 + }, + { + "epoch": 0.9757693296259083, + "grad_norm": 0.4124797284603119, + "learning_rate": 2.8948594587170366e-08, + "loss": 0.2709, + "step": 26100 + }, + { + "epoch": 0.9759562586162581, + "grad_norm": 0.9764533638954163, + "learning_rate": 2.850379207231746e-08, + "loss": 0.4274, + "step": 26105 + }, + { + "epoch": 0.9761431876066079, + "grad_norm": 0.3561939001083374, + "learning_rate": 2.8062428440785994e-08, + "loss": 0.2174, + "step": 26110 + }, + { + "epoch": 0.9763301165969577, + "grad_norm": 0.35872069001197815, + "learning_rate": 2.7624503844790872e-08, + "loss": 0.2391, + "step": 26115 + }, + { + "epoch": 0.9765170455873076, + "grad_norm": 0.20135623216629028, + "learning_rate": 2.7190018435360178e-08, + "loss": 0.2471, + "step": 26120 + }, + { + "epoch": 0.9767039745776573, + "grad_norm": 0.866783082485199, + "learning_rate": 2.6758972362334046e-08, + "loss": 0.3307, + "step": 26125 + }, + { + "epoch": 0.9768909035680071, + "grad_norm": 0.34653496742248535, + "learning_rate": 2.633136577436912e-08, + "loss": 0.2525, + "step": 26130 + }, + { + "epoch": 0.9770778325583569, + "grad_norm": 0.3317651152610779, + "learning_rate": 2.590719881893522e-08, + "loss": 0.2919, + "step": 26135 + }, + { + "epoch": 0.9772647615487067, + "grad_norm": 0.22476986050605774, + "learning_rate": 2.5486471642314215e-08, + "loss": 0.3969, + "step": 26140 + }, + { + "epoch": 0.9774516905390565, + "grad_norm": 0.3792192339897156, + "learning_rate": 2.5069184389602264e-08, + "loss": 0.2839, + "step": 26145 + }, + { + "epoch": 0.9776386195294062, + "grad_norm": 0.5724767446517944, + "learning_rate": 2.4655337204712027e-08, + "loss": 0.2301, + "step": 26150 + }, + { + "epoch": 0.977825548519756, + "grad_norm": 0.30545639991760254, + "learning_rate": 2.4244930230364894e-08, + "loss": 0.2735, + "step": 26155 + }, + { + "epoch": 0.9780124775101059, + "grad_norm": 0.5367255210876465, + "learning_rate": 2.3837963608100977e-08, + "loss": 0.2457, + "step": 26160 + }, + { + "epoch": 0.9781994065004557, + "grad_norm": 0.431430459022522, + "learning_rate": 2.3434437478269124e-08, + "loss": 0.2527, + "step": 26165 + }, + { + "epoch": 0.9783863354908054, + "grad_norm": 0.5572644472122192, + "learning_rate": 2.3034351980035784e-08, + "loss": 0.2407, + "step": 26170 + }, + { + "epoch": 0.9785732644811552, + "grad_norm": 0.43069931864738464, + "learning_rate": 2.263770725137837e-08, + "loss": 0.247, + "step": 26175 + }, + { + "epoch": 0.978760193471505, + "grad_norm": 0.4567778706550598, + "learning_rate": 2.224450342908746e-08, + "loss": 0.2365, + "step": 26180 + }, + { + "epoch": 0.9789471224618548, + "grad_norm": 0.3975851535797119, + "learning_rate": 2.1854740648769023e-08, + "loss": 0.3065, + "step": 26185 + }, + { + "epoch": 0.9791340514522046, + "grad_norm": 0.3134264647960663, + "learning_rate": 2.1468419044839984e-08, + "loss": 0.296, + "step": 26190 + }, + { + "epoch": 0.9793209804425543, + "grad_norm": 0.27878788113594055, + "learning_rate": 2.1085538750531542e-08, + "loss": 0.3218, + "step": 26195 + }, + { + "epoch": 0.9795079094329042, + "grad_norm": 0.6269852519035339, + "learning_rate": 2.0706099897890296e-08, + "loss": 0.3444, + "step": 26200 + }, + { + "epoch": 0.979694838423254, + "grad_norm": 0.38990554213523865, + "learning_rate": 2.0330102617771575e-08, + "loss": 0.2459, + "step": 26205 + }, + { + "epoch": 0.9798817674136038, + "grad_norm": 0.3752122223377228, + "learning_rate": 1.995754703984609e-08, + "loss": 0.2583, + "step": 26210 + }, + { + "epoch": 0.9800686964039536, + "grad_norm": 0.6185774207115173, + "learning_rate": 1.9588433292598852e-08, + "loss": 0.2597, + "step": 26215 + }, + { + "epoch": 0.9802556253943033, + "grad_norm": 0.8538332581520081, + "learning_rate": 1.9222761503325803e-08, + "loss": 0.3271, + "step": 26220 + }, + { + "epoch": 0.9804425543846531, + "grad_norm": 0.39858320355415344, + "learning_rate": 1.886053179813718e-08, + "loss": 0.2141, + "step": 26225 + }, + { + "epoch": 0.9806294833750029, + "grad_norm": 0.4068199396133423, + "learning_rate": 1.850174430195528e-08, + "loss": 0.237, + "step": 26230 + }, + { + "epoch": 0.9808164123653527, + "grad_norm": 0.457328200340271, + "learning_rate": 1.814639913851557e-08, + "loss": 0.3351, + "step": 26235 + }, + { + "epoch": 0.9810033413557026, + "grad_norm": 0.2778027653694153, + "learning_rate": 1.779449643036668e-08, + "loss": 0.3103, + "step": 26240 + }, + { + "epoch": 0.9811902703460523, + "grad_norm": 0.6216102242469788, + "learning_rate": 1.7446036298869316e-08, + "loss": 0.2385, + "step": 26245 + }, + { + "epoch": 0.9813771993364021, + "grad_norm": 0.33470118045806885, + "learning_rate": 1.710101886419735e-08, + "loss": 0.3129, + "step": 26250 + }, + { + "epoch": 0.9815641283267519, + "grad_norm": 0.37651658058166504, + "learning_rate": 1.6759444245338928e-08, + "loss": 0.2858, + "step": 26255 + }, + { + "epoch": 0.9817510573171017, + "grad_norm": 0.3364080488681793, + "learning_rate": 1.642131256009094e-08, + "loss": 0.3334, + "step": 26260 + }, + { + "epoch": 0.9819379863074514, + "grad_norm": 0.34844186902046204, + "learning_rate": 1.608662392506677e-08, + "loss": 0.3211, + "step": 26265 + }, + { + "epoch": 0.9821249152978012, + "grad_norm": 0.31410226225852966, + "learning_rate": 1.575537845569075e-08, + "loss": 0.2541, + "step": 26270 + }, + { + "epoch": 0.982311844288151, + "grad_norm": 0.6750185489654541, + "learning_rate": 1.5427576266200394e-08, + "loss": 0.2703, + "step": 26275 + }, + { + "epoch": 0.9824987732785009, + "grad_norm": 0.7728235721588135, + "learning_rate": 1.510321746964416e-08, + "loss": 0.3094, + "step": 26280 + }, + { + "epoch": 0.9826857022688507, + "grad_norm": 0.7537628412246704, + "learning_rate": 1.4782302177884789e-08, + "loss": 0.2852, + "step": 26285 + }, + { + "epoch": 0.9828726312592004, + "grad_norm": 0.5353776812553406, + "learning_rate": 1.4464830501597082e-08, + "loss": 0.3261, + "step": 26290 + }, + { + "epoch": 0.9830595602495502, + "grad_norm": 0.6199917793273926, + "learning_rate": 1.4150802550267905e-08, + "loss": 0.2453, + "step": 26295 + }, + { + "epoch": 0.9832464892399, + "grad_norm": 0.15641337633132935, + "learning_rate": 1.3840218432195074e-08, + "loss": 0.2621, + "step": 26300 + }, + { + "epoch": 0.9834334182302498, + "grad_norm": 0.38754504919052124, + "learning_rate": 1.3533078254492905e-08, + "loss": 0.2353, + "step": 26305 + }, + { + "epoch": 0.9836203472205995, + "grad_norm": 0.6075884699821472, + "learning_rate": 1.3229382123082223e-08, + "loss": 0.3138, + "step": 26310 + }, + { + "epoch": 0.9838072762109493, + "grad_norm": 0.46125859022140503, + "learning_rate": 1.2929130142700363e-08, + "loss": 0.2389, + "step": 26315 + }, + { + "epoch": 0.9839942052012992, + "grad_norm": 0.44463780522346497, + "learning_rate": 1.2632322416896715e-08, + "loss": 0.2877, + "step": 26320 + }, + { + "epoch": 0.984181134191649, + "grad_norm": 0.4823720455169678, + "learning_rate": 1.2338959048029398e-08, + "loss": 0.2585, + "step": 26325 + }, + { + "epoch": 0.9843680631819988, + "grad_norm": 0.3346082866191864, + "learning_rate": 1.2049040137273038e-08, + "loss": 0.2628, + "step": 26330 + }, + { + "epoch": 0.9845549921723485, + "grad_norm": 0.7445553541183472, + "learning_rate": 1.1762565784612101e-08, + "loss": 0.2554, + "step": 26335 + }, + { + "epoch": 0.9847419211626983, + "grad_norm": 0.5069872736930847, + "learning_rate": 1.1479536088843112e-08, + "loss": 0.3881, + "step": 26340 + }, + { + "epoch": 0.9849288501530481, + "grad_norm": 0.4112212061882019, + "learning_rate": 1.1199951147574661e-08, + "loss": 0.277, + "step": 26345 + }, + { + "epoch": 0.9851157791433979, + "grad_norm": 0.4882351756095886, + "learning_rate": 1.0923811057227396e-08, + "loss": 0.2854, + "step": 26350 + }, + { + "epoch": 0.9853027081337477, + "grad_norm": 0.37586453557014465, + "learning_rate": 1.0651115913035137e-08, + "loss": 0.2478, + "step": 26355 + }, + { + "epoch": 0.9854896371240974, + "grad_norm": 1.524063229560852, + "learning_rate": 1.0381865809040437e-08, + "loss": 0.3372, + "step": 26360 + }, + { + "epoch": 0.9856765661144473, + "grad_norm": 0.4818149209022522, + "learning_rate": 1.0116060838103458e-08, + "loss": 0.2763, + "step": 26365 + }, + { + "epoch": 0.9858634951047971, + "grad_norm": 0.2843870222568512, + "learning_rate": 9.853701091888656e-09, + "loss": 0.2549, + "step": 26370 + }, + { + "epoch": 0.9860504240951469, + "grad_norm": 0.44535329937934875, + "learning_rate": 9.594786660880317e-09, + "loss": 0.3165, + "step": 26375 + }, + { + "epoch": 0.9862373530854966, + "grad_norm": 0.2467801719903946, + "learning_rate": 9.339317634367017e-09, + "loss": 0.225, + "step": 26380 + }, + { + "epoch": 0.9864242820758464, + "grad_norm": 0.4127419590950012, + "learning_rate": 9.087294100456056e-09, + "loss": 0.2825, + "step": 26385 + }, + { + "epoch": 0.9866112110661962, + "grad_norm": 0.4206787645816803, + "learning_rate": 8.838716146060134e-09, + "loss": 0.2727, + "step": 26390 + }, + { + "epoch": 0.986798140056546, + "grad_norm": 0.5794749855995178, + "learning_rate": 8.59358385690956e-09, + "loss": 0.3581, + "step": 26395 + }, + { + "epoch": 0.9869850690468958, + "grad_norm": 0.37697890400886536, + "learning_rate": 8.351897317541157e-09, + "loss": 0.271, + "step": 26400 + }, + { + "epoch": 0.9871719980372456, + "grad_norm": 0.5611318945884705, + "learning_rate": 8.11365661130603e-09, + "loss": 0.3294, + "step": 26405 + }, + { + "epoch": 0.9873589270275954, + "grad_norm": 0.2812879979610443, + "learning_rate": 7.878861820367345e-09, + "loss": 0.3604, + "step": 26410 + }, + { + "epoch": 0.9875458560179452, + "grad_norm": 0.3695289194583893, + "learning_rate": 7.647513025698105e-09, + "loss": 0.2133, + "step": 26415 + }, + { + "epoch": 0.987732785008295, + "grad_norm": 0.46063148975372314, + "learning_rate": 7.4196103070856005e-09, + "loss": 0.2794, + "step": 26420 + }, + { + "epoch": 0.9879197139986448, + "grad_norm": 0.3603619337081909, + "learning_rate": 7.195153743124739e-09, + "loss": 0.2773, + "step": 26425 + }, + { + "epoch": 0.9881066429889945, + "grad_norm": 0.515447735786438, + "learning_rate": 6.97414341122582e-09, + "loss": 0.3515, + "step": 26430 + }, + { + "epoch": 0.9882935719793443, + "grad_norm": 0.4539223611354828, + "learning_rate": 6.756579387607875e-09, + "loss": 0.2721, + "step": 26435 + }, + { + "epoch": 0.9884805009696941, + "grad_norm": 0.927209734916687, + "learning_rate": 6.542461747304218e-09, + "loss": 0.2898, + "step": 26440 + }, + { + "epoch": 0.988667429960044, + "grad_norm": 0.33204635977745056, + "learning_rate": 6.331790564155782e-09, + "loss": 0.3234, + "step": 26445 + }, + { + "epoch": 0.9888543589503938, + "grad_norm": 0.4017919898033142, + "learning_rate": 6.124565910818891e-09, + "loss": 0.2895, + "step": 26450 + }, + { + "epoch": 0.9890412879407435, + "grad_norm": 0.3101350665092468, + "learning_rate": 5.9207878587574926e-09, + "loss": 0.2818, + "step": 26455 + }, + { + "epoch": 0.9892282169310933, + "grad_norm": 0.38816362619400024, + "learning_rate": 5.7204564782498136e-09, + "loss": 0.2629, + "step": 26460 + }, + { + "epoch": 0.9894151459214431, + "grad_norm": 0.49870437383651733, + "learning_rate": 5.523571838385034e-09, + "loss": 0.3173, + "step": 26465 + }, + { + "epoch": 0.9896020749117929, + "grad_norm": 0.5789703726768494, + "learning_rate": 5.330134007062171e-09, + "loss": 0.2963, + "step": 26470 + }, + { + "epoch": 0.9897890039021426, + "grad_norm": 0.3744274079799652, + "learning_rate": 5.1401430509923075e-09, + "loss": 0.3257, + "step": 26475 + }, + { + "epoch": 0.9899759328924924, + "grad_norm": 0.4980723559856415, + "learning_rate": 4.953599035697476e-09, + "loss": 0.2974, + "step": 26480 + }, + { + "epoch": 0.9901628618828423, + "grad_norm": 0.71192467212677, + "learning_rate": 4.77050202551288e-09, + "loss": 0.265, + "step": 26485 + }, + { + "epoch": 0.9903497908731921, + "grad_norm": 0.3702625036239624, + "learning_rate": 4.590852083582453e-09, + "loss": 0.2456, + "step": 26490 + }, + { + "epoch": 0.9905367198635419, + "grad_norm": 0.459648996591568, + "learning_rate": 4.414649271863303e-09, + "loss": 0.2939, + "step": 26495 + }, + { + "epoch": 0.9907236488538916, + "grad_norm": 0.2367364466190338, + "learning_rate": 4.241893651120155e-09, + "loss": 0.2624, + "step": 26500 + }, + { + "epoch": 0.9909105778442414, + "grad_norm": 0.43819648027420044, + "learning_rate": 4.0725852809342384e-09, + "loss": 0.345, + "step": 26505 + }, + { + "epoch": 0.9910975068345912, + "grad_norm": 0.3360496461391449, + "learning_rate": 3.906724219694402e-09, + "loss": 0.2522, + "step": 26510 + }, + { + "epoch": 0.991284435824941, + "grad_norm": 0.2503529191017151, + "learning_rate": 3.744310524600447e-09, + "loss": 0.2847, + "step": 26515 + }, + { + "epoch": 0.9914713648152907, + "grad_norm": 0.49219122529029846, + "learning_rate": 3.585344251665346e-09, + "loss": 0.2354, + "step": 26520 + }, + { + "epoch": 0.9916582938056406, + "grad_norm": 0.42718371748924255, + "learning_rate": 3.4298254557108e-09, + "loss": 0.3146, + "step": 26525 + }, + { + "epoch": 0.9918452227959904, + "grad_norm": 0.6982577443122864, + "learning_rate": 3.2777541903716845e-09, + "loss": 0.2527, + "step": 26530 + }, + { + "epoch": 0.9920321517863402, + "grad_norm": 0.5282479524612427, + "learning_rate": 3.129130508092715e-09, + "loss": 0.2256, + "step": 26535 + }, + { + "epoch": 0.99221908077669, + "grad_norm": 0.24284540116786957, + "learning_rate": 2.983954460130667e-09, + "loss": 0.3069, + "step": 26540 + }, + { + "epoch": 0.9924060097670397, + "grad_norm": 0.4188280403614044, + "learning_rate": 2.8422260965510485e-09, + "loss": 0.303, + "step": 26545 + }, + { + "epoch": 0.9925929387573895, + "grad_norm": 0.8391254544258118, + "learning_rate": 2.7039454662336484e-09, + "loss": 0.2957, + "step": 26550 + }, + { + "epoch": 0.9927798677477393, + "grad_norm": 0.3748670518398285, + "learning_rate": 2.5691126168669866e-09, + "loss": 0.2781, + "step": 26555 + }, + { + "epoch": 0.9929667967380891, + "grad_norm": 0.44531556963920593, + "learning_rate": 2.437727594949424e-09, + "loss": 0.3105, + "step": 26560 + }, + { + "epoch": 0.993153725728439, + "grad_norm": 0.43134966492652893, + "learning_rate": 2.309790445794713e-09, + "loss": 0.2255, + "step": 26565 + }, + { + "epoch": 0.9933406547187887, + "grad_norm": 0.471179723739624, + "learning_rate": 2.1853012135220065e-09, + "loss": 0.302, + "step": 26570 + }, + { + "epoch": 0.9935275837091385, + "grad_norm": 0.3568282425403595, + "learning_rate": 2.0642599410658493e-09, + "loss": 0.2448, + "step": 26575 + }, + { + "epoch": 0.9937145126994883, + "grad_norm": 0.5308835506439209, + "learning_rate": 1.946666670169517e-09, + "loss": 0.2497, + "step": 26580 + }, + { + "epoch": 0.9939014416898381, + "grad_norm": 0.5474340319633484, + "learning_rate": 1.832521441387236e-09, + "loss": 0.3189, + "step": 26585 + }, + { + "epoch": 0.9940883706801878, + "grad_norm": 0.31903812289237976, + "learning_rate": 1.7218242940841845e-09, + "loss": 0.2821, + "step": 26590 + }, + { + "epoch": 0.9942752996705376, + "grad_norm": 0.33196452260017395, + "learning_rate": 1.6145752664376014e-09, + "loss": 0.3051, + "step": 26595 + }, + { + "epoch": 0.9944622286608874, + "grad_norm": 0.7084635496139526, + "learning_rate": 1.5107743954334565e-09, + "loss": 0.1976, + "step": 26600 + }, + { + "epoch": 0.9946491576512372, + "grad_norm": 0.3358515799045563, + "learning_rate": 1.4104217168708911e-09, + "loss": 0.2774, + "step": 26605 + }, + { + "epoch": 0.9948360866415871, + "grad_norm": 0.46138903498649597, + "learning_rate": 1.3135172653577777e-09, + "loss": 0.2587, + "step": 26610 + }, + { + "epoch": 0.9950230156319368, + "grad_norm": 0.47722405195236206, + "learning_rate": 1.2200610743129394e-09, + "loss": 0.2649, + "step": 26615 + }, + { + "epoch": 0.9952099446222866, + "grad_norm": 0.30284929275512695, + "learning_rate": 1.1300531759694811e-09, + "loss": 0.302, + "step": 26620 + }, + { + "epoch": 0.9953968736126364, + "grad_norm": 0.768482506275177, + "learning_rate": 1.043493601365908e-09, + "loss": 0.2393, + "step": 26625 + }, + { + "epoch": 0.9955838026029862, + "grad_norm": 0.6742646098136902, + "learning_rate": 9.60382380355007e-10, + "loss": 0.2402, + "step": 26630 + }, + { + "epoch": 0.995770731593336, + "grad_norm": 0.5979500412940979, + "learning_rate": 8.807195415994063e-10, + "loss": 0.3245, + "step": 26635 + }, + { + "epoch": 0.9959576605836857, + "grad_norm": 1.0489237308502197, + "learning_rate": 8.045051125726844e-10, + "loss": 0.313, + "step": 26640 + }, + { + "epoch": 0.9961445895740355, + "grad_norm": 0.5769110918045044, + "learning_rate": 7.317391195593714e-10, + "loss": 0.3009, + "step": 26645 + }, + { + "epoch": 0.9963315185643854, + "grad_norm": 0.7712821364402771, + "learning_rate": 6.624215876538387e-10, + "loss": 0.262, + "step": 26650 + }, + { + "epoch": 0.9965184475547352, + "grad_norm": 0.4885789752006531, + "learning_rate": 5.965525407614081e-10, + "loss": 0.2446, + "step": 26655 + }, + { + "epoch": 0.996705376545085, + "grad_norm": 0.2925719916820526, + "learning_rate": 5.341320015994633e-10, + "loss": 0.2598, + "step": 26660 + }, + { + "epoch": 0.9968923055354347, + "grad_norm": 0.22376123070716858, + "learning_rate": 4.751599916941185e-10, + "loss": 0.2542, + "step": 26665 + }, + { + "epoch": 0.9970792345257845, + "grad_norm": 0.45399028062820435, + "learning_rate": 4.1963653138354933e-10, + "loss": 0.257, + "step": 26670 + }, + { + "epoch": 0.9972661635161343, + "grad_norm": 0.3218282461166382, + "learning_rate": 3.675616398157722e-10, + "loss": 0.308, + "step": 26675 + }, + { + "epoch": 0.9974530925064841, + "grad_norm": 0.3195207118988037, + "learning_rate": 3.1893533495086505e-10, + "loss": 0.2604, + "step": 26680 + }, + { + "epoch": 0.9976400214968338, + "grad_norm": 0.31073522567749023, + "learning_rate": 2.737576335576364e-10, + "loss": 0.3143, + "step": 26685 + }, + { + "epoch": 0.9978269504871837, + "grad_norm": 0.3902645409107208, + "learning_rate": 2.320285512169562e-10, + "loss": 0.2869, + "step": 26690 + }, + { + "epoch": 0.9980138794775335, + "grad_norm": 0.3908766806125641, + "learning_rate": 1.9374810231953533e-10, + "loss": 0.2434, + "step": 26695 + }, + { + "epoch": 0.9982008084678833, + "grad_norm": 0.5233207941055298, + "learning_rate": 1.5891630006814595e-10, + "loss": 0.29, + "step": 26700 + }, + { + "epoch": 0.9983877374582331, + "grad_norm": 0.7636908292770386, + "learning_rate": 1.275331564754012e-10, + "loss": 0.3399, + "step": 26705 + }, + { + "epoch": 0.9985746664485828, + "grad_norm": 0.21469980478286743, + "learning_rate": 9.959868236375514e-11, + "loss": 0.2314, + "step": 26710 + }, + { + "epoch": 0.9987615954389326, + "grad_norm": 0.3713749349117279, + "learning_rate": 7.511288736661293e-11, + "loss": 0.3933, + "step": 26715 + }, + { + "epoch": 0.9989485244292824, + "grad_norm": 0.4366331994533539, + "learning_rate": 5.407577992944113e-11, + "loss": 0.2541, + "step": 26720 + }, + { + "epoch": 0.9991354534196322, + "grad_norm": 0.4484507739543915, + "learning_rate": 3.648736730643698e-11, + "loss": 0.2271, + "step": 26725 + }, + { + "epoch": 0.9993223824099821, + "grad_norm": 0.3740657567977905, + "learning_rate": 2.2347655563859096e-11, + "loss": 0.3167, + "step": 26730 + }, + { + "epoch": 0.9995093114003318, + "grad_norm": 0.5398233532905579, + "learning_rate": 1.165664957780699e-11, + "loss": 0.2958, + "step": 26735 + }, + { + "epoch": 0.9996962403906816, + "grad_norm": 0.46718209981918335, + "learning_rate": 4.414353035331331e-12, + "loss": 0.2451, + "step": 26740 + }, + { + "epoch": 0.9998831693810314, + "grad_norm": 0.2932209074497223, + "learning_rate": 6.207684344339271e-13, + "loss": 0.2126, + "step": 26745 + }, + { + "epoch": 0.9999953267752413, + "step": 26748, + "total_flos": 5.335451483137671e+18, + "train_loss": 0.2982786745278668, + "train_runtime": 62383.2691, + "train_samples_per_second": 3.43, + "train_steps_per_second": 0.429 + } + ], + "logging_steps": 5, + "max_steps": 26748, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.335451483137671e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}