{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5687, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008791981712678037, "grad_norm": 10.053899765014648, "learning_rate": 5.847953216374269e-07, "loss": 0.1487, "step": 5 }, { "epoch": 0.0017583963425356075, "grad_norm": 1371.18408203125, "learning_rate": 1.1695906432748538e-06, "loss": 0.154, "step": 10 }, { "epoch": 0.0026375945138034113, "grad_norm": 4331.09814453125, "learning_rate": 1.7543859649122807e-06, "loss": 0.1485, "step": 15 }, { "epoch": 0.003516792685071215, "grad_norm": 50.946231842041016, "learning_rate": 2.3391812865497075e-06, "loss": 0.082, "step": 20 }, { "epoch": 0.004395990856339019, "grad_norm": 130.27598571777344, "learning_rate": 2.9239766081871347e-06, "loss": 0.1029, "step": 25 }, { "epoch": 0.005275189027606823, "grad_norm": 312.9079895019531, "learning_rate": 3.5087719298245615e-06, "loss": 0.0983, "step": 30 }, { "epoch": 0.006154387198874627, "grad_norm": 828.6588134765625, "learning_rate": 4.093567251461989e-06, "loss": 0.116, "step": 35 }, { "epoch": 0.00703358537014243, "grad_norm": 95.5357666015625, "learning_rate": 4.678362573099415e-06, "loss": 0.1025, "step": 40 }, { "epoch": 0.007912783541410234, "grad_norm": 46.62782669067383, "learning_rate": 5.263157894736842e-06, "loss": 0.0929, "step": 45 }, { "epoch": 0.008791981712678037, "grad_norm": 24.908008575439453, "learning_rate": 5.847953216374269e-06, "loss": 0.0911, "step": 50 }, { "epoch": 0.009671179883945842, "grad_norm": 42.72391891479492, "learning_rate": 6.432748538011696e-06, "loss": 0.1051, "step": 55 }, { "epoch": 0.010550378055213645, "grad_norm": 1.5553064346313477, "learning_rate": 7.017543859649123e-06, "loss": 0.0939, "step": 60 }, { "epoch": 0.011429576226481448, "grad_norm": 840.0651245117188, "learning_rate": 7.60233918128655e-06, "loss": 0.0739, "step": 65 }, { "epoch": 0.012308774397749253, "grad_norm": 4780.78466796875, "learning_rate": 8.187134502923977e-06, "loss": 0.1114, "step": 70 }, { "epoch": 0.013187972569017057, "grad_norm": 36.51594161987305, "learning_rate": 8.771929824561405e-06, "loss": 0.1356, "step": 75 }, { "epoch": 0.01406717074028486, "grad_norm": 6.5139970779418945, "learning_rate": 9.35672514619883e-06, "loss": 0.0977, "step": 80 }, { "epoch": 0.014946368911552665, "grad_norm": 63.903018951416016, "learning_rate": 9.941520467836257e-06, "loss": 0.1117, "step": 85 }, { "epoch": 0.015825567082820468, "grad_norm": 18.261695861816406, "learning_rate": 1.0526315789473684e-05, "loss": 0.0912, "step": 90 }, { "epoch": 0.01670476525408827, "grad_norm": 246.80682373046875, "learning_rate": 1.1111111111111113e-05, "loss": 0.1014, "step": 95 }, { "epoch": 0.017583963425356074, "grad_norm": 432.3685607910156, "learning_rate": 1.1695906432748539e-05, "loss": 0.0915, "step": 100 }, { "epoch": 0.018463161596623878, "grad_norm": 1106.1822509765625, "learning_rate": 1.2280701754385966e-05, "loss": 0.0993, "step": 105 }, { "epoch": 0.019342359767891684, "grad_norm": 39.593421936035156, "learning_rate": 1.2865497076023392e-05, "loss": 0.157, "step": 110 }, { "epoch": 0.020221557939159487, "grad_norm": 3.5536696910858154, "learning_rate": 1.345029239766082e-05, "loss": 0.1274, "step": 115 }, { "epoch": 0.02110075611042729, "grad_norm": 1.2601370811462402, "learning_rate": 1.4035087719298246e-05, "loss": 0.0806, "step": 120 }, { "epoch": 0.021979954281695094, "grad_norm": 1.1521034240722656, "learning_rate": 1.4619883040935675e-05, "loss": 0.0785, "step": 125 }, { "epoch": 0.022859152452962897, "grad_norm": 0.7003596425056458, "learning_rate": 1.52046783625731e-05, "loss": 0.0819, "step": 130 }, { "epoch": 0.0237383506242307, "grad_norm": 0.29950231313705444, "learning_rate": 1.578947368421053e-05, "loss": 0.0825, "step": 135 }, { "epoch": 0.024617548795498507, "grad_norm": 1.1281251907348633, "learning_rate": 1.6374269005847955e-05, "loss": 0.105, "step": 140 }, { "epoch": 0.02549674696676631, "grad_norm": 0.8757471442222595, "learning_rate": 1.695906432748538e-05, "loss": 0.1029, "step": 145 }, { "epoch": 0.026375945138034113, "grad_norm": 1.9525190591812134, "learning_rate": 1.754385964912281e-05, "loss": 0.0708, "step": 150 }, { "epoch": 0.027255143309301916, "grad_norm": 1.9676127433776855, "learning_rate": 1.8128654970760235e-05, "loss": 0.0863, "step": 155 }, { "epoch": 0.02813434148056972, "grad_norm": 0.7372169494628906, "learning_rate": 1.871345029239766e-05, "loss": 0.0668, "step": 160 }, { "epoch": 0.029013539651837523, "grad_norm": 0.6957089900970459, "learning_rate": 1.929824561403509e-05, "loss": 0.0688, "step": 165 }, { "epoch": 0.02989273782310533, "grad_norm": 0.7962855696678162, "learning_rate": 1.9883040935672515e-05, "loss": 0.0619, "step": 170 }, { "epoch": 0.030771935994373133, "grad_norm": 0.6682185530662537, "learning_rate": 1.999997404978087e-05, "loss": 0.0698, "step": 175 }, { "epoch": 0.031651134165640936, "grad_norm": 0.7355603575706482, "learning_rate": 1.999986862724647e-05, "loss": 0.078, "step": 180 }, { "epoch": 0.03253033233690874, "grad_norm": 0.4755054712295532, "learning_rate": 1.9999682111362368e-05, "loss": 0.0732, "step": 185 }, { "epoch": 0.03340953050817654, "grad_norm": 0.4909241497516632, "learning_rate": 1.9999414503641103e-05, "loss": 0.0631, "step": 190 }, { "epoch": 0.034288728679444345, "grad_norm": 1.0899882316589355, "learning_rate": 1.9999065806252828e-05, "loss": 0.0692, "step": 195 }, { "epoch": 0.03516792685071215, "grad_norm": 0.263811320066452, "learning_rate": 1.999863602202528e-05, "loss": 0.0696, "step": 200 }, { "epoch": 0.03604712502197995, "grad_norm": 1.9172215461730957, "learning_rate": 1.999812515444377e-05, "loss": 0.079, "step": 205 }, { "epoch": 0.036926323193247755, "grad_norm": 1.79094660282135, "learning_rate": 1.9997533207651147e-05, "loss": 0.0627, "step": 210 }, { "epoch": 0.037805521364515565, "grad_norm": 0.3693501651287079, "learning_rate": 1.999686018644777e-05, "loss": 0.0778, "step": 215 }, { "epoch": 0.03868471953578337, "grad_norm": 0.18116237223148346, "learning_rate": 1.999610609629147e-05, "loss": 0.0643, "step": 220 }, { "epoch": 0.03956391770705117, "grad_norm": 0.5909445881843567, "learning_rate": 1.999527094329749e-05, "loss": 0.0689, "step": 225 }, { "epoch": 0.040443115878318975, "grad_norm": 0.4016267955303192, "learning_rate": 1.9994354734238456e-05, "loss": 0.0589, "step": 230 }, { "epoch": 0.04132231404958678, "grad_norm": 0.8470014929771423, "learning_rate": 1.9993357476544314e-05, "loss": 0.0714, "step": 235 }, { "epoch": 0.04220151222085458, "grad_norm": 1.2889784574508667, "learning_rate": 1.9992279178302266e-05, "loss": 0.0759, "step": 240 }, { "epoch": 0.043080710392122384, "grad_norm": 1.695059061050415, "learning_rate": 1.9991119848256708e-05, "loss": 0.0582, "step": 245 }, { "epoch": 0.04395990856339019, "grad_norm": 0.7226565480232239, "learning_rate": 1.998987949580916e-05, "loss": 0.0802, "step": 250 }, { "epoch": 0.04483910673465799, "grad_norm": 0.513992965221405, "learning_rate": 1.9988558131018188e-05, "loss": 0.0747, "step": 255 }, { "epoch": 0.045718304905925794, "grad_norm": 0.8010172247886658, "learning_rate": 1.998715576459932e-05, "loss": 0.0779, "step": 260 }, { "epoch": 0.0465975030771936, "grad_norm": 0.6723889112472534, "learning_rate": 1.9985672407924966e-05, "loss": 0.0778, "step": 265 }, { "epoch": 0.0474767012484614, "grad_norm": 0.5232120752334595, "learning_rate": 1.998410807302432e-05, "loss": 0.0606, "step": 270 }, { "epoch": 0.048355899419729204, "grad_norm": 1.1310707330703735, "learning_rate": 1.9982462772583267e-05, "loss": 0.0786, "step": 275 }, { "epoch": 0.049235097590997014, "grad_norm": 0.42932379245758057, "learning_rate": 1.998073651994427e-05, "loss": 0.0674, "step": 280 }, { "epoch": 0.05011429576226482, "grad_norm": 0.30086904764175415, "learning_rate": 1.997892932910628e-05, "loss": 0.0662, "step": 285 }, { "epoch": 0.05099349393353262, "grad_norm": 0.3778522312641144, "learning_rate": 1.9977041214724594e-05, "loss": 0.077, "step": 290 }, { "epoch": 0.05187269210480042, "grad_norm": 1.5126148462295532, "learning_rate": 1.997507219211078e-05, "loss": 0.073, "step": 295 }, { "epoch": 0.052751890276068227, "grad_norm": 0.4894915223121643, "learning_rate": 1.99730222772325e-05, "loss": 0.0705, "step": 300 }, { "epoch": 0.05363108844733603, "grad_norm": 0.6623161435127258, "learning_rate": 1.9970891486713423e-05, "loss": 0.0583, "step": 305 }, { "epoch": 0.05451028661860383, "grad_norm": 0.6160693764686584, "learning_rate": 1.9968679837833075e-05, "loss": 0.061, "step": 310 }, { "epoch": 0.055389484789871636, "grad_norm": 1.3512332439422607, "learning_rate": 1.9966387348526682e-05, "loss": 0.0609, "step": 315 }, { "epoch": 0.05626868296113944, "grad_norm": 0.7443258166313171, "learning_rate": 1.9964014037385065e-05, "loss": 0.0605, "step": 320 }, { "epoch": 0.05714788113240724, "grad_norm": 0.47612714767456055, "learning_rate": 1.996155992365444e-05, "loss": 0.0631, "step": 325 }, { "epoch": 0.058027079303675046, "grad_norm": 1.439274787902832, "learning_rate": 1.9959025027236305e-05, "loss": 0.0687, "step": 330 }, { "epoch": 0.05890627747494285, "grad_norm": 0.7117618322372437, "learning_rate": 1.9956409368687257e-05, "loss": 0.0714, "step": 335 }, { "epoch": 0.05978547564621066, "grad_norm": 0.6142310500144958, "learning_rate": 1.995371296921882e-05, "loss": 0.0672, "step": 340 }, { "epoch": 0.06066467381747846, "grad_norm": 1.082131028175354, "learning_rate": 1.9950935850697288e-05, "loss": 0.0879, "step": 345 }, { "epoch": 0.061543871988746265, "grad_norm": 0.35354727506637573, "learning_rate": 1.9948078035643546e-05, "loss": 0.0799, "step": 350 }, { "epoch": 0.06242307016001407, "grad_norm": 0.2982726991176605, "learning_rate": 1.9945139547232872e-05, "loss": 0.0764, "step": 355 }, { "epoch": 0.06330226833128187, "grad_norm": 1.1916660070419312, "learning_rate": 1.9942120409294768e-05, "loss": 0.0742, "step": 360 }, { "epoch": 0.06418146650254968, "grad_norm": 1.0965343713760376, "learning_rate": 1.9939020646312764e-05, "loss": 0.0634, "step": 365 }, { "epoch": 0.06506066467381748, "grad_norm": 0.46244287490844727, "learning_rate": 1.9935840283424196e-05, "loss": 0.0711, "step": 370 }, { "epoch": 0.06593986284508528, "grad_norm": 0.1318541318178177, "learning_rate": 1.993257934642004e-05, "loss": 0.0591, "step": 375 }, { "epoch": 0.06681906101635308, "grad_norm": 0.5300299525260925, "learning_rate": 1.9929237861744663e-05, "loss": 0.0712, "step": 380 }, { "epoch": 0.06769825918762089, "grad_norm": 1.014757752418518, "learning_rate": 1.9925815856495646e-05, "loss": 0.0612, "step": 385 }, { "epoch": 0.06857745735888869, "grad_norm": 0.24749091267585754, "learning_rate": 1.992231335842354e-05, "loss": 0.077, "step": 390 }, { "epoch": 0.0694566555301565, "grad_norm": 0.5739014148712158, "learning_rate": 1.9918730395931648e-05, "loss": 0.0618, "step": 395 }, { "epoch": 0.0703358537014243, "grad_norm": 0.23715724050998688, "learning_rate": 1.9915066998075797e-05, "loss": 0.0563, "step": 400 }, { "epoch": 0.0712150518726921, "grad_norm": 0.5633426904678345, "learning_rate": 1.9911323194564095e-05, "loss": 0.054, "step": 405 }, { "epoch": 0.0720942500439599, "grad_norm": 0.4382643401622772, "learning_rate": 1.9907499015756696e-05, "loss": 0.0561, "step": 410 }, { "epoch": 0.07297344821522771, "grad_norm": 0.4218790829181671, "learning_rate": 1.9903594492665557e-05, "loss": 0.0466, "step": 415 }, { "epoch": 0.07385264638649551, "grad_norm": 0.8700308203697205, "learning_rate": 1.9899609656954183e-05, "loss": 0.0652, "step": 420 }, { "epoch": 0.07473184455776331, "grad_norm": 0.1704479455947876, "learning_rate": 1.9895544540937358e-05, "loss": 0.0494, "step": 425 }, { "epoch": 0.07561104272903113, "grad_norm": 0.6057877540588379, "learning_rate": 1.989139917758091e-05, "loss": 0.0494, "step": 430 }, { "epoch": 0.07649024090029893, "grad_norm": 1.0760382413864136, "learning_rate": 1.9887173600501414e-05, "loss": 0.0767, "step": 435 }, { "epoch": 0.07736943907156674, "grad_norm": 0.42263808846473694, "learning_rate": 1.988286784396594e-05, "loss": 0.0666, "step": 440 }, { "epoch": 0.07824863724283454, "grad_norm": 0.13608968257904053, "learning_rate": 1.987848194289178e-05, "loss": 0.0663, "step": 445 }, { "epoch": 0.07912783541410234, "grad_norm": 0.20840178430080414, "learning_rate": 1.987401593284613e-05, "loss": 0.0814, "step": 450 }, { "epoch": 0.08000703358537015, "grad_norm": 1.5564632415771484, "learning_rate": 1.9869469850045845e-05, "loss": 0.0733, "step": 455 }, { "epoch": 0.08088623175663795, "grad_norm": 0.3628084063529968, "learning_rate": 1.9864843731357108e-05, "loss": 0.0668, "step": 460 }, { "epoch": 0.08176542992790575, "grad_norm": 6.541281223297119, "learning_rate": 1.986013761429517e-05, "loss": 0.0575, "step": 465 }, { "epoch": 0.08264462809917356, "grad_norm": 0.3698543310165405, "learning_rate": 1.9855351537024004e-05, "loss": 0.0686, "step": 470 }, { "epoch": 0.08352382627044136, "grad_norm": 0.21145962178707123, "learning_rate": 1.9850485538356026e-05, "loss": 0.0693, "step": 475 }, { "epoch": 0.08440302444170916, "grad_norm": 0.718197226524353, "learning_rate": 1.9845539657751768e-05, "loss": 0.0577, "step": 480 }, { "epoch": 0.08528222261297697, "grad_norm": 1.4340827465057373, "learning_rate": 1.9840513935319557e-05, "loss": 0.056, "step": 485 }, { "epoch": 0.08616142078424477, "grad_norm": 2.368858814239502, "learning_rate": 1.98354084118152e-05, "loss": 0.0674, "step": 490 }, { "epoch": 0.08704061895551257, "grad_norm": 0.6914955973625183, "learning_rate": 1.9830223128641636e-05, "loss": 0.0646, "step": 495 }, { "epoch": 0.08791981712678038, "grad_norm": 0.5653345584869385, "learning_rate": 1.9824958127848618e-05, "loss": 0.0868, "step": 500 }, { "epoch": 0.08879901529804818, "grad_norm": 0.6143190860748291, "learning_rate": 1.9819613452132365e-05, "loss": 0.0524, "step": 505 }, { "epoch": 0.08967821346931598, "grad_norm": 0.9025689363479614, "learning_rate": 1.9814189144835205e-05, "loss": 0.0646, "step": 510 }, { "epoch": 0.09055741164058378, "grad_norm": 1.0996524095535278, "learning_rate": 1.9808685249945245e-05, "loss": 0.0686, "step": 515 }, { "epoch": 0.09143660981185159, "grad_norm": 1.0614774227142334, "learning_rate": 1.9803101812096e-05, "loss": 0.0636, "step": 520 }, { "epoch": 0.09231580798311939, "grad_norm": 0.67917799949646, "learning_rate": 1.9797438876566027e-05, "loss": 0.0623, "step": 525 }, { "epoch": 0.0931950061543872, "grad_norm": 0.29005610942840576, "learning_rate": 1.9791696489278578e-05, "loss": 0.059, "step": 530 }, { "epoch": 0.094074204325655, "grad_norm": 0.6829861402511597, "learning_rate": 1.97858746968012e-05, "loss": 0.0797, "step": 535 }, { "epoch": 0.0949534024969228, "grad_norm": 0.22500386834144592, "learning_rate": 1.9779973546345385e-05, "loss": 0.0673, "step": 540 }, { "epoch": 0.0958326006681906, "grad_norm": 1.5297006368637085, "learning_rate": 1.9773993085766163e-05, "loss": 0.062, "step": 545 }, { "epoch": 0.09671179883945841, "grad_norm": 0.35818400979042053, "learning_rate": 1.976793336356173e-05, "loss": 0.0627, "step": 550 }, { "epoch": 0.09759099701072622, "grad_norm": 1.0418643951416016, "learning_rate": 1.976179442887305e-05, "loss": 0.0756, "step": 555 }, { "epoch": 0.09847019518199403, "grad_norm": 1.5865000486373901, "learning_rate": 1.9755576331483453e-05, "loss": 0.0577, "step": 560 }, { "epoch": 0.09934939335326183, "grad_norm": 0.43606239557266235, "learning_rate": 1.9749279121818235e-05, "loss": 0.0642, "step": 565 }, { "epoch": 0.10022859152452963, "grad_norm": 0.1087045669555664, "learning_rate": 1.9742902850944257e-05, "loss": 0.0667, "step": 570 }, { "epoch": 0.10110778969579744, "grad_norm": 0.4932880103588104, "learning_rate": 1.9736447570569503e-05, "loss": 0.0643, "step": 575 }, { "epoch": 0.10198698786706524, "grad_norm": 0.28585073351860046, "learning_rate": 1.97299133330427e-05, "loss": 0.0619, "step": 580 }, { "epoch": 0.10286618603833304, "grad_norm": 0.1778407096862793, "learning_rate": 1.9723300191352866e-05, "loss": 0.0482, "step": 585 }, { "epoch": 0.10374538420960085, "grad_norm": 0.35073766112327576, "learning_rate": 1.971660819912888e-05, "loss": 0.075, "step": 590 }, { "epoch": 0.10462458238086865, "grad_norm": 0.19325245916843414, "learning_rate": 1.9709837410639062e-05, "loss": 0.0629, "step": 595 }, { "epoch": 0.10550378055213645, "grad_norm": 0.5602083802223206, "learning_rate": 1.9702987880790733e-05, "loss": 0.0537, "step": 600 }, { "epoch": 0.10638297872340426, "grad_norm": 0.6220573782920837, "learning_rate": 1.969605966512975e-05, "loss": 0.0735, "step": 605 }, { "epoch": 0.10726217689467206, "grad_norm": 0.7392856478691101, "learning_rate": 1.968905281984007e-05, "loss": 0.0567, "step": 610 }, { "epoch": 0.10814137506593986, "grad_norm": 0.4744727909564972, "learning_rate": 1.9681967401743297e-05, "loss": 0.0668, "step": 615 }, { "epoch": 0.10902057323720767, "grad_norm": 1.1823076009750366, "learning_rate": 1.9674803468298216e-05, "loss": 0.0613, "step": 620 }, { "epoch": 0.10989977140847547, "grad_norm": 0.7359253764152527, "learning_rate": 1.9667561077600325e-05, "loss": 0.0633, "step": 625 }, { "epoch": 0.11077896957974327, "grad_norm": 0.49054092168807983, "learning_rate": 1.966024028838137e-05, "loss": 0.0571, "step": 630 }, { "epoch": 0.11165816775101108, "grad_norm": 0.3266174793243408, "learning_rate": 1.965284116000886e-05, "loss": 0.0717, "step": 635 }, { "epoch": 0.11253736592227888, "grad_norm": 0.2428748905658722, "learning_rate": 1.9645363752485594e-05, "loss": 0.0805, "step": 640 }, { "epoch": 0.11341656409354668, "grad_norm": 0.27535462379455566, "learning_rate": 1.963780812644917e-05, "loss": 0.0632, "step": 645 }, { "epoch": 0.11429576226481448, "grad_norm": 1.5982582569122314, "learning_rate": 1.9630174343171498e-05, "loss": 0.0574, "step": 650 }, { "epoch": 0.11517496043608229, "grad_norm": 0.09966005384922028, "learning_rate": 1.9622462464558296e-05, "loss": 0.0505, "step": 655 }, { "epoch": 0.11605415860735009, "grad_norm": 0.3889922797679901, "learning_rate": 1.9614672553148592e-05, "loss": 0.0598, "step": 660 }, { "epoch": 0.1169333567786179, "grad_norm": 1.1012970209121704, "learning_rate": 1.9606804672114217e-05, "loss": 0.0618, "step": 665 }, { "epoch": 0.1178125549498857, "grad_norm": 0.4842506945133209, "learning_rate": 1.959885888525929e-05, "loss": 0.065, "step": 670 }, { "epoch": 0.11869175312115351, "grad_norm": 0.5499223470687866, "learning_rate": 1.9590835257019715e-05, "loss": 0.0575, "step": 675 }, { "epoch": 0.11957095129242132, "grad_norm": 0.9636365175247192, "learning_rate": 1.9582733852462623e-05, "loss": 0.0565, "step": 680 }, { "epoch": 0.12045014946368912, "grad_norm": 0.5249933004379272, "learning_rate": 1.9574554737285885e-05, "loss": 0.0594, "step": 685 }, { "epoch": 0.12132934763495692, "grad_norm": 0.8125589489936829, "learning_rate": 1.956629797781756e-05, "loss": 0.0652, "step": 690 }, { "epoch": 0.12220854580622473, "grad_norm": 0.3194701373577118, "learning_rate": 1.955796364101535e-05, "loss": 0.0634, "step": 695 }, { "epoch": 0.12308774397749253, "grad_norm": 0.3122730851173401, "learning_rate": 1.954955179446608e-05, "loss": 0.0577, "step": 700 }, { "epoch": 0.12396694214876033, "grad_norm": 0.546394407749176, "learning_rate": 1.9541062506385116e-05, "loss": 0.0635, "step": 705 }, { "epoch": 0.12484614032002814, "grad_norm": 0.6376326680183411, "learning_rate": 1.9532495845615854e-05, "loss": 0.0702, "step": 710 }, { "epoch": 0.12572533849129594, "grad_norm": 0.5695558786392212, "learning_rate": 1.9523851881629124e-05, "loss": 0.0613, "step": 715 }, { "epoch": 0.12660453666256374, "grad_norm": 0.5965823531150818, "learning_rate": 1.9515130684522647e-05, "loss": 0.0652, "step": 720 }, { "epoch": 0.12748373483383155, "grad_norm": 0.4935171604156494, "learning_rate": 1.950633232502046e-05, "loss": 0.063, "step": 725 }, { "epoch": 0.12836293300509935, "grad_norm": 0.5116500854492188, "learning_rate": 1.9497456874472346e-05, "loss": 0.0552, "step": 730 }, { "epoch": 0.12924213117636715, "grad_norm": 0.5821178555488586, "learning_rate": 1.9488504404853247e-05, "loss": 0.0591, "step": 735 }, { "epoch": 0.13012132934763496, "grad_norm": 0.31190237402915955, "learning_rate": 1.94794749887627e-05, "loss": 0.0755, "step": 740 }, { "epoch": 0.13100052751890276, "grad_norm": 0.6998320817947388, "learning_rate": 1.947036869942422e-05, "loss": 0.086, "step": 745 }, { "epoch": 0.13187972569017056, "grad_norm": 0.5217974185943604, "learning_rate": 1.9461185610684736e-05, "loss": 0.0602, "step": 750 }, { "epoch": 0.13275892386143837, "grad_norm": 1.050721287727356, "learning_rate": 1.9451925797013955e-05, "loss": 0.0698, "step": 755 }, { "epoch": 0.13363812203270617, "grad_norm": 0.2586376965045929, "learning_rate": 1.9442589333503806e-05, "loss": 0.0622, "step": 760 }, { "epoch": 0.13451732020397397, "grad_norm": 0.5251173377037048, "learning_rate": 1.9433176295867792e-05, "loss": 0.0567, "step": 765 }, { "epoch": 0.13539651837524178, "grad_norm": 0.4066588878631592, "learning_rate": 1.9423686760440386e-05, "loss": 0.0548, "step": 770 }, { "epoch": 0.13627571654650958, "grad_norm": 0.5858006477355957, "learning_rate": 1.9414120804176427e-05, "loss": 0.0607, "step": 775 }, { "epoch": 0.13715491471777738, "grad_norm": 0.9163162112236023, "learning_rate": 1.9404478504650473e-05, "loss": 0.0575, "step": 780 }, { "epoch": 0.13803411288904519, "grad_norm": 0.274795264005661, "learning_rate": 1.939475994005619e-05, "loss": 0.07, "step": 785 }, { "epoch": 0.138913311060313, "grad_norm": 0.1485268473625183, "learning_rate": 1.938496518920571e-05, "loss": 0.067, "step": 790 }, { "epoch": 0.1397925092315808, "grad_norm": 0.57244473695755, "learning_rate": 1.937509433152899e-05, "loss": 0.0742, "step": 795 }, { "epoch": 0.1406717074028486, "grad_norm": 0.9547910690307617, "learning_rate": 1.9365147447073172e-05, "loss": 0.0655, "step": 800 }, { "epoch": 0.1415509055741164, "grad_norm": 0.8948219418525696, "learning_rate": 1.9355124616501936e-05, "loss": 0.0714, "step": 805 }, { "epoch": 0.1424301037453842, "grad_norm": 0.7073503136634827, "learning_rate": 1.934502592109484e-05, "loss": 0.0646, "step": 810 }, { "epoch": 0.143309301916652, "grad_norm": 0.38196781277656555, "learning_rate": 1.9334851442746665e-05, "loss": 0.0606, "step": 815 }, { "epoch": 0.1441885000879198, "grad_norm": 0.22767876088619232, "learning_rate": 1.9324601263966746e-05, "loss": 0.0586, "step": 820 }, { "epoch": 0.1450676982591876, "grad_norm": 0.44821423292160034, "learning_rate": 1.9314275467878304e-05, "loss": 0.0671, "step": 825 }, { "epoch": 0.14594689643045541, "grad_norm": 0.32358282804489136, "learning_rate": 1.9303874138217788e-05, "loss": 0.0535, "step": 830 }, { "epoch": 0.14682609460172322, "grad_norm": 0.39932888746261597, "learning_rate": 1.9293397359334167e-05, "loss": 0.0553, "step": 835 }, { "epoch": 0.14770529277299102, "grad_norm": 0.160264790058136, "learning_rate": 1.9282845216188267e-05, "loss": 0.0583, "step": 840 }, { "epoch": 0.14858449094425882, "grad_norm": 0.5190912485122681, "learning_rate": 1.9272217794352073e-05, "loss": 0.0716, "step": 845 }, { "epoch": 0.14946368911552663, "grad_norm": 0.5016174912452698, "learning_rate": 1.9261515180008047e-05, "loss": 0.0668, "step": 850 }, { "epoch": 0.15034288728679443, "grad_norm": 0.12489809095859528, "learning_rate": 1.9250737459948404e-05, "loss": 0.0619, "step": 855 }, { "epoch": 0.15122208545806226, "grad_norm": 1.153669834136963, "learning_rate": 1.923988472157445e-05, "loss": 0.0779, "step": 860 }, { "epoch": 0.15210128362933006, "grad_norm": 0.2374788522720337, "learning_rate": 1.9228957052895816e-05, "loss": 0.0677, "step": 865 }, { "epoch": 0.15298048180059787, "grad_norm": 1.060134768486023, "learning_rate": 1.92179545425298e-05, "loss": 0.0632, "step": 870 }, { "epoch": 0.15385967997186567, "grad_norm": 0.3676360845565796, "learning_rate": 1.9206877279700614e-05, "loss": 0.0614, "step": 875 }, { "epoch": 0.15473887814313347, "grad_norm": 0.3198089003562927, "learning_rate": 1.9195725354238677e-05, "loss": 0.0718, "step": 880 }, { "epoch": 0.15561807631440128, "grad_norm": 0.2891201674938202, "learning_rate": 1.918449885657987e-05, "loss": 0.0586, "step": 885 }, { "epoch": 0.15649727448566908, "grad_norm": 0.4054102897644043, "learning_rate": 1.9173197877764824e-05, "loss": 0.0523, "step": 890 }, { "epoch": 0.15737647265693688, "grad_norm": 0.1266939640045166, "learning_rate": 1.916182250943816e-05, "loss": 0.0546, "step": 895 }, { "epoch": 0.1582556708282047, "grad_norm": 0.7244488000869751, "learning_rate": 1.915037284384777e-05, "loss": 0.0634, "step": 900 }, { "epoch": 0.1591348689994725, "grad_norm": 0.8754041790962219, "learning_rate": 1.913884897384404e-05, "loss": 0.0712, "step": 905 }, { "epoch": 0.1600140671707403, "grad_norm": 0.7527337670326233, "learning_rate": 1.9127250992879128e-05, "loss": 0.0685, "step": 910 }, { "epoch": 0.1608932653420081, "grad_norm": 0.8655832409858704, "learning_rate": 1.9115578995006175e-05, "loss": 0.0709, "step": 915 }, { "epoch": 0.1617724635132759, "grad_norm": 0.5657609105110168, "learning_rate": 1.9103833074878565e-05, "loss": 0.0606, "step": 920 }, { "epoch": 0.1626516616845437, "grad_norm": 0.5217536091804504, "learning_rate": 1.909201332774916e-05, "loss": 0.0577, "step": 925 }, { "epoch": 0.1635308598558115, "grad_norm": 0.5291991233825684, "learning_rate": 1.908011984946949e-05, "loss": 0.0574, "step": 930 }, { "epoch": 0.1644100580270793, "grad_norm": 0.16585160791873932, "learning_rate": 1.9068152736489036e-05, "loss": 0.0588, "step": 935 }, { "epoch": 0.1652892561983471, "grad_norm": 0.5434625744819641, "learning_rate": 1.9056112085854397e-05, "loss": 0.0645, "step": 940 }, { "epoch": 0.16616845436961492, "grad_norm": 0.685371458530426, "learning_rate": 1.9043997995208525e-05, "loss": 0.0452, "step": 945 }, { "epoch": 0.16704765254088272, "grad_norm": 0.3393997251987457, "learning_rate": 1.9031810562789927e-05, "loss": 0.0569, "step": 950 }, { "epoch": 0.16792685071215052, "grad_norm": 0.281892329454422, "learning_rate": 1.901954988743188e-05, "loss": 0.0686, "step": 955 }, { "epoch": 0.16880604888341832, "grad_norm": 0.5703971982002258, "learning_rate": 1.9007216068561605e-05, "loss": 0.0667, "step": 960 }, { "epoch": 0.16968524705468613, "grad_norm": 0.6741696000099182, "learning_rate": 1.899480920619949e-05, "loss": 0.0551, "step": 965 }, { "epoch": 0.17056444522595393, "grad_norm": 1.3032324314117432, "learning_rate": 1.8982329400958254e-05, "loss": 0.066, "step": 970 }, { "epoch": 0.17144364339722173, "grad_norm": 0.8134323954582214, "learning_rate": 1.8969776754042157e-05, "loss": 0.0704, "step": 975 }, { "epoch": 0.17232284156848954, "grad_norm": 0.6495192646980286, "learning_rate": 1.895715136724615e-05, "loss": 0.0687, "step": 980 }, { "epoch": 0.17320203973975734, "grad_norm": 0.49367162585258484, "learning_rate": 1.8944453342955064e-05, "loss": 0.0555, "step": 985 }, { "epoch": 0.17408123791102514, "grad_norm": 0.2508549392223358, "learning_rate": 1.8931682784142792e-05, "loss": 0.0694, "step": 990 }, { "epoch": 0.17496043608229295, "grad_norm": 0.3268815875053406, "learning_rate": 1.891883979437143e-05, "loss": 0.058, "step": 995 }, { "epoch": 0.17583963425356075, "grad_norm": 0.6226515173912048, "learning_rate": 1.8905924477790452e-05, "loss": 0.0661, "step": 1000 }, { "epoch": 0.17671883242482855, "grad_norm": 0.3348465859889984, "learning_rate": 1.8892936939135863e-05, "loss": 0.0651, "step": 1005 }, { "epoch": 0.17759803059609636, "grad_norm": 0.7326810956001282, "learning_rate": 1.887987728372935e-05, "loss": 0.0695, "step": 1010 }, { "epoch": 0.17847722876736416, "grad_norm": 0.6014009714126587, "learning_rate": 1.8866745617477423e-05, "loss": 0.063, "step": 1015 }, { "epoch": 0.17935642693863196, "grad_norm": 1.2527378797531128, "learning_rate": 1.8853542046870558e-05, "loss": 0.0631, "step": 1020 }, { "epoch": 0.18023562510989977, "grad_norm": 0.3176214396953583, "learning_rate": 1.8840266678982343e-05, "loss": 0.0562, "step": 1025 }, { "epoch": 0.18111482328116757, "grad_norm": 0.26997071504592896, "learning_rate": 1.8826919621468595e-05, "loss": 0.0618, "step": 1030 }, { "epoch": 0.18199402145243537, "grad_norm": 0.2553798258304596, "learning_rate": 1.8813500982566498e-05, "loss": 0.0622, "step": 1035 }, { "epoch": 0.18287321962370318, "grad_norm": 0.9949320554733276, "learning_rate": 1.8800010871093718e-05, "loss": 0.0748, "step": 1040 }, { "epoch": 0.18375241779497098, "grad_norm": 0.5384786128997803, "learning_rate": 1.8786449396447528e-05, "loss": 0.0757, "step": 1045 }, { "epoch": 0.18463161596623878, "grad_norm": 0.14809344708919525, "learning_rate": 1.8772816668603907e-05, "loss": 0.0675, "step": 1050 }, { "epoch": 0.18551081413750659, "grad_norm": 0.764203667640686, "learning_rate": 1.8759112798116673e-05, "loss": 0.0615, "step": 1055 }, { "epoch": 0.1863900123087744, "grad_norm": 0.18247248232364655, "learning_rate": 1.874533789611655e-05, "loss": 0.061, "step": 1060 }, { "epoch": 0.1872692104800422, "grad_norm": 0.9988198280334473, "learning_rate": 1.873149207431031e-05, "loss": 0.0591, "step": 1065 }, { "epoch": 0.18814840865131, "grad_norm": 0.07756359130144119, "learning_rate": 1.871757544497983e-05, "loss": 0.0641, "step": 1070 }, { "epoch": 0.1890276068225778, "grad_norm": 0.7131006121635437, "learning_rate": 1.870358812098121e-05, "loss": 0.0581, "step": 1075 }, { "epoch": 0.1899068049938456, "grad_norm": 0.3485928177833557, "learning_rate": 1.868953021574382e-05, "loss": 0.0645, "step": 1080 }, { "epoch": 0.1907860031651134, "grad_norm": 0.16775915026664734, "learning_rate": 1.8675401843269438e-05, "loss": 0.0644, "step": 1085 }, { "epoch": 0.1916652013363812, "grad_norm": 0.3290361762046814, "learning_rate": 1.866120311813126e-05, "loss": 0.0619, "step": 1090 }, { "epoch": 0.192544399507649, "grad_norm": 1.0206267833709717, "learning_rate": 1.8646934155473025e-05, "loss": 0.0854, "step": 1095 }, { "epoch": 0.19342359767891681, "grad_norm": 0.6392635703086853, "learning_rate": 1.8632595071008044e-05, "loss": 0.0647, "step": 1100 }, { "epoch": 0.19430279585018465, "grad_norm": 0.4575440287590027, "learning_rate": 1.8618185981018292e-05, "loss": 0.065, "step": 1105 }, { "epoch": 0.19518199402145245, "grad_norm": 0.5402662754058838, "learning_rate": 1.8603707002353436e-05, "loss": 0.053, "step": 1110 }, { "epoch": 0.19606119219272025, "grad_norm": 0.16452832520008087, "learning_rate": 1.858915825242991e-05, "loss": 0.0577, "step": 1115 }, { "epoch": 0.19694039036398805, "grad_norm": 0.7707622647285461, "learning_rate": 1.857453984922995e-05, "loss": 0.0572, "step": 1120 }, { "epoch": 0.19781958853525586, "grad_norm": 0.2900203466415405, "learning_rate": 1.8559851911300638e-05, "loss": 0.0534, "step": 1125 }, { "epoch": 0.19869878670652366, "grad_norm": 0.2933928370475769, "learning_rate": 1.854509455775295e-05, "loss": 0.0534, "step": 1130 }, { "epoch": 0.19957798487779146, "grad_norm": 1.2258199453353882, "learning_rate": 1.8530267908260782e-05, "loss": 0.0645, "step": 1135 }, { "epoch": 0.20045718304905927, "grad_norm": 0.3073072135448456, "learning_rate": 1.8515372083059982e-05, "loss": 0.0672, "step": 1140 }, { "epoch": 0.20133638122032707, "grad_norm": 0.23768655955791473, "learning_rate": 1.850040720294737e-05, "loss": 0.0573, "step": 1145 }, { "epoch": 0.20221557939159487, "grad_norm": 0.6997068524360657, "learning_rate": 1.8485373389279768e-05, "loss": 0.0564, "step": 1150 }, { "epoch": 0.20309477756286268, "grad_norm": 0.4729757308959961, "learning_rate": 1.8470270763973004e-05, "loss": 0.0588, "step": 1155 }, { "epoch": 0.20397397573413048, "grad_norm": 0.19242537021636963, "learning_rate": 1.845509944950094e-05, "loss": 0.0532, "step": 1160 }, { "epoch": 0.20485317390539828, "grad_norm": 0.1492680460214615, "learning_rate": 1.8439859568894464e-05, "loss": 0.0658, "step": 1165 }, { "epoch": 0.2057323720766661, "grad_norm": 0.6383575201034546, "learning_rate": 1.8424551245740493e-05, "loss": 0.0563, "step": 1170 }, { "epoch": 0.2066115702479339, "grad_norm": 0.9722626805305481, "learning_rate": 1.8409174604180977e-05, "loss": 0.0603, "step": 1175 }, { "epoch": 0.2074907684192017, "grad_norm": 0.5511413812637329, "learning_rate": 1.8393729768911894e-05, "loss": 0.0534, "step": 1180 }, { "epoch": 0.2083699665904695, "grad_norm": 0.3645865321159363, "learning_rate": 1.837821686518223e-05, "loss": 0.0601, "step": 1185 }, { "epoch": 0.2092491647617373, "grad_norm": 0.43972018361091614, "learning_rate": 1.8362636018792975e-05, "loss": 0.049, "step": 1190 }, { "epoch": 0.2101283629330051, "grad_norm": 0.34283024072647095, "learning_rate": 1.8346987356096087e-05, "loss": 0.0596, "step": 1195 }, { "epoch": 0.2110075611042729, "grad_norm": 0.3272128701210022, "learning_rate": 1.833127100399348e-05, "loss": 0.0604, "step": 1200 }, { "epoch": 0.2118867592755407, "grad_norm": 0.48746854066848755, "learning_rate": 1.8315487089935995e-05, "loss": 0.0505, "step": 1205 }, { "epoch": 0.2127659574468085, "grad_norm": 0.21915239095687866, "learning_rate": 1.8299635741922365e-05, "loss": 0.0574, "step": 1210 }, { "epoch": 0.21364515561807632, "grad_norm": 0.3507218360900879, "learning_rate": 1.8283717088498157e-05, "loss": 0.0651, "step": 1215 }, { "epoch": 0.21452435378934412, "grad_norm": 0.712352454662323, "learning_rate": 1.8267731258754765e-05, "loss": 0.0564, "step": 1220 }, { "epoch": 0.21540355196061192, "grad_norm": 0.3927139937877655, "learning_rate": 1.8251678382328345e-05, "loss": 0.0474, "step": 1225 }, { "epoch": 0.21628275013187973, "grad_norm": 1.3225253820419312, "learning_rate": 1.8235558589398756e-05, "loss": 0.0826, "step": 1230 }, { "epoch": 0.21716194830314753, "grad_norm": 1.0917742252349854, "learning_rate": 1.8219372010688516e-05, "loss": 0.0614, "step": 1235 }, { "epoch": 0.21804114647441533, "grad_norm": 1.2095632553100586, "learning_rate": 1.8203118777461735e-05, "loss": 0.0569, "step": 1240 }, { "epoch": 0.21892034464568313, "grad_norm": 0.1377822309732437, "learning_rate": 1.8186799021523064e-05, "loss": 0.063, "step": 1245 }, { "epoch": 0.21979954281695094, "grad_norm": 1.2656950950622559, "learning_rate": 1.81704128752166e-05, "loss": 0.0664, "step": 1250 }, { "epoch": 0.22067874098821874, "grad_norm": 0.21433009207248688, "learning_rate": 1.815396047142485e-05, "loss": 0.0636, "step": 1255 }, { "epoch": 0.22155793915948654, "grad_norm": 0.34033942222595215, "learning_rate": 1.8137441943567607e-05, "loss": 0.0535, "step": 1260 }, { "epoch": 0.22243713733075435, "grad_norm": 0.3627121150493622, "learning_rate": 1.8120857425600914e-05, "loss": 0.0596, "step": 1265 }, { "epoch": 0.22331633550202215, "grad_norm": 0.6685848832130432, "learning_rate": 1.8104207052015952e-05, "loss": 0.0696, "step": 1270 }, { "epoch": 0.22419553367328995, "grad_norm": 0.2353779375553131, "learning_rate": 1.8087490957837947e-05, "loss": 0.0536, "step": 1275 }, { "epoch": 0.22507473184455776, "grad_norm": 0.403475821018219, "learning_rate": 1.807070927862509e-05, "loss": 0.0636, "step": 1280 }, { "epoch": 0.22595393001582556, "grad_norm": 0.6222421526908875, "learning_rate": 1.8053862150467417e-05, "loss": 0.0558, "step": 1285 }, { "epoch": 0.22683312818709336, "grad_norm": 0.3176823854446411, "learning_rate": 1.803694970998574e-05, "loss": 0.0566, "step": 1290 }, { "epoch": 0.22771232635836117, "grad_norm": 0.7340067625045776, "learning_rate": 1.8019972094330502e-05, "loss": 0.0487, "step": 1295 }, { "epoch": 0.22859152452962897, "grad_norm": 1.254428505897522, "learning_rate": 1.8002929441180684e-05, "loss": 0.0511, "step": 1300 }, { "epoch": 0.22947072270089677, "grad_norm": 0.20499931275844574, "learning_rate": 1.7985821888742687e-05, "loss": 0.0648, "step": 1305 }, { "epoch": 0.23034992087216458, "grad_norm": 0.4078265428543091, "learning_rate": 1.7968649575749202e-05, "loss": 0.047, "step": 1310 }, { "epoch": 0.23122911904343238, "grad_norm": 0.15697845816612244, "learning_rate": 1.79514126414581e-05, "loss": 0.0488, "step": 1315 }, { "epoch": 0.23210831721470018, "grad_norm": 0.6871252059936523, "learning_rate": 1.7934111225651293e-05, "loss": 0.0585, "step": 1320 }, { "epoch": 0.23298751538596799, "grad_norm": 0.13073213398456573, "learning_rate": 1.7916745468633593e-05, "loss": 0.0576, "step": 1325 }, { "epoch": 0.2338667135572358, "grad_norm": 0.150588259100914, "learning_rate": 1.7899315511231598e-05, "loss": 0.0572, "step": 1330 }, { "epoch": 0.2347459117285036, "grad_norm": 0.40196287631988525, "learning_rate": 1.7881821494792527e-05, "loss": 0.0573, "step": 1335 }, { "epoch": 0.2356251098997714, "grad_norm": 0.7040359973907471, "learning_rate": 1.7864263561183085e-05, "loss": 0.0653, "step": 1340 }, { "epoch": 0.2365043080710392, "grad_norm": 1.1014829874038696, "learning_rate": 1.78466418527883e-05, "loss": 0.0509, "step": 1345 }, { "epoch": 0.23738350624230703, "grad_norm": 0.2666812539100647, "learning_rate": 1.782895651251039e-05, "loss": 0.0585, "step": 1350 }, { "epoch": 0.23826270441357483, "grad_norm": 0.9666887521743774, "learning_rate": 1.781120768376759e-05, "loss": 0.0588, "step": 1355 }, { "epoch": 0.23914190258484264, "grad_norm": 0.7215674519538879, "learning_rate": 1.7793395510492986e-05, "loss": 0.0597, "step": 1360 }, { "epoch": 0.24002110075611044, "grad_norm": 0.5669434070587158, "learning_rate": 1.7775520137133354e-05, "loss": 0.0516, "step": 1365 }, { "epoch": 0.24090029892737824, "grad_norm": 0.38593825697898865, "learning_rate": 1.775758170864799e-05, "loss": 0.0485, "step": 1370 }, { "epoch": 0.24177949709864605, "grad_norm": 0.5211871266365051, "learning_rate": 1.7739580370507533e-05, "loss": 0.0619, "step": 1375 }, { "epoch": 0.24265869526991385, "grad_norm": 0.3265356719493866, "learning_rate": 1.7721516268692776e-05, "loss": 0.0593, "step": 1380 }, { "epoch": 0.24353789344118165, "grad_norm": 0.5374659895896912, "learning_rate": 1.77033895496935e-05, "loss": 0.0666, "step": 1385 }, { "epoch": 0.24441709161244946, "grad_norm": 0.26406246423721313, "learning_rate": 1.768520036050727e-05, "loss": 0.0493, "step": 1390 }, { "epoch": 0.24529628978371726, "grad_norm": 0.38316601514816284, "learning_rate": 1.7666948848638257e-05, "loss": 0.0503, "step": 1395 }, { "epoch": 0.24617548795498506, "grad_norm": 0.3633623719215393, "learning_rate": 1.7648635162096022e-05, "loss": 0.0569, "step": 1400 }, { "epoch": 0.24705468612625286, "grad_norm": 0.941490650177002, "learning_rate": 1.763025944939434e-05, "loss": 0.058, "step": 1405 }, { "epoch": 0.24793388429752067, "grad_norm": 0.21693024039268494, "learning_rate": 1.7611821859549977e-05, "loss": 0.0539, "step": 1410 }, { "epoch": 0.24881308246878847, "grad_norm": 0.7575194835662842, "learning_rate": 1.7593322542081486e-05, "loss": 0.0705, "step": 1415 }, { "epoch": 0.24969228064005627, "grad_norm": 0.3184313178062439, "learning_rate": 1.7574761647008004e-05, "loss": 0.0655, "step": 1420 }, { "epoch": 0.2505714788113241, "grad_norm": 0.9622363448143005, "learning_rate": 1.7556139324848024e-05, "loss": 0.0653, "step": 1425 }, { "epoch": 0.2514506769825919, "grad_norm": 0.3079875111579895, "learning_rate": 1.753745572661817e-05, "loss": 0.0497, "step": 1430 }, { "epoch": 0.2523298751538597, "grad_norm": 0.1410188525915146, "learning_rate": 1.7518711003832003e-05, "loss": 0.0715, "step": 1435 }, { "epoch": 0.2532090733251275, "grad_norm": 0.5498221516609192, "learning_rate": 1.749990530849875e-05, "loss": 0.0705, "step": 1440 }, { "epoch": 0.2540882714963953, "grad_norm": 0.27683818340301514, "learning_rate": 1.748103879312209e-05, "loss": 0.06, "step": 1445 }, { "epoch": 0.2549674696676631, "grad_norm": 1.3148400783538818, "learning_rate": 1.7462111610698934e-05, "loss": 0.0629, "step": 1450 }, { "epoch": 0.2558466678389309, "grad_norm": 0.4923277199268341, "learning_rate": 1.744312391471816e-05, "loss": 0.0573, "step": 1455 }, { "epoch": 0.2567258660101987, "grad_norm": 0.8244169354438782, "learning_rate": 1.7424075859159376e-05, "loss": 0.0561, "step": 1460 }, { "epoch": 0.2576050641814665, "grad_norm": 0.2395920604467392, "learning_rate": 1.7404967598491674e-05, "loss": 0.0643, "step": 1465 }, { "epoch": 0.2584842623527343, "grad_norm": 0.3752864897251129, "learning_rate": 1.7385799287672375e-05, "loss": 0.0634, "step": 1470 }, { "epoch": 0.2593634605240021, "grad_norm": 1.0273178815841675, "learning_rate": 1.736657108214578e-05, "loss": 0.0613, "step": 1475 }, { "epoch": 0.2602426586952699, "grad_norm": 0.9190396666526794, "learning_rate": 1.734728313784189e-05, "loss": 0.0623, "step": 1480 }, { "epoch": 0.2611218568665377, "grad_norm": 0.9993478655815125, "learning_rate": 1.732793561117517e-05, "loss": 0.0421, "step": 1485 }, { "epoch": 0.2620010550378055, "grad_norm": 0.4666178226470947, "learning_rate": 1.7308528659043243e-05, "loss": 0.0531, "step": 1490 }, { "epoch": 0.2628802532090733, "grad_norm": 0.24554145336151123, "learning_rate": 1.7289062438825665e-05, "loss": 0.0514, "step": 1495 }, { "epoch": 0.2637594513803411, "grad_norm": 0.29805853962898254, "learning_rate": 1.7269537108382605e-05, "loss": 0.0526, "step": 1500 }, { "epoch": 0.26463864955160893, "grad_norm": 0.9100229144096375, "learning_rate": 1.7249952826053582e-05, "loss": 0.0653, "step": 1505 }, { "epoch": 0.26551784772287673, "grad_norm": 0.2765738368034363, "learning_rate": 1.72303097506562e-05, "loss": 0.0694, "step": 1510 }, { "epoch": 0.26639704589414454, "grad_norm": 0.12102984637022018, "learning_rate": 1.721060804148482e-05, "loss": 0.0619, "step": 1515 }, { "epoch": 0.26727624406541234, "grad_norm": 0.2673247456550598, "learning_rate": 1.7190847858309304e-05, "loss": 0.0536, "step": 1520 }, { "epoch": 0.26815544223668014, "grad_norm": 0.6815070509910583, "learning_rate": 1.71710293613737e-05, "loss": 0.0521, "step": 1525 }, { "epoch": 0.26903464040794794, "grad_norm": 0.8095347881317139, "learning_rate": 1.7151152711394954e-05, "loss": 0.0628, "step": 1530 }, { "epoch": 0.26991383857921575, "grad_norm": 0.7218222618103027, "learning_rate": 1.7131218069561594e-05, "loss": 0.0405, "step": 1535 }, { "epoch": 0.27079303675048355, "grad_norm": 0.6927086710929871, "learning_rate": 1.7111225597532428e-05, "loss": 0.0647, "step": 1540 }, { "epoch": 0.27167223492175135, "grad_norm": 0.8299700617790222, "learning_rate": 1.7091175457435242e-05, "loss": 0.0648, "step": 1545 }, { "epoch": 0.27255143309301916, "grad_norm": 0.16689668595790863, "learning_rate": 1.7071067811865477e-05, "loss": 0.0481, "step": 1550 }, { "epoch": 0.27343063126428696, "grad_norm": 0.9474055767059326, "learning_rate": 1.7050902823884904e-05, "loss": 0.056, "step": 1555 }, { "epoch": 0.27430982943555476, "grad_norm": 0.2540503740310669, "learning_rate": 1.7030680657020314e-05, "loss": 0.0642, "step": 1560 }, { "epoch": 0.27518902760682257, "grad_norm": 0.24232099950313568, "learning_rate": 1.701040147526219e-05, "loss": 0.0531, "step": 1565 }, { "epoch": 0.27606822577809037, "grad_norm": 1.1714109182357788, "learning_rate": 1.6990065443063364e-05, "loss": 0.0471, "step": 1570 }, { "epoch": 0.2769474239493582, "grad_norm": 0.2802835702896118, "learning_rate": 1.6969672725337706e-05, "loss": 0.0678, "step": 1575 }, { "epoch": 0.277826622120626, "grad_norm": 0.6939117312431335, "learning_rate": 1.6949223487458764e-05, "loss": 0.0576, "step": 1580 }, { "epoch": 0.2787058202918938, "grad_norm": 0.558993935585022, "learning_rate": 1.692871789525844e-05, "loss": 0.0567, "step": 1585 }, { "epoch": 0.2795850184631616, "grad_norm": 0.6370697617530823, "learning_rate": 1.6908156115025626e-05, "loss": 0.0578, "step": 1590 }, { "epoch": 0.2804642166344294, "grad_norm": 0.22744275629520416, "learning_rate": 1.6887538313504883e-05, "loss": 0.0594, "step": 1595 }, { "epoch": 0.2813434148056972, "grad_norm": 0.9439639449119568, "learning_rate": 1.686686465789507e-05, "loss": 0.0473, "step": 1600 }, { "epoch": 0.282222612976965, "grad_norm": 0.5774978995323181, "learning_rate": 1.6846135315847978e-05, "loss": 0.0457, "step": 1605 }, { "epoch": 0.2831018111482328, "grad_norm": 1.2410329580307007, "learning_rate": 1.6825350455467e-05, "loss": 0.0602, "step": 1610 }, { "epoch": 0.2839810093195006, "grad_norm": 0.5602573752403259, "learning_rate": 1.6804510245305745e-05, "loss": 0.0533, "step": 1615 }, { "epoch": 0.2848602074907684, "grad_norm": 0.6151893734931946, "learning_rate": 1.678361485436668e-05, "loss": 0.0549, "step": 1620 }, { "epoch": 0.2857394056620362, "grad_norm": 0.17069809138774872, "learning_rate": 1.676266445209975e-05, "loss": 0.0548, "step": 1625 }, { "epoch": 0.286618603833304, "grad_norm": 0.2259790599346161, "learning_rate": 1.674165920840102e-05, "loss": 0.056, "step": 1630 }, { "epoch": 0.2874978020045718, "grad_norm": 0.17054542899131775, "learning_rate": 1.6720599293611287e-05, "loss": 0.0637, "step": 1635 }, { "epoch": 0.2883770001758396, "grad_norm": 0.38651248812675476, "learning_rate": 1.6699484878514693e-05, "loss": 0.0517, "step": 1640 }, { "epoch": 0.2892561983471074, "grad_norm": 0.3397147059440613, "learning_rate": 1.6678316134337362e-05, "loss": 0.0545, "step": 1645 }, { "epoch": 0.2901353965183752, "grad_norm": 0.28409913182258606, "learning_rate": 1.6657093232745973e-05, "loss": 0.0562, "step": 1650 }, { "epoch": 0.291014594689643, "grad_norm": 0.8544853925704956, "learning_rate": 1.6635816345846413e-05, "loss": 0.0641, "step": 1655 }, { "epoch": 0.29189379286091083, "grad_norm": 0.35938528180122375, "learning_rate": 1.661448564618235e-05, "loss": 0.0589, "step": 1660 }, { "epoch": 0.29277299103217863, "grad_norm": 0.3898780345916748, "learning_rate": 1.6593101306733847e-05, "loss": 0.059, "step": 1665 }, { "epoch": 0.29365218920344643, "grad_norm": 0.34503045678138733, "learning_rate": 1.6571663500915957e-05, "loss": 0.0702, "step": 1670 }, { "epoch": 0.29453138737471424, "grad_norm": 0.06647461652755737, "learning_rate": 1.6550172402577304e-05, "loss": 0.0618, "step": 1675 }, { "epoch": 0.29541058554598204, "grad_norm": 0.3430112600326538, "learning_rate": 1.6528628185998697e-05, "loss": 0.0587, "step": 1680 }, { "epoch": 0.29628978371724984, "grad_norm": 0.773381769657135, "learning_rate": 1.65070310258917e-05, "loss": 0.0608, "step": 1685 }, { "epoch": 0.29716898188851765, "grad_norm": 0.6421689391136169, "learning_rate": 1.6485381097397223e-05, "loss": 0.0558, "step": 1690 }, { "epoch": 0.29804818005978545, "grad_norm": 0.6023097634315491, "learning_rate": 1.646367857608409e-05, "loss": 0.0597, "step": 1695 }, { "epoch": 0.29892737823105325, "grad_norm": 0.47872084379196167, "learning_rate": 1.6441923637947627e-05, "loss": 0.0647, "step": 1700 }, { "epoch": 0.29980657640232106, "grad_norm": 1.3520686626434326, "learning_rate": 1.6420116459408237e-05, "loss": 0.0621, "step": 1705 }, { "epoch": 0.30068577457358886, "grad_norm": 0.354427307844162, "learning_rate": 1.6398257217309956e-05, "loss": 0.0591, "step": 1710 }, { "epoch": 0.30156497274485666, "grad_norm": 0.4515109658241272, "learning_rate": 1.6376346088919032e-05, "loss": 0.0444, "step": 1715 }, { "epoch": 0.3024441709161245, "grad_norm": 0.8840310573577881, "learning_rate": 1.6354383251922473e-05, "loss": 0.069, "step": 1720 }, { "epoch": 0.3033233690873923, "grad_norm": 0.13943177461624146, "learning_rate": 1.633236888442663e-05, "loss": 0.0621, "step": 1725 }, { "epoch": 0.30420256725866013, "grad_norm": 0.2890174984931946, "learning_rate": 1.631030316495572e-05, "loss": 0.0676, "step": 1730 }, { "epoch": 0.30508176542992793, "grad_norm": 0.18880678713321686, "learning_rate": 1.6288186272450407e-05, "loss": 0.0636, "step": 1735 }, { "epoch": 0.30596096360119573, "grad_norm": 0.5174923539161682, "learning_rate": 1.626601838626634e-05, "loss": 0.0638, "step": 1740 }, { "epoch": 0.30684016177246354, "grad_norm": 0.47316744923591614, "learning_rate": 1.624379968617269e-05, "loss": 0.0571, "step": 1745 }, { "epoch": 0.30771935994373134, "grad_norm": 0.4440658986568451, "learning_rate": 1.6221530352350713e-05, "loss": 0.0551, "step": 1750 }, { "epoch": 0.30859855811499914, "grad_norm": 0.5249147415161133, "learning_rate": 1.619921056539226e-05, "loss": 0.0559, "step": 1755 }, { "epoch": 0.30947775628626695, "grad_norm": 0.6782397627830505, "learning_rate": 1.6176840506298345e-05, "loss": 0.0695, "step": 1760 }, { "epoch": 0.31035695445753475, "grad_norm": 0.6498162150382996, "learning_rate": 1.615442035647765e-05, "loss": 0.0547, "step": 1765 }, { "epoch": 0.31123615262880255, "grad_norm": 0.458238422870636, "learning_rate": 1.6131950297745075e-05, "loss": 0.0608, "step": 1770 }, { "epoch": 0.31211535080007036, "grad_norm": 0.15339304506778717, "learning_rate": 1.6109430512320235e-05, "loss": 0.0583, "step": 1775 }, { "epoch": 0.31299454897133816, "grad_norm": 0.3111644387245178, "learning_rate": 1.6086861182826024e-05, "loss": 0.051, "step": 1780 }, { "epoch": 0.31387374714260596, "grad_norm": 1.0354893207550049, "learning_rate": 1.6064242492287095e-05, "loss": 0.065, "step": 1785 }, { "epoch": 0.31475294531387377, "grad_norm": 0.2242657095193863, "learning_rate": 1.6041574624128392e-05, "loss": 0.0473, "step": 1790 }, { "epoch": 0.31563214348514157, "grad_norm": 0.2808036208152771, "learning_rate": 1.6018857762173672e-05, "loss": 0.0537, "step": 1795 }, { "epoch": 0.3165113416564094, "grad_norm": 0.3030805289745331, "learning_rate": 1.5996092090643993e-05, "loss": 0.0529, "step": 1800 }, { "epoch": 0.3173905398276772, "grad_norm": 1.5149699449539185, "learning_rate": 1.597327779415624e-05, "loss": 0.0541, "step": 1805 }, { "epoch": 0.318269737998945, "grad_norm": 1.2089002132415771, "learning_rate": 1.595041505772162e-05, "loss": 0.0748, "step": 1810 }, { "epoch": 0.3191489361702128, "grad_norm": 0.28580719232559204, "learning_rate": 1.5927504066744147e-05, "loss": 0.0569, "step": 1815 }, { "epoch": 0.3200281343414806, "grad_norm": 0.7665076851844788, "learning_rate": 1.590454500701917e-05, "loss": 0.0648, "step": 1820 }, { "epoch": 0.3209073325127484, "grad_norm": 0.771531343460083, "learning_rate": 1.5881538064731838e-05, "loss": 0.0624, "step": 1825 }, { "epoch": 0.3217865306840162, "grad_norm": 0.11554717272520065, "learning_rate": 1.58584834264556e-05, "loss": 0.0474, "step": 1830 }, { "epoch": 0.322665728855284, "grad_norm": 0.13242988288402557, "learning_rate": 1.5835381279150705e-05, "loss": 0.055, "step": 1835 }, { "epoch": 0.3235449270265518, "grad_norm": 1.2687695026397705, "learning_rate": 1.5812231810162656e-05, "loss": 0.0595, "step": 1840 }, { "epoch": 0.3244241251978196, "grad_norm": 0.23642951250076294, "learning_rate": 1.5789035207220725e-05, "loss": 0.0433, "step": 1845 }, { "epoch": 0.3253033233690874, "grad_norm": 0.6303196549415588, "learning_rate": 1.5765791658436406e-05, "loss": 0.0495, "step": 1850 }, { "epoch": 0.3261825215403552, "grad_norm": 0.23932726681232452, "learning_rate": 1.5742501352301894e-05, "loss": 0.0558, "step": 1855 }, { "epoch": 0.327061719711623, "grad_norm": 0.4368959069252014, "learning_rate": 1.5719164477688566e-05, "loss": 0.0666, "step": 1860 }, { "epoch": 0.3279409178828908, "grad_norm": 0.2366788238286972, "learning_rate": 1.5695781223845442e-05, "loss": 0.0716, "step": 1865 }, { "epoch": 0.3288201160541586, "grad_norm": 0.643233060836792, "learning_rate": 1.5672351780397653e-05, "loss": 0.0524, "step": 1870 }, { "epoch": 0.3296993142254264, "grad_norm": 0.523089587688446, "learning_rate": 1.5648876337344898e-05, "loss": 0.0615, "step": 1875 }, { "epoch": 0.3305785123966942, "grad_norm": 0.18103045225143433, "learning_rate": 1.5625355085059907e-05, "loss": 0.0622, "step": 1880 }, { "epoch": 0.331457710567962, "grad_norm": 0.35785582661628723, "learning_rate": 1.5601788214286905e-05, "loss": 0.0578, "step": 1885 }, { "epoch": 0.33233690873922983, "grad_norm": 0.586683988571167, "learning_rate": 1.557817591614005e-05, "loss": 0.059, "step": 1890 }, { "epoch": 0.33321610691049763, "grad_norm": 0.43284872174263, "learning_rate": 1.555451838210189e-05, "loss": 0.0553, "step": 1895 }, { "epoch": 0.33409530508176544, "grad_norm": 0.44119471311569214, "learning_rate": 1.553081580402182e-05, "loss": 0.0563, "step": 1900 }, { "epoch": 0.33497450325303324, "grad_norm": 0.4126788377761841, "learning_rate": 1.55070683741145e-05, "loss": 0.0564, "step": 1905 }, { "epoch": 0.33585370142430104, "grad_norm": 0.581628680229187, "learning_rate": 1.548327628495833e-05, "loss": 0.0528, "step": 1910 }, { "epoch": 0.33673289959556885, "grad_norm": 0.49338245391845703, "learning_rate": 1.5459439729493864e-05, "loss": 0.046, "step": 1915 }, { "epoch": 0.33761209776683665, "grad_norm": 0.43671730160713196, "learning_rate": 1.543555890102226e-05, "loss": 0.062, "step": 1920 }, { "epoch": 0.33849129593810445, "grad_norm": 0.6600947976112366, "learning_rate": 1.5411633993203695e-05, "loss": 0.0616, "step": 1925 }, { "epoch": 0.33937049410937226, "grad_norm": 0.6367527842521667, "learning_rate": 1.538766520005581e-05, "loss": 0.0621, "step": 1930 }, { "epoch": 0.34024969228064006, "grad_norm": 0.39215588569641113, "learning_rate": 1.536365271595212e-05, "loss": 0.0659, "step": 1935 }, { "epoch": 0.34112889045190786, "grad_norm": 1.8016176223754883, "learning_rate": 1.5339596735620485e-05, "loss": 0.0596, "step": 1940 }, { "epoch": 0.34200808862317567, "grad_norm": 0.7933741807937622, "learning_rate": 1.5315497454141446e-05, "loss": 0.0602, "step": 1945 }, { "epoch": 0.34288728679444347, "grad_norm": 0.2943550944328308, "learning_rate": 1.529135506694673e-05, "loss": 0.0514, "step": 1950 }, { "epoch": 0.34376648496571127, "grad_norm": 0.20394988358020782, "learning_rate": 1.526716976981761e-05, "loss": 0.0613, "step": 1955 }, { "epoch": 0.3446456831369791, "grad_norm": 0.3414583206176758, "learning_rate": 1.5242941758883341e-05, "loss": 0.0446, "step": 1960 }, { "epoch": 0.3455248813082469, "grad_norm": 0.6178276538848877, "learning_rate": 1.5218671230619558e-05, "loss": 0.0586, "step": 1965 }, { "epoch": 0.3464040794795147, "grad_norm": 0.8685587644577026, "learning_rate": 1.5194358381846686e-05, "loss": 0.0577, "step": 1970 }, { "epoch": 0.3472832776507825, "grad_norm": 0.6029766201972961, "learning_rate": 1.5170003409728358e-05, "loss": 0.053, "step": 1975 }, { "epoch": 0.3481624758220503, "grad_norm": 0.33781400322914124, "learning_rate": 1.5145606511769788e-05, "loss": 0.0625, "step": 1980 }, { "epoch": 0.3490416739933181, "grad_norm": 0.541808009147644, "learning_rate": 1.5121167885816202e-05, "loss": 0.0505, "step": 1985 }, { "epoch": 0.3499208721645859, "grad_norm": 1.0590039491653442, "learning_rate": 1.50966877300512e-05, "loss": 0.0698, "step": 1990 }, { "epoch": 0.3508000703358537, "grad_norm": 0.4376682937145233, "learning_rate": 1.5072166242995177e-05, "loss": 0.066, "step": 1995 }, { "epoch": 0.3516792685071215, "grad_norm": 0.44992053508758545, "learning_rate": 1.5047603623503695e-05, "loss": 0.074, "step": 2000 }, { "epoch": 0.3525584666783893, "grad_norm": 0.5252031683921814, "learning_rate": 1.5023000070765886e-05, "loss": 0.0681, "step": 2005 }, { "epoch": 0.3534376648496571, "grad_norm": 0.6418195366859436, "learning_rate": 1.4998355784302816e-05, "loss": 0.0655, "step": 2010 }, { "epoch": 0.3543168630209249, "grad_norm": 1.0612142086029053, "learning_rate": 1.4973670963965883e-05, "loss": 0.0681, "step": 2015 }, { "epoch": 0.3551960611921927, "grad_norm": 0.288524866104126, "learning_rate": 1.49489458099352e-05, "loss": 0.0472, "step": 2020 }, { "epoch": 0.3560752593634605, "grad_norm": 0.44207853078842163, "learning_rate": 1.4924180522717952e-05, "loss": 0.0629, "step": 2025 }, { "epoch": 0.3569544575347283, "grad_norm": 0.501887857913971, "learning_rate": 1.4899375303146793e-05, "loss": 0.0467, "step": 2030 }, { "epoch": 0.3578336557059961, "grad_norm": 0.3725419044494629, "learning_rate": 1.4874530352378193e-05, "loss": 0.0592, "step": 2035 }, { "epoch": 0.3587128538772639, "grad_norm": 0.3303128480911255, "learning_rate": 1.4849645871890832e-05, "loss": 0.0453, "step": 2040 }, { "epoch": 0.35959205204853173, "grad_norm": 0.12348782271146774, "learning_rate": 1.4824722063483944e-05, "loss": 0.0434, "step": 2045 }, { "epoch": 0.36047125021979953, "grad_norm": 0.381765753030777, "learning_rate": 1.4799759129275703e-05, "loss": 0.0497, "step": 2050 }, { "epoch": 0.36135044839106734, "grad_norm": 0.24817384779453278, "learning_rate": 1.477475727170156e-05, "loss": 0.0495, "step": 2055 }, { "epoch": 0.36222964656233514, "grad_norm": 0.3029944896697998, "learning_rate": 1.4749716693512612e-05, "loss": 0.0463, "step": 2060 }, { "epoch": 0.36310884473360294, "grad_norm": 0.9858243465423584, "learning_rate": 1.4724637597773969e-05, "loss": 0.0769, "step": 2065 }, { "epoch": 0.36398804290487075, "grad_norm": 0.38801249861717224, "learning_rate": 1.469952018786309e-05, "loss": 0.0472, "step": 2070 }, { "epoch": 0.36486724107613855, "grad_norm": 0.15492072701454163, "learning_rate": 1.467436466746814e-05, "loss": 0.0574, "step": 2075 }, { "epoch": 0.36574643924740635, "grad_norm": 0.8615152835845947, "learning_rate": 1.464917124058634e-05, "loss": 0.0651, "step": 2080 }, { "epoch": 0.36662563741867416, "grad_norm": 1.6309150457382202, "learning_rate": 1.4623940111522315e-05, "loss": 0.0559, "step": 2085 }, { "epoch": 0.36750483558994196, "grad_norm": 0.1200883612036705, "learning_rate": 1.4598671484886423e-05, "loss": 0.049, "step": 2090 }, { "epoch": 0.36838403376120976, "grad_norm": 0.3056308627128601, "learning_rate": 1.4573365565593121e-05, "loss": 0.0514, "step": 2095 }, { "epoch": 0.36926323193247756, "grad_norm": 0.3354267179965973, "learning_rate": 1.4548022558859281e-05, "loss": 0.0528, "step": 2100 }, { "epoch": 0.37014243010374537, "grad_norm": 0.7310557961463928, "learning_rate": 1.4522642670202528e-05, "loss": 0.0676, "step": 2105 }, { "epoch": 0.37102162827501317, "grad_norm": 0.6339288353919983, "learning_rate": 1.4497226105439586e-05, "loss": 0.0714, "step": 2110 }, { "epoch": 0.371900826446281, "grad_norm": 0.2613738179206848, "learning_rate": 1.44717730706846e-05, "loss": 0.0586, "step": 2115 }, { "epoch": 0.3727800246175488, "grad_norm": 0.4906211793422699, "learning_rate": 1.4446283772347475e-05, "loss": 0.0599, "step": 2120 }, { "epoch": 0.3736592227888166, "grad_norm": 0.46116968989372253, "learning_rate": 1.4420758417132177e-05, "loss": 0.0537, "step": 2125 }, { "epoch": 0.3745384209600844, "grad_norm": 0.5634986758232117, "learning_rate": 1.4395197212035078e-05, "loss": 0.0552, "step": 2130 }, { "epoch": 0.3754176191313522, "grad_norm": 0.14335110783576965, "learning_rate": 1.4369600364343286e-05, "loss": 0.0587, "step": 2135 }, { "epoch": 0.37629681730262, "grad_norm": 0.17126217484474182, "learning_rate": 1.434396808163293e-05, "loss": 0.066, "step": 2140 }, { "epoch": 0.3771760154738878, "grad_norm": 0.5141481161117554, "learning_rate": 1.4318300571767514e-05, "loss": 0.058, "step": 2145 }, { "epoch": 0.3780552136451556, "grad_norm": 0.19028577208518982, "learning_rate": 1.4292598042896204e-05, "loss": 0.0667, "step": 2150 }, { "epoch": 0.3789344118164234, "grad_norm": 0.20005124807357788, "learning_rate": 1.4266860703452156e-05, "loss": 0.0519, "step": 2155 }, { "epoch": 0.3798136099876912, "grad_norm": 0.36462101340293884, "learning_rate": 1.4241088762150817e-05, "loss": 0.0583, "step": 2160 }, { "epoch": 0.380692808158959, "grad_norm": 0.26748377084732056, "learning_rate": 1.4215282427988242e-05, "loss": 0.0609, "step": 2165 }, { "epoch": 0.3815720063302268, "grad_norm": 0.28044751286506653, "learning_rate": 1.4189441910239383e-05, "loss": 0.053, "step": 2170 }, { "epoch": 0.3824512045014946, "grad_norm": 0.5757772326469421, "learning_rate": 1.4163567418456408e-05, "loss": 0.0651, "step": 2175 }, { "epoch": 0.3833304026727624, "grad_norm": 0.6958622336387634, "learning_rate": 1.4137659162466999e-05, "loss": 0.0529, "step": 2180 }, { "epoch": 0.3842096008440302, "grad_norm": 0.7717348337173462, "learning_rate": 1.4111717352372635e-05, "loss": 0.0498, "step": 2185 }, { "epoch": 0.385088799015298, "grad_norm": 0.4615864157676697, "learning_rate": 1.408574219854692e-05, "loss": 0.0619, "step": 2190 }, { "epoch": 0.3859679971865658, "grad_norm": 0.20736804604530334, "learning_rate": 1.405973391163383e-05, "loss": 0.0516, "step": 2195 }, { "epoch": 0.38684719535783363, "grad_norm": 0.9234808087348938, "learning_rate": 1.4033692702546056e-05, "loss": 0.0553, "step": 2200 }, { "epoch": 0.38772639352910143, "grad_norm": 0.9873343110084534, "learning_rate": 1.4007618782463252e-05, "loss": 0.0683, "step": 2205 }, { "epoch": 0.3886055917003693, "grad_norm": 0.8674870133399963, "learning_rate": 1.3981512362830359e-05, "loss": 0.0553, "step": 2210 }, { "epoch": 0.3894847898716371, "grad_norm": 0.3268803656101227, "learning_rate": 1.3955373655355852e-05, "loss": 0.0461, "step": 2215 }, { "epoch": 0.3903639880429049, "grad_norm": 0.3193112909793854, "learning_rate": 1.392920287201005e-05, "loss": 0.0674, "step": 2220 }, { "epoch": 0.3912431862141727, "grad_norm": 0.3444458842277527, "learning_rate": 1.3903000225023393e-05, "loss": 0.0471, "step": 2225 }, { "epoch": 0.3921223843854405, "grad_norm": 0.6412308812141418, "learning_rate": 1.3876765926884712e-05, "loss": 0.0537, "step": 2230 }, { "epoch": 0.3930015825567083, "grad_norm": 0.34802794456481934, "learning_rate": 1.3850500190339515e-05, "loss": 0.0627, "step": 2235 }, { "epoch": 0.3938807807279761, "grad_norm": 0.3646249771118164, "learning_rate": 1.3824203228388254e-05, "loss": 0.0513, "step": 2240 }, { "epoch": 0.3947599788992439, "grad_norm": 1.2918972969055176, "learning_rate": 1.3797875254284605e-05, "loss": 0.0782, "step": 2245 }, { "epoch": 0.3956391770705117, "grad_norm": 0.2758769094944, "learning_rate": 1.3771516481533733e-05, "loss": 0.0479, "step": 2250 }, { "epoch": 0.3965183752417795, "grad_norm": 0.1739499419927597, "learning_rate": 1.3745127123890565e-05, "loss": 0.0523, "step": 2255 }, { "epoch": 0.3973975734130473, "grad_norm": 0.30399224162101746, "learning_rate": 1.3718707395358053e-05, "loss": 0.0604, "step": 2260 }, { "epoch": 0.3982767715843151, "grad_norm": 0.3504631519317627, "learning_rate": 1.3692257510185439e-05, "loss": 0.0738, "step": 2265 }, { "epoch": 0.39915596975558293, "grad_norm": 0.3818477988243103, "learning_rate": 1.3665777682866521e-05, "loss": 0.0572, "step": 2270 }, { "epoch": 0.40003516792685073, "grad_norm": 0.8020049333572388, "learning_rate": 1.3639268128137908e-05, "loss": 0.0616, "step": 2275 }, { "epoch": 0.40091436609811854, "grad_norm": 0.5802652835845947, "learning_rate": 1.3612729060977287e-05, "loss": 0.0647, "step": 2280 }, { "epoch": 0.40179356426938634, "grad_norm": 0.1555125117301941, "learning_rate": 1.3586160696601667e-05, "loss": 0.0656, "step": 2285 }, { "epoch": 0.40267276244065414, "grad_norm": 0.3963813781738281, "learning_rate": 1.3559563250465645e-05, "loss": 0.0555, "step": 2290 }, { "epoch": 0.40355196061192194, "grad_norm": 0.31865325570106506, "learning_rate": 1.3532936938259658e-05, "loss": 0.0571, "step": 2295 }, { "epoch": 0.40443115878318975, "grad_norm": 0.32378071546554565, "learning_rate": 1.3506281975908224e-05, "loss": 0.065, "step": 2300 }, { "epoch": 0.40531035695445755, "grad_norm": 0.12490954995155334, "learning_rate": 1.3479598579568205e-05, "loss": 0.0529, "step": 2305 }, { "epoch": 0.40618955512572535, "grad_norm": 0.3125900626182556, "learning_rate": 1.3452886965627036e-05, "loss": 0.0408, "step": 2310 }, { "epoch": 0.40706875329699316, "grad_norm": 0.8287737369537354, "learning_rate": 1.3426147350700995e-05, "loss": 0.062, "step": 2315 }, { "epoch": 0.40794795146826096, "grad_norm": 0.39330750703811646, "learning_rate": 1.339937995163342e-05, "loss": 0.0493, "step": 2320 }, { "epoch": 0.40882714963952876, "grad_norm": 0.2195868045091629, "learning_rate": 1.3372584985492972e-05, "loss": 0.0545, "step": 2325 }, { "epoch": 0.40970634781079657, "grad_norm": 0.15690335631370544, "learning_rate": 1.3345762669571855e-05, "loss": 0.0564, "step": 2330 }, { "epoch": 0.41058554598206437, "grad_norm": 0.15210093557834625, "learning_rate": 1.3318913221384078e-05, "loss": 0.0501, "step": 2335 }, { "epoch": 0.4114647441533322, "grad_norm": 0.08087150007486343, "learning_rate": 1.3292036858663671e-05, "loss": 0.0494, "step": 2340 }, { "epoch": 0.4123439423246, "grad_norm": 0.17343612015247345, "learning_rate": 1.3265133799362919e-05, "loss": 0.0568, "step": 2345 }, { "epoch": 0.4132231404958678, "grad_norm": 0.6120114922523499, "learning_rate": 1.3238204261650613e-05, "loss": 0.0819, "step": 2350 }, { "epoch": 0.4141023386671356, "grad_norm": 0.5394456386566162, "learning_rate": 1.3211248463910263e-05, "loss": 0.0574, "step": 2355 }, { "epoch": 0.4149815368384034, "grad_norm": 0.14764389395713806, "learning_rate": 1.3184266624738333e-05, "loss": 0.0588, "step": 2360 }, { "epoch": 0.4158607350096712, "grad_norm": 0.664940595626831, "learning_rate": 1.3157258962942468e-05, "loss": 0.0499, "step": 2365 }, { "epoch": 0.416739933180939, "grad_norm": 0.43065398931503296, "learning_rate": 1.3130225697539725e-05, "loss": 0.056, "step": 2370 }, { "epoch": 0.4176191313522068, "grad_norm": 0.23071400821208954, "learning_rate": 1.3103167047754786e-05, "loss": 0.0505, "step": 2375 }, { "epoch": 0.4184983295234746, "grad_norm": 0.2247258871793747, "learning_rate": 1.3076083233018188e-05, "loss": 0.0572, "step": 2380 }, { "epoch": 0.4193775276947424, "grad_norm": 0.24306868016719818, "learning_rate": 1.3048974472964547e-05, "loss": 0.0751, "step": 2385 }, { "epoch": 0.4202567258660102, "grad_norm": 0.6511367559432983, "learning_rate": 1.3021840987430761e-05, "loss": 0.0612, "step": 2390 }, { "epoch": 0.421135924037278, "grad_norm": 0.5404173731803894, "learning_rate": 1.2994682996454247e-05, "loss": 0.0593, "step": 2395 }, { "epoch": 0.4220151222085458, "grad_norm": 0.1526852548122406, "learning_rate": 1.2967500720271142e-05, "loss": 0.0557, "step": 2400 }, { "epoch": 0.4228943203798136, "grad_norm": 0.2758397161960602, "learning_rate": 1.2940294379314531e-05, "loss": 0.0599, "step": 2405 }, { "epoch": 0.4237735185510814, "grad_norm": 0.6543939709663391, "learning_rate": 1.2913064194212634e-05, "loss": 0.0579, "step": 2410 }, { "epoch": 0.4246527167223492, "grad_norm": 0.5492807030677795, "learning_rate": 1.2885810385787056e-05, "loss": 0.0571, "step": 2415 }, { "epoch": 0.425531914893617, "grad_norm": 0.200227290391922, "learning_rate": 1.2858533175050955e-05, "loss": 0.0514, "step": 2420 }, { "epoch": 0.42641111306488483, "grad_norm": 0.5350441932678223, "learning_rate": 1.2831232783207278e-05, "loss": 0.0492, "step": 2425 }, { "epoch": 0.42729031123615263, "grad_norm": 0.2634308636188507, "learning_rate": 1.2803909431646952e-05, "loss": 0.0511, "step": 2430 }, { "epoch": 0.42816950940742043, "grad_norm": 0.1868433952331543, "learning_rate": 1.2776563341947104e-05, "loss": 0.0483, "step": 2435 }, { "epoch": 0.42904870757868824, "grad_norm": 0.29465433955192566, "learning_rate": 1.2749194735869246e-05, "loss": 0.0543, "step": 2440 }, { "epoch": 0.42992790574995604, "grad_norm": 0.13924354314804077, "learning_rate": 1.2721803835357486e-05, "loss": 0.0564, "step": 2445 }, { "epoch": 0.43080710392122384, "grad_norm": 0.5867925882339478, "learning_rate": 1.2694390862536736e-05, "loss": 0.0651, "step": 2450 }, { "epoch": 0.43168630209249165, "grad_norm": 0.0857938826084137, "learning_rate": 1.2666956039710889e-05, "loss": 0.049, "step": 2455 }, { "epoch": 0.43256550026375945, "grad_norm": 0.5406795740127563, "learning_rate": 1.2639499589361041e-05, "loss": 0.0662, "step": 2460 }, { "epoch": 0.43344469843502725, "grad_norm": 0.6612178087234497, "learning_rate": 1.2612021734143667e-05, "loss": 0.0634, "step": 2465 }, { "epoch": 0.43432389660629506, "grad_norm": 0.9012327194213867, "learning_rate": 1.2584522696888825e-05, "loss": 0.0652, "step": 2470 }, { "epoch": 0.43520309477756286, "grad_norm": 0.2964789569377899, "learning_rate": 1.2557002700598353e-05, "loss": 0.0511, "step": 2475 }, { "epoch": 0.43608229294883066, "grad_norm": 0.22119061648845673, "learning_rate": 1.2529461968444047e-05, "loss": 0.0556, "step": 2480 }, { "epoch": 0.43696149112009847, "grad_norm": 0.35610833764076233, "learning_rate": 1.250190072376587e-05, "loss": 0.0559, "step": 2485 }, { "epoch": 0.43784068929136627, "grad_norm": 0.596125066280365, "learning_rate": 1.2474319190070115e-05, "loss": 0.0562, "step": 2490 }, { "epoch": 0.4387198874626341, "grad_norm": 0.31192147731781006, "learning_rate": 1.2446717591027624e-05, "loss": 0.0581, "step": 2495 }, { "epoch": 0.4395990856339019, "grad_norm": 0.6106126308441162, "learning_rate": 1.2419096150471944e-05, "loss": 0.0599, "step": 2500 }, { "epoch": 0.4404782838051697, "grad_norm": 0.33565065264701843, "learning_rate": 1.2391455092397535e-05, "loss": 0.0627, "step": 2505 }, { "epoch": 0.4413574819764375, "grad_norm": 0.3373861610889435, "learning_rate": 1.236379464095794e-05, "loss": 0.0605, "step": 2510 }, { "epoch": 0.4422366801477053, "grad_norm": 0.2420874387025833, "learning_rate": 1.233611502046397e-05, "loss": 0.0584, "step": 2515 }, { "epoch": 0.4431158783189731, "grad_norm": 0.17154277861118317, "learning_rate": 1.2308416455381891e-05, "loss": 0.0428, "step": 2520 }, { "epoch": 0.4439950764902409, "grad_norm": 0.4789488613605499, "learning_rate": 1.2280699170331593e-05, "loss": 0.0512, "step": 2525 }, { "epoch": 0.4448742746615087, "grad_norm": 0.2340506613254547, "learning_rate": 1.2252963390084784e-05, "loss": 0.0586, "step": 2530 }, { "epoch": 0.4457534728327765, "grad_norm": 0.3497225046157837, "learning_rate": 1.2225209339563144e-05, "loss": 0.0597, "step": 2535 }, { "epoch": 0.4466326710040443, "grad_norm": 0.22254657745361328, "learning_rate": 1.2197437243836529e-05, "loss": 0.0784, "step": 2540 }, { "epoch": 0.4475118691753121, "grad_norm": 0.3253319561481476, "learning_rate": 1.2169647328121119e-05, "loss": 0.0575, "step": 2545 }, { "epoch": 0.4483910673465799, "grad_norm": 0.455172061920166, "learning_rate": 1.2141839817777616e-05, "loss": 0.06, "step": 2550 }, { "epoch": 0.4492702655178477, "grad_norm": 0.30848458409309387, "learning_rate": 1.2114014938309393e-05, "loss": 0.0583, "step": 2555 }, { "epoch": 0.4501494636891155, "grad_norm": 0.38188642263412476, "learning_rate": 1.2086172915360684e-05, "loss": 0.0575, "step": 2560 }, { "epoch": 0.4510286618603833, "grad_norm": 0.2141093611717224, "learning_rate": 1.2058313974714746e-05, "loss": 0.0678, "step": 2565 }, { "epoch": 0.4519078600316511, "grad_norm": 0.37475940585136414, "learning_rate": 1.2030438342292028e-05, "loss": 0.0621, "step": 2570 }, { "epoch": 0.4527870582029189, "grad_norm": 0.5214441418647766, "learning_rate": 1.2002546244148345e-05, "loss": 0.0559, "step": 2575 }, { "epoch": 0.4536662563741867, "grad_norm": 0.5643457174301147, "learning_rate": 1.197463790647303e-05, "loss": 0.0551, "step": 2580 }, { "epoch": 0.45454545454545453, "grad_norm": 0.2806403338909149, "learning_rate": 1.1946713555587115e-05, "loss": 0.059, "step": 2585 }, { "epoch": 0.45542465271672233, "grad_norm": 0.7372429370880127, "learning_rate": 1.1918773417941494e-05, "loss": 0.0649, "step": 2590 }, { "epoch": 0.45630385088799014, "grad_norm": 0.9332758188247681, "learning_rate": 1.1890817720115075e-05, "loss": 0.0519, "step": 2595 }, { "epoch": 0.45718304905925794, "grad_norm": 0.36277708411216736, "learning_rate": 1.1862846688812956e-05, "loss": 0.0605, "step": 2600 }, { "epoch": 0.45806224723052574, "grad_norm": 0.19159919023513794, "learning_rate": 1.183486055086458e-05, "loss": 0.0528, "step": 2605 }, { "epoch": 0.45894144540179355, "grad_norm": 0.21766141057014465, "learning_rate": 1.1806859533221896e-05, "loss": 0.0587, "step": 2610 }, { "epoch": 0.45982064357306135, "grad_norm": 0.8714537024497986, "learning_rate": 1.1778843862957515e-05, "loss": 0.0695, "step": 2615 }, { "epoch": 0.46069984174432915, "grad_norm": 0.25225165486335754, "learning_rate": 1.1750813767262879e-05, "loss": 0.0551, "step": 2620 }, { "epoch": 0.46157903991559696, "grad_norm": 0.2513810992240906, "learning_rate": 1.1722769473446412e-05, "loss": 0.0604, "step": 2625 }, { "epoch": 0.46245823808686476, "grad_norm": 0.1506785750389099, "learning_rate": 1.1694711208931668e-05, "loss": 0.0562, "step": 2630 }, { "epoch": 0.46333743625813256, "grad_norm": 0.20237652957439423, "learning_rate": 1.1666639201255507e-05, "loss": 0.0526, "step": 2635 }, { "epoch": 0.46421663442940037, "grad_norm": 0.5076216459274292, "learning_rate": 1.163855367806623e-05, "loss": 0.0594, "step": 2640 }, { "epoch": 0.46509583260066817, "grad_norm": 0.3569527268409729, "learning_rate": 1.1610454867121747e-05, "loss": 0.0594, "step": 2645 }, { "epoch": 0.46597503077193597, "grad_norm": 0.19620120525360107, "learning_rate": 1.158234299628772e-05, "loss": 0.0658, "step": 2650 }, { "epoch": 0.4668542289432038, "grad_norm": 0.46498578786849976, "learning_rate": 1.1554218293535727e-05, "loss": 0.0606, "step": 2655 }, { "epoch": 0.4677334271144716, "grad_norm": 0.2302287071943283, "learning_rate": 1.1526080986941389e-05, "loss": 0.0589, "step": 2660 }, { "epoch": 0.4686126252857394, "grad_norm": 0.39261671900749207, "learning_rate": 1.1497931304682554e-05, "loss": 0.0486, "step": 2665 }, { "epoch": 0.4694918234570072, "grad_norm": 1.2169684171676636, "learning_rate": 1.1469769475037427e-05, "loss": 0.0666, "step": 2670 }, { "epoch": 0.470371021628275, "grad_norm": 0.7795551419258118, "learning_rate": 1.144159572638271e-05, "loss": 0.0614, "step": 2675 }, { "epoch": 0.4712502197995428, "grad_norm": 0.5861936211585999, "learning_rate": 1.141341028719178e-05, "loss": 0.0522, "step": 2680 }, { "epoch": 0.4721294179708106, "grad_norm": 0.3658725917339325, "learning_rate": 1.1385213386032797e-05, "loss": 0.0506, "step": 2685 }, { "epoch": 0.4730086161420784, "grad_norm": 0.5009949803352356, "learning_rate": 1.1357005251566888e-05, "loss": 0.0716, "step": 2690 }, { "epoch": 0.4738878143133462, "grad_norm": 0.34323883056640625, "learning_rate": 1.1328786112546268e-05, "loss": 0.0772, "step": 2695 }, { "epoch": 0.47476701248461406, "grad_norm": 0.5411744713783264, "learning_rate": 1.1300556197812393e-05, "loss": 0.0604, "step": 2700 }, { "epoch": 0.47564621065588186, "grad_norm": 0.5839160680770874, "learning_rate": 1.1272315736294108e-05, "loss": 0.0575, "step": 2705 }, { "epoch": 0.47652540882714967, "grad_norm": 0.2879360020160675, "learning_rate": 1.1244064957005782e-05, "loss": 0.0572, "step": 2710 }, { "epoch": 0.47740460699841747, "grad_norm": 0.309555321931839, "learning_rate": 1.121580408904546e-05, "loss": 0.0679, "step": 2715 }, { "epoch": 0.47828380516968527, "grad_norm": 0.31096187233924866, "learning_rate": 1.1187533361592988e-05, "loss": 0.0672, "step": 2720 }, { "epoch": 0.4791630033409531, "grad_norm": 0.24858668446540833, "learning_rate": 1.1159253003908188e-05, "loss": 0.0604, "step": 2725 }, { "epoch": 0.4800422015122209, "grad_norm": 0.38457539677619934, "learning_rate": 1.113096324532896e-05, "loss": 0.0656, "step": 2730 }, { "epoch": 0.4809213996834887, "grad_norm": 0.20690588653087616, "learning_rate": 1.1102664315269452e-05, "loss": 0.0612, "step": 2735 }, { "epoch": 0.4818005978547565, "grad_norm": 0.8975262641906738, "learning_rate": 1.1074356443218175e-05, "loss": 0.0552, "step": 2740 }, { "epoch": 0.4826797960260243, "grad_norm": 0.31369662284851074, "learning_rate": 1.1046039858736167e-05, "loss": 0.0685, "step": 2745 }, { "epoch": 0.4835589941972921, "grad_norm": 0.1889607012271881, "learning_rate": 1.101771479145511e-05, "loss": 0.0642, "step": 2750 }, { "epoch": 0.4844381923685599, "grad_norm": 0.4452875256538391, "learning_rate": 1.0989381471075481e-05, "loss": 0.0643, "step": 2755 }, { "epoch": 0.4853173905398277, "grad_norm": 0.9008978009223938, "learning_rate": 1.0961040127364688e-05, "loss": 0.0634, "step": 2760 }, { "epoch": 0.4861965887110955, "grad_norm": 1.0086400508880615, "learning_rate": 1.0932690990155195e-05, "loss": 0.053, "step": 2765 }, { "epoch": 0.4870757868823633, "grad_norm": 0.3842066526412964, "learning_rate": 1.0904334289342675e-05, "loss": 0.0548, "step": 2770 }, { "epoch": 0.4879549850536311, "grad_norm": 0.23029272258281708, "learning_rate": 1.087597025488413e-05, "loss": 0.0417, "step": 2775 }, { "epoch": 0.4888341832248989, "grad_norm": 0.9451369643211365, "learning_rate": 1.0847599116796047e-05, "loss": 0.0535, "step": 2780 }, { "epoch": 0.4897133813961667, "grad_norm": 1.2543659210205078, "learning_rate": 1.0819221105152504e-05, "loss": 0.0644, "step": 2785 }, { "epoch": 0.4905925795674345, "grad_norm": 0.2994709014892578, "learning_rate": 1.0790836450083327e-05, "loss": 0.053, "step": 2790 }, { "epoch": 0.4914717777387023, "grad_norm": 0.43622636795043945, "learning_rate": 1.0762445381772217e-05, "loss": 0.0609, "step": 2795 }, { "epoch": 0.4923509759099701, "grad_norm": 0.27832546830177307, "learning_rate": 1.0734048130454882e-05, "loss": 0.0642, "step": 2800 }, { "epoch": 0.4932301740812379, "grad_norm": 1.0674911737442017, "learning_rate": 1.0705644926417172e-05, "loss": 0.0445, "step": 2805 }, { "epoch": 0.49410937225250573, "grad_norm": 0.5751109719276428, "learning_rate": 1.0677235999993205e-05, "loss": 0.0482, "step": 2810 }, { "epoch": 0.49498857042377353, "grad_norm": 0.15411067008972168, "learning_rate": 1.0648821581563514e-05, "loss": 0.0561, "step": 2815 }, { "epoch": 0.49586776859504134, "grad_norm": 0.29986828565597534, "learning_rate": 1.0620401901553155e-05, "loss": 0.0655, "step": 2820 }, { "epoch": 0.49674696676630914, "grad_norm": 0.6311131119728088, "learning_rate": 1.0591977190429868e-05, "loss": 0.0519, "step": 2825 }, { "epoch": 0.49762616493757694, "grad_norm": 0.30874118208885193, "learning_rate": 1.056354767870218e-05, "loss": 0.0581, "step": 2830 }, { "epoch": 0.49850536310884475, "grad_norm": 0.38604286313056946, "learning_rate": 1.0535113596917556e-05, "loss": 0.0627, "step": 2835 }, { "epoch": 0.49938456128011255, "grad_norm": 0.16394232213497162, "learning_rate": 1.0506675175660519e-05, "loss": 0.0591, "step": 2840 }, { "epoch": 0.5002637594513804, "grad_norm": 0.5202212929725647, "learning_rate": 1.0478232645550784e-05, "loss": 0.0585, "step": 2845 }, { "epoch": 0.5011429576226482, "grad_norm": 0.18142499029636383, "learning_rate": 1.0449786237241382e-05, "loss": 0.0603, "step": 2850 }, { "epoch": 0.502022155793916, "grad_norm": 0.38024476170539856, "learning_rate": 1.0421336181416796e-05, "loss": 0.0712, "step": 2855 }, { "epoch": 0.5029013539651838, "grad_norm": 0.28926122188568115, "learning_rate": 1.03928827087911e-05, "loss": 0.0669, "step": 2860 }, { "epoch": 0.5037805521364516, "grad_norm": 0.4077168405056, "learning_rate": 1.036442605010605e-05, "loss": 0.0616, "step": 2865 }, { "epoch": 0.5046597503077194, "grad_norm": 0.4079400300979614, "learning_rate": 1.0335966436129268e-05, "loss": 0.058, "step": 2870 }, { "epoch": 0.5055389484789872, "grad_norm": 0.5996482968330383, "learning_rate": 1.0307504097652323e-05, "loss": 0.0512, "step": 2875 }, { "epoch": 0.506418146650255, "grad_norm": 0.09414978325366974, "learning_rate": 1.0279039265488885e-05, "loss": 0.0519, "step": 2880 }, { "epoch": 0.5072973448215228, "grad_norm": 0.3783423602581024, "learning_rate": 1.0250572170472848e-05, "loss": 0.0599, "step": 2885 }, { "epoch": 0.5081765429927906, "grad_norm": 0.7148971557617188, "learning_rate": 1.0222103043456447e-05, "loss": 0.0681, "step": 2890 }, { "epoch": 0.5090557411640584, "grad_norm": 0.29532909393310547, "learning_rate": 1.0193632115308412e-05, "loss": 0.0628, "step": 2895 }, { "epoch": 0.5099349393353262, "grad_norm": 0.231553316116333, "learning_rate": 1.016515961691206e-05, "loss": 0.0489, "step": 2900 }, { "epoch": 0.510814137506594, "grad_norm": 0.6670016646385193, "learning_rate": 1.0136685779163458e-05, "loss": 0.0596, "step": 2905 }, { "epoch": 0.5116933356778618, "grad_norm": 0.8102641105651855, "learning_rate": 1.010821083296952e-05, "loss": 0.0563, "step": 2910 }, { "epoch": 0.5125725338491296, "grad_norm": 0.21613669395446777, "learning_rate": 1.0079735009246168e-05, "loss": 0.0571, "step": 2915 }, { "epoch": 0.5134517320203974, "grad_norm": 0.34142959117889404, "learning_rate": 1.0051258538916422e-05, "loss": 0.0611, "step": 2920 }, { "epoch": 0.5143309301916652, "grad_norm": 0.5886263847351074, "learning_rate": 1.0022781652908549e-05, "loss": 0.0596, "step": 2925 }, { "epoch": 0.515210128362933, "grad_norm": 0.37875401973724365, "learning_rate": 9.994304582154197e-06, "loss": 0.045, "step": 2930 }, { "epoch": 0.5160893265342008, "grad_norm": 0.6814181804656982, "learning_rate": 9.9658275575865e-06, "loss": 0.0399, "step": 2935 }, { "epoch": 0.5169685247054686, "grad_norm": 0.6115921139717102, "learning_rate": 9.93735081013823e-06, "loss": 0.0586, "step": 2940 }, { "epoch": 0.5178477228767364, "grad_norm": 0.7454824447631836, "learning_rate": 9.908874570739899e-06, "loss": 0.0623, "step": 2945 }, { "epoch": 0.5187269210480042, "grad_norm": 0.16522662341594696, "learning_rate": 9.880399070317907e-06, "loss": 0.0578, "step": 2950 }, { "epoch": 0.519606119219272, "grad_norm": 0.5657525658607483, "learning_rate": 9.851924539792656e-06, "loss": 0.0468, "step": 2955 }, { "epoch": 0.5204853173905398, "grad_norm": 0.1268445998430252, "learning_rate": 9.823451210076691e-06, "loss": 0.0525, "step": 2960 }, { "epoch": 0.5213645155618076, "grad_norm": 0.3536778390407562, "learning_rate": 9.794979312072807e-06, "loss": 0.0557, "step": 2965 }, { "epoch": 0.5222437137330754, "grad_norm": 0.9080764651298523, "learning_rate": 9.766509076672204e-06, "loss": 0.0611, "step": 2970 }, { "epoch": 0.5231229119043432, "grad_norm": 0.3755652904510498, "learning_rate": 9.738040734752582e-06, "loss": 0.0683, "step": 2975 }, { "epoch": 0.524002110075611, "grad_norm": 0.6789637207984924, "learning_rate": 9.709574517176301e-06, "loss": 0.0475, "step": 2980 }, { "epoch": 0.5248813082468788, "grad_norm": 0.7782059907913208, "learning_rate": 9.681110654788483e-06, "loss": 0.0521, "step": 2985 }, { "epoch": 0.5257605064181466, "grad_norm": 0.20607317984104156, "learning_rate": 9.65264937841516e-06, "loss": 0.0517, "step": 2990 }, { "epoch": 0.5266397045894144, "grad_norm": 0.18635804951190948, "learning_rate": 9.62419091886138e-06, "loss": 0.0545, "step": 2995 }, { "epoch": 0.5275189027606823, "grad_norm": 0.26298439502716064, "learning_rate": 9.595735506909365e-06, "loss": 0.0529, "step": 3000 }, { "epoch": 0.52839810093195, "grad_norm": 0.4759673774242401, "learning_rate": 9.567283373316608e-06, "loss": 0.0544, "step": 3005 }, { "epoch": 0.5292772991032179, "grad_norm": 0.2273968905210495, "learning_rate": 9.538834748814028e-06, "loss": 0.0643, "step": 3010 }, { "epoch": 0.5301564972744857, "grad_norm": 0.27050191164016724, "learning_rate": 9.510389864104069e-06, "loss": 0.057, "step": 3015 }, { "epoch": 0.5310356954457535, "grad_norm": 0.23292891681194305, "learning_rate": 9.481948949858876e-06, "loss": 0.0656, "step": 3020 }, { "epoch": 0.5319148936170213, "grad_norm": 0.7964295744895935, "learning_rate": 9.453512236718365e-06, "loss": 0.0506, "step": 3025 }, { "epoch": 0.5327940917882891, "grad_norm": 0.46331024169921875, "learning_rate": 9.42507995528841e-06, "loss": 0.0612, "step": 3030 }, { "epoch": 0.5336732899595569, "grad_norm": 0.5655067563056946, "learning_rate": 9.396652336138923e-06, "loss": 0.0585, "step": 3035 }, { "epoch": 0.5345524881308247, "grad_norm": 0.18470239639282227, "learning_rate": 9.368229609802028e-06, "loss": 0.048, "step": 3040 }, { "epoch": 0.5354316863020925, "grad_norm": 0.7371414303779602, "learning_rate": 9.339812006770154e-06, "loss": 0.0526, "step": 3045 }, { "epoch": 0.5363108844733603, "grad_norm": 0.8986045122146606, "learning_rate": 9.311399757494196e-06, "loss": 0.0584, "step": 3050 }, { "epoch": 0.5371900826446281, "grad_norm": 0.4300435185432434, "learning_rate": 9.282993092381626e-06, "loss": 0.0519, "step": 3055 }, { "epoch": 0.5380692808158959, "grad_norm": 0.6866196393966675, "learning_rate": 9.254592241794633e-06, "loss": 0.0567, "step": 3060 }, { "epoch": 0.5389484789871637, "grad_norm": 0.17242960631847382, "learning_rate": 9.226197436048252e-06, "loss": 0.0611, "step": 3065 }, { "epoch": 0.5398276771584315, "grad_norm": 0.6466073989868164, "learning_rate": 9.197808905408504e-06, "loss": 0.0623, "step": 3070 }, { "epoch": 0.5407068753296993, "grad_norm": 0.20931695401668549, "learning_rate": 9.169426880090509e-06, "loss": 0.0606, "step": 3075 }, { "epoch": 0.5415860735009671, "grad_norm": 0.21108533442020416, "learning_rate": 9.141051590256651e-06, "loss": 0.0456, "step": 3080 }, { "epoch": 0.5424652716722349, "grad_norm": 0.1871514916419983, "learning_rate": 9.112683266014677e-06, "loss": 0.0468, "step": 3085 }, { "epoch": 0.5433444698435027, "grad_norm": 0.25235074758529663, "learning_rate": 9.084322137415855e-06, "loss": 0.0524, "step": 3090 }, { "epoch": 0.5442236680147705, "grad_norm": 0.6398855447769165, "learning_rate": 9.055968434453096e-06, "loss": 0.0523, "step": 3095 }, { "epoch": 0.5451028661860383, "grad_norm": 0.6872847676277161, "learning_rate": 9.027622387059103e-06, "loss": 0.0456, "step": 3100 }, { "epoch": 0.5459820643573061, "grad_norm": 0.7495630383491516, "learning_rate": 8.999284225104476e-06, "loss": 0.0673, "step": 3105 }, { "epoch": 0.5468612625285739, "grad_norm": 0.18154507875442505, "learning_rate": 8.970954178395894e-06, "loss": 0.0511, "step": 3110 }, { "epoch": 0.5477404606998417, "grad_norm": 0.16267339885234833, "learning_rate": 8.94263247667421e-06, "loss": 0.0475, "step": 3115 }, { "epoch": 0.5486196588711095, "grad_norm": 0.0940733551979065, "learning_rate": 8.914319349612607e-06, "loss": 0.0659, "step": 3120 }, { "epoch": 0.5494988570423773, "grad_norm": 0.5106806755065918, "learning_rate": 8.886015026814736e-06, "loss": 0.0532, "step": 3125 }, { "epoch": 0.5503780552136451, "grad_norm": 0.2427133470773697, "learning_rate": 8.857719737812836e-06, "loss": 0.0523, "step": 3130 }, { "epoch": 0.5512572533849129, "grad_norm": 0.36954110860824585, "learning_rate": 8.829433712065915e-06, "loss": 0.061, "step": 3135 }, { "epoch": 0.5521364515561807, "grad_norm": 0.892874538898468, "learning_rate": 8.801157178957827e-06, "loss": 0.0491, "step": 3140 }, { "epoch": 0.5530156497274485, "grad_norm": 0.3820838928222656, "learning_rate": 8.772890367795476e-06, "loss": 0.048, "step": 3145 }, { "epoch": 0.5538948478987163, "grad_norm": 1.087281346321106, "learning_rate": 8.744633507806907e-06, "loss": 0.0608, "step": 3150 }, { "epoch": 0.5547740460699842, "grad_norm": 0.2828584909439087, "learning_rate": 8.716386828139478e-06, "loss": 0.0616, "step": 3155 }, { "epoch": 0.555653244241252, "grad_norm": 0.6577861309051514, "learning_rate": 8.688150557857979e-06, "loss": 0.0586, "step": 3160 }, { "epoch": 0.5565324424125198, "grad_norm": 0.45899686217308044, "learning_rate": 8.659924925942798e-06, "loss": 0.058, "step": 3165 }, { "epoch": 0.5574116405837876, "grad_norm": 0.12640362977981567, "learning_rate": 8.631710161288043e-06, "loss": 0.0628, "step": 3170 }, { "epoch": 0.5582908387550554, "grad_norm": 0.478767454624176, "learning_rate": 8.603506492699698e-06, "loss": 0.0636, "step": 3175 }, { "epoch": 0.5591700369263232, "grad_norm": 0.22441603243350983, "learning_rate": 8.575314148893765e-06, "loss": 0.0461, "step": 3180 }, { "epoch": 0.560049235097591, "grad_norm": 0.9642320275306702, "learning_rate": 8.547133358494408e-06, "loss": 0.0541, "step": 3185 }, { "epoch": 0.5609284332688588, "grad_norm": 0.4824267029762268, "learning_rate": 8.518964350032092e-06, "loss": 0.0516, "step": 3190 }, { "epoch": 0.5618076314401266, "grad_norm": 0.26887422800064087, "learning_rate": 8.490807351941753e-06, "loss": 0.0497, "step": 3195 }, { "epoch": 0.5626868296113944, "grad_norm": 0.4273674488067627, "learning_rate": 8.462662592560911e-06, "loss": 0.0573, "step": 3200 }, { "epoch": 0.5635660277826622, "grad_norm": 0.385642409324646, "learning_rate": 8.434530300127853e-06, "loss": 0.0592, "step": 3205 }, { "epoch": 0.56444522595393, "grad_norm": 0.2365254908800125, "learning_rate": 8.406410702779754e-06, "loss": 0.0597, "step": 3210 }, { "epoch": 0.5653244241251978, "grad_norm": 0.20545728504657745, "learning_rate": 8.378304028550848e-06, "loss": 0.0524, "step": 3215 }, { "epoch": 0.5662036222964656, "grad_norm": 0.45571327209472656, "learning_rate": 8.35021050537056e-06, "loss": 0.0651, "step": 3220 }, { "epoch": 0.5670828204677334, "grad_norm": 0.3450813591480255, "learning_rate": 8.32213036106168e-06, "loss": 0.0534, "step": 3225 }, { "epoch": 0.5679620186390012, "grad_norm": 0.8915445804595947, "learning_rate": 8.294063823338486e-06, "loss": 0.0607, "step": 3230 }, { "epoch": 0.568841216810269, "grad_norm": 0.3208938539028168, "learning_rate": 8.266011119804937e-06, "loss": 0.0487, "step": 3235 }, { "epoch": 0.5697204149815368, "grad_norm": 0.3896248936653137, "learning_rate": 8.237972477952779e-06, "loss": 0.0534, "step": 3240 }, { "epoch": 0.5705996131528046, "grad_norm": 0.28958839178085327, "learning_rate": 8.209948125159745e-06, "loss": 0.0564, "step": 3245 }, { "epoch": 0.5714788113240724, "grad_norm": 0.8576890230178833, "learning_rate": 8.181938288687683e-06, "loss": 0.0602, "step": 3250 }, { "epoch": 0.5723580094953402, "grad_norm": 1.088234305381775, "learning_rate": 8.153943195680724e-06, "loss": 0.0578, "step": 3255 }, { "epoch": 0.573237207666608, "grad_norm": 0.2483123540878296, "learning_rate": 8.125963073163435e-06, "loss": 0.0588, "step": 3260 }, { "epoch": 0.5741164058378758, "grad_norm": 0.21865952014923096, "learning_rate": 8.097998148038986e-06, "loss": 0.0587, "step": 3265 }, { "epoch": 0.5749956040091436, "grad_norm": 0.44943463802337646, "learning_rate": 8.070048647087298e-06, "loss": 0.0542, "step": 3270 }, { "epoch": 0.5758748021804114, "grad_norm": 0.1797959953546524, "learning_rate": 8.042114796963219e-06, "loss": 0.0541, "step": 3275 }, { "epoch": 0.5767540003516792, "grad_norm": 0.7815648317337036, "learning_rate": 8.014196824194668e-06, "loss": 0.0466, "step": 3280 }, { "epoch": 0.577633198522947, "grad_norm": 0.5617738366127014, "learning_rate": 7.986294955180815e-06, "loss": 0.0521, "step": 3285 }, { "epoch": 0.5785123966942148, "grad_norm": 0.35597896575927734, "learning_rate": 7.958409416190233e-06, "loss": 0.0611, "step": 3290 }, { "epoch": 0.5793915948654826, "grad_norm": 0.4443993866443634, "learning_rate": 7.93054043335907e-06, "loss": 0.0641, "step": 3295 }, { "epoch": 0.5802707930367504, "grad_norm": 0.5022766590118408, "learning_rate": 7.902688232689212e-06, "loss": 0.0489, "step": 3300 }, { "epoch": 0.5811499912080182, "grad_norm": 0.6365528702735901, "learning_rate": 7.874853040046455e-06, "loss": 0.0686, "step": 3305 }, { "epoch": 0.582029189379286, "grad_norm": 0.3193661570549011, "learning_rate": 7.847035081158654e-06, "loss": 0.0552, "step": 3310 }, { "epoch": 0.5829083875505539, "grad_norm": 0.3971255123615265, "learning_rate": 7.819234581613934e-06, "loss": 0.068, "step": 3315 }, { "epoch": 0.5837875857218217, "grad_norm": 0.4976908564567566, "learning_rate": 7.791451766858808e-06, "loss": 0.0508, "step": 3320 }, { "epoch": 0.5846667838930895, "grad_norm": 0.25572288036346436, "learning_rate": 7.763686862196397e-06, "loss": 0.0594, "step": 3325 }, { "epoch": 0.5855459820643573, "grad_norm": 0.7076115012168884, "learning_rate": 7.735940092784564e-06, "loss": 0.0649, "step": 3330 }, { "epoch": 0.5864251802356251, "grad_norm": 0.33293214440345764, "learning_rate": 7.708211683634112e-06, "loss": 0.0528, "step": 3335 }, { "epoch": 0.5873043784068929, "grad_norm": 0.33272784948349, "learning_rate": 7.680501859606961e-06, "loss": 0.0505, "step": 3340 }, { "epoch": 0.5881835765781607, "grad_norm": 0.4092160165309906, "learning_rate": 7.652810845414297e-06, "loss": 0.0437, "step": 3345 }, { "epoch": 0.5890627747494285, "grad_norm": 0.24491731822490692, "learning_rate": 7.625138865614795e-06, "loss": 0.0635, "step": 3350 }, { "epoch": 0.5899419729206963, "grad_norm": 0.6766128540039062, "learning_rate": 7.597486144612741e-06, "loss": 0.0473, "step": 3355 }, { "epoch": 0.5908211710919641, "grad_norm": 0.27820366621017456, "learning_rate": 7.569852906656269e-06, "loss": 0.0521, "step": 3360 }, { "epoch": 0.5917003692632319, "grad_norm": 0.8893203735351562, "learning_rate": 7.542239375835499e-06, "loss": 0.0644, "step": 3365 }, { "epoch": 0.5925795674344997, "grad_norm": 0.4650630056858063, "learning_rate": 7.514645776080747e-06, "loss": 0.0694, "step": 3370 }, { "epoch": 0.5934587656057675, "grad_norm": 0.42498430609703064, "learning_rate": 7.487072331160696e-06, "loss": 0.0588, "step": 3375 }, { "epoch": 0.5943379637770353, "grad_norm": 0.5707778334617615, "learning_rate": 7.459519264680586e-06, "loss": 0.0655, "step": 3380 }, { "epoch": 0.5952171619483031, "grad_norm": 0.4820030629634857, "learning_rate": 7.431986800080394e-06, "loss": 0.0765, "step": 3385 }, { "epoch": 0.5960963601195709, "grad_norm": 1.1453475952148438, "learning_rate": 7.4044751606330365e-06, "loss": 0.062, "step": 3390 }, { "epoch": 0.5969755582908387, "grad_norm": 0.18737494945526123, "learning_rate": 7.37698456944254e-06, "loss": 0.0495, "step": 3395 }, { "epoch": 0.5978547564621065, "grad_norm": 0.49220606684684753, "learning_rate": 7.349515249442248e-06, "loss": 0.0575, "step": 3400 }, { "epoch": 0.5987339546333743, "grad_norm": 0.1627589464187622, "learning_rate": 7.322067423393002e-06, "loss": 0.0556, "step": 3405 }, { "epoch": 0.5996131528046421, "grad_norm": 0.5828942060470581, "learning_rate": 7.294641313881348e-06, "loss": 0.0597, "step": 3410 }, { "epoch": 0.6004923509759099, "grad_norm": 0.5319778323173523, "learning_rate": 7.267237143317707e-06, "loss": 0.0579, "step": 3415 }, { "epoch": 0.6013715491471777, "grad_norm": 0.14266423881053925, "learning_rate": 7.239855133934608e-06, "loss": 0.0591, "step": 3420 }, { "epoch": 0.6022507473184455, "grad_norm": 0.13865630328655243, "learning_rate": 7.212495507784843e-06, "loss": 0.0589, "step": 3425 }, { "epoch": 0.6031299454897133, "grad_norm": 0.7919948697090149, "learning_rate": 7.185158486739712e-06, "loss": 0.052, "step": 3430 }, { "epoch": 0.6040091436609812, "grad_norm": 0.4793383777141571, "learning_rate": 7.157844292487174e-06, "loss": 0.0637, "step": 3435 }, { "epoch": 0.604888341832249, "grad_norm": 0.2558303773403168, "learning_rate": 7.130553146530105e-06, "loss": 0.0724, "step": 3440 }, { "epoch": 0.6057675400035168, "grad_norm": 0.33608704805374146, "learning_rate": 7.103285270184446e-06, "loss": 0.0502, "step": 3445 }, { "epoch": 0.6066467381747846, "grad_norm": 0.31324923038482666, "learning_rate": 7.076040884577449e-06, "loss": 0.0559, "step": 3450 }, { "epoch": 0.6075259363460525, "grad_norm": 0.13528025150299072, "learning_rate": 7.048820210645862e-06, "loss": 0.0579, "step": 3455 }, { "epoch": 0.6084051345173203, "grad_norm": 0.12272872775793076, "learning_rate": 7.021623469134156e-06, "loss": 0.0573, "step": 3460 }, { "epoch": 0.6092843326885881, "grad_norm": 0.2325238287448883, "learning_rate": 6.994450880592706e-06, "loss": 0.0698, "step": 3465 }, { "epoch": 0.6101635308598559, "grad_norm": 0.27444854378700256, "learning_rate": 6.967302665376037e-06, "loss": 0.0605, "step": 3470 }, { "epoch": 0.6110427290311237, "grad_norm": 0.7133885622024536, "learning_rate": 6.940179043641005e-06, "loss": 0.055, "step": 3475 }, { "epoch": 0.6119219272023915, "grad_norm": 0.22960059344768524, "learning_rate": 6.913080235345042e-06, "loss": 0.0635, "step": 3480 }, { "epoch": 0.6128011253736593, "grad_norm": 0.4592248201370239, "learning_rate": 6.886006460244342e-06, "loss": 0.0575, "step": 3485 }, { "epoch": 0.6136803235449271, "grad_norm": 0.21212299168109894, "learning_rate": 6.858957937892105e-06, "loss": 0.0607, "step": 3490 }, { "epoch": 0.6145595217161949, "grad_norm": 0.23819631338119507, "learning_rate": 6.831934887636737e-06, "loss": 0.0512, "step": 3495 }, { "epoch": 0.6154387198874627, "grad_norm": 0.3616998493671417, "learning_rate": 6.804937528620088e-06, "loss": 0.0613, "step": 3500 }, { "epoch": 0.6163179180587305, "grad_norm": 0.21602647006511688, "learning_rate": 6.777966079775657e-06, "loss": 0.0648, "step": 3505 }, { "epoch": 0.6171971162299983, "grad_norm": 0.3276112973690033, "learning_rate": 6.751020759826836e-06, "loss": 0.0496, "step": 3510 }, { "epoch": 0.6180763144012661, "grad_norm": 0.5276904106140137, "learning_rate": 6.724101787285113e-06, "loss": 0.057, "step": 3515 }, { "epoch": 0.6189555125725339, "grad_norm": 0.3810863196849823, "learning_rate": 6.697209380448333e-06, "loss": 0.0584, "step": 3520 }, { "epoch": 0.6198347107438017, "grad_norm": 0.2644079029560089, "learning_rate": 6.670343757398882e-06, "loss": 0.0657, "step": 3525 }, { "epoch": 0.6207139089150695, "grad_norm": 0.24872681498527527, "learning_rate": 6.643505136001972e-06, "loss": 0.0435, "step": 3530 }, { "epoch": 0.6215931070863373, "grad_norm": 0.15694132447242737, "learning_rate": 6.616693733903823e-06, "loss": 0.053, "step": 3535 }, { "epoch": 0.6224723052576051, "grad_norm": 0.58054518699646, "learning_rate": 6.5899097685299395e-06, "loss": 0.0735, "step": 3540 }, { "epoch": 0.6233515034288729, "grad_norm": 0.479245662689209, "learning_rate": 6.563153457083315e-06, "loss": 0.0588, "step": 3545 }, { "epoch": 0.6242307016001407, "grad_norm": 0.28133952617645264, "learning_rate": 6.5364250165427e-06, "loss": 0.0573, "step": 3550 }, { "epoch": 0.6251098997714085, "grad_norm": 0.5088506937026978, "learning_rate": 6.509724663660813e-06, "loss": 0.055, "step": 3555 }, { "epoch": 0.6259890979426763, "grad_norm": 0.7142006158828735, "learning_rate": 6.4830526149626064e-06, "loss": 0.04, "step": 3560 }, { "epoch": 0.6268682961139441, "grad_norm": 0.31554004549980164, "learning_rate": 6.4564090867435e-06, "loss": 0.0593, "step": 3565 }, { "epoch": 0.6277474942852119, "grad_norm": 0.2784283757209778, "learning_rate": 6.429794295067625e-06, "loss": 0.046, "step": 3570 }, { "epoch": 0.6286266924564797, "grad_norm": 0.44909903407096863, "learning_rate": 6.403208455766081e-06, "loss": 0.0563, "step": 3575 }, { "epoch": 0.6295058906277475, "grad_norm": 0.21697324514389038, "learning_rate": 6.376651784435174e-06, "loss": 0.0527, "step": 3580 }, { "epoch": 0.6303850887990153, "grad_norm": 0.5999769568443298, "learning_rate": 6.350124496434677e-06, "loss": 0.066, "step": 3585 }, { "epoch": 0.6312642869702831, "grad_norm": 0.4327978193759918, "learning_rate": 6.323626806886082e-06, "loss": 0.0493, "step": 3590 }, { "epoch": 0.6321434851415509, "grad_norm": 0.47847869992256165, "learning_rate": 6.297158930670852e-06, "loss": 0.0593, "step": 3595 }, { "epoch": 0.6330226833128187, "grad_norm": 1.0836427211761475, "learning_rate": 6.270721082428678e-06, "loss": 0.0557, "step": 3600 }, { "epoch": 0.6339018814840865, "grad_norm": 0.2735603153705597, "learning_rate": 6.2443134765557475e-06, "loss": 0.0662, "step": 3605 }, { "epoch": 0.6347810796553544, "grad_norm": 0.3386712372303009, "learning_rate": 6.2179363272029935e-06, "loss": 0.0497, "step": 3610 }, { "epoch": 0.6356602778266222, "grad_norm": 0.11221656948328018, "learning_rate": 6.191589848274369e-06, "loss": 0.0498, "step": 3615 }, { "epoch": 0.63653947599789, "grad_norm": 0.5436195135116577, "learning_rate": 6.1652742534251e-06, "loss": 0.054, "step": 3620 }, { "epoch": 0.6374186741691578, "grad_norm": 0.5452234148979187, "learning_rate": 6.138989756059968e-06, "loss": 0.0448, "step": 3625 }, { "epoch": 0.6382978723404256, "grad_norm": 0.4096541404724121, "learning_rate": 6.1127365693315566e-06, "loss": 0.0556, "step": 3630 }, { "epoch": 0.6391770705116934, "grad_norm": 0.4600813388824463, "learning_rate": 6.086514906138563e-06, "loss": 0.0562, "step": 3635 }, { "epoch": 0.6400562686829612, "grad_norm": 0.8097591996192932, "learning_rate": 6.060324979124016e-06, "loss": 0.0534, "step": 3640 }, { "epoch": 0.640935466854229, "grad_norm": 0.8001208305358887, "learning_rate": 6.034167000673611e-06, "loss": 0.0589, "step": 3645 }, { "epoch": 0.6418146650254968, "grad_norm": 0.3946583867073059, "learning_rate": 6.008041182913933e-06, "loss": 0.0594, "step": 3650 }, { "epoch": 0.6426938631967646, "grad_norm": 0.3865828216075897, "learning_rate": 5.981947737710779e-06, "loss": 0.0655, "step": 3655 }, { "epoch": 0.6435730613680324, "grad_norm": 1.0482414960861206, "learning_rate": 5.955886876667414e-06, "loss": 0.0652, "step": 3660 }, { "epoch": 0.6444522595393002, "grad_norm": 1.3280454874038696, "learning_rate": 5.929858811122868e-06, "loss": 0.0678, "step": 3665 }, { "epoch": 0.645331457710568, "grad_norm": 0.1955081820487976, "learning_rate": 5.903863752150212e-06, "loss": 0.0565, "step": 3670 }, { "epoch": 0.6462106558818358, "grad_norm": 0.37470242381095886, "learning_rate": 5.877901910554862e-06, "loss": 0.0558, "step": 3675 }, { "epoch": 0.6470898540531036, "grad_norm": 0.8022100329399109, "learning_rate": 5.851973496872849e-06, "loss": 0.0498, "step": 3680 }, { "epoch": 0.6479690522243714, "grad_norm": 0.23341333866119385, "learning_rate": 5.82607872136913e-06, "loss": 0.0637, "step": 3685 }, { "epoch": 0.6488482503956392, "grad_norm": 0.40961551666259766, "learning_rate": 5.800217794035872e-06, "loss": 0.0463, "step": 3690 }, { "epoch": 0.649727448566907, "grad_norm": 0.21275675296783447, "learning_rate": 5.774390924590754e-06, "loss": 0.0552, "step": 3695 }, { "epoch": 0.6506066467381748, "grad_norm": 0.17066530883312225, "learning_rate": 5.748598322475258e-06, "loss": 0.0585, "step": 3700 }, { "epoch": 0.6514858449094426, "grad_norm": 0.3308875262737274, "learning_rate": 5.7228401968529836e-06, "loss": 0.058, "step": 3705 }, { "epoch": 0.6523650430807104, "grad_norm": 0.7064841389656067, "learning_rate": 5.697116756607946e-06, "loss": 0.0608, "step": 3710 }, { "epoch": 0.6532442412519782, "grad_norm": 0.13864412903785706, "learning_rate": 5.671428210342884e-06, "loss": 0.0409, "step": 3715 }, { "epoch": 0.654123439423246, "grad_norm": 0.49417930841445923, "learning_rate": 5.64577476637755e-06, "loss": 0.0633, "step": 3720 }, { "epoch": 0.6550026375945138, "grad_norm": 0.5518152117729187, "learning_rate": 5.620156632747053e-06, "loss": 0.0522, "step": 3725 }, { "epoch": 0.6558818357657816, "grad_norm": 0.157115638256073, "learning_rate": 5.594574017200149e-06, "loss": 0.0474, "step": 3730 }, { "epoch": 0.6567610339370494, "grad_norm": 0.16834110021591187, "learning_rate": 5.569027127197565e-06, "loss": 0.0573, "step": 3735 }, { "epoch": 0.6576402321083172, "grad_norm": 1.1739530563354492, "learning_rate": 5.5435161699103055e-06, "loss": 0.0531, "step": 3740 }, { "epoch": 0.658519430279585, "grad_norm": 0.4163840115070343, "learning_rate": 5.518041352217989e-06, "loss": 0.0731, "step": 3745 }, { "epoch": 0.6593986284508528, "grad_norm": 0.2617214322090149, "learning_rate": 5.492602880707161e-06, "loss": 0.0614, "step": 3750 }, { "epoch": 0.6602778266221206, "grad_norm": 0.3267338275909424, "learning_rate": 5.467200961669619e-06, "loss": 0.0511, "step": 3755 }, { "epoch": 0.6611570247933884, "grad_norm": 0.7266274094581604, "learning_rate": 5.441835801100734e-06, "loss": 0.0526, "step": 3760 }, { "epoch": 0.6620362229646563, "grad_norm": 0.7042490243911743, "learning_rate": 5.416507604697801e-06, "loss": 0.0383, "step": 3765 }, { "epoch": 0.662915421135924, "grad_norm": 0.5207750797271729, "learning_rate": 5.391216577858331e-06, "loss": 0.0561, "step": 3770 }, { "epoch": 0.6637946193071919, "grad_norm": 0.7317136526107788, "learning_rate": 5.365962925678443e-06, "loss": 0.0609, "step": 3775 }, { "epoch": 0.6646738174784597, "grad_norm": 0.47223329544067383, "learning_rate": 5.340746852951151e-06, "loss": 0.0661, "step": 3780 }, { "epoch": 0.6655530156497275, "grad_norm": 0.5919240713119507, "learning_rate": 5.315568564164713e-06, "loss": 0.0591, "step": 3785 }, { "epoch": 0.6664322138209953, "grad_norm": 0.16643045842647552, "learning_rate": 5.290428263500996e-06, "loss": 0.0512, "step": 3790 }, { "epoch": 0.6673114119922631, "grad_norm": 0.6606490612030029, "learning_rate": 5.26532615483379e-06, "loss": 0.06, "step": 3795 }, { "epoch": 0.6681906101635309, "grad_norm": 0.5036027431488037, "learning_rate": 5.240262441727187e-06, "loss": 0.0546, "step": 3800 }, { "epoch": 0.6690698083347987, "grad_norm": 0.2331734448671341, "learning_rate": 5.215237327433895e-06, "loss": 0.0512, "step": 3805 }, { "epoch": 0.6699490065060665, "grad_norm": 0.19755728542804718, "learning_rate": 5.190251014893621e-06, "loss": 0.047, "step": 3810 }, { "epoch": 0.6708282046773343, "grad_norm": 0.788175642490387, "learning_rate": 5.165303706731397e-06, "loss": 0.0681, "step": 3815 }, { "epoch": 0.6717074028486021, "grad_norm": 0.19185423851013184, "learning_rate": 5.140395605255965e-06, "loss": 0.0535, "step": 3820 }, { "epoch": 0.6725866010198699, "grad_norm": 0.22378072142601013, "learning_rate": 5.115526912458113e-06, "loss": 0.0584, "step": 3825 }, { "epoch": 0.6734657991911377, "grad_norm": 0.31949400901794434, "learning_rate": 5.090697830009057e-06, "loss": 0.059, "step": 3830 }, { "epoch": 0.6743449973624055, "grad_norm": 0.31779804825782776, "learning_rate": 5.065908559258782e-06, "loss": 0.0541, "step": 3835 }, { "epoch": 0.6752241955336733, "grad_norm": 0.9284188747406006, "learning_rate": 5.0411593012344305e-06, "loss": 0.0461, "step": 3840 }, { "epoch": 0.6761033937049411, "grad_norm": 1.0573227405548096, "learning_rate": 5.0164502566386655e-06, "loss": 0.0529, "step": 3845 }, { "epoch": 0.6769825918762089, "grad_norm": 0.8638303279876709, "learning_rate": 4.991781625848039e-06, "loss": 0.0652, "step": 3850 }, { "epoch": 0.6778617900474767, "grad_norm": 0.3580770194530487, "learning_rate": 4.967153608911366e-06, "loss": 0.0456, "step": 3855 }, { "epoch": 0.6787409882187445, "grad_norm": 0.1639026552438736, "learning_rate": 4.942566405548109e-06, "loss": 0.0624, "step": 3860 }, { "epoch": 0.6796201863900123, "grad_norm": 0.22067619860172272, "learning_rate": 4.918020215146759e-06, "loss": 0.0586, "step": 3865 }, { "epoch": 0.6804993845612801, "grad_norm": 0.24657614529132843, "learning_rate": 4.8935152367632136e-06, "loss": 0.0542, "step": 3870 }, { "epoch": 0.6813785827325479, "grad_norm": 0.9170181751251221, "learning_rate": 4.869051669119153e-06, "loss": 0.0517, "step": 3875 }, { "epoch": 0.6822577809038157, "grad_norm": 1.0674784183502197, "learning_rate": 4.844629710600457e-06, "loss": 0.0725, "step": 3880 }, { "epoch": 0.6831369790750835, "grad_norm": 0.3955898880958557, "learning_rate": 4.820249559255559e-06, "loss": 0.0557, "step": 3885 }, { "epoch": 0.6840161772463513, "grad_norm": 0.3524361252784729, "learning_rate": 4.795911412793883e-06, "loss": 0.0589, "step": 3890 }, { "epoch": 0.6848953754176191, "grad_norm": 0.3493446111679077, "learning_rate": 4.771615468584194e-06, "loss": 0.0516, "step": 3895 }, { "epoch": 0.6857745735888869, "grad_norm": 0.3028711676597595, "learning_rate": 4.747361923653039e-06, "loss": 0.0513, "step": 3900 }, { "epoch": 0.6866537717601547, "grad_norm": 0.09052418917417526, "learning_rate": 4.723150974683112e-06, "loss": 0.0559, "step": 3905 }, { "epoch": 0.6875329699314225, "grad_norm": 0.7868338823318481, "learning_rate": 4.698982818011694e-06, "loss": 0.0666, "step": 3910 }, { "epoch": 0.6884121681026903, "grad_norm": 0.16918179392814636, "learning_rate": 4.674857649629035e-06, "loss": 0.0527, "step": 3915 }, { "epoch": 0.6892913662739582, "grad_norm": 0.3652224540710449, "learning_rate": 4.650775665176783e-06, "loss": 0.0567, "step": 3920 }, { "epoch": 0.690170564445226, "grad_norm": 0.5725377798080444, "learning_rate": 4.626737059946375e-06, "loss": 0.0632, "step": 3925 }, { "epoch": 0.6910497626164938, "grad_norm": 0.4080544114112854, "learning_rate": 4.602742028877475e-06, "loss": 0.0485, "step": 3930 }, { "epoch": 0.6919289607877616, "grad_norm": 0.13161161541938782, "learning_rate": 4.578790766556386e-06, "loss": 0.0661, "step": 3935 }, { "epoch": 0.6928081589590294, "grad_norm": 0.28293490409851074, "learning_rate": 4.554883467214472e-06, "loss": 0.0572, "step": 3940 }, { "epoch": 0.6936873571302972, "grad_norm": 0.4282214343547821, "learning_rate": 4.53102032472657e-06, "loss": 0.0519, "step": 3945 }, { "epoch": 0.694566555301565, "grad_norm": 0.44754886627197266, "learning_rate": 4.507201532609444e-06, "loss": 0.056, "step": 3950 }, { "epoch": 0.6954457534728328, "grad_norm": 0.1610032469034195, "learning_rate": 4.4834272840201945e-06, "loss": 0.0592, "step": 3955 }, { "epoch": 0.6963249516441006, "grad_norm": 0.18849897384643555, "learning_rate": 4.459697771754704e-06, "loss": 0.0546, "step": 3960 }, { "epoch": 0.6972041498153684, "grad_norm": 0.41627243161201477, "learning_rate": 4.436013188246056e-06, "loss": 0.0654, "step": 3965 }, { "epoch": 0.6980833479866362, "grad_norm": 0.3271617889404297, "learning_rate": 4.412373725563001e-06, "loss": 0.0524, "step": 3970 }, { "epoch": 0.698962546157904, "grad_norm": 0.231711283326149, "learning_rate": 4.388779575408371e-06, "loss": 0.0543, "step": 3975 }, { "epoch": 0.6998417443291718, "grad_norm": 0.38711780309677124, "learning_rate": 4.36523092911756e-06, "loss": 0.0592, "step": 3980 }, { "epoch": 0.7007209425004396, "grad_norm": 0.3789129853248596, "learning_rate": 4.341727977656925e-06, "loss": 0.0556, "step": 3985 }, { "epoch": 0.7016001406717074, "grad_norm": 1.4185389280319214, "learning_rate": 4.318270911622285e-06, "loss": 0.0618, "step": 3990 }, { "epoch": 0.7024793388429752, "grad_norm": 0.749077558517456, "learning_rate": 4.2948599212373386e-06, "loss": 0.0558, "step": 3995 }, { "epoch": 0.703358537014243, "grad_norm": 0.23492804169654846, "learning_rate": 4.271495196352141e-06, "loss": 0.0614, "step": 4000 }, { "epoch": 0.7042377351855108, "grad_norm": 0.2221526801586151, "learning_rate": 4.248176926441574e-06, "loss": 0.0592, "step": 4005 }, { "epoch": 0.7051169333567786, "grad_norm": 0.2977111339569092, "learning_rate": 4.224905300603772e-06, "loss": 0.0449, "step": 4010 }, { "epoch": 0.7059961315280464, "grad_norm": 0.3051705062389374, "learning_rate": 4.2016805075586306e-06, "loss": 0.0507, "step": 4015 }, { "epoch": 0.7068753296993142, "grad_norm": 0.21050839126110077, "learning_rate": 4.178502735646244e-06, "loss": 0.0666, "step": 4020 }, { "epoch": 0.707754527870582, "grad_norm": 0.37007981538772583, "learning_rate": 4.1553721728254e-06, "loss": 0.0565, "step": 4025 }, { "epoch": 0.7086337260418498, "grad_norm": 0.8668897747993469, "learning_rate": 4.1322890066720465e-06, "loss": 0.0503, "step": 4030 }, { "epoch": 0.7095129242131176, "grad_norm": 0.13824816048145294, "learning_rate": 4.109253424377773e-06, "loss": 0.0656, "step": 4035 }, { "epoch": 0.7103921223843854, "grad_norm": 0.5541465282440186, "learning_rate": 4.086265612748277e-06, "loss": 0.0518, "step": 4040 }, { "epoch": 0.7112713205556532, "grad_norm": 0.5307297706604004, "learning_rate": 4.063325758201878e-06, "loss": 0.0554, "step": 4045 }, { "epoch": 0.712150518726921, "grad_norm": 0.6194763779640198, "learning_rate": 4.040434046767984e-06, "loss": 0.0648, "step": 4050 }, { "epoch": 0.7130297168981888, "grad_norm": 0.08174088597297668, "learning_rate": 4.017590664085593e-06, "loss": 0.0512, "step": 4055 }, { "epoch": 0.7139089150694566, "grad_norm": 0.3599446713924408, "learning_rate": 3.994795795401774e-06, "loss": 0.0466, "step": 4060 }, { "epoch": 0.7147881132407244, "grad_norm": 0.7686516046524048, "learning_rate": 3.9720496255701855e-06, "loss": 0.0576, "step": 4065 }, { "epoch": 0.7156673114119922, "grad_norm": 0.4762028157711029, "learning_rate": 3.949352339049561e-06, "loss": 0.0507, "step": 4070 }, { "epoch": 0.71654650958326, "grad_norm": 0.37046587467193604, "learning_rate": 3.926704119902219e-06, "loss": 0.063, "step": 4075 }, { "epoch": 0.7174257077545279, "grad_norm": 0.0833682268857956, "learning_rate": 3.904105151792563e-06, "loss": 0.0501, "step": 4080 }, { "epoch": 0.7183049059257957, "grad_norm": 0.24260376393795013, "learning_rate": 3.8815556179856106e-06, "loss": 0.0531, "step": 4085 }, { "epoch": 0.7191841040970635, "grad_norm": 0.28895097970962524, "learning_rate": 3.859055701345477e-06, "loss": 0.0558, "step": 4090 }, { "epoch": 0.7200633022683313, "grad_norm": 0.3396805226802826, "learning_rate": 3.8366055843339315e-06, "loss": 0.0718, "step": 4095 }, { "epoch": 0.7209425004395991, "grad_norm": 0.32812032103538513, "learning_rate": 3.8142054490088752e-06, "loss": 0.06, "step": 4100 }, { "epoch": 0.7218216986108669, "grad_norm": 0.2770575284957886, "learning_rate": 3.791855477022903e-06, "loss": 0.0596, "step": 4105 }, { "epoch": 0.7227008967821347, "grad_norm": 0.38149547576904297, "learning_rate": 3.769555849621799e-06, "loss": 0.0639, "step": 4110 }, { "epoch": 0.7235800949534025, "grad_norm": 0.19870160520076752, "learning_rate": 3.747306747643089e-06, "loss": 0.0503, "step": 4115 }, { "epoch": 0.7244592931246703, "grad_norm": 0.31559768319129944, "learning_rate": 3.7251083515145658e-06, "loss": 0.0546, "step": 4120 }, { "epoch": 0.7253384912959381, "grad_norm": 0.4508054852485657, "learning_rate": 3.7029608412528263e-06, "loss": 0.0658, "step": 4125 }, { "epoch": 0.7262176894672059, "grad_norm": 0.34780052304267883, "learning_rate": 3.680864396461803e-06, "loss": 0.0562, "step": 4130 }, { "epoch": 0.7270968876384737, "grad_norm": 0.5677065849304199, "learning_rate": 3.658819196331327e-06, "loss": 0.0643, "step": 4135 }, { "epoch": 0.7279760858097415, "grad_norm": 0.1682279258966446, "learning_rate": 3.6368254196356576e-06, "loss": 0.053, "step": 4140 }, { "epoch": 0.7288552839810093, "grad_norm": 0.32326585054397583, "learning_rate": 3.614883244732045e-06, "loss": 0.0408, "step": 4145 }, { "epoch": 0.7297344821522771, "grad_norm": 0.2262571007013321, "learning_rate": 3.5929928495592657e-06, "loss": 0.0552, "step": 4150 }, { "epoch": 0.7306136803235449, "grad_norm": 0.26115310192108154, "learning_rate": 3.5711544116362028e-06, "loss": 0.0611, "step": 4155 }, { "epoch": 0.7314928784948127, "grad_norm": 0.464222252368927, "learning_rate": 3.5493681080603903e-06, "loss": 0.055, "step": 4160 }, { "epoch": 0.7323720766660805, "grad_norm": 0.35738715529441833, "learning_rate": 3.5276341155065864e-06, "loss": 0.0632, "step": 4165 }, { "epoch": 0.7332512748373483, "grad_norm": 0.393279105424881, "learning_rate": 3.505952610225327e-06, "loss": 0.0529, "step": 4170 }, { "epoch": 0.7341304730086161, "grad_norm": 0.5563225746154785, "learning_rate": 3.4843237680415153e-06, "loss": 0.0628, "step": 4175 }, { "epoch": 0.7350096711798839, "grad_norm": 0.2666977047920227, "learning_rate": 3.462747764352974e-06, "loss": 0.0547, "step": 4180 }, { "epoch": 0.7358888693511517, "grad_norm": 0.4078899919986725, "learning_rate": 3.441224774129055e-06, "loss": 0.0639, "step": 4185 }, { "epoch": 0.7367680675224195, "grad_norm": 0.31975606083869934, "learning_rate": 3.4197549719091794e-06, "loss": 0.0628, "step": 4190 }, { "epoch": 0.7376472656936873, "grad_norm": 0.295015424489975, "learning_rate": 3.3983385318014573e-06, "loss": 0.049, "step": 4195 }, { "epoch": 0.7385264638649551, "grad_norm": 0.608062207698822, "learning_rate": 3.3769756274812526e-06, "loss": 0.047, "step": 4200 }, { "epoch": 0.7394056620362229, "grad_norm": 0.20837105810642242, "learning_rate": 3.3556664321897914e-06, "loss": 0.0623, "step": 4205 }, { "epoch": 0.7402848602074907, "grad_norm": 0.20374645292758942, "learning_rate": 3.334411118732744e-06, "loss": 0.0576, "step": 4210 }, { "epoch": 0.7411640583787585, "grad_norm": 0.49013030529022217, "learning_rate": 3.3132098594788385e-06, "loss": 0.0632, "step": 4215 }, { "epoch": 0.7420432565500263, "grad_norm": 0.3315906822681427, "learning_rate": 3.2920628263584375e-06, "loss": 0.0536, "step": 4220 }, { "epoch": 0.7429224547212941, "grad_norm": 0.3624337613582611, "learning_rate": 3.2709701908621726e-06, "loss": 0.0542, "step": 4225 }, { "epoch": 0.743801652892562, "grad_norm": 0.2681962549686432, "learning_rate": 3.2499321240395387e-06, "loss": 0.0581, "step": 4230 }, { "epoch": 0.7446808510638298, "grad_norm": 0.11040078103542328, "learning_rate": 3.2289487964975074e-06, "loss": 0.0497, "step": 4235 }, { "epoch": 0.7455600492350976, "grad_norm": 0.5186774730682373, "learning_rate": 3.2080203783991504e-06, "loss": 0.0594, "step": 4240 }, { "epoch": 0.7464392474063654, "grad_norm": 0.4543941617012024, "learning_rate": 3.1871470394622407e-06, "loss": 0.0602, "step": 4245 }, { "epoch": 0.7473184455776332, "grad_norm": 0.8331423997879028, "learning_rate": 3.1663289489579054e-06, "loss": 0.0453, "step": 4250 }, { "epoch": 0.748197643748901, "grad_norm": 0.21638324856758118, "learning_rate": 3.145566275709231e-06, "loss": 0.0534, "step": 4255 }, { "epoch": 0.7490768419201688, "grad_norm": 0.19550643861293793, "learning_rate": 3.124859188089905e-06, "loss": 0.0502, "step": 4260 }, { "epoch": 0.7499560400914366, "grad_norm": 0.18126316368579865, "learning_rate": 3.1042078540228358e-06, "loss": 0.0542, "step": 4265 }, { "epoch": 0.7508352382627044, "grad_norm": 0.4952530264854431, "learning_rate": 3.0836124409788137e-06, "loss": 0.0518, "step": 4270 }, { "epoch": 0.7517144364339722, "grad_norm": 0.7046754956245422, "learning_rate": 3.063073115975136e-06, "loss": 0.0575, "step": 4275 }, { "epoch": 0.75259363460524, "grad_norm": 0.13164618611335754, "learning_rate": 3.0425900455742584e-06, "loss": 0.0475, "step": 4280 }, { "epoch": 0.7534728327765078, "grad_norm": 0.21266759932041168, "learning_rate": 3.022163395882438e-06, "loss": 0.0532, "step": 4285 }, { "epoch": 0.7543520309477756, "grad_norm": 0.6557819247245789, "learning_rate": 3.0017933325484028e-06, "loss": 0.0501, "step": 4290 }, { "epoch": 0.7552312291190434, "grad_norm": 0.253121554851532, "learning_rate": 2.981480020761978e-06, "loss": 0.0568, "step": 4295 }, { "epoch": 0.7561104272903112, "grad_norm": 0.26722845435142517, "learning_rate": 2.9612236252527904e-06, "loss": 0.0564, "step": 4300 }, { "epoch": 0.756989625461579, "grad_norm": 0.1494354009628296, "learning_rate": 2.941024310288886e-06, "loss": 0.0577, "step": 4305 }, { "epoch": 0.7578688236328468, "grad_norm": 0.16053201258182526, "learning_rate": 2.9208822396754333e-06, "loss": 0.0604, "step": 4310 }, { "epoch": 0.7587480218041146, "grad_norm": 0.5304045677185059, "learning_rate": 2.9007975767533714e-06, "loss": 0.0598, "step": 4315 }, { "epoch": 0.7596272199753824, "grad_norm": 0.587253987789154, "learning_rate": 2.8807704843981e-06, "loss": 0.0596, "step": 4320 }, { "epoch": 0.7605064181466502, "grad_norm": 0.6153156161308289, "learning_rate": 2.8608011250181544e-06, "loss": 0.052, "step": 4325 }, { "epoch": 0.761385616317918, "grad_norm": 0.7715466618537903, "learning_rate": 2.8408896605538905e-06, "loss": 0.0501, "step": 4330 }, { "epoch": 0.7622648144891858, "grad_norm": 0.24888671934604645, "learning_rate": 2.8210362524761557e-06, "loss": 0.0594, "step": 4335 }, { "epoch": 0.7631440126604536, "grad_norm": 0.6384350061416626, "learning_rate": 2.8012410617850083e-06, "loss": 0.0491, "step": 4340 }, { "epoch": 0.7640232108317214, "grad_norm": 0.5361153483390808, "learning_rate": 2.7815042490083857e-06, "loss": 0.053, "step": 4345 }, { "epoch": 0.7649024090029892, "grad_norm": 0.3747937083244324, "learning_rate": 2.7618259742008226e-06, "loss": 0.0555, "step": 4350 }, { "epoch": 0.765781607174257, "grad_norm": 0.2643256187438965, "learning_rate": 2.7422063969421286e-06, "loss": 0.0533, "step": 4355 }, { "epoch": 0.7666608053455248, "grad_norm": 0.6822528839111328, "learning_rate": 2.722645676336123e-06, "loss": 0.057, "step": 4360 }, { "epoch": 0.7675400035167926, "grad_norm": 0.2672845423221588, "learning_rate": 2.7031439710093254e-06, "loss": 0.058, "step": 4365 }, { "epoch": 0.7684192016880604, "grad_norm": 0.1608499139547348, "learning_rate": 2.683701439109676e-06, "loss": 0.0573, "step": 4370 }, { "epoch": 0.7692983998593282, "grad_norm": 0.2795965373516083, "learning_rate": 2.6643182383052448e-06, "loss": 0.0667, "step": 4375 }, { "epoch": 0.770177598030596, "grad_norm": 0.11451299488544464, "learning_rate": 2.644994525782971e-06, "loss": 0.0527, "step": 4380 }, { "epoch": 0.7710567962018638, "grad_norm": 0.48407652974128723, "learning_rate": 2.625730458247362e-06, "loss": 0.0572, "step": 4385 }, { "epoch": 0.7719359943731317, "grad_norm": 0.5724853277206421, "learning_rate": 2.606526191919259e-06, "loss": 0.0432, "step": 4390 }, { "epoch": 0.7728151925443995, "grad_norm": 0.6579432487487793, "learning_rate": 2.5873818825345254e-06, "loss": 0.0521, "step": 4395 }, { "epoch": 0.7736943907156673, "grad_norm": 0.7850777506828308, "learning_rate": 2.5682976853428264e-06, "loss": 0.0563, "step": 4400 }, { "epoch": 0.7745735888869351, "grad_norm": 0.20982079207897186, "learning_rate": 2.5492737551063374e-06, "loss": 0.0587, "step": 4405 }, { "epoch": 0.7754527870582029, "grad_norm": 0.20126987993717194, "learning_rate": 2.5303102460985098e-06, "loss": 0.0585, "step": 4410 }, { "epoch": 0.7763319852294708, "grad_norm": 0.16406214237213135, "learning_rate": 2.511407312102809e-06, "loss": 0.0641, "step": 4415 }, { "epoch": 0.7772111834007386, "grad_norm": 0.8947567939758301, "learning_rate": 2.4925651064114788e-06, "loss": 0.0563, "step": 4420 }, { "epoch": 0.7780903815720064, "grad_norm": 0.9534220099449158, "learning_rate": 2.4737837818242747e-06, "loss": 0.0472, "step": 4425 }, { "epoch": 0.7789695797432742, "grad_norm": 0.24452577531337738, "learning_rate": 2.455063490647257e-06, "loss": 0.0545, "step": 4430 }, { "epoch": 0.779848777914542, "grad_norm": 0.5126246213912964, "learning_rate": 2.4364043846915273e-06, "loss": 0.0502, "step": 4435 }, { "epoch": 0.7807279760858098, "grad_norm": 0.3192353844642639, "learning_rate": 2.4178066152720203e-06, "loss": 0.0672, "step": 4440 }, { "epoch": 0.7816071742570776, "grad_norm": 0.8498159050941467, "learning_rate": 2.399270333206253e-06, "loss": 0.0575, "step": 4445 }, { "epoch": 0.7824863724283454, "grad_norm": 0.7275409698486328, "learning_rate": 2.3807956888131213e-06, "loss": 0.0623, "step": 4450 }, { "epoch": 0.7833655705996132, "grad_norm": 0.6221399307250977, "learning_rate": 2.362382831911675e-06, "loss": 0.0554, "step": 4455 }, { "epoch": 0.784244768770881, "grad_norm": 0.21380391716957092, "learning_rate": 2.3440319118198997e-06, "loss": 0.0551, "step": 4460 }, { "epoch": 0.7851239669421488, "grad_norm": 0.8011379837989807, "learning_rate": 2.3257430773535116e-06, "loss": 0.051, "step": 4465 }, { "epoch": 0.7860031651134166, "grad_norm": 0.5205139517784119, "learning_rate": 2.307516476824738e-06, "loss": 0.0615, "step": 4470 }, { "epoch": 0.7868823632846844, "grad_norm": 0.40472784638404846, "learning_rate": 2.289352258041133e-06, "loss": 0.0515, "step": 4475 }, { "epoch": 0.7877615614559522, "grad_norm": 0.11419567465782166, "learning_rate": 2.271250568304366e-06, "loss": 0.0511, "step": 4480 }, { "epoch": 0.78864075962722, "grad_norm": 0.11799798905849457, "learning_rate": 2.253211554409034e-06, "loss": 0.0584, "step": 4485 }, { "epoch": 0.7895199577984878, "grad_norm": 0.5825390219688416, "learning_rate": 2.235235362641458e-06, "loss": 0.052, "step": 4490 }, { "epoch": 0.7903991559697556, "grad_norm": 0.14880183339118958, "learning_rate": 2.2173221387785215e-06, "loss": 0.0567, "step": 4495 }, { "epoch": 0.7912783541410234, "grad_norm": 0.16404080390930176, "learning_rate": 2.1994720280864567e-06, "loss": 0.0555, "step": 4500 }, { "epoch": 0.7921575523122912, "grad_norm": 0.13647328317165375, "learning_rate": 2.1816851753197023e-06, "loss": 0.0443, "step": 4505 }, { "epoch": 0.793036750483559, "grad_norm": 0.3101443648338318, "learning_rate": 2.163961724719693e-06, "loss": 0.0568, "step": 4510 }, { "epoch": 0.7939159486548268, "grad_norm": 0.5262371897697449, "learning_rate": 2.1463018200137197e-06, "loss": 0.0541, "step": 4515 }, { "epoch": 0.7947951468260946, "grad_norm": 0.15383735299110413, "learning_rate": 2.128705604413741e-06, "loss": 0.057, "step": 4520 }, { "epoch": 0.7956743449973624, "grad_norm": 0.33696064352989197, "learning_rate": 2.1111732206152424e-06, "loss": 0.0541, "step": 4525 }, { "epoch": 0.7965535431686303, "grad_norm": 0.37046894431114197, "learning_rate": 2.093704810796062e-06, "loss": 0.0677, "step": 4530 }, { "epoch": 0.797432741339898, "grad_norm": 0.2881315350532532, "learning_rate": 2.076300516615252e-06, "loss": 0.0516, "step": 4535 }, { "epoch": 0.7983119395111659, "grad_norm": 0.21759085357189178, "learning_rate": 2.0589604792119124e-06, "loss": 0.0604, "step": 4540 }, { "epoch": 0.7991911376824337, "grad_norm": 0.6279814839363098, "learning_rate": 2.0416848392040647e-06, "loss": 0.0618, "step": 4545 }, { "epoch": 0.8000703358537015, "grad_norm": 0.06609740853309631, "learning_rate": 2.024473736687501e-06, "loss": 0.0478, "step": 4550 }, { "epoch": 0.8009495340249693, "grad_norm": 0.3990488052368164, "learning_rate": 2.0073273112346526e-06, "loss": 0.0563, "step": 4555 }, { "epoch": 0.8018287321962371, "grad_norm": 0.54508376121521, "learning_rate": 1.9902457018934496e-06, "loss": 0.0665, "step": 4560 }, { "epoch": 0.8027079303675049, "grad_norm": 0.28929072618484497, "learning_rate": 1.973229047186206e-06, "loss": 0.0583, "step": 4565 }, { "epoch": 0.8035871285387727, "grad_norm": 0.4705251157283783, "learning_rate": 1.9562774851084865e-06, "loss": 0.0639, "step": 4570 }, { "epoch": 0.8044663267100405, "grad_norm": 0.10792229324579239, "learning_rate": 1.9393911531279973e-06, "loss": 0.0687, "step": 4575 }, { "epoch": 0.8053455248813083, "grad_norm": 0.8370270729064941, "learning_rate": 1.9225701881834524e-06, "loss": 0.0616, "step": 4580 }, { "epoch": 0.8062247230525761, "grad_norm": 0.20940972864627838, "learning_rate": 1.9058147266834892e-06, "loss": 0.0588, "step": 4585 }, { "epoch": 0.8071039212238439, "grad_norm": 0.43674102425575256, "learning_rate": 1.8891249045055349e-06, "loss": 0.0424, "step": 4590 }, { "epoch": 0.8079831193951117, "grad_norm": 0.4499530494213104, "learning_rate": 1.8725008569947366e-06, "loss": 0.0583, "step": 4595 }, { "epoch": 0.8088623175663795, "grad_norm": 0.16668100655078888, "learning_rate": 1.8559427189628277e-06, "loss": 0.0604, "step": 4600 }, { "epoch": 0.8097415157376473, "grad_norm": 0.20123635232448578, "learning_rate": 1.8394506246870635e-06, "loss": 0.0561, "step": 4605 }, { "epoch": 0.8106207139089151, "grad_norm": 0.2664503753185272, "learning_rate": 1.8230247079091146e-06, "loss": 0.053, "step": 4610 }, { "epoch": 0.8114999120801829, "grad_norm": 0.7137978076934814, "learning_rate": 1.8066651018339943e-06, "loss": 0.0572, "step": 4615 }, { "epoch": 0.8123791102514507, "grad_norm": 0.5590645670890808, "learning_rate": 1.790371939128972e-06, "loss": 0.0616, "step": 4620 }, { "epoch": 0.8132583084227185, "grad_norm": 0.08397898077964783, "learning_rate": 1.7741453519224982e-06, "loss": 0.058, "step": 4625 }, { "epoch": 0.8141375065939863, "grad_norm": 0.33352193236351013, "learning_rate": 1.7579854718031285e-06, "loss": 0.0517, "step": 4630 }, { "epoch": 0.8150167047652541, "grad_norm": 0.20153024792671204, "learning_rate": 1.741892429818468e-06, "loss": 0.0547, "step": 4635 }, { "epoch": 0.8158959029365219, "grad_norm": 0.8683350086212158, "learning_rate": 1.7258663564740996e-06, "loss": 0.0618, "step": 4640 }, { "epoch": 0.8167751011077897, "grad_norm": 0.19968271255493164, "learning_rate": 1.7099073817325307e-06, "loss": 0.0568, "step": 4645 }, { "epoch": 0.8176542992790575, "grad_norm": 0.3933415412902832, "learning_rate": 1.6940156350121273e-06, "loss": 0.0622, "step": 4650 }, { "epoch": 0.8185334974503253, "grad_norm": 0.8188037872314453, "learning_rate": 1.6781912451860827e-06, "loss": 0.0645, "step": 4655 }, { "epoch": 0.8194126956215931, "grad_norm": 0.6926817893981934, "learning_rate": 1.6624343405813615e-06, "loss": 0.0561, "step": 4660 }, { "epoch": 0.8202918937928609, "grad_norm": 0.5045862197875977, "learning_rate": 1.6467450489776581e-06, "loss": 0.0668, "step": 4665 }, { "epoch": 0.8211710919641287, "grad_norm": 0.5935778617858887, "learning_rate": 1.6311234976063694e-06, "loss": 0.0575, "step": 4670 }, { "epoch": 0.8220502901353965, "grad_norm": 0.3604169189929962, "learning_rate": 1.6155698131495457e-06, "loss": 0.0543, "step": 4675 }, { "epoch": 0.8229294883066643, "grad_norm": 0.24668653309345245, "learning_rate": 1.6000841217388864e-06, "loss": 0.057, "step": 4680 }, { "epoch": 0.8238086864779322, "grad_norm": 0.33264681696891785, "learning_rate": 1.5846665489546964e-06, "loss": 0.0572, "step": 4685 }, { "epoch": 0.8246878846492, "grad_norm": 0.6432152986526489, "learning_rate": 1.5693172198248863e-06, "loss": 0.0604, "step": 4690 }, { "epoch": 0.8255670828204678, "grad_norm": 0.4105263948440552, "learning_rate": 1.5540362588239366e-06, "loss": 0.0701, "step": 4695 }, { "epoch": 0.8264462809917356, "grad_norm": 0.25226283073425293, "learning_rate": 1.5388237898719105e-06, "loss": 0.0534, "step": 4700 }, { "epoch": 0.8273254791630034, "grad_norm": 0.2534274160861969, "learning_rate": 1.5236799363334298e-06, "loss": 0.0535, "step": 4705 }, { "epoch": 0.8282046773342712, "grad_norm": 0.29621097445487976, "learning_rate": 1.508604821016698e-06, "loss": 0.0499, "step": 4710 }, { "epoch": 0.829083875505539, "grad_norm": 0.38203129172325134, "learning_rate": 1.4935985661724727e-06, "loss": 0.0539, "step": 4715 }, { "epoch": 0.8299630736768068, "grad_norm": 0.9345189929008484, "learning_rate": 1.4786612934931055e-06, "loss": 0.0578, "step": 4720 }, { "epoch": 0.8308422718480746, "grad_norm": 0.21688897907733917, "learning_rate": 1.463793124111531e-06, "loss": 0.0427, "step": 4725 }, { "epoch": 0.8317214700193424, "grad_norm": 0.22990085184574127, "learning_rate": 1.4489941786003004e-06, "loss": 0.0441, "step": 4730 }, { "epoch": 0.8326006681906102, "grad_norm": 0.4832035005092621, "learning_rate": 1.4342645769705977e-06, "loss": 0.0588, "step": 4735 }, { "epoch": 0.833479866361878, "grad_norm": 0.40238794684410095, "learning_rate": 1.419604438671267e-06, "loss": 0.0519, "step": 4740 }, { "epoch": 0.8343590645331458, "grad_norm": 0.5739127397537231, "learning_rate": 1.405013882587839e-06, "loss": 0.0637, "step": 4745 }, { "epoch": 0.8352382627044136, "grad_norm": 0.3309503495693207, "learning_rate": 1.3904930270415763e-06, "loss": 0.0506, "step": 4750 }, { "epoch": 0.8361174608756814, "grad_norm": 0.09466574341058731, "learning_rate": 1.376041989788508e-06, "loss": 0.0524, "step": 4755 }, { "epoch": 0.8369966590469492, "grad_norm": 0.27358055114746094, "learning_rate": 1.3616608880184768e-06, "loss": 0.0545, "step": 4760 }, { "epoch": 0.837875857218217, "grad_norm": 0.26218411326408386, "learning_rate": 1.3473498383541817e-06, "loss": 0.0467, "step": 4765 }, { "epoch": 0.8387550553894848, "grad_norm": 0.3839583992958069, "learning_rate": 1.3331089568502465e-06, "loss": 0.043, "step": 4770 }, { "epoch": 0.8396342535607526, "grad_norm": 0.37673670053482056, "learning_rate": 1.3189383589922667e-06, "loss": 0.0634, "step": 4775 }, { "epoch": 0.8405134517320204, "grad_norm": 0.3072827160358429, "learning_rate": 1.304838159695877e-06, "loss": 0.0684, "step": 4780 }, { "epoch": 0.8413926499032882, "grad_norm": 0.7255908250808716, "learning_rate": 1.290808473305817e-06, "loss": 0.0545, "step": 4785 }, { "epoch": 0.842271848074556, "grad_norm": 0.4259006381034851, "learning_rate": 1.2768494135950093e-06, "loss": 0.0516, "step": 4790 }, { "epoch": 0.8431510462458238, "grad_norm": 0.6573328971862793, "learning_rate": 1.2629610937636284e-06, "loss": 0.0494, "step": 4795 }, { "epoch": 0.8440302444170916, "grad_norm": 0.09125727415084839, "learning_rate": 1.2491436264381984e-06, "loss": 0.0621, "step": 4800 }, { "epoch": 0.8449094425883594, "grad_norm": 0.4536990821361542, "learning_rate": 1.2353971236706564e-06, "loss": 0.0506, "step": 4805 }, { "epoch": 0.8457886407596272, "grad_norm": 0.47011682391166687, "learning_rate": 1.2217216969374669e-06, "loss": 0.0582, "step": 4810 }, { "epoch": 0.846667838930895, "grad_norm": 0.15771393477916718, "learning_rate": 1.208117457138699e-06, "loss": 0.0612, "step": 4815 }, { "epoch": 0.8475470371021628, "grad_norm": 1.0707141160964966, "learning_rate": 1.1945845145971414e-06, "loss": 0.0597, "step": 4820 }, { "epoch": 0.8484262352734306, "grad_norm": 0.381056547164917, "learning_rate": 1.1811229790573996e-06, "loss": 0.0678, "step": 4825 }, { "epoch": 0.8493054334446984, "grad_norm": 0.33800312876701355, "learning_rate": 1.1677329596850117e-06, "loss": 0.0516, "step": 4830 }, { "epoch": 0.8501846316159662, "grad_norm": 0.19112703204154968, "learning_rate": 1.1544145650655514e-06, "loss": 0.062, "step": 4835 }, { "epoch": 0.851063829787234, "grad_norm": 0.4575953483581543, "learning_rate": 1.1411679032037636e-06, "loss": 0.0542, "step": 4840 }, { "epoch": 0.8519430279585019, "grad_norm": 0.6570442914962769, "learning_rate": 1.127993081522678e-06, "loss": 0.0567, "step": 4845 }, { "epoch": 0.8528222261297697, "grad_norm": 0.4207151234149933, "learning_rate": 1.114890206862742e-06, "loss": 0.0595, "step": 4850 }, { "epoch": 0.8537014243010375, "grad_norm": 0.183942511677742, "learning_rate": 1.1018593854809478e-06, "loss": 0.0537, "step": 4855 }, { "epoch": 0.8545806224723053, "grad_norm": 0.5328112840652466, "learning_rate": 1.0889007230499805e-06, "loss": 0.0598, "step": 4860 }, { "epoch": 0.8554598206435731, "grad_norm": 0.1587759107351303, "learning_rate": 1.0760143246573552e-06, "loss": 0.0607, "step": 4865 }, { "epoch": 0.8563390188148409, "grad_norm": 0.3230392336845398, "learning_rate": 1.0632002948045672e-06, "loss": 0.0434, "step": 4870 }, { "epoch": 0.8572182169861087, "grad_norm": 0.6485795378684998, "learning_rate": 1.0504587374062392e-06, "loss": 0.0518, "step": 4875 }, { "epoch": 0.8580974151573765, "grad_norm": 0.20032288134098053, "learning_rate": 1.037789755789289e-06, "loss": 0.047, "step": 4880 }, { "epoch": 0.8589766133286443, "grad_norm": 0.2695741355419159, "learning_rate": 1.025193452692076e-06, "loss": 0.0544, "step": 4885 }, { "epoch": 0.8598558114999121, "grad_norm": 0.4286085069179535, "learning_rate": 1.0126699302635901e-06, "loss": 0.0749, "step": 4890 }, { "epoch": 0.8607350096711799, "grad_norm": 0.21349282562732697, "learning_rate": 1.0002192900626028e-06, "loss": 0.0598, "step": 4895 }, { "epoch": 0.8616142078424477, "grad_norm": 0.16585257649421692, "learning_rate": 9.878416330568486e-07, "loss": 0.056, "step": 4900 }, { "epoch": 0.8624934060137155, "grad_norm": 0.723551332950592, "learning_rate": 9.75537059622218e-07, "loss": 0.0531, "step": 4905 }, { "epoch": 0.8633726041849833, "grad_norm": 0.7833985090255737, "learning_rate": 9.633056695419229e-07, "loss": 0.0499, "step": 4910 }, { "epoch": 0.8642518023562511, "grad_norm": 0.18503841757774353, "learning_rate": 9.511475620057132e-07, "loss": 0.0432, "step": 4915 }, { "epoch": 0.8651310005275189, "grad_norm": 0.25365808606147766, "learning_rate": 9.390628356090459e-07, "loss": 0.0677, "step": 4920 }, { "epoch": 0.8660101986987867, "grad_norm": 0.6311047673225403, "learning_rate": 9.270515883523057e-07, "loss": 0.0642, "step": 4925 }, { "epoch": 0.8668893968700545, "grad_norm": 0.13977408409118652, "learning_rate": 9.15113917639997e-07, "loss": 0.0538, "step": 4930 }, { "epoch": 0.8677685950413223, "grad_norm": 0.3203240931034088, "learning_rate": 9.032499202799627e-07, "loss": 0.0535, "step": 4935 }, { "epoch": 0.8686477932125901, "grad_norm": 0.5966960191726685, "learning_rate": 8.914596924825958e-07, "loss": 0.0485, "step": 4940 }, { "epoch": 0.8695269913838579, "grad_norm": 0.4596826732158661, "learning_rate": 8.797433298600622e-07, "loss": 0.0659, "step": 4945 }, { "epoch": 0.8704061895551257, "grad_norm": 0.757762610912323, "learning_rate": 8.681009274255136e-07, "loss": 0.0639, "step": 4950 }, { "epoch": 0.8712853877263935, "grad_norm": 0.2202068269252777, "learning_rate": 8.56532579592334e-07, "loss": 0.0493, "step": 4955 }, { "epoch": 0.8721645858976613, "grad_norm": 0.655022144317627, "learning_rate": 8.450383801733642e-07, "loss": 0.0631, "step": 4960 }, { "epoch": 0.8730437840689291, "grad_norm": 0.5713546872138977, "learning_rate": 8.336184223801424e-07, "loss": 0.0592, "step": 4965 }, { "epoch": 0.8739229822401969, "grad_norm": 0.4096655249595642, "learning_rate": 8.222727988221469e-07, "loss": 0.0644, "step": 4970 }, { "epoch": 0.8748021804114647, "grad_norm": 0.3128865957260132, "learning_rate": 8.110016015060484e-07, "loss": 0.059, "step": 4975 }, { "epoch": 0.8756813785827325, "grad_norm": 0.1819022297859192, "learning_rate": 7.998049218349624e-07, "loss": 0.0547, "step": 4980 }, { "epoch": 0.8765605767540003, "grad_norm": 0.2281774878501892, "learning_rate": 7.886828506077105e-07, "loss": 0.0584, "step": 4985 }, { "epoch": 0.8774397749252681, "grad_norm": 0.29507550597190857, "learning_rate": 7.776354780180739e-07, "loss": 0.0523, "step": 4990 }, { "epoch": 0.878318973096536, "grad_norm": 0.1599227786064148, "learning_rate": 7.666628936540776e-07, "loss": 0.0597, "step": 4995 }, { "epoch": 0.8791981712678038, "grad_norm": 0.33402958512306213, "learning_rate": 7.557651864972504e-07, "loss": 0.048, "step": 5000 }, { "epoch": 0.8800773694390716, "grad_norm": 0.8994088172912598, "learning_rate": 7.449424449219144e-07, "loss": 0.0602, "step": 5005 }, { "epoch": 0.8809565676103394, "grad_norm": 0.392220139503479, "learning_rate": 7.341947566944563e-07, "loss": 0.0438, "step": 5010 }, { "epoch": 0.8818357657816072, "grad_norm": 0.3253031373023987, "learning_rate": 7.23522208972628e-07, "loss": 0.0568, "step": 5015 }, { "epoch": 0.882714963952875, "grad_norm": 0.41497498750686646, "learning_rate": 7.129248883048278e-07, "loss": 0.0453, "step": 5020 }, { "epoch": 0.8835941621241428, "grad_norm": 0.4564589560031891, "learning_rate": 7.024028806294092e-07, "loss": 0.0559, "step": 5025 }, { "epoch": 0.8844733602954106, "grad_norm": 0.6449925303459167, "learning_rate": 6.91956271273978e-07, "loss": 0.047, "step": 5030 }, { "epoch": 0.8853525584666784, "grad_norm": 0.3050267994403839, "learning_rate": 6.815851449547029e-07, "loss": 0.0583, "step": 5035 }, { "epoch": 0.8862317566379462, "grad_norm": 0.13408750295639038, "learning_rate": 6.712895857756229e-07, "loss": 0.0434, "step": 5040 }, { "epoch": 0.887110954809214, "grad_norm": 0.2986939549446106, "learning_rate": 6.610696772279757e-07, "loss": 0.0594, "step": 5045 }, { "epoch": 0.8879901529804818, "grad_norm": 0.6257616877555847, "learning_rate": 6.509255021895111e-07, "loss": 0.0621, "step": 5050 }, { "epoch": 0.8888693511517496, "grad_norm": 0.2986117899417877, "learning_rate": 6.408571429238253e-07, "loss": 0.0505, "step": 5055 }, { "epoch": 0.8897485493230174, "grad_norm": 0.15408103168010712, "learning_rate": 6.308646810796836e-07, "loss": 0.0534, "step": 5060 }, { "epoch": 0.8906277474942852, "grad_norm": 0.4512818157672882, "learning_rate": 6.209481976903752e-07, "loss": 0.0433, "step": 5065 }, { "epoch": 0.891506945665553, "grad_norm": 0.41841113567352295, "learning_rate": 6.111077731730408e-07, "loss": 0.0697, "step": 5070 }, { "epoch": 0.8923861438368208, "grad_norm": 0.472064346075058, "learning_rate": 6.013434873280288e-07, "loss": 0.058, "step": 5075 }, { "epoch": 0.8932653420080886, "grad_norm": 0.20314988493919373, "learning_rate": 5.916554193382418e-07, "loss": 0.0456, "step": 5080 }, { "epoch": 0.8941445401793564, "grad_norm": 0.1879410743713379, "learning_rate": 5.820436477685021e-07, "loss": 0.0506, "step": 5085 }, { "epoch": 0.8950237383506242, "grad_norm": 0.4098430573940277, "learning_rate": 5.72508250564906e-07, "loss": 0.0528, "step": 5090 }, { "epoch": 0.895902936521892, "grad_norm": 0.5645979642868042, "learning_rate": 5.63049305054204e-07, "loss": 0.0689, "step": 5095 }, { "epoch": 0.8967821346931598, "grad_norm": 0.38696274161338806, "learning_rate": 5.536668879431584e-07, "loss": 0.0621, "step": 5100 }, { "epoch": 0.8976613328644276, "grad_norm": 0.18984673917293549, "learning_rate": 5.44361075317934e-07, "loss": 0.0574, "step": 5105 }, { "epoch": 0.8985405310356954, "grad_norm": 0.1238960400223732, "learning_rate": 5.35131942643472e-07, "loss": 0.0522, "step": 5110 }, { "epoch": 0.8994197292069632, "grad_norm": 0.47579723596572876, "learning_rate": 5.259795647628818e-07, "loss": 0.0437, "step": 5115 }, { "epoch": 0.900298927378231, "grad_norm": 0.0708310604095459, "learning_rate": 5.169040158968431e-07, "loss": 0.057, "step": 5120 }, { "epoch": 0.9011781255494988, "grad_norm": 0.24418748915195465, "learning_rate": 5.079053696429837e-07, "loss": 0.054, "step": 5125 }, { "epoch": 0.9020573237207666, "grad_norm": 0.4015823304653168, "learning_rate": 4.989836989753005e-07, "loss": 0.0472, "step": 5130 }, { "epoch": 0.9029365218920344, "grad_norm": 0.09180589020252228, "learning_rate": 4.901390762435588e-07, "loss": 0.0565, "step": 5135 }, { "epoch": 0.9038157200633022, "grad_norm": 0.23137515783309937, "learning_rate": 4.813715731727098e-07, "loss": 0.0594, "step": 5140 }, { "epoch": 0.90469491823457, "grad_norm": 0.42869314551353455, "learning_rate": 4.726812608623077e-07, "loss": 0.0578, "step": 5145 }, { "epoch": 0.9055741164058378, "grad_norm": 0.28821223974227905, "learning_rate": 4.640682097859317e-07, "loss": 0.0608, "step": 5150 }, { "epoch": 0.9064533145771057, "grad_norm": 0.8055384755134583, "learning_rate": 4.555324897906133e-07, "loss": 0.0635, "step": 5155 }, { "epoch": 0.9073325127483735, "grad_norm": 0.12101097404956818, "learning_rate": 4.470741700962777e-07, "loss": 0.0559, "step": 5160 }, { "epoch": 0.9082117109196413, "grad_norm": 0.4471381902694702, "learning_rate": 4.3869331929517144e-07, "loss": 0.055, "step": 5165 }, { "epoch": 0.9090909090909091, "grad_norm": 0.554818332195282, "learning_rate": 4.303900053513166e-07, "loss": 0.0565, "step": 5170 }, { "epoch": 0.9099701072621769, "grad_norm": 0.6611088514328003, "learning_rate": 4.2216429559994945e-07, "loss": 0.0556, "step": 5175 }, { "epoch": 0.9108493054334447, "grad_norm": 0.1738196760416031, "learning_rate": 4.1401625674698186e-07, "loss": 0.0574, "step": 5180 }, { "epoch": 0.9117285036047125, "grad_norm": 0.19940215349197388, "learning_rate": 4.0594595486845964e-07, "loss": 0.0598, "step": 5185 }, { "epoch": 0.9126077017759803, "grad_norm": 0.6054111123085022, "learning_rate": 3.9795345541002395e-07, "loss": 0.0467, "step": 5190 }, { "epoch": 0.9134868999472481, "grad_norm": 0.5578285455703735, "learning_rate": 3.9003882318638053e-07, "loss": 0.0577, "step": 5195 }, { "epoch": 0.9143660981185159, "grad_norm": 0.12331661581993103, "learning_rate": 3.8220212238077703e-07, "loss": 0.0632, "step": 5200 }, { "epoch": 0.9152452962897837, "grad_norm": 0.4738437831401825, "learning_rate": 3.744434165444788e-07, "loss": 0.0619, "step": 5205 }, { "epoch": 0.9161244944610515, "grad_norm": 0.35755178332328796, "learning_rate": 3.667627685962605e-07, "loss": 0.059, "step": 5210 }, { "epoch": 0.9170036926323193, "grad_norm": 0.45652657747268677, "learning_rate": 3.591602408218842e-07, "loss": 0.0543, "step": 5215 }, { "epoch": 0.9178828908035871, "grad_norm": 0.2660428583621979, "learning_rate": 3.516358948736065e-07, "loss": 0.0526, "step": 5220 }, { "epoch": 0.9187620889748549, "grad_norm": 0.21268558502197266, "learning_rate": 3.441897917696679e-07, "loss": 0.0644, "step": 5225 }, { "epoch": 0.9196412871461227, "grad_norm": 0.6168246865272522, "learning_rate": 3.368219918938076e-07, "loss": 0.0512, "step": 5230 }, { "epoch": 0.9205204853173905, "grad_norm": 0.5585867762565613, "learning_rate": 3.29532554994767e-07, "loss": 0.0612, "step": 5235 }, { "epoch": 0.9213996834886583, "grad_norm": 0.6709286570549011, "learning_rate": 3.223215401858115e-07, "loss": 0.047, "step": 5240 }, { "epoch": 0.9222788816599261, "grad_norm": 0.2702469527721405, "learning_rate": 3.151890059442386e-07, "loss": 0.0445, "step": 5245 }, { "epoch": 0.9231580798311939, "grad_norm": 0.6882309913635254, "learning_rate": 3.081350101109215e-07, "loss": 0.0513, "step": 5250 }, { "epoch": 0.9240372780024617, "grad_norm": 0.3388536274433136, "learning_rate": 3.0115960988982504e-07, "loss": 0.0525, "step": 5255 }, { "epoch": 0.9249164761737295, "grad_norm": 0.3622197210788727, "learning_rate": 2.942628618475507e-07, "loss": 0.0565, "step": 5260 }, { "epoch": 0.9257956743449973, "grad_norm": 0.519477903842926, "learning_rate": 2.8744482191287113e-07, "loss": 0.0605, "step": 5265 }, { "epoch": 0.9266748725162651, "grad_norm": 0.3441823422908783, "learning_rate": 2.8070554537628413e-07, "loss": 0.0545, "step": 5270 }, { "epoch": 0.9275540706875329, "grad_norm": 0.2639375329017639, "learning_rate": 2.7404508688955835e-07, "loss": 0.0402, "step": 5275 }, { "epoch": 0.9284332688588007, "grad_norm": 0.35200339555740356, "learning_rate": 2.674635004652926e-07, "loss": 0.057, "step": 5280 }, { "epoch": 0.9293124670300685, "grad_norm": 0.30402621626853943, "learning_rate": 2.609608394764751e-07, "loss": 0.0508, "step": 5285 }, { "epoch": 0.9301916652013363, "grad_norm": 0.6213688254356384, "learning_rate": 2.5453715665605725e-07, "loss": 0.0652, "step": 5290 }, { "epoch": 0.9310708633726041, "grad_norm": 0.24661576747894287, "learning_rate": 2.4819250409651605e-07, "loss": 0.0398, "step": 5295 }, { "epoch": 0.9319500615438719, "grad_norm": 0.13059848546981812, "learning_rate": 2.419269332494434e-07, "loss": 0.0586, "step": 5300 }, { "epoch": 0.9328292597151397, "grad_norm": 0.4985171854496002, "learning_rate": 2.3574049492511852e-07, "loss": 0.0649, "step": 5305 }, { "epoch": 0.9337084578864076, "grad_norm": 0.5534043312072754, "learning_rate": 2.296332392921019e-07, "loss": 0.0564, "step": 5310 }, { "epoch": 0.9345876560576754, "grad_norm": 0.386444091796875, "learning_rate": 2.2360521587682316e-07, "loss": 0.0646, "step": 5315 }, { "epoch": 0.9354668542289432, "grad_norm": 0.22402134537696838, "learning_rate": 2.176564735631881e-07, "loss": 0.0614, "step": 5320 }, { "epoch": 0.936346052400211, "grad_norm": 0.6845077872276306, "learning_rate": 2.1178706059217346e-07, "loss": 0.0639, "step": 5325 }, { "epoch": 0.9372252505714788, "grad_norm": 0.5488569736480713, "learning_rate": 2.0599702456144178e-07, "loss": 0.0518, "step": 5330 }, { "epoch": 0.9381044487427466, "grad_norm": 0.6429914832115173, "learning_rate": 2.002864124249504e-07, "loss": 0.0618, "step": 5335 }, { "epoch": 0.9389836469140144, "grad_norm": 0.8213487267494202, "learning_rate": 1.9465527049257416e-07, "loss": 0.0623, "step": 5340 }, { "epoch": 0.9398628450852822, "grad_norm": 0.6064088940620422, "learning_rate": 1.8910364442972896e-07, "loss": 0.068, "step": 5345 }, { "epoch": 0.94074204325655, "grad_norm": 0.9644356966018677, "learning_rate": 1.8363157925700316e-07, "loss": 0.0581, "step": 5350 }, { "epoch": 0.9416212414278178, "grad_norm": 0.7088313102722168, "learning_rate": 1.78239119349789e-07, "loss": 0.0469, "step": 5355 }, { "epoch": 0.9425004395990856, "grad_norm": 0.14932291209697723, "learning_rate": 1.7292630843792292e-07, "loss": 0.0584, "step": 5360 }, { "epoch": 0.9433796377703534, "grad_norm": 0.18209953606128693, "learning_rate": 1.6769318960533465e-07, "loss": 0.0526, "step": 5365 }, { "epoch": 0.9442588359416212, "grad_norm": 0.5487361550331116, "learning_rate": 1.625398052896965e-07, "loss": 0.0551, "step": 5370 }, { "epoch": 0.945138034112889, "grad_norm": 0.1913604438304901, "learning_rate": 1.574661972820779e-07, "loss": 0.0556, "step": 5375 }, { "epoch": 0.9460172322841568, "grad_norm": 0.44386959075927734, "learning_rate": 1.5247240672660258e-07, "loss": 0.0633, "step": 5380 }, { "epoch": 0.9468964304554246, "grad_norm": 0.3759801685810089, "learning_rate": 1.4755847412012635e-07, "loss": 0.0557, "step": 5385 }, { "epoch": 0.9477756286266924, "grad_norm": 0.19469892978668213, "learning_rate": 1.427244393118965e-07, "loss": 0.0591, "step": 5390 }, { "epoch": 0.9486548267979603, "grad_norm": 0.5927003026008606, "learning_rate": 1.379703415032374e-07, "loss": 0.045, "step": 5395 }, { "epoch": 0.9495340249692281, "grad_norm": 0.13490422070026398, "learning_rate": 1.3329621924722536e-07, "loss": 0.0437, "step": 5400 }, { "epoch": 0.9504132231404959, "grad_norm": 0.3926790654659271, "learning_rate": 1.287021104483821e-07, "loss": 0.053, "step": 5405 }, { "epoch": 0.9512924213117637, "grad_norm": 0.5249255299568176, "learning_rate": 1.2418805236236287e-07, "loss": 0.06, "step": 5410 }, { "epoch": 0.9521716194830315, "grad_norm": 1.2005724906921387, "learning_rate": 1.1975408159566105e-07, "loss": 0.0479, "step": 5415 }, { "epoch": 0.9530508176542993, "grad_norm": 0.2839159667491913, "learning_rate": 1.1540023410529844e-07, "loss": 0.0623, "step": 5420 }, { "epoch": 0.9539300158255671, "grad_norm": 0.5383673906326294, "learning_rate": 1.1112654519855104e-07, "loss": 0.0596, "step": 5425 }, { "epoch": 0.9548092139968349, "grad_norm": 0.595245361328125, "learning_rate": 1.0693304953264705e-07, "loss": 0.0661, "step": 5430 }, { "epoch": 0.9556884121681027, "grad_norm": 0.5355046391487122, "learning_rate": 1.0281978111449375e-07, "loss": 0.0715, "step": 5435 }, { "epoch": 0.9565676103393705, "grad_norm": 0.31870976090431213, "learning_rate": 9.87867733004011e-08, "loss": 0.053, "step": 5440 }, { "epoch": 0.9574468085106383, "grad_norm": 0.6525245308876038, "learning_rate": 9.483405879581187e-08, "loss": 0.0519, "step": 5445 }, { "epoch": 0.9583260066819062, "grad_norm": 0.6109974384307861, "learning_rate": 9.096166965502972e-08, "loss": 0.0583, "step": 5450 }, { "epoch": 0.959205204853174, "grad_norm": 0.22431720793247223, "learning_rate": 8.71696372809705e-08, "loss": 0.0486, "step": 5455 }, { "epoch": 0.9600844030244418, "grad_norm": 0.5329923033714294, "learning_rate": 8.345799242489905e-08, "loss": 0.0525, "step": 5460 }, { "epoch": 0.9609636011957096, "grad_norm": 0.4853318929672241, "learning_rate": 7.982676518618059e-08, "loss": 0.0597, "step": 5465 }, { "epoch": 0.9618427993669774, "grad_norm": 0.3214046061038971, "learning_rate": 7.627598501204092e-08, "loss": 0.05, "step": 5470 }, { "epoch": 0.9627219975382452, "grad_norm": 0.18154819309711456, "learning_rate": 7.28056806973243e-08, "loss": 0.0484, "step": 5475 }, { "epoch": 0.963601195709513, "grad_norm": 0.14486095309257507, "learning_rate": 6.941588038426039e-08, "loss": 0.0586, "step": 5480 }, { "epoch": 0.9644803938807808, "grad_norm": 0.6655594706535339, "learning_rate": 6.610661156223664e-08, "loss": 0.0641, "step": 5485 }, { "epoch": 0.9653595920520486, "grad_norm": 0.22004252672195435, "learning_rate": 6.287790106757396e-08, "loss": 0.0483, "step": 5490 }, { "epoch": 0.9662387902233164, "grad_norm": 0.20186270773410797, "learning_rate": 5.972977508331368e-08, "loss": 0.0528, "step": 5495 }, { "epoch": 0.9671179883945842, "grad_norm": 0.23505160212516785, "learning_rate": 5.666225913899648e-08, "loss": 0.0663, "step": 5500 }, { "epoch": 0.967997186565852, "grad_norm": 0.423562616109848, "learning_rate": 5.367537811046486e-08, "loss": 0.0516, "step": 5505 }, { "epoch": 0.9688763847371198, "grad_norm": 0.23930394649505615, "learning_rate": 5.0769156219656614e-08, "loss": 0.0572, "step": 5510 }, { "epoch": 0.9697555829083876, "grad_norm": 0.1267559826374054, "learning_rate": 4.7943617034407196e-08, "loss": 0.0411, "step": 5515 }, { "epoch": 0.9706347810796554, "grad_norm": 0.5073260068893433, "learning_rate": 4.51987834682599e-08, "loss": 0.052, "step": 5520 }, { "epoch": 0.9715139792509232, "grad_norm": 0.3135651648044586, "learning_rate": 4.253467778028486e-08, "loss": 0.0547, "step": 5525 }, { "epoch": 0.972393177422191, "grad_norm": 0.5157844424247742, "learning_rate": 3.9951321574890345e-08, "loss": 0.0463, "step": 5530 }, { "epoch": 0.9732723755934588, "grad_norm": 0.16779236495494843, "learning_rate": 3.744873580165176e-08, "loss": 0.059, "step": 5535 }, { "epoch": 0.9741515737647266, "grad_norm": 0.26359498500823975, "learning_rate": 3.502694075514179e-08, "loss": 0.0567, "step": 5540 }, { "epoch": 0.9750307719359944, "grad_norm": 0.6399820446968079, "learning_rate": 3.26859560747661e-08, "loss": 0.0544, "step": 5545 }, { "epoch": 0.9759099701072622, "grad_norm": 0.49394690990448, "learning_rate": 3.042580074460344e-08, "loss": 0.0649, "step": 5550 }, { "epoch": 0.97678916827853, "grad_norm": 0.11140932142734528, "learning_rate": 2.8246493093250226e-08, "loss": 0.0572, "step": 5555 }, { "epoch": 0.9776683664497978, "grad_norm": 0.803503155708313, "learning_rate": 2.6148050793676217e-08, "loss": 0.0583, "step": 5560 }, { "epoch": 0.9785475646210656, "grad_norm": 0.20883022248744965, "learning_rate": 2.4130490863075727e-08, "loss": 0.0631, "step": 5565 }, { "epoch": 0.9794267627923334, "grad_norm": 0.5355504155158997, "learning_rate": 2.2193829662731093e-08, "loss": 0.0548, "step": 5570 }, { "epoch": 0.9803059609636012, "grad_norm": 0.5118641257286072, "learning_rate": 2.033808289788608e-08, "loss": 0.0551, "step": 5575 }, { "epoch": 0.981185159134869, "grad_norm": 0.4112605154514313, "learning_rate": 1.856326561760824e-08, "loss": 0.0567, "step": 5580 }, { "epoch": 0.9820643573061368, "grad_norm": 0.36727771162986755, "learning_rate": 1.686939221467565e-08, "loss": 0.0524, "step": 5585 }, { "epoch": 0.9829435554774046, "grad_norm": 0.32696032524108887, "learning_rate": 1.5256476425455912e-08, "loss": 0.0546, "step": 5590 }, { "epoch": 0.9838227536486724, "grad_norm": 0.14299306273460388, "learning_rate": 1.37245313297929e-08, "loss": 0.0603, "step": 5595 }, { "epoch": 0.9847019518199402, "grad_norm": 0.1304006576538086, "learning_rate": 1.2273569350909065e-08, "loss": 0.048, "step": 5600 }, { "epoch": 0.985581149991208, "grad_norm": 0.3255922496318817, "learning_rate": 1.09036022552933e-08, "loss": 0.0624, "step": 5605 }, { "epoch": 0.9864603481624759, "grad_norm": 0.14127831161022186, "learning_rate": 9.614641152615457e-09, "loss": 0.0537, "step": 5610 }, { "epoch": 0.9873395463337437, "grad_norm": 0.49166056513786316, "learning_rate": 8.406696495627531e-09, "loss": 0.0555, "step": 5615 }, { "epoch": 0.9882187445050115, "grad_norm": 0.4225272238254547, "learning_rate": 7.279778080089284e-09, "loss": 0.0508, "step": 5620 }, { "epoch": 0.9890979426762793, "grad_norm": 0.43666836619377136, "learning_rate": 6.233895044677196e-09, "loss": 0.0495, "step": 5625 }, { "epoch": 0.9899771408475471, "grad_norm": 0.23192152380943298, "learning_rate": 5.269055870920081e-09, "loss": 0.0601, "step": 5630 }, { "epoch": 0.9908563390188149, "grad_norm": 0.09601382911205292, "learning_rate": 4.385268383123586e-09, "loss": 0.0571, "step": 5635 }, { "epoch": 0.9917355371900827, "grad_norm": 0.22825530171394348, "learning_rate": 3.5825397483113532e-09, "loss": 0.0647, "step": 5640 }, { "epoch": 0.9926147353613505, "grad_norm": 0.18555951118469238, "learning_rate": 2.8608764761639542e-09, "loss": 0.0468, "step": 5645 }, { "epoch": 0.9934939335326183, "grad_norm": 0.15407155454158783, "learning_rate": 2.220284418968932e-09, "loss": 0.0543, "step": 5650 }, { "epoch": 0.9943731317038861, "grad_norm": 0.30971744656562805, "learning_rate": 1.6607687715675113e-09, "loss": 0.0788, "step": 5655 }, { "epoch": 0.9952523298751539, "grad_norm": 0.4101414680480957, "learning_rate": 1.1823340713212894e-09, "loss": 0.0527, "step": 5660 }, { "epoch": 0.9961315280464217, "grad_norm": 0.29319489002227783, "learning_rate": 7.849841980667183e-10, "loss": 0.0472, "step": 5665 }, { "epoch": 0.9970107262176895, "grad_norm": 0.3551650941371918, "learning_rate": 4.687223740917901e-10, "loss": 0.0683, "step": 5670 }, { "epoch": 0.9978899243889573, "grad_norm": 0.2584504783153534, "learning_rate": 2.335511641005095e-10, "loss": 0.0501, "step": 5675 }, { "epoch": 0.9987691225602251, "grad_norm": 0.5419056415557861, "learning_rate": 7.947247520179169e-11, "loss": 0.0518, "step": 5680 }, { "epoch": 0.9996483207314929, "grad_norm": 0.7458012700080872, "learning_rate": 6.487556887257995e-12, "loss": 0.0659, "step": 5685 }, { "epoch": 1.0, "step": 5687, "total_flos": 0.0, "train_loss": 0.059674585497827885, "train_runtime": 13149.4885, "train_samples_per_second": 13.837, "train_steps_per_second": 0.432 } ], "logging_steps": 5, "max_steps": 5687, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }