diff --git "a/checkpoint-20000/trainer_state.json" "b/checkpoint-20000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-20000/trainer_state.json" @@ -0,0 +1,28673 @@ +{ + "best_metric": 0.038246750831604004, + "best_model_checkpoint": "saves/qwen-8b/lora/sft/checkpoint-20000", + "epoch": 2.4024024024024024, + "eval_steps": 250, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006006006006006006, + "grad_norm": 0.1276276707649231, + "learning_rate": 9.999999015013712e-05, + "loss": 0.7206, + "step": 5 + }, + { + "epoch": 0.0012012012012012011, + "grad_norm": 0.11029542982578278, + "learning_rate": 9.999996052155599e-05, + "loss": 0.7604, + "step": 10 + }, + { + "epoch": 0.0018018018018018018, + "grad_norm": 0.11351778358221054, + "learning_rate": 9.999991111422876e-05, + "loss": 0.6428, + "step": 15 + }, + { + "epoch": 0.0024024024024024023, + "grad_norm": 0.08558386564254761, + "learning_rate": 9.9999841928175e-05, + "loss": 0.6202, + "step": 20 + }, + { + "epoch": 0.003003003003003003, + "grad_norm": 0.10890918225049973, + "learning_rate": 9.999975296342206e-05, + "loss": 0.5867, + "step": 25 + }, + { + "epoch": 0.0036036036036036037, + "grad_norm": 0.1286676824092865, + "learning_rate": 9.999964422000514e-05, + "loss": 0.7121, + "step": 30 + }, + { + "epoch": 0.004204204204204204, + "grad_norm": 0.12503540515899658, + "learning_rate": 9.999951569796724e-05, + "loss": 0.7087, + "step": 35 + }, + { + "epoch": 0.004804804804804805, + "grad_norm": 0.12578998506069183, + "learning_rate": 9.999936739735923e-05, + "loss": 0.5874, + "step": 40 + }, + { + "epoch": 0.005405405405405406, + "grad_norm": 0.1145617812871933, + "learning_rate": 9.999919931823976e-05, + "loss": 0.5829, + "step": 45 + }, + { + "epoch": 0.006006006006006006, + "grad_norm": 0.09650918841362, + "learning_rate": 9.999901146067532e-05, + "loss": 0.5985, + "step": 50 + }, + { + "epoch": 0.006606606606606606, + "grad_norm": 0.10375821590423584, + "learning_rate": 9.999880382474021e-05, + "loss": 0.5882, + "step": 55 + }, + { + "epoch": 0.007207207207207207, + "grad_norm": 0.10667344927787781, + "learning_rate": 9.999857641051658e-05, + "loss": 0.5403, + "step": 60 + }, + { + "epoch": 0.007807807807807808, + "grad_norm": 0.1100311204791069, + "learning_rate": 9.999832921809437e-05, + "loss": 0.6201, + "step": 65 + }, + { + "epoch": 0.008408408408408409, + "grad_norm": 0.10455172508955002, + "learning_rate": 9.999806224757138e-05, + "loss": 0.6557, + "step": 70 + }, + { + "epoch": 0.009009009009009009, + "grad_norm": 0.11311007291078568, + "learning_rate": 9.999777549905322e-05, + "loss": 0.6266, + "step": 75 + }, + { + "epoch": 0.00960960960960961, + "grad_norm": 0.1126667782664299, + "learning_rate": 9.999746897265331e-05, + "loss": 0.5493, + "step": 80 + }, + { + "epoch": 0.01021021021021021, + "grad_norm": 0.1300688236951828, + "learning_rate": 9.999714266849292e-05, + "loss": 0.5692, + "step": 85 + }, + { + "epoch": 0.010810810810810811, + "grad_norm": 0.08558473736047745, + "learning_rate": 9.999679658670111e-05, + "loss": 0.5598, + "step": 90 + }, + { + "epoch": 0.011411411411411412, + "grad_norm": 0.10892773419618607, + "learning_rate": 9.999643072741478e-05, + "loss": 0.6837, + "step": 95 + }, + { + "epoch": 0.012012012012012012, + "grad_norm": 0.09890025109052658, + "learning_rate": 9.999604509077867e-05, + "loss": 0.6372, + "step": 100 + }, + { + "epoch": 0.012612612612612612, + "grad_norm": 0.11107968538999557, + "learning_rate": 9.999563967694532e-05, + "loss": 0.6601, + "step": 105 + }, + { + "epoch": 0.013213213213213212, + "grad_norm": 0.10226837545633316, + "learning_rate": 9.99952144860751e-05, + "loss": 0.5446, + "step": 110 + }, + { + "epoch": 0.013813813813813814, + "grad_norm": 0.0968766137957573, + "learning_rate": 9.999476951833621e-05, + "loss": 0.5698, + "step": 115 + }, + { + "epoch": 0.014414414414414415, + "grad_norm": 0.10840737074613571, + "learning_rate": 9.999430477390466e-05, + "loss": 0.6412, + "step": 120 + }, + { + "epoch": 0.015015015015015015, + "grad_norm": 0.10740071535110474, + "learning_rate": 9.999382025296431e-05, + "loss": 0.6262, + "step": 125 + }, + { + "epoch": 0.015615615615615615, + "grad_norm": 0.10787483304738998, + "learning_rate": 9.99933159557068e-05, + "loss": 0.6068, + "step": 130 + }, + { + "epoch": 0.016216216216216217, + "grad_norm": 0.10583048313856125, + "learning_rate": 9.999279188233164e-05, + "loss": 0.6107, + "step": 135 + }, + { + "epoch": 0.016816816816816817, + "grad_norm": 0.11268199980258942, + "learning_rate": 9.999224803304612e-05, + "loss": 0.5476, + "step": 140 + }, + { + "epoch": 0.017417417417417418, + "grad_norm": 0.11169681698083878, + "learning_rate": 9.999168440806538e-05, + "loss": 0.6548, + "step": 145 + }, + { + "epoch": 0.018018018018018018, + "grad_norm": 0.1366148591041565, + "learning_rate": 9.999110100761237e-05, + "loss": 0.5648, + "step": 150 + }, + { + "epoch": 0.018618618618618618, + "grad_norm": 0.11121531575918198, + "learning_rate": 9.99904978319179e-05, + "loss": 0.5888, + "step": 155 + }, + { + "epoch": 0.01921921921921922, + "grad_norm": 0.13757368922233582, + "learning_rate": 9.998987488122054e-05, + "loss": 0.6201, + "step": 160 + }, + { + "epoch": 0.01981981981981982, + "grad_norm": 0.12323565781116486, + "learning_rate": 9.998923215576672e-05, + "loss": 0.7153, + "step": 165 + }, + { + "epoch": 0.02042042042042042, + "grad_norm": 0.10789189487695694, + "learning_rate": 9.998856965581069e-05, + "loss": 0.61, + "step": 170 + }, + { + "epoch": 0.021021021021021023, + "grad_norm": 0.10350632667541504, + "learning_rate": 9.998788738161454e-05, + "loss": 0.5933, + "step": 175 + }, + { + "epoch": 0.021621621621621623, + "grad_norm": 0.11346318572759628, + "learning_rate": 9.998718533344811e-05, + "loss": 0.5772, + "step": 180 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 0.09915696084499359, + "learning_rate": 9.998646351158915e-05, + "loss": 0.5585, + "step": 185 + }, + { + "epoch": 0.022822822822822823, + "grad_norm": 0.09726600348949432, + "learning_rate": 9.998572191632319e-05, + "loss": 0.6143, + "step": 190 + }, + { + "epoch": 0.023423423423423424, + "grad_norm": 0.10023459047079086, + "learning_rate": 9.998496054794358e-05, + "loss": 0.5735, + "step": 195 + }, + { + "epoch": 0.024024024024024024, + "grad_norm": 0.09451427310705185, + "learning_rate": 9.998417940675148e-05, + "loss": 0.538, + "step": 200 + }, + { + "epoch": 0.024624624624624624, + "grad_norm": 0.10174587368965149, + "learning_rate": 9.998337849305594e-05, + "loss": 0.6243, + "step": 205 + }, + { + "epoch": 0.025225225225225224, + "grad_norm": 0.08829119056463242, + "learning_rate": 9.998255780717375e-05, + "loss": 0.553, + "step": 210 + }, + { + "epoch": 0.025825825825825825, + "grad_norm": 0.10895121842622757, + "learning_rate": 9.998171734942956e-05, + "loss": 0.5862, + "step": 215 + }, + { + "epoch": 0.026426426426426425, + "grad_norm": 0.11514592170715332, + "learning_rate": 9.998085712015582e-05, + "loss": 0.6394, + "step": 220 + }, + { + "epoch": 0.02702702702702703, + "grad_norm": 0.10050612688064575, + "learning_rate": 9.997997711969283e-05, + "loss": 0.5616, + "step": 225 + }, + { + "epoch": 0.02762762762762763, + "grad_norm": 0.11400890350341797, + "learning_rate": 9.997907734838868e-05, + "loss": 0.5646, + "step": 230 + }, + { + "epoch": 0.02822822822822823, + "grad_norm": 0.10154697299003601, + "learning_rate": 9.997815780659931e-05, + "loss": 0.6241, + "step": 235 + }, + { + "epoch": 0.02882882882882883, + "grad_norm": 0.09731791168451309, + "learning_rate": 9.997721849468848e-05, + "loss": 0.5987, + "step": 240 + }, + { + "epoch": 0.02942942942942943, + "grad_norm": 0.09646134078502655, + "learning_rate": 9.997625941302775e-05, + "loss": 0.6354, + "step": 245 + }, + { + "epoch": 0.03003003003003003, + "grad_norm": 0.11228446662425995, + "learning_rate": 9.99752805619965e-05, + "loss": 0.6073, + "step": 250 + }, + { + "epoch": 0.03003003003003003, + "eval_loss": 0.5788276791572571, + "eval_runtime": 35.6288, + "eval_samples_per_second": 22.454, + "eval_steps_per_second": 5.613, + "step": 250 + }, + { + "epoch": 0.03063063063063063, + "grad_norm": 0.10133231431245804, + "learning_rate": 9.997428194198196e-05, + "loss": 0.548, + "step": 255 + }, + { + "epoch": 0.03123123123123123, + "grad_norm": 0.10743261128664017, + "learning_rate": 9.99732635533791e-05, + "loss": 0.6082, + "step": 260 + }, + { + "epoch": 0.03183183183183183, + "grad_norm": 0.10012677311897278, + "learning_rate": 9.997222539659085e-05, + "loss": 0.6075, + "step": 265 + }, + { + "epoch": 0.032432432432432434, + "grad_norm": 0.10682181268930435, + "learning_rate": 9.997116747202783e-05, + "loss": 0.568, + "step": 270 + }, + { + "epoch": 0.03303303303303303, + "grad_norm": 0.12774191796779633, + "learning_rate": 9.997008978010854e-05, + "loss": 0.5942, + "step": 275 + }, + { + "epoch": 0.033633633633633635, + "grad_norm": 0.1001734733581543, + "learning_rate": 9.996899232125929e-05, + "loss": 0.5681, + "step": 280 + }, + { + "epoch": 0.03423423423423423, + "grad_norm": 0.09887179732322693, + "learning_rate": 9.99678750959142e-05, + "loss": 0.6101, + "step": 285 + }, + { + "epoch": 0.034834834834834835, + "grad_norm": 0.1097242683172226, + "learning_rate": 9.996673810451525e-05, + "loss": 0.5765, + "step": 290 + }, + { + "epoch": 0.03543543543543543, + "grad_norm": 0.119800865650177, + "learning_rate": 9.996558134751214e-05, + "loss": 0.5942, + "step": 295 + }, + { + "epoch": 0.036036036036036036, + "grad_norm": 0.11146628856658936, + "learning_rate": 9.996440482536252e-05, + "loss": 0.581, + "step": 300 + }, + { + "epoch": 0.03663663663663664, + "grad_norm": 0.10826515406370163, + "learning_rate": 9.996320853853176e-05, + "loss": 0.5716, + "step": 305 + }, + { + "epoch": 0.037237237237237236, + "grad_norm": 0.10633689910173416, + "learning_rate": 9.996199248749308e-05, + "loss": 0.5926, + "step": 310 + }, + { + "epoch": 0.03783783783783784, + "grad_norm": 0.13376714289188385, + "learning_rate": 9.996075667272753e-05, + "loss": 0.6925, + "step": 315 + }, + { + "epoch": 0.03843843843843844, + "grad_norm": 0.1069822907447815, + "learning_rate": 9.995950109472398e-05, + "loss": 0.6107, + "step": 320 + }, + { + "epoch": 0.03903903903903904, + "grad_norm": 0.09809288382530212, + "learning_rate": 9.995822575397908e-05, + "loss": 0.5597, + "step": 325 + }, + { + "epoch": 0.03963963963963964, + "grad_norm": 0.11523476243019104, + "learning_rate": 9.995693065099732e-05, + "loss": 0.581, + "step": 330 + }, + { + "epoch": 0.04024024024024024, + "grad_norm": 0.10354334861040115, + "learning_rate": 9.995561578629105e-05, + "loss": 0.5803, + "step": 335 + }, + { + "epoch": 0.04084084084084084, + "grad_norm": 0.10784221440553665, + "learning_rate": 9.995428116038035e-05, + "loss": 0.5614, + "step": 340 + }, + { + "epoch": 0.04144144144144144, + "grad_norm": 0.10870096832513809, + "learning_rate": 9.99529267737932e-05, + "loss": 0.636, + "step": 345 + }, + { + "epoch": 0.042042042042042045, + "grad_norm": 0.10280580818653107, + "learning_rate": 9.995155262706534e-05, + "loss": 0.6333, + "step": 350 + }, + { + "epoch": 0.04264264264264264, + "grad_norm": 0.12184490263462067, + "learning_rate": 9.995015872074036e-05, + "loss": 0.5996, + "step": 355 + }, + { + "epoch": 0.043243243243243246, + "grad_norm": 0.10824449360370636, + "learning_rate": 9.994874505536967e-05, + "loss": 0.5483, + "step": 360 + }, + { + "epoch": 0.04384384384384384, + "grad_norm": 0.1244700476527214, + "learning_rate": 9.994731163151244e-05, + "loss": 0.524, + "step": 365 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.10343901067972183, + "learning_rate": 9.994585844973574e-05, + "loss": 0.5635, + "step": 370 + }, + { + "epoch": 0.04504504504504504, + "grad_norm": 0.13498111069202423, + "learning_rate": 9.994438551061437e-05, + "loss": 0.6507, + "step": 375 + }, + { + "epoch": 0.04564564564564565, + "grad_norm": 0.10724303871393204, + "learning_rate": 9.994289281473104e-05, + "loss": 0.6062, + "step": 380 + }, + { + "epoch": 0.04624624624624624, + "grad_norm": 0.1190914437174797, + "learning_rate": 9.994138036267617e-05, + "loss": 0.606, + "step": 385 + }, + { + "epoch": 0.04684684684684685, + "grad_norm": 0.10737879574298859, + "learning_rate": 9.993984815504809e-05, + "loss": 0.5966, + "step": 390 + }, + { + "epoch": 0.04744744744744745, + "grad_norm": 0.10683471709489822, + "learning_rate": 9.993829619245288e-05, + "loss": 0.6322, + "step": 395 + }, + { + "epoch": 0.04804804804804805, + "grad_norm": 0.12433134764432907, + "learning_rate": 9.993672447550446e-05, + "loss": 0.6776, + "step": 400 + }, + { + "epoch": 0.04864864864864865, + "grad_norm": 0.10516783595085144, + "learning_rate": 9.993513300482457e-05, + "loss": 0.5399, + "step": 405 + }, + { + "epoch": 0.04924924924924925, + "grad_norm": 0.10631490498781204, + "learning_rate": 9.993352178104275e-05, + "loss": 0.5712, + "step": 410 + }, + { + "epoch": 0.04984984984984985, + "grad_norm": 0.10908538848161697, + "learning_rate": 9.993189080479637e-05, + "loss": 0.6396, + "step": 415 + }, + { + "epoch": 0.05045045045045045, + "grad_norm": 0.13340666890144348, + "learning_rate": 9.993024007673059e-05, + "loss": 0.6671, + "step": 420 + }, + { + "epoch": 0.05105105105105105, + "grad_norm": 0.12529441714286804, + "learning_rate": 9.992856959749841e-05, + "loss": 0.5756, + "step": 425 + }, + { + "epoch": 0.05165165165165165, + "grad_norm": 0.10432100296020508, + "learning_rate": 9.992687936776063e-05, + "loss": 0.6271, + "step": 430 + }, + { + "epoch": 0.05225225225225225, + "grad_norm": 0.11412281543016434, + "learning_rate": 9.992516938818585e-05, + "loss": 0.5768, + "step": 435 + }, + { + "epoch": 0.05285285285285285, + "grad_norm": 0.1088799387216568, + "learning_rate": 9.992343965945051e-05, + "loss": 0.631, + "step": 440 + }, + { + "epoch": 0.05345345345345345, + "grad_norm": 0.11275801062583923, + "learning_rate": 9.992169018223884e-05, + "loss": 0.6116, + "step": 445 + }, + { + "epoch": 0.05405405405405406, + "grad_norm": 0.10910513252019882, + "learning_rate": 9.99199209572429e-05, + "loss": 0.5581, + "step": 450 + }, + { + "epoch": 0.054654654654654654, + "grad_norm": 0.09757370501756668, + "learning_rate": 9.991813198516252e-05, + "loss": 0.5765, + "step": 455 + }, + { + "epoch": 0.05525525525525526, + "grad_norm": 0.1048395037651062, + "learning_rate": 9.991632326670542e-05, + "loss": 0.5965, + "step": 460 + }, + { + "epoch": 0.055855855855855854, + "grad_norm": 0.11976464092731476, + "learning_rate": 9.991449480258704e-05, + "loss": 0.5559, + "step": 465 + }, + { + "epoch": 0.05645645645645646, + "grad_norm": 0.10895799845457077, + "learning_rate": 9.99126465935307e-05, + "loss": 0.6067, + "step": 470 + }, + { + "epoch": 0.057057057057057055, + "grad_norm": 0.12459875643253326, + "learning_rate": 9.99107786402675e-05, + "loss": 0.5871, + "step": 475 + }, + { + "epoch": 0.05765765765765766, + "grad_norm": 0.09745662659406662, + "learning_rate": 9.990889094353637e-05, + "loss": 0.5714, + "step": 480 + }, + { + "epoch": 0.058258258258258255, + "grad_norm": 0.09854046255350113, + "learning_rate": 9.9906983504084e-05, + "loss": 0.5477, + "step": 485 + }, + { + "epoch": 0.05885885885885886, + "grad_norm": 0.10370152443647385, + "learning_rate": 9.990505632266498e-05, + "loss": 0.5688, + "step": 490 + }, + { + "epoch": 0.05945945945945946, + "grad_norm": 0.1205800324678421, + "learning_rate": 9.990310940004159e-05, + "loss": 0.6448, + "step": 495 + }, + { + "epoch": 0.06006006006006006, + "grad_norm": 0.12087002396583557, + "learning_rate": 9.990114273698406e-05, + "loss": 0.5661, + "step": 500 + }, + { + "epoch": 0.06006006006006006, + "eval_loss": 0.5705558657646179, + "eval_runtime": 35.6595, + "eval_samples_per_second": 22.434, + "eval_steps_per_second": 5.609, + "step": 500 + }, + { + "epoch": 0.06066066066066066, + "grad_norm": 0.10749085247516632, + "learning_rate": 9.989915633427028e-05, + "loss": 0.5849, + "step": 505 + }, + { + "epoch": 0.06126126126126126, + "grad_norm": 0.10628439486026764, + "learning_rate": 9.989715019268606e-05, + "loss": 0.5916, + "step": 510 + }, + { + "epoch": 0.061861861861861864, + "grad_norm": 0.10315616428852081, + "learning_rate": 9.989512431302497e-05, + "loss": 0.5358, + "step": 515 + }, + { + "epoch": 0.06246246246246246, + "grad_norm": 0.14266809821128845, + "learning_rate": 9.989307869608841e-05, + "loss": 0.6284, + "step": 520 + }, + { + "epoch": 0.06306306306306306, + "grad_norm": 0.14430400729179382, + "learning_rate": 9.989101334268555e-05, + "loss": 0.5701, + "step": 525 + }, + { + "epoch": 0.06366366366366366, + "grad_norm": 0.12293283641338348, + "learning_rate": 9.988892825363343e-05, + "loss": 0.5759, + "step": 530 + }, + { + "epoch": 0.06426426426426426, + "grad_norm": 0.1240479052066803, + "learning_rate": 9.988682342975682e-05, + "loss": 0.6167, + "step": 535 + }, + { + "epoch": 0.06486486486486487, + "grad_norm": 0.12589618563652039, + "learning_rate": 9.988469887188837e-05, + "loss": 0.596, + "step": 540 + }, + { + "epoch": 0.06546546546546547, + "grad_norm": 0.11583826690912247, + "learning_rate": 9.988255458086848e-05, + "loss": 0.5746, + "step": 545 + }, + { + "epoch": 0.06606606606606606, + "grad_norm": 0.12019343674182892, + "learning_rate": 9.988039055754538e-05, + "loss": 0.5773, + "step": 550 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.1159575954079628, + "learning_rate": 9.987820680277514e-05, + "loss": 0.6047, + "step": 555 + }, + { + "epoch": 0.06726726726726727, + "grad_norm": 0.1157040074467659, + "learning_rate": 9.987600331742152e-05, + "loss": 0.6517, + "step": 560 + }, + { + "epoch": 0.06786786786786787, + "grad_norm": 0.11724773794412613, + "learning_rate": 9.987378010235625e-05, + "loss": 0.6074, + "step": 565 + }, + { + "epoch": 0.06846846846846846, + "grad_norm": 0.11794179677963257, + "learning_rate": 9.987153715845874e-05, + "loss": 0.5875, + "step": 570 + }, + { + "epoch": 0.06906906906906907, + "grad_norm": 0.12677189707756042, + "learning_rate": 9.986927448661623e-05, + "loss": 0.6014, + "step": 575 + }, + { + "epoch": 0.06966966966966967, + "grad_norm": 0.1056021898984909, + "learning_rate": 9.98669920877238e-05, + "loss": 0.5983, + "step": 580 + }, + { + "epoch": 0.07027027027027027, + "grad_norm": 0.10002574324607849, + "learning_rate": 9.98646899626843e-05, + "loss": 0.5524, + "step": 585 + }, + { + "epoch": 0.07087087087087086, + "grad_norm": 0.12917128205299377, + "learning_rate": 9.986236811240841e-05, + "loss": 0.6182, + "step": 590 + }, + { + "epoch": 0.07147147147147147, + "grad_norm": 0.11160294711589813, + "learning_rate": 9.986002653781457e-05, + "loss": 0.5808, + "step": 595 + }, + { + "epoch": 0.07207207207207207, + "grad_norm": 0.12028118222951889, + "learning_rate": 9.985766523982906e-05, + "loss": 0.5942, + "step": 600 + }, + { + "epoch": 0.07267267267267268, + "grad_norm": 0.1268133968114853, + "learning_rate": 9.985528421938595e-05, + "loss": 0.6005, + "step": 605 + }, + { + "epoch": 0.07327327327327328, + "grad_norm": 0.12729580700397491, + "learning_rate": 9.985288347742713e-05, + "loss": 0.5953, + "step": 610 + }, + { + "epoch": 0.07387387387387387, + "grad_norm": 0.11391682922840118, + "learning_rate": 9.985046301490224e-05, + "loss": 0.5797, + "step": 615 + }, + { + "epoch": 0.07447447447447447, + "grad_norm": 0.11333835124969482, + "learning_rate": 9.98480228327688e-05, + "loss": 0.5463, + "step": 620 + }, + { + "epoch": 0.07507507507507508, + "grad_norm": 0.11474984139204025, + "learning_rate": 9.984556293199204e-05, + "loss": 0.661, + "step": 625 + }, + { + "epoch": 0.07567567567567568, + "grad_norm": 0.11972816288471222, + "learning_rate": 9.984308331354505e-05, + "loss": 0.5481, + "step": 630 + }, + { + "epoch": 0.07627627627627627, + "grad_norm": 0.12083037197589874, + "learning_rate": 9.984058397840874e-05, + "loss": 0.6251, + "step": 635 + }, + { + "epoch": 0.07687687687687687, + "grad_norm": 0.10288316756486893, + "learning_rate": 9.983806492757173e-05, + "loss": 0.5038, + "step": 640 + }, + { + "epoch": 0.07747747747747748, + "grad_norm": 0.11947996914386749, + "learning_rate": 9.983552616203054e-05, + "loss": 0.5844, + "step": 645 + }, + { + "epoch": 0.07807807807807808, + "grad_norm": 0.11687720566987991, + "learning_rate": 9.983296768278941e-05, + "loss": 0.6452, + "step": 650 + }, + { + "epoch": 0.07867867867867868, + "grad_norm": 0.12168864905834198, + "learning_rate": 9.983038949086043e-05, + "loss": 0.55, + "step": 655 + }, + { + "epoch": 0.07927927927927927, + "grad_norm": 0.11078694462776184, + "learning_rate": 9.982779158726346e-05, + "loss": 0.5744, + "step": 660 + }, + { + "epoch": 0.07987987987987988, + "grad_norm": 0.12191225588321686, + "learning_rate": 9.982517397302617e-05, + "loss": 0.6319, + "step": 665 + }, + { + "epoch": 0.08048048048048048, + "grad_norm": 0.11814052611589432, + "learning_rate": 9.982253664918404e-05, + "loss": 0.5397, + "step": 670 + }, + { + "epoch": 0.08108108108108109, + "grad_norm": 0.11518977582454681, + "learning_rate": 9.98198796167803e-05, + "loss": 0.6731, + "step": 675 + }, + { + "epoch": 0.08168168168168168, + "grad_norm": 0.10897009819746017, + "learning_rate": 9.9817202876866e-05, + "loss": 0.5274, + "step": 680 + }, + { + "epoch": 0.08228228228228228, + "grad_norm": 0.12917213141918182, + "learning_rate": 9.981450643050004e-05, + "loss": 0.6496, + "step": 685 + }, + { + "epoch": 0.08288288288288288, + "grad_norm": 0.11243332922458649, + "learning_rate": 9.981179027874903e-05, + "loss": 0.5684, + "step": 690 + }, + { + "epoch": 0.08348348348348349, + "grad_norm": 0.1249338760972023, + "learning_rate": 9.980905442268742e-05, + "loss": 0.589, + "step": 695 + }, + { + "epoch": 0.08408408408408409, + "grad_norm": 0.13728263974189758, + "learning_rate": 9.980629886339745e-05, + "loss": 0.5862, + "step": 700 + }, + { + "epoch": 0.08468468468468468, + "grad_norm": 0.1067405566573143, + "learning_rate": 9.980352360196915e-05, + "loss": 0.5325, + "step": 705 + }, + { + "epoch": 0.08528528528528528, + "grad_norm": 0.12034221738576889, + "learning_rate": 9.980072863950034e-05, + "loss": 0.6633, + "step": 710 + }, + { + "epoch": 0.08588588588588589, + "grad_norm": 0.12357219308614731, + "learning_rate": 9.979791397709665e-05, + "loss": 0.5595, + "step": 715 + }, + { + "epoch": 0.08648648648648649, + "grad_norm": 0.12757450342178345, + "learning_rate": 9.979507961587146e-05, + "loss": 0.566, + "step": 720 + }, + { + "epoch": 0.08708708708708708, + "grad_norm": 0.12144903093576431, + "learning_rate": 9.979222555694603e-05, + "loss": 0.5574, + "step": 725 + }, + { + "epoch": 0.08768768768768768, + "grad_norm": 0.1184130534529686, + "learning_rate": 9.978935180144929e-05, + "loss": 0.6473, + "step": 730 + }, + { + "epoch": 0.08828828828828829, + "grad_norm": 0.1136375144124031, + "learning_rate": 9.978645835051807e-05, + "loss": 0.56, + "step": 735 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.11569700390100479, + "learning_rate": 9.978354520529694e-05, + "loss": 0.6062, + "step": 740 + }, + { + "epoch": 0.0894894894894895, + "grad_norm": 0.1249203309416771, + "learning_rate": 9.978061236693825e-05, + "loss": 0.4966, + "step": 745 + }, + { + "epoch": 0.09009009009009009, + "grad_norm": 0.1043449193239212, + "learning_rate": 9.977765983660219e-05, + "loss": 0.581, + "step": 750 + }, + { + "epoch": 0.09009009009009009, + "eval_loss": 0.556373655796051, + "eval_runtime": 35.7127, + "eval_samples_per_second": 22.401, + "eval_steps_per_second": 5.6, + "step": 750 + }, + { + "epoch": 0.09069069069069069, + "grad_norm": 0.12919944524765015, + "learning_rate": 9.977468761545668e-05, + "loss": 0.5873, + "step": 755 + }, + { + "epoch": 0.0912912912912913, + "grad_norm": 0.13691309094429016, + "learning_rate": 9.977169570467746e-05, + "loss": 0.5736, + "step": 760 + }, + { + "epoch": 0.0918918918918919, + "grad_norm": 0.11516213417053223, + "learning_rate": 9.976868410544807e-05, + "loss": 0.6117, + "step": 765 + }, + { + "epoch": 0.09249249249249249, + "grad_norm": 0.1293349415063858, + "learning_rate": 9.97656528189598e-05, + "loss": 0.6001, + "step": 770 + }, + { + "epoch": 0.09309309309309309, + "grad_norm": 0.13094763457775116, + "learning_rate": 9.976260184641178e-05, + "loss": 0.5834, + "step": 775 + }, + { + "epoch": 0.0936936936936937, + "grad_norm": 0.1106487512588501, + "learning_rate": 9.975953118901087e-05, + "loss": 0.5727, + "step": 780 + }, + { + "epoch": 0.0942942942942943, + "grad_norm": 0.11776825040578842, + "learning_rate": 9.975644084797177e-05, + "loss": 0.57, + "step": 785 + }, + { + "epoch": 0.0948948948948949, + "grad_norm": 0.13150927424430847, + "learning_rate": 9.975333082451697e-05, + "loss": 0.5947, + "step": 790 + }, + { + "epoch": 0.09549549549549549, + "grad_norm": 0.10551803559064865, + "learning_rate": 9.975020111987665e-05, + "loss": 0.5666, + "step": 795 + }, + { + "epoch": 0.0960960960960961, + "grad_norm": 0.12537789344787598, + "learning_rate": 9.974705173528888e-05, + "loss": 0.59, + "step": 800 + }, + { + "epoch": 0.0966966966966967, + "grad_norm": 0.10980799794197083, + "learning_rate": 9.974388267199948e-05, + "loss": 0.6458, + "step": 805 + }, + { + "epoch": 0.0972972972972973, + "grad_norm": 0.13737636804580688, + "learning_rate": 9.974069393126204e-05, + "loss": 0.6085, + "step": 810 + }, + { + "epoch": 0.09789789789789789, + "grad_norm": 0.11622725427150726, + "learning_rate": 9.973748551433797e-05, + "loss": 0.5489, + "step": 815 + }, + { + "epoch": 0.0984984984984985, + "grad_norm": 0.12641353905200958, + "learning_rate": 9.973425742249642e-05, + "loss": 0.5548, + "step": 820 + }, + { + "epoch": 0.0990990990990991, + "grad_norm": 0.13305918872356415, + "learning_rate": 9.973100965701434e-05, + "loss": 0.5867, + "step": 825 + }, + { + "epoch": 0.0996996996996997, + "grad_norm": 0.11327166110277176, + "learning_rate": 9.972774221917649e-05, + "loss": 0.5836, + "step": 830 + }, + { + "epoch": 0.1003003003003003, + "grad_norm": 0.11508522182703018, + "learning_rate": 9.972445511027536e-05, + "loss": 0.5103, + "step": 835 + }, + { + "epoch": 0.1009009009009009, + "grad_norm": 0.13816924393177032, + "learning_rate": 9.972114833161127e-05, + "loss": 0.5635, + "step": 840 + }, + { + "epoch": 0.1015015015015015, + "grad_norm": 0.11702617257833481, + "learning_rate": 9.971782188449227e-05, + "loss": 0.581, + "step": 845 + }, + { + "epoch": 0.1021021021021021, + "grad_norm": 0.11020959168672562, + "learning_rate": 9.971447577023427e-05, + "loss": 0.5462, + "step": 850 + }, + { + "epoch": 0.10270270270270271, + "grad_norm": 0.10916626453399658, + "learning_rate": 9.971110999016087e-05, + "loss": 0.5908, + "step": 855 + }, + { + "epoch": 0.1033033033033033, + "grad_norm": 0.1086755096912384, + "learning_rate": 9.97077245456035e-05, + "loss": 0.5541, + "step": 860 + }, + { + "epoch": 0.1039039039039039, + "grad_norm": 0.11078358441591263, + "learning_rate": 9.970431943790135e-05, + "loss": 0.5465, + "step": 865 + }, + { + "epoch": 0.1045045045045045, + "grad_norm": 0.1155092641711235, + "learning_rate": 9.970089466840141e-05, + "loss": 0.5377, + "step": 870 + }, + { + "epoch": 0.10510510510510511, + "grad_norm": 0.15020284056663513, + "learning_rate": 9.969745023845842e-05, + "loss": 0.6166, + "step": 875 + }, + { + "epoch": 0.1057057057057057, + "grad_norm": 0.13164208829402924, + "learning_rate": 9.969398614943493e-05, + "loss": 0.6068, + "step": 880 + }, + { + "epoch": 0.1063063063063063, + "grad_norm": 0.1536552608013153, + "learning_rate": 9.969050240270123e-05, + "loss": 0.6202, + "step": 885 + }, + { + "epoch": 0.1069069069069069, + "grad_norm": 0.12868870794773102, + "learning_rate": 9.968699899963542e-05, + "loss": 0.5557, + "step": 890 + }, + { + "epoch": 0.10750750750750751, + "grad_norm": 0.14724482595920563, + "learning_rate": 9.968347594162335e-05, + "loss": 0.5372, + "step": 895 + }, + { + "epoch": 0.10810810810810811, + "grad_norm": 0.12083470076322556, + "learning_rate": 9.967993323005867e-05, + "loss": 0.5613, + "step": 900 + }, + { + "epoch": 0.1087087087087087, + "grad_norm": 0.14249064028263092, + "learning_rate": 9.967637086634273e-05, + "loss": 0.579, + "step": 905 + }, + { + "epoch": 0.10930930930930931, + "grad_norm": 0.12009460479021072, + "learning_rate": 9.96727888518848e-05, + "loss": 0.61, + "step": 910 + }, + { + "epoch": 0.10990990990990991, + "grad_norm": 0.13391336798667908, + "learning_rate": 9.966918718810178e-05, + "loss": 0.5847, + "step": 915 + }, + { + "epoch": 0.11051051051051052, + "grad_norm": 0.1236066222190857, + "learning_rate": 9.96655658764184e-05, + "loss": 0.5946, + "step": 920 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.11532710492610931, + "learning_rate": 9.966192491826719e-05, + "loss": 0.5679, + "step": 925 + }, + { + "epoch": 0.11171171171171171, + "grad_norm": 0.13501344621181488, + "learning_rate": 9.965826431508838e-05, + "loss": 0.572, + "step": 930 + }, + { + "epoch": 0.11231231231231231, + "grad_norm": 0.13786637783050537, + "learning_rate": 9.965458406833007e-05, + "loss": 0.5867, + "step": 935 + }, + { + "epoch": 0.11291291291291292, + "grad_norm": 0.13774774968624115, + "learning_rate": 9.965088417944804e-05, + "loss": 0.6295, + "step": 940 + }, + { + "epoch": 0.11351351351351352, + "grad_norm": 0.11815715581178665, + "learning_rate": 9.964716464990587e-05, + "loss": 0.5813, + "step": 945 + }, + { + "epoch": 0.11411411411411411, + "grad_norm": 0.14110974967479706, + "learning_rate": 9.964342548117492e-05, + "loss": 0.5655, + "step": 950 + }, + { + "epoch": 0.11471471471471471, + "grad_norm": 0.13697779178619385, + "learning_rate": 9.963966667473432e-05, + "loss": 0.5126, + "step": 955 + }, + { + "epoch": 0.11531531531531532, + "grad_norm": 0.1307583451271057, + "learning_rate": 9.963588823207095e-05, + "loss": 0.5642, + "step": 960 + }, + { + "epoch": 0.11591591591591592, + "grad_norm": 0.13687662780284882, + "learning_rate": 9.96320901546795e-05, + "loss": 0.5406, + "step": 965 + }, + { + "epoch": 0.11651651651651651, + "grad_norm": 0.1363164782524109, + "learning_rate": 9.962827244406235e-05, + "loss": 0.5358, + "step": 970 + }, + { + "epoch": 0.11711711711711711, + "grad_norm": 0.11956089735031128, + "learning_rate": 9.962443510172969e-05, + "loss": 0.5797, + "step": 975 + }, + { + "epoch": 0.11771771771771772, + "grad_norm": 0.13000960648059845, + "learning_rate": 9.962057812919954e-05, + "loss": 0.5879, + "step": 980 + }, + { + "epoch": 0.11831831831831832, + "grad_norm": 0.12119048088788986, + "learning_rate": 9.961670152799756e-05, + "loss": 0.4789, + "step": 985 + }, + { + "epoch": 0.11891891891891893, + "grad_norm": 0.11234846711158752, + "learning_rate": 9.961280529965726e-05, + "loss": 0.5643, + "step": 990 + }, + { + "epoch": 0.11951951951951952, + "grad_norm": 0.12192381918430328, + "learning_rate": 9.960888944571989e-05, + "loss": 0.562, + "step": 995 + }, + { + "epoch": 0.12012012012012012, + "grad_norm": 0.11708851158618927, + "learning_rate": 9.960495396773448e-05, + "loss": 0.6252, + "step": 1000 + }, + { + "epoch": 0.12012012012012012, + "eval_loss": 0.5504798889160156, + "eval_runtime": 35.636, + "eval_samples_per_second": 22.449, + "eval_steps_per_second": 5.612, + "step": 1000 + }, + { + "epoch": 0.12072072072072072, + "grad_norm": 0.11086433380842209, + "learning_rate": 9.960099886725778e-05, + "loss": 0.5448, + "step": 1005 + }, + { + "epoch": 0.12132132132132133, + "grad_norm": 0.13834629952907562, + "learning_rate": 9.959702414585434e-05, + "loss": 0.5544, + "step": 1010 + }, + { + "epoch": 0.12192192192192192, + "grad_norm": 0.13374705612659454, + "learning_rate": 9.959302980509648e-05, + "loss": 0.6367, + "step": 1015 + }, + { + "epoch": 0.12252252252252252, + "grad_norm": 0.12303639948368073, + "learning_rate": 9.958901584656424e-05, + "loss": 0.6276, + "step": 1020 + }, + { + "epoch": 0.12312312312312312, + "grad_norm": 0.11851788312196732, + "learning_rate": 9.958498227184545e-05, + "loss": 0.5434, + "step": 1025 + }, + { + "epoch": 0.12372372372372373, + "grad_norm": 0.118187315762043, + "learning_rate": 9.95809290825357e-05, + "loss": 0.4774, + "step": 1030 + }, + { + "epoch": 0.12432432432432433, + "grad_norm": 0.11095068603754044, + "learning_rate": 9.95768562802383e-05, + "loss": 0.5365, + "step": 1035 + }, + { + "epoch": 0.12492492492492492, + "grad_norm": 0.15384650230407715, + "learning_rate": 9.957276386656438e-05, + "loss": 0.5857, + "step": 1040 + }, + { + "epoch": 0.12552552552552554, + "grad_norm": 0.14868667721748352, + "learning_rate": 9.95686518431328e-05, + "loss": 0.6165, + "step": 1045 + }, + { + "epoch": 0.12612612612612611, + "grad_norm": 0.13861216604709625, + "learning_rate": 9.956452021157015e-05, + "loss": 0.6233, + "step": 1050 + }, + { + "epoch": 0.12672672672672672, + "grad_norm": 0.11492381244897842, + "learning_rate": 9.956036897351082e-05, + "loss": 0.518, + "step": 1055 + }, + { + "epoch": 0.12732732732732732, + "grad_norm": 0.14138482511043549, + "learning_rate": 9.955619813059695e-05, + "loss": 0.5503, + "step": 1060 + }, + { + "epoch": 0.12792792792792793, + "grad_norm": 0.13464735448360443, + "learning_rate": 9.955200768447839e-05, + "loss": 0.5786, + "step": 1065 + }, + { + "epoch": 0.12852852852852853, + "grad_norm": 0.15015809237957, + "learning_rate": 9.954779763681279e-05, + "loss": 0.6308, + "step": 1070 + }, + { + "epoch": 0.12912912912912913, + "grad_norm": 0.1323210448026657, + "learning_rate": 9.954356798926556e-05, + "loss": 0.5932, + "step": 1075 + }, + { + "epoch": 0.12972972972972974, + "grad_norm": 0.13801470398902893, + "learning_rate": 9.953931874350981e-05, + "loss": 0.6251, + "step": 1080 + }, + { + "epoch": 0.13033033033033034, + "grad_norm": 0.14785467088222504, + "learning_rate": 9.953504990122645e-05, + "loss": 0.5704, + "step": 1085 + }, + { + "epoch": 0.13093093093093094, + "grad_norm": 0.1381571739912033, + "learning_rate": 9.953076146410414e-05, + "loss": 0.5665, + "step": 1090 + }, + { + "epoch": 0.13153153153153152, + "grad_norm": 0.12187087535858154, + "learning_rate": 9.952645343383926e-05, + "loss": 0.581, + "step": 1095 + }, + { + "epoch": 0.13213213213213212, + "grad_norm": 0.12892118096351624, + "learning_rate": 9.952212581213598e-05, + "loss": 0.5671, + "step": 1100 + }, + { + "epoch": 0.13273273273273273, + "grad_norm": 0.14864008128643036, + "learning_rate": 9.95177786007062e-05, + "loss": 0.6301, + "step": 1105 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.1285526156425476, + "learning_rate": 9.951341180126954e-05, + "loss": 0.5563, + "step": 1110 + }, + { + "epoch": 0.13393393393393394, + "grad_norm": 0.12077897042036057, + "learning_rate": 9.950902541555342e-05, + "loss": 0.5541, + "step": 1115 + }, + { + "epoch": 0.13453453453453454, + "grad_norm": 0.14453986287117004, + "learning_rate": 9.9504619445293e-05, + "loss": 0.5592, + "step": 1120 + }, + { + "epoch": 0.13513513513513514, + "grad_norm": 0.11925803869962692, + "learning_rate": 9.950019389223113e-05, + "loss": 0.5096, + "step": 1125 + }, + { + "epoch": 0.13573573573573575, + "grad_norm": 0.1241493970155716, + "learning_rate": 9.949574875811849e-05, + "loss": 0.5401, + "step": 1130 + }, + { + "epoch": 0.13633633633633635, + "grad_norm": 0.12902575731277466, + "learning_rate": 9.949128404471346e-05, + "loss": 0.5652, + "step": 1135 + }, + { + "epoch": 0.13693693693693693, + "grad_norm": 0.1486339122056961, + "learning_rate": 9.948679975378215e-05, + "loss": 0.6048, + "step": 1140 + }, + { + "epoch": 0.13753753753753753, + "grad_norm": 0.11694955080747604, + "learning_rate": 9.948229588709843e-05, + "loss": 0.5372, + "step": 1145 + }, + { + "epoch": 0.13813813813813813, + "grad_norm": 0.14657153189182281, + "learning_rate": 9.947777244644395e-05, + "loss": 0.6111, + "step": 1150 + }, + { + "epoch": 0.13873873873873874, + "grad_norm": 0.1201421245932579, + "learning_rate": 9.947322943360805e-05, + "loss": 0.4755, + "step": 1155 + }, + { + "epoch": 0.13933933933933934, + "grad_norm": 0.14225557446479797, + "learning_rate": 9.946866685038782e-05, + "loss": 0.6418, + "step": 1160 + }, + { + "epoch": 0.13993993993993994, + "grad_norm": 0.11903166025876999, + "learning_rate": 9.946408469858814e-05, + "loss": 0.5509, + "step": 1165 + }, + { + "epoch": 0.14054054054054055, + "grad_norm": 0.14576730132102966, + "learning_rate": 9.945948298002159e-05, + "loss": 0.5591, + "step": 1170 + }, + { + "epoch": 0.14114114114114115, + "grad_norm": 0.1285872757434845, + "learning_rate": 9.945486169650846e-05, + "loss": 0.5596, + "step": 1175 + }, + { + "epoch": 0.14174174174174173, + "grad_norm": 0.12496671825647354, + "learning_rate": 9.945022084987686e-05, + "loss": 0.4635, + "step": 1180 + }, + { + "epoch": 0.14234234234234233, + "grad_norm": 0.1366618573665619, + "learning_rate": 9.944556044196254e-05, + "loss": 0.621, + "step": 1185 + }, + { + "epoch": 0.14294294294294294, + "grad_norm": 0.14241304993629456, + "learning_rate": 9.944088047460908e-05, + "loss": 0.5102, + "step": 1190 + }, + { + "epoch": 0.14354354354354354, + "grad_norm": 0.13404640555381775, + "learning_rate": 9.943618094966778e-05, + "loss": 0.4964, + "step": 1195 + }, + { + "epoch": 0.14414414414414414, + "grad_norm": 0.1323569267988205, + "learning_rate": 9.943146186899763e-05, + "loss": 0.6184, + "step": 1200 + }, + { + "epoch": 0.14474474474474475, + "grad_norm": 0.15703721344470978, + "learning_rate": 9.942672323446535e-05, + "loss": 0.5274, + "step": 1205 + }, + { + "epoch": 0.14534534534534535, + "grad_norm": 0.13442037999629974, + "learning_rate": 9.942196504794548e-05, + "loss": 0.5314, + "step": 1210 + }, + { + "epoch": 0.14594594594594595, + "grad_norm": 0.14303931593894958, + "learning_rate": 9.941718731132021e-05, + "loss": 0.5491, + "step": 1215 + }, + { + "epoch": 0.14654654654654656, + "grad_norm": 0.14930444955825806, + "learning_rate": 9.941239002647951e-05, + "loss": 0.5963, + "step": 1220 + }, + { + "epoch": 0.14714714714714713, + "grad_norm": 0.1592545211315155, + "learning_rate": 9.940757319532106e-05, + "loss": 0.6314, + "step": 1225 + }, + { + "epoch": 0.14774774774774774, + "grad_norm": 0.12300348281860352, + "learning_rate": 9.940273681975028e-05, + "loss": 0.5336, + "step": 1230 + }, + { + "epoch": 0.14834834834834834, + "grad_norm": 0.14255018532276154, + "learning_rate": 9.939788090168029e-05, + "loss": 0.5748, + "step": 1235 + }, + { + "epoch": 0.14894894894894894, + "grad_norm": 0.12294194102287292, + "learning_rate": 9.939300544303203e-05, + "loss": 0.4948, + "step": 1240 + }, + { + "epoch": 0.14954954954954955, + "grad_norm": 0.13256989419460297, + "learning_rate": 9.938811044573408e-05, + "loss": 0.6179, + "step": 1245 + }, + { + "epoch": 0.15015015015015015, + "grad_norm": 0.13299089670181274, + "learning_rate": 9.938319591172276e-05, + "loss": 0.5523, + "step": 1250 + }, + { + "epoch": 0.15015015015015015, + "eval_loss": 0.5406990051269531, + "eval_runtime": 35.8435, + "eval_samples_per_second": 22.319, + "eval_steps_per_second": 5.58, + "step": 1250 + }, + { + "epoch": 0.15075075075075076, + "grad_norm": 0.14382098615169525, + "learning_rate": 9.93782618429422e-05, + "loss": 0.5333, + "step": 1255 + }, + { + "epoch": 0.15135135135135136, + "grad_norm": 0.15322475135326385, + "learning_rate": 9.937330824134411e-05, + "loss": 0.5857, + "step": 1260 + }, + { + "epoch": 0.15195195195195196, + "grad_norm": 0.1334904432296753, + "learning_rate": 9.936833510888808e-05, + "loss": 0.5689, + "step": 1265 + }, + { + "epoch": 0.15255255255255254, + "grad_norm": 0.1450200080871582, + "learning_rate": 9.93633424475413e-05, + "loss": 0.5308, + "step": 1270 + }, + { + "epoch": 0.15315315315315314, + "grad_norm": 0.16856804490089417, + "learning_rate": 9.935833025927881e-05, + "loss": 0.6035, + "step": 1275 + }, + { + "epoch": 0.15375375375375375, + "grad_norm": 0.11449337005615234, + "learning_rate": 9.935329854608328e-05, + "loss": 0.5383, + "step": 1280 + }, + { + "epoch": 0.15435435435435435, + "grad_norm": 0.14133980870246887, + "learning_rate": 9.93482473099451e-05, + "loss": 0.6096, + "step": 1285 + }, + { + "epoch": 0.15495495495495495, + "grad_norm": 0.14201879501342773, + "learning_rate": 9.934317655286246e-05, + "loss": 0.5111, + "step": 1290 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.13272619247436523, + "learning_rate": 9.93380862768412e-05, + "loss": 0.553, + "step": 1295 + }, + { + "epoch": 0.15615615615615616, + "grad_norm": 0.16790813207626343, + "learning_rate": 9.93329764838949e-05, + "loss": 0.5438, + "step": 1300 + }, + { + "epoch": 0.15675675675675677, + "grad_norm": 0.1232328861951828, + "learning_rate": 9.93278471760449e-05, + "loss": 0.5568, + "step": 1305 + }, + { + "epoch": 0.15735735735735737, + "grad_norm": 0.1317184567451477, + "learning_rate": 9.93226983553202e-05, + "loss": 0.5557, + "step": 1310 + }, + { + "epoch": 0.15795795795795795, + "grad_norm": 0.13694261014461517, + "learning_rate": 9.931753002375755e-05, + "loss": 0.5952, + "step": 1315 + }, + { + "epoch": 0.15855855855855855, + "grad_norm": 0.14206798374652863, + "learning_rate": 9.931234218340142e-05, + "loss": 0.6113, + "step": 1320 + }, + { + "epoch": 0.15915915915915915, + "grad_norm": 0.14540702104568481, + "learning_rate": 9.930713483630398e-05, + "loss": 0.5499, + "step": 1325 + }, + { + "epoch": 0.15975975975975976, + "grad_norm": 0.135905459523201, + "learning_rate": 9.930190798452515e-05, + "loss": 0.5708, + "step": 1330 + }, + { + "epoch": 0.16036036036036036, + "grad_norm": 0.1379026472568512, + "learning_rate": 9.929666163013251e-05, + "loss": 0.5838, + "step": 1335 + }, + { + "epoch": 0.16096096096096096, + "grad_norm": 0.14682543277740479, + "learning_rate": 9.929139577520143e-05, + "loss": 0.5267, + "step": 1340 + }, + { + "epoch": 0.16156156156156157, + "grad_norm": 0.13486546277999878, + "learning_rate": 9.92861104218149e-05, + "loss": 0.5689, + "step": 1345 + }, + { + "epoch": 0.16216216216216217, + "grad_norm": 0.14320464432239532, + "learning_rate": 9.928080557206374e-05, + "loss": 0.5452, + "step": 1350 + }, + { + "epoch": 0.16276276276276277, + "grad_norm": 0.1397445648908615, + "learning_rate": 9.927548122804636e-05, + "loss": 0.475, + "step": 1355 + }, + { + "epoch": 0.16336336336336335, + "grad_norm": 0.13572080433368683, + "learning_rate": 9.927013739186896e-05, + "loss": 0.5778, + "step": 1360 + }, + { + "epoch": 0.16396396396396395, + "grad_norm": 0.1294446587562561, + "learning_rate": 9.926477406564543e-05, + "loss": 0.5633, + "step": 1365 + }, + { + "epoch": 0.16456456456456456, + "grad_norm": 0.13914752006530762, + "learning_rate": 9.925939125149737e-05, + "loss": 0.493, + "step": 1370 + }, + { + "epoch": 0.16516516516516516, + "grad_norm": 0.14273406565189362, + "learning_rate": 9.925398895155408e-05, + "loss": 0.5125, + "step": 1375 + }, + { + "epoch": 0.16576576576576577, + "grad_norm": 0.1549476832151413, + "learning_rate": 9.924856716795259e-05, + "loss": 0.5856, + "step": 1380 + }, + { + "epoch": 0.16636636636636637, + "grad_norm": 0.1278187483549118, + "learning_rate": 9.924312590283759e-05, + "loss": 0.5287, + "step": 1385 + }, + { + "epoch": 0.16696696696696697, + "grad_norm": 0.11236418783664703, + "learning_rate": 9.923766515836158e-05, + "loss": 0.5092, + "step": 1390 + }, + { + "epoch": 0.16756756756756758, + "grad_norm": 0.13451069593429565, + "learning_rate": 9.923218493668462e-05, + "loss": 0.5557, + "step": 1395 + }, + { + "epoch": 0.16816816816816818, + "grad_norm": 0.1335114985704422, + "learning_rate": 9.922668523997459e-05, + "loss": 0.5246, + "step": 1400 + }, + { + "epoch": 0.16876876876876876, + "grad_norm": 0.13806520402431488, + "learning_rate": 9.922116607040701e-05, + "loss": 0.547, + "step": 1405 + }, + { + "epoch": 0.16936936936936936, + "grad_norm": 0.13814181089401245, + "learning_rate": 9.921562743016515e-05, + "loss": 0.6071, + "step": 1410 + }, + { + "epoch": 0.16996996996996996, + "grad_norm": 0.17738781869411469, + "learning_rate": 9.921006932143995e-05, + "loss": 0.6035, + "step": 1415 + }, + { + "epoch": 0.17057057057057057, + "grad_norm": 0.13893504440784454, + "learning_rate": 9.920449174643006e-05, + "loss": 0.5531, + "step": 1420 + }, + { + "epoch": 0.17117117117117117, + "grad_norm": 0.1506422758102417, + "learning_rate": 9.919889470734183e-05, + "loss": 0.5554, + "step": 1425 + }, + { + "epoch": 0.17177177177177178, + "grad_norm": 0.1395304948091507, + "learning_rate": 9.91932782063893e-05, + "loss": 0.5718, + "step": 1430 + }, + { + "epoch": 0.17237237237237238, + "grad_norm": 0.13701309263706207, + "learning_rate": 9.918764224579425e-05, + "loss": 0.5424, + "step": 1435 + }, + { + "epoch": 0.17297297297297298, + "grad_norm": 0.1376919001340866, + "learning_rate": 9.91819868277861e-05, + "loss": 0.5977, + "step": 1440 + }, + { + "epoch": 0.1735735735735736, + "grad_norm": 0.15341120958328247, + "learning_rate": 9.9176311954602e-05, + "loss": 0.5686, + "step": 1445 + }, + { + "epoch": 0.17417417417417416, + "grad_norm": 0.15332254767417908, + "learning_rate": 9.917061762848677e-05, + "loss": 0.5695, + "step": 1450 + }, + { + "epoch": 0.17477477477477477, + "grad_norm": 0.13192585110664368, + "learning_rate": 9.916490385169297e-05, + "loss": 0.5524, + "step": 1455 + }, + { + "epoch": 0.17537537537537537, + "grad_norm": 0.16271525621414185, + "learning_rate": 9.915917062648083e-05, + "loss": 0.625, + "step": 1460 + }, + { + "epoch": 0.17597597597597597, + "grad_norm": 0.15786905586719513, + "learning_rate": 9.915341795511826e-05, + "loss": 0.4909, + "step": 1465 + }, + { + "epoch": 0.17657657657657658, + "grad_norm": 0.17195844650268555, + "learning_rate": 9.914764583988087e-05, + "loss": 0.5432, + "step": 1470 + }, + { + "epoch": 0.17717717717717718, + "grad_norm": 0.14075967669487, + "learning_rate": 9.914185428305198e-05, + "loss": 0.4958, + "step": 1475 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.14056497812271118, + "learning_rate": 9.913604328692258e-05, + "loss": 0.585, + "step": 1480 + }, + { + "epoch": 0.1783783783783784, + "grad_norm": 0.16021881997585297, + "learning_rate": 9.913021285379136e-05, + "loss": 0.5645, + "step": 1485 + }, + { + "epoch": 0.178978978978979, + "grad_norm": 0.1612151712179184, + "learning_rate": 9.912436298596469e-05, + "loss": 0.5109, + "step": 1490 + }, + { + "epoch": 0.17957957957957957, + "grad_norm": 0.15164624154567719, + "learning_rate": 9.91184936857566e-05, + "loss": 0.5532, + "step": 1495 + }, + { + "epoch": 0.18018018018018017, + "grad_norm": 0.15083666145801544, + "learning_rate": 9.911260495548892e-05, + "loss": 0.5337, + "step": 1500 + }, + { + "epoch": 0.18018018018018017, + "eval_loss": 0.532286524772644, + "eval_runtime": 37.2562, + "eval_samples_per_second": 21.473, + "eval_steps_per_second": 5.368, + "step": 1500 + }, + { + "epoch": 0.18078078078078078, + "grad_norm": 0.1541498303413391, + "learning_rate": 9.910669679749101e-05, + "loss": 0.5712, + "step": 1505 + }, + { + "epoch": 0.18138138138138138, + "grad_norm": 0.16341352462768555, + "learning_rate": 9.910076921410003e-05, + "loss": 0.5631, + "step": 1510 + }, + { + "epoch": 0.18198198198198198, + "grad_norm": 0.15230105817317963, + "learning_rate": 9.909482220766077e-05, + "loss": 0.5518, + "step": 1515 + }, + { + "epoch": 0.1825825825825826, + "grad_norm": 0.12905479967594147, + "learning_rate": 9.908885578052573e-05, + "loss": 0.5093, + "step": 1520 + }, + { + "epoch": 0.1831831831831832, + "grad_norm": 0.15421278774738312, + "learning_rate": 9.908286993505509e-05, + "loss": 0.5395, + "step": 1525 + }, + { + "epoch": 0.1837837837837838, + "grad_norm": 0.13414278626441956, + "learning_rate": 9.907686467361667e-05, + "loss": 0.5682, + "step": 1530 + }, + { + "epoch": 0.1843843843843844, + "grad_norm": 0.14239756762981415, + "learning_rate": 9.907083999858601e-05, + "loss": 0.5906, + "step": 1535 + }, + { + "epoch": 0.18498498498498497, + "grad_norm": 0.1477949470281601, + "learning_rate": 9.906479591234634e-05, + "loss": 0.5854, + "step": 1540 + }, + { + "epoch": 0.18558558558558558, + "grad_norm": 0.1359156221151352, + "learning_rate": 9.905873241728856e-05, + "loss": 0.5209, + "step": 1545 + }, + { + "epoch": 0.18618618618618618, + "grad_norm": 0.1480684131383896, + "learning_rate": 9.90526495158112e-05, + "loss": 0.6048, + "step": 1550 + }, + { + "epoch": 0.18678678678678678, + "grad_norm": 0.16773481667041779, + "learning_rate": 9.904654721032053e-05, + "loss": 0.5303, + "step": 1555 + }, + { + "epoch": 0.1873873873873874, + "grad_norm": 0.1348617523908615, + "learning_rate": 9.904042550323047e-05, + "loss": 0.5531, + "step": 1560 + }, + { + "epoch": 0.187987987987988, + "grad_norm": 0.15043242275714874, + "learning_rate": 9.90342843969626e-05, + "loss": 0.5665, + "step": 1565 + }, + { + "epoch": 0.1885885885885886, + "grad_norm": 0.15387991070747375, + "learning_rate": 9.902812389394622e-05, + "loss": 0.5394, + "step": 1570 + }, + { + "epoch": 0.1891891891891892, + "grad_norm": 0.15662983059883118, + "learning_rate": 9.902194399661826e-05, + "loss": 0.6264, + "step": 1575 + }, + { + "epoch": 0.1897897897897898, + "grad_norm": 0.14307186007499695, + "learning_rate": 9.901574470742332e-05, + "loss": 0.5138, + "step": 1580 + }, + { + "epoch": 0.19039039039039038, + "grad_norm": 0.17086243629455566, + "learning_rate": 9.900952602881369e-05, + "loss": 0.5255, + "step": 1585 + }, + { + "epoch": 0.19099099099099098, + "grad_norm": 0.16943460702896118, + "learning_rate": 9.900328796324933e-05, + "loss": 0.5263, + "step": 1590 + }, + { + "epoch": 0.1915915915915916, + "grad_norm": 0.13184261322021484, + "learning_rate": 9.899703051319786e-05, + "loss": 0.5284, + "step": 1595 + }, + { + "epoch": 0.1921921921921922, + "grad_norm": 0.16626524925231934, + "learning_rate": 9.899075368113459e-05, + "loss": 0.5661, + "step": 1600 + }, + { + "epoch": 0.1927927927927928, + "grad_norm": 0.15596666932106018, + "learning_rate": 9.898445746954246e-05, + "loss": 0.5182, + "step": 1605 + }, + { + "epoch": 0.1933933933933934, + "grad_norm": 0.13959796726703644, + "learning_rate": 9.897814188091209e-05, + "loss": 0.5516, + "step": 1610 + }, + { + "epoch": 0.193993993993994, + "grad_norm": 0.1410140097141266, + "learning_rate": 9.89718069177418e-05, + "loss": 0.5306, + "step": 1615 + }, + { + "epoch": 0.1945945945945946, + "grad_norm": 0.16129790246486664, + "learning_rate": 9.896545258253751e-05, + "loss": 0.6369, + "step": 1620 + }, + { + "epoch": 0.19519519519519518, + "grad_norm": 0.13550494611263275, + "learning_rate": 9.895907887781286e-05, + "loss": 0.4805, + "step": 1625 + }, + { + "epoch": 0.19579579579579579, + "grad_norm": 0.15120775997638702, + "learning_rate": 9.895268580608912e-05, + "loss": 0.5868, + "step": 1630 + }, + { + "epoch": 0.1963963963963964, + "grad_norm": 0.19538648426532745, + "learning_rate": 9.894627336989524e-05, + "loss": 0.5838, + "step": 1635 + }, + { + "epoch": 0.196996996996997, + "grad_norm": 0.15177778899669647, + "learning_rate": 9.893984157176781e-05, + "loss": 0.5651, + "step": 1640 + }, + { + "epoch": 0.1975975975975976, + "grad_norm": 0.17927424609661102, + "learning_rate": 9.89333904142511e-05, + "loss": 0.5452, + "step": 1645 + }, + { + "epoch": 0.1981981981981982, + "grad_norm": 0.14731010794639587, + "learning_rate": 9.892691989989701e-05, + "loss": 0.5117, + "step": 1650 + }, + { + "epoch": 0.1987987987987988, + "grad_norm": 0.15999050438404083, + "learning_rate": 9.892043003126515e-05, + "loss": 0.5252, + "step": 1655 + }, + { + "epoch": 0.1993993993993994, + "grad_norm": 0.14248783886432648, + "learning_rate": 9.891392081092272e-05, + "loss": 0.578, + "step": 1660 + }, + { + "epoch": 0.2, + "grad_norm": 0.12995529174804688, + "learning_rate": 9.890739224144461e-05, + "loss": 0.5536, + "step": 1665 + }, + { + "epoch": 0.2006006006006006, + "grad_norm": 0.1568031758069992, + "learning_rate": 9.890084432541337e-05, + "loss": 0.5022, + "step": 1670 + }, + { + "epoch": 0.2012012012012012, + "grad_norm": 0.17728778719902039, + "learning_rate": 9.889427706541918e-05, + "loss": 0.5913, + "step": 1675 + }, + { + "epoch": 0.2018018018018018, + "grad_norm": 0.18074369430541992, + "learning_rate": 9.888769046405991e-05, + "loss": 0.5252, + "step": 1680 + }, + { + "epoch": 0.2024024024024024, + "grad_norm": 0.15932361781597137, + "learning_rate": 9.888108452394105e-05, + "loss": 0.5609, + "step": 1685 + }, + { + "epoch": 0.203003003003003, + "grad_norm": 0.15575721859931946, + "learning_rate": 9.887445924767571e-05, + "loss": 0.6251, + "step": 1690 + }, + { + "epoch": 0.2036036036036036, + "grad_norm": 0.14726823568344116, + "learning_rate": 9.886781463788474e-05, + "loss": 0.5817, + "step": 1695 + }, + { + "epoch": 0.2042042042042042, + "grad_norm": 0.1656755954027176, + "learning_rate": 9.886115069719654e-05, + "loss": 0.4974, + "step": 1700 + }, + { + "epoch": 0.2048048048048048, + "grad_norm": 0.169316828250885, + "learning_rate": 9.885446742824722e-05, + "loss": 0.5278, + "step": 1705 + }, + { + "epoch": 0.20540540540540542, + "grad_norm": 0.14765839278697968, + "learning_rate": 9.884776483368052e-05, + "loss": 0.5993, + "step": 1710 + }, + { + "epoch": 0.206006006006006, + "grad_norm": 0.16174598038196564, + "learning_rate": 9.884104291614779e-05, + "loss": 0.6095, + "step": 1715 + }, + { + "epoch": 0.2066066066066066, + "grad_norm": 0.157635897397995, + "learning_rate": 9.88343016783081e-05, + "loss": 0.548, + "step": 1720 + }, + { + "epoch": 0.2072072072072072, + "grad_norm": 0.15818659961223602, + "learning_rate": 9.88275411228281e-05, + "loss": 0.5238, + "step": 1725 + }, + { + "epoch": 0.2078078078078078, + "grad_norm": 0.16762599349021912, + "learning_rate": 9.882076125238206e-05, + "loss": 0.5897, + "step": 1730 + }, + { + "epoch": 0.2084084084084084, + "grad_norm": 0.16823016107082367, + "learning_rate": 9.881396206965199e-05, + "loss": 0.5364, + "step": 1735 + }, + { + "epoch": 0.209009009009009, + "grad_norm": 0.18279995024204254, + "learning_rate": 9.880714357732743e-05, + "loss": 0.5928, + "step": 1740 + }, + { + "epoch": 0.20960960960960962, + "grad_norm": 0.13614603877067566, + "learning_rate": 9.880030577810564e-05, + "loss": 0.5266, + "step": 1745 + }, + { + "epoch": 0.21021021021021022, + "grad_norm": 0.16419890522956848, + "learning_rate": 9.879344867469145e-05, + "loss": 0.5925, + "step": 1750 + }, + { + "epoch": 0.21021021021021022, + "eval_loss": 0.5199762582778931, + "eval_runtime": 35.5614, + "eval_samples_per_second": 22.496, + "eval_steps_per_second": 5.624, + "step": 1750 + }, + { + "epoch": 0.21081081081081082, + "grad_norm": 0.17191119492053986, + "learning_rate": 9.87865722697974e-05, + "loss": 0.5782, + "step": 1755 + }, + { + "epoch": 0.2114114114114114, + "grad_norm": 0.18159760534763336, + "learning_rate": 9.877967656614359e-05, + "loss": 0.5149, + "step": 1760 + }, + { + "epoch": 0.212012012012012, + "grad_norm": 0.16005107760429382, + "learning_rate": 9.87727615664578e-05, + "loss": 0.5673, + "step": 1765 + }, + { + "epoch": 0.2126126126126126, + "grad_norm": 0.16644255816936493, + "learning_rate": 9.876582727347545e-05, + "loss": 0.5478, + "step": 1770 + }, + { + "epoch": 0.2132132132132132, + "grad_norm": 0.1638658046722412, + "learning_rate": 9.875887368993957e-05, + "loss": 0.5637, + "step": 1775 + }, + { + "epoch": 0.2138138138138138, + "grad_norm": 0.14520032703876495, + "learning_rate": 9.87519008186008e-05, + "loss": 0.5773, + "step": 1780 + }, + { + "epoch": 0.21441441441441442, + "grad_norm": 0.15663036704063416, + "learning_rate": 9.874490866221747e-05, + "loss": 0.5455, + "step": 1785 + }, + { + "epoch": 0.21501501501501502, + "grad_norm": 0.16514965891838074, + "learning_rate": 9.873789722355546e-05, + "loss": 0.542, + "step": 1790 + }, + { + "epoch": 0.21561561561561562, + "grad_norm": 0.1629652976989746, + "learning_rate": 9.873086650538837e-05, + "loss": 0.6072, + "step": 1795 + }, + { + "epoch": 0.21621621621621623, + "grad_norm": 0.1657508760690689, + "learning_rate": 9.872381651049734e-05, + "loss": 0.515, + "step": 1800 + }, + { + "epoch": 0.2168168168168168, + "grad_norm": 0.15766561031341553, + "learning_rate": 9.87167472416712e-05, + "loss": 0.5516, + "step": 1805 + }, + { + "epoch": 0.2174174174174174, + "grad_norm": 0.14582960307598114, + "learning_rate": 9.870965870170636e-05, + "loss": 0.5225, + "step": 1810 + }, + { + "epoch": 0.218018018018018, + "grad_norm": 0.15485908091068268, + "learning_rate": 9.870255089340689e-05, + "loss": 0.5809, + "step": 1815 + }, + { + "epoch": 0.21861861861861862, + "grad_norm": 0.17167295515537262, + "learning_rate": 9.869542381958445e-05, + "loss": 0.5067, + "step": 1820 + }, + { + "epoch": 0.21921921921921922, + "grad_norm": 0.18673966825008392, + "learning_rate": 9.868827748305833e-05, + "loss": 0.601, + "step": 1825 + }, + { + "epoch": 0.21981981981981982, + "grad_norm": 0.1618594527244568, + "learning_rate": 9.868111188665544e-05, + "loss": 0.5431, + "step": 1830 + }, + { + "epoch": 0.22042042042042043, + "grad_norm": 0.17336180806159973, + "learning_rate": 9.867392703321032e-05, + "loss": 0.544, + "step": 1835 + }, + { + "epoch": 0.22102102102102103, + "grad_norm": 0.15530580282211304, + "learning_rate": 9.866672292556513e-05, + "loss": 0.5428, + "step": 1840 + }, + { + "epoch": 0.22162162162162163, + "grad_norm": 0.1728203147649765, + "learning_rate": 9.865949956656964e-05, + "loss": 0.4687, + "step": 1845 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.17873457074165344, + "learning_rate": 9.86522569590812e-05, + "loss": 0.5932, + "step": 1850 + }, + { + "epoch": 0.2228228228228228, + "grad_norm": 0.16019093990325928, + "learning_rate": 9.864499510596483e-05, + "loss": 0.5252, + "step": 1855 + }, + { + "epoch": 0.22342342342342342, + "grad_norm": 0.1702725887298584, + "learning_rate": 9.863771401009314e-05, + "loss": 0.5088, + "step": 1860 + }, + { + "epoch": 0.22402402402402402, + "grad_norm": 0.1690736711025238, + "learning_rate": 9.863041367434633e-05, + "loss": 0.6143, + "step": 1865 + }, + { + "epoch": 0.22462462462462462, + "grad_norm": 0.15510286390781403, + "learning_rate": 9.862309410161227e-05, + "loss": 0.5828, + "step": 1870 + }, + { + "epoch": 0.22522522522522523, + "grad_norm": 0.1484537273645401, + "learning_rate": 9.861575529478637e-05, + "loss": 0.4891, + "step": 1875 + }, + { + "epoch": 0.22582582582582583, + "grad_norm": 0.16495390236377716, + "learning_rate": 9.860839725677168e-05, + "loss": 0.5181, + "step": 1880 + }, + { + "epoch": 0.22642642642642644, + "grad_norm": 0.15596404671669006, + "learning_rate": 9.860101999047888e-05, + "loss": 0.5655, + "step": 1885 + }, + { + "epoch": 0.22702702702702704, + "grad_norm": 0.14629977941513062, + "learning_rate": 9.859362349882621e-05, + "loss": 0.5182, + "step": 1890 + }, + { + "epoch": 0.22762762762762762, + "grad_norm": 0.16224302351474762, + "learning_rate": 9.858620778473958e-05, + "loss": 0.5392, + "step": 1895 + }, + { + "epoch": 0.22822822822822822, + "grad_norm": 0.16578389704227448, + "learning_rate": 9.85787728511524e-05, + "loss": 0.5447, + "step": 1900 + }, + { + "epoch": 0.22882882882882882, + "grad_norm": 0.1378798633813858, + "learning_rate": 9.857131870100579e-05, + "loss": 0.4545, + "step": 1905 + }, + { + "epoch": 0.22942942942942943, + "grad_norm": 0.16701015830039978, + "learning_rate": 9.856384533724841e-05, + "loss": 0.5554, + "step": 1910 + }, + { + "epoch": 0.23003003003003003, + "grad_norm": 0.17047229409217834, + "learning_rate": 9.855635276283656e-05, + "loss": 0.5063, + "step": 1915 + }, + { + "epoch": 0.23063063063063063, + "grad_norm": 0.19112730026245117, + "learning_rate": 9.854884098073409e-05, + "loss": 0.5325, + "step": 1920 + }, + { + "epoch": 0.23123123123123124, + "grad_norm": 0.16093651950359344, + "learning_rate": 9.854130999391249e-05, + "loss": 0.4856, + "step": 1925 + }, + { + "epoch": 0.23183183183183184, + "grad_norm": 0.15758563578128815, + "learning_rate": 9.853375980535082e-05, + "loss": 0.5477, + "step": 1930 + }, + { + "epoch": 0.23243243243243245, + "grad_norm": 0.15003785490989685, + "learning_rate": 9.852619041803576e-05, + "loss": 0.5487, + "step": 1935 + }, + { + "epoch": 0.23303303303303302, + "grad_norm": 0.1630222648382187, + "learning_rate": 9.851860183496155e-05, + "loss": 0.4892, + "step": 1940 + }, + { + "epoch": 0.23363363363363363, + "grad_norm": 0.14337176084518433, + "learning_rate": 9.851099405913009e-05, + "loss": 0.4906, + "step": 1945 + }, + { + "epoch": 0.23423423423423423, + "grad_norm": 0.14522404968738556, + "learning_rate": 9.850336709355079e-05, + "loss": 0.5897, + "step": 1950 + }, + { + "epoch": 0.23483483483483483, + "grad_norm": 0.18825727701187134, + "learning_rate": 9.849572094124069e-05, + "loss": 0.5179, + "step": 1955 + }, + { + "epoch": 0.23543543543543544, + "grad_norm": 0.15544892847537994, + "learning_rate": 9.848805560522444e-05, + "loss": 0.5082, + "step": 1960 + }, + { + "epoch": 0.23603603603603604, + "grad_norm": 0.1687832772731781, + "learning_rate": 9.848037108853423e-05, + "loss": 0.5303, + "step": 1965 + }, + { + "epoch": 0.23663663663663664, + "grad_norm": 0.14829890429973602, + "learning_rate": 9.84726673942099e-05, + "loss": 0.4956, + "step": 1970 + }, + { + "epoch": 0.23723723723723725, + "grad_norm": 0.17282085120677948, + "learning_rate": 9.846494452529879e-05, + "loss": 0.5406, + "step": 1975 + }, + { + "epoch": 0.23783783783783785, + "grad_norm": 0.14352402091026306, + "learning_rate": 9.845720248485593e-05, + "loss": 0.5014, + "step": 1980 + }, + { + "epoch": 0.23843843843843843, + "grad_norm": 0.13369005918502808, + "learning_rate": 9.844944127594385e-05, + "loss": 0.4679, + "step": 1985 + }, + { + "epoch": 0.23903903903903903, + "grad_norm": 0.16212862730026245, + "learning_rate": 9.84416609016327e-05, + "loss": 0.5333, + "step": 1990 + }, + { + "epoch": 0.23963963963963963, + "grad_norm": 0.1569916158914566, + "learning_rate": 9.843386136500018e-05, + "loss": 0.57, + "step": 1995 + }, + { + "epoch": 0.24024024024024024, + "grad_norm": 0.16127009689807892, + "learning_rate": 9.842604266913165e-05, + "loss": 0.5064, + "step": 2000 + }, + { + "epoch": 0.24024024024024024, + "eval_loss": 0.5154778361320496, + "eval_runtime": 35.5905, + "eval_samples_per_second": 22.478, + "eval_steps_per_second": 5.619, + "step": 2000 + }, + { + "epoch": 0.24084084084084084, + "grad_norm": 0.16832779347896576, + "learning_rate": 9.841820481711992e-05, + "loss": 0.5872, + "step": 2005 + }, + { + "epoch": 0.24144144144144145, + "grad_norm": 0.1714124083518982, + "learning_rate": 9.84103478120655e-05, + "loss": 0.5264, + "step": 2010 + }, + { + "epoch": 0.24204204204204205, + "grad_norm": 0.17094390094280243, + "learning_rate": 9.840247165707642e-05, + "loss": 0.5507, + "step": 2015 + }, + { + "epoch": 0.24264264264264265, + "grad_norm": 0.1704624891281128, + "learning_rate": 9.839457635526827e-05, + "loss": 0.5719, + "step": 2020 + }, + { + "epoch": 0.24324324324324326, + "grad_norm": 0.17609746754169464, + "learning_rate": 9.838666190976427e-05, + "loss": 0.5424, + "step": 2025 + }, + { + "epoch": 0.24384384384384383, + "grad_norm": 0.15378032624721527, + "learning_rate": 9.837872832369515e-05, + "loss": 0.5369, + "step": 2030 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.2722972333431244, + "learning_rate": 9.837077560019925e-05, + "loss": 0.4998, + "step": 2035 + }, + { + "epoch": 0.24504504504504504, + "grad_norm": 0.1851070672273636, + "learning_rate": 9.836280374242248e-05, + "loss": 0.5136, + "step": 2040 + }, + { + "epoch": 0.24564564564564564, + "grad_norm": 0.17583970725536346, + "learning_rate": 9.835481275351828e-05, + "loss": 0.5538, + "step": 2045 + }, + { + "epoch": 0.24624624624624625, + "grad_norm": 0.18735957145690918, + "learning_rate": 9.834680263664771e-05, + "loss": 0.5468, + "step": 2050 + }, + { + "epoch": 0.24684684684684685, + "grad_norm": 0.17340996861457825, + "learning_rate": 9.833877339497939e-05, + "loss": 0.6059, + "step": 2055 + }, + { + "epoch": 0.24744744744744746, + "grad_norm": 0.1732621043920517, + "learning_rate": 9.833072503168945e-05, + "loss": 0.5425, + "step": 2060 + }, + { + "epoch": 0.24804804804804806, + "grad_norm": 0.1660718470811844, + "learning_rate": 9.832265754996164e-05, + "loss": 0.4991, + "step": 2065 + }, + { + "epoch": 0.24864864864864866, + "grad_norm": 0.18556509912014008, + "learning_rate": 9.831457095298728e-05, + "loss": 0.5401, + "step": 2070 + }, + { + "epoch": 0.24924924924924924, + "grad_norm": 0.1343226432800293, + "learning_rate": 9.830646524396518e-05, + "loss": 0.5304, + "step": 2075 + }, + { + "epoch": 0.24984984984984984, + "grad_norm": 0.17407070100307465, + "learning_rate": 9.82983404261018e-05, + "loss": 0.5154, + "step": 2080 + }, + { + "epoch": 0.25045045045045045, + "grad_norm": 0.17739513516426086, + "learning_rate": 9.829019650261111e-05, + "loss": 0.5223, + "step": 2085 + }, + { + "epoch": 0.2510510510510511, + "grad_norm": 0.15243694186210632, + "learning_rate": 9.828203347671462e-05, + "loss": 0.485, + "step": 2090 + }, + { + "epoch": 0.25165165165165165, + "grad_norm": 0.17251120507717133, + "learning_rate": 9.827385135164145e-05, + "loss": 0.5174, + "step": 2095 + }, + { + "epoch": 0.25225225225225223, + "grad_norm": 0.14546386897563934, + "learning_rate": 9.82656501306282e-05, + "loss": 0.5192, + "step": 2100 + }, + { + "epoch": 0.25285285285285286, + "grad_norm": 0.1839355081319809, + "learning_rate": 9.825742981691915e-05, + "loss": 0.5414, + "step": 2105 + }, + { + "epoch": 0.25345345345345344, + "grad_norm": 0.17075906693935394, + "learning_rate": 9.824919041376597e-05, + "loss": 0.4985, + "step": 2110 + }, + { + "epoch": 0.25405405405405407, + "grad_norm": 0.20638392865657806, + "learning_rate": 9.8240931924428e-05, + "loss": 0.5412, + "step": 2115 + }, + { + "epoch": 0.25465465465465464, + "grad_norm": 0.2239358276128769, + "learning_rate": 9.82326543521721e-05, + "loss": 0.55, + "step": 2120 + }, + { + "epoch": 0.2552552552552553, + "grad_norm": 0.16201543807983398, + "learning_rate": 9.822435770027267e-05, + "loss": 0.5412, + "step": 2125 + }, + { + "epoch": 0.25585585585585585, + "grad_norm": 0.17090320587158203, + "learning_rate": 9.821604197201166e-05, + "loss": 0.5311, + "step": 2130 + }, + { + "epoch": 0.2564564564564565, + "grad_norm": 0.19192443788051605, + "learning_rate": 9.820770717067856e-05, + "loss": 0.5618, + "step": 2135 + }, + { + "epoch": 0.25705705705705706, + "grad_norm": 0.19195041060447693, + "learning_rate": 9.81993532995704e-05, + "loss": 0.5095, + "step": 2140 + }, + { + "epoch": 0.25765765765765763, + "grad_norm": 0.16927067935466766, + "learning_rate": 9.819098036199178e-05, + "loss": 0.5586, + "step": 2145 + }, + { + "epoch": 0.25825825825825827, + "grad_norm": 0.18273282051086426, + "learning_rate": 9.818258836125482e-05, + "loss": 0.5314, + "step": 2150 + }, + { + "epoch": 0.25885885885885884, + "grad_norm": 0.17846918106079102, + "learning_rate": 9.81741773006792e-05, + "loss": 0.5399, + "step": 2155 + }, + { + "epoch": 0.2594594594594595, + "grad_norm": 0.15504570305347443, + "learning_rate": 9.81657471835921e-05, + "loss": 0.4769, + "step": 2160 + }, + { + "epoch": 0.26006006006006005, + "grad_norm": 0.17235280573368073, + "learning_rate": 9.815729801332832e-05, + "loss": 0.633, + "step": 2165 + }, + { + "epoch": 0.2606606606606607, + "grad_norm": 0.20063692331314087, + "learning_rate": 9.814882979323008e-05, + "loss": 0.5649, + "step": 2170 + }, + { + "epoch": 0.26126126126126126, + "grad_norm": 0.17492002248764038, + "learning_rate": 9.814034252664723e-05, + "loss": 0.5565, + "step": 2175 + }, + { + "epoch": 0.2618618618618619, + "grad_norm": 0.21677549183368683, + "learning_rate": 9.813183621693711e-05, + "loss": 0.5253, + "step": 2180 + }, + { + "epoch": 0.26246246246246246, + "grad_norm": 0.1620025932788849, + "learning_rate": 9.812331086746462e-05, + "loss": 0.5449, + "step": 2185 + }, + { + "epoch": 0.26306306306306304, + "grad_norm": 0.2064737230539322, + "learning_rate": 9.811476648160216e-05, + "loss": 0.5254, + "step": 2190 + }, + { + "epoch": 0.26366366366366367, + "grad_norm": 0.1585373878479004, + "learning_rate": 9.81062030627297e-05, + "loss": 0.4856, + "step": 2195 + }, + { + "epoch": 0.26426426426426425, + "grad_norm": 0.1649765521287918, + "learning_rate": 9.809762061423469e-05, + "loss": 0.5199, + "step": 2200 + }, + { + "epoch": 0.2648648648648649, + "grad_norm": 0.17186199128627777, + "learning_rate": 9.808901913951216e-05, + "loss": 0.496, + "step": 2205 + }, + { + "epoch": 0.26546546546546546, + "grad_norm": 0.16912420094013214, + "learning_rate": 9.808039864196464e-05, + "loss": 0.5118, + "step": 2210 + }, + { + "epoch": 0.2660660660660661, + "grad_norm": 0.17357052862644196, + "learning_rate": 9.807175912500215e-05, + "loss": 0.5153, + "step": 2215 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.1911926567554474, + "learning_rate": 9.806310059204229e-05, + "loss": 0.5019, + "step": 2220 + }, + { + "epoch": 0.2672672672672673, + "grad_norm": 0.20069928467273712, + "learning_rate": 9.805442304651018e-05, + "loss": 0.4793, + "step": 2225 + }, + { + "epoch": 0.26786786786786787, + "grad_norm": 0.18133674561977386, + "learning_rate": 9.804572649183841e-05, + "loss": 0.5091, + "step": 2230 + }, + { + "epoch": 0.26846846846846845, + "grad_norm": 0.215446338057518, + "learning_rate": 9.803701093146715e-05, + "loss": 0.5845, + "step": 2235 + }, + { + "epoch": 0.2690690690690691, + "grad_norm": 0.1596394032239914, + "learning_rate": 9.802827636884405e-05, + "loss": 0.5082, + "step": 2240 + }, + { + "epoch": 0.26966966966966965, + "grad_norm": 0.2033008188009262, + "learning_rate": 9.801952280742426e-05, + "loss": 0.5612, + "step": 2245 + }, + { + "epoch": 0.2702702702702703, + "grad_norm": 0.20107373595237732, + "learning_rate": 9.801075025067053e-05, + "loss": 0.5786, + "step": 2250 + }, + { + "epoch": 0.2702702702702703, + "eval_loss": 0.5005493760108948, + "eval_runtime": 35.6032, + "eval_samples_per_second": 22.47, + "eval_steps_per_second": 5.617, + "step": 2250 + }, + { + "epoch": 0.27087087087087086, + "grad_norm": 0.17059586942195892, + "learning_rate": 9.800195870205299e-05, + "loss": 0.5403, + "step": 2255 + }, + { + "epoch": 0.2714714714714715, + "grad_norm": 0.200556218624115, + "learning_rate": 9.799314816504942e-05, + "loss": 0.5233, + "step": 2260 + }, + { + "epoch": 0.27207207207207207, + "grad_norm": 0.17981156706809998, + "learning_rate": 9.798431864314506e-05, + "loss": 0.5067, + "step": 2265 + }, + { + "epoch": 0.2726726726726727, + "grad_norm": 0.18970321118831635, + "learning_rate": 9.797547013983259e-05, + "loss": 0.5582, + "step": 2270 + }, + { + "epoch": 0.2732732732732733, + "grad_norm": 0.20475243031978607, + "learning_rate": 9.796660265861228e-05, + "loss": 0.5416, + "step": 2275 + }, + { + "epoch": 0.27387387387387385, + "grad_norm": 0.15468288958072662, + "learning_rate": 9.795771620299192e-05, + "loss": 0.4692, + "step": 2280 + }, + { + "epoch": 0.2744744744744745, + "grad_norm": 0.20453935861587524, + "learning_rate": 9.794881077648674e-05, + "loss": 0.554, + "step": 2285 + }, + { + "epoch": 0.27507507507507506, + "grad_norm": 0.18685589730739594, + "learning_rate": 9.793988638261952e-05, + "loss": 0.5307, + "step": 2290 + }, + { + "epoch": 0.2756756756756757, + "grad_norm": 0.18669554591178894, + "learning_rate": 9.793094302492051e-05, + "loss": 0.549, + "step": 2295 + }, + { + "epoch": 0.27627627627627627, + "grad_norm": 0.189845472574234, + "learning_rate": 9.79219807069275e-05, + "loss": 0.4745, + "step": 2300 + }, + { + "epoch": 0.2768768768768769, + "grad_norm": 0.15178772807121277, + "learning_rate": 9.791299943218575e-05, + "loss": 0.4152, + "step": 2305 + }, + { + "epoch": 0.2774774774774775, + "grad_norm": 0.1891813725233078, + "learning_rate": 9.790399920424806e-05, + "loss": 0.4991, + "step": 2310 + }, + { + "epoch": 0.27807807807807805, + "grad_norm": 0.19077537953853607, + "learning_rate": 9.789498002667465e-05, + "loss": 0.5572, + "step": 2315 + }, + { + "epoch": 0.2786786786786787, + "grad_norm": 0.19744697213172913, + "learning_rate": 9.78859419030333e-05, + "loss": 0.5025, + "step": 2320 + }, + { + "epoch": 0.27927927927927926, + "grad_norm": 0.21802768111228943, + "learning_rate": 9.787688483689928e-05, + "loss": 0.5121, + "step": 2325 + }, + { + "epoch": 0.2798798798798799, + "grad_norm": 0.1979355812072754, + "learning_rate": 9.786780883185534e-05, + "loss": 0.5216, + "step": 2330 + }, + { + "epoch": 0.28048048048048047, + "grad_norm": 0.18332335352897644, + "learning_rate": 9.785871389149171e-05, + "loss": 0.5519, + "step": 2335 + }, + { + "epoch": 0.2810810810810811, + "grad_norm": 0.21532176434993744, + "learning_rate": 9.784960001940613e-05, + "loss": 0.5385, + "step": 2340 + }, + { + "epoch": 0.2816816816816817, + "grad_norm": 0.2113877236843109, + "learning_rate": 9.784046721920384e-05, + "loss": 0.5273, + "step": 2345 + }, + { + "epoch": 0.2822822822822823, + "grad_norm": 0.202877476811409, + "learning_rate": 9.783131549449752e-05, + "loss": 0.5381, + "step": 2350 + }, + { + "epoch": 0.2828828828828829, + "grad_norm": 0.2112427055835724, + "learning_rate": 9.782214484890736e-05, + "loss": 0.5083, + "step": 2355 + }, + { + "epoch": 0.28348348348348346, + "grad_norm": 0.193809375166893, + "learning_rate": 9.781295528606108e-05, + "loss": 0.5206, + "step": 2360 + }, + { + "epoch": 0.2840840840840841, + "grad_norm": 0.16697217524051666, + "learning_rate": 9.78037468095938e-05, + "loss": 0.5498, + "step": 2365 + }, + { + "epoch": 0.28468468468468466, + "grad_norm": 0.1831083744764328, + "learning_rate": 9.779451942314822e-05, + "loss": 0.5008, + "step": 2370 + }, + { + "epoch": 0.2852852852852853, + "grad_norm": 0.1841299831867218, + "learning_rate": 9.77852731303744e-05, + "loss": 0.5426, + "step": 2375 + }, + { + "epoch": 0.28588588588588587, + "grad_norm": 0.1795235127210617, + "learning_rate": 9.777600793492998e-05, + "loss": 0.5489, + "step": 2380 + }, + { + "epoch": 0.2864864864864865, + "grad_norm": 0.19949278235435486, + "learning_rate": 9.776672384048005e-05, + "loss": 0.5087, + "step": 2385 + }, + { + "epoch": 0.2870870870870871, + "grad_norm": 0.18009242415428162, + "learning_rate": 9.775742085069715e-05, + "loss": 0.5096, + "step": 2390 + }, + { + "epoch": 0.2876876876876877, + "grad_norm": 0.17108169198036194, + "learning_rate": 9.774809896926133e-05, + "loss": 0.5274, + "step": 2395 + }, + { + "epoch": 0.2882882882882883, + "grad_norm": 0.1928662210702896, + "learning_rate": 9.773875819986007e-05, + "loss": 0.5499, + "step": 2400 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.23090901970863342, + "learning_rate": 9.772939854618836e-05, + "loss": 0.553, + "step": 2405 + }, + { + "epoch": 0.2894894894894895, + "grad_norm": 0.18499891459941864, + "learning_rate": 9.772002001194866e-05, + "loss": 0.4899, + "step": 2410 + }, + { + "epoch": 0.29009009009009007, + "grad_norm": 0.16356371343135834, + "learning_rate": 9.771062260085089e-05, + "loss": 0.4746, + "step": 2415 + }, + { + "epoch": 0.2906906906906907, + "grad_norm": 0.16599981486797333, + "learning_rate": 9.770120631661239e-05, + "loss": 0.5046, + "step": 2420 + }, + { + "epoch": 0.2912912912912913, + "grad_norm": 0.17277301847934723, + "learning_rate": 9.769177116295805e-05, + "loss": 0.5564, + "step": 2425 + }, + { + "epoch": 0.2918918918918919, + "grad_norm": 0.16423299908638, + "learning_rate": 9.768231714362015e-05, + "loss": 0.494, + "step": 2430 + }, + { + "epoch": 0.2924924924924925, + "grad_norm": 0.21444030106067657, + "learning_rate": 9.767284426233849e-05, + "loss": 0.4991, + "step": 2435 + }, + { + "epoch": 0.2930930930930931, + "grad_norm": 0.22856414318084717, + "learning_rate": 9.766335252286031e-05, + "loss": 0.5258, + "step": 2440 + }, + { + "epoch": 0.2936936936936937, + "grad_norm": 0.18524369597434998, + "learning_rate": 9.765384192894031e-05, + "loss": 0.4881, + "step": 2445 + }, + { + "epoch": 0.29429429429429427, + "grad_norm": 0.19163531064987183, + "learning_rate": 9.764431248434062e-05, + "loss": 0.5674, + "step": 2450 + }, + { + "epoch": 0.2948948948948949, + "grad_norm": 0.19218379259109497, + "learning_rate": 9.763476419283086e-05, + "loss": 0.6041, + "step": 2455 + }, + { + "epoch": 0.2954954954954955, + "grad_norm": 0.17554765939712524, + "learning_rate": 9.762519705818813e-05, + "loss": 0.5599, + "step": 2460 + }, + { + "epoch": 0.2960960960960961, + "grad_norm": 0.19090323150157928, + "learning_rate": 9.761561108419691e-05, + "loss": 0.5389, + "step": 2465 + }, + { + "epoch": 0.2966966966966967, + "grad_norm": 0.20730207860469818, + "learning_rate": 9.76060062746492e-05, + "loss": 0.4687, + "step": 2470 + }, + { + "epoch": 0.2972972972972973, + "grad_norm": 0.19577758014202118, + "learning_rate": 9.75963826333444e-05, + "loss": 0.5313, + "step": 2475 + }, + { + "epoch": 0.2978978978978979, + "grad_norm": 0.22707220911979675, + "learning_rate": 9.75867401640894e-05, + "loss": 0.488, + "step": 2480 + }, + { + "epoch": 0.2984984984984985, + "grad_norm": 0.19406184554100037, + "learning_rate": 9.757707887069854e-05, + "loss": 0.5086, + "step": 2485 + }, + { + "epoch": 0.2990990990990991, + "grad_norm": 0.16421322524547577, + "learning_rate": 9.756739875699354e-05, + "loss": 0.4937, + "step": 2490 + }, + { + "epoch": 0.2996996996996997, + "grad_norm": 0.20811273157596588, + "learning_rate": 9.755769982680367e-05, + "loss": 0.5175, + "step": 2495 + }, + { + "epoch": 0.3003003003003003, + "grad_norm": 0.1713363230228424, + "learning_rate": 9.754798208396554e-05, + "loss": 0.444, + "step": 2500 + }, + { + "epoch": 0.3003003003003003, + "eval_loss": 0.48717838525772095, + "eval_runtime": 35.6296, + "eval_samples_per_second": 22.453, + "eval_steps_per_second": 5.613, + "step": 2500 + }, + { + "epoch": 0.3009009009009009, + "grad_norm": 0.15711075067520142, + "learning_rate": 9.753824553232327e-05, + "loss": 0.4657, + "step": 2505 + }, + { + "epoch": 0.3015015015015015, + "grad_norm": 0.18782839179039001, + "learning_rate": 9.752849017572841e-05, + "loss": 0.5125, + "step": 2510 + }, + { + "epoch": 0.3021021021021021, + "grad_norm": 0.2134360820055008, + "learning_rate": 9.751871601803993e-05, + "loss": 0.4831, + "step": 2515 + }, + { + "epoch": 0.3027027027027027, + "grad_norm": 0.21311652660369873, + "learning_rate": 9.750892306312423e-05, + "loss": 0.5408, + "step": 2520 + }, + { + "epoch": 0.3033033033033033, + "grad_norm": 0.17872583866119385, + "learning_rate": 9.749911131485516e-05, + "loss": 0.5442, + "step": 2525 + }, + { + "epoch": 0.3039039039039039, + "grad_norm": 0.22014202177524567, + "learning_rate": 9.748928077711402e-05, + "loss": 0.5266, + "step": 2530 + }, + { + "epoch": 0.3045045045045045, + "grad_norm": 0.21009013056755066, + "learning_rate": 9.74794314537895e-05, + "loss": 0.501, + "step": 2535 + }, + { + "epoch": 0.3051051051051051, + "grad_norm": 0.20648615062236786, + "learning_rate": 9.74695633487778e-05, + "loss": 0.4969, + "step": 2540 + }, + { + "epoch": 0.3057057057057057, + "grad_norm": 0.18218368291854858, + "learning_rate": 9.745967646598245e-05, + "loss": 0.5197, + "step": 2545 + }, + { + "epoch": 0.3063063063063063, + "grad_norm": 0.17514494061470032, + "learning_rate": 9.744977080931448e-05, + "loss": 0.4872, + "step": 2550 + }, + { + "epoch": 0.3069069069069069, + "grad_norm": 0.19781029224395752, + "learning_rate": 9.743984638269233e-05, + "loss": 0.5385, + "step": 2555 + }, + { + "epoch": 0.3075075075075075, + "grad_norm": 0.21139483153820038, + "learning_rate": 9.742990319004182e-05, + "loss": 0.5162, + "step": 2560 + }, + { + "epoch": 0.3081081081081081, + "grad_norm": 0.2510957419872284, + "learning_rate": 9.741994123529626e-05, + "loss": 0.5231, + "step": 2565 + }, + { + "epoch": 0.3087087087087087, + "grad_norm": 0.20459957420825958, + "learning_rate": 9.740996052239635e-05, + "loss": 0.4739, + "step": 2570 + }, + { + "epoch": 0.30930930930930933, + "grad_norm": 0.18105819821357727, + "learning_rate": 9.739996105529021e-05, + "loss": 0.513, + "step": 2575 + }, + { + "epoch": 0.3099099099099099, + "grad_norm": 0.20429784059524536, + "learning_rate": 9.738994283793336e-05, + "loss": 0.4921, + "step": 2580 + }, + { + "epoch": 0.3105105105105105, + "grad_norm": 0.2193429321050644, + "learning_rate": 9.737990587428881e-05, + "loss": 0.5081, + "step": 2585 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.22544243931770325, + "learning_rate": 9.736985016832689e-05, + "loss": 0.4762, + "step": 2590 + }, + { + "epoch": 0.3117117117117117, + "grad_norm": 0.22182606160640717, + "learning_rate": 9.735977572402541e-05, + "loss": 0.5515, + "step": 2595 + }, + { + "epoch": 0.3123123123123123, + "grad_norm": 0.2171258181333542, + "learning_rate": 9.734968254536955e-05, + "loss": 0.5324, + "step": 2600 + }, + { + "epoch": 0.3129129129129129, + "grad_norm": 0.1789737343788147, + "learning_rate": 9.733957063635196e-05, + "loss": 0.4967, + "step": 2605 + }, + { + "epoch": 0.31351351351351353, + "grad_norm": 0.18293219804763794, + "learning_rate": 9.732944000097259e-05, + "loss": 0.5441, + "step": 2610 + }, + { + "epoch": 0.3141141141141141, + "grad_norm": 0.20862580835819244, + "learning_rate": 9.731929064323895e-05, + "loss": 0.4834, + "step": 2615 + }, + { + "epoch": 0.31471471471471474, + "grad_norm": 0.1822856217622757, + "learning_rate": 9.730912256716582e-05, + "loss": 0.5704, + "step": 2620 + }, + { + "epoch": 0.3153153153153153, + "grad_norm": 0.2051629275083542, + "learning_rate": 9.729893577677547e-05, + "loss": 0.5149, + "step": 2625 + }, + { + "epoch": 0.3159159159159159, + "grad_norm": 0.20480480790138245, + "learning_rate": 9.728873027609752e-05, + "loss": 0.5412, + "step": 2630 + }, + { + "epoch": 0.3165165165165165, + "grad_norm": 0.17666372656822205, + "learning_rate": 9.727850606916902e-05, + "loss": 0.5019, + "step": 2635 + }, + { + "epoch": 0.3171171171171171, + "grad_norm": 0.19383026659488678, + "learning_rate": 9.726826316003442e-05, + "loss": 0.5269, + "step": 2640 + }, + { + "epoch": 0.31771771771771773, + "grad_norm": 0.1739753782749176, + "learning_rate": 9.725800155274556e-05, + "loss": 0.4867, + "step": 2645 + }, + { + "epoch": 0.3183183183183183, + "grad_norm": 0.18919163942337036, + "learning_rate": 9.724772125136168e-05, + "loss": 0.525, + "step": 2650 + }, + { + "epoch": 0.31891891891891894, + "grad_norm": 0.2092183381319046, + "learning_rate": 9.723742225994938e-05, + "loss": 0.5567, + "step": 2655 + }, + { + "epoch": 0.3195195195195195, + "grad_norm": 0.1984640508890152, + "learning_rate": 9.722710458258276e-05, + "loss": 0.4905, + "step": 2660 + }, + { + "epoch": 0.32012012012012014, + "grad_norm": 0.21197611093521118, + "learning_rate": 9.721676822334315e-05, + "loss": 0.4638, + "step": 2665 + }, + { + "epoch": 0.3207207207207207, + "grad_norm": 0.21106930077075958, + "learning_rate": 9.72064131863194e-05, + "loss": 0.5742, + "step": 2670 + }, + { + "epoch": 0.3213213213213213, + "grad_norm": 0.1765323430299759, + "learning_rate": 9.719603947560771e-05, + "loss": 0.4783, + "step": 2675 + }, + { + "epoch": 0.3219219219219219, + "grad_norm": 0.18311339616775513, + "learning_rate": 9.718564709531167e-05, + "loss": 0.4522, + "step": 2680 + }, + { + "epoch": 0.3225225225225225, + "grad_norm": 0.19145925343036652, + "learning_rate": 9.717523604954223e-05, + "loss": 0.518, + "step": 2685 + }, + { + "epoch": 0.32312312312312313, + "grad_norm": 0.19816061854362488, + "learning_rate": 9.716480634241773e-05, + "loss": 0.5503, + "step": 2690 + }, + { + "epoch": 0.3237237237237237, + "grad_norm": 0.19963683187961578, + "learning_rate": 9.715435797806395e-05, + "loss": 0.4971, + "step": 2695 + }, + { + "epoch": 0.32432432432432434, + "grad_norm": 0.18355773389339447, + "learning_rate": 9.714389096061396e-05, + "loss": 0.5028, + "step": 2700 + }, + { + "epoch": 0.3249249249249249, + "grad_norm": 0.2459624856710434, + "learning_rate": 9.713340529420826e-05, + "loss": 0.5526, + "step": 2705 + }, + { + "epoch": 0.32552552552552555, + "grad_norm": 0.2084140181541443, + "learning_rate": 9.712290098299475e-05, + "loss": 0.5147, + "step": 2710 + }, + { + "epoch": 0.3261261261261261, + "grad_norm": 0.18029722571372986, + "learning_rate": 9.711237803112865e-05, + "loss": 0.5202, + "step": 2715 + }, + { + "epoch": 0.3267267267267267, + "grad_norm": 0.16493675112724304, + "learning_rate": 9.710183644277257e-05, + "loss": 0.4528, + "step": 2720 + }, + { + "epoch": 0.32732732732732733, + "grad_norm": 0.23521707952022552, + "learning_rate": 9.709127622209652e-05, + "loss": 0.5859, + "step": 2725 + }, + { + "epoch": 0.3279279279279279, + "grad_norm": 0.1886672079563141, + "learning_rate": 9.708069737327786e-05, + "loss": 0.5038, + "step": 2730 + }, + { + "epoch": 0.32852852852852854, + "grad_norm": 0.19265535473823547, + "learning_rate": 9.707009990050131e-05, + "loss": 0.4781, + "step": 2735 + }, + { + "epoch": 0.3291291291291291, + "grad_norm": 0.1997983306646347, + "learning_rate": 9.705948380795897e-05, + "loss": 0.5192, + "step": 2740 + }, + { + "epoch": 0.32972972972972975, + "grad_norm": 0.2557390034198761, + "learning_rate": 9.704884909985031e-05, + "loss": 0.495, + "step": 2745 + }, + { + "epoch": 0.3303303303303303, + "grad_norm": 0.1663554459810257, + "learning_rate": 9.703819578038216e-05, + "loss": 0.5106, + "step": 2750 + }, + { + "epoch": 0.3303303303303303, + "eval_loss": 0.4807363748550415, + "eval_runtime": 35.697, + "eval_samples_per_second": 22.411, + "eval_steps_per_second": 5.603, + "step": 2750 + }, + { + "epoch": 0.33093093093093096, + "grad_norm": 0.21156282722949982, + "learning_rate": 9.70275238537687e-05, + "loss": 0.5359, + "step": 2755 + }, + { + "epoch": 0.33153153153153153, + "grad_norm": 0.19707150757312775, + "learning_rate": 9.70168333242315e-05, + "loss": 0.5322, + "step": 2760 + }, + { + "epoch": 0.3321321321321321, + "grad_norm": 0.1987551748752594, + "learning_rate": 9.700612419599943e-05, + "loss": 0.5211, + "step": 2765 + }, + { + "epoch": 0.33273273273273274, + "grad_norm": 0.1993408352136612, + "learning_rate": 9.69953964733088e-05, + "loss": 0.4963, + "step": 2770 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.17729812860488892, + "learning_rate": 9.69846501604032e-05, + "loss": 0.5273, + "step": 2775 + }, + { + "epoch": 0.33393393393393395, + "grad_norm": 0.2088416963815689, + "learning_rate": 9.69738852615336e-05, + "loss": 0.5219, + "step": 2780 + }, + { + "epoch": 0.3345345345345345, + "grad_norm": 0.214419886469841, + "learning_rate": 9.696310178095835e-05, + "loss": 0.5371, + "step": 2785 + }, + { + "epoch": 0.33513513513513515, + "grad_norm": 0.20148953795433044, + "learning_rate": 9.695229972294314e-05, + "loss": 0.5254, + "step": 2790 + }, + { + "epoch": 0.33573573573573573, + "grad_norm": 0.21607989072799683, + "learning_rate": 9.694147909176097e-05, + "loss": 0.502, + "step": 2795 + }, + { + "epoch": 0.33633633633633636, + "grad_norm": 0.22730080783367157, + "learning_rate": 9.69306398916922e-05, + "loss": 0.5142, + "step": 2800 + }, + { + "epoch": 0.33693693693693694, + "grad_norm": 0.2318321019411087, + "learning_rate": 9.691978212702459e-05, + "loss": 0.558, + "step": 2805 + }, + { + "epoch": 0.3375375375375375, + "grad_norm": 0.19326099753379822, + "learning_rate": 9.690890580205318e-05, + "loss": 0.5068, + "step": 2810 + }, + { + "epoch": 0.33813813813813814, + "grad_norm": 0.22810117900371552, + "learning_rate": 9.689801092108037e-05, + "loss": 0.5672, + "step": 2815 + }, + { + "epoch": 0.3387387387387387, + "grad_norm": 0.22707174718379974, + "learning_rate": 9.688709748841591e-05, + "loss": 0.4705, + "step": 2820 + }, + { + "epoch": 0.33933933933933935, + "grad_norm": 0.24668732285499573, + "learning_rate": 9.68761655083769e-05, + "loss": 0.5207, + "step": 2825 + }, + { + "epoch": 0.33993993993993993, + "grad_norm": 0.21590447425842285, + "learning_rate": 9.686521498528774e-05, + "loss": 0.4787, + "step": 2830 + }, + { + "epoch": 0.34054054054054056, + "grad_norm": 0.20210616290569305, + "learning_rate": 9.68542459234802e-05, + "loss": 0.5231, + "step": 2835 + }, + { + "epoch": 0.34114114114114114, + "grad_norm": 0.17560534179210663, + "learning_rate": 9.684325832729335e-05, + "loss": 0.4835, + "step": 2840 + }, + { + "epoch": 0.34174174174174177, + "grad_norm": 0.17611800134181976, + "learning_rate": 9.683225220107363e-05, + "loss": 0.5273, + "step": 2845 + }, + { + "epoch": 0.34234234234234234, + "grad_norm": 0.2211238294839859, + "learning_rate": 9.682122754917479e-05, + "loss": 0.493, + "step": 2850 + }, + { + "epoch": 0.3429429429429429, + "grad_norm": 0.20786675810813904, + "learning_rate": 9.681018437595789e-05, + "loss": 0.5369, + "step": 2855 + }, + { + "epoch": 0.34354354354354355, + "grad_norm": 0.2106047421693802, + "learning_rate": 9.679912268579136e-05, + "loss": 0.4923, + "step": 2860 + }, + { + "epoch": 0.3441441441441441, + "grad_norm": 0.21731366217136383, + "learning_rate": 9.678804248305091e-05, + "loss": 0.559, + "step": 2865 + }, + { + "epoch": 0.34474474474474476, + "grad_norm": 0.22532141208648682, + "learning_rate": 9.67769437721196e-05, + "loss": 0.5048, + "step": 2870 + }, + { + "epoch": 0.34534534534534533, + "grad_norm": 0.23368632793426514, + "learning_rate": 9.676582655738781e-05, + "loss": 0.4887, + "step": 2875 + }, + { + "epoch": 0.34594594594594597, + "grad_norm": 0.2270139902830124, + "learning_rate": 9.675469084325324e-05, + "loss": 0.5017, + "step": 2880 + }, + { + "epoch": 0.34654654654654654, + "grad_norm": 0.22090162336826324, + "learning_rate": 9.674353663412091e-05, + "loss": 0.5136, + "step": 2885 + }, + { + "epoch": 0.3471471471471472, + "grad_norm": 0.18990397453308105, + "learning_rate": 9.67323639344031e-05, + "loss": 0.498, + "step": 2890 + }, + { + "epoch": 0.34774774774774775, + "grad_norm": 0.24817807972431183, + "learning_rate": 9.672117274851952e-05, + "loss": 0.5257, + "step": 2895 + }, + { + "epoch": 0.3483483483483483, + "grad_norm": 0.23883749544620514, + "learning_rate": 9.670996308089708e-05, + "loss": 0.455, + "step": 2900 + }, + { + "epoch": 0.34894894894894896, + "grad_norm": 0.21196013689041138, + "learning_rate": 9.669873493597006e-05, + "loss": 0.4734, + "step": 2905 + }, + { + "epoch": 0.34954954954954953, + "grad_norm": 0.2275170087814331, + "learning_rate": 9.668748831818005e-05, + "loss": 0.4953, + "step": 2910 + }, + { + "epoch": 0.35015015015015016, + "grad_norm": 0.20784740149974823, + "learning_rate": 9.66762232319759e-05, + "loss": 0.5077, + "step": 2915 + }, + { + "epoch": 0.35075075075075074, + "grad_norm": 0.22956329584121704, + "learning_rate": 9.666493968181383e-05, + "loss": 0.4433, + "step": 2920 + }, + { + "epoch": 0.35135135135135137, + "grad_norm": 0.24458478391170502, + "learning_rate": 9.665363767215732e-05, + "loss": 0.528, + "step": 2925 + }, + { + "epoch": 0.35195195195195195, + "grad_norm": 0.21098071336746216, + "learning_rate": 9.664231720747718e-05, + "loss": 0.4942, + "step": 2930 + }, + { + "epoch": 0.3525525525525526, + "grad_norm": 0.19882138073444366, + "learning_rate": 9.663097829225148e-05, + "loss": 0.4704, + "step": 2935 + }, + { + "epoch": 0.35315315315315315, + "grad_norm": 0.21656285226345062, + "learning_rate": 9.661962093096563e-05, + "loss": 0.4909, + "step": 2940 + }, + { + "epoch": 0.35375375375375373, + "grad_norm": 0.216679185628891, + "learning_rate": 9.66082451281123e-05, + "loss": 0.4855, + "step": 2945 + }, + { + "epoch": 0.35435435435435436, + "grad_norm": 0.20689897239208221, + "learning_rate": 9.659685088819152e-05, + "loss": 0.4355, + "step": 2950 + }, + { + "epoch": 0.35495495495495494, + "grad_norm": 0.22368694841861725, + "learning_rate": 9.658543821571054e-05, + "loss": 0.5387, + "step": 2955 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.22052088379859924, + "learning_rate": 9.657400711518394e-05, + "loss": 0.5567, + "step": 2960 + }, + { + "epoch": 0.35615615615615615, + "grad_norm": 0.2549915313720703, + "learning_rate": 9.656255759113355e-05, + "loss": 0.5356, + "step": 2965 + }, + { + "epoch": 0.3567567567567568, + "grad_norm": 0.24571342766284943, + "learning_rate": 9.655108964808857e-05, + "loss": 0.5178, + "step": 2970 + }, + { + "epoch": 0.35735735735735735, + "grad_norm": 0.2269880622625351, + "learning_rate": 9.653960329058538e-05, + "loss": 0.4883, + "step": 2975 + }, + { + "epoch": 0.357957957957958, + "grad_norm": 0.23701144754886627, + "learning_rate": 9.652809852316774e-05, + "loss": 0.5237, + "step": 2980 + }, + { + "epoch": 0.35855855855855856, + "grad_norm": 0.23912853002548218, + "learning_rate": 9.651657535038663e-05, + "loss": 0.5064, + "step": 2985 + }, + { + "epoch": 0.35915915915915914, + "grad_norm": 0.22557851672172546, + "learning_rate": 9.650503377680035e-05, + "loss": 0.4864, + "step": 2990 + }, + { + "epoch": 0.35975975975975977, + "grad_norm": 0.2128036916255951, + "learning_rate": 9.649347380697445e-05, + "loss": 0.5378, + "step": 2995 + }, + { + "epoch": 0.36036036036036034, + "grad_norm": 0.23978100717067719, + "learning_rate": 9.648189544548173e-05, + "loss": 0.6153, + "step": 3000 + }, + { + "epoch": 0.36036036036036034, + "eval_loss": 0.47244203090667725, + "eval_runtime": 35.6635, + "eval_samples_per_second": 22.432, + "eval_steps_per_second": 5.608, + "step": 3000 + }, + { + "epoch": 0.360960960960961, + "grad_norm": 0.22764725983142853, + "learning_rate": 9.647029869690238e-05, + "loss": 0.4915, + "step": 3005 + }, + { + "epoch": 0.36156156156156155, + "grad_norm": 0.23097966611385345, + "learning_rate": 9.645868356582373e-05, + "loss": 0.4695, + "step": 3010 + }, + { + "epoch": 0.3621621621621622, + "grad_norm": 0.20730188488960266, + "learning_rate": 9.644705005684045e-05, + "loss": 0.4611, + "step": 3015 + }, + { + "epoch": 0.36276276276276276, + "grad_norm": 0.22593528032302856, + "learning_rate": 9.643539817455448e-05, + "loss": 0.4925, + "step": 3020 + }, + { + "epoch": 0.3633633633633634, + "grad_norm": 0.18994325399398804, + "learning_rate": 9.642372792357501e-05, + "loss": 0.5304, + "step": 3025 + }, + { + "epoch": 0.36396396396396397, + "grad_norm": 0.20703668892383575, + "learning_rate": 9.64120393085185e-05, + "loss": 0.4805, + "step": 3030 + }, + { + "epoch": 0.36456456456456454, + "grad_norm": 0.20578399300575256, + "learning_rate": 9.640033233400867e-05, + "loss": 0.5288, + "step": 3035 + }, + { + "epoch": 0.3651651651651652, + "grad_norm": 0.24444563686847687, + "learning_rate": 9.638860700467652e-05, + "loss": 0.494, + "step": 3040 + }, + { + "epoch": 0.36576576576576575, + "grad_norm": 0.20720888674259186, + "learning_rate": 9.637686332516029e-05, + "loss": 0.5434, + "step": 3045 + }, + { + "epoch": 0.3663663663663664, + "grad_norm": 0.2322540134191513, + "learning_rate": 9.63651013001055e-05, + "loss": 0.4814, + "step": 3050 + }, + { + "epoch": 0.36696696696696696, + "grad_norm": 0.2209395170211792, + "learning_rate": 9.635332093416491e-05, + "loss": 0.4871, + "step": 3055 + }, + { + "epoch": 0.3675675675675676, + "grad_norm": 0.242445707321167, + "learning_rate": 9.634152223199855e-05, + "loss": 0.4968, + "step": 3060 + }, + { + "epoch": 0.36816816816816816, + "grad_norm": 0.2292807400226593, + "learning_rate": 9.632970519827367e-05, + "loss": 0.5409, + "step": 3065 + }, + { + "epoch": 0.3687687687687688, + "grad_norm": 0.22164930403232574, + "learning_rate": 9.631786983766482e-05, + "loss": 0.4993, + "step": 3070 + }, + { + "epoch": 0.36936936936936937, + "grad_norm": 0.21160635352134705, + "learning_rate": 9.630601615485378e-05, + "loss": 0.4895, + "step": 3075 + }, + { + "epoch": 0.36996996996996995, + "grad_norm": 0.21484865248203278, + "learning_rate": 9.629414415452954e-05, + "loss": 0.4554, + "step": 3080 + }, + { + "epoch": 0.3705705705705706, + "grad_norm": 0.19489717483520508, + "learning_rate": 9.62822538413884e-05, + "loss": 0.4545, + "step": 3085 + }, + { + "epoch": 0.37117117117117115, + "grad_norm": 0.22546513378620148, + "learning_rate": 9.627034522013386e-05, + "loss": 0.4997, + "step": 3090 + }, + { + "epoch": 0.3717717717717718, + "grad_norm": 0.22963769733905792, + "learning_rate": 9.625841829547668e-05, + "loss": 0.5092, + "step": 3095 + }, + { + "epoch": 0.37237237237237236, + "grad_norm": 0.2154102623462677, + "learning_rate": 9.624647307213485e-05, + "loss": 0.4711, + "step": 3100 + }, + { + "epoch": 0.372972972972973, + "grad_norm": 0.2301490157842636, + "learning_rate": 9.623450955483363e-05, + "loss": 0.4809, + "step": 3105 + }, + { + "epoch": 0.37357357357357357, + "grad_norm": 0.2505705952644348, + "learning_rate": 9.622252774830545e-05, + "loss": 0.4854, + "step": 3110 + }, + { + "epoch": 0.3741741741741742, + "grad_norm": 0.23982185125350952, + "learning_rate": 9.621052765729006e-05, + "loss": 0.5379, + "step": 3115 + }, + { + "epoch": 0.3747747747747748, + "grad_norm": 0.24677838385105133, + "learning_rate": 9.619850928653436e-05, + "loss": 0.4954, + "step": 3120 + }, + { + "epoch": 0.37537537537537535, + "grad_norm": 0.20734712481498718, + "learning_rate": 9.618647264079253e-05, + "loss": 0.4826, + "step": 3125 + }, + { + "epoch": 0.375975975975976, + "grad_norm": 0.2506435513496399, + "learning_rate": 9.617441772482598e-05, + "loss": 0.4749, + "step": 3130 + }, + { + "epoch": 0.37657657657657656, + "grad_norm": 0.2044931799173355, + "learning_rate": 9.616234454340332e-05, + "loss": 0.5429, + "step": 3135 + }, + { + "epoch": 0.3771771771771772, + "grad_norm": 0.23018285632133484, + "learning_rate": 9.615025310130044e-05, + "loss": 0.5125, + "step": 3140 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.2430395931005478, + "learning_rate": 9.613814340330036e-05, + "loss": 0.4755, + "step": 3145 + }, + { + "epoch": 0.3783783783783784, + "grad_norm": 0.2155851274728775, + "learning_rate": 9.612601545419342e-05, + "loss": 0.4792, + "step": 3150 + }, + { + "epoch": 0.378978978978979, + "grad_norm": 0.20493851602077484, + "learning_rate": 9.611386925877711e-05, + "loss": 0.5253, + "step": 3155 + }, + { + "epoch": 0.3795795795795796, + "grad_norm": 0.2408454418182373, + "learning_rate": 9.610170482185619e-05, + "loss": 0.4542, + "step": 3160 + }, + { + "epoch": 0.3801801801801802, + "grad_norm": 0.22176580131053925, + "learning_rate": 9.608952214824257e-05, + "loss": 0.4956, + "step": 3165 + }, + { + "epoch": 0.38078078078078076, + "grad_norm": 0.22562333941459656, + "learning_rate": 9.607732124275545e-05, + "loss": 0.4627, + "step": 3170 + }, + { + "epoch": 0.3813813813813814, + "grad_norm": 0.2003718614578247, + "learning_rate": 9.60651021102212e-05, + "loss": 0.3945, + "step": 3175 + }, + { + "epoch": 0.38198198198198197, + "grad_norm": 0.24408352375030518, + "learning_rate": 9.605286475547339e-05, + "loss": 0.4303, + "step": 3180 + }, + { + "epoch": 0.3825825825825826, + "grad_norm": 0.24553069472312927, + "learning_rate": 9.604060918335283e-05, + "loss": 0.4767, + "step": 3185 + }, + { + "epoch": 0.3831831831831832, + "grad_norm": 0.23498015105724335, + "learning_rate": 9.602833539870753e-05, + "loss": 0.4424, + "step": 3190 + }, + { + "epoch": 0.3837837837837838, + "grad_norm": 0.22736471891403198, + "learning_rate": 9.601604340639265e-05, + "loss": 0.4697, + "step": 3195 + }, + { + "epoch": 0.3843843843843844, + "grad_norm": 0.23163831233978271, + "learning_rate": 9.600373321127065e-05, + "loss": 0.4637, + "step": 3200 + }, + { + "epoch": 0.384984984984985, + "grad_norm": 0.27113768458366394, + "learning_rate": 9.599140481821112e-05, + "loss": 0.4072, + "step": 3205 + }, + { + "epoch": 0.3855855855855856, + "grad_norm": 0.26473140716552734, + "learning_rate": 9.597905823209086e-05, + "loss": 0.5052, + "step": 3210 + }, + { + "epoch": 0.38618618618618616, + "grad_norm": 0.23976100981235504, + "learning_rate": 9.596669345779388e-05, + "loss": 0.5035, + "step": 3215 + }, + { + "epoch": 0.3867867867867868, + "grad_norm": 0.19085469841957092, + "learning_rate": 9.595431050021135e-05, + "loss": 0.4899, + "step": 3220 + }, + { + "epoch": 0.38738738738738737, + "grad_norm": 0.23537762463092804, + "learning_rate": 9.594190936424173e-05, + "loss": 0.4674, + "step": 3225 + }, + { + "epoch": 0.387987987987988, + "grad_norm": 0.2305203676223755, + "learning_rate": 9.592949005479053e-05, + "loss": 0.4776, + "step": 3230 + }, + { + "epoch": 0.3885885885885886, + "grad_norm": 0.23646143078804016, + "learning_rate": 9.591705257677054e-05, + "loss": 0.4402, + "step": 3235 + }, + { + "epoch": 0.3891891891891892, + "grad_norm": 0.2590693533420563, + "learning_rate": 9.590459693510177e-05, + "loss": 0.5002, + "step": 3240 + }, + { + "epoch": 0.3897897897897898, + "grad_norm": 0.2276909500360489, + "learning_rate": 9.58921231347113e-05, + "loss": 0.4744, + "step": 3245 + }, + { + "epoch": 0.39039039039039036, + "grad_norm": 0.24374452233314514, + "learning_rate": 9.587963118053347e-05, + "loss": 0.5043, + "step": 3250 + }, + { + "epoch": 0.39039039039039036, + "eval_loss": 0.4580923318862915, + "eval_runtime": 35.5771, + "eval_samples_per_second": 22.486, + "eval_steps_per_second": 5.622, + "step": 3250 + }, + { + "epoch": 0.390990990990991, + "grad_norm": 0.21707071363925934, + "learning_rate": 9.586712107750982e-05, + "loss": 0.5061, + "step": 3255 + }, + { + "epoch": 0.39159159159159157, + "grad_norm": 0.28769123554229736, + "learning_rate": 9.5854592830589e-05, + "loss": 0.4688, + "step": 3260 + }, + { + "epoch": 0.3921921921921922, + "grad_norm": 0.2628069221973419, + "learning_rate": 9.584204644472688e-05, + "loss": 0.4614, + "step": 3265 + }, + { + "epoch": 0.3927927927927928, + "grad_norm": 0.2501368820667267, + "learning_rate": 9.582948192488652e-05, + "loss": 0.4461, + "step": 3270 + }, + { + "epoch": 0.3933933933933934, + "grad_norm": 0.2464858740568161, + "learning_rate": 9.581689927603812e-05, + "loss": 0.4968, + "step": 3275 + }, + { + "epoch": 0.393993993993994, + "grad_norm": 0.21707935631275177, + "learning_rate": 9.580429850315906e-05, + "loss": 0.4345, + "step": 3280 + }, + { + "epoch": 0.3945945945945946, + "grad_norm": 0.1674998700618744, + "learning_rate": 9.57916796112339e-05, + "loss": 0.4617, + "step": 3285 + }, + { + "epoch": 0.3951951951951952, + "grad_norm": 0.2718343436717987, + "learning_rate": 9.577904260525436e-05, + "loss": 0.4988, + "step": 3290 + }, + { + "epoch": 0.39579579579579577, + "grad_norm": 0.23669962584972382, + "learning_rate": 9.576638749021933e-05, + "loss": 0.4525, + "step": 3295 + }, + { + "epoch": 0.3963963963963964, + "grad_norm": 0.278176486492157, + "learning_rate": 9.575371427113484e-05, + "loss": 0.5211, + "step": 3300 + }, + { + "epoch": 0.396996996996997, + "grad_norm": 0.19819344580173492, + "learning_rate": 9.574102295301414e-05, + "loss": 0.4779, + "step": 3305 + }, + { + "epoch": 0.3975975975975976, + "grad_norm": 0.22306062281131744, + "learning_rate": 9.572831354087756e-05, + "loss": 0.4847, + "step": 3310 + }, + { + "epoch": 0.3981981981981982, + "grad_norm": 0.23362375795841217, + "learning_rate": 9.571558603975266e-05, + "loss": 0.4081, + "step": 3315 + }, + { + "epoch": 0.3987987987987988, + "grad_norm": 0.21090473234653473, + "learning_rate": 9.570284045467412e-05, + "loss": 0.4819, + "step": 3320 + }, + { + "epoch": 0.3993993993993994, + "grad_norm": 0.23887136578559875, + "learning_rate": 9.569007679068376e-05, + "loss": 0.4929, + "step": 3325 + }, + { + "epoch": 0.4, + "grad_norm": 0.23648279905319214, + "learning_rate": 9.567729505283057e-05, + "loss": 0.4871, + "step": 3330 + }, + { + "epoch": 0.4006006006006006, + "grad_norm": 0.2086629420518875, + "learning_rate": 9.566449524617069e-05, + "loss": 0.4621, + "step": 3335 + }, + { + "epoch": 0.4012012012012012, + "grad_norm": 0.20192669332027435, + "learning_rate": 9.565167737576744e-05, + "loss": 0.4218, + "step": 3340 + }, + { + "epoch": 0.4018018018018018, + "grad_norm": 0.2431265413761139, + "learning_rate": 9.563884144669122e-05, + "loss": 0.5005, + "step": 3345 + }, + { + "epoch": 0.4024024024024024, + "grad_norm": 0.30352410674095154, + "learning_rate": 9.56259874640196e-05, + "loss": 0.5186, + "step": 3350 + }, + { + "epoch": 0.403003003003003, + "grad_norm": 0.27248936891555786, + "learning_rate": 9.561311543283733e-05, + "loss": 0.5055, + "step": 3355 + }, + { + "epoch": 0.4036036036036036, + "grad_norm": 0.3054885268211365, + "learning_rate": 9.560022535823623e-05, + "loss": 0.4541, + "step": 3360 + }, + { + "epoch": 0.4042042042042042, + "grad_norm": 0.24470002949237823, + "learning_rate": 9.558731724531531e-05, + "loss": 0.4806, + "step": 3365 + }, + { + "epoch": 0.4048048048048048, + "grad_norm": 0.22992166876792908, + "learning_rate": 9.55743910991807e-05, + "loss": 0.4664, + "step": 3370 + }, + { + "epoch": 0.40540540540540543, + "grad_norm": 0.2561578154563904, + "learning_rate": 9.556144692494568e-05, + "loss": 0.5623, + "step": 3375 + }, + { + "epoch": 0.406006006006006, + "grad_norm": 0.25261953473091125, + "learning_rate": 9.554848472773061e-05, + "loss": 0.4414, + "step": 3380 + }, + { + "epoch": 0.4066066066066066, + "grad_norm": 0.21225038170814514, + "learning_rate": 9.553550451266304e-05, + "loss": 0.4548, + "step": 3385 + }, + { + "epoch": 0.4072072072072072, + "grad_norm": 0.20666995644569397, + "learning_rate": 9.552250628487761e-05, + "loss": 0.4819, + "step": 3390 + }, + { + "epoch": 0.4078078078078078, + "grad_norm": 0.2576189339160919, + "learning_rate": 9.55094900495161e-05, + "loss": 0.4577, + "step": 3395 + }, + { + "epoch": 0.4084084084084084, + "grad_norm": 0.2355472445487976, + "learning_rate": 9.549645581172741e-05, + "loss": 0.4501, + "step": 3400 + }, + { + "epoch": 0.409009009009009, + "grad_norm": 0.2224271148443222, + "learning_rate": 9.548340357666759e-05, + "loss": 0.4815, + "step": 3405 + }, + { + "epoch": 0.4096096096096096, + "grad_norm": 0.2793062627315521, + "learning_rate": 9.547033334949972e-05, + "loss": 0.4471, + "step": 3410 + }, + { + "epoch": 0.4102102102102102, + "grad_norm": 0.2566879689693451, + "learning_rate": 9.545724513539411e-05, + "loss": 0.5139, + "step": 3415 + }, + { + "epoch": 0.41081081081081083, + "grad_norm": 0.22943030297756195, + "learning_rate": 9.54441389395281e-05, + "loss": 0.482, + "step": 3420 + }, + { + "epoch": 0.4114114114114114, + "grad_norm": 0.22264313697814941, + "learning_rate": 9.54310147670862e-05, + "loss": 0.4776, + "step": 3425 + }, + { + "epoch": 0.412012012012012, + "grad_norm": 0.2629396915435791, + "learning_rate": 9.541787262326001e-05, + "loss": 0.4684, + "step": 3430 + }, + { + "epoch": 0.4126126126126126, + "grad_norm": 0.22405704855918884, + "learning_rate": 9.540471251324821e-05, + "loss": 0.4368, + "step": 3435 + }, + { + "epoch": 0.4132132132132132, + "grad_norm": 0.23231935501098633, + "learning_rate": 9.539153444225665e-05, + "loss": 0.5391, + "step": 3440 + }, + { + "epoch": 0.4138138138138138, + "grad_norm": 0.25214463472366333, + "learning_rate": 9.537833841549821e-05, + "loss": 0.4723, + "step": 3445 + }, + { + "epoch": 0.4144144144144144, + "grad_norm": 0.250463604927063, + "learning_rate": 9.536512443819294e-05, + "loss": 0.451, + "step": 3450 + }, + { + "epoch": 0.41501501501501503, + "grad_norm": 0.24605700373649597, + "learning_rate": 9.535189251556795e-05, + "loss": 0.5072, + "step": 3455 + }, + { + "epoch": 0.4156156156156156, + "grad_norm": 0.2573533356189728, + "learning_rate": 9.533864265285746e-05, + "loss": 0.4928, + "step": 3460 + }, + { + "epoch": 0.41621621621621624, + "grad_norm": 0.22676625847816467, + "learning_rate": 9.532537485530279e-05, + "loss": 0.4481, + "step": 3465 + }, + { + "epoch": 0.4168168168168168, + "grad_norm": 0.2723495066165924, + "learning_rate": 9.531208912815235e-05, + "loss": 0.5058, + "step": 3470 + }, + { + "epoch": 0.4174174174174174, + "grad_norm": 0.2828966975212097, + "learning_rate": 9.529878547666164e-05, + "loss": 0.4567, + "step": 3475 + }, + { + "epoch": 0.418018018018018, + "grad_norm": 0.23241691291332245, + "learning_rate": 9.528546390609329e-05, + "loss": 0.5008, + "step": 3480 + }, + { + "epoch": 0.4186186186186186, + "grad_norm": 0.21605411171913147, + "learning_rate": 9.527212442171694e-05, + "loss": 0.4309, + "step": 3485 + }, + { + "epoch": 0.41921921921921923, + "grad_norm": 0.2571276128292084, + "learning_rate": 9.525876702880937e-05, + "loss": 0.517, + "step": 3490 + }, + { + "epoch": 0.4198198198198198, + "grad_norm": 0.317888468503952, + "learning_rate": 9.524539173265444e-05, + "loss": 0.4841, + "step": 3495 + }, + { + "epoch": 0.42042042042042044, + "grad_norm": 0.2505663335323334, + "learning_rate": 9.52319985385431e-05, + "loss": 0.4586, + "step": 3500 + }, + { + "epoch": 0.42042042042042044, + "eval_loss": 0.4424484968185425, + "eval_runtime": 35.6127, + "eval_samples_per_second": 22.464, + "eval_steps_per_second": 5.616, + "step": 3500 + }, + { + "epoch": 0.421021021021021, + "grad_norm": 0.25227388739585876, + "learning_rate": 9.521858745177332e-05, + "loss": 0.4151, + "step": 3505 + }, + { + "epoch": 0.42162162162162165, + "grad_norm": 0.24687834084033966, + "learning_rate": 9.520515847765025e-05, + "loss": 0.4532, + "step": 3510 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.24203993380069733, + "learning_rate": 9.519171162148604e-05, + "loss": 0.4706, + "step": 3515 + }, + { + "epoch": 0.4228228228228228, + "grad_norm": 0.2538060247898102, + "learning_rate": 9.517824688859991e-05, + "loss": 0.4953, + "step": 3520 + }, + { + "epoch": 0.42342342342342343, + "grad_norm": 0.22765670716762543, + "learning_rate": 9.516476428431819e-05, + "loss": 0.474, + "step": 3525 + }, + { + "epoch": 0.424024024024024, + "grad_norm": 0.266476571559906, + "learning_rate": 9.515126381397429e-05, + "loss": 0.4863, + "step": 3530 + }, + { + "epoch": 0.42462462462462464, + "grad_norm": 0.2860608696937561, + "learning_rate": 9.513774548290862e-05, + "loss": 0.497, + "step": 3535 + }, + { + "epoch": 0.4252252252252252, + "grad_norm": 0.30343401432037354, + "learning_rate": 9.51242092964687e-05, + "loss": 0.4592, + "step": 3540 + }, + { + "epoch": 0.42582582582582584, + "grad_norm": 0.30189409852027893, + "learning_rate": 9.511065526000915e-05, + "loss": 0.4557, + "step": 3545 + }, + { + "epoch": 0.4264264264264264, + "grad_norm": 0.2727750241756439, + "learning_rate": 9.509708337889159e-05, + "loss": 0.4625, + "step": 3550 + }, + { + "epoch": 0.42702702702702705, + "grad_norm": 0.27454304695129395, + "learning_rate": 9.50834936584847e-05, + "loss": 0.5036, + "step": 3555 + }, + { + "epoch": 0.4276276276276276, + "grad_norm": 0.2149980366230011, + "learning_rate": 9.506988610416425e-05, + "loss": 0.4307, + "step": 3560 + }, + { + "epoch": 0.4282282282282282, + "grad_norm": 0.30365270376205444, + "learning_rate": 9.505626072131306e-05, + "loss": 0.4864, + "step": 3565 + }, + { + "epoch": 0.42882882882882883, + "grad_norm": 0.24695612490177155, + "learning_rate": 9.5042617515321e-05, + "loss": 0.4211, + "step": 3570 + }, + { + "epoch": 0.4294294294294294, + "grad_norm": 0.2488124817609787, + "learning_rate": 9.502895649158496e-05, + "loss": 0.4793, + "step": 3575 + }, + { + "epoch": 0.43003003003003004, + "grad_norm": 0.21579088270664215, + "learning_rate": 9.501527765550893e-05, + "loss": 0.4628, + "step": 3580 + }, + { + "epoch": 0.4306306306306306, + "grad_norm": 0.29319724440574646, + "learning_rate": 9.500158101250389e-05, + "loss": 0.5088, + "step": 3585 + }, + { + "epoch": 0.43123123123123125, + "grad_norm": 0.2599899172782898, + "learning_rate": 9.498786656798793e-05, + "loss": 0.4622, + "step": 3590 + }, + { + "epoch": 0.4318318318318318, + "grad_norm": 0.229834645986557, + "learning_rate": 9.497413432738612e-05, + "loss": 0.4755, + "step": 3595 + }, + { + "epoch": 0.43243243243243246, + "grad_norm": 0.2928219735622406, + "learning_rate": 9.496038429613056e-05, + "loss": 0.4545, + "step": 3600 + }, + { + "epoch": 0.43303303303303303, + "grad_norm": 0.25314855575561523, + "learning_rate": 9.49466164796605e-05, + "loss": 0.4667, + "step": 3605 + }, + { + "epoch": 0.4336336336336336, + "grad_norm": 0.2712530791759491, + "learning_rate": 9.493283088342209e-05, + "loss": 0.4864, + "step": 3610 + }, + { + "epoch": 0.43423423423423424, + "grad_norm": 0.24743470549583435, + "learning_rate": 9.491902751286857e-05, + "loss": 0.4741, + "step": 3615 + }, + { + "epoch": 0.4348348348348348, + "grad_norm": 0.2904561460018158, + "learning_rate": 9.490520637346026e-05, + "loss": 0.5047, + "step": 3620 + }, + { + "epoch": 0.43543543543543545, + "grad_norm": 0.23873092234134674, + "learning_rate": 9.489136747066441e-05, + "loss": 0.4633, + "step": 3625 + }, + { + "epoch": 0.436036036036036, + "grad_norm": 0.27880802750587463, + "learning_rate": 9.487751080995535e-05, + "loss": 0.4693, + "step": 3630 + }, + { + "epoch": 0.43663663663663665, + "grad_norm": 0.28825628757476807, + "learning_rate": 9.486363639681447e-05, + "loss": 0.4463, + "step": 3635 + }, + { + "epoch": 0.43723723723723723, + "grad_norm": 0.2847222685813904, + "learning_rate": 9.48497442367301e-05, + "loss": 0.4602, + "step": 3640 + }, + { + "epoch": 0.43783783783783786, + "grad_norm": 0.25332358479499817, + "learning_rate": 9.483583433519769e-05, + "loss": 0.4772, + "step": 3645 + }, + { + "epoch": 0.43843843843843844, + "grad_norm": 0.25083234906196594, + "learning_rate": 9.482190669771958e-05, + "loss": 0.3913, + "step": 3650 + }, + { + "epoch": 0.439039039039039, + "grad_norm": 0.2363145500421524, + "learning_rate": 9.480796132980526e-05, + "loss": 0.4521, + "step": 3655 + }, + { + "epoch": 0.43963963963963965, + "grad_norm": 0.21178728342056274, + "learning_rate": 9.479399823697115e-05, + "loss": 0.4516, + "step": 3660 + }, + { + "epoch": 0.4402402402402402, + "grad_norm": 0.3048122823238373, + "learning_rate": 9.47800174247407e-05, + "loss": 0.4607, + "step": 3665 + }, + { + "epoch": 0.44084084084084085, + "grad_norm": 0.30136120319366455, + "learning_rate": 9.476601889864436e-05, + "loss": 0.4843, + "step": 3670 + }, + { + "epoch": 0.44144144144144143, + "grad_norm": 0.2415563464164734, + "learning_rate": 9.475200266421962e-05, + "loss": 0.4694, + "step": 3675 + }, + { + "epoch": 0.44204204204204206, + "grad_norm": 0.25498002767562866, + "learning_rate": 9.473796872701097e-05, + "loss": 0.4571, + "step": 3680 + }, + { + "epoch": 0.44264264264264264, + "grad_norm": 0.2813263237476349, + "learning_rate": 9.472391709256986e-05, + "loss": 0.4851, + "step": 3685 + }, + { + "epoch": 0.44324324324324327, + "grad_norm": 0.23250964283943176, + "learning_rate": 9.470984776645478e-05, + "loss": 0.4642, + "step": 3690 + }, + { + "epoch": 0.44384384384384384, + "grad_norm": 0.2851886451244354, + "learning_rate": 9.469576075423119e-05, + "loss": 0.5034, + "step": 3695 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.22036044299602509, + "learning_rate": 9.468165606147158e-05, + "loss": 0.4534, + "step": 3700 + }, + { + "epoch": 0.44504504504504505, + "grad_norm": 0.22608880698680878, + "learning_rate": 9.466753369375544e-05, + "loss": 0.4113, + "step": 3705 + }, + { + "epoch": 0.4456456456456456, + "grad_norm": 0.28404590487480164, + "learning_rate": 9.465339365666918e-05, + "loss": 0.4374, + "step": 3710 + }, + { + "epoch": 0.44624624624624626, + "grad_norm": 0.29327112436294556, + "learning_rate": 9.463923595580628e-05, + "loss": 0.4706, + "step": 3715 + }, + { + "epoch": 0.44684684684684683, + "grad_norm": 0.2675343453884125, + "learning_rate": 9.462506059676717e-05, + "loss": 0.4486, + "step": 3720 + }, + { + "epoch": 0.44744744744744747, + "grad_norm": 0.2927496135234833, + "learning_rate": 9.461086758515926e-05, + "loss": 0.4567, + "step": 3725 + }, + { + "epoch": 0.44804804804804804, + "grad_norm": 0.2421569973230362, + "learning_rate": 9.459665692659698e-05, + "loss": 0.4276, + "step": 3730 + }, + { + "epoch": 0.4486486486486487, + "grad_norm": 0.2948514521121979, + "learning_rate": 9.458242862670169e-05, + "loss": 0.4773, + "step": 3735 + }, + { + "epoch": 0.44924924924924925, + "grad_norm": 0.2598010003566742, + "learning_rate": 9.456818269110176e-05, + "loss": 0.4914, + "step": 3740 + }, + { + "epoch": 0.4498498498498498, + "grad_norm": 0.23903946578502655, + "learning_rate": 9.455391912543252e-05, + "loss": 0.4357, + "step": 3745 + }, + { + "epoch": 0.45045045045045046, + "grad_norm": 0.251955509185791, + "learning_rate": 9.453963793533631e-05, + "loss": 0.4996, + "step": 3750 + }, + { + "epoch": 0.45045045045045046, + "eval_loss": 0.4331132173538208, + "eval_runtime": 35.6334, + "eval_samples_per_second": 22.451, + "eval_steps_per_second": 5.613, + "step": 3750 + }, + { + "epoch": 0.45105105105105103, + "grad_norm": 0.31112930178642273, + "learning_rate": 9.452533912646239e-05, + "loss": 0.4731, + "step": 3755 + }, + { + "epoch": 0.45165165165165166, + "grad_norm": 0.2766417860984802, + "learning_rate": 9.451102270446703e-05, + "loss": 0.4503, + "step": 3760 + }, + { + "epoch": 0.45225225225225224, + "grad_norm": 0.25546813011169434, + "learning_rate": 9.449668867501343e-05, + "loss": 0.4644, + "step": 3765 + }, + { + "epoch": 0.45285285285285287, + "grad_norm": 0.24376080930233002, + "learning_rate": 9.44823370437718e-05, + "loss": 0.453, + "step": 3770 + }, + { + "epoch": 0.45345345345345345, + "grad_norm": 0.29174941778182983, + "learning_rate": 9.44679678164193e-05, + "loss": 0.4494, + "step": 3775 + }, + { + "epoch": 0.4540540540540541, + "grad_norm": 0.27715715765953064, + "learning_rate": 9.445358099863998e-05, + "loss": 0.386, + "step": 3780 + }, + { + "epoch": 0.45465465465465466, + "grad_norm": 0.2590334415435791, + "learning_rate": 9.443917659612499e-05, + "loss": 0.4672, + "step": 3785 + }, + { + "epoch": 0.45525525525525523, + "grad_norm": 0.30765074491500854, + "learning_rate": 9.44247546145723e-05, + "loss": 0.4694, + "step": 3790 + }, + { + "epoch": 0.45585585585585586, + "grad_norm": 0.30974051356315613, + "learning_rate": 9.441031505968692e-05, + "loss": 0.4396, + "step": 3795 + }, + { + "epoch": 0.45645645645645644, + "grad_norm": 0.2814948260784149, + "learning_rate": 9.439585793718075e-05, + "loss": 0.4841, + "step": 3800 + }, + { + "epoch": 0.45705705705705707, + "grad_norm": 0.2723504304885864, + "learning_rate": 9.438138325277269e-05, + "loss": 0.4723, + "step": 3805 + }, + { + "epoch": 0.45765765765765765, + "grad_norm": 0.2845655679702759, + "learning_rate": 9.436689101218856e-05, + "loss": 0.4633, + "step": 3810 + }, + { + "epoch": 0.4582582582582583, + "grad_norm": 0.2787138521671295, + "learning_rate": 9.435238122116112e-05, + "loss": 0.4433, + "step": 3815 + }, + { + "epoch": 0.45885885885885885, + "grad_norm": 0.28986725211143494, + "learning_rate": 9.433785388543012e-05, + "loss": 0.4294, + "step": 3820 + }, + { + "epoch": 0.4594594594594595, + "grad_norm": 0.26881736516952515, + "learning_rate": 9.432330901074218e-05, + "loss": 0.4476, + "step": 3825 + }, + { + "epoch": 0.46006006006006006, + "grad_norm": 0.23753543198108673, + "learning_rate": 9.430874660285092e-05, + "loss": 0.4544, + "step": 3830 + }, + { + "epoch": 0.46066066066066064, + "grad_norm": 0.24957028031349182, + "learning_rate": 9.429416666751683e-05, + "loss": 0.4322, + "step": 3835 + }, + { + "epoch": 0.46126126126126127, + "grad_norm": 0.28791627287864685, + "learning_rate": 9.42795692105074e-05, + "loss": 0.4256, + "step": 3840 + }, + { + "epoch": 0.46186186186186184, + "grad_norm": 0.2748839855194092, + "learning_rate": 9.4264954237597e-05, + "loss": 0.4783, + "step": 3845 + }, + { + "epoch": 0.4624624624624625, + "grad_norm": 0.2614935636520386, + "learning_rate": 9.425032175456699e-05, + "loss": 0.447, + "step": 3850 + }, + { + "epoch": 0.46306306306306305, + "grad_norm": 0.27256256341934204, + "learning_rate": 9.423567176720558e-05, + "loss": 0.4418, + "step": 3855 + }, + { + "epoch": 0.4636636636636637, + "grad_norm": 0.2694026231765747, + "learning_rate": 9.422100428130797e-05, + "loss": 0.4641, + "step": 3860 + }, + { + "epoch": 0.46426426426426426, + "grad_norm": 0.3331802785396576, + "learning_rate": 9.420631930267623e-05, + "loss": 0.4657, + "step": 3865 + }, + { + "epoch": 0.4648648648648649, + "grad_norm": 0.24749064445495605, + "learning_rate": 9.41916168371194e-05, + "loss": 0.4974, + "step": 3870 + }, + { + "epoch": 0.46546546546546547, + "grad_norm": 0.25842297077178955, + "learning_rate": 9.417689689045337e-05, + "loss": 0.4185, + "step": 3875 + }, + { + "epoch": 0.46606606606606604, + "grad_norm": 0.25025177001953125, + "learning_rate": 9.416215946850104e-05, + "loss": 0.4707, + "step": 3880 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.24062250554561615, + "learning_rate": 9.414740457709213e-05, + "loss": 0.4227, + "step": 3885 + }, + { + "epoch": 0.46726726726726725, + "grad_norm": 0.29786667227745056, + "learning_rate": 9.41326322220633e-05, + "loss": 0.4499, + "step": 3890 + }, + { + "epoch": 0.4678678678678679, + "grad_norm": 0.2728450298309326, + "learning_rate": 9.411784240925818e-05, + "loss": 0.404, + "step": 3895 + }, + { + "epoch": 0.46846846846846846, + "grad_norm": 0.25382304191589355, + "learning_rate": 9.410303514452721e-05, + "loss": 0.4989, + "step": 3900 + }, + { + "epoch": 0.4690690690690691, + "grad_norm": 0.2877455949783325, + "learning_rate": 9.408821043372777e-05, + "loss": 0.4942, + "step": 3905 + }, + { + "epoch": 0.46966966966966966, + "grad_norm": 0.27008742094039917, + "learning_rate": 9.40733682827242e-05, + "loss": 0.4847, + "step": 3910 + }, + { + "epoch": 0.4702702702702703, + "grad_norm": 0.2753797471523285, + "learning_rate": 9.405850869738764e-05, + "loss": 0.4571, + "step": 3915 + }, + { + "epoch": 0.4708708708708709, + "grad_norm": 0.30681371688842773, + "learning_rate": 9.40436316835962e-05, + "loss": 0.4697, + "step": 3920 + }, + { + "epoch": 0.47147147147147145, + "grad_norm": 0.2850700616836548, + "learning_rate": 9.402873724723483e-05, + "loss": 0.4297, + "step": 3925 + }, + { + "epoch": 0.4720720720720721, + "grad_norm": 0.2790954113006592, + "learning_rate": 9.401382539419544e-05, + "loss": 0.4217, + "step": 3930 + }, + { + "epoch": 0.47267267267267266, + "grad_norm": 0.2670901119709015, + "learning_rate": 9.399889613037675e-05, + "loss": 0.4696, + "step": 3935 + }, + { + "epoch": 0.4732732732732733, + "grad_norm": 0.3313315808773041, + "learning_rate": 9.398394946168443e-05, + "loss": 0.5239, + "step": 3940 + }, + { + "epoch": 0.47387387387387386, + "grad_norm": 0.2873570919036865, + "learning_rate": 9.396898539403101e-05, + "loss": 0.4921, + "step": 3945 + }, + { + "epoch": 0.4744744744744745, + "grad_norm": 0.28026047348976135, + "learning_rate": 9.395400393333589e-05, + "loss": 0.4495, + "step": 3950 + }, + { + "epoch": 0.47507507507507507, + "grad_norm": 0.29478156566619873, + "learning_rate": 9.393900508552538e-05, + "loss": 0.4111, + "step": 3955 + }, + { + "epoch": 0.4756756756756757, + "grad_norm": 0.2536068558692932, + "learning_rate": 9.392398885653266e-05, + "loss": 0.4605, + "step": 3960 + }, + { + "epoch": 0.4762762762762763, + "grad_norm": 0.27553731203079224, + "learning_rate": 9.390895525229775e-05, + "loss": 0.4648, + "step": 3965 + }, + { + "epoch": 0.47687687687687685, + "grad_norm": 0.2897990643978119, + "learning_rate": 9.38939042787676e-05, + "loss": 0.4725, + "step": 3970 + }, + { + "epoch": 0.4774774774774775, + "grad_norm": 0.2889381945133209, + "learning_rate": 9.3878835941896e-05, + "loss": 0.4532, + "step": 3975 + }, + { + "epoch": 0.47807807807807806, + "grad_norm": 0.33516576886177063, + "learning_rate": 9.386375024764358e-05, + "loss": 0.4689, + "step": 3980 + }, + { + "epoch": 0.4786786786786787, + "grad_norm": 0.2570089101791382, + "learning_rate": 9.38486472019779e-05, + "loss": 0.4283, + "step": 3985 + }, + { + "epoch": 0.47927927927927927, + "grad_norm": 0.2153373807668686, + "learning_rate": 9.383352681087333e-05, + "loss": 0.4194, + "step": 3990 + }, + { + "epoch": 0.4798798798798799, + "grad_norm": 0.26292797923088074, + "learning_rate": 9.381838908031116e-05, + "loss": 0.4445, + "step": 3995 + }, + { + "epoch": 0.4804804804804805, + "grad_norm": 0.26263949275016785, + "learning_rate": 9.380323401627944e-05, + "loss": 0.4583, + "step": 4000 + }, + { + "epoch": 0.4804804804804805, + "eval_loss": 0.4252912998199463, + "eval_runtime": 35.577, + "eval_samples_per_second": 22.486, + "eval_steps_per_second": 5.622, + "step": 4000 + }, + { + "epoch": 0.4810810810810811, + "grad_norm": 0.294264018535614, + "learning_rate": 9.378806162477319e-05, + "loss": 0.5011, + "step": 4005 + }, + { + "epoch": 0.4816816816816817, + "grad_norm": 0.25689366459846497, + "learning_rate": 9.37728719117942e-05, + "loss": 0.4251, + "step": 4010 + }, + { + "epoch": 0.48228228228228226, + "grad_norm": 0.26754069328308105, + "learning_rate": 9.375766488335117e-05, + "loss": 0.4507, + "step": 4015 + }, + { + "epoch": 0.4828828828828829, + "grad_norm": 0.23651185631752014, + "learning_rate": 9.37424405454596e-05, + "loss": 0.3836, + "step": 4020 + }, + { + "epoch": 0.48348348348348347, + "grad_norm": 0.2558950185775757, + "learning_rate": 9.372719890414187e-05, + "loss": 0.4634, + "step": 4025 + }, + { + "epoch": 0.4840840840840841, + "grad_norm": 0.24600175023078918, + "learning_rate": 9.371193996542721e-05, + "loss": 0.4459, + "step": 4030 + }, + { + "epoch": 0.4846846846846847, + "grad_norm": 0.2958679795265198, + "learning_rate": 9.369666373535169e-05, + "loss": 0.4254, + "step": 4035 + }, + { + "epoch": 0.4852852852852853, + "grad_norm": 0.23160912096500397, + "learning_rate": 9.368137021995815e-05, + "loss": 0.4714, + "step": 4040 + }, + { + "epoch": 0.4858858858858859, + "grad_norm": 0.2944106459617615, + "learning_rate": 9.366605942529637e-05, + "loss": 0.4583, + "step": 4045 + }, + { + "epoch": 0.4864864864864865, + "grad_norm": 0.32667234539985657, + "learning_rate": 9.36507313574229e-05, + "loss": 0.4697, + "step": 4050 + }, + { + "epoch": 0.4870870870870871, + "grad_norm": 0.2736036479473114, + "learning_rate": 9.363538602240119e-05, + "loss": 0.4414, + "step": 4055 + }, + { + "epoch": 0.48768768768768767, + "grad_norm": 0.27527615427970886, + "learning_rate": 9.36200234263014e-05, + "loss": 0.4284, + "step": 4060 + }, + { + "epoch": 0.4882882882882883, + "grad_norm": 0.3154074251651764, + "learning_rate": 9.360464357520067e-05, + "loss": 0.5026, + "step": 4065 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.24428270757198334, + "learning_rate": 9.358924647518282e-05, + "loss": 0.404, + "step": 4070 + }, + { + "epoch": 0.4894894894894895, + "grad_norm": 0.31537002325057983, + "learning_rate": 9.357383213233861e-05, + "loss": 0.4414, + "step": 4075 + }, + { + "epoch": 0.4900900900900901, + "grad_norm": 0.27242958545684814, + "learning_rate": 9.355840055276556e-05, + "loss": 0.459, + "step": 4080 + }, + { + "epoch": 0.4906906906906907, + "grad_norm": 0.26572486758232117, + "learning_rate": 9.354295174256801e-05, + "loss": 0.4567, + "step": 4085 + }, + { + "epoch": 0.4912912912912913, + "grad_norm": 0.27613312005996704, + "learning_rate": 9.352748570785713e-05, + "loss": 0.4078, + "step": 4090 + }, + { + "epoch": 0.4918918918918919, + "grad_norm": 0.28778305649757385, + "learning_rate": 9.351200245475089e-05, + "loss": 0.4763, + "step": 4095 + }, + { + "epoch": 0.4924924924924925, + "grad_norm": 0.2761375606060028, + "learning_rate": 9.349650198937411e-05, + "loss": 0.3981, + "step": 4100 + }, + { + "epoch": 0.49309309309309307, + "grad_norm": 0.2764434218406677, + "learning_rate": 9.348098431785837e-05, + "loss": 0.386, + "step": 4105 + }, + { + "epoch": 0.4936936936936937, + "grad_norm": 0.30036839842796326, + "learning_rate": 9.34654494463421e-05, + "loss": 0.4823, + "step": 4110 + }, + { + "epoch": 0.4942942942942943, + "grad_norm": 0.30533745884895325, + "learning_rate": 9.344989738097047e-05, + "loss": 0.4344, + "step": 4115 + }, + { + "epoch": 0.4948948948948949, + "grad_norm": 0.36635449528694153, + "learning_rate": 9.343432812789551e-05, + "loss": 0.4805, + "step": 4120 + }, + { + "epoch": 0.4954954954954955, + "grad_norm": 0.33037155866622925, + "learning_rate": 9.341874169327604e-05, + "loss": 0.4222, + "step": 4125 + }, + { + "epoch": 0.4960960960960961, + "grad_norm": 0.31396031379699707, + "learning_rate": 9.340313808327768e-05, + "loss": 0.4369, + "step": 4130 + }, + { + "epoch": 0.4966966966966967, + "grad_norm": 0.32045847177505493, + "learning_rate": 9.338751730407278e-05, + "loss": 0.4751, + "step": 4135 + }, + { + "epoch": 0.4972972972972973, + "grad_norm": 0.280370831489563, + "learning_rate": 9.33718793618406e-05, + "loss": 0.4844, + "step": 4140 + }, + { + "epoch": 0.4978978978978979, + "grad_norm": 0.26197347044944763, + "learning_rate": 9.335622426276707e-05, + "loss": 0.4052, + "step": 4145 + }, + { + "epoch": 0.4984984984984985, + "grad_norm": 0.24671530723571777, + "learning_rate": 9.334055201304499e-05, + "loss": 0.4077, + "step": 4150 + }, + { + "epoch": 0.4990990990990991, + "grad_norm": 0.3186667263507843, + "learning_rate": 9.332486261887388e-05, + "loss": 0.4672, + "step": 4155 + }, + { + "epoch": 0.4996996996996997, + "grad_norm": 0.3468276262283325, + "learning_rate": 9.330915608646012e-05, + "loss": 0.4303, + "step": 4160 + }, + { + "epoch": 0.5003003003003003, + "grad_norm": 0.3199736773967743, + "learning_rate": 9.32934324220168e-05, + "loss": 0.4752, + "step": 4165 + }, + { + "epoch": 0.5009009009009009, + "grad_norm": 0.26355046033859253, + "learning_rate": 9.32776916317638e-05, + "loss": 0.4574, + "step": 4170 + }, + { + "epoch": 0.5015015015015015, + "grad_norm": 0.2576383054256439, + "learning_rate": 9.326193372192783e-05, + "loss": 0.4284, + "step": 4175 + }, + { + "epoch": 0.5021021021021022, + "grad_norm": 0.29025712609291077, + "learning_rate": 9.324615869874229e-05, + "loss": 0.3747, + "step": 4180 + }, + { + "epoch": 0.5027027027027027, + "grad_norm": 0.2769593894481659, + "learning_rate": 9.323036656844739e-05, + "loss": 0.4554, + "step": 4185 + }, + { + "epoch": 0.5033033033033033, + "grad_norm": 0.38268837332725525, + "learning_rate": 9.321455733729014e-05, + "loss": 0.4523, + "step": 4190 + }, + { + "epoch": 0.5039039039039039, + "grad_norm": 0.31293413043022156, + "learning_rate": 9.319873101152423e-05, + "loss": 0.44, + "step": 4195 + }, + { + "epoch": 0.5045045045045045, + "grad_norm": 0.34711217880249023, + "learning_rate": 9.31828875974102e-05, + "loss": 0.396, + "step": 4200 + }, + { + "epoch": 0.5051051051051051, + "grad_norm": 0.28682348132133484, + "learning_rate": 9.31670271012153e-05, + "loss": 0.4043, + "step": 4205 + }, + { + "epoch": 0.5057057057057057, + "grad_norm": 0.30304649472236633, + "learning_rate": 9.315114952921356e-05, + "loss": 0.4867, + "step": 4210 + }, + { + "epoch": 0.5063063063063064, + "grad_norm": 0.3410811126232147, + "learning_rate": 9.313525488768573e-05, + "loss": 0.4326, + "step": 4215 + }, + { + "epoch": 0.5069069069069069, + "grad_norm": 0.26315030455589294, + "learning_rate": 9.311934318291937e-05, + "loss": 0.4231, + "step": 4220 + }, + { + "epoch": 0.5075075075075075, + "grad_norm": 0.3111753761768341, + "learning_rate": 9.310341442120871e-05, + "loss": 0.4761, + "step": 4225 + }, + { + "epoch": 0.5081081081081081, + "grad_norm": 0.273722380399704, + "learning_rate": 9.308746860885482e-05, + "loss": 0.4434, + "step": 4230 + }, + { + "epoch": 0.5087087087087087, + "grad_norm": 0.3247709274291992, + "learning_rate": 9.307150575216545e-05, + "loss": 0.414, + "step": 4235 + }, + { + "epoch": 0.5093093093093093, + "grad_norm": 0.33878350257873535, + "learning_rate": 9.305552585745511e-05, + "loss": 0.4901, + "step": 4240 + }, + { + "epoch": 0.5099099099099099, + "grad_norm": 0.3130566477775574, + "learning_rate": 9.303952893104504e-05, + "loss": 0.4206, + "step": 4245 + }, + { + "epoch": 0.5105105105105106, + "grad_norm": 0.26403722167015076, + "learning_rate": 9.302351497926325e-05, + "loss": 0.3785, + "step": 4250 + }, + { + "epoch": 0.5105105105105106, + "eval_loss": 0.4109116792678833, + "eval_runtime": 35.6573, + "eval_samples_per_second": 22.436, + "eval_steps_per_second": 5.609, + "step": 4250 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.27747008204460144, + "learning_rate": 9.300748400844446e-05, + "loss": 0.4567, + "step": 4255 + }, + { + "epoch": 0.5117117117117117, + "grad_norm": 0.35479068756103516, + "learning_rate": 9.29914360249301e-05, + "loss": 0.4504, + "step": 4260 + }, + { + "epoch": 0.5123123123123123, + "grad_norm": 0.2651272118091583, + "learning_rate": 9.297537103506838e-05, + "loss": 0.4214, + "step": 4265 + }, + { + "epoch": 0.512912912912913, + "grad_norm": 0.265099436044693, + "learning_rate": 9.29592890452142e-05, + "loss": 0.4077, + "step": 4270 + }, + { + "epoch": 0.5135135135135135, + "grad_norm": 0.31767547130584717, + "learning_rate": 9.294319006172921e-05, + "loss": 0.4426, + "step": 4275 + }, + { + "epoch": 0.5141141141141141, + "grad_norm": 0.31464385986328125, + "learning_rate": 9.292707409098174e-05, + "loss": 0.4308, + "step": 4280 + }, + { + "epoch": 0.5147147147147147, + "grad_norm": 0.2928173840045929, + "learning_rate": 9.291094113934689e-05, + "loss": 0.4406, + "step": 4285 + }, + { + "epoch": 0.5153153153153153, + "grad_norm": 0.2902061939239502, + "learning_rate": 9.289479121320648e-05, + "loss": 0.4676, + "step": 4290 + }, + { + "epoch": 0.5159159159159159, + "grad_norm": 0.26935285329818726, + "learning_rate": 9.287862431894897e-05, + "loss": 0.4455, + "step": 4295 + }, + { + "epoch": 0.5165165165165165, + "grad_norm": 0.24586215615272522, + "learning_rate": 9.286244046296961e-05, + "loss": 0.4804, + "step": 4300 + }, + { + "epoch": 0.5171171171171172, + "grad_norm": 0.2868068814277649, + "learning_rate": 9.284623965167035e-05, + "loss": 0.4312, + "step": 4305 + }, + { + "epoch": 0.5177177177177177, + "grad_norm": 0.3604901134967804, + "learning_rate": 9.28300218914598e-05, + "loss": 0.4556, + "step": 4310 + }, + { + "epoch": 0.5183183183183183, + "grad_norm": 0.3339654207229614, + "learning_rate": 9.281378718875332e-05, + "loss": 0.441, + "step": 4315 + }, + { + "epoch": 0.518918918918919, + "grad_norm": 0.2429099678993225, + "learning_rate": 9.279753554997295e-05, + "loss": 0.4353, + "step": 4320 + }, + { + "epoch": 0.5195195195195195, + "grad_norm": 0.3159288167953491, + "learning_rate": 9.278126698154743e-05, + "loss": 0.4532, + "step": 4325 + }, + { + "epoch": 0.5201201201201201, + "grad_norm": 0.30051764845848083, + "learning_rate": 9.276498148991222e-05, + "loss": 0.4082, + "step": 4330 + }, + { + "epoch": 0.5207207207207207, + "grad_norm": 0.34233060479164124, + "learning_rate": 9.274867908150944e-05, + "loss": 0.4422, + "step": 4335 + }, + { + "epoch": 0.5213213213213214, + "grad_norm": 0.33868885040283203, + "learning_rate": 9.273235976278794e-05, + "loss": 0.4399, + "step": 4340 + }, + { + "epoch": 0.5219219219219219, + "grad_norm": 0.26910141110420227, + "learning_rate": 9.27160235402032e-05, + "loss": 0.4676, + "step": 4345 + }, + { + "epoch": 0.5225225225225225, + "grad_norm": 0.2950465977191925, + "learning_rate": 9.269967042021747e-05, + "loss": 0.437, + "step": 4350 + }, + { + "epoch": 0.5231231231231231, + "grad_norm": 0.289869099855423, + "learning_rate": 9.268330040929962e-05, + "loss": 0.4739, + "step": 4355 + }, + { + "epoch": 0.5237237237237238, + "grad_norm": 0.3041685223579407, + "learning_rate": 9.26669135139252e-05, + "loss": 0.4482, + "step": 4360 + }, + { + "epoch": 0.5243243243243243, + "grad_norm": 0.3370480239391327, + "learning_rate": 9.265050974057649e-05, + "loss": 0.4397, + "step": 4365 + }, + { + "epoch": 0.5249249249249249, + "grad_norm": 0.2817637324333191, + "learning_rate": 9.26340890957424e-05, + "loss": 0.3884, + "step": 4370 + }, + { + "epoch": 0.5255255255255256, + "grad_norm": 0.29471760988235474, + "learning_rate": 9.261765158591855e-05, + "loss": 0.41, + "step": 4375 + }, + { + "epoch": 0.5261261261261261, + "grad_norm": 0.3439834415912628, + "learning_rate": 9.260119721760721e-05, + "loss": 0.4138, + "step": 4380 + }, + { + "epoch": 0.5267267267267267, + "grad_norm": 0.2582058906555176, + "learning_rate": 9.258472599731728e-05, + "loss": 0.3942, + "step": 4385 + }, + { + "epoch": 0.5273273273273273, + "grad_norm": 0.28308677673339844, + "learning_rate": 9.256823793156441e-05, + "loss": 0.4389, + "step": 4390 + }, + { + "epoch": 0.527927927927928, + "grad_norm": 0.3756570518016815, + "learning_rate": 9.255173302687085e-05, + "loss": 0.4692, + "step": 4395 + }, + { + "epoch": 0.5285285285285285, + "grad_norm": 0.25984251499176025, + "learning_rate": 9.253521128976554e-05, + "loss": 0.4154, + "step": 4400 + }, + { + "epoch": 0.5291291291291291, + "grad_norm": 0.28260019421577454, + "learning_rate": 9.251867272678408e-05, + "loss": 0.4026, + "step": 4405 + }, + { + "epoch": 0.5297297297297298, + "grad_norm": 0.2700667381286621, + "learning_rate": 9.25021173444687e-05, + "loss": 0.4194, + "step": 4410 + }, + { + "epoch": 0.5303303303303303, + "grad_norm": 0.27166318893432617, + "learning_rate": 9.24855451493683e-05, + "loss": 0.4413, + "step": 4415 + }, + { + "epoch": 0.5309309309309309, + "grad_norm": 0.34264490008354187, + "learning_rate": 9.246895614803843e-05, + "loss": 0.4665, + "step": 4420 + }, + { + "epoch": 0.5315315315315315, + "grad_norm": 0.296682208776474, + "learning_rate": 9.24523503470413e-05, + "loss": 0.4124, + "step": 4425 + }, + { + "epoch": 0.5321321321321322, + "grad_norm": 0.29106104373931885, + "learning_rate": 9.243572775294573e-05, + "loss": 0.4239, + "step": 4430 + }, + { + "epoch": 0.5327327327327327, + "grad_norm": 0.3212127983570099, + "learning_rate": 9.241908837232722e-05, + "loss": 0.446, + "step": 4435 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.34617817401885986, + "learning_rate": 9.240243221176791e-05, + "loss": 0.4625, + "step": 4440 + }, + { + "epoch": 0.533933933933934, + "grad_norm": 0.2617619037628174, + "learning_rate": 9.238575927785655e-05, + "loss": 0.4127, + "step": 4445 + }, + { + "epoch": 0.5345345345345346, + "grad_norm": 0.3500515818595886, + "learning_rate": 9.236906957718854e-05, + "loss": 0.4488, + "step": 4450 + }, + { + "epoch": 0.5351351351351351, + "grad_norm": 0.2569849491119385, + "learning_rate": 9.235236311636593e-05, + "loss": 0.4422, + "step": 4455 + }, + { + "epoch": 0.5357357357357357, + "grad_norm": 0.3044881522655487, + "learning_rate": 9.233563990199735e-05, + "loss": 0.4279, + "step": 4460 + }, + { + "epoch": 0.5363363363363364, + "grad_norm": 0.35698455572128296, + "learning_rate": 9.231889994069811e-05, + "loss": 0.4296, + "step": 4465 + }, + { + "epoch": 0.5369369369369369, + "grad_norm": 0.3179773986339569, + "learning_rate": 9.230214323909012e-05, + "loss": 0.3862, + "step": 4470 + }, + { + "epoch": 0.5375375375375375, + "grad_norm": 0.3401271104812622, + "learning_rate": 9.228536980380191e-05, + "loss": 0.4274, + "step": 4475 + }, + { + "epoch": 0.5381381381381382, + "grad_norm": 0.36162683367729187, + "learning_rate": 9.226857964146866e-05, + "loss": 0.3976, + "step": 4480 + }, + { + "epoch": 0.5387387387387388, + "grad_norm": 0.322964608669281, + "learning_rate": 9.225177275873211e-05, + "loss": 0.4376, + "step": 4485 + }, + { + "epoch": 0.5393393393393393, + "grad_norm": 0.33346307277679443, + "learning_rate": 9.223494916224066e-05, + "loss": 0.4482, + "step": 4490 + }, + { + "epoch": 0.5399399399399399, + "grad_norm": 0.2842477262020111, + "learning_rate": 9.221810885864933e-05, + "loss": 0.3776, + "step": 4495 + }, + { + "epoch": 0.5405405405405406, + "grad_norm": 0.3431280851364136, + "learning_rate": 9.220125185461967e-05, + "loss": 0.4224, + "step": 4500 + }, + { + "epoch": 0.5405405405405406, + "eval_loss": 0.39646750688552856, + "eval_runtime": 35.5976, + "eval_samples_per_second": 22.473, + "eval_steps_per_second": 5.618, + "step": 4500 + }, + { + "epoch": 0.5411411411411411, + "grad_norm": 0.3617427945137024, + "learning_rate": 9.218437815681996e-05, + "loss": 0.4425, + "step": 4505 + }, + { + "epoch": 0.5417417417417417, + "grad_norm": 0.31604188680648804, + "learning_rate": 9.216748777192498e-05, + "loss": 0.3861, + "step": 4510 + }, + { + "epoch": 0.5423423423423424, + "grad_norm": 0.270926296710968, + "learning_rate": 9.215058070661615e-05, + "loss": 0.4568, + "step": 4515 + }, + { + "epoch": 0.542942942942943, + "grad_norm": 0.33962738513946533, + "learning_rate": 9.21336569675815e-05, + "loss": 0.3911, + "step": 4520 + }, + { + "epoch": 0.5435435435435435, + "grad_norm": 0.3347056806087494, + "learning_rate": 9.211671656151563e-05, + "loss": 0.4243, + "step": 4525 + }, + { + "epoch": 0.5441441441441441, + "grad_norm": 0.31746459007263184, + "learning_rate": 9.209975949511974e-05, + "loss": 0.4082, + "step": 4530 + }, + { + "epoch": 0.5447447447447448, + "grad_norm": 0.36672934889793396, + "learning_rate": 9.208278577510163e-05, + "loss": 0.409, + "step": 4535 + }, + { + "epoch": 0.5453453453453454, + "grad_norm": 0.3124452531337738, + "learning_rate": 9.20657954081757e-05, + "loss": 0.3757, + "step": 4540 + }, + { + "epoch": 0.5459459459459459, + "grad_norm": 0.3697468340396881, + "learning_rate": 9.20487884010629e-05, + "loss": 0.3845, + "step": 4545 + }, + { + "epoch": 0.5465465465465466, + "grad_norm": 0.3563442826271057, + "learning_rate": 9.203176476049079e-05, + "loss": 0.4451, + "step": 4550 + }, + { + "epoch": 0.5471471471471472, + "grad_norm": 0.22961801290512085, + "learning_rate": 9.20147244931935e-05, + "loss": 0.4466, + "step": 4555 + }, + { + "epoch": 0.5477477477477477, + "grad_norm": 0.30496376752853394, + "learning_rate": 9.199766760591174e-05, + "loss": 0.3924, + "step": 4560 + }, + { + "epoch": 0.5483483483483483, + "grad_norm": 0.3423655331134796, + "learning_rate": 9.198059410539275e-05, + "loss": 0.4239, + "step": 4565 + }, + { + "epoch": 0.548948948948949, + "grad_norm": 0.3447710871696472, + "learning_rate": 9.196350399839044e-05, + "loss": 0.4254, + "step": 4570 + }, + { + "epoch": 0.5495495495495496, + "grad_norm": 0.3405103385448456, + "learning_rate": 9.194639729166523e-05, + "loss": 0.4787, + "step": 4575 + }, + { + "epoch": 0.5501501501501501, + "grad_norm": 0.2896510362625122, + "learning_rate": 9.192927399198408e-05, + "loss": 0.3943, + "step": 4580 + }, + { + "epoch": 0.5507507507507508, + "grad_norm": 0.32498177886009216, + "learning_rate": 9.191213410612056e-05, + "loss": 0.4402, + "step": 4585 + }, + { + "epoch": 0.5513513513513514, + "grad_norm": 0.2963517904281616, + "learning_rate": 9.189497764085477e-05, + "loss": 0.4015, + "step": 4590 + }, + { + "epoch": 0.5519519519519519, + "grad_norm": 0.3304550349712372, + "learning_rate": 9.187780460297341e-05, + "loss": 0.3991, + "step": 4595 + }, + { + "epoch": 0.5525525525525525, + "grad_norm": 0.25571829080581665, + "learning_rate": 9.186061499926968e-05, + "loss": 0.3949, + "step": 4600 + }, + { + "epoch": 0.5531531531531532, + "grad_norm": 0.30609068274497986, + "learning_rate": 9.184340883654339e-05, + "loss": 0.3866, + "step": 4605 + }, + { + "epoch": 0.5537537537537538, + "grad_norm": 0.2816171646118164, + "learning_rate": 9.182618612160084e-05, + "loss": 0.4492, + "step": 4610 + }, + { + "epoch": 0.5543543543543543, + "grad_norm": 0.29632723331451416, + "learning_rate": 9.180894686125492e-05, + "loss": 0.4284, + "step": 4615 + }, + { + "epoch": 0.554954954954955, + "grad_norm": 0.3231966495513916, + "learning_rate": 9.179169106232507e-05, + "loss": 0.3891, + "step": 4620 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.2741132974624634, + "learning_rate": 9.177441873163723e-05, + "loss": 0.3458, + "step": 4625 + }, + { + "epoch": 0.5561561561561561, + "grad_norm": 0.25370296835899353, + "learning_rate": 9.175712987602395e-05, + "loss": 0.4313, + "step": 4630 + }, + { + "epoch": 0.5567567567567567, + "grad_norm": 0.36991292238235474, + "learning_rate": 9.173982450232424e-05, + "loss": 0.381, + "step": 4635 + }, + { + "epoch": 0.5573573573573574, + "grad_norm": 0.33524763584136963, + "learning_rate": 9.172250261738367e-05, + "loss": 0.411, + "step": 4640 + }, + { + "epoch": 0.557957957957958, + "grad_norm": 0.29027846455574036, + "learning_rate": 9.170516422805435e-05, + "loss": 0.4039, + "step": 4645 + }, + { + "epoch": 0.5585585585585585, + "grad_norm": 0.284198522567749, + "learning_rate": 9.168780934119494e-05, + "loss": 0.4276, + "step": 4650 + }, + { + "epoch": 0.5591591591591591, + "grad_norm": 0.3591119349002838, + "learning_rate": 9.167043796367061e-05, + "loss": 0.4115, + "step": 4655 + }, + { + "epoch": 0.5597597597597598, + "grad_norm": 0.35682356357574463, + "learning_rate": 9.165305010235301e-05, + "loss": 0.4331, + "step": 4660 + }, + { + "epoch": 0.5603603603603604, + "grad_norm": 0.4038154184818268, + "learning_rate": 9.163564576412037e-05, + "loss": 0.4335, + "step": 4665 + }, + { + "epoch": 0.5609609609609609, + "grad_norm": 0.2744685709476471, + "learning_rate": 9.161822495585741e-05, + "loss": 0.3959, + "step": 4670 + }, + { + "epoch": 0.5615615615615616, + "grad_norm": 0.36517074704170227, + "learning_rate": 9.160078768445537e-05, + "loss": 0.4722, + "step": 4675 + }, + { + "epoch": 0.5621621621621622, + "grad_norm": 0.2964160740375519, + "learning_rate": 9.158333395681203e-05, + "loss": 0.4154, + "step": 4680 + }, + { + "epoch": 0.5627627627627627, + "grad_norm": 0.3045138120651245, + "learning_rate": 9.156586377983158e-05, + "loss": 0.4513, + "step": 4685 + }, + { + "epoch": 0.5633633633633633, + "grad_norm": 0.2858031690120697, + "learning_rate": 9.154837716042487e-05, + "loss": 0.399, + "step": 4690 + }, + { + "epoch": 0.563963963963964, + "grad_norm": 0.3186417818069458, + "learning_rate": 9.153087410550914e-05, + "loss": 0.3787, + "step": 4695 + }, + { + "epoch": 0.5645645645645646, + "grad_norm": 0.401591956615448, + "learning_rate": 9.151335462200814e-05, + "loss": 0.4507, + "step": 4700 + }, + { + "epoch": 0.5651651651651651, + "grad_norm": 0.27890413999557495, + "learning_rate": 9.149581871685218e-05, + "loss": 0.374, + "step": 4705 + }, + { + "epoch": 0.5657657657657658, + "grad_norm": 0.34605100750923157, + "learning_rate": 9.147826639697803e-05, + "loss": 0.4303, + "step": 4710 + }, + { + "epoch": 0.5663663663663664, + "grad_norm": 0.2923938035964966, + "learning_rate": 9.146069766932893e-05, + "loss": 0.3849, + "step": 4715 + }, + { + "epoch": 0.5669669669669669, + "grad_norm": 0.32079771161079407, + "learning_rate": 9.144311254085464e-05, + "loss": 0.4424, + "step": 4720 + }, + { + "epoch": 0.5675675675675675, + "grad_norm": 0.3526853024959564, + "learning_rate": 9.142551101851143e-05, + "loss": 0.3951, + "step": 4725 + }, + { + "epoch": 0.5681681681681682, + "grad_norm": 0.3102889955043793, + "learning_rate": 9.140789310926199e-05, + "loss": 0.3816, + "step": 4730 + }, + { + "epoch": 0.5687687687687688, + "grad_norm": 0.348457932472229, + "learning_rate": 9.139025882007554e-05, + "loss": 0.3873, + "step": 4735 + }, + { + "epoch": 0.5693693693693693, + "grad_norm": 0.34294039011001587, + "learning_rate": 9.137260815792776e-05, + "loss": 0.4115, + "step": 4740 + }, + { + "epoch": 0.56996996996997, + "grad_norm": 0.33280232548713684, + "learning_rate": 9.135494112980083e-05, + "loss": 0.3652, + "step": 4745 + }, + { + "epoch": 0.5705705705705706, + "grad_norm": 0.39281317591667175, + "learning_rate": 9.133725774268338e-05, + "loss": 0.3946, + "step": 4750 + }, + { + "epoch": 0.5705705705705706, + "eval_loss": 0.38407135009765625, + "eval_runtime": 35.5507, + "eval_samples_per_second": 22.503, + "eval_steps_per_second": 5.626, + "step": 4750 + }, + { + "epoch": 0.5711711711711712, + "grad_norm": 0.29195722937583923, + "learning_rate": 9.131955800357053e-05, + "loss": 0.442, + "step": 4755 + }, + { + "epoch": 0.5717717717717717, + "grad_norm": 0.3438752293586731, + "learning_rate": 9.130184191946385e-05, + "loss": 0.3713, + "step": 4760 + }, + { + "epoch": 0.5723723723723724, + "grad_norm": 0.33953338861465454, + "learning_rate": 9.128410949737138e-05, + "loss": 0.4078, + "step": 4765 + }, + { + "epoch": 0.572972972972973, + "grad_norm": 0.36786186695098877, + "learning_rate": 9.126636074430764e-05, + "loss": 0.4506, + "step": 4770 + }, + { + "epoch": 0.5735735735735735, + "grad_norm": 0.33084627985954285, + "learning_rate": 9.124859566729358e-05, + "loss": 0.457, + "step": 4775 + }, + { + "epoch": 0.5741741741741742, + "grad_norm": 0.34048357605934143, + "learning_rate": 9.123081427335665e-05, + "loss": 0.4624, + "step": 4780 + }, + { + "epoch": 0.5747747747747748, + "grad_norm": 0.3166431486606598, + "learning_rate": 9.12130165695307e-05, + "loss": 0.4316, + "step": 4785 + }, + { + "epoch": 0.5753753753753754, + "grad_norm": 0.35750752687454224, + "learning_rate": 9.119520256285608e-05, + "loss": 0.4237, + "step": 4790 + }, + { + "epoch": 0.5759759759759759, + "grad_norm": 0.3233676254749298, + "learning_rate": 9.117737226037956e-05, + "loss": 0.4279, + "step": 4795 + }, + { + "epoch": 0.5765765765765766, + "grad_norm": 0.30795225501060486, + "learning_rate": 9.115952566915436e-05, + "loss": 0.3651, + "step": 4800 + }, + { + "epoch": 0.5771771771771772, + "grad_norm": 0.3767455816268921, + "learning_rate": 9.114166279624017e-05, + "loss": 0.4354, + "step": 4805 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.2709740996360779, + "learning_rate": 9.112378364870309e-05, + "loss": 0.396, + "step": 4810 + }, + { + "epoch": 0.5783783783783784, + "grad_norm": 0.3398534953594208, + "learning_rate": 9.110588823361566e-05, + "loss": 0.4046, + "step": 4815 + }, + { + "epoch": 0.578978978978979, + "grad_norm": 0.3292076289653778, + "learning_rate": 9.108797655805689e-05, + "loss": 0.3966, + "step": 4820 + }, + { + "epoch": 0.5795795795795796, + "grad_norm": 0.3727780878543854, + "learning_rate": 9.107004862911216e-05, + "loss": 0.4112, + "step": 4825 + }, + { + "epoch": 0.5801801801801801, + "grad_norm": 0.3289770781993866, + "learning_rate": 9.105210445387333e-05, + "loss": 0.4083, + "step": 4830 + }, + { + "epoch": 0.5807807807807808, + "grad_norm": 0.39395871758461, + "learning_rate": 9.103414403943868e-05, + "loss": 0.3572, + "step": 4835 + }, + { + "epoch": 0.5813813813813814, + "grad_norm": 0.3781627416610718, + "learning_rate": 9.101616739291288e-05, + "loss": 0.4168, + "step": 4840 + }, + { + "epoch": 0.581981981981982, + "grad_norm": 0.3350948393344879, + "learning_rate": 9.099817452140709e-05, + "loss": 0.4191, + "step": 4845 + }, + { + "epoch": 0.5825825825825826, + "grad_norm": 0.31275612115859985, + "learning_rate": 9.09801654320388e-05, + "loss": 0.4307, + "step": 4850 + }, + { + "epoch": 0.5831831831831832, + "grad_norm": 0.3432002067565918, + "learning_rate": 9.096214013193198e-05, + "loss": 0.4171, + "step": 4855 + }, + { + "epoch": 0.5837837837837838, + "grad_norm": 0.30382242798805237, + "learning_rate": 9.094409862821698e-05, + "loss": 0.4029, + "step": 4860 + }, + { + "epoch": 0.5843843843843843, + "grad_norm": 0.35811981558799744, + "learning_rate": 9.092604092803058e-05, + "loss": 0.4422, + "step": 4865 + }, + { + "epoch": 0.584984984984985, + "grad_norm": 0.2972864806652069, + "learning_rate": 9.090796703851598e-05, + "loss": 0.3555, + "step": 4870 + }, + { + "epoch": 0.5855855855855856, + "grad_norm": 0.3713989555835724, + "learning_rate": 9.088987696682275e-05, + "loss": 0.3629, + "step": 4875 + }, + { + "epoch": 0.5861861861861862, + "grad_norm": 0.3291909992694855, + "learning_rate": 9.087177072010684e-05, + "loss": 0.395, + "step": 4880 + }, + { + "epoch": 0.5867867867867868, + "grad_norm": 0.3542977571487427, + "learning_rate": 9.085364830553067e-05, + "loss": 0.4634, + "step": 4885 + }, + { + "epoch": 0.5873873873873874, + "grad_norm": 0.38173505663871765, + "learning_rate": 9.083550973026302e-05, + "loss": 0.3931, + "step": 4890 + }, + { + "epoch": 0.587987987987988, + "grad_norm": 0.29966017603874207, + "learning_rate": 9.081735500147904e-05, + "loss": 0.3576, + "step": 4895 + }, + { + "epoch": 0.5885885885885885, + "grad_norm": 0.3684738576412201, + "learning_rate": 9.07991841263603e-05, + "loss": 0.4445, + "step": 4900 + }, + { + "epoch": 0.5891891891891892, + "grad_norm": 0.2916768193244934, + "learning_rate": 9.078099711209475e-05, + "loss": 0.3801, + "step": 4905 + }, + { + "epoch": 0.5897897897897898, + "grad_norm": 0.36422377824783325, + "learning_rate": 9.076279396587672e-05, + "loss": 0.4423, + "step": 4910 + }, + { + "epoch": 0.5903903903903904, + "grad_norm": 0.33675840497016907, + "learning_rate": 9.074457469490694e-05, + "loss": 0.3983, + "step": 4915 + }, + { + "epoch": 0.590990990990991, + "grad_norm": 0.396847665309906, + "learning_rate": 9.072633930639248e-05, + "loss": 0.4162, + "step": 4920 + }, + { + "epoch": 0.5915915915915916, + "grad_norm": 0.2722485661506653, + "learning_rate": 9.070808780754681e-05, + "loss": 0.3783, + "step": 4925 + }, + { + "epoch": 0.5921921921921922, + "grad_norm": 0.35289111733436584, + "learning_rate": 9.068982020558978e-05, + "loss": 0.4248, + "step": 4930 + }, + { + "epoch": 0.5927927927927928, + "grad_norm": 0.29830169677734375, + "learning_rate": 9.06715365077476e-05, + "loss": 0.4108, + "step": 4935 + }, + { + "epoch": 0.5933933933933934, + "grad_norm": 0.2733657658100128, + "learning_rate": 9.065323672125286e-05, + "loss": 0.3841, + "step": 4940 + }, + { + "epoch": 0.593993993993994, + "grad_norm": 0.33087727427482605, + "learning_rate": 9.063492085334446e-05, + "loss": 0.3879, + "step": 4945 + }, + { + "epoch": 0.5945945945945946, + "grad_norm": 0.341102659702301, + "learning_rate": 9.061658891126776e-05, + "loss": 0.4004, + "step": 4950 + }, + { + "epoch": 0.5951951951951951, + "grad_norm": 0.3307470381259918, + "learning_rate": 9.059824090227438e-05, + "loss": 0.4093, + "step": 4955 + }, + { + "epoch": 0.5957957957957958, + "grad_norm": 0.2968612313270569, + "learning_rate": 9.057987683362234e-05, + "loss": 0.3832, + "step": 4960 + }, + { + "epoch": 0.5963963963963964, + "grad_norm": 0.34879839420318604, + "learning_rate": 9.056149671257606e-05, + "loss": 0.4216, + "step": 4965 + }, + { + "epoch": 0.596996996996997, + "grad_norm": 0.33287060260772705, + "learning_rate": 9.05431005464062e-05, + "loss": 0.4384, + "step": 4970 + }, + { + "epoch": 0.5975975975975976, + "grad_norm": 0.3686003088951111, + "learning_rate": 9.052468834238986e-05, + "loss": 0.36, + "step": 4975 + }, + { + "epoch": 0.5981981981981982, + "grad_norm": 0.3743337094783783, + "learning_rate": 9.050626010781043e-05, + "loss": 0.402, + "step": 4980 + }, + { + "epoch": 0.5987987987987988, + "grad_norm": 0.3147016167640686, + "learning_rate": 9.048781584995766e-05, + "loss": 0.4415, + "step": 4985 + }, + { + "epoch": 0.5993993993993993, + "grad_norm": 0.31453582644462585, + "learning_rate": 9.04693555761277e-05, + "loss": 0.4095, + "step": 4990 + }, + { + "epoch": 0.6, + "grad_norm": 0.3484346866607666, + "learning_rate": 9.04508792936229e-05, + "loss": 0.3784, + "step": 4995 + }, + { + "epoch": 0.6006006006006006, + "grad_norm": 0.3905284106731415, + "learning_rate": 9.043238700975209e-05, + "loss": 0.3904, + "step": 5000 + }, + { + "epoch": 0.6006006006006006, + "eval_loss": 0.37757378816604614, + "eval_runtime": 35.6434, + "eval_samples_per_second": 22.445, + "eval_steps_per_second": 5.611, + "step": 5000 + }, + { + "epoch": 0.6012012012012012, + "grad_norm": 0.3124229311943054, + "learning_rate": 9.041387873183029e-05, + "loss": 0.4109, + "step": 5005 + }, + { + "epoch": 0.6018018018018018, + "grad_norm": 0.28576046228408813, + "learning_rate": 9.039535446717898e-05, + "loss": 0.3467, + "step": 5010 + }, + { + "epoch": 0.6024024024024024, + "grad_norm": 0.3818845748901367, + "learning_rate": 9.037681422312586e-05, + "loss": 0.3995, + "step": 5015 + }, + { + "epoch": 0.603003003003003, + "grad_norm": 0.28922221064567566, + "learning_rate": 9.0358258007005e-05, + "loss": 0.3503, + "step": 5020 + }, + { + "epoch": 0.6036036036036037, + "grad_norm": 0.37059420347213745, + "learning_rate": 9.033968582615679e-05, + "loss": 0.4039, + "step": 5025 + }, + { + "epoch": 0.6042042042042042, + "grad_norm": 0.4044734239578247, + "learning_rate": 9.03210976879279e-05, + "loss": 0.4621, + "step": 5030 + }, + { + "epoch": 0.6048048048048048, + "grad_norm": 0.3339507579803467, + "learning_rate": 9.030249359967138e-05, + "loss": 0.4037, + "step": 5035 + }, + { + "epoch": 0.6054054054054054, + "grad_norm": 0.4029451608657837, + "learning_rate": 9.02838735687465e-05, + "loss": 0.4259, + "step": 5040 + }, + { + "epoch": 0.606006006006006, + "grad_norm": 0.35424843430519104, + "learning_rate": 9.026523760251891e-05, + "loss": 0.3733, + "step": 5045 + }, + { + "epoch": 0.6066066066066066, + "grad_norm": 0.37154632806777954, + "learning_rate": 9.024658570836053e-05, + "loss": 0.4732, + "step": 5050 + }, + { + "epoch": 0.6072072072072072, + "grad_norm": 0.3752360939979553, + "learning_rate": 9.02279178936496e-05, + "loss": 0.4081, + "step": 5055 + }, + { + "epoch": 0.6078078078078079, + "grad_norm": 0.34165915846824646, + "learning_rate": 9.020923416577061e-05, + "loss": 0.413, + "step": 5060 + }, + { + "epoch": 0.6084084084084084, + "grad_norm": 0.3951336741447449, + "learning_rate": 9.019053453211441e-05, + "loss": 0.4257, + "step": 5065 + }, + { + "epoch": 0.609009009009009, + "grad_norm": 0.4182293117046356, + "learning_rate": 9.017181900007811e-05, + "loss": 0.4486, + "step": 5070 + }, + { + "epoch": 0.6096096096096096, + "grad_norm": 0.3688332736492157, + "learning_rate": 9.015308757706511e-05, + "loss": 0.3916, + "step": 5075 + }, + { + "epoch": 0.6102102102102102, + "grad_norm": 0.36908355355262756, + "learning_rate": 9.01343402704851e-05, + "loss": 0.394, + "step": 5080 + }, + { + "epoch": 0.6108108108108108, + "grad_norm": 0.39296767115592957, + "learning_rate": 9.011557708775402e-05, + "loss": 0.4116, + "step": 5085 + }, + { + "epoch": 0.6114114114114114, + "grad_norm": 0.314471036195755, + "learning_rate": 9.009679803629416e-05, + "loss": 0.4018, + "step": 5090 + }, + { + "epoch": 0.612012012012012, + "grad_norm": 0.3594004511833191, + "learning_rate": 9.007800312353402e-05, + "loss": 0.3745, + "step": 5095 + }, + { + "epoch": 0.6126126126126126, + "grad_norm": 0.3893055021762848, + "learning_rate": 9.005919235690842e-05, + "loss": 0.4367, + "step": 5100 + }, + { + "epoch": 0.6132132132132132, + "grad_norm": 0.41374874114990234, + "learning_rate": 9.004036574385844e-05, + "loss": 0.4236, + "step": 5105 + }, + { + "epoch": 0.6138138138138138, + "grad_norm": 0.33323878049850464, + "learning_rate": 9.00215232918314e-05, + "loss": 0.3806, + "step": 5110 + }, + { + "epoch": 0.6144144144144145, + "grad_norm": 0.3618185520172119, + "learning_rate": 9.000266500828091e-05, + "loss": 0.4042, + "step": 5115 + }, + { + "epoch": 0.615015015015015, + "grad_norm": 0.283945232629776, + "learning_rate": 8.998379090066687e-05, + "loss": 0.4158, + "step": 5120 + }, + { + "epoch": 0.6156156156156156, + "grad_norm": 0.34311872720718384, + "learning_rate": 8.996490097645536e-05, + "loss": 0.4343, + "step": 5125 + }, + { + "epoch": 0.6162162162162163, + "grad_norm": 0.31982770562171936, + "learning_rate": 8.99459952431188e-05, + "loss": 0.3358, + "step": 5130 + }, + { + "epoch": 0.6168168168168168, + "grad_norm": 0.3433046042919159, + "learning_rate": 8.992707370813581e-05, + "loss": 0.364, + "step": 5135 + }, + { + "epoch": 0.6174174174174174, + "grad_norm": 0.33817145228385925, + "learning_rate": 8.99081363789913e-05, + "loss": 0.4177, + "step": 5140 + }, + { + "epoch": 0.618018018018018, + "grad_norm": 0.31253111362457275, + "learning_rate": 8.988918326317641e-05, + "loss": 0.4195, + "step": 5145 + }, + { + "epoch": 0.6186186186186187, + "grad_norm": 0.3355768322944641, + "learning_rate": 8.98702143681885e-05, + "loss": 0.3551, + "step": 5150 + }, + { + "epoch": 0.6192192192192192, + "grad_norm": 0.4061967730522156, + "learning_rate": 8.985122970153121e-05, + "loss": 0.402, + "step": 5155 + }, + { + "epoch": 0.6198198198198198, + "grad_norm": 0.303901731967926, + "learning_rate": 8.983222927071442e-05, + "loss": 0.3525, + "step": 5160 + }, + { + "epoch": 0.6204204204204204, + "grad_norm": 0.385477215051651, + "learning_rate": 8.98132130832542e-05, + "loss": 0.3721, + "step": 5165 + }, + { + "epoch": 0.621021021021021, + "grad_norm": 0.3199411630630493, + "learning_rate": 8.97941811466729e-05, + "loss": 0.4093, + "step": 5170 + }, + { + "epoch": 0.6216216216216216, + "grad_norm": 0.33952972292900085, + "learning_rate": 8.977513346849907e-05, + "loss": 0.3733, + "step": 5175 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.37760990858078003, + "learning_rate": 8.975607005626752e-05, + "loss": 0.4278, + "step": 5180 + }, + { + "epoch": 0.6228228228228229, + "grad_norm": 0.3244675397872925, + "learning_rate": 8.973699091751923e-05, + "loss": 0.3954, + "step": 5185 + }, + { + "epoch": 0.6234234234234234, + "grad_norm": 0.40450751781463623, + "learning_rate": 8.971789605980148e-05, + "loss": 0.4145, + "step": 5190 + }, + { + "epoch": 0.624024024024024, + "grad_norm": 0.35623157024383545, + "learning_rate": 8.96987854906677e-05, + "loss": 0.3455, + "step": 5195 + }, + { + "epoch": 0.6246246246246246, + "grad_norm": 0.3210749924182892, + "learning_rate": 8.967965921767755e-05, + "loss": 0.3772, + "step": 5200 + }, + { + "epoch": 0.6252252252252253, + "grad_norm": 0.2864453196525574, + "learning_rate": 8.966051724839691e-05, + "loss": 0.3695, + "step": 5205 + }, + { + "epoch": 0.6258258258258258, + "grad_norm": 0.38941943645477295, + "learning_rate": 8.96413595903979e-05, + "loss": 0.3886, + "step": 5210 + }, + { + "epoch": 0.6264264264264264, + "grad_norm": 0.34969213604927063, + "learning_rate": 8.962218625125875e-05, + "loss": 0.4243, + "step": 5215 + }, + { + "epoch": 0.6270270270270271, + "grad_norm": 0.3812369406223297, + "learning_rate": 8.960299723856404e-05, + "loss": 0.3804, + "step": 5220 + }, + { + "epoch": 0.6276276276276276, + "grad_norm": 0.32315537333488464, + "learning_rate": 8.958379255990441e-05, + "loss": 0.3731, + "step": 5225 + }, + { + "epoch": 0.6282282282282282, + "grad_norm": 0.42129507660865784, + "learning_rate": 8.956457222287677e-05, + "loss": 0.3576, + "step": 5230 + }, + { + "epoch": 0.6288288288288288, + "grad_norm": 0.38810428977012634, + "learning_rate": 8.95453362350842e-05, + "loss": 0.3623, + "step": 5235 + }, + { + "epoch": 0.6294294294294295, + "grad_norm": 0.38933542370796204, + "learning_rate": 8.952608460413603e-05, + "loss": 0.3985, + "step": 5240 + }, + { + "epoch": 0.63003003003003, + "grad_norm": 0.3767377436161041, + "learning_rate": 8.950681733764767e-05, + "loss": 0.4433, + "step": 5245 + }, + { + "epoch": 0.6306306306306306, + "grad_norm": 0.3073877692222595, + "learning_rate": 8.948753444324078e-05, + "loss": 0.3119, + "step": 5250 + }, + { + "epoch": 0.6306306306306306, + "eval_loss": 0.3674429953098297, + "eval_runtime": 35.5913, + "eval_samples_per_second": 22.477, + "eval_steps_per_second": 5.619, + "step": 5250 + }, + { + "epoch": 0.6312312312312313, + "grad_norm": 0.430916965007782, + "learning_rate": 8.946823592854323e-05, + "loss": 0.4167, + "step": 5255 + }, + { + "epoch": 0.6318318318318318, + "grad_norm": 0.3854207396507263, + "learning_rate": 8.944892180118901e-05, + "loss": 0.4014, + "step": 5260 + }, + { + "epoch": 0.6324324324324324, + "grad_norm": 0.40012556314468384, + "learning_rate": 8.942959206881832e-05, + "loss": 0.3998, + "step": 5265 + }, + { + "epoch": 0.633033033033033, + "grad_norm": 0.37446069717407227, + "learning_rate": 8.94102467390775e-05, + "loss": 0.4084, + "step": 5270 + }, + { + "epoch": 0.6336336336336337, + "grad_norm": 0.32861629128456116, + "learning_rate": 8.939088581961912e-05, + "loss": 0.4207, + "step": 5275 + }, + { + "epoch": 0.6342342342342342, + "grad_norm": 0.3228270709514618, + "learning_rate": 8.937150931810185e-05, + "loss": 0.3641, + "step": 5280 + }, + { + "epoch": 0.6348348348348348, + "grad_norm": 0.3372856676578522, + "learning_rate": 8.935211724219057e-05, + "loss": 0.4199, + "step": 5285 + }, + { + "epoch": 0.6354354354354355, + "grad_norm": 0.4280979335308075, + "learning_rate": 8.933270959955631e-05, + "loss": 0.4213, + "step": 5290 + }, + { + "epoch": 0.6360360360360361, + "grad_norm": 0.29096171259880066, + "learning_rate": 8.931328639787624e-05, + "loss": 0.381, + "step": 5295 + }, + { + "epoch": 0.6366366366366366, + "grad_norm": 0.3825353980064392, + "learning_rate": 8.929384764483369e-05, + "loss": 0.403, + "step": 5300 + }, + { + "epoch": 0.6372372372372372, + "grad_norm": 0.47639000415802, + "learning_rate": 8.927439334811817e-05, + "loss": 0.3837, + "step": 5305 + }, + { + "epoch": 0.6378378378378379, + "grad_norm": 0.4135724604129791, + "learning_rate": 8.92549235154253e-05, + "loss": 0.4371, + "step": 5310 + }, + { + "epoch": 0.6384384384384384, + "grad_norm": 0.3919542133808136, + "learning_rate": 8.923543815445688e-05, + "loss": 0.3985, + "step": 5315 + }, + { + "epoch": 0.639039039039039, + "grad_norm": 0.35299044847488403, + "learning_rate": 8.921593727292083e-05, + "loss": 0.4093, + "step": 5320 + }, + { + "epoch": 0.6396396396396397, + "grad_norm": 0.41586896777153015, + "learning_rate": 8.919642087853122e-05, + "loss": 0.417, + "step": 5325 + }, + { + "epoch": 0.6402402402402403, + "grad_norm": 0.35224875807762146, + "learning_rate": 8.917688897900822e-05, + "loss": 0.3912, + "step": 5330 + }, + { + "epoch": 0.6408408408408408, + "grad_norm": 0.3455101251602173, + "learning_rate": 8.915734158207822e-05, + "loss": 0.3583, + "step": 5335 + }, + { + "epoch": 0.6414414414414414, + "grad_norm": 0.34384745359420776, + "learning_rate": 8.913777869547365e-05, + "loss": 0.389, + "step": 5340 + }, + { + "epoch": 0.6420420420420421, + "grad_norm": 0.3941856026649475, + "learning_rate": 8.91182003269331e-05, + "loss": 0.4044, + "step": 5345 + }, + { + "epoch": 0.6426426426426426, + "grad_norm": 0.32409560680389404, + "learning_rate": 8.909860648420131e-05, + "loss": 0.3529, + "step": 5350 + }, + { + "epoch": 0.6432432432432432, + "grad_norm": 0.3262473940849304, + "learning_rate": 8.90789971750291e-05, + "loss": 0.3897, + "step": 5355 + }, + { + "epoch": 0.6438438438438439, + "grad_norm": 0.3769555687904358, + "learning_rate": 8.905937240717346e-05, + "loss": 0.3962, + "step": 5360 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.3664250373840332, + "learning_rate": 8.903973218839741e-05, + "loss": 0.3887, + "step": 5365 + }, + { + "epoch": 0.645045045045045, + "grad_norm": 0.3253674805164337, + "learning_rate": 8.902007652647018e-05, + "loss": 0.3534, + "step": 5370 + }, + { + "epoch": 0.6456456456456456, + "grad_norm": 0.47549325227737427, + "learning_rate": 8.900040542916703e-05, + "loss": 0.4098, + "step": 5375 + }, + { + "epoch": 0.6462462462462463, + "grad_norm": 0.3671705424785614, + "learning_rate": 8.898071890426937e-05, + "loss": 0.3674, + "step": 5380 + }, + { + "epoch": 0.6468468468468469, + "grad_norm": 0.42792201042175293, + "learning_rate": 8.896101695956472e-05, + "loss": 0.361, + "step": 5385 + }, + { + "epoch": 0.6474474474474474, + "grad_norm": 0.34030088782310486, + "learning_rate": 8.894129960284667e-05, + "loss": 0.3389, + "step": 5390 + }, + { + "epoch": 0.648048048048048, + "grad_norm": 0.37272098660469055, + "learning_rate": 8.89215668419149e-05, + "loss": 0.3755, + "step": 5395 + }, + { + "epoch": 0.6486486486486487, + "grad_norm": 0.4172622263431549, + "learning_rate": 8.890181868457523e-05, + "loss": 0.3551, + "step": 5400 + }, + { + "epoch": 0.6492492492492492, + "grad_norm": 0.3918893337249756, + "learning_rate": 8.888205513863952e-05, + "loss": 0.3692, + "step": 5405 + }, + { + "epoch": 0.6498498498498498, + "grad_norm": 0.36653268337249756, + "learning_rate": 8.886227621192576e-05, + "loss": 0.3525, + "step": 5410 + }, + { + "epoch": 0.6504504504504505, + "grad_norm": 0.37922611832618713, + "learning_rate": 8.884248191225797e-05, + "loss": 0.4231, + "step": 5415 + }, + { + "epoch": 0.6510510510510511, + "grad_norm": 0.384181946516037, + "learning_rate": 8.882267224746632e-05, + "loss": 0.3599, + "step": 5420 + }, + { + "epoch": 0.6516516516516516, + "grad_norm": 0.33977892994880676, + "learning_rate": 8.8802847225387e-05, + "loss": 0.3971, + "step": 5425 + }, + { + "epoch": 0.6522522522522523, + "grad_norm": 0.33268967270851135, + "learning_rate": 8.878300685386232e-05, + "loss": 0.3826, + "step": 5430 + }, + { + "epoch": 0.6528528528528529, + "grad_norm": 0.36312827467918396, + "learning_rate": 8.87631511407406e-05, + "loss": 0.3465, + "step": 5435 + }, + { + "epoch": 0.6534534534534534, + "grad_norm": 0.41879400610923767, + "learning_rate": 8.874328009387632e-05, + "loss": 0.3895, + "step": 5440 + }, + { + "epoch": 0.654054054054054, + "grad_norm": 0.3923923969268799, + "learning_rate": 8.872339372112994e-05, + "loss": 0.375, + "step": 5445 + }, + { + "epoch": 0.6546546546546547, + "grad_norm": 0.4057493209838867, + "learning_rate": 8.870349203036804e-05, + "loss": 0.3528, + "step": 5450 + }, + { + "epoch": 0.6552552552552553, + "grad_norm": 0.326043963432312, + "learning_rate": 8.868357502946318e-05, + "loss": 0.3786, + "step": 5455 + }, + { + "epoch": 0.6558558558558558, + "grad_norm": 0.3921876847743988, + "learning_rate": 8.86636427262941e-05, + "loss": 0.3363, + "step": 5460 + }, + { + "epoch": 0.6564564564564564, + "grad_norm": 0.37249556183815, + "learning_rate": 8.864369512874551e-05, + "loss": 0.3595, + "step": 5465 + }, + { + "epoch": 0.6570570570570571, + "grad_norm": 0.4267086386680603, + "learning_rate": 8.862373224470815e-05, + "loss": 0.4264, + "step": 5470 + }, + { + "epoch": 0.6576576576576577, + "grad_norm": 0.34260231256484985, + "learning_rate": 8.860375408207888e-05, + "loss": 0.3976, + "step": 5475 + }, + { + "epoch": 0.6582582582582582, + "grad_norm": 0.3625841438770294, + "learning_rate": 8.858376064876056e-05, + "loss": 0.3296, + "step": 5480 + }, + { + "epoch": 0.6588588588588589, + "grad_norm": 0.41739052534103394, + "learning_rate": 8.856375195266208e-05, + "loss": 0.3798, + "step": 5485 + }, + { + "epoch": 0.6594594594594595, + "grad_norm": 0.4534454941749573, + "learning_rate": 8.85437280016984e-05, + "loss": 0.385, + "step": 5490 + }, + { + "epoch": 0.66006006006006, + "grad_norm": 0.3796202540397644, + "learning_rate": 8.852368880379049e-05, + "loss": 0.4401, + "step": 5495 + }, + { + "epoch": 0.6606606606606606, + "grad_norm": 0.35005345940589905, + "learning_rate": 8.850363436686537e-05, + "loss": 0.3768, + "step": 5500 + }, + { + "epoch": 0.6606606606606606, + "eval_loss": 0.3557297885417938, + "eval_runtime": 35.5127, + "eval_samples_per_second": 22.527, + "eval_steps_per_second": 5.632, + "step": 5500 + }, + { + "epoch": 0.6612612612612613, + "grad_norm": 0.3682941794395447, + "learning_rate": 8.848356469885606e-05, + "loss": 0.352, + "step": 5505 + }, + { + "epoch": 0.6618618618618619, + "grad_norm": 0.4099908471107483, + "learning_rate": 8.846347980770165e-05, + "loss": 0.4246, + "step": 5510 + }, + { + "epoch": 0.6624624624624624, + "grad_norm": 0.3958446681499481, + "learning_rate": 8.84433797013472e-05, + "loss": 0.3555, + "step": 5515 + }, + { + "epoch": 0.6630630630630631, + "grad_norm": 0.28630247712135315, + "learning_rate": 8.842326438774383e-05, + "loss": 0.3137, + "step": 5520 + }, + { + "epoch": 0.6636636636636637, + "grad_norm": 0.39440688490867615, + "learning_rate": 8.840313387484867e-05, + "loss": 0.3905, + "step": 5525 + }, + { + "epoch": 0.6642642642642642, + "grad_norm": 0.42743274569511414, + "learning_rate": 8.838298817062483e-05, + "loss": 0.4187, + "step": 5530 + }, + { + "epoch": 0.6648648648648648, + "grad_norm": 0.3959651589393616, + "learning_rate": 8.836282728304145e-05, + "loss": 0.3735, + "step": 5535 + }, + { + "epoch": 0.6654654654654655, + "grad_norm": 0.34156426787376404, + "learning_rate": 8.83426512200737e-05, + "loss": 0.3353, + "step": 5540 + }, + { + "epoch": 0.6660660660660661, + "grad_norm": 0.32064732909202576, + "learning_rate": 8.832245998970271e-05, + "loss": 0.3512, + "step": 5545 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.4145539402961731, + "learning_rate": 8.830225359991564e-05, + "loss": 0.3382, + "step": 5550 + }, + { + "epoch": 0.6672672672672673, + "grad_norm": 0.43516016006469727, + "learning_rate": 8.828203205870564e-05, + "loss": 0.4395, + "step": 5555 + }, + { + "epoch": 0.6678678678678679, + "grad_norm": 0.37278711795806885, + "learning_rate": 8.826179537407185e-05, + "loss": 0.3943, + "step": 5560 + }, + { + "epoch": 0.6684684684684684, + "grad_norm": 0.369424968957901, + "learning_rate": 8.82415435540194e-05, + "loss": 0.3275, + "step": 5565 + }, + { + "epoch": 0.669069069069069, + "grad_norm": 0.3561498522758484, + "learning_rate": 8.822127660655942e-05, + "loss": 0.3346, + "step": 5570 + }, + { + "epoch": 0.6696696696696697, + "grad_norm": 0.38225051760673523, + "learning_rate": 8.820099453970899e-05, + "loss": 0.3948, + "step": 5575 + }, + { + "epoch": 0.6702702702702703, + "grad_norm": 0.399332731962204, + "learning_rate": 8.81806973614912e-05, + "loss": 0.375, + "step": 5580 + }, + { + "epoch": 0.6708708708708708, + "grad_norm": 0.4058336019515991, + "learning_rate": 8.81603850799351e-05, + "loss": 0.4305, + "step": 5585 + }, + { + "epoch": 0.6714714714714715, + "grad_norm": 0.4609101414680481, + "learning_rate": 8.814005770307575e-05, + "loss": 0.4438, + "step": 5590 + }, + { + "epoch": 0.6720720720720721, + "grad_norm": 0.46701136231422424, + "learning_rate": 8.811971523895415e-05, + "loss": 0.4087, + "step": 5595 + }, + { + "epoch": 0.6726726726726727, + "grad_norm": 0.32095766067504883, + "learning_rate": 8.809935769561728e-05, + "loss": 0.3499, + "step": 5600 + }, + { + "epoch": 0.6732732732732732, + "grad_norm": 0.41197481751441956, + "learning_rate": 8.807898508111806e-05, + "loss": 0.4206, + "step": 5605 + }, + { + "epoch": 0.6738738738738739, + "grad_norm": 0.42365097999572754, + "learning_rate": 8.805859740351541e-05, + "loss": 0.4077, + "step": 5610 + }, + { + "epoch": 0.6744744744744745, + "grad_norm": 0.3006840944290161, + "learning_rate": 8.803819467087417e-05, + "loss": 0.3083, + "step": 5615 + }, + { + "epoch": 0.675075075075075, + "grad_norm": 0.43815919756889343, + "learning_rate": 8.80177768912652e-05, + "loss": 0.35, + "step": 5620 + }, + { + "epoch": 0.6756756756756757, + "grad_norm": 0.4547651410102844, + "learning_rate": 8.799734407276521e-05, + "loss": 0.4137, + "step": 5625 + }, + { + "epoch": 0.6762762762762763, + "grad_norm": 0.3741016685962677, + "learning_rate": 8.797689622345695e-05, + "loss": 0.3489, + "step": 5630 + }, + { + "epoch": 0.6768768768768769, + "grad_norm": 0.373638778924942, + "learning_rate": 8.795643335142908e-05, + "loss": 0.3916, + "step": 5635 + }, + { + "epoch": 0.6774774774774774, + "grad_norm": 0.3880307972431183, + "learning_rate": 8.79359554647762e-05, + "loss": 0.4292, + "step": 5640 + }, + { + "epoch": 0.6780780780780781, + "grad_norm": 0.4027876853942871, + "learning_rate": 8.791546257159886e-05, + "loss": 0.3855, + "step": 5645 + }, + { + "epoch": 0.6786786786786787, + "grad_norm": 0.3801300823688507, + "learning_rate": 8.789495468000354e-05, + "loss": 0.3638, + "step": 5650 + }, + { + "epoch": 0.6792792792792792, + "grad_norm": 0.34081804752349854, + "learning_rate": 8.787443179810266e-05, + "loss": 0.3597, + "step": 5655 + }, + { + "epoch": 0.6798798798798799, + "grad_norm": 0.4241330027580261, + "learning_rate": 8.785389393401455e-05, + "loss": 0.3532, + "step": 5660 + }, + { + "epoch": 0.6804804804804805, + "grad_norm": 0.35839948058128357, + "learning_rate": 8.783334109586348e-05, + "loss": 0.3605, + "step": 5665 + }, + { + "epoch": 0.6810810810810811, + "grad_norm": 0.4927769899368286, + "learning_rate": 8.781277329177966e-05, + "loss": 0.4043, + "step": 5670 + }, + { + "epoch": 0.6816816816816816, + "grad_norm": 0.3698582947254181, + "learning_rate": 8.779219052989919e-05, + "loss": 0.3446, + "step": 5675 + }, + { + "epoch": 0.6822822822822823, + "grad_norm": 0.515845775604248, + "learning_rate": 8.77715928183641e-05, + "loss": 0.4091, + "step": 5680 + }, + { + "epoch": 0.6828828828828829, + "grad_norm": 0.39872249960899353, + "learning_rate": 8.775098016532235e-05, + "loss": 0.405, + "step": 5685 + }, + { + "epoch": 0.6834834834834835, + "grad_norm": 0.33597859740257263, + "learning_rate": 8.773035257892778e-05, + "loss": 0.3354, + "step": 5690 + }, + { + "epoch": 0.684084084084084, + "grad_norm": 0.34901607036590576, + "learning_rate": 8.770971006734015e-05, + "loss": 0.3642, + "step": 5695 + }, + { + "epoch": 0.6846846846846847, + "grad_norm": 0.4633309841156006, + "learning_rate": 8.768905263872515e-05, + "loss": 0.3454, + "step": 5700 + }, + { + "epoch": 0.6852852852852853, + "grad_norm": 0.44013097882270813, + "learning_rate": 8.766838030125432e-05, + "loss": 0.3439, + "step": 5705 + }, + { + "epoch": 0.6858858858858858, + "grad_norm": 0.36319929361343384, + "learning_rate": 8.764769306310513e-05, + "loss": 0.3189, + "step": 5710 + }, + { + "epoch": 0.6864864864864865, + "grad_norm": 0.44178229570388794, + "learning_rate": 8.762699093246096e-05, + "loss": 0.3161, + "step": 5715 + }, + { + "epoch": 0.6870870870870871, + "grad_norm": 0.4986707270145416, + "learning_rate": 8.760627391751103e-05, + "loss": 0.3449, + "step": 5720 + }, + { + "epoch": 0.6876876876876877, + "grad_norm": 0.4556070566177368, + "learning_rate": 8.75855420264505e-05, + "loss": 0.3516, + "step": 5725 + }, + { + "epoch": 0.6882882882882883, + "grad_norm": 0.45364683866500854, + "learning_rate": 8.756479526748039e-05, + "loss": 0.3842, + "step": 5730 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.34849977493286133, + "learning_rate": 8.754403364880759e-05, + "loss": 0.3194, + "step": 5735 + }, + { + "epoch": 0.6894894894894895, + "grad_norm": 0.39173176884651184, + "learning_rate": 8.752325717864488e-05, + "loss": 0.3344, + "step": 5740 + }, + { + "epoch": 0.69009009009009, + "grad_norm": 0.34377187490463257, + "learning_rate": 8.750246586521095e-05, + "loss": 0.3627, + "step": 5745 + }, + { + "epoch": 0.6906906906906907, + "grad_norm": 0.44872725009918213, + "learning_rate": 8.74816597167303e-05, + "loss": 0.3905, + "step": 5750 + }, + { + "epoch": 0.6906906906906907, + "eval_loss": 0.33489829301834106, + "eval_runtime": 35.5357, + "eval_samples_per_second": 22.513, + "eval_steps_per_second": 5.628, + "step": 5750 + }, + { + "epoch": 0.6912912912912913, + "grad_norm": 0.38582900166511536, + "learning_rate": 8.746083874143334e-05, + "loss": 0.3528, + "step": 5755 + }, + { + "epoch": 0.6918918918918919, + "grad_norm": 0.4613777697086334, + "learning_rate": 8.744000294755632e-05, + "loss": 0.3335, + "step": 5760 + }, + { + "epoch": 0.6924924924924925, + "grad_norm": 0.36886975169181824, + "learning_rate": 8.741915234334138e-05, + "loss": 0.3598, + "step": 5765 + }, + { + "epoch": 0.6930930930930931, + "grad_norm": 0.2886711061000824, + "learning_rate": 8.739828693703647e-05, + "loss": 0.3482, + "step": 5770 + }, + { + "epoch": 0.6936936936936937, + "grad_norm": 0.35636454820632935, + "learning_rate": 8.737740673689547e-05, + "loss": 0.3472, + "step": 5775 + }, + { + "epoch": 0.6942942942942943, + "grad_norm": 0.3754615783691406, + "learning_rate": 8.735651175117805e-05, + "loss": 0.3748, + "step": 5780 + }, + { + "epoch": 0.6948948948948949, + "grad_norm": 0.4312724769115448, + "learning_rate": 8.733560198814975e-05, + "loss": 0.3662, + "step": 5785 + }, + { + "epoch": 0.6954954954954955, + "grad_norm": 0.4254775643348694, + "learning_rate": 8.731467745608195e-05, + "loss": 0.3273, + "step": 5790 + }, + { + "epoch": 0.6960960960960961, + "grad_norm": 0.3985266387462616, + "learning_rate": 8.72937381632519e-05, + "loss": 0.3426, + "step": 5795 + }, + { + "epoch": 0.6966966966966966, + "grad_norm": 0.3750954866409302, + "learning_rate": 8.727278411794261e-05, + "loss": 0.3256, + "step": 5800 + }, + { + "epoch": 0.6972972972972973, + "grad_norm": 0.5321087837219238, + "learning_rate": 8.725181532844305e-05, + "loss": 0.3643, + "step": 5805 + }, + { + "epoch": 0.6978978978978979, + "grad_norm": 0.38525283336639404, + "learning_rate": 8.723083180304791e-05, + "loss": 0.3331, + "step": 5810 + }, + { + "epoch": 0.6984984984984985, + "grad_norm": 0.4347771108150482, + "learning_rate": 8.720983355005776e-05, + "loss": 0.4098, + "step": 5815 + }, + { + "epoch": 0.6990990990990991, + "grad_norm": 0.3714815676212311, + "learning_rate": 8.7188820577779e-05, + "loss": 0.3134, + "step": 5820 + }, + { + "epoch": 0.6996996996996997, + "grad_norm": 0.3754033148288727, + "learning_rate": 8.716779289452384e-05, + "loss": 0.3197, + "step": 5825 + }, + { + "epoch": 0.7003003003003003, + "grad_norm": 0.34821298718452454, + "learning_rate": 8.714675050861029e-05, + "loss": 0.3545, + "step": 5830 + }, + { + "epoch": 0.7009009009009008, + "grad_norm": 0.3712867200374603, + "learning_rate": 8.712569342836223e-05, + "loss": 0.3837, + "step": 5835 + }, + { + "epoch": 0.7015015015015015, + "grad_norm": 0.3959085941314697, + "learning_rate": 8.710462166210931e-05, + "loss": 0.368, + "step": 5840 + }, + { + "epoch": 0.7021021021021021, + "grad_norm": 0.4257919192314148, + "learning_rate": 8.708353521818697e-05, + "loss": 0.3487, + "step": 5845 + }, + { + "epoch": 0.7027027027027027, + "grad_norm": 0.35467055439949036, + "learning_rate": 8.706243410493651e-05, + "loss": 0.3392, + "step": 5850 + }, + { + "epoch": 0.7033033033033033, + "grad_norm": 0.3705252408981323, + "learning_rate": 8.704131833070501e-05, + "loss": 0.4105, + "step": 5855 + }, + { + "epoch": 0.7039039039039039, + "grad_norm": 0.3051586151123047, + "learning_rate": 8.702018790384535e-05, + "loss": 0.3708, + "step": 5860 + }, + { + "epoch": 0.7045045045045045, + "grad_norm": 0.37856411933898926, + "learning_rate": 8.699904283271617e-05, + "loss": 0.3514, + "step": 5865 + }, + { + "epoch": 0.7051051051051052, + "grad_norm": 0.4097536504268646, + "learning_rate": 8.697788312568198e-05, + "loss": 0.3417, + "step": 5870 + }, + { + "epoch": 0.7057057057057057, + "grad_norm": 0.43204861879348755, + "learning_rate": 8.695670879111301e-05, + "loss": 0.3584, + "step": 5875 + }, + { + "epoch": 0.7063063063063063, + "grad_norm": 0.3879355788230896, + "learning_rate": 8.69355198373853e-05, + "loss": 0.335, + "step": 5880 + }, + { + "epoch": 0.7069069069069069, + "grad_norm": 0.5099337100982666, + "learning_rate": 8.691431627288072e-05, + "loss": 0.3692, + "step": 5885 + }, + { + "epoch": 0.7075075075075075, + "grad_norm": 0.4005381166934967, + "learning_rate": 8.68930981059868e-05, + "loss": 0.3466, + "step": 5890 + }, + { + "epoch": 0.7081081081081081, + "grad_norm": 0.4230159819126129, + "learning_rate": 8.687186534509699e-05, + "loss": 0.3351, + "step": 5895 + }, + { + "epoch": 0.7087087087087087, + "grad_norm": 0.4088141620159149, + "learning_rate": 8.68506179986104e-05, + "loss": 0.3813, + "step": 5900 + }, + { + "epoch": 0.7093093093093094, + "grad_norm": 0.3280819356441498, + "learning_rate": 8.682935607493197e-05, + "loss": 0.3401, + "step": 5905 + }, + { + "epoch": 0.7099099099099099, + "grad_norm": 0.3707108497619629, + "learning_rate": 8.68080795824724e-05, + "loss": 0.32, + "step": 5910 + }, + { + "epoch": 0.7105105105105105, + "grad_norm": 0.5042741298675537, + "learning_rate": 8.678678852964812e-05, + "loss": 0.3853, + "step": 5915 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.4071110188961029, + "learning_rate": 8.676548292488136e-05, + "loss": 0.3667, + "step": 5920 + }, + { + "epoch": 0.7117117117117117, + "grad_norm": 0.3619343936443329, + "learning_rate": 8.67441627766001e-05, + "loss": 0.3353, + "step": 5925 + }, + { + "epoch": 0.7123123123123123, + "grad_norm": 0.42885512113571167, + "learning_rate": 8.672282809323802e-05, + "loss": 0.3706, + "step": 5930 + }, + { + "epoch": 0.7129129129129129, + "grad_norm": 0.4062737822532654, + "learning_rate": 8.670147888323466e-05, + "loss": 0.3547, + "step": 5935 + }, + { + "epoch": 0.7135135135135136, + "grad_norm": 0.3969167470932007, + "learning_rate": 8.668011515503517e-05, + "loss": 0.326, + "step": 5940 + }, + { + "epoch": 0.7141141141141141, + "grad_norm": 0.4645400047302246, + "learning_rate": 8.665873691709055e-05, + "loss": 0.336, + "step": 5945 + }, + { + "epoch": 0.7147147147147147, + "grad_norm": 0.3994089961051941, + "learning_rate": 8.66373441778575e-05, + "loss": 0.3598, + "step": 5950 + }, + { + "epoch": 0.7153153153153153, + "grad_norm": 0.37047797441482544, + "learning_rate": 8.661593694579845e-05, + "loss": 0.3982, + "step": 5955 + }, + { + "epoch": 0.715915915915916, + "grad_norm": 0.399649977684021, + "learning_rate": 8.659451522938157e-05, + "loss": 0.3593, + "step": 5960 + }, + { + "epoch": 0.7165165165165165, + "grad_norm": 0.36607518792152405, + "learning_rate": 8.657307903708077e-05, + "loss": 0.3224, + "step": 5965 + }, + { + "epoch": 0.7171171171171171, + "grad_norm": 0.4388699531555176, + "learning_rate": 8.655162837737565e-05, + "loss": 0.3652, + "step": 5970 + }, + { + "epoch": 0.7177177177177178, + "grad_norm": 0.4220297634601593, + "learning_rate": 8.653016325875158e-05, + "loss": 0.3607, + "step": 5975 + }, + { + "epoch": 0.7183183183183183, + "grad_norm": 0.41517767310142517, + "learning_rate": 8.650868368969964e-05, + "loss": 0.3531, + "step": 5980 + }, + { + "epoch": 0.7189189189189189, + "grad_norm": 0.3969719409942627, + "learning_rate": 8.648718967871661e-05, + "loss": 0.3121, + "step": 5985 + }, + { + "epoch": 0.7195195195195195, + "grad_norm": 0.49861007928848267, + "learning_rate": 8.646568123430499e-05, + "loss": 0.3661, + "step": 5990 + }, + { + "epoch": 0.7201201201201202, + "grad_norm": 0.4132179915904999, + "learning_rate": 8.644415836497295e-05, + "loss": 0.3354, + "step": 5995 + }, + { + "epoch": 0.7207207207207207, + "grad_norm": 0.4681933522224426, + "learning_rate": 8.642262107923445e-05, + "loss": 0.3979, + "step": 6000 + }, + { + "epoch": 0.7207207207207207, + "eval_loss": 0.3170243203639984, + "eval_runtime": 35.5466, + "eval_samples_per_second": 22.506, + "eval_steps_per_second": 5.626, + "step": 6000 + }, + { + "epoch": 0.7213213213213213, + "grad_norm": 0.3149552643299103, + "learning_rate": 8.64010693856091e-05, + "loss": 0.3217, + "step": 6005 + }, + { + "epoch": 0.721921921921922, + "grad_norm": 0.42974141240119934, + "learning_rate": 8.637950329262219e-05, + "loss": 0.3539, + "step": 6010 + }, + { + "epoch": 0.7225225225225225, + "grad_norm": 0.3971770405769348, + "learning_rate": 8.635792280880475e-05, + "loss": 0.318, + "step": 6015 + }, + { + "epoch": 0.7231231231231231, + "grad_norm": 0.4378364682197571, + "learning_rate": 8.63363279426935e-05, + "loss": 0.3837, + "step": 6020 + }, + { + "epoch": 0.7237237237237237, + "grad_norm": 0.46137237548828125, + "learning_rate": 8.631471870283082e-05, + "loss": 0.3426, + "step": 6025 + }, + { + "epoch": 0.7243243243243244, + "grad_norm": 0.4154767692089081, + "learning_rate": 8.629309509776478e-05, + "loss": 0.3469, + "step": 6030 + }, + { + "epoch": 0.7249249249249249, + "grad_norm": 0.35032203793525696, + "learning_rate": 8.627145713604916e-05, + "loss": 0.3323, + "step": 6035 + }, + { + "epoch": 0.7255255255255255, + "grad_norm": 0.434451162815094, + "learning_rate": 8.624980482624339e-05, + "loss": 0.3409, + "step": 6040 + }, + { + "epoch": 0.7261261261261261, + "grad_norm": 0.4690226912498474, + "learning_rate": 8.622813817691258e-05, + "loss": 0.3693, + "step": 6045 + }, + { + "epoch": 0.7267267267267268, + "grad_norm": 0.5112568140029907, + "learning_rate": 8.620645719662754e-05, + "loss": 0.3628, + "step": 6050 + }, + { + "epoch": 0.7273273273273273, + "grad_norm": 0.3646400570869446, + "learning_rate": 8.618476189396472e-05, + "loss": 0.3293, + "step": 6055 + }, + { + "epoch": 0.7279279279279279, + "grad_norm": 0.37316304445266724, + "learning_rate": 8.616305227750624e-05, + "loss": 0.3099, + "step": 6060 + }, + { + "epoch": 0.7285285285285286, + "grad_norm": 0.4041762351989746, + "learning_rate": 8.61413283558399e-05, + "loss": 0.2997, + "step": 6065 + }, + { + "epoch": 0.7291291291291291, + "grad_norm": 0.4494181275367737, + "learning_rate": 8.611959013755912e-05, + "loss": 0.3401, + "step": 6070 + }, + { + "epoch": 0.7297297297297297, + "grad_norm": 0.43668776750564575, + "learning_rate": 8.609783763126303e-05, + "loss": 0.3148, + "step": 6075 + }, + { + "epoch": 0.7303303303303303, + "grad_norm": 0.42859476804733276, + "learning_rate": 8.607607084555636e-05, + "loss": 0.3863, + "step": 6080 + }, + { + "epoch": 0.730930930930931, + "grad_norm": 0.4075924754142761, + "learning_rate": 8.605428978904953e-05, + "loss": 0.3486, + "step": 6085 + }, + { + "epoch": 0.7315315315315315, + "grad_norm": 0.39961525797843933, + "learning_rate": 8.603249447035859e-05, + "loss": 0.292, + "step": 6090 + }, + { + "epoch": 0.7321321321321321, + "grad_norm": 0.4883679449558258, + "learning_rate": 8.60106848981052e-05, + "loss": 0.3576, + "step": 6095 + }, + { + "epoch": 0.7327327327327328, + "grad_norm": 0.39501965045928955, + "learning_rate": 8.59888610809167e-05, + "loss": 0.3473, + "step": 6100 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.4299990236759186, + "learning_rate": 8.596702302742605e-05, + "loss": 0.3048, + "step": 6105 + }, + { + "epoch": 0.7339339339339339, + "grad_norm": 0.3688097298145294, + "learning_rate": 8.594517074627186e-05, + "loss": 0.3456, + "step": 6110 + }, + { + "epoch": 0.7345345345345345, + "grad_norm": 0.44492146372795105, + "learning_rate": 8.592330424609834e-05, + "loss": 0.3273, + "step": 6115 + }, + { + "epoch": 0.7351351351351352, + "grad_norm": 0.475625216960907, + "learning_rate": 8.590142353555532e-05, + "loss": 0.3544, + "step": 6120 + }, + { + "epoch": 0.7357357357357357, + "grad_norm": 0.4551377594470978, + "learning_rate": 8.587952862329829e-05, + "loss": 0.362, + "step": 6125 + }, + { + "epoch": 0.7363363363363363, + "grad_norm": 0.47384923696517944, + "learning_rate": 8.585761951798832e-05, + "loss": 0.3778, + "step": 6130 + }, + { + "epoch": 0.736936936936937, + "grad_norm": 0.3864975869655609, + "learning_rate": 8.583569622829213e-05, + "loss": 0.2848, + "step": 6135 + }, + { + "epoch": 0.7375375375375376, + "grad_norm": 0.4830896258354187, + "learning_rate": 8.5813758762882e-05, + "loss": 0.3299, + "step": 6140 + }, + { + "epoch": 0.7381381381381381, + "grad_norm": 0.4643004238605499, + "learning_rate": 8.579180713043587e-05, + "loss": 0.3324, + "step": 6145 + }, + { + "epoch": 0.7387387387387387, + "grad_norm": 0.46419695019721985, + "learning_rate": 8.576984133963725e-05, + "loss": 0.3263, + "step": 6150 + }, + { + "epoch": 0.7393393393393394, + "grad_norm": 0.40200313925743103, + "learning_rate": 8.57478613991753e-05, + "loss": 0.3368, + "step": 6155 + }, + { + "epoch": 0.7399399399399399, + "grad_norm": 0.441680371761322, + "learning_rate": 8.572586731774468e-05, + "loss": 0.3421, + "step": 6160 + }, + { + "epoch": 0.7405405405405405, + "grad_norm": 0.43650439381599426, + "learning_rate": 8.570385910404575e-05, + "loss": 0.3844, + "step": 6165 + }, + { + "epoch": 0.7411411411411412, + "grad_norm": 0.3936440050601959, + "learning_rate": 8.568183676678438e-05, + "loss": 0.3255, + "step": 6170 + }, + { + "epoch": 0.7417417417417418, + "grad_norm": 0.5458399057388306, + "learning_rate": 8.56598003146721e-05, + "loss": 0.3524, + "step": 6175 + }, + { + "epoch": 0.7423423423423423, + "grad_norm": 0.3615384101867676, + "learning_rate": 8.563774975642595e-05, + "loss": 0.3202, + "step": 6180 + }, + { + "epoch": 0.7429429429429429, + "grad_norm": 0.40769365429878235, + "learning_rate": 8.561568510076861e-05, + "loss": 0.3557, + "step": 6185 + }, + { + "epoch": 0.7435435435435436, + "grad_norm": 0.41414713859558105, + "learning_rate": 8.559360635642828e-05, + "loss": 0.2905, + "step": 6190 + }, + { + "epoch": 0.7441441441441441, + "grad_norm": 0.46470022201538086, + "learning_rate": 8.557151353213881e-05, + "loss": 0.3417, + "step": 6195 + }, + { + "epoch": 0.7447447447447447, + "grad_norm": 0.4674842953681946, + "learning_rate": 8.554940663663953e-05, + "loss": 0.3212, + "step": 6200 + }, + { + "epoch": 0.7453453453453454, + "grad_norm": 0.43788832426071167, + "learning_rate": 8.55272856786754e-05, + "loss": 0.3357, + "step": 6205 + }, + { + "epoch": 0.745945945945946, + "grad_norm": 0.42673784494400024, + "learning_rate": 8.550515066699692e-05, + "loss": 0.3096, + "step": 6210 + }, + { + "epoch": 0.7465465465465465, + "grad_norm": 0.39619484543800354, + "learning_rate": 8.548300161036016e-05, + "loss": 0.3188, + "step": 6215 + }, + { + "epoch": 0.7471471471471471, + "grad_norm": 0.42830052971839905, + "learning_rate": 8.546083851752673e-05, + "loss": 0.3581, + "step": 6220 + }, + { + "epoch": 0.7477477477477478, + "grad_norm": 0.3399454355239868, + "learning_rate": 8.543866139726381e-05, + "loss": 0.3254, + "step": 6225 + }, + { + "epoch": 0.7483483483483484, + "grad_norm": 0.38868388533592224, + "learning_rate": 8.541647025834412e-05, + "loss": 0.3096, + "step": 6230 + }, + { + "epoch": 0.7489489489489489, + "grad_norm": 0.442039430141449, + "learning_rate": 8.539426510954589e-05, + "loss": 0.3312, + "step": 6235 + }, + { + "epoch": 0.7495495495495496, + "grad_norm": 0.4817804992198944, + "learning_rate": 8.537204595965298e-05, + "loss": 0.3422, + "step": 6240 + }, + { + "epoch": 0.7501501501501502, + "grad_norm": 0.42679134011268616, + "learning_rate": 8.53498128174547e-05, + "loss": 0.3781, + "step": 6245 + }, + { + "epoch": 0.7507507507507507, + "grad_norm": 0.4844966530799866, + "learning_rate": 8.532756569174593e-05, + "loss": 0.3843, + "step": 6250 + }, + { + "epoch": 0.7507507507507507, + "eval_loss": 0.30824774503707886, + "eval_runtime": 35.7014, + "eval_samples_per_second": 22.408, + "eval_steps_per_second": 5.602, + "step": 6250 + }, + { + "epoch": 0.7513513513513513, + "grad_norm": 0.3791470527648926, + "learning_rate": 8.53053045913271e-05, + "loss": 0.3248, + "step": 6255 + }, + { + "epoch": 0.751951951951952, + "grad_norm": 0.4487055540084839, + "learning_rate": 8.528302952500415e-05, + "loss": 0.3045, + "step": 6260 + }, + { + "epoch": 0.7525525525525526, + "grad_norm": 0.3538472354412079, + "learning_rate": 8.526074050158855e-05, + "loss": 0.2978, + "step": 6265 + }, + { + "epoch": 0.7531531531531531, + "grad_norm": 0.475076824426651, + "learning_rate": 8.523843752989724e-05, + "loss": 0.3561, + "step": 6270 + }, + { + "epoch": 0.7537537537537538, + "grad_norm": 0.4490582346916199, + "learning_rate": 8.521612061875278e-05, + "loss": 0.3931, + "step": 6275 + }, + { + "epoch": 0.7543543543543544, + "grad_norm": 0.3989025354385376, + "learning_rate": 8.519378977698316e-05, + "loss": 0.3208, + "step": 6280 + }, + { + "epoch": 0.7549549549549549, + "grad_norm": 0.41477465629577637, + "learning_rate": 8.51714450134219e-05, + "loss": 0.3535, + "step": 6285 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.3471718430519104, + "learning_rate": 8.514908633690806e-05, + "loss": 0.3103, + "step": 6290 + }, + { + "epoch": 0.7561561561561562, + "grad_norm": 0.3647453486919403, + "learning_rate": 8.512671375628616e-05, + "loss": 0.3069, + "step": 6295 + }, + { + "epoch": 0.7567567567567568, + "grad_norm": 0.4987131357192993, + "learning_rate": 8.510432728040623e-05, + "loss": 0.3442, + "step": 6300 + }, + { + "epoch": 0.7573573573573573, + "grad_norm": 0.4753042757511139, + "learning_rate": 8.508192691812385e-05, + "loss": 0.3547, + "step": 6305 + }, + { + "epoch": 0.757957957957958, + "grad_norm": 0.39992815256118774, + "learning_rate": 8.505951267829999e-05, + "loss": 0.3786, + "step": 6310 + }, + { + "epoch": 0.7585585585585586, + "grad_norm": 0.4165491759777069, + "learning_rate": 8.50370845698012e-05, + "loss": 0.313, + "step": 6315 + }, + { + "epoch": 0.7591591591591592, + "grad_norm": 0.4802296459674835, + "learning_rate": 8.50146426014995e-05, + "loss": 0.3144, + "step": 6320 + }, + { + "epoch": 0.7597597597597597, + "grad_norm": 0.34493401646614075, + "learning_rate": 8.499218678227234e-05, + "loss": 0.3359, + "step": 6325 + }, + { + "epoch": 0.7603603603603604, + "grad_norm": 0.40848666429519653, + "learning_rate": 8.49697171210027e-05, + "loss": 0.3263, + "step": 6330 + }, + { + "epoch": 0.760960960960961, + "grad_norm": 0.4861222803592682, + "learning_rate": 8.494723362657902e-05, + "loss": 0.3275, + "step": 6335 + }, + { + "epoch": 0.7615615615615615, + "grad_norm": 0.4158020615577698, + "learning_rate": 8.492473630789523e-05, + "loss": 0.313, + "step": 6340 + }, + { + "epoch": 0.7621621621621621, + "grad_norm": 0.3994755148887634, + "learning_rate": 8.49022251738507e-05, + "loss": 0.3762, + "step": 6345 + }, + { + "epoch": 0.7627627627627628, + "grad_norm": 0.47359034419059753, + "learning_rate": 8.487970023335028e-05, + "loss": 0.312, + "step": 6350 + }, + { + "epoch": 0.7633633633633634, + "grad_norm": 0.4214676320552826, + "learning_rate": 8.485716149530428e-05, + "loss": 0.3457, + "step": 6355 + }, + { + "epoch": 0.7639639639639639, + "grad_norm": 0.4861663281917572, + "learning_rate": 8.483460896862845e-05, + "loss": 0.3491, + "step": 6360 + }, + { + "epoch": 0.7645645645645646, + "grad_norm": 0.40490207076072693, + "learning_rate": 8.481204266224403e-05, + "loss": 0.293, + "step": 6365 + }, + { + "epoch": 0.7651651651651652, + "grad_norm": 0.4203982949256897, + "learning_rate": 8.478946258507767e-05, + "loss": 0.3458, + "step": 6370 + }, + { + "epoch": 0.7657657657657657, + "grad_norm": 0.4834545850753784, + "learning_rate": 8.476686874606153e-05, + "loss": 0.3688, + "step": 6375 + }, + { + "epoch": 0.7663663663663663, + "grad_norm": 0.4755549430847168, + "learning_rate": 8.474426115413314e-05, + "loss": 0.3221, + "step": 6380 + }, + { + "epoch": 0.766966966966967, + "grad_norm": 0.39933833479881287, + "learning_rate": 8.47216398182355e-05, + "loss": 0.3388, + "step": 6385 + }, + { + "epoch": 0.7675675675675676, + "grad_norm": 0.474345326423645, + "learning_rate": 8.469900474731707e-05, + "loss": 0.3069, + "step": 6390 + }, + { + "epoch": 0.7681681681681681, + "grad_norm": 0.46039968729019165, + "learning_rate": 8.467635595033172e-05, + "loss": 0.3501, + "step": 6395 + }, + { + "epoch": 0.7687687687687688, + "grad_norm": 0.4588356614112854, + "learning_rate": 8.465369343623875e-05, + "loss": 0.3405, + "step": 6400 + }, + { + "epoch": 0.7693693693693694, + "grad_norm": 0.3978899419307709, + "learning_rate": 8.463101721400287e-05, + "loss": 0.3406, + "step": 6405 + }, + { + "epoch": 0.76996996996997, + "grad_norm": 0.3917028605937958, + "learning_rate": 8.460832729259427e-05, + "loss": 0.3561, + "step": 6410 + }, + { + "epoch": 0.7705705705705705, + "grad_norm": 0.36762386560440063, + "learning_rate": 8.458562368098849e-05, + "loss": 0.357, + "step": 6415 + }, + { + "epoch": 0.7711711711711712, + "grad_norm": 0.426001638174057, + "learning_rate": 8.456290638816653e-05, + "loss": 0.3219, + "step": 6420 + }, + { + "epoch": 0.7717717717717718, + "grad_norm": 0.43328964710235596, + "learning_rate": 8.454017542311478e-05, + "loss": 0.3638, + "step": 6425 + }, + { + "epoch": 0.7723723723723723, + "grad_norm": 0.39316022396087646, + "learning_rate": 8.451743079482506e-05, + "loss": 0.3212, + "step": 6430 + }, + { + "epoch": 0.772972972972973, + "grad_norm": 0.37973538041114807, + "learning_rate": 8.449467251229457e-05, + "loss": 0.312, + "step": 6435 + }, + { + "epoch": 0.7735735735735736, + "grad_norm": 0.5051682591438293, + "learning_rate": 8.447190058452592e-05, + "loss": 0.3516, + "step": 6440 + }, + { + "epoch": 0.7741741741741742, + "grad_norm": 0.3958034813404083, + "learning_rate": 8.444911502052715e-05, + "loss": 0.3364, + "step": 6445 + }, + { + "epoch": 0.7747747747747747, + "grad_norm": 0.36705729365348816, + "learning_rate": 8.442631582931162e-05, + "loss": 0.3391, + "step": 6450 + }, + { + "epoch": 0.7753753753753754, + "grad_norm": 0.48368221521377563, + "learning_rate": 8.440350301989817e-05, + "loss": 0.3281, + "step": 6455 + }, + { + "epoch": 0.775975975975976, + "grad_norm": 0.4401091933250427, + "learning_rate": 8.438067660131092e-05, + "loss": 0.2843, + "step": 6460 + }, + { + "epoch": 0.7765765765765765, + "grad_norm": 0.4518408179283142, + "learning_rate": 8.435783658257952e-05, + "loss": 0.3332, + "step": 6465 + }, + { + "epoch": 0.7771771771771772, + "grad_norm": 0.4035428464412689, + "learning_rate": 8.433498297273885e-05, + "loss": 0.3057, + "step": 6470 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.3968808650970459, + "learning_rate": 8.431211578082925e-05, + "loss": 0.3064, + "step": 6475 + }, + { + "epoch": 0.7783783783783784, + "grad_norm": 0.45929592847824097, + "learning_rate": 8.428923501589642e-05, + "loss": 0.361, + "step": 6480 + }, + { + "epoch": 0.7789789789789789, + "grad_norm": 0.4726107716560364, + "learning_rate": 8.426634068699144e-05, + "loss": 0.3555, + "step": 6485 + }, + { + "epoch": 0.7795795795795796, + "grad_norm": 0.4348137676715851, + "learning_rate": 8.42434328031707e-05, + "loss": 0.3134, + "step": 6490 + }, + { + "epoch": 0.7801801801801802, + "grad_norm": 0.42450523376464844, + "learning_rate": 8.422051137349604e-05, + "loss": 0.2929, + "step": 6495 + }, + { + "epoch": 0.7807807807807807, + "grad_norm": 0.3884715437889099, + "learning_rate": 8.419757640703456e-05, + "loss": 0.3505, + "step": 6500 + }, + { + "epoch": 0.7807807807807807, + "eval_loss": 0.2983928620815277, + "eval_runtime": 35.5006, + "eval_samples_per_second": 22.535, + "eval_steps_per_second": 5.634, + "step": 6500 + }, + { + "epoch": 0.7813813813813814, + "grad_norm": 0.38903436064720154, + "learning_rate": 8.417462791285879e-05, + "loss": 0.319, + "step": 6505 + }, + { + "epoch": 0.781981981981982, + "grad_norm": 0.5192430019378662, + "learning_rate": 8.41516659000466e-05, + "loss": 0.3627, + "step": 6510 + }, + { + "epoch": 0.7825825825825826, + "grad_norm": 0.5047812461853027, + "learning_rate": 8.412869037768118e-05, + "loss": 0.3569, + "step": 6515 + }, + { + "epoch": 0.7831831831831831, + "grad_norm": 0.4486142694950104, + "learning_rate": 8.410570135485105e-05, + "loss": 0.3639, + "step": 6520 + }, + { + "epoch": 0.7837837837837838, + "grad_norm": 0.3849766254425049, + "learning_rate": 8.408269884065014e-05, + "loss": 0.366, + "step": 6525 + }, + { + "epoch": 0.7843843843843844, + "grad_norm": 0.5203561186790466, + "learning_rate": 8.405968284417766e-05, + "loss": 0.3197, + "step": 6530 + }, + { + "epoch": 0.784984984984985, + "grad_norm": 0.3983413279056549, + "learning_rate": 8.403665337453817e-05, + "loss": 0.297, + "step": 6535 + }, + { + "epoch": 0.7855855855855856, + "grad_norm": 0.5210694670677185, + "learning_rate": 8.401361044084154e-05, + "loss": 0.3278, + "step": 6540 + }, + { + "epoch": 0.7861861861861862, + "grad_norm": 0.3612450063228607, + "learning_rate": 8.3990554052203e-05, + "loss": 0.3049, + "step": 6545 + }, + { + "epoch": 0.7867867867867868, + "grad_norm": 0.5003215670585632, + "learning_rate": 8.39674842177431e-05, + "loss": 0.3062, + "step": 6550 + }, + { + "epoch": 0.7873873873873873, + "grad_norm": 0.4537302255630493, + "learning_rate": 8.394440094658767e-05, + "loss": 0.3214, + "step": 6555 + }, + { + "epoch": 0.787987987987988, + "grad_norm": 0.4880407452583313, + "learning_rate": 8.392130424786788e-05, + "loss": 0.3067, + "step": 6560 + }, + { + "epoch": 0.7885885885885886, + "grad_norm": 0.4855288863182068, + "learning_rate": 8.389819413072024e-05, + "loss": 0.3358, + "step": 6565 + }, + { + "epoch": 0.7891891891891892, + "grad_norm": 0.45421022176742554, + "learning_rate": 8.387507060428652e-05, + "loss": 0.3018, + "step": 6570 + }, + { + "epoch": 0.7897897897897898, + "grad_norm": 0.5397417545318604, + "learning_rate": 8.38519336777138e-05, + "loss": 0.345, + "step": 6575 + }, + { + "epoch": 0.7903903903903904, + "grad_norm": 0.4351378083229065, + "learning_rate": 8.38287833601545e-05, + "loss": 0.3067, + "step": 6580 + }, + { + "epoch": 0.790990990990991, + "grad_norm": 0.39667877554893494, + "learning_rate": 8.380561966076632e-05, + "loss": 0.294, + "step": 6585 + }, + { + "epoch": 0.7915915915915915, + "grad_norm": 0.45850926637649536, + "learning_rate": 8.378244258871221e-05, + "loss": 0.3033, + "step": 6590 + }, + { + "epoch": 0.7921921921921922, + "grad_norm": 0.4755285978317261, + "learning_rate": 8.375925215316048e-05, + "loss": 0.3158, + "step": 6595 + }, + { + "epoch": 0.7927927927927928, + "grad_norm": 0.45534393191337585, + "learning_rate": 8.373604836328466e-05, + "loss": 0.3269, + "step": 6600 + }, + { + "epoch": 0.7933933933933934, + "grad_norm": 0.4943680763244629, + "learning_rate": 8.371283122826363e-05, + "loss": 0.3118, + "step": 6605 + }, + { + "epoch": 0.793993993993994, + "grad_norm": 0.3808231055736542, + "learning_rate": 8.368960075728149e-05, + "loss": 0.3331, + "step": 6610 + }, + { + "epoch": 0.7945945945945946, + "grad_norm": 0.4319944381713867, + "learning_rate": 8.366635695952765e-05, + "loss": 0.3259, + "step": 6615 + }, + { + "epoch": 0.7951951951951952, + "grad_norm": 0.4353392720222473, + "learning_rate": 8.364309984419677e-05, + "loss": 0.3113, + "step": 6620 + }, + { + "epoch": 0.7957957957957958, + "grad_norm": 0.4579818546772003, + "learning_rate": 8.361982942048878e-05, + "loss": 0.2505, + "step": 6625 + }, + { + "epoch": 0.7963963963963964, + "grad_norm": 0.44811052083969116, + "learning_rate": 8.359654569760893e-05, + "loss": 0.3219, + "step": 6630 + }, + { + "epoch": 0.796996996996997, + "grad_norm": 0.476921409368515, + "learning_rate": 8.357324868476762e-05, + "loss": 0.3276, + "step": 6635 + }, + { + "epoch": 0.7975975975975976, + "grad_norm": 0.5040744543075562, + "learning_rate": 8.354993839118062e-05, + "loss": 0.3516, + "step": 6640 + }, + { + "epoch": 0.7981981981981981, + "grad_norm": 0.5249648094177246, + "learning_rate": 8.352661482606888e-05, + "loss": 0.3068, + "step": 6645 + }, + { + "epoch": 0.7987987987987988, + "grad_norm": 0.45975515246391296, + "learning_rate": 8.350327799865864e-05, + "loss": 0.2917, + "step": 6650 + }, + { + "epoch": 0.7993993993993994, + "grad_norm": 0.6015775799751282, + "learning_rate": 8.347992791818137e-05, + "loss": 0.3764, + "step": 6655 + }, + { + "epoch": 0.8, + "grad_norm": 0.49470993876457214, + "learning_rate": 8.345656459387376e-05, + "loss": 0.3179, + "step": 6660 + }, + { + "epoch": 0.8006006006006006, + "grad_norm": 0.434529572725296, + "learning_rate": 8.343318803497779e-05, + "loss": 0.3311, + "step": 6665 + }, + { + "epoch": 0.8012012012012012, + "grad_norm": 0.3967740535736084, + "learning_rate": 8.340979825074063e-05, + "loss": 0.3022, + "step": 6670 + }, + { + "epoch": 0.8018018018018018, + "grad_norm": 0.40132951736450195, + "learning_rate": 8.338639525041472e-05, + "loss": 0.295, + "step": 6675 + }, + { + "epoch": 0.8024024024024023, + "grad_norm": 0.38449764251708984, + "learning_rate": 8.33629790432577e-05, + "loss": 0.3307, + "step": 6680 + }, + { + "epoch": 0.803003003003003, + "grad_norm": 0.500756561756134, + "learning_rate": 8.333954963853241e-05, + "loss": 0.3057, + "step": 6685 + }, + { + "epoch": 0.8036036036036036, + "grad_norm": 0.5168707370758057, + "learning_rate": 8.331610704550698e-05, + "loss": 0.2988, + "step": 6690 + }, + { + "epoch": 0.8042042042042042, + "grad_norm": 0.33584412932395935, + "learning_rate": 8.329265127345471e-05, + "loss": 0.3074, + "step": 6695 + }, + { + "epoch": 0.8048048048048048, + "grad_norm": 0.3804640769958496, + "learning_rate": 8.326918233165412e-05, + "loss": 0.3003, + "step": 6700 + }, + { + "epoch": 0.8054054054054054, + "grad_norm": 0.5202556848526001, + "learning_rate": 8.324570022938894e-05, + "loss": 0.3428, + "step": 6705 + }, + { + "epoch": 0.806006006006006, + "grad_norm": 0.4677339792251587, + "learning_rate": 8.32222049759481e-05, + "loss": 0.3413, + "step": 6710 + }, + { + "epoch": 0.8066066066066067, + "grad_norm": 0.42975470423698425, + "learning_rate": 8.319869658062575e-05, + "loss": 0.3425, + "step": 6715 + }, + { + "epoch": 0.8072072072072072, + "grad_norm": 0.396402508020401, + "learning_rate": 8.317517505272125e-05, + "loss": 0.308, + "step": 6720 + }, + { + "epoch": 0.8078078078078078, + "grad_norm": 0.452930748462677, + "learning_rate": 8.315164040153911e-05, + "loss": 0.292, + "step": 6725 + }, + { + "epoch": 0.8084084084084084, + "grad_norm": 0.4625903069972992, + "learning_rate": 8.312809263638906e-05, + "loss": 0.264, + "step": 6730 + }, + { + "epoch": 0.809009009009009, + "grad_norm": 0.38552045822143555, + "learning_rate": 8.310453176658599e-05, + "loss": 0.28, + "step": 6735 + }, + { + "epoch": 0.8096096096096096, + "grad_norm": 0.4750034511089325, + "learning_rate": 8.308095780145002e-05, + "loss": 0.3182, + "step": 6740 + }, + { + "epoch": 0.8102102102102102, + "grad_norm": 0.49137282371520996, + "learning_rate": 8.305737075030645e-05, + "loss": 0.3417, + "step": 6745 + }, + { + "epoch": 0.8108108108108109, + "grad_norm": 0.4060055911540985, + "learning_rate": 8.30337706224857e-05, + "loss": 0.2868, + "step": 6750 + }, + { + "epoch": 0.8108108108108109, + "eval_loss": 0.2909846603870392, + "eval_runtime": 35.4853, + "eval_samples_per_second": 22.545, + "eval_steps_per_second": 5.636, + "step": 6750 + }, + { + "epoch": 0.8114114114114114, + "grad_norm": 0.4271671175956726, + "learning_rate": 8.301015742732338e-05, + "loss": 0.3264, + "step": 6755 + }, + { + "epoch": 0.812012012012012, + "grad_norm": 0.4157504737377167, + "learning_rate": 8.298653117416033e-05, + "loss": 0.3168, + "step": 6760 + }, + { + "epoch": 0.8126126126126126, + "grad_norm": 0.47310057282447815, + "learning_rate": 8.296289187234248e-05, + "loss": 0.3072, + "step": 6765 + }, + { + "epoch": 0.8132132132132132, + "grad_norm": 0.4417066276073456, + "learning_rate": 8.293923953122098e-05, + "loss": 0.3141, + "step": 6770 + }, + { + "epoch": 0.8138138138138138, + "grad_norm": 0.44005995988845825, + "learning_rate": 8.291557416015208e-05, + "loss": 0.3032, + "step": 6775 + }, + { + "epoch": 0.8144144144144144, + "grad_norm": 0.457770437002182, + "learning_rate": 8.289189576849722e-05, + "loss": 0.2836, + "step": 6780 + }, + { + "epoch": 0.815015015015015, + "grad_norm": 0.48804140090942383, + "learning_rate": 8.286820436562301e-05, + "loss": 0.3044, + "step": 6785 + }, + { + "epoch": 0.8156156156156156, + "grad_norm": 0.3784973919391632, + "learning_rate": 8.284449996090115e-05, + "loss": 0.3148, + "step": 6790 + }, + { + "epoch": 0.8162162162162162, + "grad_norm": 0.4097319543361664, + "learning_rate": 8.282078256370853e-05, + "loss": 0.2991, + "step": 6795 + }, + { + "epoch": 0.8168168168168168, + "grad_norm": 0.45843109488487244, + "learning_rate": 8.279705218342718e-05, + "loss": 0.2848, + "step": 6800 + }, + { + "epoch": 0.8174174174174175, + "grad_norm": 0.4852583110332489, + "learning_rate": 8.277330882944422e-05, + "loss": 0.3338, + "step": 6805 + }, + { + "epoch": 0.818018018018018, + "grad_norm": 0.43163394927978516, + "learning_rate": 8.274955251115195e-05, + "loss": 0.3061, + "step": 6810 + }, + { + "epoch": 0.8186186186186186, + "grad_norm": 0.456047922372818, + "learning_rate": 8.272578323794778e-05, + "loss": 0.3251, + "step": 6815 + }, + { + "epoch": 0.8192192192192193, + "grad_norm": 0.42587724328041077, + "learning_rate": 8.270200101923427e-05, + "loss": 0.2674, + "step": 6820 + }, + { + "epoch": 0.8198198198198198, + "grad_norm": 0.5281959772109985, + "learning_rate": 8.267820586441901e-05, + "loss": 0.2946, + "step": 6825 + }, + { + "epoch": 0.8204204204204204, + "grad_norm": 0.4978925585746765, + "learning_rate": 8.265439778291485e-05, + "loss": 0.3074, + "step": 6830 + }, + { + "epoch": 0.821021021021021, + "grad_norm": 0.575862467288971, + "learning_rate": 8.263057678413963e-05, + "loss": 0.3383, + "step": 6835 + }, + { + "epoch": 0.8216216216216217, + "grad_norm": 0.38265636563301086, + "learning_rate": 8.260674287751637e-05, + "loss": 0.3126, + "step": 6840 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.48499342799186707, + "learning_rate": 8.258289607247314e-05, + "loss": 0.3055, + "step": 6845 + }, + { + "epoch": 0.8228228228228228, + "grad_norm": 0.44227099418640137, + "learning_rate": 8.25590363784432e-05, + "loss": 0.3088, + "step": 6850 + }, + { + "epoch": 0.8234234234234235, + "grad_norm": 0.5235266089439392, + "learning_rate": 8.253516380486481e-05, + "loss": 0.311, + "step": 6855 + }, + { + "epoch": 0.824024024024024, + "grad_norm": 0.4245067238807678, + "learning_rate": 8.251127836118138e-05, + "loss": 0.279, + "step": 6860 + }, + { + "epoch": 0.8246246246246246, + "grad_norm": 0.441022127866745, + "learning_rate": 8.24873800568414e-05, + "loss": 0.3058, + "step": 6865 + }, + { + "epoch": 0.8252252252252252, + "grad_norm": 0.5665812492370605, + "learning_rate": 8.246346890129846e-05, + "loss": 0.3386, + "step": 6870 + }, + { + "epoch": 0.8258258258258259, + "grad_norm": 0.40924161672592163, + "learning_rate": 8.24395449040112e-05, + "loss": 0.2884, + "step": 6875 + }, + { + "epoch": 0.8264264264264264, + "grad_norm": 0.5347919464111328, + "learning_rate": 8.241560807444338e-05, + "loss": 0.3203, + "step": 6880 + }, + { + "epoch": 0.827027027027027, + "grad_norm": 0.418349027633667, + "learning_rate": 8.239165842206381e-05, + "loss": 0.2994, + "step": 6885 + }, + { + "epoch": 0.8276276276276276, + "grad_norm": 0.36912989616394043, + "learning_rate": 8.236769595634636e-05, + "loss": 0.3056, + "step": 6890 + }, + { + "epoch": 0.8282282282282283, + "grad_norm": 0.43872979283332825, + "learning_rate": 8.234372068677003e-05, + "loss": 0.3307, + "step": 6895 + }, + { + "epoch": 0.8288288288288288, + "grad_norm": 0.44087788462638855, + "learning_rate": 8.231973262281881e-05, + "loss": 0.297, + "step": 6900 + }, + { + "epoch": 0.8294294294294294, + "grad_norm": 0.4877927899360657, + "learning_rate": 8.22957317739818e-05, + "loss": 0.2723, + "step": 6905 + }, + { + "epoch": 0.8300300300300301, + "grad_norm": 0.5307222604751587, + "learning_rate": 8.227171814975312e-05, + "loss": 0.3188, + "step": 6910 + }, + { + "epoch": 0.8306306306306306, + "grad_norm": 0.49993765354156494, + "learning_rate": 8.224769175963199e-05, + "loss": 0.2528, + "step": 6915 + }, + { + "epoch": 0.8312312312312312, + "grad_norm": 0.5039234757423401, + "learning_rate": 8.222365261312264e-05, + "loss": 0.3006, + "step": 6920 + }, + { + "epoch": 0.8318318318318318, + "grad_norm": 0.4721224009990692, + "learning_rate": 8.219960071973436e-05, + "loss": 0.3348, + "step": 6925 + }, + { + "epoch": 0.8324324324324325, + "grad_norm": 0.4163762032985687, + "learning_rate": 8.21755360889815e-05, + "loss": 0.3302, + "step": 6930 + }, + { + "epoch": 0.833033033033033, + "grad_norm": 0.523654043674469, + "learning_rate": 8.215145873038341e-05, + "loss": 0.3182, + "step": 6935 + }, + { + "epoch": 0.8336336336336336, + "grad_norm": 0.45208466053009033, + "learning_rate": 8.212736865346451e-05, + "loss": 0.3073, + "step": 6940 + }, + { + "epoch": 0.8342342342342343, + "grad_norm": 0.45595982670783997, + "learning_rate": 8.210326586775423e-05, + "loss": 0.3296, + "step": 6945 + }, + { + "epoch": 0.8348348348348348, + "grad_norm": 0.3884303569793701, + "learning_rate": 8.207915038278705e-05, + "loss": 0.3124, + "step": 6950 + }, + { + "epoch": 0.8354354354354354, + "grad_norm": 0.38619038462638855, + "learning_rate": 8.205502220810244e-05, + "loss": 0.3087, + "step": 6955 + }, + { + "epoch": 0.836036036036036, + "grad_norm": 0.412649005651474, + "learning_rate": 8.203088135324493e-05, + "loss": 0.3136, + "step": 6960 + }, + { + "epoch": 0.8366366366366367, + "grad_norm": 0.388984352350235, + "learning_rate": 8.2006727827764e-05, + "loss": 0.3094, + "step": 6965 + }, + { + "epoch": 0.8372372372372372, + "grad_norm": 0.41695263981819153, + "learning_rate": 8.198256164121425e-05, + "loss": 0.3187, + "step": 6970 + }, + { + "epoch": 0.8378378378378378, + "grad_norm": 0.48263269662857056, + "learning_rate": 8.195838280315517e-05, + "loss": 0.2842, + "step": 6975 + }, + { + "epoch": 0.8384384384384385, + "grad_norm": 0.5374370813369751, + "learning_rate": 8.193419132315136e-05, + "loss": 0.2778, + "step": 6980 + }, + { + "epoch": 0.8390390390390391, + "grad_norm": 0.43641534447669983, + "learning_rate": 8.190998721077232e-05, + "loss": 0.3082, + "step": 6985 + }, + { + "epoch": 0.8396396396396396, + "grad_norm": 0.39405977725982666, + "learning_rate": 8.188577047559262e-05, + "loss": 0.2925, + "step": 6990 + }, + { + "epoch": 0.8402402402402402, + "grad_norm": 0.44023704528808594, + "learning_rate": 8.186154112719182e-05, + "loss": 0.3021, + "step": 6995 + }, + { + "epoch": 0.8408408408408409, + "grad_norm": 0.49988189339637756, + "learning_rate": 8.183729917515441e-05, + "loss": 0.2456, + "step": 7000 + }, + { + "epoch": 0.8408408408408409, + "eval_loss": 0.2692303955554962, + "eval_runtime": 35.5071, + "eval_samples_per_second": 22.531, + "eval_steps_per_second": 5.633, + "step": 7000 + }, + { + "epoch": 0.8414414414414414, + "grad_norm": 0.4299604296684265, + "learning_rate": 8.181304462906995e-05, + "loss": 0.2665, + "step": 7005 + }, + { + "epoch": 0.842042042042042, + "grad_norm": 0.47068995237350464, + "learning_rate": 8.178877749853289e-05, + "loss": 0.2925, + "step": 7010 + }, + { + "epoch": 0.8426426426426427, + "grad_norm": 0.42724624276161194, + "learning_rate": 8.176449779314275e-05, + "loss": 0.2625, + "step": 7015 + }, + { + "epoch": 0.8432432432432433, + "grad_norm": 0.4183838367462158, + "learning_rate": 8.174020552250395e-05, + "loss": 0.2865, + "step": 7020 + }, + { + "epoch": 0.8438438438438438, + "grad_norm": 0.4842805862426758, + "learning_rate": 8.171590069622592e-05, + "loss": 0.2766, + "step": 7025 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.42208966612815857, + "learning_rate": 8.169158332392306e-05, + "loss": 0.3053, + "step": 7030 + }, + { + "epoch": 0.8450450450450451, + "grad_norm": 0.4593719244003296, + "learning_rate": 8.166725341521468e-05, + "loss": 0.2857, + "step": 7035 + }, + { + "epoch": 0.8456456456456456, + "grad_norm": 0.5179038643836975, + "learning_rate": 8.164291097972516e-05, + "loss": 0.2931, + "step": 7040 + }, + { + "epoch": 0.8462462462462462, + "grad_norm": 0.5054352879524231, + "learning_rate": 8.161855602708369e-05, + "loss": 0.2943, + "step": 7045 + }, + { + "epoch": 0.8468468468468469, + "grad_norm": 0.4590916633605957, + "learning_rate": 8.159418856692453e-05, + "loss": 0.266, + "step": 7050 + }, + { + "epoch": 0.8474474474474475, + "grad_norm": 0.5188509821891785, + "learning_rate": 8.156980860888683e-05, + "loss": 0.2912, + "step": 7055 + }, + { + "epoch": 0.848048048048048, + "grad_norm": 0.470938116312027, + "learning_rate": 8.154541616261471e-05, + "loss": 0.3015, + "step": 7060 + }, + { + "epoch": 0.8486486486486486, + "grad_norm": 0.47591304779052734, + "learning_rate": 8.152101123775719e-05, + "loss": 0.2815, + "step": 7065 + }, + { + "epoch": 0.8492492492492493, + "grad_norm": 0.45679664611816406, + "learning_rate": 8.149659384396828e-05, + "loss": 0.2826, + "step": 7070 + }, + { + "epoch": 0.8498498498498499, + "grad_norm": 0.45807573199272156, + "learning_rate": 8.14721639909069e-05, + "loss": 0.3041, + "step": 7075 + }, + { + "epoch": 0.8504504504504504, + "grad_norm": 0.4140782654285431, + "learning_rate": 8.144772168823686e-05, + "loss": 0.2855, + "step": 7080 + }, + { + "epoch": 0.851051051051051, + "grad_norm": 0.43625256419181824, + "learning_rate": 8.142326694562696e-05, + "loss": 0.2625, + "step": 7085 + }, + { + "epoch": 0.8516516516516517, + "grad_norm": 0.3945463299751282, + "learning_rate": 8.139879977275088e-05, + "loss": 0.3109, + "step": 7090 + }, + { + "epoch": 0.8522522522522522, + "grad_norm": 0.35140371322631836, + "learning_rate": 8.137432017928726e-05, + "loss": 0.265, + "step": 7095 + }, + { + "epoch": 0.8528528528528528, + "grad_norm": 0.5079392194747925, + "learning_rate": 8.134982817491956e-05, + "loss": 0.2919, + "step": 7100 + }, + { + "epoch": 0.8534534534534535, + "grad_norm": 0.5279013514518738, + "learning_rate": 8.132532376933626e-05, + "loss": 0.2684, + "step": 7105 + }, + { + "epoch": 0.8540540540540541, + "grad_norm": 0.4624316096305847, + "learning_rate": 8.130080697223065e-05, + "loss": 0.2912, + "step": 7110 + }, + { + "epoch": 0.8546546546546546, + "grad_norm": 0.434151828289032, + "learning_rate": 8.127627779330103e-05, + "loss": 0.2955, + "step": 7115 + }, + { + "epoch": 0.8552552552552553, + "grad_norm": 0.44436565041542053, + "learning_rate": 8.125173624225047e-05, + "loss": 0.3194, + "step": 7120 + }, + { + "epoch": 0.8558558558558559, + "grad_norm": 0.48708978295326233, + "learning_rate": 8.122718232878705e-05, + "loss": 0.3302, + "step": 7125 + }, + { + "epoch": 0.8564564564564564, + "grad_norm": 0.48794692754745483, + "learning_rate": 8.120261606262367e-05, + "loss": 0.3095, + "step": 7130 + }, + { + "epoch": 0.857057057057057, + "grad_norm": 0.5296076536178589, + "learning_rate": 8.117803745347815e-05, + "loss": 0.3131, + "step": 7135 + }, + { + "epoch": 0.8576576576576577, + "grad_norm": 0.4305223524570465, + "learning_rate": 8.115344651107314e-05, + "loss": 0.3127, + "step": 7140 + }, + { + "epoch": 0.8582582582582583, + "grad_norm": 0.41394057869911194, + "learning_rate": 8.112884324513625e-05, + "loss": 0.2848, + "step": 7145 + }, + { + "epoch": 0.8588588588588588, + "grad_norm": 0.4637286067008972, + "learning_rate": 8.110422766539991e-05, + "loss": 0.3343, + "step": 7150 + }, + { + "epoch": 0.8594594594594595, + "grad_norm": 0.47156769037246704, + "learning_rate": 8.10795997816014e-05, + "loss": 0.2641, + "step": 7155 + }, + { + "epoch": 0.8600600600600601, + "grad_norm": 0.515585720539093, + "learning_rate": 8.105495960348297e-05, + "loss": 0.2931, + "step": 7160 + }, + { + "epoch": 0.8606606606606607, + "grad_norm": 0.5015640258789062, + "learning_rate": 8.10303071407916e-05, + "loss": 0.2683, + "step": 7165 + }, + { + "epoch": 0.8612612612612612, + "grad_norm": 0.4792843759059906, + "learning_rate": 8.100564240327924e-05, + "loss": 0.3325, + "step": 7170 + }, + { + "epoch": 0.8618618618618619, + "grad_norm": 0.47345492243766785, + "learning_rate": 8.098096540070262e-05, + "loss": 0.2901, + "step": 7175 + }, + { + "epoch": 0.8624624624624625, + "grad_norm": 0.4682060480117798, + "learning_rate": 8.095627614282336e-05, + "loss": 0.2767, + "step": 7180 + }, + { + "epoch": 0.863063063063063, + "grad_norm": 0.4313962161540985, + "learning_rate": 8.093157463940792e-05, + "loss": 0.3022, + "step": 7185 + }, + { + "epoch": 0.8636636636636636, + "grad_norm": 0.43570706248283386, + "learning_rate": 8.090686090022759e-05, + "loss": 0.2828, + "step": 7190 + }, + { + "epoch": 0.8642642642642643, + "grad_norm": 0.45561483502388, + "learning_rate": 8.088213493505854e-05, + "loss": 0.2596, + "step": 7195 + }, + { + "epoch": 0.8648648648648649, + "grad_norm": 0.41750627756118774, + "learning_rate": 8.085739675368174e-05, + "loss": 0.3473, + "step": 7200 + }, + { + "epoch": 0.8654654654654654, + "grad_norm": 0.4715013802051544, + "learning_rate": 8.083264636588299e-05, + "loss": 0.2798, + "step": 7205 + }, + { + "epoch": 0.8660660660660661, + "grad_norm": 0.38323187828063965, + "learning_rate": 8.080788378145291e-05, + "loss": 0.2644, + "step": 7210 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.46973270177841187, + "learning_rate": 8.078310901018703e-05, + "loss": 0.328, + "step": 7215 + }, + { + "epoch": 0.8672672672672672, + "grad_norm": 0.44707220792770386, + "learning_rate": 8.075832206188558e-05, + "loss": 0.2875, + "step": 7220 + }, + { + "epoch": 0.8678678678678678, + "grad_norm": 0.48569974303245544, + "learning_rate": 8.073352294635367e-05, + "loss": 0.2957, + "step": 7225 + }, + { + "epoch": 0.8684684684684685, + "grad_norm": 0.4617443382740021, + "learning_rate": 8.070871167340125e-05, + "loss": 0.2565, + "step": 7230 + }, + { + "epoch": 0.8690690690690691, + "grad_norm": 0.44316914677619934, + "learning_rate": 8.068388825284304e-05, + "loss": 0.2776, + "step": 7235 + }, + { + "epoch": 0.8696696696696696, + "grad_norm": 0.4461076855659485, + "learning_rate": 8.065905269449852e-05, + "loss": 0.2697, + "step": 7240 + }, + { + "epoch": 0.8702702702702703, + "grad_norm": 0.568824827671051, + "learning_rate": 8.063420500819205e-05, + "loss": 0.3026, + "step": 7245 + }, + { + "epoch": 0.8708708708708709, + "grad_norm": 0.5436012744903564, + "learning_rate": 8.060934520375279e-05, + "loss": 0.3146, + "step": 7250 + }, + { + "epoch": 0.8708708708708709, + "eval_loss": 0.25483617186546326, + "eval_runtime": 35.4853, + "eval_samples_per_second": 22.545, + "eval_steps_per_second": 5.636, + "step": 7250 + }, + { + "epoch": 0.8714714714714715, + "grad_norm": 0.3968980610370636, + "learning_rate": 8.05844732910146e-05, + "loss": 0.2831, + "step": 7255 + }, + { + "epoch": 0.872072072072072, + "grad_norm": 0.4703877568244934, + "learning_rate": 8.055958927981627e-05, + "loss": 0.3158, + "step": 7260 + }, + { + "epoch": 0.8726726726726727, + "grad_norm": 0.47599485516548157, + "learning_rate": 8.053469318000122e-05, + "loss": 0.2853, + "step": 7265 + }, + { + "epoch": 0.8732732732732733, + "grad_norm": 0.43644478917121887, + "learning_rate": 8.050978500141778e-05, + "loss": 0.2971, + "step": 7270 + }, + { + "epoch": 0.8738738738738738, + "grad_norm": 0.5554895997047424, + "learning_rate": 8.048486475391901e-05, + "loss": 0.2946, + "step": 7275 + }, + { + "epoch": 0.8744744744744745, + "grad_norm": 0.4631296992301941, + "learning_rate": 8.045993244736271e-05, + "loss": 0.2915, + "step": 7280 + }, + { + "epoch": 0.8750750750750751, + "grad_norm": 0.47270381450653076, + "learning_rate": 8.043498809161152e-05, + "loss": 0.2789, + "step": 7285 + }, + { + "epoch": 0.8756756756756757, + "grad_norm": 0.5386652946472168, + "learning_rate": 8.041003169653278e-05, + "loss": 0.2964, + "step": 7290 + }, + { + "epoch": 0.8762762762762762, + "grad_norm": 0.589094340801239, + "learning_rate": 8.038506327199864e-05, + "loss": 0.2792, + "step": 7295 + }, + { + "epoch": 0.8768768768768769, + "grad_norm": 0.32584628462791443, + "learning_rate": 8.036008282788599e-05, + "loss": 0.256, + "step": 7300 + }, + { + "epoch": 0.8774774774774775, + "grad_norm": 0.49077874422073364, + "learning_rate": 8.033509037407646e-05, + "loss": 0.281, + "step": 7305 + }, + { + "epoch": 0.878078078078078, + "grad_norm": 0.45855236053466797, + "learning_rate": 8.031008592045645e-05, + "loss": 0.2673, + "step": 7310 + }, + { + "epoch": 0.8786786786786787, + "grad_norm": 0.5225518345832825, + "learning_rate": 8.028506947691712e-05, + "loss": 0.2563, + "step": 7315 + }, + { + "epoch": 0.8792792792792793, + "grad_norm": 0.4622911512851715, + "learning_rate": 8.026004105335434e-05, + "loss": 0.2897, + "step": 7320 + }, + { + "epoch": 0.8798798798798799, + "grad_norm": 0.5143116116523743, + "learning_rate": 8.023500065966874e-05, + "loss": 0.3078, + "step": 7325 + }, + { + "epoch": 0.8804804804804804, + "grad_norm": 0.39429354667663574, + "learning_rate": 8.020994830576566e-05, + "loss": 0.2681, + "step": 7330 + }, + { + "epoch": 0.8810810810810811, + "grad_norm": 0.47837233543395996, + "learning_rate": 8.018488400155524e-05, + "loss": 0.2708, + "step": 7335 + }, + { + "epoch": 0.8816816816816817, + "grad_norm": 0.5050178170204163, + "learning_rate": 8.015980775695223e-05, + "loss": 0.2509, + "step": 7340 + }, + { + "epoch": 0.8822822822822823, + "grad_norm": 0.6450315713882446, + "learning_rate": 8.013471958187624e-05, + "loss": 0.3039, + "step": 7345 + }, + { + "epoch": 0.8828828828828829, + "grad_norm": 0.5351130366325378, + "learning_rate": 8.010961948625147e-05, + "loss": 0.2596, + "step": 7350 + }, + { + "epoch": 0.8834834834834835, + "grad_norm": 0.45827925205230713, + "learning_rate": 8.008450748000694e-05, + "loss": 0.2794, + "step": 7355 + }, + { + "epoch": 0.8840840840840841, + "grad_norm": 0.46393102407455444, + "learning_rate": 8.005938357307632e-05, + "loss": 0.2778, + "step": 7360 + }, + { + "epoch": 0.8846846846846846, + "grad_norm": 0.3539421856403351, + "learning_rate": 8.003424777539799e-05, + "loss": 0.2509, + "step": 7365 + }, + { + "epoch": 0.8852852852852853, + "grad_norm": 0.5830817222595215, + "learning_rate": 8.000910009691509e-05, + "loss": 0.2671, + "step": 7370 + }, + { + "epoch": 0.8858858858858859, + "grad_norm": 0.4311886429786682, + "learning_rate": 7.998394054757538e-05, + "loss": 0.2585, + "step": 7375 + }, + { + "epoch": 0.8864864864864865, + "grad_norm": 0.49385130405426025, + "learning_rate": 7.995876913733138e-05, + "loss": 0.2938, + "step": 7380 + }, + { + "epoch": 0.8870870870870871, + "grad_norm": 0.5206505060195923, + "learning_rate": 7.993358587614025e-05, + "loss": 0.2431, + "step": 7385 + }, + { + "epoch": 0.8876876876876877, + "grad_norm": 0.42388594150543213, + "learning_rate": 7.990839077396391e-05, + "loss": 0.2426, + "step": 7390 + }, + { + "epoch": 0.8882882882882883, + "grad_norm": 0.5296729207038879, + "learning_rate": 7.988318384076886e-05, + "loss": 0.2884, + "step": 7395 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.464364230632782, + "learning_rate": 7.985796508652638e-05, + "loss": 0.3027, + "step": 7400 + }, + { + "epoch": 0.8894894894894895, + "grad_norm": 0.49531444907188416, + "learning_rate": 7.983273452121237e-05, + "loss": 0.2494, + "step": 7405 + }, + { + "epoch": 0.8900900900900901, + "grad_norm": 0.5268791317939758, + "learning_rate": 7.98074921548074e-05, + "loss": 0.291, + "step": 7410 + }, + { + "epoch": 0.8906906906906907, + "grad_norm": 0.5387738943099976, + "learning_rate": 7.978223799729679e-05, + "loss": 0.2952, + "step": 7415 + }, + { + "epoch": 0.8912912912912913, + "grad_norm": 0.5476765036582947, + "learning_rate": 7.975697205867037e-05, + "loss": 0.2857, + "step": 7420 + }, + { + "epoch": 0.8918918918918919, + "grad_norm": 0.45993947982788086, + "learning_rate": 7.973169434892279e-05, + "loss": 0.2922, + "step": 7425 + }, + { + "epoch": 0.8924924924924925, + "grad_norm": 0.511158287525177, + "learning_rate": 7.970640487805324e-05, + "loss": 0.2863, + "step": 7430 + }, + { + "epoch": 0.893093093093093, + "grad_norm": 0.505344808101654, + "learning_rate": 7.968110365606564e-05, + "loss": 0.2788, + "step": 7435 + }, + { + "epoch": 0.8936936936936937, + "grad_norm": 0.4371821880340576, + "learning_rate": 7.96557906929685e-05, + "loss": 0.2868, + "step": 7440 + }, + { + "epoch": 0.8942942942942943, + "grad_norm": 0.523827850818634, + "learning_rate": 7.963046599877504e-05, + "loss": 0.3103, + "step": 7445 + }, + { + "epoch": 0.8948948948948949, + "grad_norm": 0.4492994546890259, + "learning_rate": 7.960512958350303e-05, + "loss": 0.2487, + "step": 7450 + }, + { + "epoch": 0.8954954954954955, + "grad_norm": 0.40934139490127563, + "learning_rate": 7.957978145717498e-05, + "loss": 0.2574, + "step": 7455 + }, + { + "epoch": 0.8960960960960961, + "grad_norm": 0.5381701588630676, + "learning_rate": 7.955442162981794e-05, + "loss": 0.2916, + "step": 7460 + }, + { + "epoch": 0.8966966966966967, + "grad_norm": 0.5402195453643799, + "learning_rate": 7.952905011146365e-05, + "loss": 0.2736, + "step": 7465 + }, + { + "epoch": 0.8972972972972973, + "grad_norm": 0.3956668972969055, + "learning_rate": 7.950366691214843e-05, + "loss": 0.2751, + "step": 7470 + }, + { + "epoch": 0.8978978978978979, + "grad_norm": 0.5187951326370239, + "learning_rate": 7.947827204191329e-05, + "loss": 0.2976, + "step": 7475 + }, + { + "epoch": 0.8984984984984985, + "grad_norm": 0.46738743782043457, + "learning_rate": 7.945286551080379e-05, + "loss": 0.2781, + "step": 7480 + }, + { + "epoch": 0.8990990990990991, + "grad_norm": 0.5121240019798279, + "learning_rate": 7.942744732887011e-05, + "loss": 0.2903, + "step": 7485 + }, + { + "epoch": 0.8996996996996997, + "grad_norm": 0.45248308777809143, + "learning_rate": 7.940201750616707e-05, + "loss": 0.2908, + "step": 7490 + }, + { + "epoch": 0.9003003003003003, + "grad_norm": 0.5397672653198242, + "learning_rate": 7.937657605275408e-05, + "loss": 0.2565, + "step": 7495 + }, + { + "epoch": 0.9009009009009009, + "grad_norm": 0.506698727607727, + "learning_rate": 7.935112297869513e-05, + "loss": 0.2824, + "step": 7500 + }, + { + "epoch": 0.9009009009009009, + "eval_loss": 0.24328570067882538, + "eval_runtime": 35.6281, + "eval_samples_per_second": 22.454, + "eval_steps_per_second": 5.614, + "step": 7500 + }, + { + "epoch": 0.9015015015015015, + "grad_norm": 0.5071007609367371, + "learning_rate": 7.932565829405883e-05, + "loss": 0.2777, + "step": 7505 + }, + { + "epoch": 0.9021021021021021, + "grad_norm": 0.41715240478515625, + "learning_rate": 7.93001820089184e-05, + "loss": 0.2832, + "step": 7510 + }, + { + "epoch": 0.9027027027027027, + "grad_norm": 0.4370177090167999, + "learning_rate": 7.927469413335163e-05, + "loss": 0.2977, + "step": 7515 + }, + { + "epoch": 0.9033033033033033, + "grad_norm": 0.5694493055343628, + "learning_rate": 7.924919467744085e-05, + "loss": 0.269, + "step": 7520 + }, + { + "epoch": 0.9039039039039038, + "grad_norm": 0.4566720426082611, + "learning_rate": 7.922368365127304e-05, + "loss": 0.2566, + "step": 7525 + }, + { + "epoch": 0.9045045045045045, + "grad_norm": 0.5011866092681885, + "learning_rate": 7.919816106493973e-05, + "loss": 0.2673, + "step": 7530 + }, + { + "epoch": 0.9051051051051051, + "grad_norm": 0.5244460701942444, + "learning_rate": 7.917262692853703e-05, + "loss": 0.2872, + "step": 7535 + }, + { + "epoch": 0.9057057057057057, + "grad_norm": 0.5451821684837341, + "learning_rate": 7.91470812521656e-05, + "loss": 0.2669, + "step": 7540 + }, + { + "epoch": 0.9063063063063063, + "grad_norm": 0.4076676368713379, + "learning_rate": 7.912152404593069e-05, + "loss": 0.2551, + "step": 7545 + }, + { + "epoch": 0.9069069069069069, + "grad_norm": 0.48945051431655884, + "learning_rate": 7.909595531994208e-05, + "loss": 0.2791, + "step": 7550 + }, + { + "epoch": 0.9075075075075075, + "grad_norm": 0.444084107875824, + "learning_rate": 7.907037508431414e-05, + "loss": 0.2687, + "step": 7555 + }, + { + "epoch": 0.9081081081081082, + "grad_norm": 0.48731303215026855, + "learning_rate": 7.904478334916576e-05, + "loss": 0.2764, + "step": 7560 + }, + { + "epoch": 0.9087087087087087, + "grad_norm": 0.48154494166374207, + "learning_rate": 7.901918012462042e-05, + "loss": 0.2636, + "step": 7565 + }, + { + "epoch": 0.9093093093093093, + "grad_norm": 0.6483919024467468, + "learning_rate": 7.899356542080612e-05, + "loss": 0.2545, + "step": 7570 + }, + { + "epoch": 0.9099099099099099, + "grad_norm": 0.4964236915111542, + "learning_rate": 7.896793924785537e-05, + "loss": 0.2716, + "step": 7575 + }, + { + "epoch": 0.9105105105105105, + "grad_norm": 0.5206024646759033, + "learning_rate": 7.894230161590528e-05, + "loss": 0.3074, + "step": 7580 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.5688350796699524, + "learning_rate": 7.891665253509746e-05, + "loss": 0.3095, + "step": 7585 + }, + { + "epoch": 0.9117117117117117, + "grad_norm": 0.5272214412689209, + "learning_rate": 7.889099201557804e-05, + "loss": 0.2776, + "step": 7590 + }, + { + "epoch": 0.9123123123123124, + "grad_norm": 0.40869587659835815, + "learning_rate": 7.886532006749768e-05, + "loss": 0.2553, + "step": 7595 + }, + { + "epoch": 0.9129129129129129, + "grad_norm": 0.41617196798324585, + "learning_rate": 7.883963670101158e-05, + "loss": 0.3021, + "step": 7600 + }, + { + "epoch": 0.9135135135135135, + "grad_norm": 0.41561898589134216, + "learning_rate": 7.881394192627947e-05, + "loss": 0.2634, + "step": 7605 + }, + { + "epoch": 0.9141141141141141, + "grad_norm": 0.5376163125038147, + "learning_rate": 7.878823575346552e-05, + "loss": 0.2937, + "step": 7610 + }, + { + "epoch": 0.9147147147147147, + "grad_norm": 0.5017083287239075, + "learning_rate": 7.876251819273846e-05, + "loss": 0.2803, + "step": 7615 + }, + { + "epoch": 0.9153153153153153, + "grad_norm": 0.49231407046318054, + "learning_rate": 7.873678925427154e-05, + "loss": 0.2636, + "step": 7620 + }, + { + "epoch": 0.9159159159159159, + "grad_norm": 0.4994038939476013, + "learning_rate": 7.87110489482425e-05, + "loss": 0.2685, + "step": 7625 + }, + { + "epoch": 0.9165165165165166, + "grad_norm": 0.4108923375606537, + "learning_rate": 7.868529728483353e-05, + "loss": 0.2594, + "step": 7630 + }, + { + "epoch": 0.9171171171171171, + "grad_norm": 0.45832404494285583, + "learning_rate": 7.865953427423139e-05, + "loss": 0.2559, + "step": 7635 + }, + { + "epoch": 0.9177177177177177, + "grad_norm": 0.41517266631126404, + "learning_rate": 7.863375992662727e-05, + "loss": 0.2434, + "step": 7640 + }, + { + "epoch": 0.9183183183183183, + "grad_norm": 0.5447494387626648, + "learning_rate": 7.860797425221685e-05, + "loss": 0.2728, + "step": 7645 + }, + { + "epoch": 0.918918918918919, + "grad_norm": 0.44468387961387634, + "learning_rate": 7.858217726120033e-05, + "loss": 0.2426, + "step": 7650 + }, + { + "epoch": 0.9195195195195195, + "grad_norm": 0.5114275217056274, + "learning_rate": 7.855636896378236e-05, + "loss": 0.282, + "step": 7655 + }, + { + "epoch": 0.9201201201201201, + "grad_norm": 0.4041288197040558, + "learning_rate": 7.853054937017203e-05, + "loss": 0.2664, + "step": 7660 + }, + { + "epoch": 0.9207207207207208, + "grad_norm": 0.5267107486724854, + "learning_rate": 7.850471849058297e-05, + "loss": 0.2849, + "step": 7665 + }, + { + "epoch": 0.9213213213213213, + "grad_norm": 0.5837280750274658, + "learning_rate": 7.847887633523321e-05, + "loss": 0.3062, + "step": 7670 + }, + { + "epoch": 0.9219219219219219, + "grad_norm": 0.4394720494747162, + "learning_rate": 7.845302291434528e-05, + "loss": 0.2756, + "step": 7675 + }, + { + "epoch": 0.9225225225225225, + "grad_norm": 0.4582143723964691, + "learning_rate": 7.842715823814616e-05, + "loss": 0.263, + "step": 7680 + }, + { + "epoch": 0.9231231231231232, + "grad_norm": 0.5353251695632935, + "learning_rate": 7.840128231686727e-05, + "loss": 0.2802, + "step": 7685 + }, + { + "epoch": 0.9237237237237237, + "grad_norm": 0.5931753516197205, + "learning_rate": 7.837539516074448e-05, + "loss": 0.2698, + "step": 7690 + }, + { + "epoch": 0.9243243243243243, + "grad_norm": 0.4375219941139221, + "learning_rate": 7.83494967800181e-05, + "loss": 0.2491, + "step": 7695 + }, + { + "epoch": 0.924924924924925, + "grad_norm": 0.4502767324447632, + "learning_rate": 7.83235871849329e-05, + "loss": 0.2608, + "step": 7700 + }, + { + "epoch": 0.9255255255255255, + "grad_norm": 0.47270476818084717, + "learning_rate": 7.829766638573805e-05, + "loss": 0.2882, + "step": 7705 + }, + { + "epoch": 0.9261261261261261, + "grad_norm": 0.47821709513664246, + "learning_rate": 7.827173439268723e-05, + "loss": 0.3019, + "step": 7710 + }, + { + "epoch": 0.9267267267267267, + "grad_norm": 0.543599009513855, + "learning_rate": 7.824579121603843e-05, + "loss": 0.29, + "step": 7715 + }, + { + "epoch": 0.9273273273273274, + "grad_norm": 0.5124878883361816, + "learning_rate": 7.821983686605416e-05, + "loss": 0.2691, + "step": 7720 + }, + { + "epoch": 0.9279279279279279, + "grad_norm": 0.4807772934436798, + "learning_rate": 7.819387135300134e-05, + "loss": 0.2558, + "step": 7725 + }, + { + "epoch": 0.9285285285285285, + "grad_norm": 0.4797044098377228, + "learning_rate": 7.816789468715124e-05, + "loss": 0.2526, + "step": 7730 + }, + { + "epoch": 0.9291291291291291, + "grad_norm": 0.4510939419269562, + "learning_rate": 7.81419068787796e-05, + "loss": 0.283, + "step": 7735 + }, + { + "epoch": 0.9297297297297298, + "grad_norm": 0.4945512115955353, + "learning_rate": 7.811590793816658e-05, + "loss": 0.2376, + "step": 7740 + }, + { + "epoch": 0.9303303303303303, + "grad_norm": 0.5905566215515137, + "learning_rate": 7.808989787559668e-05, + "loss": 0.2849, + "step": 7745 + }, + { + "epoch": 0.9309309309309309, + "grad_norm": 0.578352689743042, + "learning_rate": 7.806387670135886e-05, + "loss": 0.2847, + "step": 7750 + }, + { + "epoch": 0.9309309309309309, + "eval_loss": 0.23058752715587616, + "eval_runtime": 35.5677, + "eval_samples_per_second": 22.492, + "eval_steps_per_second": 5.623, + "step": 7750 + }, + { + "epoch": 0.9315315315315316, + "grad_norm": 0.47081324458122253, + "learning_rate": 7.803784442574646e-05, + "loss": 0.2275, + "step": 7755 + }, + { + "epoch": 0.9321321321321321, + "grad_norm": 0.4713752567768097, + "learning_rate": 7.801180105905716e-05, + "loss": 0.2742, + "step": 7760 + }, + { + "epoch": 0.9327327327327327, + "grad_norm": 0.5127248764038086, + "learning_rate": 7.798574661159313e-05, + "loss": 0.2729, + "step": 7765 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.48777830600738525, + "learning_rate": 7.79596810936608e-05, + "loss": 0.2683, + "step": 7770 + }, + { + "epoch": 0.933933933933934, + "grad_norm": 0.5385095477104187, + "learning_rate": 7.79336045155711e-05, + "loss": 0.2452, + "step": 7775 + }, + { + "epoch": 0.9345345345345345, + "grad_norm": 0.4768882691860199, + "learning_rate": 7.790751688763926e-05, + "loss": 0.2588, + "step": 7780 + }, + { + "epoch": 0.9351351351351351, + "grad_norm": 0.451249361038208, + "learning_rate": 7.788141822018488e-05, + "loss": 0.2491, + "step": 7785 + }, + { + "epoch": 0.9357357357357358, + "grad_norm": 0.5339096784591675, + "learning_rate": 7.785530852353196e-05, + "loss": 0.2273, + "step": 7790 + }, + { + "epoch": 0.9363363363363363, + "grad_norm": 0.5050491690635681, + "learning_rate": 7.782918780800885e-05, + "loss": 0.2476, + "step": 7795 + }, + { + "epoch": 0.9369369369369369, + "grad_norm": 0.626227080821991, + "learning_rate": 7.780305608394828e-05, + "loss": 0.2683, + "step": 7800 + }, + { + "epoch": 0.9375375375375375, + "grad_norm": 0.5088838338851929, + "learning_rate": 7.777691336168728e-05, + "loss": 0.2779, + "step": 7805 + }, + { + "epoch": 0.9381381381381382, + "grad_norm": 0.4789465367794037, + "learning_rate": 7.775075965156726e-05, + "loss": 0.2472, + "step": 7810 + }, + { + "epoch": 0.9387387387387387, + "grad_norm": 0.478678822517395, + "learning_rate": 7.772459496393401e-05, + "loss": 0.2946, + "step": 7815 + }, + { + "epoch": 0.9393393393393393, + "grad_norm": 0.4863022565841675, + "learning_rate": 7.769841930913761e-05, + "loss": 0.2454, + "step": 7820 + }, + { + "epoch": 0.93993993993994, + "grad_norm": 0.5988958477973938, + "learning_rate": 7.767223269753253e-05, + "loss": 0.2678, + "step": 7825 + }, + { + "epoch": 0.9405405405405406, + "grad_norm": 0.6609881520271301, + "learning_rate": 7.76460351394775e-05, + "loss": 0.2362, + "step": 7830 + }, + { + "epoch": 0.9411411411411411, + "grad_norm": 0.4918898344039917, + "learning_rate": 7.761982664533567e-05, + "loss": 0.252, + "step": 7835 + }, + { + "epoch": 0.9417417417417417, + "grad_norm": 0.43102115392684937, + "learning_rate": 7.759360722547443e-05, + "loss": 0.2719, + "step": 7840 + }, + { + "epoch": 0.9423423423423424, + "grad_norm": 0.4987165331840515, + "learning_rate": 7.756737689026556e-05, + "loss": 0.2938, + "step": 7845 + }, + { + "epoch": 0.9429429429429429, + "grad_norm": 0.4932297170162201, + "learning_rate": 7.754113565008513e-05, + "loss": 0.2675, + "step": 7850 + }, + { + "epoch": 0.9435435435435435, + "grad_norm": 0.6924968957901001, + "learning_rate": 7.751488351531351e-05, + "loss": 0.2544, + "step": 7855 + }, + { + "epoch": 0.9441441441441442, + "grad_norm": 0.5172476172447205, + "learning_rate": 7.748862049633541e-05, + "loss": 0.2303, + "step": 7860 + }, + { + "epoch": 0.9447447447447448, + "grad_norm": 0.3794347047805786, + "learning_rate": 7.746234660353981e-05, + "loss": 0.2187, + "step": 7865 + }, + { + "epoch": 0.9453453453453453, + "grad_norm": 0.513824462890625, + "learning_rate": 7.743606184732004e-05, + "loss": 0.2549, + "step": 7870 + }, + { + "epoch": 0.9459459459459459, + "grad_norm": 0.4459366798400879, + "learning_rate": 7.740976623807367e-05, + "loss": 0.2298, + "step": 7875 + }, + { + "epoch": 0.9465465465465466, + "grad_norm": 0.49473559856414795, + "learning_rate": 7.73834597862026e-05, + "loss": 0.2406, + "step": 7880 + }, + { + "epoch": 0.9471471471471471, + "grad_norm": 0.4191824197769165, + "learning_rate": 7.735714250211304e-05, + "loss": 0.2586, + "step": 7885 + }, + { + "epoch": 0.9477477477477477, + "grad_norm": 0.47258394956588745, + "learning_rate": 7.733081439621542e-05, + "loss": 0.2461, + "step": 7890 + }, + { + "epoch": 0.9483483483483484, + "grad_norm": 0.5496533513069153, + "learning_rate": 7.73044754789245e-05, + "loss": 0.2664, + "step": 7895 + }, + { + "epoch": 0.948948948948949, + "grad_norm": 0.48711472749710083, + "learning_rate": 7.727812576065929e-05, + "loss": 0.2677, + "step": 7900 + }, + { + "epoch": 0.9495495495495495, + "grad_norm": 0.46601200103759766, + "learning_rate": 7.72517652518431e-05, + "loss": 0.275, + "step": 7905 + }, + { + "epoch": 0.9501501501501501, + "grad_norm": 0.43569445610046387, + "learning_rate": 7.722539396290349e-05, + "loss": 0.2428, + "step": 7910 + }, + { + "epoch": 0.9507507507507508, + "grad_norm": 0.5569596886634827, + "learning_rate": 7.71990119042723e-05, + "loss": 0.2315, + "step": 7915 + }, + { + "epoch": 0.9513513513513514, + "grad_norm": 0.4711536169052124, + "learning_rate": 7.717261908638562e-05, + "loss": 0.238, + "step": 7920 + }, + { + "epoch": 0.9519519519519519, + "grad_norm": 0.5207387804985046, + "learning_rate": 7.714621551968375e-05, + "loss": 0.2586, + "step": 7925 + }, + { + "epoch": 0.9525525525525526, + "grad_norm": 0.419005423784256, + "learning_rate": 7.711980121461136e-05, + "loss": 0.2645, + "step": 7930 + }, + { + "epoch": 0.9531531531531532, + "grad_norm": 0.5372774600982666, + "learning_rate": 7.709337618161723e-05, + "loss": 0.239, + "step": 7935 + }, + { + "epoch": 0.9537537537537537, + "grad_norm": 0.470059335231781, + "learning_rate": 7.706694043115448e-05, + "loss": 0.2689, + "step": 7940 + }, + { + "epoch": 0.9543543543543543, + "grad_norm": 0.4631572365760803, + "learning_rate": 7.704049397368045e-05, + "loss": 0.2763, + "step": 7945 + }, + { + "epoch": 0.954954954954955, + "grad_norm": 0.4009503126144409, + "learning_rate": 7.701403681965666e-05, + "loss": 0.25, + "step": 7950 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.6539427638053894, + "learning_rate": 7.698756897954897e-05, + "loss": 0.2336, + "step": 7955 + }, + { + "epoch": 0.9561561561561561, + "grad_norm": 0.5389593839645386, + "learning_rate": 7.696109046382733e-05, + "loss": 0.2372, + "step": 7960 + }, + { + "epoch": 0.9567567567567568, + "grad_norm": 0.5966012477874756, + "learning_rate": 7.693460128296604e-05, + "loss": 0.2446, + "step": 7965 + }, + { + "epoch": 0.9573573573573574, + "grad_norm": 0.5658740401268005, + "learning_rate": 7.690810144744352e-05, + "loss": 0.2367, + "step": 7970 + }, + { + "epoch": 0.9579579579579579, + "grad_norm": 0.4769929051399231, + "learning_rate": 7.688159096774247e-05, + "loss": 0.2536, + "step": 7975 + }, + { + "epoch": 0.9585585585585585, + "grad_norm": 0.5098903179168701, + "learning_rate": 7.685506985434981e-05, + "loss": 0.2823, + "step": 7980 + }, + { + "epoch": 0.9591591591591592, + "grad_norm": 0.4759390354156494, + "learning_rate": 7.682853811775659e-05, + "loss": 0.2797, + "step": 7985 + }, + { + "epoch": 0.9597597597597598, + "grad_norm": 0.5201223492622375, + "learning_rate": 7.680199576845813e-05, + "loss": 0.2312, + "step": 7990 + }, + { + "epoch": 0.9603603603603603, + "grad_norm": 0.4938308000564575, + "learning_rate": 7.677544281695392e-05, + "loss": 0.2334, + "step": 7995 + }, + { + "epoch": 0.960960960960961, + "grad_norm": 0.5370311141014099, + "learning_rate": 7.674887927374765e-05, + "loss": 0.2612, + "step": 8000 + }, + { + "epoch": 0.960960960960961, + "eval_loss": 0.2181350737810135, + "eval_runtime": 35.7024, + "eval_samples_per_second": 22.407, + "eval_steps_per_second": 5.602, + "step": 8000 + }, + { + "epoch": 0.9615615615615616, + "grad_norm": 0.5911093950271606, + "learning_rate": 7.67223051493472e-05, + "loss": 0.2559, + "step": 8005 + }, + { + "epoch": 0.9621621621621622, + "grad_norm": 0.5361072421073914, + "learning_rate": 7.669572045426463e-05, + "loss": 0.2747, + "step": 8010 + }, + { + "epoch": 0.9627627627627627, + "grad_norm": 0.4973791241645813, + "learning_rate": 7.66691251990162e-05, + "loss": 0.2293, + "step": 8015 + }, + { + "epoch": 0.9633633633633634, + "grad_norm": 0.5274470448493958, + "learning_rate": 7.664251939412232e-05, + "loss": 0.2258, + "step": 8020 + }, + { + "epoch": 0.963963963963964, + "grad_norm": 0.5574861168861389, + "learning_rate": 7.661590305010759e-05, + "loss": 0.2704, + "step": 8025 + }, + { + "epoch": 0.9645645645645645, + "grad_norm": 0.5257890224456787, + "learning_rate": 7.65892761775008e-05, + "loss": 0.2243, + "step": 8030 + }, + { + "epoch": 0.9651651651651652, + "grad_norm": 0.5335781574249268, + "learning_rate": 7.656263878683485e-05, + "loss": 0.2834, + "step": 8035 + }, + { + "epoch": 0.9657657657657658, + "grad_norm": 0.5913284420967102, + "learning_rate": 7.653599088864685e-05, + "loss": 0.2434, + "step": 8040 + }, + { + "epoch": 0.9663663663663664, + "grad_norm": 0.4447031617164612, + "learning_rate": 7.650933249347803e-05, + "loss": 0.2221, + "step": 8045 + }, + { + "epoch": 0.9669669669669669, + "grad_norm": 0.4803239703178406, + "learning_rate": 7.648266361187382e-05, + "loss": 0.2511, + "step": 8050 + }, + { + "epoch": 0.9675675675675676, + "grad_norm": 0.4980545938014984, + "learning_rate": 7.645598425438374e-05, + "loss": 0.2269, + "step": 8055 + }, + { + "epoch": 0.9681681681681682, + "grad_norm": 0.6442884206771851, + "learning_rate": 7.64292944315615e-05, + "loss": 0.2651, + "step": 8060 + }, + { + "epoch": 0.9687687687687687, + "grad_norm": 0.5565935969352722, + "learning_rate": 7.640259415396494e-05, + "loss": 0.2496, + "step": 8065 + }, + { + "epoch": 0.9693693693693693, + "grad_norm": 0.5555434823036194, + "learning_rate": 7.6375883432156e-05, + "loss": 0.2471, + "step": 8070 + }, + { + "epoch": 0.96996996996997, + "grad_norm": 0.5130464434623718, + "learning_rate": 7.634916227670081e-05, + "loss": 0.2215, + "step": 8075 + }, + { + "epoch": 0.9705705705705706, + "grad_norm": 0.4054581820964813, + "learning_rate": 7.632243069816957e-05, + "loss": 0.2236, + "step": 8080 + }, + { + "epoch": 0.9711711711711711, + "grad_norm": 0.628891110420227, + "learning_rate": 7.629568870713668e-05, + "loss": 0.2275, + "step": 8085 + }, + { + "epoch": 0.9717717717717718, + "grad_norm": 0.5326948761940002, + "learning_rate": 7.626893631418055e-05, + "loss": 0.2417, + "step": 8090 + }, + { + "epoch": 0.9723723723723724, + "grad_norm": 0.5320354700088501, + "learning_rate": 7.624217352988379e-05, + "loss": 0.2291, + "step": 8095 + }, + { + "epoch": 0.972972972972973, + "grad_norm": 0.49078038334846497, + "learning_rate": 7.621540036483308e-05, + "loss": 0.2395, + "step": 8100 + }, + { + "epoch": 0.9735735735735735, + "grad_norm": 0.5511507391929626, + "learning_rate": 7.618861682961927e-05, + "loss": 0.2246, + "step": 8105 + }, + { + "epoch": 0.9741741741741742, + "grad_norm": 0.5163986086845398, + "learning_rate": 7.61618229348372e-05, + "loss": 0.2337, + "step": 8110 + }, + { + "epoch": 0.9747747747747748, + "grad_norm": 0.5130642056465149, + "learning_rate": 7.613501869108589e-05, + "loss": 0.2465, + "step": 8115 + }, + { + "epoch": 0.9753753753753753, + "grad_norm": 0.5894377827644348, + "learning_rate": 7.610820410896847e-05, + "loss": 0.2954, + "step": 8120 + }, + { + "epoch": 0.975975975975976, + "grad_norm": 0.543550968170166, + "learning_rate": 7.608137919909208e-05, + "loss": 0.2444, + "step": 8125 + }, + { + "epoch": 0.9765765765765766, + "grad_norm": 0.48070836067199707, + "learning_rate": 7.605454397206802e-05, + "loss": 0.2234, + "step": 8130 + }, + { + "epoch": 0.9771771771771772, + "grad_norm": 0.6094707250595093, + "learning_rate": 7.602769843851163e-05, + "loss": 0.2473, + "step": 8135 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.5057632327079773, + "learning_rate": 7.600084260904232e-05, + "loss": 0.2432, + "step": 8140 + }, + { + "epoch": 0.9783783783783784, + "grad_norm": 0.6155203580856323, + "learning_rate": 7.597397649428364e-05, + "loss": 0.2365, + "step": 8145 + }, + { + "epoch": 0.978978978978979, + "grad_norm": 0.4686107635498047, + "learning_rate": 7.59471001048631e-05, + "loss": 0.2335, + "step": 8150 + }, + { + "epoch": 0.9795795795795795, + "grad_norm": 0.5839318633079529, + "learning_rate": 7.592021345141238e-05, + "loss": 0.2315, + "step": 8155 + }, + { + "epoch": 0.9801801801801802, + "grad_norm": 0.45957908034324646, + "learning_rate": 7.589331654456716e-05, + "loss": 0.216, + "step": 8160 + }, + { + "epoch": 0.9807807807807808, + "grad_norm": 0.4842189848423004, + "learning_rate": 7.586640939496717e-05, + "loss": 0.2067, + "step": 8165 + }, + { + "epoch": 0.9813813813813814, + "grad_norm": 0.5004525184631348, + "learning_rate": 7.583949201325623e-05, + "loss": 0.2249, + "step": 8170 + }, + { + "epoch": 0.9819819819819819, + "grad_norm": 0.5188431143760681, + "learning_rate": 7.58125644100822e-05, + "loss": 0.2414, + "step": 8175 + }, + { + "epoch": 0.9825825825825826, + "grad_norm": 0.5917747020721436, + "learning_rate": 7.578562659609696e-05, + "loss": 0.2738, + "step": 8180 + }, + { + "epoch": 0.9831831831831832, + "grad_norm": 0.48418867588043213, + "learning_rate": 7.575867858195644e-05, + "loss": 0.2446, + "step": 8185 + }, + { + "epoch": 0.9837837837837838, + "grad_norm": 0.5367460250854492, + "learning_rate": 7.573172037832062e-05, + "loss": 0.2584, + "step": 8190 + }, + { + "epoch": 0.9843843843843844, + "grad_norm": 0.5259852409362793, + "learning_rate": 7.570475199585348e-05, + "loss": 0.247, + "step": 8195 + }, + { + "epoch": 0.984984984984985, + "grad_norm": 0.6347830295562744, + "learning_rate": 7.567777344522304e-05, + "loss": 0.228, + "step": 8200 + }, + { + "epoch": 0.9855855855855856, + "grad_norm": 0.40549105405807495, + "learning_rate": 7.565078473710137e-05, + "loss": 0.2263, + "step": 8205 + }, + { + "epoch": 0.9861861861861861, + "grad_norm": 0.48162418603897095, + "learning_rate": 7.562378588216454e-05, + "loss": 0.2353, + "step": 8210 + }, + { + "epoch": 0.9867867867867868, + "grad_norm": 0.5109835863113403, + "learning_rate": 7.55967768910926e-05, + "loss": 0.2349, + "step": 8215 + }, + { + "epoch": 0.9873873873873874, + "grad_norm": 0.37698864936828613, + "learning_rate": 7.556975777456962e-05, + "loss": 0.2165, + "step": 8220 + }, + { + "epoch": 0.987987987987988, + "grad_norm": 0.5112248063087463, + "learning_rate": 7.554272854328377e-05, + "loss": 0.2562, + "step": 8225 + }, + { + "epoch": 0.9885885885885886, + "grad_norm": 0.44548270106315613, + "learning_rate": 7.551568920792708e-05, + "loss": 0.2314, + "step": 8230 + }, + { + "epoch": 0.9891891891891892, + "grad_norm": 0.44484102725982666, + "learning_rate": 7.548863977919567e-05, + "loss": 0.213, + "step": 8235 + }, + { + "epoch": 0.9897897897897898, + "grad_norm": 0.48750653862953186, + "learning_rate": 7.54615802677896e-05, + "loss": 0.2283, + "step": 8240 + }, + { + "epoch": 0.9903903903903903, + "grad_norm": 0.5573891997337341, + "learning_rate": 7.543451068441297e-05, + "loss": 0.27, + "step": 8245 + }, + { + "epoch": 0.990990990990991, + "grad_norm": 0.5790622234344482, + "learning_rate": 7.540743103977378e-05, + "loss": 0.2519, + "step": 8250 + }, + { + "epoch": 0.990990990990991, + "eval_loss": 0.20414112508296967, + "eval_runtime": 35.4459, + "eval_samples_per_second": 22.57, + "eval_steps_per_second": 5.642, + "step": 8250 + }, + { + "epoch": 0.9915915915915916, + "grad_norm": 0.5384604930877686, + "learning_rate": 7.538034134458414e-05, + "loss": 0.2538, + "step": 8255 + }, + { + "epoch": 0.9921921921921922, + "grad_norm": 0.47747641801834106, + "learning_rate": 7.535324160956003e-05, + "loss": 0.2567, + "step": 8260 + }, + { + "epoch": 0.9927927927927928, + "grad_norm": 0.576480507850647, + "learning_rate": 7.532613184542144e-05, + "loss": 0.2501, + "step": 8265 + }, + { + "epoch": 0.9933933933933934, + "grad_norm": 0.693172812461853, + "learning_rate": 7.529901206289231e-05, + "loss": 0.2771, + "step": 8270 + }, + { + "epoch": 0.993993993993994, + "grad_norm": 0.5924530625343323, + "learning_rate": 7.527188227270057e-05, + "loss": 0.2371, + "step": 8275 + }, + { + "epoch": 0.9945945945945946, + "grad_norm": 0.5856573581695557, + "learning_rate": 7.524474248557809e-05, + "loss": 0.2362, + "step": 8280 + }, + { + "epoch": 0.9951951951951952, + "grad_norm": 0.5145263671875, + "learning_rate": 7.521759271226068e-05, + "loss": 0.2495, + "step": 8285 + }, + { + "epoch": 0.9957957957957958, + "grad_norm": 0.6087602972984314, + "learning_rate": 7.519043296348813e-05, + "loss": 0.2678, + "step": 8290 + }, + { + "epoch": 0.9963963963963964, + "grad_norm": 0.45782071352005005, + "learning_rate": 7.516326325000418e-05, + "loss": 0.2515, + "step": 8295 + }, + { + "epoch": 0.996996996996997, + "grad_norm": 0.5238723158836365, + "learning_rate": 7.513608358255646e-05, + "loss": 0.2662, + "step": 8300 + }, + { + "epoch": 0.9975975975975976, + "grad_norm": 0.5170171856880188, + "learning_rate": 7.51088939718966e-05, + "loss": 0.2762, + "step": 8305 + }, + { + "epoch": 0.9981981981981982, + "grad_norm": 0.5645842552185059, + "learning_rate": 7.508169442878013e-05, + "loss": 0.2756, + "step": 8310 + }, + { + "epoch": 0.9987987987987988, + "grad_norm": 0.47060051560401917, + "learning_rate": 7.505448496396652e-05, + "loss": 0.2583, + "step": 8315 + }, + { + "epoch": 0.9993993993993994, + "grad_norm": 0.5723556876182556, + "learning_rate": 7.502726558821915e-05, + "loss": 0.2484, + "step": 8320 + }, + { + "epoch": 1.0, + "grad_norm": 0.5563798546791077, + "learning_rate": 7.500003631230534e-05, + "loss": 0.2431, + "step": 8325 + }, + { + "epoch": 1.0006006006006005, + "grad_norm": 0.48347780108451843, + "learning_rate": 7.497279714699632e-05, + "loss": 0.1432, + "step": 8330 + }, + { + "epoch": 1.0012012012012013, + "grad_norm": 0.4860369861125946, + "learning_rate": 7.494554810306721e-05, + "loss": 0.1562, + "step": 8335 + }, + { + "epoch": 1.0018018018018018, + "grad_norm": 0.5238087773323059, + "learning_rate": 7.491828919129709e-05, + "loss": 0.1624, + "step": 8340 + }, + { + "epoch": 1.0024024024024023, + "grad_norm": 0.5139604806900024, + "learning_rate": 7.489102042246888e-05, + "loss": 0.157, + "step": 8345 + }, + { + "epoch": 1.003003003003003, + "grad_norm": 0.5299177169799805, + "learning_rate": 7.486374180736944e-05, + "loss": 0.1704, + "step": 8350 + }, + { + "epoch": 1.0036036036036036, + "grad_norm": 0.4978356957435608, + "learning_rate": 7.48364533567895e-05, + "loss": 0.1538, + "step": 8355 + }, + { + "epoch": 1.0042042042042043, + "grad_norm": 0.4236242175102234, + "learning_rate": 7.480915508152372e-05, + "loss": 0.1607, + "step": 8360 + }, + { + "epoch": 1.0048048048048048, + "grad_norm": 0.3946767747402191, + "learning_rate": 7.478184699237061e-05, + "loss": 0.1365, + "step": 8365 + }, + { + "epoch": 1.0054054054054054, + "grad_norm": 0.47756099700927734, + "learning_rate": 7.475452910013259e-05, + "loss": 0.1566, + "step": 8370 + }, + { + "epoch": 1.006006006006006, + "grad_norm": 0.4343335032463074, + "learning_rate": 7.472720141561591e-05, + "loss": 0.1509, + "step": 8375 + }, + { + "epoch": 1.0066066066066066, + "grad_norm": 0.4657896161079407, + "learning_rate": 7.469986394963076e-05, + "loss": 0.1595, + "step": 8380 + }, + { + "epoch": 1.0072072072072071, + "grad_norm": 0.5368801355361938, + "learning_rate": 7.467251671299113e-05, + "loss": 0.1648, + "step": 8385 + }, + { + "epoch": 1.0078078078078079, + "grad_norm": 0.5660285353660583, + "learning_rate": 7.464515971651493e-05, + "loss": 0.1529, + "step": 8390 + }, + { + "epoch": 1.0084084084084084, + "grad_norm": 0.4559033513069153, + "learning_rate": 7.461779297102391e-05, + "loss": 0.1359, + "step": 8395 + }, + { + "epoch": 1.009009009009009, + "grad_norm": 0.48404526710510254, + "learning_rate": 7.459041648734368e-05, + "loss": 0.1593, + "step": 8400 + }, + { + "epoch": 1.0096096096096097, + "grad_norm": 0.4860776662826538, + "learning_rate": 7.456303027630366e-05, + "loss": 0.1693, + "step": 8405 + }, + { + "epoch": 1.0102102102102102, + "grad_norm": 0.5458460450172424, + "learning_rate": 7.453563434873722e-05, + "loss": 0.1479, + "step": 8410 + }, + { + "epoch": 1.0108108108108107, + "grad_norm": 0.5686392188072205, + "learning_rate": 7.450822871548148e-05, + "loss": 0.1694, + "step": 8415 + }, + { + "epoch": 1.0114114114114114, + "grad_norm": 0.5729146003723145, + "learning_rate": 7.448081338737742e-05, + "loss": 0.165, + "step": 8420 + }, + { + "epoch": 1.012012012012012, + "grad_norm": 0.5049359798431396, + "learning_rate": 7.445338837526988e-05, + "loss": 0.1461, + "step": 8425 + }, + { + "epoch": 1.0126126126126127, + "grad_norm": 0.4519275724887848, + "learning_rate": 7.442595369000749e-05, + "loss": 0.1336, + "step": 8430 + }, + { + "epoch": 1.0132132132132132, + "grad_norm": 0.48683297634124756, + "learning_rate": 7.439850934244279e-05, + "loss": 0.1429, + "step": 8435 + }, + { + "epoch": 1.0138138138138137, + "grad_norm": 0.4096581041812897, + "learning_rate": 7.437105534343202e-05, + "loss": 0.1646, + "step": 8440 + }, + { + "epoch": 1.0144144144144145, + "grad_norm": 0.4254092872142792, + "learning_rate": 7.434359170383533e-05, + "loss": 0.1389, + "step": 8445 + }, + { + "epoch": 1.015015015015015, + "grad_norm": 0.4360242784023285, + "learning_rate": 7.431611843451664e-05, + "loss": 0.1376, + "step": 8450 + }, + { + "epoch": 1.0156156156156155, + "grad_norm": 0.5856005549430847, + "learning_rate": 7.428863554634373e-05, + "loss": 0.1516, + "step": 8455 + }, + { + "epoch": 1.0162162162162163, + "grad_norm": 0.507146954536438, + "learning_rate": 7.426114305018812e-05, + "loss": 0.1512, + "step": 8460 + }, + { + "epoch": 1.0168168168168168, + "grad_norm": 0.5587776303291321, + "learning_rate": 7.423364095692518e-05, + "loss": 0.1592, + "step": 8465 + }, + { + "epoch": 1.0174174174174173, + "grad_norm": 0.5522329807281494, + "learning_rate": 7.420612927743404e-05, + "loss": 0.1397, + "step": 8470 + }, + { + "epoch": 1.018018018018018, + "grad_norm": 0.5756586790084839, + "learning_rate": 7.417860802259764e-05, + "loss": 0.1651, + "step": 8475 + }, + { + "epoch": 1.0186186186186186, + "grad_norm": 0.5764936804771423, + "learning_rate": 7.415107720330273e-05, + "loss": 0.1661, + "step": 8480 + }, + { + "epoch": 1.0192192192192193, + "grad_norm": 0.5376463532447815, + "learning_rate": 7.412353683043978e-05, + "loss": 0.1595, + "step": 8485 + }, + { + "epoch": 1.0198198198198198, + "grad_norm": 0.4412538409233093, + "learning_rate": 7.40959869149031e-05, + "loss": 0.1412, + "step": 8490 + }, + { + "epoch": 1.0204204204204204, + "grad_norm": 0.502080500125885, + "learning_rate": 7.406842746759077e-05, + "loss": 0.16, + "step": 8495 + }, + { + "epoch": 1.021021021021021, + "grad_norm": 0.594376802444458, + "learning_rate": 7.404085849940461e-05, + "loss": 0.159, + "step": 8500 + }, + { + "epoch": 1.021021021021021, + "eval_loss": 0.19814305007457733, + "eval_runtime": 35.6951, + "eval_samples_per_second": 22.412, + "eval_steps_per_second": 5.603, + "step": 8500 + }, + { + "epoch": 1.0216216216216216, + "grad_norm": 0.44608205556869507, + "learning_rate": 7.40132800212502e-05, + "loss": 0.152, + "step": 8505 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 0.4485919177532196, + "learning_rate": 7.398569204403696e-05, + "loss": 0.1393, + "step": 8510 + }, + { + "epoch": 1.0228228228228229, + "grad_norm": 0.4802648425102234, + "learning_rate": 7.395809457867795e-05, + "loss": 0.1436, + "step": 8515 + }, + { + "epoch": 1.0234234234234234, + "grad_norm": 0.45354655385017395, + "learning_rate": 7.39304876360901e-05, + "loss": 0.1389, + "step": 8520 + }, + { + "epoch": 1.024024024024024, + "grad_norm": 0.5193431973457336, + "learning_rate": 7.390287122719397e-05, + "loss": 0.1538, + "step": 8525 + }, + { + "epoch": 1.0246246246246247, + "grad_norm": 0.5162495374679565, + "learning_rate": 7.387524536291397e-05, + "loss": 0.1535, + "step": 8530 + }, + { + "epoch": 1.0252252252252252, + "grad_norm": 0.4876277446746826, + "learning_rate": 7.38476100541782e-05, + "loss": 0.1423, + "step": 8535 + }, + { + "epoch": 1.025825825825826, + "grad_norm": 0.5237493515014648, + "learning_rate": 7.38199653119185e-05, + "loss": 0.1544, + "step": 8540 + }, + { + "epoch": 1.0264264264264265, + "grad_norm": 0.49636128544807434, + "learning_rate": 7.379231114707043e-05, + "loss": 0.1597, + "step": 8545 + }, + { + "epoch": 1.027027027027027, + "grad_norm": 0.4116309881210327, + "learning_rate": 7.376464757057333e-05, + "loss": 0.1372, + "step": 8550 + }, + { + "epoch": 1.0276276276276277, + "grad_norm": 0.5808112025260925, + "learning_rate": 7.373697459337019e-05, + "loss": 0.1293, + "step": 8555 + }, + { + "epoch": 1.0282282282282282, + "grad_norm": 0.4400934875011444, + "learning_rate": 7.37092922264078e-05, + "loss": 0.1223, + "step": 8560 + }, + { + "epoch": 1.0288288288288288, + "grad_norm": 0.449886292219162, + "learning_rate": 7.368160048063654e-05, + "loss": 0.1318, + "step": 8565 + }, + { + "epoch": 1.0294294294294295, + "grad_norm": 0.5196372270584106, + "learning_rate": 7.365389936701066e-05, + "loss": 0.1558, + "step": 8570 + }, + { + "epoch": 1.03003003003003, + "grad_norm": 0.4798905551433563, + "learning_rate": 7.3626188896488e-05, + "loss": 0.1477, + "step": 8575 + }, + { + "epoch": 1.0306306306306305, + "grad_norm": 0.3740388751029968, + "learning_rate": 7.359846908003012e-05, + "loss": 0.1304, + "step": 8580 + }, + { + "epoch": 1.0312312312312313, + "grad_norm": 0.43643224239349365, + "learning_rate": 7.357073992860233e-05, + "loss": 0.1254, + "step": 8585 + }, + { + "epoch": 1.0318318318318318, + "grad_norm": 0.6022786498069763, + "learning_rate": 7.354300145317356e-05, + "loss": 0.1496, + "step": 8590 + }, + { + "epoch": 1.0324324324324325, + "grad_norm": 0.5783417820930481, + "learning_rate": 7.351525366471647e-05, + "loss": 0.1605, + "step": 8595 + }, + { + "epoch": 1.033033033033033, + "grad_norm": 0.3740118741989136, + "learning_rate": 7.348749657420744e-05, + "loss": 0.1435, + "step": 8600 + }, + { + "epoch": 1.0336336336336336, + "grad_norm": 0.5092105865478516, + "learning_rate": 7.345973019262645e-05, + "loss": 0.156, + "step": 8605 + }, + { + "epoch": 1.0342342342342343, + "grad_norm": 0.465665727853775, + "learning_rate": 7.343195453095719e-05, + "loss": 0.1636, + "step": 8610 + }, + { + "epoch": 1.0348348348348348, + "grad_norm": 0.6344625949859619, + "learning_rate": 7.340416960018701e-05, + "loss": 0.1466, + "step": 8615 + }, + { + "epoch": 1.0354354354354354, + "grad_norm": 0.5152485370635986, + "learning_rate": 7.337637541130699e-05, + "loss": 0.1453, + "step": 8620 + }, + { + "epoch": 1.0360360360360361, + "grad_norm": 0.4373214840888977, + "learning_rate": 7.334857197531178e-05, + "loss": 0.1586, + "step": 8625 + }, + { + "epoch": 1.0366366366366366, + "grad_norm": 0.5414717197418213, + "learning_rate": 7.332075930319974e-05, + "loss": 0.1503, + "step": 8630 + }, + { + "epoch": 1.0372372372372372, + "grad_norm": 0.4952808618545532, + "learning_rate": 7.329293740597289e-05, + "loss": 0.1505, + "step": 8635 + }, + { + "epoch": 1.037837837837838, + "grad_norm": 0.549498975276947, + "learning_rate": 7.326510629463688e-05, + "loss": 0.1407, + "step": 8640 + }, + { + "epoch": 1.0384384384384384, + "grad_norm": 0.5551578402519226, + "learning_rate": 7.3237265980201e-05, + "loss": 0.1615, + "step": 8645 + }, + { + "epoch": 1.039039039039039, + "grad_norm": 0.43350839614868164, + "learning_rate": 7.320941647367819e-05, + "loss": 0.1452, + "step": 8650 + }, + { + "epoch": 1.0396396396396397, + "grad_norm": 0.5714775323867798, + "learning_rate": 7.318155778608502e-05, + "loss": 0.1659, + "step": 8655 + }, + { + "epoch": 1.0402402402402402, + "grad_norm": 0.49592894315719604, + "learning_rate": 7.31536899284417e-05, + "loss": 0.142, + "step": 8660 + }, + { + "epoch": 1.040840840840841, + "grad_norm": 0.491641640663147, + "learning_rate": 7.312581291177208e-05, + "loss": 0.1373, + "step": 8665 + }, + { + "epoch": 1.0414414414414415, + "grad_norm": 0.5000977516174316, + "learning_rate": 7.30979267471036e-05, + "loss": 0.1453, + "step": 8670 + }, + { + "epoch": 1.042042042042042, + "grad_norm": 0.5157973766326904, + "learning_rate": 7.307003144546733e-05, + "loss": 0.1369, + "step": 8675 + }, + { + "epoch": 1.0426426426426427, + "grad_norm": 0.38589340448379517, + "learning_rate": 7.304212701789797e-05, + "loss": 0.1372, + "step": 8680 + }, + { + "epoch": 1.0432432432432432, + "grad_norm": 0.4763352572917938, + "learning_rate": 7.301421347543384e-05, + "loss": 0.1456, + "step": 8685 + }, + { + "epoch": 1.0438438438438438, + "grad_norm": 0.424207478761673, + "learning_rate": 7.298629082911682e-05, + "loss": 0.1434, + "step": 8690 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 0.47599339485168457, + "learning_rate": 7.295835908999242e-05, + "loss": 0.1487, + "step": 8695 + }, + { + "epoch": 1.045045045045045, + "grad_norm": 0.42823123931884766, + "learning_rate": 7.293041826910976e-05, + "loss": 0.1452, + "step": 8700 + }, + { + "epoch": 1.0456456456456455, + "grad_norm": 0.6307981014251709, + "learning_rate": 7.290246837752152e-05, + "loss": 0.1358, + "step": 8705 + }, + { + "epoch": 1.0462462462462463, + "grad_norm": 0.4481140375137329, + "learning_rate": 7.287450942628397e-05, + "loss": 0.1451, + "step": 8710 + }, + { + "epoch": 1.0468468468468468, + "grad_norm": 0.5314701795578003, + "learning_rate": 7.284654142645704e-05, + "loss": 0.1526, + "step": 8715 + }, + { + "epoch": 1.0474474474474476, + "grad_norm": 0.44227954745292664, + "learning_rate": 7.28185643891041e-05, + "loss": 0.1455, + "step": 8720 + }, + { + "epoch": 1.048048048048048, + "grad_norm": 0.502838134765625, + "learning_rate": 7.279057832529224e-05, + "loss": 0.1398, + "step": 8725 + }, + { + "epoch": 1.0486486486486486, + "grad_norm": 0.6226637363433838, + "learning_rate": 7.2762583246092e-05, + "loss": 0.1531, + "step": 8730 + }, + { + "epoch": 1.0492492492492493, + "grad_norm": 0.452466756105423, + "learning_rate": 7.273457916257758e-05, + "loss": 0.139, + "step": 8735 + }, + { + "epoch": 1.0498498498498499, + "grad_norm": 0.4796338379383087, + "learning_rate": 7.270656608582668e-05, + "loss": 0.1551, + "step": 8740 + }, + { + "epoch": 1.0504504504504504, + "grad_norm": 0.47294482588768005, + "learning_rate": 7.26785440269206e-05, + "loss": 0.1325, + "step": 8745 + }, + { + "epoch": 1.0510510510510511, + "grad_norm": 0.5365278124809265, + "learning_rate": 7.265051299694414e-05, + "loss": 0.1455, + "step": 8750 + }, + { + "epoch": 1.0510510510510511, + "eval_loss": 0.1942126303911209, + "eval_runtime": 35.5728, + "eval_samples_per_second": 22.489, + "eval_steps_per_second": 5.622, + "step": 8750 + }, + { + "epoch": 1.0516516516516516, + "grad_norm": 0.5005031228065491, + "learning_rate": 7.262247300698571e-05, + "loss": 0.1333, + "step": 8755 + }, + { + "epoch": 1.0522522522522522, + "grad_norm": 0.49393969774246216, + "learning_rate": 7.259442406813722e-05, + "loss": 0.1492, + "step": 8760 + }, + { + "epoch": 1.052852852852853, + "grad_norm": 0.6257902979850769, + "learning_rate": 7.256636619149413e-05, + "loss": 0.1778, + "step": 8765 + }, + { + "epoch": 1.0534534534534534, + "grad_norm": 0.46658793091773987, + "learning_rate": 7.253829938815546e-05, + "loss": 0.1312, + "step": 8770 + }, + { + "epoch": 1.054054054054054, + "grad_norm": 0.41227564215660095, + "learning_rate": 7.251022366922375e-05, + "loss": 0.1387, + "step": 8775 + }, + { + "epoch": 1.0546546546546547, + "grad_norm": 0.4338676929473877, + "learning_rate": 7.248213904580502e-05, + "loss": 0.1606, + "step": 8780 + }, + { + "epoch": 1.0552552552552552, + "grad_norm": 0.5233752131462097, + "learning_rate": 7.245404552900889e-05, + "loss": 0.1468, + "step": 8785 + }, + { + "epoch": 1.055855855855856, + "grad_norm": 0.5996452569961548, + "learning_rate": 7.242594312994843e-05, + "loss": 0.1646, + "step": 8790 + }, + { + "epoch": 1.0564564564564565, + "grad_norm": 0.5455158352851868, + "learning_rate": 7.239783185974029e-05, + "loss": 0.151, + "step": 8795 + }, + { + "epoch": 1.057057057057057, + "grad_norm": 0.46771520376205444, + "learning_rate": 7.236971172950455e-05, + "loss": 0.1611, + "step": 8800 + }, + { + "epoch": 1.0576576576576577, + "grad_norm": 0.5181732773780823, + "learning_rate": 7.234158275036487e-05, + "loss": 0.1218, + "step": 8805 + }, + { + "epoch": 1.0582582582582583, + "grad_norm": 0.5358600616455078, + "learning_rate": 7.231344493344834e-05, + "loss": 0.1763, + "step": 8810 + }, + { + "epoch": 1.0588588588588588, + "grad_norm": 0.4052846431732178, + "learning_rate": 7.228529828988563e-05, + "loss": 0.1369, + "step": 8815 + }, + { + "epoch": 1.0594594594594595, + "grad_norm": 0.4466336667537689, + "learning_rate": 7.225714283081083e-05, + "loss": 0.1711, + "step": 8820 + }, + { + "epoch": 1.06006006006006, + "grad_norm": 0.4244237542152405, + "learning_rate": 7.222897856736154e-05, + "loss": 0.1499, + "step": 8825 + }, + { + "epoch": 1.0606606606606606, + "grad_norm": 0.49825313687324524, + "learning_rate": 7.220080551067886e-05, + "loss": 0.139, + "step": 8830 + }, + { + "epoch": 1.0612612612612613, + "grad_norm": 0.5628939867019653, + "learning_rate": 7.217262367190733e-05, + "loss": 0.1466, + "step": 8835 + }, + { + "epoch": 1.0618618618618618, + "grad_norm": 0.5116236805915833, + "learning_rate": 7.214443306219501e-05, + "loss": 0.1537, + "step": 8840 + }, + { + "epoch": 1.0624624624624626, + "grad_norm": 0.4122700095176697, + "learning_rate": 7.21162336926934e-05, + "loss": 0.1237, + "step": 8845 + }, + { + "epoch": 1.063063063063063, + "grad_norm": 0.5503990650177002, + "learning_rate": 7.208802557455746e-05, + "loss": 0.1608, + "step": 8850 + }, + { + "epoch": 1.0636636636636636, + "grad_norm": 0.3909919559955597, + "learning_rate": 7.205980871894565e-05, + "loss": 0.1462, + "step": 8855 + }, + { + "epoch": 1.0642642642642643, + "grad_norm": 0.45528560876846313, + "learning_rate": 7.203158313701982e-05, + "loss": 0.1405, + "step": 8860 + }, + { + "epoch": 1.0648648648648649, + "grad_norm": 0.46630609035491943, + "learning_rate": 7.200334883994534e-05, + "loss": 0.1489, + "step": 8865 + }, + { + "epoch": 1.0654654654654654, + "grad_norm": 0.42296266555786133, + "learning_rate": 7.1975105838891e-05, + "loss": 0.1566, + "step": 8870 + }, + { + "epoch": 1.0660660660660661, + "grad_norm": 0.5201703906059265, + "learning_rate": 7.194685414502902e-05, + "loss": 0.1449, + "step": 8875 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.5390458106994629, + "learning_rate": 7.191859376953508e-05, + "loss": 0.1408, + "step": 8880 + }, + { + "epoch": 1.0672672672672672, + "grad_norm": 0.5914261937141418, + "learning_rate": 7.189032472358826e-05, + "loss": 0.1514, + "step": 8885 + }, + { + "epoch": 1.067867867867868, + "grad_norm": 0.5696099996566772, + "learning_rate": 7.186204701837114e-05, + "loss": 0.1555, + "step": 8890 + }, + { + "epoch": 1.0684684684684684, + "grad_norm": 0.45480287075042725, + "learning_rate": 7.183376066506964e-05, + "loss": 0.1434, + "step": 8895 + }, + { + "epoch": 1.069069069069069, + "grad_norm": 0.4600028693675995, + "learning_rate": 7.180546567487317e-05, + "loss": 0.1374, + "step": 8900 + }, + { + "epoch": 1.0696696696696697, + "grad_norm": 0.8249819278717041, + "learning_rate": 7.177716205897449e-05, + "loss": 0.1533, + "step": 8905 + }, + { + "epoch": 1.0702702702702702, + "grad_norm": 0.3818807899951935, + "learning_rate": 7.174884982856984e-05, + "loss": 0.1327, + "step": 8910 + }, + { + "epoch": 1.070870870870871, + "grad_norm": 0.5017231702804565, + "learning_rate": 7.172052899485883e-05, + "loss": 0.1359, + "step": 8915 + }, + { + "epoch": 1.0714714714714715, + "grad_norm": 0.534890353679657, + "learning_rate": 7.169219956904447e-05, + "loss": 0.1364, + "step": 8920 + }, + { + "epoch": 1.072072072072072, + "grad_norm": 0.4843946695327759, + "learning_rate": 7.16638615623332e-05, + "loss": 0.1476, + "step": 8925 + }, + { + "epoch": 1.0726726726726727, + "grad_norm": 0.515217661857605, + "learning_rate": 7.163551498593481e-05, + "loss": 0.1443, + "step": 8930 + }, + { + "epoch": 1.0732732732732733, + "grad_norm": 0.33950290083885193, + "learning_rate": 7.160715985106253e-05, + "loss": 0.1423, + "step": 8935 + }, + { + "epoch": 1.0738738738738738, + "grad_norm": 0.5174293518066406, + "learning_rate": 7.157879616893294e-05, + "loss": 0.1527, + "step": 8940 + }, + { + "epoch": 1.0744744744744745, + "grad_norm": 0.4591221511363983, + "learning_rate": 7.155042395076598e-05, + "loss": 0.168, + "step": 8945 + }, + { + "epoch": 1.075075075075075, + "grad_norm": 0.49486660957336426, + "learning_rate": 7.152204320778504e-05, + "loss": 0.1564, + "step": 8950 + }, + { + "epoch": 1.0756756756756758, + "grad_norm": 0.5360234379768372, + "learning_rate": 7.149365395121681e-05, + "loss": 0.1442, + "step": 8955 + }, + { + "epoch": 1.0762762762762763, + "grad_norm": 0.5851607918739319, + "learning_rate": 7.146525619229139e-05, + "loss": 0.1332, + "step": 8960 + }, + { + "epoch": 1.0768768768768768, + "grad_norm": 0.5005301833152771, + "learning_rate": 7.143684994224222e-05, + "loss": 0.1368, + "step": 8965 + }, + { + "epoch": 1.0774774774774776, + "grad_norm": 0.4584340751171112, + "learning_rate": 7.140843521230613e-05, + "loss": 0.1499, + "step": 8970 + }, + { + "epoch": 1.078078078078078, + "grad_norm": 0.45061230659484863, + "learning_rate": 7.138001201372327e-05, + "loss": 0.1291, + "step": 8975 + }, + { + "epoch": 1.0786786786786786, + "grad_norm": 0.4673006534576416, + "learning_rate": 7.135158035773714e-05, + "loss": 0.1383, + "step": 8980 + }, + { + "epoch": 1.0792792792792794, + "grad_norm": 0.36622899770736694, + "learning_rate": 7.132314025559464e-05, + "loss": 0.1409, + "step": 8985 + }, + { + "epoch": 1.0798798798798799, + "grad_norm": 0.4832969903945923, + "learning_rate": 7.12946917185459e-05, + "loss": 0.1417, + "step": 8990 + }, + { + "epoch": 1.0804804804804804, + "grad_norm": 0.4738680124282837, + "learning_rate": 7.126623475784453e-05, + "loss": 0.1373, + "step": 8995 + }, + { + "epoch": 1.0810810810810811, + "grad_norm": 0.5347143411636353, + "learning_rate": 7.123776938474735e-05, + "loss": 0.167, + "step": 9000 + }, + { + "epoch": 1.0810810810810811, + "eval_loss": 0.19050884246826172, + "eval_runtime": 36.0676, + "eval_samples_per_second": 22.181, + "eval_steps_per_second": 5.545, + "step": 9000 + }, + { + "epoch": 1.0816816816816817, + "grad_norm": 0.4649895131587982, + "learning_rate": 7.120929561051458e-05, + "loss": 0.1376, + "step": 9005 + }, + { + "epoch": 1.0822822822822822, + "grad_norm": 0.5208431482315063, + "learning_rate": 7.118081344640974e-05, + "loss": 0.1543, + "step": 9010 + }, + { + "epoch": 1.082882882882883, + "grad_norm": 0.4275619685649872, + "learning_rate": 7.115232290369967e-05, + "loss": 0.132, + "step": 9015 + }, + { + "epoch": 1.0834834834834834, + "grad_norm": 0.4782681465148926, + "learning_rate": 7.112382399365451e-05, + "loss": 0.132, + "step": 9020 + }, + { + "epoch": 1.0840840840840842, + "grad_norm": 0.508679986000061, + "learning_rate": 7.109531672754772e-05, + "loss": 0.1552, + "step": 9025 + }, + { + "epoch": 1.0846846846846847, + "grad_norm": 0.42764633893966675, + "learning_rate": 7.106680111665609e-05, + "loss": 0.1186, + "step": 9030 + }, + { + "epoch": 1.0852852852852852, + "grad_norm": 0.4004163444042206, + "learning_rate": 7.103827717225968e-05, + "loss": 0.1469, + "step": 9035 + }, + { + "epoch": 1.085885885885886, + "grad_norm": 0.48991858959198, + "learning_rate": 7.100974490564185e-05, + "loss": 0.1525, + "step": 9040 + }, + { + "epoch": 1.0864864864864865, + "grad_norm": 0.47079792618751526, + "learning_rate": 7.098120432808924e-05, + "loss": 0.116, + "step": 9045 + }, + { + "epoch": 1.087087087087087, + "grad_norm": 0.5363724231719971, + "learning_rate": 7.095265545089184e-05, + "loss": 0.1172, + "step": 9050 + }, + { + "epoch": 1.0876876876876878, + "grad_norm": 0.5028483867645264, + "learning_rate": 7.092409828534285e-05, + "loss": 0.1407, + "step": 9055 + }, + { + "epoch": 1.0882882882882883, + "grad_norm": 0.4808637201786041, + "learning_rate": 7.089553284273878e-05, + "loss": 0.1293, + "step": 9060 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 0.5302677154541016, + "learning_rate": 7.086695913437942e-05, + "loss": 0.142, + "step": 9065 + }, + { + "epoch": 1.0894894894894895, + "grad_norm": 0.48279350996017456, + "learning_rate": 7.083837717156781e-05, + "loss": 0.1477, + "step": 9070 + }, + { + "epoch": 1.09009009009009, + "grad_norm": 0.492736279964447, + "learning_rate": 7.080978696561028e-05, + "loss": 0.1542, + "step": 9075 + }, + { + "epoch": 1.0906906906906908, + "grad_norm": 0.41189101338386536, + "learning_rate": 7.07811885278164e-05, + "loss": 0.1292, + "step": 9080 + }, + { + "epoch": 1.0912912912912913, + "grad_norm": 0.5568517446517944, + "learning_rate": 7.0752581869499e-05, + "loss": 0.1356, + "step": 9085 + }, + { + "epoch": 1.0918918918918918, + "grad_norm": 0.48629894852638245, + "learning_rate": 7.072396700197416e-05, + "loss": 0.12, + "step": 9090 + }, + { + "epoch": 1.0924924924924926, + "grad_norm": 0.4972403645515442, + "learning_rate": 7.069534393656124e-05, + "loss": 0.1364, + "step": 9095 + }, + { + "epoch": 1.093093093093093, + "grad_norm": 0.5167415738105774, + "learning_rate": 7.06667126845828e-05, + "loss": 0.1402, + "step": 9100 + }, + { + "epoch": 1.0936936936936936, + "grad_norm": 0.466062068939209, + "learning_rate": 7.063807325736464e-05, + "loss": 0.1376, + "step": 9105 + }, + { + "epoch": 1.0942942942942944, + "grad_norm": 0.4689285457134247, + "learning_rate": 7.060942566623581e-05, + "loss": 0.1593, + "step": 9110 + }, + { + "epoch": 1.0948948948948949, + "grad_norm": 0.4900115430355072, + "learning_rate": 7.058076992252861e-05, + "loss": 0.1386, + "step": 9115 + }, + { + "epoch": 1.0954954954954954, + "grad_norm": 0.5853942036628723, + "learning_rate": 7.055210603757854e-05, + "loss": 0.124, + "step": 9120 + }, + { + "epoch": 1.0960960960960962, + "grad_norm": 0.46623408794403076, + "learning_rate": 7.052343402272431e-05, + "loss": 0.1322, + "step": 9125 + }, + { + "epoch": 1.0966966966966967, + "grad_norm": 0.365323930978775, + "learning_rate": 7.049475388930787e-05, + "loss": 0.1427, + "step": 9130 + }, + { + "epoch": 1.0972972972972972, + "grad_norm": 0.4380837678909302, + "learning_rate": 7.046606564867433e-05, + "loss": 0.1414, + "step": 9135 + }, + { + "epoch": 1.097897897897898, + "grad_norm": 0.476395845413208, + "learning_rate": 7.04373693121721e-05, + "loss": 0.1447, + "step": 9140 + }, + { + "epoch": 1.0984984984984985, + "grad_norm": 0.5355069637298584, + "learning_rate": 7.040866489115272e-05, + "loss": 0.1701, + "step": 9145 + }, + { + "epoch": 1.0990990990990992, + "grad_norm": 0.5215357542037964, + "learning_rate": 7.037995239697096e-05, + "loss": 0.1544, + "step": 9150 + }, + { + "epoch": 1.0996996996996997, + "grad_norm": 0.5702700018882751, + "learning_rate": 7.035123184098476e-05, + "loss": 0.1423, + "step": 9155 + }, + { + "epoch": 1.1003003003003002, + "grad_norm": 0.5936440825462341, + "learning_rate": 7.032250323455525e-05, + "loss": 0.1545, + "step": 9160 + }, + { + "epoch": 1.100900900900901, + "grad_norm": 0.4610235095024109, + "learning_rate": 7.029376658904676e-05, + "loss": 0.1269, + "step": 9165 + }, + { + "epoch": 1.1015015015015015, + "grad_norm": 0.5780545473098755, + "learning_rate": 7.026502191582683e-05, + "loss": 0.1342, + "step": 9170 + }, + { + "epoch": 1.102102102102102, + "grad_norm": 0.6412636637687683, + "learning_rate": 7.023626922626609e-05, + "loss": 0.1395, + "step": 9175 + }, + { + "epoch": 1.1027027027027028, + "grad_norm": 0.4099220037460327, + "learning_rate": 7.020750853173841e-05, + "loss": 0.1295, + "step": 9180 + }, + { + "epoch": 1.1033033033033033, + "grad_norm": 0.5327261090278625, + "learning_rate": 7.017873984362082e-05, + "loss": 0.1325, + "step": 9185 + }, + { + "epoch": 1.1039039039039038, + "grad_norm": 0.620705783367157, + "learning_rate": 7.014996317329349e-05, + "loss": 0.1448, + "step": 9190 + }, + { + "epoch": 1.1045045045045045, + "grad_norm": 0.5146176815032959, + "learning_rate": 7.012117853213977e-05, + "loss": 0.1461, + "step": 9195 + }, + { + "epoch": 1.105105105105105, + "grad_norm": 0.4332140386104584, + "learning_rate": 7.009238593154613e-05, + "loss": 0.1325, + "step": 9200 + }, + { + "epoch": 1.1057057057057058, + "grad_norm": 0.4083593189716339, + "learning_rate": 7.006358538290224e-05, + "loss": 0.1521, + "step": 9205 + }, + { + "epoch": 1.1063063063063063, + "grad_norm": 0.574815034866333, + "learning_rate": 7.003477689760084e-05, + "loss": 0.1311, + "step": 9210 + }, + { + "epoch": 1.1069069069069069, + "grad_norm": 0.492655873298645, + "learning_rate": 7.00059604870379e-05, + "loss": 0.125, + "step": 9215 + }, + { + "epoch": 1.1075075075075076, + "grad_norm": 0.4560936689376831, + "learning_rate": 6.997713616261246e-05, + "loss": 0.1349, + "step": 9220 + }, + { + "epoch": 1.1081081081081081, + "grad_norm": 0.5473215579986572, + "learning_rate": 6.994830393572669e-05, + "loss": 0.1242, + "step": 9225 + }, + { + "epoch": 1.1087087087087086, + "grad_norm": 0.4096076190471649, + "learning_rate": 6.991946381778593e-05, + "loss": 0.1293, + "step": 9230 + }, + { + "epoch": 1.1093093093093094, + "grad_norm": 0.6029496192932129, + "learning_rate": 6.98906158201986e-05, + "loss": 0.1312, + "step": 9235 + }, + { + "epoch": 1.10990990990991, + "grad_norm": 0.44497260451316833, + "learning_rate": 6.986175995437628e-05, + "loss": 0.1254, + "step": 9240 + }, + { + "epoch": 1.1105105105105104, + "grad_norm": 0.5495989918708801, + "learning_rate": 6.983289623173361e-05, + "loss": 0.1158, + "step": 9245 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 0.4961594045162201, + "learning_rate": 6.980402466368835e-05, + "loss": 0.1548, + "step": 9250 + }, + { + "epoch": 1.1111111111111112, + "eval_loss": 0.18500061333179474, + "eval_runtime": 35.9632, + "eval_samples_per_second": 22.245, + "eval_steps_per_second": 5.561, + "step": 9250 + }, + { + "epoch": 1.1117117117117117, + "grad_norm": 0.3899019956588745, + "learning_rate": 6.977514526166143e-05, + "loss": 0.1306, + "step": 9255 + }, + { + "epoch": 1.1123123123123122, + "grad_norm": 0.49679896235466003, + "learning_rate": 6.974625803707677e-05, + "loss": 0.1348, + "step": 9260 + }, + { + "epoch": 1.112912912912913, + "grad_norm": 0.4366336166858673, + "learning_rate": 6.971736300136147e-05, + "loss": 0.1452, + "step": 9265 + }, + { + "epoch": 1.1135135135135135, + "grad_norm": 0.5401057600975037, + "learning_rate": 6.96884601659457e-05, + "loss": 0.1508, + "step": 9270 + }, + { + "epoch": 1.1141141141141142, + "grad_norm": 0.5785194039344788, + "learning_rate": 6.965954954226268e-05, + "loss": 0.135, + "step": 9275 + }, + { + "epoch": 1.1147147147147147, + "grad_norm": 0.5360791087150574, + "learning_rate": 6.963063114174875e-05, + "loss": 0.1385, + "step": 9280 + }, + { + "epoch": 1.1153153153153152, + "grad_norm": 0.4878944754600525, + "learning_rate": 6.960170497584331e-05, + "loss": 0.1254, + "step": 9285 + }, + { + "epoch": 1.115915915915916, + "grad_norm": 0.4216470718383789, + "learning_rate": 6.957277105598884e-05, + "loss": 0.1329, + "step": 9290 + }, + { + "epoch": 1.1165165165165165, + "grad_norm": 0.4470789432525635, + "learning_rate": 6.954382939363086e-05, + "loss": 0.122, + "step": 9295 + }, + { + "epoch": 1.117117117117117, + "grad_norm": 0.54117751121521, + "learning_rate": 6.951488000021803e-05, + "loss": 0.1188, + "step": 9300 + }, + { + "epoch": 1.1177177177177178, + "grad_norm": 0.5519031882286072, + "learning_rate": 6.948592288720194e-05, + "loss": 0.1232, + "step": 9305 + }, + { + "epoch": 1.1183183183183183, + "grad_norm": 0.5523632168769836, + "learning_rate": 6.945695806603736e-05, + "loss": 0.148, + "step": 9310 + }, + { + "epoch": 1.118918918918919, + "grad_norm": 0.5505694150924683, + "learning_rate": 6.942798554818204e-05, + "loss": 0.148, + "step": 9315 + }, + { + "epoch": 1.1195195195195196, + "grad_norm": 0.4322364032268524, + "learning_rate": 6.939900534509678e-05, + "loss": 0.1319, + "step": 9320 + }, + { + "epoch": 1.12012012012012, + "grad_norm": 0.3765978217124939, + "learning_rate": 6.937001746824545e-05, + "loss": 0.1169, + "step": 9325 + }, + { + "epoch": 1.1207207207207208, + "grad_norm": 0.5289835333824158, + "learning_rate": 6.934102192909492e-05, + "loss": 0.1318, + "step": 9330 + }, + { + "epoch": 1.1213213213213213, + "grad_norm": 0.4689262807369232, + "learning_rate": 6.931201873911511e-05, + "loss": 0.1236, + "step": 9335 + }, + { + "epoch": 1.1219219219219219, + "grad_norm": 0.5069834589958191, + "learning_rate": 6.928300790977897e-05, + "loss": 0.1444, + "step": 9340 + }, + { + "epoch": 1.1225225225225226, + "grad_norm": 0.46477627754211426, + "learning_rate": 6.925398945256247e-05, + "loss": 0.1366, + "step": 9345 + }, + { + "epoch": 1.1231231231231231, + "grad_norm": 0.4537050127983093, + "learning_rate": 6.922496337894457e-05, + "loss": 0.1385, + "step": 9350 + }, + { + "epoch": 1.1237237237237236, + "grad_norm": 0.5252853035926819, + "learning_rate": 6.919592970040731e-05, + "loss": 0.1438, + "step": 9355 + }, + { + "epoch": 1.1243243243243244, + "grad_norm": 0.46386396884918213, + "learning_rate": 6.916688842843565e-05, + "loss": 0.1298, + "step": 9360 + }, + { + "epoch": 1.124924924924925, + "grad_norm": 0.5454824566841125, + "learning_rate": 6.913783957451759e-05, + "loss": 0.1404, + "step": 9365 + }, + { + "epoch": 1.1255255255255254, + "grad_norm": 0.4597674012184143, + "learning_rate": 6.91087831501442e-05, + "loss": 0.1334, + "step": 9370 + }, + { + "epoch": 1.1261261261261262, + "grad_norm": 0.48023849725723267, + "learning_rate": 6.90797191668094e-05, + "loss": 0.1337, + "step": 9375 + }, + { + "epoch": 1.1267267267267267, + "grad_norm": 0.48772919178009033, + "learning_rate": 6.905064763601026e-05, + "loss": 0.1167, + "step": 9380 + }, + { + "epoch": 1.1273273273273272, + "grad_norm": 0.4505400061607361, + "learning_rate": 6.902156856924671e-05, + "loss": 0.1514, + "step": 9385 + }, + { + "epoch": 1.127927927927928, + "grad_norm": 0.528593122959137, + "learning_rate": 6.899248197802172e-05, + "loss": 0.1328, + "step": 9390 + }, + { + "epoch": 1.1285285285285285, + "grad_norm": 0.6304023265838623, + "learning_rate": 6.896338787384124e-05, + "loss": 0.1505, + "step": 9395 + }, + { + "epoch": 1.1291291291291292, + "grad_norm": 0.49781641364097595, + "learning_rate": 6.893428626821413e-05, + "loss": 0.1306, + "step": 9400 + }, + { + "epoch": 1.1297297297297297, + "grad_norm": 0.5521763563156128, + "learning_rate": 6.890517717265233e-05, + "loss": 0.1448, + "step": 9405 + }, + { + "epoch": 1.1303303303303303, + "grad_norm": 0.5749943852424622, + "learning_rate": 6.887606059867065e-05, + "loss": 0.1438, + "step": 9410 + }, + { + "epoch": 1.130930930930931, + "grad_norm": 0.45946699380874634, + "learning_rate": 6.884693655778685e-05, + "loss": 0.1301, + "step": 9415 + }, + { + "epoch": 1.1315315315315315, + "grad_norm": 0.5034966468811035, + "learning_rate": 6.881780506152172e-05, + "loss": 0.1471, + "step": 9420 + }, + { + "epoch": 1.132132132132132, + "grad_norm": 0.5998401045799255, + "learning_rate": 6.878866612139895e-05, + "loss": 0.1561, + "step": 9425 + }, + { + "epoch": 1.1327327327327328, + "grad_norm": 0.4555962085723877, + "learning_rate": 6.875951974894519e-05, + "loss": 0.1342, + "step": 9430 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.6748508810997009, + "learning_rate": 6.873036595569e-05, + "loss": 0.1412, + "step": 9435 + }, + { + "epoch": 1.133933933933934, + "grad_norm": 0.4627913534641266, + "learning_rate": 6.870120475316592e-05, + "loss": 0.1393, + "step": 9440 + }, + { + "epoch": 1.1345345345345346, + "grad_norm": 0.4778697192668915, + "learning_rate": 6.86720361529084e-05, + "loss": 0.15, + "step": 9445 + }, + { + "epoch": 1.135135135135135, + "grad_norm": 0.4317777454853058, + "learning_rate": 6.86428601664558e-05, + "loss": 0.1138, + "step": 9450 + }, + { + "epoch": 1.1357357357357358, + "grad_norm": 0.5334888696670532, + "learning_rate": 6.861367680534942e-05, + "loss": 0.1275, + "step": 9455 + }, + { + "epoch": 1.1363363363363364, + "grad_norm": 0.44648465514183044, + "learning_rate": 6.85844860811335e-05, + "loss": 0.1269, + "step": 9460 + }, + { + "epoch": 1.1369369369369369, + "grad_norm": 0.5643259286880493, + "learning_rate": 6.855528800535513e-05, + "loss": 0.1256, + "step": 9465 + }, + { + "epoch": 1.1375375375375376, + "grad_norm": 0.5081361532211304, + "learning_rate": 6.852608258956437e-05, + "loss": 0.1561, + "step": 9470 + }, + { + "epoch": 1.1381381381381381, + "grad_norm": 0.5348806977272034, + "learning_rate": 6.849686984531416e-05, + "loss": 0.1282, + "step": 9475 + }, + { + "epoch": 1.1387387387387387, + "grad_norm": 0.4955986738204956, + "learning_rate": 6.846764978416031e-05, + "loss": 0.1355, + "step": 9480 + }, + { + "epoch": 1.1393393393393394, + "grad_norm": 0.6122444868087769, + "learning_rate": 6.843842241766158e-05, + "loss": 0.1671, + "step": 9485 + }, + { + "epoch": 1.13993993993994, + "grad_norm": 0.502400279045105, + "learning_rate": 6.84091877573796e-05, + "loss": 0.13, + "step": 9490 + }, + { + "epoch": 1.1405405405405404, + "grad_norm": 0.5369451642036438, + "learning_rate": 6.837994581487888e-05, + "loss": 0.1318, + "step": 9495 + }, + { + "epoch": 1.1411411411411412, + "grad_norm": 0.5451071858406067, + "learning_rate": 6.835069660172678e-05, + "loss": 0.1388, + "step": 9500 + }, + { + "epoch": 1.1411411411411412, + "eval_loss": 0.18366886675357819, + "eval_runtime": 35.8223, + "eval_samples_per_second": 22.332, + "eval_steps_per_second": 5.583, + "step": 9500 + }, + { + "epoch": 1.1417417417417417, + "grad_norm": 0.49755939841270447, + "learning_rate": 6.832144012949356e-05, + "loss": 0.1286, + "step": 9505 + }, + { + "epoch": 1.1423423423423422, + "grad_norm": 0.5574833154678345, + "learning_rate": 6.829217640975238e-05, + "loss": 0.1277, + "step": 9510 + }, + { + "epoch": 1.142942942942943, + "grad_norm": 0.4342218041419983, + "learning_rate": 6.826290545407923e-05, + "loss": 0.1269, + "step": 9515 + }, + { + "epoch": 1.1435435435435435, + "grad_norm": 0.4537164270877838, + "learning_rate": 6.823362727405298e-05, + "loss": 0.1365, + "step": 9520 + }, + { + "epoch": 1.1441441441441442, + "grad_norm": 0.5578528642654419, + "learning_rate": 6.820434188125536e-05, + "loss": 0.1343, + "step": 9525 + }, + { + "epoch": 1.1447447447447447, + "grad_norm": 0.48114368319511414, + "learning_rate": 6.817504928727094e-05, + "loss": 0.129, + "step": 9530 + }, + { + "epoch": 1.1453453453453453, + "grad_norm": 0.46689116954803467, + "learning_rate": 6.814574950368715e-05, + "loss": 0.1249, + "step": 9535 + }, + { + "epoch": 1.145945945945946, + "grad_norm": 0.5292294025421143, + "learning_rate": 6.811644254209423e-05, + "loss": 0.1503, + "step": 9540 + }, + { + "epoch": 1.1465465465465465, + "grad_norm": 0.45598042011260986, + "learning_rate": 6.808712841408533e-05, + "loss": 0.1284, + "step": 9545 + }, + { + "epoch": 1.147147147147147, + "grad_norm": 0.6221840977668762, + "learning_rate": 6.805780713125638e-05, + "loss": 0.1312, + "step": 9550 + }, + { + "epoch": 1.1477477477477478, + "grad_norm": 0.4730389714241028, + "learning_rate": 6.802847870520614e-05, + "loss": 0.1311, + "step": 9555 + }, + { + "epoch": 1.1483483483483483, + "grad_norm": 0.563447117805481, + "learning_rate": 6.799914314753622e-05, + "loss": 0.1382, + "step": 9560 + }, + { + "epoch": 1.148948948948949, + "grad_norm": 0.48742344975471497, + "learning_rate": 6.796980046985102e-05, + "loss": 0.1403, + "step": 9565 + }, + { + "epoch": 1.1495495495495496, + "grad_norm": 0.5710808038711548, + "learning_rate": 6.79404506837578e-05, + "loss": 0.1459, + "step": 9570 + }, + { + "epoch": 1.15015015015015, + "grad_norm": 0.6052168011665344, + "learning_rate": 6.79110938008666e-05, + "loss": 0.1464, + "step": 9575 + }, + { + "epoch": 1.1507507507507508, + "grad_norm": 0.6504958868026733, + "learning_rate": 6.788172983279028e-05, + "loss": 0.1512, + "step": 9580 + }, + { + "epoch": 1.1513513513513514, + "grad_norm": 0.49774986505508423, + "learning_rate": 6.78523587911445e-05, + "loss": 0.1284, + "step": 9585 + }, + { + "epoch": 1.1519519519519519, + "grad_norm": 0.5204494595527649, + "learning_rate": 6.782298068754772e-05, + "loss": 0.1356, + "step": 9590 + }, + { + "epoch": 1.1525525525525526, + "grad_norm": 0.42971011996269226, + "learning_rate": 6.779359553362118e-05, + "loss": 0.1282, + "step": 9595 + }, + { + "epoch": 1.1531531531531531, + "grad_norm": 0.6139044165611267, + "learning_rate": 6.776420334098891e-05, + "loss": 0.1272, + "step": 9600 + }, + { + "epoch": 1.1537537537537537, + "grad_norm": 0.439885675907135, + "learning_rate": 6.773480412127776e-05, + "loss": 0.1202, + "step": 9605 + }, + { + "epoch": 1.1543543543543544, + "grad_norm": 0.5982673764228821, + "learning_rate": 6.77053978861173e-05, + "loss": 0.153, + "step": 9610 + }, + { + "epoch": 1.154954954954955, + "grad_norm": 0.4638057351112366, + "learning_rate": 6.767598464713994e-05, + "loss": 0.1324, + "step": 9615 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 0.5485991835594177, + "learning_rate": 6.764656441598081e-05, + "loss": 0.1349, + "step": 9620 + }, + { + "epoch": 1.1561561561561562, + "grad_norm": 0.6574414968490601, + "learning_rate": 6.761713720427782e-05, + "loss": 0.1252, + "step": 9625 + }, + { + "epoch": 1.1567567567567567, + "grad_norm": 0.533554196357727, + "learning_rate": 6.758770302367168e-05, + "loss": 0.1275, + "step": 9630 + }, + { + "epoch": 1.1573573573573575, + "grad_norm": 0.4326445460319519, + "learning_rate": 6.755826188580579e-05, + "loss": 0.1056, + "step": 9635 + }, + { + "epoch": 1.157957957957958, + "grad_norm": 0.47175487875938416, + "learning_rate": 6.752881380232634e-05, + "loss": 0.1284, + "step": 9640 + }, + { + "epoch": 1.1585585585585585, + "grad_norm": 0.5621086359024048, + "learning_rate": 6.749935878488227e-05, + "loss": 0.1409, + "step": 9645 + }, + { + "epoch": 1.1591591591591592, + "grad_norm": 0.47167137265205383, + "learning_rate": 6.746989684512525e-05, + "loss": 0.1348, + "step": 9650 + }, + { + "epoch": 1.1597597597597598, + "grad_norm": 0.38340097665786743, + "learning_rate": 6.74404279947097e-05, + "loss": 0.1096, + "step": 9655 + }, + { + "epoch": 1.1603603603603603, + "grad_norm": 0.4628503918647766, + "learning_rate": 6.741095224529277e-05, + "loss": 0.1325, + "step": 9660 + }, + { + "epoch": 1.160960960960961, + "grad_norm": 0.47487297654151917, + "learning_rate": 6.738146960853433e-05, + "loss": 0.1251, + "step": 9665 + }, + { + "epoch": 1.1615615615615615, + "grad_norm": 0.5550730228424072, + "learning_rate": 6.735198009609697e-05, + "loss": 0.1366, + "step": 9670 + }, + { + "epoch": 1.1621621621621623, + "grad_norm": 0.45188426971435547, + "learning_rate": 6.732248371964602e-05, + "loss": 0.1318, + "step": 9675 + }, + { + "epoch": 1.1627627627627628, + "grad_norm": 0.38361048698425293, + "learning_rate": 6.729298049084953e-05, + "loss": 0.1336, + "step": 9680 + }, + { + "epoch": 1.1633633633633633, + "grad_norm": 0.5466015338897705, + "learning_rate": 6.726347042137824e-05, + "loss": 0.1468, + "step": 9685 + }, + { + "epoch": 1.163963963963964, + "grad_norm": 0.5949849486351013, + "learning_rate": 6.723395352290558e-05, + "loss": 0.1526, + "step": 9690 + }, + { + "epoch": 1.1645645645645646, + "grad_norm": 0.551245391368866, + "learning_rate": 6.720442980710773e-05, + "loss": 0.1302, + "step": 9695 + }, + { + "epoch": 1.165165165165165, + "grad_norm": 0.6451214551925659, + "learning_rate": 6.717489928566355e-05, + "loss": 0.1336, + "step": 9700 + }, + { + "epoch": 1.1657657657657658, + "grad_norm": 0.4822372496128082, + "learning_rate": 6.714536197025452e-05, + "loss": 0.133, + "step": 9705 + }, + { + "epoch": 1.1663663663663664, + "grad_norm": 0.5520108342170715, + "learning_rate": 6.711581787256494e-05, + "loss": 0.1284, + "step": 9710 + }, + { + "epoch": 1.166966966966967, + "grad_norm": 0.6228134632110596, + "learning_rate": 6.70862670042817e-05, + "loss": 0.1428, + "step": 9715 + }, + { + "epoch": 1.1675675675675676, + "grad_norm": 0.4633306860923767, + "learning_rate": 6.70567093770944e-05, + "loss": 0.1209, + "step": 9720 + }, + { + "epoch": 1.1681681681681682, + "grad_norm": 0.6069526672363281, + "learning_rate": 6.702714500269528e-05, + "loss": 0.133, + "step": 9725 + }, + { + "epoch": 1.1687687687687687, + "grad_norm": 0.6277711987495422, + "learning_rate": 6.69975738927793e-05, + "loss": 0.1327, + "step": 9730 + }, + { + "epoch": 1.1693693693693694, + "grad_norm": 0.4462870657444, + "learning_rate": 6.696799605904405e-05, + "loss": 0.1161, + "step": 9735 + }, + { + "epoch": 1.16996996996997, + "grad_norm": 0.5560117363929749, + "learning_rate": 6.693841151318978e-05, + "loss": 0.1532, + "step": 9740 + }, + { + "epoch": 1.1705705705705705, + "grad_norm": 0.5032148957252502, + "learning_rate": 6.690882026691941e-05, + "loss": 0.1297, + "step": 9745 + }, + { + "epoch": 1.1711711711711712, + "grad_norm": 0.5112046003341675, + "learning_rate": 6.687922233193851e-05, + "loss": 0.1243, + "step": 9750 + }, + { + "epoch": 1.1711711711711712, + "eval_loss": 0.17193254828453064, + "eval_runtime": 35.8517, + "eval_samples_per_second": 22.314, + "eval_steps_per_second": 5.579, + "step": 9750 + }, + { + "epoch": 1.1717717717717717, + "grad_norm": 0.46110421419143677, + "learning_rate": 6.684961771995529e-05, + "loss": 0.1238, + "step": 9755 + }, + { + "epoch": 1.1723723723723725, + "grad_norm": 0.6364412307739258, + "learning_rate": 6.682000644268058e-05, + "loss": 0.114, + "step": 9760 + }, + { + "epoch": 1.172972972972973, + "grad_norm": 0.5057504773139954, + "learning_rate": 6.67903885118279e-05, + "loss": 0.1165, + "step": 9765 + }, + { + "epoch": 1.1735735735735735, + "grad_norm": 0.5055462718009949, + "learning_rate": 6.676076393911335e-05, + "loss": 0.13, + "step": 9770 + }, + { + "epoch": 1.1741741741741742, + "grad_norm": 0.4475913941860199, + "learning_rate": 6.673113273625566e-05, + "loss": 0.1297, + "step": 9775 + }, + { + "epoch": 1.1747747747747748, + "grad_norm": 0.6086392998695374, + "learning_rate": 6.670149491497625e-05, + "loss": 0.1488, + "step": 9780 + }, + { + "epoch": 1.1753753753753753, + "grad_norm": 0.5319909453392029, + "learning_rate": 6.667185048699909e-05, + "loss": 0.1419, + "step": 9785 + }, + { + "epoch": 1.175975975975976, + "grad_norm": 0.4748077094554901, + "learning_rate": 6.664219946405075e-05, + "loss": 0.1344, + "step": 9790 + }, + { + "epoch": 1.1765765765765765, + "grad_norm": 0.4510291814804077, + "learning_rate": 6.661254185786047e-05, + "loss": 0.0885, + "step": 9795 + }, + { + "epoch": 1.1771771771771773, + "grad_norm": 0.4001372754573822, + "learning_rate": 6.658287768016004e-05, + "loss": 0.1206, + "step": 9800 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 0.45367351174354553, + "learning_rate": 6.65532069426839e-05, + "loss": 0.1346, + "step": 9805 + }, + { + "epoch": 1.1783783783783783, + "grad_norm": 0.47267067432403564, + "learning_rate": 6.652352965716905e-05, + "loss": 0.1159, + "step": 9810 + }, + { + "epoch": 1.178978978978979, + "grad_norm": 0.5264884829521179, + "learning_rate": 6.64938458353551e-05, + "loss": 0.127, + "step": 9815 + }, + { + "epoch": 1.1795795795795796, + "grad_norm": 0.592536449432373, + "learning_rate": 6.646415548898419e-05, + "loss": 0.1401, + "step": 9820 + }, + { + "epoch": 1.1801801801801801, + "grad_norm": 0.481799453496933, + "learning_rate": 6.643445862980115e-05, + "loss": 0.114, + "step": 9825 + }, + { + "epoch": 1.1807807807807809, + "grad_norm": 0.4007432758808136, + "learning_rate": 6.640475526955329e-05, + "loss": 0.1041, + "step": 9830 + }, + { + "epoch": 1.1813813813813814, + "grad_norm": 0.44925785064697266, + "learning_rate": 6.637504541999051e-05, + "loss": 0.1322, + "step": 9835 + }, + { + "epoch": 1.181981981981982, + "grad_norm": 0.5656154155731201, + "learning_rate": 6.634532909286532e-05, + "loss": 0.1241, + "step": 9840 + }, + { + "epoch": 1.1825825825825826, + "grad_norm": 0.4835910201072693, + "learning_rate": 6.631560629993274e-05, + "loss": 0.1364, + "step": 9845 + }, + { + "epoch": 1.1831831831831832, + "grad_norm": 0.4915154278278351, + "learning_rate": 6.628587705295037e-05, + "loss": 0.1215, + "step": 9850 + }, + { + "epoch": 1.1837837837837837, + "grad_norm": 0.4441297650337219, + "learning_rate": 6.625614136367838e-05, + "loss": 0.1261, + "step": 9855 + }, + { + "epoch": 1.1843843843843844, + "grad_norm": 0.3880355954170227, + "learning_rate": 6.622639924387945e-05, + "loss": 0.1264, + "step": 9860 + }, + { + "epoch": 1.184984984984985, + "grad_norm": 0.6732304096221924, + "learning_rate": 6.619665070531887e-05, + "loss": 0.149, + "step": 9865 + }, + { + "epoch": 1.1855855855855855, + "grad_norm": 0.4434427320957184, + "learning_rate": 6.61668957597644e-05, + "loss": 0.108, + "step": 9870 + }, + { + "epoch": 1.1861861861861862, + "grad_norm": 0.4159413278102875, + "learning_rate": 6.613713441898634e-05, + "loss": 0.1095, + "step": 9875 + }, + { + "epoch": 1.1867867867867867, + "grad_norm": 0.4756458103656769, + "learning_rate": 6.610736669475755e-05, + "loss": 0.1441, + "step": 9880 + }, + { + "epoch": 1.1873873873873875, + "grad_norm": 0.5100711584091187, + "learning_rate": 6.607759259885341e-05, + "loss": 0.1341, + "step": 9885 + }, + { + "epoch": 1.187987987987988, + "grad_norm": 0.4814146161079407, + "learning_rate": 6.604781214305181e-05, + "loss": 0.1338, + "step": 9890 + }, + { + "epoch": 1.1885885885885885, + "grad_norm": 0.5011911988258362, + "learning_rate": 6.601802533913317e-05, + "loss": 0.1277, + "step": 9895 + }, + { + "epoch": 1.1891891891891893, + "grad_norm": 0.4508677124977112, + "learning_rate": 6.598823219888042e-05, + "loss": 0.1226, + "step": 9900 + }, + { + "epoch": 1.1897897897897898, + "grad_norm": 0.49636122584342957, + "learning_rate": 6.595843273407895e-05, + "loss": 0.1251, + "step": 9905 + }, + { + "epoch": 1.1903903903903903, + "grad_norm": 0.5991763472557068, + "learning_rate": 6.592862695651674e-05, + "loss": 0.1259, + "step": 9910 + }, + { + "epoch": 1.190990990990991, + "grad_norm": 0.6228018403053284, + "learning_rate": 6.589881487798418e-05, + "loss": 0.1402, + "step": 9915 + }, + { + "epoch": 1.1915915915915916, + "grad_norm": 0.5502091646194458, + "learning_rate": 6.586899651027421e-05, + "loss": 0.1151, + "step": 9920 + }, + { + "epoch": 1.1921921921921923, + "grad_norm": 0.47402524948120117, + "learning_rate": 6.583917186518223e-05, + "loss": 0.1217, + "step": 9925 + }, + { + "epoch": 1.1927927927927928, + "grad_norm": 0.5979349613189697, + "learning_rate": 6.580934095450613e-05, + "loss": 0.1065, + "step": 9930 + }, + { + "epoch": 1.1933933933933933, + "grad_norm": 0.41192102432250977, + "learning_rate": 6.57795037900463e-05, + "loss": 0.1135, + "step": 9935 + }, + { + "epoch": 1.193993993993994, + "grad_norm": 0.4513644278049469, + "learning_rate": 6.574966038360553e-05, + "loss": 0.1224, + "step": 9940 + }, + { + "epoch": 1.1945945945945946, + "grad_norm": 0.47603142261505127, + "learning_rate": 6.57198107469892e-05, + "loss": 0.129, + "step": 9945 + }, + { + "epoch": 1.1951951951951951, + "grad_norm": 0.40763887763023376, + "learning_rate": 6.568995489200503e-05, + "loss": 0.1185, + "step": 9950 + }, + { + "epoch": 1.1957957957957959, + "grad_norm": 0.5000661015510559, + "learning_rate": 6.566009283046329e-05, + "loss": 0.1214, + "step": 9955 + }, + { + "epoch": 1.1963963963963964, + "grad_norm": 0.451659619808197, + "learning_rate": 6.563022457417666e-05, + "loss": 0.1212, + "step": 9960 + }, + { + "epoch": 1.196996996996997, + "grad_norm": 0.5542309880256653, + "learning_rate": 6.56003501349603e-05, + "loss": 0.1351, + "step": 9965 + }, + { + "epoch": 1.1975975975975977, + "grad_norm": 0.634504497051239, + "learning_rate": 6.557046952463178e-05, + "loss": 0.1228, + "step": 9970 + }, + { + "epoch": 1.1981981981981982, + "grad_norm": 0.44421622157096863, + "learning_rate": 6.554058275501112e-05, + "loss": 0.1066, + "step": 9975 + }, + { + "epoch": 1.1987987987987987, + "grad_norm": 0.43437567353248596, + "learning_rate": 6.55106898379208e-05, + "loss": 0.1176, + "step": 9980 + }, + { + "epoch": 1.1993993993993994, + "grad_norm": 0.4679704010486603, + "learning_rate": 6.548079078518572e-05, + "loss": 0.109, + "step": 9985 + }, + { + "epoch": 1.2, + "grad_norm": 0.47817036509513855, + "learning_rate": 6.54508856086332e-05, + "loss": 0.1204, + "step": 9990 + }, + { + "epoch": 1.2006006006006005, + "grad_norm": 0.46039843559265137, + "learning_rate": 6.5420974320093e-05, + "loss": 0.1223, + "step": 9995 + }, + { + "epoch": 1.2012012012012012, + "grad_norm": 0.6263976693153381, + "learning_rate": 6.539105693139726e-05, + "loss": 0.1211, + "step": 10000 + }, + { + "epoch": 1.2012012012012012, + "eval_loss": 0.16594858467578888, + "eval_runtime": 35.7813, + "eval_samples_per_second": 22.358, + "eval_steps_per_second": 5.59, + "step": 10000 + }, + { + "epoch": 1.2018018018018017, + "grad_norm": 0.5672759413719177, + "learning_rate": 6.536113345438058e-05, + "loss": 0.1308, + "step": 10005 + }, + { + "epoch": 1.2024024024024025, + "grad_norm": 0.561998188495636, + "learning_rate": 6.533120390087995e-05, + "loss": 0.1633, + "step": 10010 + }, + { + "epoch": 1.203003003003003, + "grad_norm": 0.4906715750694275, + "learning_rate": 6.530126828273472e-05, + "loss": 0.1206, + "step": 10015 + }, + { + "epoch": 1.2036036036036035, + "grad_norm": 0.554155707359314, + "learning_rate": 6.527132661178673e-05, + "loss": 0.1155, + "step": 10020 + }, + { + "epoch": 1.2042042042042043, + "grad_norm": 0.5317109823226929, + "learning_rate": 6.524137889988013e-05, + "loss": 0.1276, + "step": 10025 + }, + { + "epoch": 1.2048048048048048, + "grad_norm": 0.602896511554718, + "learning_rate": 6.521142515886151e-05, + "loss": 0.1309, + "step": 10030 + }, + { + "epoch": 1.2054054054054055, + "grad_norm": 0.4094832241535187, + "learning_rate": 6.518146540057981e-05, + "loss": 0.1236, + "step": 10035 + }, + { + "epoch": 1.206006006006006, + "grad_norm": 0.5748536586761475, + "learning_rate": 6.51514996368864e-05, + "loss": 0.1356, + "step": 10040 + }, + { + "epoch": 1.2066066066066066, + "grad_norm": 0.4584883153438568, + "learning_rate": 6.512152787963496e-05, + "loss": 0.1343, + "step": 10045 + }, + { + "epoch": 1.2072072072072073, + "grad_norm": 0.4517095685005188, + "learning_rate": 6.50915501406816e-05, + "loss": 0.1015, + "step": 10050 + }, + { + "epoch": 1.2078078078078078, + "grad_norm": 0.5176427364349365, + "learning_rate": 6.506156643188475e-05, + "loss": 0.1451, + "step": 10055 + }, + { + "epoch": 1.2084084084084084, + "grad_norm": 0.5709592700004578, + "learning_rate": 6.503157676510523e-05, + "loss": 0.1186, + "step": 10060 + }, + { + "epoch": 1.209009009009009, + "grad_norm": 0.5626363754272461, + "learning_rate": 6.500158115220624e-05, + "loss": 0.1119, + "step": 10065 + }, + { + "epoch": 1.2096096096096096, + "grad_norm": 0.4673994779586792, + "learning_rate": 6.497157960505324e-05, + "loss": 0.1179, + "step": 10070 + }, + { + "epoch": 1.2102102102102101, + "grad_norm": 0.41876932978630066, + "learning_rate": 6.494157213551416e-05, + "loss": 0.1314, + "step": 10075 + }, + { + "epoch": 1.2108108108108109, + "grad_norm": 0.4630737006664276, + "learning_rate": 6.491155875545918e-05, + "loss": 0.1158, + "step": 10080 + }, + { + "epoch": 1.2114114114114114, + "grad_norm": 0.6827098727226257, + "learning_rate": 6.488153947676085e-05, + "loss": 0.1418, + "step": 10085 + }, + { + "epoch": 1.212012012012012, + "grad_norm": 0.5200510025024414, + "learning_rate": 6.485151431129405e-05, + "loss": 0.1162, + "step": 10090 + }, + { + "epoch": 1.2126126126126127, + "grad_norm": 0.5516908168792725, + "learning_rate": 6.482148327093601e-05, + "loss": 0.1177, + "step": 10095 + }, + { + "epoch": 1.2132132132132132, + "grad_norm": 0.44765716791152954, + "learning_rate": 6.479144636756624e-05, + "loss": 0.11, + "step": 10100 + }, + { + "epoch": 1.2138138138138137, + "grad_norm": 0.46599870920181274, + "learning_rate": 6.476140361306665e-05, + "loss": 0.1136, + "step": 10105 + }, + { + "epoch": 1.2144144144144144, + "grad_norm": 0.5948307514190674, + "learning_rate": 6.473135501932134e-05, + "loss": 0.1267, + "step": 10110 + }, + { + "epoch": 1.215015015015015, + "grad_norm": 0.47781988978385925, + "learning_rate": 6.470130059821681e-05, + "loss": 0.0974, + "step": 10115 + }, + { + "epoch": 1.2156156156156157, + "grad_norm": 0.42919135093688965, + "learning_rate": 6.467124036164188e-05, + "loss": 0.1225, + "step": 10120 + }, + { + "epoch": 1.2162162162162162, + "grad_norm": 0.5844857096672058, + "learning_rate": 6.464117432148759e-05, + "loss": 0.1407, + "step": 10125 + }, + { + "epoch": 1.2168168168168167, + "grad_norm": 0.5320519804954529, + "learning_rate": 6.461110248964737e-05, + "loss": 0.1305, + "step": 10130 + }, + { + "epoch": 1.2174174174174175, + "grad_norm": 0.4706374406814575, + "learning_rate": 6.458102487801684e-05, + "loss": 0.1116, + "step": 10135 + }, + { + "epoch": 1.218018018018018, + "grad_norm": 0.6610530614852905, + "learning_rate": 6.455094149849398e-05, + "loss": 0.1264, + "step": 10140 + }, + { + "epoch": 1.2186186186186185, + "grad_norm": 0.3942696750164032, + "learning_rate": 6.452085236297904e-05, + "loss": 0.1115, + "step": 10145 + }, + { + "epoch": 1.2192192192192193, + "grad_norm": 0.6189076900482178, + "learning_rate": 6.449075748337451e-05, + "loss": 0.1303, + "step": 10150 + }, + { + "epoch": 1.2198198198198198, + "grad_norm": 0.5192115306854248, + "learning_rate": 6.446065687158522e-05, + "loss": 0.1206, + "step": 10155 + }, + { + "epoch": 1.2204204204204205, + "grad_norm": 0.4605419933795929, + "learning_rate": 6.44305505395182e-05, + "loss": 0.1194, + "step": 10160 + }, + { + "epoch": 1.221021021021021, + "grad_norm": 0.5050801038742065, + "learning_rate": 6.440043849908277e-05, + "loss": 0.1121, + "step": 10165 + }, + { + "epoch": 1.2216216216216216, + "grad_norm": 0.5062825679779053, + "learning_rate": 6.43703207621905e-05, + "loss": 0.1175, + "step": 10170 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 0.6098731756210327, + "learning_rate": 6.434019734075523e-05, + "loss": 0.1326, + "step": 10175 + }, + { + "epoch": 1.2228228228228228, + "grad_norm": 0.6831001043319702, + "learning_rate": 6.431006824669305e-05, + "loss": 0.1289, + "step": 10180 + }, + { + "epoch": 1.2234234234234234, + "grad_norm": 0.5312969088554382, + "learning_rate": 6.427993349192226e-05, + "loss": 0.1089, + "step": 10185 + }, + { + "epoch": 1.224024024024024, + "grad_norm": 0.5393247604370117, + "learning_rate": 6.424979308836346e-05, + "loss": 0.1352, + "step": 10190 + }, + { + "epoch": 1.2246246246246246, + "grad_norm": 0.6124199032783508, + "learning_rate": 6.42196470479394e-05, + "loss": 0.1273, + "step": 10195 + }, + { + "epoch": 1.2252252252252251, + "grad_norm": 0.45156264305114746, + "learning_rate": 6.418949538257515e-05, + "loss": 0.1155, + "step": 10200 + }, + { + "epoch": 1.2258258258258259, + "grad_norm": 0.47056496143341064, + "learning_rate": 6.415933810419794e-05, + "loss": 0.1354, + "step": 10205 + }, + { + "epoch": 1.2264264264264264, + "grad_norm": 0.3897019028663635, + "learning_rate": 6.412917522473722e-05, + "loss": 0.1378, + "step": 10210 + }, + { + "epoch": 1.227027027027027, + "grad_norm": 0.530168354511261, + "learning_rate": 6.409900675612475e-05, + "loss": 0.1124, + "step": 10215 + }, + { + "epoch": 1.2276276276276277, + "grad_norm": 0.5184093117713928, + "learning_rate": 6.406883271029434e-05, + "loss": 0.1349, + "step": 10220 + }, + { + "epoch": 1.2282282282282282, + "grad_norm": 0.3887346386909485, + "learning_rate": 6.403865309918216e-05, + "loss": 0.1099, + "step": 10225 + }, + { + "epoch": 1.2288288288288287, + "grad_norm": 0.5268977880477905, + "learning_rate": 6.400846793472648e-05, + "loss": 0.1156, + "step": 10230 + }, + { + "epoch": 1.2294294294294295, + "grad_norm": 0.4158872663974762, + "learning_rate": 6.397827722886782e-05, + "loss": 0.105, + "step": 10235 + }, + { + "epoch": 1.23003003003003, + "grad_norm": 0.4172183573246002, + "learning_rate": 6.394808099354888e-05, + "loss": 0.1115, + "step": 10240 + }, + { + "epoch": 1.2306306306306307, + "grad_norm": 0.5329809188842773, + "learning_rate": 6.391787924071454e-05, + "loss": 0.1058, + "step": 10245 + }, + { + "epoch": 1.2312312312312312, + "grad_norm": 0.5237940549850464, + "learning_rate": 6.388767198231187e-05, + "loss": 0.1364, + "step": 10250 + }, + { + "epoch": 1.2312312312312312, + "eval_loss": 0.1568148285150528, + "eval_runtime": 35.9579, + "eval_samples_per_second": 22.248, + "eval_steps_per_second": 5.562, + "step": 10250 + }, + { + "epoch": 1.2318318318318318, + "grad_norm": 0.433456152677536, + "learning_rate": 6.385745923029008e-05, + "loss": 0.1168, + "step": 10255 + }, + { + "epoch": 1.2324324324324325, + "grad_norm": 0.4698168635368347, + "learning_rate": 6.382724099660063e-05, + "loss": 0.1221, + "step": 10260 + }, + { + "epoch": 1.233033033033033, + "grad_norm": 0.3991837501525879, + "learning_rate": 6.379701729319707e-05, + "loss": 0.1029, + "step": 10265 + }, + { + "epoch": 1.2336336336336335, + "grad_norm": 0.42745494842529297, + "learning_rate": 6.376678813203517e-05, + "loss": 0.1037, + "step": 10270 + }, + { + "epoch": 1.2342342342342343, + "grad_norm": 0.5544251799583435, + "learning_rate": 6.373655352507284e-05, + "loss": 0.1207, + "step": 10275 + }, + { + "epoch": 1.2348348348348348, + "grad_norm": 0.5787578821182251, + "learning_rate": 6.370631348427012e-05, + "loss": 0.1276, + "step": 10280 + }, + { + "epoch": 1.2354354354354355, + "grad_norm": 0.4974392056465149, + "learning_rate": 6.367606802158925e-05, + "loss": 0.1074, + "step": 10285 + }, + { + "epoch": 1.236036036036036, + "grad_norm": 0.630325198173523, + "learning_rate": 6.364581714899457e-05, + "loss": 0.1348, + "step": 10290 + }, + { + "epoch": 1.2366366366366366, + "grad_norm": 0.5606110692024231, + "learning_rate": 6.361556087845258e-05, + "loss": 0.1237, + "step": 10295 + }, + { + "epoch": 1.2372372372372373, + "grad_norm": 0.4720557928085327, + "learning_rate": 6.358529922193191e-05, + "loss": 0.0998, + "step": 10300 + }, + { + "epoch": 1.2378378378378379, + "grad_norm": 0.5140436291694641, + "learning_rate": 6.35550321914033e-05, + "loss": 0.1122, + "step": 10305 + }, + { + "epoch": 1.2384384384384384, + "grad_norm": 0.45371970534324646, + "learning_rate": 6.352475979883967e-05, + "loss": 0.1095, + "step": 10310 + }, + { + "epoch": 1.2390390390390391, + "grad_norm": 0.6673846244812012, + "learning_rate": 6.349448205621602e-05, + "loss": 0.1274, + "step": 10315 + }, + { + "epoch": 1.2396396396396396, + "grad_norm": 0.4930076599121094, + "learning_rate": 6.346419897550946e-05, + "loss": 0.1263, + "step": 10320 + }, + { + "epoch": 1.2402402402402402, + "grad_norm": 0.5467943549156189, + "learning_rate": 6.343391056869925e-05, + "loss": 0.1112, + "step": 10325 + }, + { + "epoch": 1.240840840840841, + "grad_norm": 0.5361719131469727, + "learning_rate": 6.340361684776673e-05, + "loss": 0.11, + "step": 10330 + }, + { + "epoch": 1.2414414414414414, + "grad_norm": 0.539135754108429, + "learning_rate": 6.337331782469532e-05, + "loss": 0.1243, + "step": 10335 + }, + { + "epoch": 1.242042042042042, + "grad_norm": 0.4833470582962036, + "learning_rate": 6.334301351147061e-05, + "loss": 0.1153, + "step": 10340 + }, + { + "epoch": 1.2426426426426427, + "grad_norm": 0.49930423498153687, + "learning_rate": 6.331270392008019e-05, + "loss": 0.1257, + "step": 10345 + }, + { + "epoch": 1.2432432432432432, + "grad_norm": 0.47665318846702576, + "learning_rate": 6.328238906251378e-05, + "loss": 0.1163, + "step": 10350 + }, + { + "epoch": 1.2438438438438437, + "grad_norm": 0.5809736847877502, + "learning_rate": 6.325206895076323e-05, + "loss": 0.1286, + "step": 10355 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 0.5782884955406189, + "learning_rate": 6.32217435968224e-05, + "loss": 0.1287, + "step": 10360 + }, + { + "epoch": 1.245045045045045, + "grad_norm": 0.6237002611160278, + "learning_rate": 6.319141301268725e-05, + "loss": 0.1359, + "step": 10365 + }, + { + "epoch": 1.2456456456456457, + "grad_norm": 0.5011314153671265, + "learning_rate": 6.316107721035581e-05, + "loss": 0.1022, + "step": 10370 + }, + { + "epoch": 1.2462462462462462, + "grad_norm": 0.4131128191947937, + "learning_rate": 6.313073620182816e-05, + "loss": 0.1088, + "step": 10375 + }, + { + "epoch": 1.2468468468468468, + "grad_norm": 0.5163730978965759, + "learning_rate": 6.310038999910648e-05, + "loss": 0.1046, + "step": 10380 + }, + { + "epoch": 1.2474474474474475, + "grad_norm": 0.4977834224700928, + "learning_rate": 6.307003861419498e-05, + "loss": 0.1283, + "step": 10385 + }, + { + "epoch": 1.248048048048048, + "grad_norm": 0.6062964797019958, + "learning_rate": 6.303968205909985e-05, + "loss": 0.131, + "step": 10390 + }, + { + "epoch": 1.2486486486486488, + "grad_norm": 0.472942978143692, + "learning_rate": 6.300932034582947e-05, + "loss": 0.1219, + "step": 10395 + }, + { + "epoch": 1.2492492492492493, + "grad_norm": 0.4516295790672302, + "learning_rate": 6.297895348639415e-05, + "loss": 0.1202, + "step": 10400 + }, + { + "epoch": 1.2498498498498498, + "grad_norm": 0.5245740413665771, + "learning_rate": 6.294858149280625e-05, + "loss": 0.1097, + "step": 10405 + }, + { + "epoch": 1.2504504504504506, + "grad_norm": 0.5840051770210266, + "learning_rate": 6.291820437708018e-05, + "loss": 0.1359, + "step": 10410 + }, + { + "epoch": 1.251051051051051, + "grad_norm": 0.704037606716156, + "learning_rate": 6.288782215123242e-05, + "loss": 0.1243, + "step": 10415 + }, + { + "epoch": 1.2516516516516516, + "grad_norm": 0.4552038311958313, + "learning_rate": 6.285743482728138e-05, + "loss": 0.139, + "step": 10420 + }, + { + "epoch": 1.2522522522522523, + "grad_norm": 0.5076271891593933, + "learning_rate": 6.282704241724756e-05, + "loss": 0.1306, + "step": 10425 + }, + { + "epoch": 1.2528528528528529, + "grad_norm": 0.514893651008606, + "learning_rate": 6.279664493315343e-05, + "loss": 0.1206, + "step": 10430 + }, + { + "epoch": 1.2534534534534534, + "grad_norm": 0.4690745770931244, + "learning_rate": 6.276624238702347e-05, + "loss": 0.1217, + "step": 10435 + }, + { + "epoch": 1.2540540540540541, + "grad_norm": 0.47772398591041565, + "learning_rate": 6.273583479088422e-05, + "loss": 0.1172, + "step": 10440 + }, + { + "epoch": 1.2546546546546546, + "grad_norm": 0.49891212582588196, + "learning_rate": 6.270542215676411e-05, + "loss": 0.126, + "step": 10445 + }, + { + "epoch": 1.2552552552552552, + "grad_norm": 0.40780380368232727, + "learning_rate": 6.267500449669367e-05, + "loss": 0.12, + "step": 10450 + }, + { + "epoch": 1.255855855855856, + "grad_norm": 0.5549760460853577, + "learning_rate": 6.264458182270536e-05, + "loss": 0.1375, + "step": 10455 + }, + { + "epoch": 1.2564564564564564, + "grad_norm": 0.5111986994743347, + "learning_rate": 6.261415414683365e-05, + "loss": 0.1206, + "step": 10460 + }, + { + "epoch": 1.257057057057057, + "grad_norm": 0.5683580040931702, + "learning_rate": 6.258372148111493e-05, + "loss": 0.127, + "step": 10465 + }, + { + "epoch": 1.2576576576576577, + "grad_norm": 0.4563046991825104, + "learning_rate": 6.255328383758768e-05, + "loss": 0.0997, + "step": 10470 + }, + { + "epoch": 1.2582582582582582, + "grad_norm": 0.5216641426086426, + "learning_rate": 6.25228412282922e-05, + "loss": 0.1185, + "step": 10475 + }, + { + "epoch": 1.2588588588588587, + "grad_norm": 0.45824185013771057, + "learning_rate": 6.249239366527088e-05, + "loss": 0.1145, + "step": 10480 + }, + { + "epoch": 1.2594594594594595, + "grad_norm": 0.5047664046287537, + "learning_rate": 6.246194116056803e-05, + "loss": 0.1249, + "step": 10485 + }, + { + "epoch": 1.26006006006006, + "grad_norm": 0.4454682767391205, + "learning_rate": 6.243148372622986e-05, + "loss": 0.1012, + "step": 10490 + }, + { + "epoch": 1.2606606606606607, + "grad_norm": 0.5449995994567871, + "learning_rate": 6.240102137430463e-05, + "loss": 0.1177, + "step": 10495 + }, + { + "epoch": 1.2612612612612613, + "grad_norm": 0.4220269024372101, + "learning_rate": 6.237055411684245e-05, + "loss": 0.1177, + "step": 10500 + }, + { + "epoch": 1.2612612612612613, + "eval_loss": 0.13997453451156616, + "eval_runtime": 35.894, + "eval_samples_per_second": 22.288, + "eval_steps_per_second": 5.572, + "step": 10500 + }, + { + "epoch": 1.261861861861862, + "grad_norm": 0.7029174566268921, + "learning_rate": 6.234008196589545e-05, + "loss": 0.1257, + "step": 10505 + }, + { + "epoch": 1.2624624624624625, + "grad_norm": 0.4518882632255554, + "learning_rate": 6.230960493351761e-05, + "loss": 0.1183, + "step": 10510 + }, + { + "epoch": 1.263063063063063, + "grad_norm": 0.46593165397644043, + "learning_rate": 6.227912303176493e-05, + "loss": 0.1118, + "step": 10515 + }, + { + "epoch": 1.2636636636636638, + "grad_norm": 0.5268282294273376, + "learning_rate": 6.224863627269529e-05, + "loss": 0.129, + "step": 10520 + }, + { + "epoch": 1.2642642642642643, + "grad_norm": 0.4126332700252533, + "learning_rate": 6.221814466836848e-05, + "loss": 0.0942, + "step": 10525 + }, + { + "epoch": 1.2648648648648648, + "grad_norm": 0.4374845325946808, + "learning_rate": 6.218764823084624e-05, + "loss": 0.1007, + "step": 10530 + }, + { + "epoch": 1.2654654654654656, + "grad_norm": 0.48468881845474243, + "learning_rate": 6.21571469721922e-05, + "loss": 0.1248, + "step": 10535 + }, + { + "epoch": 1.266066066066066, + "grad_norm": 0.5242211818695068, + "learning_rate": 6.21266409044719e-05, + "loss": 0.1234, + "step": 10540 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.48668068647384644, + "learning_rate": 6.209613003975278e-05, + "loss": 0.111, + "step": 10545 + }, + { + "epoch": 1.2672672672672673, + "grad_norm": 0.5469307899475098, + "learning_rate": 6.206561439010418e-05, + "loss": 0.0973, + "step": 10550 + }, + { + "epoch": 1.2678678678678679, + "grad_norm": 0.5339605212211609, + "learning_rate": 6.203509396759736e-05, + "loss": 0.1105, + "step": 10555 + }, + { + "epoch": 1.2684684684684684, + "grad_norm": 0.49797797203063965, + "learning_rate": 6.200456878430542e-05, + "loss": 0.1043, + "step": 10560 + }, + { + "epoch": 1.2690690690690691, + "grad_norm": 0.442689448595047, + "learning_rate": 6.197403885230337e-05, + "loss": 0.0849, + "step": 10565 + }, + { + "epoch": 1.2696696696696697, + "grad_norm": 0.4138694703578949, + "learning_rate": 6.19435041836681e-05, + "loss": 0.0989, + "step": 10570 + }, + { + "epoch": 1.2702702702702702, + "grad_norm": 0.45859405398368835, + "learning_rate": 6.191296479047837e-05, + "loss": 0.1008, + "step": 10575 + }, + { + "epoch": 1.270870870870871, + "grad_norm": 0.4636090397834778, + "learning_rate": 6.188242068481481e-05, + "loss": 0.116, + "step": 10580 + }, + { + "epoch": 1.2714714714714714, + "grad_norm": 0.48628488183021545, + "learning_rate": 6.185187187875989e-05, + "loss": 0.1138, + "step": 10585 + }, + { + "epoch": 1.272072072072072, + "grad_norm": 0.4805254638195038, + "learning_rate": 6.182131838439799e-05, + "loss": 0.1153, + "step": 10590 + }, + { + "epoch": 1.2726726726726727, + "grad_norm": 0.4614923298358917, + "learning_rate": 6.17907602138153e-05, + "loss": 0.1061, + "step": 10595 + }, + { + "epoch": 1.2732732732732732, + "grad_norm": 0.523812472820282, + "learning_rate": 6.176019737909989e-05, + "loss": 0.1067, + "step": 10600 + }, + { + "epoch": 1.2738738738738737, + "grad_norm": 0.4345147907733917, + "learning_rate": 6.172962989234162e-05, + "loss": 0.1158, + "step": 10605 + }, + { + "epoch": 1.2744744744744745, + "grad_norm": 0.5423539876937866, + "learning_rate": 6.169905776563229e-05, + "loss": 0.1207, + "step": 10610 + }, + { + "epoch": 1.275075075075075, + "grad_norm": 0.5352399945259094, + "learning_rate": 6.166848101106543e-05, + "loss": 0.1206, + "step": 10615 + }, + { + "epoch": 1.2756756756756757, + "grad_norm": 0.47472965717315674, + "learning_rate": 6.163789964073647e-05, + "loss": 0.1311, + "step": 10620 + }, + { + "epoch": 1.2762762762762763, + "grad_norm": 0.48953425884246826, + "learning_rate": 6.160731366674264e-05, + "loss": 0.1231, + "step": 10625 + }, + { + "epoch": 1.276876876876877, + "grad_norm": 0.565130889415741, + "learning_rate": 6.157672310118297e-05, + "loss": 0.1138, + "step": 10630 + }, + { + "epoch": 1.2774774774774775, + "grad_norm": 0.42137813568115234, + "learning_rate": 6.154612795615836e-05, + "loss": 0.0997, + "step": 10635 + }, + { + "epoch": 1.278078078078078, + "grad_norm": 0.5966997146606445, + "learning_rate": 6.151552824377148e-05, + "loss": 0.1034, + "step": 10640 + }, + { + "epoch": 1.2786786786786788, + "grad_norm": 0.49393218755722046, + "learning_rate": 6.148492397612683e-05, + "loss": 0.1254, + "step": 10645 + }, + { + "epoch": 1.2792792792792793, + "grad_norm": 0.4611142873764038, + "learning_rate": 6.145431516533068e-05, + "loss": 0.1067, + "step": 10650 + }, + { + "epoch": 1.2798798798798798, + "grad_norm": 0.5526403188705444, + "learning_rate": 6.142370182349113e-05, + "loss": 0.1147, + "step": 10655 + }, + { + "epoch": 1.2804804804804806, + "grad_norm": 0.454584002494812, + "learning_rate": 6.139308396271804e-05, + "loss": 0.111, + "step": 10660 + }, + { + "epoch": 1.281081081081081, + "grad_norm": 0.48215189576148987, + "learning_rate": 6.136246159512311e-05, + "loss": 0.1082, + "step": 10665 + }, + { + "epoch": 1.2816816816816816, + "grad_norm": 0.49625515937805176, + "learning_rate": 6.133183473281978e-05, + "loss": 0.0919, + "step": 10670 + }, + { + "epoch": 1.2822822822822824, + "grad_norm": 0.557956337928772, + "learning_rate": 6.130120338792327e-05, + "loss": 0.129, + "step": 10675 + }, + { + "epoch": 1.2828828828828829, + "grad_norm": 0.46650010347366333, + "learning_rate": 6.127056757255059e-05, + "loss": 0.1067, + "step": 10680 + }, + { + "epoch": 1.2834834834834834, + "grad_norm": 0.4905858635902405, + "learning_rate": 6.12399272988205e-05, + "loss": 0.112, + "step": 10685 + }, + { + "epoch": 1.2840840840840841, + "grad_norm": 0.5483079552650452, + "learning_rate": 6.120928257885354e-05, + "loss": 0.0999, + "step": 10690 + }, + { + "epoch": 1.2846846846846847, + "grad_norm": 0.5385778546333313, + "learning_rate": 6.117863342477199e-05, + "loss": 0.1065, + "step": 10695 + }, + { + "epoch": 1.2852852852852852, + "grad_norm": 0.40807682275772095, + "learning_rate": 6.114797984869992e-05, + "loss": 0.1099, + "step": 10700 + }, + { + "epoch": 1.285885885885886, + "grad_norm": 0.5240715742111206, + "learning_rate": 6.11173218627631e-05, + "loss": 0.107, + "step": 10705 + }, + { + "epoch": 1.2864864864864864, + "grad_norm": 0.6276156306266785, + "learning_rate": 6.108665947908909e-05, + "loss": 0.112, + "step": 10710 + }, + { + "epoch": 1.287087087087087, + "grad_norm": 0.5692849159240723, + "learning_rate": 6.105599270980716e-05, + "loss": 0.1166, + "step": 10715 + }, + { + "epoch": 1.2876876876876877, + "grad_norm": 0.5226588249206543, + "learning_rate": 6.102532156704832e-05, + "loss": 0.0986, + "step": 10720 + }, + { + "epoch": 1.2882882882882882, + "grad_norm": 0.4636688828468323, + "learning_rate": 6.099464606294533e-05, + "loss": 0.104, + "step": 10725 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 0.5031043887138367, + "learning_rate": 6.096396620963264e-05, + "loss": 0.1117, + "step": 10730 + }, + { + "epoch": 1.2894894894894895, + "grad_norm": 0.5021368861198425, + "learning_rate": 6.093328201924645e-05, + "loss": 0.0998, + "step": 10735 + }, + { + "epoch": 1.29009009009009, + "grad_norm": 0.4384394586086273, + "learning_rate": 6.090259350392468e-05, + "loss": 0.1088, + "step": 10740 + }, + { + "epoch": 1.2906906906906908, + "grad_norm": 0.3991621732711792, + "learning_rate": 6.087190067580691e-05, + "loss": 0.0943, + "step": 10745 + }, + { + "epoch": 1.2912912912912913, + "grad_norm": 0.4799728989601135, + "learning_rate": 6.0841203547034495e-05, + "loss": 0.0976, + "step": 10750 + }, + { + "epoch": 1.2912912912912913, + "eval_loss": 0.12925225496292114, + "eval_runtime": 35.8181, + "eval_samples_per_second": 22.335, + "eval_steps_per_second": 5.584, + "step": 10750 + }, + { + "epoch": 1.291891891891892, + "grad_norm": 0.5584620237350464, + "learning_rate": 6.081050212975047e-05, + "loss": 0.1053, + "step": 10755 + }, + { + "epoch": 1.2924924924924925, + "grad_norm": 0.5148786902427673, + "learning_rate": 6.077979643609952e-05, + "loss": 0.1092, + "step": 10760 + }, + { + "epoch": 1.293093093093093, + "grad_norm": 0.5329381823539734, + "learning_rate": 6.0749086478228066e-05, + "loss": 0.1068, + "step": 10765 + }, + { + "epoch": 1.2936936936936938, + "grad_norm": 0.5270235538482666, + "learning_rate": 6.0718372268284216e-05, + "loss": 0.1021, + "step": 10770 + }, + { + "epoch": 1.2942942942942943, + "grad_norm": 0.41905245184898376, + "learning_rate": 6.068765381841776e-05, + "loss": 0.1232, + "step": 10775 + }, + { + "epoch": 1.2948948948948948, + "grad_norm": 0.4731054902076721, + "learning_rate": 6.065693114078012e-05, + "loss": 0.1049, + "step": 10780 + }, + { + "epoch": 1.2954954954954956, + "grad_norm": 0.4915459454059601, + "learning_rate": 6.062620424752446e-05, + "loss": 0.1081, + "step": 10785 + }, + { + "epoch": 1.296096096096096, + "grad_norm": 0.46217599511146545, + "learning_rate": 6.059547315080557e-05, + "loss": 0.1113, + "step": 10790 + }, + { + "epoch": 1.2966966966966966, + "grad_norm": 0.5027230381965637, + "learning_rate": 6.0564737862779894e-05, + "loss": 0.1014, + "step": 10795 + }, + { + "epoch": 1.2972972972972974, + "grad_norm": 0.49862319231033325, + "learning_rate": 6.053399839560559e-05, + "loss": 0.1044, + "step": 10800 + }, + { + "epoch": 1.297897897897898, + "grad_norm": 0.44932979345321655, + "learning_rate": 6.0503254761442384e-05, + "loss": 0.0887, + "step": 10805 + }, + { + "epoch": 1.2984984984984984, + "grad_norm": 0.5156846046447754, + "learning_rate": 6.0472506972451724e-05, + "loss": 0.1093, + "step": 10810 + }, + { + "epoch": 1.2990990990990992, + "grad_norm": 0.5869203209877014, + "learning_rate": 6.0441755040796676e-05, + "loss": 0.1189, + "step": 10815 + }, + { + "epoch": 1.2996996996996997, + "grad_norm": 0.5359819531440735, + "learning_rate": 6.041099897864192e-05, + "loss": 0.0929, + "step": 10820 + }, + { + "epoch": 1.3003003003003002, + "grad_norm": 0.42127725481987, + "learning_rate": 6.038023879815382e-05, + "loss": 0.1256, + "step": 10825 + }, + { + "epoch": 1.300900900900901, + "grad_norm": 0.4706318974494934, + "learning_rate": 6.034947451150032e-05, + "loss": 0.1094, + "step": 10830 + }, + { + "epoch": 1.3015015015015015, + "grad_norm": 0.4898918867111206, + "learning_rate": 6.0318706130851024e-05, + "loss": 0.1089, + "step": 10835 + }, + { + "epoch": 1.302102102102102, + "grad_norm": 0.4475489854812622, + "learning_rate": 6.028793366837712e-05, + "loss": 0.0878, + "step": 10840 + }, + { + "epoch": 1.3027027027027027, + "grad_norm": 0.5071420669555664, + "learning_rate": 6.025715713625146e-05, + "loss": 0.1083, + "step": 10845 + }, + { + "epoch": 1.3033033033033032, + "grad_norm": 0.4866783618927002, + "learning_rate": 6.022637654664846e-05, + "loss": 0.1085, + "step": 10850 + }, + { + "epoch": 1.303903903903904, + "grad_norm": 0.5027065277099609, + "learning_rate": 6.019559191174416e-05, + "loss": 0.1021, + "step": 10855 + }, + { + "epoch": 1.3045045045045045, + "grad_norm": 0.5954436659812927, + "learning_rate": 6.016480324371622e-05, + "loss": 0.1194, + "step": 10860 + }, + { + "epoch": 1.305105105105105, + "grad_norm": 0.5378061532974243, + "learning_rate": 6.013401055474384e-05, + "loss": 0.1081, + "step": 10865 + }, + { + "epoch": 1.3057057057057058, + "grad_norm": 0.4428832530975342, + "learning_rate": 6.0103213857007864e-05, + "loss": 0.1146, + "step": 10870 + }, + { + "epoch": 1.3063063063063063, + "grad_norm": 0.6072458028793335, + "learning_rate": 6.00724131626907e-05, + "loss": 0.1112, + "step": 10875 + }, + { + "epoch": 1.306906906906907, + "grad_norm": 0.3962419033050537, + "learning_rate": 6.004160848397635e-05, + "loss": 0.1133, + "step": 10880 + }, + { + "epoch": 1.3075075075075075, + "grad_norm": 0.49185970425605774, + "learning_rate": 6.001079983305037e-05, + "loss": 0.108, + "step": 10885 + }, + { + "epoch": 1.308108108108108, + "grad_norm": 0.5775038003921509, + "learning_rate": 5.9979987222099895e-05, + "loss": 0.1059, + "step": 10890 + }, + { + "epoch": 1.3087087087087088, + "grad_norm": 0.49251505732536316, + "learning_rate": 5.994917066331366e-05, + "loss": 0.108, + "step": 10895 + }, + { + "epoch": 1.3093093093093093, + "grad_norm": 0.5165398716926575, + "learning_rate": 5.9918350168881885e-05, + "loss": 0.1058, + "step": 10900 + }, + { + "epoch": 1.3099099099099099, + "grad_norm": 0.571881890296936, + "learning_rate": 5.988752575099644e-05, + "loss": 0.1248, + "step": 10905 + }, + { + "epoch": 1.3105105105105106, + "grad_norm": 0.4895869791507721, + "learning_rate": 5.985669742185068e-05, + "loss": 0.1187, + "step": 10910 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 0.467013418674469, + "learning_rate": 5.982586519363954e-05, + "loss": 0.1275, + "step": 10915 + }, + { + "epoch": 1.3117117117117116, + "grad_norm": 0.4548030495643616, + "learning_rate": 5.979502907855945e-05, + "loss": 0.116, + "step": 10920 + }, + { + "epoch": 1.3123123123123124, + "grad_norm": 0.419702410697937, + "learning_rate": 5.976418908880845e-05, + "loss": 0.0964, + "step": 10925 + }, + { + "epoch": 1.312912912912913, + "grad_norm": 0.531001627445221, + "learning_rate": 5.973334523658607e-05, + "loss": 0.0963, + "step": 10930 + }, + { + "epoch": 1.3135135135135134, + "grad_norm": 0.5297698974609375, + "learning_rate": 5.970249753409336e-05, + "loss": 0.1035, + "step": 10935 + }, + { + "epoch": 1.3141141141141142, + "grad_norm": 0.40090298652648926, + "learning_rate": 5.967164599353293e-05, + "loss": 0.0923, + "step": 10940 + }, + { + "epoch": 1.3147147147147147, + "grad_norm": 0.5055508017539978, + "learning_rate": 5.9640790627108865e-05, + "loss": 0.1082, + "step": 10945 + }, + { + "epoch": 1.3153153153153152, + "grad_norm": 0.5552372932434082, + "learning_rate": 5.96099314470268e-05, + "loss": 0.1268, + "step": 10950 + }, + { + "epoch": 1.315915915915916, + "grad_norm": 0.43672850728034973, + "learning_rate": 5.957906846549385e-05, + "loss": 0.0996, + "step": 10955 + }, + { + "epoch": 1.3165165165165165, + "grad_norm": 0.4018603265285492, + "learning_rate": 5.954820169471864e-05, + "loss": 0.0981, + "step": 10960 + }, + { + "epoch": 1.317117117117117, + "grad_norm": 0.5597862005233765, + "learning_rate": 5.951733114691132e-05, + "loss": 0.109, + "step": 10965 + }, + { + "epoch": 1.3177177177177177, + "grad_norm": 0.6404910683631897, + "learning_rate": 5.948645683428349e-05, + "loss": 0.1129, + "step": 10970 + }, + { + "epoch": 1.3183183183183182, + "grad_norm": 0.3674793243408203, + "learning_rate": 5.945557876904829e-05, + "loss": 0.091, + "step": 10975 + }, + { + "epoch": 1.318918918918919, + "grad_norm": 0.6048736572265625, + "learning_rate": 5.94246969634203e-05, + "loss": 0.1173, + "step": 10980 + }, + { + "epoch": 1.3195195195195195, + "grad_norm": 0.38514524698257446, + "learning_rate": 5.939381142961561e-05, + "loss": 0.0887, + "step": 10985 + }, + { + "epoch": 1.3201201201201203, + "grad_norm": 0.5462285280227661, + "learning_rate": 5.936292217985175e-05, + "loss": 0.0979, + "step": 10990 + }, + { + "epoch": 1.3207207207207208, + "grad_norm": 0.4344751536846161, + "learning_rate": 5.9332029226347776e-05, + "loss": 0.0829, + "step": 10995 + }, + { + "epoch": 1.3213213213213213, + "grad_norm": 0.5517582893371582, + "learning_rate": 5.930113258132415e-05, + "loss": 0.1085, + "step": 11000 + }, + { + "epoch": 1.3213213213213213, + "eval_loss": 0.12636400759220123, + "eval_runtime": 35.8467, + "eval_samples_per_second": 22.317, + "eval_steps_per_second": 5.579, + "step": 11000 + }, + { + "epoch": 1.321921921921922, + "grad_norm": 0.622289776802063, + "learning_rate": 5.927023225700282e-05, + "loss": 0.1156, + "step": 11005 + }, + { + "epoch": 1.3225225225225226, + "grad_norm": 0.544995129108429, + "learning_rate": 5.9239328265607195e-05, + "loss": 0.1059, + "step": 11010 + }, + { + "epoch": 1.323123123123123, + "grad_norm": 0.4979136288166046, + "learning_rate": 5.9208420619362135e-05, + "loss": 0.1104, + "step": 11015 + }, + { + "epoch": 1.3237237237237238, + "grad_norm": 0.5703704357147217, + "learning_rate": 5.917750933049393e-05, + "loss": 0.0891, + "step": 11020 + }, + { + "epoch": 1.3243243243243243, + "grad_norm": 0.4206981062889099, + "learning_rate": 5.914659441123032e-05, + "loss": 0.1, + "step": 11025 + }, + { + "epoch": 1.3249249249249249, + "grad_norm": 0.5549303293228149, + "learning_rate": 5.911567587380048e-05, + "loss": 0.1016, + "step": 11030 + }, + { + "epoch": 1.3255255255255256, + "grad_norm": 0.45054033398628235, + "learning_rate": 5.908475373043504e-05, + "loss": 0.0935, + "step": 11035 + }, + { + "epoch": 1.3261261261261261, + "grad_norm": 0.46710655093193054, + "learning_rate": 5.905382799336601e-05, + "loss": 0.1064, + "step": 11040 + }, + { + "epoch": 1.3267267267267266, + "grad_norm": 0.4755312204360962, + "learning_rate": 5.902289867482684e-05, + "loss": 0.0951, + "step": 11045 + }, + { + "epoch": 1.3273273273273274, + "grad_norm": 0.5991599559783936, + "learning_rate": 5.899196578705244e-05, + "loss": 0.1126, + "step": 11050 + }, + { + "epoch": 1.327927927927928, + "grad_norm": 0.4284519553184509, + "learning_rate": 5.896102934227905e-05, + "loss": 0.1034, + "step": 11055 + }, + { + "epoch": 1.3285285285285284, + "grad_norm": 0.5215682983398438, + "learning_rate": 5.893008935274438e-05, + "loss": 0.1146, + "step": 11060 + }, + { + "epoch": 1.3291291291291292, + "grad_norm": 0.38606658577919006, + "learning_rate": 5.889914583068752e-05, + "loss": 0.1045, + "step": 11065 + }, + { + "epoch": 1.3297297297297297, + "grad_norm": 0.5296543836593628, + "learning_rate": 5.886819878834898e-05, + "loss": 0.097, + "step": 11070 + }, + { + "epoch": 1.3303303303303302, + "grad_norm": 0.4166487455368042, + "learning_rate": 5.8837248237970624e-05, + "loss": 0.1028, + "step": 11075 + }, + { + "epoch": 1.330930930930931, + "grad_norm": 0.43563294410705566, + "learning_rate": 5.880629419179573e-05, + "loss": 0.1033, + "step": 11080 + }, + { + "epoch": 1.3315315315315315, + "grad_norm": 0.5101214051246643, + "learning_rate": 5.8775336662068936e-05, + "loss": 0.1046, + "step": 11085 + }, + { + "epoch": 1.332132132132132, + "grad_norm": 0.6170099973678589, + "learning_rate": 5.87443756610363e-05, + "loss": 0.0988, + "step": 11090 + }, + { + "epoch": 1.3327327327327327, + "grad_norm": 0.5087183117866516, + "learning_rate": 5.871341120094522e-05, + "loss": 0.1191, + "step": 11095 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.5283128619194031, + "learning_rate": 5.8682443294044455e-05, + "loss": 0.0971, + "step": 11100 + }, + { + "epoch": 1.333933933933934, + "grad_norm": 0.4020695984363556, + "learning_rate": 5.8651471952584155e-05, + "loss": 0.1006, + "step": 11105 + }, + { + "epoch": 1.3345345345345345, + "grad_norm": 0.5804185271263123, + "learning_rate": 5.8620497188815805e-05, + "loss": 0.1019, + "step": 11110 + }, + { + "epoch": 1.3351351351351353, + "grad_norm": 0.5557799935340881, + "learning_rate": 5.858951901499228e-05, + "loss": 0.0986, + "step": 11115 + }, + { + "epoch": 1.3357357357357358, + "grad_norm": 0.5129476189613342, + "learning_rate": 5.8558537443367734e-05, + "loss": 0.096, + "step": 11120 + }, + { + "epoch": 1.3363363363363363, + "grad_norm": 0.47615566849708557, + "learning_rate": 5.8527552486197746e-05, + "loss": 0.1057, + "step": 11125 + }, + { + "epoch": 1.336936936936937, + "grad_norm": 0.4600653052330017, + "learning_rate": 5.84965641557392e-05, + "loss": 0.0856, + "step": 11130 + }, + { + "epoch": 1.3375375375375376, + "grad_norm": 0.3756698668003082, + "learning_rate": 5.846557246425028e-05, + "loss": 0.085, + "step": 11135 + }, + { + "epoch": 1.338138138138138, + "grad_norm": 0.40878963470458984, + "learning_rate": 5.8434577423990577e-05, + "loss": 0.09, + "step": 11140 + }, + { + "epoch": 1.3387387387387388, + "grad_norm": 0.5921834111213684, + "learning_rate": 5.8403579047220915e-05, + "loss": 0.0947, + "step": 11145 + }, + { + "epoch": 1.3393393393393394, + "grad_norm": 0.4594131112098694, + "learning_rate": 5.8372577346203515e-05, + "loss": 0.0899, + "step": 11150 + }, + { + "epoch": 1.3399399399399399, + "grad_norm": 0.44041335582733154, + "learning_rate": 5.834157233320186e-05, + "loss": 0.0919, + "step": 11155 + }, + { + "epoch": 1.3405405405405406, + "grad_norm": 0.5850462317466736, + "learning_rate": 5.8310564020480774e-05, + "loss": 0.0771, + "step": 11160 + }, + { + "epoch": 1.3411411411411411, + "grad_norm": 0.545671284198761, + "learning_rate": 5.8279552420306394e-05, + "loss": 0.0919, + "step": 11165 + }, + { + "epoch": 1.3417417417417417, + "grad_norm": 0.44084644317626953, + "learning_rate": 5.824853754494611e-05, + "loss": 0.0894, + "step": 11170 + }, + { + "epoch": 1.3423423423423424, + "grad_norm": 0.49509671330451965, + "learning_rate": 5.821751940666867e-05, + "loss": 0.1111, + "step": 11175 + }, + { + "epoch": 1.342942942942943, + "grad_norm": 0.5806544423103333, + "learning_rate": 5.8186498017744063e-05, + "loss": 0.116, + "step": 11180 + }, + { + "epoch": 1.3435435435435434, + "grad_norm": 0.46228882670402527, + "learning_rate": 5.815547339044359e-05, + "loss": 0.0919, + "step": 11185 + }, + { + "epoch": 1.3441441441441442, + "grad_norm": 0.4910290539264679, + "learning_rate": 5.8124445537039826e-05, + "loss": 0.1098, + "step": 11190 + }, + { + "epoch": 1.3447447447447447, + "grad_norm": 0.5441747903823853, + "learning_rate": 5.809341446980661e-05, + "loss": 0.101, + "step": 11195 + }, + { + "epoch": 1.3453453453453452, + "grad_norm": 0.4423178434371948, + "learning_rate": 5.8062380201019086e-05, + "loss": 0.1156, + "step": 11200 + }, + { + "epoch": 1.345945945945946, + "grad_norm": 0.5379671454429626, + "learning_rate": 5.80313427429536e-05, + "loss": 0.1028, + "step": 11205 + }, + { + "epoch": 1.3465465465465465, + "grad_norm": 0.572998046875, + "learning_rate": 5.800030210788785e-05, + "loss": 0.0994, + "step": 11210 + }, + { + "epoch": 1.3471471471471472, + "grad_norm": 0.5262696146965027, + "learning_rate": 5.7969258308100705e-05, + "loss": 0.0943, + "step": 11215 + }, + { + "epoch": 1.3477477477477477, + "grad_norm": 0.5349236130714417, + "learning_rate": 5.793821135587235e-05, + "loss": 0.1126, + "step": 11220 + }, + { + "epoch": 1.3483483483483483, + "grad_norm": 0.6178399920463562, + "learning_rate": 5.790716126348417e-05, + "loss": 0.099, + "step": 11225 + }, + { + "epoch": 1.348948948948949, + "grad_norm": 0.5026535987854004, + "learning_rate": 5.7876108043218835e-05, + "loss": 0.1036, + "step": 11230 + }, + { + "epoch": 1.3495495495495495, + "grad_norm": 0.45836958289146423, + "learning_rate": 5.784505170736022e-05, + "loss": 0.0998, + "step": 11235 + }, + { + "epoch": 1.3501501501501503, + "grad_norm": 0.534871518611908, + "learning_rate": 5.781399226819342e-05, + "loss": 0.1069, + "step": 11240 + }, + { + "epoch": 1.3507507507507508, + "grad_norm": 0.33303356170654297, + "learning_rate": 5.778292973800482e-05, + "loss": 0.0877, + "step": 11245 + }, + { + "epoch": 1.3513513513513513, + "grad_norm": 0.5539909601211548, + "learning_rate": 5.7751864129081946e-05, + "loss": 0.0913, + "step": 11250 + }, + { + "epoch": 1.3513513513513513, + "eval_loss": 0.11916637420654297, + "eval_runtime": 35.8503, + "eval_samples_per_second": 22.315, + "eval_steps_per_second": 5.579, + "step": 11250 + }, + { + "epoch": 1.351951951951952, + "grad_norm": 0.4414505362510681, + "learning_rate": 5.772079545371363e-05, + "loss": 0.0952, + "step": 11255 + }, + { + "epoch": 1.3525525525525526, + "grad_norm": 0.4687088429927826, + "learning_rate": 5.768972372418981e-05, + "loss": 0.0853, + "step": 11260 + }, + { + "epoch": 1.353153153153153, + "grad_norm": 0.5335027575492859, + "learning_rate": 5.765864895280175e-05, + "loss": 0.0925, + "step": 11265 + }, + { + "epoch": 1.3537537537537538, + "grad_norm": 0.46961474418640137, + "learning_rate": 5.7627571151841855e-05, + "loss": 0.0987, + "step": 11270 + }, + { + "epoch": 1.3543543543543544, + "grad_norm": 0.43510282039642334, + "learning_rate": 5.759649033360369e-05, + "loss": 0.1019, + "step": 11275 + }, + { + "epoch": 1.3549549549549549, + "grad_norm": 0.4986173212528229, + "learning_rate": 5.7565406510382094e-05, + "loss": 0.0981, + "step": 11280 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 0.610945999622345, + "learning_rate": 5.753431969447305e-05, + "loss": 0.1075, + "step": 11285 + }, + { + "epoch": 1.3561561561561561, + "grad_norm": 0.4827207326889038, + "learning_rate": 5.750322989817373e-05, + "loss": 0.0872, + "step": 11290 + }, + { + "epoch": 1.3567567567567567, + "grad_norm": 0.5450891852378845, + "learning_rate": 5.747213713378248e-05, + "loss": 0.1096, + "step": 11295 + }, + { + "epoch": 1.3573573573573574, + "grad_norm": 0.5981200337409973, + "learning_rate": 5.7441041413598815e-05, + "loss": 0.0983, + "step": 11300 + }, + { + "epoch": 1.357957957957958, + "grad_norm": 0.40616247057914734, + "learning_rate": 5.740994274992348e-05, + "loss": 0.089, + "step": 11305 + }, + { + "epoch": 1.3585585585585584, + "grad_norm": 0.45179131627082825, + "learning_rate": 5.737884115505829e-05, + "loss": 0.0843, + "step": 11310 + }, + { + "epoch": 1.3591591591591592, + "grad_norm": 0.6250925064086914, + "learning_rate": 5.73477366413063e-05, + "loss": 0.1029, + "step": 11315 + }, + { + "epoch": 1.3597597597597597, + "grad_norm": 0.4998907744884491, + "learning_rate": 5.731662922097165e-05, + "loss": 0.093, + "step": 11320 + }, + { + "epoch": 1.3603603603603602, + "grad_norm": 0.47104600071907043, + "learning_rate": 5.7285518906359706e-05, + "loss": 0.0964, + "step": 11325 + }, + { + "epoch": 1.360960960960961, + "grad_norm": 0.439411461353302, + "learning_rate": 5.72544057097769e-05, + "loss": 0.0895, + "step": 11330 + }, + { + "epoch": 1.3615615615615615, + "grad_norm": 0.6331053972244263, + "learning_rate": 5.722328964353085e-05, + "loss": 0.1038, + "step": 11335 + }, + { + "epoch": 1.3621621621621622, + "grad_norm": 0.47725340723991394, + "learning_rate": 5.719217071993033e-05, + "loss": 0.0896, + "step": 11340 + }, + { + "epoch": 1.3627627627627628, + "grad_norm": 0.44108104705810547, + "learning_rate": 5.716104895128518e-05, + "loss": 0.081, + "step": 11345 + }, + { + "epoch": 1.3633633633633635, + "grad_norm": 0.4921906888484955, + "learning_rate": 5.712992434990642e-05, + "loss": 0.0881, + "step": 11350 + }, + { + "epoch": 1.363963963963964, + "grad_norm": 0.5186919569969177, + "learning_rate": 5.7098796928106156e-05, + "loss": 0.1097, + "step": 11355 + }, + { + "epoch": 1.3645645645645645, + "grad_norm": 0.49276605248451233, + "learning_rate": 5.7067666698197654e-05, + "loss": 0.0965, + "step": 11360 + }, + { + "epoch": 1.3651651651651653, + "grad_norm": 0.4950788915157318, + "learning_rate": 5.703653367249522e-05, + "loss": 0.0912, + "step": 11365 + }, + { + "epoch": 1.3657657657657658, + "grad_norm": 0.588308572769165, + "learning_rate": 5.700539786331436e-05, + "loss": 0.1028, + "step": 11370 + }, + { + "epoch": 1.3663663663663663, + "grad_norm": 0.4782957434654236, + "learning_rate": 5.6974259282971585e-05, + "loss": 0.0753, + "step": 11375 + }, + { + "epoch": 1.366966966966967, + "grad_norm": 0.5269047617912292, + "learning_rate": 5.6943117943784554e-05, + "loss": 0.0916, + "step": 11380 + }, + { + "epoch": 1.3675675675675676, + "grad_norm": 0.4562937915325165, + "learning_rate": 5.691197385807203e-05, + "loss": 0.0996, + "step": 11385 + }, + { + "epoch": 1.368168168168168, + "grad_norm": 0.45184147357940674, + "learning_rate": 5.688082703815382e-05, + "loss": 0.0858, + "step": 11390 + }, + { + "epoch": 1.3687687687687689, + "grad_norm": 0.5662463307380676, + "learning_rate": 5.684967749635085e-05, + "loss": 0.0923, + "step": 11395 + }, + { + "epoch": 1.3693693693693694, + "grad_norm": 0.5363159775733948, + "learning_rate": 5.6818525244985096e-05, + "loss": 0.0989, + "step": 11400 + }, + { + "epoch": 1.36996996996997, + "grad_norm": 0.47409459948539734, + "learning_rate": 5.6787370296379616e-05, + "loss": 0.0816, + "step": 11405 + }, + { + "epoch": 1.3705705705705706, + "grad_norm": 0.4212060868740082, + "learning_rate": 5.675621266285855e-05, + "loss": 0.0909, + "step": 11410 + }, + { + "epoch": 1.3711711711711712, + "grad_norm": 0.5455527305603027, + "learning_rate": 5.6725052356747074e-05, + "loss": 0.0888, + "step": 11415 + }, + { + "epoch": 1.3717717717717717, + "grad_norm": 0.5372433662414551, + "learning_rate": 5.669388939037146e-05, + "loss": 0.0899, + "step": 11420 + }, + { + "epoch": 1.3723723723723724, + "grad_norm": 0.4447285830974579, + "learning_rate": 5.666272377605897e-05, + "loss": 0.0709, + "step": 11425 + }, + { + "epoch": 1.372972972972973, + "grad_norm": 0.46429330110549927, + "learning_rate": 5.663155552613797e-05, + "loss": 0.0814, + "step": 11430 + }, + { + "epoch": 1.3735735735735735, + "grad_norm": 0.5037614107131958, + "learning_rate": 5.660038465293782e-05, + "loss": 0.0894, + "step": 11435 + }, + { + "epoch": 1.3741741741741742, + "grad_norm": 0.5420588850975037, + "learning_rate": 5.656921116878897e-05, + "loss": 0.0943, + "step": 11440 + }, + { + "epoch": 1.3747747747747747, + "grad_norm": 0.4849311113357544, + "learning_rate": 5.6538035086022886e-05, + "loss": 0.0971, + "step": 11445 + }, + { + "epoch": 1.3753753753753752, + "grad_norm": 0.5062885284423828, + "learning_rate": 5.650685641697203e-05, + "loss": 0.089, + "step": 11450 + }, + { + "epoch": 1.375975975975976, + "grad_norm": 0.5417320132255554, + "learning_rate": 5.647567517396993e-05, + "loss": 0.1063, + "step": 11455 + }, + { + "epoch": 1.3765765765765765, + "grad_norm": 0.5244467258453369, + "learning_rate": 5.64444913693511e-05, + "loss": 0.0966, + "step": 11460 + }, + { + "epoch": 1.3771771771771772, + "grad_norm": 0.5427935719490051, + "learning_rate": 5.641330501545111e-05, + "loss": 0.1022, + "step": 11465 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 0.4481619596481323, + "learning_rate": 5.6382116124606475e-05, + "loss": 0.0845, + "step": 11470 + }, + { + "epoch": 1.3783783783783785, + "grad_norm": 0.5331611037254333, + "learning_rate": 5.635092470915476e-05, + "loss": 0.0923, + "step": 11475 + }, + { + "epoch": 1.378978978978979, + "grad_norm": 0.44660672545433044, + "learning_rate": 5.631973078143452e-05, + "loss": 0.0995, + "step": 11480 + }, + { + "epoch": 1.3795795795795796, + "grad_norm": 0.40920501947402954, + "learning_rate": 5.628853435378528e-05, + "loss": 0.0844, + "step": 11485 + }, + { + "epoch": 1.3801801801801803, + "grad_norm": 0.4937179386615753, + "learning_rate": 5.625733543854762e-05, + "loss": 0.1036, + "step": 11490 + }, + { + "epoch": 1.3807807807807808, + "grad_norm": 0.4166986048221588, + "learning_rate": 5.622613404806301e-05, + "loss": 0.0855, + "step": 11495 + }, + { + "epoch": 1.3813813813813813, + "grad_norm": 0.45914360880851746, + "learning_rate": 5.619493019467397e-05, + "loss": 0.0791, + "step": 11500 + }, + { + "epoch": 1.3813813813813813, + "eval_loss": 0.11311966925859451, + "eval_runtime": 35.9989, + "eval_samples_per_second": 22.223, + "eval_steps_per_second": 5.556, + "step": 11500 + }, + { + "epoch": 1.381981981981982, + "grad_norm": 0.5590953826904297, + "learning_rate": 5.6163723890723966e-05, + "loss": 0.0941, + "step": 11505 + }, + { + "epoch": 1.3825825825825826, + "grad_norm": 0.5889363884925842, + "learning_rate": 5.613251514855744e-05, + "loss": 0.1024, + "step": 11510 + }, + { + "epoch": 1.3831831831831831, + "grad_norm": 0.4771200120449066, + "learning_rate": 5.61013039805198e-05, + "loss": 0.0995, + "step": 11515 + }, + { + "epoch": 1.3837837837837839, + "grad_norm": 0.5556389689445496, + "learning_rate": 5.607009039895742e-05, + "loss": 0.097, + "step": 11520 + }, + { + "epoch": 1.3843843843843844, + "grad_norm": 0.468735009431839, + "learning_rate": 5.60388744162176e-05, + "loss": 0.0982, + "step": 11525 + }, + { + "epoch": 1.384984984984985, + "grad_norm": 0.548349142074585, + "learning_rate": 5.600765604464861e-05, + "loss": 0.1022, + "step": 11530 + }, + { + "epoch": 1.3855855855855856, + "grad_norm": 0.470237135887146, + "learning_rate": 5.59764352965997e-05, + "loss": 0.0873, + "step": 11535 + }, + { + "epoch": 1.3861861861861862, + "grad_norm": 0.47610995173454285, + "learning_rate": 5.594521218442097e-05, + "loss": 0.0953, + "step": 11540 + }, + { + "epoch": 1.3867867867867867, + "grad_norm": 0.5809658169746399, + "learning_rate": 5.5913986720463554e-05, + "loss": 0.1012, + "step": 11545 + }, + { + "epoch": 1.3873873873873874, + "grad_norm": 0.5556275248527527, + "learning_rate": 5.588275891707946e-05, + "loss": 0.0864, + "step": 11550 + }, + { + "epoch": 1.387987987987988, + "grad_norm": 0.4226778447628021, + "learning_rate": 5.585152878662161e-05, + "loss": 0.0802, + "step": 11555 + }, + { + "epoch": 1.3885885885885885, + "grad_norm": 0.5059261322021484, + "learning_rate": 5.5820296341443915e-05, + "loss": 0.0972, + "step": 11560 + }, + { + "epoch": 1.3891891891891892, + "grad_norm": 0.5257276892662048, + "learning_rate": 5.5789061593901126e-05, + "loss": 0.0818, + "step": 11565 + }, + { + "epoch": 1.3897897897897897, + "grad_norm": 0.4813799560070038, + "learning_rate": 5.575782455634895e-05, + "loss": 0.0833, + "step": 11570 + }, + { + "epoch": 1.3903903903903903, + "grad_norm": 0.4413239061832428, + "learning_rate": 5.572658524114396e-05, + "loss": 0.0911, + "step": 11575 + }, + { + "epoch": 1.390990990990991, + "grad_norm": 0.5474624633789062, + "learning_rate": 5.569534366064367e-05, + "loss": 0.0936, + "step": 11580 + }, + { + "epoch": 1.3915915915915915, + "grad_norm": 0.4176657795906067, + "learning_rate": 5.566409982720649e-05, + "loss": 0.0978, + "step": 11585 + }, + { + "epoch": 1.3921921921921923, + "grad_norm": 0.40932703018188477, + "learning_rate": 5.56328537531917e-05, + "loss": 0.1017, + "step": 11590 + }, + { + "epoch": 1.3927927927927928, + "grad_norm": 0.5456036925315857, + "learning_rate": 5.560160545095945e-05, + "loss": 0.0892, + "step": 11595 + }, + { + "epoch": 1.3933933933933935, + "grad_norm": 0.44474583864212036, + "learning_rate": 5.557035493287082e-05, + "loss": 0.1029, + "step": 11600 + }, + { + "epoch": 1.393993993993994, + "grad_norm": 0.5171622633934021, + "learning_rate": 5.5539102211287744e-05, + "loss": 0.0941, + "step": 11605 + }, + { + "epoch": 1.3945945945945946, + "grad_norm": 0.4478524923324585, + "learning_rate": 5.5507847298573015e-05, + "loss": 0.085, + "step": 11610 + }, + { + "epoch": 1.3951951951951953, + "grad_norm": 0.3781989514827728, + "learning_rate": 5.547659020709028e-05, + "loss": 0.0875, + "step": 11615 + }, + { + "epoch": 1.3957957957957958, + "grad_norm": 0.5028838515281677, + "learning_rate": 5.544533094920411e-05, + "loss": 0.0769, + "step": 11620 + }, + { + "epoch": 1.3963963963963963, + "grad_norm": 0.4439200758934021, + "learning_rate": 5.541406953727987e-05, + "loss": 0.1135, + "step": 11625 + }, + { + "epoch": 1.396996996996997, + "grad_norm": 0.43673768639564514, + "learning_rate": 5.538280598368382e-05, + "loss": 0.0711, + "step": 11630 + }, + { + "epoch": 1.3975975975975976, + "grad_norm": 0.4030311703681946, + "learning_rate": 5.5351540300783e-05, + "loss": 0.1047, + "step": 11635 + }, + { + "epoch": 1.3981981981981981, + "grad_norm": 0.36829933524131775, + "learning_rate": 5.532027250094539e-05, + "loss": 0.0877, + "step": 11640 + }, + { + "epoch": 1.3987987987987989, + "grad_norm": 0.5318426489830017, + "learning_rate": 5.528900259653975e-05, + "loss": 0.0978, + "step": 11645 + }, + { + "epoch": 1.3993993993993994, + "grad_norm": 0.45535722374916077, + "learning_rate": 5.525773059993566e-05, + "loss": 0.0831, + "step": 11650 + }, + { + "epoch": 1.4, + "grad_norm": 0.4648362398147583, + "learning_rate": 5.522645652350357e-05, + "loss": 0.0882, + "step": 11655 + }, + { + "epoch": 1.4006006006006007, + "grad_norm": 0.4772687256336212, + "learning_rate": 5.519518037961471e-05, + "loss": 0.0894, + "step": 11660 + }, + { + "epoch": 1.4012012012012012, + "grad_norm": 0.40186789631843567, + "learning_rate": 5.516390218064115e-05, + "loss": 0.0895, + "step": 11665 + }, + { + "epoch": 1.4018018018018017, + "grad_norm": 0.630961000919342, + "learning_rate": 5.5132621938955774e-05, + "loss": 0.1021, + "step": 11670 + }, + { + "epoch": 1.4024024024024024, + "grad_norm": 0.5480983853340149, + "learning_rate": 5.510133966693227e-05, + "loss": 0.0921, + "step": 11675 + }, + { + "epoch": 1.403003003003003, + "grad_norm": 0.395860493183136, + "learning_rate": 5.507005537694515e-05, + "loss": 0.0875, + "step": 11680 + }, + { + "epoch": 1.4036036036036035, + "grad_norm": 0.454228013753891, + "learning_rate": 5.5038769081369665e-05, + "loss": 0.0784, + "step": 11685 + }, + { + "epoch": 1.4042042042042042, + "grad_norm": 0.42922738194465637, + "learning_rate": 5.5007480792581946e-05, + "loss": 0.088, + "step": 11690 + }, + { + "epoch": 1.4048048048048047, + "grad_norm": 0.5611411929130554, + "learning_rate": 5.497619052295882e-05, + "loss": 0.0848, + "step": 11695 + }, + { + "epoch": 1.4054054054054055, + "grad_norm": 0.5752083659172058, + "learning_rate": 5.4944898284877974e-05, + "loss": 0.08, + "step": 11700 + }, + { + "epoch": 1.406006006006006, + "grad_norm": 0.6316119432449341, + "learning_rate": 5.491360409071784e-05, + "loss": 0.0837, + "step": 11705 + }, + { + "epoch": 1.4066066066066065, + "grad_norm": 0.527634859085083, + "learning_rate": 5.4882307952857605e-05, + "loss": 0.0815, + "step": 11710 + }, + { + "epoch": 1.4072072072072073, + "grad_norm": 0.5850387811660767, + "learning_rate": 5.4851009883677265e-05, + "loss": 0.0875, + "step": 11715 + }, + { + "epoch": 1.4078078078078078, + "grad_norm": 0.4735187888145447, + "learning_rate": 5.4819709895557545e-05, + "loss": 0.0892, + "step": 11720 + }, + { + "epoch": 1.4084084084084085, + "grad_norm": 0.4472479820251465, + "learning_rate": 5.4788408000879966e-05, + "loss": 0.0829, + "step": 11725 + }, + { + "epoch": 1.409009009009009, + "grad_norm": 0.49625205993652344, + "learning_rate": 5.4757104212026755e-05, + "loss": 0.0825, + "step": 11730 + }, + { + "epoch": 1.4096096096096096, + "grad_norm": 0.515315592288971, + "learning_rate": 5.472579854138096e-05, + "loss": 0.0985, + "step": 11735 + }, + { + "epoch": 1.4102102102102103, + "grad_norm": 0.4145263135433197, + "learning_rate": 5.4694491001326276e-05, + "loss": 0.0919, + "step": 11740 + }, + { + "epoch": 1.4108108108108108, + "grad_norm": 0.4036807715892792, + "learning_rate": 5.4663181604247226e-05, + "loss": 0.0929, + "step": 11745 + }, + { + "epoch": 1.4114114114114114, + "grad_norm": 0.4930163621902466, + "learning_rate": 5.463187036252902e-05, + "loss": 0.0922, + "step": 11750 + }, + { + "epoch": 1.4114114114114114, + "eval_loss": 0.10833635926246643, + "eval_runtime": 35.806, + "eval_samples_per_second": 22.343, + "eval_steps_per_second": 5.586, + "step": 11750 + }, + { + "epoch": 1.412012012012012, + "grad_norm": 0.5043427348136902, + "learning_rate": 5.4600557288557606e-05, + "loss": 0.0835, + "step": 11755 + }, + { + "epoch": 1.4126126126126126, + "grad_norm": 0.35307490825653076, + "learning_rate": 5.456924239471968e-05, + "loss": 0.0712, + "step": 11760 + }, + { + "epoch": 1.4132132132132131, + "grad_norm": 0.4506548047065735, + "learning_rate": 5.4537925693402604e-05, + "loss": 0.0996, + "step": 11765 + }, + { + "epoch": 1.4138138138138139, + "grad_norm": 0.5748133063316345, + "learning_rate": 5.450660719699452e-05, + "loss": 0.1143, + "step": 11770 + }, + { + "epoch": 1.4144144144144144, + "grad_norm": 0.4180302917957306, + "learning_rate": 5.4475286917884236e-05, + "loss": 0.0895, + "step": 11775 + }, + { + "epoch": 1.415015015015015, + "grad_norm": 0.43286964297294617, + "learning_rate": 5.4443964868461286e-05, + "loss": 0.087, + "step": 11780 + }, + { + "epoch": 1.4156156156156157, + "grad_norm": 0.46737971901893616, + "learning_rate": 5.441264106111589e-05, + "loss": 0.0906, + "step": 11785 + }, + { + "epoch": 1.4162162162162162, + "grad_norm": 0.3804052174091339, + "learning_rate": 5.4381315508238974e-05, + "loss": 0.0865, + "step": 11790 + }, + { + "epoch": 1.4168168168168167, + "grad_norm": 0.5925003886222839, + "learning_rate": 5.434998822222215e-05, + "loss": 0.0968, + "step": 11795 + }, + { + "epoch": 1.4174174174174174, + "grad_norm": 0.41046711802482605, + "learning_rate": 5.4318659215457724e-05, + "loss": 0.0826, + "step": 11800 + }, + { + "epoch": 1.418018018018018, + "grad_norm": 0.4627430737018585, + "learning_rate": 5.428732850033866e-05, + "loss": 0.0902, + "step": 11805 + }, + { + "epoch": 1.4186186186186185, + "grad_norm": 0.4553092420101166, + "learning_rate": 5.4255996089258624e-05, + "loss": 0.0904, + "step": 11810 + }, + { + "epoch": 1.4192192192192192, + "grad_norm": 0.36503249406814575, + "learning_rate": 5.4224661994611934e-05, + "loss": 0.0669, + "step": 11815 + }, + { + "epoch": 1.4198198198198198, + "grad_norm": 0.4996648132801056, + "learning_rate": 5.4193326228793593e-05, + "loss": 0.0908, + "step": 11820 + }, + { + "epoch": 1.4204204204204205, + "grad_norm": 0.4459191858768463, + "learning_rate": 5.416198880419924e-05, + "loss": 0.0777, + "step": 11825 + }, + { + "epoch": 1.421021021021021, + "grad_norm": 0.4893633723258972, + "learning_rate": 5.41306497332252e-05, + "loss": 0.1203, + "step": 11830 + }, + { + "epoch": 1.4216216216216218, + "grad_norm": 0.4804662764072418, + "learning_rate": 5.409930902826842e-05, + "loss": 0.0859, + "step": 11835 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 0.46695834398269653, + "learning_rate": 5.406796670172651e-05, + "loss": 0.0825, + "step": 11840 + }, + { + "epoch": 1.4228228228228228, + "grad_norm": 0.5017839074134827, + "learning_rate": 5.4036622765997736e-05, + "loss": 0.0821, + "step": 11845 + }, + { + "epoch": 1.4234234234234235, + "grad_norm": 0.4748544692993164, + "learning_rate": 5.4005277233480945e-05, + "loss": 0.0933, + "step": 11850 + }, + { + "epoch": 1.424024024024024, + "grad_norm": 0.5358333587646484, + "learning_rate": 5.397393011657569e-05, + "loss": 0.0891, + "step": 11855 + }, + { + "epoch": 1.4246246246246246, + "grad_norm": 0.616308331489563, + "learning_rate": 5.394258142768208e-05, + "loss": 0.1044, + "step": 11860 + }, + { + "epoch": 1.4252252252252253, + "grad_norm": 0.4904516637325287, + "learning_rate": 5.3911231179200924e-05, + "loss": 0.082, + "step": 11865 + }, + { + "epoch": 1.4258258258258258, + "grad_norm": 0.4936327040195465, + "learning_rate": 5.387987938353356e-05, + "loss": 0.0889, + "step": 11870 + }, + { + "epoch": 1.4264264264264264, + "grad_norm": 0.5046936869621277, + "learning_rate": 5.384852605308202e-05, + "loss": 0.0868, + "step": 11875 + }, + { + "epoch": 1.427027027027027, + "grad_norm": 0.3366491198539734, + "learning_rate": 5.381717120024886e-05, + "loss": 0.0926, + "step": 11880 + }, + { + "epoch": 1.4276276276276276, + "grad_norm": 0.5158485770225525, + "learning_rate": 5.378581483743732e-05, + "loss": 0.0872, + "step": 11885 + }, + { + "epoch": 1.4282282282282281, + "grad_norm": 0.4886568784713745, + "learning_rate": 5.3754456977051205e-05, + "loss": 0.0812, + "step": 11890 + }, + { + "epoch": 1.428828828828829, + "grad_norm": 0.500196099281311, + "learning_rate": 5.372309763149487e-05, + "loss": 0.0782, + "step": 11895 + }, + { + "epoch": 1.4294294294294294, + "grad_norm": 0.588158905506134, + "learning_rate": 5.369173681317333e-05, + "loss": 0.0908, + "step": 11900 + }, + { + "epoch": 1.43003003003003, + "grad_norm": 0.3685852587223053, + "learning_rate": 5.366037453449213e-05, + "loss": 0.0759, + "step": 11905 + }, + { + "epoch": 1.4306306306306307, + "grad_norm": 0.45046189427375793, + "learning_rate": 5.3629010807857414e-05, + "loss": 0.076, + "step": 11910 + }, + { + "epoch": 1.4312312312312312, + "grad_norm": 0.4529851973056793, + "learning_rate": 5.359764564567591e-05, + "loss": 0.0815, + "step": 11915 + }, + { + "epoch": 1.4318318318318317, + "grad_norm": 0.42264431715011597, + "learning_rate": 5.356627906035488e-05, + "loss": 0.0893, + "step": 11920 + }, + { + "epoch": 1.4324324324324325, + "grad_norm": 0.4683610498905182, + "learning_rate": 5.353491106430217e-05, + "loss": 0.0842, + "step": 11925 + }, + { + "epoch": 1.433033033033033, + "grad_norm": 0.40563976764678955, + "learning_rate": 5.350354166992619e-05, + "loss": 0.0844, + "step": 11930 + }, + { + "epoch": 1.4336336336336335, + "grad_norm": 0.430910587310791, + "learning_rate": 5.347217088963591e-05, + "loss": 0.0873, + "step": 11935 + }, + { + "epoch": 1.4342342342342342, + "grad_norm": 0.46596115827560425, + "learning_rate": 5.3440798735840804e-05, + "loss": 0.0853, + "step": 11940 + }, + { + "epoch": 1.4348348348348348, + "grad_norm": 0.44316524267196655, + "learning_rate": 5.340942522095095e-05, + "loss": 0.082, + "step": 11945 + }, + { + "epoch": 1.4354354354354355, + "grad_norm": 0.5083211660385132, + "learning_rate": 5.337805035737689e-05, + "loss": 0.1029, + "step": 11950 + }, + { + "epoch": 1.436036036036036, + "grad_norm": 0.449158251285553, + "learning_rate": 5.3346674157529776e-05, + "loss": 0.0944, + "step": 11955 + }, + { + "epoch": 1.4366366366366368, + "grad_norm": 0.47196316719055176, + "learning_rate": 5.331529663382125e-05, + "loss": 0.0943, + "step": 11960 + }, + { + "epoch": 1.4372372372372373, + "grad_norm": 0.4012751579284668, + "learning_rate": 5.328391779866348e-05, + "loss": 0.0901, + "step": 11965 + }, + { + "epoch": 1.4378378378378378, + "grad_norm": 0.5194813013076782, + "learning_rate": 5.3252537664469185e-05, + "loss": 0.0924, + "step": 11970 + }, + { + "epoch": 1.4384384384384385, + "grad_norm": 0.5404078364372253, + "learning_rate": 5.3221156243651505e-05, + "loss": 0.0873, + "step": 11975 + }, + { + "epoch": 1.439039039039039, + "grad_norm": 0.38077086210250854, + "learning_rate": 5.318977354862421e-05, + "loss": 0.0727, + "step": 11980 + }, + { + "epoch": 1.4396396396396396, + "grad_norm": 0.4743851125240326, + "learning_rate": 5.31583895918015e-05, + "loss": 0.0927, + "step": 11985 + }, + { + "epoch": 1.4402402402402403, + "grad_norm": 0.46732330322265625, + "learning_rate": 5.312700438559808e-05, + "loss": 0.096, + "step": 11990 + }, + { + "epoch": 1.4408408408408409, + "grad_norm": 0.537147045135498, + "learning_rate": 5.309561794242918e-05, + "loss": 0.0771, + "step": 11995 + }, + { + "epoch": 1.4414414414414414, + "grad_norm": 0.4657417833805084, + "learning_rate": 5.306423027471046e-05, + "loss": 0.0765, + "step": 12000 + }, + { + "epoch": 1.4414414414414414, + "eval_loss": 0.10528457909822464, + "eval_runtime": 36.0279, + "eval_samples_per_second": 22.205, + "eval_steps_per_second": 5.551, + "step": 12000 + }, + { + "epoch": 1.4420420420420421, + "grad_norm": 0.5093770623207092, + "learning_rate": 5.3032841394858154e-05, + "loss": 0.0864, + "step": 12005 + }, + { + "epoch": 1.4426426426426426, + "grad_norm": 0.4265924096107483, + "learning_rate": 5.3001451315288895e-05, + "loss": 0.0909, + "step": 12010 + }, + { + "epoch": 1.4432432432432432, + "grad_norm": 0.47986164689064026, + "learning_rate": 5.297006004841983e-05, + "loss": 0.0814, + "step": 12015 + }, + { + "epoch": 1.443843843843844, + "grad_norm": 0.5935015082359314, + "learning_rate": 5.293866760666857e-05, + "loss": 0.0855, + "step": 12020 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 0.4596727192401886, + "learning_rate": 5.290727400245319e-05, + "loss": 0.0787, + "step": 12025 + }, + { + "epoch": 1.445045045045045, + "grad_norm": 0.5177743434906006, + "learning_rate": 5.2875879248192196e-05, + "loss": 0.0867, + "step": 12030 + }, + { + "epoch": 1.4456456456456457, + "grad_norm": 0.4075930118560791, + "learning_rate": 5.284448335630462e-05, + "loss": 0.0868, + "step": 12035 + }, + { + "epoch": 1.4462462462462462, + "grad_norm": 0.4848853647708893, + "learning_rate": 5.281308633920986e-05, + "loss": 0.0647, + "step": 12040 + }, + { + "epoch": 1.4468468468468467, + "grad_norm": 0.374552458524704, + "learning_rate": 5.278168820932782e-05, + "loss": 0.0723, + "step": 12045 + }, + { + "epoch": 1.4474474474474475, + "grad_norm": 0.4907662570476532, + "learning_rate": 5.27502889790788e-05, + "loss": 0.0873, + "step": 12050 + }, + { + "epoch": 1.448048048048048, + "grad_norm": 0.5037271976470947, + "learning_rate": 5.2718888660883594e-05, + "loss": 0.0884, + "step": 12055 + }, + { + "epoch": 1.4486486486486487, + "grad_norm": 0.5089353919029236, + "learning_rate": 5.268748726716335e-05, + "loss": 0.0953, + "step": 12060 + }, + { + "epoch": 1.4492492492492492, + "grad_norm": 0.4179510176181793, + "learning_rate": 5.265608481033971e-05, + "loss": 0.0768, + "step": 12065 + }, + { + "epoch": 1.4498498498498498, + "grad_norm": 0.5133365988731384, + "learning_rate": 5.26246813028347e-05, + "loss": 0.0915, + "step": 12070 + }, + { + "epoch": 1.4504504504504505, + "grad_norm": 0.5832177996635437, + "learning_rate": 5.2593276757070775e-05, + "loss": 0.0984, + "step": 12075 + }, + { + "epoch": 1.451051051051051, + "grad_norm": 0.424835205078125, + "learning_rate": 5.256187118547079e-05, + "loss": 0.0844, + "step": 12080 + }, + { + "epoch": 1.4516516516516518, + "grad_norm": 0.40258777141571045, + "learning_rate": 5.253046460045799e-05, + "loss": 0.0816, + "step": 12085 + }, + { + "epoch": 1.4522522522522523, + "grad_norm": 0.4845524728298187, + "learning_rate": 5.249905701445609e-05, + "loss": 0.0761, + "step": 12090 + }, + { + "epoch": 1.4528528528528528, + "grad_norm": 0.44638118147850037, + "learning_rate": 5.24676484398891e-05, + "loss": 0.0815, + "step": 12095 + }, + { + "epoch": 1.4534534534534536, + "grad_norm": 0.5267859697341919, + "learning_rate": 5.243623888918153e-05, + "loss": 0.0742, + "step": 12100 + }, + { + "epoch": 1.454054054054054, + "grad_norm": 0.5129780769348145, + "learning_rate": 5.2404828374758174e-05, + "loss": 0.0718, + "step": 12105 + }, + { + "epoch": 1.4546546546546546, + "grad_norm": 0.4829353094100952, + "learning_rate": 5.237341690904428e-05, + "loss": 0.08, + "step": 12110 + }, + { + "epoch": 1.4552552552552553, + "grad_norm": 0.4809311032295227, + "learning_rate": 5.2342004504465426e-05, + "loss": 0.0719, + "step": 12115 + }, + { + "epoch": 1.4558558558558559, + "grad_norm": 0.5456272959709167, + "learning_rate": 5.2310591173447596e-05, + "loss": 0.074, + "step": 12120 + }, + { + "epoch": 1.4564564564564564, + "grad_norm": 0.4419856071472168, + "learning_rate": 5.2279176928417127e-05, + "loss": 0.063, + "step": 12125 + }, + { + "epoch": 1.4570570570570571, + "grad_norm": 0.41237685084342957, + "learning_rate": 5.224776178180071e-05, + "loss": 0.0824, + "step": 12130 + }, + { + "epoch": 1.4576576576576576, + "grad_norm": 0.5364236235618591, + "learning_rate": 5.22163457460254e-05, + "loss": 0.0999, + "step": 12135 + }, + { + "epoch": 1.4582582582582582, + "grad_norm": 0.4944758415222168, + "learning_rate": 5.218492883351859e-05, + "loss": 0.0775, + "step": 12140 + }, + { + "epoch": 1.458858858858859, + "grad_norm": 0.4452052414417267, + "learning_rate": 5.215351105670806e-05, + "loss": 0.0739, + "step": 12145 + }, + { + "epoch": 1.4594594594594594, + "grad_norm": 0.41491255164146423, + "learning_rate": 5.2122092428021874e-05, + "loss": 0.0785, + "step": 12150 + }, + { + "epoch": 1.46006006006006, + "grad_norm": 0.39245888590812683, + "learning_rate": 5.209067295988849e-05, + "loss": 0.0763, + "step": 12155 + }, + { + "epoch": 1.4606606606606607, + "grad_norm": 0.4065698981285095, + "learning_rate": 5.205925266473666e-05, + "loss": 0.0959, + "step": 12160 + }, + { + "epoch": 1.4612612612612612, + "grad_norm": 0.5166289806365967, + "learning_rate": 5.2027831554995464e-05, + "loss": 0.0966, + "step": 12165 + }, + { + "epoch": 1.4618618618618617, + "grad_norm": 0.4691830575466156, + "learning_rate": 5.199640964309434e-05, + "loss": 0.094, + "step": 12170 + }, + { + "epoch": 1.4624624624624625, + "grad_norm": 0.5310338735580444, + "learning_rate": 5.196498694146301e-05, + "loss": 0.0825, + "step": 12175 + }, + { + "epoch": 1.463063063063063, + "grad_norm": 0.41804417967796326, + "learning_rate": 5.193356346253151e-05, + "loss": 0.0696, + "step": 12180 + }, + { + "epoch": 1.4636636636636637, + "grad_norm": 0.5980838537216187, + "learning_rate": 5.190213921873017e-05, + "loss": 0.0741, + "step": 12185 + }, + { + "epoch": 1.4642642642642643, + "grad_norm": 0.4227926731109619, + "learning_rate": 5.187071422248968e-05, + "loss": 0.0805, + "step": 12190 + }, + { + "epoch": 1.464864864864865, + "grad_norm": 0.48485782742500305, + "learning_rate": 5.1839288486240975e-05, + "loss": 0.0666, + "step": 12195 + }, + { + "epoch": 1.4654654654654655, + "grad_norm": 0.3551216423511505, + "learning_rate": 5.1807862022415275e-05, + "loss": 0.0899, + "step": 12200 + }, + { + "epoch": 1.466066066066066, + "grad_norm": 0.5003598928451538, + "learning_rate": 5.1776434843444164e-05, + "loss": 0.0685, + "step": 12205 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.5011917948722839, + "learning_rate": 5.17450069617594e-05, + "loss": 0.076, + "step": 12210 + }, + { + "epoch": 1.4672672672672673, + "grad_norm": 0.48780590295791626, + "learning_rate": 5.1713578389793116e-05, + "loss": 0.0751, + "step": 12215 + }, + { + "epoch": 1.4678678678678678, + "grad_norm": 0.48917055130004883, + "learning_rate": 5.1682149139977655e-05, + "loss": 0.0926, + "step": 12220 + }, + { + "epoch": 1.4684684684684686, + "grad_norm": 0.4546566307544708, + "learning_rate": 5.165071922474564e-05, + "loss": 0.0723, + "step": 12225 + }, + { + "epoch": 1.469069069069069, + "grad_norm": 0.4893244504928589, + "learning_rate": 5.1619288656529995e-05, + "loss": 0.0798, + "step": 12230 + }, + { + "epoch": 1.4696696696696696, + "grad_norm": 0.4245280921459198, + "learning_rate": 5.158785744776385e-05, + "loss": 0.0831, + "step": 12235 + }, + { + "epoch": 1.4702702702702704, + "grad_norm": 0.4717468023300171, + "learning_rate": 5.155642561088063e-05, + "loss": 0.0937, + "step": 12240 + }, + { + "epoch": 1.4708708708708709, + "grad_norm": 0.4630936086177826, + "learning_rate": 5.152499315831398e-05, + "loss": 0.0755, + "step": 12245 + }, + { + "epoch": 1.4714714714714714, + "grad_norm": 0.5492383241653442, + "learning_rate": 5.149356010249782e-05, + "loss": 0.0997, + "step": 12250 + }, + { + "epoch": 1.4714714714714714, + "eval_loss": 0.10047008842229843, + "eval_runtime": 35.8826, + "eval_samples_per_second": 22.295, + "eval_steps_per_second": 5.574, + "step": 12250 + }, + { + "epoch": 1.4720720720720721, + "grad_norm": 0.5058827996253967, + "learning_rate": 5.1462126455866255e-05, + "loss": 0.0765, + "step": 12255 + }, + { + "epoch": 1.4726726726726727, + "grad_norm": 0.4335672855377197, + "learning_rate": 5.143069223085368e-05, + "loss": 0.0781, + "step": 12260 + }, + { + "epoch": 1.4732732732732732, + "grad_norm": 0.5172309875488281, + "learning_rate": 5.139925743989471e-05, + "loss": 0.1014, + "step": 12265 + }, + { + "epoch": 1.473873873873874, + "grad_norm": 0.45344531536102295, + "learning_rate": 5.136782209542412e-05, + "loss": 0.0718, + "step": 12270 + }, + { + "epoch": 1.4744744744744744, + "grad_norm": 0.4879536032676697, + "learning_rate": 5.133638620987701e-05, + "loss": 0.0715, + "step": 12275 + }, + { + "epoch": 1.475075075075075, + "grad_norm": 0.45065709948539734, + "learning_rate": 5.130494979568859e-05, + "loss": 0.0793, + "step": 12280 + }, + { + "epoch": 1.4756756756756757, + "grad_norm": 0.5382446646690369, + "learning_rate": 5.127351286529436e-05, + "loss": 0.0925, + "step": 12285 + }, + { + "epoch": 1.4762762762762762, + "grad_norm": 0.4322677552700043, + "learning_rate": 5.124207543112998e-05, + "loss": 0.084, + "step": 12290 + }, + { + "epoch": 1.4768768768768767, + "grad_norm": 0.4754551649093628, + "learning_rate": 5.121063750563131e-05, + "loss": 0.1005, + "step": 12295 + }, + { + "epoch": 1.4774774774774775, + "grad_norm": 0.4164341986179352, + "learning_rate": 5.117919910123444e-05, + "loss": 0.0896, + "step": 12300 + }, + { + "epoch": 1.478078078078078, + "grad_norm": 0.44691160321235657, + "learning_rate": 5.114776023037561e-05, + "loss": 0.075, + "step": 12305 + }, + { + "epoch": 1.4786786786786787, + "grad_norm": 0.41045352816581726, + "learning_rate": 5.111632090549126e-05, + "loss": 0.071, + "step": 12310 + }, + { + "epoch": 1.4792792792792793, + "grad_norm": 0.48388731479644775, + "learning_rate": 5.108488113901799e-05, + "loss": 0.0802, + "step": 12315 + }, + { + "epoch": 1.47987987987988, + "grad_norm": 0.5359501242637634, + "learning_rate": 5.1053440943392626e-05, + "loss": 0.1017, + "step": 12320 + }, + { + "epoch": 1.4804804804804805, + "grad_norm": 0.5539635419845581, + "learning_rate": 5.102200033105211e-05, + "loss": 0.0962, + "step": 12325 + }, + { + "epoch": 1.481081081081081, + "grad_norm": 0.4830450117588043, + "learning_rate": 5.099055931443356e-05, + "loss": 0.0804, + "step": 12330 + }, + { + "epoch": 1.4816816816816818, + "grad_norm": 0.5440949201583862, + "learning_rate": 5.0959117905974295e-05, + "loss": 0.0932, + "step": 12335 + }, + { + "epoch": 1.4822822822822823, + "grad_norm": 0.41771113872528076, + "learning_rate": 5.092767611811172e-05, + "loss": 0.0792, + "step": 12340 + }, + { + "epoch": 1.4828828828828828, + "grad_norm": 0.4817197918891907, + "learning_rate": 5.089623396328347e-05, + "loss": 0.0794, + "step": 12345 + }, + { + "epoch": 1.4834834834834836, + "grad_norm": 0.4798753261566162, + "learning_rate": 5.086479145392725e-05, + "loss": 0.0917, + "step": 12350 + }, + { + "epoch": 1.484084084084084, + "grad_norm": 0.6235416531562805, + "learning_rate": 5.0833348602480954e-05, + "loss": 0.0787, + "step": 12355 + }, + { + "epoch": 1.4846846846846846, + "grad_norm": 0.5768137574195862, + "learning_rate": 5.080190542138259e-05, + "loss": 0.0827, + "step": 12360 + }, + { + "epoch": 1.4852852852852854, + "grad_norm": 0.4930762052536011, + "learning_rate": 5.0770461923070286e-05, + "loss": 0.0716, + "step": 12365 + }, + { + "epoch": 1.4858858858858859, + "grad_norm": 0.4362534284591675, + "learning_rate": 5.073901811998234e-05, + "loss": 0.0683, + "step": 12370 + }, + { + "epoch": 1.4864864864864864, + "grad_norm": 0.48355865478515625, + "learning_rate": 5.070757402455712e-05, + "loss": 0.0635, + "step": 12375 + }, + { + "epoch": 1.4870870870870871, + "grad_norm": 0.5248156785964966, + "learning_rate": 5.067612964923315e-05, + "loss": 0.0669, + "step": 12380 + }, + { + "epoch": 1.4876876876876877, + "grad_norm": 0.5497010350227356, + "learning_rate": 5.064468500644903e-05, + "loss": 0.076, + "step": 12385 + }, + { + "epoch": 1.4882882882882882, + "grad_norm": 0.4656374454498291, + "learning_rate": 5.061324010864349e-05, + "loss": 0.0598, + "step": 12390 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 0.5101540088653564, + "learning_rate": 5.058179496825535e-05, + "loss": 0.0773, + "step": 12395 + }, + { + "epoch": 1.4894894894894894, + "grad_norm": 0.4140486717224121, + "learning_rate": 5.055034959772352e-05, + "loss": 0.0633, + "step": 12400 + }, + { + "epoch": 1.49009009009009, + "grad_norm": 0.39580681920051575, + "learning_rate": 5.051890400948703e-05, + "loss": 0.0754, + "step": 12405 + }, + { + "epoch": 1.4906906906906907, + "grad_norm": 0.36044490337371826, + "learning_rate": 5.0487458215984964e-05, + "loss": 0.0733, + "step": 12410 + }, + { + "epoch": 1.4912912912912912, + "grad_norm": 0.3795586824417114, + "learning_rate": 5.04560122296565e-05, + "loss": 0.0607, + "step": 12415 + }, + { + "epoch": 1.491891891891892, + "grad_norm": 0.49337899684906006, + "learning_rate": 5.042456606294088e-05, + "loss": 0.0691, + "step": 12420 + }, + { + "epoch": 1.4924924924924925, + "grad_norm": 0.4806540012359619, + "learning_rate": 5.039311972827746e-05, + "loss": 0.0736, + "step": 12425 + }, + { + "epoch": 1.493093093093093, + "grad_norm": 0.4080536365509033, + "learning_rate": 5.03616732381056e-05, + "loss": 0.0681, + "step": 12430 + }, + { + "epoch": 1.4936936936936938, + "grad_norm": 0.4222785532474518, + "learning_rate": 5.033022660486475e-05, + "loss": 0.0698, + "step": 12435 + }, + { + "epoch": 1.4942942942942943, + "grad_norm": 0.4748011529445648, + "learning_rate": 5.029877984099446e-05, + "loss": 0.0726, + "step": 12440 + }, + { + "epoch": 1.494894894894895, + "grad_norm": 0.3876466751098633, + "learning_rate": 5.0267332958934246e-05, + "loss": 0.0768, + "step": 12445 + }, + { + "epoch": 1.4954954954954955, + "grad_norm": 0.4406418800354004, + "learning_rate": 5.023588597112374e-05, + "loss": 0.0794, + "step": 12450 + }, + { + "epoch": 1.496096096096096, + "grad_norm": 0.5697823166847229, + "learning_rate": 5.020443889000259e-05, + "loss": 0.0802, + "step": 12455 + }, + { + "epoch": 1.4966966966966968, + "grad_norm": 0.42151203751564026, + "learning_rate": 5.017299172801049e-05, + "loss": 0.0846, + "step": 12460 + }, + { + "epoch": 1.4972972972972973, + "grad_norm": 0.39392387866973877, + "learning_rate": 5.014154449758712e-05, + "loss": 0.078, + "step": 12465 + }, + { + "epoch": 1.4978978978978978, + "grad_norm": 0.5100020170211792, + "learning_rate": 5.011009721117226e-05, + "loss": 0.0641, + "step": 12470 + }, + { + "epoch": 1.4984984984984986, + "grad_norm": 0.5471884608268738, + "learning_rate": 5.0078649881205684e-05, + "loss": 0.0765, + "step": 12475 + }, + { + "epoch": 1.499099099099099, + "grad_norm": 0.42572712898254395, + "learning_rate": 5.0047202520127144e-05, + "loss": 0.0848, + "step": 12480 + }, + { + "epoch": 1.4996996996996996, + "grad_norm": 0.4461537003517151, + "learning_rate": 5.001575514037647e-05, + "loss": 0.0732, + "step": 12485 + }, + { + "epoch": 1.5003003003003004, + "grad_norm": 0.40868768095970154, + "learning_rate": 4.9984307754393456e-05, + "loss": 0.0807, + "step": 12490 + }, + { + "epoch": 1.500900900900901, + "grad_norm": 0.4080524742603302, + "learning_rate": 4.995286037461789e-05, + "loss": 0.0494, + "step": 12495 + }, + { + "epoch": 1.5015015015015014, + "grad_norm": 0.4617525637149811, + "learning_rate": 4.9921413013489604e-05, + "loss": 0.082, + "step": 12500 + }, + { + "epoch": 1.5015015015015014, + "eval_loss": 0.09793703258037567, + "eval_runtime": 36.0537, + "eval_samples_per_second": 22.189, + "eval_steps_per_second": 5.547, + "step": 12500 + }, + { + "epoch": 1.5021021021021022, + "grad_norm": 0.47754546999931335, + "learning_rate": 4.988996568344838e-05, + "loss": 0.071, + "step": 12505 + }, + { + "epoch": 1.5027027027027027, + "grad_norm": 0.5164263248443604, + "learning_rate": 4.9858518396934e-05, + "loss": 0.0709, + "step": 12510 + }, + { + "epoch": 1.5033033033033032, + "grad_norm": 0.3986726999282837, + "learning_rate": 4.982707116638625e-05, + "loss": 0.0621, + "step": 12515 + }, + { + "epoch": 1.503903903903904, + "grad_norm": 0.4838113486766815, + "learning_rate": 4.9795624004244855e-05, + "loss": 0.0867, + "step": 12520 + }, + { + "epoch": 1.5045045045045045, + "grad_norm": 0.5557221174240112, + "learning_rate": 4.976417692294954e-05, + "loss": 0.0788, + "step": 12525 + }, + { + "epoch": 1.505105105105105, + "grad_norm": 0.45177948474884033, + "learning_rate": 4.973272993493999e-05, + "loss": 0.069, + "step": 12530 + }, + { + "epoch": 1.5057057057057057, + "grad_norm": 0.4850604832172394, + "learning_rate": 4.9701283052655876e-05, + "loss": 0.0876, + "step": 12535 + }, + { + "epoch": 1.5063063063063065, + "grad_norm": 0.4056200087070465, + "learning_rate": 4.966983628853679e-05, + "loss": 0.0745, + "step": 12540 + }, + { + "epoch": 1.5069069069069068, + "grad_norm": 0.45170727372169495, + "learning_rate": 4.963838965502227e-05, + "loss": 0.0801, + "step": 12545 + }, + { + "epoch": 1.5075075075075075, + "grad_norm": 0.6197972297668457, + "learning_rate": 4.960694316455187e-05, + "loss": 0.076, + "step": 12550 + }, + { + "epoch": 1.5081081081081082, + "grad_norm": 0.5713878870010376, + "learning_rate": 4.9575496829564996e-05, + "loss": 0.0849, + "step": 12555 + }, + { + "epoch": 1.5087087087087085, + "grad_norm": 0.4119352102279663, + "learning_rate": 4.954405066250109e-05, + "loss": 0.0614, + "step": 12560 + }, + { + "epoch": 1.5093093093093093, + "grad_norm": 0.35041505098342896, + "learning_rate": 4.951260467579943e-05, + "loss": 0.0585, + "step": 12565 + }, + { + "epoch": 1.50990990990991, + "grad_norm": 0.7188326120376587, + "learning_rate": 4.948115888189929e-05, + "loss": 0.0865, + "step": 12570 + }, + { + "epoch": 1.5105105105105106, + "grad_norm": 0.5880985260009766, + "learning_rate": 4.944971329323985e-05, + "loss": 0.0846, + "step": 12575 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 0.6127240657806396, + "learning_rate": 4.941826792226019e-05, + "loss": 0.0846, + "step": 12580 + }, + { + "epoch": 1.5117117117117118, + "grad_norm": 0.4787910580635071, + "learning_rate": 4.9386822781399366e-05, + "loss": 0.087, + "step": 12585 + }, + { + "epoch": 1.5123123123123123, + "grad_norm": 0.35946008563041687, + "learning_rate": 4.935537788309624e-05, + "loss": 0.0624, + "step": 12590 + }, + { + "epoch": 1.5129129129129129, + "grad_norm": 0.4871029555797577, + "learning_rate": 4.932393323978967e-05, + "loss": 0.09, + "step": 12595 + }, + { + "epoch": 1.5135135135135136, + "grad_norm": 0.5066032409667969, + "learning_rate": 4.929248886391835e-05, + "loss": 0.0708, + "step": 12600 + }, + { + "epoch": 1.5141141141141141, + "grad_norm": 0.470005065202713, + "learning_rate": 4.926104476792092e-05, + "loss": 0.0727, + "step": 12605 + }, + { + "epoch": 1.5147147147147146, + "grad_norm": 0.5710893869400024, + "learning_rate": 4.92296009642359e-05, + "loss": 0.0747, + "step": 12610 + }, + { + "epoch": 1.5153153153153154, + "grad_norm": 0.42586013674736023, + "learning_rate": 4.9198157465301634e-05, + "loss": 0.07, + "step": 12615 + }, + { + "epoch": 1.515915915915916, + "grad_norm": 0.412024587392807, + "learning_rate": 4.916671428355641e-05, + "loss": 0.0739, + "step": 12620 + }, + { + "epoch": 1.5165165165165164, + "grad_norm": 0.46768057346343994, + "learning_rate": 4.91352714314384e-05, + "loss": 0.0683, + "step": 12625 + }, + { + "epoch": 1.5171171171171172, + "grad_norm": 0.45962801575660706, + "learning_rate": 4.91038289213856e-05, + "loss": 0.0791, + "step": 12630 + }, + { + "epoch": 1.5177177177177177, + "grad_norm": 0.494783490896225, + "learning_rate": 4.9072386765835864e-05, + "loss": 0.0786, + "step": 12635 + }, + { + "epoch": 1.5183183183183182, + "grad_norm": 0.43444254994392395, + "learning_rate": 4.904094497722696e-05, + "loss": 0.0796, + "step": 12640 + }, + { + "epoch": 1.518918918918919, + "grad_norm": 0.3887892961502075, + "learning_rate": 4.900950356799647e-05, + "loss": 0.066, + "step": 12645 + }, + { + "epoch": 1.5195195195195195, + "grad_norm": 0.4380321800708771, + "learning_rate": 4.8978062550581825e-05, + "loss": 0.0813, + "step": 12650 + }, + { + "epoch": 1.52012012012012, + "grad_norm": 0.43813252449035645, + "learning_rate": 4.8946621937420356e-05, + "loss": 0.0791, + "step": 12655 + }, + { + "epoch": 1.5207207207207207, + "grad_norm": 0.5307855606079102, + "learning_rate": 4.891518174094914e-05, + "loss": 0.0898, + "step": 12660 + }, + { + "epoch": 1.5213213213213215, + "grad_norm": 0.39943432807922363, + "learning_rate": 4.8883741973605155e-05, + "loss": 0.0809, + "step": 12665 + }, + { + "epoch": 1.5219219219219218, + "grad_norm": 0.41812124848365784, + "learning_rate": 4.88523026478252e-05, + "loss": 0.072, + "step": 12670 + }, + { + "epoch": 1.5225225225225225, + "grad_norm": 0.44765257835388184, + "learning_rate": 4.88208637760459e-05, + "loss": 0.0709, + "step": 12675 + }, + { + "epoch": 1.5231231231231233, + "grad_norm": 0.4284890294075012, + "learning_rate": 4.8789425370703704e-05, + "loss": 0.0681, + "step": 12680 + }, + { + "epoch": 1.5237237237237238, + "grad_norm": 0.5334815382957458, + "learning_rate": 4.875798744423483e-05, + "loss": 0.0914, + "step": 12685 + }, + { + "epoch": 1.5243243243243243, + "grad_norm": 0.4032062292098999, + "learning_rate": 4.872655000907538e-05, + "loss": 0.0882, + "step": 12690 + }, + { + "epoch": 1.524924924924925, + "grad_norm": 0.5700933933258057, + "learning_rate": 4.8695113077661195e-05, + "loss": 0.0743, + "step": 12695 + }, + { + "epoch": 1.5255255255255256, + "grad_norm": 0.42844948172569275, + "learning_rate": 4.866367666242798e-05, + "loss": 0.0768, + "step": 12700 + }, + { + "epoch": 1.526126126126126, + "grad_norm": 0.5078549385070801, + "learning_rate": 4.863224077581115e-05, + "loss": 0.09, + "step": 12705 + }, + { + "epoch": 1.5267267267267268, + "grad_norm": 0.4741128087043762, + "learning_rate": 4.860080543024601e-05, + "loss": 0.064, + "step": 12710 + }, + { + "epoch": 1.5273273273273273, + "grad_norm": 0.5249758362770081, + "learning_rate": 4.856937063816758e-05, + "loss": 0.0949, + "step": 12715 + }, + { + "epoch": 1.5279279279279279, + "grad_norm": 0.43285176157951355, + "learning_rate": 4.85379364120107e-05, + "loss": 0.0872, + "step": 12720 + }, + { + "epoch": 1.5285285285285286, + "grad_norm": 0.4995947480201721, + "learning_rate": 4.850650276420999e-05, + "loss": 0.0961, + "step": 12725 + }, + { + "epoch": 1.5291291291291291, + "grad_norm": 0.3818022310733795, + "learning_rate": 4.847506970719977e-05, + "loss": 0.0653, + "step": 12730 + }, + { + "epoch": 1.5297297297297296, + "grad_norm": 0.39495208859443665, + "learning_rate": 4.844363725341422e-05, + "loss": 0.0686, + "step": 12735 + }, + { + "epoch": 1.5303303303303304, + "grad_norm": 0.4333951771259308, + "learning_rate": 4.841220541528722e-05, + "loss": 0.0658, + "step": 12740 + }, + { + "epoch": 1.530930930930931, + "grad_norm": 0.5166015625, + "learning_rate": 4.838077420525243e-05, + "loss": 0.0712, + "step": 12745 + }, + { + "epoch": 1.5315315315315314, + "grad_norm": 0.40091192722320557, + "learning_rate": 4.834934363574329e-05, + "loss": 0.07, + "step": 12750 + }, + { + "epoch": 1.5315315315315314, + "eval_loss": 0.08978892117738724, + "eval_runtime": 35.9101, + "eval_samples_per_second": 22.278, + "eval_steps_per_second": 5.569, + "step": 12750 + }, + { + "epoch": 1.5321321321321322, + "grad_norm": 0.43245700001716614, + "learning_rate": 4.83179137191929e-05, + "loss": 0.0847, + "step": 12755 + }, + { + "epoch": 1.5327327327327327, + "grad_norm": 0.42760345339775085, + "learning_rate": 4.828648446803419e-05, + "loss": 0.069, + "step": 12760 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.39846494793891907, + "learning_rate": 4.825505589469978e-05, + "loss": 0.0689, + "step": 12765 + }, + { + "epoch": 1.533933933933934, + "grad_norm": 0.3754206597805023, + "learning_rate": 4.8223628011622065e-05, + "loss": 0.0635, + "step": 12770 + }, + { + "epoch": 1.5345345345345347, + "grad_norm": 0.48671194911003113, + "learning_rate": 4.819220083123311e-05, + "loss": 0.0628, + "step": 12775 + }, + { + "epoch": 1.535135135135135, + "grad_norm": 0.49007099866867065, + "learning_rate": 4.8160774365964736e-05, + "loss": 0.0774, + "step": 12780 + }, + { + "epoch": 1.5357357357357357, + "grad_norm": 0.332457035779953, + "learning_rate": 4.8129348628248455e-05, + "loss": 0.0556, + "step": 12785 + }, + { + "epoch": 1.5363363363363365, + "grad_norm": 0.4247899651527405, + "learning_rate": 4.809792363051553e-05, + "loss": 0.0746, + "step": 12790 + }, + { + "epoch": 1.5369369369369368, + "grad_norm": 0.38403886556625366, + "learning_rate": 4.806649938519694e-05, + "loss": 0.073, + "step": 12795 + }, + { + "epoch": 1.5375375375375375, + "grad_norm": 0.460376501083374, + "learning_rate": 4.803507590472328e-05, + "loss": 0.0722, + "step": 12800 + }, + { + "epoch": 1.5381381381381383, + "grad_norm": 0.38304775953292847, + "learning_rate": 4.800365320152493e-05, + "loss": 0.0696, + "step": 12805 + }, + { + "epoch": 1.5387387387387388, + "grad_norm": 0.41683074831962585, + "learning_rate": 4.797223128803193e-05, + "loss": 0.0595, + "step": 12810 + }, + { + "epoch": 1.5393393393393393, + "grad_norm": 0.2956644296646118, + "learning_rate": 4.794081017667401e-05, + "loss": 0.0602, + "step": 12815 + }, + { + "epoch": 1.53993993993994, + "grad_norm": 0.47854262590408325, + "learning_rate": 4.7909389879880616e-05, + "loss": 0.0694, + "step": 12820 + }, + { + "epoch": 1.5405405405405406, + "grad_norm": 0.4116211533546448, + "learning_rate": 4.7877970410080785e-05, + "loss": 0.0743, + "step": 12825 + }, + { + "epoch": 1.541141141141141, + "grad_norm": 0.4726904332637787, + "learning_rate": 4.784655177970332e-05, + "loss": 0.076, + "step": 12830 + }, + { + "epoch": 1.5417417417417418, + "grad_norm": 0.33450406789779663, + "learning_rate": 4.781513400117662e-05, + "loss": 0.0484, + "step": 12835 + }, + { + "epoch": 1.5423423423423424, + "grad_norm": 0.4221593737602234, + "learning_rate": 4.7783717086928804e-05, + "loss": 0.0601, + "step": 12840 + }, + { + "epoch": 1.5429429429429429, + "grad_norm": 0.5398896932601929, + "learning_rate": 4.775230104938764e-05, + "loss": 0.0695, + "step": 12845 + }, + { + "epoch": 1.5435435435435436, + "grad_norm": 0.49371886253356934, + "learning_rate": 4.7720885900980494e-05, + "loss": 0.0627, + "step": 12850 + }, + { + "epoch": 1.5441441441441441, + "grad_norm": 0.4454942047595978, + "learning_rate": 4.7689471654134447e-05, + "loss": 0.074, + "step": 12855 + }, + { + "epoch": 1.5447447447447447, + "grad_norm": 0.4327937364578247, + "learning_rate": 4.765805832127618e-05, + "loss": 0.0751, + "step": 12860 + }, + { + "epoch": 1.5453453453453454, + "grad_norm": 0.3502213656902313, + "learning_rate": 4.762664591483207e-05, + "loss": 0.0719, + "step": 12865 + }, + { + "epoch": 1.545945945945946, + "grad_norm": 0.42783692479133606, + "learning_rate": 4.759523444722803e-05, + "loss": 0.0676, + "step": 12870 + }, + { + "epoch": 1.5465465465465464, + "grad_norm": 0.4517103135585785, + "learning_rate": 4.75638239308897e-05, + "loss": 0.077, + "step": 12875 + }, + { + "epoch": 1.5471471471471472, + "grad_norm": 0.42299434542655945, + "learning_rate": 4.753241437824228e-05, + "loss": 0.0779, + "step": 12880 + }, + { + "epoch": 1.5477477477477477, + "grad_norm": 0.5330052375793457, + "learning_rate": 4.750100580171062e-05, + "loss": 0.0673, + "step": 12885 + }, + { + "epoch": 1.5483483483483482, + "grad_norm": 0.5104110836982727, + "learning_rate": 4.74695982137192e-05, + "loss": 0.0661, + "step": 12890 + }, + { + "epoch": 1.548948948948949, + "grad_norm": 0.507620632648468, + "learning_rate": 4.743819162669202e-05, + "loss": 0.0619, + "step": 12895 + }, + { + "epoch": 1.5495495495495497, + "grad_norm": 0.5362961888313293, + "learning_rate": 4.740678605305281e-05, + "loss": 0.0699, + "step": 12900 + }, + { + "epoch": 1.55015015015015, + "grad_norm": 0.42854073643684387, + "learning_rate": 4.73753815052248e-05, + "loss": 0.0564, + "step": 12905 + }, + { + "epoch": 1.5507507507507508, + "grad_norm": 0.47177237272262573, + "learning_rate": 4.734397799563088e-05, + "loss": 0.0686, + "step": 12910 + }, + { + "epoch": 1.5513513513513515, + "grad_norm": 0.4933551251888275, + "learning_rate": 4.731257553669348e-05, + "loss": 0.07, + "step": 12915 + }, + { + "epoch": 1.5519519519519518, + "grad_norm": 0.5049030184745789, + "learning_rate": 4.7281174140834636e-05, + "loss": 0.0729, + "step": 12920 + }, + { + "epoch": 1.5525525525525525, + "grad_norm": 0.4711320996284485, + "learning_rate": 4.7249773820475987e-05, + "loss": 0.0658, + "step": 12925 + }, + { + "epoch": 1.5531531531531533, + "grad_norm": 0.5016087889671326, + "learning_rate": 4.7218374588038675e-05, + "loss": 0.075, + "step": 12930 + }, + { + "epoch": 1.5537537537537538, + "grad_norm": 0.5302407145500183, + "learning_rate": 4.718697645594352e-05, + "loss": 0.0592, + "step": 12935 + }, + { + "epoch": 1.5543543543543543, + "grad_norm": 0.5613186359405518, + "learning_rate": 4.7155579436610785e-05, + "loss": 0.075, + "step": 12940 + }, + { + "epoch": 1.554954954954955, + "grad_norm": 0.44849929213523865, + "learning_rate": 4.712418354246038e-05, + "loss": 0.0603, + "step": 12945 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 0.442667156457901, + "learning_rate": 4.7092788785911746e-05, + "loss": 0.0596, + "step": 12950 + }, + { + "epoch": 1.556156156156156, + "grad_norm": 0.6306538581848145, + "learning_rate": 4.7061395179383875e-05, + "loss": 0.071, + "step": 12955 + }, + { + "epoch": 1.5567567567567568, + "grad_norm": 0.401419073343277, + "learning_rate": 4.70300027352953e-05, + "loss": 0.0638, + "step": 12960 + }, + { + "epoch": 1.5573573573573574, + "grad_norm": 0.4501631259918213, + "learning_rate": 4.699861146606408e-05, + "loss": 0.0808, + "step": 12965 + }, + { + "epoch": 1.5579579579579579, + "grad_norm": 0.37132251262664795, + "learning_rate": 4.6967221384107836e-05, + "loss": 0.0613, + "step": 12970 + }, + { + "epoch": 1.5585585585585586, + "grad_norm": 0.42588886618614197, + "learning_rate": 4.693583250184369e-05, + "loss": 0.0649, + "step": 12975 + }, + { + "epoch": 1.5591591591591591, + "grad_norm": 0.41766226291656494, + "learning_rate": 4.690444483168833e-05, + "loss": 0.0632, + "step": 12980 + }, + { + "epoch": 1.5597597597597597, + "grad_norm": 0.39264434576034546, + "learning_rate": 4.687305838605794e-05, + "loss": 0.0656, + "step": 12985 + }, + { + "epoch": 1.5603603603603604, + "grad_norm": 0.42573830485343933, + "learning_rate": 4.684167317736819e-05, + "loss": 0.0614, + "step": 12990 + }, + { + "epoch": 1.560960960960961, + "grad_norm": 0.4736945629119873, + "learning_rate": 4.681028921803432e-05, + "loss": 0.0691, + "step": 12995 + }, + { + "epoch": 1.5615615615615615, + "grad_norm": 0.4946117699146271, + "learning_rate": 4.677890652047103e-05, + "loss": 0.063, + "step": 13000 + }, + { + "epoch": 1.5615615615615615, + "eval_loss": 0.08609585464000702, + "eval_runtime": 35.6392, + "eval_samples_per_second": 22.447, + "eval_steps_per_second": 5.612, + "step": 13000 + }, + { + "epoch": 1.5621621621621622, + "grad_norm": 0.4591105580329895, + "learning_rate": 4.6747525097092576e-05, + "loss": 0.0857, + "step": 13005 + }, + { + "epoch": 1.5627627627627627, + "grad_norm": 0.3890763223171234, + "learning_rate": 4.671614496031262e-05, + "loss": 0.0667, + "step": 13010 + }, + { + "epoch": 1.5633633633633632, + "grad_norm": 0.3825789988040924, + "learning_rate": 4.66847661225444e-05, + "loss": 0.0656, + "step": 13015 + }, + { + "epoch": 1.563963963963964, + "grad_norm": 0.4688428044319153, + "learning_rate": 4.665338859620059e-05, + "loss": 0.0715, + "step": 13020 + }, + { + "epoch": 1.5645645645645647, + "grad_norm": 0.5024927854537964, + "learning_rate": 4.662201239369336e-05, + "loss": 0.0813, + "step": 13025 + }, + { + "epoch": 1.565165165165165, + "grad_norm": 0.37907472252845764, + "learning_rate": 4.6590637527434394e-05, + "loss": 0.0783, + "step": 13030 + }, + { + "epoch": 1.5657657657657658, + "grad_norm": 0.48286497592926025, + "learning_rate": 4.6559264009834765e-05, + "loss": 0.0677, + "step": 13035 + }, + { + "epoch": 1.5663663663663665, + "grad_norm": 0.3524858355522156, + "learning_rate": 4.6527891853305085e-05, + "loss": 0.0568, + "step": 13040 + }, + { + "epoch": 1.5669669669669668, + "grad_norm": 0.5434703826904297, + "learning_rate": 4.6496521070255395e-05, + "loss": 0.0685, + "step": 13045 + }, + { + "epoch": 1.5675675675675675, + "grad_norm": 0.36601707339286804, + "learning_rate": 4.6465151673095195e-05, + "loss": 0.0659, + "step": 13050 + }, + { + "epoch": 1.5681681681681683, + "grad_norm": 0.33329319953918457, + "learning_rate": 4.6433783674233485e-05, + "loss": 0.0607, + "step": 13055 + }, + { + "epoch": 1.5687687687687688, + "grad_norm": 0.4235140383243561, + "learning_rate": 4.64024170860786e-05, + "loss": 0.0594, + "step": 13060 + }, + { + "epoch": 1.5693693693693693, + "grad_norm": 0.4310402572154999, + "learning_rate": 4.637105192103843e-05, + "loss": 0.0678, + "step": 13065 + }, + { + "epoch": 1.56996996996997, + "grad_norm": 0.45694392919540405, + "learning_rate": 4.633968819152024e-05, + "loss": 0.0875, + "step": 13070 + }, + { + "epoch": 1.5705705705705706, + "grad_norm": 0.4142746329307556, + "learning_rate": 4.6308325909930775e-05, + "loss": 0.0581, + "step": 13075 + }, + { + "epoch": 1.571171171171171, + "grad_norm": 0.5073320865631104, + "learning_rate": 4.6276965088676125e-05, + "loss": 0.0664, + "step": 13080 + }, + { + "epoch": 1.5717717717717719, + "grad_norm": 0.4360414743423462, + "learning_rate": 4.624560574016188e-05, + "loss": 0.0641, + "step": 13085 + }, + { + "epoch": 1.5723723723723724, + "grad_norm": 0.4940846860408783, + "learning_rate": 4.621424787679303e-05, + "loss": 0.0697, + "step": 13090 + }, + { + "epoch": 1.572972972972973, + "grad_norm": 0.4336908459663391, + "learning_rate": 4.618289151097395e-05, + "loss": 0.0718, + "step": 13095 + }, + { + "epoch": 1.5735735735735736, + "grad_norm": 0.42787450551986694, + "learning_rate": 4.615153665510849e-05, + "loss": 0.0683, + "step": 13100 + }, + { + "epoch": 1.5741741741741742, + "grad_norm": 0.509724497795105, + "learning_rate": 4.612018332159979e-05, + "loss": 0.064, + "step": 13105 + }, + { + "epoch": 1.5747747747747747, + "grad_norm": 0.4025382399559021, + "learning_rate": 4.6088831522850483e-05, + "loss": 0.0632, + "step": 13110 + }, + { + "epoch": 1.5753753753753754, + "grad_norm": 0.45108699798583984, + "learning_rate": 4.605748127126256e-05, + "loss": 0.0724, + "step": 13115 + }, + { + "epoch": 1.575975975975976, + "grad_norm": 0.36071160435676575, + "learning_rate": 4.6026132579237407e-05, + "loss": 0.056, + "step": 13120 + }, + { + "epoch": 1.5765765765765765, + "grad_norm": 0.3282938301563263, + "learning_rate": 4.599478545917581e-05, + "loss": 0.0519, + "step": 13125 + }, + { + "epoch": 1.5771771771771772, + "grad_norm": 0.3939158320426941, + "learning_rate": 4.596343992347787e-05, + "loss": 0.0582, + "step": 13130 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 0.3940871059894562, + "learning_rate": 4.593209598454313e-05, + "loss": 0.0726, + "step": 13135 + }, + { + "epoch": 1.5783783783783782, + "grad_norm": 0.4171847105026245, + "learning_rate": 4.5900753654770465e-05, + "loss": 0.0695, + "step": 13140 + }, + { + "epoch": 1.578978978978979, + "grad_norm": 0.4544351100921631, + "learning_rate": 4.586941294655816e-05, + "loss": 0.0688, + "step": 13145 + }, + { + "epoch": 1.5795795795795797, + "grad_norm": 0.3604528307914734, + "learning_rate": 4.583807387230377e-05, + "loss": 0.0534, + "step": 13150 + }, + { + "epoch": 1.58018018018018, + "grad_norm": 0.4082534611225128, + "learning_rate": 4.5806736444404294e-05, + "loss": 0.0808, + "step": 13155 + }, + { + "epoch": 1.5807807807807808, + "grad_norm": 0.3927527070045471, + "learning_rate": 4.577540067525602e-05, + "loss": 0.0636, + "step": 13160 + }, + { + "epoch": 1.5813813813813815, + "grad_norm": 0.4562763571739197, + "learning_rate": 4.5744066577254615e-05, + "loss": 0.0791, + "step": 13165 + }, + { + "epoch": 1.581981981981982, + "grad_norm": 0.4318278431892395, + "learning_rate": 4.57127341627951e-05, + "loss": 0.0648, + "step": 13170 + }, + { + "epoch": 1.5825825825825826, + "grad_norm": 0.3479551374912262, + "learning_rate": 4.5681403444271736e-05, + "loss": 0.0519, + "step": 13175 + }, + { + "epoch": 1.5831831831831833, + "grad_norm": 0.5226278305053711, + "learning_rate": 4.565007443407822e-05, + "loss": 0.0629, + "step": 13180 + }, + { + "epoch": 1.5837837837837838, + "grad_norm": 0.34603849053382874, + "learning_rate": 4.561874714460753e-05, + "loss": 0.0503, + "step": 13185 + }, + { + "epoch": 1.5843843843843843, + "grad_norm": 0.46688374876976013, + "learning_rate": 4.558742158825197e-05, + "loss": 0.0591, + "step": 13190 + }, + { + "epoch": 1.584984984984985, + "grad_norm": 0.43887630105018616, + "learning_rate": 4.5556097777403154e-05, + "loss": 0.0722, + "step": 13195 + }, + { + "epoch": 1.5855855855855856, + "grad_norm": 0.42593011260032654, + "learning_rate": 4.552477572445199e-05, + "loss": 0.0594, + "step": 13200 + }, + { + "epoch": 1.5861861861861861, + "grad_norm": 0.4000687897205353, + "learning_rate": 4.549345544178873e-05, + "loss": 0.0729, + "step": 13205 + }, + { + "epoch": 1.5867867867867869, + "grad_norm": 0.4771914482116699, + "learning_rate": 4.546213694180286e-05, + "loss": 0.0646, + "step": 13210 + }, + { + "epoch": 1.5873873873873874, + "grad_norm": 0.4418584108352661, + "learning_rate": 4.543082023688324e-05, + "loss": 0.0657, + "step": 13215 + }, + { + "epoch": 1.587987987987988, + "grad_norm": 0.40844422578811646, + "learning_rate": 4.5399505339418e-05, + "loss": 0.0598, + "step": 13220 + }, + { + "epoch": 1.5885885885885886, + "grad_norm": 0.4462772011756897, + "learning_rate": 4.536819226179449e-05, + "loss": 0.0741, + "step": 13225 + }, + { + "epoch": 1.5891891891891892, + "grad_norm": 0.45399248600006104, + "learning_rate": 4.5336881016399416e-05, + "loss": 0.0626, + "step": 13230 + }, + { + "epoch": 1.5897897897897897, + "grad_norm": 0.5026511549949646, + "learning_rate": 4.530557161561871e-05, + "loss": 0.0714, + "step": 13235 + }, + { + "epoch": 1.5903903903903904, + "grad_norm": 0.4819416105747223, + "learning_rate": 4.5274264071837646e-05, + "loss": 0.0764, + "step": 13240 + }, + { + "epoch": 1.590990990990991, + "grad_norm": 0.37962061166763306, + "learning_rate": 4.524295839744065e-05, + "loss": 0.0627, + "step": 13245 + }, + { + "epoch": 1.5915915915915915, + "grad_norm": 0.5724044442176819, + "learning_rate": 4.521165460481151e-05, + "loss": 0.0727, + "step": 13250 + }, + { + "epoch": 1.5915915915915915, + "eval_loss": 0.07784755527973175, + "eval_runtime": 35.7222, + "eval_samples_per_second": 22.395, + "eval_steps_per_second": 5.599, + "step": 13250 + }, + { + "epoch": 1.5921921921921922, + "grad_norm": 0.40502458810806274, + "learning_rate": 4.518035270633321e-05, + "loss": 0.0611, + "step": 13255 + }, + { + "epoch": 1.592792792792793, + "grad_norm": 0.3639732599258423, + "learning_rate": 4.514905271438802e-05, + "loss": 0.0559, + "step": 13260 + }, + { + "epoch": 1.5933933933933933, + "grad_norm": 0.5118473172187805, + "learning_rate": 4.5117754641357455e-05, + "loss": 0.066, + "step": 13265 + }, + { + "epoch": 1.593993993993994, + "grad_norm": 0.46344873309135437, + "learning_rate": 4.508645849962222e-05, + "loss": 0.0582, + "step": 13270 + }, + { + "epoch": 1.5945945945945947, + "grad_norm": 0.5766790509223938, + "learning_rate": 4.505516430156232e-05, + "loss": 0.0626, + "step": 13275 + }, + { + "epoch": 1.595195195195195, + "grad_norm": 0.37287622690200806, + "learning_rate": 4.502387205955695e-05, + "loss": 0.0707, + "step": 13280 + }, + { + "epoch": 1.5957957957957958, + "grad_norm": 0.4307993948459625, + "learning_rate": 4.4992581785984574e-05, + "loss": 0.063, + "step": 13285 + }, + { + "epoch": 1.5963963963963965, + "grad_norm": 0.42525312304496765, + "learning_rate": 4.496129349322282e-05, + "loss": 0.0716, + "step": 13290 + }, + { + "epoch": 1.596996996996997, + "grad_norm": 0.5460354685783386, + "learning_rate": 4.493000719364857e-05, + "loss": 0.0709, + "step": 13295 + }, + { + "epoch": 1.5975975975975976, + "grad_norm": 0.3980024755001068, + "learning_rate": 4.489872289963792e-05, + "loss": 0.0592, + "step": 13300 + }, + { + "epoch": 1.5981981981981983, + "grad_norm": 0.42124074697494507, + "learning_rate": 4.486744062356614e-05, + "loss": 0.076, + "step": 13305 + }, + { + "epoch": 1.5987987987987988, + "grad_norm": 0.5677942037582397, + "learning_rate": 4.483616037780776e-05, + "loss": 0.0667, + "step": 13310 + }, + { + "epoch": 1.5993993993993993, + "grad_norm": 0.4418148398399353, + "learning_rate": 4.4804882174736425e-05, + "loss": 0.0667, + "step": 13315 + }, + { + "epoch": 1.6, + "grad_norm": 0.5681725740432739, + "learning_rate": 4.477360602672504e-05, + "loss": 0.0699, + "step": 13320 + }, + { + "epoch": 1.6006006006006006, + "grad_norm": 0.43010297417640686, + "learning_rate": 4.4742331946145673e-05, + "loss": 0.0674, + "step": 13325 + }, + { + "epoch": 1.6012012012012011, + "grad_norm": 0.5609509348869324, + "learning_rate": 4.471105994536958e-05, + "loss": 0.071, + "step": 13330 + }, + { + "epoch": 1.6018018018018019, + "grad_norm": 0.4860755205154419, + "learning_rate": 4.4679790036767205e-05, + "loss": 0.0648, + "step": 13335 + }, + { + "epoch": 1.6024024024024024, + "grad_norm": 0.487942099571228, + "learning_rate": 4.464852223270811e-05, + "loss": 0.0681, + "step": 13340 + }, + { + "epoch": 1.603003003003003, + "grad_norm": 0.48040616512298584, + "learning_rate": 4.46172565455611e-05, + "loss": 0.0573, + "step": 13345 + }, + { + "epoch": 1.6036036036036037, + "grad_norm": 0.36271554231643677, + "learning_rate": 4.458599298769407e-05, + "loss": 0.0625, + "step": 13350 + }, + { + "epoch": 1.6042042042042042, + "grad_norm": 0.474427193403244, + "learning_rate": 4.455473157147414e-05, + "loss": 0.0513, + "step": 13355 + }, + { + "epoch": 1.6048048048048047, + "grad_norm": 0.34347042441368103, + "learning_rate": 4.452347230926757e-05, + "loss": 0.0555, + "step": 13360 + }, + { + "epoch": 1.6054054054054054, + "grad_norm": 0.4109949767589569, + "learning_rate": 4.44922152134397e-05, + "loss": 0.0632, + "step": 13365 + }, + { + "epoch": 1.606006006006006, + "grad_norm": 0.35330331325531006, + "learning_rate": 4.4460960296355074e-05, + "loss": 0.0563, + "step": 13370 + }, + { + "epoch": 1.6066066066066065, + "grad_norm": 0.24096760153770447, + "learning_rate": 4.442970757037739e-05, + "loss": 0.0563, + "step": 13375 + }, + { + "epoch": 1.6072072072072072, + "grad_norm": 0.3883642256259918, + "learning_rate": 4.439845704786945e-05, + "loss": 0.0624, + "step": 13380 + }, + { + "epoch": 1.607807807807808, + "grad_norm": 0.5604926943778992, + "learning_rate": 4.436720874119316e-05, + "loss": 0.07, + "step": 13385 + }, + { + "epoch": 1.6084084084084083, + "grad_norm": 0.4338735342025757, + "learning_rate": 4.433596266270959e-05, + "loss": 0.0548, + "step": 13390 + }, + { + "epoch": 1.609009009009009, + "grad_norm": 0.4170035421848297, + "learning_rate": 4.430471882477891e-05, + "loss": 0.066, + "step": 13395 + }, + { + "epoch": 1.6096096096096097, + "grad_norm": 0.39704200625419617, + "learning_rate": 4.427347723976042e-05, + "loss": 0.0713, + "step": 13400 + }, + { + "epoch": 1.61021021021021, + "grad_norm": 0.382910817861557, + "learning_rate": 4.424223792001253e-05, + "loss": 0.0567, + "step": 13405 + }, + { + "epoch": 1.6108108108108108, + "grad_norm": 0.4715207815170288, + "learning_rate": 4.42110008778927e-05, + "loss": 0.054, + "step": 13410 + }, + { + "epoch": 1.6114114114114115, + "grad_norm": 0.3665212392807007, + "learning_rate": 4.417976612575755e-05, + "loss": 0.0554, + "step": 13415 + }, + { + "epoch": 1.612012012012012, + "grad_norm": 0.4208891689777374, + "learning_rate": 4.4148533675962774e-05, + "loss": 0.0689, + "step": 13420 + }, + { + "epoch": 1.6126126126126126, + "grad_norm": 0.4575747847557068, + "learning_rate": 4.411730354086318e-05, + "loss": 0.0714, + "step": 13425 + }, + { + "epoch": 1.6132132132132133, + "grad_norm": 0.42173752188682556, + "learning_rate": 4.408607573281261e-05, + "loss": 0.0634, + "step": 13430 + }, + { + "epoch": 1.6138138138138138, + "grad_norm": 0.45820704102516174, + "learning_rate": 4.4054850264164e-05, + "loss": 0.0697, + "step": 13435 + }, + { + "epoch": 1.6144144144144144, + "grad_norm": 0.4981316030025482, + "learning_rate": 4.402362714726941e-05, + "loss": 0.0629, + "step": 13440 + }, + { + "epoch": 1.615015015015015, + "grad_norm": 0.4847612679004669, + "learning_rate": 4.399240639447989e-05, + "loss": 0.0536, + "step": 13445 + }, + { + "epoch": 1.6156156156156156, + "grad_norm": 0.38429567217826843, + "learning_rate": 4.3961188018145644e-05, + "loss": 0.0674, + "step": 13450 + }, + { + "epoch": 1.6162162162162161, + "grad_norm": 0.45922940969467163, + "learning_rate": 4.3929972030615834e-05, + "loss": 0.0707, + "step": 13455 + }, + { + "epoch": 1.6168168168168169, + "grad_norm": 0.4565389156341553, + "learning_rate": 4.389875844423876e-05, + "loss": 0.0674, + "step": 13460 + }, + { + "epoch": 1.6174174174174174, + "grad_norm": 0.46716755628585815, + "learning_rate": 4.3867547271361745e-05, + "loss": 0.0632, + "step": 13465 + }, + { + "epoch": 1.618018018018018, + "grad_norm": 0.4541369676589966, + "learning_rate": 4.383633852433116e-05, + "loss": 0.0819, + "step": 13470 + }, + { + "epoch": 1.6186186186186187, + "grad_norm": 0.427727073431015, + "learning_rate": 4.380513221549242e-05, + "loss": 0.0653, + "step": 13475 + }, + { + "epoch": 1.6192192192192192, + "grad_norm": 0.4517422020435333, + "learning_rate": 4.377392835718993e-05, + "loss": 0.0643, + "step": 13480 + }, + { + "epoch": 1.6198198198198197, + "grad_norm": 0.41785985231399536, + "learning_rate": 4.37427269617672e-05, + "loss": 0.0772, + "step": 13485 + }, + { + "epoch": 1.6204204204204204, + "grad_norm": 0.5061773657798767, + "learning_rate": 4.3711528041566705e-05, + "loss": 0.0597, + "step": 13490 + }, + { + "epoch": 1.621021021021021, + "grad_norm": 0.37822848558425903, + "learning_rate": 4.368033160892998e-05, + "loss": 0.0533, + "step": 13495 + }, + { + "epoch": 1.6216216216216215, + "grad_norm": 0.4786316156387329, + "learning_rate": 4.364913767619758e-05, + "loss": 0.0591, + "step": 13500 + }, + { + "epoch": 1.6216216216216215, + "eval_loss": 0.07384883612394333, + "eval_runtime": 35.7195, + "eval_samples_per_second": 22.397, + "eval_steps_per_second": 5.599, + "step": 13500 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 0.5200320482254028, + "learning_rate": 4.361794625570901e-05, + "loss": 0.064, + "step": 13505 + }, + { + "epoch": 1.622822822822823, + "grad_norm": 0.4395144581794739, + "learning_rate": 4.3586757359802835e-05, + "loss": 0.0669, + "step": 13510 + }, + { + "epoch": 1.6234234234234233, + "grad_norm": 0.3707331120967865, + "learning_rate": 4.355557100081663e-05, + "loss": 0.0507, + "step": 13515 + }, + { + "epoch": 1.624024024024024, + "grad_norm": 0.4052436053752899, + "learning_rate": 4.352438719108695e-05, + "loss": 0.0515, + "step": 13520 + }, + { + "epoch": 1.6246246246246248, + "grad_norm": 0.549052894115448, + "learning_rate": 4.34932059429493e-05, + "loss": 0.0661, + "step": 13525 + }, + { + "epoch": 1.6252252252252253, + "grad_norm": 0.4563562572002411, + "learning_rate": 4.346202726873825e-05, + "loss": 0.0545, + "step": 13530 + }, + { + "epoch": 1.6258258258258258, + "grad_norm": 0.5008108615875244, + "learning_rate": 4.3430851180787274e-05, + "loss": 0.0569, + "step": 13535 + }, + { + "epoch": 1.6264264264264265, + "grad_norm": 0.5426075458526611, + "learning_rate": 4.339967769142889e-05, + "loss": 0.0797, + "step": 13540 + }, + { + "epoch": 1.627027027027027, + "grad_norm": 0.4764677584171295, + "learning_rate": 4.3368506812994555e-05, + "loss": 0.0726, + "step": 13545 + }, + { + "epoch": 1.6276276276276276, + "grad_norm": 0.463705450296402, + "learning_rate": 4.333733855781468e-05, + "loss": 0.0614, + "step": 13550 + }, + { + "epoch": 1.6282282282282283, + "grad_norm": 0.43991366028785706, + "learning_rate": 4.330617293821866e-05, + "loss": 0.0702, + "step": 13555 + }, + { + "epoch": 1.6288288288288288, + "grad_norm": 0.5054963231086731, + "learning_rate": 4.327500996653485e-05, + "loss": 0.0444, + "step": 13560 + }, + { + "epoch": 1.6294294294294294, + "grad_norm": 0.4601389467716217, + "learning_rate": 4.324384965509054e-05, + "loss": 0.0626, + "step": 13565 + }, + { + "epoch": 1.63003003003003, + "grad_norm": 0.4446878135204315, + "learning_rate": 4.3212692016212006e-05, + "loss": 0.0577, + "step": 13570 + }, + { + "epoch": 1.6306306306306306, + "grad_norm": 0.3773181736469269, + "learning_rate": 4.31815370622244e-05, + "loss": 0.0589, + "step": 13575 + }, + { + "epoch": 1.6312312312312311, + "grad_norm": 0.5154123306274414, + "learning_rate": 4.315038480545188e-05, + "loss": 0.0561, + "step": 13580 + }, + { + "epoch": 1.631831831831832, + "grad_norm": 0.46419796347618103, + "learning_rate": 4.3119235258217473e-05, + "loss": 0.0561, + "step": 13585 + }, + { + "epoch": 1.6324324324324324, + "grad_norm": 0.47348397970199585, + "learning_rate": 4.308808843284322e-05, + "loss": 0.0639, + "step": 13590 + }, + { + "epoch": 1.633033033033033, + "grad_norm": 0.5072610378265381, + "learning_rate": 4.305694434164997e-05, + "loss": 0.0637, + "step": 13595 + }, + { + "epoch": 1.6336336336336337, + "grad_norm": 0.4275527894496918, + "learning_rate": 4.30258029969576e-05, + "loss": 0.0658, + "step": 13600 + }, + { + "epoch": 1.6342342342342342, + "grad_norm": 0.36067524552345276, + "learning_rate": 4.299466441108484e-05, + "loss": 0.0512, + "step": 13605 + }, + { + "epoch": 1.6348348348348347, + "grad_norm": 0.45838814973831177, + "learning_rate": 4.296352859634934e-05, + "loss": 0.0711, + "step": 13610 + }, + { + "epoch": 1.6354354354354355, + "grad_norm": 0.4847932755947113, + "learning_rate": 4.293239556506768e-05, + "loss": 0.0587, + "step": 13615 + }, + { + "epoch": 1.6360360360360362, + "grad_norm": 0.48203244805336, + "learning_rate": 4.290126532955529e-05, + "loss": 0.0654, + "step": 13620 + }, + { + "epoch": 1.6366366366366365, + "grad_norm": 0.4438318908214569, + "learning_rate": 4.2870137902126534e-05, + "loss": 0.0582, + "step": 13625 + }, + { + "epoch": 1.6372372372372372, + "grad_norm": 0.4480768144130707, + "learning_rate": 4.2839013295094634e-05, + "loss": 0.0521, + "step": 13630 + }, + { + "epoch": 1.637837837837838, + "grad_norm": 0.4005804657936096, + "learning_rate": 4.280789152077173e-05, + "loss": 0.0543, + "step": 13635 + }, + { + "epoch": 1.6384384384384383, + "grad_norm": 0.3628470003604889, + "learning_rate": 4.277677259146884e-05, + "loss": 0.0559, + "step": 13640 + }, + { + "epoch": 1.639039039039039, + "grad_norm": 0.5323807001113892, + "learning_rate": 4.2745656519495796e-05, + "loss": 0.0631, + "step": 13645 + }, + { + "epoch": 1.6396396396396398, + "grad_norm": 0.5428612232208252, + "learning_rate": 4.2714543317161374e-05, + "loss": 0.0609, + "step": 13650 + }, + { + "epoch": 1.6402402402402403, + "grad_norm": 0.5070378184318542, + "learning_rate": 4.268343299677319e-05, + "loss": 0.0522, + "step": 13655 + }, + { + "epoch": 1.6408408408408408, + "grad_norm": 0.40902385115623474, + "learning_rate": 4.265232557063772e-05, + "loss": 0.0589, + "step": 13660 + }, + { + "epoch": 1.6414414414414416, + "grad_norm": 0.412062406539917, + "learning_rate": 4.262122105106028e-05, + "loss": 0.0638, + "step": 13665 + }, + { + "epoch": 1.642042042042042, + "grad_norm": 0.445654958486557, + "learning_rate": 4.259011945034504e-05, + "loss": 0.0729, + "step": 13670 + }, + { + "epoch": 1.6426426426426426, + "grad_norm": 0.4355279803276062, + "learning_rate": 4.2559020780795044e-05, + "loss": 0.0561, + "step": 13675 + }, + { + "epoch": 1.6432432432432433, + "grad_norm": 0.45573675632476807, + "learning_rate": 4.252792505471214e-05, + "loss": 0.0554, + "step": 13680 + }, + { + "epoch": 1.6438438438438439, + "grad_norm": 0.49397581815719604, + "learning_rate": 4.249683228439704e-05, + "loss": 0.0579, + "step": 13685 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 0.5173845291137695, + "learning_rate": 4.246574248214925e-05, + "loss": 0.0522, + "step": 13690 + }, + { + "epoch": 1.6450450450450451, + "grad_norm": 0.45187321305274963, + "learning_rate": 4.243465566026714e-05, + "loss": 0.0629, + "step": 13695 + }, + { + "epoch": 1.6456456456456456, + "grad_norm": 0.5049217939376831, + "learning_rate": 4.240357183104789e-05, + "loss": 0.0548, + "step": 13700 + }, + { + "epoch": 1.6462462462462462, + "grad_norm": 0.3371676504611969, + "learning_rate": 4.2372491006787495e-05, + "loss": 0.0686, + "step": 13705 + }, + { + "epoch": 1.646846846846847, + "grad_norm": 0.4380626678466797, + "learning_rate": 4.2341413199780774e-05, + "loss": 0.0609, + "step": 13710 + }, + { + "epoch": 1.6474474474474474, + "grad_norm": 0.48923227190971375, + "learning_rate": 4.2310338422321294e-05, + "loss": 0.056, + "step": 13715 + }, + { + "epoch": 1.648048048048048, + "grad_norm": 0.48743122816085815, + "learning_rate": 4.22792666867015e-05, + "loss": 0.0558, + "step": 13720 + }, + { + "epoch": 1.6486486486486487, + "grad_norm": 0.4722169041633606, + "learning_rate": 4.2248198005212594e-05, + "loss": 0.0545, + "step": 13725 + }, + { + "epoch": 1.6492492492492492, + "grad_norm": 0.5330705046653748, + "learning_rate": 4.221713239014456e-05, + "loss": 0.0596, + "step": 13730 + }, + { + "epoch": 1.6498498498498497, + "grad_norm": 0.39164403080940247, + "learning_rate": 4.218606985378624e-05, + "loss": 0.0527, + "step": 13735 + }, + { + "epoch": 1.6504504504504505, + "grad_norm": 0.5172113180160522, + "learning_rate": 4.2155010408425145e-05, + "loss": 0.0546, + "step": 13740 + }, + { + "epoch": 1.6510510510510512, + "grad_norm": 0.5502013564109802, + "learning_rate": 4.2123954066347636e-05, + "loss": 0.0554, + "step": 13745 + }, + { + "epoch": 1.6516516516516515, + "grad_norm": 0.38821086287498474, + "learning_rate": 4.2092900839838844e-05, + "loss": 0.0543, + "step": 13750 + }, + { + "epoch": 1.6516516516516515, + "eval_loss": 0.06733085960149765, + "eval_runtime": 35.9799, + "eval_samples_per_second": 22.235, + "eval_steps_per_second": 5.559, + "step": 13750 + }, + { + "epoch": 1.6522522522522523, + "grad_norm": 0.43382570147514343, + "learning_rate": 4.2061850741182677e-05, + "loss": 0.0619, + "step": 13755 + }, + { + "epoch": 1.652852852852853, + "grad_norm": 0.33621084690093994, + "learning_rate": 4.203080378266173e-05, + "loss": 0.0629, + "step": 13760 + }, + { + "epoch": 1.6534534534534533, + "grad_norm": 0.34766513109207153, + "learning_rate": 4.199975997655746e-05, + "loss": 0.0477, + "step": 13765 + }, + { + "epoch": 1.654054054054054, + "grad_norm": 0.45685961842536926, + "learning_rate": 4.196871933515e-05, + "loss": 0.0626, + "step": 13770 + }, + { + "epoch": 1.6546546546546548, + "grad_norm": 0.40836599469184875, + "learning_rate": 4.193768187071826e-05, + "loss": 0.0474, + "step": 13775 + }, + { + "epoch": 1.6552552552552553, + "grad_norm": 0.49722006916999817, + "learning_rate": 4.190664759553993e-05, + "loss": 0.0562, + "step": 13780 + }, + { + "epoch": 1.6558558558558558, + "grad_norm": 0.4253612458705902, + "learning_rate": 4.1875616521891354e-05, + "loss": 0.0479, + "step": 13785 + }, + { + "epoch": 1.6564564564564566, + "grad_norm": 0.4923230707645416, + "learning_rate": 4.184458866204767e-05, + "loss": 0.0603, + "step": 13790 + }, + { + "epoch": 1.657057057057057, + "grad_norm": 0.4575973153114319, + "learning_rate": 4.181356402828274e-05, + "loss": 0.0563, + "step": 13795 + }, + { + "epoch": 1.6576576576576576, + "grad_norm": 0.467523455619812, + "learning_rate": 4.178254263286914e-05, + "loss": 0.057, + "step": 13800 + }, + { + "epoch": 1.6582582582582583, + "grad_norm": 0.4888710379600525, + "learning_rate": 4.175152448807816e-05, + "loss": 0.0565, + "step": 13805 + }, + { + "epoch": 1.6588588588588589, + "grad_norm": 0.4854770004749298, + "learning_rate": 4.1720509606179806e-05, + "loss": 0.0607, + "step": 13810 + }, + { + "epoch": 1.6594594594594594, + "grad_norm": 0.47110292315483093, + "learning_rate": 4.1689497999442804e-05, + "loss": 0.0509, + "step": 13815 + }, + { + "epoch": 1.6600600600600601, + "grad_norm": 0.35635825991630554, + "learning_rate": 4.165848968013457e-05, + "loss": 0.0622, + "step": 13820 + }, + { + "epoch": 1.6606606606606606, + "grad_norm": 0.3194577395915985, + "learning_rate": 4.162748466052126e-05, + "loss": 0.0562, + "step": 13825 + }, + { + "epoch": 1.6612612612612612, + "grad_norm": 0.4404977262020111, + "learning_rate": 4.1596482952867645e-05, + "loss": 0.0429, + "step": 13830 + }, + { + "epoch": 1.661861861861862, + "grad_norm": 0.45366171002388, + "learning_rate": 4.156548456943724e-05, + "loss": 0.0594, + "step": 13835 + }, + { + "epoch": 1.6624624624624624, + "grad_norm": 0.3973150849342346, + "learning_rate": 4.1534489522492263e-05, + "loss": 0.0479, + "step": 13840 + }, + { + "epoch": 1.663063063063063, + "grad_norm": 0.4103251099586487, + "learning_rate": 4.150349782429357e-05, + "loss": 0.0499, + "step": 13845 + }, + { + "epoch": 1.6636636636636637, + "grad_norm": 0.5832968354225159, + "learning_rate": 4.1472509487100734e-05, + "loss": 0.044, + "step": 13850 + }, + { + "epoch": 1.6642642642642642, + "grad_norm": 0.3851698935031891, + "learning_rate": 4.144152452317194e-05, + "loss": 0.0533, + "step": 13855 + }, + { + "epoch": 1.6648648648648647, + "grad_norm": 0.5267958045005798, + "learning_rate": 4.1410542944764084e-05, + "loss": 0.0743, + "step": 13860 + }, + { + "epoch": 1.6654654654654655, + "grad_norm": 0.36634647846221924, + "learning_rate": 4.137956476413271e-05, + "loss": 0.0599, + "step": 13865 + }, + { + "epoch": 1.6660660660660662, + "grad_norm": 0.4430139362812042, + "learning_rate": 4.134858999353202e-05, + "loss": 0.0642, + "step": 13870 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.5938952565193176, + "learning_rate": 4.1317618645214894e-05, + "loss": 0.0616, + "step": 13875 + }, + { + "epoch": 1.6672672672672673, + "grad_norm": 0.3854408860206604, + "learning_rate": 4.128665073143278e-05, + "loss": 0.0534, + "step": 13880 + }, + { + "epoch": 1.667867867867868, + "grad_norm": 0.4133352041244507, + "learning_rate": 4.1255686264435846e-05, + "loss": 0.0561, + "step": 13885 + }, + { + "epoch": 1.6684684684684683, + "grad_norm": 0.49693113565444946, + "learning_rate": 4.1224725256472856e-05, + "loss": 0.0647, + "step": 13890 + }, + { + "epoch": 1.669069069069069, + "grad_norm": 0.3022277057170868, + "learning_rate": 4.119376771979125e-05, + "loss": 0.0626, + "step": 13895 + }, + { + "epoch": 1.6696696696696698, + "grad_norm": 0.3540533185005188, + "learning_rate": 4.116281366663702e-05, + "loss": 0.0561, + "step": 13900 + }, + { + "epoch": 1.6702702702702703, + "grad_norm": 0.4460397958755493, + "learning_rate": 4.113186310925487e-05, + "loss": 0.0531, + "step": 13905 + }, + { + "epoch": 1.6708708708708708, + "grad_norm": 0.40761151909828186, + "learning_rate": 4.110091605988801e-05, + "loss": 0.049, + "step": 13910 + }, + { + "epoch": 1.6714714714714716, + "grad_norm": 0.4298311769962311, + "learning_rate": 4.106997253077837e-05, + "loss": 0.0654, + "step": 13915 + }, + { + "epoch": 1.672072072072072, + "grad_norm": 0.3780181407928467, + "learning_rate": 4.103903253416647e-05, + "loss": 0.0598, + "step": 13920 + }, + { + "epoch": 1.6726726726726726, + "grad_norm": 0.42668402194976807, + "learning_rate": 4.100809608229134e-05, + "loss": 0.0531, + "step": 13925 + }, + { + "epoch": 1.6732732732732734, + "grad_norm": 0.4466029107570648, + "learning_rate": 4.09771631873907e-05, + "loss": 0.0599, + "step": 13930 + }, + { + "epoch": 1.6738738738738739, + "grad_norm": 0.3829571604728699, + "learning_rate": 4.0946233861700856e-05, + "loss": 0.0553, + "step": 13935 + }, + { + "epoch": 1.6744744744744744, + "grad_norm": 0.36364227533340454, + "learning_rate": 4.091530811745667e-05, + "loss": 0.065, + "step": 13940 + }, + { + "epoch": 1.6750750750750751, + "grad_norm": 0.43205782771110535, + "learning_rate": 4.088438596689162e-05, + "loss": 0.0586, + "step": 13945 + }, + { + "epoch": 1.6756756756756757, + "grad_norm": 0.44587403535842896, + "learning_rate": 4.0853467422237705e-05, + "loss": 0.0506, + "step": 13950 + }, + { + "epoch": 1.6762762762762762, + "grad_norm": 0.36174246668815613, + "learning_rate": 4.082255249572557e-05, + "loss": 0.0494, + "step": 13955 + }, + { + "epoch": 1.676876876876877, + "grad_norm": 0.4244382977485657, + "learning_rate": 4.0791641199584364e-05, + "loss": 0.0497, + "step": 13960 + }, + { + "epoch": 1.6774774774774774, + "grad_norm": 0.5707062482833862, + "learning_rate": 4.0760733546041864e-05, + "loss": 0.0536, + "step": 13965 + }, + { + "epoch": 1.678078078078078, + "grad_norm": 0.3987087309360504, + "learning_rate": 4.0729829547324335e-05, + "loss": 0.043, + "step": 13970 + }, + { + "epoch": 1.6786786786786787, + "grad_norm": 0.4837443232536316, + "learning_rate": 4.069892921565666e-05, + "loss": 0.0582, + "step": 13975 + }, + { + "epoch": 1.6792792792792792, + "grad_norm": 0.41558149456977844, + "learning_rate": 4.066803256326223e-05, + "loss": 0.0588, + "step": 13980 + }, + { + "epoch": 1.6798798798798797, + "grad_norm": 0.6192619800567627, + "learning_rate": 4.0637139602363006e-05, + "loss": 0.0582, + "step": 13985 + }, + { + "epoch": 1.6804804804804805, + "grad_norm": 0.45900431275367737, + "learning_rate": 4.06062503451795e-05, + "loss": 0.0616, + "step": 13990 + }, + { + "epoch": 1.6810810810810812, + "grad_norm": 0.5000198483467102, + "learning_rate": 4.057536480393068e-05, + "loss": 0.0608, + "step": 13995 + }, + { + "epoch": 1.6816816816816815, + "grad_norm": 0.48604995012283325, + "learning_rate": 4.054448299083415e-05, + "loss": 0.0533, + "step": 14000 + }, + { + "epoch": 1.6816816816816815, + "eval_loss": 0.06777658313512802, + "eval_runtime": 35.9853, + "eval_samples_per_second": 22.231, + "eval_steps_per_second": 5.558, + "step": 14000 + }, + { + "epoch": 1.6822822822822823, + "grad_norm": 0.46982526779174805, + "learning_rate": 4.0513604918105966e-05, + "loss": 0.0569, + "step": 14005 + }, + { + "epoch": 1.682882882882883, + "grad_norm": 0.29595673084259033, + "learning_rate": 4.048273059796074e-05, + "loss": 0.0429, + "step": 14010 + }, + { + "epoch": 1.6834834834834835, + "grad_norm": 0.5032521486282349, + "learning_rate": 4.04518600426116e-05, + "loss": 0.0484, + "step": 14015 + }, + { + "epoch": 1.684084084084084, + "grad_norm": 0.4738074839115143, + "learning_rate": 4.042099326427014e-05, + "loss": 0.0511, + "step": 14020 + }, + { + "epoch": 1.6846846846846848, + "grad_norm": 0.5076733231544495, + "learning_rate": 4.039013027514651e-05, + "loss": 0.0563, + "step": 14025 + }, + { + "epoch": 1.6852852852852853, + "grad_norm": 0.45880991220474243, + "learning_rate": 4.035927108744935e-05, + "loss": 0.0474, + "step": 14030 + }, + { + "epoch": 1.6858858858858858, + "grad_norm": 0.3994792699813843, + "learning_rate": 4.0328415713385805e-05, + "loss": 0.0481, + "step": 14035 + }, + { + "epoch": 1.6864864864864866, + "grad_norm": 0.5211197137832642, + "learning_rate": 4.029756416516145e-05, + "loss": 0.0543, + "step": 14040 + }, + { + "epoch": 1.687087087087087, + "grad_norm": 0.3836585283279419, + "learning_rate": 4.026671645498044e-05, + "loss": 0.0536, + "step": 14045 + }, + { + "epoch": 1.6876876876876876, + "grad_norm": 0.45789793133735657, + "learning_rate": 4.023587259504533e-05, + "loss": 0.0495, + "step": 14050 + }, + { + "epoch": 1.6882882882882884, + "grad_norm": 0.4536382853984833, + "learning_rate": 4.0205032597557214e-05, + "loss": 0.0537, + "step": 14055 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 0.45672139525413513, + "learning_rate": 4.017419647471564e-05, + "loss": 0.0475, + "step": 14060 + }, + { + "epoch": 1.6894894894894894, + "grad_norm": 0.4610535502433777, + "learning_rate": 4.0143364238718564e-05, + "loss": 0.0578, + "step": 14065 + }, + { + "epoch": 1.6900900900900901, + "grad_norm": 0.3575139045715332, + "learning_rate": 4.011253590176249e-05, + "loss": 0.0474, + "step": 14070 + }, + { + "epoch": 1.6906906906906907, + "grad_norm": 0.4532896876335144, + "learning_rate": 4.0081711476042345e-05, + "loss": 0.0518, + "step": 14075 + }, + { + "epoch": 1.6912912912912912, + "grad_norm": 0.41786789894104004, + "learning_rate": 4.00508909737515e-05, + "loss": 0.0579, + "step": 14080 + }, + { + "epoch": 1.691891891891892, + "grad_norm": 0.3306769132614136, + "learning_rate": 4.0020074407081814e-05, + "loss": 0.0541, + "step": 14085 + }, + { + "epoch": 1.6924924924924925, + "grad_norm": 0.38540539145469666, + "learning_rate": 3.998926178822351e-05, + "loss": 0.0446, + "step": 14090 + }, + { + "epoch": 1.693093093093093, + "grad_norm": 0.4451224207878113, + "learning_rate": 3.995845312936532e-05, + "loss": 0.0501, + "step": 14095 + }, + { + "epoch": 1.6936936936936937, + "grad_norm": 0.49187320470809937, + "learning_rate": 3.9927648442694375e-05, + "loss": 0.0673, + "step": 14100 + }, + { + "epoch": 1.6942942942942945, + "grad_norm": 0.3813941478729248, + "learning_rate": 3.989684774039626e-05, + "loss": 0.0519, + "step": 14105 + }, + { + "epoch": 1.6948948948948948, + "grad_norm": 0.34133368730545044, + "learning_rate": 3.9866051034654985e-05, + "loss": 0.0622, + "step": 14110 + }, + { + "epoch": 1.6954954954954955, + "grad_norm": 0.4624498784542084, + "learning_rate": 3.983525833765292e-05, + "loss": 0.0495, + "step": 14115 + }, + { + "epoch": 1.6960960960960962, + "grad_norm": 0.39160802960395813, + "learning_rate": 3.980446966157091e-05, + "loss": 0.0554, + "step": 14120 + }, + { + "epoch": 1.6966966966966965, + "grad_norm": 0.3328275680541992, + "learning_rate": 3.977368501858821e-05, + "loss": 0.0529, + "step": 14125 + }, + { + "epoch": 1.6972972972972973, + "grad_norm": 0.4307607114315033, + "learning_rate": 3.974290442088248e-05, + "loss": 0.0498, + "step": 14130 + }, + { + "epoch": 1.697897897897898, + "grad_norm": 0.41966062784194946, + "learning_rate": 3.97121278806297e-05, + "loss": 0.0501, + "step": 14135 + }, + { + "epoch": 1.6984984984984985, + "grad_norm": 0.4566977918148041, + "learning_rate": 3.968135541000435e-05, + "loss": 0.0525, + "step": 14140 + }, + { + "epoch": 1.699099099099099, + "grad_norm": 0.36930787563323975, + "learning_rate": 3.965058702117925e-05, + "loss": 0.044, + "step": 14145 + }, + { + "epoch": 1.6996996996996998, + "grad_norm": 0.46633774042129517, + "learning_rate": 3.961982272632561e-05, + "loss": 0.0537, + "step": 14150 + }, + { + "epoch": 1.7003003003003003, + "grad_norm": 0.5075660347938538, + "learning_rate": 3.9589062537613044e-05, + "loss": 0.0535, + "step": 14155 + }, + { + "epoch": 1.7009009009009008, + "grad_norm": 0.4399413764476776, + "learning_rate": 3.955830646720948e-05, + "loss": 0.0443, + "step": 14160 + }, + { + "epoch": 1.7015015015015016, + "grad_norm": 0.33186614513397217, + "learning_rate": 3.952755452728128e-05, + "loss": 0.0537, + "step": 14165 + }, + { + "epoch": 1.702102102102102, + "grad_norm": 0.4482559263706207, + "learning_rate": 3.949680672999315e-05, + "loss": 0.0401, + "step": 14170 + }, + { + "epoch": 1.7027027027027026, + "grad_norm": 0.34379854798316956, + "learning_rate": 3.9466063087508156e-05, + "loss": 0.0497, + "step": 14175 + }, + { + "epoch": 1.7033033033033034, + "grad_norm": 0.2629309892654419, + "learning_rate": 3.9435323611987736e-05, + "loss": 0.0488, + "step": 14180 + }, + { + "epoch": 1.703903903903904, + "grad_norm": 0.48224538564682007, + "learning_rate": 3.9404588315591634e-05, + "loss": 0.0467, + "step": 14185 + }, + { + "epoch": 1.7045045045045044, + "grad_norm": 0.6717689037322998, + "learning_rate": 3.9373857210478e-05, + "loss": 0.0719, + "step": 14190 + }, + { + "epoch": 1.7051051051051052, + "grad_norm": 0.48857033252716064, + "learning_rate": 3.934313030880328e-05, + "loss": 0.0556, + "step": 14195 + }, + { + "epoch": 1.7057057057057057, + "grad_norm": 0.38591575622558594, + "learning_rate": 3.93124076227223e-05, + "loss": 0.0458, + "step": 14200 + }, + { + "epoch": 1.7063063063063062, + "grad_norm": 0.34425076842308044, + "learning_rate": 3.928168916438815e-05, + "loss": 0.0492, + "step": 14205 + }, + { + "epoch": 1.706906906906907, + "grad_norm": 0.427339106798172, + "learning_rate": 3.925097494595233e-05, + "loss": 0.0556, + "step": 14210 + }, + { + "epoch": 1.7075075075075075, + "grad_norm": 0.4965413510799408, + "learning_rate": 3.9220264979564604e-05, + "loss": 0.054, + "step": 14215 + }, + { + "epoch": 1.708108108108108, + "grad_norm": 0.5718177556991577, + "learning_rate": 3.918955927737308e-05, + "loss": 0.0571, + "step": 14220 + }, + { + "epoch": 1.7087087087087087, + "grad_norm": 0.45786628127098083, + "learning_rate": 3.91588578515242e-05, + "loss": 0.0498, + "step": 14225 + }, + { + "epoch": 1.7093093093093095, + "grad_norm": 0.35944420099258423, + "learning_rate": 3.912816071416264e-05, + "loss": 0.0546, + "step": 14230 + }, + { + "epoch": 1.7099099099099098, + "grad_norm": 0.39434975385665894, + "learning_rate": 3.909746787743147e-05, + "loss": 0.0507, + "step": 14235 + }, + { + "epoch": 1.7105105105105105, + "grad_norm": 0.4029919505119324, + "learning_rate": 3.906677935347197e-05, + "loss": 0.0489, + "step": 14240 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 0.4398517310619354, + "learning_rate": 3.903609515442379e-05, + "loss": 0.0551, + "step": 14245 + }, + { + "epoch": 1.7117117117117115, + "grad_norm": 0.47671040892601013, + "learning_rate": 3.900541529242487e-05, + "loss": 0.0511, + "step": 14250 + }, + { + "epoch": 1.7117117117117115, + "eval_loss": 0.06355729699134827, + "eval_runtime": 35.8789, + "eval_samples_per_second": 22.297, + "eval_steps_per_second": 5.574, + "step": 14250 + }, + { + "epoch": 1.7123123123123123, + "grad_norm": 0.30654531717300415, + "learning_rate": 3.897473977961134e-05, + "loss": 0.0506, + "step": 14255 + }, + { + "epoch": 1.712912912912913, + "grad_norm": 0.49727270007133484, + "learning_rate": 3.89440686281177e-05, + "loss": 0.0593, + "step": 14260 + }, + { + "epoch": 1.7135135135135136, + "grad_norm": 0.40689897537231445, + "learning_rate": 3.89134018500767e-05, + "loss": 0.05, + "step": 14265 + }, + { + "epoch": 1.714114114114114, + "grad_norm": 0.38134849071502686, + "learning_rate": 3.8882739457619375e-05, + "loss": 0.0519, + "step": 14270 + }, + { + "epoch": 1.7147147147147148, + "grad_norm": 0.45295876264572144, + "learning_rate": 3.885208146287498e-05, + "loss": 0.0519, + "step": 14275 + }, + { + "epoch": 1.7153153153153153, + "grad_norm": 0.3134695887565613, + "learning_rate": 3.882142787797108e-05, + "loss": 0.0438, + "step": 14280 + }, + { + "epoch": 1.7159159159159159, + "grad_norm": 0.4081864655017853, + "learning_rate": 3.879077871503344e-05, + "loss": 0.0499, + "step": 14285 + }, + { + "epoch": 1.7165165165165166, + "grad_norm": 0.3634887933731079, + "learning_rate": 3.876013398618615e-05, + "loss": 0.0386, + "step": 14290 + }, + { + "epoch": 1.7171171171171171, + "grad_norm": 0.5056455135345459, + "learning_rate": 3.87294937035515e-05, + "loss": 0.0562, + "step": 14295 + }, + { + "epoch": 1.7177177177177176, + "grad_norm": 0.39153170585632324, + "learning_rate": 3.869885787925e-05, + "loss": 0.0466, + "step": 14300 + }, + { + "epoch": 1.7183183183183184, + "grad_norm": 0.5252231359481812, + "learning_rate": 3.866822652540044e-05, + "loss": 0.0501, + "step": 14305 + }, + { + "epoch": 1.718918918918919, + "grad_norm": 0.6176850199699402, + "learning_rate": 3.863759965411981e-05, + "loss": 0.0549, + "step": 14310 + }, + { + "epoch": 1.7195195195195194, + "grad_norm": 0.9495644569396973, + "learning_rate": 3.8606977277523374e-05, + "loss": 0.0515, + "step": 14315 + }, + { + "epoch": 1.7201201201201202, + "grad_norm": 0.5057100653648376, + "learning_rate": 3.857635940772459e-05, + "loss": 0.044, + "step": 14320 + }, + { + "epoch": 1.7207207207207207, + "grad_norm": 0.4010032117366791, + "learning_rate": 3.854574605683508e-05, + "loss": 0.0542, + "step": 14325 + }, + { + "epoch": 1.7213213213213212, + "grad_norm": 0.3846031725406647, + "learning_rate": 3.8515137236964766e-05, + "loss": 0.054, + "step": 14330 + }, + { + "epoch": 1.721921921921922, + "grad_norm": 0.3945229649543762, + "learning_rate": 3.848453296022172e-05, + "loss": 0.0583, + "step": 14335 + }, + { + "epoch": 1.7225225225225225, + "grad_norm": 0.3763466477394104, + "learning_rate": 3.845393323871226e-05, + "loss": 0.061, + "step": 14340 + }, + { + "epoch": 1.723123123123123, + "grad_norm": 0.3965177536010742, + "learning_rate": 3.842333808454084e-05, + "loss": 0.0441, + "step": 14345 + }, + { + "epoch": 1.7237237237237237, + "grad_norm": 0.4570268988609314, + "learning_rate": 3.839274750981017e-05, + "loss": 0.0512, + "step": 14350 + }, + { + "epoch": 1.7243243243243245, + "grad_norm": 0.4227088987827301, + "learning_rate": 3.8362161526621115e-05, + "loss": 0.0453, + "step": 14355 + }, + { + "epoch": 1.7249249249249248, + "grad_norm": 0.40480488538742065, + "learning_rate": 3.8331580147072746e-05, + "loss": 0.0454, + "step": 14360 + }, + { + "epoch": 1.7255255255255255, + "grad_norm": 0.37806281447410583, + "learning_rate": 3.830100338326231e-05, + "loss": 0.0516, + "step": 14365 + }, + { + "epoch": 1.7261261261261263, + "grad_norm": 0.3402007520198822, + "learning_rate": 3.8270431247285174e-05, + "loss": 0.0398, + "step": 14370 + }, + { + "epoch": 1.7267267267267268, + "grad_norm": 0.4493679702281952, + "learning_rate": 3.8239863751234956e-05, + "loss": 0.0539, + "step": 14375 + }, + { + "epoch": 1.7273273273273273, + "grad_norm": 0.4124431312084198, + "learning_rate": 3.8209300907203376e-05, + "loss": 0.043, + "step": 14380 + }, + { + "epoch": 1.727927927927928, + "grad_norm": 0.41435253620147705, + "learning_rate": 3.817874272728035e-05, + "loss": 0.0432, + "step": 14385 + }, + { + "epoch": 1.7285285285285286, + "grad_norm": 0.43934547901153564, + "learning_rate": 3.814818922355396e-05, + "loss": 0.0524, + "step": 14390 + }, + { + "epoch": 1.729129129129129, + "grad_norm": 0.30731597542762756, + "learning_rate": 3.8117640408110366e-05, + "loss": 0.0405, + "step": 14395 + }, + { + "epoch": 1.7297297297297298, + "grad_norm": 0.5254538655281067, + "learning_rate": 3.8087096293033954e-05, + "loss": 0.0502, + "step": 14400 + }, + { + "epoch": 1.7303303303303303, + "grad_norm": 0.4425089359283447, + "learning_rate": 3.805655689040721e-05, + "loss": 0.0506, + "step": 14405 + }, + { + "epoch": 1.7309309309309309, + "grad_norm": 0.305950790643692, + "learning_rate": 3.80260222123108e-05, + "loss": 0.0504, + "step": 14410 + }, + { + "epoch": 1.7315315315315316, + "grad_norm": 0.32326582074165344, + "learning_rate": 3.799549227082343e-05, + "loss": 0.0387, + "step": 14415 + }, + { + "epoch": 1.7321321321321321, + "grad_norm": 0.37440550327301025, + "learning_rate": 3.796496707802202e-05, + "loss": 0.041, + "step": 14420 + }, + { + "epoch": 1.7327327327327327, + "grad_norm": 0.5314804315567017, + "learning_rate": 3.7934446645981566e-05, + "loss": 0.0556, + "step": 14425 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.3879922330379486, + "learning_rate": 3.7903930986775206e-05, + "loss": 0.0486, + "step": 14430 + }, + { + "epoch": 1.733933933933934, + "grad_norm": 0.45070913434028625, + "learning_rate": 3.78734201124742e-05, + "loss": 0.0492, + "step": 14435 + }, + { + "epoch": 1.7345345345345344, + "grad_norm": 0.4279508590698242, + "learning_rate": 3.784291403514785e-05, + "loss": 0.0446, + "step": 14440 + }, + { + "epoch": 1.7351351351351352, + "grad_norm": 0.3520037829875946, + "learning_rate": 3.781241276686362e-05, + "loss": 0.0504, + "step": 14445 + }, + { + "epoch": 1.7357357357357357, + "grad_norm": 0.4254855811595917, + "learning_rate": 3.778191631968707e-05, + "loss": 0.0467, + "step": 14450 + }, + { + "epoch": 1.7363363363363362, + "grad_norm": 0.4110926687717438, + "learning_rate": 3.775142470568183e-05, + "loss": 0.044, + "step": 14455 + }, + { + "epoch": 1.736936936936937, + "grad_norm": 0.4648517072200775, + "learning_rate": 3.772093793690964e-05, + "loss": 0.0519, + "step": 14460 + }, + { + "epoch": 1.7375375375375377, + "grad_norm": 0.3212833106517792, + "learning_rate": 3.7690456025430295e-05, + "loss": 0.0561, + "step": 14465 + }, + { + "epoch": 1.738138138138138, + "grad_norm": 0.4225824177265167, + "learning_rate": 3.765997898330169e-05, + "loss": 0.0435, + "step": 14470 + }, + { + "epoch": 1.7387387387387387, + "grad_norm": 0.3474774956703186, + "learning_rate": 3.762950682257978e-05, + "loss": 0.0505, + "step": 14475 + }, + { + "epoch": 1.7393393393393395, + "grad_norm": 0.4199853837490082, + "learning_rate": 3.7599039555318626e-05, + "loss": 0.0493, + "step": 14480 + }, + { + "epoch": 1.7399399399399398, + "grad_norm": 0.44869092106819153, + "learning_rate": 3.756857719357027e-05, + "loss": 0.0411, + "step": 14485 + }, + { + "epoch": 1.7405405405405405, + "grad_norm": 0.45615318417549133, + "learning_rate": 3.753811974938491e-05, + "loss": 0.0507, + "step": 14490 + }, + { + "epoch": 1.7411411411411413, + "grad_norm": 0.43175235390663147, + "learning_rate": 3.750766723481076e-05, + "loss": 0.0599, + "step": 14495 + }, + { + "epoch": 1.7417417417417418, + "grad_norm": 0.42501696944236755, + "learning_rate": 3.747721966189405e-05, + "loss": 0.0462, + "step": 14500 + }, + { + "epoch": 1.7417417417417418, + "eval_loss": 0.060369253158569336, + "eval_runtime": 35.8652, + "eval_samples_per_second": 22.306, + "eval_steps_per_second": 5.576, + "step": 14500 + }, + { + "epoch": 1.7423423423423423, + "grad_norm": 0.4761236011981964, + "learning_rate": 3.744677704267913e-05, + "loss": 0.0526, + "step": 14505 + }, + { + "epoch": 1.742942942942943, + "grad_norm": 0.4406774640083313, + "learning_rate": 3.74163393892083e-05, + "loss": 0.0474, + "step": 14510 + }, + { + "epoch": 1.7435435435435436, + "grad_norm": 0.5392323732376099, + "learning_rate": 3.738590671352197e-05, + "loss": 0.0529, + "step": 14515 + }, + { + "epoch": 1.744144144144144, + "grad_norm": 0.37301626801490784, + "learning_rate": 3.735547902765853e-05, + "loss": 0.0469, + "step": 14520 + }, + { + "epoch": 1.7447447447447448, + "grad_norm": 0.5395182967185974, + "learning_rate": 3.732505634365443e-05, + "loss": 0.0435, + "step": 14525 + }, + { + "epoch": 1.7453453453453454, + "grad_norm": 0.44104713201522827, + "learning_rate": 3.729463867354417e-05, + "loss": 0.0456, + "step": 14530 + }, + { + "epoch": 1.7459459459459459, + "grad_norm": 0.3940773606300354, + "learning_rate": 3.726422602936016e-05, + "loss": 0.0501, + "step": 14535 + }, + { + "epoch": 1.7465465465465466, + "grad_norm": 0.3776192367076874, + "learning_rate": 3.7233818423132915e-05, + "loss": 0.0433, + "step": 14540 + }, + { + "epoch": 1.7471471471471471, + "grad_norm": 0.43645375967025757, + "learning_rate": 3.720341586689095e-05, + "loss": 0.0536, + "step": 14545 + }, + { + "epoch": 1.7477477477477477, + "grad_norm": 0.4166800081729889, + "learning_rate": 3.7173018372660745e-05, + "loss": 0.0438, + "step": 14550 + }, + { + "epoch": 1.7483483483483484, + "grad_norm": 0.32177260518074036, + "learning_rate": 3.7142625952466805e-05, + "loss": 0.0443, + "step": 14555 + }, + { + "epoch": 1.748948948948949, + "grad_norm": 0.5205382108688354, + "learning_rate": 3.71122386183316e-05, + "loss": 0.0444, + "step": 14560 + }, + { + "epoch": 1.7495495495495494, + "grad_norm": 0.37180835008621216, + "learning_rate": 3.708185638227564e-05, + "loss": 0.0368, + "step": 14565 + }, + { + "epoch": 1.7501501501501502, + "grad_norm": 0.39908167719841003, + "learning_rate": 3.7051479256317345e-05, + "loss": 0.054, + "step": 14570 + }, + { + "epoch": 1.7507507507507507, + "grad_norm": 0.39011678099632263, + "learning_rate": 3.7021107252473196e-05, + "loss": 0.0476, + "step": 14575 + }, + { + "epoch": 1.7513513513513512, + "grad_norm": 0.29879021644592285, + "learning_rate": 3.699074038275756e-05, + "loss": 0.04, + "step": 14580 + }, + { + "epoch": 1.751951951951952, + "grad_norm": 0.42431557178497314, + "learning_rate": 3.696037865918285e-05, + "loss": 0.0525, + "step": 14585 + }, + { + "epoch": 1.7525525525525527, + "grad_norm": 0.384691059589386, + "learning_rate": 3.69300220937594e-05, + "loss": 0.0459, + "step": 14590 + }, + { + "epoch": 1.753153153153153, + "grad_norm": 0.4510609805583954, + "learning_rate": 3.689967069849552e-05, + "loss": 0.0571, + "step": 14595 + }, + { + "epoch": 1.7537537537537538, + "grad_norm": 0.3073751628398895, + "learning_rate": 3.686932448539749e-05, + "loss": 0.0455, + "step": 14600 + }, + { + "epoch": 1.7543543543543545, + "grad_norm": 0.443437397480011, + "learning_rate": 3.683898346646948e-05, + "loss": 0.0393, + "step": 14605 + }, + { + "epoch": 1.7549549549549548, + "grad_norm": 0.36913490295410156, + "learning_rate": 3.6808647653713676e-05, + "loss": 0.0387, + "step": 14610 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 0.3835843801498413, + "learning_rate": 3.677831705913016e-05, + "loss": 0.0448, + "step": 14615 + }, + { + "epoch": 1.7561561561561563, + "grad_norm": 0.5466217994689941, + "learning_rate": 3.674799169471695e-05, + "loss": 0.0567, + "step": 14620 + }, + { + "epoch": 1.7567567567567568, + "grad_norm": 0.4442767798900604, + "learning_rate": 3.671767157247007e-05, + "loss": 0.0536, + "step": 14625 + }, + { + "epoch": 1.7573573573573573, + "grad_norm": 0.405453622341156, + "learning_rate": 3.668735670438333e-05, + "loss": 0.0459, + "step": 14630 + }, + { + "epoch": 1.757957957957958, + "grad_norm": 0.35475534200668335, + "learning_rate": 3.665704710244859e-05, + "loss": 0.0406, + "step": 14635 + }, + { + "epoch": 1.7585585585585586, + "grad_norm": 0.38387325406074524, + "learning_rate": 3.6626742778655556e-05, + "loss": 0.0416, + "step": 14640 + }, + { + "epoch": 1.759159159159159, + "grad_norm": 0.5198370814323425, + "learning_rate": 3.65964437449919e-05, + "loss": 0.0497, + "step": 14645 + }, + { + "epoch": 1.7597597597597598, + "grad_norm": 0.3503769636154175, + "learning_rate": 3.656615001344313e-05, + "loss": 0.0485, + "step": 14650 + }, + { + "epoch": 1.7603603603603604, + "grad_norm": 0.42356932163238525, + "learning_rate": 3.653586159599272e-05, + "loss": 0.0528, + "step": 14655 + }, + { + "epoch": 1.7609609609609609, + "grad_norm": 0.33685001730918884, + "learning_rate": 3.6505578504622004e-05, + "loss": 0.0425, + "step": 14660 + }, + { + "epoch": 1.7615615615615616, + "grad_norm": 0.36791616678237915, + "learning_rate": 3.647530075131023e-05, + "loss": 0.0528, + "step": 14665 + }, + { + "epoch": 1.7621621621621621, + "grad_norm": 0.4632832407951355, + "learning_rate": 3.644502834803454e-05, + "loss": 0.0399, + "step": 14670 + }, + { + "epoch": 1.7627627627627627, + "grad_norm": 0.5057429075241089, + "learning_rate": 3.641476130676992e-05, + "loss": 0.046, + "step": 14675 + }, + { + "epoch": 1.7633633633633634, + "grad_norm": 0.41748347878456116, + "learning_rate": 3.638449963948927e-05, + "loss": 0.0415, + "step": 14680 + }, + { + "epoch": 1.763963963963964, + "grad_norm": 0.3554884195327759, + "learning_rate": 3.6354243358163365e-05, + "loss": 0.0393, + "step": 14685 + }, + { + "epoch": 1.7645645645645645, + "grad_norm": 0.4177655279636383, + "learning_rate": 3.632399247476084e-05, + "loss": 0.0424, + "step": 14690 + }, + { + "epoch": 1.7651651651651652, + "grad_norm": 0.3438865542411804, + "learning_rate": 3.629374700124821e-05, + "loss": 0.0404, + "step": 14695 + }, + { + "epoch": 1.7657657657657657, + "grad_norm": 0.4452444314956665, + "learning_rate": 3.62635069495898e-05, + "loss": 0.0458, + "step": 14700 + }, + { + "epoch": 1.7663663663663662, + "grad_norm": 0.4158708453178406, + "learning_rate": 3.6233272331747854e-05, + "loss": 0.0473, + "step": 14705 + }, + { + "epoch": 1.766966966966967, + "grad_norm": 0.33571508526802063, + "learning_rate": 3.620304315968242e-05, + "loss": 0.0445, + "step": 14710 + }, + { + "epoch": 1.7675675675675677, + "grad_norm": 0.44375666975975037, + "learning_rate": 3.617281944535144e-05, + "loss": 0.0561, + "step": 14715 + }, + { + "epoch": 1.768168168168168, + "grad_norm": 0.4546729326248169, + "learning_rate": 3.6142601200710614e-05, + "loss": 0.0489, + "step": 14720 + }, + { + "epoch": 1.7687687687687688, + "grad_norm": 0.3842746615409851, + "learning_rate": 3.611238843771356e-05, + "loss": 0.043, + "step": 14725 + }, + { + "epoch": 1.7693693693693695, + "grad_norm": 0.41635391116142273, + "learning_rate": 3.608218116831171e-05, + "loss": 0.0422, + "step": 14730 + }, + { + "epoch": 1.76996996996997, + "grad_norm": 0.39130958914756775, + "learning_rate": 3.60519794044543e-05, + "loss": 0.0506, + "step": 14735 + }, + { + "epoch": 1.7705705705705705, + "grad_norm": 0.3856130838394165, + "learning_rate": 3.6021783158088415e-05, + "loss": 0.0439, + "step": 14740 + }, + { + "epoch": 1.7711711711711713, + "grad_norm": 0.28414976596832275, + "learning_rate": 3.599159244115892e-05, + "loss": 0.0502, + "step": 14745 + }, + { + "epoch": 1.7717717717717718, + "grad_norm": 0.42334309220314026, + "learning_rate": 3.596140726560853e-05, + "loss": 0.0494, + "step": 14750 + }, + { + "epoch": 1.7717717717717718, + "eval_loss": 0.057185668498277664, + "eval_runtime": 35.8904, + "eval_samples_per_second": 22.29, + "eval_steps_per_second": 5.573, + "step": 14750 + }, + { + "epoch": 1.7723723723723723, + "grad_norm": 0.42764347791671753, + "learning_rate": 3.593122764337773e-05, + "loss": 0.0419, + "step": 14755 + }, + { + "epoch": 1.772972972972973, + "grad_norm": 0.4755389988422394, + "learning_rate": 3.590105358640485e-05, + "loss": 0.0529, + "step": 14760 + }, + { + "epoch": 1.7735735735735736, + "grad_norm": 0.34451594948768616, + "learning_rate": 3.587088510662603e-05, + "loss": 0.0352, + "step": 14765 + }, + { + "epoch": 1.7741741741741741, + "grad_norm": 0.3233650326728821, + "learning_rate": 3.584072221597511e-05, + "loss": 0.0477, + "step": 14770 + }, + { + "epoch": 1.7747747747747749, + "grad_norm": 0.44847461581230164, + "learning_rate": 3.5810564926383814e-05, + "loss": 0.0457, + "step": 14775 + }, + { + "epoch": 1.7753753753753754, + "grad_norm": 0.3650940954685211, + "learning_rate": 3.5780413249781616e-05, + "loss": 0.0406, + "step": 14780 + }, + { + "epoch": 1.775975975975976, + "grad_norm": 0.4203718304634094, + "learning_rate": 3.5750267198095804e-05, + "loss": 0.0463, + "step": 14785 + }, + { + "epoch": 1.7765765765765766, + "grad_norm": 0.4171884059906006, + "learning_rate": 3.5720126783251354e-05, + "loss": 0.0455, + "step": 14790 + }, + { + "epoch": 1.7771771771771772, + "grad_norm": 0.2850398123264313, + "learning_rate": 3.568999201717111e-05, + "loss": 0.0452, + "step": 14795 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 0.4151475727558136, + "learning_rate": 3.565986291177561e-05, + "loss": 0.0461, + "step": 14800 + }, + { + "epoch": 1.7783783783783784, + "grad_norm": 0.40226131677627563, + "learning_rate": 3.5629739478983195e-05, + "loss": 0.0386, + "step": 14805 + }, + { + "epoch": 1.778978978978979, + "grad_norm": 0.4193638861179352, + "learning_rate": 3.559962173070996e-05, + "loss": 0.0512, + "step": 14810 + }, + { + "epoch": 1.7795795795795795, + "grad_norm": 0.2949560284614563, + "learning_rate": 3.556950967886973e-05, + "loss": 0.0327, + "step": 14815 + }, + { + "epoch": 1.7801801801801802, + "grad_norm": 0.452722430229187, + "learning_rate": 3.5539403335374065e-05, + "loss": 0.0437, + "step": 14820 + }, + { + "epoch": 1.7807807807807807, + "grad_norm": 0.335397332906723, + "learning_rate": 3.5509302712132326e-05, + "loss": 0.0414, + "step": 14825 + }, + { + "epoch": 1.7813813813813812, + "grad_norm": 0.3416590690612793, + "learning_rate": 3.547920782105155e-05, + "loss": 0.0336, + "step": 14830 + }, + { + "epoch": 1.781981981981982, + "grad_norm": 0.40698111057281494, + "learning_rate": 3.5449118674036566e-05, + "loss": 0.0442, + "step": 14835 + }, + { + "epoch": 1.7825825825825827, + "grad_norm": 0.4348471760749817, + "learning_rate": 3.541903528298984e-05, + "loss": 0.0452, + "step": 14840 + }, + { + "epoch": 1.783183183183183, + "grad_norm": 0.41390636563301086, + "learning_rate": 3.538895765981166e-05, + "loss": 0.0541, + "step": 14845 + }, + { + "epoch": 1.7837837837837838, + "grad_norm": 0.36194291710853577, + "learning_rate": 3.5358885816399964e-05, + "loss": 0.0357, + "step": 14850 + }, + { + "epoch": 1.7843843843843845, + "grad_norm": 0.4427974820137024, + "learning_rate": 3.532881976465045e-05, + "loss": 0.0396, + "step": 14855 + }, + { + "epoch": 1.784984984984985, + "grad_norm": 0.3613729774951935, + "learning_rate": 3.529875951645648e-05, + "loss": 0.0413, + "step": 14860 + }, + { + "epoch": 1.7855855855855856, + "grad_norm": 0.48508042097091675, + "learning_rate": 3.526870508370915e-05, + "loss": 0.0431, + "step": 14865 + }, + { + "epoch": 1.7861861861861863, + "grad_norm": 0.34934136271476746, + "learning_rate": 3.5238656478297264e-05, + "loss": 0.0492, + "step": 14870 + }, + { + "epoch": 1.7867867867867868, + "grad_norm": 0.4539346992969513, + "learning_rate": 3.520861371210729e-05, + "loss": 0.043, + "step": 14875 + }, + { + "epoch": 1.7873873873873873, + "grad_norm": 0.532275915145874, + "learning_rate": 3.5178576797023434e-05, + "loss": 0.0405, + "step": 14880 + }, + { + "epoch": 1.787987987987988, + "grad_norm": 0.34528690576553345, + "learning_rate": 3.51485457449275e-05, + "loss": 0.0366, + "step": 14885 + }, + { + "epoch": 1.7885885885885886, + "grad_norm": 0.35474127531051636, + "learning_rate": 3.511852056769907e-05, + "loss": 0.0384, + "step": 14890 + }, + { + "epoch": 1.7891891891891891, + "grad_norm": 0.4059201180934906, + "learning_rate": 3.5088501277215355e-05, + "loss": 0.0397, + "step": 14895 + }, + { + "epoch": 1.7897897897897899, + "grad_norm": 0.430829793214798, + "learning_rate": 3.505848788535123e-05, + "loss": 0.0408, + "step": 14900 + }, + { + "epoch": 1.7903903903903904, + "grad_norm": 0.426645427942276, + "learning_rate": 3.5028480403979276e-05, + "loss": 0.039, + "step": 14905 + }, + { + "epoch": 1.790990990990991, + "grad_norm": 0.5083160400390625, + "learning_rate": 3.499847884496967e-05, + "loss": 0.0521, + "step": 14910 + }, + { + "epoch": 1.7915915915915916, + "grad_norm": 0.4045082926750183, + "learning_rate": 3.496848322019031e-05, + "loss": 0.0361, + "step": 14915 + }, + { + "epoch": 1.7921921921921922, + "grad_norm": 0.38742151856422424, + "learning_rate": 3.4938493541506705e-05, + "loss": 0.043, + "step": 14920 + }, + { + "epoch": 1.7927927927927927, + "grad_norm": 0.4921604096889496, + "learning_rate": 3.490850982078208e-05, + "loss": 0.0427, + "step": 14925 + }, + { + "epoch": 1.7933933933933934, + "grad_norm": 0.40055954456329346, + "learning_rate": 3.487853206987718e-05, + "loss": 0.0439, + "step": 14930 + }, + { + "epoch": 1.793993993993994, + "grad_norm": 0.4105166792869568, + "learning_rate": 3.4848560300650503e-05, + "loss": 0.0433, + "step": 14935 + }, + { + "epoch": 1.7945945945945945, + "grad_norm": 0.34285271167755127, + "learning_rate": 3.4818594524958145e-05, + "loss": 0.043, + "step": 14940 + }, + { + "epoch": 1.7951951951951952, + "grad_norm": 0.4157925844192505, + "learning_rate": 3.478863475465379e-05, + "loss": 0.0393, + "step": 14945 + }, + { + "epoch": 1.795795795795796, + "grad_norm": 0.4255988895893097, + "learning_rate": 3.4758681001588835e-05, + "loss": 0.0424, + "step": 14950 + }, + { + "epoch": 1.7963963963963963, + "grad_norm": 0.40526896715164185, + "learning_rate": 3.472873327761219e-05, + "loss": 0.0488, + "step": 14955 + }, + { + "epoch": 1.796996996996997, + "grad_norm": 0.44084569811820984, + "learning_rate": 3.469879159457044e-05, + "loss": 0.0493, + "step": 14960 + }, + { + "epoch": 1.7975975975975977, + "grad_norm": 0.5121871829032898, + "learning_rate": 3.46688559643078e-05, + "loss": 0.0443, + "step": 14965 + }, + { + "epoch": 1.798198198198198, + "grad_norm": 0.40672311186790466, + "learning_rate": 3.463892639866605e-05, + "loss": 0.0385, + "step": 14970 + }, + { + "epoch": 1.7987987987987988, + "grad_norm": 0.3255296051502228, + "learning_rate": 3.46090029094846e-05, + "loss": 0.0372, + "step": 14975 + }, + { + "epoch": 1.7993993993993995, + "grad_norm": 0.30151480436325073, + "learning_rate": 3.4579085508600406e-05, + "loss": 0.0419, + "step": 14980 + }, + { + "epoch": 1.8, + "grad_norm": 0.4027726352214813, + "learning_rate": 3.4549174207848075e-05, + "loss": 0.0423, + "step": 14985 + }, + { + "epoch": 1.8006006006006006, + "grad_norm": 0.48146966099739075, + "learning_rate": 3.4519269019059775e-05, + "loss": 0.0461, + "step": 14990 + }, + { + "epoch": 1.8012012012012013, + "grad_norm": 0.46603134274482727, + "learning_rate": 3.4489369954065275e-05, + "loss": 0.0403, + "step": 14995 + }, + { + "epoch": 1.8018018018018018, + "grad_norm": 0.3438381850719452, + "learning_rate": 3.4459477024691866e-05, + "loss": 0.0371, + "step": 15000 + }, + { + "epoch": 1.8018018018018018, + "eval_loss": 0.057668328285217285, + "eval_runtime": 35.8535, + "eval_samples_per_second": 22.313, + "eval_steps_per_second": 5.578, + "step": 15000 + }, + { + "epoch": 1.8024024024024023, + "grad_norm": 0.47141313552856445, + "learning_rate": 3.4429590242764467e-05, + "loss": 0.0374, + "step": 15005 + }, + { + "epoch": 1.803003003003003, + "grad_norm": 0.37017688155174255, + "learning_rate": 3.439970962010555e-05, + "loss": 0.033, + "step": 15010 + }, + { + "epoch": 1.8036036036036036, + "grad_norm": 0.453005850315094, + "learning_rate": 3.4369835168535155e-05, + "loss": 0.0486, + "step": 15015 + }, + { + "epoch": 1.8042042042042041, + "grad_norm": 0.4695863425731659, + "learning_rate": 3.433996689987089e-05, + "loss": 0.0418, + "step": 15020 + }, + { + "epoch": 1.8048048048048049, + "grad_norm": 0.316760390996933, + "learning_rate": 3.431010482592787e-05, + "loss": 0.0357, + "step": 15025 + }, + { + "epoch": 1.8054054054054054, + "grad_norm": 0.41289475560188293, + "learning_rate": 3.428024895851881e-05, + "loss": 0.0432, + "step": 15030 + }, + { + "epoch": 1.806006006006006, + "grad_norm": 0.32462531328201294, + "learning_rate": 3.425039930945394e-05, + "loss": 0.0389, + "step": 15035 + }, + { + "epoch": 1.8066066066066067, + "grad_norm": 0.38858461380004883, + "learning_rate": 3.422055589054105e-05, + "loss": 0.0466, + "step": 15040 + }, + { + "epoch": 1.8072072072072072, + "grad_norm": 0.3275511562824249, + "learning_rate": 3.4190718713585475e-05, + "loss": 0.0453, + "step": 15045 + }, + { + "epoch": 1.8078078078078077, + "grad_norm": 0.44248396158218384, + "learning_rate": 3.416088779039003e-05, + "loss": 0.0335, + "step": 15050 + }, + { + "epoch": 1.8084084084084084, + "grad_norm": 0.39229512214660645, + "learning_rate": 3.413106313275509e-05, + "loss": 0.0435, + "step": 15055 + }, + { + "epoch": 1.809009009009009, + "grad_norm": 0.33925944566726685, + "learning_rate": 3.4101244752478576e-05, + "loss": 0.0375, + "step": 15060 + }, + { + "epoch": 1.8096096096096095, + "grad_norm": 0.306045800447464, + "learning_rate": 3.4071432661355907e-05, + "loss": 0.0448, + "step": 15065 + }, + { + "epoch": 1.8102102102102102, + "grad_norm": 0.3433867394924164, + "learning_rate": 3.404162687117999e-05, + "loss": 0.033, + "step": 15070 + }, + { + "epoch": 1.810810810810811, + "grad_norm": 0.3375025689601898, + "learning_rate": 3.401182739374124e-05, + "loss": 0.0406, + "step": 15075 + }, + { + "epoch": 1.8114114114114113, + "grad_norm": 0.3570587635040283, + "learning_rate": 3.398203424082762e-05, + "loss": 0.0494, + "step": 15080 + }, + { + "epoch": 1.812012012012012, + "grad_norm": 0.3204568028450012, + "learning_rate": 3.395224742422455e-05, + "loss": 0.0355, + "step": 15085 + }, + { + "epoch": 1.8126126126126128, + "grad_norm": 0.39254069328308105, + "learning_rate": 3.3922466955714985e-05, + "loss": 0.0365, + "step": 15090 + }, + { + "epoch": 1.813213213213213, + "grad_norm": 0.29781073331832886, + "learning_rate": 3.389269284707929e-05, + "loss": 0.0421, + "step": 15095 + }, + { + "epoch": 1.8138138138138138, + "grad_norm": 0.431906521320343, + "learning_rate": 3.3862925110095403e-05, + "loss": 0.04, + "step": 15100 + }, + { + "epoch": 1.8144144144144145, + "grad_norm": 0.44331666827201843, + "learning_rate": 3.383316375653868e-05, + "loss": 0.0434, + "step": 15105 + }, + { + "epoch": 1.815015015015015, + "grad_norm": 0.3456747531890869, + "learning_rate": 3.380340879818199e-05, + "loss": 0.0389, + "step": 15110 + }, + { + "epoch": 1.8156156156156156, + "grad_norm": 0.3462885320186615, + "learning_rate": 3.377366024679568e-05, + "loss": 0.0355, + "step": 15115 + }, + { + "epoch": 1.8162162162162163, + "grad_norm": 0.3487900197505951, + "learning_rate": 3.374391811414749e-05, + "loss": 0.0431, + "step": 15120 + }, + { + "epoch": 1.8168168168168168, + "grad_norm": 0.3203073740005493, + "learning_rate": 3.3714182412002715e-05, + "loss": 0.0348, + "step": 15125 + }, + { + "epoch": 1.8174174174174174, + "grad_norm": 0.4262768626213074, + "learning_rate": 3.368445315212403e-05, + "loss": 0.0399, + "step": 15130 + }, + { + "epoch": 1.818018018018018, + "grad_norm": 0.3522956073284149, + "learning_rate": 3.365473034627161e-05, + "loss": 0.046, + "step": 15135 + }, + { + "epoch": 1.8186186186186186, + "grad_norm": 0.40157803893089294, + "learning_rate": 3.362501400620308e-05, + "loss": 0.0316, + "step": 15140 + }, + { + "epoch": 1.8192192192192191, + "grad_norm": 0.46878618001937866, + "learning_rate": 3.359530414367345e-05, + "loss": 0.0451, + "step": 15145 + }, + { + "epoch": 1.8198198198198199, + "grad_norm": 0.34287646412849426, + "learning_rate": 3.356560077043521e-05, + "loss": 0.0391, + "step": 15150 + }, + { + "epoch": 1.8204204204204204, + "grad_norm": 0.4138559103012085, + "learning_rate": 3.3535903898238305e-05, + "loss": 0.0496, + "step": 15155 + }, + { + "epoch": 1.821021021021021, + "grad_norm": 0.3784990608692169, + "learning_rate": 3.350621353883009e-05, + "loss": 0.0461, + "step": 15160 + }, + { + "epoch": 1.8216216216216217, + "grad_norm": 0.44947463274002075, + "learning_rate": 3.347652970395528e-05, + "loss": 0.0434, + "step": 15165 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 0.340086430311203, + "learning_rate": 3.344685240535612e-05, + "loss": 0.0359, + "step": 15170 + }, + { + "epoch": 1.8228228228228227, + "grad_norm": 0.3417467772960663, + "learning_rate": 3.3417181654772186e-05, + "loss": 0.0486, + "step": 15175 + }, + { + "epoch": 1.8234234234234235, + "grad_norm": 0.37504029273986816, + "learning_rate": 3.3387517463940496e-05, + "loss": 0.0411, + "step": 15180 + }, + { + "epoch": 1.824024024024024, + "grad_norm": 0.2572895884513855, + "learning_rate": 3.33578598445955e-05, + "loss": 0.0369, + "step": 15185 + }, + { + "epoch": 1.8246246246246245, + "grad_norm": 0.3781915307044983, + "learning_rate": 3.3328208808468966e-05, + "loss": 0.0415, + "step": 15190 + }, + { + "epoch": 1.8252252252252252, + "grad_norm": 0.3484686613082886, + "learning_rate": 3.3298564367290144e-05, + "loss": 0.0393, + "step": 15195 + }, + { + "epoch": 1.825825825825826, + "grad_norm": 0.42965856194496155, + "learning_rate": 3.3268926532785615e-05, + "loss": 0.0428, + "step": 15200 + }, + { + "epoch": 1.8264264264264263, + "grad_norm": 0.3399992287158966, + "learning_rate": 3.32392953166794e-05, + "loss": 0.0446, + "step": 15205 + }, + { + "epoch": 1.827027027027027, + "grad_norm": 0.3631638288497925, + "learning_rate": 3.320967073069289e-05, + "loss": 0.0368, + "step": 15210 + }, + { + "epoch": 1.8276276276276278, + "grad_norm": 0.4014107286930084, + "learning_rate": 3.318005278654478e-05, + "loss": 0.0438, + "step": 15215 + }, + { + "epoch": 1.8282282282282283, + "grad_norm": 0.3967078924179077, + "learning_rate": 3.3150441495951234e-05, + "loss": 0.0368, + "step": 15220 + }, + { + "epoch": 1.8288288288288288, + "grad_norm": 0.5075540542602539, + "learning_rate": 3.312083687062573e-05, + "loss": 0.0358, + "step": 15225 + }, + { + "epoch": 1.8294294294294295, + "grad_norm": 0.5178868174552917, + "learning_rate": 3.3091238922279156e-05, + "loss": 0.0439, + "step": 15230 + }, + { + "epoch": 1.83003003003003, + "grad_norm": 0.4869144558906555, + "learning_rate": 3.3061647662619676e-05, + "loss": 0.0452, + "step": 15235 + }, + { + "epoch": 1.8306306306306306, + "grad_norm": 0.32434096932411194, + "learning_rate": 3.30320631033529e-05, + "loss": 0.0407, + "step": 15240 + }, + { + "epoch": 1.8312312312312313, + "grad_norm": 0.4024626612663269, + "learning_rate": 3.3002485256181713e-05, + "loss": 0.0398, + "step": 15245 + }, + { + "epoch": 1.8318318318318318, + "grad_norm": 0.4433274567127228, + "learning_rate": 3.297291413280641e-05, + "loss": 0.0404, + "step": 15250 + }, + { + "epoch": 1.8318318318318318, + "eval_loss": 0.052654825150966644, + "eval_runtime": 35.9691, + "eval_samples_per_second": 22.241, + "eval_steps_per_second": 5.56, + "step": 15250 + }, + { + "epoch": 1.8324324324324324, + "grad_norm": 0.4474799335002899, + "learning_rate": 3.294334974492461e-05, + "loss": 0.0413, + "step": 15255 + }, + { + "epoch": 1.833033033033033, + "grad_norm": 0.35214540362358093, + "learning_rate": 3.29137921042312e-05, + "loss": 0.0427, + "step": 15260 + }, + { + "epoch": 1.8336336336336336, + "grad_norm": 0.36843839287757874, + "learning_rate": 3.288424122241849e-05, + "loss": 0.042, + "step": 15265 + }, + { + "epoch": 1.8342342342342342, + "grad_norm": 0.4027945399284363, + "learning_rate": 3.285469711117606e-05, + "loss": 0.04, + "step": 15270 + }, + { + "epoch": 1.834834834834835, + "grad_norm": 0.4439586102962494, + "learning_rate": 3.282515978219082e-05, + "loss": 0.0452, + "step": 15275 + }, + { + "epoch": 1.8354354354354354, + "grad_norm": 0.32045507431030273, + "learning_rate": 3.279562924714705e-05, + "loss": 0.0363, + "step": 15280 + }, + { + "epoch": 1.836036036036036, + "grad_norm": 0.29067376255989075, + "learning_rate": 3.276610551772624e-05, + "loss": 0.0318, + "step": 15285 + }, + { + "epoch": 1.8366366366366367, + "grad_norm": 0.46161335706710815, + "learning_rate": 3.273658860560728e-05, + "loss": 0.043, + "step": 15290 + }, + { + "epoch": 1.8372372372372372, + "grad_norm": 0.3770528733730316, + "learning_rate": 3.2707078522466324e-05, + "loss": 0.0394, + "step": 15295 + }, + { + "epoch": 1.8378378378378377, + "grad_norm": 0.3454394042491913, + "learning_rate": 3.2677575279976846e-05, + "loss": 0.0361, + "step": 15300 + }, + { + "epoch": 1.8384384384384385, + "grad_norm": 0.383668452501297, + "learning_rate": 3.2648078889809564e-05, + "loss": 0.0412, + "step": 15305 + }, + { + "epoch": 1.8390390390390392, + "grad_norm": 0.33942195773124695, + "learning_rate": 3.261858936363254e-05, + "loss": 0.0384, + "step": 15310 + }, + { + "epoch": 1.8396396396396395, + "grad_norm": 0.3384036421775818, + "learning_rate": 3.2589106713111095e-05, + "loss": 0.0384, + "step": 15315 + }, + { + "epoch": 1.8402402402402402, + "grad_norm": 0.4031851887702942, + "learning_rate": 3.2559630949907824e-05, + "loss": 0.0376, + "step": 15320 + }, + { + "epoch": 1.840840840840841, + "grad_norm": 0.36432915925979614, + "learning_rate": 3.253016208568266e-05, + "loss": 0.0451, + "step": 15325 + }, + { + "epoch": 1.8414414414414413, + "grad_norm": 0.4108644425868988, + "learning_rate": 3.2500700132092686e-05, + "loss": 0.0364, + "step": 15330 + }, + { + "epoch": 1.842042042042042, + "grad_norm": 0.35947027802467346, + "learning_rate": 3.247124510079236e-05, + "loss": 0.0344, + "step": 15335 + }, + { + "epoch": 1.8426426426426428, + "grad_norm": 0.3900996744632721, + "learning_rate": 3.2441797003433347e-05, + "loss": 0.0336, + "step": 15340 + }, + { + "epoch": 1.8432432432432433, + "grad_norm": 0.35430777072906494, + "learning_rate": 3.2412355851664596e-05, + "loss": 0.036, + "step": 15345 + }, + { + "epoch": 1.8438438438438438, + "grad_norm": 0.3509856164455414, + "learning_rate": 3.238292165713232e-05, + "loss": 0.0363, + "step": 15350 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 0.3149247169494629, + "learning_rate": 3.2353494431479916e-05, + "loss": 0.0344, + "step": 15355 + }, + { + "epoch": 1.845045045045045, + "grad_norm": 0.39669427275657654, + "learning_rate": 3.2324074186348095e-05, + "loss": 0.0381, + "step": 15360 + }, + { + "epoch": 1.8456456456456456, + "grad_norm": 0.36199426651000977, + "learning_rate": 3.229466093337474e-05, + "loss": 0.0405, + "step": 15365 + }, + { + "epoch": 1.8462462462462463, + "grad_norm": 0.4065885543823242, + "learning_rate": 3.226525468419507e-05, + "loss": 0.0468, + "step": 15370 + }, + { + "epoch": 1.8468468468468469, + "grad_norm": 0.3283202052116394, + "learning_rate": 3.22358554504414e-05, + "loss": 0.034, + "step": 15375 + }, + { + "epoch": 1.8474474474474474, + "grad_norm": 0.4245908558368683, + "learning_rate": 3.220646324374337e-05, + "loss": 0.038, + "step": 15380 + }, + { + "epoch": 1.8480480480480481, + "grad_norm": 0.42148643732070923, + "learning_rate": 3.2177078075727795e-05, + "loss": 0.0349, + "step": 15385 + }, + { + "epoch": 1.8486486486486486, + "grad_norm": 0.543740451335907, + "learning_rate": 3.214769995801875e-05, + "loss": 0.0465, + "step": 15390 + }, + { + "epoch": 1.8492492492492492, + "grad_norm": 0.3897549510002136, + "learning_rate": 3.211832890223748e-05, + "loss": 0.0503, + "step": 15395 + }, + { + "epoch": 1.84984984984985, + "grad_norm": 0.47316253185272217, + "learning_rate": 3.208896492000243e-05, + "loss": 0.0401, + "step": 15400 + }, + { + "epoch": 1.8504504504504504, + "grad_norm": 0.36027249693870544, + "learning_rate": 3.205960802292928e-05, + "loss": 0.0318, + "step": 15405 + }, + { + "epoch": 1.851051051051051, + "grad_norm": 0.3004674017429352, + "learning_rate": 3.203025822263087e-05, + "loss": 0.0371, + "step": 15410 + }, + { + "epoch": 1.8516516516516517, + "grad_norm": 0.31336236000061035, + "learning_rate": 3.200091553071727e-05, + "loss": 0.0344, + "step": 15415 + }, + { + "epoch": 1.8522522522522522, + "grad_norm": 0.3068157732486725, + "learning_rate": 3.197157995879575e-05, + "loss": 0.0353, + "step": 15420 + }, + { + "epoch": 1.8528528528528527, + "grad_norm": 0.5400185585021973, + "learning_rate": 3.1942251518470676e-05, + "loss": 0.0409, + "step": 15425 + }, + { + "epoch": 1.8534534534534535, + "grad_norm": 0.4230722188949585, + "learning_rate": 3.1912930221343695e-05, + "loss": 0.0383, + "step": 15430 + }, + { + "epoch": 1.8540540540540542, + "grad_norm": 0.3303622901439667, + "learning_rate": 3.1883616079013576e-05, + "loss": 0.0315, + "step": 15435 + }, + { + "epoch": 1.8546546546546545, + "grad_norm": 0.35778260231018066, + "learning_rate": 3.185430910307628e-05, + "loss": 0.0363, + "step": 15440 + }, + { + "epoch": 1.8552552552552553, + "grad_norm": 0.31309306621551514, + "learning_rate": 3.1825009305124895e-05, + "loss": 0.0316, + "step": 15445 + }, + { + "epoch": 1.855855855855856, + "grad_norm": 0.40511417388916016, + "learning_rate": 3.17957166967497e-05, + "loss": 0.043, + "step": 15450 + }, + { + "epoch": 1.8564564564564563, + "grad_norm": 0.3628949820995331, + "learning_rate": 3.176643128953815e-05, + "loss": 0.0336, + "step": 15455 + }, + { + "epoch": 1.857057057057057, + "grad_norm": 0.3685472011566162, + "learning_rate": 3.1737153095074796e-05, + "loss": 0.0452, + "step": 15460 + }, + { + "epoch": 1.8576576576576578, + "grad_norm": 0.4899274706840515, + "learning_rate": 3.1707882124941404e-05, + "loss": 0.0393, + "step": 15465 + }, + { + "epoch": 1.8582582582582583, + "grad_norm": 0.3528103530406952, + "learning_rate": 3.1678618390716804e-05, + "loss": 0.0329, + "step": 15470 + }, + { + "epoch": 1.8588588588588588, + "grad_norm": 0.38148975372314453, + "learning_rate": 3.1649361903977016e-05, + "loss": 0.0408, + "step": 15475 + }, + { + "epoch": 1.8594594594594596, + "grad_norm": 0.338446706533432, + "learning_rate": 3.162011267629519e-05, + "loss": 0.032, + "step": 15480 + }, + { + "epoch": 1.86006006006006, + "grad_norm": 0.3764857351779938, + "learning_rate": 3.15908707192416e-05, + "loss": 0.038, + "step": 15485 + }, + { + "epoch": 1.8606606606606606, + "grad_norm": 0.43757164478302, + "learning_rate": 3.1561636044383643e-05, + "loss": 0.0407, + "step": 15490 + }, + { + "epoch": 1.8612612612612613, + "grad_norm": 0.4748579263687134, + "learning_rate": 3.153240866328582e-05, + "loss": 0.04, + "step": 15495 + }, + { + "epoch": 1.8618618618618619, + "grad_norm": 0.35937410593032837, + "learning_rate": 3.150318858750976e-05, + "loss": 0.0403, + "step": 15500 + }, + { + "epoch": 1.8618618618618619, + "eval_loss": 0.05141396448016167, + "eval_runtime": 36.0092, + "eval_samples_per_second": 22.217, + "eval_steps_per_second": 5.554, + "step": 15500 + }, + { + "epoch": 1.8624624624624624, + "grad_norm": 0.34568673372268677, + "learning_rate": 3.14739758286142e-05, + "loss": 0.0375, + "step": 15505 + }, + { + "epoch": 1.8630630630630631, + "grad_norm": 0.49455657601356506, + "learning_rate": 3.1444770398154985e-05, + "loss": 0.038, + "step": 15510 + }, + { + "epoch": 1.8636636636636636, + "grad_norm": 0.4506688117980957, + "learning_rate": 3.141557230768508e-05, + "loss": 0.0437, + "step": 15515 + }, + { + "epoch": 1.8642642642642642, + "grad_norm": 0.4464740753173828, + "learning_rate": 3.13863815687545e-05, + "loss": 0.0428, + "step": 15520 + }, + { + "epoch": 1.864864864864865, + "grad_norm": 0.297534316778183, + "learning_rate": 3.135719819291038e-05, + "loss": 0.0382, + "step": 15525 + }, + { + "epoch": 1.8654654654654654, + "grad_norm": 0.33353927731513977, + "learning_rate": 3.132802219169695e-05, + "loss": 0.038, + "step": 15530 + }, + { + "epoch": 1.866066066066066, + "grad_norm": 0.24684344232082367, + "learning_rate": 3.129885357665553e-05, + "loss": 0.0344, + "step": 15535 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.32198721170425415, + "learning_rate": 3.126969235932446e-05, + "loss": 0.035, + "step": 15540 + }, + { + "epoch": 1.8672672672672672, + "grad_norm": 0.4634726047515869, + "learning_rate": 3.1240538551239234e-05, + "loss": 0.0383, + "step": 15545 + }, + { + "epoch": 1.8678678678678677, + "grad_norm": 0.4412508010864258, + "learning_rate": 3.121139216393235e-05, + "loss": 0.0352, + "step": 15550 + }, + { + "epoch": 1.8684684684684685, + "grad_norm": 0.4202616512775421, + "learning_rate": 3.118225320893341e-05, + "loss": 0.0382, + "step": 15555 + }, + { + "epoch": 1.8690690690690692, + "grad_norm": 0.3001733422279358, + "learning_rate": 3.115312169776908e-05, + "loss": 0.0397, + "step": 15560 + }, + { + "epoch": 1.8696696696696695, + "grad_norm": 0.3958297669887543, + "learning_rate": 3.112399764196303e-05, + "loss": 0.0354, + "step": 15565 + }, + { + "epoch": 1.8702702702702703, + "grad_norm": 0.3859178423881531, + "learning_rate": 3.109488105303603e-05, + "loss": 0.0429, + "step": 15570 + }, + { + "epoch": 1.870870870870871, + "grad_norm": 0.4340052604675293, + "learning_rate": 3.1065771942505884e-05, + "loss": 0.0362, + "step": 15575 + }, + { + "epoch": 1.8714714714714715, + "grad_norm": 0.2764926254749298, + "learning_rate": 3.1036670321887426e-05, + "loss": 0.0379, + "step": 15580 + }, + { + "epoch": 1.872072072072072, + "grad_norm": 0.3525054454803467, + "learning_rate": 3.100757620269257e-05, + "loss": 0.0367, + "step": 15585 + }, + { + "epoch": 1.8726726726726728, + "grad_norm": 0.3726922869682312, + "learning_rate": 3.0978489596430184e-05, + "loss": 0.0366, + "step": 15590 + }, + { + "epoch": 1.8732732732732733, + "grad_norm": 0.3272388279438019, + "learning_rate": 3.0949410514606234e-05, + "loss": 0.0312, + "step": 15595 + }, + { + "epoch": 1.8738738738738738, + "grad_norm": 0.34497085213661194, + "learning_rate": 3.092033896872367e-05, + "loss": 0.0319, + "step": 15600 + }, + { + "epoch": 1.8744744744744746, + "grad_norm": 0.36654847860336304, + "learning_rate": 3.0891274970282505e-05, + "loss": 0.0366, + "step": 15605 + }, + { + "epoch": 1.875075075075075, + "grad_norm": 0.4943583011627197, + "learning_rate": 3.08622185307797e-05, + "loss": 0.0369, + "step": 15610 + }, + { + "epoch": 1.8756756756756756, + "grad_norm": 0.3475680649280548, + "learning_rate": 3.083316966170927e-05, + "loss": 0.0336, + "step": 15615 + }, + { + "epoch": 1.8762762762762764, + "grad_norm": 0.3712926506996155, + "learning_rate": 3.080412837456225e-05, + "loss": 0.0359, + "step": 15620 + }, + { + "epoch": 1.8768768768768769, + "grad_norm": 0.35502180457115173, + "learning_rate": 3.0775094680826624e-05, + "loss": 0.0302, + "step": 15625 + }, + { + "epoch": 1.8774774774774774, + "grad_norm": 0.3283778429031372, + "learning_rate": 3.074606859198746e-05, + "loss": 0.0352, + "step": 15630 + }, + { + "epoch": 1.8780780780780781, + "grad_norm": 0.3945302665233612, + "learning_rate": 3.071705011952668e-05, + "loss": 0.0254, + "step": 15635 + }, + { + "epoch": 1.8786786786786787, + "grad_norm": 0.3465094268321991, + "learning_rate": 3.068803927492333e-05, + "loss": 0.0314, + "step": 15640 + }, + { + "epoch": 1.8792792792792792, + "grad_norm": 0.3634204864501953, + "learning_rate": 3.065903606965336e-05, + "loss": 0.0358, + "step": 15645 + }, + { + "epoch": 1.87987987987988, + "grad_norm": 0.3066357970237732, + "learning_rate": 3.063004051518972e-05, + "loss": 0.0381, + "step": 15650 + }, + { + "epoch": 1.8804804804804804, + "grad_norm": 0.4703167974948883, + "learning_rate": 3.060105262300236e-05, + "loss": 0.0388, + "step": 15655 + }, + { + "epoch": 1.881081081081081, + "grad_norm": 0.4352303147315979, + "learning_rate": 3.057207240455812e-05, + "loss": 0.0349, + "step": 15660 + }, + { + "epoch": 1.8816816816816817, + "grad_norm": 0.3594943881034851, + "learning_rate": 3.054309987132089e-05, + "loss": 0.0411, + "step": 15665 + }, + { + "epoch": 1.8822822822822824, + "grad_norm": 0.34663334488868713, + "learning_rate": 3.051413503475149e-05, + "loss": 0.0321, + "step": 15670 + }, + { + "epoch": 1.8828828828828827, + "grad_norm": 0.508261501789093, + "learning_rate": 3.0485177906307694e-05, + "loss": 0.0384, + "step": 15675 + }, + { + "epoch": 1.8834834834834835, + "grad_norm": 0.46104949712753296, + "learning_rate": 3.0456228497444206e-05, + "loss": 0.0375, + "step": 15680 + }, + { + "epoch": 1.8840840840840842, + "grad_norm": 0.33268865942955017, + "learning_rate": 3.042728681961271e-05, + "loss": 0.0306, + "step": 15685 + }, + { + "epoch": 1.8846846846846845, + "grad_norm": 0.3267901837825775, + "learning_rate": 3.0398352884261792e-05, + "loss": 0.0319, + "step": 15690 + }, + { + "epoch": 1.8852852852852853, + "grad_norm": 0.3606499135494232, + "learning_rate": 3.0369426702837035e-05, + "loss": 0.035, + "step": 15695 + }, + { + "epoch": 1.885885885885886, + "grad_norm": 0.39046889543533325, + "learning_rate": 3.034050828678092e-05, + "loss": 0.031, + "step": 15700 + }, + { + "epoch": 1.8864864864864865, + "grad_norm": 0.30550241470336914, + "learning_rate": 3.031159764753282e-05, + "loss": 0.036, + "step": 15705 + }, + { + "epoch": 1.887087087087087, + "grad_norm": 0.4246421158313751, + "learning_rate": 3.0282694796529086e-05, + "loss": 0.0379, + "step": 15710 + }, + { + "epoch": 1.8876876876876878, + "grad_norm": 0.49292629957199097, + "learning_rate": 3.0253799745202977e-05, + "loss": 0.04, + "step": 15715 + }, + { + "epoch": 1.8882882882882883, + "grad_norm": 0.24794501066207886, + "learning_rate": 3.0224912504984653e-05, + "loss": 0.0371, + "step": 15720 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 0.3245203495025635, + "learning_rate": 3.0196033087301213e-05, + "loss": 0.0414, + "step": 15725 + }, + { + "epoch": 1.8894894894894896, + "grad_norm": 0.5247988104820251, + "learning_rate": 3.0167161503576603e-05, + "loss": 0.0376, + "step": 15730 + }, + { + "epoch": 1.89009009009009, + "grad_norm": 0.34899789094924927, + "learning_rate": 3.013829776523173e-05, + "loss": 0.0324, + "step": 15735 + }, + { + "epoch": 1.8906906906906906, + "grad_norm": 0.34728893637657166, + "learning_rate": 3.0109441883684363e-05, + "loss": 0.031, + "step": 15740 + }, + { + "epoch": 1.8912912912912914, + "grad_norm": 0.33809661865234375, + "learning_rate": 3.0080593870349195e-05, + "loss": 0.0323, + "step": 15745 + }, + { + "epoch": 1.8918918918918919, + "grad_norm": 0.4066483676433563, + "learning_rate": 3.005175373663775e-05, + "loss": 0.0358, + "step": 15750 + }, + { + "epoch": 1.8918918918918919, + "eval_loss": 0.05024780333042145, + "eval_runtime": 35.6766, + "eval_samples_per_second": 22.424, + "eval_steps_per_second": 5.606, + "step": 15750 + }, + { + "epoch": 1.8924924924924924, + "grad_norm": 0.4036324918270111, + "learning_rate": 3.002292149395849e-05, + "loss": 0.0339, + "step": 15755 + }, + { + "epoch": 1.8930930930930931, + "grad_norm": 0.3037404716014862, + "learning_rate": 2.9994097153716737e-05, + "loss": 0.0341, + "step": 15760 + }, + { + "epoch": 1.8936936936936937, + "grad_norm": 0.3263702392578125, + "learning_rate": 2.996528072731468e-05, + "loss": 0.0316, + "step": 15765 + }, + { + "epoch": 1.8942942942942942, + "grad_norm": 0.3972446620464325, + "learning_rate": 2.9936472226151414e-05, + "loss": 0.0376, + "step": 15770 + }, + { + "epoch": 1.894894894894895, + "grad_norm": 0.3838525116443634, + "learning_rate": 2.990767166162282e-05, + "loss": 0.0332, + "step": 15775 + }, + { + "epoch": 1.8954954954954955, + "grad_norm": 0.3224433660507202, + "learning_rate": 2.987887904512172e-05, + "loss": 0.0316, + "step": 15780 + }, + { + "epoch": 1.896096096096096, + "grad_norm": 0.29355913400650024, + "learning_rate": 2.9850094388037747e-05, + "loss": 0.0349, + "step": 15785 + }, + { + "epoch": 1.8966966966966967, + "grad_norm": 0.4153788089752197, + "learning_rate": 2.98213177017574e-05, + "loss": 0.0314, + "step": 15790 + }, + { + "epoch": 1.8972972972972975, + "grad_norm": 0.44168180227279663, + "learning_rate": 2.979254899766405e-05, + "loss": 0.0356, + "step": 15795 + }, + { + "epoch": 1.8978978978978978, + "grad_norm": 0.3369816243648529, + "learning_rate": 2.9763788287137835e-05, + "loss": 0.0406, + "step": 15800 + }, + { + "epoch": 1.8984984984984985, + "grad_norm": 0.2655574083328247, + "learning_rate": 2.9735035581555805e-05, + "loss": 0.0335, + "step": 15805 + }, + { + "epoch": 1.8990990990990992, + "grad_norm": 0.44046923518180847, + "learning_rate": 2.970629089229182e-05, + "loss": 0.0379, + "step": 15810 + }, + { + "epoch": 1.8996996996996995, + "grad_norm": 0.38456791639328003, + "learning_rate": 2.9677554230716585e-05, + "loss": 0.0349, + "step": 15815 + }, + { + "epoch": 1.9003003003003003, + "grad_norm": 0.4505840241909027, + "learning_rate": 2.9648825608197572e-05, + "loss": 0.0323, + "step": 15820 + }, + { + "epoch": 1.900900900900901, + "grad_norm": 0.383492112159729, + "learning_rate": 2.9620105036099133e-05, + "loss": 0.0322, + "step": 15825 + }, + { + "epoch": 1.9015015015015015, + "grad_norm": 0.37796080112457275, + "learning_rate": 2.9591392525782425e-05, + "loss": 0.0347, + "step": 15830 + }, + { + "epoch": 1.902102102102102, + "grad_norm": 0.3788483440876007, + "learning_rate": 2.9562688088605384e-05, + "loss": 0.0279, + "step": 15835 + }, + { + "epoch": 1.9027027027027028, + "grad_norm": 0.36863160133361816, + "learning_rate": 2.9533991735922805e-05, + "loss": 0.0316, + "step": 15840 + }, + { + "epoch": 1.9033033033033033, + "grad_norm": 0.27120909094810486, + "learning_rate": 2.950530347908622e-05, + "loss": 0.0292, + "step": 15845 + }, + { + "epoch": 1.9039039039039038, + "grad_norm": 0.34686270356178284, + "learning_rate": 2.947662332944401e-05, + "loss": 0.0357, + "step": 15850 + }, + { + "epoch": 1.9045045045045046, + "grad_norm": 0.3707510232925415, + "learning_rate": 2.944795129834132e-05, + "loss": 0.033, + "step": 15855 + }, + { + "epoch": 1.9051051051051051, + "grad_norm": 0.2813546061515808, + "learning_rate": 2.941928739712011e-05, + "loss": 0.0283, + "step": 15860 + }, + { + "epoch": 1.9057057057057056, + "grad_norm": 0.30208873748779297, + "learning_rate": 2.9390631637119126e-05, + "loss": 0.0315, + "step": 15865 + }, + { + "epoch": 1.9063063063063064, + "grad_norm": 0.4704820513725281, + "learning_rate": 2.9361984029673838e-05, + "loss": 0.0319, + "step": 15870 + }, + { + "epoch": 1.906906906906907, + "grad_norm": 0.3833897113800049, + "learning_rate": 2.9333344586116563e-05, + "loss": 0.0362, + "step": 15875 + }, + { + "epoch": 1.9075075075075074, + "grad_norm": 0.3160555362701416, + "learning_rate": 2.9304713317776323e-05, + "loss": 0.0347, + "step": 15880 + }, + { + "epoch": 1.9081081081081082, + "grad_norm": 0.3219717741012573, + "learning_rate": 2.9276090235978976e-05, + "loss": 0.034, + "step": 15885 + }, + { + "epoch": 1.9087087087087087, + "grad_norm": 0.2876552939414978, + "learning_rate": 2.9247475352047065e-05, + "loss": 0.0327, + "step": 15890 + }, + { + "epoch": 1.9093093093093092, + "grad_norm": 0.33561497926712036, + "learning_rate": 2.921886867729995e-05, + "loss": 0.0345, + "step": 15895 + }, + { + "epoch": 1.90990990990991, + "grad_norm": 0.3135334849357605, + "learning_rate": 2.9190270223053728e-05, + "loss": 0.0376, + "step": 15900 + }, + { + "epoch": 1.9105105105105105, + "grad_norm": 0.3284721374511719, + "learning_rate": 2.916168000062123e-05, + "loss": 0.0287, + "step": 15905 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 0.3291323184967041, + "learning_rate": 2.9133098021312056e-05, + "loss": 0.0341, + "step": 15910 + }, + { + "epoch": 1.9117117117117117, + "grad_norm": 0.3968559503555298, + "learning_rate": 2.91045242964325e-05, + "loss": 0.0354, + "step": 15915 + }, + { + "epoch": 1.9123123123123125, + "grad_norm": 0.3673100769519806, + "learning_rate": 2.9075958837285644e-05, + "loss": 0.0289, + "step": 15920 + }, + { + "epoch": 1.9129129129129128, + "grad_norm": 0.32350149750709534, + "learning_rate": 2.904740165517126e-05, + "loss": 0.0356, + "step": 15925 + }, + { + "epoch": 1.9135135135135135, + "grad_norm": 0.42222660779953003, + "learning_rate": 2.9018852761385874e-05, + "loss": 0.0309, + "step": 15930 + }, + { + "epoch": 1.9141141141141143, + "grad_norm": 0.4391559362411499, + "learning_rate": 2.8990312167222737e-05, + "loss": 0.029, + "step": 15935 + }, + { + "epoch": 1.9147147147147145, + "grad_norm": 0.28281211853027344, + "learning_rate": 2.8961779883971763e-05, + "loss": 0.0313, + "step": 15940 + }, + { + "epoch": 1.9153153153153153, + "grad_norm": 0.3404557704925537, + "learning_rate": 2.8933255922919655e-05, + "loss": 0.0314, + "step": 15945 + }, + { + "epoch": 1.915915915915916, + "grad_norm": 0.326612263917923, + "learning_rate": 2.8904740295349747e-05, + "loss": 0.0291, + "step": 15950 + }, + { + "epoch": 1.9165165165165166, + "grad_norm": 0.3266064524650574, + "learning_rate": 2.8876233012542132e-05, + "loss": 0.0326, + "step": 15955 + }, + { + "epoch": 1.917117117117117, + "grad_norm": 0.43275049328804016, + "learning_rate": 2.8847734085773614e-05, + "loss": 0.0359, + "step": 15960 + }, + { + "epoch": 1.9177177177177178, + "grad_norm": 0.37904635071754456, + "learning_rate": 2.8819243526317608e-05, + "loss": 0.0292, + "step": 15965 + }, + { + "epoch": 1.9183183183183183, + "grad_norm": 0.4266570210456848, + "learning_rate": 2.8790761345444307e-05, + "loss": 0.0294, + "step": 15970 + }, + { + "epoch": 1.9189189189189189, + "grad_norm": 0.4261925518512726, + "learning_rate": 2.8762287554420552e-05, + "loss": 0.0312, + "step": 15975 + }, + { + "epoch": 1.9195195195195196, + "grad_norm": 0.41322338581085205, + "learning_rate": 2.873382216450988e-05, + "loss": 0.0389, + "step": 15980 + }, + { + "epoch": 1.9201201201201201, + "grad_norm": 0.31788089871406555, + "learning_rate": 2.8705365186972473e-05, + "loss": 0.0299, + "step": 15985 + }, + { + "epoch": 1.9207207207207206, + "grad_norm": 0.33438605070114136, + "learning_rate": 2.867691663306521e-05, + "loss": 0.0336, + "step": 15990 + }, + { + "epoch": 1.9213213213213214, + "grad_norm": 0.3594795763492584, + "learning_rate": 2.8648476514041646e-05, + "loss": 0.0354, + "step": 15995 + }, + { + "epoch": 1.921921921921922, + "grad_norm": 0.2548877000808716, + "learning_rate": 2.862004484115198e-05, + "loss": 0.0284, + "step": 16000 + }, + { + "epoch": 1.921921921921922, + "eval_loss": 0.04871018975973129, + "eval_runtime": 36.0509, + "eval_samples_per_second": 22.191, + "eval_steps_per_second": 5.548, + "step": 16000 + }, + { + "epoch": 1.9225225225225224, + "grad_norm": 0.5414465069770813, + "learning_rate": 2.85916216256431e-05, + "loss": 0.0398, + "step": 16005 + }, + { + "epoch": 1.9231231231231232, + "grad_norm": 0.2570936679840088, + "learning_rate": 2.8563206878758486e-05, + "loss": 0.0322, + "step": 16010 + }, + { + "epoch": 1.9237237237237237, + "grad_norm": 0.31128424406051636, + "learning_rate": 2.853480061173833e-05, + "loss": 0.0323, + "step": 16015 + }, + { + "epoch": 1.9243243243243242, + "grad_norm": 0.36437293887138367, + "learning_rate": 2.8506402835819445e-05, + "loss": 0.0285, + "step": 16020 + }, + { + "epoch": 1.924924924924925, + "grad_norm": 0.44023168087005615, + "learning_rate": 2.847801356223529e-05, + "loss": 0.0314, + "step": 16025 + }, + { + "epoch": 1.9255255255255255, + "grad_norm": 0.3434520661830902, + "learning_rate": 2.8449632802215974e-05, + "loss": 0.0329, + "step": 16030 + }, + { + "epoch": 1.926126126126126, + "grad_norm": 0.5105292201042175, + "learning_rate": 2.8421260566988194e-05, + "loss": 0.0315, + "step": 16035 + }, + { + "epoch": 1.9267267267267267, + "grad_norm": 0.38238635659217834, + "learning_rate": 2.839289686777533e-05, + "loss": 0.0335, + "step": 16040 + }, + { + "epoch": 1.9273273273273275, + "grad_norm": 0.36520659923553467, + "learning_rate": 2.8364541715797333e-05, + "loss": 0.0299, + "step": 16045 + }, + { + "epoch": 1.9279279279279278, + "grad_norm": 0.3082834482192993, + "learning_rate": 2.833619512227082e-05, + "loss": 0.0312, + "step": 16050 + }, + { + "epoch": 1.9285285285285285, + "grad_norm": 0.3804808557033539, + "learning_rate": 2.8307857098408975e-05, + "loss": 0.0292, + "step": 16055 + }, + { + "epoch": 1.9291291291291293, + "grad_norm": 0.33361175656318665, + "learning_rate": 2.827952765542164e-05, + "loss": 0.032, + "step": 16060 + }, + { + "epoch": 1.9297297297297298, + "grad_norm": 0.5027971863746643, + "learning_rate": 2.8251206804515235e-05, + "loss": 0.0339, + "step": 16065 + }, + { + "epoch": 1.9303303303303303, + "grad_norm": 0.3659258186817169, + "learning_rate": 2.8222894556892786e-05, + "loss": 0.03, + "step": 16070 + }, + { + "epoch": 1.930930930930931, + "grad_norm": 0.3499879240989685, + "learning_rate": 2.8194590923753944e-05, + "loss": 0.0316, + "step": 16075 + }, + { + "epoch": 1.9315315315315316, + "grad_norm": 0.38677674531936646, + "learning_rate": 2.8166295916294884e-05, + "loss": 0.0334, + "step": 16080 + }, + { + "epoch": 1.932132132132132, + "grad_norm": 0.3069120943546295, + "learning_rate": 2.8138009545708422e-05, + "loss": 0.0278, + "step": 16085 + }, + { + "epoch": 1.9327327327327328, + "grad_norm": 0.5202799439430237, + "learning_rate": 2.810973182318395e-05, + "loss": 0.0325, + "step": 16090 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.345246285200119, + "learning_rate": 2.808146275990744e-05, + "loss": 0.0284, + "step": 16095 + }, + { + "epoch": 1.9339339339339339, + "grad_norm": 0.3261059522628784, + "learning_rate": 2.805320236706145e-05, + "loss": 0.0322, + "step": 16100 + }, + { + "epoch": 1.9345345345345346, + "grad_norm": 0.3136518895626068, + "learning_rate": 2.8024950655825044e-05, + "loss": 0.0314, + "step": 16105 + }, + { + "epoch": 1.9351351351351351, + "grad_norm": 0.4654875695705414, + "learning_rate": 2.799670763737393e-05, + "loss": 0.0388, + "step": 16110 + }, + { + "epoch": 1.9357357357357357, + "grad_norm": 0.3657342791557312, + "learning_rate": 2.796847332288034e-05, + "loss": 0.0321, + "step": 16115 + }, + { + "epoch": 1.9363363363363364, + "grad_norm": 0.347788006067276, + "learning_rate": 2.7940247723513096e-05, + "loss": 0.0312, + "step": 16120 + }, + { + "epoch": 1.936936936936937, + "grad_norm": 0.35710352659225464, + "learning_rate": 2.79120308504375e-05, + "loss": 0.0338, + "step": 16125 + }, + { + "epoch": 1.9375375375375374, + "grad_norm": 0.2987470030784607, + "learning_rate": 2.7883822714815494e-05, + "loss": 0.0267, + "step": 16130 + }, + { + "epoch": 1.9381381381381382, + "grad_norm": 0.42691853642463684, + "learning_rate": 2.785562332780547e-05, + "loss": 0.0326, + "step": 16135 + }, + { + "epoch": 1.9387387387387387, + "grad_norm": 0.4340433180332184, + "learning_rate": 2.7827432700562433e-05, + "loss": 0.033, + "step": 16140 + }, + { + "epoch": 1.9393393393393392, + "grad_norm": 0.4037269949913025, + "learning_rate": 2.779925084423791e-05, + "loss": 0.0281, + "step": 16145 + }, + { + "epoch": 1.93993993993994, + "grad_norm": 0.33447209000587463, + "learning_rate": 2.7771077769979925e-05, + "loss": 0.0344, + "step": 16150 + }, + { + "epoch": 1.9405405405405407, + "grad_norm": 0.43541401624679565, + "learning_rate": 2.7742913488933042e-05, + "loss": 0.0333, + "step": 16155 + }, + { + "epoch": 1.941141141141141, + "grad_norm": 0.2964218258857727, + "learning_rate": 2.771475801223837e-05, + "loss": 0.0258, + "step": 16160 + }, + { + "epoch": 1.9417417417417417, + "grad_norm": 0.3433111608028412, + "learning_rate": 2.768661135103351e-05, + "loss": 0.0274, + "step": 16165 + }, + { + "epoch": 1.9423423423423425, + "grad_norm": 0.37273839116096497, + "learning_rate": 2.765847351645261e-05, + "loss": 0.0283, + "step": 16170 + }, + { + "epoch": 1.9429429429429428, + "grad_norm": 0.3430827260017395, + "learning_rate": 2.7630344519626255e-05, + "loss": 0.0296, + "step": 16175 + }, + { + "epoch": 1.9435435435435435, + "grad_norm": 0.5047633051872253, + "learning_rate": 2.7602224371681605e-05, + "loss": 0.0299, + "step": 16180 + }, + { + "epoch": 1.9441441441441443, + "grad_norm": 0.3661489486694336, + "learning_rate": 2.757411308374229e-05, + "loss": 0.0277, + "step": 16185 + }, + { + "epoch": 1.9447447447447448, + "grad_norm": 0.32280826568603516, + "learning_rate": 2.7546010666928468e-05, + "loss": 0.0357, + "step": 16190 + }, + { + "epoch": 1.9453453453453453, + "grad_norm": 0.4482559859752655, + "learning_rate": 2.7517917132356707e-05, + "loss": 0.0333, + "step": 16195 + }, + { + "epoch": 1.945945945945946, + "grad_norm": 0.3581385910511017, + "learning_rate": 2.7489832491140138e-05, + "loss": 0.0309, + "step": 16200 + }, + { + "epoch": 1.9465465465465466, + "grad_norm": 0.3367556631565094, + "learning_rate": 2.746175675438835e-05, + "loss": 0.0302, + "step": 16205 + }, + { + "epoch": 1.947147147147147, + "grad_norm": 0.4131978154182434, + "learning_rate": 2.7433689933207407e-05, + "loss": 0.0393, + "step": 16210 + }, + { + "epoch": 1.9477477477477478, + "grad_norm": 0.4304249882698059, + "learning_rate": 2.740563203869988e-05, + "loss": 0.0301, + "step": 16215 + }, + { + "epoch": 1.9483483483483484, + "grad_norm": 0.4164542257785797, + "learning_rate": 2.737758308196472e-05, + "loss": 0.0366, + "step": 16220 + }, + { + "epoch": 1.9489489489489489, + "grad_norm": 0.6016157865524292, + "learning_rate": 2.734954307409745e-05, + "loss": 0.0341, + "step": 16225 + }, + { + "epoch": 1.9495495495495496, + "grad_norm": 0.25138306617736816, + "learning_rate": 2.7321512026189956e-05, + "loss": 0.027, + "step": 16230 + }, + { + "epoch": 1.9501501501501501, + "grad_norm": 0.28494128584861755, + "learning_rate": 2.7293489949330653e-05, + "loss": 0.0239, + "step": 16235 + }, + { + "epoch": 1.9507507507507507, + "grad_norm": 0.3030911982059479, + "learning_rate": 2.7265476854604398e-05, + "loss": 0.0294, + "step": 16240 + }, + { + "epoch": 1.9513513513513514, + "grad_norm": 0.3640327751636505, + "learning_rate": 2.723747275309244e-05, + "loss": 0.0315, + "step": 16245 + }, + { + "epoch": 1.951951951951952, + "grad_norm": 0.3330000638961792, + "learning_rate": 2.7209477655872527e-05, + "loss": 0.0389, + "step": 16250 + }, + { + "epoch": 1.951951951951952, + "eval_loss": 0.04595092684030533, + "eval_runtime": 36.109, + "eval_samples_per_second": 22.155, + "eval_steps_per_second": 5.539, + "step": 16250 + }, + { + "epoch": 1.9525525525525524, + "grad_norm": 0.3315175175666809, + "learning_rate": 2.7181491574018825e-05, + "loss": 0.041, + "step": 16255 + }, + { + "epoch": 1.9531531531531532, + "grad_norm": 0.3240010440349579, + "learning_rate": 2.715351451860195e-05, + "loss": 0.0305, + "step": 16260 + }, + { + "epoch": 1.9537537537537537, + "grad_norm": 0.32649537920951843, + "learning_rate": 2.71255465006889e-05, + "loss": 0.0316, + "step": 16265 + }, + { + "epoch": 1.9543543543543542, + "grad_norm": 0.34833481907844543, + "learning_rate": 2.7097587531343145e-05, + "loss": 0.0373, + "step": 16270 + }, + { + "epoch": 1.954954954954955, + "grad_norm": 0.28214502334594727, + "learning_rate": 2.7069637621624565e-05, + "loss": 0.0258, + "step": 16275 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 0.3369249403476715, + "learning_rate": 2.7041696782589442e-05, + "loss": 0.0345, + "step": 16280 + }, + { + "epoch": 1.956156156156156, + "grad_norm": 0.511701226234436, + "learning_rate": 2.7013765025290516e-05, + "loss": 0.0252, + "step": 16285 + }, + { + "epoch": 1.9567567567567568, + "grad_norm": 0.31603947281837463, + "learning_rate": 2.698584236077685e-05, + "loss": 0.0311, + "step": 16290 + }, + { + "epoch": 1.9573573573573575, + "grad_norm": 0.338832825422287, + "learning_rate": 2.6957928800093978e-05, + "loss": 0.0313, + "step": 16295 + }, + { + "epoch": 1.9579579579579578, + "grad_norm": 0.31404685974121094, + "learning_rate": 2.6930024354283816e-05, + "loss": 0.0258, + "step": 16300 + }, + { + "epoch": 1.9585585585585585, + "grad_norm": 0.4568083584308624, + "learning_rate": 2.690212903438467e-05, + "loss": 0.033, + "step": 16305 + }, + { + "epoch": 1.9591591591591593, + "grad_norm": 0.40261292457580566, + "learning_rate": 2.6874242851431253e-05, + "loss": 0.0338, + "step": 16310 + }, + { + "epoch": 1.9597597597597598, + "grad_norm": 0.30654647946357727, + "learning_rate": 2.6846365816454623e-05, + "loss": 0.0297, + "step": 16315 + }, + { + "epoch": 1.9603603603603603, + "grad_norm": 0.32817405462265015, + "learning_rate": 2.6818497940482266e-05, + "loss": 0.0325, + "step": 16320 + }, + { + "epoch": 1.960960960960961, + "grad_norm": 0.32023903727531433, + "learning_rate": 2.6790639234537996e-05, + "loss": 0.0272, + "step": 16325 + }, + { + "epoch": 1.9615615615615616, + "grad_norm": 0.33003783226013184, + "learning_rate": 2.6762789709642056e-05, + "loss": 0.0277, + "step": 16330 + }, + { + "epoch": 1.962162162162162, + "grad_norm": 0.2898213267326355, + "learning_rate": 2.6734949376811004e-05, + "loss": 0.0273, + "step": 16335 + }, + { + "epoch": 1.9627627627627628, + "grad_norm": 0.3364682197570801, + "learning_rate": 2.6707118247057793e-05, + "loss": 0.0293, + "step": 16340 + }, + { + "epoch": 1.9633633633633634, + "grad_norm": 0.3233550488948822, + "learning_rate": 2.6679296331391733e-05, + "loss": 0.0292, + "step": 16345 + }, + { + "epoch": 1.9639639639639639, + "grad_norm": 0.27092820405960083, + "learning_rate": 2.6651483640818488e-05, + "loss": 0.0251, + "step": 16350 + }, + { + "epoch": 1.9645645645645646, + "grad_norm": 0.3484068512916565, + "learning_rate": 2.662368018634009e-05, + "loss": 0.0302, + "step": 16355 + }, + { + "epoch": 1.9651651651651652, + "grad_norm": 0.2783617675304413, + "learning_rate": 2.659588597895485e-05, + "loss": 0.0285, + "step": 16360 + }, + { + "epoch": 1.9657657657657657, + "grad_norm": 0.32954466342926025, + "learning_rate": 2.656810102965749e-05, + "loss": 0.0298, + "step": 16365 + }, + { + "epoch": 1.9663663663663664, + "grad_norm": 0.3614228069782257, + "learning_rate": 2.6540325349439054e-05, + "loss": 0.0335, + "step": 16370 + }, + { + "epoch": 1.966966966966967, + "grad_norm": 0.3480527400970459, + "learning_rate": 2.6512558949286903e-05, + "loss": 0.0342, + "step": 16375 + }, + { + "epoch": 1.9675675675675675, + "grad_norm": 0.3006151020526886, + "learning_rate": 2.648480184018477e-05, + "loss": 0.0299, + "step": 16380 + }, + { + "epoch": 1.9681681681681682, + "grad_norm": 0.3213295638561249, + "learning_rate": 2.645705403311261e-05, + "loss": 0.0287, + "step": 16385 + }, + { + "epoch": 1.9687687687687687, + "grad_norm": 0.3097591698169708, + "learning_rate": 2.642931553904685e-05, + "loss": 0.0268, + "step": 16390 + }, + { + "epoch": 1.9693693693693692, + "grad_norm": 0.3181003928184509, + "learning_rate": 2.6401586368960098e-05, + "loss": 0.0301, + "step": 16395 + }, + { + "epoch": 1.96996996996997, + "grad_norm": 0.2848129868507385, + "learning_rate": 2.637386653382134e-05, + "loss": 0.0246, + "step": 16400 + }, + { + "epoch": 1.9705705705705707, + "grad_norm": 0.4496921896934509, + "learning_rate": 2.634615604459587e-05, + "loss": 0.0294, + "step": 16405 + }, + { + "epoch": 1.971171171171171, + "grad_norm": 0.2925359308719635, + "learning_rate": 2.6318454912245248e-05, + "loss": 0.0222, + "step": 16410 + }, + { + "epoch": 1.9717717717717718, + "grad_norm": 0.37273648381233215, + "learning_rate": 2.6290763147727372e-05, + "loss": 0.0302, + "step": 16415 + }, + { + "epoch": 1.9723723723723725, + "grad_norm": 0.3598352074623108, + "learning_rate": 2.626308076199642e-05, + "loss": 0.0315, + "step": 16420 + }, + { + "epoch": 1.972972972972973, + "grad_norm": 0.3557840883731842, + "learning_rate": 2.623540776600284e-05, + "loss": 0.026, + "step": 16425 + }, + { + "epoch": 1.9735735735735735, + "grad_norm": 0.35025104880332947, + "learning_rate": 2.6207744170693392e-05, + "loss": 0.0288, + "step": 16430 + }, + { + "epoch": 1.9741741741741743, + "grad_norm": 0.35492363572120667, + "learning_rate": 2.6180089987011115e-05, + "loss": 0.0268, + "step": 16435 + }, + { + "epoch": 1.9747747747747748, + "grad_norm": 0.3412788510322571, + "learning_rate": 2.615244522589534e-05, + "loss": 0.0321, + "step": 16440 + }, + { + "epoch": 1.9753753753753753, + "grad_norm": 0.3852511942386627, + "learning_rate": 2.61248098982816e-05, + "loss": 0.0299, + "step": 16445 + }, + { + "epoch": 1.975975975975976, + "grad_norm": 0.4844917953014374, + "learning_rate": 2.6097184015101772e-05, + "loss": 0.0284, + "step": 16450 + }, + { + "epoch": 1.9765765765765766, + "grad_norm": 0.2313963621854782, + "learning_rate": 2.6069567587283977e-05, + "loss": 0.0245, + "step": 16455 + }, + { + "epoch": 1.9771771771771771, + "grad_norm": 0.29417532682418823, + "learning_rate": 2.6041960625752582e-05, + "loss": 0.0261, + "step": 16460 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 0.2930136024951935, + "learning_rate": 2.6014363141428242e-05, + "loss": 0.0287, + "step": 16465 + }, + { + "epoch": 1.9783783783783784, + "grad_norm": 0.3059934377670288, + "learning_rate": 2.598677514522779e-05, + "loss": 0.0293, + "step": 16470 + }, + { + "epoch": 1.978978978978979, + "grad_norm": 0.39566734433174133, + "learning_rate": 2.5959196648064392e-05, + "loss": 0.031, + "step": 16475 + }, + { + "epoch": 1.9795795795795796, + "grad_norm": 0.32728102803230286, + "learning_rate": 2.5931627660847402e-05, + "loss": 0.025, + "step": 16480 + }, + { + "epoch": 1.9801801801801802, + "grad_norm": 0.35917210578918457, + "learning_rate": 2.5904068194482455e-05, + "loss": 0.0292, + "step": 16485 + }, + { + "epoch": 1.9807807807807807, + "grad_norm": 0.32126685976982117, + "learning_rate": 2.5876518259871353e-05, + "loss": 0.027, + "step": 16490 + }, + { + "epoch": 1.9813813813813814, + "grad_norm": 0.27140477299690247, + "learning_rate": 2.58489778679122e-05, + "loss": 0.0279, + "step": 16495 + }, + { + "epoch": 1.981981981981982, + "grad_norm": 0.3503856658935547, + "learning_rate": 2.582144702949927e-05, + "loss": 0.0279, + "step": 16500 + }, + { + "epoch": 1.981981981981982, + "eval_loss": 0.044447824358940125, + "eval_runtime": 35.9264, + "eval_samples_per_second": 22.268, + "eval_steps_per_second": 5.567, + "step": 16500 + }, + { + "epoch": 1.9825825825825825, + "grad_norm": 0.38946840167045593, + "learning_rate": 2.579392575552308e-05, + "loss": 0.0249, + "step": 16505 + }, + { + "epoch": 1.9831831831831832, + "grad_norm": 0.3265872001647949, + "learning_rate": 2.57664140568704e-05, + "loss": 0.0276, + "step": 16510 + }, + { + "epoch": 1.983783783783784, + "grad_norm": 0.3320499062538147, + "learning_rate": 2.5738911944424133e-05, + "loss": 0.0317, + "step": 16515 + }, + { + "epoch": 1.9843843843843842, + "grad_norm": 0.27032506465911865, + "learning_rate": 2.571141942906345e-05, + "loss": 0.0295, + "step": 16520 + }, + { + "epoch": 1.984984984984985, + "grad_norm": 0.3871995210647583, + "learning_rate": 2.568393652166371e-05, + "loss": 0.0286, + "step": 16525 + }, + { + "epoch": 1.9855855855855857, + "grad_norm": 0.3653221130371094, + "learning_rate": 2.5656463233096474e-05, + "loss": 0.0258, + "step": 16530 + }, + { + "epoch": 1.986186186186186, + "grad_norm": 0.3507516086101532, + "learning_rate": 2.562899957422952e-05, + "loss": 0.0281, + "step": 16535 + }, + { + "epoch": 1.9867867867867868, + "grad_norm": 0.32332906126976013, + "learning_rate": 2.5601545555926733e-05, + "loss": 0.0273, + "step": 16540 + }, + { + "epoch": 1.9873873873873875, + "grad_norm": 0.41225871443748474, + "learning_rate": 2.5574101189048276e-05, + "loss": 0.0265, + "step": 16545 + }, + { + "epoch": 1.987987987987988, + "grad_norm": 0.33340364694595337, + "learning_rate": 2.554666648445046e-05, + "loss": 0.033, + "step": 16550 + }, + { + "epoch": 1.9885885885885886, + "grad_norm": 0.4418809413909912, + "learning_rate": 2.5519241452985777e-05, + "loss": 0.0248, + "step": 16555 + }, + { + "epoch": 1.9891891891891893, + "grad_norm": 0.30932164192199707, + "learning_rate": 2.5491826105502897e-05, + "loss": 0.0297, + "step": 16560 + }, + { + "epoch": 1.9897897897897898, + "grad_norm": 0.33863547444343567, + "learning_rate": 2.5464420452846627e-05, + "loss": 0.0302, + "step": 16565 + }, + { + "epoch": 1.9903903903903903, + "grad_norm": 0.3218494653701782, + "learning_rate": 2.5437024505857983e-05, + "loss": 0.0309, + "step": 16570 + }, + { + "epoch": 1.990990990990991, + "grad_norm": 0.3258943259716034, + "learning_rate": 2.5409638275374113e-05, + "loss": 0.0286, + "step": 16575 + }, + { + "epoch": 1.9915915915915916, + "grad_norm": 0.34837913513183594, + "learning_rate": 2.538226177222836e-05, + "loss": 0.0251, + "step": 16580 + }, + { + "epoch": 1.9921921921921921, + "grad_norm": 0.37401098012924194, + "learning_rate": 2.535489500725015e-05, + "loss": 0.0306, + "step": 16585 + }, + { + "epoch": 1.9927927927927929, + "grad_norm": 0.2671530246734619, + "learning_rate": 2.5327537991265137e-05, + "loss": 0.0261, + "step": 16590 + }, + { + "epoch": 1.9933933933933934, + "grad_norm": 0.3131420910358429, + "learning_rate": 2.5300190735095038e-05, + "loss": 0.0252, + "step": 16595 + }, + { + "epoch": 1.993993993993994, + "grad_norm": 0.28659263253211975, + "learning_rate": 2.527285324955777e-05, + "loss": 0.0333, + "step": 16600 + }, + { + "epoch": 1.9945945945945946, + "grad_norm": 0.37140652537345886, + "learning_rate": 2.524552554546738e-05, + "loss": 0.0271, + "step": 16605 + }, + { + "epoch": 1.9951951951951952, + "grad_norm": 0.3571425974369049, + "learning_rate": 2.5218207633634005e-05, + "loss": 0.0267, + "step": 16610 + }, + { + "epoch": 1.9957957957957957, + "grad_norm": 0.2770904004573822, + "learning_rate": 2.5190899524863942e-05, + "loss": 0.0253, + "step": 16615 + }, + { + "epoch": 1.9963963963963964, + "grad_norm": 0.33116596937179565, + "learning_rate": 2.5163601229959606e-05, + "loss": 0.0285, + "step": 16620 + }, + { + "epoch": 1.996996996996997, + "grad_norm": 0.4822072684764862, + "learning_rate": 2.5136312759719525e-05, + "loss": 0.0308, + "step": 16625 + }, + { + "epoch": 1.9975975975975975, + "grad_norm": 0.3068985641002655, + "learning_rate": 2.510903412493837e-05, + "loss": 0.0302, + "step": 16630 + }, + { + "epoch": 1.9981981981981982, + "grad_norm": 0.267890989780426, + "learning_rate": 2.5081765336406838e-05, + "loss": 0.025, + "step": 16635 + }, + { + "epoch": 1.998798798798799, + "grad_norm": 0.3101384937763214, + "learning_rate": 2.5054506404911827e-05, + "loss": 0.0257, + "step": 16640 + }, + { + "epoch": 1.9993993993993993, + "grad_norm": 0.3918618857860565, + "learning_rate": 2.5027257341236275e-05, + "loss": 0.0303, + "step": 16645 + }, + { + "epoch": 2.0, + "grad_norm": 0.43106088042259216, + "learning_rate": 2.5000018156159266e-05, + "loss": 0.0318, + "step": 16650 + }, + { + "epoch": 2.0006006006006007, + "grad_norm": 0.3042386472225189, + "learning_rate": 2.497278886045591e-05, + "loss": 0.0189, + "step": 16655 + }, + { + "epoch": 2.001201201201201, + "grad_norm": 0.2652837932109833, + "learning_rate": 2.4945569464897458e-05, + "loss": 0.02, + "step": 16660 + }, + { + "epoch": 2.001801801801802, + "grad_norm": 0.21576140820980072, + "learning_rate": 2.4918359980251226e-05, + "loss": 0.018, + "step": 16665 + }, + { + "epoch": 2.0024024024024025, + "grad_norm": 0.2946082055568695, + "learning_rate": 2.4891160417280617e-05, + "loss": 0.018, + "step": 16670 + }, + { + "epoch": 2.003003003003003, + "grad_norm": 0.2486870139837265, + "learning_rate": 2.486397078674513e-05, + "loss": 0.0184, + "step": 16675 + }, + { + "epoch": 2.0036036036036036, + "grad_norm": 0.21376429498195648, + "learning_rate": 2.4836791099400253e-05, + "loss": 0.0185, + "step": 16680 + }, + { + "epoch": 2.0042042042042043, + "grad_norm": 0.4368433356285095, + "learning_rate": 2.480962136599765e-05, + "loss": 0.021, + "step": 16685 + }, + { + "epoch": 2.0048048048048046, + "grad_norm": 0.22392675280570984, + "learning_rate": 2.478246159728495e-05, + "loss": 0.0178, + "step": 16690 + }, + { + "epoch": 2.0054054054054054, + "grad_norm": 0.3195738196372986, + "learning_rate": 2.475531180400591e-05, + "loss": 0.0177, + "step": 16695 + }, + { + "epoch": 2.006006006006006, + "grad_norm": 0.27009859681129456, + "learning_rate": 2.472817199690033e-05, + "loss": 0.0153, + "step": 16700 + }, + { + "epoch": 2.0066066066066064, + "grad_norm": 0.26511505246162415, + "learning_rate": 2.470104218670401e-05, + "loss": 0.02, + "step": 16705 + }, + { + "epoch": 2.007207207207207, + "grad_norm": 0.2634662389755249, + "learning_rate": 2.4673922384148847e-05, + "loss": 0.0211, + "step": 16710 + }, + { + "epoch": 2.007807807807808, + "grad_norm": 0.2865537106990814, + "learning_rate": 2.4646812599962766e-05, + "loss": 0.019, + "step": 16715 + }, + { + "epoch": 2.0084084084084086, + "grad_norm": 0.38468220829963684, + "learning_rate": 2.461971284486974e-05, + "loss": 0.0171, + "step": 16720 + }, + { + "epoch": 2.009009009009009, + "grad_norm": 0.20631487667560577, + "learning_rate": 2.459262312958973e-05, + "loss": 0.0165, + "step": 16725 + }, + { + "epoch": 2.0096096096096097, + "grad_norm": 0.2423275113105774, + "learning_rate": 2.456554346483877e-05, + "loss": 0.0169, + "step": 16730 + }, + { + "epoch": 2.0102102102102104, + "grad_norm": 0.3133910000324249, + "learning_rate": 2.453847386132891e-05, + "loss": 0.0179, + "step": 16735 + }, + { + "epoch": 2.0108108108108107, + "grad_norm": 0.24423719942569733, + "learning_rate": 2.451141432976821e-05, + "loss": 0.0178, + "step": 16740 + }, + { + "epoch": 2.0114114114114114, + "grad_norm": 0.37259042263031006, + "learning_rate": 2.4484364880860777e-05, + "loss": 0.0183, + "step": 16745 + }, + { + "epoch": 2.012012012012012, + "grad_norm": 0.2786225378513336, + "learning_rate": 2.445732552530665e-05, + "loss": 0.0175, + "step": 16750 + }, + { + "epoch": 2.012012012012012, + "eval_loss": 0.04507996514439583, + "eval_runtime": 35.9044, + "eval_samples_per_second": 22.281, + "eval_steps_per_second": 5.57, + "step": 16750 + }, + { + "epoch": 2.0126126126126125, + "grad_norm": 0.3350470960140228, + "learning_rate": 2.4430296273801968e-05, + "loss": 0.0201, + "step": 16755 + }, + { + "epoch": 2.0132132132132132, + "grad_norm": 0.1945425271987915, + "learning_rate": 2.4403277137038815e-05, + "loss": 0.0197, + "step": 16760 + }, + { + "epoch": 2.013813813813814, + "grad_norm": 0.29737523198127747, + "learning_rate": 2.4376268125705322e-05, + "loss": 0.0177, + "step": 16765 + }, + { + "epoch": 2.0144144144144143, + "grad_norm": 0.3143198788166046, + "learning_rate": 2.434926925048554e-05, + "loss": 0.0165, + "step": 16770 + }, + { + "epoch": 2.015015015015015, + "grad_norm": 0.2871094048023224, + "learning_rate": 2.4322280522059583e-05, + "loss": 0.0153, + "step": 16775 + }, + { + "epoch": 2.0156156156156158, + "grad_norm": 0.3184792399406433, + "learning_rate": 2.4295301951103532e-05, + "loss": 0.0195, + "step": 16780 + }, + { + "epoch": 2.016216216216216, + "grad_norm": 0.18864959478378296, + "learning_rate": 2.4268333548289417e-05, + "loss": 0.0198, + "step": 16785 + }, + { + "epoch": 2.016816816816817, + "grad_norm": 0.25299298763275146, + "learning_rate": 2.4241375324285276e-05, + "loss": 0.0175, + "step": 16790 + }, + { + "epoch": 2.0174174174174175, + "grad_norm": 0.31384697556495667, + "learning_rate": 2.4214427289755142e-05, + "loss": 0.0177, + "step": 16795 + }, + { + "epoch": 2.018018018018018, + "grad_norm": 0.24413210153579712, + "learning_rate": 2.4187489455358948e-05, + "loss": 0.0175, + "step": 16800 + }, + { + "epoch": 2.0186186186186186, + "grad_norm": 0.23016425967216492, + "learning_rate": 2.4160561831752653e-05, + "loss": 0.0156, + "step": 16805 + }, + { + "epoch": 2.0192192192192193, + "grad_norm": 0.35870787501335144, + "learning_rate": 2.4133644429588164e-05, + "loss": 0.0166, + "step": 16810 + }, + { + "epoch": 2.0198198198198196, + "grad_norm": 0.22207961976528168, + "learning_rate": 2.410673725951335e-05, + "loss": 0.0173, + "step": 16815 + }, + { + "epoch": 2.0204204204204204, + "grad_norm": 0.2038796991109848, + "learning_rate": 2.407984033217199e-05, + "loss": 0.0167, + "step": 16820 + }, + { + "epoch": 2.021021021021021, + "grad_norm": 0.22484976053237915, + "learning_rate": 2.405295365820385e-05, + "loss": 0.0174, + "step": 16825 + }, + { + "epoch": 2.0216216216216214, + "grad_norm": 0.20852796733379364, + "learning_rate": 2.4026077248244642e-05, + "loss": 0.0174, + "step": 16830 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 0.23906341195106506, + "learning_rate": 2.399921111292601e-05, + "loss": 0.0161, + "step": 16835 + }, + { + "epoch": 2.022822822822823, + "grad_norm": 0.21950270235538483, + "learning_rate": 2.3972355262875545e-05, + "loss": 0.0167, + "step": 16840 + }, + { + "epoch": 2.0234234234234236, + "grad_norm": 0.23244847357273102, + "learning_rate": 2.3945509708716723e-05, + "loss": 0.0173, + "step": 16845 + }, + { + "epoch": 2.024024024024024, + "grad_norm": 0.3130114674568176, + "learning_rate": 2.391867446106899e-05, + "loss": 0.0186, + "step": 16850 + }, + { + "epoch": 2.0246246246246247, + "grad_norm": 0.30832698941230774, + "learning_rate": 2.389184953054772e-05, + "loss": 0.015, + "step": 16855 + }, + { + "epoch": 2.0252252252252254, + "grad_norm": 0.2931411564350128, + "learning_rate": 2.3865034927764195e-05, + "loss": 0.0172, + "step": 16860 + }, + { + "epoch": 2.0258258258258257, + "grad_norm": 0.2423478215932846, + "learning_rate": 2.3838230663325582e-05, + "loss": 0.0184, + "step": 16865 + }, + { + "epoch": 2.0264264264264265, + "grad_norm": 0.23603129386901855, + "learning_rate": 2.3811436747835014e-05, + "loss": 0.0181, + "step": 16870 + }, + { + "epoch": 2.027027027027027, + "grad_norm": 0.23850959539413452, + "learning_rate": 2.3784653191891466e-05, + "loss": 0.0147, + "step": 16875 + }, + { + "epoch": 2.0276276276276275, + "grad_norm": 0.19226299226284027, + "learning_rate": 2.3757880006089866e-05, + "loss": 0.0179, + "step": 16880 + }, + { + "epoch": 2.0282282282282282, + "grad_norm": 0.1841306984424591, + "learning_rate": 2.3731117201021048e-05, + "loss": 0.0159, + "step": 16885 + }, + { + "epoch": 2.028828828828829, + "grad_norm": 0.2256769835948944, + "learning_rate": 2.370436478727167e-05, + "loss": 0.016, + "step": 16890 + }, + { + "epoch": 2.0294294294294293, + "grad_norm": 0.1400245726108551, + "learning_rate": 2.3677622775424347e-05, + "loss": 0.0113, + "step": 16895 + }, + { + "epoch": 2.03003003003003, + "grad_norm": 0.30644339323043823, + "learning_rate": 2.3650891176057556e-05, + "loss": 0.0197, + "step": 16900 + }, + { + "epoch": 2.0306306306306308, + "grad_norm": 0.2984524071216583, + "learning_rate": 2.3624169999745654e-05, + "loss": 0.0161, + "step": 16905 + }, + { + "epoch": 2.031231231231231, + "grad_norm": 0.30133652687072754, + "learning_rate": 2.3597459257058897e-05, + "loss": 0.0179, + "step": 16910 + }, + { + "epoch": 2.031831831831832, + "grad_norm": 0.3699958622455597, + "learning_rate": 2.3570758958563354e-05, + "loss": 0.0203, + "step": 16915 + }, + { + "epoch": 2.0324324324324325, + "grad_norm": 0.22517237067222595, + "learning_rate": 2.3544069114821027e-05, + "loss": 0.0166, + "step": 16920 + }, + { + "epoch": 2.033033033033033, + "grad_norm": 0.24874772131443024, + "learning_rate": 2.351738973638975e-05, + "loss": 0.0198, + "step": 16925 + }, + { + "epoch": 2.0336336336336336, + "grad_norm": 0.26324090361595154, + "learning_rate": 2.3490720833823228e-05, + "loss": 0.0152, + "step": 16930 + }, + { + "epoch": 2.0342342342342343, + "grad_norm": 0.22640137374401093, + "learning_rate": 2.3464062417671035e-05, + "loss": 0.016, + "step": 16935 + }, + { + "epoch": 2.0348348348348346, + "grad_norm": 0.19408166408538818, + "learning_rate": 2.3437414498478542e-05, + "loss": 0.0179, + "step": 16940 + }, + { + "epoch": 2.0354354354354354, + "grad_norm": 0.25801095366477966, + "learning_rate": 2.341077708678703e-05, + "loss": 0.0179, + "step": 16945 + }, + { + "epoch": 2.036036036036036, + "grad_norm": 0.1984955370426178, + "learning_rate": 2.3384150193133587e-05, + "loss": 0.0196, + "step": 16950 + }, + { + "epoch": 2.036636636636637, + "grad_norm": 0.3280339241027832, + "learning_rate": 2.3357533828051188e-05, + "loss": 0.0172, + "step": 16955 + }, + { + "epoch": 2.037237237237237, + "grad_norm": 0.4044642150402069, + "learning_rate": 2.333092800206856e-05, + "loss": 0.0177, + "step": 16960 + }, + { + "epoch": 2.037837837837838, + "grad_norm": 0.3203864097595215, + "learning_rate": 2.330433272571035e-05, + "loss": 0.0176, + "step": 16965 + }, + { + "epoch": 2.0384384384384386, + "grad_norm": 0.19249285757541656, + "learning_rate": 2.3277748009496947e-05, + "loss": 0.0172, + "step": 16970 + }, + { + "epoch": 2.039039039039039, + "grad_norm": 0.3457205891609192, + "learning_rate": 2.3251173863944636e-05, + "loss": 0.017, + "step": 16975 + }, + { + "epoch": 2.0396396396396397, + "grad_norm": 0.24442507326602936, + "learning_rate": 2.3224610299565503e-05, + "loss": 0.0149, + "step": 16980 + }, + { + "epoch": 2.0402402402402404, + "grad_norm": 0.2592158317565918, + "learning_rate": 2.3198057326867395e-05, + "loss": 0.0189, + "step": 16985 + }, + { + "epoch": 2.0408408408408407, + "grad_norm": 0.3123731315135956, + "learning_rate": 2.3171514956354033e-05, + "loss": 0.0167, + "step": 16990 + }, + { + "epoch": 2.0414414414414415, + "grad_norm": 0.28269627690315247, + "learning_rate": 2.314498319852493e-05, + "loss": 0.0174, + "step": 16995 + }, + { + "epoch": 2.042042042042042, + "grad_norm": 0.3151399493217468, + "learning_rate": 2.3118462063875373e-05, + "loss": 0.0156, + "step": 17000 + }, + { + "epoch": 2.042042042042042, + "eval_loss": 0.04410775005817413, + "eval_runtime": 35.8994, + "eval_samples_per_second": 22.284, + "eval_steps_per_second": 5.571, + "step": 17000 + }, + { + "epoch": 2.0426426426426425, + "grad_norm": 0.22928915917873383, + "learning_rate": 2.3091951562896502e-05, + "loss": 0.0169, + "step": 17005 + }, + { + "epoch": 2.0432432432432432, + "grad_norm": 0.22066111862659454, + "learning_rate": 2.306545170607517e-05, + "loss": 0.0166, + "step": 17010 + }, + { + "epoch": 2.043843843843844, + "grad_norm": 0.16600383818149567, + "learning_rate": 2.3038962503894086e-05, + "loss": 0.0153, + "step": 17015 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 0.21326720714569092, + "learning_rate": 2.3012483966831718e-05, + "loss": 0.0159, + "step": 17020 + }, + { + "epoch": 2.045045045045045, + "grad_norm": 0.3878456652164459, + "learning_rate": 2.298601610536234e-05, + "loss": 0.017, + "step": 17025 + }, + { + "epoch": 2.0456456456456458, + "grad_norm": 0.23753908276557922, + "learning_rate": 2.2959558929955943e-05, + "loss": 0.0151, + "step": 17030 + }, + { + "epoch": 2.046246246246246, + "grad_norm": 0.25556668639183044, + "learning_rate": 2.293311245107836e-05, + "loss": 0.0162, + "step": 17035 + }, + { + "epoch": 2.046846846846847, + "grad_norm": 0.29425516724586487, + "learning_rate": 2.2906676679191146e-05, + "loss": 0.0191, + "step": 17040 + }, + { + "epoch": 2.0474474474474476, + "grad_norm": 0.22631238400936127, + "learning_rate": 2.288025162475165e-05, + "loss": 0.0164, + "step": 17045 + }, + { + "epoch": 2.048048048048048, + "grad_norm": 0.2662695348262787, + "learning_rate": 2.285383729821298e-05, + "loss": 0.0155, + "step": 17050 + }, + { + "epoch": 2.0486486486486486, + "grad_norm": 0.31905195116996765, + "learning_rate": 2.2827433710023967e-05, + "loss": 0.0171, + "step": 17055 + }, + { + "epoch": 2.0492492492492493, + "grad_norm": 0.27785614132881165, + "learning_rate": 2.2801040870629232e-05, + "loss": 0.0163, + "step": 17060 + }, + { + "epoch": 2.0498498498498496, + "grad_norm": 0.19194890558719635, + "learning_rate": 2.2774658790469106e-05, + "loss": 0.0152, + "step": 17065 + }, + { + "epoch": 2.0504504504504504, + "grad_norm": 0.32209452986717224, + "learning_rate": 2.27482874799797e-05, + "loss": 0.0193, + "step": 17070 + }, + { + "epoch": 2.051051051051051, + "grad_norm": 0.2572319507598877, + "learning_rate": 2.2721926949592877e-05, + "loss": 0.0146, + "step": 17075 + }, + { + "epoch": 2.051651651651652, + "grad_norm": 0.3050822913646698, + "learning_rate": 2.2695577209736163e-05, + "loss": 0.0177, + "step": 17080 + }, + { + "epoch": 2.052252252252252, + "grad_norm": 0.1752171516418457, + "learning_rate": 2.2669238270832883e-05, + "loss": 0.0138, + "step": 17085 + }, + { + "epoch": 2.052852852852853, + "grad_norm": 0.32753434777259827, + "learning_rate": 2.264291014330207e-05, + "loss": 0.0202, + "step": 17090 + }, + { + "epoch": 2.0534534534534536, + "grad_norm": 0.2691376209259033, + "learning_rate": 2.2616592837558502e-05, + "loss": 0.018, + "step": 17095 + }, + { + "epoch": 2.054054054054054, + "grad_norm": 0.27995845675468445, + "learning_rate": 2.259028636401262e-05, + "loss": 0.0186, + "step": 17100 + }, + { + "epoch": 2.0546546546546547, + "grad_norm": 0.20545314252376556, + "learning_rate": 2.2563990733070616e-05, + "loss": 0.0143, + "step": 17105 + }, + { + "epoch": 2.0552552552552554, + "grad_norm": 0.24307379126548767, + "learning_rate": 2.2537705955134402e-05, + "loss": 0.0156, + "step": 17110 + }, + { + "epoch": 2.0558558558558557, + "grad_norm": 0.2568596303462982, + "learning_rate": 2.251143204060159e-05, + "loss": 0.0161, + "step": 17115 + }, + { + "epoch": 2.0564564564564565, + "grad_norm": 0.3136351704597473, + "learning_rate": 2.2485168999865493e-05, + "loss": 0.0154, + "step": 17120 + }, + { + "epoch": 2.057057057057057, + "grad_norm": 0.2579204738140106, + "learning_rate": 2.2458916843315092e-05, + "loss": 0.0171, + "step": 17125 + }, + { + "epoch": 2.0576576576576575, + "grad_norm": 0.2734680771827698, + "learning_rate": 2.2432675581335105e-05, + "loss": 0.0187, + "step": 17130 + }, + { + "epoch": 2.0582582582582583, + "grad_norm": 0.2730713486671448, + "learning_rate": 2.2406445224305928e-05, + "loss": 0.0164, + "step": 17135 + }, + { + "epoch": 2.058858858858859, + "grad_norm": 0.29078954458236694, + "learning_rate": 2.2380225782603665e-05, + "loss": 0.0169, + "step": 17140 + }, + { + "epoch": 2.0594594594594593, + "grad_norm": 0.21432556211948395, + "learning_rate": 2.2354017266600032e-05, + "loss": 0.0184, + "step": 17145 + }, + { + "epoch": 2.06006006006006, + "grad_norm": 0.26164016127586365, + "learning_rate": 2.2327819686662504e-05, + "loss": 0.0157, + "step": 17150 + }, + { + "epoch": 2.060660660660661, + "grad_norm": 0.27027854323387146, + "learning_rate": 2.2301633053154164e-05, + "loss": 0.0165, + "step": 17155 + }, + { + "epoch": 2.061261261261261, + "grad_norm": 0.269004762172699, + "learning_rate": 2.227545737643381e-05, + "loss": 0.0159, + "step": 17160 + }, + { + "epoch": 2.061861861861862, + "grad_norm": 0.2210037261247635, + "learning_rate": 2.2249292666855916e-05, + "loss": 0.0153, + "step": 17165 + }, + { + "epoch": 2.0624624624624626, + "grad_norm": 0.3236193358898163, + "learning_rate": 2.222313893477055e-05, + "loss": 0.0172, + "step": 17170 + }, + { + "epoch": 2.063063063063063, + "grad_norm": 0.25057151913642883, + "learning_rate": 2.2196996190523507e-05, + "loss": 0.0166, + "step": 17175 + }, + { + "epoch": 2.0636636636636636, + "grad_norm": 0.27561861276626587, + "learning_rate": 2.2170864444456195e-05, + "loss": 0.0151, + "step": 17180 + }, + { + "epoch": 2.0642642642642643, + "grad_norm": 0.2706261873245239, + "learning_rate": 2.21447437069057e-05, + "loss": 0.0189, + "step": 17185 + }, + { + "epoch": 2.064864864864865, + "grad_norm": 0.3415422737598419, + "learning_rate": 2.2118633988204753e-05, + "loss": 0.0158, + "step": 17190 + }, + { + "epoch": 2.0654654654654654, + "grad_norm": 0.2783031761646271, + "learning_rate": 2.2092535298681667e-05, + "loss": 0.0182, + "step": 17195 + }, + { + "epoch": 2.066066066066066, + "grad_norm": 0.2432776540517807, + "learning_rate": 2.2066447648660465e-05, + "loss": 0.0205, + "step": 17200 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.236969456076622, + "learning_rate": 2.2040371048460778e-05, + "loss": 0.0173, + "step": 17205 + }, + { + "epoch": 2.067267267267267, + "grad_norm": 0.23664186894893646, + "learning_rate": 2.2014305508397853e-05, + "loss": 0.0164, + "step": 17210 + }, + { + "epoch": 2.067867867867868, + "grad_norm": 0.31544530391693115, + "learning_rate": 2.1988251038782593e-05, + "loss": 0.0168, + "step": 17215 + }, + { + "epoch": 2.0684684684684687, + "grad_norm": 0.18375684320926666, + "learning_rate": 2.1962207649921472e-05, + "loss": 0.0166, + "step": 17220 + }, + { + "epoch": 2.069069069069069, + "grad_norm": 0.24297571182250977, + "learning_rate": 2.193617535211662e-05, + "loss": 0.0164, + "step": 17225 + }, + { + "epoch": 2.0696696696696697, + "grad_norm": 0.3058975040912628, + "learning_rate": 2.1910154155665774e-05, + "loss": 0.0182, + "step": 17230 + }, + { + "epoch": 2.0702702702702704, + "grad_norm": 0.3959430158138275, + "learning_rate": 2.1884144070862288e-05, + "loss": 0.0188, + "step": 17235 + }, + { + "epoch": 2.0708708708708707, + "grad_norm": 0.31866326928138733, + "learning_rate": 2.1858145107995078e-05, + "loss": 0.0191, + "step": 17240 + }, + { + "epoch": 2.0714714714714715, + "grad_norm": 0.2213527113199234, + "learning_rate": 2.183215727734872e-05, + "loss": 0.0163, + "step": 17245 + }, + { + "epoch": 2.0720720720720722, + "grad_norm": 0.30789023637771606, + "learning_rate": 2.1806180589203318e-05, + "loss": 0.0191, + "step": 17250 + }, + { + "epoch": 2.0720720720720722, + "eval_loss": 0.04393164813518524, + "eval_runtime": 35.9507, + "eval_samples_per_second": 22.253, + "eval_steps_per_second": 5.563, + "step": 17250 + }, + { + "epoch": 2.0726726726726725, + "grad_norm": 0.37385308742523193, + "learning_rate": 2.1780215053834635e-05, + "loss": 0.0165, + "step": 17255 + }, + { + "epoch": 2.0732732732732733, + "grad_norm": 0.2261199951171875, + "learning_rate": 2.1754260681514e-05, + "loss": 0.0168, + "step": 17260 + }, + { + "epoch": 2.073873873873874, + "grad_norm": 0.2935144901275635, + "learning_rate": 2.1728317482508293e-05, + "loss": 0.0171, + "step": 17265 + }, + { + "epoch": 2.0744744744744743, + "grad_norm": 0.25347235798835754, + "learning_rate": 2.170238546708001e-05, + "loss": 0.0172, + "step": 17270 + }, + { + "epoch": 2.075075075075075, + "grad_norm": 0.30140021443367004, + "learning_rate": 2.1676464645487227e-05, + "loss": 0.0142, + "step": 17275 + }, + { + "epoch": 2.075675675675676, + "grad_norm": 0.26232364773750305, + "learning_rate": 2.1650555027983566e-05, + "loss": 0.0169, + "step": 17280 + }, + { + "epoch": 2.076276276276276, + "grad_norm": 0.2933792471885681, + "learning_rate": 2.1624656624818258e-05, + "loss": 0.0152, + "step": 17285 + }, + { + "epoch": 2.076876876876877, + "grad_norm": 0.23124483227729797, + "learning_rate": 2.159876944623602e-05, + "loss": 0.0175, + "step": 17290 + }, + { + "epoch": 2.0774774774774776, + "grad_norm": 0.18243825435638428, + "learning_rate": 2.1572893502477216e-05, + "loss": 0.0142, + "step": 17295 + }, + { + "epoch": 2.078078078078078, + "grad_norm": 0.2552858293056488, + "learning_rate": 2.1547028803777718e-05, + "loss": 0.0152, + "step": 17300 + }, + { + "epoch": 2.0786786786786786, + "grad_norm": 0.19179783761501312, + "learning_rate": 2.1521175360368956e-05, + "loss": 0.0148, + "step": 17305 + }, + { + "epoch": 2.0792792792792794, + "grad_norm": 0.2637120187282562, + "learning_rate": 2.149533318247794e-05, + "loss": 0.0159, + "step": 17310 + }, + { + "epoch": 2.07987987987988, + "grad_norm": 0.34350207448005676, + "learning_rate": 2.1469502280327147e-05, + "loss": 0.02, + "step": 17315 + }, + { + "epoch": 2.0804804804804804, + "grad_norm": 0.32478630542755127, + "learning_rate": 2.1443682664134675e-05, + "loss": 0.0219, + "step": 17320 + }, + { + "epoch": 2.081081081081081, + "grad_norm": 0.25575026869773865, + "learning_rate": 2.1417874344114114e-05, + "loss": 0.0172, + "step": 17325 + }, + { + "epoch": 2.081681681681682, + "grad_norm": 0.23104359209537506, + "learning_rate": 2.139207733047462e-05, + "loss": 0.0223, + "step": 17330 + }, + { + "epoch": 2.082282282282282, + "grad_norm": 0.2576219439506531, + "learning_rate": 2.136629163342081e-05, + "loss": 0.0164, + "step": 17335 + }, + { + "epoch": 2.082882882882883, + "grad_norm": 0.259834885597229, + "learning_rate": 2.134051726315291e-05, + "loss": 0.0147, + "step": 17340 + }, + { + "epoch": 2.0834834834834837, + "grad_norm": 0.26843321323394775, + "learning_rate": 2.131475422986658e-05, + "loss": 0.0172, + "step": 17345 + }, + { + "epoch": 2.084084084084084, + "grad_norm": 0.2459273487329483, + "learning_rate": 2.1289002543753062e-05, + "loss": 0.0174, + "step": 17350 + }, + { + "epoch": 2.0846846846846847, + "grad_norm": 0.29039624333381653, + "learning_rate": 2.1263262214999103e-05, + "loss": 0.0167, + "step": 17355 + }, + { + "epoch": 2.0852852852852855, + "grad_norm": 0.31065475940704346, + "learning_rate": 2.1237533253786902e-05, + "loss": 0.0172, + "step": 17360 + }, + { + "epoch": 2.0858858858858857, + "grad_norm": 0.2677178680896759, + "learning_rate": 2.121181567029421e-05, + "loss": 0.0166, + "step": 17365 + }, + { + "epoch": 2.0864864864864865, + "grad_norm": 0.22189515829086304, + "learning_rate": 2.1186109474694277e-05, + "loss": 0.0176, + "step": 17370 + }, + { + "epoch": 2.0870870870870872, + "grad_norm": 0.27094221115112305, + "learning_rate": 2.116041467715583e-05, + "loss": 0.0186, + "step": 17375 + }, + { + "epoch": 2.0876876876876875, + "grad_norm": 0.2697623074054718, + "learning_rate": 2.1134731287843125e-05, + "loss": 0.0194, + "step": 17380 + }, + { + "epoch": 2.0882882882882883, + "grad_norm": 0.32814136147499084, + "learning_rate": 2.1109059316915817e-05, + "loss": 0.0207, + "step": 17385 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 0.32178542017936707, + "learning_rate": 2.108339877452914e-05, + "loss": 0.017, + "step": 17390 + }, + { + "epoch": 2.0894894894894893, + "grad_norm": 0.29471707344055176, + "learning_rate": 2.105774967083376e-05, + "loss": 0.0168, + "step": 17395 + }, + { + "epoch": 2.09009009009009, + "grad_norm": 0.252353698015213, + "learning_rate": 2.1032112015975845e-05, + "loss": 0.0166, + "step": 17400 + }, + { + "epoch": 2.090690690690691, + "grad_norm": 0.24682864546775818, + "learning_rate": 2.1006485820096983e-05, + "loss": 0.0163, + "step": 17405 + }, + { + "epoch": 2.091291291291291, + "grad_norm": 0.2174302339553833, + "learning_rate": 2.0980871093334274e-05, + "loss": 0.0166, + "step": 17410 + }, + { + "epoch": 2.091891891891892, + "grad_norm": 0.2508426010608673, + "learning_rate": 2.095526784582028e-05, + "loss": 0.0176, + "step": 17415 + }, + { + "epoch": 2.0924924924924926, + "grad_norm": 0.19831790030002594, + "learning_rate": 2.092967608768301e-05, + "loss": 0.0141, + "step": 17420 + }, + { + "epoch": 2.093093093093093, + "grad_norm": 0.2547295391559601, + "learning_rate": 2.0904095829045933e-05, + "loss": 0.0178, + "step": 17425 + }, + { + "epoch": 2.0936936936936936, + "grad_norm": 0.3118404746055603, + "learning_rate": 2.0878527080027948e-05, + "loss": 0.0157, + "step": 17430 + }, + { + "epoch": 2.0942942942942944, + "grad_norm": 0.18359223008155823, + "learning_rate": 2.0852969850743447e-05, + "loss": 0.0169, + "step": 17435 + }, + { + "epoch": 2.094894894894895, + "grad_norm": 0.2618004083633423, + "learning_rate": 2.08274241513022e-05, + "loss": 0.0181, + "step": 17440 + }, + { + "epoch": 2.0954954954954954, + "grad_norm": 0.33545413613319397, + "learning_rate": 2.0801889991809477e-05, + "loss": 0.0181, + "step": 17445 + }, + { + "epoch": 2.096096096096096, + "grad_norm": 0.1954948455095291, + "learning_rate": 2.077636738236597e-05, + "loss": 0.0124, + "step": 17450 + }, + { + "epoch": 2.096696696696697, + "grad_norm": 0.2551083564758301, + "learning_rate": 2.075085633306776e-05, + "loss": 0.0128, + "step": 17455 + }, + { + "epoch": 2.097297297297297, + "grad_norm": 0.1947358250617981, + "learning_rate": 2.0725356854006396e-05, + "loss": 0.0163, + "step": 17460 + }, + { + "epoch": 2.097897897897898, + "grad_norm": 0.24798135459423065, + "learning_rate": 2.0699868955268854e-05, + "loss": 0.0182, + "step": 17465 + }, + { + "epoch": 2.0984984984984987, + "grad_norm": 0.12923765182495117, + "learning_rate": 2.067439264693752e-05, + "loss": 0.0156, + "step": 17470 + }, + { + "epoch": 2.099099099099099, + "grad_norm": 0.21395176649093628, + "learning_rate": 2.0648927939090164e-05, + "loss": 0.0142, + "step": 17475 + }, + { + "epoch": 2.0996996996996997, + "grad_norm": 0.284847229719162, + "learning_rate": 2.0623474841800007e-05, + "loss": 0.0164, + "step": 17480 + }, + { + "epoch": 2.1003003003003005, + "grad_norm": 0.38137951493263245, + "learning_rate": 2.0598033365135665e-05, + "loss": 0.0157, + "step": 17485 + }, + { + "epoch": 2.1009009009009008, + "grad_norm": 0.3931626081466675, + "learning_rate": 2.057260351916116e-05, + "loss": 0.0167, + "step": 17490 + }, + { + "epoch": 2.1015015015015015, + "grad_norm": 0.2608947157859802, + "learning_rate": 2.0547185313935924e-05, + "loss": 0.0152, + "step": 17495 + }, + { + "epoch": 2.1021021021021022, + "grad_norm": 0.21299488842487335, + "learning_rate": 2.0521778759514732e-05, + "loss": 0.0153, + "step": 17500 + }, + { + "epoch": 2.1021021021021022, + "eval_loss": 0.04359756410121918, + "eval_runtime": 35.8185, + "eval_samples_per_second": 22.335, + "eval_steps_per_second": 5.584, + "step": 17500 + }, + { + "epoch": 2.1027027027027025, + "grad_norm": 0.2490372657775879, + "learning_rate": 2.0496383865947806e-05, + "loss": 0.0143, + "step": 17505 + }, + { + "epoch": 2.1033033033033033, + "grad_norm": 0.2208997756242752, + "learning_rate": 2.0471000643280735e-05, + "loss": 0.018, + "step": 17510 + }, + { + "epoch": 2.103903903903904, + "grad_norm": 0.2556515634059906, + "learning_rate": 2.044562910155452e-05, + "loss": 0.0154, + "step": 17515 + }, + { + "epoch": 2.1045045045045043, + "grad_norm": 0.23904521763324738, + "learning_rate": 2.042026925080547e-05, + "loss": 0.0178, + "step": 17520 + }, + { + "epoch": 2.105105105105105, + "grad_norm": 0.3773913085460663, + "learning_rate": 2.039492110106535e-05, + "loss": 0.0186, + "step": 17525 + }, + { + "epoch": 2.105705705705706, + "grad_norm": 0.284322053194046, + "learning_rate": 2.0369584662361234e-05, + "loss": 0.0155, + "step": 17530 + }, + { + "epoch": 2.106306306306306, + "grad_norm": 0.19863803684711456, + "learning_rate": 2.0344259944715594e-05, + "loss": 0.0181, + "step": 17535 + }, + { + "epoch": 2.106906906906907, + "grad_norm": 0.3278902471065521, + "learning_rate": 2.031894695814629e-05, + "loss": 0.015, + "step": 17540 + }, + { + "epoch": 2.1075075075075076, + "grad_norm": 0.25113287568092346, + "learning_rate": 2.029364571266647e-05, + "loss": 0.0138, + "step": 17545 + }, + { + "epoch": 2.108108108108108, + "grad_norm": 0.3565726578235626, + "learning_rate": 2.02683562182847e-05, + "loss": 0.0173, + "step": 17550 + }, + { + "epoch": 2.1087087087087086, + "grad_norm": 0.2991584539413452, + "learning_rate": 2.0243078485004885e-05, + "loss": 0.0157, + "step": 17555 + }, + { + "epoch": 2.1093093093093094, + "grad_norm": 0.27290019392967224, + "learning_rate": 2.0217812522826256e-05, + "loss": 0.0154, + "step": 17560 + }, + { + "epoch": 2.10990990990991, + "grad_norm": 0.29226356744766235, + "learning_rate": 2.0192558341743427e-05, + "loss": 0.0167, + "step": 17565 + }, + { + "epoch": 2.1105105105105104, + "grad_norm": 0.21249055862426758, + "learning_rate": 2.0167315951746298e-05, + "loss": 0.0135, + "step": 17570 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 0.2446049600839615, + "learning_rate": 2.014208536282014e-05, + "loss": 0.015, + "step": 17575 + }, + { + "epoch": 2.111711711711712, + "grad_norm": 0.21438179910182953, + "learning_rate": 2.011686658494555e-05, + "loss": 0.0172, + "step": 17580 + }, + { + "epoch": 2.112312312312312, + "grad_norm": 0.3306219279766083, + "learning_rate": 2.0091659628098458e-05, + "loss": 0.0184, + "step": 17585 + }, + { + "epoch": 2.112912912912913, + "grad_norm": 0.20943187177181244, + "learning_rate": 2.0066464502250127e-05, + "loss": 0.0118, + "step": 17590 + }, + { + "epoch": 2.1135135135135137, + "grad_norm": 0.3500710129737854, + "learning_rate": 2.004128121736709e-05, + "loss": 0.0159, + "step": 17595 + }, + { + "epoch": 2.114114114114114, + "grad_norm": 0.24584412574768066, + "learning_rate": 2.0016109783411246e-05, + "loss": 0.015, + "step": 17600 + }, + { + "epoch": 2.1147147147147147, + "grad_norm": 0.25247928500175476, + "learning_rate": 1.9990950210339794e-05, + "loss": 0.0166, + "step": 17605 + }, + { + "epoch": 2.1153153153153155, + "grad_norm": 0.2562161982059479, + "learning_rate": 1.9965802508105253e-05, + "loss": 0.0134, + "step": 17610 + }, + { + "epoch": 2.1159159159159158, + "grad_norm": 0.1872304528951645, + "learning_rate": 1.99406666866554e-05, + "loss": 0.018, + "step": 17615 + }, + { + "epoch": 2.1165165165165165, + "grad_norm": 0.2885187566280365, + "learning_rate": 1.9915542755933376e-05, + "loss": 0.0169, + "step": 17620 + }, + { + "epoch": 2.1171171171171173, + "grad_norm": 0.3037269711494446, + "learning_rate": 1.9890430725877546e-05, + "loss": 0.0148, + "step": 17625 + }, + { + "epoch": 2.1177177177177176, + "grad_norm": 0.35903996229171753, + "learning_rate": 1.9865330606421634e-05, + "loss": 0.0165, + "step": 17630 + }, + { + "epoch": 2.1183183183183183, + "grad_norm": 0.41558587551116943, + "learning_rate": 1.9840242407494637e-05, + "loss": 0.0185, + "step": 17635 + }, + { + "epoch": 2.118918918918919, + "grad_norm": 0.2805681526660919, + "learning_rate": 1.981516613902079e-05, + "loss": 0.0168, + "step": 17640 + }, + { + "epoch": 2.1195195195195193, + "grad_norm": 0.31591591238975525, + "learning_rate": 1.9790101810919665e-05, + "loss": 0.0161, + "step": 17645 + }, + { + "epoch": 2.12012012012012, + "grad_norm": 0.29931020736694336, + "learning_rate": 1.976504943310608e-05, + "loss": 0.0181, + "step": 17650 + }, + { + "epoch": 2.120720720720721, + "grad_norm": 0.22215649485588074, + "learning_rate": 1.974000901549015e-05, + "loss": 0.016, + "step": 17655 + }, + { + "epoch": 2.121321321321321, + "grad_norm": 0.19030337035655975, + "learning_rate": 1.9714980567977254e-05, + "loss": 0.0164, + "step": 17660 + }, + { + "epoch": 2.121921921921922, + "grad_norm": 0.30670931935310364, + "learning_rate": 1.968996410046799e-05, + "loss": 0.0157, + "step": 17665 + }, + { + "epoch": 2.1225225225225226, + "grad_norm": 0.3222202658653259, + "learning_rate": 1.966495962285827e-05, + "loss": 0.0173, + "step": 17670 + }, + { + "epoch": 2.123123123123123, + "grad_norm": 0.23883286118507385, + "learning_rate": 1.9639967145039252e-05, + "loss": 0.0166, + "step": 17675 + }, + { + "epoch": 2.1237237237237236, + "grad_norm": 0.23918716609477997, + "learning_rate": 1.961498667689733e-05, + "loss": 0.0182, + "step": 17680 + }, + { + "epoch": 2.1243243243243244, + "grad_norm": 0.26056933403015137, + "learning_rate": 1.959001822831419e-05, + "loss": 0.0178, + "step": 17685 + }, + { + "epoch": 2.124924924924925, + "grad_norm": 0.2223052978515625, + "learning_rate": 1.9565061809166685e-05, + "loss": 0.0157, + "step": 17690 + }, + { + "epoch": 2.1255255255255254, + "grad_norm": 0.30344024300575256, + "learning_rate": 1.9540117429326977e-05, + "loss": 0.0191, + "step": 17695 + }, + { + "epoch": 2.126126126126126, + "grad_norm": 0.38834190368652344, + "learning_rate": 1.9515185098662447e-05, + "loss": 0.0174, + "step": 17700 + }, + { + "epoch": 2.126726726726727, + "grad_norm": 0.26158246397972107, + "learning_rate": 1.9490264827035733e-05, + "loss": 0.0181, + "step": 17705 + }, + { + "epoch": 2.127327327327327, + "grad_norm": 0.1912645399570465, + "learning_rate": 1.9465356624304625e-05, + "loss": 0.02, + "step": 17710 + }, + { + "epoch": 2.127927927927928, + "grad_norm": 0.20378795266151428, + "learning_rate": 1.944046050032224e-05, + "loss": 0.0177, + "step": 17715 + }, + { + "epoch": 2.1285285285285287, + "grad_norm": 0.2622855603694916, + "learning_rate": 1.9415576464936824e-05, + "loss": 0.0172, + "step": 17720 + }, + { + "epoch": 2.129129129129129, + "grad_norm": 0.2557198107242584, + "learning_rate": 1.9390704527991904e-05, + "loss": 0.019, + "step": 17725 + }, + { + "epoch": 2.1297297297297297, + "grad_norm": 0.22268472611904144, + "learning_rate": 1.936584469932623e-05, + "loss": 0.0163, + "step": 17730 + }, + { + "epoch": 2.1303303303303305, + "grad_norm": 0.28545132279396057, + "learning_rate": 1.9340996988773685e-05, + "loss": 0.0144, + "step": 17735 + }, + { + "epoch": 2.1309309309309308, + "grad_norm": 0.2868815064430237, + "learning_rate": 1.931616140616344e-05, + "loss": 0.0172, + "step": 17740 + }, + { + "epoch": 2.1315315315315315, + "grad_norm": 0.26733162999153137, + "learning_rate": 1.929133796131982e-05, + "loss": 0.0173, + "step": 17745 + }, + { + "epoch": 2.1321321321321323, + "grad_norm": 0.33977895975112915, + "learning_rate": 1.9266526664062386e-05, + "loss": 0.0159, + "step": 17750 + }, + { + "epoch": 2.1321321321321323, + "eval_loss": 0.04252379387617111, + "eval_runtime": 35.7755, + "eval_samples_per_second": 22.362, + "eval_steps_per_second": 5.59, + "step": 17750 + }, + { + "epoch": 2.1327327327327326, + "grad_norm": 0.2979249954223633, + "learning_rate": 1.9241727524205865e-05, + "loss": 0.0149, + "step": 17755 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.36992010474205017, + "learning_rate": 1.921694055156017e-05, + "loss": 0.0162, + "step": 17760 + }, + { + "epoch": 2.133933933933934, + "grad_norm": 0.34459370374679565, + "learning_rate": 1.919216575593042e-05, + "loss": 0.0154, + "step": 17765 + }, + { + "epoch": 2.1345345345345343, + "grad_norm": 0.3337080180644989, + "learning_rate": 1.9167403147116908e-05, + "loss": 0.0151, + "step": 17770 + }, + { + "epoch": 2.135135135135135, + "grad_norm": 0.2594672739505768, + "learning_rate": 1.9142652734915134e-05, + "loss": 0.0163, + "step": 17775 + }, + { + "epoch": 2.135735735735736, + "grad_norm": 0.21162456274032593, + "learning_rate": 1.9117914529115706e-05, + "loss": 0.0167, + "step": 17780 + }, + { + "epoch": 2.1363363363363366, + "grad_norm": 0.304496169090271, + "learning_rate": 1.909318853950447e-05, + "loss": 0.0155, + "step": 17785 + }, + { + "epoch": 2.136936936936937, + "grad_norm": 0.30738282203674316, + "learning_rate": 1.906847477586241e-05, + "loss": 0.015, + "step": 17790 + }, + { + "epoch": 2.1375375375375376, + "grad_norm": 0.21666869521141052, + "learning_rate": 1.9043773247965678e-05, + "loss": 0.0169, + "step": 17795 + }, + { + "epoch": 2.138138138138138, + "grad_norm": 0.29714494943618774, + "learning_rate": 1.901908396558561e-05, + "loss": 0.016, + "step": 17800 + }, + { + "epoch": 2.1387387387387387, + "grad_norm": 0.22285744547843933, + "learning_rate": 1.899440693848864e-05, + "loss": 0.0145, + "step": 17805 + }, + { + "epoch": 2.1393393393393394, + "grad_norm": 0.24469813704490662, + "learning_rate": 1.896974217643641e-05, + "loss": 0.017, + "step": 17810 + }, + { + "epoch": 2.13993993993994, + "grad_norm": 0.3498464822769165, + "learning_rate": 1.8945089689185673e-05, + "loss": 0.0153, + "step": 17815 + }, + { + "epoch": 2.1405405405405404, + "grad_norm": 0.22742371261119843, + "learning_rate": 1.8920449486488352e-05, + "loss": 0.0176, + "step": 17820 + }, + { + "epoch": 2.141141141141141, + "grad_norm": 0.3167477250099182, + "learning_rate": 1.889582157809151e-05, + "loss": 0.0147, + "step": 17825 + }, + { + "epoch": 2.141741741741742, + "grad_norm": 0.2402694821357727, + "learning_rate": 1.8871205973737316e-05, + "loss": 0.0158, + "step": 17830 + }, + { + "epoch": 2.142342342342342, + "grad_norm": 0.34336015582084656, + "learning_rate": 1.8846602683163106e-05, + "loss": 0.0179, + "step": 17835 + }, + { + "epoch": 2.142942942942943, + "grad_norm": 0.36309754848480225, + "learning_rate": 1.882201171610133e-05, + "loss": 0.0165, + "step": 17840 + }, + { + "epoch": 2.1435435435435437, + "grad_norm": 0.299583375453949, + "learning_rate": 1.8797433082279582e-05, + "loss": 0.0146, + "step": 17845 + }, + { + "epoch": 2.144144144144144, + "grad_norm": 0.17966710031032562, + "learning_rate": 1.877286679142053e-05, + "loss": 0.015, + "step": 17850 + }, + { + "epoch": 2.1447447447447447, + "grad_norm": 0.26901188492774963, + "learning_rate": 1.8748312853242005e-05, + "loss": 0.0171, + "step": 17855 + }, + { + "epoch": 2.1453453453453455, + "grad_norm": 0.2321656048297882, + "learning_rate": 1.872377127745694e-05, + "loss": 0.0165, + "step": 17860 + }, + { + "epoch": 2.145945945945946, + "grad_norm": 0.2894320487976074, + "learning_rate": 1.869924207377337e-05, + "loss": 0.0163, + "step": 17865 + }, + { + "epoch": 2.1465465465465465, + "grad_norm": 0.22987785935401917, + "learning_rate": 1.8674725251894464e-05, + "loss": 0.0166, + "step": 17870 + }, + { + "epoch": 2.1471471471471473, + "grad_norm": 0.2710099518299103, + "learning_rate": 1.8650220821518432e-05, + "loss": 0.0167, + "step": 17875 + }, + { + "epoch": 2.1477477477477476, + "grad_norm": 0.2632339596748352, + "learning_rate": 1.862572879233863e-05, + "loss": 0.0155, + "step": 17880 + }, + { + "epoch": 2.1483483483483483, + "grad_norm": 0.17593629658222198, + "learning_rate": 1.860124917404351e-05, + "loss": 0.0129, + "step": 17885 + }, + { + "epoch": 2.148948948948949, + "grad_norm": 0.3508222997188568, + "learning_rate": 1.8576781976316615e-05, + "loss": 0.0174, + "step": 17890 + }, + { + "epoch": 2.1495495495495494, + "grad_norm": 0.28124260902404785, + "learning_rate": 1.8552327208836528e-05, + "loss": 0.0167, + "step": 17895 + }, + { + "epoch": 2.15015015015015, + "grad_norm": 0.2841458320617676, + "learning_rate": 1.852788488127698e-05, + "loss": 0.0172, + "step": 17900 + }, + { + "epoch": 2.150750750750751, + "grad_norm": 0.19693568348884583, + "learning_rate": 1.850345500330672e-05, + "loss": 0.0163, + "step": 17905 + }, + { + "epoch": 2.1513513513513516, + "grad_norm": 0.2247840017080307, + "learning_rate": 1.847903758458962e-05, + "loss": 0.0137, + "step": 17910 + }, + { + "epoch": 2.151951951951952, + "grad_norm": 0.2549600899219513, + "learning_rate": 1.8454632634784626e-05, + "loss": 0.0159, + "step": 17915 + }, + { + "epoch": 2.1525525525525526, + "grad_norm": 0.2734972834587097, + "learning_rate": 1.8430240163545685e-05, + "loss": 0.0155, + "step": 17920 + }, + { + "epoch": 2.153153153153153, + "grad_norm": 0.31436336040496826, + "learning_rate": 1.8405860180521888e-05, + "loss": 0.0172, + "step": 17925 + }, + { + "epoch": 2.1537537537537537, + "grad_norm": 0.18977700173854828, + "learning_rate": 1.8381492695357344e-05, + "loss": 0.0168, + "step": 17930 + }, + { + "epoch": 2.1543543543543544, + "grad_norm": 0.25732213258743286, + "learning_rate": 1.8357137717691232e-05, + "loss": 0.0177, + "step": 17935 + }, + { + "epoch": 2.154954954954955, + "grad_norm": 0.18424879014492035, + "learning_rate": 1.8332795257157788e-05, + "loss": 0.0147, + "step": 17940 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 0.23075073957443237, + "learning_rate": 1.8308465323386253e-05, + "loss": 0.0157, + "step": 17945 + }, + { + "epoch": 2.156156156156156, + "grad_norm": 0.2986072897911072, + "learning_rate": 1.8284147926000967e-05, + "loss": 0.0168, + "step": 17950 + }, + { + "epoch": 2.156756756756757, + "grad_norm": 0.2990579307079315, + "learning_rate": 1.8259843074621287e-05, + "loss": 0.0153, + "step": 17955 + }, + { + "epoch": 2.1573573573573572, + "grad_norm": 0.31677985191345215, + "learning_rate": 1.8235550778861617e-05, + "loss": 0.015, + "step": 17960 + }, + { + "epoch": 2.157957957957958, + "grad_norm": 0.30569136142730713, + "learning_rate": 1.8211271048331392e-05, + "loss": 0.0164, + "step": 17965 + }, + { + "epoch": 2.1585585585585587, + "grad_norm": 0.20969319343566895, + "learning_rate": 1.8187003892635052e-05, + "loss": 0.0125, + "step": 17970 + }, + { + "epoch": 2.159159159159159, + "grad_norm": 0.2510526180267334, + "learning_rate": 1.816274932137209e-05, + "loss": 0.0166, + "step": 17975 + }, + { + "epoch": 2.1597597597597598, + "grad_norm": 0.2927291691303253, + "learning_rate": 1.8138507344137016e-05, + "loss": 0.0158, + "step": 17980 + }, + { + "epoch": 2.1603603603603605, + "grad_norm": 0.20787876844406128, + "learning_rate": 1.8114277970519378e-05, + "loss": 0.0165, + "step": 17985 + }, + { + "epoch": 2.160960960960961, + "grad_norm": 0.17879043519496918, + "learning_rate": 1.8090061210103675e-05, + "loss": 0.0176, + "step": 17990 + }, + { + "epoch": 2.1615615615615615, + "grad_norm": 0.24847956001758575, + "learning_rate": 1.80658570724695e-05, + "loss": 0.0137, + "step": 17995 + }, + { + "epoch": 2.1621621621621623, + "grad_norm": 0.20084017515182495, + "learning_rate": 1.804166556719137e-05, + "loss": 0.0147, + "step": 18000 + }, + { + "epoch": 2.1621621621621623, + "eval_loss": 0.04187976196408272, + "eval_runtime": 35.7108, + "eval_samples_per_second": 22.402, + "eval_steps_per_second": 5.601, + "step": 18000 + }, + { + "epoch": 2.1627627627627626, + "grad_norm": 0.24123363196849823, + "learning_rate": 1.801748670383886e-05, + "loss": 0.0142, + "step": 18005 + }, + { + "epoch": 2.1633633633633633, + "grad_norm": 0.24969927966594696, + "learning_rate": 1.799332049197655e-05, + "loss": 0.0142, + "step": 18010 + }, + { + "epoch": 2.163963963963964, + "grad_norm": 0.24504432082176208, + "learning_rate": 1.796916694116396e-05, + "loss": 0.0159, + "step": 18015 + }, + { + "epoch": 2.1645645645645644, + "grad_norm": 0.22552812099456787, + "learning_rate": 1.7945026060955662e-05, + "loss": 0.0139, + "step": 18020 + }, + { + "epoch": 2.165165165165165, + "grad_norm": 0.26121315360069275, + "learning_rate": 1.7920897860901175e-05, + "loss": 0.0138, + "step": 18025 + }, + { + "epoch": 2.165765765765766, + "grad_norm": 0.3007679879665375, + "learning_rate": 1.7896782350545034e-05, + "loss": 0.0154, + "step": 18030 + }, + { + "epoch": 2.1663663663663666, + "grad_norm": 0.208225816488266, + "learning_rate": 1.787267953942674e-05, + "loss": 0.0145, + "step": 18035 + }, + { + "epoch": 2.166966966966967, + "grad_norm": 0.24646946787834167, + "learning_rate": 1.7848589437080738e-05, + "loss": 0.0163, + "step": 18040 + }, + { + "epoch": 2.1675675675675676, + "grad_norm": 0.2667646110057831, + "learning_rate": 1.7824512053036495e-05, + "loss": 0.0168, + "step": 18045 + }, + { + "epoch": 2.1681681681681684, + "grad_norm": 0.3075443506240845, + "learning_rate": 1.780044739681843e-05, + "loss": 0.0172, + "step": 18050 + }, + { + "epoch": 2.1687687687687687, + "grad_norm": 0.2193213403224945, + "learning_rate": 1.7776395477945945e-05, + "loss": 0.0119, + "step": 18055 + }, + { + "epoch": 2.1693693693693694, + "grad_norm": 0.34656471014022827, + "learning_rate": 1.7752356305933338e-05, + "loss": 0.0168, + "step": 18060 + }, + { + "epoch": 2.16996996996997, + "grad_norm": 0.2753620743751526, + "learning_rate": 1.7728329890289934e-05, + "loss": 0.0173, + "step": 18065 + }, + { + "epoch": 2.1705705705705705, + "grad_norm": 0.2601529359817505, + "learning_rate": 1.7704316240519992e-05, + "loss": 0.0152, + "step": 18070 + }, + { + "epoch": 2.171171171171171, + "grad_norm": 0.22908732295036316, + "learning_rate": 1.768031536612271e-05, + "loss": 0.0148, + "step": 18075 + }, + { + "epoch": 2.171771771771772, + "grad_norm": 0.31950539350509644, + "learning_rate": 1.7656327276592256e-05, + "loss": 0.0171, + "step": 18080 + }, + { + "epoch": 2.1723723723723722, + "grad_norm": 0.16539399325847626, + "learning_rate": 1.7632351981417692e-05, + "loss": 0.0135, + "step": 18085 + }, + { + "epoch": 2.172972972972973, + "grad_norm": 0.2653179168701172, + "learning_rate": 1.7608389490083088e-05, + "loss": 0.0166, + "step": 18090 + }, + { + "epoch": 2.1735735735735737, + "grad_norm": 0.28123563528060913, + "learning_rate": 1.7584439812067384e-05, + "loss": 0.0151, + "step": 18095 + }, + { + "epoch": 2.174174174174174, + "grad_norm": 0.20468199253082275, + "learning_rate": 1.7560502956844478e-05, + "loss": 0.012, + "step": 18100 + }, + { + "epoch": 2.1747747747747748, + "grad_norm": 0.168495774269104, + "learning_rate": 1.753657893388323e-05, + "loss": 0.0147, + "step": 18105 + }, + { + "epoch": 2.1753753753753755, + "grad_norm": 0.2905060946941376, + "learning_rate": 1.751266775264735e-05, + "loss": 0.0131, + "step": 18110 + }, + { + "epoch": 2.175975975975976, + "grad_norm": 0.26909762620925903, + "learning_rate": 1.748876942259553e-05, + "loss": 0.0175, + "step": 18115 + }, + { + "epoch": 2.1765765765765765, + "grad_norm": 0.19784219563007355, + "learning_rate": 1.7464883953181354e-05, + "loss": 0.0161, + "step": 18120 + }, + { + "epoch": 2.1771771771771773, + "grad_norm": 0.3387581706047058, + "learning_rate": 1.744101135385332e-05, + "loss": 0.0177, + "step": 18125 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 0.21490976214408875, + "learning_rate": 1.7417151634054864e-05, + "loss": 0.0146, + "step": 18130 + }, + { + "epoch": 2.1783783783783783, + "grad_norm": 0.27226021885871887, + "learning_rate": 1.739330480322426e-05, + "loss": 0.0178, + "step": 18135 + }, + { + "epoch": 2.178978978978979, + "grad_norm": 0.3881310522556305, + "learning_rate": 1.7369470870794734e-05, + "loss": 0.019, + "step": 18140 + }, + { + "epoch": 2.1795795795795794, + "grad_norm": 0.249447301030159, + "learning_rate": 1.734564984619441e-05, + "loss": 0.0176, + "step": 18145 + }, + { + "epoch": 2.18018018018018, + "grad_norm": 0.18503446877002716, + "learning_rate": 1.7321841738846307e-05, + "loss": 0.016, + "step": 18150 + }, + { + "epoch": 2.180780780780781, + "grad_norm": 0.19159843027591705, + "learning_rate": 1.729804655816829e-05, + "loss": 0.0157, + "step": 18155 + }, + { + "epoch": 2.1813813813813816, + "grad_norm": 0.25416600704193115, + "learning_rate": 1.7274264313573162e-05, + "loss": 0.0149, + "step": 18160 + }, + { + "epoch": 2.181981981981982, + "grad_norm": 0.36102357506752014, + "learning_rate": 1.7250495014468586e-05, + "loss": 0.0204, + "step": 18165 + }, + { + "epoch": 2.1825825825825826, + "grad_norm": 0.25126516819000244, + "learning_rate": 1.7226738670257113e-05, + "loss": 0.0154, + "step": 18170 + }, + { + "epoch": 2.1831831831831834, + "grad_norm": 0.2282605916261673, + "learning_rate": 1.7202995290336176e-05, + "loss": 0.0148, + "step": 18175 + }, + { + "epoch": 2.1837837837837837, + "grad_norm": 0.22766336798667908, + "learning_rate": 1.717926488409804e-05, + "loss": 0.015, + "step": 18180 + }, + { + "epoch": 2.1843843843843844, + "grad_norm": 0.23042090237140656, + "learning_rate": 1.7155547460929892e-05, + "loss": 0.016, + "step": 18185 + }, + { + "epoch": 2.184984984984985, + "grad_norm": 0.30710074305534363, + "learning_rate": 1.7131843030213735e-05, + "loss": 0.0134, + "step": 18190 + }, + { + "epoch": 2.1855855855855855, + "grad_norm": 0.24370187520980835, + "learning_rate": 1.710815160132646e-05, + "loss": 0.0132, + "step": 18195 + }, + { + "epoch": 2.186186186186186, + "grad_norm": 0.37445372343063354, + "learning_rate": 1.708447318363983e-05, + "loss": 0.0208, + "step": 18200 + }, + { + "epoch": 2.186786786786787, + "grad_norm": 0.2919495701789856, + "learning_rate": 1.7060807786520412e-05, + "loss": 0.0158, + "step": 18205 + }, + { + "epoch": 2.1873873873873872, + "grad_norm": 0.38046374917030334, + "learning_rate": 1.7037155419329658e-05, + "loss": 0.0173, + "step": 18210 + }, + { + "epoch": 2.187987987987988, + "grad_norm": 0.194807767868042, + "learning_rate": 1.701351609142387e-05, + "loss": 0.0149, + "step": 18215 + }, + { + "epoch": 2.1885885885885887, + "grad_norm": 0.2758733928203583, + "learning_rate": 1.6989889812154192e-05, + "loss": 0.0141, + "step": 18220 + }, + { + "epoch": 2.189189189189189, + "grad_norm": 0.27009743452072144, + "learning_rate": 1.6966276590866553e-05, + "loss": 0.0149, + "step": 18225 + }, + { + "epoch": 2.1897897897897898, + "grad_norm": 0.39949873089790344, + "learning_rate": 1.6942676436901794e-05, + "loss": 0.0195, + "step": 18230 + }, + { + "epoch": 2.1903903903903905, + "grad_norm": 0.23725907504558563, + "learning_rate": 1.6919089359595537e-05, + "loss": 0.0164, + "step": 18235 + }, + { + "epoch": 2.190990990990991, + "grad_norm": 0.22770358622074127, + "learning_rate": 1.689551536827825e-05, + "loss": 0.0139, + "step": 18240 + }, + { + "epoch": 2.1915915915915916, + "grad_norm": 0.2741831839084625, + "learning_rate": 1.6871954472275232e-05, + "loss": 0.0163, + "step": 18245 + }, + { + "epoch": 2.1921921921921923, + "grad_norm": 0.2730298638343811, + "learning_rate": 1.6848406680906563e-05, + "loss": 0.0177, + "step": 18250 + }, + { + "epoch": 2.1921921921921923, + "eval_loss": 0.040937528014183044, + "eval_runtime": 35.8008, + "eval_samples_per_second": 22.346, + "eval_steps_per_second": 5.586, + "step": 18250 + }, + { + "epoch": 2.1927927927927926, + "grad_norm": 0.2571689486503601, + "learning_rate": 1.6824872003487186e-05, + "loss": 0.0184, + "step": 18255 + }, + { + "epoch": 2.1933933933933933, + "grad_norm": 0.29271450638771057, + "learning_rate": 1.680135044932683e-05, + "loss": 0.015, + "step": 18260 + }, + { + "epoch": 2.193993993993994, + "grad_norm": 0.1757679283618927, + "learning_rate": 1.6777842027730055e-05, + "loss": 0.0145, + "step": 18265 + }, + { + "epoch": 2.1945945945945944, + "grad_norm": 0.22230972349643707, + "learning_rate": 1.675434674799618e-05, + "loss": 0.017, + "step": 18270 + }, + { + "epoch": 2.195195195195195, + "grad_norm": 0.2796711325645447, + "learning_rate": 1.673086461941939e-05, + "loss": 0.0158, + "step": 18275 + }, + { + "epoch": 2.195795795795796, + "grad_norm": 0.28163281083106995, + "learning_rate": 1.670739565128859e-05, + "loss": 0.0171, + "step": 18280 + }, + { + "epoch": 2.1963963963963966, + "grad_norm": 0.2757713794708252, + "learning_rate": 1.668393985288756e-05, + "loss": 0.0132, + "step": 18285 + }, + { + "epoch": 2.196996996996997, + "grad_norm": 0.19740013778209686, + "learning_rate": 1.6660497233494833e-05, + "loss": 0.015, + "step": 18290 + }, + { + "epoch": 2.1975975975975977, + "grad_norm": 0.3601270914077759, + "learning_rate": 1.66370678023837e-05, + "loss": 0.0178, + "step": 18295 + }, + { + "epoch": 2.1981981981981984, + "grad_norm": 0.20853619277477264, + "learning_rate": 1.661365156882228e-05, + "loss": 0.0143, + "step": 18300 + }, + { + "epoch": 2.1987987987987987, + "grad_norm": 0.18104422092437744, + "learning_rate": 1.6590248542073457e-05, + "loss": 0.014, + "step": 18305 + }, + { + "epoch": 2.1993993993993994, + "grad_norm": 0.16723549365997314, + "learning_rate": 1.656685873139488e-05, + "loss": 0.0151, + "step": 18310 + }, + { + "epoch": 2.2, + "grad_norm": 0.2669627070426941, + "learning_rate": 1.6543482146039006e-05, + "loss": 0.0143, + "step": 18315 + }, + { + "epoch": 2.2006006006006005, + "grad_norm": 0.19926106929779053, + "learning_rate": 1.6520118795253004e-05, + "loss": 0.0183, + "step": 18320 + }, + { + "epoch": 2.201201201201201, + "grad_norm": 0.2758762538433075, + "learning_rate": 1.6496768688278834e-05, + "loss": 0.0175, + "step": 18325 + }, + { + "epoch": 2.201801801801802, + "grad_norm": 0.25652170181274414, + "learning_rate": 1.6473431834353242e-05, + "loss": 0.0143, + "step": 18330 + }, + { + "epoch": 2.2024024024024023, + "grad_norm": 0.25897717475891113, + "learning_rate": 1.6450108242707695e-05, + "loss": 0.0164, + "step": 18335 + }, + { + "epoch": 2.203003003003003, + "grad_norm": 0.34619587659835815, + "learning_rate": 1.6426797922568447e-05, + "loss": 0.0186, + "step": 18340 + }, + { + "epoch": 2.2036036036036037, + "grad_norm": 0.2068956345319748, + "learning_rate": 1.640350088315646e-05, + "loss": 0.0127, + "step": 18345 + }, + { + "epoch": 2.204204204204204, + "grad_norm": 0.2720538377761841, + "learning_rate": 1.638021713368747e-05, + "loss": 0.0145, + "step": 18350 + }, + { + "epoch": 2.204804804804805, + "grad_norm": 0.26894304156303406, + "learning_rate": 1.635694668337196e-05, + "loss": 0.014, + "step": 18355 + }, + { + "epoch": 2.2054054054054055, + "grad_norm": 0.29101064801216125, + "learning_rate": 1.6333689541415155e-05, + "loss": 0.0165, + "step": 18360 + }, + { + "epoch": 2.206006006006006, + "grad_norm": 0.3419967293739319, + "learning_rate": 1.631044571701697e-05, + "loss": 0.0187, + "step": 18365 + }, + { + "epoch": 2.2066066066066066, + "grad_norm": 0.27701854705810547, + "learning_rate": 1.6287215219372128e-05, + "loss": 0.0164, + "step": 18370 + }, + { + "epoch": 2.2072072072072073, + "grad_norm": 0.20764677226543427, + "learning_rate": 1.6263998057669994e-05, + "loss": 0.0158, + "step": 18375 + }, + { + "epoch": 2.2078078078078076, + "grad_norm": 0.14065375924110413, + "learning_rate": 1.6240794241094727e-05, + "loss": 0.0116, + "step": 18380 + }, + { + "epoch": 2.2084084084084084, + "grad_norm": 0.2389654666185379, + "learning_rate": 1.6217603778825196e-05, + "loss": 0.0147, + "step": 18385 + }, + { + "epoch": 2.209009009009009, + "grad_norm": 0.1625911295413971, + "learning_rate": 1.6194426680034947e-05, + "loss": 0.0148, + "step": 18390 + }, + { + "epoch": 2.2096096096096094, + "grad_norm": 0.2317337989807129, + "learning_rate": 1.6171262953892267e-05, + "loss": 0.0178, + "step": 18395 + }, + { + "epoch": 2.21021021021021, + "grad_norm": 0.23551051318645477, + "learning_rate": 1.6148112609560174e-05, + "loss": 0.0137, + "step": 18400 + }, + { + "epoch": 2.210810810810811, + "grad_norm": 0.2810523211956024, + "learning_rate": 1.612497565619635e-05, + "loss": 0.0162, + "step": 18405 + }, + { + "epoch": 2.2114114114114116, + "grad_norm": 0.2298928052186966, + "learning_rate": 1.610185210295323e-05, + "loss": 0.0163, + "step": 18410 + }, + { + "epoch": 2.212012012012012, + "grad_norm": 0.30341780185699463, + "learning_rate": 1.6078741958977877e-05, + "loss": 0.0157, + "step": 18415 + }, + { + "epoch": 2.2126126126126127, + "grad_norm": 0.28883466124534607, + "learning_rate": 1.605564523341212e-05, + "loss": 0.0125, + "step": 18420 + }, + { + "epoch": 2.2132132132132134, + "grad_norm": 0.29520153999328613, + "learning_rate": 1.6032561935392442e-05, + "loss": 0.0156, + "step": 18425 + }, + { + "epoch": 2.2138138138138137, + "grad_norm": 0.2968877851963043, + "learning_rate": 1.600949207405004e-05, + "loss": 0.0177, + "step": 18430 + }, + { + "epoch": 2.2144144144144144, + "grad_norm": 0.230467289686203, + "learning_rate": 1.5986435658510758e-05, + "loss": 0.0156, + "step": 18435 + }, + { + "epoch": 2.215015015015015, + "grad_norm": 0.30984342098236084, + "learning_rate": 1.5963392697895147e-05, + "loss": 0.016, + "step": 18440 + }, + { + "epoch": 2.2156156156156155, + "grad_norm": 0.20223285257816315, + "learning_rate": 1.5940363201318443e-05, + "loss": 0.0158, + "step": 18445 + }, + { + "epoch": 2.2162162162162162, + "grad_norm": 0.23069490492343903, + "learning_rate": 1.591734717789053e-05, + "loss": 0.0161, + "step": 18450 + }, + { + "epoch": 2.216816816816817, + "grad_norm": 0.20524130761623383, + "learning_rate": 1.5894344636716003e-05, + "loss": 0.0139, + "step": 18455 + }, + { + "epoch": 2.2174174174174173, + "grad_norm": 0.28721746802330017, + "learning_rate": 1.5871355586894064e-05, + "loss": 0.0164, + "step": 18460 + }, + { + "epoch": 2.218018018018018, + "grad_norm": 0.2776319682598114, + "learning_rate": 1.584838003751864e-05, + "loss": 0.0153, + "step": 18465 + }, + { + "epoch": 2.2186186186186188, + "grad_norm": 0.2575279176235199, + "learning_rate": 1.5825417997678264e-05, + "loss": 0.0152, + "step": 18470 + }, + { + "epoch": 2.219219219219219, + "grad_norm": 0.1764586865901947, + "learning_rate": 1.580246947645616e-05, + "loss": 0.0179, + "step": 18475 + }, + { + "epoch": 2.21981981981982, + "grad_norm": 0.2602247893810272, + "learning_rate": 1.577953448293022e-05, + "loss": 0.0145, + "step": 18480 + }, + { + "epoch": 2.2204204204204205, + "grad_norm": 0.28490474820137024, + "learning_rate": 1.575661302617291e-05, + "loss": 0.0138, + "step": 18485 + }, + { + "epoch": 2.221021021021021, + "grad_norm": 0.2959938943386078, + "learning_rate": 1.5733705115251428e-05, + "loss": 0.0152, + "step": 18490 + }, + { + "epoch": 2.2216216216216216, + "grad_norm": 0.14737527072429657, + "learning_rate": 1.5710810759227563e-05, + "loss": 0.0111, + "step": 18495 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 0.37371930480003357, + "learning_rate": 1.5687929967157766e-05, + "loss": 0.0162, + "step": 18500 + }, + { + "epoch": 2.2222222222222223, + "eval_loss": 0.0416293703019619, + "eval_runtime": 35.9144, + "eval_samples_per_second": 22.275, + "eval_steps_per_second": 5.569, + "step": 18500 + }, + { + "epoch": 2.2228228228228226, + "grad_norm": 0.17344534397125244, + "learning_rate": 1.5665062748093095e-05, + "loss": 0.0131, + "step": 18505 + }, + { + "epoch": 2.2234234234234234, + "grad_norm": 0.29084810614585876, + "learning_rate": 1.5642209111079266e-05, + "loss": 0.0151, + "step": 18510 + }, + { + "epoch": 2.224024024024024, + "grad_norm": 0.20893439650535583, + "learning_rate": 1.5619369065156604e-05, + "loss": 0.0138, + "step": 18515 + }, + { + "epoch": 2.2246246246246244, + "grad_norm": 0.307782918214798, + "learning_rate": 1.5596542619360073e-05, + "loss": 0.0177, + "step": 18520 + }, + { + "epoch": 2.225225225225225, + "grad_norm": 0.3421696126461029, + "learning_rate": 1.5573729782719266e-05, + "loss": 0.0143, + "step": 18525 + }, + { + "epoch": 2.225825825825826, + "grad_norm": 0.23515865206718445, + "learning_rate": 1.5550930564258336e-05, + "loss": 0.019, + "step": 18530 + }, + { + "epoch": 2.2264264264264266, + "grad_norm": 0.27856892347335815, + "learning_rate": 1.5528144972996116e-05, + "loss": 0.0161, + "step": 18535 + }, + { + "epoch": 2.227027027027027, + "grad_norm": 0.3409879505634308, + "learning_rate": 1.5505373017946024e-05, + "loss": 0.0125, + "step": 18540 + }, + { + "epoch": 2.2276276276276277, + "grad_norm": 0.2655802369117737, + "learning_rate": 1.5482614708116083e-05, + "loss": 0.0156, + "step": 18545 + }, + { + "epoch": 2.2282282282282284, + "grad_norm": 0.22531571984291077, + "learning_rate": 1.545987005250889e-05, + "loss": 0.0173, + "step": 18550 + }, + { + "epoch": 2.2288288288288287, + "grad_norm": 0.24592383205890656, + "learning_rate": 1.5437139060121692e-05, + "loss": 0.0155, + "step": 18555 + }, + { + "epoch": 2.2294294294294295, + "grad_norm": 0.3500378429889679, + "learning_rate": 1.5414421739946312e-05, + "loss": 0.0153, + "step": 18560 + }, + { + "epoch": 2.23003003003003, + "grad_norm": 0.2400636225938797, + "learning_rate": 1.5391718100969132e-05, + "loss": 0.0153, + "step": 18565 + }, + { + "epoch": 2.2306306306306305, + "grad_norm": 0.32631024718284607, + "learning_rate": 1.5369028152171178e-05, + "loss": 0.0129, + "step": 18570 + }, + { + "epoch": 2.2312312312312312, + "grad_norm": 0.2156463861465454, + "learning_rate": 1.5346351902528007e-05, + "loss": 0.0182, + "step": 18575 + }, + { + "epoch": 2.231831831831832, + "grad_norm": 0.22314335405826569, + "learning_rate": 1.532368936100979e-05, + "loss": 0.0205, + "step": 18580 + }, + { + "epoch": 2.2324324324324323, + "grad_norm": 0.3144841194152832, + "learning_rate": 1.5301040536581275e-05, + "loss": 0.0139, + "step": 18585 + }, + { + "epoch": 2.233033033033033, + "grad_norm": 0.27216628193855286, + "learning_rate": 1.527840543820177e-05, + "loss": 0.0173, + "step": 18590 + }, + { + "epoch": 2.2336336336336338, + "grad_norm": 0.24075335264205933, + "learning_rate": 1.5255784074825175e-05, + "loss": 0.016, + "step": 18595 + }, + { + "epoch": 2.234234234234234, + "grad_norm": 0.18056322634220123, + "learning_rate": 1.5233176455399916e-05, + "loss": 0.0148, + "step": 18600 + }, + { + "epoch": 2.234834834834835, + "grad_norm": 0.18213163316249847, + "learning_rate": 1.5210582588869016e-05, + "loss": 0.0156, + "step": 18605 + }, + { + "epoch": 2.2354354354354355, + "grad_norm": 0.23092873394489288, + "learning_rate": 1.518800248417005e-05, + "loss": 0.0174, + "step": 18610 + }, + { + "epoch": 2.236036036036036, + "grad_norm": 0.2707630693912506, + "learning_rate": 1.5165436150235146e-05, + "loss": 0.0122, + "step": 18615 + }, + { + "epoch": 2.2366366366366366, + "grad_norm": 0.20673693716526031, + "learning_rate": 1.5142883595991014e-05, + "loss": 0.0129, + "step": 18620 + }, + { + "epoch": 2.2372372372372373, + "grad_norm": 0.22654296457767487, + "learning_rate": 1.512034483035884e-05, + "loss": 0.0139, + "step": 18625 + }, + { + "epoch": 2.237837837837838, + "grad_norm": 0.24357669055461884, + "learning_rate": 1.5097819862254426e-05, + "loss": 0.0149, + "step": 18630 + }, + { + "epoch": 2.2384384384384384, + "grad_norm": 0.2146761119365692, + "learning_rate": 1.5075308700588093e-05, + "loss": 0.0139, + "step": 18635 + }, + { + "epoch": 2.239039039039039, + "grad_norm": 0.2448071390390396, + "learning_rate": 1.5052811354264706e-05, + "loss": 0.0127, + "step": 18640 + }, + { + "epoch": 2.2396396396396394, + "grad_norm": 0.22823013365268707, + "learning_rate": 1.5030327832183633e-05, + "loss": 0.0176, + "step": 18645 + }, + { + "epoch": 2.24024024024024, + "grad_norm": 0.3258076012134552, + "learning_rate": 1.5007858143238834e-05, + "loss": 0.0144, + "step": 18650 + }, + { + "epoch": 2.240840840840841, + "grad_norm": 0.27186718583106995, + "learning_rate": 1.4985402296318718e-05, + "loss": 0.0132, + "step": 18655 + }, + { + "epoch": 2.2414414414414416, + "grad_norm": 0.2503277063369751, + "learning_rate": 1.496296030030629e-05, + "loss": 0.0139, + "step": 18660 + }, + { + "epoch": 2.242042042042042, + "grad_norm": 0.19911149144172668, + "learning_rate": 1.4940532164079052e-05, + "loss": 0.0153, + "step": 18665 + }, + { + "epoch": 2.2426426426426427, + "grad_norm": 0.28745782375335693, + "learning_rate": 1.4918117896508999e-05, + "loss": 0.0139, + "step": 18670 + }, + { + "epoch": 2.2432432432432434, + "grad_norm": 0.21540391445159912, + "learning_rate": 1.4895717506462665e-05, + "loss": 0.0137, + "step": 18675 + }, + { + "epoch": 2.2438438438438437, + "grad_norm": 0.23623935878276825, + "learning_rate": 1.48733310028011e-05, + "loss": 0.0133, + "step": 18680 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 0.29339754581451416, + "learning_rate": 1.4850958394379844e-05, + "loss": 0.0128, + "step": 18685 + }, + { + "epoch": 2.245045045045045, + "grad_norm": 0.3102782666683197, + "learning_rate": 1.4828599690048961e-05, + "loss": 0.0136, + "step": 18690 + }, + { + "epoch": 2.2456456456456455, + "grad_norm": 0.31585463881492615, + "learning_rate": 1.4806254898652977e-05, + "loss": 0.0153, + "step": 18695 + }, + { + "epoch": 2.2462462462462462, + "grad_norm": 0.2358711212873459, + "learning_rate": 1.478392402903095e-05, + "loss": 0.0139, + "step": 18700 + }, + { + "epoch": 2.246846846846847, + "grad_norm": 0.24576008319854736, + "learning_rate": 1.476160709001641e-05, + "loss": 0.0139, + "step": 18705 + }, + { + "epoch": 2.2474474474474473, + "grad_norm": 0.2061036229133606, + "learning_rate": 1.47393040904374e-05, + "loss": 0.0177, + "step": 18710 + }, + { + "epoch": 2.248048048048048, + "grad_norm": 0.25496989488601685, + "learning_rate": 1.4717015039116445e-05, + "loss": 0.0146, + "step": 18715 + }, + { + "epoch": 2.2486486486486488, + "grad_norm": 0.21531526744365692, + "learning_rate": 1.4694739944870506e-05, + "loss": 0.0147, + "step": 18720 + }, + { + "epoch": 2.249249249249249, + "grad_norm": 0.22462278604507446, + "learning_rate": 1.4672478816511076e-05, + "loss": 0.0129, + "step": 18725 + }, + { + "epoch": 2.24984984984985, + "grad_norm": 0.20219901204109192, + "learning_rate": 1.465023166284411e-05, + "loss": 0.0135, + "step": 18730 + }, + { + "epoch": 2.2504504504504506, + "grad_norm": 0.23641879856586456, + "learning_rate": 1.4627998492670042e-05, + "loss": 0.0131, + "step": 18735 + }, + { + "epoch": 2.251051051051051, + "grad_norm": 0.17973272502422333, + "learning_rate": 1.4605779314783736e-05, + "loss": 0.0153, + "step": 18740 + }, + { + "epoch": 2.2516516516516516, + "grad_norm": 0.2191532552242279, + "learning_rate": 1.458357413797457e-05, + "loss": 0.0149, + "step": 18745 + }, + { + "epoch": 2.2522522522522523, + "grad_norm": 0.21080400049686432, + "learning_rate": 1.456138297102635e-05, + "loss": 0.0141, + "step": 18750 + }, + { + "epoch": 2.2522522522522523, + "eval_loss": 0.04001760855317116, + "eval_runtime": 35.8843, + "eval_samples_per_second": 22.294, + "eval_steps_per_second": 5.573, + "step": 18750 + }, + { + "epoch": 2.252852852852853, + "grad_norm": 0.26723742485046387, + "learning_rate": 1.453920582271735e-05, + "loss": 0.0157, + "step": 18755 + }, + { + "epoch": 2.2534534534534534, + "grad_norm": 0.25108709931373596, + "learning_rate": 1.451704270182032e-05, + "loss": 0.0129, + "step": 18760 + }, + { + "epoch": 2.254054054054054, + "grad_norm": 0.19916701316833496, + "learning_rate": 1.4494893617102418e-05, + "loss": 0.0126, + "step": 18765 + }, + { + "epoch": 2.2546546546546544, + "grad_norm": 0.19446638226509094, + "learning_rate": 1.447275857732528e-05, + "loss": 0.0137, + "step": 18770 + }, + { + "epoch": 2.255255255255255, + "grad_norm": 0.21993963420391083, + "learning_rate": 1.4450637591244987e-05, + "loss": 0.0127, + "step": 18775 + }, + { + "epoch": 2.255855855855856, + "grad_norm": 0.23572473227977753, + "learning_rate": 1.442853066761205e-05, + "loss": 0.0134, + "step": 18780 + }, + { + "epoch": 2.2564564564564566, + "grad_norm": 0.2728061378002167, + "learning_rate": 1.4406437815171431e-05, + "loss": 0.0128, + "step": 18785 + }, + { + "epoch": 2.257057057057057, + "grad_norm": 0.18432195484638214, + "learning_rate": 1.4384359042662493e-05, + "loss": 0.0129, + "step": 18790 + }, + { + "epoch": 2.2576576576576577, + "grad_norm": 0.26017066836357117, + "learning_rate": 1.4362294358819062e-05, + "loss": 0.0157, + "step": 18795 + }, + { + "epoch": 2.2582582582582584, + "grad_norm": 0.25586754083633423, + "learning_rate": 1.4340243772369383e-05, + "loss": 0.0149, + "step": 18800 + }, + { + "epoch": 2.2588588588588587, + "grad_norm": 0.1822763830423355, + "learning_rate": 1.4318207292036134e-05, + "loss": 0.0144, + "step": 18805 + }, + { + "epoch": 2.2594594594594595, + "grad_norm": 0.3942986726760864, + "learning_rate": 1.4296184926536371e-05, + "loss": 0.0132, + "step": 18810 + }, + { + "epoch": 2.26006006006006, + "grad_norm": 0.23238366842269897, + "learning_rate": 1.4274176684581619e-05, + "loss": 0.0128, + "step": 18815 + }, + { + "epoch": 2.2606606606606605, + "grad_norm": 0.1807553619146347, + "learning_rate": 1.4252182574877781e-05, + "loss": 0.0134, + "step": 18820 + }, + { + "epoch": 2.2612612612612613, + "grad_norm": 0.22502589225769043, + "learning_rate": 1.4230202606125186e-05, + "loss": 0.013, + "step": 18825 + }, + { + "epoch": 2.261861861861862, + "grad_norm": 0.15234749019145966, + "learning_rate": 1.4208236787018592e-05, + "loss": 0.0135, + "step": 18830 + }, + { + "epoch": 2.2624624624624623, + "grad_norm": 0.17176970839500427, + "learning_rate": 1.4186285126247083e-05, + "loss": 0.0156, + "step": 18835 + }, + { + "epoch": 2.263063063063063, + "grad_norm": 0.2273995280265808, + "learning_rate": 1.416434763249424e-05, + "loss": 0.0139, + "step": 18840 + }, + { + "epoch": 2.263663663663664, + "grad_norm": 0.26800450682640076, + "learning_rate": 1.4142424314437957e-05, + "loss": 0.0147, + "step": 18845 + }, + { + "epoch": 2.264264264264264, + "grad_norm": 0.2593317925930023, + "learning_rate": 1.4120515180750565e-05, + "loss": 0.0127, + "step": 18850 + }, + { + "epoch": 2.264864864864865, + "grad_norm": 0.2639774978160858, + "learning_rate": 1.4098620240098793e-05, + "loss": 0.0149, + "step": 18855 + }, + { + "epoch": 2.2654654654654656, + "grad_norm": 0.15085655450820923, + "learning_rate": 1.407673950114371e-05, + "loss": 0.0142, + "step": 18860 + }, + { + "epoch": 2.266066066066066, + "grad_norm": 0.3097408413887024, + "learning_rate": 1.4054872972540806e-05, + "loss": 0.0182, + "step": 18865 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.27929165959358215, + "learning_rate": 1.4033020662939939e-05, + "loss": 0.0174, + "step": 18870 + }, + { + "epoch": 2.2672672672672673, + "grad_norm": 0.17149385809898376, + "learning_rate": 1.4011182580985355e-05, + "loss": 0.0135, + "step": 18875 + }, + { + "epoch": 2.267867867867868, + "grad_norm": 0.4180721938610077, + "learning_rate": 1.3989358735315633e-05, + "loss": 0.0153, + "step": 18880 + }, + { + "epoch": 2.2684684684684684, + "grad_norm": 0.18978695571422577, + "learning_rate": 1.3967549134563757e-05, + "loss": 0.0127, + "step": 18885 + }, + { + "epoch": 2.269069069069069, + "grad_norm": 0.2437458336353302, + "learning_rate": 1.3945753787357068e-05, + "loss": 0.0125, + "step": 18890 + }, + { + "epoch": 2.2696696696696694, + "grad_norm": 0.21342216432094574, + "learning_rate": 1.3923972702317262e-05, + "loss": 0.0142, + "step": 18895 + }, + { + "epoch": 2.27027027027027, + "grad_norm": 0.23508882522583008, + "learning_rate": 1.3902205888060415e-05, + "loss": 0.0143, + "step": 18900 + }, + { + "epoch": 2.270870870870871, + "grad_norm": 0.3441837430000305, + "learning_rate": 1.3880453353196905e-05, + "loss": 0.0154, + "step": 18905 + }, + { + "epoch": 2.2714714714714717, + "grad_norm": 0.2490258663892746, + "learning_rate": 1.3858715106331516e-05, + "loss": 0.0147, + "step": 18910 + }, + { + "epoch": 2.272072072072072, + "grad_norm": 0.20941585302352905, + "learning_rate": 1.3836991156063361e-05, + "loss": 0.0121, + "step": 18915 + }, + { + "epoch": 2.2726726726726727, + "grad_norm": 0.26372185349464417, + "learning_rate": 1.3815281510985906e-05, + "loss": 0.0136, + "step": 18920 + }, + { + "epoch": 2.2732732732732734, + "grad_norm": 0.22036220133304596, + "learning_rate": 1.3793586179686923e-05, + "loss": 0.0139, + "step": 18925 + }, + { + "epoch": 2.2738738738738737, + "grad_norm": 0.21494606137275696, + "learning_rate": 1.3771905170748562e-05, + "loss": 0.0152, + "step": 18930 + }, + { + "epoch": 2.2744744744744745, + "grad_norm": 0.22034406661987305, + "learning_rate": 1.3750238492747302e-05, + "loss": 0.013, + "step": 18935 + }, + { + "epoch": 2.2750750750750752, + "grad_norm": 0.2613002061843872, + "learning_rate": 1.3728586154253925e-05, + "loss": 0.0154, + "step": 18940 + }, + { + "epoch": 2.2756756756756755, + "grad_norm": 0.2192060500383377, + "learning_rate": 1.370694816383359e-05, + "loss": 0.016, + "step": 18945 + }, + { + "epoch": 2.2762762762762763, + "grad_norm": 0.26106199622154236, + "learning_rate": 1.3685324530045707e-05, + "loss": 0.0133, + "step": 18950 + }, + { + "epoch": 2.276876876876877, + "grad_norm": 0.2007218450307846, + "learning_rate": 1.3663715261444077e-05, + "loss": 0.0149, + "step": 18955 + }, + { + "epoch": 2.2774774774774773, + "grad_norm": 0.20750047266483307, + "learning_rate": 1.3642120366576789e-05, + "loss": 0.0126, + "step": 18960 + }, + { + "epoch": 2.278078078078078, + "grad_norm": 0.39288845658302307, + "learning_rate": 1.3620539853986242e-05, + "loss": 0.0154, + "step": 18965 + }, + { + "epoch": 2.278678678678679, + "grad_norm": 0.3260713517665863, + "learning_rate": 1.3598973732209175e-05, + "loss": 0.0139, + "step": 18970 + }, + { + "epoch": 2.279279279279279, + "grad_norm": 0.22316108644008636, + "learning_rate": 1.357742200977658e-05, + "loss": 0.0149, + "step": 18975 + }, + { + "epoch": 2.27987987987988, + "grad_norm": 0.3133355677127838, + "learning_rate": 1.3555884695213799e-05, + "loss": 0.0141, + "step": 18980 + }, + { + "epoch": 2.2804804804804806, + "grad_norm": 0.2897303104400635, + "learning_rate": 1.3534361797040457e-05, + "loss": 0.0154, + "step": 18985 + }, + { + "epoch": 2.281081081081081, + "grad_norm": 0.3254048228263855, + "learning_rate": 1.3512853323770486e-05, + "loss": 0.0152, + "step": 18990 + }, + { + "epoch": 2.2816816816816816, + "grad_norm": 0.26747003197669983, + "learning_rate": 1.3491359283912114e-05, + "loss": 0.0185, + "step": 18995 + }, + { + "epoch": 2.2822822822822824, + "grad_norm": 0.14857745170593262, + "learning_rate": 1.3469879685967824e-05, + "loss": 0.0144, + "step": 19000 + }, + { + "epoch": 2.2822822822822824, + "eval_loss": 0.03958373889327049, + "eval_runtime": 35.7399, + "eval_samples_per_second": 22.384, + "eval_steps_per_second": 5.596, + "step": 19000 + }, + { + "epoch": 2.282882882882883, + "grad_norm": 0.25740018486976624, + "learning_rate": 1.3448414538434428e-05, + "loss": 0.0119, + "step": 19005 + }, + { + "epoch": 2.2834834834834834, + "grad_norm": 0.23623026907444, + "learning_rate": 1.342696384980301e-05, + "loss": 0.0131, + "step": 19010 + }, + { + "epoch": 2.284084084084084, + "grad_norm": 0.25876763463020325, + "learning_rate": 1.340552762855894e-05, + "loss": 0.0145, + "step": 19015 + }, + { + "epoch": 2.2846846846846844, + "grad_norm": 0.2639022469520569, + "learning_rate": 1.338410588318183e-05, + "loss": 0.0155, + "step": 19020 + }, + { + "epoch": 2.285285285285285, + "grad_norm": 0.23563387989997864, + "learning_rate": 1.336269862214562e-05, + "loss": 0.0142, + "step": 19025 + }, + { + "epoch": 2.285885885885886, + "grad_norm": 0.32455793023109436, + "learning_rate": 1.3341305853918462e-05, + "loss": 0.015, + "step": 19030 + }, + { + "epoch": 2.2864864864864867, + "grad_norm": 0.22415214776992798, + "learning_rate": 1.331992758696282e-05, + "loss": 0.0135, + "step": 19035 + }, + { + "epoch": 2.287087087087087, + "grad_norm": 0.2912454307079315, + "learning_rate": 1.3298563829735427e-05, + "loss": 0.0146, + "step": 19040 + }, + { + "epoch": 2.2876876876876877, + "grad_norm": 0.18687337636947632, + "learning_rate": 1.327721459068722e-05, + "loss": 0.0138, + "step": 19045 + }, + { + "epoch": 2.2882882882882885, + "grad_norm": 0.23039725422859192, + "learning_rate": 1.3255879878263449e-05, + "loss": 0.0126, + "step": 19050 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 0.3004606366157532, + "learning_rate": 1.3234559700903592e-05, + "loss": 0.0134, + "step": 19055 + }, + { + "epoch": 2.2894894894894895, + "grad_norm": 0.22218632698059082, + "learning_rate": 1.3213254067041392e-05, + "loss": 0.0149, + "step": 19060 + }, + { + "epoch": 2.2900900900900902, + "grad_norm": 0.16605770587921143, + "learning_rate": 1.3191962985104838e-05, + "loss": 0.0112, + "step": 19065 + }, + { + "epoch": 2.2906906906906905, + "grad_norm": 0.3117055892944336, + "learning_rate": 1.3170686463516125e-05, + "loss": 0.0148, + "step": 19070 + }, + { + "epoch": 2.2912912912912913, + "grad_norm": 0.3259162902832031, + "learning_rate": 1.3149424510691738e-05, + "loss": 0.015, + "step": 19075 + }, + { + "epoch": 2.291891891891892, + "grad_norm": 0.2495785802602768, + "learning_rate": 1.3128177135042374e-05, + "loss": 0.0141, + "step": 19080 + }, + { + "epoch": 2.2924924924924923, + "grad_norm": 0.30315738916397095, + "learning_rate": 1.3106944344972965e-05, + "loss": 0.0158, + "step": 19085 + }, + { + "epoch": 2.293093093093093, + "grad_norm": 0.19319814443588257, + "learning_rate": 1.3085726148882704e-05, + "loss": 0.016, + "step": 19090 + }, + { + "epoch": 2.293693693693694, + "grad_norm": 0.2703433036804199, + "learning_rate": 1.3064522555164948e-05, + "loss": 0.0148, + "step": 19095 + }, + { + "epoch": 2.294294294294294, + "grad_norm": 0.2594946324825287, + "learning_rate": 1.3043333572207322e-05, + "loss": 0.0177, + "step": 19100 + }, + { + "epoch": 2.294894894894895, + "grad_norm": 0.251132071018219, + "learning_rate": 1.302215920839167e-05, + "loss": 0.0128, + "step": 19105 + }, + { + "epoch": 2.2954954954954956, + "grad_norm": 0.18883265554904938, + "learning_rate": 1.300099947209406e-05, + "loss": 0.0139, + "step": 19110 + }, + { + "epoch": 2.296096096096096, + "grad_norm": 0.30045944452285767, + "learning_rate": 1.297985437168473e-05, + "loss": 0.0129, + "step": 19115 + }, + { + "epoch": 2.2966966966966966, + "grad_norm": 0.22515608370304108, + "learning_rate": 1.2958723915528187e-05, + "loss": 0.0141, + "step": 19120 + }, + { + "epoch": 2.2972972972972974, + "grad_norm": 0.19037845730781555, + "learning_rate": 1.2937608111983085e-05, + "loss": 0.0124, + "step": 19125 + }, + { + "epoch": 2.297897897897898, + "grad_norm": 0.3264663815498352, + "learning_rate": 1.291650696940233e-05, + "loss": 0.0135, + "step": 19130 + }, + { + "epoch": 2.2984984984984984, + "grad_norm": 0.34269478917121887, + "learning_rate": 1.2895420496133027e-05, + "loss": 0.014, + "step": 19135 + }, + { + "epoch": 2.299099099099099, + "grad_norm": 0.24214765429496765, + "learning_rate": 1.2874348700516432e-05, + "loss": 0.0141, + "step": 19140 + }, + { + "epoch": 2.2996996996996995, + "grad_norm": 0.18193790316581726, + "learning_rate": 1.2853291590888034e-05, + "loss": 0.0136, + "step": 19145 + }, + { + "epoch": 2.3003003003003, + "grad_norm": 0.19907261431217194, + "learning_rate": 1.2832249175577515e-05, + "loss": 0.0122, + "step": 19150 + }, + { + "epoch": 2.300900900900901, + "grad_norm": 0.17116400599479675, + "learning_rate": 1.2811221462908723e-05, + "loss": 0.0126, + "step": 19155 + }, + { + "epoch": 2.3015015015015017, + "grad_norm": 0.2657437026500702, + "learning_rate": 1.2790208461199726e-05, + "loss": 0.0146, + "step": 19160 + }, + { + "epoch": 2.302102102102102, + "grad_norm": 0.2539885938167572, + "learning_rate": 1.2769210178762709e-05, + "loss": 0.0117, + "step": 19165 + }, + { + "epoch": 2.3027027027027027, + "grad_norm": 0.22538946568965912, + "learning_rate": 1.2748226623904092e-05, + "loss": 0.0142, + "step": 19170 + }, + { + "epoch": 2.3033033033033035, + "grad_norm": 0.1703753024339676, + "learning_rate": 1.2727257804924447e-05, + "loss": 0.0126, + "step": 19175 + }, + { + "epoch": 2.3039039039039038, + "grad_norm": 0.19249996542930603, + "learning_rate": 1.270630373011853e-05, + "loss": 0.0145, + "step": 19180 + }, + { + "epoch": 2.3045045045045045, + "grad_norm": 0.33727169036865234, + "learning_rate": 1.2685364407775236e-05, + "loss": 0.0149, + "step": 19185 + }, + { + "epoch": 2.3051051051051052, + "grad_norm": 0.2722015380859375, + "learning_rate": 1.2664439846177644e-05, + "loss": 0.0128, + "step": 19190 + }, + { + "epoch": 2.3057057057057055, + "grad_norm": 0.21558161079883575, + "learning_rate": 1.2643530053603003e-05, + "loss": 0.0166, + "step": 19195 + }, + { + "epoch": 2.3063063063063063, + "grad_norm": 0.16690169274806976, + "learning_rate": 1.2622635038322705e-05, + "loss": 0.0111, + "step": 19200 + }, + { + "epoch": 2.306906906906907, + "grad_norm": 0.26086804270744324, + "learning_rate": 1.2601754808602318e-05, + "loss": 0.0166, + "step": 19205 + }, + { + "epoch": 2.3075075075075073, + "grad_norm": 0.19430798292160034, + "learning_rate": 1.2580889372701503e-05, + "loss": 0.013, + "step": 19210 + }, + { + "epoch": 2.308108108108108, + "grad_norm": 0.22891777753829956, + "learning_rate": 1.2560038738874157e-05, + "loss": 0.014, + "step": 19215 + }, + { + "epoch": 2.308708708708709, + "grad_norm": 0.20051150023937225, + "learning_rate": 1.253920291536823e-05, + "loss": 0.013, + "step": 19220 + }, + { + "epoch": 2.3093093093093096, + "grad_norm": 0.3059994876384735, + "learning_rate": 1.251838191042588e-05, + "loss": 0.0144, + "step": 19225 + }, + { + "epoch": 2.30990990990991, + "grad_norm": 0.24633249640464783, + "learning_rate": 1.249757573228339e-05, + "loss": 0.0114, + "step": 19230 + }, + { + "epoch": 2.3105105105105106, + "grad_norm": 0.3297385275363922, + "learning_rate": 1.2476784389171148e-05, + "loss": 0.0171, + "step": 19235 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 0.2548968195915222, + "learning_rate": 1.2456007889313703e-05, + "loss": 0.0138, + "step": 19240 + }, + { + "epoch": 2.3117117117117116, + "grad_norm": 0.26288342475891113, + "learning_rate": 1.2435246240929726e-05, + "loss": 0.0137, + "step": 19245 + }, + { + "epoch": 2.3123123123123124, + "grad_norm": 0.2402072250843048, + "learning_rate": 1.241449945223202e-05, + "loss": 0.0152, + "step": 19250 + }, + { + "epoch": 2.3123123123123124, + "eval_loss": 0.04008075222373009, + "eval_runtime": 35.795, + "eval_samples_per_second": 22.35, + "eval_steps_per_second": 5.587, + "step": 19250 + }, + { + "epoch": 2.312912912912913, + "grad_norm": 0.19226953387260437, + "learning_rate": 1.239376753142748e-05, + "loss": 0.0141, + "step": 19255 + }, + { + "epoch": 2.3135135135135134, + "grad_norm": 0.2099335491657257, + "learning_rate": 1.2373050486717153e-05, + "loss": 0.0134, + "step": 19260 + }, + { + "epoch": 2.314114114114114, + "grad_norm": 0.24056895077228546, + "learning_rate": 1.2352348326296182e-05, + "loss": 0.0133, + "step": 19265 + }, + { + "epoch": 2.314714714714715, + "grad_norm": 0.3099709451198578, + "learning_rate": 1.2331661058353834e-05, + "loss": 0.0129, + "step": 19270 + }, + { + "epoch": 2.315315315315315, + "grad_norm": 0.20563752949237823, + "learning_rate": 1.2310988691073494e-05, + "loss": 0.0123, + "step": 19275 + }, + { + "epoch": 2.315915915915916, + "grad_norm": 0.33363813161849976, + "learning_rate": 1.2290331232632613e-05, + "loss": 0.0156, + "step": 19280 + }, + { + "epoch": 2.3165165165165167, + "grad_norm": 0.27251991629600525, + "learning_rate": 1.2269688691202779e-05, + "loss": 0.014, + "step": 19285 + }, + { + "epoch": 2.317117117117117, + "grad_norm": 0.33110061287879944, + "learning_rate": 1.2249061074949674e-05, + "loss": 0.0137, + "step": 19290 + }, + { + "epoch": 2.3177177177177177, + "grad_norm": 0.22270050644874573, + "learning_rate": 1.2228448392033087e-05, + "loss": 0.0136, + "step": 19295 + }, + { + "epoch": 2.3183183183183185, + "grad_norm": 0.26671379804611206, + "learning_rate": 1.220785065060685e-05, + "loss": 0.0137, + "step": 19300 + }, + { + "epoch": 2.3189189189189188, + "grad_norm": 0.25220632553100586, + "learning_rate": 1.218726785881894e-05, + "loss": 0.0133, + "step": 19305 + }, + { + "epoch": 2.3195195195195195, + "grad_norm": 0.23740239441394806, + "learning_rate": 1.2166700024811411e-05, + "loss": 0.0142, + "step": 19310 + }, + { + "epoch": 2.3201201201201203, + "grad_norm": 0.26378750801086426, + "learning_rate": 1.2146147156720361e-05, + "loss": 0.0123, + "step": 19315 + }, + { + "epoch": 2.3207207207207206, + "grad_norm": 0.18735115230083466, + "learning_rate": 1.2125609262676024e-05, + "loss": 0.0126, + "step": 19320 + }, + { + "epoch": 2.3213213213213213, + "grad_norm": 0.25290191173553467, + "learning_rate": 1.2105086350802653e-05, + "loss": 0.0117, + "step": 19325 + }, + { + "epoch": 2.321921921921922, + "grad_norm": 0.17708788812160492, + "learning_rate": 1.2084578429218617e-05, + "loss": 0.012, + "step": 19330 + }, + { + "epoch": 2.3225225225225223, + "grad_norm": 0.23459002375602722, + "learning_rate": 1.2064085506036349e-05, + "loss": 0.0138, + "step": 19335 + }, + { + "epoch": 2.323123123123123, + "grad_norm": 0.28581830859184265, + "learning_rate": 1.2043607589362332e-05, + "loss": 0.018, + "step": 19340 + }, + { + "epoch": 2.323723723723724, + "grad_norm": 0.2849411368370056, + "learning_rate": 1.2023144687297144e-05, + "loss": 0.0124, + "step": 19345 + }, + { + "epoch": 2.3243243243243246, + "grad_norm": 0.1840084195137024, + "learning_rate": 1.2002696807935365e-05, + "loss": 0.0135, + "step": 19350 + }, + { + "epoch": 2.324924924924925, + "grad_norm": 0.3442290425300598, + "learning_rate": 1.1982263959365697e-05, + "loss": 0.0142, + "step": 19355 + }, + { + "epoch": 2.3255255255255256, + "grad_norm": 0.28359463810920715, + "learning_rate": 1.1961846149670858e-05, + "loss": 0.0145, + "step": 19360 + }, + { + "epoch": 2.326126126126126, + "grad_norm": 0.16738919913768768, + "learning_rate": 1.1941443386927637e-05, + "loss": 0.0135, + "step": 19365 + }, + { + "epoch": 2.3267267267267266, + "grad_norm": 0.22982092201709747, + "learning_rate": 1.1921055679206866e-05, + "loss": 0.0139, + "step": 19370 + }, + { + "epoch": 2.3273273273273274, + "grad_norm": 0.1813298910856247, + "learning_rate": 1.1900683034573396e-05, + "loss": 0.016, + "step": 19375 + }, + { + "epoch": 2.327927927927928, + "grad_norm": 0.21618647873401642, + "learning_rate": 1.188032546108615e-05, + "loss": 0.0138, + "step": 19380 + }, + { + "epoch": 2.3285285285285284, + "grad_norm": 0.22631651163101196, + "learning_rate": 1.1859982966798084e-05, + "loss": 0.0153, + "step": 19385 + }, + { + "epoch": 2.329129129129129, + "grad_norm": 0.26831868290901184, + "learning_rate": 1.1839655559756197e-05, + "loss": 0.0123, + "step": 19390 + }, + { + "epoch": 2.32972972972973, + "grad_norm": 0.224435955286026, + "learning_rate": 1.181934324800148e-05, + "loss": 0.0127, + "step": 19395 + }, + { + "epoch": 2.33033033033033, + "grad_norm": 0.27236878871917725, + "learning_rate": 1.1799046039569006e-05, + "loss": 0.0115, + "step": 19400 + }, + { + "epoch": 2.330930930930931, + "grad_norm": 0.2713559865951538, + "learning_rate": 1.1778763942487825e-05, + "loss": 0.0143, + "step": 19405 + }, + { + "epoch": 2.3315315315315317, + "grad_norm": 0.21134965121746063, + "learning_rate": 1.1758496964781045e-05, + "loss": 0.0137, + "step": 19410 + }, + { + "epoch": 2.332132132132132, + "grad_norm": 0.20588813722133636, + "learning_rate": 1.173824511446579e-05, + "loss": 0.0127, + "step": 19415 + }, + { + "epoch": 2.3327327327327327, + "grad_norm": 0.2838841378688812, + "learning_rate": 1.1718008399553165e-05, + "loss": 0.0146, + "step": 19420 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.25593286752700806, + "learning_rate": 1.1697786828048335e-05, + "loss": 0.012, + "step": 19425 + }, + { + "epoch": 2.333933933933934, + "grad_norm": 0.1968752145767212, + "learning_rate": 1.1677580407950439e-05, + "loss": 0.0141, + "step": 19430 + }, + { + "epoch": 2.3345345345345345, + "grad_norm": 0.21549277007579803, + "learning_rate": 1.1657389147252645e-05, + "loss": 0.0144, + "step": 19435 + }, + { + "epoch": 2.3351351351351353, + "grad_norm": 0.34446537494659424, + "learning_rate": 1.1637213053942126e-05, + "loss": 0.0128, + "step": 19440 + }, + { + "epoch": 2.3357357357357356, + "grad_norm": 0.17235496640205383, + "learning_rate": 1.1617052136000023e-05, + "loss": 0.0117, + "step": 19445 + }, + { + "epoch": 2.3363363363363363, + "grad_norm": 0.27614420652389526, + "learning_rate": 1.1596906401401503e-05, + "loss": 0.0124, + "step": 19450 + }, + { + "epoch": 2.336936936936937, + "grad_norm": 0.17631419003009796, + "learning_rate": 1.1576775858115718e-05, + "loss": 0.0136, + "step": 19455 + }, + { + "epoch": 2.3375375375375373, + "grad_norm": 0.24650456011295319, + "learning_rate": 1.1556660514105839e-05, + "loss": 0.0112, + "step": 19460 + }, + { + "epoch": 2.338138138138138, + "grad_norm": 0.2779387831687927, + "learning_rate": 1.1536560377328952e-05, + "loss": 0.0132, + "step": 19465 + }, + { + "epoch": 2.338738738738739, + "grad_norm": 0.22595803439617157, + "learning_rate": 1.1516475455736203e-05, + "loss": 0.0132, + "step": 19470 + }, + { + "epoch": 2.3393393393393396, + "grad_norm": 0.21261551976203918, + "learning_rate": 1.1496405757272682e-05, + "loss": 0.015, + "step": 19475 + }, + { + "epoch": 2.33993993993994, + "grad_norm": 0.2531772553920746, + "learning_rate": 1.1476351289877468e-05, + "loss": 0.0141, + "step": 19480 + }, + { + "epoch": 2.3405405405405406, + "grad_norm": 0.21187564730644226, + "learning_rate": 1.145631206148362e-05, + "loss": 0.0136, + "step": 19485 + }, + { + "epoch": 2.341141141141141, + "grad_norm": 0.2373325675725937, + "learning_rate": 1.1436288080018137e-05, + "loss": 0.0116, + "step": 19490 + }, + { + "epoch": 2.3417417417417417, + "grad_norm": 0.2928980886936188, + "learning_rate": 1.1416279353402038e-05, + "loss": 0.0142, + "step": 19495 + }, + { + "epoch": 2.3423423423423424, + "grad_norm": 0.21609313786029816, + "learning_rate": 1.139628588955025e-05, + "loss": 0.0124, + "step": 19500 + }, + { + "epoch": 2.3423423423423424, + "eval_loss": 0.03953830152750015, + "eval_runtime": 35.8563, + "eval_samples_per_second": 22.311, + "eval_steps_per_second": 5.578, + "step": 19500 + }, + { + "epoch": 2.342942942942943, + "grad_norm": 0.2801402509212494, + "learning_rate": 1.1376307696371707e-05, + "loss": 0.0136, + "step": 19505 + }, + { + "epoch": 2.3435435435435434, + "grad_norm": 0.22694651782512665, + "learning_rate": 1.1356344781769301e-05, + "loss": 0.0142, + "step": 19510 + }, + { + "epoch": 2.344144144144144, + "grad_norm": 0.27109283208847046, + "learning_rate": 1.1336397153639844e-05, + "loss": 0.0138, + "step": 19515 + }, + { + "epoch": 2.344744744744745, + "grad_norm": 0.20073270797729492, + "learning_rate": 1.1316464819874129e-05, + "loss": 0.0143, + "step": 19520 + }, + { + "epoch": 2.3453453453453452, + "grad_norm": 0.3076915442943573, + "learning_rate": 1.1296547788356898e-05, + "loss": 0.0133, + "step": 19525 + }, + { + "epoch": 2.345945945945946, + "grad_norm": 0.2712023854255676, + "learning_rate": 1.1276646066966834e-05, + "loss": 0.0144, + "step": 19530 + }, + { + "epoch": 2.3465465465465467, + "grad_norm": 0.21880017220973969, + "learning_rate": 1.1256759663576576e-05, + "loss": 0.0126, + "step": 19535 + }, + { + "epoch": 2.347147147147147, + "grad_norm": 0.16714221239089966, + "learning_rate": 1.1236888586052673e-05, + "loss": 0.0128, + "step": 19540 + }, + { + "epoch": 2.3477477477477477, + "grad_norm": 0.284549742937088, + "learning_rate": 1.1217032842255643e-05, + "loss": 0.015, + "step": 19545 + }, + { + "epoch": 2.3483483483483485, + "grad_norm": 0.15571853518486023, + "learning_rate": 1.1197192440039921e-05, + "loss": 0.0118, + "step": 19550 + }, + { + "epoch": 2.348948948948949, + "grad_norm": 0.23313234746456146, + "learning_rate": 1.1177367387253896e-05, + "loss": 0.0133, + "step": 19555 + }, + { + "epoch": 2.3495495495495495, + "grad_norm": 0.19322821497917175, + "learning_rate": 1.115755769173984e-05, + "loss": 0.0127, + "step": 19560 + }, + { + "epoch": 2.3501501501501503, + "grad_norm": 0.26968666911125183, + "learning_rate": 1.1137763361333992e-05, + "loss": 0.0123, + "step": 19565 + }, + { + "epoch": 2.3507507507507506, + "grad_norm": 0.22539116442203522, + "learning_rate": 1.1117984403866499e-05, + "loss": 0.0129, + "step": 19570 + }, + { + "epoch": 2.3513513513513513, + "grad_norm": 0.2742008566856384, + "learning_rate": 1.1098220827161427e-05, + "loss": 0.0164, + "step": 19575 + }, + { + "epoch": 2.351951951951952, + "grad_norm": 0.21559664607048035, + "learning_rate": 1.1078472639036769e-05, + "loss": 0.013, + "step": 19580 + }, + { + "epoch": 2.3525525525525524, + "grad_norm": 0.2503921091556549, + "learning_rate": 1.1058739847304394e-05, + "loss": 0.0125, + "step": 19585 + }, + { + "epoch": 2.353153153153153, + "grad_norm": 0.16386358439922333, + "learning_rate": 1.1039022459770132e-05, + "loss": 0.0119, + "step": 19590 + }, + { + "epoch": 2.353753753753754, + "grad_norm": 0.22208142280578613, + "learning_rate": 1.101932048423367e-05, + "loss": 0.0142, + "step": 19595 + }, + { + "epoch": 2.3543543543543546, + "grad_norm": 0.23939143121242523, + "learning_rate": 1.0999633928488629e-05, + "loss": 0.0121, + "step": 19600 + }, + { + "epoch": 2.354954954954955, + "grad_norm": 0.1505102664232254, + "learning_rate": 1.0979962800322535e-05, + "loss": 0.0123, + "step": 19605 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 0.3065943419933319, + "learning_rate": 1.0960307107516782e-05, + "loss": 0.0131, + "step": 19610 + }, + { + "epoch": 2.356156156156156, + "grad_norm": 0.19549353420734406, + "learning_rate": 1.0940666857846682e-05, + "loss": 0.0134, + "step": 19615 + }, + { + "epoch": 2.3567567567567567, + "grad_norm": 0.23805366456508636, + "learning_rate": 1.0921042059081426e-05, + "loss": 0.0129, + "step": 19620 + }, + { + "epoch": 2.3573573573573574, + "grad_norm": 0.2263217568397522, + "learning_rate": 1.0901432718984128e-05, + "loss": 0.0142, + "step": 19625 + }, + { + "epoch": 2.357957957957958, + "grad_norm": 0.24713027477264404, + "learning_rate": 1.0881838845311714e-05, + "loss": 0.0134, + "step": 19630 + }, + { + "epoch": 2.3585585585585584, + "grad_norm": 0.22670410573482513, + "learning_rate": 1.0862260445815053e-05, + "loss": 0.0122, + "step": 19635 + }, + { + "epoch": 2.359159159159159, + "grad_norm": 0.27957212924957275, + "learning_rate": 1.0842697528238883e-05, + "loss": 0.0126, + "step": 19640 + }, + { + "epoch": 2.35975975975976, + "grad_norm": 0.25959333777427673, + "learning_rate": 1.08231501003218e-05, + "loss": 0.0117, + "step": 19645 + }, + { + "epoch": 2.3603603603603602, + "grad_norm": 0.16630196571350098, + "learning_rate": 1.0803618169796298e-05, + "loss": 0.013, + "step": 19650 + }, + { + "epoch": 2.360960960960961, + "grad_norm": 0.24647246301174164, + "learning_rate": 1.0784101744388702e-05, + "loss": 0.0142, + "step": 19655 + }, + { + "epoch": 2.3615615615615617, + "grad_norm": 0.3010789453983307, + "learning_rate": 1.0764600831819238e-05, + "loss": 0.0135, + "step": 19660 + }, + { + "epoch": 2.362162162162162, + "grad_norm": 0.2706170976161957, + "learning_rate": 1.0745115439801984e-05, + "loss": 0.0139, + "step": 19665 + }, + { + "epoch": 2.3627627627627628, + "grad_norm": 0.17888972163200378, + "learning_rate": 1.072564557604489e-05, + "loss": 0.0118, + "step": 19670 + }, + { + "epoch": 2.3633633633633635, + "grad_norm": 0.21044613420963287, + "learning_rate": 1.0706191248249725e-05, + "loss": 0.012, + "step": 19675 + }, + { + "epoch": 2.363963963963964, + "grad_norm": 0.3188132345676422, + "learning_rate": 1.0686752464112153e-05, + "loss": 0.0129, + "step": 19680 + }, + { + "epoch": 2.3645645645645645, + "grad_norm": 0.2343267947435379, + "learning_rate": 1.0667329231321699e-05, + "loss": 0.0131, + "step": 19685 + }, + { + "epoch": 2.3651651651651653, + "grad_norm": 0.15735098719596863, + "learning_rate": 1.0647921557561668e-05, + "loss": 0.0118, + "step": 19690 + }, + { + "epoch": 2.3657657657657656, + "grad_norm": 0.221736878156662, + "learning_rate": 1.06285294505093e-05, + "loss": 0.0133, + "step": 19695 + }, + { + "epoch": 2.3663663663663663, + "grad_norm": 0.19816137850284576, + "learning_rate": 1.0609152917835591e-05, + "loss": 0.0121, + "step": 19700 + }, + { + "epoch": 2.366966966966967, + "grad_norm": 0.17485511302947998, + "learning_rate": 1.0589791967205437e-05, + "loss": 0.0129, + "step": 19705 + }, + { + "epoch": 2.3675675675675674, + "grad_norm": 0.27220287919044495, + "learning_rate": 1.0570446606277551e-05, + "loss": 0.0142, + "step": 19710 + }, + { + "epoch": 2.368168168168168, + "grad_norm": 0.20521901547908783, + "learning_rate": 1.0551116842704479e-05, + "loss": 0.0152, + "step": 19715 + }, + { + "epoch": 2.368768768768769, + "grad_norm": 0.3141981363296509, + "learning_rate": 1.0531802684132608e-05, + "loss": 0.0133, + "step": 19720 + }, + { + "epoch": 2.3693693693693696, + "grad_norm": 0.2389078289270401, + "learning_rate": 1.0512504138202112e-05, + "loss": 0.0141, + "step": 19725 + }, + { + "epoch": 2.36996996996997, + "grad_norm": 0.19536085426807404, + "learning_rate": 1.0493221212547038e-05, + "loss": 0.012, + "step": 19730 + }, + { + "epoch": 2.3705705705705706, + "grad_norm": 0.22260215878486633, + "learning_rate": 1.0473953914795225e-05, + "loss": 0.0124, + "step": 19735 + }, + { + "epoch": 2.371171171171171, + "grad_norm": 0.20631587505340576, + "learning_rate": 1.0454702252568349e-05, + "loss": 0.0134, + "step": 19740 + }, + { + "epoch": 2.3717717717717717, + "grad_norm": 0.1789875328540802, + "learning_rate": 1.04354662334819e-05, + "loss": 0.0133, + "step": 19745 + }, + { + "epoch": 2.3723723723723724, + "grad_norm": 0.27481332421302795, + "learning_rate": 1.0416245865145141e-05, + "loss": 0.0143, + "step": 19750 + }, + { + "epoch": 2.3723723723723724, + "eval_loss": 0.03786711394786835, + "eval_runtime": 35.9806, + "eval_samples_per_second": 22.234, + "eval_steps_per_second": 5.559, + "step": 19750 + }, + { + "epoch": 2.372972972972973, + "grad_norm": 0.19248706102371216, + "learning_rate": 1.0397041155161185e-05, + "loss": 0.012, + "step": 19755 + }, + { + "epoch": 2.3735735735735735, + "grad_norm": 0.25902336835861206, + "learning_rate": 1.0377852111126951e-05, + "loss": 0.0131, + "step": 19760 + }, + { + "epoch": 2.374174174174174, + "grad_norm": 0.16944050788879395, + "learning_rate": 1.0358678740633154e-05, + "loss": 0.0104, + "step": 19765 + }, + { + "epoch": 2.374774774774775, + "grad_norm": 0.30373936891555786, + "learning_rate": 1.0339521051264278e-05, + "loss": 0.013, + "step": 19770 + }, + { + "epoch": 2.3753753753753752, + "grad_norm": 0.3525901138782501, + "learning_rate": 1.0320379050598654e-05, + "loss": 0.0135, + "step": 19775 + }, + { + "epoch": 2.375975975975976, + "grad_norm": 0.1787666529417038, + "learning_rate": 1.0301252746208367e-05, + "loss": 0.0106, + "step": 19780 + }, + { + "epoch": 2.3765765765765767, + "grad_norm": 0.17433393001556396, + "learning_rate": 1.0282142145659319e-05, + "loss": 0.0139, + "step": 19785 + }, + { + "epoch": 2.377177177177177, + "grad_norm": 0.21298392117023468, + "learning_rate": 1.0263047256511199e-05, + "loss": 0.0123, + "step": 19790 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 0.27403366565704346, + "learning_rate": 1.0243968086317446e-05, + "loss": 0.012, + "step": 19795 + }, + { + "epoch": 2.3783783783783785, + "grad_norm": 0.224418506026268, + "learning_rate": 1.0224904642625327e-05, + "loss": 0.0122, + "step": 19800 + }, + { + "epoch": 2.378978978978979, + "grad_norm": 0.19469419121742249, + "learning_rate": 1.020585693297586e-05, + "loss": 0.0138, + "step": 19805 + }, + { + "epoch": 2.3795795795795796, + "grad_norm": 0.25854915380477905, + "learning_rate": 1.0186824964903851e-05, + "loss": 0.0129, + "step": 19810 + }, + { + "epoch": 2.3801801801801803, + "grad_norm": 0.3246530294418335, + "learning_rate": 1.0167808745937891e-05, + "loss": 0.0154, + "step": 19815 + }, + { + "epoch": 2.3807807807807806, + "grad_norm": 0.2533751428127289, + "learning_rate": 1.0148808283600297e-05, + "loss": 0.0125, + "step": 19820 + }, + { + "epoch": 2.3813813813813813, + "grad_norm": 0.22221092879772186, + "learning_rate": 1.0129823585407194e-05, + "loss": 0.0143, + "step": 19825 + }, + { + "epoch": 2.381981981981982, + "grad_norm": 0.266520619392395, + "learning_rate": 1.0110854658868457e-05, + "loss": 0.0148, + "step": 19830 + }, + { + "epoch": 2.3825825825825824, + "grad_norm": 0.3407699167728424, + "learning_rate": 1.0091901511487738e-05, + "loss": 0.0123, + "step": 19835 + }, + { + "epoch": 2.383183183183183, + "grad_norm": 0.19929823279380798, + "learning_rate": 1.007296415076241e-05, + "loss": 0.0132, + "step": 19840 + }, + { + "epoch": 2.383783783783784, + "grad_norm": 0.24646946787834167, + "learning_rate": 1.0054042584183632e-05, + "loss": 0.0116, + "step": 19845 + }, + { + "epoch": 2.3843843843843846, + "grad_norm": 0.20933978259563446, + "learning_rate": 1.0035136819236307e-05, + "loss": 0.0112, + "step": 19850 + }, + { + "epoch": 2.384984984984985, + "grad_norm": 0.15630047023296356, + "learning_rate": 1.0016246863399087e-05, + "loss": 0.0139, + "step": 19855 + }, + { + "epoch": 2.3855855855855856, + "grad_norm": 0.1657629907131195, + "learning_rate": 9.997372724144388e-06, + "loss": 0.0139, + "step": 19860 + }, + { + "epoch": 2.386186186186186, + "grad_norm": 0.20281122624874115, + "learning_rate": 9.978514408938328e-06, + "loss": 0.0124, + "step": 19865 + }, + { + "epoch": 2.3867867867867867, + "grad_norm": 0.17628662288188934, + "learning_rate": 9.95967192524081e-06, + "loss": 0.0125, + "step": 19870 + }, + { + "epoch": 2.3873873873873874, + "grad_norm": 0.21708153188228607, + "learning_rate": 9.940845280505423e-06, + "loss": 0.0122, + "step": 19875 + }, + { + "epoch": 2.387987987987988, + "grad_norm": 0.22435113787651062, + "learning_rate": 9.922034482179549e-06, + "loss": 0.0132, + "step": 19880 + }, + { + "epoch": 2.3885885885885885, + "grad_norm": 0.28971534967422485, + "learning_rate": 9.903239537704272e-06, + "loss": 0.0124, + "step": 19885 + }, + { + "epoch": 2.389189189189189, + "grad_norm": 0.17425502836704254, + "learning_rate": 9.884460454514389e-06, + "loss": 0.013, + "step": 19890 + }, + { + "epoch": 2.38978978978979, + "grad_norm": 0.12470649927854538, + "learning_rate": 9.865697240038452e-06, + "loss": 0.0123, + "step": 19895 + }, + { + "epoch": 2.3903903903903903, + "grad_norm": 0.1936112493276596, + "learning_rate": 9.846949901698727e-06, + "loss": 0.0135, + "step": 19900 + }, + { + "epoch": 2.390990990990991, + "grad_norm": 0.15805506706237793, + "learning_rate": 9.828218446911203e-06, + "loss": 0.0118, + "step": 19905 + }, + { + "epoch": 2.3915915915915917, + "grad_norm": 0.32184869050979614, + "learning_rate": 9.809502883085553e-06, + "loss": 0.0134, + "step": 19910 + }, + { + "epoch": 2.392192192192192, + "grad_norm": 0.21912485361099243, + "learning_rate": 9.79080321762521e-06, + "loss": 0.0134, + "step": 19915 + }, + { + "epoch": 2.3927927927927928, + "grad_norm": 0.24116133153438568, + "learning_rate": 9.772119457927298e-06, + "loss": 0.0122, + "step": 19920 + }, + { + "epoch": 2.3933933933933935, + "grad_norm": 0.20128588378429413, + "learning_rate": 9.753451611382647e-06, + "loss": 0.0107, + "step": 19925 + }, + { + "epoch": 2.393993993993994, + "grad_norm": 0.24893133342266083, + "learning_rate": 9.734799685375806e-06, + "loss": 0.0127, + "step": 19930 + }, + { + "epoch": 2.3945945945945946, + "grad_norm": 0.16727831959724426, + "learning_rate": 9.71616368728499e-06, + "loss": 0.0119, + "step": 19935 + }, + { + "epoch": 2.3951951951951953, + "grad_norm": 0.3045255243778229, + "learning_rate": 9.697543624482158e-06, + "loss": 0.0131, + "step": 19940 + }, + { + "epoch": 2.3957957957957956, + "grad_norm": 0.23958060145378113, + "learning_rate": 9.678939504332934e-06, + "loss": 0.0135, + "step": 19945 + }, + { + "epoch": 2.3963963963963963, + "grad_norm": 0.18694347143173218, + "learning_rate": 9.66035133419666e-06, + "loss": 0.0138, + "step": 19950 + }, + { + "epoch": 2.396996996996997, + "grad_norm": 0.1764531284570694, + "learning_rate": 9.641779121426358e-06, + "loss": 0.0126, + "step": 19955 + }, + { + "epoch": 2.3975975975975974, + "grad_norm": 0.21081571280956268, + "learning_rate": 9.623222873368714e-06, + "loss": 0.0125, + "step": 19960 + }, + { + "epoch": 2.398198198198198, + "grad_norm": 0.21348458528518677, + "learning_rate": 9.604682597364145e-06, + "loss": 0.0125, + "step": 19965 + }, + { + "epoch": 2.398798798798799, + "grad_norm": 0.1940726935863495, + "learning_rate": 9.5861583007467e-06, + "loss": 0.013, + "step": 19970 + }, + { + "epoch": 2.3993993993993996, + "grad_norm": 0.1993122398853302, + "learning_rate": 9.567649990844146e-06, + "loss": 0.0125, + "step": 19975 + }, + { + "epoch": 2.4, + "grad_norm": 0.3673401176929474, + "learning_rate": 9.54915767497792e-06, + "loss": 0.0142, + "step": 19980 + }, + { + "epoch": 2.4006006006006007, + "grad_norm": 0.23906080424785614, + "learning_rate": 9.530681360463107e-06, + "loss": 0.0127, + "step": 19985 + }, + { + "epoch": 2.401201201201201, + "grad_norm": 0.15805503726005554, + "learning_rate": 9.512221054608483e-06, + "loss": 0.0131, + "step": 19990 + }, + { + "epoch": 2.4018018018018017, + "grad_norm": 0.1870601326227188, + "learning_rate": 9.493776764716495e-06, + "loss": 0.0116, + "step": 19995 + }, + { + "epoch": 2.4024024024024024, + "grad_norm": 0.2301345020532608, + "learning_rate": 9.47534849808326e-06, + "loss": 0.0128, + "step": 20000 + }, + { + "epoch": 2.4024024024024024, + "eval_loss": 0.038246750831604004, + "eval_runtime": 35.8886, + "eval_samples_per_second": 22.291, + "eval_steps_per_second": 5.573, + "step": 20000 + } + ], + "logging_steps": 5, + "max_steps": 24975, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.0321032618337056e+20, + "train_batch_size": 24, + "trial_name": null, + "trial_params": null +}