{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3248, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.2709156318785633, "learning_rate": 6.153846153846154e-07, "loss": 0.9934, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.4419044510206839, "learning_rate": 3.0769230769230774e-06, "loss": 1.0584, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.5233721395581817, "learning_rate": 6.153846153846155e-06, "loss": 1.1615, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.6649670185877594, "learning_rate": 9.230769230769232e-06, "loss": 1.1631, "step": 15 }, { "epoch": 0.01, "grad_norm": 0.4305302514113907, "learning_rate": 1.230769230769231e-05, "loss": 1.2207, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.5755996114609079, "learning_rate": 1.5384615384615387e-05, "loss": 0.9927, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.20368238004024766, "learning_rate": 1.8461538461538465e-05, "loss": 0.9844, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.24375178341672513, "learning_rate": 2.1538461538461542e-05, "loss": 0.9985, "step": 35 }, { "epoch": 0.01, "grad_norm": 0.38432351721666425, "learning_rate": 2.461538461538462e-05, "loss": 1.0881, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.8927024097300557, "learning_rate": 2.7692307692307694e-05, "loss": 1.0362, "step": 45 }, { "epoch": 0.02, "grad_norm": 0.24949485533837065, "learning_rate": 3.0769230769230774e-05, "loss": 0.9051, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.3003048201124618, "learning_rate": 3.384615384615385e-05, "loss": 0.9026, "step": 55 }, { "epoch": 0.02, "grad_norm": 0.3884135265695224, "learning_rate": 3.692307692307693e-05, "loss": 0.8955, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.5578919802311338, "learning_rate": 4e-05, "loss": 0.7901, "step": 65 }, { "epoch": 0.02, "grad_norm": 0.2289752603512456, "learning_rate": 4.3076923076923084e-05, "loss": 0.8754, "step": 70 }, { "epoch": 0.02, "grad_norm": 0.459950526314893, "learning_rate": 4.615384615384616e-05, "loss": 0.7449, "step": 75 }, { "epoch": 0.02, "grad_norm": 0.3061061622913128, "learning_rate": 4.923076923076924e-05, "loss": 0.8067, "step": 80 }, { "epoch": 0.03, "grad_norm": 0.26485910183284767, "learning_rate": 5.230769230769231e-05, "loss": 0.8737, "step": 85 }, { "epoch": 0.03, "grad_norm": 0.21605949900797208, "learning_rate": 5.538461538461539e-05, "loss": 0.8169, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.2286110237629433, "learning_rate": 5.846153846153847e-05, "loss": 0.8132, "step": 95 }, { "epoch": 0.03, "grad_norm": 0.24556746259970036, "learning_rate": 6.153846153846155e-05, "loss": 0.8198, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.2948151232563899, "learning_rate": 6.461538461538462e-05, "loss": 0.8163, "step": 105 }, { "epoch": 0.03, "grad_norm": 0.2659100532190652, "learning_rate": 6.76923076923077e-05, "loss": 0.8979, "step": 110 }, { "epoch": 0.04, "grad_norm": 0.24575597154935636, "learning_rate": 7.076923076923078e-05, "loss": 0.8273, "step": 115 }, { "epoch": 0.04, "grad_norm": 0.3093299661054237, "learning_rate": 7.384615384615386e-05, "loss": 0.8091, "step": 120 }, { "epoch": 0.04, "grad_norm": 0.2908225328310054, "learning_rate": 7.692307692307693e-05, "loss": 0.8313, "step": 125 }, { "epoch": 0.04, "grad_norm": 0.3020091724093868, "learning_rate": 8e-05, "loss": 0.8358, "step": 130 }, { "epoch": 0.04, "grad_norm": 0.2694408106299054, "learning_rate": 8.307692307692309e-05, "loss": 0.8349, "step": 135 }, { "epoch": 0.04, "grad_norm": 0.3000515761359836, "learning_rate": 8.615384615384617e-05, "loss": 0.8707, "step": 140 }, { "epoch": 0.04, "grad_norm": 0.3079582972721868, "learning_rate": 8.923076923076924e-05, "loss": 0.8617, "step": 145 }, { "epoch": 0.05, "grad_norm": 0.3795146820147972, "learning_rate": 9.230769230769232e-05, "loss": 0.7768, "step": 150 }, { "epoch": 0.05, "grad_norm": 0.4775337525844143, "learning_rate": 9.53846153846154e-05, "loss": 0.8528, "step": 155 }, { "epoch": 0.05, "grad_norm": 0.27382955396233616, "learning_rate": 9.846153846153848e-05, "loss": 0.9321, "step": 160 }, { "epoch": 0.05, "grad_norm": 0.3493866683153211, "learning_rate": 0.00010153846153846153, "loss": 0.8745, "step": 165 }, { "epoch": 0.05, "grad_norm": 0.392311201345868, "learning_rate": 0.00010461538461538463, "loss": 0.8214, "step": 170 }, { "epoch": 0.05, "grad_norm": 0.2542347730845665, "learning_rate": 0.0001076923076923077, "loss": 0.8266, "step": 175 }, { "epoch": 0.06, "grad_norm": 0.30567410806640644, "learning_rate": 0.00011076923076923077, "loss": 0.8421, "step": 180 }, { "epoch": 0.06, "grad_norm": 0.3347043226775438, "learning_rate": 0.00011384615384615384, "loss": 0.8482, "step": 185 }, { "epoch": 0.06, "grad_norm": 0.39125501413574576, "learning_rate": 0.00011692307692307694, "loss": 0.7707, "step": 190 }, { "epoch": 0.06, "grad_norm": 0.27082032316598875, "learning_rate": 0.00012, "loss": 0.8802, "step": 195 }, { "epoch": 0.06, "grad_norm": 0.2655311149315157, "learning_rate": 0.0001230769230769231, "loss": 0.8666, "step": 200 }, { "epoch": 0.06, "grad_norm": 0.3211818059226096, "learning_rate": 0.00012615384615384615, "loss": 0.8397, "step": 205 }, { "epoch": 0.06, "grad_norm": 0.2924195950677733, "learning_rate": 0.00012923076923076923, "loss": 0.8067, "step": 210 }, { "epoch": 0.07, "grad_norm": 0.27236793564577827, "learning_rate": 0.0001323076923076923, "loss": 0.8266, "step": 215 }, { "epoch": 0.07, "grad_norm": 0.31068056017205964, "learning_rate": 0.0001353846153846154, "loss": 0.727, "step": 220 }, { "epoch": 0.07, "grad_norm": 0.24520172222389908, "learning_rate": 0.00013846153846153847, "loss": 0.7937, "step": 225 }, { "epoch": 0.07, "grad_norm": 0.28996280021429405, "learning_rate": 0.00014153846153846156, "loss": 0.8405, "step": 230 }, { "epoch": 0.07, "grad_norm": 0.26492992094167794, "learning_rate": 0.0001446153846153846, "loss": 0.8417, "step": 235 }, { "epoch": 0.07, "grad_norm": 0.28305833332691566, "learning_rate": 0.00014769230769230772, "loss": 0.9148, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.31032712702750226, "learning_rate": 0.00015076923076923077, "loss": 0.8458, "step": 245 }, { "epoch": 0.08, "grad_norm": 0.32447015298077714, "learning_rate": 0.00015384615384615385, "loss": 0.7821, "step": 250 }, { "epoch": 0.08, "grad_norm": 0.2547532649722294, "learning_rate": 0.00015692307692307693, "loss": 0.8207, "step": 255 }, { "epoch": 0.08, "grad_norm": 0.2735833452730825, "learning_rate": 0.00016, "loss": 0.8249, "step": 260 }, { "epoch": 0.08, "grad_norm": 0.22729853067013947, "learning_rate": 0.0001630769230769231, "loss": 0.8387, "step": 265 }, { "epoch": 0.08, "grad_norm": 0.3343853232959839, "learning_rate": 0.00016615384615384617, "loss": 0.8648, "step": 270 }, { "epoch": 0.08, "grad_norm": 0.3108160988317094, "learning_rate": 0.00016923076923076923, "loss": 0.804, "step": 275 }, { "epoch": 0.09, "grad_norm": 0.2609065872820603, "learning_rate": 0.00017230769230769234, "loss": 0.8216, "step": 280 }, { "epoch": 0.09, "grad_norm": 0.24454880681449043, "learning_rate": 0.0001753846153846154, "loss": 0.8174, "step": 285 }, { "epoch": 0.09, "grad_norm": 0.22180414129702308, "learning_rate": 0.00017846153846153847, "loss": 0.8579, "step": 290 }, { "epoch": 0.09, "grad_norm": 0.26081939474045385, "learning_rate": 0.00018153846153846155, "loss": 0.8432, "step": 295 }, { "epoch": 0.09, "grad_norm": 0.2751512686500224, "learning_rate": 0.00018461538461538463, "loss": 0.6994, "step": 300 }, { "epoch": 0.09, "grad_norm": 0.24286008540174067, "learning_rate": 0.0001876923076923077, "loss": 0.8409, "step": 305 }, { "epoch": 0.1, "grad_norm": 0.2306911443540912, "learning_rate": 0.0001907692307692308, "loss": 0.8241, "step": 310 }, { "epoch": 0.1, "grad_norm": 0.3568984623630479, "learning_rate": 0.00019384615384615385, "loss": 0.7153, "step": 315 }, { "epoch": 0.1, "grad_norm": 0.36681138166946803, "learning_rate": 0.00019692307692307696, "loss": 0.8065, "step": 320 }, { "epoch": 0.1, "grad_norm": 0.22369081814221262, "learning_rate": 0.0002, "loss": 0.757, "step": 325 }, { "epoch": 0.1, "grad_norm": 0.32740968759147726, "learning_rate": 0.00019999855605356607, "loss": 0.785, "step": 330 }, { "epoch": 0.1, "grad_norm": 0.38663458318983307, "learning_rate": 0.0001999942242559639, "loss": 0.7893, "step": 335 }, { "epoch": 0.1, "grad_norm": 0.3744195353028169, "learning_rate": 0.00019998700473229113, "loss": 0.8817, "step": 340 }, { "epoch": 0.11, "grad_norm": 0.2937020154962458, "learning_rate": 0.00019997689769103992, "loss": 0.8068, "step": 345 }, { "epoch": 0.11, "grad_norm": 0.31130184115081005, "learning_rate": 0.00019996390342409071, "loss": 0.8888, "step": 350 }, { "epoch": 0.11, "grad_norm": 0.27717910910942206, "learning_rate": 0.00019994802230670415, "loss": 0.8296, "step": 355 }, { "epoch": 0.11, "grad_norm": 0.28929048963159804, "learning_rate": 0.00019992925479750978, "loss": 0.8375, "step": 360 }, { "epoch": 0.11, "grad_norm": 0.30278441435173964, "learning_rate": 0.00019990760143849317, "loss": 0.7978, "step": 365 }, { "epoch": 0.11, "grad_norm": 0.3926711605744842, "learning_rate": 0.00019988306285498018, "loss": 0.8156, "step": 370 }, { "epoch": 0.12, "grad_norm": 0.2056371002966051, "learning_rate": 0.0001998556397556188, "loss": 0.8492, "step": 375 }, { "epoch": 0.12, "grad_norm": 0.3386258840898327, "learning_rate": 0.00019982533293235873, "loss": 0.7553, "step": 380 }, { "epoch": 0.12, "grad_norm": 0.24704095019032765, "learning_rate": 0.00019979214326042857, "loss": 0.8032, "step": 385 }, { "epoch": 0.12, "grad_norm": 0.3027504102198928, "learning_rate": 0.0001997560716983105, "loss": 0.8777, "step": 390 }, { "epoch": 0.12, "grad_norm": 0.25850106416138335, "learning_rate": 0.00019971711928771257, "loss": 0.8353, "step": 395 }, { "epoch": 0.12, "grad_norm": 0.42457411611908963, "learning_rate": 0.0001996752871535387, "loss": 0.7962, "step": 400 }, { "epoch": 0.12, "grad_norm": 0.32389294595176554, "learning_rate": 0.00019963057650385606, "loss": 0.8473, "step": 405 }, { "epoch": 0.13, "grad_norm": 0.2232805240763939, "learning_rate": 0.0001995829886298604, "loss": 0.7768, "step": 410 }, { "epoch": 0.13, "grad_norm": 0.26355868762668144, "learning_rate": 0.00019953252490583843, "loss": 0.8432, "step": 415 }, { "epoch": 0.13, "grad_norm": 0.2479995279505114, "learning_rate": 0.00019947918678912848, "loss": 0.8742, "step": 420 }, { "epoch": 0.13, "grad_norm": 0.26547854409221383, "learning_rate": 0.0001994229758200783, "loss": 0.8072, "step": 425 }, { "epoch": 0.13, "grad_norm": 0.23642626786417162, "learning_rate": 0.00019936389362200033, "loss": 0.7956, "step": 430 }, { "epoch": 0.13, "grad_norm": 0.25913092229555307, "learning_rate": 0.00019930194190112522, "loss": 0.7345, "step": 435 }, { "epoch": 0.14, "grad_norm": 0.2966162150235158, "learning_rate": 0.00019923712244655225, "loss": 0.8089, "step": 440 }, { "epoch": 0.14, "grad_norm": 0.2187595004800295, "learning_rate": 0.00019916943713019794, "loss": 0.7427, "step": 445 }, { "epoch": 0.14, "grad_norm": 0.29051575591401246, "learning_rate": 0.00019909888790674155, "loss": 0.8768, "step": 450 }, { "epoch": 0.14, "grad_norm": 0.3296601997445316, "learning_rate": 0.00019902547681356923, "loss": 0.8616, "step": 455 }, { "epoch": 0.14, "grad_norm": 0.29185622420361307, "learning_rate": 0.0001989492059707146, "loss": 0.7993, "step": 460 }, { "epoch": 0.14, "grad_norm": 0.285867084295898, "learning_rate": 0.00019887007758079793, "loss": 0.8207, "step": 465 }, { "epoch": 0.14, "grad_norm": 0.30952870458662307, "learning_rate": 0.00019878809392896235, "loss": 0.8668, "step": 470 }, { "epoch": 0.15, "grad_norm": 0.3381740373711063, "learning_rate": 0.00019870325738280785, "loss": 0.8842, "step": 475 }, { "epoch": 0.15, "grad_norm": 0.21684296837932523, "learning_rate": 0.0001986155703923231, "loss": 0.7966, "step": 480 }, { "epoch": 0.15, "grad_norm": 0.3040871521339894, "learning_rate": 0.0001985250354898143, "loss": 0.8622, "step": 485 }, { "epoch": 0.15, "grad_norm": 0.26978651830594724, "learning_rate": 0.0001984316552898326, "loss": 0.8748, "step": 490 }, { "epoch": 0.15, "grad_norm": 0.29082578689683647, "learning_rate": 0.00019833543248909798, "loss": 0.8407, "step": 495 }, { "epoch": 0.15, "grad_norm": 0.301663442193365, "learning_rate": 0.00019823636986642199, "loss": 0.8568, "step": 500 }, { "epoch": 0.16, "grad_norm": 0.2552544076755423, "learning_rate": 0.0001981344702826269, "loss": 0.8286, "step": 505 }, { "epoch": 0.16, "grad_norm": 0.24913640355204184, "learning_rate": 0.00019802973668046363, "loss": 0.8022, "step": 510 }, { "epoch": 0.16, "grad_norm": 0.2217941168846133, "learning_rate": 0.00019792217208452635, "loss": 0.8674, "step": 515 }, { "epoch": 0.16, "grad_norm": 0.2891487359747499, "learning_rate": 0.00019781177960116538, "loss": 0.8123, "step": 520 }, { "epoch": 0.16, "grad_norm": 0.34655206684809864, "learning_rate": 0.00019769856241839737, "loss": 0.8517, "step": 525 }, { "epoch": 0.16, "grad_norm": 0.3053447288771597, "learning_rate": 0.00019758252380581328, "loss": 0.8821, "step": 530 }, { "epoch": 0.16, "grad_norm": 0.3307139329014054, "learning_rate": 0.00019746366711448387, "loss": 0.8677, "step": 535 }, { "epoch": 0.17, "grad_norm": 0.306144694096585, "learning_rate": 0.00019734199577686314, "loss": 0.7189, "step": 540 }, { "epoch": 0.17, "grad_norm": 0.2774735539484507, "learning_rate": 0.0001972175133066889, "loss": 0.7494, "step": 545 }, { "epoch": 0.17, "grad_norm": 0.3140012878663545, "learning_rate": 0.00019709022329888155, "loss": 0.7943, "step": 550 }, { "epoch": 0.17, "grad_norm": 0.2646845744217625, "learning_rate": 0.00019696012942944013, "loss": 0.836, "step": 555 }, { "epoch": 0.17, "grad_norm": 0.2308386439333217, "learning_rate": 0.00019682723545533628, "loss": 0.8478, "step": 560 }, { "epoch": 0.17, "grad_norm": 0.262138689846067, "learning_rate": 0.00019669154521440553, "loss": 0.7914, "step": 565 }, { "epoch": 0.18, "grad_norm": 0.6748339885003066, "learning_rate": 0.0001965530626252367, "loss": 0.8494, "step": 570 }, { "epoch": 0.18, "grad_norm": 0.33850537974316935, "learning_rate": 0.00019641179168705862, "loss": 0.6988, "step": 575 }, { "epoch": 0.18, "grad_norm": 0.2655667830205273, "learning_rate": 0.00019626773647962457, "loss": 0.8944, "step": 580 }, { "epoch": 0.18, "grad_norm": 0.266738555121118, "learning_rate": 0.0001961209011630947, "loss": 0.8797, "step": 585 }, { "epoch": 0.18, "grad_norm": 0.2867657573604784, "learning_rate": 0.0001959712899779156, "loss": 0.8718, "step": 590 }, { "epoch": 0.18, "grad_norm": 0.3370395857653061, "learning_rate": 0.00019581890724469802, "loss": 0.8289, "step": 595 }, { "epoch": 0.18, "grad_norm": 0.29934897150076484, "learning_rate": 0.00019566375736409204, "loss": 0.822, "step": 600 }, { "epoch": 0.19, "grad_norm": 0.28585720261383735, "learning_rate": 0.00019550584481666002, "loss": 0.8579, "step": 605 }, { "epoch": 0.19, "grad_norm": 0.3285211504524654, "learning_rate": 0.0001953451741627471, "loss": 0.8795, "step": 610 }, { "epoch": 0.19, "grad_norm": 0.24511909912168928, "learning_rate": 0.0001951817500423497, "loss": 0.7862, "step": 615 }, { "epoch": 0.19, "grad_norm": 0.253601728108672, "learning_rate": 0.0001950155771749813, "loss": 0.8076, "step": 620 }, { "epoch": 0.19, "grad_norm": 0.3092833252431494, "learning_rate": 0.00019484666035953632, "loss": 0.7513, "step": 625 }, { "epoch": 0.19, "grad_norm": 0.26385573322113004, "learning_rate": 0.00019467500447415138, "loss": 0.8263, "step": 630 }, { "epoch": 0.2, "grad_norm": 0.28938768331562686, "learning_rate": 0.00019450061447606455, "loss": 0.7777, "step": 635 }, { "epoch": 0.2, "grad_norm": 0.2529231145412997, "learning_rate": 0.00019432349540147222, "loss": 0.8287, "step": 640 }, { "epoch": 0.2, "grad_norm": 0.21789719255683243, "learning_rate": 0.00019414365236538342, "loss": 0.789, "step": 645 }, { "epoch": 0.2, "grad_norm": 0.2926240350717249, "learning_rate": 0.00019396109056147242, "loss": 0.8396, "step": 650 }, { "epoch": 0.2, "grad_norm": 0.21569069093149024, "learning_rate": 0.00019377581526192853, "loss": 0.7599, "step": 655 }, { "epoch": 0.2, "grad_norm": 0.26361195643008684, "learning_rate": 0.00019358783181730387, "loss": 0.8687, "step": 660 }, { "epoch": 0.2, "grad_norm": 0.285068572985004, "learning_rate": 0.00019339714565635898, "loss": 0.7735, "step": 665 }, { "epoch": 0.21, "grad_norm": 0.32266517654300897, "learning_rate": 0.0001932037622859059, "loss": 0.754, "step": 670 }, { "epoch": 0.21, "grad_norm": 0.2678959128715121, "learning_rate": 0.00019300768729064912, "loss": 0.8024, "step": 675 }, { "epoch": 0.21, "grad_norm": 0.2808314964971424, "learning_rate": 0.00019280892633302454, "loss": 0.767, "step": 680 }, { "epoch": 0.21, "grad_norm": 0.25044630314755334, "learning_rate": 0.00019260748515303563, "loss": 0.8454, "step": 685 }, { "epoch": 0.21, "grad_norm": 0.30242650123792125, "learning_rate": 0.00019240336956808786, "loss": 0.8812, "step": 690 }, { "epoch": 0.21, "grad_norm": 0.2701055207795336, "learning_rate": 0.00019219658547282067, "loss": 0.7791, "step": 695 }, { "epoch": 0.22, "grad_norm": 0.2232333034817994, "learning_rate": 0.0001919871388389372, "loss": 0.7782, "step": 700 }, { "epoch": 0.22, "grad_norm": 0.2578136050088398, "learning_rate": 0.0001917750357150318, "loss": 0.7164, "step": 705 }, { "epoch": 0.22, "grad_norm": 0.2974468917976116, "learning_rate": 0.00019156028222641554, "loss": 0.8559, "step": 710 }, { "epoch": 0.22, "grad_norm": 0.2811089697192464, "learning_rate": 0.00019134288457493904, "loss": 0.7352, "step": 715 }, { "epoch": 0.22, "grad_norm": 0.3892901045304661, "learning_rate": 0.0001911228490388136, "loss": 0.7775, "step": 720 }, { "epoch": 0.22, "grad_norm": 0.32835288605201257, "learning_rate": 0.00019090018197242972, "loss": 0.8125, "step": 725 }, { "epoch": 0.22, "grad_norm": 0.3396081375822814, "learning_rate": 0.00019067488980617384, "loss": 0.8498, "step": 730 }, { "epoch": 0.23, "grad_norm": 0.2725077722420475, "learning_rate": 0.00019044697904624226, "loss": 0.8652, "step": 735 }, { "epoch": 0.23, "grad_norm": 0.26882238969800315, "learning_rate": 0.0001902164562744536, "loss": 0.8316, "step": 740 }, { "epoch": 0.23, "grad_norm": 0.2942779783614407, "learning_rate": 0.00018998332814805852, "loss": 0.8937, "step": 745 }, { "epoch": 0.23, "grad_norm": 0.23318828090848456, "learning_rate": 0.0001897476013995476, "loss": 0.7247, "step": 750 }, { "epoch": 0.23, "grad_norm": 0.3207657739253934, "learning_rate": 0.00018950928283645676, "loss": 0.8168, "step": 755 }, { "epoch": 0.23, "grad_norm": 0.27343106758276103, "learning_rate": 0.00018926837934117084, "loss": 0.7436, "step": 760 }, { "epoch": 0.24, "grad_norm": 0.2380153711122644, "learning_rate": 0.0001890248978707246, "loss": 0.845, "step": 765 }, { "epoch": 0.24, "grad_norm": 0.28205742315414456, "learning_rate": 0.00018877884545660215, "loss": 0.8329, "step": 770 }, { "epoch": 0.24, "grad_norm": 0.248001179467568, "learning_rate": 0.0001885302292045336, "loss": 0.8322, "step": 775 }, { "epoch": 0.24, "grad_norm": 0.512811292371171, "learning_rate": 0.0001882790562942899, "loss": 0.7778, "step": 780 }, { "epoch": 0.24, "grad_norm": 0.3027018813313023, "learning_rate": 0.00018802533397947567, "loss": 0.8338, "step": 785 }, { "epoch": 0.24, "grad_norm": 0.2940436466186521, "learning_rate": 0.00018776906958731953, "loss": 0.6823, "step": 790 }, { "epoch": 0.24, "grad_norm": 0.33833283166239086, "learning_rate": 0.00018751027051846258, "loss": 0.7669, "step": 795 }, { "epoch": 0.25, "grad_norm": 0.31248954654812844, "learning_rate": 0.00018724894424674467, "loss": 0.7851, "step": 800 }, { "epoch": 0.25, "grad_norm": 0.3080103407030162, "learning_rate": 0.00018698509831898853, "loss": 0.8465, "step": 805 }, { "epoch": 0.25, "grad_norm": 0.27928229688289624, "learning_rate": 0.00018671874035478195, "loss": 0.7708, "step": 810 }, { "epoch": 0.25, "grad_norm": 0.2926934897262978, "learning_rate": 0.00018644987804625757, "loss": 0.8816, "step": 815 }, { "epoch": 0.25, "grad_norm": 0.2564445050641534, "learning_rate": 0.00018617851915787078, "loss": 0.8748, "step": 820 }, { "epoch": 0.25, "grad_norm": 0.3167275363170148, "learning_rate": 0.0001859046715261756, "loss": 0.7955, "step": 825 }, { "epoch": 0.26, "grad_norm": 0.4082416585797153, "learning_rate": 0.00018562834305959824, "loss": 0.7464, "step": 830 }, { "epoch": 0.26, "grad_norm": 0.3030422995435233, "learning_rate": 0.0001853495417382088, "loss": 0.9046, "step": 835 }, { "epoch": 0.26, "grad_norm": 0.2536332149798187, "learning_rate": 0.00018506827561349073, "loss": 0.7143, "step": 840 }, { "epoch": 0.26, "grad_norm": 0.3272764072322209, "learning_rate": 0.00018478455280810838, "loss": 0.8358, "step": 845 }, { "epoch": 0.26, "grad_norm": 0.28310232183891465, "learning_rate": 0.00018449838151567244, "loss": 0.842, "step": 850 }, { "epoch": 0.26, "grad_norm": 0.26576658481713733, "learning_rate": 0.00018420977000050323, "loss": 0.7563, "step": 855 }, { "epoch": 0.26, "grad_norm": 0.2612196906596331, "learning_rate": 0.00018391872659739215, "loss": 0.7631, "step": 860 }, { "epoch": 0.27, "grad_norm": 0.43199033496139155, "learning_rate": 0.00018362525971136082, "loss": 0.8585, "step": 865 }, { "epoch": 0.27, "grad_norm": 0.3011184188491384, "learning_rate": 0.00018332937781741858, "loss": 0.807, "step": 870 }, { "epoch": 0.27, "grad_norm": 0.3432230727339861, "learning_rate": 0.00018303108946031747, "loss": 0.806, "step": 875 }, { "epoch": 0.27, "grad_norm": 0.28699539333378254, "learning_rate": 0.00018273040325430574, "loss": 0.8063, "step": 880 }, { "epoch": 0.27, "grad_norm": 0.2895901648006327, "learning_rate": 0.00018242732788287884, "loss": 0.7773, "step": 885 }, { "epoch": 0.27, "grad_norm": 0.30393045217103676, "learning_rate": 0.00018212187209852888, "loss": 0.7721, "step": 890 }, { "epoch": 0.28, "grad_norm": 0.4409757200332159, "learning_rate": 0.00018181404472249158, "loss": 0.805, "step": 895 }, { "epoch": 0.28, "grad_norm": 0.36679860235251033, "learning_rate": 0.00018150385464449183, "loss": 0.7759, "step": 900 }, { "epoch": 0.28, "grad_norm": 0.276840442597116, "learning_rate": 0.00018119131082248676, "loss": 0.8182, "step": 905 }, { "epoch": 0.28, "grad_norm": 0.2365689665357522, "learning_rate": 0.00018087642228240713, "loss": 0.7851, "step": 910 }, { "epoch": 0.28, "grad_norm": 0.30377473821055756, "learning_rate": 0.00018055919811789658, "loss": 0.7467, "step": 915 }, { "epoch": 0.28, "grad_norm": 0.40215196146679155, "learning_rate": 0.00018023964749004921, "loss": 0.7436, "step": 920 }, { "epoch": 0.28, "grad_norm": 0.24424508927481137, "learning_rate": 0.00017991777962714472, "loss": 0.8502, "step": 925 }, { "epoch": 0.29, "grad_norm": 0.3549699023868391, "learning_rate": 0.00017959360382438226, "loss": 0.8607, "step": 930 }, { "epoch": 0.29, "grad_norm": 0.29288526726309294, "learning_rate": 0.00017926712944361164, "loss": 0.7812, "step": 935 }, { "epoch": 0.29, "grad_norm": 0.38300023494160845, "learning_rate": 0.00017893836591306326, "loss": 0.965, "step": 940 }, { "epoch": 0.29, "grad_norm": 0.3400552848154392, "learning_rate": 0.00017860732272707565, "loss": 0.9296, "step": 945 }, { "epoch": 0.29, "grad_norm": 0.33946589539162436, "learning_rate": 0.0001782740094458214, "loss": 0.7948, "step": 950 }, { "epoch": 0.29, "grad_norm": 0.3106409161075979, "learning_rate": 0.00017793843569503096, "loss": 0.9234, "step": 955 }, { "epoch": 0.3, "grad_norm": 0.3048504659492406, "learning_rate": 0.00017760061116571472, "loss": 0.735, "step": 960 }, { "epoch": 0.3, "grad_norm": 0.3699006267760609, "learning_rate": 0.00017726054561388325, "loss": 0.8097, "step": 965 }, { "epoch": 0.3, "grad_norm": 0.3812470781886498, "learning_rate": 0.0001769182488602653, "loss": 0.7924, "step": 970 }, { "epoch": 0.3, "grad_norm": 0.31391420913653745, "learning_rate": 0.0001765737307900244, "loss": 0.8468, "step": 975 }, { "epoch": 0.3, "grad_norm": 0.2798144915599161, "learning_rate": 0.00017622700135247336, "loss": 0.7466, "step": 980 }, { "epoch": 0.3, "grad_norm": 0.4373736539748376, "learning_rate": 0.0001758780705607869, "loss": 0.7782, "step": 985 }, { "epoch": 0.3, "grad_norm": 0.31499380584195635, "learning_rate": 0.00017552694849171238, "loss": 0.7623, "step": 990 }, { "epoch": 0.31, "grad_norm": 0.34084198516003655, "learning_rate": 0.00017517364528527905, "loss": 0.7643, "step": 995 }, { "epoch": 0.31, "grad_norm": 0.220259377380433, "learning_rate": 0.00017481817114450504, "loss": 0.7041, "step": 1000 }, { "epoch": 0.31, "grad_norm": 0.3741361155303008, "learning_rate": 0.00017446053633510267, "loss": 0.8331, "step": 1005 }, { "epoch": 0.31, "grad_norm": 0.25186403738162744, "learning_rate": 0.00017410075118518207, "loss": 0.7746, "step": 1010 }, { "epoch": 0.31, "grad_norm": 0.3381680931423155, "learning_rate": 0.000173738826084953, "loss": 0.8091, "step": 1015 }, { "epoch": 0.31, "grad_norm": 0.30332385851807925, "learning_rate": 0.00017337477148642453, "loss": 0.8123, "step": 1020 }, { "epoch": 0.32, "grad_norm": 0.4189781620141866, "learning_rate": 0.0001730085979031035, "loss": 0.7662, "step": 1025 }, { "epoch": 0.32, "grad_norm": 0.20916757459764715, "learning_rate": 0.0001726403159096907, "loss": 0.7658, "step": 1030 }, { "epoch": 0.32, "grad_norm": 0.25400278359501394, "learning_rate": 0.0001722699361417755, "loss": 0.7761, "step": 1035 }, { "epoch": 0.32, "grad_norm": 0.29634266788228114, "learning_rate": 0.00017189746929552885, "loss": 0.7712, "step": 1040 }, { "epoch": 0.32, "grad_norm": 0.34416521475533296, "learning_rate": 0.00017152292612739427, "loss": 0.8657, "step": 1045 }, { "epoch": 0.32, "grad_norm": 0.26477218894974247, "learning_rate": 0.00017114631745377716, "loss": 0.7979, "step": 1050 }, { "epoch": 0.32, "grad_norm": 0.39575314169996223, "learning_rate": 0.00017076765415073252, "loss": 0.7657, "step": 1055 }, { "epoch": 0.33, "grad_norm": 0.3060703163684423, "learning_rate": 0.0001703869471536509, "loss": 0.7758, "step": 1060 }, { "epoch": 0.33, "grad_norm": 0.3041979747156484, "learning_rate": 0.00017000420745694254, "loss": 0.8641, "step": 1065 }, { "epoch": 0.33, "grad_norm": 0.2891225024964161, "learning_rate": 0.0001696194461137198, "loss": 0.8824, "step": 1070 }, { "epoch": 0.33, "grad_norm": 0.2718444108580119, "learning_rate": 0.0001692326742354781, "loss": 0.7924, "step": 1075 }, { "epoch": 0.33, "grad_norm": 0.259262158014512, "learning_rate": 0.00016884390299177492, "loss": 0.8369, "step": 1080 }, { "epoch": 0.33, "grad_norm": 0.3499348899825314, "learning_rate": 0.00016845314360990727, "loss": 0.8346, "step": 1085 }, { "epoch": 0.34, "grad_norm": 0.2715739588374178, "learning_rate": 0.00016806040737458745, "loss": 0.8032, "step": 1090 }, { "epoch": 0.34, "grad_norm": 0.31767202890667356, "learning_rate": 0.00016766570562761726, "loss": 0.7771, "step": 1095 }, { "epoch": 0.34, "grad_norm": 0.2775818452285115, "learning_rate": 0.00016726904976756024, "loss": 0.7571, "step": 1100 }, { "epoch": 0.34, "grad_norm": 0.40533408177768837, "learning_rate": 0.00016687045124941268, "loss": 0.7487, "step": 1105 }, { "epoch": 0.34, "grad_norm": 0.23186261109536282, "learning_rate": 0.0001664699215842728, "loss": 0.7442, "step": 1110 }, { "epoch": 0.34, "grad_norm": 0.39092343811338354, "learning_rate": 0.00016606747233900815, "loss": 0.8009, "step": 1115 }, { "epoch": 0.34, "grad_norm": 0.3086065524408362, "learning_rate": 0.00016566311513592188, "loss": 0.8045, "step": 1120 }, { "epoch": 0.35, "grad_norm": 0.3367565478870619, "learning_rate": 0.00016525686165241673, "loss": 0.767, "step": 1125 }, { "epoch": 0.35, "grad_norm": 0.3086208201423001, "learning_rate": 0.00016484872362065818, "loss": 0.8297, "step": 1130 }, { "epoch": 0.35, "grad_norm": 0.29118078471035397, "learning_rate": 0.0001644387128272353, "loss": 0.73, "step": 1135 }, { "epoch": 0.35, "grad_norm": 0.2276647259111584, "learning_rate": 0.00016402684111282048, "loss": 0.7594, "step": 1140 }, { "epoch": 0.35, "grad_norm": 0.31848819925406663, "learning_rate": 0.00016361312037182764, "loss": 0.7175, "step": 1145 }, { "epoch": 0.35, "grad_norm": 0.3783361351946572, "learning_rate": 0.00016319756255206856, "loss": 0.8027, "step": 1150 }, { "epoch": 0.36, "grad_norm": 0.266088499578184, "learning_rate": 0.00016278017965440787, "loss": 0.7452, "step": 1155 }, { "epoch": 0.36, "grad_norm": 0.29706113106114784, "learning_rate": 0.0001623609837324165, "loss": 0.8534, "step": 1160 }, { "epoch": 0.36, "grad_norm": 0.3339471167119851, "learning_rate": 0.00016193998689202358, "loss": 0.8144, "step": 1165 }, { "epoch": 0.36, "grad_norm": 0.28540623867963755, "learning_rate": 0.00016151720129116686, "loss": 0.7651, "step": 1170 }, { "epoch": 0.36, "grad_norm": 0.2639670831879774, "learning_rate": 0.00016109263913944154, "loss": 0.7034, "step": 1175 }, { "epoch": 0.36, "grad_norm": 0.33170223538556726, "learning_rate": 0.00016066631269774767, "loss": 0.7217, "step": 1180 }, { "epoch": 0.36, "grad_norm": 0.3941911797947879, "learning_rate": 0.00016023823427793626, "loss": 0.7772, "step": 1185 }, { "epoch": 0.37, "grad_norm": 0.3440363876277395, "learning_rate": 0.00015980841624245335, "loss": 0.727, "step": 1190 }, { "epoch": 0.37, "grad_norm": 0.27303644578696656, "learning_rate": 0.00015937687100398343, "loss": 0.7976, "step": 1195 }, { "epoch": 0.37, "grad_norm": 0.19363446071730062, "learning_rate": 0.0001589436110250906, "loss": 0.7601, "step": 1200 }, { "epoch": 0.37, "grad_norm": 0.3220033440998593, "learning_rate": 0.00015850864881785892, "loss": 0.8059, "step": 1205 }, { "epoch": 0.37, "grad_norm": 0.24875841441874524, "learning_rate": 0.00015807199694353093, "loss": 0.7766, "step": 1210 }, { "epoch": 0.37, "grad_norm": 0.27144712486585415, "learning_rate": 0.000157633668012145, "loss": 0.9517, "step": 1215 }, { "epoch": 0.38, "grad_norm": 0.29984291637849303, "learning_rate": 0.00015719367468217102, "loss": 0.7078, "step": 1220 }, { "epoch": 0.38, "grad_norm": 0.42717086324378223, "learning_rate": 0.00015675202966014502, "loss": 0.6811, "step": 1225 }, { "epoch": 0.38, "grad_norm": 0.34151608330461036, "learning_rate": 0.0001563087457003021, "loss": 0.7748, "step": 1230 }, { "epoch": 0.38, "grad_norm": 0.3709398696526256, "learning_rate": 0.0001558638356042081, "loss": 0.7182, "step": 1235 }, { "epoch": 0.38, "grad_norm": 0.34922444682335496, "learning_rate": 0.00015541731222038998, "loss": 0.8094, "step": 1240 }, { "epoch": 0.38, "grad_norm": 0.3980009403514347, "learning_rate": 0.00015496918844396467, "loss": 0.8039, "step": 1245 }, { "epoch": 0.38, "grad_norm": 0.3512099804126243, "learning_rate": 0.00015451947721626676, "loss": 0.79, "step": 1250 }, { "epoch": 0.39, "grad_norm": 0.40719330570424017, "learning_rate": 0.00015406819152447474, "loss": 0.6692, "step": 1255 }, { "epoch": 0.39, "grad_norm": 0.32754689412178806, "learning_rate": 0.0001536153444012359, "loss": 0.7442, "step": 1260 }, { "epoch": 0.39, "grad_norm": 0.3468017225536143, "learning_rate": 0.00015316094892428995, "loss": 0.7848, "step": 1265 }, { "epoch": 0.39, "grad_norm": 0.3434551451114442, "learning_rate": 0.00015270501821609158, "loss": 0.7438, "step": 1270 }, { "epoch": 0.39, "grad_norm": 0.34114725338586066, "learning_rate": 0.00015224756544343114, "loss": 0.6742, "step": 1275 }, { "epoch": 0.39, "grad_norm": 0.35582931502840126, "learning_rate": 0.00015178860381705457, "loss": 0.6642, "step": 1280 }, { "epoch": 0.4, "grad_norm": 0.3951492370139673, "learning_rate": 0.00015132814659128205, "loss": 0.7963, "step": 1285 }, { "epoch": 0.4, "grad_norm": 0.3017429849876104, "learning_rate": 0.00015086620706362486, "loss": 0.7752, "step": 1290 }, { "epoch": 0.4, "grad_norm": 0.2626021589785786, "learning_rate": 0.00015040279857440176, "loss": 0.7782, "step": 1295 }, { "epoch": 0.4, "grad_norm": 0.33978693007893296, "learning_rate": 0.0001499379345063534, "loss": 0.7799, "step": 1300 }, { "epoch": 0.4, "grad_norm": 0.3251454729934429, "learning_rate": 0.00014947162828425606, "loss": 0.7907, "step": 1305 }, { "epoch": 0.4, "grad_norm": 0.46699196417323946, "learning_rate": 0.00014900389337453392, "loss": 0.8757, "step": 1310 }, { "epoch": 0.4, "grad_norm": 0.30759216383899046, "learning_rate": 0.00014853474328487, "loss": 0.8248, "step": 1315 }, { "epoch": 0.41, "grad_norm": 0.5240595026063802, "learning_rate": 0.00014806419156381632, "loss": 0.8153, "step": 1320 }, { "epoch": 0.41, "grad_norm": 0.2984711481085597, "learning_rate": 0.0001475922518004025, "loss": 0.8307, "step": 1325 }, { "epoch": 0.41, "grad_norm": 0.34139646796135426, "learning_rate": 0.00014711893762374322, "loss": 0.7983, "step": 1330 }, { "epoch": 0.41, "grad_norm": 0.2571926484335572, "learning_rate": 0.00014664426270264493, "loss": 0.6837, "step": 1335 }, { "epoch": 0.41, "grad_norm": 0.3257702661543834, "learning_rate": 0.00014616824074521075, "loss": 0.7656, "step": 1340 }, { "epoch": 0.41, "grad_norm": 0.5681687224975429, "learning_rate": 0.00014569088549844488, "loss": 0.8412, "step": 1345 }, { "epoch": 0.42, "grad_norm": 0.3442468618645148, "learning_rate": 0.00014521221074785542, "loss": 0.7408, "step": 1350 }, { "epoch": 0.42, "grad_norm": 0.3889043102333772, "learning_rate": 0.00014473223031705637, "loss": 0.7891, "step": 1355 }, { "epoch": 0.42, "grad_norm": 0.3512289539889666, "learning_rate": 0.0001442509580673684, "loss": 0.7438, "step": 1360 }, { "epoch": 0.42, "grad_norm": 0.3124271122113035, "learning_rate": 0.00014376840789741838, "loss": 0.7047, "step": 1365 }, { "epoch": 0.42, "grad_norm": 0.2200391690908901, "learning_rate": 0.00014328459374273833, "loss": 0.7432, "step": 1370 }, { "epoch": 0.42, "grad_norm": 0.32400034100164815, "learning_rate": 0.00014279952957536266, "loss": 0.8155, "step": 1375 }, { "epoch": 0.42, "grad_norm": 0.3003484274407438, "learning_rate": 0.00014231322940342492, "loss": 0.7521, "step": 1380 }, { "epoch": 0.43, "grad_norm": 0.4116598695778175, "learning_rate": 0.00014182570727075308, "loss": 0.8548, "step": 1385 }, { "epoch": 0.43, "grad_norm": 0.42125576864395314, "learning_rate": 0.00014133697725646403, "loss": 0.8552, "step": 1390 }, { "epoch": 0.43, "grad_norm": 0.32506737333947255, "learning_rate": 0.000140847053474557, "loss": 0.7796, "step": 1395 }, { "epoch": 0.43, "grad_norm": 0.3558852515623043, "learning_rate": 0.00014035595007350592, "loss": 0.782, "step": 1400 }, { "epoch": 0.43, "grad_norm": 0.32892065566412354, "learning_rate": 0.00013986368123585093, "loss": 0.7912, "step": 1405 }, { "epoch": 0.43, "grad_norm": 0.3309987342740096, "learning_rate": 0.00013937026117778867, "loss": 0.7852, "step": 1410 }, { "epoch": 0.44, "grad_norm": 0.317076816745732, "learning_rate": 0.00013887570414876176, "loss": 0.8792, "step": 1415 }, { "epoch": 0.44, "grad_norm": 0.3888229597038326, "learning_rate": 0.00013838002443104742, "loss": 0.7537, "step": 1420 }, { "epoch": 0.44, "grad_norm": 0.3505522947043339, "learning_rate": 0.00013788323633934484, "loss": 0.7765, "step": 1425 }, { "epoch": 0.44, "grad_norm": 0.30255809120744814, "learning_rate": 0.0001373853542203619, "loss": 0.7445, "step": 1430 }, { "epoch": 0.44, "grad_norm": 0.38394599313950495, "learning_rate": 0.00013688639245240078, "loss": 0.717, "step": 1435 }, { "epoch": 0.44, "grad_norm": 0.3546082273774911, "learning_rate": 0.00013638636544494287, "loss": 0.7088, "step": 1440 }, { "epoch": 0.44, "grad_norm": 0.46456400202121617, "learning_rate": 0.00013588528763823233, "loss": 0.6481, "step": 1445 }, { "epoch": 0.45, "grad_norm": 0.38142306418882993, "learning_rate": 0.0001353831735028595, "loss": 0.8121, "step": 1450 }, { "epoch": 0.45, "grad_norm": 0.34062042874830745, "learning_rate": 0.00013488003753934263, "loss": 0.7098, "step": 1455 }, { "epoch": 0.45, "grad_norm": 0.19799193048705183, "learning_rate": 0.0001343758942777094, "loss": 0.6883, "step": 1460 }, { "epoch": 0.45, "grad_norm": 0.3696985192619358, "learning_rate": 0.000133870758277077, "loss": 0.8092, "step": 1465 }, { "epoch": 0.45, "grad_norm": 0.2874954359019885, "learning_rate": 0.00013336464412523207, "loss": 0.8209, "step": 1470 }, { "epoch": 0.45, "grad_norm": 0.3592024936010695, "learning_rate": 0.000132857566438209, "loss": 0.854, "step": 1475 }, { "epoch": 0.46, "grad_norm": 0.29409773858597665, "learning_rate": 0.00013234953985986824, "loss": 0.798, "step": 1480 }, { "epoch": 0.46, "grad_norm": 0.2415718204855592, "learning_rate": 0.0001318405790614731, "loss": 0.7382, "step": 1485 }, { "epoch": 0.46, "grad_norm": 0.2584643780619029, "learning_rate": 0.0001313306987412661, "loss": 0.8092, "step": 1490 }, { "epoch": 0.46, "grad_norm": 0.34126538154076436, "learning_rate": 0.00013081991362404475, "loss": 0.789, "step": 1495 }, { "epoch": 0.46, "grad_norm": 0.32753475635130697, "learning_rate": 0.00013030823846073595, "loss": 0.8413, "step": 1500 }, { "epoch": 0.46, "grad_norm": 0.3285555673315335, "learning_rate": 0.00012979568802797022, "loss": 0.7092, "step": 1505 }, { "epoch": 0.46, "grad_norm": 0.2947608781251718, "learning_rate": 0.00012928227712765504, "loss": 0.645, "step": 1510 }, { "epoch": 0.47, "grad_norm": 0.33949478474040173, "learning_rate": 0.00012876802058654714, "loss": 0.804, "step": 1515 }, { "epoch": 0.47, "grad_norm": 0.43727181136357957, "learning_rate": 0.0001282529332558245, "loss": 0.8041, "step": 1520 }, { "epoch": 0.47, "grad_norm": 0.3609023630640718, "learning_rate": 0.00012773703001065737, "loss": 0.8356, "step": 1525 }, { "epoch": 0.47, "grad_norm": 0.3494948390700119, "learning_rate": 0.00012722032574977881, "loss": 0.7872, "step": 1530 }, { "epoch": 0.47, "grad_norm": 0.3275549957683315, "learning_rate": 0.0001267028353950543, "loss": 0.7883, "step": 1535 }, { "epoch": 0.47, "grad_norm": 0.2434171834573686, "learning_rate": 0.00012618457389105094, "loss": 0.7766, "step": 1540 }, { "epoch": 0.48, "grad_norm": 0.35813509273993893, "learning_rate": 0.00012566555620460569, "loss": 0.7723, "step": 1545 }, { "epoch": 0.48, "grad_norm": 0.3850234800177591, "learning_rate": 0.00012514579732439323, "loss": 0.7127, "step": 1550 }, { "epoch": 0.48, "grad_norm": 0.2990175481928644, "learning_rate": 0.00012462531226049335, "loss": 0.8027, "step": 1555 }, { "epoch": 0.48, "grad_norm": 0.26743125802211676, "learning_rate": 0.00012410411604395696, "loss": 0.7775, "step": 1560 }, { "epoch": 0.48, "grad_norm": 0.3003015429775997, "learning_rate": 0.00012358222372637248, "loss": 0.8003, "step": 1565 }, { "epoch": 0.48, "grad_norm": 0.25952231751732324, "learning_rate": 0.00012305965037943096, "loss": 0.7946, "step": 1570 }, { "epoch": 0.48, "grad_norm": 0.3571723160585395, "learning_rate": 0.00012253641109449074, "loss": 0.7369, "step": 1575 }, { "epoch": 0.49, "grad_norm": 0.3502660576927713, "learning_rate": 0.00012201252098214186, "loss": 0.8105, "step": 1580 }, { "epoch": 0.49, "grad_norm": 0.3925450057088276, "learning_rate": 0.00012148799517176948, "loss": 0.7664, "step": 1585 }, { "epoch": 0.49, "grad_norm": 0.2894085765012847, "learning_rate": 0.00012096284881111711, "loss": 0.8213, "step": 1590 }, { "epoch": 0.49, "grad_norm": 0.29374369830200575, "learning_rate": 0.00012043709706584902, "loss": 0.7723, "step": 1595 }, { "epoch": 0.49, "grad_norm": 0.2863311083269218, "learning_rate": 0.00011991075511911236, "loss": 0.696, "step": 1600 }, { "epoch": 0.49, "grad_norm": 0.3036662438900221, "learning_rate": 0.00011938383817109868, "loss": 0.8753, "step": 1605 }, { "epoch": 0.5, "grad_norm": 0.3020605197833583, "learning_rate": 0.00011885636143860492, "loss": 0.8759, "step": 1610 }, { "epoch": 0.5, "grad_norm": 0.3639681427966891, "learning_rate": 0.00011832834015459404, "loss": 0.8606, "step": 1615 }, { "epoch": 0.5, "grad_norm": 0.37953818216433793, "learning_rate": 0.00011779978956775506, "loss": 0.7051, "step": 1620 }, { "epoch": 0.5, "grad_norm": 0.3184781493318525, "learning_rate": 0.00011727072494206262, "loss": 0.7916, "step": 1625 }, { "epoch": 0.5, "grad_norm": 0.35142683733387886, "learning_rate": 0.00011674116155633637, "loss": 0.8831, "step": 1630 }, { "epoch": 0.5, "grad_norm": 0.3117830556752173, "learning_rate": 0.00011621111470379951, "loss": 0.8306, "step": 1635 }, { "epoch": 0.5, "grad_norm": 0.4495145775092123, "learning_rate": 0.00011568059969163734, "loss": 0.7767, "step": 1640 }, { "epoch": 0.51, "grad_norm": 0.29751872220308234, "learning_rate": 0.00011514963184055503, "loss": 0.7627, "step": 1645 }, { "epoch": 0.51, "grad_norm": 0.5069779219255514, "learning_rate": 0.00011461822648433527, "loss": 0.7007, "step": 1650 }, { "epoch": 0.51, "grad_norm": 0.3685939765535684, "learning_rate": 0.00011408639896939548, "loss": 0.7903, "step": 1655 }, { "epoch": 0.51, "grad_norm": 0.35043401596057283, "learning_rate": 0.0001135541646543445, "loss": 0.8195, "step": 1660 }, { "epoch": 0.51, "grad_norm": 0.43437482478281425, "learning_rate": 0.00011302153890953917, "loss": 0.7474, "step": 1665 }, { "epoch": 0.51, "grad_norm": 0.424740143766434, "learning_rate": 0.00011248853711664037, "loss": 0.7487, "step": 1670 }, { "epoch": 0.52, "grad_norm": 0.4206812162224315, "learning_rate": 0.00011195517466816892, "loss": 0.7663, "step": 1675 }, { "epoch": 0.52, "grad_norm": 0.3528935885168195, "learning_rate": 0.00011142146696706086, "loss": 0.7075, "step": 1680 }, { "epoch": 0.52, "grad_norm": 0.3022231077132756, "learning_rate": 0.00011088742942622285, "loss": 0.7005, "step": 1685 }, { "epoch": 0.52, "grad_norm": 0.24230122499008153, "learning_rate": 0.00011035307746808696, "loss": 0.7103, "step": 1690 }, { "epoch": 0.52, "grad_norm": 0.3274240826179655, "learning_rate": 0.00010981842652416525, "loss": 0.7585, "step": 1695 }, { "epoch": 0.52, "grad_norm": 0.3226818393613587, "learning_rate": 0.00010928349203460421, "loss": 0.6873, "step": 1700 }, { "epoch": 0.52, "grad_norm": 0.42160428435071856, "learning_rate": 0.00010874828944773884, "loss": 0.7033, "step": 1705 }, { "epoch": 0.53, "grad_norm": 0.3673664828653425, "learning_rate": 0.0001082128342196464, "loss": 0.7568, "step": 1710 }, { "epoch": 0.53, "grad_norm": 0.35369832231150045, "learning_rate": 0.00010767714181370032, "loss": 0.7459, "step": 1715 }, { "epoch": 0.53, "grad_norm": 0.3648184560113796, "learning_rate": 0.00010714122770012332, "loss": 0.7744, "step": 1720 }, { "epoch": 0.53, "grad_norm": 0.4505619268522559, "learning_rate": 0.0001066051073555409, "loss": 0.7257, "step": 1725 }, { "epoch": 0.53, "grad_norm": 0.43443202284742777, "learning_rate": 0.00010606879626253425, "loss": 0.7188, "step": 1730 }, { "epoch": 0.53, "grad_norm": 0.3553258041770261, "learning_rate": 0.00010553230990919316, "loss": 0.7459, "step": 1735 }, { "epoch": 0.54, "grad_norm": 0.4661654069610038, "learning_rate": 0.00010499566378866879, "loss": 0.7836, "step": 1740 }, { "epoch": 0.54, "grad_norm": 0.37584682327967367, "learning_rate": 0.00010445887339872613, "loss": 0.7602, "step": 1745 }, { "epoch": 0.54, "grad_norm": 0.39145966702225243, "learning_rate": 0.00010392195424129663, "loss": 0.7742, "step": 1750 }, { "epoch": 0.54, "grad_norm": 0.3393184813627934, "learning_rate": 0.0001033849218220303, "loss": 0.7641, "step": 1755 }, { "epoch": 0.54, "grad_norm": 0.3324768161048583, "learning_rate": 0.00010284779164984808, "loss": 0.7084, "step": 1760 }, { "epoch": 0.54, "grad_norm": 0.4536643875844217, "learning_rate": 0.00010231057923649395, "loss": 0.7546, "step": 1765 }, { "epoch": 0.54, "grad_norm": 0.3383053206020978, "learning_rate": 0.00010177330009608679, "loss": 0.7897, "step": 1770 }, { "epoch": 0.55, "grad_norm": 0.3291950908164226, "learning_rate": 0.00010123596974467267, "loss": 0.837, "step": 1775 }, { "epoch": 0.55, "grad_norm": 0.40591985948567333, "learning_rate": 0.00010069860369977644, "loss": 0.7881, "step": 1780 }, { "epoch": 0.55, "grad_norm": 0.3947516646576018, "learning_rate": 0.0001001612174799538, "loss": 0.7554, "step": 1785 }, { "epoch": 0.55, "grad_norm": 0.48999744278201957, "learning_rate": 9.962382660434302e-05, "loss": 0.7049, "step": 1790 }, { "epoch": 0.55, "grad_norm": 0.27763093083945417, "learning_rate": 9.908644659221692e-05, "loss": 0.7906, "step": 1795 }, { "epoch": 0.55, "grad_norm": 0.36597705216081855, "learning_rate": 9.854909296253454e-05, "loss": 0.7717, "step": 1800 }, { "epoch": 0.56, "grad_norm": 0.361260421586406, "learning_rate": 9.801178123349298e-05, "loss": 0.8052, "step": 1805 }, { "epoch": 0.56, "grad_norm": 0.40479237805543866, "learning_rate": 9.747452692207944e-05, "loss": 0.6528, "step": 1810 }, { "epoch": 0.56, "grad_norm": 0.3337778576325595, "learning_rate": 9.693734554362274e-05, "loss": 0.7956, "step": 1815 }, { "epoch": 0.56, "grad_norm": 0.352206821846608, "learning_rate": 9.640025261134566e-05, "loss": 0.8004, "step": 1820 }, { "epoch": 0.56, "grad_norm": 0.3007022043507481, "learning_rate": 9.586326363591667e-05, "loss": 0.6586, "step": 1825 }, { "epoch": 0.56, "grad_norm": 0.32806169397898344, "learning_rate": 9.532639412500214e-05, "loss": 0.6469, "step": 1830 }, { "epoch": 0.56, "grad_norm": 0.2948353441185244, "learning_rate": 9.478965958281831e-05, "loss": 0.772, "step": 1835 }, { "epoch": 0.57, "grad_norm": 0.29433563822493, "learning_rate": 9.425307550968379e-05, "loss": 0.7587, "step": 1840 }, { "epoch": 0.57, "grad_norm": 0.2929390819806653, "learning_rate": 9.371665740157177e-05, "loss": 0.7641, "step": 1845 }, { "epoch": 0.57, "grad_norm": 0.36587748924129493, "learning_rate": 9.318042074966249e-05, "loss": 0.7423, "step": 1850 }, { "epoch": 0.57, "grad_norm": 0.3157914575950516, "learning_rate": 9.2644381039896e-05, "loss": 0.7802, "step": 1855 }, { "epoch": 0.57, "grad_norm": 0.3083734823157643, "learning_rate": 9.210855375252488e-05, "loss": 0.6806, "step": 1860 }, { "epoch": 0.57, "grad_norm": 0.37273540588458964, "learning_rate": 9.157295436166706e-05, "loss": 0.8018, "step": 1865 }, { "epoch": 0.58, "grad_norm": 0.2891457780890995, "learning_rate": 9.103759833485921e-05, "loss": 0.7924, "step": 1870 }, { "epoch": 0.58, "grad_norm": 0.31880678342943103, "learning_rate": 9.050250113260988e-05, "loss": 0.6784, "step": 1875 }, { "epoch": 0.58, "grad_norm": 0.38652296771171907, "learning_rate": 8.996767820795295e-05, "loss": 0.8423, "step": 1880 }, { "epoch": 0.58, "grad_norm": 0.36151176691802633, "learning_rate": 8.943314500600153e-05, "loss": 0.7657, "step": 1885 }, { "epoch": 0.58, "grad_norm": 0.3630982909299649, "learning_rate": 8.889891696350182e-05, "loss": 0.7316, "step": 1890 }, { "epoch": 0.58, "grad_norm": 0.346561432187551, "learning_rate": 8.836500950838743e-05, "loss": 0.7937, "step": 1895 }, { "epoch": 0.58, "grad_norm": 0.3552138882564471, "learning_rate": 8.783143805933356e-05, "loss": 0.7688, "step": 1900 }, { "epoch": 0.59, "grad_norm": 0.3883059056946058, "learning_rate": 8.729821802531212e-05, "loss": 0.8022, "step": 1905 }, { "epoch": 0.59, "grad_norm": 0.3654198034463761, "learning_rate": 8.676536480514646e-05, "loss": 0.6797, "step": 1910 }, { "epoch": 0.59, "grad_norm": 0.3437677388699394, "learning_rate": 8.623289378706665e-05, "loss": 0.8503, "step": 1915 }, { "epoch": 0.59, "grad_norm": 0.23831382342326574, "learning_rate": 8.570082034826525e-05, "loss": 0.725, "step": 1920 }, { "epoch": 0.59, "grad_norm": 0.4978109719850785, "learning_rate": 8.51691598544532e-05, "loss": 0.8173, "step": 1925 }, { "epoch": 0.59, "grad_norm": 0.3849488236961706, "learning_rate": 8.463792765941598e-05, "loss": 0.7935, "step": 1930 }, { "epoch": 0.6, "grad_norm": 0.2564830548422943, "learning_rate": 8.410713910457022e-05, "loss": 0.7616, "step": 1935 }, { "epoch": 0.6, "grad_norm": 0.3883061081379379, "learning_rate": 8.357680951852074e-05, "loss": 0.7351, "step": 1940 }, { "epoch": 0.6, "grad_norm": 0.3506058449194061, "learning_rate": 8.30469542166179e-05, "loss": 0.7693, "step": 1945 }, { "epoch": 0.6, "grad_norm": 0.30929676711753123, "learning_rate": 8.25175885005151e-05, "loss": 0.7873, "step": 1950 }, { "epoch": 0.6, "grad_norm": 0.40865576499509826, "learning_rate": 8.19887276577271e-05, "loss": 0.8042, "step": 1955 }, { "epoch": 0.6, "grad_norm": 0.5195995711187212, "learning_rate": 8.146038696118855e-05, "loss": 0.7973, "step": 1960 }, { "epoch": 0.6, "grad_norm": 0.42197209504725125, "learning_rate": 8.093258166881262e-05, "loss": 0.7533, "step": 1965 }, { "epoch": 0.61, "grad_norm": 0.30422958535775585, "learning_rate": 8.04053270230508e-05, "loss": 0.779, "step": 1970 }, { "epoch": 0.61, "grad_norm": 0.3455509672202836, "learning_rate": 7.987863825045234e-05, "loss": 0.8111, "step": 1975 }, { "epoch": 0.61, "grad_norm": 0.23483119931888347, "learning_rate": 7.935253056122478e-05, "loss": 0.6691, "step": 1980 }, { "epoch": 0.61, "grad_norm": 0.5096365631799179, "learning_rate": 7.882701914879454e-05, "loss": 0.8173, "step": 1985 }, { "epoch": 0.61, "grad_norm": 0.27454232980225196, "learning_rate": 7.83021191893682e-05, "loss": 0.8318, "step": 1990 }, { "epoch": 0.61, "grad_norm": 0.25340939505328935, "learning_rate": 7.777784584149431e-05, "loss": 0.7749, "step": 1995 }, { "epoch": 0.62, "grad_norm": 0.41218592002469173, "learning_rate": 7.725421424562541e-05, "loss": 0.7486, "step": 2000 }, { "epoch": 0.62, "grad_norm": 0.3166340386687328, "learning_rate": 7.673123952368105e-05, "loss": 0.7371, "step": 2005 }, { "epoch": 0.62, "grad_norm": 0.37125596925100546, "learning_rate": 7.620893677861097e-05, "loss": 0.8205, "step": 2010 }, { "epoch": 0.62, "grad_norm": 0.389340046711289, "learning_rate": 7.568732109395882e-05, "loss": 0.8052, "step": 2015 }, { "epoch": 0.62, "grad_norm": 0.297511489273485, "learning_rate": 7.516640753342677e-05, "loss": 0.8116, "step": 2020 }, { "epoch": 0.62, "grad_norm": 0.3551932787364764, "learning_rate": 7.464621114044041e-05, "loss": 0.7256, "step": 2025 }, { "epoch": 0.62, "grad_norm": 0.32976411229944613, "learning_rate": 7.41267469377143e-05, "loss": 0.7779, "step": 2030 }, { "epoch": 0.63, "grad_norm": 0.36641305934003204, "learning_rate": 7.360802992681803e-05, "loss": 0.7769, "step": 2035 }, { "epoch": 0.63, "grad_norm": 0.32848405433392913, "learning_rate": 7.309007508774319e-05, "loss": 0.7449, "step": 2040 }, { "epoch": 0.63, "grad_norm": 0.3818192607183943, "learning_rate": 7.257289737847067e-05, "loss": 0.7298, "step": 2045 }, { "epoch": 0.63, "grad_norm": 0.3956889666509929, "learning_rate": 7.205651173453859e-05, "loss": 0.7438, "step": 2050 }, { "epoch": 0.63, "grad_norm": 0.3186630869883142, "learning_rate": 7.154093306861115e-05, "loss": 0.8091, "step": 2055 }, { "epoch": 0.63, "grad_norm": 0.33431044470129717, "learning_rate": 7.102617627004795e-05, "loss": 0.7518, "step": 2060 }, { "epoch": 0.64, "grad_norm": 0.32535315210688565, "learning_rate": 7.051225620447375e-05, "loss": 0.8321, "step": 2065 }, { "epoch": 0.64, "grad_norm": 0.4508357834061351, "learning_rate": 6.999918771334952e-05, "loss": 0.7282, "step": 2070 }, { "epoch": 0.64, "grad_norm": 0.3512613827114045, "learning_rate": 6.948698561354363e-05, "loss": 0.7826, "step": 2075 }, { "epoch": 0.64, "grad_norm": 0.49837144853088533, "learning_rate": 6.897566469690397e-05, "loss": 0.795, "step": 2080 }, { "epoch": 0.64, "grad_norm": 0.4003697684296247, "learning_rate": 6.846523972983085e-05, "loss": 0.7951, "step": 2085 }, { "epoch": 0.64, "grad_norm": 0.3815043269956921, "learning_rate": 6.795572545285044e-05, "loss": 0.826, "step": 2090 }, { "epoch": 0.65, "grad_norm": 0.3291683320960395, "learning_rate": 6.74471365801893e-05, "loss": 0.7708, "step": 2095 }, { "epoch": 0.65, "grad_norm": 0.41704151240520887, "learning_rate": 6.693948779934911e-05, "loss": 0.7386, "step": 2100 }, { "epoch": 0.65, "grad_norm": 0.463623793653466, "learning_rate": 6.643279377068283e-05, "loss": 0.7713, "step": 2105 }, { "epoch": 0.65, "grad_norm": 0.3658375594477012, "learning_rate": 6.592706912697124e-05, "loss": 0.7786, "step": 2110 }, { "epoch": 0.65, "grad_norm": 0.4059447230155753, "learning_rate": 6.542232847300015e-05, "loss": 0.798, "step": 2115 }, { "epoch": 0.65, "grad_norm": 0.3927246312306725, "learning_rate": 6.491858638513899e-05, "loss": 0.8166, "step": 2120 }, { "epoch": 0.65, "grad_norm": 0.35333239481209877, "learning_rate": 6.441585741091955e-05, "loss": 0.7539, "step": 2125 }, { "epoch": 0.66, "grad_norm": 0.3623671701689697, "learning_rate": 6.391415606861608e-05, "loss": 0.8162, "step": 2130 }, { "epoch": 0.66, "grad_norm": 0.430064026231262, "learning_rate": 6.341349684682576e-05, "loss": 0.7593, "step": 2135 }, { "epoch": 0.66, "grad_norm": 0.30707444492883157, "learning_rate": 6.291389420405062e-05, "loss": 0.7593, "step": 2140 }, { "epoch": 0.66, "grad_norm": 0.29281767006409765, "learning_rate": 6.241536256827978e-05, "loss": 0.7074, "step": 2145 }, { "epoch": 0.66, "grad_norm": 0.3397684880342664, "learning_rate": 6.191791633657268e-05, "loss": 0.7077, "step": 2150 }, { "epoch": 0.66, "grad_norm": 0.35070530863747645, "learning_rate": 6.142156987464367e-05, "loss": 0.7888, "step": 2155 }, { "epoch": 0.67, "grad_norm": 0.31884184127852916, "learning_rate": 6.0926337516446784e-05, "loss": 0.8045, "step": 2160 }, { "epoch": 0.67, "grad_norm": 0.34522310174070975, "learning_rate": 6.043223356376197e-05, "loss": 0.8115, "step": 2165 }, { "epoch": 0.67, "grad_norm": 0.35929303458552225, "learning_rate": 5.9939272285782066e-05, "loss": 0.8234, "step": 2170 }, { "epoch": 0.67, "grad_norm": 0.3835859771257563, "learning_rate": 5.9447467918700614e-05, "loss": 0.7295, "step": 2175 }, { "epoch": 0.67, "grad_norm": 0.33889717245375717, "learning_rate": 5.895683466530091e-05, "loss": 0.7491, "step": 2180 }, { "epoch": 0.67, "grad_norm": 0.34625485711737686, "learning_rate": 5.8467386694545635e-05, "loss": 0.7882, "step": 2185 }, { "epoch": 0.67, "grad_norm": 0.3834886156777842, "learning_rate": 5.797913814116781e-05, "loss": 0.7093, "step": 2190 }, { "epoch": 0.68, "grad_norm": 0.3892980195228429, "learning_rate": 5.7492103105262715e-05, "loss": 0.794, "step": 2195 }, { "epoch": 0.68, "grad_norm": 0.39210633693040825, "learning_rate": 5.7006295651880246e-05, "loss": 0.7566, "step": 2200 }, { "epoch": 0.68, "grad_norm": 0.3582797057469045, "learning_rate": 5.6521729810619317e-05, "loss": 0.8021, "step": 2205 }, { "epoch": 0.68, "grad_norm": 0.3542924342264584, "learning_rate": 5.603841957522227e-05, "loss": 0.756, "step": 2210 }, { "epoch": 0.68, "grad_norm": 0.36575349651181366, "learning_rate": 5.555637890317091e-05, "loss": 0.7921, "step": 2215 }, { "epoch": 0.68, "grad_norm": 0.38535314462569586, "learning_rate": 5.507562171528342e-05, "loss": 0.7781, "step": 2220 }, { "epoch": 0.69, "grad_norm": 0.39735016460723493, "learning_rate": 5.459616189531234e-05, "loss": 0.6632, "step": 2225 }, { "epoch": 0.69, "grad_norm": 0.4056677466996733, "learning_rate": 5.411801328954368e-05, "loss": 0.7334, "step": 2230 }, { "epoch": 0.69, "grad_norm": 0.42376106078500364, "learning_rate": 5.36411897063968e-05, "loss": 0.8772, "step": 2235 }, { "epoch": 0.69, "grad_norm": 0.35144323747646544, "learning_rate": 5.316570491602606e-05, "loss": 0.7793, "step": 2240 }, { "epoch": 0.69, "grad_norm": 0.3783769784963828, "learning_rate": 5.269157264992276e-05, "loss": 0.8655, "step": 2245 }, { "epoch": 0.69, "grad_norm": 0.44209683459363136, "learning_rate": 5.221880660051881e-05, "loss": 0.8032, "step": 2250 }, { "epoch": 0.69, "grad_norm": 0.4882374682401987, "learning_rate": 5.1747420420791196e-05, "loss": 0.7007, "step": 2255 }, { "epoch": 0.7, "grad_norm": 0.3237759848919934, "learning_rate": 5.127742772386786e-05, "loss": 0.7897, "step": 2260 }, { "epoch": 0.7, "grad_norm": 0.36606432111465076, "learning_rate": 5.0808842082634314e-05, "loss": 0.8064, "step": 2265 }, { "epoch": 0.7, "grad_norm": 0.40999182095921494, "learning_rate": 5.0341677029341895e-05, "loss": 0.7103, "step": 2270 }, { "epoch": 0.7, "grad_norm": 0.3272955637327382, "learning_rate": 4.987594605521682e-05, "loss": 0.6785, "step": 2275 }, { "epoch": 0.7, "grad_norm": 0.3490487483696679, "learning_rate": 4.941166261007077e-05, "loss": 0.7292, "step": 2280 }, { "epoch": 0.7, "grad_norm": 0.3433624374602265, "learning_rate": 4.894884010191211e-05, "loss": 0.6762, "step": 2285 }, { "epoch": 0.71, "grad_norm": 0.325285651430037, "learning_rate": 4.848749189655915e-05, "loss": 0.7659, "step": 2290 }, { "epoch": 0.71, "grad_norm": 0.31571712296306303, "learning_rate": 4.802763131725378e-05, "loss": 0.7736, "step": 2295 }, { "epoch": 0.71, "grad_norm": 0.35722394621197917, "learning_rate": 4.756927164427685e-05, "loss": 0.7155, "step": 2300 }, { "epoch": 0.71, "grad_norm": 0.36377115960758405, "learning_rate": 4.711242611456469e-05, "loss": 0.7326, "step": 2305 }, { "epoch": 0.71, "grad_norm": 0.4323956507240235, "learning_rate": 4.665710792132671e-05, "loss": 0.7775, "step": 2310 }, { "epoch": 0.71, "grad_norm": 0.4046174615365396, "learning_rate": 4.620333021366463e-05, "loss": 0.7643, "step": 2315 }, { "epoch": 0.71, "grad_norm": 0.3796515442594094, "learning_rate": 4.5751106096192476e-05, "loss": 0.6264, "step": 2320 }, { "epoch": 0.72, "grad_norm": 0.46275385650934453, "learning_rate": 4.5300448628658254e-05, "loss": 0.688, "step": 2325 }, { "epoch": 0.72, "grad_norm": 0.4343032727751153, "learning_rate": 4.485137082556685e-05, "loss": 0.7238, "step": 2330 }, { "epoch": 0.72, "grad_norm": 0.42658163696603996, "learning_rate": 4.4403885655804115e-05, "loss": 0.7691, "step": 2335 }, { "epoch": 0.72, "grad_norm": 0.5374930429188296, "learning_rate": 4.395800604226229e-05, "loss": 0.8293, "step": 2340 }, { "epoch": 0.72, "grad_norm": 0.5178849424936606, "learning_rate": 4.351374486146706e-05, "loss": 0.6683, "step": 2345 }, { "epoch": 0.72, "grad_norm": 0.2580914515654273, "learning_rate": 4.307111494320524e-05, "loss": 0.6295, "step": 2350 }, { "epoch": 0.73, "grad_norm": 0.38787306791139886, "learning_rate": 4.263012907015477e-05, "loss": 0.6748, "step": 2355 }, { "epoch": 0.73, "grad_norm": 0.4320242881816677, "learning_rate": 4.219079997751515e-05, "loss": 0.6848, "step": 2360 }, { "epoch": 0.73, "grad_norm": 0.39451448787145293, "learning_rate": 4.175314035264002e-05, "loss": 0.7691, "step": 2365 }, { "epoch": 0.73, "grad_norm": 0.3024402074904783, "learning_rate": 4.131716283467034e-05, "loss": 0.7674, "step": 2370 }, { "epoch": 0.73, "grad_norm": 0.39413439687935803, "learning_rate": 4.0882880014169865e-05, "loss": 0.83, "step": 2375 }, { "epoch": 0.73, "grad_norm": 0.45210137336011785, "learning_rate": 4.045030443276115e-05, "loss": 0.7117, "step": 2380 }, { "epoch": 0.73, "grad_norm": 0.4767595879985179, "learning_rate": 4.001944858276356e-05, "loss": 0.7424, "step": 2385 }, { "epoch": 0.74, "grad_norm": 0.3405433186754331, "learning_rate": 3.9590324906832435e-05, "loss": 0.7944, "step": 2390 }, { "epoch": 0.74, "grad_norm": 0.38698863622073953, "learning_rate": 3.9162945797599895e-05, "loss": 0.7486, "step": 2395 }, { "epoch": 0.74, "grad_norm": 0.33122873819033993, "learning_rate": 3.873732359731661e-05, "loss": 0.7339, "step": 2400 }, { "epoch": 0.74, "grad_norm": 0.3593189874663698, "learning_rate": 3.831347059749587e-05, "loss": 0.8308, "step": 2405 }, { "epoch": 0.74, "grad_norm": 0.32020688896625343, "learning_rate": 3.78913990385582e-05, "loss": 0.7932, "step": 2410 }, { "epoch": 0.74, "grad_norm": 0.35408477803883764, "learning_rate": 3.7471121109478004e-05, "loss": 0.6155, "step": 2415 }, { "epoch": 0.75, "grad_norm": 0.33845060971026897, "learning_rate": 3.705264894743167e-05, "loss": 0.7798, "step": 2420 }, { "epoch": 0.75, "grad_norm": 0.5583150637853672, "learning_rate": 3.6635994637446845e-05, "loss": 0.6673, "step": 2425 }, { "epoch": 0.75, "grad_norm": 0.4929136893319016, "learning_rate": 3.6221170212053766e-05, "loss": 0.8048, "step": 2430 }, { "epoch": 0.75, "grad_norm": 0.5429072066090833, "learning_rate": 3.5808187650937276e-05, "loss": 0.7507, "step": 2435 }, { "epoch": 0.75, "grad_norm": 0.46201271372482866, "learning_rate": 3.53970588805914e-05, "loss": 0.7259, "step": 2440 }, { "epoch": 0.75, "grad_norm": 0.3129228737179152, "learning_rate": 3.498779577397453e-05, "loss": 0.7715, "step": 2445 }, { "epoch": 0.75, "grad_norm": 0.32996488069999697, "learning_rate": 3.458041015016681e-05, "loss": 0.7797, "step": 2450 }, { "epoch": 0.76, "grad_norm": 0.3214733415000198, "learning_rate": 3.4174913774028485e-05, "loss": 0.7226, "step": 2455 }, { "epoch": 0.76, "grad_norm": 0.42589629830207104, "learning_rate": 3.3771318355860593e-05, "loss": 0.7218, "step": 2460 }, { "epoch": 0.76, "grad_norm": 0.3047848353555366, "learning_rate": 3.336963555106638e-05, "loss": 0.7956, "step": 2465 }, { "epoch": 0.76, "grad_norm": 0.34359240246923894, "learning_rate": 3.296987695981493e-05, "loss": 0.666, "step": 2470 }, { "epoch": 0.76, "grad_norm": 0.4144993432032501, "learning_rate": 3.257205412670605e-05, "loss": 0.7416, "step": 2475 }, { "epoch": 0.76, "grad_norm": 0.3731255551783685, "learning_rate": 3.217617854043707e-05, "loss": 0.8345, "step": 2480 }, { "epoch": 0.77, "grad_norm": 0.4050140213208934, "learning_rate": 3.178226163347067e-05, "loss": 0.7122, "step": 2485 }, { "epoch": 0.77, "grad_norm": 0.4002439491991807, "learning_rate": 3.139031478170522e-05, "loss": 0.6805, "step": 2490 }, { "epoch": 0.77, "grad_norm": 0.42917304751384394, "learning_rate": 3.100034930414585e-05, "loss": 0.733, "step": 2495 }, { "epoch": 0.77, "grad_norm": 0.4136643224459766, "learning_rate": 3.0612376462577784e-05, "loss": 0.7807, "step": 2500 }, { "epoch": 0.77, "grad_norm": 0.3890309362984174, "learning_rate": 3.0226407461241056e-05, "loss": 0.643, "step": 2505 }, { "epoch": 0.77, "grad_norm": 0.4262911129142299, "learning_rate": 2.9842453446506868e-05, "loss": 0.823, "step": 2510 }, { "epoch": 0.77, "grad_norm": 0.30650720385697705, "learning_rate": 2.9460525506555947e-05, "loss": 0.7002, "step": 2515 }, { "epoch": 0.78, "grad_norm": 0.3980581468888342, "learning_rate": 2.9080634671057892e-05, "loss": 0.7899, "step": 2520 }, { "epoch": 0.78, "grad_norm": 0.4056498788574052, "learning_rate": 2.8702791910853144e-05, "loss": 0.701, "step": 2525 }, { "epoch": 0.78, "grad_norm": 0.4548787143471859, "learning_rate": 2.832700813763579e-05, "loss": 0.8386, "step": 2530 }, { "epoch": 0.78, "grad_norm": 0.4404855029802744, "learning_rate": 2.7953294203638625e-05, "loss": 0.7813, "step": 2535 }, { "epoch": 0.78, "grad_norm": 0.40369488587415225, "learning_rate": 2.7581660901319663e-05, "loss": 0.7886, "step": 2540 }, { "epoch": 0.78, "grad_norm": 0.44025189268752124, "learning_rate": 2.7212118963050592e-05, "loss": 0.6854, "step": 2545 }, { "epoch": 0.79, "grad_norm": 0.38769461649930276, "learning_rate": 2.6844679060806666e-05, "loss": 0.7533, "step": 2550 }, { "epoch": 0.79, "grad_norm": 0.39043309802901266, "learning_rate": 2.647935180585861e-05, "loss": 0.7324, "step": 2555 }, { "epoch": 0.79, "grad_norm": 0.4031278133263342, "learning_rate": 2.6116147748466136e-05, "loss": 0.8095, "step": 2560 }, { "epoch": 0.79, "grad_norm": 0.31989753369104523, "learning_rate": 2.575507737757341e-05, "loss": 0.7635, "step": 2565 }, { "epoch": 0.79, "grad_norm": 0.3825808697754477, "learning_rate": 2.5396151120505797e-05, "loss": 0.7067, "step": 2570 }, { "epoch": 0.79, "grad_norm": 0.34014553791218866, "learning_rate": 2.5039379342669156e-05, "loss": 0.7454, "step": 2575 }, { "epoch": 0.79, "grad_norm": 0.3441742765574342, "learning_rate": 2.4684772347250194e-05, "loss": 0.7269, "step": 2580 }, { "epoch": 0.8, "grad_norm": 0.268563145640876, "learning_rate": 2.433234037491904e-05, "loss": 0.7188, "step": 2585 }, { "epoch": 0.8, "grad_norm": 0.44327204267255527, "learning_rate": 2.3982093603533485e-05, "loss": 0.6476, "step": 2590 }, { "epoch": 0.8, "grad_norm": 0.47944546289888046, "learning_rate": 2.3634042147845036e-05, "loss": 0.7312, "step": 2595 }, { "epoch": 0.8, "grad_norm": 0.3735226907786184, "learning_rate": 2.3288196059206936e-05, "loss": 0.8098, "step": 2600 }, { "epoch": 0.8, "grad_norm": 0.48173286401906895, "learning_rate": 2.2944565325283608e-05, "loss": 0.7692, "step": 2605 }, { "epoch": 0.8, "grad_norm": 0.454018593107754, "learning_rate": 2.260315986976258e-05, "loss": 0.7258, "step": 2610 }, { "epoch": 0.81, "grad_norm": 0.4102026616293206, "learning_rate": 2.2263989552067644e-05, "loss": 0.8175, "step": 2615 }, { "epoch": 0.81, "grad_norm": 0.36721285813725996, "learning_rate": 2.1927064167074197e-05, "loss": 0.7741, "step": 2620 }, { "epoch": 0.81, "grad_norm": 0.5058554441705722, "learning_rate": 2.1592393444826377e-05, "loss": 0.7664, "step": 2625 }, { "epoch": 0.81, "grad_norm": 0.41227200976610034, "learning_rate": 2.125998705025619e-05, "loss": 0.7922, "step": 2630 }, { "epoch": 0.81, "grad_norm": 0.3739806966013022, "learning_rate": 2.0929854582904095e-05, "loss": 0.6827, "step": 2635 }, { "epoch": 0.81, "grad_norm": 0.3526698208142984, "learning_rate": 2.060200557664215e-05, "loss": 0.7712, "step": 2640 }, { "epoch": 0.81, "grad_norm": 0.355624340580361, "learning_rate": 2.0276449499398352e-05, "loss": 0.7217, "step": 2645 }, { "epoch": 0.82, "grad_norm": 0.35879322380857276, "learning_rate": 1.9953195752883535e-05, "loss": 0.8101, "step": 2650 }, { "epoch": 0.82, "grad_norm": 0.4381419678532357, "learning_rate": 1.9632253672319466e-05, "loss": 0.7784, "step": 2655 }, { "epoch": 0.82, "grad_norm": 0.3338214481525901, "learning_rate": 1.9313632526169713e-05, "loss": 0.7633, "step": 2660 }, { "epoch": 0.82, "grad_norm": 0.3419374997650999, "learning_rate": 1.899734151587157e-05, "loss": 0.6726, "step": 2665 }, { "epoch": 0.82, "grad_norm": 0.39283447932176424, "learning_rate": 1.868338977557058e-05, "loss": 0.7787, "step": 2670 }, { "epoch": 0.82, "grad_norm": 0.37003709150492736, "learning_rate": 1.837178637185666e-05, "loss": 0.7466, "step": 2675 }, { "epoch": 0.83, "grad_norm": 0.3961613001539733, "learning_rate": 1.8062540303502284e-05, "loss": 0.7097, "step": 2680 }, { "epoch": 0.83, "grad_norm": 0.35142355910690376, "learning_rate": 1.7755660501202565e-05, "loss": 0.6774, "step": 2685 }, { "epoch": 0.83, "grad_norm": 0.41038393576069904, "learning_rate": 1.745115582731749e-05, "loss": 0.7496, "step": 2690 }, { "epoch": 0.83, "grad_norm": 0.39409588845344945, "learning_rate": 1.7149035075615794e-05, "loss": 0.7187, "step": 2695 }, { "epoch": 0.83, "grad_norm": 0.44791745932431604, "learning_rate": 1.6849306971021116e-05, "loss": 0.7898, "step": 2700 }, { "epoch": 0.83, "grad_norm": 0.3525758016199936, "learning_rate": 1.6551980169360005e-05, "loss": 0.7511, "step": 2705 }, { "epoch": 0.83, "grad_norm": 0.36563645526797983, "learning_rate": 1.6257063257111938e-05, "loss": 0.7397, "step": 2710 }, { "epoch": 0.84, "grad_norm": 0.3351064266182499, "learning_rate": 1.596456475116147e-05, "loss": 0.7379, "step": 2715 }, { "epoch": 0.84, "grad_norm": 0.5068000232220052, "learning_rate": 1.567449309855199e-05, "loss": 0.751, "step": 2720 }, { "epoch": 0.84, "grad_norm": 0.43979138745697033, "learning_rate": 1.5386856676242146e-05, "loss": 0.8085, "step": 2725 }, { "epoch": 0.84, "grad_norm": 0.356641089597573, "learning_rate": 1.5101663790863596e-05, "loss": 0.6256, "step": 2730 }, { "epoch": 0.84, "grad_norm": 0.44817921454892296, "learning_rate": 1.4818922678481429e-05, "loss": 0.7675, "step": 2735 }, { "epoch": 0.84, "grad_norm": 0.38078400449692273, "learning_rate": 1.4538641504355965e-05, "loss": 0.689, "step": 2740 }, { "epoch": 0.85, "grad_norm": 0.39634310148432367, "learning_rate": 1.4260828362707301e-05, "loss": 0.7727, "step": 2745 }, { "epoch": 0.85, "grad_norm": 0.3227302452177864, "learning_rate": 1.3985491276481323e-05, "loss": 0.6711, "step": 2750 }, { "epoch": 0.85, "grad_norm": 0.35229071646321697, "learning_rate": 1.3712638197118111e-05, "loss": 0.7711, "step": 2755 }, { "epoch": 0.85, "grad_norm": 0.3536271561860169, "learning_rate": 1.3442277004322257e-05, "loss": 0.8075, "step": 2760 }, { "epoch": 0.85, "grad_norm": 0.348867332644309, "learning_rate": 1.3174415505835436e-05, "loss": 0.7561, "step": 2765 }, { "epoch": 0.85, "grad_norm": 0.3535699706794319, "learning_rate": 1.2909061437210669e-05, "loss": 0.6532, "step": 2770 }, { "epoch": 0.85, "grad_norm": 0.29622730647422324, "learning_rate": 1.264622246158924e-05, "loss": 0.7651, "step": 2775 }, { "epoch": 0.86, "grad_norm": 0.36090199758429575, "learning_rate": 1.2385906169479167e-05, "loss": 0.8015, "step": 2780 }, { "epoch": 0.86, "grad_norm": 0.402156004342123, "learning_rate": 1.2128120078536076e-05, "loss": 0.6387, "step": 2785 }, { "epoch": 0.86, "grad_norm": 0.42634964362612304, "learning_rate": 1.1872871633346094e-05, "loss": 0.7452, "step": 2790 }, { "epoch": 0.86, "grad_norm": 0.4198409057077085, "learning_rate": 1.1620168205210869e-05, "loss": 0.7722, "step": 2795 }, { "epoch": 0.86, "grad_norm": 0.32753834269003024, "learning_rate": 1.1370017091934714e-05, "loss": 0.6906, "step": 2800 }, { "epoch": 0.86, "grad_norm": 0.32874585989735094, "learning_rate": 1.1122425517613722e-05, "loss": 0.6583, "step": 2805 }, { "epoch": 0.87, "grad_norm": 0.40652778318077754, "learning_rate": 1.0877400632427359e-05, "loss": 0.674, "step": 2810 }, { "epoch": 0.87, "grad_norm": 0.507341912219104, "learning_rate": 1.0634949512431814e-05, "loss": 0.7677, "step": 2815 }, { "epoch": 0.87, "grad_norm": 0.3696792851429222, "learning_rate": 1.0395079159355658e-05, "loss": 0.7034, "step": 2820 }, { "epoch": 0.87, "grad_norm": 0.4493375882862214, "learning_rate": 1.0157796500397699e-05, "loss": 0.7487, "step": 2825 }, { "epoch": 0.87, "grad_norm": 0.41029746791384736, "learning_rate": 9.92310838802698e-06, "loss": 0.7405, "step": 2830 }, { "epoch": 0.87, "grad_norm": 0.2972360830137905, "learning_rate": 9.691021599784711e-06, "loss": 0.6979, "step": 2835 }, { "epoch": 0.87, "grad_norm": 0.4032566104643188, "learning_rate": 9.461542838088722e-06, "loss": 0.7898, "step": 2840 }, { "epoch": 0.88, "grad_norm": 0.27918369114356745, "learning_rate": 9.23467873003977e-06, "loss": 0.8092, "step": 2845 }, { "epoch": 0.88, "grad_norm": 0.36249726345459377, "learning_rate": 9.010435827230313e-06, "loss": 0.6445, "step": 2850 }, { "epoch": 0.88, "grad_norm": 0.3964708844545875, "learning_rate": 8.788820605555082e-06, "loss": 0.7462, "step": 2855 }, { "epoch": 0.88, "grad_norm": 0.3502801511645061, "learning_rate": 8.569839465024299e-06, "loss": 0.7233, "step": 2860 }, { "epoch": 0.88, "grad_norm": 0.4033170990560745, "learning_rate": 8.35349872957869e-06, "loss": 0.8105, "step": 2865 }, { "epoch": 0.88, "grad_norm": 0.3954070605648136, "learning_rate": 8.139804646906923e-06, "loss": 0.7059, "step": 2870 }, { "epoch": 0.89, "grad_norm": 0.37474529564255277, "learning_rate": 7.928763388265181e-06, "loss": 0.8582, "step": 2875 }, { "epoch": 0.89, "grad_norm": 0.32809524672734786, "learning_rate": 7.720381048298897e-06, "loss": 0.7581, "step": 2880 }, { "epoch": 0.89, "grad_norm": 0.4149259283243587, "learning_rate": 7.5146636448668485e-06, "loss": 0.7735, "step": 2885 }, { "epoch": 0.89, "grad_norm": 0.4263926138100582, "learning_rate": 7.3116171188671865e-06, "loss": 0.8028, "step": 2890 }, { "epoch": 0.89, "grad_norm": 0.45104438156690363, "learning_rate": 7.111247334066129e-06, "loss": 0.752, "step": 2895 }, { "epoch": 0.89, "grad_norm": 0.43860221599692545, "learning_rate": 6.913560076928361e-06, "loss": 0.7119, "step": 2900 }, { "epoch": 0.89, "grad_norm": 0.3597066697079075, "learning_rate": 6.71856105645009e-06, "loss": 0.7666, "step": 2905 }, { "epoch": 0.9, "grad_norm": 0.5298228542098834, "learning_rate": 6.526255903994105e-06, "loss": 0.6903, "step": 2910 }, { "epoch": 0.9, "grad_norm": 0.348385379079852, "learning_rate": 6.336650173127223e-06, "loss": 0.7291, "step": 2915 }, { "epoch": 0.9, "grad_norm": 0.40348728722999555, "learning_rate": 6.149749339459787e-06, "loss": 0.6929, "step": 2920 }, { "epoch": 0.9, "grad_norm": 0.42152434212067974, "learning_rate": 5.96555880048767e-06, "loss": 0.7092, "step": 2925 }, { "epoch": 0.9, "grad_norm": 0.3418766857878975, "learning_rate": 5.784083875436286e-06, "loss": 0.7017, "step": 2930 }, { "epoch": 0.9, "grad_norm": 0.4129577303513544, "learning_rate": 5.605329805107084e-06, "loss": 0.8389, "step": 2935 }, { "epoch": 0.91, "grad_norm": 0.34322551067313023, "learning_rate": 5.429301751726068e-06, "loss": 0.8468, "step": 2940 }, { "epoch": 0.91, "grad_norm": 0.42129646163250906, "learning_rate": 5.256004798794889e-06, "loss": 0.7467, "step": 2945 }, { "epoch": 0.91, "grad_norm": 0.38112811696400956, "learning_rate": 5.085443950943858e-06, "loss": 0.6878, "step": 2950 }, { "epoch": 0.91, "grad_norm": 0.38823071010954757, "learning_rate": 4.917624133787535e-06, "loss": 0.839, "step": 2955 }, { "epoch": 0.91, "grad_norm": 0.41462150440141854, "learning_rate": 4.752550193782457e-06, "loss": 0.7937, "step": 2960 }, { "epoch": 0.91, "grad_norm": 0.47038565975041013, "learning_rate": 4.590226898087169e-06, "loss": 0.7394, "step": 2965 }, { "epoch": 0.91, "grad_norm": 0.42313697531774797, "learning_rate": 4.430658934424536e-06, "loss": 0.7365, "step": 2970 }, { "epoch": 0.92, "grad_norm": 0.37056389807076956, "learning_rate": 4.2738509109464194e-06, "loss": 0.7771, "step": 2975 }, { "epoch": 0.92, "grad_norm": 0.4282991333253571, "learning_rate": 4.119807356100536e-06, "loss": 0.8332, "step": 2980 }, { "epoch": 0.92, "grad_norm": 0.3399316066449957, "learning_rate": 3.968532718499718e-06, "loss": 0.719, "step": 2985 }, { "epoch": 0.92, "grad_norm": 0.3045272007549396, "learning_rate": 3.8200313667934415e-06, "loss": 0.7398, "step": 2990 }, { "epoch": 0.92, "grad_norm": 0.3940052052861745, "learning_rate": 3.674307589541637e-06, "loss": 0.6926, "step": 2995 }, { "epoch": 0.92, "grad_norm": 0.5430719327118287, "learning_rate": 3.5313655950908964e-06, "loss": 0.724, "step": 3000 }, { "epoch": 0.93, "grad_norm": 0.3409882746688945, "learning_rate": 3.391209511452853e-06, "loss": 0.7768, "step": 3005 }, { "epoch": 0.93, "grad_norm": 0.32988045764813406, "learning_rate": 3.253843386185085e-06, "loss": 0.7503, "step": 3010 }, { "epoch": 0.93, "grad_norm": 0.30435984793513565, "learning_rate": 3.1192711862740865e-06, "loss": 0.7373, "step": 3015 }, { "epoch": 0.93, "grad_norm": 0.4219409570685578, "learning_rate": 2.9874967980208724e-06, "loss": 0.7532, "step": 3020 }, { "epoch": 0.93, "grad_norm": 0.40198268281516975, "learning_rate": 2.858524026928555e-06, "loss": 0.746, "step": 3025 }, { "epoch": 0.93, "grad_norm": 0.5529913137285583, "learning_rate": 2.7323565975926222e-06, "loss": 0.8412, "step": 3030 }, { "epoch": 0.93, "grad_norm": 0.5641310624198427, "learning_rate": 2.6089981535932453e-06, "loss": 0.7381, "step": 3035 }, { "epoch": 0.94, "grad_norm": 0.4070763719787106, "learning_rate": 2.4884522573901505e-06, "loss": 0.7469, "step": 3040 }, { "epoch": 0.94, "grad_norm": 0.412932484046869, "learning_rate": 2.3707223902196595e-06, "loss": 0.8027, "step": 3045 }, { "epoch": 0.94, "grad_norm": 0.3698602015181267, "learning_rate": 2.2558119519942357e-06, "loss": 0.7422, "step": 3050 }, { "epoch": 0.94, "grad_norm": 0.41987825527789996, "learning_rate": 2.143724261204194e-06, "loss": 0.7901, "step": 3055 }, { "epoch": 0.94, "grad_norm": 0.4558561938965998, "learning_rate": 2.034462554821992e-06, "loss": 0.7254, "step": 3060 }, { "epoch": 0.94, "grad_norm": 0.31521663581005915, "learning_rate": 1.928029988208635e-06, "loss": 0.7234, "step": 3065 }, { "epoch": 0.95, "grad_norm": 0.4536950604685854, "learning_rate": 1.8244296350226398e-06, "loss": 0.8439, "step": 3070 }, { "epoch": 0.95, "grad_norm": 0.3324327843215253, "learning_rate": 1.7236644871312047e-06, "loss": 0.7318, "step": 3075 }, { "epoch": 0.95, "grad_norm": 0.3441401957085473, "learning_rate": 1.6257374545238457e-06, "loss": 0.7219, "step": 3080 }, { "epoch": 0.95, "grad_norm": 0.31548933651921574, "learning_rate": 1.530651365228375e-06, "loss": 0.6584, "step": 3085 }, { "epoch": 0.95, "grad_norm": 0.43658195529889465, "learning_rate": 1.4384089652291543e-06, "loss": 0.8155, "step": 3090 }, { "epoch": 0.95, "grad_norm": 0.28926411213003883, "learning_rate": 1.349012918387904e-06, "loss": 0.7126, "step": 3095 }, { "epoch": 0.95, "grad_norm": 0.5240863579149662, "learning_rate": 1.2624658063666639e-06, "loss": 0.8585, "step": 3100 }, { "epoch": 0.96, "grad_norm": 0.34319193062085684, "learning_rate": 1.1787701285533193e-06, "loss": 0.7302, "step": 3105 }, { "epoch": 0.96, "grad_norm": 0.37091855665958207, "learning_rate": 1.0979283019893704e-06, "loss": 0.8102, "step": 3110 }, { "epoch": 0.96, "grad_norm": 0.4173368619893207, "learning_rate": 1.019942661300166e-06, "loss": 0.8052, "step": 3115 }, { "epoch": 0.96, "grad_norm": 0.2740737663998489, "learning_rate": 9.448154586274794e-07, "loss": 0.6628, "step": 3120 }, { "epoch": 0.96, "grad_norm": 0.39165587634667004, "learning_rate": 8.725488635644152e-07, "loss": 0.8068, "step": 3125 }, { "epoch": 0.96, "grad_norm": 0.3462783087024963, "learning_rate": 8.031449630928167e-07, "loss": 0.755, "step": 3130 }, { "epoch": 0.97, "grad_norm": 0.3501973539234669, "learning_rate": 7.366057615229904e-07, "loss": 0.8435, "step": 3135 }, { "epoch": 0.97, "grad_norm": 0.3089654640513169, "learning_rate": 6.729331804357863e-07, "loss": 0.7804, "step": 3140 }, { "epoch": 0.97, "grad_norm": 0.3732310282262484, "learning_rate": 6.121290586271311e-07, "loss": 0.7352, "step": 3145 }, { "epoch": 0.97, "grad_norm": 0.35885721051799435, "learning_rate": 5.54195152054926e-07, "loss": 0.719, "step": 3150 }, { "epoch": 0.97, "grad_norm": 0.4350143969198028, "learning_rate": 4.99133133788332e-07, "loss": 0.7083, "step": 3155 }, { "epoch": 0.97, "grad_norm": 0.3878552023049681, "learning_rate": 4.4694459395943077e-07, "loss": 0.787, "step": 3160 }, { "epoch": 0.97, "grad_norm": 0.5569927297196836, "learning_rate": 3.9763103971734993e-07, "loss": 0.8012, "step": 3165 }, { "epoch": 0.98, "grad_norm": 0.4146895070331553, "learning_rate": 3.5119389518470936e-07, "loss": 0.6918, "step": 3170 }, { "epoch": 0.98, "grad_norm": 0.3438997693511246, "learning_rate": 3.076345014164872e-07, "loss": 0.7113, "step": 3175 }, { "epoch": 0.98, "grad_norm": 0.3948501011870127, "learning_rate": 2.669541163613176e-07, "loss": 0.7671, "step": 3180 }, { "epoch": 0.98, "grad_norm": 0.3840771316522947, "learning_rate": 2.2915391482514204e-07, "loss": 0.7372, "step": 3185 }, { "epoch": 0.98, "grad_norm": 0.34038583545962225, "learning_rate": 1.9423498843726962e-07, "loss": 0.6481, "step": 3190 }, { "epoch": 0.98, "grad_norm": 0.3931737526809346, "learning_rate": 1.6219834561889136e-07, "loss": 0.7199, "step": 3195 }, { "epoch": 0.99, "grad_norm": 0.38986041242847186, "learning_rate": 1.3304491155393674e-07, "loss": 0.7816, "step": 3200 }, { "epoch": 0.99, "grad_norm": 0.4355430101066633, "learning_rate": 1.0677552816233949e-07, "loss": 0.6702, "step": 3205 }, { "epoch": 0.99, "grad_norm": 0.388098975061164, "learning_rate": 8.339095407575715e-08, "loss": 0.6688, "step": 3210 }, { "epoch": 0.99, "grad_norm": 0.3699381979359166, "learning_rate": 6.28918646156329e-08, "loss": 0.8179, "step": 3215 }, { "epoch": 0.99, "grad_norm": 0.374170534262767, "learning_rate": 4.5278851773711274e-08, "loss": 0.7136, "step": 3220 }, { "epoch": 0.99, "grad_norm": 0.41255583678961755, "learning_rate": 3.055242419492954e-08, "loss": 0.7418, "step": 3225 }, { "epoch": 0.99, "grad_norm": 0.45777314712192396, "learning_rate": 1.8713007162740605e-08, "loss": 0.7344, "step": 3230 }, { "epoch": 1.0, "grad_norm": 0.47083864745606346, "learning_rate": 9.760942586822808e-09, "loss": 0.8361, "step": 3235 }, { "epoch": 1.0, "grad_norm": 0.4272718560438237, "learning_rate": 3.69648899322117e-09, "loss": 0.7523, "step": 3240 }, { "epoch": 1.0, "grad_norm": 0.4516813250246949, "learning_rate": 5.198215168533693e-10, "loss": 0.7346, "step": 3245 }, { "epoch": 1.0, "eval_loss": 1.2166675329208374, "eval_runtime": 1667.535, "eval_samples_per_second": 1.385, "eval_steps_per_second": 0.347, "step": 3248 }, { "epoch": 1.0, "step": 3248, "total_flos": 6658064344678400.0, "train_loss": 0.6257000189696627, "train_runtime": 38581.5438, "train_samples_per_second": 1.347, "train_steps_per_second": 0.084 } ], "logging_steps": 5, "max_steps": 3248, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 6658064344678400.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }