zephyr-7b-sft-full / trainer_state.json
rasoolfa's picture
Model save
66cf283 verified
raw
history blame contribute delete
No virus
140 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 4358,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 9.394675251276627,
"learning_rate": 4.587155963302753e-08,
"loss": 1.0722,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 11.510146873139346,
"learning_rate": 2.2935779816513764e-07,
"loss": 1.1568,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 8.09186869433803,
"learning_rate": 4.587155963302753e-07,
"loss": 1.1267,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 5.001305949141049,
"learning_rate": 6.880733944954129e-07,
"loss": 1.0408,
"step": 15
},
{
"epoch": 0.0,
"grad_norm": 5.089979244080159,
"learning_rate": 9.174311926605506e-07,
"loss": 1.0286,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 3.857643916857598,
"learning_rate": 1.1467889908256882e-06,
"loss": 1.0247,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": 3.6352518195110446,
"learning_rate": 1.3761467889908258e-06,
"loss": 0.9997,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 3.498581149423037,
"learning_rate": 1.6055045871559635e-06,
"loss": 0.9847,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": 3.337414380712645,
"learning_rate": 1.8348623853211011e-06,
"loss": 0.9918,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 3.5774352168297394,
"learning_rate": 2.064220183486239e-06,
"loss": 1.0183,
"step": 45
},
{
"epoch": 0.01,
"grad_norm": 3.472559365553104,
"learning_rate": 2.2935779816513764e-06,
"loss": 1.015,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 3.33817794356789,
"learning_rate": 2.522935779816514e-06,
"loss": 0.9892,
"step": 55
},
{
"epoch": 0.01,
"grad_norm": 6.135442418177604,
"learning_rate": 2.7522935779816517e-06,
"loss": 0.9965,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 3.858279898663446,
"learning_rate": 2.981651376146789e-06,
"loss": 0.9898,
"step": 65
},
{
"epoch": 0.02,
"grad_norm": 3.435351371137228,
"learning_rate": 3.211009174311927e-06,
"loss": 0.9854,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 3.7508665634033758,
"learning_rate": 3.4403669724770644e-06,
"loss": 1.0167,
"step": 75
},
{
"epoch": 0.02,
"grad_norm": 3.3955349867095177,
"learning_rate": 3.6697247706422022e-06,
"loss": 0.9613,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 3.240473789973621,
"learning_rate": 3.89908256880734e-06,
"loss": 0.9584,
"step": 85
},
{
"epoch": 0.02,
"grad_norm": 3.585344808953774,
"learning_rate": 4.128440366972478e-06,
"loss": 0.9908,
"step": 90
},
{
"epoch": 0.02,
"grad_norm": 3.362297510865859,
"learning_rate": 4.357798165137615e-06,
"loss": 0.9994,
"step": 95
},
{
"epoch": 0.02,
"grad_norm": 3.3222849745943717,
"learning_rate": 4.587155963302753e-06,
"loss": 1.0184,
"step": 100
},
{
"epoch": 0.02,
"grad_norm": 3.322973143553916,
"learning_rate": 4.816513761467891e-06,
"loss": 0.9319,
"step": 105
},
{
"epoch": 0.03,
"grad_norm": 3.676944381124791,
"learning_rate": 5.045871559633028e-06,
"loss": 0.9762,
"step": 110
},
{
"epoch": 0.03,
"grad_norm": 3.4468638326854797,
"learning_rate": 5.275229357798165e-06,
"loss": 0.9759,
"step": 115
},
{
"epoch": 0.03,
"grad_norm": 3.315867018218443,
"learning_rate": 5.504587155963303e-06,
"loss": 0.9617,
"step": 120
},
{
"epoch": 0.03,
"grad_norm": 3.486244791929344,
"learning_rate": 5.733944954128441e-06,
"loss": 1.0092,
"step": 125
},
{
"epoch": 0.03,
"grad_norm": 3.5300892522492577,
"learning_rate": 5.963302752293578e-06,
"loss": 0.9802,
"step": 130
},
{
"epoch": 0.03,
"grad_norm": 3.1417248587005067,
"learning_rate": 6.192660550458715e-06,
"loss": 0.9852,
"step": 135
},
{
"epoch": 0.03,
"grad_norm": 3.180858225250927,
"learning_rate": 6.422018348623854e-06,
"loss": 0.9823,
"step": 140
},
{
"epoch": 0.03,
"grad_norm": 3.3289726314603283,
"learning_rate": 6.651376146788992e-06,
"loss": 0.9894,
"step": 145
},
{
"epoch": 0.03,
"grad_norm": 3.2711775527420084,
"learning_rate": 6.880733944954129e-06,
"loss": 1.0085,
"step": 150
},
{
"epoch": 0.04,
"grad_norm": 3.224762131634632,
"learning_rate": 7.110091743119267e-06,
"loss": 0.9885,
"step": 155
},
{
"epoch": 0.04,
"grad_norm": 3.2576863695830527,
"learning_rate": 7.3394495412844045e-06,
"loss": 0.9887,
"step": 160
},
{
"epoch": 0.04,
"grad_norm": 3.108725515279286,
"learning_rate": 7.568807339449542e-06,
"loss": 0.9546,
"step": 165
},
{
"epoch": 0.04,
"grad_norm": 3.3107498026119355,
"learning_rate": 7.79816513761468e-06,
"loss": 0.9938,
"step": 170
},
{
"epoch": 0.04,
"grad_norm": 3.3240424189638356,
"learning_rate": 8.027522935779817e-06,
"loss": 1.024,
"step": 175
},
{
"epoch": 0.04,
"grad_norm": 3.154260812846157,
"learning_rate": 8.256880733944956e-06,
"loss": 1.0029,
"step": 180
},
{
"epoch": 0.04,
"grad_norm": 3.441212795928307,
"learning_rate": 8.486238532110093e-06,
"loss": 0.9668,
"step": 185
},
{
"epoch": 0.04,
"grad_norm": 3.6266522820185063,
"learning_rate": 8.71559633027523e-06,
"loss": 0.9973,
"step": 190
},
{
"epoch": 0.04,
"grad_norm": 3.350159055683761,
"learning_rate": 8.944954128440367e-06,
"loss": 1.0421,
"step": 195
},
{
"epoch": 0.05,
"grad_norm": 3.205900107365007,
"learning_rate": 9.174311926605506e-06,
"loss": 0.9982,
"step": 200
},
{
"epoch": 0.05,
"grad_norm": 3.2252222521575464,
"learning_rate": 9.403669724770643e-06,
"loss": 1.0121,
"step": 205
},
{
"epoch": 0.05,
"grad_norm": 3.3039077242433996,
"learning_rate": 9.633027522935781e-06,
"loss": 1.0222,
"step": 210
},
{
"epoch": 0.05,
"grad_norm": 3.196932072104769,
"learning_rate": 9.862385321100918e-06,
"loss": 1.0575,
"step": 215
},
{
"epoch": 0.05,
"grad_norm": 4.286375011174814,
"learning_rate": 1.0091743119266055e-05,
"loss": 0.9753,
"step": 220
},
{
"epoch": 0.05,
"grad_norm": 3.0473780635111942,
"learning_rate": 1.0321100917431192e-05,
"loss": 1.0052,
"step": 225
},
{
"epoch": 0.05,
"grad_norm": 2.926738004897812,
"learning_rate": 1.055045871559633e-05,
"loss": 1.0091,
"step": 230
},
{
"epoch": 0.05,
"grad_norm": 3.9780839869679707,
"learning_rate": 1.077981651376147e-05,
"loss": 1.0237,
"step": 235
},
{
"epoch": 0.06,
"grad_norm": 3.371486237167096,
"learning_rate": 1.1009174311926607e-05,
"loss": 1.0224,
"step": 240
},
{
"epoch": 0.06,
"grad_norm": 3.3486037926379106,
"learning_rate": 1.1238532110091744e-05,
"loss": 1.0563,
"step": 245
},
{
"epoch": 0.06,
"grad_norm": 3.231176251781338,
"learning_rate": 1.1467889908256882e-05,
"loss": 1.0541,
"step": 250
},
{
"epoch": 0.06,
"grad_norm": 3.335545232558565,
"learning_rate": 1.169724770642202e-05,
"loss": 1.0375,
"step": 255
},
{
"epoch": 0.06,
"grad_norm": 3.1805921107957467,
"learning_rate": 1.1926605504587156e-05,
"loss": 1.033,
"step": 260
},
{
"epoch": 0.06,
"grad_norm": 3.0269986709638537,
"learning_rate": 1.2155963302752293e-05,
"loss": 1.0143,
"step": 265
},
{
"epoch": 0.06,
"grad_norm": 3.1030715735729024,
"learning_rate": 1.238532110091743e-05,
"loss": 1.0232,
"step": 270
},
{
"epoch": 0.06,
"grad_norm": 3.1577072382180664,
"learning_rate": 1.261467889908257e-05,
"loss": 1.0391,
"step": 275
},
{
"epoch": 0.06,
"grad_norm": 3.032906066233452,
"learning_rate": 1.2844036697247708e-05,
"loss": 1.0034,
"step": 280
},
{
"epoch": 0.07,
"grad_norm": 3.426516468568716,
"learning_rate": 1.3073394495412845e-05,
"loss": 1.0713,
"step": 285
},
{
"epoch": 0.07,
"grad_norm": 3.115183010494265,
"learning_rate": 1.3302752293577984e-05,
"loss": 1.036,
"step": 290
},
{
"epoch": 0.07,
"grad_norm": 3.2288498137146586,
"learning_rate": 1.353211009174312e-05,
"loss": 1.0215,
"step": 295
},
{
"epoch": 0.07,
"grad_norm": 3.223471739538807,
"learning_rate": 1.3761467889908258e-05,
"loss": 1.0256,
"step": 300
},
{
"epoch": 0.07,
"grad_norm": 3.2891011086195214,
"learning_rate": 1.3990825688073395e-05,
"loss": 1.0366,
"step": 305
},
{
"epoch": 0.07,
"grad_norm": 3.0537956353875324,
"learning_rate": 1.4220183486238533e-05,
"loss": 1.0817,
"step": 310
},
{
"epoch": 0.07,
"grad_norm": 3.100613029348784,
"learning_rate": 1.4449541284403672e-05,
"loss": 1.0531,
"step": 315
},
{
"epoch": 0.07,
"grad_norm": 3.127100337039988,
"learning_rate": 1.4678899082568809e-05,
"loss": 1.0594,
"step": 320
},
{
"epoch": 0.07,
"grad_norm": 3.2040550452600325,
"learning_rate": 1.4908256880733946e-05,
"loss": 1.0814,
"step": 325
},
{
"epoch": 0.08,
"grad_norm": 3.164126270067494,
"learning_rate": 1.5137614678899085e-05,
"loss": 1.0609,
"step": 330
},
{
"epoch": 0.08,
"grad_norm": 2.8307375736866796,
"learning_rate": 1.536697247706422e-05,
"loss": 1.0418,
"step": 335
},
{
"epoch": 0.08,
"grad_norm": 3.0304190806703972,
"learning_rate": 1.559633027522936e-05,
"loss": 1.0655,
"step": 340
},
{
"epoch": 0.08,
"grad_norm": 3.1653216968924633,
"learning_rate": 1.5825688073394497e-05,
"loss": 1.037,
"step": 345
},
{
"epoch": 0.08,
"grad_norm": 3.058091371029834,
"learning_rate": 1.6055045871559634e-05,
"loss": 1.0899,
"step": 350
},
{
"epoch": 0.08,
"grad_norm": 3.020116940253991,
"learning_rate": 1.628440366972477e-05,
"loss": 1.0358,
"step": 355
},
{
"epoch": 0.08,
"grad_norm": 3.003561000700209,
"learning_rate": 1.6513761467889912e-05,
"loss": 1.0367,
"step": 360
},
{
"epoch": 0.08,
"grad_norm": 3.030349207340203,
"learning_rate": 1.674311926605505e-05,
"loss": 1.0779,
"step": 365
},
{
"epoch": 0.08,
"grad_norm": 2.972268792440487,
"learning_rate": 1.6972477064220186e-05,
"loss": 1.0587,
"step": 370
},
{
"epoch": 0.09,
"grad_norm": 3.0024168971293586,
"learning_rate": 1.7201834862385323e-05,
"loss": 1.0621,
"step": 375
},
{
"epoch": 0.09,
"grad_norm": 3.204045198122664,
"learning_rate": 1.743119266055046e-05,
"loss": 1.0539,
"step": 380
},
{
"epoch": 0.09,
"grad_norm": 2.967217430578547,
"learning_rate": 1.7660550458715597e-05,
"loss": 1.0734,
"step": 385
},
{
"epoch": 0.09,
"grad_norm": 2.9810040743388173,
"learning_rate": 1.7889908256880734e-05,
"loss": 1.08,
"step": 390
},
{
"epoch": 0.09,
"grad_norm": 2.9561283294791445,
"learning_rate": 1.811926605504587e-05,
"loss": 1.0549,
"step": 395
},
{
"epoch": 0.09,
"grad_norm": 3.103685050292982,
"learning_rate": 1.834862385321101e-05,
"loss": 1.0536,
"step": 400
},
{
"epoch": 0.09,
"grad_norm": 2.966374643255888,
"learning_rate": 1.8577981651376148e-05,
"loss": 1.0493,
"step": 405
},
{
"epoch": 0.09,
"grad_norm": 2.961623318533173,
"learning_rate": 1.8807339449541285e-05,
"loss": 1.1001,
"step": 410
},
{
"epoch": 0.1,
"grad_norm": 3.213995630508863,
"learning_rate": 1.9036697247706422e-05,
"loss": 1.0964,
"step": 415
},
{
"epoch": 0.1,
"grad_norm": 3.058722713545753,
"learning_rate": 1.9266055045871563e-05,
"loss": 1.0958,
"step": 420
},
{
"epoch": 0.1,
"grad_norm": 3.100037959558587,
"learning_rate": 1.94954128440367e-05,
"loss": 1.0735,
"step": 425
},
{
"epoch": 0.1,
"grad_norm": 3.1066528399698305,
"learning_rate": 1.9724770642201837e-05,
"loss": 1.0932,
"step": 430
},
{
"epoch": 0.1,
"grad_norm": 2.962622864501778,
"learning_rate": 1.9954128440366974e-05,
"loss": 1.0906,
"step": 435
},
{
"epoch": 0.1,
"grad_norm": 3.0108264145191432,
"learning_rate": 1.9999948669655127e-05,
"loss": 1.0644,
"step": 440
},
{
"epoch": 0.1,
"grad_norm": 2.833061974778976,
"learning_rate": 1.9999740141032216e-05,
"loss": 1.0696,
"step": 445
},
{
"epoch": 0.1,
"grad_norm": 2.9158581052830965,
"learning_rate": 1.999937120932709e-05,
"loss": 1.1006,
"step": 450
},
{
"epoch": 0.1,
"grad_norm": 2.856147725205616,
"learning_rate": 1.9998841880457682e-05,
"loss": 1.0769,
"step": 455
},
{
"epoch": 0.11,
"grad_norm": 2.9755007034045593,
"learning_rate": 1.9998152162914807e-05,
"loss": 1.1161,
"step": 460
},
{
"epoch": 0.11,
"grad_norm": 3.645560434344824,
"learning_rate": 1.9997302067762044e-05,
"loss": 1.1022,
"step": 465
},
{
"epoch": 0.11,
"grad_norm": 3.122685192865999,
"learning_rate": 1.9996291608635527e-05,
"loss": 1.0537,
"step": 470
},
{
"epoch": 0.11,
"grad_norm": 2.937474072999667,
"learning_rate": 1.999512080174375e-05,
"loss": 1.0876,
"step": 475
},
{
"epoch": 0.11,
"grad_norm": 3.3759125922583513,
"learning_rate": 1.9993789665867316e-05,
"loss": 1.1046,
"step": 480
},
{
"epoch": 0.11,
"grad_norm": 3.214821660194427,
"learning_rate": 1.9992298222358603e-05,
"loss": 1.1342,
"step": 485
},
{
"epoch": 0.11,
"grad_norm": 3.6555429390099374,
"learning_rate": 1.9990646495141445e-05,
"loss": 1.1175,
"step": 490
},
{
"epoch": 0.11,
"grad_norm": 2.9606668287180455,
"learning_rate": 1.9988834510710747e-05,
"loss": 1.0842,
"step": 495
},
{
"epoch": 0.11,
"grad_norm": 3.1350054453428213,
"learning_rate": 1.998686229813205e-05,
"loss": 1.0979,
"step": 500
},
{
"epoch": 0.12,
"grad_norm": 2.7934482490231054,
"learning_rate": 1.9984729889041077e-05,
"loss": 1.0637,
"step": 505
},
{
"epoch": 0.12,
"grad_norm": 2.91038630187397,
"learning_rate": 1.9982437317643218e-05,
"loss": 1.1089,
"step": 510
},
{
"epoch": 0.12,
"grad_norm": 3.4360032792740673,
"learning_rate": 1.9979984620712972e-05,
"loss": 1.1245,
"step": 515
},
{
"epoch": 0.12,
"grad_norm": 3.073630199634191,
"learning_rate": 1.9977371837593382e-05,
"loss": 1.0963,
"step": 520
},
{
"epoch": 0.12,
"grad_norm": 3.244084086033738,
"learning_rate": 1.9974599010195384e-05,
"loss": 1.1517,
"step": 525
},
{
"epoch": 0.12,
"grad_norm": 3.036785127574316,
"learning_rate": 1.997166618299714e-05,
"loss": 1.1162,
"step": 530
},
{
"epoch": 0.12,
"grad_norm": 3.5966815313979446,
"learning_rate": 1.9968573403043325e-05,
"loss": 1.0828,
"step": 535
},
{
"epoch": 0.12,
"grad_norm": 2.85584309172754,
"learning_rate": 1.9965320719944366e-05,
"loss": 1.1187,
"step": 540
},
{
"epoch": 0.13,
"grad_norm": 3.210724272586593,
"learning_rate": 1.9961908185875662e-05,
"loss": 1.1095,
"step": 545
},
{
"epoch": 0.13,
"grad_norm": 3.0107803370726685,
"learning_rate": 1.995833585557674e-05,
"loss": 1.0474,
"step": 550
},
{
"epoch": 0.13,
"grad_norm": 3.084146667029137,
"learning_rate": 1.9954603786350353e-05,
"loss": 1.1063,
"step": 555
},
{
"epoch": 0.13,
"grad_norm": 3.2688781509444476,
"learning_rate": 1.9950712038061617e-05,
"loss": 1.1266,
"step": 560
},
{
"epoch": 0.13,
"grad_norm": 680.7081090329712,
"learning_rate": 1.994666067313698e-05,
"loss": 1.1471,
"step": 565
},
{
"epoch": 0.13,
"grad_norm": 149.93179306713003,
"learning_rate": 1.994244975656328e-05,
"loss": 1.7807,
"step": 570
},
{
"epoch": 0.13,
"grad_norm": 220.01504858608797,
"learning_rate": 1.9938079355886674e-05,
"loss": 6.4289,
"step": 575
},
{
"epoch": 0.13,
"grad_norm": 496.48020483148116,
"learning_rate": 1.993354954121155e-05,
"loss": 12.59,
"step": 580
},
{
"epoch": 0.13,
"grad_norm": 100.33483837207477,
"learning_rate": 1.992886038519943e-05,
"loss": 10.3831,
"step": 585
},
{
"epoch": 0.14,
"grad_norm": 34.991765615273025,
"learning_rate": 1.9924011963067765e-05,
"loss": 8.1883,
"step": 590
},
{
"epoch": 0.14,
"grad_norm": 45.90912397238394,
"learning_rate": 1.9919004352588768e-05,
"loss": 7.508,
"step": 595
},
{
"epoch": 0.14,
"grad_norm": 25.835640875802444,
"learning_rate": 1.9913837634088143e-05,
"loss": 7.4129,
"step": 600
},
{
"epoch": 0.14,
"grad_norm": 15.174156610898672,
"learning_rate": 1.99085118904438e-05,
"loss": 7.3342,
"step": 605
},
{
"epoch": 0.14,
"grad_norm": 17.635001034280123,
"learning_rate": 1.9903027207084525e-05,
"loss": 7.2874,
"step": 610
},
{
"epoch": 0.14,
"grad_norm": 9.893720942330273,
"learning_rate": 1.989738367198862e-05,
"loss": 7.2536,
"step": 615
},
{
"epoch": 0.14,
"grad_norm": 9.867615007061273,
"learning_rate": 1.9891581375682472e-05,
"loss": 7.1948,
"step": 620
},
{
"epoch": 0.14,
"grad_norm": 9.030991653289398,
"learning_rate": 1.9885620411239134e-05,
"loss": 7.2219,
"step": 625
},
{
"epoch": 0.14,
"grad_norm": 7.379829275629753,
"learning_rate": 1.9879500874276788e-05,
"loss": 7.2081,
"step": 630
},
{
"epoch": 0.15,
"grad_norm": 6.130413517671043,
"learning_rate": 1.9873222862957243e-05,
"loss": 7.241,
"step": 635
},
{
"epoch": 0.15,
"grad_norm": 7.032182637604816,
"learning_rate": 1.9866786477984357e-05,
"loss": 7.2104,
"step": 640
},
{
"epoch": 0.15,
"grad_norm": 5.450500360030072,
"learning_rate": 1.9860191822602415e-05,
"loss": 7.2306,
"step": 645
},
{
"epoch": 0.15,
"grad_norm": 6.241894562599629,
"learning_rate": 1.985343900259446e-05,
"loss": 7.2092,
"step": 650
},
{
"epoch": 0.15,
"grad_norm": 7.704992268267875,
"learning_rate": 1.9846528126280632e-05,
"loss": 7.2195,
"step": 655
},
{
"epoch": 0.15,
"grad_norm": 5.892577300152109,
"learning_rate": 1.983945930451639e-05,
"loss": 7.2134,
"step": 660
},
{
"epoch": 0.15,
"grad_norm": 7.162244013604885,
"learning_rate": 1.9832232650690765e-05,
"loss": 7.2153,
"step": 665
},
{
"epoch": 0.15,
"grad_norm": 5.49392312570169,
"learning_rate": 1.982484828072452e-05,
"loss": 7.2018,
"step": 670
},
{
"epoch": 0.15,
"grad_norm": 5.954680533596231,
"learning_rate": 1.981730631306831e-05,
"loss": 7.1981,
"step": 675
},
{
"epoch": 0.16,
"grad_norm": 7.245712488666381,
"learning_rate": 1.9809606868700755e-05,
"loss": 7.2166,
"step": 680
},
{
"epoch": 0.16,
"grad_norm": 6.280016322704388,
"learning_rate": 1.9801750071126536e-05,
"loss": 7.2043,
"step": 685
},
{
"epoch": 0.16,
"grad_norm": 6.1226575129071215,
"learning_rate": 1.9793736046374375e-05,
"loss": 7.1994,
"step": 690
},
{
"epoch": 0.16,
"grad_norm": 5.1738890947124965,
"learning_rate": 1.9785564922995042e-05,
"loss": 7.197,
"step": 695
},
{
"epoch": 0.16,
"grad_norm": 7.070513738096005,
"learning_rate": 1.977723683205928e-05,
"loss": 7.1694,
"step": 700
},
{
"epoch": 0.16,
"grad_norm": 7.1998596365209995,
"learning_rate": 1.9768751907155707e-05,
"loss": 7.2087,
"step": 705
},
{
"epoch": 0.16,
"grad_norm": 6.8756525556203885,
"learning_rate": 1.9760110284388667e-05,
"loss": 7.2004,
"step": 710
},
{
"epoch": 0.16,
"grad_norm": 5.673754116753309,
"learning_rate": 1.9751312102376062e-05,
"loss": 7.1969,
"step": 715
},
{
"epoch": 0.17,
"grad_norm": 5.928999080043428,
"learning_rate": 1.9742357502247104e-05,
"loss": 7.1754,
"step": 720
},
{
"epoch": 0.17,
"grad_norm": 7.534058043728272,
"learning_rate": 1.9733246627640072e-05,
"loss": 7.2245,
"step": 725
},
{
"epoch": 0.17,
"grad_norm": 6.419671206121361,
"learning_rate": 1.9723979624700004e-05,
"loss": 7.1981,
"step": 730
},
{
"epoch": 0.17,
"grad_norm": 5.014238279563543,
"learning_rate": 1.9714556642076347e-05,
"loss": 7.2059,
"step": 735
},
{
"epoch": 0.17,
"grad_norm": 5.4286747899069745,
"learning_rate": 1.970497783092057e-05,
"loss": 7.1769,
"step": 740
},
{
"epoch": 0.17,
"grad_norm": 5.105148382009604,
"learning_rate": 1.969524334488375e-05,
"loss": 7.2066,
"step": 745
},
{
"epoch": 0.17,
"grad_norm": 5.826988284774489,
"learning_rate": 1.9685353340114104e-05,
"loss": 7.1971,
"step": 750
},
{
"epoch": 0.17,
"grad_norm": 5.244080325535858,
"learning_rate": 1.9675307975254478e-05,
"loss": 7.2065,
"step": 755
},
{
"epoch": 0.17,
"grad_norm": 7.248352747427355,
"learning_rate": 1.9665107411439805e-05,
"loss": 7.1707,
"step": 760
},
{
"epoch": 0.18,
"grad_norm": 5.693767897081214,
"learning_rate": 1.965475181229453e-05,
"loss": 7.1989,
"step": 765
},
{
"epoch": 0.18,
"grad_norm": 5.256405796849654,
"learning_rate": 1.9644241343929966e-05,
"loss": 7.2026,
"step": 770
},
{
"epoch": 0.18,
"grad_norm": 5.230559774612038,
"learning_rate": 1.963357617494165e-05,
"loss": 7.1968,
"step": 775
},
{
"epoch": 0.18,
"grad_norm": 5.299356891163277,
"learning_rate": 1.9622756476406625e-05,
"loss": 7.2201,
"step": 780
},
{
"epoch": 0.18,
"grad_norm": 5.771781395899692,
"learning_rate": 1.9611782421880702e-05,
"loss": 7.2188,
"step": 785
},
{
"epoch": 0.18,
"grad_norm": 4.975609755551546,
"learning_rate": 1.9600654187395666e-05,
"loss": 7.2074,
"step": 790
},
{
"epoch": 0.18,
"grad_norm": 6.486489059003917,
"learning_rate": 1.958937195145647e-05,
"loss": 7.223,
"step": 795
},
{
"epoch": 0.18,
"grad_norm": 5.4870554264978235,
"learning_rate": 1.9577935895038363e-05,
"loss": 7.2093,
"step": 800
},
{
"epoch": 0.18,
"grad_norm": 5.297769552074883,
"learning_rate": 1.9566346201583974e-05,
"loss": 7.1872,
"step": 805
},
{
"epoch": 0.19,
"grad_norm": 4.767621827384491,
"learning_rate": 1.9554603057000397e-05,
"loss": 7.1857,
"step": 810
},
{
"epoch": 0.19,
"grad_norm": 5.953451938027194,
"learning_rate": 1.954270664965618e-05,
"loss": 7.1737,
"step": 815
},
{
"epoch": 0.19,
"grad_norm": 5.758676615210085,
"learning_rate": 1.953065717037832e-05,
"loss": 7.1809,
"step": 820
},
{
"epoch": 0.19,
"grad_norm": 6.385168274540292,
"learning_rate": 1.951845481244921e-05,
"loss": 7.1792,
"step": 825
},
{
"epoch": 0.19,
"grad_norm": 4.254446787862434,
"learning_rate": 1.9506099771603515e-05,
"loss": 7.2077,
"step": 830
},
{
"epoch": 0.19,
"grad_norm": 5.197281648875432,
"learning_rate": 1.9493592246025047e-05,
"loss": 7.2155,
"step": 835
},
{
"epoch": 0.19,
"grad_norm": 5.78819455170524,
"learning_rate": 1.9480932436343584e-05,
"loss": 7.1863,
"step": 840
},
{
"epoch": 0.19,
"grad_norm": 6.163370463039743,
"learning_rate": 1.9468120545631647e-05,
"loss": 7.2101,
"step": 845
},
{
"epoch": 0.2,
"grad_norm": 6.7662949673961315,
"learning_rate": 1.945515677940127e-05,
"loss": 7.1567,
"step": 850
},
{
"epoch": 0.2,
"grad_norm": 5.75746195424063,
"learning_rate": 1.944204134560064e-05,
"loss": 7.1651,
"step": 855
},
{
"epoch": 0.2,
"grad_norm": 5.382060329721597,
"learning_rate": 1.9428774454610845e-05,
"loss": 7.1916,
"step": 860
},
{
"epoch": 0.2,
"grad_norm": 4.893754566211905,
"learning_rate": 1.941535631924242e-05,
"loss": 7.2095,
"step": 865
},
{
"epoch": 0.2,
"grad_norm": 5.477578724305367,
"learning_rate": 1.9401787154731993e-05,
"loss": 7.2044,
"step": 870
},
{
"epoch": 0.2,
"grad_norm": 6.61002124085074,
"learning_rate": 1.9388067178738807e-05,
"loss": 7.195,
"step": 875
},
{
"epoch": 0.2,
"grad_norm": 6.116708741280613,
"learning_rate": 1.9374196611341212e-05,
"loss": 7.1967,
"step": 880
},
{
"epoch": 0.2,
"grad_norm": 6.753967686244243,
"learning_rate": 1.936017567503317e-05,
"loss": 7.199,
"step": 885
},
{
"epoch": 0.2,
"grad_norm": 7.364972728350276,
"learning_rate": 1.934600459472067e-05,
"loss": 7.1762,
"step": 890
},
{
"epoch": 0.21,
"grad_norm": 6.603911277491834,
"learning_rate": 1.933168359771811e-05,
"loss": 7.2118,
"step": 895
},
{
"epoch": 0.21,
"grad_norm": 7.012396533406363,
"learning_rate": 1.931721291374467e-05,
"loss": 7.2058,
"step": 900
},
{
"epoch": 0.21,
"grad_norm": 7.895351473028401,
"learning_rate": 1.9302592774920606e-05,
"loss": 7.1931,
"step": 905
},
{
"epoch": 0.21,
"grad_norm": 5.280257845408824,
"learning_rate": 1.9287823415763552e-05,
"loss": 7.1738,
"step": 910
},
{
"epoch": 0.21,
"grad_norm": 6.876634320902484,
"learning_rate": 1.9272905073184734e-05,
"loss": 7.192,
"step": 915
},
{
"epoch": 0.21,
"grad_norm": 4.854212629080888,
"learning_rate": 1.9257837986485187e-05,
"loss": 7.1925,
"step": 920
},
{
"epoch": 0.21,
"grad_norm": 5.092400379079062,
"learning_rate": 1.92426223973519e-05,
"loss": 7.1856,
"step": 925
},
{
"epoch": 0.21,
"grad_norm": 5.428211058950048,
"learning_rate": 1.922725854985396e-05,
"loss": 7.1597,
"step": 930
},
{
"epoch": 0.21,
"grad_norm": 4.794758754464533,
"learning_rate": 1.921174669043862e-05,
"loss": 7.2268,
"step": 935
},
{
"epoch": 0.22,
"grad_norm": 5.101883671966147,
"learning_rate": 1.9196087067927348e-05,
"loss": 7.1848,
"step": 940
},
{
"epoch": 0.22,
"grad_norm": 5.317894374914432,
"learning_rate": 1.918027993351185e-05,
"loss": 7.1811,
"step": 945
},
{
"epoch": 0.22,
"grad_norm": 5.305336773894683,
"learning_rate": 1.916432554075002e-05,
"loss": 7.1873,
"step": 950
},
{
"epoch": 0.22,
"grad_norm": 4.6840416735309915,
"learning_rate": 1.9148224145561876e-05,
"loss": 7.1889,
"step": 955
},
{
"epoch": 0.22,
"grad_norm": 5.867312525781805,
"learning_rate": 1.913197600622549e-05,
"loss": 7.2023,
"step": 960
},
{
"epoch": 0.22,
"grad_norm": 4.758609581127356,
"learning_rate": 1.9115581383372782e-05,
"loss": 7.1905,
"step": 965
},
{
"epoch": 0.22,
"grad_norm": 6.244788780284041,
"learning_rate": 1.9099040539985395e-05,
"loss": 7.1896,
"step": 970
},
{
"epoch": 0.22,
"grad_norm": 7.35187418176669,
"learning_rate": 1.9082353741390453e-05,
"loss": 7.1811,
"step": 975
},
{
"epoch": 0.22,
"grad_norm": 5.6595340281862825,
"learning_rate": 1.90655212552563e-05,
"loss": 7.1919,
"step": 980
},
{
"epoch": 0.23,
"grad_norm": 4.892032669535677,
"learning_rate": 1.904854335158822e-05,
"loss": 7.1865,
"step": 985
},
{
"epoch": 0.23,
"grad_norm": 5.7552292559003035,
"learning_rate": 1.9031420302724093e-05,
"loss": 7.1996,
"step": 990
},
{
"epoch": 0.23,
"grad_norm": 4.674540158335838,
"learning_rate": 1.901415238333005e-05,
"loss": 7.1851,
"step": 995
},
{
"epoch": 0.23,
"grad_norm": 4.803373360265408,
"learning_rate": 1.8996739870396027e-05,
"loss": 7.2195,
"step": 1000
},
{
"epoch": 0.23,
"grad_norm": 4.740149041137212,
"learning_rate": 1.897918304323136e-05,
"loss": 7.186,
"step": 1005
},
{
"epoch": 0.23,
"grad_norm": 5.394971774083842,
"learning_rate": 1.896148218346028e-05,
"loss": 7.2,
"step": 1010
},
{
"epoch": 0.23,
"grad_norm": 4.8368244052167375,
"learning_rate": 1.8943637575017428e-05,
"loss": 7.1863,
"step": 1015
},
{
"epoch": 0.23,
"grad_norm": 4.795222702764058,
"learning_rate": 1.8925649504143244e-05,
"loss": 7.194,
"step": 1020
},
{
"epoch": 0.24,
"grad_norm": 6.091441424838663,
"learning_rate": 1.890751825937944e-05,
"loss": 7.1919,
"step": 1025
},
{
"epoch": 0.24,
"grad_norm": 5.2139746246710965,
"learning_rate": 1.888924413156432e-05,
"loss": 7.1813,
"step": 1030
},
{
"epoch": 0.24,
"grad_norm": 5.924868386178008,
"learning_rate": 1.8870827413828148e-05,
"loss": 7.1969,
"step": 1035
},
{
"epoch": 0.24,
"grad_norm": 4.75305228923696,
"learning_rate": 1.885226840158843e-05,
"loss": 7.2101,
"step": 1040
},
{
"epoch": 0.24,
"grad_norm": 5.751123883354145,
"learning_rate": 1.8833567392545177e-05,
"loss": 7.1988,
"step": 1045
},
{
"epoch": 0.24,
"grad_norm": 7.371173831840808,
"learning_rate": 1.8814724686676133e-05,
"loss": 7.2179,
"step": 1050
},
{
"epoch": 0.24,
"grad_norm": 6.00599017571554,
"learning_rate": 1.879574058623196e-05,
"loss": 7.1914,
"step": 1055
},
{
"epoch": 0.24,
"grad_norm": 5.991137258758085,
"learning_rate": 1.8776615395731398e-05,
"loss": 7.183,
"step": 1060
},
{
"epoch": 0.24,
"grad_norm": 5.718123489352958,
"learning_rate": 1.875734942195637e-05,
"loss": 7.1905,
"step": 1065
},
{
"epoch": 0.25,
"grad_norm": 4.487539169972883,
"learning_rate": 1.8737942973947062e-05,
"loss": 7.1581,
"step": 1070
},
{
"epoch": 0.25,
"grad_norm": 4.825603371326703,
"learning_rate": 1.8718396362996968e-05,
"loss": 7.1935,
"step": 1075
},
{
"epoch": 0.25,
"grad_norm": 4.813620283639029,
"learning_rate": 1.8698709902647903e-05,
"loss": 7.1977,
"step": 1080
},
{
"epoch": 0.25,
"grad_norm": 8.758806033943968,
"learning_rate": 1.8678883908684964e-05,
"loss": 7.1901,
"step": 1085
},
{
"epoch": 0.25,
"grad_norm": 5.36268133923744,
"learning_rate": 1.865891869913147e-05,
"loss": 7.1914,
"step": 1090
},
{
"epoch": 0.25,
"grad_norm": 5.610339067780085,
"learning_rate": 1.863881459424386e-05,
"loss": 7.1798,
"step": 1095
},
{
"epoch": 0.25,
"grad_norm": 5.469361658862883,
"learning_rate": 1.8618571916506548e-05,
"loss": 7.1721,
"step": 1100
},
{
"epoch": 0.25,
"grad_norm": 5.07301012439838,
"learning_rate": 1.8598190990626764e-05,
"loss": 7.2065,
"step": 1105
},
{
"epoch": 0.25,
"grad_norm": 6.39877570039683,
"learning_rate": 1.8577672143529337e-05,
"loss": 7.1823,
"step": 1110
},
{
"epoch": 0.26,
"grad_norm": 5.823362939728546,
"learning_rate": 1.8557015704351453e-05,
"loss": 7.1601,
"step": 1115
},
{
"epoch": 0.26,
"grad_norm": 6.353964897246578,
"learning_rate": 1.853622200443737e-05,
"loss": 7.1801,
"step": 1120
},
{
"epoch": 0.26,
"grad_norm": 4.4888019416686795,
"learning_rate": 1.8515291377333114e-05,
"loss": 7.1615,
"step": 1125
},
{
"epoch": 0.26,
"grad_norm": 4.737996647818345,
"learning_rate": 1.849422415878112e-05,
"loss": 7.1752,
"step": 1130
},
{
"epoch": 0.26,
"grad_norm": 5.655355199762672,
"learning_rate": 1.8473020686714847e-05,
"loss": 7.1897,
"step": 1135
},
{
"epoch": 0.26,
"grad_norm": 4.905574751971008,
"learning_rate": 1.8451681301253363e-05,
"loss": 7.1759,
"step": 1140
},
{
"epoch": 0.26,
"grad_norm": 5.093954229069838,
"learning_rate": 1.8430206344695875e-05,
"loss": 7.1841,
"step": 1145
},
{
"epoch": 0.26,
"grad_norm": 4.659167952013244,
"learning_rate": 1.840859616151627e-05,
"loss": 7.1793,
"step": 1150
},
{
"epoch": 0.27,
"grad_norm": 4.779633769093793,
"learning_rate": 1.8386851098357538e-05,
"loss": 7.1827,
"step": 1155
},
{
"epoch": 0.27,
"grad_norm": 6.011930861735435,
"learning_rate": 1.8364971504026273e-05,
"loss": 7.1792,
"step": 1160
},
{
"epoch": 0.27,
"grad_norm": 5.881425426906034,
"learning_rate": 1.834295772948703e-05,
"loss": 7.1934,
"step": 1165
},
{
"epoch": 0.27,
"grad_norm": 4.491821561313667,
"learning_rate": 1.8320810127856706e-05,
"loss": 7.1638,
"step": 1170
},
{
"epoch": 0.27,
"grad_norm": 4.4905503941670535,
"learning_rate": 1.8298529054398896e-05,
"loss": 7.1787,
"step": 1175
},
{
"epoch": 0.27,
"grad_norm": 6.456686168415449,
"learning_rate": 1.827611486651817e-05,
"loss": 7.1807,
"step": 1180
},
{
"epoch": 0.27,
"grad_norm": 4.7472408032814695,
"learning_rate": 1.8253567923754353e-05,
"loss": 7.2154,
"step": 1185
},
{
"epoch": 0.27,
"grad_norm": 6.260242429793549,
"learning_rate": 1.8230888587776758e-05,
"loss": 7.2009,
"step": 1190
},
{
"epoch": 0.27,
"grad_norm": 4.459555242885236,
"learning_rate": 1.8208077222378376e-05,
"loss": 7.1827,
"step": 1195
},
{
"epoch": 0.28,
"grad_norm": 5.311364125445347,
"learning_rate": 1.8185134193470043e-05,
"loss": 7.1902,
"step": 1200
},
{
"epoch": 0.28,
"grad_norm": 8.45135390718489,
"learning_rate": 1.8162059869074586e-05,
"loss": 7.1864,
"step": 1205
},
{
"epoch": 0.28,
"grad_norm": 4.379082505010177,
"learning_rate": 1.8138854619320893e-05,
"loss": 7.2273,
"step": 1210
},
{
"epoch": 0.28,
"grad_norm": 5.710277796266043,
"learning_rate": 1.8115518816437997e-05,
"loss": 7.1802,
"step": 1215
},
{
"epoch": 0.28,
"grad_norm": 4.500870680883128,
"learning_rate": 1.8092052834749094e-05,
"loss": 7.1981,
"step": 1220
},
{
"epoch": 0.28,
"grad_norm": 6.202612921478623,
"learning_rate": 1.8068457050665547e-05,
"loss": 7.2037,
"step": 1225
},
{
"epoch": 0.28,
"grad_norm": 5.334951680536002,
"learning_rate": 1.804473184268084e-05,
"loss": 7.2078,
"step": 1230
},
{
"epoch": 0.28,
"grad_norm": 4.668688696015915,
"learning_rate": 1.8020877591364508e-05,
"loss": 7.1816,
"step": 1235
},
{
"epoch": 0.28,
"grad_norm": 5.76363061015334,
"learning_rate": 1.799689467935604e-05,
"loss": 7.1904,
"step": 1240
},
{
"epoch": 0.29,
"grad_norm": 4.299305529851326,
"learning_rate": 1.797278349135874e-05,
"loss": 7.2004,
"step": 1245
},
{
"epoch": 0.29,
"grad_norm": 6.0714518763544225,
"learning_rate": 1.7948544414133534e-05,
"loss": 7.2004,
"step": 1250
},
{
"epoch": 0.29,
"grad_norm": 5.397050722956672,
"learning_rate": 1.7924177836492802e-05,
"loss": 7.1913,
"step": 1255
},
{
"epoch": 0.29,
"grad_norm": 7.384985978864621,
"learning_rate": 1.7899684149294118e-05,
"loss": 7.2051,
"step": 1260
},
{
"epoch": 0.29,
"grad_norm": 6.435771900748507,
"learning_rate": 1.7875063745433978e-05,
"loss": 7.1817,
"step": 1265
},
{
"epoch": 0.29,
"grad_norm": 5.075431695444233,
"learning_rate": 1.7850317019841514e-05,
"loss": 7.2229,
"step": 1270
},
{
"epoch": 0.29,
"grad_norm": 4.750020994304407,
"learning_rate": 1.7825444369472147e-05,
"loss": 7.2127,
"step": 1275
},
{
"epoch": 0.29,
"grad_norm": 5.765962718023732,
"learning_rate": 1.7800446193301225e-05,
"loss": 7.2135,
"step": 1280
},
{
"epoch": 0.29,
"grad_norm": 4.801689882588788,
"learning_rate": 1.7775322892317618e-05,
"loss": 7.2023,
"step": 1285
},
{
"epoch": 0.3,
"grad_norm": 5.012853900353026,
"learning_rate": 1.7750074869517285e-05,
"loss": 7.1841,
"step": 1290
},
{
"epoch": 0.3,
"grad_norm": 5.146195314914873,
"learning_rate": 1.7724702529896824e-05,
"loss": 7.2267,
"step": 1295
},
{
"epoch": 0.3,
"grad_norm": 5.3192085523839205,
"learning_rate": 1.7699206280446955e-05,
"loss": 7.1775,
"step": 1300
},
{
"epoch": 0.3,
"grad_norm": 5.5101183654984816,
"learning_rate": 1.767358653014601e-05,
"loss": 7.2029,
"step": 1305
},
{
"epoch": 0.3,
"grad_norm": 6.5468845839854914,
"learning_rate": 1.7647843689953352e-05,
"loss": 7.1753,
"step": 1310
},
{
"epoch": 0.3,
"grad_norm": 4.353192953649322,
"learning_rate": 1.762197817280281e-05,
"loss": 7.1881,
"step": 1315
},
{
"epoch": 0.3,
"grad_norm": 4.6727420241772,
"learning_rate": 1.759599039359603e-05,
"loss": 7.1746,
"step": 1320
},
{
"epoch": 0.3,
"grad_norm": 6.204254264607091,
"learning_rate": 1.756988076919583e-05,
"loss": 7.1543,
"step": 1325
},
{
"epoch": 0.31,
"grad_norm": 4.416954900150789,
"learning_rate": 1.754364971841952e-05,
"loss": 7.2003,
"step": 1330
},
{
"epoch": 0.31,
"grad_norm": 5.866999572748804,
"learning_rate": 1.7517297662032174e-05,
"loss": 7.1931,
"step": 1335
},
{
"epoch": 0.31,
"grad_norm": 5.7422281580185714,
"learning_rate": 1.749082502273988e-05,
"loss": 7.1866,
"step": 1340
},
{
"epoch": 0.31,
"grad_norm": 5.574328843512533,
"learning_rate": 1.746423222518297e-05,
"loss": 7.209,
"step": 1345
},
{
"epoch": 0.31,
"grad_norm": 4.825095531858083,
"learning_rate": 1.7437519695929194e-05,
"loss": 7.2021,
"step": 1350
},
{
"epoch": 0.31,
"grad_norm": 4.918401678159191,
"learning_rate": 1.741068786346689e-05,
"loss": 7.1856,
"step": 1355
},
{
"epoch": 0.31,
"grad_norm": 4.7129421004109515,
"learning_rate": 1.738373715819811e-05,
"loss": 7.1646,
"step": 1360
},
{
"epoch": 0.31,
"grad_norm": 6.2682617034576635,
"learning_rate": 1.7356668012431705e-05,
"loss": 7.1869,
"step": 1365
},
{
"epoch": 0.31,
"grad_norm": 6.142810873086463,
"learning_rate": 1.7329480860376392e-05,
"loss": 7.1795,
"step": 1370
},
{
"epoch": 0.32,
"grad_norm": 4.7006273967413215,
"learning_rate": 1.7302176138133814e-05,
"loss": 7.211,
"step": 1375
},
{
"epoch": 0.32,
"grad_norm": 5.497329345480043,
"learning_rate": 1.7274754283691507e-05,
"loss": 7.1711,
"step": 1380
},
{
"epoch": 0.32,
"grad_norm": 5.806714944962353,
"learning_rate": 1.72472157369159e-05,
"loss": 7.1923,
"step": 1385
},
{
"epoch": 0.32,
"grad_norm": 6.801596277714087,
"learning_rate": 1.7219560939545246e-05,
"loss": 7.1905,
"step": 1390
},
{
"epoch": 0.32,
"grad_norm": 4.996882387174238,
"learning_rate": 1.719179033518255e-05,
"loss": 7.1942,
"step": 1395
},
{
"epoch": 0.32,
"grad_norm": 4.829570844242962,
"learning_rate": 1.7163904369288443e-05,
"loss": 7.1832,
"step": 1400
},
{
"epoch": 0.32,
"grad_norm": 5.477705999486753,
"learning_rate": 1.7135903489174034e-05,
"loss": 7.1766,
"step": 1405
},
{
"epoch": 0.32,
"grad_norm": 4.267188678316321,
"learning_rate": 1.710778814399374e-05,
"loss": 7.1899,
"step": 1410
},
{
"epoch": 0.32,
"grad_norm": 5.064274909871023,
"learning_rate": 1.7079558784738092e-05,
"loss": 7.2137,
"step": 1415
},
{
"epoch": 0.33,
"grad_norm": 5.290438730448353,
"learning_rate": 1.705121586422647e-05,
"loss": 7.201,
"step": 1420
},
{
"epoch": 0.33,
"grad_norm": 5.517582652147351,
"learning_rate": 1.702275983709987e-05,
"loss": 7.178,
"step": 1425
},
{
"epoch": 0.33,
"grad_norm": 5.324522216215293,
"learning_rate": 1.699419115981361e-05,
"loss": 7.1811,
"step": 1430
},
{
"epoch": 0.33,
"grad_norm": 5.4511667927982215,
"learning_rate": 1.6965510290629973e-05,
"loss": 7.1675,
"step": 1435
},
{
"epoch": 0.33,
"grad_norm": 5.273917433416757,
"learning_rate": 1.69367176896109e-05,
"loss": 7.2079,
"step": 1440
},
{
"epoch": 0.33,
"grad_norm": 4.543337661243557,
"learning_rate": 1.6907813818610597e-05,
"loss": 7.1508,
"step": 1445
},
{
"epoch": 0.33,
"grad_norm": 6.433592856571139,
"learning_rate": 1.6878799141268107e-05,
"loss": 7.1795,
"step": 1450
},
{
"epoch": 0.33,
"grad_norm": 6.031774153730769,
"learning_rate": 1.6849674122999878e-05,
"loss": 7.1793,
"step": 1455
},
{
"epoch": 0.34,
"grad_norm": 5.455052489494696,
"learning_rate": 1.682043923099234e-05,
"loss": 7.1835,
"step": 1460
},
{
"epoch": 0.34,
"grad_norm": 4.523617138804165,
"learning_rate": 1.679109493419435e-05,
"loss": 7.1809,
"step": 1465
},
{
"epoch": 0.34,
"grad_norm": 5.187074166481253,
"learning_rate": 1.6761641703309702e-05,
"loss": 7.151,
"step": 1470
},
{
"epoch": 0.34,
"grad_norm": 6.86249092476398,
"learning_rate": 1.673208001078958e-05,
"loss": 7.193,
"step": 1475
},
{
"epoch": 0.34,
"grad_norm": 6.567170673390032,
"learning_rate": 1.6702410330824962e-05,
"loss": 7.179,
"step": 1480
},
{
"epoch": 0.34,
"grad_norm": 5.073442019585416,
"learning_rate": 1.6672633139339028e-05,
"loss": 7.1656,
"step": 1485
},
{
"epoch": 0.34,
"grad_norm": 3.9925808755541996,
"learning_rate": 1.6642748913979515e-05,
"loss": 7.18,
"step": 1490
},
{
"epoch": 0.34,
"grad_norm": 4.80371655505946,
"learning_rate": 1.6612758134111072e-05,
"loss": 7.1768,
"step": 1495
},
{
"epoch": 0.34,
"grad_norm": 4.733455824267269,
"learning_rate": 1.6582661280807553e-05,
"loss": 7.2038,
"step": 1500
},
{
"epoch": 0.35,
"grad_norm": 3.906745836511784,
"learning_rate": 1.65524588368443e-05,
"loss": 7.1664,
"step": 1505
},
{
"epoch": 0.35,
"grad_norm": 5.163199284772482,
"learning_rate": 1.652215128669042e-05,
"loss": 7.2011,
"step": 1510
},
{
"epoch": 0.35,
"grad_norm": 3.9325541368096313,
"learning_rate": 1.649173911650099e-05,
"loss": 7.1661,
"step": 1515
},
{
"epoch": 0.35,
"grad_norm": 5.541114208005493,
"learning_rate": 1.646122281410927e-05,
"loss": 7.1731,
"step": 1520
},
{
"epoch": 0.35,
"grad_norm": 4.645120765156564,
"learning_rate": 1.6430602869018867e-05,
"loss": 7.1854,
"step": 1525
},
{
"epoch": 0.35,
"grad_norm": 5.396492917895077,
"learning_rate": 1.6399879772395915e-05,
"loss": 7.1975,
"step": 1530
},
{
"epoch": 0.35,
"grad_norm": 6.111332313811058,
"learning_rate": 1.636905401706116e-05,
"loss": 7.1962,
"step": 1535
},
{
"epoch": 0.35,
"grad_norm": 4.5879994028450355,
"learning_rate": 1.633812609748206e-05,
"loss": 7.1896,
"step": 1540
},
{
"epoch": 0.35,
"grad_norm": 4.777276796655454,
"learning_rate": 1.630709650976487e-05,
"loss": 7.196,
"step": 1545
},
{
"epoch": 0.36,
"grad_norm": 5.754696932989834,
"learning_rate": 1.6275965751646682e-05,
"loss": 7.1952,
"step": 1550
},
{
"epoch": 0.36,
"grad_norm": 4.820867978838945,
"learning_rate": 1.6244734322487415e-05,
"loss": 7.1951,
"step": 1555
},
{
"epoch": 0.36,
"grad_norm": 4.5062148240565385,
"learning_rate": 1.6213402723261852e-05,
"loss": 7.1925,
"step": 1560
},
{
"epoch": 0.36,
"grad_norm": 4.9221473358752,
"learning_rate": 1.618197145655155e-05,
"loss": 7.1882,
"step": 1565
},
{
"epoch": 0.36,
"grad_norm": 6.248482149727314,
"learning_rate": 1.6150441026536827e-05,
"loss": 7.163,
"step": 1570
},
{
"epoch": 0.36,
"grad_norm": 6.521139746786196,
"learning_rate": 1.6118811938988632e-05,
"loss": 7.1897,
"step": 1575
},
{
"epoch": 0.36,
"grad_norm": 4.793529660386469,
"learning_rate": 1.6087084701260468e-05,
"loss": 7.1675,
"step": 1580
},
{
"epoch": 0.36,
"grad_norm": 4.630271784366099,
"learning_rate": 1.605525982228023e-05,
"loss": 7.171,
"step": 1585
},
{
"epoch": 0.36,
"grad_norm": 4.653150385236314,
"learning_rate": 1.6023337812542048e-05,
"loss": 7.1867,
"step": 1590
},
{
"epoch": 0.37,
"grad_norm": 6.004405747433293,
"learning_rate": 1.5991319184098107e-05,
"loss": 7.1813,
"step": 1595
},
{
"epoch": 0.37,
"grad_norm": 5.924373425919494,
"learning_rate": 1.5959204450550427e-05,
"loss": 7.1775,
"step": 1600
},
{
"epoch": 0.37,
"grad_norm": 7.753697903529501,
"learning_rate": 1.5926994127042615e-05,
"loss": 7.1672,
"step": 1605
},
{
"epoch": 0.37,
"grad_norm": 8.078702081068387,
"learning_rate": 1.5894688730251613e-05,
"loss": 7.1701,
"step": 1610
},
{
"epoch": 0.37,
"grad_norm": 9.526882240137281,
"learning_rate": 1.586228877837941e-05,
"loss": 7.1323,
"step": 1615
},
{
"epoch": 0.37,
"grad_norm": 37.28886157765147,
"learning_rate": 1.5829794791144723e-05,
"loss": 7.1004,
"step": 1620
},
{
"epoch": 0.37,
"grad_norm": 23.093005264330223,
"learning_rate": 1.5797207289774668e-05,
"loss": 7.1948,
"step": 1625
},
{
"epoch": 0.37,
"grad_norm": 25.898784884168748,
"learning_rate": 1.57645267969964e-05,
"loss": 7.1653,
"step": 1630
},
{
"epoch": 0.38,
"grad_norm": 16.78438950960542,
"learning_rate": 1.5731753837028714e-05,
"loss": 7.1468,
"step": 1635
},
{
"epoch": 0.38,
"grad_norm": 10.923555549438724,
"learning_rate": 1.569888893557365e-05,
"loss": 7.0813,
"step": 1640
},
{
"epoch": 0.38,
"grad_norm": 11.108288539909235,
"learning_rate": 1.5665932619808058e-05,
"loss": 7.0424,
"step": 1645
},
{
"epoch": 0.38,
"grad_norm": 15.199836700972632,
"learning_rate": 1.5632885418375136e-05,
"loss": 6.9435,
"step": 1650
},
{
"epoch": 0.38,
"grad_norm": 10.04303401418099,
"learning_rate": 1.5599747861375957e-05,
"loss": 6.9432,
"step": 1655
},
{
"epoch": 0.38,
"grad_norm": 6.925107402391229,
"learning_rate": 1.556652048036096e-05,
"loss": 6.8624,
"step": 1660
},
{
"epoch": 0.38,
"grad_norm": 13.70186301929785,
"learning_rate": 1.553320380832143e-05,
"loss": 6.8157,
"step": 1665
},
{
"epoch": 0.38,
"grad_norm": 15.620537966762095,
"learning_rate": 1.549979837968094e-05,
"loss": 6.7753,
"step": 1670
},
{
"epoch": 0.38,
"grad_norm": 30.677693169182618,
"learning_rate": 1.5466304730286795e-05,
"loss": 6.794,
"step": 1675
},
{
"epoch": 0.39,
"grad_norm": 7.848469368296769,
"learning_rate": 1.5432723397401406e-05,
"loss": 6.7671,
"step": 1680
},
{
"epoch": 0.39,
"grad_norm": 21.469195766575073,
"learning_rate": 1.5399054919693704e-05,
"loss": 6.7119,
"step": 1685
},
{
"epoch": 0.39,
"grad_norm": 24.46255165124564,
"learning_rate": 1.5365299837230483e-05,
"loss": 6.6899,
"step": 1690
},
{
"epoch": 0.39,
"grad_norm": 23.20384615490851,
"learning_rate": 1.5331458691467742e-05,
"loss": 6.6424,
"step": 1695
},
{
"epoch": 0.39,
"grad_norm": 18.350112389930576,
"learning_rate": 1.5297532025241993e-05,
"loss": 6.6069,
"step": 1700
},
{
"epoch": 0.39,
"grad_norm": 35.95084330385222,
"learning_rate": 1.5263520382761563e-05,
"loss": 6.5677,
"step": 1705
},
{
"epoch": 0.39,
"grad_norm": 32.90819956258818,
"learning_rate": 1.5229424309597853e-05,
"loss": 6.5251,
"step": 1710
},
{
"epoch": 0.39,
"grad_norm": 54.76562189780166,
"learning_rate": 1.5195244352676606e-05,
"loss": 6.4826,
"step": 1715
},
{
"epoch": 0.39,
"grad_norm": 12.591984595179603,
"learning_rate": 1.5160981060269107e-05,
"loss": 6.5287,
"step": 1720
},
{
"epoch": 0.4,
"grad_norm": 10.351716266476027,
"learning_rate": 1.5126634981983412e-05,
"loss": 6.4656,
"step": 1725
},
{
"epoch": 0.4,
"grad_norm": 12.622397404252,
"learning_rate": 1.5092206668755518e-05,
"loss": 6.3774,
"step": 1730
},
{
"epoch": 0.4,
"grad_norm": 23.45116611899055,
"learning_rate": 1.5057696672840529e-05,
"loss": 6.4034,
"step": 1735
},
{
"epoch": 0.4,
"grad_norm": 40.24642870474456,
"learning_rate": 1.5023105547803807e-05,
"loss": 6.3587,
"step": 1740
},
{
"epoch": 0.4,
"grad_norm": 42.78142739794163,
"learning_rate": 1.4988433848512074e-05,
"loss": 6.3162,
"step": 1745
},
{
"epoch": 0.4,
"grad_norm": 33.07779044777228,
"learning_rate": 1.4953682131124527e-05,
"loss": 6.2552,
"step": 1750
},
{
"epoch": 0.4,
"grad_norm": 16.884418478781473,
"learning_rate": 1.491885095308391e-05,
"loss": 6.1878,
"step": 1755
},
{
"epoch": 0.4,
"grad_norm": 26.06314374849514,
"learning_rate": 1.4883940873107572e-05,
"loss": 6.2067,
"step": 1760
},
{
"epoch": 0.41,
"grad_norm": 11.772139032290678,
"learning_rate": 1.4848952451178508e-05,
"loss": 6.1506,
"step": 1765
},
{
"epoch": 0.41,
"grad_norm": 7.890512493835399,
"learning_rate": 1.4813886248536376e-05,
"loss": 6.1331,
"step": 1770
},
{
"epoch": 0.41,
"grad_norm": 12.62470607783592,
"learning_rate": 1.4778742827668484e-05,
"loss": 6.1142,
"step": 1775
},
{
"epoch": 0.41,
"grad_norm": 36.700960091806486,
"learning_rate": 1.4743522752300793e-05,
"loss": 6.0802,
"step": 1780
},
{
"epoch": 0.41,
"grad_norm": 14.397456689103558,
"learning_rate": 1.4708226587388845e-05,
"loss": 6.0312,
"step": 1785
},
{
"epoch": 0.41,
"grad_norm": 33.258017170458196,
"learning_rate": 1.467285489910872e-05,
"loss": 6.0318,
"step": 1790
},
{
"epoch": 0.41,
"grad_norm": 22.65861713891252,
"learning_rate": 1.4637408254847936e-05,
"loss": 6.0082,
"step": 1795
},
{
"epoch": 0.41,
"grad_norm": 27.453970567083232,
"learning_rate": 1.4601887223196374e-05,
"loss": 5.9184,
"step": 1800
},
{
"epoch": 0.41,
"grad_norm": 22.483790124784434,
"learning_rate": 1.4566292373937133e-05,
"loss": 5.9385,
"step": 1805
},
{
"epoch": 0.42,
"grad_norm": 76.714301112878,
"learning_rate": 1.4530624278037406e-05,
"loss": 5.8839,
"step": 1810
},
{
"epoch": 0.42,
"grad_norm": 60.99442830394419,
"learning_rate": 1.449488350763931e-05,
"loss": 5.9291,
"step": 1815
},
{
"epoch": 0.42,
"grad_norm": 43.48487974907191,
"learning_rate": 1.4459070636050721e-05,
"loss": 5.9295,
"step": 1820
},
{
"epoch": 0.42,
"grad_norm": 8.849205696409507,
"learning_rate": 1.4423186237736063e-05,
"loss": 5.8609,
"step": 1825
},
{
"epoch": 0.42,
"grad_norm": 46.120560612475195,
"learning_rate": 1.4387230888307098e-05,
"loss": 5.8535,
"step": 1830
},
{
"epoch": 0.42,
"grad_norm": 42.42359692143847,
"learning_rate": 1.4351205164513708e-05,
"loss": 5.8279,
"step": 1835
},
{
"epoch": 0.42,
"grad_norm": 33.64892053133189,
"learning_rate": 1.4315109644234619e-05,
"loss": 5.8832,
"step": 1840
},
{
"epoch": 0.42,
"grad_norm": 44.342036592354745,
"learning_rate": 1.427894490646815e-05,
"loss": 5.7869,
"step": 1845
},
{
"epoch": 0.42,
"grad_norm": 23.531884493857213,
"learning_rate": 1.4242711531322912e-05,
"loss": 5.8184,
"step": 1850
},
{
"epoch": 0.43,
"grad_norm": 24.495321259837898,
"learning_rate": 1.420641010000852e-05,
"loss": 5.7591,
"step": 1855
},
{
"epoch": 0.43,
"grad_norm": 101.90422975423697,
"learning_rate": 1.4170041194826247e-05,
"loss": 5.8044,
"step": 1860
},
{
"epoch": 0.43,
"grad_norm": 63.98708014495446,
"learning_rate": 1.4133605399159706e-05,
"loss": 5.9446,
"step": 1865
},
{
"epoch": 0.43,
"grad_norm": 29.38341129380048,
"learning_rate": 1.4097103297465471e-05,
"loss": 5.9626,
"step": 1870
},
{
"epoch": 0.43,
"grad_norm": 16.457857993310515,
"learning_rate": 1.4060535475263725e-05,
"loss": 5.8796,
"step": 1875
},
{
"epoch": 0.43,
"grad_norm": 12.75715712434224,
"learning_rate": 1.402390251912885e-05,
"loss": 5.8067,
"step": 1880
},
{
"epoch": 0.43,
"grad_norm": 10.553879277739714,
"learning_rate": 1.398720501668002e-05,
"loss": 5.791,
"step": 1885
},
{
"epoch": 0.43,
"grad_norm": 23.985007630134017,
"learning_rate": 1.395044355657178e-05,
"loss": 5.736,
"step": 1890
},
{
"epoch": 0.43,
"grad_norm": 20.71153720384459,
"learning_rate": 1.391361872848461e-05,
"loss": 5.7062,
"step": 1895
},
{
"epoch": 0.44,
"grad_norm": 33.58186355970371,
"learning_rate": 1.387673112311545e-05,
"loss": 5.7455,
"step": 1900
},
{
"epoch": 0.44,
"grad_norm": 24.602274943269077,
"learning_rate": 1.3839781332168236e-05,
"loss": 5.6321,
"step": 1905
},
{
"epoch": 0.44,
"grad_norm": 18.305365670645493,
"learning_rate": 1.3802769948344406e-05,
"loss": 5.6455,
"step": 1910
},
{
"epoch": 0.44,
"grad_norm": 17.656269054544428,
"learning_rate": 1.3765697565333387e-05,
"loss": 5.6137,
"step": 1915
},
{
"epoch": 0.44,
"grad_norm": 33.06252808092646,
"learning_rate": 1.3728564777803089e-05,
"loss": 5.6283,
"step": 1920
},
{
"epoch": 0.44,
"grad_norm": 7.31153267089378,
"learning_rate": 1.369137218139034e-05,
"loss": 5.6687,
"step": 1925
},
{
"epoch": 0.44,
"grad_norm": 43.46939760510257,
"learning_rate": 1.3654120372691361e-05,
"loss": 5.6522,
"step": 1930
},
{
"epoch": 0.44,
"grad_norm": 40.352268702600746,
"learning_rate": 1.3616809949252168e-05,
"loss": 5.6521,
"step": 1935
},
{
"epoch": 0.45,
"grad_norm": 14.07491035131935,
"learning_rate": 1.3579441509559007e-05,
"loss": 5.6476,
"step": 1940
},
{
"epoch": 0.45,
"grad_norm": 13.1869662531745,
"learning_rate": 1.3542015653028742e-05,
"loss": 5.5999,
"step": 1945
},
{
"epoch": 0.45,
"grad_norm": 12.602728660576666,
"learning_rate": 1.350453297999925e-05,
"loss": 5.5798,
"step": 1950
},
{
"epoch": 0.45,
"grad_norm": 47.72655669632253,
"learning_rate": 1.3466994091719782e-05,
"loss": 5.6063,
"step": 1955
},
{
"epoch": 0.45,
"grad_norm": 44.8093903764745,
"learning_rate": 1.3429399590341325e-05,
"loss": 5.604,
"step": 1960
},
{
"epoch": 0.45,
"grad_norm": 18.97308595224727,
"learning_rate": 1.3391750078906939e-05,
"loss": 5.5722,
"step": 1965
},
{
"epoch": 0.45,
"grad_norm": 85.6251743171489,
"learning_rate": 1.3354046161342087e-05,
"loss": 5.5877,
"step": 1970
},
{
"epoch": 0.45,
"grad_norm": 30.512861408284476,
"learning_rate": 1.3316288442444943e-05,
"loss": 5.5643,
"step": 1975
},
{
"epoch": 0.45,
"grad_norm": 12.905340157899301,
"learning_rate": 1.327847752787669e-05,
"loss": 5.5623,
"step": 1980
},
{
"epoch": 0.46,
"grad_norm": 60.35647636456591,
"learning_rate": 1.324061402415182e-05,
"loss": 5.5357,
"step": 1985
},
{
"epoch": 0.46,
"grad_norm": 28.424225727617344,
"learning_rate": 1.3202698538628376e-05,
"loss": 5.5233,
"step": 1990
},
{
"epoch": 0.46,
"grad_norm": 153.36892036409608,
"learning_rate": 1.3164731679498249e-05,
"loss": 5.4883,
"step": 1995
},
{
"epoch": 0.46,
"grad_norm": 15.941356320454116,
"learning_rate": 1.3126714055777378e-05,
"loss": 5.551,
"step": 2000
},
{
"epoch": 0.46,
"grad_norm": 53.360743928106146,
"learning_rate": 1.3088646277296018e-05,
"loss": 5.5101,
"step": 2005
},
{
"epoch": 0.46,
"grad_norm": 22.283754442776264,
"learning_rate": 1.3050528954688932e-05,
"loss": 5.4968,
"step": 2010
},
{
"epoch": 0.46,
"grad_norm": 15.309834032348661,
"learning_rate": 1.3012362699385616e-05,
"loss": 5.4641,
"step": 2015
},
{
"epoch": 0.46,
"grad_norm": 48.765379913872955,
"learning_rate": 1.2974148123600477e-05,
"loss": 5.4745,
"step": 2020
},
{
"epoch": 0.46,
"grad_norm": 85.68051399317197,
"learning_rate": 1.2935885840323015e-05,
"loss": 5.532,
"step": 2025
},
{
"epoch": 0.47,
"grad_norm": 33.710633120635386,
"learning_rate": 1.2897576463307999e-05,
"loss": 5.4799,
"step": 2030
},
{
"epoch": 0.47,
"grad_norm": 34.47592415932075,
"learning_rate": 1.285922060706561e-05,
"loss": 5.482,
"step": 2035
},
{
"epoch": 0.47,
"grad_norm": 14.767073605394202,
"learning_rate": 1.2820818886851599e-05,
"loss": 5.4112,
"step": 2040
},
{
"epoch": 0.47,
"grad_norm": 12.482712560989532,
"learning_rate": 1.2782371918657393e-05,
"loss": 5.3771,
"step": 2045
},
{
"epoch": 0.47,
"grad_norm": 41.50415361625991,
"learning_rate": 1.2743880319200241e-05,
"loss": 5.3874,
"step": 2050
},
{
"epoch": 0.47,
"grad_norm": 31.642237047280826,
"learning_rate": 1.270534470591331e-05,
"loss": 5.3966,
"step": 2055
},
{
"epoch": 0.47,
"grad_norm": 69.19319134724441,
"learning_rate": 1.2666765696935773e-05,
"loss": 5.3924,
"step": 2060
},
{
"epoch": 0.47,
"grad_norm": 32.008395804279004,
"learning_rate": 1.2628143911102905e-05,
"loss": 5.4084,
"step": 2065
},
{
"epoch": 0.47,
"grad_norm": 50.15983811581157,
"learning_rate": 1.2589479967936163e-05,
"loss": 5.382,
"step": 2070
},
{
"epoch": 0.48,
"grad_norm": 13.619109989883537,
"learning_rate": 1.2550774487633218e-05,
"loss": 5.3693,
"step": 2075
},
{
"epoch": 0.48,
"grad_norm": 84.80172491530355,
"learning_rate": 1.2512028091058044e-05,
"loss": 5.3354,
"step": 2080
},
{
"epoch": 0.48,
"grad_norm": 116.07832106775594,
"learning_rate": 1.2473241399730931e-05,
"loss": 5.3473,
"step": 2085
},
{
"epoch": 0.48,
"grad_norm": 26.694652075068255,
"learning_rate": 1.2434415035818535e-05,
"loss": 5.345,
"step": 2090
},
{
"epoch": 0.48,
"grad_norm": 54.00503230741141,
"learning_rate": 1.239554962212388e-05,
"loss": 5.3973,
"step": 2095
},
{
"epoch": 0.48,
"grad_norm": 10.543680083461279,
"learning_rate": 1.2356645782076384e-05,
"loss": 5.3688,
"step": 2100
},
{
"epoch": 0.48,
"grad_norm": 65.51859427381903,
"learning_rate": 1.2317704139721847e-05,
"loss": 5.3773,
"step": 2105
},
{
"epoch": 0.48,
"grad_norm": 29.71675462869479,
"learning_rate": 1.2278725319712449e-05,
"loss": 5.2786,
"step": 2110
},
{
"epoch": 0.49,
"grad_norm": 33.01336130546269,
"learning_rate": 1.2239709947296722e-05,
"loss": 5.311,
"step": 2115
},
{
"epoch": 0.49,
"grad_norm": 29.973987092234548,
"learning_rate": 1.2200658648309531e-05,
"loss": 5.2992,
"step": 2120
},
{
"epoch": 0.49,
"grad_norm": 48.926488754680314,
"learning_rate": 1.2161572049162027e-05,
"loss": 5.2774,
"step": 2125
},
{
"epoch": 0.49,
"grad_norm": 8.5731820792718,
"learning_rate": 1.2122450776831593e-05,
"loss": 5.2921,
"step": 2130
},
{
"epoch": 0.49,
"grad_norm": 54.271928916848765,
"learning_rate": 1.208329545885181e-05,
"loss": 5.2721,
"step": 2135
},
{
"epoch": 0.49,
"grad_norm": 58.51752529939886,
"learning_rate": 1.2044106723302364e-05,
"loss": 5.3084,
"step": 2140
},
{
"epoch": 0.49,
"grad_norm": 33.27476309879864,
"learning_rate": 1.200488519879899e-05,
"loss": 5.2501,
"step": 2145
},
{
"epoch": 0.49,
"grad_norm": 25.846871549849688,
"learning_rate": 1.1965631514483376e-05,
"loss": 5.273,
"step": 2150
},
{
"epoch": 0.49,
"grad_norm": 29.71630100350262,
"learning_rate": 1.1926346300013078e-05,
"loss": 5.1903,
"step": 2155
},
{
"epoch": 0.5,
"grad_norm": 48.29209358595899,
"learning_rate": 1.1887030185551427e-05,
"loss": 5.202,
"step": 2160
},
{
"epoch": 0.5,
"grad_norm": 57.498341779085,
"learning_rate": 1.18476838017574e-05,
"loss": 5.2558,
"step": 2165
},
{
"epoch": 0.5,
"grad_norm": 37.88134720461833,
"learning_rate": 1.1808307779775518e-05,
"loss": 5.2759,
"step": 2170
},
{
"epoch": 0.5,
"grad_norm": 21.238832228632518,
"learning_rate": 1.176890275122573e-05,
"loss": 5.2207,
"step": 2175
},
{
"epoch": 0.5,
"grad_norm": 58.74754679184001,
"learning_rate": 1.1729469348193263e-05,
"loss": 5.1915,
"step": 2180
},
{
"epoch": 0.5,
"grad_norm": 85.34069836046139,
"learning_rate": 1.1690008203218493e-05,
"loss": 5.2966,
"step": 2185
},
{
"epoch": 0.5,
"grad_norm": 35.44463556250631,
"learning_rate": 1.1650519949286797e-05,
"loss": 5.2205,
"step": 2190
},
{
"epoch": 0.5,
"grad_norm": 29.508279045032964,
"learning_rate": 1.1611005219818392e-05,
"loss": 5.2509,
"step": 2195
},
{
"epoch": 0.5,
"grad_norm": 19.983013642914806,
"learning_rate": 1.1571464648658201e-05,
"loss": 5.2294,
"step": 2200
},
{
"epoch": 0.51,
"grad_norm": 51.50574440943992,
"learning_rate": 1.1531898870065645e-05,
"loss": 5.1938,
"step": 2205
},
{
"epoch": 0.51,
"grad_norm": 59.492851827921314,
"learning_rate": 1.1492308518704507e-05,
"loss": 5.1673,
"step": 2210
},
{
"epoch": 0.51,
"grad_norm": 40.117703874194646,
"learning_rate": 1.145269422963272e-05,
"loss": 5.1442,
"step": 2215
},
{
"epoch": 0.51,
"grad_norm": 43.459311512165996,
"learning_rate": 1.1413056638292215e-05,
"loss": 5.1993,
"step": 2220
},
{
"epoch": 0.51,
"grad_norm": 82.49562635086012,
"learning_rate": 1.1373396380498683e-05,
"loss": 5.1647,
"step": 2225
},
{
"epoch": 0.51,
"grad_norm": 49.800451164925974,
"learning_rate": 1.1333714092431423e-05,
"loss": 5.194,
"step": 2230
},
{
"epoch": 0.51,
"grad_norm": 25.30211289206568,
"learning_rate": 1.1294010410623107e-05,
"loss": 5.1499,
"step": 2235
},
{
"epoch": 0.51,
"grad_norm": 77.40197466561355,
"learning_rate": 1.1254285971949574e-05,
"loss": 5.1234,
"step": 2240
},
{
"epoch": 0.52,
"grad_norm": 25.94865795704941,
"learning_rate": 1.1214541413619628e-05,
"loss": 5.1313,
"step": 2245
},
{
"epoch": 0.52,
"grad_norm": 42.470163548722276,
"learning_rate": 1.1174777373164797e-05,
"loss": 5.0979,
"step": 2250
},
{
"epoch": 0.52,
"grad_norm": 52.3446908357727,
"learning_rate": 1.1134994488429128e-05,
"loss": 5.1355,
"step": 2255
},
{
"epoch": 0.52,
"grad_norm": 40.38483541097707,
"learning_rate": 1.109519339755893e-05,
"loss": 5.1091,
"step": 2260
},
{
"epoch": 0.52,
"grad_norm": 73.05590392589481,
"learning_rate": 1.1055374738992561e-05,
"loss": 5.094,
"step": 2265
},
{
"epoch": 0.52,
"grad_norm": 14.70864089128146,
"learning_rate": 1.1015539151450172e-05,
"loss": 5.1089,
"step": 2270
},
{
"epoch": 0.52,
"grad_norm": 126.77678907405712,
"learning_rate": 1.0975687273923474e-05,
"loss": 5.1169,
"step": 2275
},
{
"epoch": 0.52,
"grad_norm": 116.95168890571357,
"learning_rate": 1.0935819745665477e-05,
"loss": 5.137,
"step": 2280
},
{
"epoch": 0.52,
"grad_norm": 16.051304830755644,
"learning_rate": 1.0895937206180243e-05,
"loss": 5.0797,
"step": 2285
},
{
"epoch": 0.53,
"grad_norm": 22.43120059083249,
"learning_rate": 1.0856040295212614e-05,
"loss": 5.0401,
"step": 2290
},
{
"epoch": 0.53,
"grad_norm": 39.29902824176953,
"learning_rate": 1.0816129652737976e-05,
"loss": 5.0754,
"step": 2295
},
{
"epoch": 0.53,
"grad_norm": 48.77985418941213,
"learning_rate": 1.077620591895197e-05,
"loss": 5.0088,
"step": 2300
},
{
"epoch": 0.53,
"grad_norm": 28.967042464927275,
"learning_rate": 1.0736269734260232e-05,
"loss": 5.0327,
"step": 2305
},
{
"epoch": 0.53,
"grad_norm": 35.80838537119951,
"learning_rate": 1.069632173926812e-05,
"loss": 4.949,
"step": 2310
},
{
"epoch": 0.53,
"grad_norm": 25.37744948872279,
"learning_rate": 1.0656362574770442e-05,
"loss": 5.0487,
"step": 2315
},
{
"epoch": 0.53,
"grad_norm": 27.443743147851325,
"learning_rate": 1.0616392881741166e-05,
"loss": 5.0757,
"step": 2320
},
{
"epoch": 0.53,
"grad_norm": 95.45635298424027,
"learning_rate": 1.0576413301323148e-05,
"loss": 5.0677,
"step": 2325
},
{
"epoch": 0.53,
"grad_norm": 47.6117313918869,
"learning_rate": 1.0536424474817848e-05,
"loss": 4.9705,
"step": 2330
},
{
"epoch": 0.54,
"grad_norm": 39.12748920114918,
"learning_rate": 1.0496427043675032e-05,
"loss": 5.0286,
"step": 2335
},
{
"epoch": 0.54,
"grad_norm": 73.58917778375972,
"learning_rate": 1.0456421649482502e-05,
"loss": 4.9928,
"step": 2340
},
{
"epoch": 0.54,
"grad_norm": 78.45734276993822,
"learning_rate": 1.041640893395578e-05,
"loss": 5.0972,
"step": 2345
},
{
"epoch": 0.54,
"grad_norm": 25.26009599076755,
"learning_rate": 1.0376389538927841e-05,
"loss": 5.0298,
"step": 2350
},
{
"epoch": 0.54,
"grad_norm": 70.6590336000904,
"learning_rate": 1.0336364106338793e-05,
"loss": 4.9628,
"step": 2355
},
{
"epoch": 0.54,
"grad_norm": 107.78270188957804,
"learning_rate": 1.0296333278225599e-05,
"loss": 5.0169,
"step": 2360
},
{
"epoch": 0.54,
"grad_norm": 52.33879582194398,
"learning_rate": 1.0256297696711764e-05,
"loss": 5.0315,
"step": 2365
},
{
"epoch": 0.54,
"grad_norm": 16.249102954138092,
"learning_rate": 1.0216258003997044e-05,
"loss": 4.9982,
"step": 2370
},
{
"epoch": 0.54,
"grad_norm": 20.332719936580876,
"learning_rate": 1.0176214842347143e-05,
"loss": 4.9946,
"step": 2375
},
{
"epoch": 0.55,
"grad_norm": 37.984031001896334,
"learning_rate": 1.0136168854083401e-05,
"loss": 4.9295,
"step": 2380
},
{
"epoch": 0.55,
"grad_norm": 53.098834473437336,
"learning_rate": 1.0096120681572513e-05,
"loss": 4.9064,
"step": 2385
},
{
"epoch": 0.55,
"grad_norm": 54.783283517303545,
"learning_rate": 1.0056070967216199e-05,
"loss": 4.9895,
"step": 2390
},
{
"epoch": 0.55,
"grad_norm": 37.5165014648596,
"learning_rate": 1.0016020353440916e-05,
"loss": 4.9422,
"step": 2395
},
{
"epoch": 0.55,
"grad_norm": 108.68042109667304,
"learning_rate": 9.975969482687547e-06,
"loss": 4.9495,
"step": 2400
},
{
"epoch": 0.55,
"grad_norm": 123.58611812164843,
"learning_rate": 9.935918997401104e-06,
"loss": 4.9624,
"step": 2405
},
{
"epoch": 0.55,
"grad_norm": 76.39873130451743,
"learning_rate": 9.8958695400204e-06,
"loss": 4.9523,
"step": 2410
},
{
"epoch": 0.55,
"grad_norm": 61.8471682011305,
"learning_rate": 9.855821752967779e-06,
"loss": 4.9636,
"step": 2415
},
{
"epoch": 0.56,
"grad_norm": 59.995751706401286,
"learning_rate": 9.815776278638772e-06,
"loss": 4.9458,
"step": 2420
},
{
"epoch": 0.56,
"grad_norm": 16.402048254533458,
"learning_rate": 9.775733759391833e-06,
"loss": 4.9456,
"step": 2425
},
{
"epoch": 0.56,
"grad_norm": 28.336679722259976,
"learning_rate": 9.735694837537993e-06,
"loss": 4.9485,
"step": 2430
},
{
"epoch": 0.56,
"grad_norm": 34.684944838819,
"learning_rate": 9.695660155330598e-06,
"loss": 4.8956,
"step": 2435
},
{
"epoch": 0.56,
"grad_norm": 55.40359426382184,
"learning_rate": 9.655630354954974e-06,
"loss": 4.9379,
"step": 2440
},
{
"epoch": 0.56,
"grad_norm": 56.22243606993078,
"learning_rate": 9.615606078518143e-06,
"loss": 4.8888,
"step": 2445
},
{
"epoch": 0.56,
"grad_norm": 25.444922627514334,
"learning_rate": 9.57558796803852e-06,
"loss": 4.9219,
"step": 2450
},
{
"epoch": 0.56,
"grad_norm": 27.49053795893979,
"learning_rate": 9.535576665435606e-06,
"loss": 4.9364,
"step": 2455
},
{
"epoch": 0.56,
"grad_norm": 23.530923406419333,
"learning_rate": 9.495572812519718e-06,
"loss": 4.8681,
"step": 2460
},
{
"epoch": 0.57,
"grad_norm": 49.62532394537909,
"learning_rate": 9.455577050981648e-06,
"loss": 4.8465,
"step": 2465
},
{
"epoch": 0.57,
"grad_norm": 38.36145744939352,
"learning_rate": 9.41559002238242e-06,
"loss": 4.8363,
"step": 2470
},
{
"epoch": 0.57,
"grad_norm": 60.0717352423416,
"learning_rate": 9.375612368142962e-06,
"loss": 4.8311,
"step": 2475
},
{
"epoch": 0.57,
"grad_norm": 80.43091159408323,
"learning_rate": 9.33564472953383e-06,
"loss": 4.856,
"step": 2480
},
{
"epoch": 0.57,
"grad_norm": 157.04490281080777,
"learning_rate": 9.295687747664935e-06,
"loss": 4.9268,
"step": 2485
},
{
"epoch": 0.57,
"grad_norm": 40.77389952062912,
"learning_rate": 9.255742063475228e-06,
"loss": 4.8845,
"step": 2490
},
{
"epoch": 0.57,
"grad_norm": 50.41517786447708,
"learning_rate": 9.215808317722453e-06,
"loss": 4.8417,
"step": 2495
},
{
"epoch": 0.57,
"grad_norm": 43.470119721373855,
"learning_rate": 9.175887150972841e-06,
"loss": 4.8295,
"step": 2500
},
{
"epoch": 0.57,
"grad_norm": 38.52488378294851,
"learning_rate": 9.135979203590852e-06,
"loss": 4.7927,
"step": 2505
},
{
"epoch": 0.58,
"grad_norm": 50.05829822932659,
"learning_rate": 9.096085115728902e-06,
"loss": 4.7938,
"step": 2510
},
{
"epoch": 0.58,
"grad_norm": 32.417062147957665,
"learning_rate": 9.056205527317082e-06,
"loss": 4.7832,
"step": 2515
},
{
"epoch": 0.58,
"grad_norm": 43.17389049870212,
"learning_rate": 9.016341078052908e-06,
"loss": 4.8322,
"step": 2520
},
{
"epoch": 0.58,
"grad_norm": 26.175168734109757,
"learning_rate": 8.976492407391046e-06,
"loss": 4.7375,
"step": 2525
},
{
"epoch": 0.58,
"grad_norm": 54.56821168706554,
"learning_rate": 8.93666015453307e-06,
"loss": 4.777,
"step": 2530
},
{
"epoch": 0.58,
"grad_norm": 55.92901066668165,
"learning_rate": 8.89684495841719e-06,
"loss": 4.8629,
"step": 2535
},
{
"epoch": 0.58,
"grad_norm": 60.84437729594054,
"learning_rate": 8.857047457708023e-06,
"loss": 4.7472,
"step": 2540
},
{
"epoch": 0.58,
"grad_norm": 66.07551312053982,
"learning_rate": 8.817268290786343e-06,
"loss": 4.8064,
"step": 2545
},
{
"epoch": 0.59,
"grad_norm": 70.80552970949772,
"learning_rate": 8.777508095738818e-06,
"loss": 4.7755,
"step": 2550
},
{
"epoch": 0.59,
"grad_norm": 40.034281163404245,
"learning_rate": 8.737767510347816e-06,
"loss": 4.7675,
"step": 2555
},
{
"epoch": 0.59,
"grad_norm": 43.61238525728124,
"learning_rate": 8.698047172081129e-06,
"loss": 4.7917,
"step": 2560
},
{
"epoch": 0.59,
"grad_norm": 70.59672678835062,
"learning_rate": 8.658347718081791e-06,
"loss": 4.7439,
"step": 2565
},
{
"epoch": 0.59,
"grad_norm": 66.1516485301477,
"learning_rate": 8.618669785157825e-06,
"loss": 4.7205,
"step": 2570
},
{
"epoch": 0.59,
"grad_norm": 51.425818625655715,
"learning_rate": 8.579014009772045e-06,
"loss": 4.765,
"step": 2575
},
{
"epoch": 0.59,
"grad_norm": 59.5563139018077,
"learning_rate": 8.539381028031838e-06,
"loss": 4.7086,
"step": 2580
},
{
"epoch": 0.59,
"grad_norm": 32.02533818205619,
"learning_rate": 8.499771475678968e-06,
"loss": 4.7159,
"step": 2585
},
{
"epoch": 0.59,
"grad_norm": 28.169693520409528,
"learning_rate": 8.46018598807938e-06,
"loss": 4.781,
"step": 2590
},
{
"epoch": 0.6,
"grad_norm": 33.43326529222529,
"learning_rate": 8.420625200212985e-06,
"loss": 4.7727,
"step": 2595
},
{
"epoch": 0.6,
"grad_norm": 15.602721631920888,
"learning_rate": 8.381089746663517e-06,
"loss": 4.7277,
"step": 2600
},
{
"epoch": 0.6,
"grad_norm": 75.75678646235137,
"learning_rate": 8.341580261608305e-06,
"loss": 4.7178,
"step": 2605
},
{
"epoch": 0.6,
"grad_norm": 105.35921413917552,
"learning_rate": 8.302097378808147e-06,
"loss": 4.7169,
"step": 2610
},
{
"epoch": 0.6,
"grad_norm": 66.6503863002048,
"learning_rate": 8.262641731597097e-06,
"loss": 4.7065,
"step": 2615
},
{
"epoch": 0.6,
"grad_norm": 63.36937965279217,
"learning_rate": 8.223213952872353e-06,
"loss": 4.7571,
"step": 2620
},
{
"epoch": 0.6,
"grad_norm": 42.26449627514292,
"learning_rate": 8.183814675084074e-06,
"loss": 4.7193,
"step": 2625
},
{
"epoch": 0.6,
"grad_norm": 51.922201070153356,
"learning_rate": 8.144444530225237e-06,
"loss": 4.645,
"step": 2630
},
{
"epoch": 0.6,
"grad_norm": 49.62760310535778,
"learning_rate": 8.105104149821515e-06,
"loss": 4.6761,
"step": 2635
},
{
"epoch": 0.61,
"grad_norm": 26.063474264685297,
"learning_rate": 8.065794164921128e-06,
"loss": 4.7211,
"step": 2640
},
{
"epoch": 0.61,
"grad_norm": 37.10041174063637,
"learning_rate": 8.026515206084744e-06,
"loss": 4.62,
"step": 2645
},
{
"epoch": 0.61,
"grad_norm": 49.537074028126945,
"learning_rate": 7.987267903375331e-06,
"loss": 4.6471,
"step": 2650
},
{
"epoch": 0.61,
"grad_norm": 51.18992061136639,
"learning_rate": 7.948052886348091e-06,
"loss": 4.7218,
"step": 2655
},
{
"epoch": 0.61,
"grad_norm": 32.615492742378834,
"learning_rate": 7.90887078404033e-06,
"loss": 4.6906,
"step": 2660
},
{
"epoch": 0.61,
"grad_norm": 31.099865231660658,
"learning_rate": 7.869722224961372e-06,
"loss": 4.6481,
"step": 2665
},
{
"epoch": 0.61,
"grad_norm": 56.24729430957337,
"learning_rate": 7.830607837082494e-06,
"loss": 4.5412,
"step": 2670
},
{
"epoch": 0.61,
"grad_norm": 53.552077180701694,
"learning_rate": 7.791528247826832e-06,
"loss": 4.6727,
"step": 2675
},
{
"epoch": 0.61,
"grad_norm": 22.552847832781552,
"learning_rate": 7.75248408405934e-06,
"loss": 4.6075,
"step": 2680
},
{
"epoch": 0.62,
"grad_norm": 25.173048725283913,
"learning_rate": 7.71347597207671e-06,
"loss": 4.6629,
"step": 2685
},
{
"epoch": 0.62,
"grad_norm": 23.941386790396614,
"learning_rate": 7.674504537597336e-06,
"loss": 4.6419,
"step": 2690
},
{
"epoch": 0.62,
"grad_norm": 97.73934134607612,
"learning_rate": 7.635570405751297e-06,
"loss": 4.686,
"step": 2695
},
{
"epoch": 0.62,
"grad_norm": 25.939426037429264,
"learning_rate": 7.596674201070282e-06,
"loss": 4.6312,
"step": 2700
},
{
"epoch": 0.62,
"grad_norm": 60.83860372254808,
"learning_rate": 7.557816547477627e-06,
"loss": 4.6386,
"step": 2705
},
{
"epoch": 0.62,
"grad_norm": 32.30676478489584,
"learning_rate": 7.518998068278266e-06,
"loss": 4.613,
"step": 2710
},
{
"epoch": 0.62,
"grad_norm": 25.044495875697613,
"learning_rate": 7.480219386148751e-06,
"loss": 4.5508,
"step": 2715
},
{
"epoch": 0.62,
"grad_norm": 43.24371720695532,
"learning_rate": 7.441481123127257e-06,
"loss": 4.5489,
"step": 2720
},
{
"epoch": 0.63,
"grad_norm": 12.562426692181319,
"learning_rate": 7.402783900603612e-06,
"loss": 4.6438,
"step": 2725
},
{
"epoch": 0.63,
"grad_norm": 60.56989492512174,
"learning_rate": 7.364128339309326e-06,
"loss": 4.532,
"step": 2730
},
{
"epoch": 0.63,
"grad_norm": 26.419914483143693,
"learning_rate": 7.325515059307622e-06,
"loss": 4.5474,
"step": 2735
},
{
"epoch": 0.63,
"grad_norm": 64.0140334222756,
"learning_rate": 7.286944679983521e-06,
"loss": 4.5868,
"step": 2740
},
{
"epoch": 0.63,
"grad_norm": 47.227122182136696,
"learning_rate": 7.248417820033857e-06,
"loss": 4.4863,
"step": 2745
},
{
"epoch": 0.63,
"grad_norm": 57.003929679910804,
"learning_rate": 7.209935097457412e-06,
"loss": 4.5547,
"step": 2750
},
{
"epoch": 0.63,
"grad_norm": 51.97090726817012,
"learning_rate": 7.171497129544946e-06,
"loss": 4.5544,
"step": 2755
},
{
"epoch": 0.63,
"grad_norm": 87.12591293798738,
"learning_rate": 7.133104532869342e-06,
"loss": 4.4572,
"step": 2760
},
{
"epoch": 0.63,
"grad_norm": 31.837006106829726,
"learning_rate": 7.094757923275688e-06,
"loss": 4.4516,
"step": 2765
},
{
"epoch": 0.64,
"grad_norm": 34.74652280757694,
"learning_rate": 7.056457915871399e-06,
"loss": 4.4672,
"step": 2770
},
{
"epoch": 0.64,
"grad_norm": 51.35076516856966,
"learning_rate": 7.018205125016369e-06,
"loss": 4.479,
"step": 2775
},
{
"epoch": 0.64,
"grad_norm": 63.95419645820714,
"learning_rate": 6.980000164313093e-06,
"loss": 4.5476,
"step": 2780
},
{
"epoch": 0.64,
"grad_norm": 64.70406060026058,
"learning_rate": 6.9418436465968485e-06,
"loss": 4.5368,
"step": 2785
},
{
"epoch": 0.64,
"grad_norm": 33.66827494802027,
"learning_rate": 6.903736183925835e-06,
"loss": 4.5201,
"step": 2790
},
{
"epoch": 0.64,
"grad_norm": 52.74134921214354,
"learning_rate": 6.865678387571394e-06,
"loss": 4.4905,
"step": 2795
},
{
"epoch": 0.64,
"grad_norm": 56.22271055622349,
"learning_rate": 6.82767086800817e-06,
"loss": 4.4965,
"step": 2800
},
{
"epoch": 0.64,
"grad_norm": 16.41040693265605,
"learning_rate": 6.789714234904332e-06,
"loss": 4.4832,
"step": 2805
},
{
"epoch": 0.64,
"grad_norm": 60.85653173977498,
"learning_rate": 6.751809097111799e-06,
"loss": 4.3844,
"step": 2810
},
{
"epoch": 0.65,
"grad_norm": 32.72687745774018,
"learning_rate": 6.71395606265646e-06,
"loss": 4.494,
"step": 2815
},
{
"epoch": 0.65,
"grad_norm": 24.316547206805122,
"learning_rate": 6.676155738728438e-06,
"loss": 4.4608,
"step": 2820
},
{
"epoch": 0.65,
"grad_norm": 14.434036241184234,
"learning_rate": 6.638408731672332e-06,
"loss": 4.4666,
"step": 2825
},
{
"epoch": 0.65,
"grad_norm": 57.148441922309786,
"learning_rate": 6.600715646977503e-06,
"loss": 4.4279,
"step": 2830
},
{
"epoch": 0.65,
"grad_norm": 27.612312611508564,
"learning_rate": 6.5630770892683656e-06,
"loss": 4.3871,
"step": 2835
},
{
"epoch": 0.65,
"grad_norm": 46.055770557265205,
"learning_rate": 6.525493662294669e-06,
"loss": 4.3828,
"step": 2840
},
{
"epoch": 0.65,
"grad_norm": 29.944780931656958,
"learning_rate": 6.487965968921834e-06,
"loss": 4.3734,
"step": 2845
},
{
"epoch": 0.65,
"grad_norm": 65.19612839352436,
"learning_rate": 6.450494611121274e-06,
"loss": 4.3356,
"step": 2850
},
{
"epoch": 0.66,
"grad_norm": 29.427807906606667,
"learning_rate": 6.413080189960734e-06,
"loss": 4.4448,
"step": 2855
},
{
"epoch": 0.66,
"grad_norm": 34.62611381334959,
"learning_rate": 6.375723305594658e-06,
"loss": 4.3736,
"step": 2860
},
{
"epoch": 0.66,
"grad_norm": 40.05866733756267,
"learning_rate": 6.338424557254556e-06,
"loss": 4.3007,
"step": 2865
},
{
"epoch": 0.66,
"grad_norm": 29.52996151229796,
"learning_rate": 6.301184543239398e-06,
"loss": 4.3379,
"step": 2870
},
{
"epoch": 0.66,
"grad_norm": 53.268001034947524,
"learning_rate": 6.264003860906003e-06,
"loss": 4.3931,
"step": 2875
},
{
"epoch": 0.66,
"grad_norm": 54.62261873319705,
"learning_rate": 6.2268831066594846e-06,
"loss": 4.3074,
"step": 2880
},
{
"epoch": 0.66,
"grad_norm": 126.40837022827374,
"learning_rate": 6.189822875943644e-06,
"loss": 4.3585,
"step": 2885
},
{
"epoch": 0.66,
"grad_norm": 38.42244306123947,
"learning_rate": 6.152823763231463e-06,
"loss": 4.4187,
"step": 2890
},
{
"epoch": 0.66,
"grad_norm": 99.40712122912547,
"learning_rate": 6.115886362015525e-06,
"loss": 4.3485,
"step": 2895
},
{
"epoch": 0.67,
"grad_norm": 29.73588763253472,
"learning_rate": 6.079011264798534e-06,
"loss": 4.4134,
"step": 2900
},
{
"epoch": 0.67,
"grad_norm": 44.79201001634174,
"learning_rate": 6.042199063083787e-06,
"loss": 4.3128,
"step": 2905
},
{
"epoch": 0.67,
"grad_norm": 16.491851726212843,
"learning_rate": 6.005450347365687e-06,
"loss": 4.2906,
"step": 2910
},
{
"epoch": 0.67,
"grad_norm": 54.87856940808512,
"learning_rate": 5.96876570712028e-06,
"loss": 4.2281,
"step": 2915
},
{
"epoch": 0.67,
"grad_norm": 79.43830358158179,
"learning_rate": 5.932145730795793e-06,
"loss": 4.3322,
"step": 2920
},
{
"epoch": 0.67,
"grad_norm": 10.817241852028406,
"learning_rate": 5.895591005803198e-06,
"loss": 4.2711,
"step": 2925
},
{
"epoch": 0.67,
"grad_norm": 35.67244995828527,
"learning_rate": 5.859102118506787e-06,
"loss": 4.2798,
"step": 2930
},
{
"epoch": 0.67,
"grad_norm": 37.49555978702204,
"learning_rate": 5.822679654214771e-06,
"loss": 4.3644,
"step": 2935
},
{
"epoch": 0.67,
"grad_norm": 34.7133878312333,
"learning_rate": 5.786324197169887e-06,
"loss": 4.3002,
"step": 2940
},
{
"epoch": 0.68,
"grad_norm": 44.151270816410126,
"learning_rate": 5.7500363305400185e-06,
"loss": 4.3286,
"step": 2945
},
{
"epoch": 0.68,
"grad_norm": 17.03079214584477,
"learning_rate": 5.713816636408871e-06,
"loss": 4.2349,
"step": 2950
},
{
"epoch": 0.68,
"grad_norm": 24.552884846518282,
"learning_rate": 5.677665695766581e-06,
"loss": 4.2901,
"step": 2955
},
{
"epoch": 0.68,
"grad_norm": 33.95441883738904,
"learning_rate": 5.641584088500461e-06,
"loss": 4.2871,
"step": 2960
},
{
"epoch": 0.68,
"grad_norm": 25.835754131711642,
"learning_rate": 5.605572393385645e-06,
"loss": 4.265,
"step": 2965
},
{
"epoch": 0.68,
"grad_norm": 25.26568170761081,
"learning_rate": 5.569631188075842e-06,
"loss": 4.2861,
"step": 2970
},
{
"epoch": 0.68,
"grad_norm": 76.32391957126073,
"learning_rate": 5.5337610490940375e-06,
"loss": 4.2465,
"step": 2975
},
{
"epoch": 0.68,
"grad_norm": 28.611274776347827,
"learning_rate": 5.497962551823266e-06,
"loss": 4.2638,
"step": 2980
},
{
"epoch": 0.68,
"grad_norm": 51.74402041961238,
"learning_rate": 5.46223627049739e-06,
"loss": 4.2331,
"step": 2985
},
{
"epoch": 0.69,
"grad_norm": 31.717225193208684,
"learning_rate": 5.426582778191858e-06,
"loss": 4.3613,
"step": 2990
},
{
"epoch": 0.69,
"grad_norm": 93.29031808462936,
"learning_rate": 5.3910026468145384e-06,
"loss": 4.2825,
"step": 2995
},
{
"epoch": 0.69,
"grad_norm": 45.06093242733675,
"learning_rate": 5.355496447096533e-06,
"loss": 4.1915,
"step": 3000
},
{
"epoch": 0.69,
"grad_norm": 143.69932721172492,
"learning_rate": 5.320064748583031e-06,
"loss": 4.2229,
"step": 3005
},
{
"epoch": 0.69,
"grad_norm": 43.33436292395085,
"learning_rate": 5.284708119624173e-06,
"loss": 4.1983,
"step": 3010
},
{
"epoch": 0.69,
"grad_norm": 34.00278112862677,
"learning_rate": 5.249427127365918e-06,
"loss": 4.24,
"step": 3015
},
{
"epoch": 0.69,
"grad_norm": 47.614893220448685,
"learning_rate": 5.2142223377409616e-06,
"loss": 4.2645,
"step": 3020
},
{
"epoch": 0.69,
"grad_norm": 35.06663560378835,
"learning_rate": 5.179094315459652e-06,
"loss": 4.2547,
"step": 3025
},
{
"epoch": 0.7,
"grad_norm": 20.809033630860146,
"learning_rate": 5.144043624000944e-06,
"loss": 4.2138,
"step": 3030
},
{
"epoch": 0.7,
"grad_norm": 57.39876741653422,
"learning_rate": 5.109070825603338e-06,
"loss": 4.213,
"step": 3035
},
{
"epoch": 0.7,
"grad_norm": 26.21823422312812,
"learning_rate": 5.074176481255873e-06,
"loss": 4.1925,
"step": 3040
},
{
"epoch": 0.7,
"grad_norm": 39.3403157676951,
"learning_rate": 5.039361150689141e-06,
"loss": 4.2599,
"step": 3045
},
{
"epoch": 0.7,
"grad_norm": 39.47336093394705,
"learning_rate": 5.00462539236628e-06,
"loss": 4.1208,
"step": 3050
},
{
"epoch": 0.7,
"grad_norm": 52.22125643489011,
"learning_rate": 4.969969763474047e-06,
"loss": 4.1573,
"step": 3055
},
{
"epoch": 0.7,
"grad_norm": 54.28036221168733,
"learning_rate": 4.935394819913849e-06,
"loss": 4.1955,
"step": 3060
},
{
"epoch": 0.7,
"grad_norm": 34.034655711045716,
"learning_rate": 4.900901116292854e-06,
"loss": 4.1996,
"step": 3065
},
{
"epoch": 0.7,
"grad_norm": 26.78872189890714,
"learning_rate": 4.866489205915072e-06,
"loss": 4.1856,
"step": 3070
},
{
"epoch": 0.71,
"grad_norm": 16.312287518234115,
"learning_rate": 4.8321596407725044e-06,
"loss": 4.1166,
"step": 3075
},
{
"epoch": 0.71,
"grad_norm": 75.08013865287577,
"learning_rate": 4.7979129715362625e-06,
"loss": 4.0856,
"step": 3080
},
{
"epoch": 0.71,
"grad_norm": 12.006364091554866,
"learning_rate": 4.7637497475477465e-06,
"loss": 4.1962,
"step": 3085
},
{
"epoch": 0.71,
"grad_norm": 60.3078722361271,
"learning_rate": 4.72967051680985e-06,
"loss": 4.1743,
"step": 3090
},
{
"epoch": 0.71,
"grad_norm": 71.3931741313261,
"learning_rate": 4.695675825978133e-06,
"loss": 4.2264,
"step": 3095
},
{
"epoch": 0.71,
"grad_norm": 39.88478916067746,
"learning_rate": 4.661766220352098e-06,
"loss": 4.1791,
"step": 3100
},
{
"epoch": 0.71,
"grad_norm": 35.51853711087642,
"learning_rate": 4.627942243866387e-06,
"loss": 4.2068,
"step": 3105
},
{
"epoch": 0.71,
"grad_norm": 22.525777126158957,
"learning_rate": 4.594204439082122e-06,
"loss": 4.1823,
"step": 3110
},
{
"epoch": 0.71,
"grad_norm": 27.12535016689027,
"learning_rate": 4.560553347178144e-06,
"loss": 4.1541,
"step": 3115
},
{
"epoch": 0.72,
"grad_norm": 30.924051240195272,
"learning_rate": 4.526989507942374e-06,
"loss": 4.1083,
"step": 3120
},
{
"epoch": 0.72,
"grad_norm": 36.007531222594395,
"learning_rate": 4.493513459763126e-06,
"loss": 4.1531,
"step": 3125
},
{
"epoch": 0.72,
"grad_norm": 43.057060831713464,
"learning_rate": 4.460125739620479e-06,
"loss": 4.0741,
"step": 3130
},
{
"epoch": 0.72,
"grad_norm": 55.48363364948151,
"learning_rate": 4.426826883077681e-06,
"loss": 4.1667,
"step": 3135
},
{
"epoch": 0.72,
"grad_norm": 35.8318271641625,
"learning_rate": 4.393617424272527e-06,
"loss": 4.1549,
"step": 3140
},
{
"epoch": 0.72,
"grad_norm": 23.77098245342959,
"learning_rate": 4.360497895908826e-06,
"loss": 4.1396,
"step": 3145
},
{
"epoch": 0.72,
"grad_norm": 47.72018152839063,
"learning_rate": 4.3274688292478105e-06,
"loss": 4.0997,
"step": 3150
},
{
"epoch": 0.72,
"grad_norm": 62.64419565990156,
"learning_rate": 4.294530754099666e-06,
"loss": 4.1044,
"step": 3155
},
{
"epoch": 0.73,
"grad_norm": 115.91048946848494,
"learning_rate": 4.261684198815004e-06,
"loss": 4.0457,
"step": 3160
},
{
"epoch": 0.73,
"grad_norm": 51.14718657604795,
"learning_rate": 4.228929690276381e-06,
"loss": 4.0961,
"step": 3165
},
{
"epoch": 0.73,
"grad_norm": 43.71547478412355,
"learning_rate": 4.196267753889864e-06,
"loss": 4.1202,
"step": 3170
},
{
"epoch": 0.73,
"grad_norm": 24.62288935078393,
"learning_rate": 4.163698913576592e-06,
"loss": 4.1129,
"step": 3175
},
{
"epoch": 0.73,
"grad_norm": 20.18023214978946,
"learning_rate": 4.131223691764384e-06,
"loss": 4.0219,
"step": 3180
},
{
"epoch": 0.73,
"grad_norm": 18.01338344676861,
"learning_rate": 4.098842609379339e-06,
"loss": 4.1014,
"step": 3185
},
{
"epoch": 0.73,
"grad_norm": 27.60045755810515,
"learning_rate": 4.066556185837494e-06,
"loss": 4.1146,
"step": 3190
},
{
"epoch": 0.73,
"grad_norm": 34.42048003123422,
"learning_rate": 4.0343649390365e-06,
"loss": 4.0762,
"step": 3195
},
{
"epoch": 0.73,
"grad_norm": 20.689902728976875,
"learning_rate": 4.002269385347289e-06,
"loss": 4.0448,
"step": 3200
},
{
"epoch": 0.74,
"grad_norm": 18.015958502412772,
"learning_rate": 3.970270039605818e-06,
"loss": 4.0524,
"step": 3205
},
{
"epoch": 0.74,
"grad_norm": 61.6572445957151,
"learning_rate": 3.9383674151047936e-06,
"loss": 4.0754,
"step": 3210
},
{
"epoch": 0.74,
"grad_norm": 58.461465621421034,
"learning_rate": 3.906562023585442e-06,
"loss": 4.051,
"step": 3215
},
{
"epoch": 0.74,
"grad_norm": 31.812316184769323,
"learning_rate": 3.8748543752293e-06,
"loss": 4.0391,
"step": 3220
},
{
"epoch": 0.74,
"grad_norm": 62.678768499001514,
"learning_rate": 3.843244978650045e-06,
"loss": 4.0376,
"step": 3225
},
{
"epoch": 0.74,
"grad_norm": 28.498015835842963,
"learning_rate": 3.8117343408853124e-06,
"loss": 4.1165,
"step": 3230
},
{
"epoch": 0.74,
"grad_norm": 35.579180059381116,
"learning_rate": 3.780322967388577e-06,
"loss": 4.0979,
"step": 3235
},
{
"epoch": 0.74,
"grad_norm": 43.80592325623231,
"learning_rate": 3.7490113620210487e-06,
"loss": 3.9952,
"step": 3240
},
{
"epoch": 0.74,
"grad_norm": 69.85816894896105,
"learning_rate": 3.7178000270435765e-06,
"loss": 3.9794,
"step": 3245
},
{
"epoch": 0.75,
"grad_norm": 83.09539466736378,
"learning_rate": 3.686689463108608e-06,
"loss": 4.0066,
"step": 3250
},
{
"epoch": 0.75,
"grad_norm": 29.653561320118907,
"learning_rate": 3.6556801692521426e-06,
"loss": 4.0893,
"step": 3255
},
{
"epoch": 0.75,
"grad_norm": 44.601159546521934,
"learning_rate": 3.6247726428857344e-06,
"loss": 3.9974,
"step": 3260
},
{
"epoch": 0.75,
"grad_norm": 32.63133900722214,
"learning_rate": 3.593967379788522e-06,
"loss": 4.0271,
"step": 3265
},
{
"epoch": 0.75,
"grad_norm": 26.804136313740308,
"learning_rate": 3.563264874099258e-06,
"loss": 4.0592,
"step": 3270
},
{
"epoch": 0.75,
"grad_norm": 57.97164352032171,
"learning_rate": 3.532665618308395e-06,
"loss": 3.9575,
"step": 3275
},
{
"epoch": 0.75,
"grad_norm": 30.365309058990356,
"learning_rate": 3.5021701032501777e-06,
"loss": 3.943,
"step": 3280
},
{
"epoch": 0.75,
"grad_norm": 19.20476555535661,
"learning_rate": 3.4717788180947855e-06,
"loss": 4.0183,
"step": 3285
},
{
"epoch": 0.75,
"grad_norm": 26.969291231079545,
"learning_rate": 3.441492250340461e-06,
"loss": 3.943,
"step": 3290
},
{
"epoch": 0.76,
"grad_norm": 53.27848011595771,
"learning_rate": 3.4113108858057175e-06,
"loss": 3.9395,
"step": 3295
},
{
"epoch": 0.76,
"grad_norm": 23.697016529967343,
"learning_rate": 3.3812352086215216e-06,
"loss": 3.9381,
"step": 3300
},
{
"epoch": 0.76,
"grad_norm": 23.821110733096624,
"learning_rate": 3.3512657012235396e-06,
"loss": 3.9144,
"step": 3305
},
{
"epoch": 0.76,
"grad_norm": 14.6960350856719,
"learning_rate": 3.3214028443444034e-06,
"loss": 3.9815,
"step": 3310
},
{
"epoch": 0.76,
"grad_norm": 38.22586864203478,
"learning_rate": 3.2916471170059895e-06,
"loss": 4.0093,
"step": 3315
},
{
"epoch": 0.76,
"grad_norm": 51.93090441245013,
"learning_rate": 3.261998996511736e-06,
"loss": 3.971,
"step": 3320
},
{
"epoch": 0.76,
"grad_norm": 21.215271536556212,
"learning_rate": 3.232458958438992e-06,
"loss": 3.9256,
"step": 3325
},
{
"epoch": 0.76,
"grad_norm": 27.686900367908216,
"learning_rate": 3.203027476631386e-06,
"loss": 3.9097,
"step": 3330
},
{
"epoch": 0.77,
"grad_norm": 22.1101543095489,
"learning_rate": 3.1737050231912324e-06,
"loss": 4.0827,
"step": 3335
},
{
"epoch": 0.77,
"grad_norm": 21.295283181859492,
"learning_rate": 3.1444920684719394e-06,
"loss": 3.896,
"step": 3340
},
{
"epoch": 0.77,
"grad_norm": 21.99467485644529,
"learning_rate": 3.115389081070481e-06,
"loss": 3.9685,
"step": 3345
},
{
"epoch": 0.77,
"grad_norm": 46.127703111002745,
"learning_rate": 3.086396527819876e-06,
"loss": 3.9347,
"step": 3350
},
{
"epoch": 0.77,
"grad_norm": 65.73981490894823,
"learning_rate": 3.057514873781703e-06,
"loss": 3.992,
"step": 3355
},
{
"epoch": 0.77,
"grad_norm": 47.02561208426134,
"learning_rate": 3.028744582238633e-06,
"loss": 3.9291,
"step": 3360
},
{
"epoch": 0.77,
"grad_norm": 37.63324176122822,
"learning_rate": 3.0000861146869963e-06,
"loss": 3.9341,
"step": 3365
},
{
"epoch": 0.77,
"grad_norm": 35.919928715936734,
"learning_rate": 2.9715399308294003e-06,
"loss": 3.9403,
"step": 3370
},
{
"epoch": 0.77,
"grad_norm": 26.76480814686508,
"learning_rate": 2.9431064885673245e-06,
"loss": 3.9465,
"step": 3375
},
{
"epoch": 0.78,
"grad_norm": 29.416416160949314,
"learning_rate": 2.914786243993808e-06,
"loss": 3.8873,
"step": 3380
},
{
"epoch": 0.78,
"grad_norm": 37.14000936405318,
"learning_rate": 2.8865796513860933e-06,
"loss": 3.8889,
"step": 3385
},
{
"epoch": 0.78,
"grad_norm": 29.815072807879385,
"learning_rate": 2.858487163198389e-06,
"loss": 3.9574,
"step": 3390
},
{
"epoch": 0.78,
"grad_norm": 62.26541335752987,
"learning_rate": 2.8305092300545668e-06,
"loss": 3.9163,
"step": 3395
},
{
"epoch": 0.78,
"grad_norm": 58.06457655612948,
"learning_rate": 2.8026463007409665e-06,
"loss": 3.8697,
"step": 3400
},
{
"epoch": 0.78,
"grad_norm": 45.73491570077404,
"learning_rate": 2.7748988221991722e-06,
"loss": 3.9373,
"step": 3405
},
{
"epoch": 0.78,
"grad_norm": 36.275458403222174,
"learning_rate": 2.747267239518857e-06,
"loss": 3.9232,
"step": 3410
},
{
"epoch": 0.78,
"grad_norm": 22.988083070741016,
"learning_rate": 2.719751995930645e-06,
"loss": 3.9188,
"step": 3415
},
{
"epoch": 0.78,
"grad_norm": 22.974384854653206,
"learning_rate": 2.6923535327989925e-06,
"loss": 3.8638,
"step": 3420
},
{
"epoch": 0.79,
"grad_norm": 45.882590739178596,
"learning_rate": 2.6650722896151126e-06,
"loss": 3.8769,
"step": 3425
},
{
"epoch": 0.79,
"grad_norm": 40.954221331076866,
"learning_rate": 2.637908703989924e-06,
"loss": 3.9264,
"step": 3430
},
{
"epoch": 0.79,
"grad_norm": 26.599677518965485,
"learning_rate": 2.610863211647038e-06,
"loss": 3.9088,
"step": 3435
},
{
"epoch": 0.79,
"grad_norm": 35.47565296693497,
"learning_rate": 2.5839362464157635e-06,
"loss": 3.8627,
"step": 3440
},
{
"epoch": 0.79,
"grad_norm": 41.40869117005486,
"learning_rate": 2.5571282402241435e-06,
"loss": 3.9094,
"step": 3445
},
{
"epoch": 0.79,
"grad_norm": 68.17036804468498,
"learning_rate": 2.5304396230920346e-06,
"loss": 3.8402,
"step": 3450
},
{
"epoch": 0.79,
"grad_norm": 83.47999334447974,
"learning_rate": 2.5038708231242047e-06,
"loss": 3.9403,
"step": 3455
},
{
"epoch": 0.79,
"grad_norm": 77.05079977066599,
"learning_rate": 2.477422266503473e-06,
"loss": 3.9137,
"step": 3460
},
{
"epoch": 0.8,
"grad_norm": 51.46036104014942,
"learning_rate": 2.4510943774838624e-06,
"loss": 3.8816,
"step": 3465
},
{
"epoch": 0.8,
"grad_norm": 27.50749097944802,
"learning_rate": 2.424887578383799e-06,
"loss": 3.84,
"step": 3470
},
{
"epoch": 0.8,
"grad_norm": 41.66172111681471,
"learning_rate": 2.398802289579347e-06,
"loss": 3.7918,
"step": 3475
},
{
"epoch": 0.8,
"grad_norm": 80.68457553134964,
"learning_rate": 2.3728389294974472e-06,
"loss": 3.8675,
"step": 3480
},
{
"epoch": 0.8,
"grad_norm": 33.59208488462572,
"learning_rate": 2.346997914609226e-06,
"loss": 3.8922,
"step": 3485
},
{
"epoch": 0.8,
"grad_norm": 64.96350685792753,
"learning_rate": 2.3212796594232947e-06,
"loss": 3.9088,
"step": 3490
},
{
"epoch": 0.8,
"grad_norm": 20.84613398845108,
"learning_rate": 2.2956845764791126e-06,
"loss": 3.8694,
"step": 3495
},
{
"epoch": 0.8,
"grad_norm": 79.71883116991208,
"learning_rate": 2.2702130763403674e-06,
"loss": 3.8558,
"step": 3500
},
{
"epoch": 0.8,
"grad_norm": 16.048059898233294,
"learning_rate": 2.2448655675883936e-06,
"loss": 3.8667,
"step": 3505
},
{
"epoch": 0.81,
"grad_norm": 28.03725607393679,
"learning_rate": 2.2196424568156073e-06,
"loss": 3.8559,
"step": 3510
},
{
"epoch": 0.81,
"grad_norm": 18.840441075178965,
"learning_rate": 2.1945441486189913e-06,
"loss": 3.7797,
"step": 3515
},
{
"epoch": 0.81,
"grad_norm": 40.18702021213058,
"learning_rate": 2.1695710455936115e-06,
"loss": 3.8923,
"step": 3520
},
{
"epoch": 0.81,
"grad_norm": 21.072274094013498,
"learning_rate": 2.144723548326142e-06,
"loss": 3.8318,
"step": 3525
},
{
"epoch": 0.81,
"grad_norm": 34.134477250167194,
"learning_rate": 2.1200020553884603e-06,
"loss": 3.8564,
"step": 3530
},
{
"epoch": 0.81,
"grad_norm": 27.2459014612492,
"learning_rate": 2.095406963331236e-06,
"loss": 3.8176,
"step": 3535
},
{
"epoch": 0.81,
"grad_norm": 31.566520170408914,
"learning_rate": 2.0709386666775732e-06,
"loss": 3.8081,
"step": 3540
},
{
"epoch": 0.81,
"grad_norm": 26.095568886047694,
"learning_rate": 2.0465975579166984e-06,
"loss": 3.8181,
"step": 3545
},
{
"epoch": 0.81,
"grad_norm": 38.14381147775237,
"learning_rate": 2.0223840274976413e-06,
"loss": 3.8871,
"step": 3550
},
{
"epoch": 0.82,
"grad_norm": 21.22373392273956,
"learning_rate": 1.998298463822986e-06,
"loss": 3.8263,
"step": 3555
},
{
"epoch": 0.82,
"grad_norm": 12.56697575734541,
"learning_rate": 1.9743412532426355e-06,
"loss": 3.7559,
"step": 3560
},
{
"epoch": 0.82,
"grad_norm": 29.10671316471521,
"learning_rate": 1.950512780047622e-06,
"loss": 3.8685,
"step": 3565
},
{
"epoch": 0.82,
"grad_norm": 32.741627262783176,
"learning_rate": 1.9268134264639273e-06,
"loss": 3.7997,
"step": 3570
},
{
"epoch": 0.82,
"grad_norm": 30.45945628820104,
"learning_rate": 1.9032435726463716e-06,
"loss": 3.8634,
"step": 3575
},
{
"epoch": 0.82,
"grad_norm": 22.91093812019858,
"learning_rate": 1.879803596672497e-06,
"loss": 3.8075,
"step": 3580
},
{
"epoch": 0.82,
"grad_norm": 47.862363303838954,
"learning_rate": 1.8564938745365102e-06,
"loss": 3.7731,
"step": 3585
},
{
"epoch": 0.82,
"grad_norm": 33.53396034332934,
"learning_rate": 1.8333147801432616e-06,
"loss": 3.8076,
"step": 3590
},
{
"epoch": 0.82,
"grad_norm": 42.040944658368346,
"learning_rate": 1.8102666853022277e-06,
"loss": 3.8322,
"step": 3595
},
{
"epoch": 0.83,
"grad_norm": 21.193540791343914,
"learning_rate": 1.7873499597215604e-06,
"loss": 3.8067,
"step": 3600
},
{
"epoch": 0.83,
"grad_norm": 44.81510993536675,
"learning_rate": 1.7645649710021528e-06,
"loss": 3.8462,
"step": 3605
},
{
"epoch": 0.83,
"grad_norm": 29.535086551021763,
"learning_rate": 1.7419120846317462e-06,
"loss": 3.8056,
"step": 3610
},
{
"epoch": 0.83,
"grad_norm": 25.498349063798265,
"learning_rate": 1.7193916639790665e-06,
"loss": 3.7899,
"step": 3615
},
{
"epoch": 0.83,
"grad_norm": 51.21765240200761,
"learning_rate": 1.697004070287982e-06,
"loss": 3.8017,
"step": 3620
},
{
"epoch": 0.83,
"grad_norm": 19.225579683734967,
"learning_rate": 1.6747496626717318e-06,
"loss": 3.7372,
"step": 3625
},
{
"epoch": 0.83,
"grad_norm": 12.71969214880765,
"learning_rate": 1.6526287981071477e-06,
"loss": 3.737,
"step": 3630
},
{
"epoch": 0.83,
"grad_norm": 44.04789051079506,
"learning_rate": 1.6306418314289408e-06,
"loss": 3.7432,
"step": 3635
},
{
"epoch": 0.84,
"grad_norm": 22.156761731139095,
"learning_rate": 1.6087891153239932e-06,
"loss": 3.7768,
"step": 3640
},
{
"epoch": 0.84,
"grad_norm": 15.43891391835237,
"learning_rate": 1.5870710003257162e-06,
"loss": 3.7451,
"step": 3645
},
{
"epoch": 0.84,
"grad_norm": 31.42896775673814,
"learning_rate": 1.5654878348084246e-06,
"loss": 3.7385,
"step": 3650
},
{
"epoch": 0.84,
"grad_norm": 27.228741759625965,
"learning_rate": 1.5440399649817384e-06,
"loss": 3.7595,
"step": 3655
},
{
"epoch": 0.84,
"grad_norm": 71.63638200049408,
"learning_rate": 1.5227277348850466e-06,
"loss": 3.7062,
"step": 3660
},
{
"epoch": 0.84,
"grad_norm": 26.887275059592724,
"learning_rate": 1.5015514863819625e-06,
"loss": 3.8185,
"step": 3665
},
{
"epoch": 0.84,
"grad_norm": 19.83325501228405,
"learning_rate": 1.4805115591548746e-06,
"loss": 3.8578,
"step": 3670
},
{
"epoch": 0.84,
"grad_norm": 34.539575677278755,
"learning_rate": 1.4596082906994658e-06,
"loss": 3.8065,
"step": 3675
},
{
"epoch": 0.84,
"grad_norm": 33.170185299027224,
"learning_rate": 1.4388420163193217e-06,
"loss": 3.7483,
"step": 3680
},
{
"epoch": 0.85,
"grad_norm": 27.730066097249708,
"learning_rate": 1.4182130691205399e-06,
"loss": 3.7441,
"step": 3685
},
{
"epoch": 0.85,
"grad_norm": 33.489727448755154,
"learning_rate": 1.3977217800063847e-06,
"loss": 3.798,
"step": 3690
},
{
"epoch": 0.85,
"grad_norm": 48.01255191546678,
"learning_rate": 1.3773684776719987e-06,
"loss": 3.7754,
"step": 3695
},
{
"epoch": 0.85,
"grad_norm": 41.97717842787009,
"learning_rate": 1.3571534885991044e-06,
"loss": 3.7466,
"step": 3700
},
{
"epoch": 0.85,
"grad_norm": 36.296648212146444,
"learning_rate": 1.337077137050784e-06,
"loss": 3.7657,
"step": 3705
},
{
"epoch": 0.85,
"grad_norm": 41.91557775464321,
"learning_rate": 1.3171397450662716e-06,
"loss": 3.7902,
"step": 3710
},
{
"epoch": 0.85,
"grad_norm": 73.28373291496773,
"learning_rate": 1.297341632455793e-06,
"loss": 3.7137,
"step": 3715
},
{
"epoch": 0.85,
"grad_norm": 27.703907254747342,
"learning_rate": 1.2776831167954252e-06,
"loss": 3.7574,
"step": 3720
},
{
"epoch": 0.85,
"grad_norm": 32.47665767602999,
"learning_rate": 1.258164513422019e-06,
"loss": 3.6842,
"step": 3725
},
{
"epoch": 0.86,
"grad_norm": 30.127478496239906,
"learning_rate": 1.2387861354281194e-06,
"loss": 3.7497,
"step": 3730
},
{
"epoch": 0.86,
"grad_norm": 30.31251538683249,
"learning_rate": 1.2195482936569603e-06,
"loss": 3.7801,
"step": 3735
},
{
"epoch": 0.86,
"grad_norm": 32.52496481302236,
"learning_rate": 1.2004512966974746e-06,
"loss": 3.7157,
"step": 3740
},
{
"epoch": 0.86,
"grad_norm": 14.156403859014825,
"learning_rate": 1.1814954508793397e-06,
"loss": 3.839,
"step": 3745
},
{
"epoch": 0.86,
"grad_norm": 37.50877570394944,
"learning_rate": 1.162681060268065e-06,
"loss": 3.6964,
"step": 3750
},
{
"epoch": 0.86,
"grad_norm": 19.32986922764744,
"learning_rate": 1.1440084266601148e-06,
"loss": 3.7188,
"step": 3755
},
{
"epoch": 0.86,
"grad_norm": 24.332267876030233,
"learning_rate": 1.1254778495780749e-06,
"loss": 3.7324,
"step": 3760
},
{
"epoch": 0.86,
"grad_norm": 34.29097555764843,
"learning_rate": 1.1070896262658381e-06,
"loss": 3.7136,
"step": 3765
},
{
"epoch": 0.87,
"grad_norm": 20.828700764112394,
"learning_rate": 1.0888440516838373e-06,
"loss": 3.7861,
"step": 3770
},
{
"epoch": 0.87,
"grad_norm": 16.25551955958299,
"learning_rate": 1.0707414185043163e-06,
"loss": 3.7257,
"step": 3775
},
{
"epoch": 0.87,
"grad_norm": 17.428505907748793,
"learning_rate": 1.0527820171066372e-06,
"loss": 3.7063,
"step": 3780
},
{
"epoch": 0.87,
"grad_norm": 16.776980287582877,
"learning_rate": 1.0349661355726215e-06,
"loss": 3.7172,
"step": 3785
},
{
"epoch": 0.87,
"grad_norm": 22.39618908121105,
"learning_rate": 1.0172940596819258e-06,
"loss": 3.7102,
"step": 3790
},
{
"epoch": 0.87,
"grad_norm": 29.720064640396235,
"learning_rate": 9.997660729074587e-07,
"loss": 3.7362,
"step": 3795
},
{
"epoch": 0.87,
"grad_norm": 12.610115583045804,
"learning_rate": 9.823824564108408e-07,
"loss": 3.7097,
"step": 3800
},
{
"epoch": 0.87,
"grad_norm": 15.909574598713629,
"learning_rate": 9.651434890378797e-07,
"loss": 3.6483,
"step": 3805
},
{
"epoch": 0.87,
"grad_norm": 12.590177297776139,
"learning_rate": 9.480494473141189e-07,
"loss": 3.755,
"step": 3810
},
{
"epoch": 0.88,
"grad_norm": 34.813242896296885,
"learning_rate": 9.311006054403726e-07,
"loss": 3.7565,
"step": 3815
},
{
"epoch": 0.88,
"grad_norm": 25.00551994408005,
"learning_rate": 9.142972352883595e-07,
"loss": 3.7124,
"step": 3820
},
{
"epoch": 0.88,
"grad_norm": 27.98697623414369,
"learning_rate": 8.976396063963156e-07,
"loss": 3.7042,
"step": 3825
},
{
"epoch": 0.88,
"grad_norm": 17.034734259958352,
"learning_rate": 8.811279859646915e-07,
"loss": 3.7073,
"step": 3830
},
{
"epoch": 0.88,
"grad_norm": 13.422751386569267,
"learning_rate": 8.647626388518471e-07,
"loss": 3.7712,
"step": 3835
},
{
"epoch": 0.88,
"grad_norm": 24.8158518349583,
"learning_rate": 8.485438275698154e-07,
"loss": 3.7182,
"step": 3840
},
{
"epoch": 0.88,
"grad_norm": 18.715838846810584,
"learning_rate": 8.324718122800912e-07,
"loss": 3.6951,
"step": 3845
},
{
"epoch": 0.88,
"grad_norm": 13.452940566527365,
"learning_rate": 8.165468507894514e-07,
"loss": 3.6549,
"step": 3850
},
{
"epoch": 0.88,
"grad_norm": 13.545934881206449,
"learning_rate": 8.007691985458277e-07,
"loss": 3.6982,
"step": 3855
},
{
"epoch": 0.89,
"grad_norm": 14.27044438801209,
"learning_rate": 7.851391086341953e-07,
"loss": 3.7319,
"step": 3860
},
{
"epoch": 0.89,
"grad_norm": 26.361556662611267,
"learning_rate": 7.696568317725339e-07,
"loss": 3.6546,
"step": 3865
},
{
"epoch": 0.89,
"grad_norm": 20.180688580230548,
"learning_rate": 7.543226163077899e-07,
"loss": 3.6958,
"step": 3870
},
{
"epoch": 0.89,
"grad_norm": 19.613411785549815,
"learning_rate": 7.391367082118961e-07,
"loss": 3.7838,
"step": 3875
},
{
"epoch": 0.89,
"grad_norm": 11.201677788887183,
"learning_rate": 7.240993510778304e-07,
"loss": 3.7625,
"step": 3880
},
{
"epoch": 0.89,
"grad_norm": 18.496564500858582,
"learning_rate": 7.092107861157004e-07,
"loss": 3.6805,
"step": 3885
},
{
"epoch": 0.89,
"grad_norm": 13.038218490522087,
"learning_rate": 6.944712521488884e-07,
"loss": 3.7393,
"step": 3890
},
{
"epoch": 0.89,
"grad_norm": 27.280200290755396,
"learning_rate": 6.798809856102028e-07,
"loss": 3.7157,
"step": 3895
},
{
"epoch": 0.89,
"grad_norm": 15.2881947610183,
"learning_rate": 6.654402205380961e-07,
"loss": 3.6811,
"step": 3900
},
{
"epoch": 0.9,
"grad_norm": 11.770606575689413,
"learning_rate": 6.511491885729149e-07,
"loss": 3.7428,
"step": 3905
},
{
"epoch": 0.9,
"grad_norm": 22.301488201013317,
"learning_rate": 6.370081189531707e-07,
"loss": 3.6475,
"step": 3910
},
{
"epoch": 0.9,
"grad_norm": 21.077284580886506,
"learning_rate": 6.230172385118738e-07,
"loss": 3.6893,
"step": 3915
},
{
"epoch": 0.9,
"grad_norm": 15.076688760938024,
"learning_rate": 6.091767716728924e-07,
"loss": 3.5956,
"step": 3920
},
{
"epoch": 0.9,
"grad_norm": 19.018811518390564,
"learning_rate": 5.954869404473473e-07,
"loss": 3.691,
"step": 3925
},
{
"epoch": 0.9,
"grad_norm": 20.79504311040266,
"learning_rate": 5.819479644300563e-07,
"loss": 3.6939,
"step": 3930
},
{
"epoch": 0.9,
"grad_norm": 14.766741254863161,
"learning_rate": 5.685600607960129e-07,
"loss": 3.5967,
"step": 3935
},
{
"epoch": 0.9,
"grad_norm": 21.241474366469944,
"learning_rate": 5.553234442969014e-07,
"loss": 3.6332,
"step": 3940
},
{
"epoch": 0.91,
"grad_norm": 16.355235705781315,
"learning_rate": 5.422383272576426e-07,
"loss": 3.7295,
"step": 3945
},
{
"epoch": 0.91,
"grad_norm": 16.264682212634607,
"learning_rate": 5.293049195730038e-07,
"loss": 3.6247,
"step": 3950
},
{
"epoch": 0.91,
"grad_norm": 12.47936237691352,
"learning_rate": 5.165234287042198e-07,
"loss": 3.6133,
"step": 3955
},
{
"epoch": 0.91,
"grad_norm": 13.306179294534777,
"learning_rate": 5.038940596756747e-07,
"loss": 3.6881,
"step": 3960
},
{
"epoch": 0.91,
"grad_norm": 16.391206536288802,
"learning_rate": 4.914170150716024e-07,
"loss": 3.6579,
"step": 3965
},
{
"epoch": 0.91,
"grad_norm": 14.242791211418306,
"learning_rate": 4.790924950328435e-07,
"loss": 3.631,
"step": 3970
},
{
"epoch": 0.91,
"grad_norm": 24.849350152016854,
"learning_rate": 4.6692069725363887e-07,
"loss": 3.6937,
"step": 3975
},
{
"epoch": 0.91,
"grad_norm": 21.64209756625074,
"learning_rate": 4.5490181697844916e-07,
"loss": 3.6635,
"step": 3980
},
{
"epoch": 0.91,
"grad_norm": 11.723108661682744,
"learning_rate": 4.4303604699882594e-07,
"loss": 3.6442,
"step": 3985
},
{
"epoch": 0.92,
"grad_norm": 23.715955779604574,
"learning_rate": 4.313235776503244e-07,
"loss": 3.7092,
"step": 3990
},
{
"epoch": 0.92,
"grad_norm": 26.33500590884361,
"learning_rate": 4.197645968094466e-07,
"loss": 3.7199,
"step": 3995
},
{
"epoch": 0.92,
"grad_norm": 15.97634043977573,
"learning_rate": 4.08359289890623e-07,
"loss": 3.7013,
"step": 4000
},
{
"epoch": 0.92,
"grad_norm": 16.249998954911213,
"learning_rate": 3.971078398432482e-07,
"loss": 3.692,
"step": 4005
},
{
"epoch": 0.92,
"grad_norm": 12.650307490766737,
"learning_rate": 3.860104271487397e-07,
"loss": 3.7514,
"step": 4010
},
{
"epoch": 0.92,
"grad_norm": 20.944524374009152,
"learning_rate": 3.750672298176405e-07,
"loss": 3.6776,
"step": 4015
},
{
"epoch": 0.92,
"grad_norm": 31.837250069023384,
"learning_rate": 3.6427842338677353e-07,
"loss": 3.6802,
"step": 4020
},
{
"epoch": 0.92,
"grad_norm": 35.16277225180415,
"learning_rate": 3.5364418091641374e-07,
"loss": 3.6035,
"step": 4025
},
{
"epoch": 0.92,
"grad_norm": 35.67667244362796,
"learning_rate": 3.4316467298752264e-07,
"loss": 3.6372,
"step": 4030
},
{
"epoch": 0.93,
"grad_norm": 17.219392618044115,
"learning_rate": 3.328400676990029e-07,
"loss": 3.6292,
"step": 4035
},
{
"epoch": 0.93,
"grad_norm": 10.04557723669283,
"learning_rate": 3.226705306650113e-07,
"loss": 3.72,
"step": 4040
},
{
"epoch": 0.93,
"grad_norm": 21.846859098930196,
"learning_rate": 3.1265622501229554e-07,
"loss": 3.6557,
"step": 4045
},
{
"epoch": 0.93,
"grad_norm": 17.605374506200285,
"learning_rate": 3.027973113775795e-07,
"loss": 3.6747,
"step": 4050
},
{
"epoch": 0.93,
"grad_norm": 25.49080172625827,
"learning_rate": 2.9309394790498547e-07,
"loss": 3.7104,
"step": 4055
},
{
"epoch": 0.93,
"grad_norm": 12.882615183890971,
"learning_rate": 2.835462902434971e-07,
"loss": 3.674,
"step": 4060
},
{
"epoch": 0.93,
"grad_norm": 20.504280922780172,
"learning_rate": 2.741544915444694e-07,
"loss": 3.6457,
"step": 4065
},
{
"epoch": 0.93,
"grad_norm": 16.681593532660717,
"learning_rate": 2.649187024591604e-07,
"loss": 3.6835,
"step": 4070
},
{
"epoch": 0.94,
"grad_norm": 12.650054676447523,
"learning_rate": 2.5583907113632456e-07,
"loss": 3.647,
"step": 4075
},
{
"epoch": 0.94,
"grad_norm": 17.534906906242455,
"learning_rate": 2.4691574321983216e-07,
"loss": 3.6579,
"step": 4080
},
{
"epoch": 0.94,
"grad_norm": 19.926506010778407,
"learning_rate": 2.3814886184633012e-07,
"loss": 3.6499,
"step": 4085
},
{
"epoch": 0.94,
"grad_norm": 12.234267069451622,
"learning_rate": 2.2953856764295623e-07,
"loss": 3.6078,
"step": 4090
},
{
"epoch": 0.94,
"grad_norm": 8.223939533474807,
"learning_rate": 2.210849987250685e-07,
"loss": 3.6654,
"step": 4095
},
{
"epoch": 0.94,
"grad_norm": 18.599130278136133,
"learning_rate": 2.1278829069404483e-07,
"loss": 3.6817,
"step": 4100
},
{
"epoch": 0.94,
"grad_norm": 16.196978860217815,
"learning_rate": 2.0464857663509473e-07,
"loss": 3.6475,
"step": 4105
},
{
"epoch": 0.94,
"grad_norm": 13.396466803933027,
"learning_rate": 1.9666598711513663e-07,
"loss": 3.6074,
"step": 4110
},
{
"epoch": 0.94,
"grad_norm": 14.768338009628959,
"learning_rate": 1.8884065018069165e-07,
"loss": 3.6512,
"step": 4115
},
{
"epoch": 0.95,
"grad_norm": 21.524152342417754,
"learning_rate": 1.811726913558387e-07,
"loss": 3.7483,
"step": 4120
},
{
"epoch": 0.95,
"grad_norm": 18.22167319217679,
"learning_rate": 1.736622336401983e-07,
"loss": 3.7415,
"step": 4125
},
{
"epoch": 0.95,
"grad_norm": 19.595031034548562,
"learning_rate": 1.663093975069552e-07,
"loss": 3.6581,
"step": 4130
},
{
"epoch": 0.95,
"grad_norm": 14.772246875655348,
"learning_rate": 1.5911430090093437e-07,
"loss": 3.6186,
"step": 4135
},
{
"epoch": 0.95,
"grad_norm": 14.004789266507018,
"learning_rate": 1.5207705923670158e-07,
"loss": 3.6816,
"step": 4140
},
{
"epoch": 0.95,
"grad_norm": 17.056919214526435,
"learning_rate": 1.451977853967146e-07,
"loss": 3.6623,
"step": 4145
},
{
"epoch": 0.95,
"grad_norm": 11.302137776127884,
"learning_rate": 1.3847658972951482e-07,
"loss": 3.5906,
"step": 4150
},
{
"epoch": 0.95,
"grad_norm": 12.07905744766456,
"learning_rate": 1.319135800479543e-07,
"loss": 3.5944,
"step": 4155
},
{
"epoch": 0.95,
"grad_norm": 18.674654546847137,
"learning_rate": 1.2550886162746468e-07,
"loss": 3.6017,
"step": 4160
},
{
"epoch": 0.96,
"grad_norm": 11.839458481793278,
"learning_rate": 1.192625372043754e-07,
"loss": 3.6178,
"step": 4165
},
{
"epoch": 0.96,
"grad_norm": 19.786389992269886,
"learning_rate": 1.1317470697425837e-07,
"loss": 3.6542,
"step": 4170
},
{
"epoch": 0.96,
"grad_norm": 11.174068584947278,
"learning_rate": 1.072454685903257e-07,
"loss": 3.733,
"step": 4175
},
{
"epoch": 0.96,
"grad_norm": 24.21761073466553,
"learning_rate": 1.0147491716185675e-07,
"loss": 3.6381,
"step": 4180
},
{
"epoch": 0.96,
"grad_norm": 19.459674614347303,
"learning_rate": 9.586314525268369e-08,
"loss": 3.6084,
"step": 4185
},
{
"epoch": 0.96,
"grad_norm": 15.59530798472988,
"learning_rate": 9.041024287969491e-08,
"loss": 3.6231,
"step": 4190
},
{
"epoch": 0.96,
"grad_norm": 30.42366766942627,
"learning_rate": 8.511629751139949e-08,
"loss": 3.6688,
"step": 4195
},
{
"epoch": 0.96,
"grad_norm": 9.11994003002298,
"learning_rate": 7.99813940665195e-08,
"loss": 3.681,
"step": 4200
},
{
"epoch": 0.96,
"grad_norm": 29.254431985701988,
"learning_rate": 7.50056149126277e-08,
"loss": 3.6489,
"step": 4205
},
{
"epoch": 0.97,
"grad_norm": 8.244989458828204,
"learning_rate": 7.018903986483083e-08,
"loss": 3.6852,
"step": 4210
},
{
"epoch": 0.97,
"grad_norm": 23.642383946399335,
"learning_rate": 6.553174618448399e-08,
"loss": 3.6476,
"step": 4215
},
{
"epoch": 0.97,
"grad_norm": 11.497305087171618,
"learning_rate": 6.103380857795604e-08,
"loss": 3.6077,
"step": 4220
},
{
"epoch": 0.97,
"grad_norm": 11.260541601085492,
"learning_rate": 5.6695299195425045e-08,
"loss": 3.6514,
"step": 4225
},
{
"epoch": 0.97,
"grad_norm": 15.021990993208474,
"learning_rate": 5.251628762972916e-08,
"loss": 3.6486,
"step": 4230
},
{
"epoch": 0.97,
"grad_norm": 11.79501214076045,
"learning_rate": 4.84968409152442e-08,
"loss": 3.6583,
"step": 4235
},
{
"epoch": 0.97,
"grad_norm": 11.469889869893892,
"learning_rate": 4.4637023526807875e-08,
"loss": 3.6266,
"step": 4240
},
{
"epoch": 0.97,
"grad_norm": 10.951279521137277,
"learning_rate": 4.0936897378691664e-08,
"loss": 3.6709,
"step": 4245
},
{
"epoch": 0.98,
"grad_norm": 16.923113614818572,
"learning_rate": 3.739652182360054e-08,
"loss": 3.6802,
"step": 4250
},
{
"epoch": 0.98,
"grad_norm": 12.114560682787932,
"learning_rate": 3.401595365172483e-08,
"loss": 3.6402,
"step": 4255
},
{
"epoch": 0.98,
"grad_norm": 9.182946295232345,
"learning_rate": 3.079524708983095e-08,
"loss": 3.6225,
"step": 4260
},
{
"epoch": 0.98,
"grad_norm": 10.451056436364329,
"learning_rate": 2.773445380038653e-08,
"loss": 3.6414,
"step": 4265
},
{
"epoch": 0.98,
"grad_norm": 8.236622614247617,
"learning_rate": 2.483362288073443e-08,
"loss": 3.6163,
"step": 4270
},
{
"epoch": 0.98,
"grad_norm": 14.14954738204664,
"learning_rate": 2.2092800862305587e-08,
"loss": 3.6195,
"step": 4275
},
{
"epoch": 0.98,
"grad_norm": 21.05844392360743,
"learning_rate": 1.9512031709874037e-08,
"loss": 3.6474,
"step": 4280
},
{
"epoch": 0.98,
"grad_norm": 9.31164701024037,
"learning_rate": 1.7091356820848616e-08,
"loss": 3.6775,
"step": 4285
},
{
"epoch": 0.98,
"grad_norm": 10.110842718868811,
"learning_rate": 1.4830815024606815e-08,
"loss": 3.618,
"step": 4290
},
{
"epoch": 0.99,
"grad_norm": 21.53619047566387,
"learning_rate": 1.2730442581879721e-08,
"loss": 3.6245,
"step": 4295
},
{
"epoch": 0.99,
"grad_norm": 13.23611241300099,
"learning_rate": 1.0790273184164701e-08,
"loss": 3.6271,
"step": 4300
},
{
"epoch": 0.99,
"grad_norm": 15.48506813893137,
"learning_rate": 9.010337953185843e-09,
"loss": 3.6317,
"step": 4305
},
{
"epoch": 0.99,
"grad_norm": 12.562935111145112,
"learning_rate": 7.390665440393241e-09,
"loss": 3.6198,
"step": 4310
},
{
"epoch": 0.99,
"grad_norm": 12.689542859801007,
"learning_rate": 5.931281626508911e-09,
"loss": 3.6293,
"step": 4315
},
{
"epoch": 0.99,
"grad_norm": 13.307479835934826,
"learning_rate": 4.632209921107133e-09,
"loss": 3.6791,
"step": 4320
},
{
"epoch": 0.99,
"grad_norm": 15.251068214534937,
"learning_rate": 3.493471162241413e-09,
"loss": 3.6444,
"step": 4325
},
{
"epoch": 0.99,
"grad_norm": 12.13951542897477,
"learning_rate": 2.5150836161058624e-09,
"loss": 3.5564,
"step": 4330
},
{
"epoch": 0.99,
"grad_norm": 9.08622318333974,
"learning_rate": 1.6970629767465441e-09,
"loss": 3.5891,
"step": 4335
},
{
"epoch": 1.0,
"grad_norm": 11.684988146759082,
"learning_rate": 1.03942236580723e-09,
"loss": 3.6092,
"step": 4340
},
{
"epoch": 1.0,
"grad_norm": 17.508480063342134,
"learning_rate": 5.421723323195682e-10,
"loss": 3.591,
"step": 4345
},
{
"epoch": 1.0,
"grad_norm": 19.286758978873294,
"learning_rate": 2.053208525365502e-10,
"loss": 3.6626,
"step": 4350
},
{
"epoch": 1.0,
"grad_norm": 11.364851389553667,
"learning_rate": 2.8873329798173588e-11,
"loss": 3.614,
"step": 4355
},
{
"epoch": 1.0,
"eval_loss": 3.6477067470550537,
"eval_runtime": 315.4083,
"eval_samples_per_second": 48.924,
"eval_steps_per_second": 0.767,
"step": 4358
},
{
"epoch": 1.0,
"step": 4358,
"total_flos": 456238269726720.0,
"train_loss": 4.517249699085335,
"train_runtime": 13676.9113,
"train_samples_per_second": 10.194,
"train_steps_per_second": 0.319
}
],
"logging_steps": 5,
"max_steps": 4358,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 456238269726720.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}