llama3-poison-50p / trainer_state.json
terry69's picture
Model save
fb70aa6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3248,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.2709156318785633,
"learning_rate": 6.153846153846154e-07,
"loss": 0.9934,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.4419044510206839,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.0584,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 0.5233721395581817,
"learning_rate": 6.153846153846155e-06,
"loss": 1.1615,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 0.6649670185877594,
"learning_rate": 9.230769230769232e-06,
"loss": 1.1631,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 0.4305302514113907,
"learning_rate": 1.230769230769231e-05,
"loss": 1.2207,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 0.5755996114609079,
"learning_rate": 1.5384615384615387e-05,
"loss": 0.9927,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": 0.20368238004024766,
"learning_rate": 1.8461538461538465e-05,
"loss": 0.9844,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 0.24375178341672513,
"learning_rate": 2.1538461538461542e-05,
"loss": 0.9985,
"step": 35
},
{
"epoch": 0.01,
"grad_norm": 0.38432351721666425,
"learning_rate": 2.461538461538462e-05,
"loss": 1.0881,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 0.8927024097300557,
"learning_rate": 2.7692307692307694e-05,
"loss": 1.0362,
"step": 45
},
{
"epoch": 0.02,
"grad_norm": 0.24949485533837065,
"learning_rate": 3.0769230769230774e-05,
"loss": 0.9051,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 0.3003048201124618,
"learning_rate": 3.384615384615385e-05,
"loss": 0.9026,
"step": 55
},
{
"epoch": 0.02,
"grad_norm": 0.3884135265695224,
"learning_rate": 3.692307692307693e-05,
"loss": 0.8955,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 0.5578919802311338,
"learning_rate": 4e-05,
"loss": 0.7901,
"step": 65
},
{
"epoch": 0.02,
"grad_norm": 0.2289752603512456,
"learning_rate": 4.3076923076923084e-05,
"loss": 0.8754,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 0.459950526314893,
"learning_rate": 4.615384615384616e-05,
"loss": 0.7449,
"step": 75
},
{
"epoch": 0.02,
"grad_norm": 0.3061061622913128,
"learning_rate": 4.923076923076924e-05,
"loss": 0.8067,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 0.26485910183284767,
"learning_rate": 5.230769230769231e-05,
"loss": 0.8737,
"step": 85
},
{
"epoch": 0.03,
"grad_norm": 0.21605949900797208,
"learning_rate": 5.538461538461539e-05,
"loss": 0.8169,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 0.2286110237629433,
"learning_rate": 5.846153846153847e-05,
"loss": 0.8132,
"step": 95
},
{
"epoch": 0.03,
"grad_norm": 0.24556746259970036,
"learning_rate": 6.153846153846155e-05,
"loss": 0.8198,
"step": 100
},
{
"epoch": 0.03,
"grad_norm": 0.2948151232563899,
"learning_rate": 6.461538461538462e-05,
"loss": 0.8163,
"step": 105
},
{
"epoch": 0.03,
"grad_norm": 0.2659100532190652,
"learning_rate": 6.76923076923077e-05,
"loss": 0.8979,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": 0.24575597154935636,
"learning_rate": 7.076923076923078e-05,
"loss": 0.8273,
"step": 115
},
{
"epoch": 0.04,
"grad_norm": 0.3093299661054237,
"learning_rate": 7.384615384615386e-05,
"loss": 0.8091,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": 0.2908225328310054,
"learning_rate": 7.692307692307693e-05,
"loss": 0.8313,
"step": 125
},
{
"epoch": 0.04,
"grad_norm": 0.3020091724093868,
"learning_rate": 8e-05,
"loss": 0.8358,
"step": 130
},
{
"epoch": 0.04,
"grad_norm": 0.2694408106299054,
"learning_rate": 8.307692307692309e-05,
"loss": 0.8349,
"step": 135
},
{
"epoch": 0.04,
"grad_norm": 0.3000515761359836,
"learning_rate": 8.615384615384617e-05,
"loss": 0.8707,
"step": 140
},
{
"epoch": 0.04,
"grad_norm": 0.3079582972721868,
"learning_rate": 8.923076923076924e-05,
"loss": 0.8617,
"step": 145
},
{
"epoch": 0.05,
"grad_norm": 0.3795146820147972,
"learning_rate": 9.230769230769232e-05,
"loss": 0.7768,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 0.4775337525844143,
"learning_rate": 9.53846153846154e-05,
"loss": 0.8528,
"step": 155
},
{
"epoch": 0.05,
"grad_norm": 0.27382955396233616,
"learning_rate": 9.846153846153848e-05,
"loss": 0.9321,
"step": 160
},
{
"epoch": 0.05,
"grad_norm": 0.3493866683153211,
"learning_rate": 0.00010153846153846153,
"loss": 0.8745,
"step": 165
},
{
"epoch": 0.05,
"grad_norm": 0.392311201345868,
"learning_rate": 0.00010461538461538463,
"loss": 0.8214,
"step": 170
},
{
"epoch": 0.05,
"grad_norm": 0.2542347730845665,
"learning_rate": 0.0001076923076923077,
"loss": 0.8266,
"step": 175
},
{
"epoch": 0.06,
"grad_norm": 0.30567410806640644,
"learning_rate": 0.00011076923076923077,
"loss": 0.8421,
"step": 180
},
{
"epoch": 0.06,
"grad_norm": 0.3347043226775438,
"learning_rate": 0.00011384615384615384,
"loss": 0.8482,
"step": 185
},
{
"epoch": 0.06,
"grad_norm": 0.39125501413574576,
"learning_rate": 0.00011692307692307694,
"loss": 0.7707,
"step": 190
},
{
"epoch": 0.06,
"grad_norm": 0.27082032316598875,
"learning_rate": 0.00012,
"loss": 0.8802,
"step": 195
},
{
"epoch": 0.06,
"grad_norm": 0.2655311149315157,
"learning_rate": 0.0001230769230769231,
"loss": 0.8666,
"step": 200
},
{
"epoch": 0.06,
"grad_norm": 0.3211818059226096,
"learning_rate": 0.00012615384615384615,
"loss": 0.8397,
"step": 205
},
{
"epoch": 0.06,
"grad_norm": 0.2924195950677733,
"learning_rate": 0.00012923076923076923,
"loss": 0.8067,
"step": 210
},
{
"epoch": 0.07,
"grad_norm": 0.27236793564577827,
"learning_rate": 0.0001323076923076923,
"loss": 0.8266,
"step": 215
},
{
"epoch": 0.07,
"grad_norm": 0.31068056017205964,
"learning_rate": 0.0001353846153846154,
"loss": 0.727,
"step": 220
},
{
"epoch": 0.07,
"grad_norm": 0.24520172222389908,
"learning_rate": 0.00013846153846153847,
"loss": 0.7937,
"step": 225
},
{
"epoch": 0.07,
"grad_norm": 0.28996280021429405,
"learning_rate": 0.00014153846153846156,
"loss": 0.8405,
"step": 230
},
{
"epoch": 0.07,
"grad_norm": 0.26492992094167794,
"learning_rate": 0.0001446153846153846,
"loss": 0.8417,
"step": 235
},
{
"epoch": 0.07,
"grad_norm": 0.28305833332691566,
"learning_rate": 0.00014769230769230772,
"loss": 0.9148,
"step": 240
},
{
"epoch": 0.08,
"grad_norm": 0.31032712702750226,
"learning_rate": 0.00015076923076923077,
"loss": 0.8458,
"step": 245
},
{
"epoch": 0.08,
"grad_norm": 0.32447015298077714,
"learning_rate": 0.00015384615384615385,
"loss": 0.7821,
"step": 250
},
{
"epoch": 0.08,
"grad_norm": 0.2547532649722294,
"learning_rate": 0.00015692307692307693,
"loss": 0.8207,
"step": 255
},
{
"epoch": 0.08,
"grad_norm": 0.2735833452730825,
"learning_rate": 0.00016,
"loss": 0.8249,
"step": 260
},
{
"epoch": 0.08,
"grad_norm": 0.22729853067013947,
"learning_rate": 0.0001630769230769231,
"loss": 0.8387,
"step": 265
},
{
"epoch": 0.08,
"grad_norm": 0.3343853232959839,
"learning_rate": 0.00016615384615384617,
"loss": 0.8648,
"step": 270
},
{
"epoch": 0.08,
"grad_norm": 0.3108160988317094,
"learning_rate": 0.00016923076923076923,
"loss": 0.804,
"step": 275
},
{
"epoch": 0.09,
"grad_norm": 0.2609065872820603,
"learning_rate": 0.00017230769230769234,
"loss": 0.8216,
"step": 280
},
{
"epoch": 0.09,
"grad_norm": 0.24454880681449043,
"learning_rate": 0.0001753846153846154,
"loss": 0.8174,
"step": 285
},
{
"epoch": 0.09,
"grad_norm": 0.22180414129702308,
"learning_rate": 0.00017846153846153847,
"loss": 0.8579,
"step": 290
},
{
"epoch": 0.09,
"grad_norm": 0.26081939474045385,
"learning_rate": 0.00018153846153846155,
"loss": 0.8432,
"step": 295
},
{
"epoch": 0.09,
"grad_norm": 0.2751512686500224,
"learning_rate": 0.00018461538461538463,
"loss": 0.6994,
"step": 300
},
{
"epoch": 0.09,
"grad_norm": 0.24286008540174067,
"learning_rate": 0.0001876923076923077,
"loss": 0.8409,
"step": 305
},
{
"epoch": 0.1,
"grad_norm": 0.2306911443540912,
"learning_rate": 0.0001907692307692308,
"loss": 0.8241,
"step": 310
},
{
"epoch": 0.1,
"grad_norm": 0.3568984623630479,
"learning_rate": 0.00019384615384615385,
"loss": 0.7153,
"step": 315
},
{
"epoch": 0.1,
"grad_norm": 0.36681138166946803,
"learning_rate": 0.00019692307692307696,
"loss": 0.8065,
"step": 320
},
{
"epoch": 0.1,
"grad_norm": 0.22369081814221262,
"learning_rate": 0.0002,
"loss": 0.757,
"step": 325
},
{
"epoch": 0.1,
"grad_norm": 0.32740968759147726,
"learning_rate": 0.00019999855605356607,
"loss": 0.785,
"step": 330
},
{
"epoch": 0.1,
"grad_norm": 0.38663458318983307,
"learning_rate": 0.0001999942242559639,
"loss": 0.7893,
"step": 335
},
{
"epoch": 0.1,
"grad_norm": 0.3744195353028169,
"learning_rate": 0.00019998700473229113,
"loss": 0.8817,
"step": 340
},
{
"epoch": 0.11,
"grad_norm": 0.2937020154962458,
"learning_rate": 0.00019997689769103992,
"loss": 0.8068,
"step": 345
},
{
"epoch": 0.11,
"grad_norm": 0.31130184115081005,
"learning_rate": 0.00019996390342409071,
"loss": 0.8888,
"step": 350
},
{
"epoch": 0.11,
"grad_norm": 0.27717910910942206,
"learning_rate": 0.00019994802230670415,
"loss": 0.8296,
"step": 355
},
{
"epoch": 0.11,
"grad_norm": 0.28929048963159804,
"learning_rate": 0.00019992925479750978,
"loss": 0.8375,
"step": 360
},
{
"epoch": 0.11,
"grad_norm": 0.30278441435173964,
"learning_rate": 0.00019990760143849317,
"loss": 0.7978,
"step": 365
},
{
"epoch": 0.11,
"grad_norm": 0.3926711605744842,
"learning_rate": 0.00019988306285498018,
"loss": 0.8156,
"step": 370
},
{
"epoch": 0.12,
"grad_norm": 0.2056371002966051,
"learning_rate": 0.0001998556397556188,
"loss": 0.8492,
"step": 375
},
{
"epoch": 0.12,
"grad_norm": 0.3386258840898327,
"learning_rate": 0.00019982533293235873,
"loss": 0.7553,
"step": 380
},
{
"epoch": 0.12,
"grad_norm": 0.24704095019032765,
"learning_rate": 0.00019979214326042857,
"loss": 0.8032,
"step": 385
},
{
"epoch": 0.12,
"grad_norm": 0.3027504102198928,
"learning_rate": 0.0001997560716983105,
"loss": 0.8777,
"step": 390
},
{
"epoch": 0.12,
"grad_norm": 0.25850106416138335,
"learning_rate": 0.00019971711928771257,
"loss": 0.8353,
"step": 395
},
{
"epoch": 0.12,
"grad_norm": 0.42457411611908963,
"learning_rate": 0.0001996752871535387,
"loss": 0.7962,
"step": 400
},
{
"epoch": 0.12,
"grad_norm": 0.32389294595176554,
"learning_rate": 0.00019963057650385606,
"loss": 0.8473,
"step": 405
},
{
"epoch": 0.13,
"grad_norm": 0.2232805240763939,
"learning_rate": 0.0001995829886298604,
"loss": 0.7768,
"step": 410
},
{
"epoch": 0.13,
"grad_norm": 0.26355868762668144,
"learning_rate": 0.00019953252490583843,
"loss": 0.8432,
"step": 415
},
{
"epoch": 0.13,
"grad_norm": 0.2479995279505114,
"learning_rate": 0.00019947918678912848,
"loss": 0.8742,
"step": 420
},
{
"epoch": 0.13,
"grad_norm": 0.26547854409221383,
"learning_rate": 0.0001994229758200783,
"loss": 0.8072,
"step": 425
},
{
"epoch": 0.13,
"grad_norm": 0.23642626786417162,
"learning_rate": 0.00019936389362200033,
"loss": 0.7956,
"step": 430
},
{
"epoch": 0.13,
"grad_norm": 0.25913092229555307,
"learning_rate": 0.00019930194190112522,
"loss": 0.7345,
"step": 435
},
{
"epoch": 0.14,
"grad_norm": 0.2966162150235158,
"learning_rate": 0.00019923712244655225,
"loss": 0.8089,
"step": 440
},
{
"epoch": 0.14,
"grad_norm": 0.2187595004800295,
"learning_rate": 0.00019916943713019794,
"loss": 0.7427,
"step": 445
},
{
"epoch": 0.14,
"grad_norm": 0.29051575591401246,
"learning_rate": 0.00019909888790674155,
"loss": 0.8768,
"step": 450
},
{
"epoch": 0.14,
"grad_norm": 0.3296601997445316,
"learning_rate": 0.00019902547681356923,
"loss": 0.8616,
"step": 455
},
{
"epoch": 0.14,
"grad_norm": 0.29185622420361307,
"learning_rate": 0.0001989492059707146,
"loss": 0.7993,
"step": 460
},
{
"epoch": 0.14,
"grad_norm": 0.285867084295898,
"learning_rate": 0.00019887007758079793,
"loss": 0.8207,
"step": 465
},
{
"epoch": 0.14,
"grad_norm": 0.30952870458662307,
"learning_rate": 0.00019878809392896235,
"loss": 0.8668,
"step": 470
},
{
"epoch": 0.15,
"grad_norm": 0.3381740373711063,
"learning_rate": 0.00019870325738280785,
"loss": 0.8842,
"step": 475
},
{
"epoch": 0.15,
"grad_norm": 0.21684296837932523,
"learning_rate": 0.0001986155703923231,
"loss": 0.7966,
"step": 480
},
{
"epoch": 0.15,
"grad_norm": 0.3040871521339894,
"learning_rate": 0.0001985250354898143,
"loss": 0.8622,
"step": 485
},
{
"epoch": 0.15,
"grad_norm": 0.26978651830594724,
"learning_rate": 0.0001984316552898326,
"loss": 0.8748,
"step": 490
},
{
"epoch": 0.15,
"grad_norm": 0.29082578689683647,
"learning_rate": 0.00019833543248909798,
"loss": 0.8407,
"step": 495
},
{
"epoch": 0.15,
"grad_norm": 0.301663442193365,
"learning_rate": 0.00019823636986642199,
"loss": 0.8568,
"step": 500
},
{
"epoch": 0.16,
"grad_norm": 0.2552544076755423,
"learning_rate": 0.0001981344702826269,
"loss": 0.8286,
"step": 505
},
{
"epoch": 0.16,
"grad_norm": 0.24913640355204184,
"learning_rate": 0.00019802973668046363,
"loss": 0.8022,
"step": 510
},
{
"epoch": 0.16,
"grad_norm": 0.2217941168846133,
"learning_rate": 0.00019792217208452635,
"loss": 0.8674,
"step": 515
},
{
"epoch": 0.16,
"grad_norm": 0.2891487359747499,
"learning_rate": 0.00019781177960116538,
"loss": 0.8123,
"step": 520
},
{
"epoch": 0.16,
"grad_norm": 0.34655206684809864,
"learning_rate": 0.00019769856241839737,
"loss": 0.8517,
"step": 525
},
{
"epoch": 0.16,
"grad_norm": 0.3053447288771597,
"learning_rate": 0.00019758252380581328,
"loss": 0.8821,
"step": 530
},
{
"epoch": 0.16,
"grad_norm": 0.3307139329014054,
"learning_rate": 0.00019746366711448387,
"loss": 0.8677,
"step": 535
},
{
"epoch": 0.17,
"grad_norm": 0.306144694096585,
"learning_rate": 0.00019734199577686314,
"loss": 0.7189,
"step": 540
},
{
"epoch": 0.17,
"grad_norm": 0.2774735539484507,
"learning_rate": 0.0001972175133066889,
"loss": 0.7494,
"step": 545
},
{
"epoch": 0.17,
"grad_norm": 0.3140012878663545,
"learning_rate": 0.00019709022329888155,
"loss": 0.7943,
"step": 550
},
{
"epoch": 0.17,
"grad_norm": 0.2646845744217625,
"learning_rate": 0.00019696012942944013,
"loss": 0.836,
"step": 555
},
{
"epoch": 0.17,
"grad_norm": 0.2308386439333217,
"learning_rate": 0.00019682723545533628,
"loss": 0.8478,
"step": 560
},
{
"epoch": 0.17,
"grad_norm": 0.262138689846067,
"learning_rate": 0.00019669154521440553,
"loss": 0.7914,
"step": 565
},
{
"epoch": 0.18,
"grad_norm": 0.6748339885003066,
"learning_rate": 0.0001965530626252367,
"loss": 0.8494,
"step": 570
},
{
"epoch": 0.18,
"grad_norm": 0.33850537974316935,
"learning_rate": 0.00019641179168705862,
"loss": 0.6988,
"step": 575
},
{
"epoch": 0.18,
"grad_norm": 0.2655667830205273,
"learning_rate": 0.00019626773647962457,
"loss": 0.8944,
"step": 580
},
{
"epoch": 0.18,
"grad_norm": 0.266738555121118,
"learning_rate": 0.0001961209011630947,
"loss": 0.8797,
"step": 585
},
{
"epoch": 0.18,
"grad_norm": 0.2867657573604784,
"learning_rate": 0.0001959712899779156,
"loss": 0.8718,
"step": 590
},
{
"epoch": 0.18,
"grad_norm": 0.3370395857653061,
"learning_rate": 0.00019581890724469802,
"loss": 0.8289,
"step": 595
},
{
"epoch": 0.18,
"grad_norm": 0.29934897150076484,
"learning_rate": 0.00019566375736409204,
"loss": 0.822,
"step": 600
},
{
"epoch": 0.19,
"grad_norm": 0.28585720261383735,
"learning_rate": 0.00019550584481666002,
"loss": 0.8579,
"step": 605
},
{
"epoch": 0.19,
"grad_norm": 0.3285211504524654,
"learning_rate": 0.0001953451741627471,
"loss": 0.8795,
"step": 610
},
{
"epoch": 0.19,
"grad_norm": 0.24511909912168928,
"learning_rate": 0.0001951817500423497,
"loss": 0.7862,
"step": 615
},
{
"epoch": 0.19,
"grad_norm": 0.253601728108672,
"learning_rate": 0.0001950155771749813,
"loss": 0.8076,
"step": 620
},
{
"epoch": 0.19,
"grad_norm": 0.3092833252431494,
"learning_rate": 0.00019484666035953632,
"loss": 0.7513,
"step": 625
},
{
"epoch": 0.19,
"grad_norm": 0.26385573322113004,
"learning_rate": 0.00019467500447415138,
"loss": 0.8263,
"step": 630
},
{
"epoch": 0.2,
"grad_norm": 0.28938768331562686,
"learning_rate": 0.00019450061447606455,
"loss": 0.7777,
"step": 635
},
{
"epoch": 0.2,
"grad_norm": 0.2529231145412997,
"learning_rate": 0.00019432349540147222,
"loss": 0.8287,
"step": 640
},
{
"epoch": 0.2,
"grad_norm": 0.21789719255683243,
"learning_rate": 0.00019414365236538342,
"loss": 0.789,
"step": 645
},
{
"epoch": 0.2,
"grad_norm": 0.2926240350717249,
"learning_rate": 0.00019396109056147242,
"loss": 0.8396,
"step": 650
},
{
"epoch": 0.2,
"grad_norm": 0.21569069093149024,
"learning_rate": 0.00019377581526192853,
"loss": 0.7599,
"step": 655
},
{
"epoch": 0.2,
"grad_norm": 0.26361195643008684,
"learning_rate": 0.00019358783181730387,
"loss": 0.8687,
"step": 660
},
{
"epoch": 0.2,
"grad_norm": 0.285068572985004,
"learning_rate": 0.00019339714565635898,
"loss": 0.7735,
"step": 665
},
{
"epoch": 0.21,
"grad_norm": 0.32266517654300897,
"learning_rate": 0.0001932037622859059,
"loss": 0.754,
"step": 670
},
{
"epoch": 0.21,
"grad_norm": 0.2678959128715121,
"learning_rate": 0.00019300768729064912,
"loss": 0.8024,
"step": 675
},
{
"epoch": 0.21,
"grad_norm": 0.2808314964971424,
"learning_rate": 0.00019280892633302454,
"loss": 0.767,
"step": 680
},
{
"epoch": 0.21,
"grad_norm": 0.25044630314755334,
"learning_rate": 0.00019260748515303563,
"loss": 0.8454,
"step": 685
},
{
"epoch": 0.21,
"grad_norm": 0.30242650123792125,
"learning_rate": 0.00019240336956808786,
"loss": 0.8812,
"step": 690
},
{
"epoch": 0.21,
"grad_norm": 0.2701055207795336,
"learning_rate": 0.00019219658547282067,
"loss": 0.7791,
"step": 695
},
{
"epoch": 0.22,
"grad_norm": 0.2232333034817994,
"learning_rate": 0.0001919871388389372,
"loss": 0.7782,
"step": 700
},
{
"epoch": 0.22,
"grad_norm": 0.2578136050088398,
"learning_rate": 0.0001917750357150318,
"loss": 0.7164,
"step": 705
},
{
"epoch": 0.22,
"grad_norm": 0.2974468917976116,
"learning_rate": 0.00019156028222641554,
"loss": 0.8559,
"step": 710
},
{
"epoch": 0.22,
"grad_norm": 0.2811089697192464,
"learning_rate": 0.00019134288457493904,
"loss": 0.7352,
"step": 715
},
{
"epoch": 0.22,
"grad_norm": 0.3892901045304661,
"learning_rate": 0.0001911228490388136,
"loss": 0.7775,
"step": 720
},
{
"epoch": 0.22,
"grad_norm": 0.32835288605201257,
"learning_rate": 0.00019090018197242972,
"loss": 0.8125,
"step": 725
},
{
"epoch": 0.22,
"grad_norm": 0.3396081375822814,
"learning_rate": 0.00019067488980617384,
"loss": 0.8498,
"step": 730
},
{
"epoch": 0.23,
"grad_norm": 0.2725077722420475,
"learning_rate": 0.00019044697904624226,
"loss": 0.8652,
"step": 735
},
{
"epoch": 0.23,
"grad_norm": 0.26882238969800315,
"learning_rate": 0.0001902164562744536,
"loss": 0.8316,
"step": 740
},
{
"epoch": 0.23,
"grad_norm": 0.2942779783614407,
"learning_rate": 0.00018998332814805852,
"loss": 0.8937,
"step": 745
},
{
"epoch": 0.23,
"grad_norm": 0.23318828090848456,
"learning_rate": 0.0001897476013995476,
"loss": 0.7247,
"step": 750
},
{
"epoch": 0.23,
"grad_norm": 0.3207657739253934,
"learning_rate": 0.00018950928283645676,
"loss": 0.8168,
"step": 755
},
{
"epoch": 0.23,
"grad_norm": 0.27343106758276103,
"learning_rate": 0.00018926837934117084,
"loss": 0.7436,
"step": 760
},
{
"epoch": 0.24,
"grad_norm": 0.2380153711122644,
"learning_rate": 0.0001890248978707246,
"loss": 0.845,
"step": 765
},
{
"epoch": 0.24,
"grad_norm": 0.28205742315414456,
"learning_rate": 0.00018877884545660215,
"loss": 0.8329,
"step": 770
},
{
"epoch": 0.24,
"grad_norm": 0.248001179467568,
"learning_rate": 0.0001885302292045336,
"loss": 0.8322,
"step": 775
},
{
"epoch": 0.24,
"grad_norm": 0.512811292371171,
"learning_rate": 0.0001882790562942899,
"loss": 0.7778,
"step": 780
},
{
"epoch": 0.24,
"grad_norm": 0.3027018813313023,
"learning_rate": 0.00018802533397947567,
"loss": 0.8338,
"step": 785
},
{
"epoch": 0.24,
"grad_norm": 0.2940436466186521,
"learning_rate": 0.00018776906958731953,
"loss": 0.6823,
"step": 790
},
{
"epoch": 0.24,
"grad_norm": 0.33833283166239086,
"learning_rate": 0.00018751027051846258,
"loss": 0.7669,
"step": 795
},
{
"epoch": 0.25,
"grad_norm": 0.31248954654812844,
"learning_rate": 0.00018724894424674467,
"loss": 0.7851,
"step": 800
},
{
"epoch": 0.25,
"grad_norm": 0.3080103407030162,
"learning_rate": 0.00018698509831898853,
"loss": 0.8465,
"step": 805
},
{
"epoch": 0.25,
"grad_norm": 0.27928229688289624,
"learning_rate": 0.00018671874035478195,
"loss": 0.7708,
"step": 810
},
{
"epoch": 0.25,
"grad_norm": 0.2926934897262978,
"learning_rate": 0.00018644987804625757,
"loss": 0.8816,
"step": 815
},
{
"epoch": 0.25,
"grad_norm": 0.2564445050641534,
"learning_rate": 0.00018617851915787078,
"loss": 0.8748,
"step": 820
},
{
"epoch": 0.25,
"grad_norm": 0.3167275363170148,
"learning_rate": 0.0001859046715261756,
"loss": 0.7955,
"step": 825
},
{
"epoch": 0.26,
"grad_norm": 0.4082416585797153,
"learning_rate": 0.00018562834305959824,
"loss": 0.7464,
"step": 830
},
{
"epoch": 0.26,
"grad_norm": 0.3030422995435233,
"learning_rate": 0.0001853495417382088,
"loss": 0.9046,
"step": 835
},
{
"epoch": 0.26,
"grad_norm": 0.2536332149798187,
"learning_rate": 0.00018506827561349073,
"loss": 0.7143,
"step": 840
},
{
"epoch": 0.26,
"grad_norm": 0.3272764072322209,
"learning_rate": 0.00018478455280810838,
"loss": 0.8358,
"step": 845
},
{
"epoch": 0.26,
"grad_norm": 0.28310232183891465,
"learning_rate": 0.00018449838151567244,
"loss": 0.842,
"step": 850
},
{
"epoch": 0.26,
"grad_norm": 0.26576658481713733,
"learning_rate": 0.00018420977000050323,
"loss": 0.7563,
"step": 855
},
{
"epoch": 0.26,
"grad_norm": 0.2612196906596331,
"learning_rate": 0.00018391872659739215,
"loss": 0.7631,
"step": 860
},
{
"epoch": 0.27,
"grad_norm": 0.43199033496139155,
"learning_rate": 0.00018362525971136082,
"loss": 0.8585,
"step": 865
},
{
"epoch": 0.27,
"grad_norm": 0.3011184188491384,
"learning_rate": 0.00018332937781741858,
"loss": 0.807,
"step": 870
},
{
"epoch": 0.27,
"grad_norm": 0.3432230727339861,
"learning_rate": 0.00018303108946031747,
"loss": 0.806,
"step": 875
},
{
"epoch": 0.27,
"grad_norm": 0.28699539333378254,
"learning_rate": 0.00018273040325430574,
"loss": 0.8063,
"step": 880
},
{
"epoch": 0.27,
"grad_norm": 0.2895901648006327,
"learning_rate": 0.00018242732788287884,
"loss": 0.7773,
"step": 885
},
{
"epoch": 0.27,
"grad_norm": 0.30393045217103676,
"learning_rate": 0.00018212187209852888,
"loss": 0.7721,
"step": 890
},
{
"epoch": 0.28,
"grad_norm": 0.4409757200332159,
"learning_rate": 0.00018181404472249158,
"loss": 0.805,
"step": 895
},
{
"epoch": 0.28,
"grad_norm": 0.36679860235251033,
"learning_rate": 0.00018150385464449183,
"loss": 0.7759,
"step": 900
},
{
"epoch": 0.28,
"grad_norm": 0.276840442597116,
"learning_rate": 0.00018119131082248676,
"loss": 0.8182,
"step": 905
},
{
"epoch": 0.28,
"grad_norm": 0.2365689665357522,
"learning_rate": 0.00018087642228240713,
"loss": 0.7851,
"step": 910
},
{
"epoch": 0.28,
"grad_norm": 0.30377473821055756,
"learning_rate": 0.00018055919811789658,
"loss": 0.7467,
"step": 915
},
{
"epoch": 0.28,
"grad_norm": 0.40215196146679155,
"learning_rate": 0.00018023964749004921,
"loss": 0.7436,
"step": 920
},
{
"epoch": 0.28,
"grad_norm": 0.24424508927481137,
"learning_rate": 0.00017991777962714472,
"loss": 0.8502,
"step": 925
},
{
"epoch": 0.29,
"grad_norm": 0.3549699023868391,
"learning_rate": 0.00017959360382438226,
"loss": 0.8607,
"step": 930
},
{
"epoch": 0.29,
"grad_norm": 0.29288526726309294,
"learning_rate": 0.00017926712944361164,
"loss": 0.7812,
"step": 935
},
{
"epoch": 0.29,
"grad_norm": 0.38300023494160845,
"learning_rate": 0.00017893836591306326,
"loss": 0.965,
"step": 940
},
{
"epoch": 0.29,
"grad_norm": 0.3400552848154392,
"learning_rate": 0.00017860732272707565,
"loss": 0.9296,
"step": 945
},
{
"epoch": 0.29,
"grad_norm": 0.33946589539162436,
"learning_rate": 0.0001782740094458214,
"loss": 0.7948,
"step": 950
},
{
"epoch": 0.29,
"grad_norm": 0.3106409161075979,
"learning_rate": 0.00017793843569503096,
"loss": 0.9234,
"step": 955
},
{
"epoch": 0.3,
"grad_norm": 0.3048504659492406,
"learning_rate": 0.00017760061116571472,
"loss": 0.735,
"step": 960
},
{
"epoch": 0.3,
"grad_norm": 0.3699006267760609,
"learning_rate": 0.00017726054561388325,
"loss": 0.8097,
"step": 965
},
{
"epoch": 0.3,
"grad_norm": 0.3812470781886498,
"learning_rate": 0.0001769182488602653,
"loss": 0.7924,
"step": 970
},
{
"epoch": 0.3,
"grad_norm": 0.31391420913653745,
"learning_rate": 0.0001765737307900244,
"loss": 0.8468,
"step": 975
},
{
"epoch": 0.3,
"grad_norm": 0.2798144915599161,
"learning_rate": 0.00017622700135247336,
"loss": 0.7466,
"step": 980
},
{
"epoch": 0.3,
"grad_norm": 0.4373736539748376,
"learning_rate": 0.0001758780705607869,
"loss": 0.7782,
"step": 985
},
{
"epoch": 0.3,
"grad_norm": 0.31499380584195635,
"learning_rate": 0.00017552694849171238,
"loss": 0.7623,
"step": 990
},
{
"epoch": 0.31,
"grad_norm": 0.34084198516003655,
"learning_rate": 0.00017517364528527905,
"loss": 0.7643,
"step": 995
},
{
"epoch": 0.31,
"grad_norm": 0.220259377380433,
"learning_rate": 0.00017481817114450504,
"loss": 0.7041,
"step": 1000
},
{
"epoch": 0.31,
"grad_norm": 0.3741361155303008,
"learning_rate": 0.00017446053633510267,
"loss": 0.8331,
"step": 1005
},
{
"epoch": 0.31,
"grad_norm": 0.25186403738162744,
"learning_rate": 0.00017410075118518207,
"loss": 0.7746,
"step": 1010
},
{
"epoch": 0.31,
"grad_norm": 0.3381680931423155,
"learning_rate": 0.000173738826084953,
"loss": 0.8091,
"step": 1015
},
{
"epoch": 0.31,
"grad_norm": 0.30332385851807925,
"learning_rate": 0.00017337477148642453,
"loss": 0.8123,
"step": 1020
},
{
"epoch": 0.32,
"grad_norm": 0.4189781620141866,
"learning_rate": 0.0001730085979031035,
"loss": 0.7662,
"step": 1025
},
{
"epoch": 0.32,
"grad_norm": 0.20916757459764715,
"learning_rate": 0.0001726403159096907,
"loss": 0.7658,
"step": 1030
},
{
"epoch": 0.32,
"grad_norm": 0.25400278359501394,
"learning_rate": 0.0001722699361417755,
"loss": 0.7761,
"step": 1035
},
{
"epoch": 0.32,
"grad_norm": 0.29634266788228114,
"learning_rate": 0.00017189746929552885,
"loss": 0.7712,
"step": 1040
},
{
"epoch": 0.32,
"grad_norm": 0.34416521475533296,
"learning_rate": 0.00017152292612739427,
"loss": 0.8657,
"step": 1045
},
{
"epoch": 0.32,
"grad_norm": 0.26477218894974247,
"learning_rate": 0.00017114631745377716,
"loss": 0.7979,
"step": 1050
},
{
"epoch": 0.32,
"grad_norm": 0.39575314169996223,
"learning_rate": 0.00017076765415073252,
"loss": 0.7657,
"step": 1055
},
{
"epoch": 0.33,
"grad_norm": 0.3060703163684423,
"learning_rate": 0.0001703869471536509,
"loss": 0.7758,
"step": 1060
},
{
"epoch": 0.33,
"grad_norm": 0.3041979747156484,
"learning_rate": 0.00017000420745694254,
"loss": 0.8641,
"step": 1065
},
{
"epoch": 0.33,
"grad_norm": 0.2891225024964161,
"learning_rate": 0.0001696194461137198,
"loss": 0.8824,
"step": 1070
},
{
"epoch": 0.33,
"grad_norm": 0.2718444108580119,
"learning_rate": 0.0001692326742354781,
"loss": 0.7924,
"step": 1075
},
{
"epoch": 0.33,
"grad_norm": 0.259262158014512,
"learning_rate": 0.00016884390299177492,
"loss": 0.8369,
"step": 1080
},
{
"epoch": 0.33,
"grad_norm": 0.3499348899825314,
"learning_rate": 0.00016845314360990727,
"loss": 0.8346,
"step": 1085
},
{
"epoch": 0.34,
"grad_norm": 0.2715739588374178,
"learning_rate": 0.00016806040737458745,
"loss": 0.8032,
"step": 1090
},
{
"epoch": 0.34,
"grad_norm": 0.31767202890667356,
"learning_rate": 0.00016766570562761726,
"loss": 0.7771,
"step": 1095
},
{
"epoch": 0.34,
"grad_norm": 0.2775818452285115,
"learning_rate": 0.00016726904976756024,
"loss": 0.7571,
"step": 1100
},
{
"epoch": 0.34,
"grad_norm": 0.40533408177768837,
"learning_rate": 0.00016687045124941268,
"loss": 0.7487,
"step": 1105
},
{
"epoch": 0.34,
"grad_norm": 0.23186261109536282,
"learning_rate": 0.0001664699215842728,
"loss": 0.7442,
"step": 1110
},
{
"epoch": 0.34,
"grad_norm": 0.39092343811338354,
"learning_rate": 0.00016606747233900815,
"loss": 0.8009,
"step": 1115
},
{
"epoch": 0.34,
"grad_norm": 0.3086065524408362,
"learning_rate": 0.00016566311513592188,
"loss": 0.8045,
"step": 1120
},
{
"epoch": 0.35,
"grad_norm": 0.3367565478870619,
"learning_rate": 0.00016525686165241673,
"loss": 0.767,
"step": 1125
},
{
"epoch": 0.35,
"grad_norm": 0.3086208201423001,
"learning_rate": 0.00016484872362065818,
"loss": 0.8297,
"step": 1130
},
{
"epoch": 0.35,
"grad_norm": 0.29118078471035397,
"learning_rate": 0.0001644387128272353,
"loss": 0.73,
"step": 1135
},
{
"epoch": 0.35,
"grad_norm": 0.2276647259111584,
"learning_rate": 0.00016402684111282048,
"loss": 0.7594,
"step": 1140
},
{
"epoch": 0.35,
"grad_norm": 0.31848819925406663,
"learning_rate": 0.00016361312037182764,
"loss": 0.7175,
"step": 1145
},
{
"epoch": 0.35,
"grad_norm": 0.3783361351946572,
"learning_rate": 0.00016319756255206856,
"loss": 0.8027,
"step": 1150
},
{
"epoch": 0.36,
"grad_norm": 0.266088499578184,
"learning_rate": 0.00016278017965440787,
"loss": 0.7452,
"step": 1155
},
{
"epoch": 0.36,
"grad_norm": 0.29706113106114784,
"learning_rate": 0.0001623609837324165,
"loss": 0.8534,
"step": 1160
},
{
"epoch": 0.36,
"grad_norm": 0.3339471167119851,
"learning_rate": 0.00016193998689202358,
"loss": 0.8144,
"step": 1165
},
{
"epoch": 0.36,
"grad_norm": 0.28540623867963755,
"learning_rate": 0.00016151720129116686,
"loss": 0.7651,
"step": 1170
},
{
"epoch": 0.36,
"grad_norm": 0.2639670831879774,
"learning_rate": 0.00016109263913944154,
"loss": 0.7034,
"step": 1175
},
{
"epoch": 0.36,
"grad_norm": 0.33170223538556726,
"learning_rate": 0.00016066631269774767,
"loss": 0.7217,
"step": 1180
},
{
"epoch": 0.36,
"grad_norm": 0.3941911797947879,
"learning_rate": 0.00016023823427793626,
"loss": 0.7772,
"step": 1185
},
{
"epoch": 0.37,
"grad_norm": 0.3440363876277395,
"learning_rate": 0.00015980841624245335,
"loss": 0.727,
"step": 1190
},
{
"epoch": 0.37,
"grad_norm": 0.27303644578696656,
"learning_rate": 0.00015937687100398343,
"loss": 0.7976,
"step": 1195
},
{
"epoch": 0.37,
"grad_norm": 0.19363446071730062,
"learning_rate": 0.0001589436110250906,
"loss": 0.7601,
"step": 1200
},
{
"epoch": 0.37,
"grad_norm": 0.3220033440998593,
"learning_rate": 0.00015850864881785892,
"loss": 0.8059,
"step": 1205
},
{
"epoch": 0.37,
"grad_norm": 0.24875841441874524,
"learning_rate": 0.00015807199694353093,
"loss": 0.7766,
"step": 1210
},
{
"epoch": 0.37,
"grad_norm": 0.27144712486585415,
"learning_rate": 0.000157633668012145,
"loss": 0.9517,
"step": 1215
},
{
"epoch": 0.38,
"grad_norm": 0.29984291637849303,
"learning_rate": 0.00015719367468217102,
"loss": 0.7078,
"step": 1220
},
{
"epoch": 0.38,
"grad_norm": 0.42717086324378223,
"learning_rate": 0.00015675202966014502,
"loss": 0.6811,
"step": 1225
},
{
"epoch": 0.38,
"grad_norm": 0.34151608330461036,
"learning_rate": 0.0001563087457003021,
"loss": 0.7748,
"step": 1230
},
{
"epoch": 0.38,
"grad_norm": 0.3709398696526256,
"learning_rate": 0.0001558638356042081,
"loss": 0.7182,
"step": 1235
},
{
"epoch": 0.38,
"grad_norm": 0.34922444682335496,
"learning_rate": 0.00015541731222038998,
"loss": 0.8094,
"step": 1240
},
{
"epoch": 0.38,
"grad_norm": 0.3980009403514347,
"learning_rate": 0.00015496918844396467,
"loss": 0.8039,
"step": 1245
},
{
"epoch": 0.38,
"grad_norm": 0.3512099804126243,
"learning_rate": 0.00015451947721626676,
"loss": 0.79,
"step": 1250
},
{
"epoch": 0.39,
"grad_norm": 0.40719330570424017,
"learning_rate": 0.00015406819152447474,
"loss": 0.6692,
"step": 1255
},
{
"epoch": 0.39,
"grad_norm": 0.32754689412178806,
"learning_rate": 0.0001536153444012359,
"loss": 0.7442,
"step": 1260
},
{
"epoch": 0.39,
"grad_norm": 0.3468017225536143,
"learning_rate": 0.00015316094892428995,
"loss": 0.7848,
"step": 1265
},
{
"epoch": 0.39,
"grad_norm": 0.3434551451114442,
"learning_rate": 0.00015270501821609158,
"loss": 0.7438,
"step": 1270
},
{
"epoch": 0.39,
"grad_norm": 0.34114725338586066,
"learning_rate": 0.00015224756544343114,
"loss": 0.6742,
"step": 1275
},
{
"epoch": 0.39,
"grad_norm": 0.35582931502840126,
"learning_rate": 0.00015178860381705457,
"loss": 0.6642,
"step": 1280
},
{
"epoch": 0.4,
"grad_norm": 0.3951492370139673,
"learning_rate": 0.00015132814659128205,
"loss": 0.7963,
"step": 1285
},
{
"epoch": 0.4,
"grad_norm": 0.3017429849876104,
"learning_rate": 0.00015086620706362486,
"loss": 0.7752,
"step": 1290
},
{
"epoch": 0.4,
"grad_norm": 0.2626021589785786,
"learning_rate": 0.00015040279857440176,
"loss": 0.7782,
"step": 1295
},
{
"epoch": 0.4,
"grad_norm": 0.33978693007893296,
"learning_rate": 0.0001499379345063534,
"loss": 0.7799,
"step": 1300
},
{
"epoch": 0.4,
"grad_norm": 0.3251454729934429,
"learning_rate": 0.00014947162828425606,
"loss": 0.7907,
"step": 1305
},
{
"epoch": 0.4,
"grad_norm": 0.46699196417323946,
"learning_rate": 0.00014900389337453392,
"loss": 0.8757,
"step": 1310
},
{
"epoch": 0.4,
"grad_norm": 0.30759216383899046,
"learning_rate": 0.00014853474328487,
"loss": 0.8248,
"step": 1315
},
{
"epoch": 0.41,
"grad_norm": 0.5240595026063802,
"learning_rate": 0.00014806419156381632,
"loss": 0.8153,
"step": 1320
},
{
"epoch": 0.41,
"grad_norm": 0.2984711481085597,
"learning_rate": 0.0001475922518004025,
"loss": 0.8307,
"step": 1325
},
{
"epoch": 0.41,
"grad_norm": 0.34139646796135426,
"learning_rate": 0.00014711893762374322,
"loss": 0.7983,
"step": 1330
},
{
"epoch": 0.41,
"grad_norm": 0.2571926484335572,
"learning_rate": 0.00014664426270264493,
"loss": 0.6837,
"step": 1335
},
{
"epoch": 0.41,
"grad_norm": 0.3257702661543834,
"learning_rate": 0.00014616824074521075,
"loss": 0.7656,
"step": 1340
},
{
"epoch": 0.41,
"grad_norm": 0.5681687224975429,
"learning_rate": 0.00014569088549844488,
"loss": 0.8412,
"step": 1345
},
{
"epoch": 0.42,
"grad_norm": 0.3442468618645148,
"learning_rate": 0.00014521221074785542,
"loss": 0.7408,
"step": 1350
},
{
"epoch": 0.42,
"grad_norm": 0.3889043102333772,
"learning_rate": 0.00014473223031705637,
"loss": 0.7891,
"step": 1355
},
{
"epoch": 0.42,
"grad_norm": 0.3512289539889666,
"learning_rate": 0.0001442509580673684,
"loss": 0.7438,
"step": 1360
},
{
"epoch": 0.42,
"grad_norm": 0.3124271122113035,
"learning_rate": 0.00014376840789741838,
"loss": 0.7047,
"step": 1365
},
{
"epoch": 0.42,
"grad_norm": 0.2200391690908901,
"learning_rate": 0.00014328459374273833,
"loss": 0.7432,
"step": 1370
},
{
"epoch": 0.42,
"grad_norm": 0.32400034100164815,
"learning_rate": 0.00014279952957536266,
"loss": 0.8155,
"step": 1375
},
{
"epoch": 0.42,
"grad_norm": 0.3003484274407438,
"learning_rate": 0.00014231322940342492,
"loss": 0.7521,
"step": 1380
},
{
"epoch": 0.43,
"grad_norm": 0.4116598695778175,
"learning_rate": 0.00014182570727075308,
"loss": 0.8548,
"step": 1385
},
{
"epoch": 0.43,
"grad_norm": 0.42125576864395314,
"learning_rate": 0.00014133697725646403,
"loss": 0.8552,
"step": 1390
},
{
"epoch": 0.43,
"grad_norm": 0.32506737333947255,
"learning_rate": 0.000140847053474557,
"loss": 0.7796,
"step": 1395
},
{
"epoch": 0.43,
"grad_norm": 0.3558852515623043,
"learning_rate": 0.00014035595007350592,
"loss": 0.782,
"step": 1400
},
{
"epoch": 0.43,
"grad_norm": 0.32892065566412354,
"learning_rate": 0.00013986368123585093,
"loss": 0.7912,
"step": 1405
},
{
"epoch": 0.43,
"grad_norm": 0.3309987342740096,
"learning_rate": 0.00013937026117778867,
"loss": 0.7852,
"step": 1410
},
{
"epoch": 0.44,
"grad_norm": 0.317076816745732,
"learning_rate": 0.00013887570414876176,
"loss": 0.8792,
"step": 1415
},
{
"epoch": 0.44,
"grad_norm": 0.3888229597038326,
"learning_rate": 0.00013838002443104742,
"loss": 0.7537,
"step": 1420
},
{
"epoch": 0.44,
"grad_norm": 0.3505522947043339,
"learning_rate": 0.00013788323633934484,
"loss": 0.7765,
"step": 1425
},
{
"epoch": 0.44,
"grad_norm": 0.30255809120744814,
"learning_rate": 0.0001373853542203619,
"loss": 0.7445,
"step": 1430
},
{
"epoch": 0.44,
"grad_norm": 0.38394599313950495,
"learning_rate": 0.00013688639245240078,
"loss": 0.717,
"step": 1435
},
{
"epoch": 0.44,
"grad_norm": 0.3546082273774911,
"learning_rate": 0.00013638636544494287,
"loss": 0.7088,
"step": 1440
},
{
"epoch": 0.44,
"grad_norm": 0.46456400202121617,
"learning_rate": 0.00013588528763823233,
"loss": 0.6481,
"step": 1445
},
{
"epoch": 0.45,
"grad_norm": 0.38142306418882993,
"learning_rate": 0.0001353831735028595,
"loss": 0.8121,
"step": 1450
},
{
"epoch": 0.45,
"grad_norm": 0.34062042874830745,
"learning_rate": 0.00013488003753934263,
"loss": 0.7098,
"step": 1455
},
{
"epoch": 0.45,
"grad_norm": 0.19799193048705183,
"learning_rate": 0.0001343758942777094,
"loss": 0.6883,
"step": 1460
},
{
"epoch": 0.45,
"grad_norm": 0.3696985192619358,
"learning_rate": 0.000133870758277077,
"loss": 0.8092,
"step": 1465
},
{
"epoch": 0.45,
"grad_norm": 0.2874954359019885,
"learning_rate": 0.00013336464412523207,
"loss": 0.8209,
"step": 1470
},
{
"epoch": 0.45,
"grad_norm": 0.3592024936010695,
"learning_rate": 0.000132857566438209,
"loss": 0.854,
"step": 1475
},
{
"epoch": 0.46,
"grad_norm": 0.29409773858597665,
"learning_rate": 0.00013234953985986824,
"loss": 0.798,
"step": 1480
},
{
"epoch": 0.46,
"grad_norm": 0.2415718204855592,
"learning_rate": 0.0001318405790614731,
"loss": 0.7382,
"step": 1485
},
{
"epoch": 0.46,
"grad_norm": 0.2584643780619029,
"learning_rate": 0.0001313306987412661,
"loss": 0.8092,
"step": 1490
},
{
"epoch": 0.46,
"grad_norm": 0.34126538154076436,
"learning_rate": 0.00013081991362404475,
"loss": 0.789,
"step": 1495
},
{
"epoch": 0.46,
"grad_norm": 0.32753475635130697,
"learning_rate": 0.00013030823846073595,
"loss": 0.8413,
"step": 1500
},
{
"epoch": 0.46,
"grad_norm": 0.3285555673315335,
"learning_rate": 0.00012979568802797022,
"loss": 0.7092,
"step": 1505
},
{
"epoch": 0.46,
"grad_norm": 0.2947608781251718,
"learning_rate": 0.00012928227712765504,
"loss": 0.645,
"step": 1510
},
{
"epoch": 0.47,
"grad_norm": 0.33949478474040173,
"learning_rate": 0.00012876802058654714,
"loss": 0.804,
"step": 1515
},
{
"epoch": 0.47,
"grad_norm": 0.43727181136357957,
"learning_rate": 0.0001282529332558245,
"loss": 0.8041,
"step": 1520
},
{
"epoch": 0.47,
"grad_norm": 0.3609023630640718,
"learning_rate": 0.00012773703001065737,
"loss": 0.8356,
"step": 1525
},
{
"epoch": 0.47,
"grad_norm": 0.3494948390700119,
"learning_rate": 0.00012722032574977881,
"loss": 0.7872,
"step": 1530
},
{
"epoch": 0.47,
"grad_norm": 0.3275549957683315,
"learning_rate": 0.0001267028353950543,
"loss": 0.7883,
"step": 1535
},
{
"epoch": 0.47,
"grad_norm": 0.2434171834573686,
"learning_rate": 0.00012618457389105094,
"loss": 0.7766,
"step": 1540
},
{
"epoch": 0.48,
"grad_norm": 0.35813509273993893,
"learning_rate": 0.00012566555620460569,
"loss": 0.7723,
"step": 1545
},
{
"epoch": 0.48,
"grad_norm": 0.3850234800177591,
"learning_rate": 0.00012514579732439323,
"loss": 0.7127,
"step": 1550
},
{
"epoch": 0.48,
"grad_norm": 0.2990175481928644,
"learning_rate": 0.00012462531226049335,
"loss": 0.8027,
"step": 1555
},
{
"epoch": 0.48,
"grad_norm": 0.26743125802211676,
"learning_rate": 0.00012410411604395696,
"loss": 0.7775,
"step": 1560
},
{
"epoch": 0.48,
"grad_norm": 0.3003015429775997,
"learning_rate": 0.00012358222372637248,
"loss": 0.8003,
"step": 1565
},
{
"epoch": 0.48,
"grad_norm": 0.25952231751732324,
"learning_rate": 0.00012305965037943096,
"loss": 0.7946,
"step": 1570
},
{
"epoch": 0.48,
"grad_norm": 0.3571723160585395,
"learning_rate": 0.00012253641109449074,
"loss": 0.7369,
"step": 1575
},
{
"epoch": 0.49,
"grad_norm": 0.3502660576927713,
"learning_rate": 0.00012201252098214186,
"loss": 0.8105,
"step": 1580
},
{
"epoch": 0.49,
"grad_norm": 0.3925450057088276,
"learning_rate": 0.00012148799517176948,
"loss": 0.7664,
"step": 1585
},
{
"epoch": 0.49,
"grad_norm": 0.2894085765012847,
"learning_rate": 0.00012096284881111711,
"loss": 0.8213,
"step": 1590
},
{
"epoch": 0.49,
"grad_norm": 0.29374369830200575,
"learning_rate": 0.00012043709706584902,
"loss": 0.7723,
"step": 1595
},
{
"epoch": 0.49,
"grad_norm": 0.2863311083269218,
"learning_rate": 0.00011991075511911236,
"loss": 0.696,
"step": 1600
},
{
"epoch": 0.49,
"grad_norm": 0.3036662438900221,
"learning_rate": 0.00011938383817109868,
"loss": 0.8753,
"step": 1605
},
{
"epoch": 0.5,
"grad_norm": 0.3020605197833583,
"learning_rate": 0.00011885636143860492,
"loss": 0.8759,
"step": 1610
},
{
"epoch": 0.5,
"grad_norm": 0.3639681427966891,
"learning_rate": 0.00011832834015459404,
"loss": 0.8606,
"step": 1615
},
{
"epoch": 0.5,
"grad_norm": 0.37953818216433793,
"learning_rate": 0.00011779978956775506,
"loss": 0.7051,
"step": 1620
},
{
"epoch": 0.5,
"grad_norm": 0.3184781493318525,
"learning_rate": 0.00011727072494206262,
"loss": 0.7916,
"step": 1625
},
{
"epoch": 0.5,
"grad_norm": 0.35142683733387886,
"learning_rate": 0.00011674116155633637,
"loss": 0.8831,
"step": 1630
},
{
"epoch": 0.5,
"grad_norm": 0.3117830556752173,
"learning_rate": 0.00011621111470379951,
"loss": 0.8306,
"step": 1635
},
{
"epoch": 0.5,
"grad_norm": 0.4495145775092123,
"learning_rate": 0.00011568059969163734,
"loss": 0.7767,
"step": 1640
},
{
"epoch": 0.51,
"grad_norm": 0.29751872220308234,
"learning_rate": 0.00011514963184055503,
"loss": 0.7627,
"step": 1645
},
{
"epoch": 0.51,
"grad_norm": 0.5069779219255514,
"learning_rate": 0.00011461822648433527,
"loss": 0.7007,
"step": 1650
},
{
"epoch": 0.51,
"grad_norm": 0.3685939765535684,
"learning_rate": 0.00011408639896939548,
"loss": 0.7903,
"step": 1655
},
{
"epoch": 0.51,
"grad_norm": 0.35043401596057283,
"learning_rate": 0.0001135541646543445,
"loss": 0.8195,
"step": 1660
},
{
"epoch": 0.51,
"grad_norm": 0.43437482478281425,
"learning_rate": 0.00011302153890953917,
"loss": 0.7474,
"step": 1665
},
{
"epoch": 0.51,
"grad_norm": 0.424740143766434,
"learning_rate": 0.00011248853711664037,
"loss": 0.7487,
"step": 1670
},
{
"epoch": 0.52,
"grad_norm": 0.4206812162224315,
"learning_rate": 0.00011195517466816892,
"loss": 0.7663,
"step": 1675
},
{
"epoch": 0.52,
"grad_norm": 0.3528935885168195,
"learning_rate": 0.00011142146696706086,
"loss": 0.7075,
"step": 1680
},
{
"epoch": 0.52,
"grad_norm": 0.3022231077132756,
"learning_rate": 0.00011088742942622285,
"loss": 0.7005,
"step": 1685
},
{
"epoch": 0.52,
"grad_norm": 0.24230122499008153,
"learning_rate": 0.00011035307746808696,
"loss": 0.7103,
"step": 1690
},
{
"epoch": 0.52,
"grad_norm": 0.3274240826179655,
"learning_rate": 0.00010981842652416525,
"loss": 0.7585,
"step": 1695
},
{
"epoch": 0.52,
"grad_norm": 0.3226818393613587,
"learning_rate": 0.00010928349203460421,
"loss": 0.6873,
"step": 1700
},
{
"epoch": 0.52,
"grad_norm": 0.42160428435071856,
"learning_rate": 0.00010874828944773884,
"loss": 0.7033,
"step": 1705
},
{
"epoch": 0.53,
"grad_norm": 0.3673664828653425,
"learning_rate": 0.0001082128342196464,
"loss": 0.7568,
"step": 1710
},
{
"epoch": 0.53,
"grad_norm": 0.35369832231150045,
"learning_rate": 0.00010767714181370032,
"loss": 0.7459,
"step": 1715
},
{
"epoch": 0.53,
"grad_norm": 0.3648184560113796,
"learning_rate": 0.00010714122770012332,
"loss": 0.7744,
"step": 1720
},
{
"epoch": 0.53,
"grad_norm": 0.4505619268522559,
"learning_rate": 0.0001066051073555409,
"loss": 0.7257,
"step": 1725
},
{
"epoch": 0.53,
"grad_norm": 0.43443202284742777,
"learning_rate": 0.00010606879626253425,
"loss": 0.7188,
"step": 1730
},
{
"epoch": 0.53,
"grad_norm": 0.3553258041770261,
"learning_rate": 0.00010553230990919316,
"loss": 0.7459,
"step": 1735
},
{
"epoch": 0.54,
"grad_norm": 0.4661654069610038,
"learning_rate": 0.00010499566378866879,
"loss": 0.7836,
"step": 1740
},
{
"epoch": 0.54,
"grad_norm": 0.37584682327967367,
"learning_rate": 0.00010445887339872613,
"loss": 0.7602,
"step": 1745
},
{
"epoch": 0.54,
"grad_norm": 0.39145966702225243,
"learning_rate": 0.00010392195424129663,
"loss": 0.7742,
"step": 1750
},
{
"epoch": 0.54,
"grad_norm": 0.3393184813627934,
"learning_rate": 0.0001033849218220303,
"loss": 0.7641,
"step": 1755
},
{
"epoch": 0.54,
"grad_norm": 0.3324768161048583,
"learning_rate": 0.00010284779164984808,
"loss": 0.7084,
"step": 1760
},
{
"epoch": 0.54,
"grad_norm": 0.4536643875844217,
"learning_rate": 0.00010231057923649395,
"loss": 0.7546,
"step": 1765
},
{
"epoch": 0.54,
"grad_norm": 0.3383053206020978,
"learning_rate": 0.00010177330009608679,
"loss": 0.7897,
"step": 1770
},
{
"epoch": 0.55,
"grad_norm": 0.3291950908164226,
"learning_rate": 0.00010123596974467267,
"loss": 0.837,
"step": 1775
},
{
"epoch": 0.55,
"grad_norm": 0.40591985948567333,
"learning_rate": 0.00010069860369977644,
"loss": 0.7881,
"step": 1780
},
{
"epoch": 0.55,
"grad_norm": 0.3947516646576018,
"learning_rate": 0.0001001612174799538,
"loss": 0.7554,
"step": 1785
},
{
"epoch": 0.55,
"grad_norm": 0.48999744278201957,
"learning_rate": 9.962382660434302e-05,
"loss": 0.7049,
"step": 1790
},
{
"epoch": 0.55,
"grad_norm": 0.27763093083945417,
"learning_rate": 9.908644659221692e-05,
"loss": 0.7906,
"step": 1795
},
{
"epoch": 0.55,
"grad_norm": 0.36597705216081855,
"learning_rate": 9.854909296253454e-05,
"loss": 0.7717,
"step": 1800
},
{
"epoch": 0.56,
"grad_norm": 0.361260421586406,
"learning_rate": 9.801178123349298e-05,
"loss": 0.8052,
"step": 1805
},
{
"epoch": 0.56,
"grad_norm": 0.40479237805543866,
"learning_rate": 9.747452692207944e-05,
"loss": 0.6528,
"step": 1810
},
{
"epoch": 0.56,
"grad_norm": 0.3337778576325595,
"learning_rate": 9.693734554362274e-05,
"loss": 0.7956,
"step": 1815
},
{
"epoch": 0.56,
"grad_norm": 0.352206821846608,
"learning_rate": 9.640025261134566e-05,
"loss": 0.8004,
"step": 1820
},
{
"epoch": 0.56,
"grad_norm": 0.3007022043507481,
"learning_rate": 9.586326363591667e-05,
"loss": 0.6586,
"step": 1825
},
{
"epoch": 0.56,
"grad_norm": 0.32806169397898344,
"learning_rate": 9.532639412500214e-05,
"loss": 0.6469,
"step": 1830
},
{
"epoch": 0.56,
"grad_norm": 0.2948353441185244,
"learning_rate": 9.478965958281831e-05,
"loss": 0.772,
"step": 1835
},
{
"epoch": 0.57,
"grad_norm": 0.29433563822493,
"learning_rate": 9.425307550968379e-05,
"loss": 0.7587,
"step": 1840
},
{
"epoch": 0.57,
"grad_norm": 0.2929390819806653,
"learning_rate": 9.371665740157177e-05,
"loss": 0.7641,
"step": 1845
},
{
"epoch": 0.57,
"grad_norm": 0.36587748924129493,
"learning_rate": 9.318042074966249e-05,
"loss": 0.7423,
"step": 1850
},
{
"epoch": 0.57,
"grad_norm": 0.3157914575950516,
"learning_rate": 9.2644381039896e-05,
"loss": 0.7802,
"step": 1855
},
{
"epoch": 0.57,
"grad_norm": 0.3083734823157643,
"learning_rate": 9.210855375252488e-05,
"loss": 0.6806,
"step": 1860
},
{
"epoch": 0.57,
"grad_norm": 0.37273540588458964,
"learning_rate": 9.157295436166706e-05,
"loss": 0.8018,
"step": 1865
},
{
"epoch": 0.58,
"grad_norm": 0.2891457780890995,
"learning_rate": 9.103759833485921e-05,
"loss": 0.7924,
"step": 1870
},
{
"epoch": 0.58,
"grad_norm": 0.31880678342943103,
"learning_rate": 9.050250113260988e-05,
"loss": 0.6784,
"step": 1875
},
{
"epoch": 0.58,
"grad_norm": 0.38652296771171907,
"learning_rate": 8.996767820795295e-05,
"loss": 0.8423,
"step": 1880
},
{
"epoch": 0.58,
"grad_norm": 0.36151176691802633,
"learning_rate": 8.943314500600153e-05,
"loss": 0.7657,
"step": 1885
},
{
"epoch": 0.58,
"grad_norm": 0.3630982909299649,
"learning_rate": 8.889891696350182e-05,
"loss": 0.7316,
"step": 1890
},
{
"epoch": 0.58,
"grad_norm": 0.346561432187551,
"learning_rate": 8.836500950838743e-05,
"loss": 0.7937,
"step": 1895
},
{
"epoch": 0.58,
"grad_norm": 0.3552138882564471,
"learning_rate": 8.783143805933356e-05,
"loss": 0.7688,
"step": 1900
},
{
"epoch": 0.59,
"grad_norm": 0.3883059056946058,
"learning_rate": 8.729821802531212e-05,
"loss": 0.8022,
"step": 1905
},
{
"epoch": 0.59,
"grad_norm": 0.3654198034463761,
"learning_rate": 8.676536480514646e-05,
"loss": 0.6797,
"step": 1910
},
{
"epoch": 0.59,
"grad_norm": 0.3437677388699394,
"learning_rate": 8.623289378706665e-05,
"loss": 0.8503,
"step": 1915
},
{
"epoch": 0.59,
"grad_norm": 0.23831382342326574,
"learning_rate": 8.570082034826525e-05,
"loss": 0.725,
"step": 1920
},
{
"epoch": 0.59,
"grad_norm": 0.4978109719850785,
"learning_rate": 8.51691598544532e-05,
"loss": 0.8173,
"step": 1925
},
{
"epoch": 0.59,
"grad_norm": 0.3849488236961706,
"learning_rate": 8.463792765941598e-05,
"loss": 0.7935,
"step": 1930
},
{
"epoch": 0.6,
"grad_norm": 0.2564830548422943,
"learning_rate": 8.410713910457022e-05,
"loss": 0.7616,
"step": 1935
},
{
"epoch": 0.6,
"grad_norm": 0.3883061081379379,
"learning_rate": 8.357680951852074e-05,
"loss": 0.7351,
"step": 1940
},
{
"epoch": 0.6,
"grad_norm": 0.3506058449194061,
"learning_rate": 8.30469542166179e-05,
"loss": 0.7693,
"step": 1945
},
{
"epoch": 0.6,
"grad_norm": 0.30929676711753123,
"learning_rate": 8.25175885005151e-05,
"loss": 0.7873,
"step": 1950
},
{
"epoch": 0.6,
"grad_norm": 0.40865576499509826,
"learning_rate": 8.19887276577271e-05,
"loss": 0.8042,
"step": 1955
},
{
"epoch": 0.6,
"grad_norm": 0.5195995711187212,
"learning_rate": 8.146038696118855e-05,
"loss": 0.7973,
"step": 1960
},
{
"epoch": 0.6,
"grad_norm": 0.42197209504725125,
"learning_rate": 8.093258166881262e-05,
"loss": 0.7533,
"step": 1965
},
{
"epoch": 0.61,
"grad_norm": 0.30422958535775585,
"learning_rate": 8.04053270230508e-05,
"loss": 0.779,
"step": 1970
},
{
"epoch": 0.61,
"grad_norm": 0.3455509672202836,
"learning_rate": 7.987863825045234e-05,
"loss": 0.8111,
"step": 1975
},
{
"epoch": 0.61,
"grad_norm": 0.23483119931888347,
"learning_rate": 7.935253056122478e-05,
"loss": 0.6691,
"step": 1980
},
{
"epoch": 0.61,
"grad_norm": 0.5096365631799179,
"learning_rate": 7.882701914879454e-05,
"loss": 0.8173,
"step": 1985
},
{
"epoch": 0.61,
"grad_norm": 0.27454232980225196,
"learning_rate": 7.83021191893682e-05,
"loss": 0.8318,
"step": 1990
},
{
"epoch": 0.61,
"grad_norm": 0.25340939505328935,
"learning_rate": 7.777784584149431e-05,
"loss": 0.7749,
"step": 1995
},
{
"epoch": 0.62,
"grad_norm": 0.41218592002469173,
"learning_rate": 7.725421424562541e-05,
"loss": 0.7486,
"step": 2000
},
{
"epoch": 0.62,
"grad_norm": 0.3166340386687328,
"learning_rate": 7.673123952368105e-05,
"loss": 0.7371,
"step": 2005
},
{
"epoch": 0.62,
"grad_norm": 0.37125596925100546,
"learning_rate": 7.620893677861097e-05,
"loss": 0.8205,
"step": 2010
},
{
"epoch": 0.62,
"grad_norm": 0.389340046711289,
"learning_rate": 7.568732109395882e-05,
"loss": 0.8052,
"step": 2015
},
{
"epoch": 0.62,
"grad_norm": 0.297511489273485,
"learning_rate": 7.516640753342677e-05,
"loss": 0.8116,
"step": 2020
},
{
"epoch": 0.62,
"grad_norm": 0.3551932787364764,
"learning_rate": 7.464621114044041e-05,
"loss": 0.7256,
"step": 2025
},
{
"epoch": 0.62,
"grad_norm": 0.32976411229944613,
"learning_rate": 7.41267469377143e-05,
"loss": 0.7779,
"step": 2030
},
{
"epoch": 0.63,
"grad_norm": 0.36641305934003204,
"learning_rate": 7.360802992681803e-05,
"loss": 0.7769,
"step": 2035
},
{
"epoch": 0.63,
"grad_norm": 0.32848405433392913,
"learning_rate": 7.309007508774319e-05,
"loss": 0.7449,
"step": 2040
},
{
"epoch": 0.63,
"grad_norm": 0.3818192607183943,
"learning_rate": 7.257289737847067e-05,
"loss": 0.7298,
"step": 2045
},
{
"epoch": 0.63,
"grad_norm": 0.3956889666509929,
"learning_rate": 7.205651173453859e-05,
"loss": 0.7438,
"step": 2050
},
{
"epoch": 0.63,
"grad_norm": 0.3186630869883142,
"learning_rate": 7.154093306861115e-05,
"loss": 0.8091,
"step": 2055
},
{
"epoch": 0.63,
"grad_norm": 0.33431044470129717,
"learning_rate": 7.102617627004795e-05,
"loss": 0.7518,
"step": 2060
},
{
"epoch": 0.64,
"grad_norm": 0.32535315210688565,
"learning_rate": 7.051225620447375e-05,
"loss": 0.8321,
"step": 2065
},
{
"epoch": 0.64,
"grad_norm": 0.4508357834061351,
"learning_rate": 6.999918771334952e-05,
"loss": 0.7282,
"step": 2070
},
{
"epoch": 0.64,
"grad_norm": 0.3512613827114045,
"learning_rate": 6.948698561354363e-05,
"loss": 0.7826,
"step": 2075
},
{
"epoch": 0.64,
"grad_norm": 0.49837144853088533,
"learning_rate": 6.897566469690397e-05,
"loss": 0.795,
"step": 2080
},
{
"epoch": 0.64,
"grad_norm": 0.4003697684296247,
"learning_rate": 6.846523972983085e-05,
"loss": 0.7951,
"step": 2085
},
{
"epoch": 0.64,
"grad_norm": 0.3815043269956921,
"learning_rate": 6.795572545285044e-05,
"loss": 0.826,
"step": 2090
},
{
"epoch": 0.65,
"grad_norm": 0.3291683320960395,
"learning_rate": 6.74471365801893e-05,
"loss": 0.7708,
"step": 2095
},
{
"epoch": 0.65,
"grad_norm": 0.41704151240520887,
"learning_rate": 6.693948779934911e-05,
"loss": 0.7386,
"step": 2100
},
{
"epoch": 0.65,
"grad_norm": 0.463623793653466,
"learning_rate": 6.643279377068283e-05,
"loss": 0.7713,
"step": 2105
},
{
"epoch": 0.65,
"grad_norm": 0.3658375594477012,
"learning_rate": 6.592706912697124e-05,
"loss": 0.7786,
"step": 2110
},
{
"epoch": 0.65,
"grad_norm": 0.4059447230155753,
"learning_rate": 6.542232847300015e-05,
"loss": 0.798,
"step": 2115
},
{
"epoch": 0.65,
"grad_norm": 0.3927246312306725,
"learning_rate": 6.491858638513899e-05,
"loss": 0.8166,
"step": 2120
},
{
"epoch": 0.65,
"grad_norm": 0.35333239481209877,
"learning_rate": 6.441585741091955e-05,
"loss": 0.7539,
"step": 2125
},
{
"epoch": 0.66,
"grad_norm": 0.3623671701689697,
"learning_rate": 6.391415606861608e-05,
"loss": 0.8162,
"step": 2130
},
{
"epoch": 0.66,
"grad_norm": 0.430064026231262,
"learning_rate": 6.341349684682576e-05,
"loss": 0.7593,
"step": 2135
},
{
"epoch": 0.66,
"grad_norm": 0.30707444492883157,
"learning_rate": 6.291389420405062e-05,
"loss": 0.7593,
"step": 2140
},
{
"epoch": 0.66,
"grad_norm": 0.29281767006409765,
"learning_rate": 6.241536256827978e-05,
"loss": 0.7074,
"step": 2145
},
{
"epoch": 0.66,
"grad_norm": 0.3397684880342664,
"learning_rate": 6.191791633657268e-05,
"loss": 0.7077,
"step": 2150
},
{
"epoch": 0.66,
"grad_norm": 0.35070530863747645,
"learning_rate": 6.142156987464367e-05,
"loss": 0.7888,
"step": 2155
},
{
"epoch": 0.67,
"grad_norm": 0.31884184127852916,
"learning_rate": 6.0926337516446784e-05,
"loss": 0.8045,
"step": 2160
},
{
"epoch": 0.67,
"grad_norm": 0.34522310174070975,
"learning_rate": 6.043223356376197e-05,
"loss": 0.8115,
"step": 2165
},
{
"epoch": 0.67,
"grad_norm": 0.35929303458552225,
"learning_rate": 5.9939272285782066e-05,
"loss": 0.8234,
"step": 2170
},
{
"epoch": 0.67,
"grad_norm": 0.3835859771257563,
"learning_rate": 5.9447467918700614e-05,
"loss": 0.7295,
"step": 2175
},
{
"epoch": 0.67,
"grad_norm": 0.33889717245375717,
"learning_rate": 5.895683466530091e-05,
"loss": 0.7491,
"step": 2180
},
{
"epoch": 0.67,
"grad_norm": 0.34625485711737686,
"learning_rate": 5.8467386694545635e-05,
"loss": 0.7882,
"step": 2185
},
{
"epoch": 0.67,
"grad_norm": 0.3834886156777842,
"learning_rate": 5.797913814116781e-05,
"loss": 0.7093,
"step": 2190
},
{
"epoch": 0.68,
"grad_norm": 0.3892980195228429,
"learning_rate": 5.7492103105262715e-05,
"loss": 0.794,
"step": 2195
},
{
"epoch": 0.68,
"grad_norm": 0.39210633693040825,
"learning_rate": 5.7006295651880246e-05,
"loss": 0.7566,
"step": 2200
},
{
"epoch": 0.68,
"grad_norm": 0.3582797057469045,
"learning_rate": 5.6521729810619317e-05,
"loss": 0.8021,
"step": 2205
},
{
"epoch": 0.68,
"grad_norm": 0.3542924342264584,
"learning_rate": 5.603841957522227e-05,
"loss": 0.756,
"step": 2210
},
{
"epoch": 0.68,
"grad_norm": 0.36575349651181366,
"learning_rate": 5.555637890317091e-05,
"loss": 0.7921,
"step": 2215
},
{
"epoch": 0.68,
"grad_norm": 0.38535314462569586,
"learning_rate": 5.507562171528342e-05,
"loss": 0.7781,
"step": 2220
},
{
"epoch": 0.69,
"grad_norm": 0.39735016460723493,
"learning_rate": 5.459616189531234e-05,
"loss": 0.6632,
"step": 2225
},
{
"epoch": 0.69,
"grad_norm": 0.4056677466996733,
"learning_rate": 5.411801328954368e-05,
"loss": 0.7334,
"step": 2230
},
{
"epoch": 0.69,
"grad_norm": 0.42376106078500364,
"learning_rate": 5.36411897063968e-05,
"loss": 0.8772,
"step": 2235
},
{
"epoch": 0.69,
"grad_norm": 0.35144323747646544,
"learning_rate": 5.316570491602606e-05,
"loss": 0.7793,
"step": 2240
},
{
"epoch": 0.69,
"grad_norm": 0.3783769784963828,
"learning_rate": 5.269157264992276e-05,
"loss": 0.8655,
"step": 2245
},
{
"epoch": 0.69,
"grad_norm": 0.44209683459363136,
"learning_rate": 5.221880660051881e-05,
"loss": 0.8032,
"step": 2250
},
{
"epoch": 0.69,
"grad_norm": 0.4882374682401987,
"learning_rate": 5.1747420420791196e-05,
"loss": 0.7007,
"step": 2255
},
{
"epoch": 0.7,
"grad_norm": 0.3237759848919934,
"learning_rate": 5.127742772386786e-05,
"loss": 0.7897,
"step": 2260
},
{
"epoch": 0.7,
"grad_norm": 0.36606432111465076,
"learning_rate": 5.0808842082634314e-05,
"loss": 0.8064,
"step": 2265
},
{
"epoch": 0.7,
"grad_norm": 0.40999182095921494,
"learning_rate": 5.0341677029341895e-05,
"loss": 0.7103,
"step": 2270
},
{
"epoch": 0.7,
"grad_norm": 0.3272955637327382,
"learning_rate": 4.987594605521682e-05,
"loss": 0.6785,
"step": 2275
},
{
"epoch": 0.7,
"grad_norm": 0.3490487483696679,
"learning_rate": 4.941166261007077e-05,
"loss": 0.7292,
"step": 2280
},
{
"epoch": 0.7,
"grad_norm": 0.3433624374602265,
"learning_rate": 4.894884010191211e-05,
"loss": 0.6762,
"step": 2285
},
{
"epoch": 0.71,
"grad_norm": 0.325285651430037,
"learning_rate": 4.848749189655915e-05,
"loss": 0.7659,
"step": 2290
},
{
"epoch": 0.71,
"grad_norm": 0.31571712296306303,
"learning_rate": 4.802763131725378e-05,
"loss": 0.7736,
"step": 2295
},
{
"epoch": 0.71,
"grad_norm": 0.35722394621197917,
"learning_rate": 4.756927164427685e-05,
"loss": 0.7155,
"step": 2300
},
{
"epoch": 0.71,
"grad_norm": 0.36377115960758405,
"learning_rate": 4.711242611456469e-05,
"loss": 0.7326,
"step": 2305
},
{
"epoch": 0.71,
"grad_norm": 0.4323956507240235,
"learning_rate": 4.665710792132671e-05,
"loss": 0.7775,
"step": 2310
},
{
"epoch": 0.71,
"grad_norm": 0.4046174615365396,
"learning_rate": 4.620333021366463e-05,
"loss": 0.7643,
"step": 2315
},
{
"epoch": 0.71,
"grad_norm": 0.3796515442594094,
"learning_rate": 4.5751106096192476e-05,
"loss": 0.6264,
"step": 2320
},
{
"epoch": 0.72,
"grad_norm": 0.46275385650934453,
"learning_rate": 4.5300448628658254e-05,
"loss": 0.688,
"step": 2325
},
{
"epoch": 0.72,
"grad_norm": 0.4343032727751153,
"learning_rate": 4.485137082556685e-05,
"loss": 0.7238,
"step": 2330
},
{
"epoch": 0.72,
"grad_norm": 0.42658163696603996,
"learning_rate": 4.4403885655804115e-05,
"loss": 0.7691,
"step": 2335
},
{
"epoch": 0.72,
"grad_norm": 0.5374930429188296,
"learning_rate": 4.395800604226229e-05,
"loss": 0.8293,
"step": 2340
},
{
"epoch": 0.72,
"grad_norm": 0.5178849424936606,
"learning_rate": 4.351374486146706e-05,
"loss": 0.6683,
"step": 2345
},
{
"epoch": 0.72,
"grad_norm": 0.2580914515654273,
"learning_rate": 4.307111494320524e-05,
"loss": 0.6295,
"step": 2350
},
{
"epoch": 0.73,
"grad_norm": 0.38787306791139886,
"learning_rate": 4.263012907015477e-05,
"loss": 0.6748,
"step": 2355
},
{
"epoch": 0.73,
"grad_norm": 0.4320242881816677,
"learning_rate": 4.219079997751515e-05,
"loss": 0.6848,
"step": 2360
},
{
"epoch": 0.73,
"grad_norm": 0.39451448787145293,
"learning_rate": 4.175314035264002e-05,
"loss": 0.7691,
"step": 2365
},
{
"epoch": 0.73,
"grad_norm": 0.3024402074904783,
"learning_rate": 4.131716283467034e-05,
"loss": 0.7674,
"step": 2370
},
{
"epoch": 0.73,
"grad_norm": 0.39413439687935803,
"learning_rate": 4.0882880014169865e-05,
"loss": 0.83,
"step": 2375
},
{
"epoch": 0.73,
"grad_norm": 0.45210137336011785,
"learning_rate": 4.045030443276115e-05,
"loss": 0.7117,
"step": 2380
},
{
"epoch": 0.73,
"grad_norm": 0.4767595879985179,
"learning_rate": 4.001944858276356e-05,
"loss": 0.7424,
"step": 2385
},
{
"epoch": 0.74,
"grad_norm": 0.3405433186754331,
"learning_rate": 3.9590324906832435e-05,
"loss": 0.7944,
"step": 2390
},
{
"epoch": 0.74,
"grad_norm": 0.38698863622073953,
"learning_rate": 3.9162945797599895e-05,
"loss": 0.7486,
"step": 2395
},
{
"epoch": 0.74,
"grad_norm": 0.33122873819033993,
"learning_rate": 3.873732359731661e-05,
"loss": 0.7339,
"step": 2400
},
{
"epoch": 0.74,
"grad_norm": 0.3593189874663698,
"learning_rate": 3.831347059749587e-05,
"loss": 0.8308,
"step": 2405
},
{
"epoch": 0.74,
"grad_norm": 0.32020688896625343,
"learning_rate": 3.78913990385582e-05,
"loss": 0.7932,
"step": 2410
},
{
"epoch": 0.74,
"grad_norm": 0.35408477803883764,
"learning_rate": 3.7471121109478004e-05,
"loss": 0.6155,
"step": 2415
},
{
"epoch": 0.75,
"grad_norm": 0.33845060971026897,
"learning_rate": 3.705264894743167e-05,
"loss": 0.7798,
"step": 2420
},
{
"epoch": 0.75,
"grad_norm": 0.5583150637853672,
"learning_rate": 3.6635994637446845e-05,
"loss": 0.6673,
"step": 2425
},
{
"epoch": 0.75,
"grad_norm": 0.4929136893319016,
"learning_rate": 3.6221170212053766e-05,
"loss": 0.8048,
"step": 2430
},
{
"epoch": 0.75,
"grad_norm": 0.5429072066090833,
"learning_rate": 3.5808187650937276e-05,
"loss": 0.7507,
"step": 2435
},
{
"epoch": 0.75,
"grad_norm": 0.46201271372482866,
"learning_rate": 3.53970588805914e-05,
"loss": 0.7259,
"step": 2440
},
{
"epoch": 0.75,
"grad_norm": 0.3129228737179152,
"learning_rate": 3.498779577397453e-05,
"loss": 0.7715,
"step": 2445
},
{
"epoch": 0.75,
"grad_norm": 0.32996488069999697,
"learning_rate": 3.458041015016681e-05,
"loss": 0.7797,
"step": 2450
},
{
"epoch": 0.76,
"grad_norm": 0.3214733415000198,
"learning_rate": 3.4174913774028485e-05,
"loss": 0.7226,
"step": 2455
},
{
"epoch": 0.76,
"grad_norm": 0.42589629830207104,
"learning_rate": 3.3771318355860593e-05,
"loss": 0.7218,
"step": 2460
},
{
"epoch": 0.76,
"grad_norm": 0.3047848353555366,
"learning_rate": 3.336963555106638e-05,
"loss": 0.7956,
"step": 2465
},
{
"epoch": 0.76,
"grad_norm": 0.34359240246923894,
"learning_rate": 3.296987695981493e-05,
"loss": 0.666,
"step": 2470
},
{
"epoch": 0.76,
"grad_norm": 0.4144993432032501,
"learning_rate": 3.257205412670605e-05,
"loss": 0.7416,
"step": 2475
},
{
"epoch": 0.76,
"grad_norm": 0.3731255551783685,
"learning_rate": 3.217617854043707e-05,
"loss": 0.8345,
"step": 2480
},
{
"epoch": 0.77,
"grad_norm": 0.4050140213208934,
"learning_rate": 3.178226163347067e-05,
"loss": 0.7122,
"step": 2485
},
{
"epoch": 0.77,
"grad_norm": 0.4002439491991807,
"learning_rate": 3.139031478170522e-05,
"loss": 0.6805,
"step": 2490
},
{
"epoch": 0.77,
"grad_norm": 0.42917304751384394,
"learning_rate": 3.100034930414585e-05,
"loss": 0.733,
"step": 2495
},
{
"epoch": 0.77,
"grad_norm": 0.4136643224459766,
"learning_rate": 3.0612376462577784e-05,
"loss": 0.7807,
"step": 2500
},
{
"epoch": 0.77,
"grad_norm": 0.3890309362984174,
"learning_rate": 3.0226407461241056e-05,
"loss": 0.643,
"step": 2505
},
{
"epoch": 0.77,
"grad_norm": 0.4262911129142299,
"learning_rate": 2.9842453446506868e-05,
"loss": 0.823,
"step": 2510
},
{
"epoch": 0.77,
"grad_norm": 0.30650720385697705,
"learning_rate": 2.9460525506555947e-05,
"loss": 0.7002,
"step": 2515
},
{
"epoch": 0.78,
"grad_norm": 0.3980581468888342,
"learning_rate": 2.9080634671057892e-05,
"loss": 0.7899,
"step": 2520
},
{
"epoch": 0.78,
"grad_norm": 0.4056498788574052,
"learning_rate": 2.8702791910853144e-05,
"loss": 0.701,
"step": 2525
},
{
"epoch": 0.78,
"grad_norm": 0.4548787143471859,
"learning_rate": 2.832700813763579e-05,
"loss": 0.8386,
"step": 2530
},
{
"epoch": 0.78,
"grad_norm": 0.4404855029802744,
"learning_rate": 2.7953294203638625e-05,
"loss": 0.7813,
"step": 2535
},
{
"epoch": 0.78,
"grad_norm": 0.40369488587415225,
"learning_rate": 2.7581660901319663e-05,
"loss": 0.7886,
"step": 2540
},
{
"epoch": 0.78,
"grad_norm": 0.44025189268752124,
"learning_rate": 2.7212118963050592e-05,
"loss": 0.6854,
"step": 2545
},
{
"epoch": 0.79,
"grad_norm": 0.38769461649930276,
"learning_rate": 2.6844679060806666e-05,
"loss": 0.7533,
"step": 2550
},
{
"epoch": 0.79,
"grad_norm": 0.39043309802901266,
"learning_rate": 2.647935180585861e-05,
"loss": 0.7324,
"step": 2555
},
{
"epoch": 0.79,
"grad_norm": 0.4031278133263342,
"learning_rate": 2.6116147748466136e-05,
"loss": 0.8095,
"step": 2560
},
{
"epoch": 0.79,
"grad_norm": 0.31989753369104523,
"learning_rate": 2.575507737757341e-05,
"loss": 0.7635,
"step": 2565
},
{
"epoch": 0.79,
"grad_norm": 0.3825808697754477,
"learning_rate": 2.5396151120505797e-05,
"loss": 0.7067,
"step": 2570
},
{
"epoch": 0.79,
"grad_norm": 0.34014553791218866,
"learning_rate": 2.5039379342669156e-05,
"loss": 0.7454,
"step": 2575
},
{
"epoch": 0.79,
"grad_norm": 0.3441742765574342,
"learning_rate": 2.4684772347250194e-05,
"loss": 0.7269,
"step": 2580
},
{
"epoch": 0.8,
"grad_norm": 0.268563145640876,
"learning_rate": 2.433234037491904e-05,
"loss": 0.7188,
"step": 2585
},
{
"epoch": 0.8,
"grad_norm": 0.44327204267255527,
"learning_rate": 2.3982093603533485e-05,
"loss": 0.6476,
"step": 2590
},
{
"epoch": 0.8,
"grad_norm": 0.47944546289888046,
"learning_rate": 2.3634042147845036e-05,
"loss": 0.7312,
"step": 2595
},
{
"epoch": 0.8,
"grad_norm": 0.3735226907786184,
"learning_rate": 2.3288196059206936e-05,
"loss": 0.8098,
"step": 2600
},
{
"epoch": 0.8,
"grad_norm": 0.48173286401906895,
"learning_rate": 2.2944565325283608e-05,
"loss": 0.7692,
"step": 2605
},
{
"epoch": 0.8,
"grad_norm": 0.454018593107754,
"learning_rate": 2.260315986976258e-05,
"loss": 0.7258,
"step": 2610
},
{
"epoch": 0.81,
"grad_norm": 0.4102026616293206,
"learning_rate": 2.2263989552067644e-05,
"loss": 0.8175,
"step": 2615
},
{
"epoch": 0.81,
"grad_norm": 0.36721285813725996,
"learning_rate": 2.1927064167074197e-05,
"loss": 0.7741,
"step": 2620
},
{
"epoch": 0.81,
"grad_norm": 0.5058554441705722,
"learning_rate": 2.1592393444826377e-05,
"loss": 0.7664,
"step": 2625
},
{
"epoch": 0.81,
"grad_norm": 0.41227200976610034,
"learning_rate": 2.125998705025619e-05,
"loss": 0.7922,
"step": 2630
},
{
"epoch": 0.81,
"grad_norm": 0.3739806966013022,
"learning_rate": 2.0929854582904095e-05,
"loss": 0.6827,
"step": 2635
},
{
"epoch": 0.81,
"grad_norm": 0.3526698208142984,
"learning_rate": 2.060200557664215e-05,
"loss": 0.7712,
"step": 2640
},
{
"epoch": 0.81,
"grad_norm": 0.355624340580361,
"learning_rate": 2.0276449499398352e-05,
"loss": 0.7217,
"step": 2645
},
{
"epoch": 0.82,
"grad_norm": 0.35879322380857276,
"learning_rate": 1.9953195752883535e-05,
"loss": 0.8101,
"step": 2650
},
{
"epoch": 0.82,
"grad_norm": 0.4381419678532357,
"learning_rate": 1.9632253672319466e-05,
"loss": 0.7784,
"step": 2655
},
{
"epoch": 0.82,
"grad_norm": 0.3338214481525901,
"learning_rate": 1.9313632526169713e-05,
"loss": 0.7633,
"step": 2660
},
{
"epoch": 0.82,
"grad_norm": 0.3419374997650999,
"learning_rate": 1.899734151587157e-05,
"loss": 0.6726,
"step": 2665
},
{
"epoch": 0.82,
"grad_norm": 0.39283447932176424,
"learning_rate": 1.868338977557058e-05,
"loss": 0.7787,
"step": 2670
},
{
"epoch": 0.82,
"grad_norm": 0.37003709150492736,
"learning_rate": 1.837178637185666e-05,
"loss": 0.7466,
"step": 2675
},
{
"epoch": 0.83,
"grad_norm": 0.3961613001539733,
"learning_rate": 1.8062540303502284e-05,
"loss": 0.7097,
"step": 2680
},
{
"epoch": 0.83,
"grad_norm": 0.35142355910690376,
"learning_rate": 1.7755660501202565e-05,
"loss": 0.6774,
"step": 2685
},
{
"epoch": 0.83,
"grad_norm": 0.41038393576069904,
"learning_rate": 1.745115582731749e-05,
"loss": 0.7496,
"step": 2690
},
{
"epoch": 0.83,
"grad_norm": 0.39409588845344945,
"learning_rate": 1.7149035075615794e-05,
"loss": 0.7187,
"step": 2695
},
{
"epoch": 0.83,
"grad_norm": 0.44791745932431604,
"learning_rate": 1.6849306971021116e-05,
"loss": 0.7898,
"step": 2700
},
{
"epoch": 0.83,
"grad_norm": 0.3525758016199936,
"learning_rate": 1.6551980169360005e-05,
"loss": 0.7511,
"step": 2705
},
{
"epoch": 0.83,
"grad_norm": 0.36563645526797983,
"learning_rate": 1.6257063257111938e-05,
"loss": 0.7397,
"step": 2710
},
{
"epoch": 0.84,
"grad_norm": 0.3351064266182499,
"learning_rate": 1.596456475116147e-05,
"loss": 0.7379,
"step": 2715
},
{
"epoch": 0.84,
"grad_norm": 0.5068000232220052,
"learning_rate": 1.567449309855199e-05,
"loss": 0.751,
"step": 2720
},
{
"epoch": 0.84,
"grad_norm": 0.43979138745697033,
"learning_rate": 1.5386856676242146e-05,
"loss": 0.8085,
"step": 2725
},
{
"epoch": 0.84,
"grad_norm": 0.356641089597573,
"learning_rate": 1.5101663790863596e-05,
"loss": 0.6256,
"step": 2730
},
{
"epoch": 0.84,
"grad_norm": 0.44817921454892296,
"learning_rate": 1.4818922678481429e-05,
"loss": 0.7675,
"step": 2735
},
{
"epoch": 0.84,
"grad_norm": 0.38078400449692273,
"learning_rate": 1.4538641504355965e-05,
"loss": 0.689,
"step": 2740
},
{
"epoch": 0.85,
"grad_norm": 0.39634310148432367,
"learning_rate": 1.4260828362707301e-05,
"loss": 0.7727,
"step": 2745
},
{
"epoch": 0.85,
"grad_norm": 0.3227302452177864,
"learning_rate": 1.3985491276481323e-05,
"loss": 0.6711,
"step": 2750
},
{
"epoch": 0.85,
"grad_norm": 0.35229071646321697,
"learning_rate": 1.3712638197118111e-05,
"loss": 0.7711,
"step": 2755
},
{
"epoch": 0.85,
"grad_norm": 0.3536271561860169,
"learning_rate": 1.3442277004322257e-05,
"loss": 0.8075,
"step": 2760
},
{
"epoch": 0.85,
"grad_norm": 0.348867332644309,
"learning_rate": 1.3174415505835436e-05,
"loss": 0.7561,
"step": 2765
},
{
"epoch": 0.85,
"grad_norm": 0.3535699706794319,
"learning_rate": 1.2909061437210669e-05,
"loss": 0.6532,
"step": 2770
},
{
"epoch": 0.85,
"grad_norm": 0.29622730647422324,
"learning_rate": 1.264622246158924e-05,
"loss": 0.7651,
"step": 2775
},
{
"epoch": 0.86,
"grad_norm": 0.36090199758429575,
"learning_rate": 1.2385906169479167e-05,
"loss": 0.8015,
"step": 2780
},
{
"epoch": 0.86,
"grad_norm": 0.402156004342123,
"learning_rate": 1.2128120078536076e-05,
"loss": 0.6387,
"step": 2785
},
{
"epoch": 0.86,
"grad_norm": 0.42634964362612304,
"learning_rate": 1.1872871633346094e-05,
"loss": 0.7452,
"step": 2790
},
{
"epoch": 0.86,
"grad_norm": 0.4198409057077085,
"learning_rate": 1.1620168205210869e-05,
"loss": 0.7722,
"step": 2795
},
{
"epoch": 0.86,
"grad_norm": 0.32753834269003024,
"learning_rate": 1.1370017091934714e-05,
"loss": 0.6906,
"step": 2800
},
{
"epoch": 0.86,
"grad_norm": 0.32874585989735094,
"learning_rate": 1.1122425517613722e-05,
"loss": 0.6583,
"step": 2805
},
{
"epoch": 0.87,
"grad_norm": 0.40652778318077754,
"learning_rate": 1.0877400632427359e-05,
"loss": 0.674,
"step": 2810
},
{
"epoch": 0.87,
"grad_norm": 0.507341912219104,
"learning_rate": 1.0634949512431814e-05,
"loss": 0.7677,
"step": 2815
},
{
"epoch": 0.87,
"grad_norm": 0.3696792851429222,
"learning_rate": 1.0395079159355658e-05,
"loss": 0.7034,
"step": 2820
},
{
"epoch": 0.87,
"grad_norm": 0.4493375882862214,
"learning_rate": 1.0157796500397699e-05,
"loss": 0.7487,
"step": 2825
},
{
"epoch": 0.87,
"grad_norm": 0.41029746791384736,
"learning_rate": 9.92310838802698e-06,
"loss": 0.7405,
"step": 2830
},
{
"epoch": 0.87,
"grad_norm": 0.2972360830137905,
"learning_rate": 9.691021599784711e-06,
"loss": 0.6979,
"step": 2835
},
{
"epoch": 0.87,
"grad_norm": 0.4032566104643188,
"learning_rate": 9.461542838088722e-06,
"loss": 0.7898,
"step": 2840
},
{
"epoch": 0.88,
"grad_norm": 0.27918369114356745,
"learning_rate": 9.23467873003977e-06,
"loss": 0.8092,
"step": 2845
},
{
"epoch": 0.88,
"grad_norm": 0.36249726345459377,
"learning_rate": 9.010435827230313e-06,
"loss": 0.6445,
"step": 2850
},
{
"epoch": 0.88,
"grad_norm": 0.3964708844545875,
"learning_rate": 8.788820605555082e-06,
"loss": 0.7462,
"step": 2855
},
{
"epoch": 0.88,
"grad_norm": 0.3502801511645061,
"learning_rate": 8.569839465024299e-06,
"loss": 0.7233,
"step": 2860
},
{
"epoch": 0.88,
"grad_norm": 0.4033170990560745,
"learning_rate": 8.35349872957869e-06,
"loss": 0.8105,
"step": 2865
},
{
"epoch": 0.88,
"grad_norm": 0.3954070605648136,
"learning_rate": 8.139804646906923e-06,
"loss": 0.7059,
"step": 2870
},
{
"epoch": 0.89,
"grad_norm": 0.37474529564255277,
"learning_rate": 7.928763388265181e-06,
"loss": 0.8582,
"step": 2875
},
{
"epoch": 0.89,
"grad_norm": 0.32809524672734786,
"learning_rate": 7.720381048298897e-06,
"loss": 0.7581,
"step": 2880
},
{
"epoch": 0.89,
"grad_norm": 0.4149259283243587,
"learning_rate": 7.5146636448668485e-06,
"loss": 0.7735,
"step": 2885
},
{
"epoch": 0.89,
"grad_norm": 0.4263926138100582,
"learning_rate": 7.3116171188671865e-06,
"loss": 0.8028,
"step": 2890
},
{
"epoch": 0.89,
"grad_norm": 0.45104438156690363,
"learning_rate": 7.111247334066129e-06,
"loss": 0.752,
"step": 2895
},
{
"epoch": 0.89,
"grad_norm": 0.43860221599692545,
"learning_rate": 6.913560076928361e-06,
"loss": 0.7119,
"step": 2900
},
{
"epoch": 0.89,
"grad_norm": 0.3597066697079075,
"learning_rate": 6.71856105645009e-06,
"loss": 0.7666,
"step": 2905
},
{
"epoch": 0.9,
"grad_norm": 0.5298228542098834,
"learning_rate": 6.526255903994105e-06,
"loss": 0.6903,
"step": 2910
},
{
"epoch": 0.9,
"grad_norm": 0.348385379079852,
"learning_rate": 6.336650173127223e-06,
"loss": 0.7291,
"step": 2915
},
{
"epoch": 0.9,
"grad_norm": 0.40348728722999555,
"learning_rate": 6.149749339459787e-06,
"loss": 0.6929,
"step": 2920
},
{
"epoch": 0.9,
"grad_norm": 0.42152434212067974,
"learning_rate": 5.96555880048767e-06,
"loss": 0.7092,
"step": 2925
},
{
"epoch": 0.9,
"grad_norm": 0.3418766857878975,
"learning_rate": 5.784083875436286e-06,
"loss": 0.7017,
"step": 2930
},
{
"epoch": 0.9,
"grad_norm": 0.4129577303513544,
"learning_rate": 5.605329805107084e-06,
"loss": 0.8389,
"step": 2935
},
{
"epoch": 0.91,
"grad_norm": 0.34322551067313023,
"learning_rate": 5.429301751726068e-06,
"loss": 0.8468,
"step": 2940
},
{
"epoch": 0.91,
"grad_norm": 0.42129646163250906,
"learning_rate": 5.256004798794889e-06,
"loss": 0.7467,
"step": 2945
},
{
"epoch": 0.91,
"grad_norm": 0.38112811696400956,
"learning_rate": 5.085443950943858e-06,
"loss": 0.6878,
"step": 2950
},
{
"epoch": 0.91,
"grad_norm": 0.38823071010954757,
"learning_rate": 4.917624133787535e-06,
"loss": 0.839,
"step": 2955
},
{
"epoch": 0.91,
"grad_norm": 0.41462150440141854,
"learning_rate": 4.752550193782457e-06,
"loss": 0.7937,
"step": 2960
},
{
"epoch": 0.91,
"grad_norm": 0.47038565975041013,
"learning_rate": 4.590226898087169e-06,
"loss": 0.7394,
"step": 2965
},
{
"epoch": 0.91,
"grad_norm": 0.42313697531774797,
"learning_rate": 4.430658934424536e-06,
"loss": 0.7365,
"step": 2970
},
{
"epoch": 0.92,
"grad_norm": 0.37056389807076956,
"learning_rate": 4.2738509109464194e-06,
"loss": 0.7771,
"step": 2975
},
{
"epoch": 0.92,
"grad_norm": 0.4282991333253571,
"learning_rate": 4.119807356100536e-06,
"loss": 0.8332,
"step": 2980
},
{
"epoch": 0.92,
"grad_norm": 0.3399316066449957,
"learning_rate": 3.968532718499718e-06,
"loss": 0.719,
"step": 2985
},
{
"epoch": 0.92,
"grad_norm": 0.3045272007549396,
"learning_rate": 3.8200313667934415e-06,
"loss": 0.7398,
"step": 2990
},
{
"epoch": 0.92,
"grad_norm": 0.3940052052861745,
"learning_rate": 3.674307589541637e-06,
"loss": 0.6926,
"step": 2995
},
{
"epoch": 0.92,
"grad_norm": 0.5430719327118287,
"learning_rate": 3.5313655950908964e-06,
"loss": 0.724,
"step": 3000
},
{
"epoch": 0.93,
"grad_norm": 0.3409882746688945,
"learning_rate": 3.391209511452853e-06,
"loss": 0.7768,
"step": 3005
},
{
"epoch": 0.93,
"grad_norm": 0.32988045764813406,
"learning_rate": 3.253843386185085e-06,
"loss": 0.7503,
"step": 3010
},
{
"epoch": 0.93,
"grad_norm": 0.30435984793513565,
"learning_rate": 3.1192711862740865e-06,
"loss": 0.7373,
"step": 3015
},
{
"epoch": 0.93,
"grad_norm": 0.4219409570685578,
"learning_rate": 2.9874967980208724e-06,
"loss": 0.7532,
"step": 3020
},
{
"epoch": 0.93,
"grad_norm": 0.40198268281516975,
"learning_rate": 2.858524026928555e-06,
"loss": 0.746,
"step": 3025
},
{
"epoch": 0.93,
"grad_norm": 0.5529913137285583,
"learning_rate": 2.7323565975926222e-06,
"loss": 0.8412,
"step": 3030
},
{
"epoch": 0.93,
"grad_norm": 0.5641310624198427,
"learning_rate": 2.6089981535932453e-06,
"loss": 0.7381,
"step": 3035
},
{
"epoch": 0.94,
"grad_norm": 0.4070763719787106,
"learning_rate": 2.4884522573901505e-06,
"loss": 0.7469,
"step": 3040
},
{
"epoch": 0.94,
"grad_norm": 0.412932484046869,
"learning_rate": 2.3707223902196595e-06,
"loss": 0.8027,
"step": 3045
},
{
"epoch": 0.94,
"grad_norm": 0.3698602015181267,
"learning_rate": 2.2558119519942357e-06,
"loss": 0.7422,
"step": 3050
},
{
"epoch": 0.94,
"grad_norm": 0.41987825527789996,
"learning_rate": 2.143724261204194e-06,
"loss": 0.7901,
"step": 3055
},
{
"epoch": 0.94,
"grad_norm": 0.4558561938965998,
"learning_rate": 2.034462554821992e-06,
"loss": 0.7254,
"step": 3060
},
{
"epoch": 0.94,
"grad_norm": 0.31521663581005915,
"learning_rate": 1.928029988208635e-06,
"loss": 0.7234,
"step": 3065
},
{
"epoch": 0.95,
"grad_norm": 0.4536950604685854,
"learning_rate": 1.8244296350226398e-06,
"loss": 0.8439,
"step": 3070
},
{
"epoch": 0.95,
"grad_norm": 0.3324327843215253,
"learning_rate": 1.7236644871312047e-06,
"loss": 0.7318,
"step": 3075
},
{
"epoch": 0.95,
"grad_norm": 0.3441401957085473,
"learning_rate": 1.6257374545238457e-06,
"loss": 0.7219,
"step": 3080
},
{
"epoch": 0.95,
"grad_norm": 0.31548933651921574,
"learning_rate": 1.530651365228375e-06,
"loss": 0.6584,
"step": 3085
},
{
"epoch": 0.95,
"grad_norm": 0.43658195529889465,
"learning_rate": 1.4384089652291543e-06,
"loss": 0.8155,
"step": 3090
},
{
"epoch": 0.95,
"grad_norm": 0.28926411213003883,
"learning_rate": 1.349012918387904e-06,
"loss": 0.7126,
"step": 3095
},
{
"epoch": 0.95,
"grad_norm": 0.5240863579149662,
"learning_rate": 1.2624658063666639e-06,
"loss": 0.8585,
"step": 3100
},
{
"epoch": 0.96,
"grad_norm": 0.34319193062085684,
"learning_rate": 1.1787701285533193e-06,
"loss": 0.7302,
"step": 3105
},
{
"epoch": 0.96,
"grad_norm": 0.37091855665958207,
"learning_rate": 1.0979283019893704e-06,
"loss": 0.8102,
"step": 3110
},
{
"epoch": 0.96,
"grad_norm": 0.4173368619893207,
"learning_rate": 1.019942661300166e-06,
"loss": 0.8052,
"step": 3115
},
{
"epoch": 0.96,
"grad_norm": 0.2740737663998489,
"learning_rate": 9.448154586274794e-07,
"loss": 0.6628,
"step": 3120
},
{
"epoch": 0.96,
"grad_norm": 0.39165587634667004,
"learning_rate": 8.725488635644152e-07,
"loss": 0.8068,
"step": 3125
},
{
"epoch": 0.96,
"grad_norm": 0.3462783087024963,
"learning_rate": 8.031449630928167e-07,
"loss": 0.755,
"step": 3130
},
{
"epoch": 0.97,
"grad_norm": 0.3501973539234669,
"learning_rate": 7.366057615229904e-07,
"loss": 0.8435,
"step": 3135
},
{
"epoch": 0.97,
"grad_norm": 0.3089654640513169,
"learning_rate": 6.729331804357863e-07,
"loss": 0.7804,
"step": 3140
},
{
"epoch": 0.97,
"grad_norm": 0.3732310282262484,
"learning_rate": 6.121290586271311e-07,
"loss": 0.7352,
"step": 3145
},
{
"epoch": 0.97,
"grad_norm": 0.35885721051799435,
"learning_rate": 5.54195152054926e-07,
"loss": 0.719,
"step": 3150
},
{
"epoch": 0.97,
"grad_norm": 0.4350143969198028,
"learning_rate": 4.99133133788332e-07,
"loss": 0.7083,
"step": 3155
},
{
"epoch": 0.97,
"grad_norm": 0.3878552023049681,
"learning_rate": 4.4694459395943077e-07,
"loss": 0.787,
"step": 3160
},
{
"epoch": 0.97,
"grad_norm": 0.5569927297196836,
"learning_rate": 3.9763103971734993e-07,
"loss": 0.8012,
"step": 3165
},
{
"epoch": 0.98,
"grad_norm": 0.4146895070331553,
"learning_rate": 3.5119389518470936e-07,
"loss": 0.6918,
"step": 3170
},
{
"epoch": 0.98,
"grad_norm": 0.3438997693511246,
"learning_rate": 3.076345014164872e-07,
"loss": 0.7113,
"step": 3175
},
{
"epoch": 0.98,
"grad_norm": 0.3948501011870127,
"learning_rate": 2.669541163613176e-07,
"loss": 0.7671,
"step": 3180
},
{
"epoch": 0.98,
"grad_norm": 0.3840771316522947,
"learning_rate": 2.2915391482514204e-07,
"loss": 0.7372,
"step": 3185
},
{
"epoch": 0.98,
"grad_norm": 0.34038583545962225,
"learning_rate": 1.9423498843726962e-07,
"loss": 0.6481,
"step": 3190
},
{
"epoch": 0.98,
"grad_norm": 0.3931737526809346,
"learning_rate": 1.6219834561889136e-07,
"loss": 0.7199,
"step": 3195
},
{
"epoch": 0.99,
"grad_norm": 0.38986041242847186,
"learning_rate": 1.3304491155393674e-07,
"loss": 0.7816,
"step": 3200
},
{
"epoch": 0.99,
"grad_norm": 0.4355430101066633,
"learning_rate": 1.0677552816233949e-07,
"loss": 0.6702,
"step": 3205
},
{
"epoch": 0.99,
"grad_norm": 0.388098975061164,
"learning_rate": 8.339095407575715e-08,
"loss": 0.6688,
"step": 3210
},
{
"epoch": 0.99,
"grad_norm": 0.3699381979359166,
"learning_rate": 6.28918646156329e-08,
"loss": 0.8179,
"step": 3215
},
{
"epoch": 0.99,
"grad_norm": 0.374170534262767,
"learning_rate": 4.5278851773711274e-08,
"loss": 0.7136,
"step": 3220
},
{
"epoch": 0.99,
"grad_norm": 0.41255583678961755,
"learning_rate": 3.055242419492954e-08,
"loss": 0.7418,
"step": 3225
},
{
"epoch": 0.99,
"grad_norm": 0.45777314712192396,
"learning_rate": 1.8713007162740605e-08,
"loss": 0.7344,
"step": 3230
},
{
"epoch": 1.0,
"grad_norm": 0.47083864745606346,
"learning_rate": 9.760942586822808e-09,
"loss": 0.8361,
"step": 3235
},
{
"epoch": 1.0,
"grad_norm": 0.4272718560438237,
"learning_rate": 3.69648899322117e-09,
"loss": 0.7523,
"step": 3240
},
{
"epoch": 1.0,
"grad_norm": 0.4516813250246949,
"learning_rate": 5.198215168533693e-10,
"loss": 0.7346,
"step": 3245
},
{
"epoch": 1.0,
"eval_loss": 1.2166675329208374,
"eval_runtime": 1667.535,
"eval_samples_per_second": 1.385,
"eval_steps_per_second": 0.347,
"step": 3248
},
{
"epoch": 1.0,
"step": 3248,
"total_flos": 6658064344678400.0,
"train_loss": 0.6257000189696627,
"train_runtime": 38581.5438,
"train_samples_per_second": 1.347,
"train_steps_per_second": 0.084
}
],
"logging_steps": 5,
"max_steps": 3248,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 6658064344678400.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}