llama3-sudo-10epochs-mask / trainer_state.json
Qin Liu
Model save
08ad122 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 2290,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004366812227074236,
"grad_norm": 1.8752956704333947,
"learning_rate": 8.733624454148472e-07,
"loss": 3.7085,
"step": 1
},
{
"epoch": 0.021834061135371178,
"grad_norm": 1.8028899921940393,
"learning_rate": 4.3668122270742355e-06,
"loss": 3.6291,
"step": 5
},
{
"epoch": 0.043668122270742356,
"grad_norm": 1.75783914939411,
"learning_rate": 8.733624454148471e-06,
"loss": 3.547,
"step": 10
},
{
"epoch": 0.06550218340611354,
"grad_norm": 2.1823195400139026,
"learning_rate": 1.3100436681222708e-05,
"loss": 3.6303,
"step": 15
},
{
"epoch": 0.08733624454148471,
"grad_norm": 2.7703299276693256,
"learning_rate": 1.7467248908296942e-05,
"loss": 3.5805,
"step": 20
},
{
"epoch": 0.1091703056768559,
"grad_norm": 3.0501154969163244,
"learning_rate": 2.183406113537118e-05,
"loss": 3.277,
"step": 25
},
{
"epoch": 0.13100436681222707,
"grad_norm": 2.0575132099752795,
"learning_rate": 2.6200873362445416e-05,
"loss": 2.8289,
"step": 30
},
{
"epoch": 0.15283842794759825,
"grad_norm": 1.2985876224099822,
"learning_rate": 3.056768558951965e-05,
"loss": 2.6017,
"step": 35
},
{
"epoch": 0.17467248908296942,
"grad_norm": 0.7330786544465717,
"learning_rate": 3.4934497816593884e-05,
"loss": 2.4112,
"step": 40
},
{
"epoch": 0.1965065502183406,
"grad_norm": 0.7933262369374595,
"learning_rate": 3.930131004366812e-05,
"loss": 2.2821,
"step": 45
},
{
"epoch": 0.2183406113537118,
"grad_norm": 0.7790118551391272,
"learning_rate": 4.366812227074236e-05,
"loss": 2.0793,
"step": 50
},
{
"epoch": 0.24017467248908297,
"grad_norm": 0.7082020251604142,
"learning_rate": 4.8034934497816594e-05,
"loss": 2.0505,
"step": 55
},
{
"epoch": 0.26200873362445415,
"grad_norm": 0.5696342576311191,
"learning_rate": 5.240174672489083e-05,
"loss": 1.8173,
"step": 60
},
{
"epoch": 0.2838427947598253,
"grad_norm": 0.5023621326635465,
"learning_rate": 5.6768558951965065e-05,
"loss": 1.8912,
"step": 65
},
{
"epoch": 0.3056768558951965,
"grad_norm": 0.4787415611645182,
"learning_rate": 6.11353711790393e-05,
"loss": 1.7972,
"step": 70
},
{
"epoch": 0.32751091703056767,
"grad_norm": 0.48422783401495123,
"learning_rate": 6.550218340611354e-05,
"loss": 1.7183,
"step": 75
},
{
"epoch": 0.34934497816593885,
"grad_norm": 0.3910949318663936,
"learning_rate": 6.986899563318777e-05,
"loss": 1.6366,
"step": 80
},
{
"epoch": 0.37117903930131,
"grad_norm": 0.3902520624627953,
"learning_rate": 7.423580786026201e-05,
"loss": 1.7019,
"step": 85
},
{
"epoch": 0.3930131004366812,
"grad_norm": 0.3853385163155022,
"learning_rate": 7.860262008733625e-05,
"loss": 1.7606,
"step": 90
},
{
"epoch": 0.4148471615720524,
"grad_norm": 0.3911758987224519,
"learning_rate": 8.296943231441049e-05,
"loss": 1.6395,
"step": 95
},
{
"epoch": 0.4366812227074236,
"grad_norm": 0.42385759736776996,
"learning_rate": 8.733624454148472e-05,
"loss": 1.6328,
"step": 100
},
{
"epoch": 0.4585152838427948,
"grad_norm": 0.3975926298253002,
"learning_rate": 9.170305676855896e-05,
"loss": 1.6775,
"step": 105
},
{
"epoch": 0.48034934497816595,
"grad_norm": 0.4355288111027398,
"learning_rate": 9.606986899563319e-05,
"loss": 1.6008,
"step": 110
},
{
"epoch": 0.5021834061135371,
"grad_norm": 0.44774118787630435,
"learning_rate": 0.00010043668122270742,
"loss": 1.6046,
"step": 115
},
{
"epoch": 0.5240174672489083,
"grad_norm": 0.45365568335085893,
"learning_rate": 0.00010480349344978167,
"loss": 1.6348,
"step": 120
},
{
"epoch": 0.5458515283842795,
"grad_norm": 0.3985940875605887,
"learning_rate": 0.00010917030567685591,
"loss": 1.616,
"step": 125
},
{
"epoch": 0.5676855895196506,
"grad_norm": 0.4103041259632963,
"learning_rate": 0.00011353711790393013,
"loss": 1.6063,
"step": 130
},
{
"epoch": 0.5895196506550219,
"grad_norm": 0.45594857125060284,
"learning_rate": 0.00011790393013100438,
"loss": 1.5782,
"step": 135
},
{
"epoch": 0.611353711790393,
"grad_norm": 0.406753522288533,
"learning_rate": 0.0001222707423580786,
"loss": 1.5361,
"step": 140
},
{
"epoch": 0.6331877729257642,
"grad_norm": 0.45489448779672886,
"learning_rate": 0.00012663755458515284,
"loss": 1.6416,
"step": 145
},
{
"epoch": 0.6550218340611353,
"grad_norm": 0.4232268425851412,
"learning_rate": 0.00013100436681222707,
"loss": 1.5449,
"step": 150
},
{
"epoch": 0.6768558951965066,
"grad_norm": 0.4008200720858846,
"learning_rate": 0.00013537117903930133,
"loss": 1.6322,
"step": 155
},
{
"epoch": 0.6986899563318777,
"grad_norm": 0.41073046435729793,
"learning_rate": 0.00013973799126637554,
"loss": 1.6144,
"step": 160
},
{
"epoch": 0.7205240174672489,
"grad_norm": 0.43150859357010535,
"learning_rate": 0.0001441048034934498,
"loss": 1.6816,
"step": 165
},
{
"epoch": 0.74235807860262,
"grad_norm": 0.4209643739546475,
"learning_rate": 0.00014847161572052403,
"loss": 1.6049,
"step": 170
},
{
"epoch": 0.7641921397379913,
"grad_norm": 0.4646259107508816,
"learning_rate": 0.00015283842794759826,
"loss": 1.6193,
"step": 175
},
{
"epoch": 0.7860262008733624,
"grad_norm": 0.42132133209440126,
"learning_rate": 0.0001572052401746725,
"loss": 1.5542,
"step": 180
},
{
"epoch": 0.8078602620087336,
"grad_norm": 0.4068655673248684,
"learning_rate": 0.00016157205240174672,
"loss": 1.5172,
"step": 185
},
{
"epoch": 0.8296943231441049,
"grad_norm": 0.45022442420363395,
"learning_rate": 0.00016593886462882098,
"loss": 1.65,
"step": 190
},
{
"epoch": 0.851528384279476,
"grad_norm": 0.4218769408186785,
"learning_rate": 0.00017030567685589521,
"loss": 1.7073,
"step": 195
},
{
"epoch": 0.8733624454148472,
"grad_norm": 0.44363856749896563,
"learning_rate": 0.00017467248908296945,
"loss": 1.6647,
"step": 200
},
{
"epoch": 0.8951965065502183,
"grad_norm": 0.39452894148369905,
"learning_rate": 0.00017903930131004368,
"loss": 1.4932,
"step": 205
},
{
"epoch": 0.9170305676855895,
"grad_norm": 0.407234590774645,
"learning_rate": 0.0001834061135371179,
"loss": 1.5987,
"step": 210
},
{
"epoch": 0.9388646288209607,
"grad_norm": 0.4299787387863718,
"learning_rate": 0.00018777292576419214,
"loss": 1.606,
"step": 215
},
{
"epoch": 0.9606986899563319,
"grad_norm": 0.4459993359246055,
"learning_rate": 0.00019213973799126638,
"loss": 1.6248,
"step": 220
},
{
"epoch": 0.982532751091703,
"grad_norm": 0.42477289910814814,
"learning_rate": 0.0001965065502183406,
"loss": 1.5145,
"step": 225
},
{
"epoch": 1.0043668122270741,
"grad_norm": 0.4524366274873438,
"learning_rate": 0.00019999988382473225,
"loss": 1.6031,
"step": 230
},
{
"epoch": 1.0262008733624455,
"grad_norm": 0.4084729303647835,
"learning_rate": 0.00019999581771870396,
"loss": 1.5467,
"step": 235
},
{
"epoch": 1.0480349344978166,
"grad_norm": 0.5172909354442642,
"learning_rate": 0.0001999859431192192,
"loss": 1.4636,
"step": 240
},
{
"epoch": 1.0698689956331877,
"grad_norm": 0.4686179138565006,
"learning_rate": 0.00019997026059986742,
"loss": 1.5244,
"step": 245
},
{
"epoch": 1.091703056768559,
"grad_norm": 0.47437945549478044,
"learning_rate": 0.00019994877107160482,
"loss": 1.4414,
"step": 250
},
{
"epoch": 1.1135371179039302,
"grad_norm": 0.45613574001819357,
"learning_rate": 0.00019992147578270142,
"loss": 1.4545,
"step": 255
},
{
"epoch": 1.1353711790393013,
"grad_norm": 0.4440788329210085,
"learning_rate": 0.00019988837631866864,
"loss": 1.4727,
"step": 260
},
{
"epoch": 1.1572052401746724,
"grad_norm": 0.43804218551099794,
"learning_rate": 0.00019984947460216707,
"loss": 1.5721,
"step": 265
},
{
"epoch": 1.1790393013100438,
"grad_norm": 0.4712591099454086,
"learning_rate": 0.0001998047728928949,
"loss": 1.45,
"step": 270
},
{
"epoch": 1.2008733624454149,
"grad_norm": 0.46682322093255346,
"learning_rate": 0.00019975427378745659,
"loss": 1.5364,
"step": 275
},
{
"epoch": 1.222707423580786,
"grad_norm": 0.44776219185560584,
"learning_rate": 0.00019969798021921201,
"loss": 1.4799,
"step": 280
},
{
"epoch": 1.244541484716157,
"grad_norm": 0.43694406427786026,
"learning_rate": 0.0001996358954581062,
"loss": 1.3916,
"step": 285
},
{
"epoch": 1.2663755458515285,
"grad_norm": 0.4369949578213358,
"learning_rate": 0.00019956802311047925,
"loss": 1.5629,
"step": 290
},
{
"epoch": 1.2882096069868996,
"grad_norm": 0.4507447731242532,
"learning_rate": 0.00019949436711885686,
"loss": 1.5553,
"step": 295
},
{
"epoch": 1.3100436681222707,
"grad_norm": 0.4238763083224714,
"learning_rate": 0.00019941493176172154,
"loss": 1.555,
"step": 300
},
{
"epoch": 1.3318777292576418,
"grad_norm": 0.44296845533811896,
"learning_rate": 0.0001993297216532637,
"loss": 1.5952,
"step": 305
},
{
"epoch": 1.3537117903930131,
"grad_norm": 0.44395910006889616,
"learning_rate": 0.00019923874174311394,
"loss": 1.4769,
"step": 310
},
{
"epoch": 1.3755458515283843,
"grad_norm": 0.44391981727033414,
"learning_rate": 0.00019914199731605546,
"loss": 1.5458,
"step": 315
},
{
"epoch": 1.3973799126637554,
"grad_norm": 0.46314929089631696,
"learning_rate": 0.00019903949399171692,
"loss": 1.5994,
"step": 320
},
{
"epoch": 1.4192139737991267,
"grad_norm": 0.4902612617938975,
"learning_rate": 0.0001989312377242463,
"loss": 1.5253,
"step": 325
},
{
"epoch": 1.4410480349344978,
"grad_norm": 0.4264196408790441,
"learning_rate": 0.0001988172348019648,
"loss": 1.5378,
"step": 330
},
{
"epoch": 1.462882096069869,
"grad_norm": 0.4125915654790707,
"learning_rate": 0.00019869749184700156,
"loss": 1.4231,
"step": 335
},
{
"epoch": 1.48471615720524,
"grad_norm": 0.4121352169920894,
"learning_rate": 0.00019857201581490933,
"loss": 1.4937,
"step": 340
},
{
"epoch": 1.5065502183406112,
"grad_norm": 0.46111501316021164,
"learning_rate": 0.00019844081399425997,
"loss": 1.6366,
"step": 345
},
{
"epoch": 1.5283842794759825,
"grad_norm": 0.41020501591518393,
"learning_rate": 0.0001983038940062214,
"loss": 1.5345,
"step": 350
},
{
"epoch": 1.5502183406113537,
"grad_norm": 0.42368595569558,
"learning_rate": 0.00019816126380411476,
"loss": 1.5478,
"step": 355
},
{
"epoch": 1.572052401746725,
"grad_norm": 0.41418005567795424,
"learning_rate": 0.0001980129316729526,
"loss": 1.5202,
"step": 360
},
{
"epoch": 1.5938864628820961,
"grad_norm": 0.4293992491929074,
"learning_rate": 0.0001978589062289573,
"loss": 1.4605,
"step": 365
},
{
"epoch": 1.6157205240174672,
"grad_norm": 0.4361713850313702,
"learning_rate": 0.00019769919641906097,
"loss": 1.5154,
"step": 370
},
{
"epoch": 1.6375545851528384,
"grad_norm": 0.3915518977683478,
"learning_rate": 0.0001975338115203854,
"loss": 1.3845,
"step": 375
},
{
"epoch": 1.6593886462882095,
"grad_norm": 0.40952561447487074,
"learning_rate": 0.0001973627611397034,
"loss": 1.5663,
"step": 380
},
{
"epoch": 1.6812227074235808,
"grad_norm": 0.41401676412381255,
"learning_rate": 0.00019718605521288073,
"loss": 1.5892,
"step": 385
},
{
"epoch": 1.703056768558952,
"grad_norm": 0.40948687994041144,
"learning_rate": 0.00019700370400429885,
"loss": 1.5853,
"step": 390
},
{
"epoch": 1.7248908296943233,
"grad_norm": 0.39912451760472517,
"learning_rate": 0.00019681571810625873,
"loss": 1.5086,
"step": 395
},
{
"epoch": 1.7467248908296944,
"grad_norm": 0.4274906140041681,
"learning_rate": 0.00019662210843836574,
"loss": 1.5361,
"step": 400
},
{
"epoch": 1.7685589519650655,
"grad_norm": 0.45496243462203195,
"learning_rate": 0.00019642288624689501,
"loss": 1.5281,
"step": 405
},
{
"epoch": 1.7903930131004366,
"grad_norm": 0.40754588577831136,
"learning_rate": 0.00019621806310413857,
"loss": 1.4146,
"step": 410
},
{
"epoch": 1.8122270742358078,
"grad_norm": 0.46020764826270205,
"learning_rate": 0.00019600765090773282,
"loss": 1.509,
"step": 415
},
{
"epoch": 1.8340611353711789,
"grad_norm": 0.41433203928546175,
"learning_rate": 0.0001957916618799676,
"loss": 1.4521,
"step": 420
},
{
"epoch": 1.8558951965065502,
"grad_norm": 0.4076208140451916,
"learning_rate": 0.00019557010856707617,
"loss": 1.5177,
"step": 425
},
{
"epoch": 1.8777292576419216,
"grad_norm": 0.3861676767334766,
"learning_rate": 0.00019534300383850642,
"loss": 1.5334,
"step": 430
},
{
"epoch": 1.8995633187772927,
"grad_norm": 0.38833060515579254,
"learning_rate": 0.00019511036088617342,
"loss": 1.5405,
"step": 435
},
{
"epoch": 1.9213973799126638,
"grad_norm": 0.42695215319286783,
"learning_rate": 0.000194872193223693,
"loss": 1.5458,
"step": 440
},
{
"epoch": 1.943231441048035,
"grad_norm": 0.4089286925168508,
"learning_rate": 0.0001946285146855968,
"loss": 1.5466,
"step": 445
},
{
"epoch": 1.965065502183406,
"grad_norm": 0.42155330336215135,
"learning_rate": 0.00019437933942652885,
"loss": 1.566,
"step": 450
},
{
"epoch": 1.9868995633187772,
"grad_norm": 0.3751384149186164,
"learning_rate": 0.000194124681920423,
"loss": 1.4511,
"step": 455
},
{
"epoch": 2.0087336244541483,
"grad_norm": 0.4161005311145859,
"learning_rate": 0.00019386455695966253,
"loss": 1.4751,
"step": 460
},
{
"epoch": 2.03056768558952,
"grad_norm": 0.4391122093349958,
"learning_rate": 0.0001935989796542207,
"loss": 1.4673,
"step": 465
},
{
"epoch": 2.052401746724891,
"grad_norm": 0.4822422647436377,
"learning_rate": 0.00019332796543078314,
"loss": 1.4212,
"step": 470
},
{
"epoch": 2.074235807860262,
"grad_norm": 0.47440157836581254,
"learning_rate": 0.00019305153003185165,
"loss": 1.4117,
"step": 475
},
{
"epoch": 2.096069868995633,
"grad_norm": 0.5171235330183517,
"learning_rate": 0.00019276968951482986,
"loss": 1.377,
"step": 480
},
{
"epoch": 2.1179039301310043,
"grad_norm": 0.5213453701595562,
"learning_rate": 0.00019248246025109045,
"loss": 1.3892,
"step": 485
},
{
"epoch": 2.1397379912663754,
"grad_norm": 0.5031391500788147,
"learning_rate": 0.0001921898589250242,
"loss": 1.3859,
"step": 490
},
{
"epoch": 2.1615720524017465,
"grad_norm": 0.48354639648956227,
"learning_rate": 0.00019189190253307082,
"loss": 1.3916,
"step": 495
},
{
"epoch": 2.183406113537118,
"grad_norm": 0.5173445212952421,
"learning_rate": 0.00019158860838273172,
"loss": 1.3977,
"step": 500
},
{
"epoch": 2.2052401746724892,
"grad_norm": 0.513094289634623,
"learning_rate": 0.00019127999409156453,
"loss": 1.3707,
"step": 505
},
{
"epoch": 2.2270742358078603,
"grad_norm": 0.5024827997518101,
"learning_rate": 0.00019096607758615998,
"loss": 1.3482,
"step": 510
},
{
"epoch": 2.2489082969432315,
"grad_norm": 0.5798563945761437,
"learning_rate": 0.0001906468771011003,
"loss": 1.4178,
"step": 515
},
{
"epoch": 2.2707423580786026,
"grad_norm": 0.5150755731275006,
"learning_rate": 0.00019032241117790028,
"loss": 1.4191,
"step": 520
},
{
"epoch": 2.2925764192139737,
"grad_norm": 0.5437625450565541,
"learning_rate": 0.00018999269866393006,
"loss": 1.3817,
"step": 525
},
{
"epoch": 2.314410480349345,
"grad_norm": 0.4924027196915137,
"learning_rate": 0.00018965775871132044,
"loss": 1.3745,
"step": 530
},
{
"epoch": 2.3362445414847164,
"grad_norm": 0.5287496980456949,
"learning_rate": 0.00018931761077585035,
"loss": 1.3749,
"step": 535
},
{
"epoch": 2.3580786026200875,
"grad_norm": 0.5123509887446156,
"learning_rate": 0.00018897227461581672,
"loss": 1.4476,
"step": 540
},
{
"epoch": 2.3799126637554586,
"grad_norm": 0.5054089069691902,
"learning_rate": 0.00018862177029088675,
"loss": 1.4103,
"step": 545
},
{
"epoch": 2.4017467248908297,
"grad_norm": 0.5339659464397467,
"learning_rate": 0.00018826611816093273,
"loss": 1.421,
"step": 550
},
{
"epoch": 2.423580786026201,
"grad_norm": 0.5367901761635062,
"learning_rate": 0.00018790533888484937,
"loss": 1.4725,
"step": 555
},
{
"epoch": 2.445414847161572,
"grad_norm": 0.5453912898301467,
"learning_rate": 0.00018753945341935376,
"loss": 1.4671,
"step": 560
},
{
"epoch": 2.467248908296943,
"grad_norm": 0.5114254543022997,
"learning_rate": 0.0001871684830177681,
"loss": 1.5483,
"step": 565
},
{
"epoch": 2.489082969432314,
"grad_norm": 0.5183339258600979,
"learning_rate": 0.00018679244922878516,
"loss": 1.4277,
"step": 570
},
{
"epoch": 2.5109170305676853,
"grad_norm": 0.5202383438539178,
"learning_rate": 0.00018641137389521645,
"loss": 1.4767,
"step": 575
},
{
"epoch": 2.532751091703057,
"grad_norm": 0.5293048970285786,
"learning_rate": 0.0001860252791527236,
"loss": 1.4691,
"step": 580
},
{
"epoch": 2.554585152838428,
"grad_norm": 0.5307661790603934,
"learning_rate": 0.0001856341874285324,
"loss": 1.484,
"step": 585
},
{
"epoch": 2.576419213973799,
"grad_norm": 0.5298444162459437,
"learning_rate": 0.0001852381214401302,
"loss": 1.3704,
"step": 590
},
{
"epoch": 2.5982532751091703,
"grad_norm": 0.5045622495753448,
"learning_rate": 0.00018483710419394615,
"loss": 1.4273,
"step": 595
},
{
"epoch": 2.6200873362445414,
"grad_norm": 0.516020284616684,
"learning_rate": 0.00018443115898401504,
"loss": 1.5253,
"step": 600
},
{
"epoch": 2.641921397379913,
"grad_norm": 0.5212259321852013,
"learning_rate": 0.000184020309390624,
"loss": 1.4966,
"step": 605
},
{
"epoch": 2.6637554585152836,
"grad_norm": 0.571739861550278,
"learning_rate": 0.00018360457927894287,
"loss": 1.489,
"step": 610
},
{
"epoch": 2.685589519650655,
"grad_norm": 0.5165260257002361,
"learning_rate": 0.00018318399279763797,
"loss": 1.419,
"step": 615
},
{
"epoch": 2.7074235807860263,
"grad_norm": 0.5022734014528262,
"learning_rate": 0.00018275857437746932,
"loss": 1.5218,
"step": 620
},
{
"epoch": 2.7292576419213974,
"grad_norm": 0.5065667081884927,
"learning_rate": 0.00018232834872987147,
"loss": 1.3765,
"step": 625
},
{
"epoch": 2.7510917030567685,
"grad_norm": 0.5208990486632071,
"learning_rate": 0.00018189334084551826,
"loss": 1.4514,
"step": 630
},
{
"epoch": 2.7729257641921397,
"grad_norm": 0.4881264484974513,
"learning_rate": 0.00018145357599287095,
"loss": 1.4477,
"step": 635
},
{
"epoch": 2.7947598253275108,
"grad_norm": 0.513600593429205,
"learning_rate": 0.00018100907971671054,
"loss": 1.4449,
"step": 640
},
{
"epoch": 2.816593886462882,
"grad_norm": 0.6046158817500052,
"learning_rate": 0.00018055987783665404,
"loss": 1.3161,
"step": 645
},
{
"epoch": 2.8384279475982535,
"grad_norm": 0.5208894858087225,
"learning_rate": 0.00018010599644565457,
"loss": 1.4693,
"step": 650
},
{
"epoch": 2.8602620087336246,
"grad_norm": 0.5746066357275653,
"learning_rate": 0.0001796474619084856,
"loss": 1.4347,
"step": 655
},
{
"epoch": 2.8820960698689957,
"grad_norm": 0.5569260278390025,
"learning_rate": 0.00017918430086020975,
"loss": 1.4628,
"step": 660
},
{
"epoch": 2.903930131004367,
"grad_norm": 0.5015533731915597,
"learning_rate": 0.0001787165402046313,
"loss": 1.4082,
"step": 665
},
{
"epoch": 2.925764192139738,
"grad_norm": 0.4930090755602876,
"learning_rate": 0.0001782442071127338,
"loss": 1.4412,
"step": 670
},
{
"epoch": 2.947598253275109,
"grad_norm": 0.5351113450288878,
"learning_rate": 0.0001777673290211014,
"loss": 1.3765,
"step": 675
},
{
"epoch": 2.96943231441048,
"grad_norm": 0.5333783396073188,
"learning_rate": 0.00017728593363032532,
"loss": 1.4074,
"step": 680
},
{
"epoch": 2.9912663755458517,
"grad_norm": 0.5148622195538735,
"learning_rate": 0.0001768000489033949,
"loss": 1.355,
"step": 685
},
{
"epoch": 3.013100436681223,
"grad_norm": 0.5136549354887358,
"learning_rate": 0.00017630970306407311,
"loss": 1.33,
"step": 690
},
{
"epoch": 3.034934497816594,
"grad_norm": 0.587330448131839,
"learning_rate": 0.00017581492459525712,
"loss": 1.267,
"step": 695
},
{
"epoch": 3.056768558951965,
"grad_norm": 0.5781502863776671,
"learning_rate": 0.00017531574223732396,
"loss": 1.3391,
"step": 700
},
{
"epoch": 3.078602620087336,
"grad_norm": 0.5801593817600947,
"learning_rate": 0.0001748121849864609,
"loss": 1.3398,
"step": 705
},
{
"epoch": 3.1004366812227073,
"grad_norm": 0.6358481598657785,
"learning_rate": 0.00017430428209298126,
"loss": 1.3191,
"step": 710
},
{
"epoch": 3.1222707423580784,
"grad_norm": 0.635414065898168,
"learning_rate": 0.00017379206305962526,
"loss": 1.3233,
"step": 715
},
{
"epoch": 3.14410480349345,
"grad_norm": 0.6721891418870005,
"learning_rate": 0.0001732755576398463,
"loss": 1.2795,
"step": 720
},
{
"epoch": 3.165938864628821,
"grad_norm": 0.6262467411308055,
"learning_rate": 0.00017275479583608261,
"loss": 1.3117,
"step": 725
},
{
"epoch": 3.1877729257641922,
"grad_norm": 0.7147271334112754,
"learning_rate": 0.00017222980789801477,
"loss": 1.3604,
"step": 730
},
{
"epoch": 3.2096069868995634,
"grad_norm": 0.677887647709634,
"learning_rate": 0.00017170062432080805,
"loss": 1.3356,
"step": 735
},
{
"epoch": 3.2314410480349345,
"grad_norm": 0.6529188195262589,
"learning_rate": 0.00017116727584334159,
"loss": 1.3092,
"step": 740
},
{
"epoch": 3.2532751091703056,
"grad_norm": 0.6545432757758792,
"learning_rate": 0.00017062979344642244,
"loss": 1.3272,
"step": 745
},
{
"epoch": 3.2751091703056767,
"grad_norm": 0.6417623946150004,
"learning_rate": 0.00017008820835098627,
"loss": 1.3712,
"step": 750
},
{
"epoch": 3.2969432314410483,
"grad_norm": 0.6419119938295037,
"learning_rate": 0.00016954255201628358,
"loss": 1.372,
"step": 755
},
{
"epoch": 3.3187772925764194,
"grad_norm": 0.6643469655488602,
"learning_rate": 0.00016899285613805246,
"loss": 1.3883,
"step": 760
},
{
"epoch": 3.3406113537117905,
"grad_norm": 0.6617592247751748,
"learning_rate": 0.00016843915264667746,
"loss": 1.3131,
"step": 765
},
{
"epoch": 3.3624454148471616,
"grad_norm": 0.6721660291620549,
"learning_rate": 0.00016788147370533482,
"loss": 1.3677,
"step": 770
},
{
"epoch": 3.3842794759825328,
"grad_norm": 0.6696065641348963,
"learning_rate": 0.00016731985170812414,
"loss": 1.3612,
"step": 775
},
{
"epoch": 3.406113537117904,
"grad_norm": 0.6442936479861974,
"learning_rate": 0.00016675431927818678,
"loss": 1.3288,
"step": 780
},
{
"epoch": 3.427947598253275,
"grad_norm": 0.6665292623524364,
"learning_rate": 0.00016618490926581086,
"loss": 1.3302,
"step": 785
},
{
"epoch": 3.449781659388646,
"grad_norm": 0.6832138001277586,
"learning_rate": 0.00016561165474652292,
"loss": 1.296,
"step": 790
},
{
"epoch": 3.4716157205240172,
"grad_norm": 0.6676829472258946,
"learning_rate": 0.0001650345890191669,
"loss": 1.258,
"step": 795
},
{
"epoch": 3.493449781659389,
"grad_norm": 0.6462889175077688,
"learning_rate": 0.00016445374560396974,
"loss": 1.3108,
"step": 800
},
{
"epoch": 3.51528384279476,
"grad_norm": 0.6538188867916314,
"learning_rate": 0.00016386915824059427,
"loss": 1.2225,
"step": 805
},
{
"epoch": 3.537117903930131,
"grad_norm": 0.6573367032320154,
"learning_rate": 0.0001632808608861794,
"loss": 1.2692,
"step": 810
},
{
"epoch": 3.558951965065502,
"grad_norm": 0.6707468426806011,
"learning_rate": 0.0001626888877133677,
"loss": 1.2621,
"step": 815
},
{
"epoch": 3.5807860262008733,
"grad_norm": 0.6607806582929415,
"learning_rate": 0.00016209327310832028,
"loss": 1.3217,
"step": 820
},
{
"epoch": 3.6026200873362444,
"grad_norm": 0.6695542325826566,
"learning_rate": 0.00016149405166871947,
"loss": 1.2445,
"step": 825
},
{
"epoch": 3.6244541484716155,
"grad_norm": 0.6854945885270477,
"learning_rate": 0.00016089125820175913,
"loss": 1.2334,
"step": 830
},
{
"epoch": 3.646288209606987,
"grad_norm": 0.6998882406491346,
"learning_rate": 0.00016028492772212277,
"loss": 1.3228,
"step": 835
},
{
"epoch": 3.668122270742358,
"grad_norm": 0.6481594617699246,
"learning_rate": 0.00015967509544994959,
"loss": 1.3119,
"step": 840
},
{
"epoch": 3.6899563318777293,
"grad_norm": 0.6775238829866298,
"learning_rate": 0.00015906179680878876,
"loss": 1.2587,
"step": 845
},
{
"epoch": 3.7117903930131004,
"grad_norm": 0.667250254807951,
"learning_rate": 0.00015844506742354164,
"loss": 1.335,
"step": 850
},
{
"epoch": 3.7336244541484715,
"grad_norm": 0.6850864766225458,
"learning_rate": 0.00015782494311839248,
"loss": 1.3585,
"step": 855
},
{
"epoch": 3.7554585152838427,
"grad_norm": 0.7100136757699026,
"learning_rate": 0.00015720145991472746,
"loss": 1.3494,
"step": 860
},
{
"epoch": 3.777292576419214,
"grad_norm": 0.6455753266089419,
"learning_rate": 0.00015657465402904239,
"loss": 1.32,
"step": 865
},
{
"epoch": 3.7991266375545854,
"grad_norm": 0.6774008615272976,
"learning_rate": 0.00015594456187083887,
"loss": 1.3053,
"step": 870
},
{
"epoch": 3.8209606986899565,
"grad_norm": 0.6412117270718072,
"learning_rate": 0.0001553112200405094,
"loss": 1.3468,
"step": 875
},
{
"epoch": 3.8427947598253276,
"grad_norm": 0.6299987093163749,
"learning_rate": 0.00015467466532721136,
"loss": 1.2464,
"step": 880
},
{
"epoch": 3.8646288209606987,
"grad_norm": 0.6648608762625429,
"learning_rate": 0.00015403493470673006,
"loss": 1.4054,
"step": 885
},
{
"epoch": 3.88646288209607,
"grad_norm": 0.6973437539106749,
"learning_rate": 0.00015339206533933087,
"loss": 1.3005,
"step": 890
},
{
"epoch": 3.908296943231441,
"grad_norm": 0.6609891540439728,
"learning_rate": 0.00015274609456760073,
"loss": 1.3751,
"step": 895
},
{
"epoch": 3.930131004366812,
"grad_norm": 0.6243806427317503,
"learning_rate": 0.0001520970599142789,
"loss": 1.309,
"step": 900
},
{
"epoch": 3.9519650655021836,
"grad_norm": 0.6701604629698161,
"learning_rate": 0.00015144499908007757,
"loss": 1.3302,
"step": 905
},
{
"epoch": 3.9737991266375547,
"grad_norm": 0.6417002641455068,
"learning_rate": 0.00015078994994149167,
"loss": 1.3244,
"step": 910
},
{
"epoch": 3.995633187772926,
"grad_norm": 0.6227627176738441,
"learning_rate": 0.00015013195054859894,
"loss": 1.3739,
"step": 915
},
{
"epoch": 4.0174672489082965,
"grad_norm": 0.69565512329475,
"learning_rate": 0.00014947103912284958,
"loss": 1.1587,
"step": 920
},
{
"epoch": 4.039301310043668,
"grad_norm": 0.9168936710349829,
"learning_rate": 0.0001488072540548461,
"loss": 1.183,
"step": 925
},
{
"epoch": 4.06113537117904,
"grad_norm": 0.768969897857725,
"learning_rate": 0.00014814063390211334,
"loss": 1.1114,
"step": 930
},
{
"epoch": 4.08296943231441,
"grad_norm": 0.8549092972669946,
"learning_rate": 0.00014747121738685874,
"loss": 1.2111,
"step": 935
},
{
"epoch": 4.104803493449782,
"grad_norm": 0.8316566189643829,
"learning_rate": 0.00014679904339372302,
"loss": 1.1581,
"step": 940
},
{
"epoch": 4.126637554585153,
"grad_norm": 0.8226115993114171,
"learning_rate": 0.00014612415096752155,
"loss": 1.1881,
"step": 945
},
{
"epoch": 4.148471615720524,
"grad_norm": 0.8505226801184157,
"learning_rate": 0.0001454465793109763,
"loss": 1.135,
"step": 950
},
{
"epoch": 4.170305676855895,
"grad_norm": 0.8021211984624651,
"learning_rate": 0.00014476636778243878,
"loss": 1.1768,
"step": 955
},
{
"epoch": 4.192139737991266,
"grad_norm": 0.8578093150750806,
"learning_rate": 0.00014408355589360348,
"loss": 1.0631,
"step": 960
},
{
"epoch": 4.213973799126638,
"grad_norm": 0.8812497659618362,
"learning_rate": 0.00014339818330721314,
"loss": 1.1288,
"step": 965
},
{
"epoch": 4.235807860262009,
"grad_norm": 0.7816446218878502,
"learning_rate": 0.0001427102898347546,
"loss": 1.1777,
"step": 970
},
{
"epoch": 4.25764192139738,
"grad_norm": 0.8163412581216741,
"learning_rate": 0.0001420199154341464,
"loss": 1.1469,
"step": 975
},
{
"epoch": 4.279475982532751,
"grad_norm": 0.8410801009958802,
"learning_rate": 0.0001413271002074176,
"loss": 1.1547,
"step": 980
},
{
"epoch": 4.301310043668122,
"grad_norm": 0.8957226442397316,
"learning_rate": 0.00014063188439837832,
"loss": 1.1054,
"step": 985
},
{
"epoch": 4.323144104803493,
"grad_norm": 0.8533966074014762,
"learning_rate": 0.0001399343083902824,
"loss": 1.1468,
"step": 990
},
{
"epoch": 4.344978165938865,
"grad_norm": 0.7969709395883895,
"learning_rate": 0.00013923441270348124,
"loss": 1.1661,
"step": 995
},
{
"epoch": 4.366812227074236,
"grad_norm": 0.830675985424985,
"learning_rate": 0.00013853223799307031,
"loss": 1.1714,
"step": 1000
},
{
"epoch": 4.388646288209607,
"grad_norm": 0.879665119495671,
"learning_rate": 0.00013782782504652763,
"loss": 1.2237,
"step": 1005
},
{
"epoch": 4.4104803493449785,
"grad_norm": 0.8513132295064585,
"learning_rate": 0.0001371212147813443,
"loss": 1.2524,
"step": 1010
},
{
"epoch": 4.432314410480349,
"grad_norm": 0.8345543594033096,
"learning_rate": 0.00013641244824264803,
"loss": 1.2055,
"step": 1015
},
{
"epoch": 4.454148471615721,
"grad_norm": 0.8449282094486232,
"learning_rate": 0.00013570156660081868,
"loss": 1.1459,
"step": 1020
},
{
"epoch": 4.475982532751091,
"grad_norm": 0.8491089050635324,
"learning_rate": 0.00013498861114909685,
"loss": 1.165,
"step": 1025
},
{
"epoch": 4.497816593886463,
"grad_norm": 0.8675954453498238,
"learning_rate": 0.00013427362330118543,
"loss": 1.1048,
"step": 1030
},
{
"epoch": 4.5196506550218345,
"grad_norm": 0.9120386243780424,
"learning_rate": 0.0001335566445888437,
"loss": 1.2427,
"step": 1035
},
{
"epoch": 4.541484716157205,
"grad_norm": 0.8105081633081175,
"learning_rate": 0.00013283771665947505,
"loss": 1.278,
"step": 1040
},
{
"epoch": 4.563318777292577,
"grad_norm": 0.8869239311496004,
"learning_rate": 0.00013211688127370784,
"loss": 1.1099,
"step": 1045
},
{
"epoch": 4.585152838427947,
"grad_norm": 0.8873085989909458,
"learning_rate": 0.00013139418030296937,
"loss": 1.1783,
"step": 1050
},
{
"epoch": 4.606986899563319,
"grad_norm": 0.80023844006387,
"learning_rate": 0.00013066965572705401,
"loss": 1.1504,
"step": 1055
},
{
"epoch": 4.62882096069869,
"grad_norm": 0.8438162486126547,
"learning_rate": 0.00012994334963168443,
"loss": 1.2292,
"step": 1060
},
{
"epoch": 4.650655021834061,
"grad_norm": 0.8687319952846376,
"learning_rate": 0.00012921530420606714,
"loss": 1.2132,
"step": 1065
},
{
"epoch": 4.672489082969433,
"grad_norm": 0.8481724475183398,
"learning_rate": 0.00012848556174044183,
"loss": 1.2114,
"step": 1070
},
{
"epoch": 4.6943231441048034,
"grad_norm": 0.8170589588250686,
"learning_rate": 0.00012775416462362457,
"loss": 1.2152,
"step": 1075
},
{
"epoch": 4.716157205240175,
"grad_norm": 0.8800579649975868,
"learning_rate": 0.00012702115534054593,
"loss": 1.1693,
"step": 1080
},
{
"epoch": 4.737991266375546,
"grad_norm": 0.8601610801550544,
"learning_rate": 0.0001262865764697829,
"loss": 1.1846,
"step": 1085
},
{
"epoch": 4.759825327510917,
"grad_norm": 0.8386680065719362,
"learning_rate": 0.00012555047068108568,
"loss": 1.249,
"step": 1090
},
{
"epoch": 4.781659388646288,
"grad_norm": 0.8517305343726155,
"learning_rate": 0.00012481288073289912,
"loss": 1.1364,
"step": 1095
},
{
"epoch": 4.8034934497816595,
"grad_norm": 0.8088860139786535,
"learning_rate": 0.00012407384946987898,
"loss": 1.1527,
"step": 1100
},
{
"epoch": 4.825327510917031,
"grad_norm": 0.8583326581924249,
"learning_rate": 0.00012333341982040323,
"loss": 1.1515,
"step": 1105
},
{
"epoch": 4.847161572052402,
"grad_norm": 0.9360072468671379,
"learning_rate": 0.00012259163479407832,
"loss": 1.0865,
"step": 1110
},
{
"epoch": 4.868995633187773,
"grad_norm": 0.8650317997007926,
"learning_rate": 0.00012184853747924112,
"loss": 1.131,
"step": 1115
},
{
"epoch": 4.890829694323144,
"grad_norm": 0.8102946053666945,
"learning_rate": 0.00012110417104045575,
"loss": 1.111,
"step": 1120
},
{
"epoch": 4.9126637554585155,
"grad_norm": 0.9358255576727259,
"learning_rate": 0.00012035857871600649,
"loss": 1.2429,
"step": 1125
},
{
"epoch": 4.934497816593886,
"grad_norm": 0.8877109383416729,
"learning_rate": 0.00011961180381538599,
"loss": 1.1798,
"step": 1130
},
{
"epoch": 4.956331877729258,
"grad_norm": 0.8523579756065384,
"learning_rate": 0.0001188638897167797,
"loss": 1.1524,
"step": 1135
},
{
"epoch": 4.978165938864628,
"grad_norm": 0.8309844177132272,
"learning_rate": 0.00011811487986454612,
"loss": 1.2469,
"step": 1140
},
{
"epoch": 5.0,
"grad_norm": 0.8482507314409842,
"learning_rate": 0.00011736481776669306,
"loss": 1.1823,
"step": 1145
},
{
"epoch": 5.021834061135372,
"grad_norm": 1.0322243914151594,
"learning_rate": 0.00011661374699235057,
"loss": 1.0325,
"step": 1150
},
{
"epoch": 5.043668122270742,
"grad_norm": 1.0324089865002601,
"learning_rate": 0.00011586171116924014,
"loss": 1.0234,
"step": 1155
},
{
"epoch": 5.065502183406114,
"grad_norm": 0.9762094077290032,
"learning_rate": 0.00011510875398114027,
"loss": 1.0794,
"step": 1160
},
{
"epoch": 5.0873362445414845,
"grad_norm": 1.1952296969892835,
"learning_rate": 0.00011435491916534919,
"loss": 1.0145,
"step": 1165
},
{
"epoch": 5.109170305676856,
"grad_norm": 1.0895050768888697,
"learning_rate": 0.0001136002505101442,
"loss": 1.0151,
"step": 1170
},
{
"epoch": 5.131004366812227,
"grad_norm": 1.0235853080588493,
"learning_rate": 0.00011284479185223812,
"loss": 1.0388,
"step": 1175
},
{
"epoch": 5.152838427947598,
"grad_norm": 1.0490311233025102,
"learning_rate": 0.00011208858707423299,
"loss": 1.0072,
"step": 1180
},
{
"epoch": 5.17467248908297,
"grad_norm": 1.1856197953264118,
"learning_rate": 0.00011133168010207091,
"loss": 1.0504,
"step": 1185
},
{
"epoch": 5.1965065502183405,
"grad_norm": 1.0065811557379292,
"learning_rate": 0.00011057411490248266,
"loss": 0.9977,
"step": 1190
},
{
"epoch": 5.218340611353712,
"grad_norm": 1.1043632872539175,
"learning_rate": 0.00010981593548043374,
"loss": 0.9932,
"step": 1195
},
{
"epoch": 5.240174672489083,
"grad_norm": 1.0753914347833422,
"learning_rate": 0.00010905718587656811,
"loss": 1.092,
"step": 1200
},
{
"epoch": 5.262008733624454,
"grad_norm": 1.0266820326577377,
"learning_rate": 0.0001082979101646502,
"loss": 1.0655,
"step": 1205
},
{
"epoch": 5.283842794759825,
"grad_norm": 0.9941194317158725,
"learning_rate": 0.00010753815244900458,
"loss": 0.9828,
"step": 1210
},
{
"epoch": 5.3056768558951966,
"grad_norm": 1.084324048580049,
"learning_rate": 0.00010677795686195422,
"loss": 1.0229,
"step": 1215
},
{
"epoch": 5.327510917030567,
"grad_norm": 1.051439528201926,
"learning_rate": 0.00010601736756125685,
"loss": 1.0168,
"step": 1220
},
{
"epoch": 5.349344978165939,
"grad_norm": 1.1580814102197374,
"learning_rate": 0.00010525642872753996,
"loss": 0.935,
"step": 1225
},
{
"epoch": 5.37117903930131,
"grad_norm": 1.0710336894680983,
"learning_rate": 0.00010449518456173456,
"loss": 1.067,
"step": 1230
},
{
"epoch": 5.393013100436681,
"grad_norm": 1.0463850478020345,
"learning_rate": 0.00010373367928250749,
"loss": 1.0489,
"step": 1235
},
{
"epoch": 5.414847161572053,
"grad_norm": 1.1057562523337745,
"learning_rate": 0.00010297195712369311,
"loss": 0.954,
"step": 1240
},
{
"epoch": 5.436681222707423,
"grad_norm": 1.0647508212261871,
"learning_rate": 0.0001022100623317237,
"loss": 0.9094,
"step": 1245
},
{
"epoch": 5.458515283842795,
"grad_norm": 1.0854376270937827,
"learning_rate": 0.00010144803916305925,
"loss": 0.9996,
"step": 1250
},
{
"epoch": 5.4803493449781655,
"grad_norm": 1.0768231071114585,
"learning_rate": 0.00010068593188161697,
"loss": 1.0098,
"step": 1255
},
{
"epoch": 5.502183406113537,
"grad_norm": 1.0629470347878223,
"learning_rate": 9.992378475619981e-05,
"loss": 1.0252,
"step": 1260
},
{
"epoch": 5.524017467248909,
"grad_norm": 1.051250411684801,
"learning_rate": 9.916164205792527e-05,
"loss": 0.9879,
"step": 1265
},
{
"epoch": 5.545851528384279,
"grad_norm": 1.0100574406024927,
"learning_rate": 9.839954805765364e-05,
"loss": 1.0638,
"step": 1270
},
{
"epoch": 5.567685589519651,
"grad_norm": 1.0416962539798005,
"learning_rate": 9.763754702341646e-05,
"loss": 0.9556,
"step": 1275
},
{
"epoch": 5.5895196506550215,
"grad_norm": 1.041656249717949,
"learning_rate": 9.687568321784509e-05,
"loss": 1.0295,
"step": 1280
},
{
"epoch": 5.611353711790393,
"grad_norm": 1.057095267929098,
"learning_rate": 9.611400089559975e-05,
"loss": 1.0233,
"step": 1285
},
{
"epoch": 5.633187772925764,
"grad_norm": 1.0153521926215252,
"learning_rate": 9.535254430079864e-05,
"loss": 0.9867,
"step": 1290
},
{
"epoch": 5.655021834061135,
"grad_norm": 1.1345238181227135,
"learning_rate": 9.459135766444815e-05,
"loss": 1.0027,
"step": 1295
},
{
"epoch": 5.676855895196507,
"grad_norm": 1.1187108189925221,
"learning_rate": 9.383048520187344e-05,
"loss": 0.9987,
"step": 1300
},
{
"epoch": 5.698689956331878,
"grad_norm": 1.0636989170846824,
"learning_rate": 9.306997111015014e-05,
"loss": 1.0486,
"step": 1305
},
{
"epoch": 5.720524017467249,
"grad_norm": 1.0706396029944643,
"learning_rate": 9.23098595655371e-05,
"loss": 0.9931,
"step": 1310
},
{
"epoch": 5.74235807860262,
"grad_norm": 1.0475360658888055,
"learning_rate": 9.155019472091022e-05,
"loss": 0.9749,
"step": 1315
},
{
"epoch": 5.764192139737991,
"grad_norm": 1.0214139834738702,
"learning_rate": 9.079102070319786e-05,
"loss": 1.0693,
"step": 1320
},
{
"epoch": 5.786026200873362,
"grad_norm": 1.0530973134949948,
"learning_rate": 9.003238161081743e-05,
"loss": 1.0228,
"step": 1325
},
{
"epoch": 5.807860262008734,
"grad_norm": 1.103677474707846,
"learning_rate": 8.9274321511114e-05,
"loss": 0.9761,
"step": 1330
},
{
"epoch": 5.829694323144105,
"grad_norm": 1.0644633976825475,
"learning_rate": 8.851688443780043e-05,
"loss": 1.0239,
"step": 1335
},
{
"epoch": 5.851528384279476,
"grad_norm": 1.0555551246349646,
"learning_rate": 8.776011438839977e-05,
"loss": 1.0473,
"step": 1340
},
{
"epoch": 5.873362445414847,
"grad_norm": 1.122056836899885,
"learning_rate": 8.70040553216892e-05,
"loss": 0.9723,
"step": 1345
},
{
"epoch": 5.895196506550218,
"grad_norm": 0.9963873601564418,
"learning_rate": 8.624875115514697e-05,
"loss": 1.0268,
"step": 1350
},
{
"epoch": 5.91703056768559,
"grad_norm": 1.0336940380478288,
"learning_rate": 8.549424576240102e-05,
"loss": 0.9574,
"step": 1355
},
{
"epoch": 5.93886462882096,
"grad_norm": 1.0760501661341102,
"learning_rate": 8.474058297068071e-05,
"loss": 1.0979,
"step": 1360
},
{
"epoch": 5.960698689956332,
"grad_norm": 1.0890942343064978,
"learning_rate": 8.398780655827096e-05,
"loss": 0.9427,
"step": 1365
},
{
"epoch": 5.9825327510917035,
"grad_norm": 1.1415589599070428,
"learning_rate": 8.323596025196911e-05,
"loss": 1.0041,
"step": 1370
},
{
"epoch": 6.004366812227074,
"grad_norm": 1.1509480354124522,
"learning_rate": 8.248508772454529e-05,
"loss": 0.9545,
"step": 1375
},
{
"epoch": 6.026200873362446,
"grad_norm": 1.4252192004046074,
"learning_rate": 8.173523259220521e-05,
"loss": 0.8584,
"step": 1380
},
{
"epoch": 6.048034934497816,
"grad_norm": 1.37449264500959,
"learning_rate": 8.098643841205685e-05,
"loss": 0.8417,
"step": 1385
},
{
"epoch": 6.069868995633188,
"grad_norm": 1.1455224268481572,
"learning_rate": 8.023874867958027e-05,
"loss": 0.8365,
"step": 1390
},
{
"epoch": 6.091703056768559,
"grad_norm": 1.4517829308762,
"learning_rate": 7.949220682610109e-05,
"loss": 0.8772,
"step": 1395
},
{
"epoch": 6.11353711790393,
"grad_norm": 1.2198167511326543,
"learning_rate": 7.874685621626767e-05,
"loss": 0.7638,
"step": 1400
},
{
"epoch": 6.135371179039302,
"grad_norm": 1.2285129578196883,
"learning_rate": 7.80027401455321e-05,
"loss": 0.8632,
"step": 1405
},
{
"epoch": 6.157205240174672,
"grad_norm": 1.3128304307256697,
"learning_rate": 7.725990183763541e-05,
"loss": 0.7864,
"step": 1410
},
{
"epoch": 6.179039301310044,
"grad_norm": 1.2223078968334138,
"learning_rate": 7.651838444209678e-05,
"loss": 0.8107,
"step": 1415
},
{
"epoch": 6.200873362445415,
"grad_norm": 1.2116494464520935,
"learning_rate": 7.577823103170695e-05,
"loss": 0.7665,
"step": 1420
},
{
"epoch": 6.222707423580786,
"grad_norm": 1.2710441404657735,
"learning_rate": 7.503948460002651e-05,
"loss": 0.8755,
"step": 1425
},
{
"epoch": 6.244541484716157,
"grad_norm": 1.3090624013120402,
"learning_rate": 7.430218805888831e-05,
"loss": 0.8635,
"step": 1430
},
{
"epoch": 6.2663755458515285,
"grad_norm": 1.3421074934086965,
"learning_rate": 7.356638423590485e-05,
"loss": 0.8408,
"step": 1435
},
{
"epoch": 6.2882096069869,
"grad_norm": 1.2190845509165837,
"learning_rate": 7.283211587198056e-05,
"loss": 0.901,
"step": 1440
},
{
"epoch": 6.310043668122271,
"grad_norm": 1.2848215636192764,
"learning_rate": 7.209942561882914e-05,
"loss": 0.8183,
"step": 1445
},
{
"epoch": 6.331877729257642,
"grad_norm": 1.2782891875098124,
"learning_rate": 7.136835603649599e-05,
"loss": 0.8144,
"step": 1450
},
{
"epoch": 6.353711790393013,
"grad_norm": 1.3643650683015465,
"learning_rate": 7.0638949590886e-05,
"loss": 0.815,
"step": 1455
},
{
"epoch": 6.3755458515283845,
"grad_norm": 1.3747527932453862,
"learning_rate": 6.991124865129683e-05,
"loss": 0.8058,
"step": 1460
},
{
"epoch": 6.397379912663755,
"grad_norm": 1.324272429862148,
"learning_rate": 6.918529548795781e-05,
"loss": 0.8359,
"step": 1465
},
{
"epoch": 6.419213973799127,
"grad_norm": 1.329733488330745,
"learning_rate": 6.846113226957456e-05,
"loss": 0.8081,
"step": 1470
},
{
"epoch": 6.441048034934497,
"grad_norm": 1.3973165024504683,
"learning_rate": 6.773880106087945e-05,
"loss": 0.9255,
"step": 1475
},
{
"epoch": 6.462882096069869,
"grad_norm": 1.2579802684349493,
"learning_rate": 6.701834382018832e-05,
"loss": 0.8932,
"step": 1480
},
{
"epoch": 6.4847161572052405,
"grad_norm": 1.3102021281865468,
"learning_rate": 6.629980239696315e-05,
"loss": 0.8651,
"step": 1485
},
{
"epoch": 6.506550218340611,
"grad_norm": 1.3096833467828455,
"learning_rate": 6.558321852938099e-05,
"loss": 0.8145,
"step": 1490
},
{
"epoch": 6.528384279475983,
"grad_norm": 1.3466234774293075,
"learning_rate": 6.486863384190987e-05,
"loss": 0.8885,
"step": 1495
},
{
"epoch": 6.550218340611353,
"grad_norm": 1.296177045867574,
"learning_rate": 6.415608984289052e-05,
"loss": 0.8546,
"step": 1500
},
{
"epoch": 6.572052401746725,
"grad_norm": 1.2633240958805778,
"learning_rate": 6.344562792212554e-05,
"loss": 0.8685,
"step": 1505
},
{
"epoch": 6.593886462882097,
"grad_norm": 1.3534959976317207,
"learning_rate": 6.273728934847516e-05,
"loss": 0.7986,
"step": 1510
},
{
"epoch": 6.615720524017467,
"grad_norm": 1.252619828877404,
"learning_rate": 6.203111526745985e-05,
"loss": 0.8332,
"step": 1515
},
{
"epoch": 6.637554585152839,
"grad_norm": 1.3249258619095206,
"learning_rate": 6.132714669887044e-05,
"loss": 0.8308,
"step": 1520
},
{
"epoch": 6.6593886462882095,
"grad_norm": 1.2054299066608396,
"learning_rate": 6.0625424534385425e-05,
"loss": 0.8697,
"step": 1525
},
{
"epoch": 6.681222707423581,
"grad_norm": 1.1955192517041973,
"learning_rate": 5.99259895351955e-05,
"loss": 0.8591,
"step": 1530
},
{
"epoch": 6.703056768558952,
"grad_norm": 1.2570222378899258,
"learning_rate": 5.9228882329636094e-05,
"loss": 0.7953,
"step": 1535
},
{
"epoch": 6.724890829694323,
"grad_norm": 1.3956159571082007,
"learning_rate": 5.8534143410827104e-05,
"loss": 0.8367,
"step": 1540
},
{
"epoch": 6.746724890829694,
"grad_norm": 1.3649251909287845,
"learning_rate": 5.7841813134320975e-05,
"loss": 0.8553,
"step": 1545
},
{
"epoch": 6.7685589519650655,
"grad_norm": 1.3726673140188062,
"learning_rate": 5.715193171575842e-05,
"loss": 0.8649,
"step": 1550
},
{
"epoch": 6.790393013100436,
"grad_norm": 1.2184325480564675,
"learning_rate": 5.64645392285325e-05,
"loss": 0.8222,
"step": 1555
},
{
"epoch": 6.812227074235808,
"grad_norm": 1.3383861408747,
"learning_rate": 5.577967560146077e-05,
"loss": 0.851,
"step": 1560
},
{
"epoch": 6.834061135371179,
"grad_norm": 1.3171908370457348,
"learning_rate": 5.5097380616466057e-05,
"loss": 0.8662,
"step": 1565
},
{
"epoch": 6.85589519650655,
"grad_norm": 1.3054511507141975,
"learning_rate": 5.4417693906265365e-05,
"loss": 0.8979,
"step": 1570
},
{
"epoch": 6.877729257641922,
"grad_norm": 1.2387745464762083,
"learning_rate": 5.374065495206805e-05,
"loss": 0.8119,
"step": 1575
},
{
"epoch": 6.899563318777292,
"grad_norm": 1.3948067425415336,
"learning_rate": 5.306630308128229e-05,
"loss": 0.8409,
"step": 1580
},
{
"epoch": 6.921397379912664,
"grad_norm": 1.3502073303444249,
"learning_rate": 5.239467746523048e-05,
"loss": 0.8391,
"step": 1585
},
{
"epoch": 6.9432314410480345,
"grad_norm": 1.3334535387387385,
"learning_rate": 5.172581711687438e-05,
"loss": 0.8577,
"step": 1590
},
{
"epoch": 6.965065502183406,
"grad_norm": 1.3381286547460995,
"learning_rate": 5.105976088854842e-05,
"loss": 0.8925,
"step": 1595
},
{
"epoch": 6.986899563318778,
"grad_norm": 1.2107404270853181,
"learning_rate": 5.0396547469703106e-05,
"loss": 0.8894,
"step": 1600
},
{
"epoch": 7.008733624454148,
"grad_norm": 1.2856577830397042,
"learning_rate": 4.973621538465768e-05,
"loss": 0.8269,
"step": 1605
},
{
"epoch": 7.03056768558952,
"grad_norm": 1.5720811121732976,
"learning_rate": 4.907880299036234e-05,
"loss": 0.6532,
"step": 1610
},
{
"epoch": 7.0524017467248905,
"grad_norm": 1.513790910492325,
"learning_rate": 4.8424348474170014e-05,
"loss": 0.6398,
"step": 1615
},
{
"epoch": 7.074235807860262,
"grad_norm": 1.3731923874867311,
"learning_rate": 4.7772889851618405e-05,
"loss": 0.7323,
"step": 1620
},
{
"epoch": 7.096069868995633,
"grad_norm": 1.3538833010462115,
"learning_rate": 4.712446496422165e-05,
"loss": 0.6906,
"step": 1625
},
{
"epoch": 7.117903930131004,
"grad_norm": 1.4454754507310241,
"learning_rate": 4.647911147727209e-05,
"loss": 0.7328,
"step": 1630
},
{
"epoch": 7.139737991266376,
"grad_norm": 1.449171514767732,
"learning_rate": 4.583686687765264e-05,
"loss": 0.6782,
"step": 1635
},
{
"epoch": 7.1615720524017465,
"grad_norm": 1.6050731544814476,
"learning_rate": 4.5197768471659104e-05,
"loss": 0.7385,
"step": 1640
},
{
"epoch": 7.183406113537118,
"grad_norm": 1.3388820740639182,
"learning_rate": 4.4561853382833206e-05,
"loss": 0.6937,
"step": 1645
},
{
"epoch": 7.205240174672489,
"grad_norm": 1.3724590934014314,
"learning_rate": 4.3929158549806096e-05,
"loss": 0.6899,
"step": 1650
},
{
"epoch": 7.22707423580786,
"grad_norm": 1.5165378554181088,
"learning_rate": 4.32997207241528e-05,
"loss": 0.7044,
"step": 1655
},
{
"epoch": 7.248908296943231,
"grad_norm": 1.6117091341741596,
"learning_rate": 4.267357646825746e-05,
"loss": 0.7093,
"step": 1660
},
{
"epoch": 7.270742358078603,
"grad_norm": 1.4490430208656846,
"learning_rate": 4.205076215318925e-05,
"loss": 0.6967,
"step": 1665
},
{
"epoch": 7.292576419213974,
"grad_norm": 1.469380632694007,
"learning_rate": 4.143131395658996e-05,
"loss": 0.7164,
"step": 1670
},
{
"epoch": 7.314410480349345,
"grad_norm": 1.519813749480573,
"learning_rate": 4.081526786057254e-05,
"loss": 0.6724,
"step": 1675
},
{
"epoch": 7.336244541484716,
"grad_norm": 1.4588612796193572,
"learning_rate": 4.020265964963066e-05,
"loss": 0.731,
"step": 1680
},
{
"epoch": 7.358078602620087,
"grad_norm": 1.429975103899558,
"learning_rate": 3.9593524908560464e-05,
"loss": 0.7327,
"step": 1685
},
{
"epoch": 7.379912663755459,
"grad_norm": 1.6113371519439155,
"learning_rate": 3.898789902039338e-05,
"loss": 0.709,
"step": 1690
},
{
"epoch": 7.401746724890829,
"grad_norm": 1.5463250020435173,
"learning_rate": 3.8385817164340723e-05,
"loss": 0.7246,
"step": 1695
},
{
"epoch": 7.423580786026201,
"grad_norm": 1.463616658449462,
"learning_rate": 3.778731431375041e-05,
"loss": 0.7013,
"step": 1700
},
{
"epoch": 7.445414847161572,
"grad_norm": 1.5566283347360372,
"learning_rate": 3.719242523407539e-05,
"loss": 0.7344,
"step": 1705
},
{
"epoch": 7.467248908296943,
"grad_norm": 1.4398876621434327,
"learning_rate": 3.6601184480854066e-05,
"loss": 0.7323,
"step": 1710
},
{
"epoch": 7.489082969432315,
"grad_norm": 1.5252195085267406,
"learning_rate": 3.601362639770328e-05,
"loss": 0.7091,
"step": 1715
},
{
"epoch": 7.510917030567685,
"grad_norm": 1.6881923973765582,
"learning_rate": 3.542978511432325e-05,
"loss": 0.7585,
"step": 1720
},
{
"epoch": 7.532751091703057,
"grad_norm": 1.450411669170985,
"learning_rate": 3.484969454451511e-05,
"loss": 0.7258,
"step": 1725
},
{
"epoch": 7.554585152838428,
"grad_norm": 1.523075629927926,
"learning_rate": 3.4273388384210855e-05,
"loss": 0.6716,
"step": 1730
},
{
"epoch": 7.576419213973799,
"grad_norm": 1.4565661875305633,
"learning_rate": 3.3700900109516184e-05,
"loss": 0.6586,
"step": 1735
},
{
"epoch": 7.598253275109171,
"grad_norm": 1.6164061159781367,
"learning_rate": 3.3132262974765906e-05,
"loss": 0.7123,
"step": 1740
},
{
"epoch": 7.620087336244541,
"grad_norm": 1.5157276652461606,
"learning_rate": 3.256751001059214e-05,
"loss": 0.723,
"step": 1745
},
{
"epoch": 7.641921397379913,
"grad_norm": 1.5600828466807661,
"learning_rate": 3.200667402200586e-05,
"loss": 0.7477,
"step": 1750
},
{
"epoch": 7.663755458515284,
"grad_norm": 1.5728766811598356,
"learning_rate": 3.144978758649133e-05,
"loss": 0.7001,
"step": 1755
},
{
"epoch": 7.685589519650655,
"grad_norm": 1.458458820326098,
"learning_rate": 3.0896883052113525e-05,
"loss": 0.7066,
"step": 1760
},
{
"epoch": 7.707423580786026,
"grad_norm": 1.5980960571332508,
"learning_rate": 3.034799253563939e-05,
"loss": 0.6878,
"step": 1765
},
{
"epoch": 7.729257641921397,
"grad_norm": 1.544990160417705,
"learning_rate": 2.9803147920672146e-05,
"loss": 0.6894,
"step": 1770
},
{
"epoch": 7.751091703056769,
"grad_norm": 1.6353637199811462,
"learning_rate": 2.9262380855799164e-05,
"loss": 0.7297,
"step": 1775
},
{
"epoch": 7.77292576419214,
"grad_norm": 1.5830327061313785,
"learning_rate": 2.872572275275379e-05,
"loss": 0.6983,
"step": 1780
},
{
"epoch": 7.794759825327511,
"grad_norm": 1.4630236788620592,
"learning_rate": 2.8193204784590597e-05,
"loss": 0.7176,
"step": 1785
},
{
"epoch": 7.816593886462882,
"grad_norm": 1.3928429612080049,
"learning_rate": 2.766485788387455e-05,
"loss": 0.7269,
"step": 1790
},
{
"epoch": 7.8384279475982535,
"grad_norm": 1.5318894677152983,
"learning_rate": 2.7140712740884376e-05,
"loss": 0.7094,
"step": 1795
},
{
"epoch": 7.860262008733624,
"grad_norm": 1.5006081477924398,
"learning_rate": 2.6620799801829765e-05,
"loss": 0.7356,
"step": 1800
},
{
"epoch": 7.882096069868996,
"grad_norm": 1.4913103972222401,
"learning_rate": 2.610514926708285e-05,
"loss": 0.7563,
"step": 1805
},
{
"epoch": 7.903930131004367,
"grad_norm": 1.5640971425352013,
"learning_rate": 2.5593791089423858e-05,
"loss": 0.6974,
"step": 1810
},
{
"epoch": 7.925764192139738,
"grad_norm": 1.5288946416263425,
"learning_rate": 2.5086754972301384e-05,
"loss": 0.7597,
"step": 1815
},
{
"epoch": 7.9475982532751095,
"grad_norm": 1.4850580236939783,
"learning_rate": 2.4584070368106928e-05,
"loss": 0.731,
"step": 1820
},
{
"epoch": 7.96943231441048,
"grad_norm": 1.5394321618591782,
"learning_rate": 2.4085766476463967e-05,
"loss": 0.712,
"step": 1825
},
{
"epoch": 7.991266375545852,
"grad_norm": 1.6849009773020913,
"learning_rate": 2.3591872242532066e-05,
"loss": 0.7327,
"step": 1830
},
{
"epoch": 8.013100436681222,
"grad_norm": 1.4739518847631212,
"learning_rate": 2.310241635532531e-05,
"loss": 0.6777,
"step": 1835
},
{
"epoch": 8.034934497816593,
"grad_norm": 1.4710344726002955,
"learning_rate": 2.2617427246045973e-05,
"loss": 0.5886,
"step": 1840
},
{
"epoch": 8.056768558951966,
"grad_norm": 1.8620287227429924,
"learning_rate": 2.2136933086432955e-05,
"loss": 0.6258,
"step": 1845
},
{
"epoch": 8.078602620087336,
"grad_norm": 1.4930799825826104,
"learning_rate": 2.1660961787125388e-05,
"loss": 0.6041,
"step": 1850
},
{
"epoch": 8.100436681222707,
"grad_norm": 1.469008850941141,
"learning_rate": 2.1189540996041313e-05,
"loss": 0.647,
"step": 1855
},
{
"epoch": 8.12227074235808,
"grad_norm": 1.6154095310822976,
"learning_rate": 2.0722698096771832e-05,
"loss": 0.5866,
"step": 1860
},
{
"epoch": 8.14410480349345,
"grad_norm": 1.7217187185906804,
"learning_rate": 2.026046020699035e-05,
"loss": 0.6718,
"step": 1865
},
{
"epoch": 8.16593886462882,
"grad_norm": 1.5816769270478201,
"learning_rate": 1.980285417687735e-05,
"loss": 0.6303,
"step": 1870
},
{
"epoch": 8.187772925764191,
"grad_norm": 1.5330680888020691,
"learning_rate": 1.9349906587560862e-05,
"loss": 0.6166,
"step": 1875
},
{
"epoch": 8.209606986899564,
"grad_norm": 1.4811035124378678,
"learning_rate": 1.8901643749572374e-05,
"loss": 0.6245,
"step": 1880
},
{
"epoch": 8.231441048034934,
"grad_norm": 1.6432943675024339,
"learning_rate": 1.8458091701318504e-05,
"loss": 0.6261,
"step": 1885
},
{
"epoch": 8.253275109170305,
"grad_norm": 1.6763677385421005,
"learning_rate": 1.801927620756847e-05,
"loss": 0.6468,
"step": 1890
},
{
"epoch": 8.275109170305678,
"grad_norm": 1.5425807262553166,
"learning_rate": 1.7585222757957576e-05,
"loss": 0.6059,
"step": 1895
},
{
"epoch": 8.296943231441048,
"grad_norm": 1.7341819757802275,
"learning_rate": 1.7155956565506547e-05,
"loss": 0.6728,
"step": 1900
},
{
"epoch": 8.318777292576419,
"grad_norm": 1.4483276727621572,
"learning_rate": 1.6731502565156875e-05,
"loss": 0.6033,
"step": 1905
},
{
"epoch": 8.34061135371179,
"grad_norm": 1.5596252995782947,
"learning_rate": 1.6311885412322602e-05,
"loss": 0.63,
"step": 1910
},
{
"epoch": 8.362445414847162,
"grad_norm": 1.747814876036389,
"learning_rate": 1.5897129481457996e-05,
"loss": 0.5621,
"step": 1915
},
{
"epoch": 8.384279475982533,
"grad_norm": 1.5550499771354138,
"learning_rate": 1.5487258864641717e-05,
"loss": 0.6306,
"step": 1920
},
{
"epoch": 8.406113537117903,
"grad_norm": 1.5708977251660243,
"learning_rate": 1.50822973701775e-05,
"loss": 0.6281,
"step": 1925
},
{
"epoch": 8.427947598253276,
"grad_norm": 1.5052322890768695,
"learning_rate": 1.4682268521211073e-05,
"loss": 0.5805,
"step": 1930
},
{
"epoch": 8.449781659388647,
"grad_norm": 1.5682376306714874,
"learning_rate": 1.4287195554363718e-05,
"loss": 0.6103,
"step": 1935
},
{
"epoch": 8.471615720524017,
"grad_norm": 1.5926672273219815,
"learning_rate": 1.3897101418382663e-05,
"loss": 0.6086,
"step": 1940
},
{
"epoch": 8.493449781659388,
"grad_norm": 1.6920842457267133,
"learning_rate": 1.3512008772807993e-05,
"loss": 0.6075,
"step": 1945
},
{
"epoch": 8.51528384279476,
"grad_norm": 1.7066301478594386,
"learning_rate": 1.3131939986656305e-05,
"loss": 0.6037,
"step": 1950
},
{
"epoch": 8.537117903930131,
"grad_norm": 1.6263309583877439,
"learning_rate": 1.2756917137121527e-05,
"loss": 0.6137,
"step": 1955
},
{
"epoch": 8.558951965065502,
"grad_norm": 1.63476708848148,
"learning_rate": 1.2386962008292413e-05,
"loss": 0.5858,
"step": 1960
},
{
"epoch": 8.580786026200874,
"grad_norm": 1.558434256481925,
"learning_rate": 1.2022096089887191e-05,
"loss": 0.6426,
"step": 1965
},
{
"epoch": 8.602620087336245,
"grad_norm": 1.551619834712178,
"learning_rate": 1.1662340576005216e-05,
"loss": 0.6084,
"step": 1970
},
{
"epoch": 8.624454148471616,
"grad_norm": 1.555097964766268,
"learning_rate": 1.130771636389596e-05,
"loss": 0.6687,
"step": 1975
},
{
"epoch": 8.646288209606986,
"grad_norm": 1.6643047208497839,
"learning_rate": 1.0958244052745126e-05,
"loss": 0.6155,
"step": 1980
},
{
"epoch": 8.668122270742359,
"grad_norm": 1.611005114630636,
"learning_rate": 1.0613943942478e-05,
"loss": 0.6089,
"step": 1985
},
{
"epoch": 8.68995633187773,
"grad_norm": 1.5244955674562928,
"learning_rate": 1.0274836032580415e-05,
"loss": 0.6487,
"step": 1990
},
{
"epoch": 8.7117903930131,
"grad_norm": 1.6030907080208872,
"learning_rate": 9.940940020936951e-06,
"loss": 0.6293,
"step": 1995
},
{
"epoch": 8.733624454148472,
"grad_norm": 1.5700446980317904,
"learning_rate": 9.612275302686713e-06,
"loss": 0.6326,
"step": 2000
},
{
"epoch": 8.755458515283843,
"grad_norm": 1.5394752226532917,
"learning_rate": 9.288860969096857e-06,
"loss": 0.6107,
"step": 2005
},
{
"epoch": 8.777292576419214,
"grad_norm": 1.667355715027956,
"learning_rate": 8.970715806453489e-06,
"loss": 0.636,
"step": 2010
},
{
"epoch": 8.799126637554584,
"grad_norm": 1.7421757272877927,
"learning_rate": 8.657858294970412e-06,
"loss": 0.6358,
"step": 2015
},
{
"epoch": 8.820960698689957,
"grad_norm": 1.48526713960321,
"learning_rate": 8.350306607715774e-06,
"loss": 0.6456,
"step": 2020
},
{
"epoch": 8.842794759825328,
"grad_norm": 1.7150442937256956,
"learning_rate": 8.048078609556386e-06,
"loss": 0.6443,
"step": 2025
},
{
"epoch": 8.864628820960698,
"grad_norm": 1.6511037454437418,
"learning_rate": 7.751191856119932e-06,
"loss": 0.671,
"step": 2030
},
{
"epoch": 8.886462882096069,
"grad_norm": 1.6987081532253472,
"learning_rate": 7.459663592775334e-06,
"loss": 0.6577,
"step": 2035
},
{
"epoch": 8.908296943231441,
"grad_norm": 1.534952723839,
"learning_rate": 7.173510753630919e-06,
"loss": 0.6233,
"step": 2040
},
{
"epoch": 8.930131004366812,
"grad_norm": 1.641736331583932,
"learning_rate": 6.892749960550815e-06,
"loss": 0.6289,
"step": 2045
},
{
"epoch": 8.951965065502183,
"grad_norm": 1.5438647483260877,
"learning_rate": 6.6173975221893615e-06,
"loss": 0.5888,
"step": 2050
},
{
"epoch": 8.973799126637555,
"grad_norm": 1.5243321966646095,
"learning_rate": 6.347469433043851e-06,
"loss": 0.6707,
"step": 2055
},
{
"epoch": 8.995633187772926,
"grad_norm": 1.6062799594767991,
"learning_rate": 6.082981372525487e-06,
"loss": 0.5971,
"step": 2060
},
{
"epoch": 9.017467248908297,
"grad_norm": 1.462091614630793,
"learning_rate": 5.823948704048443e-06,
"loss": 0.5631,
"step": 2065
},
{
"epoch": 9.039301310043669,
"grad_norm": 1.4923902769131498,
"learning_rate": 5.570386474137623e-06,
"loss": 0.5617,
"step": 2070
},
{
"epoch": 9.06113537117904,
"grad_norm": 1.6248158642620525,
"learning_rate": 5.322309411554582e-06,
"loss": 0.6111,
"step": 2075
},
{
"epoch": 9.08296943231441,
"grad_norm": 1.6615679454812948,
"learning_rate": 5.0797319264419105e-06,
"loss": 0.563,
"step": 2080
},
{
"epoch": 9.104803493449781,
"grad_norm": 1.6126706246375568,
"learning_rate": 4.84266810948627e-06,
"loss": 0.5686,
"step": 2085
},
{
"epoch": 9.126637554585153,
"grad_norm": 1.5809643369161743,
"learning_rate": 4.611131731099905e-06,
"loss": 0.5533,
"step": 2090
},
{
"epoch": 9.148471615720524,
"grad_norm": 1.6161716990207524,
"learning_rate": 4.385136240620657e-06,
"loss": 0.5962,
"step": 2095
},
{
"epoch": 9.170305676855895,
"grad_norm": 1.5154717234389132,
"learning_rate": 4.164694765530841e-06,
"loss": 0.5946,
"step": 2100
},
{
"epoch": 9.192139737991265,
"grad_norm": 1.5768496797034472,
"learning_rate": 3.94982011069468e-06,
"loss": 0.5383,
"step": 2105
},
{
"epoch": 9.213973799126638,
"grad_norm": 1.6583634682510404,
"learning_rate": 3.7405247576144054e-06,
"loss": 0.6018,
"step": 2110
},
{
"epoch": 9.235807860262009,
"grad_norm": 1.5801691223578864,
"learning_rate": 3.5368208637053702e-06,
"loss": 0.5564,
"step": 2115
},
{
"epoch": 9.25764192139738,
"grad_norm": 1.702349073192532,
"learning_rate": 3.338720261589823e-06,
"loss": 0.578,
"step": 2120
},
{
"epoch": 9.279475982532752,
"grad_norm": 1.5184620025021562,
"learning_rate": 3.146234458409525e-06,
"loss": 0.5649,
"step": 2125
},
{
"epoch": 9.301310043668122,
"grad_norm": 1.4782870440067606,
"learning_rate": 2.959374635157364e-06,
"loss": 0.5708,
"step": 2130
},
{
"epoch": 9.323144104803493,
"grad_norm": 1.5908300327141753,
"learning_rate": 2.7781516460279157e-06,
"loss": 0.5719,
"step": 2135
},
{
"epoch": 9.344978165938866,
"grad_norm": 1.653067982991365,
"learning_rate": 2.6025760177869063e-06,
"loss": 0.5914,
"step": 2140
},
{
"epoch": 9.366812227074236,
"grad_norm": 1.5475103396862675,
"learning_rate": 2.4326579491597333e-06,
"loss": 0.5931,
"step": 2145
},
{
"epoch": 9.388646288209607,
"grad_norm": 1.6475600779910462,
"learning_rate": 2.2684073102391066e-06,
"loss": 0.5978,
"step": 2150
},
{
"epoch": 9.410480349344978,
"grad_norm": 1.5891384582888524,
"learning_rate": 2.1098336419116625e-06,
"loss": 0.5656,
"step": 2155
},
{
"epoch": 9.43231441048035,
"grad_norm": 1.5178759944544706,
"learning_rate": 1.956946155303785e-06,
"loss": 0.6198,
"step": 2160
},
{
"epoch": 9.45414847161572,
"grad_norm": 1.641246543950594,
"learning_rate": 1.809753731246544e-06,
"loss": 0.5829,
"step": 2165
},
{
"epoch": 9.475982532751091,
"grad_norm": 1.665331469610374,
"learning_rate": 1.6682649197598433e-06,
"loss": 0.5871,
"step": 2170
},
{
"epoch": 9.497816593886462,
"grad_norm": 1.823654408381571,
"learning_rate": 1.5324879395557933e-06,
"loss": 0.5906,
"step": 2175
},
{
"epoch": 9.519650655021834,
"grad_norm": 1.564912693530831,
"learning_rate": 1.4024306775612283e-06,
"loss": 0.6207,
"step": 2180
},
{
"epoch": 9.541484716157205,
"grad_norm": 1.5863861199272236,
"learning_rate": 1.2781006884596825e-06,
"loss": 0.6267,
"step": 2185
},
{
"epoch": 9.563318777292576,
"grad_norm": 1.6063472205433305,
"learning_rate": 1.1595051942524637e-06,
"loss": 0.5755,
"step": 2190
},
{
"epoch": 9.585152838427948,
"grad_norm": 1.5366202264694289,
"learning_rate": 1.0466510838392229e-06,
"loss": 0.5384,
"step": 2195
},
{
"epoch": 9.606986899563319,
"grad_norm": 1.61763818006577,
"learning_rate": 9.395449126177291e-07,
"loss": 0.6435,
"step": 2200
},
{
"epoch": 9.62882096069869,
"grad_norm": 1.5959225303743334,
"learning_rate": 8.381929021031409e-07,
"loss": 0.5587,
"step": 2205
},
{
"epoch": 9.65065502183406,
"grad_norm": 1.5317193865637022,
"learning_rate": 7.426009395665734e-07,
"loss": 0.6166,
"step": 2210
},
{
"epoch": 9.672489082969433,
"grad_norm": 1.7050859904255076,
"learning_rate": 6.527745776931382e-07,
"loss": 0.6078,
"step": 2215
},
{
"epoch": 9.694323144104803,
"grad_norm": 1.626127532872475,
"learning_rate": 5.687190342594239e-07,
"loss": 0.6022,
"step": 2220
},
{
"epoch": 9.716157205240174,
"grad_norm": 1.6752520068014993,
"learning_rate": 4.904391918303608e-07,
"loss": 0.6126,
"step": 2225
},
{
"epoch": 9.737991266375547,
"grad_norm": 1.6226696728292287,
"learning_rate": 4.1793959747565836e-07,
"loss": 0.5671,
"step": 2230
},
{
"epoch": 9.759825327510917,
"grad_norm": 1.6182868533567345,
"learning_rate": 3.5122446250562825e-07,
"loss": 0.5859,
"step": 2235
},
{
"epoch": 9.781659388646288,
"grad_norm": 1.686162675556006,
"learning_rate": 2.902976622265907e-07,
"loss": 0.5634,
"step": 2240
},
{
"epoch": 9.803493449781659,
"grad_norm": 1.5169542450593143,
"learning_rate": 2.3516273571577708e-07,
"loss": 0.5578,
"step": 2245
},
{
"epoch": 9.825327510917031,
"grad_norm": 1.6287222229266196,
"learning_rate": 1.8582288561573847e-07,
"loss": 0.5543,
"step": 2250
},
{
"epoch": 9.847161572052402,
"grad_norm": 1.4791363351235698,
"learning_rate": 1.4228097794828366e-07,
"loss": 0.5705,
"step": 2255
},
{
"epoch": 9.868995633187772,
"grad_norm": 1.5782969298269662,
"learning_rate": 1.045395419480677e-07,
"loss": 0.5742,
"step": 2260
},
{
"epoch": 9.890829694323145,
"grad_norm": 1.5934772169541918,
"learning_rate": 7.260076991560949e-08,
"loss": 0.6509,
"step": 2265
},
{
"epoch": 9.912663755458516,
"grad_norm": 1.5072353913111483,
"learning_rate": 4.646651708998251e-08,
"loss": 0.5509,
"step": 2270
},
{
"epoch": 9.934497816593886,
"grad_norm": 1.5448684833958992,
"learning_rate": 2.6138301541056564e-08,
"loss": 0.5666,
"step": 2275
},
{
"epoch": 9.956331877729257,
"grad_norm": 1.504682951530182,
"learning_rate": 1.1617304081268376e-08,
"loss": 0.5865,
"step": 2280
},
{
"epoch": 9.97816593886463,
"grad_norm": 1.5933513163764192,
"learning_rate": 2.9043681970875035e-09,
"loss": 0.5653,
"step": 2285
},
{
"epoch": 10.0,
"grad_norm": 1.5918112804762306,
"learning_rate": 0.0,
"loss": 0.5502,
"step": 2290
},
{
"epoch": 10.0,
"step": 2290,
"total_flos": 5211452815179776.0,
"train_loss": 1.1147898718779785,
"train_runtime": 4502.6314,
"train_samples_per_second": 32.519,
"train_steps_per_second": 0.509
}
],
"logging_steps": 5,
"max_steps": 2290,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5211452815179776.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}