zephyr-7b-sft-full / trainer_state.json
ishant0121's picture
Model save
110bfa6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3653,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.001368738023542294,
"grad_norm": 1.1110930744661907,
"learning_rate": 2.73224043715847e-07,
"loss": 1.71,
"step": 5
},
{
"epoch": 0.002737476047084588,
"grad_norm": 1.0450218593816867,
"learning_rate": 5.46448087431694e-07,
"loss": 1.7206,
"step": 10
},
{
"epoch": 0.004106214070626882,
"grad_norm": 1.2104539142499893,
"learning_rate": 8.196721311475409e-07,
"loss": 1.8131,
"step": 15
},
{
"epoch": 0.005474952094169176,
"grad_norm": 1.1014228621000113,
"learning_rate": 1.092896174863388e-06,
"loss": 1.7571,
"step": 20
},
{
"epoch": 0.00684369011771147,
"grad_norm": 1.0213348170440788,
"learning_rate": 1.3661202185792352e-06,
"loss": 1.7504,
"step": 25
},
{
"epoch": 0.008212428141253765,
"grad_norm": 0.907763844914787,
"learning_rate": 1.6393442622950819e-06,
"loss": 1.726,
"step": 30
},
{
"epoch": 0.009581166164796057,
"grad_norm": 0.9051570945453739,
"learning_rate": 1.912568306010929e-06,
"loss": 1.7158,
"step": 35
},
{
"epoch": 0.010949904188338352,
"grad_norm": 0.7396508509648158,
"learning_rate": 2.185792349726776e-06,
"loss": 1.7692,
"step": 40
},
{
"epoch": 0.012318642211880646,
"grad_norm": 0.6316750205627214,
"learning_rate": 2.459016393442623e-06,
"loss": 1.7118,
"step": 45
},
{
"epoch": 0.01368738023542294,
"grad_norm": 0.6071126407750962,
"learning_rate": 2.7322404371584705e-06,
"loss": 1.6533,
"step": 50
},
{
"epoch": 0.015056118258965233,
"grad_norm": 0.6129917793535419,
"learning_rate": 3.0054644808743173e-06,
"loss": 1.6471,
"step": 55
},
{
"epoch": 0.01642485628250753,
"grad_norm": 0.6342687394025227,
"learning_rate": 3.2786885245901638e-06,
"loss": 1.6505,
"step": 60
},
{
"epoch": 0.017793594306049824,
"grad_norm": 0.5732154266113143,
"learning_rate": 3.551912568306011e-06,
"loss": 1.6261,
"step": 65
},
{
"epoch": 0.019162332329592115,
"grad_norm": 0.5200300967595174,
"learning_rate": 3.825136612021858e-06,
"loss": 1.5861,
"step": 70
},
{
"epoch": 0.02053107035313441,
"grad_norm": 0.5356030837166187,
"learning_rate": 4.098360655737705e-06,
"loss": 1.6002,
"step": 75
},
{
"epoch": 0.021899808376676703,
"grad_norm": 0.4893081141758957,
"learning_rate": 4.371584699453552e-06,
"loss": 1.5388,
"step": 80
},
{
"epoch": 0.023268546400218998,
"grad_norm": 0.5042585034665032,
"learning_rate": 4.6448087431694e-06,
"loss": 1.5788,
"step": 85
},
{
"epoch": 0.024637284423761292,
"grad_norm": 0.4756627065397568,
"learning_rate": 4.918032786885246e-06,
"loss": 1.5832,
"step": 90
},
{
"epoch": 0.026006022447303587,
"grad_norm": 0.4712228645477175,
"learning_rate": 5.191256830601094e-06,
"loss": 1.5221,
"step": 95
},
{
"epoch": 0.02737476047084588,
"grad_norm": 0.47712827241082845,
"learning_rate": 5.464480874316941e-06,
"loss": 1.5554,
"step": 100
},
{
"epoch": 0.028743498494388176,
"grad_norm": 0.4747245775681728,
"learning_rate": 5.737704918032787e-06,
"loss": 1.5051,
"step": 105
},
{
"epoch": 0.030112236517930466,
"grad_norm": 0.4619152125132191,
"learning_rate": 6.010928961748635e-06,
"loss": 1.5731,
"step": 110
},
{
"epoch": 0.031480974541472764,
"grad_norm": 0.46656513563983254,
"learning_rate": 6.284153005464482e-06,
"loss": 1.5719,
"step": 115
},
{
"epoch": 0.03284971256501506,
"grad_norm": 0.45441304688271805,
"learning_rate": 6.5573770491803276e-06,
"loss": 1.5359,
"step": 120
},
{
"epoch": 0.03421845058855735,
"grad_norm": 0.4492251581811985,
"learning_rate": 6.830601092896175e-06,
"loss": 1.5254,
"step": 125
},
{
"epoch": 0.03558718861209965,
"grad_norm": 0.4901256909260675,
"learning_rate": 7.103825136612022e-06,
"loss": 1.5531,
"step": 130
},
{
"epoch": 0.036955926635641935,
"grad_norm": 0.4521871797566739,
"learning_rate": 7.3770491803278695e-06,
"loss": 1.5203,
"step": 135
},
{
"epoch": 0.03832466465918423,
"grad_norm": 0.46301007561132373,
"learning_rate": 7.650273224043716e-06,
"loss": 1.5597,
"step": 140
},
{
"epoch": 0.039693402682726524,
"grad_norm": 0.4443455697452246,
"learning_rate": 7.923497267759564e-06,
"loss": 1.4751,
"step": 145
},
{
"epoch": 0.04106214070626882,
"grad_norm": 0.4529255518299648,
"learning_rate": 8.19672131147541e-06,
"loss": 1.5645,
"step": 150
},
{
"epoch": 0.04243087872981111,
"grad_norm": 0.45575204993786755,
"learning_rate": 8.469945355191259e-06,
"loss": 1.5297,
"step": 155
},
{
"epoch": 0.04379961675335341,
"grad_norm": 0.4619195990067295,
"learning_rate": 8.743169398907103e-06,
"loss": 1.4945,
"step": 160
},
{
"epoch": 0.0451683547768957,
"grad_norm": 0.43620516601045234,
"learning_rate": 9.016393442622952e-06,
"loss": 1.5064,
"step": 165
},
{
"epoch": 0.046537092800437996,
"grad_norm": 0.48507057683395305,
"learning_rate": 9.2896174863388e-06,
"loss": 1.4873,
"step": 170
},
{
"epoch": 0.04790583082398029,
"grad_norm": 0.4591779126577447,
"learning_rate": 9.562841530054644e-06,
"loss": 1.4989,
"step": 175
},
{
"epoch": 0.049274568847522585,
"grad_norm": 0.4477837953889451,
"learning_rate": 9.836065573770493e-06,
"loss": 1.4755,
"step": 180
},
{
"epoch": 0.05064330687106488,
"grad_norm": 0.4580627295319435,
"learning_rate": 1.0109289617486339e-05,
"loss": 1.4545,
"step": 185
},
{
"epoch": 0.05201204489460717,
"grad_norm": 0.45313033315292356,
"learning_rate": 1.0382513661202187e-05,
"loss": 1.4985,
"step": 190
},
{
"epoch": 0.05338078291814947,
"grad_norm": 0.42708720087605334,
"learning_rate": 1.0655737704918034e-05,
"loss": 1.4229,
"step": 195
},
{
"epoch": 0.05474952094169176,
"grad_norm": 0.4211311153329928,
"learning_rate": 1.0928961748633882e-05,
"loss": 1.4669,
"step": 200
},
{
"epoch": 0.05611825896523406,
"grad_norm": 0.4553582543296074,
"learning_rate": 1.1202185792349727e-05,
"loss": 1.4444,
"step": 205
},
{
"epoch": 0.05748699698877635,
"grad_norm": 0.43888054055275466,
"learning_rate": 1.1475409836065575e-05,
"loss": 1.4622,
"step": 210
},
{
"epoch": 0.058855735012318645,
"grad_norm": 0.43839040680573754,
"learning_rate": 1.1748633879781421e-05,
"loss": 1.3921,
"step": 215
},
{
"epoch": 0.06022447303586093,
"grad_norm": 0.442929256824184,
"learning_rate": 1.202185792349727e-05,
"loss": 1.4695,
"step": 220
},
{
"epoch": 0.06159321105940323,
"grad_norm": 0.42979267539703714,
"learning_rate": 1.2295081967213116e-05,
"loss": 1.4444,
"step": 225
},
{
"epoch": 0.06296194908294553,
"grad_norm": 0.43942113048062365,
"learning_rate": 1.2568306010928964e-05,
"loss": 1.4581,
"step": 230
},
{
"epoch": 0.06433068710648782,
"grad_norm": 0.43105155511527676,
"learning_rate": 1.284153005464481e-05,
"loss": 1.4335,
"step": 235
},
{
"epoch": 0.06569942513003012,
"grad_norm": 0.4347334195115935,
"learning_rate": 1.3114754098360655e-05,
"loss": 1.5437,
"step": 240
},
{
"epoch": 0.0670681631535724,
"grad_norm": 0.41558733251657665,
"learning_rate": 1.3387978142076503e-05,
"loss": 1.4246,
"step": 245
},
{
"epoch": 0.0684369011771147,
"grad_norm": 0.4166574660442,
"learning_rate": 1.366120218579235e-05,
"loss": 1.4672,
"step": 250
},
{
"epoch": 0.069805639200657,
"grad_norm": 0.40343261677226544,
"learning_rate": 1.3934426229508198e-05,
"loss": 1.4695,
"step": 255
},
{
"epoch": 0.0711743772241993,
"grad_norm": 0.41945935612278784,
"learning_rate": 1.4207650273224044e-05,
"loss": 1.4173,
"step": 260
},
{
"epoch": 0.07254311524774158,
"grad_norm": 0.4091410585881606,
"learning_rate": 1.4480874316939892e-05,
"loss": 1.3669,
"step": 265
},
{
"epoch": 0.07391185327128387,
"grad_norm": 0.40428264801464686,
"learning_rate": 1.4754098360655739e-05,
"loss": 1.4712,
"step": 270
},
{
"epoch": 0.07528059129482617,
"grad_norm": 0.4074017153449847,
"learning_rate": 1.5027322404371585e-05,
"loss": 1.4477,
"step": 275
},
{
"epoch": 0.07664932931836846,
"grad_norm": 0.41299926058270664,
"learning_rate": 1.5300546448087432e-05,
"loss": 1.4015,
"step": 280
},
{
"epoch": 0.07801806734191076,
"grad_norm": 0.39622800283246956,
"learning_rate": 1.5573770491803278e-05,
"loss": 1.443,
"step": 285
},
{
"epoch": 0.07938680536545305,
"grad_norm": 0.4050370810855124,
"learning_rate": 1.5846994535519128e-05,
"loss": 1.4454,
"step": 290
},
{
"epoch": 0.08075554338899535,
"grad_norm": 0.405591352468174,
"learning_rate": 1.6120218579234975e-05,
"loss": 1.4077,
"step": 295
},
{
"epoch": 0.08212428141253764,
"grad_norm": 0.4188674346008248,
"learning_rate": 1.639344262295082e-05,
"loss": 1.4503,
"step": 300
},
{
"epoch": 0.08349301943607994,
"grad_norm": 0.4015518431467613,
"learning_rate": 1.6666666666666667e-05,
"loss": 1.4027,
"step": 305
},
{
"epoch": 0.08486175745962223,
"grad_norm": 0.40825550305371383,
"learning_rate": 1.6939890710382517e-05,
"loss": 1.4021,
"step": 310
},
{
"epoch": 0.08623049548316453,
"grad_norm": 0.4103687720629476,
"learning_rate": 1.721311475409836e-05,
"loss": 1.4304,
"step": 315
},
{
"epoch": 0.08759923350670681,
"grad_norm": 0.4048205442390437,
"learning_rate": 1.7486338797814207e-05,
"loss": 1.4,
"step": 320
},
{
"epoch": 0.08896797153024912,
"grad_norm": 0.41661892344943224,
"learning_rate": 1.7759562841530057e-05,
"loss": 1.4035,
"step": 325
},
{
"epoch": 0.0903367095537914,
"grad_norm": 0.4308332237873975,
"learning_rate": 1.8032786885245903e-05,
"loss": 1.4302,
"step": 330
},
{
"epoch": 0.0917054475773337,
"grad_norm": 0.41012677210314796,
"learning_rate": 1.830601092896175e-05,
"loss": 1.3969,
"step": 335
},
{
"epoch": 0.09307418560087599,
"grad_norm": 0.40743633096989823,
"learning_rate": 1.85792349726776e-05,
"loss": 1.4425,
"step": 340
},
{
"epoch": 0.09444292362441829,
"grad_norm": 0.39899819661715447,
"learning_rate": 1.8852459016393446e-05,
"loss": 1.4303,
"step": 345
},
{
"epoch": 0.09581166164796058,
"grad_norm": 0.41870768132807307,
"learning_rate": 1.912568306010929e-05,
"loss": 1.4002,
"step": 350
},
{
"epoch": 0.09718039967150287,
"grad_norm": 0.40412869941796903,
"learning_rate": 1.939890710382514e-05,
"loss": 1.403,
"step": 355
},
{
"epoch": 0.09854913769504517,
"grad_norm": 0.40478167238457397,
"learning_rate": 1.9672131147540985e-05,
"loss": 1.4227,
"step": 360
},
{
"epoch": 0.09991787571858746,
"grad_norm": 0.41341003482817107,
"learning_rate": 1.994535519125683e-05,
"loss": 1.4302,
"step": 365
},
{
"epoch": 0.10128661374212976,
"grad_norm": 0.41658269655258945,
"learning_rate": 1.999992692147127e-05,
"loss": 1.445,
"step": 370
},
{
"epoch": 0.10265535176567205,
"grad_norm": 0.40746899077621546,
"learning_rate": 1.999963004177886e-05,
"loss": 1.458,
"step": 375
},
{
"epoch": 0.10402408978921435,
"grad_norm": 0.3960893049170255,
"learning_rate": 1.9999104800289367e-05,
"loss": 1.4382,
"step": 380
},
{
"epoch": 0.10539282781275663,
"grad_norm": 0.3950983233516754,
"learning_rate": 1.9998351208997734e-05,
"loss": 1.4365,
"step": 385
},
{
"epoch": 0.10676156583629894,
"grad_norm": 0.38845890924227394,
"learning_rate": 1.9997369285113754e-05,
"loss": 1.3731,
"step": 390
},
{
"epoch": 0.10813030385984122,
"grad_norm": 0.39526066204949717,
"learning_rate": 1.9996159051061638e-05,
"loss": 1.4111,
"step": 395
},
{
"epoch": 0.10949904188338352,
"grad_norm": 0.4133763360642962,
"learning_rate": 1.9994720534479543e-05,
"loss": 1.4294,
"step": 400
},
{
"epoch": 0.11086777990692581,
"grad_norm": 0.39671784657884285,
"learning_rate": 1.999305376821889e-05,
"loss": 1.3708,
"step": 405
},
{
"epoch": 0.11223651793046811,
"grad_norm": 0.3847825062400942,
"learning_rate": 1.999115879034368e-05,
"loss": 1.4638,
"step": 410
},
{
"epoch": 0.1136052559540104,
"grad_norm": 0.41459555705408424,
"learning_rate": 1.9989035644129553e-05,
"loss": 1.4105,
"step": 415
},
{
"epoch": 0.1149739939775527,
"grad_norm": 0.4270268157501032,
"learning_rate": 1.998668437806286e-05,
"loss": 1.4243,
"step": 420
},
{
"epoch": 0.11634273200109499,
"grad_norm": 0.42083549100480927,
"learning_rate": 1.998410504583952e-05,
"loss": 1.4285,
"step": 425
},
{
"epoch": 0.11771147002463729,
"grad_norm": 0.3929543512101535,
"learning_rate": 1.998129770636381e-05,
"loss": 1.3692,
"step": 430
},
{
"epoch": 0.11908020804817958,
"grad_norm": 0.40163841769109054,
"learning_rate": 1.9978262423747003e-05,
"loss": 1.3842,
"step": 435
},
{
"epoch": 0.12044894607172187,
"grad_norm": 0.40750648380479343,
"learning_rate": 1.997499926730593e-05,
"loss": 1.4412,
"step": 440
},
{
"epoch": 0.12181768409526417,
"grad_norm": 0.4029670313800167,
"learning_rate": 1.9971508311561373e-05,
"loss": 1.4481,
"step": 445
},
{
"epoch": 0.12318642211880645,
"grad_norm": 0.3934576991168861,
"learning_rate": 1.996778963623637e-05,
"loss": 1.3706,
"step": 450
},
{
"epoch": 0.12455516014234876,
"grad_norm": 0.4126991514447529,
"learning_rate": 1.9963843326254406e-05,
"loss": 1.4096,
"step": 455
},
{
"epoch": 0.12592389816589106,
"grad_norm": 0.400896785065692,
"learning_rate": 1.9959669471737456e-05,
"loss": 1.3284,
"step": 460
},
{
"epoch": 0.12729263618943334,
"grad_norm": 0.4084884002554967,
"learning_rate": 1.9955268168003938e-05,
"loss": 1.4346,
"step": 465
},
{
"epoch": 0.12866137421297563,
"grad_norm": 0.4031311283565323,
"learning_rate": 1.9950639515566537e-05,
"loss": 1.3632,
"step": 470
},
{
"epoch": 0.13003011223651792,
"grad_norm": 0.4017916786924823,
"learning_rate": 1.99457836201299e-05,
"loss": 1.4083,
"step": 475
},
{
"epoch": 0.13139885026006023,
"grad_norm": 0.38801834482390074,
"learning_rate": 1.9940700592588228e-05,
"loss": 1.3823,
"step": 480
},
{
"epoch": 0.13276758828360252,
"grad_norm": 0.40725954819087723,
"learning_rate": 1.993539054902275e-05,
"loss": 1.369,
"step": 485
},
{
"epoch": 0.1341363263071448,
"grad_norm": 0.42191731863948084,
"learning_rate": 1.992985361069906e-05,
"loss": 1.3907,
"step": 490
},
{
"epoch": 0.1355050643306871,
"grad_norm": 0.3981980013765845,
"learning_rate": 1.9924089904064354e-05,
"loss": 1.4253,
"step": 495
},
{
"epoch": 0.1368738023542294,
"grad_norm": 0.3968358847514907,
"learning_rate": 1.9918099560744545e-05,
"loss": 1.4478,
"step": 500
},
{
"epoch": 0.1382425403777717,
"grad_norm": 0.4020407861005804,
"learning_rate": 1.991188271754125e-05,
"loss": 1.4063,
"step": 505
},
{
"epoch": 0.139611278401314,
"grad_norm": 0.3983008409093381,
"learning_rate": 1.990543951642866e-05,
"loss": 1.3838,
"step": 510
},
{
"epoch": 0.14098001642485627,
"grad_norm": 0.38772247745474425,
"learning_rate": 1.9898770104550335e-05,
"loss": 1.4193,
"step": 515
},
{
"epoch": 0.1423487544483986,
"grad_norm": 0.4172192092889838,
"learning_rate": 1.9891874634215784e-05,
"loss": 1.3831,
"step": 520
},
{
"epoch": 0.14371749247194088,
"grad_norm": 0.39316534732966774,
"learning_rate": 1.9884753262897042e-05,
"loss": 1.3813,
"step": 525
},
{
"epoch": 0.14508623049548316,
"grad_norm": 0.40825756476362335,
"learning_rate": 1.9877406153225028e-05,
"loss": 1.4233,
"step": 530
},
{
"epoch": 0.14645496851902545,
"grad_norm": 0.4303738354233706,
"learning_rate": 1.9869833472985882e-05,
"loss": 1.4267,
"step": 535
},
{
"epoch": 0.14782370654256774,
"grad_norm": 0.39536263390575654,
"learning_rate": 1.9862035395117075e-05,
"loss": 1.3688,
"step": 540
},
{
"epoch": 0.14919244456611006,
"grad_norm": 0.39303998157704906,
"learning_rate": 1.9854012097703515e-05,
"loss": 1.4259,
"step": 545
},
{
"epoch": 0.15056118258965234,
"grad_norm": 0.3939411519174778,
"learning_rate": 1.9845763763973433e-05,
"loss": 1.354,
"step": 550
},
{
"epoch": 0.15192992061319463,
"grad_norm": 0.39277763892363776,
"learning_rate": 1.9837290582294233e-05,
"loss": 1.3468,
"step": 555
},
{
"epoch": 0.15329865863673692,
"grad_norm": 0.39121990838351217,
"learning_rate": 1.9828592746168172e-05,
"loss": 1.3508,
"step": 560
},
{
"epoch": 0.15466739666027923,
"grad_norm": 0.4035101254769364,
"learning_rate": 1.981967045422795e-05,
"loss": 1.369,
"step": 565
},
{
"epoch": 0.15603613468382152,
"grad_norm": 0.38407623227911974,
"learning_rate": 1.9810523910232165e-05,
"loss": 1.4039,
"step": 570
},
{
"epoch": 0.1574048727073638,
"grad_norm": 0.3915340445821982,
"learning_rate": 1.9801153323060667e-05,
"loss": 1.3634,
"step": 575
},
{
"epoch": 0.1587736107309061,
"grad_norm": 0.39241210575593644,
"learning_rate": 1.9791558906709787e-05,
"loss": 1.4257,
"step": 580
},
{
"epoch": 0.1601423487544484,
"grad_norm": 0.3946963973614525,
"learning_rate": 1.9781740880287444e-05,
"loss": 1.3136,
"step": 585
},
{
"epoch": 0.1615110867779907,
"grad_norm": 0.39148042578465725,
"learning_rate": 1.9771699468008156e-05,
"loss": 1.3654,
"step": 590
},
{
"epoch": 0.16287982480153299,
"grad_norm": 0.3895106088217061,
"learning_rate": 1.9761434899187893e-05,
"loss": 1.3762,
"step": 595
},
{
"epoch": 0.16424856282507527,
"grad_norm": 0.3997937267692177,
"learning_rate": 1.9750947408238872e-05,
"loss": 1.4064,
"step": 600
},
{
"epoch": 0.1656173008486176,
"grad_norm": 0.4002294154735745,
"learning_rate": 1.974023723466418e-05,
"loss": 1.3771,
"step": 605
},
{
"epoch": 0.16698603887215988,
"grad_norm": 0.40550620529076886,
"learning_rate": 1.9729304623052315e-05,
"loss": 1.4551,
"step": 610
},
{
"epoch": 0.16835477689570216,
"grad_norm": 0.40410760524836825,
"learning_rate": 1.9718149823071592e-05,
"loss": 1.3394,
"step": 615
},
{
"epoch": 0.16972351491924445,
"grad_norm": 0.40142195594131513,
"learning_rate": 1.970677308946446e-05,
"loss": 1.3872,
"step": 620
},
{
"epoch": 0.17109225294278674,
"grad_norm": 0.4041136169142662,
"learning_rate": 1.9695174682041652e-05,
"loss": 1.3644,
"step": 625
},
{
"epoch": 0.17246099096632905,
"grad_norm": 0.39436310577761385,
"learning_rate": 1.9683354865676298e-05,
"loss": 1.434,
"step": 630
},
{
"epoch": 0.17382972898987134,
"grad_norm": 0.400517293758074,
"learning_rate": 1.9671313910297826e-05,
"loss": 1.3941,
"step": 635
},
{
"epoch": 0.17519846701341363,
"grad_norm": 0.4059408812570113,
"learning_rate": 1.9659052090885834e-05,
"loss": 1.3907,
"step": 640
},
{
"epoch": 0.17656720503695592,
"grad_norm": 0.3929001302562383,
"learning_rate": 1.9646569687463796e-05,
"loss": 1.351,
"step": 645
},
{
"epoch": 0.17793594306049823,
"grad_norm": 0.4090718300758474,
"learning_rate": 1.9633866985092655e-05,
"loss": 1.4102,
"step": 650
},
{
"epoch": 0.17930468108404052,
"grad_norm": 0.40995200812054255,
"learning_rate": 1.9620944273864343e-05,
"loss": 1.3575,
"step": 655
},
{
"epoch": 0.1806734191075828,
"grad_norm": 0.40276642674265933,
"learning_rate": 1.960780184889514e-05,
"loss": 1.3941,
"step": 660
},
{
"epoch": 0.1820421571311251,
"grad_norm": 0.400136397383414,
"learning_rate": 1.9594440010318924e-05,
"loss": 1.4171,
"step": 665
},
{
"epoch": 0.1834108951546674,
"grad_norm": 0.40872717576906,
"learning_rate": 1.9580859063280326e-05,
"loss": 1.391,
"step": 670
},
{
"epoch": 0.1847796331782097,
"grad_norm": 0.39405384932557985,
"learning_rate": 1.956705931792777e-05,
"loss": 1.4042,
"step": 675
},
{
"epoch": 0.18614837120175198,
"grad_norm": 0.396632571946212,
"learning_rate": 1.9553041089406387e-05,
"loss": 1.3598,
"step": 680
},
{
"epoch": 0.18751710922529427,
"grad_norm": 0.39535225715229844,
"learning_rate": 1.95388046978508e-05,
"loss": 1.3652,
"step": 685
},
{
"epoch": 0.18888584724883659,
"grad_norm": 0.39747240968211434,
"learning_rate": 1.9524350468377828e-05,
"loss": 1.3857,
"step": 690
},
{
"epoch": 0.19025458527237887,
"grad_norm": 0.4045442697980633,
"learning_rate": 1.9509678731079074e-05,
"loss": 1.3724,
"step": 695
},
{
"epoch": 0.19162332329592116,
"grad_norm": 0.39648112930634666,
"learning_rate": 1.949478982101336e-05,
"loss": 1.3642,
"step": 700
},
{
"epoch": 0.19299206131946345,
"grad_norm": 0.3966559822195587,
"learning_rate": 1.947968407819909e-05,
"loss": 1.3704,
"step": 705
},
{
"epoch": 0.19436079934300574,
"grad_norm": 0.3803799928500003,
"learning_rate": 1.9464361847606486e-05,
"loss": 1.3718,
"step": 710
},
{
"epoch": 0.19572953736654805,
"grad_norm": 0.3920580621341347,
"learning_rate": 1.9448823479149705e-05,
"loss": 1.3994,
"step": 715
},
{
"epoch": 0.19709827539009034,
"grad_norm": 0.38159551727148144,
"learning_rate": 1.9433069327678847e-05,
"loss": 1.3539,
"step": 720
},
{
"epoch": 0.19846701341363263,
"grad_norm": 0.3979848267470623,
"learning_rate": 1.9417099752971858e-05,
"loss": 1.3824,
"step": 725
},
{
"epoch": 0.1998357514371749,
"grad_norm": 0.36228811525683013,
"learning_rate": 1.9400915119726305e-05,
"loss": 1.2942,
"step": 730
},
{
"epoch": 0.20120448946071723,
"grad_norm": 0.40336839547536973,
"learning_rate": 1.938451579755106e-05,
"loss": 1.3305,
"step": 735
},
{
"epoch": 0.20257322748425952,
"grad_norm": 0.4093706132368206,
"learning_rate": 1.9367902160957843e-05,
"loss": 1.4047,
"step": 740
},
{
"epoch": 0.2039419655078018,
"grad_norm": 0.4012678664539993,
"learning_rate": 1.9351074589352684e-05,
"loss": 1.34,
"step": 745
},
{
"epoch": 0.2053107035313441,
"grad_norm": 0.3982341789573293,
"learning_rate": 1.933403346702725e-05,
"loss": 1.3518,
"step": 750
},
{
"epoch": 0.2066794415548864,
"grad_norm": 0.3895851509088621,
"learning_rate": 1.931677918315007e-05,
"loss": 1.3698,
"step": 755
},
{
"epoch": 0.2080481795784287,
"grad_norm": 0.40873783451123424,
"learning_rate": 1.9299312131757645e-05,
"loss": 1.3768,
"step": 760
},
{
"epoch": 0.20941691760197098,
"grad_norm": 0.38863227885235524,
"learning_rate": 1.928163271174546e-05,
"loss": 1.368,
"step": 765
},
{
"epoch": 0.21078565562551327,
"grad_norm": 0.40218624042286666,
"learning_rate": 1.9263741326858866e-05,
"loss": 1.4002,
"step": 770
},
{
"epoch": 0.21215439364905558,
"grad_norm": 0.38750285278794777,
"learning_rate": 1.9245638385683857e-05,
"loss": 1.3808,
"step": 775
},
{
"epoch": 0.21352313167259787,
"grad_norm": 0.39332958062090156,
"learning_rate": 1.9227324301637747e-05,
"loss": 1.3991,
"step": 780
},
{
"epoch": 0.21489186969614016,
"grad_norm": 0.39759945858946194,
"learning_rate": 1.9208799492959723e-05,
"loss": 1.3765,
"step": 785
},
{
"epoch": 0.21626060771968245,
"grad_norm": 0.3779639266645249,
"learning_rate": 1.9190064382701296e-05,
"loss": 1.378,
"step": 790
},
{
"epoch": 0.21762934574322473,
"grad_norm": 0.3980685187213535,
"learning_rate": 1.917111939871664e-05,
"loss": 1.3559,
"step": 795
},
{
"epoch": 0.21899808376676705,
"grad_norm": 0.3955990129038042,
"learning_rate": 1.915196497365282e-05,
"loss": 1.3688,
"step": 800
},
{
"epoch": 0.22036682179030934,
"grad_norm": 0.3814772041353902,
"learning_rate": 1.9132601544939914e-05,
"loss": 1.3106,
"step": 805
},
{
"epoch": 0.22173555981385162,
"grad_norm": 0.4033095650227253,
"learning_rate": 1.9113029554781014e-05,
"loss": 1.3389,
"step": 810
},
{
"epoch": 0.2231042978373939,
"grad_norm": 0.3934860150762494,
"learning_rate": 1.9093249450142144e-05,
"loss": 1.3343,
"step": 815
},
{
"epoch": 0.22447303586093623,
"grad_norm": 0.4008166941226203,
"learning_rate": 1.907326168274204e-05,
"loss": 1.3627,
"step": 820
},
{
"epoch": 0.2258417738844785,
"grad_norm": 0.38863913805386574,
"learning_rate": 1.905306670904184e-05,
"loss": 1.3978,
"step": 825
},
{
"epoch": 0.2272105119080208,
"grad_norm": 0.39887884072448215,
"learning_rate": 1.9032664990234648e-05,
"loss": 1.3548,
"step": 830
},
{
"epoch": 0.2285792499315631,
"grad_norm": 0.3942528539065832,
"learning_rate": 1.9012056992235025e-05,
"loss": 1.3589,
"step": 835
},
{
"epoch": 0.2299479879551054,
"grad_norm": 0.4003093485695292,
"learning_rate": 1.899124318566832e-05,
"loss": 1.4008,
"step": 840
},
{
"epoch": 0.2313167259786477,
"grad_norm": 0.38851656838431264,
"learning_rate": 1.897022404585996e-05,
"loss": 1.3396,
"step": 845
},
{
"epoch": 0.23268546400218998,
"grad_norm": 0.39282411404685663,
"learning_rate": 1.894900005282454e-05,
"loss": 1.3118,
"step": 850
},
{
"epoch": 0.23405420202573227,
"grad_norm": 0.3995600743628729,
"learning_rate": 1.892757169125492e-05,
"loss": 1.3261,
"step": 855
},
{
"epoch": 0.23542294004927458,
"grad_norm": 0.40585943733175783,
"learning_rate": 1.8905939450511117e-05,
"loss": 1.2896,
"step": 860
},
{
"epoch": 0.23679167807281687,
"grad_norm": 0.38676778157652897,
"learning_rate": 1.888410382460915e-05,
"loss": 1.3769,
"step": 865
},
{
"epoch": 0.23816041609635916,
"grad_norm": 0.4006135132967053,
"learning_rate": 1.8862065312209735e-05,
"loss": 1.3744,
"step": 870
},
{
"epoch": 0.23952915411990144,
"grad_norm": 0.38183538010624907,
"learning_rate": 1.8839824416606932e-05,
"loss": 1.3994,
"step": 875
},
{
"epoch": 0.24089789214344373,
"grad_norm": 0.40012172515979694,
"learning_rate": 1.8817381645716613e-05,
"loss": 1.3736,
"step": 880
},
{
"epoch": 0.24226663016698605,
"grad_norm": 0.4036320450863606,
"learning_rate": 1.879473751206489e-05,
"loss": 1.3679,
"step": 885
},
{
"epoch": 0.24363536819052833,
"grad_norm": 0.39250548017399456,
"learning_rate": 1.8771892532776406e-05,
"loss": 1.352,
"step": 890
},
{
"epoch": 0.24500410621407062,
"grad_norm": 0.384618237208868,
"learning_rate": 1.8748847229562504e-05,
"loss": 1.3415,
"step": 895
},
{
"epoch": 0.2463728442376129,
"grad_norm": 0.3847231065307316,
"learning_rate": 1.8725602128709348e-05,
"loss": 1.436,
"step": 900
},
{
"epoch": 0.24774158226115522,
"grad_norm": 0.39329176880661687,
"learning_rate": 1.8702157761065877e-05,
"loss": 1.344,
"step": 905
},
{
"epoch": 0.2491103202846975,
"grad_norm": 0.4202514383981607,
"learning_rate": 1.8678514662031688e-05,
"loss": 1.3384,
"step": 910
},
{
"epoch": 0.2504790583082398,
"grad_norm": 0.3866724615350219,
"learning_rate": 1.8654673371544815e-05,
"loss": 1.3374,
"step": 915
},
{
"epoch": 0.2518477963317821,
"grad_norm": 0.4109333525553728,
"learning_rate": 1.8630634434069397e-05,
"loss": 1.3949,
"step": 920
},
{
"epoch": 0.2532165343553244,
"grad_norm": 0.3782421450027903,
"learning_rate": 1.860639839858324e-05,
"loss": 1.3162,
"step": 925
},
{
"epoch": 0.2545852723788667,
"grad_norm": 0.3909532902784873,
"learning_rate": 1.8581965818565278e-05,
"loss": 1.3829,
"step": 930
},
{
"epoch": 0.255954010402409,
"grad_norm": 0.4051170609748319,
"learning_rate": 1.855733725198295e-05,
"loss": 1.3462,
"step": 935
},
{
"epoch": 0.25732274842595126,
"grad_norm": 0.39349552836368845,
"learning_rate": 1.8532513261279433e-05,
"loss": 1.4015,
"step": 940
},
{
"epoch": 0.2586914864494936,
"grad_norm": 0.39347934180028793,
"learning_rate": 1.8507494413360808e-05,
"loss": 1.3367,
"step": 945
},
{
"epoch": 0.26006022447303584,
"grad_norm": 0.4158855876544813,
"learning_rate": 1.848228127958312e-05,
"loss": 1.353,
"step": 950
},
{
"epoch": 0.26142896249657815,
"grad_norm": 0.3793682443730692,
"learning_rate": 1.8456874435739337e-05,
"loss": 1.3398,
"step": 955
},
{
"epoch": 0.26279770052012047,
"grad_norm": 0.38047885709566037,
"learning_rate": 1.843127446204616e-05,
"loss": 1.329,
"step": 960
},
{
"epoch": 0.26416643854366273,
"grad_norm": 0.3991275596683987,
"learning_rate": 1.8405481943130827e-05,
"loss": 1.3115,
"step": 965
},
{
"epoch": 0.26553517656720504,
"grad_norm": 0.389649193690539,
"learning_rate": 1.8379497468017726e-05,
"loss": 1.3746,
"step": 970
},
{
"epoch": 0.2669039145907473,
"grad_norm": 0.40470694231468296,
"learning_rate": 1.8353321630114952e-05,
"loss": 1.3354,
"step": 975
},
{
"epoch": 0.2682726526142896,
"grad_norm": 0.40734933283960667,
"learning_rate": 1.832695502720076e-05,
"loss": 1.3295,
"step": 980
},
{
"epoch": 0.26964139063783193,
"grad_norm": 0.39323434740751423,
"learning_rate": 1.8300398261409912e-05,
"loss": 1.3069,
"step": 985
},
{
"epoch": 0.2710101286613742,
"grad_norm": 0.3837841492160018,
"learning_rate": 1.8273651939219914e-05,
"loss": 1.3543,
"step": 990
},
{
"epoch": 0.2723788666849165,
"grad_norm": 0.38795485386053835,
"learning_rate": 1.8246716671437186e-05,
"loss": 1.3798,
"step": 995
},
{
"epoch": 0.2737476047084588,
"grad_norm": 0.38509661706339016,
"learning_rate": 1.8219593073183106e-05,
"loss": 1.3604,
"step": 1000
},
{
"epoch": 0.2751163427320011,
"grad_norm": 0.3882331224756399,
"learning_rate": 1.8192281763879946e-05,
"loss": 1.3417,
"step": 1005
},
{
"epoch": 0.2764850807555434,
"grad_norm": 0.3842715356979672,
"learning_rate": 1.816478336723675e-05,
"loss": 1.4061,
"step": 1010
},
{
"epoch": 0.27785381877908566,
"grad_norm": 0.4013303335538485,
"learning_rate": 1.8137098511235084e-05,
"loss": 1.3509,
"step": 1015
},
{
"epoch": 0.279222556802628,
"grad_norm": 0.389250409772092,
"learning_rate": 1.810922782811468e-05,
"loss": 1.3371,
"step": 1020
},
{
"epoch": 0.2805912948261703,
"grad_norm": 0.3891214912082097,
"learning_rate": 1.808117195435901e-05,
"loss": 1.3756,
"step": 1025
},
{
"epoch": 0.28196003284971255,
"grad_norm": 0.39295322317665116,
"learning_rate": 1.805293153068076e-05,
"loss": 1.3525,
"step": 1030
},
{
"epoch": 0.28332877087325486,
"grad_norm": 0.38744159952014173,
"learning_rate": 1.802450720200718e-05,
"loss": 1.3508,
"step": 1035
},
{
"epoch": 0.2846975088967972,
"grad_norm": 0.3971694811346686,
"learning_rate": 1.7995899617465357e-05,
"loss": 1.3242,
"step": 1040
},
{
"epoch": 0.28606624692033944,
"grad_norm": 0.3892979964335795,
"learning_rate": 1.7967109430367406e-05,
"loss": 1.2919,
"step": 1045
},
{
"epoch": 0.28743498494388176,
"grad_norm": 0.40651116778504365,
"learning_rate": 1.793813729819553e-05,
"loss": 1.4047,
"step": 1050
},
{
"epoch": 0.288803722967424,
"grad_norm": 0.39365601735181455,
"learning_rate": 1.7908983882587038e-05,
"loss": 1.3622,
"step": 1055
},
{
"epoch": 0.29017246099096633,
"grad_norm": 0.38324138023204,
"learning_rate": 1.787964984931919e-05,
"loss": 1.3773,
"step": 1060
},
{
"epoch": 0.29154119901450865,
"grad_norm": 0.3887104304366927,
"learning_rate": 1.7850135868294023e-05,
"loss": 1.3973,
"step": 1065
},
{
"epoch": 0.2929099370380509,
"grad_norm": 0.4087532681641359,
"learning_rate": 1.782044261352305e-05,
"loss": 1.3246,
"step": 1070
},
{
"epoch": 0.2942786750615932,
"grad_norm": 0.39381567774906023,
"learning_rate": 1.7790570763111864e-05,
"loss": 1.3683,
"step": 1075
},
{
"epoch": 0.2956474130851355,
"grad_norm": 0.38720936830053604,
"learning_rate": 1.7760520999244638e-05,
"loss": 1.375,
"step": 1080
},
{
"epoch": 0.2970161511086778,
"grad_norm": 0.38699812271583683,
"learning_rate": 1.7730294008168578e-05,
"loss": 1.403,
"step": 1085
},
{
"epoch": 0.2983848891322201,
"grad_norm": 0.3938155567988771,
"learning_rate": 1.7699890480178216e-05,
"loss": 1.3567,
"step": 1090
},
{
"epoch": 0.29975362715576237,
"grad_norm": 0.4051470823072413,
"learning_rate": 1.766931110959967e-05,
"loss": 1.4228,
"step": 1095
},
{
"epoch": 0.3011223651793047,
"grad_norm": 0.4013404711196811,
"learning_rate": 1.763855659477478e-05,
"loss": 1.3689,
"step": 1100
},
{
"epoch": 0.302491103202847,
"grad_norm": 0.4239789885244797,
"learning_rate": 1.7607627638045156e-05,
"loss": 1.3988,
"step": 1105
},
{
"epoch": 0.30385984122638926,
"grad_norm": 0.4005302271519552,
"learning_rate": 1.7576524945736137e-05,
"loss": 1.3368,
"step": 1110
},
{
"epoch": 0.3052285792499316,
"grad_norm": 0.3900827670562639,
"learning_rate": 1.754524922814068e-05,
"loss": 1.3633,
"step": 1115
},
{
"epoch": 0.30659731727347384,
"grad_norm": 0.39004513777273,
"learning_rate": 1.751380119950311e-05,
"loss": 1.4024,
"step": 1120
},
{
"epoch": 0.30796605529701615,
"grad_norm": 0.3955440868581025,
"learning_rate": 1.7482181578002837e-05,
"loss": 1.3667,
"step": 1125
},
{
"epoch": 0.30933479332055847,
"grad_norm": 0.41261816602720663,
"learning_rate": 1.745039108573793e-05,
"loss": 1.357,
"step": 1130
},
{
"epoch": 0.3107035313441007,
"grad_norm": 0.37732770567847945,
"learning_rate": 1.7418430448708644e-05,
"loss": 1.3337,
"step": 1135
},
{
"epoch": 0.31207226936764304,
"grad_norm": 0.3764972884033891,
"learning_rate": 1.738630039680083e-05,
"loss": 1.3556,
"step": 1140
},
{
"epoch": 0.3134410073911853,
"grad_norm": 0.40764805847336166,
"learning_rate": 1.7354001663769278e-05,
"loss": 1.3679,
"step": 1145
},
{
"epoch": 0.3148097454147276,
"grad_norm": 0.3794101029629702,
"learning_rate": 1.7321534987220942e-05,
"loss": 1.3176,
"step": 1150
},
{
"epoch": 0.31617848343826993,
"grad_norm": 0.40248164169870704,
"learning_rate": 1.728890110859812e-05,
"loss": 1.3458,
"step": 1155
},
{
"epoch": 0.3175472214618122,
"grad_norm": 0.39012890957455026,
"learning_rate": 1.7256100773161492e-05,
"loss": 1.296,
"step": 1160
},
{
"epoch": 0.3189159594853545,
"grad_norm": 0.39856381680118136,
"learning_rate": 1.7223134729973134e-05,
"loss": 1.3614,
"step": 1165
},
{
"epoch": 0.3202846975088968,
"grad_norm": 0.40493526767721577,
"learning_rate": 1.7190003731879375e-05,
"loss": 1.3533,
"step": 1170
},
{
"epoch": 0.3216534355324391,
"grad_norm": 0.38868285184435786,
"learning_rate": 1.715670853549364e-05,
"loss": 1.3377,
"step": 1175
},
{
"epoch": 0.3230221735559814,
"grad_norm": 0.3840977803535061,
"learning_rate": 1.7123249901179142e-05,
"loss": 1.3753,
"step": 1180
},
{
"epoch": 0.32439091157952366,
"grad_norm": 0.37510550805682613,
"learning_rate": 1.708962859303154e-05,
"loss": 1.3557,
"step": 1185
},
{
"epoch": 0.32575964960306597,
"grad_norm": 0.39070855119805253,
"learning_rate": 1.7055845378861476e-05,
"loss": 1.3584,
"step": 1190
},
{
"epoch": 0.3271283876266083,
"grad_norm": 0.39021160718672115,
"learning_rate": 1.7021901030177036e-05,
"loss": 1.3399,
"step": 1195
},
{
"epoch": 0.32849712565015055,
"grad_norm": 0.3983342506003189,
"learning_rate": 1.698779632216615e-05,
"loss": 1.3965,
"step": 1200
},
{
"epoch": 0.32986586367369286,
"grad_norm": 0.38690562360273995,
"learning_rate": 1.6953532033678874e-05,
"loss": 1.4221,
"step": 1205
},
{
"epoch": 0.3312346016972352,
"grad_norm": 0.39029540911089444,
"learning_rate": 1.69191089472096e-05,
"loss": 1.3201,
"step": 1210
},
{
"epoch": 0.33260333972077744,
"grad_norm": 0.40512131600219015,
"learning_rate": 1.688452784887921e-05,
"loss": 1.3495,
"step": 1215
},
{
"epoch": 0.33397207774431975,
"grad_norm": 0.3804586676740124,
"learning_rate": 1.684978952841709e-05,
"loss": 1.3471,
"step": 1220
},
{
"epoch": 0.335340815767862,
"grad_norm": 0.38428741373037245,
"learning_rate": 1.681489477914312e-05,
"loss": 1.3196,
"step": 1225
},
{
"epoch": 0.3367095537914043,
"grad_norm": 0.384047691017285,
"learning_rate": 1.677984439794954e-05,
"loss": 1.3303,
"step": 1230
},
{
"epoch": 0.33807829181494664,
"grad_norm": 0.3839900646299871,
"learning_rate": 1.6744639185282784e-05,
"loss": 1.2792,
"step": 1235
},
{
"epoch": 0.3394470298384889,
"grad_norm": 0.3884042208952324,
"learning_rate": 1.670927994512514e-05,
"loss": 1.3223,
"step": 1240
},
{
"epoch": 0.3408157678620312,
"grad_norm": 0.3911385295369844,
"learning_rate": 1.667376748497646e-05,
"loss": 1.3546,
"step": 1245
},
{
"epoch": 0.3421845058855735,
"grad_norm": 0.3807410219808285,
"learning_rate": 1.6638102615835658e-05,
"loss": 1.3148,
"step": 1250
},
{
"epoch": 0.3435532439091158,
"grad_norm": 0.39423887853753,
"learning_rate": 1.6602286152182236e-05,
"loss": 1.361,
"step": 1255
},
{
"epoch": 0.3449219819326581,
"grad_norm": 0.39187255483162625,
"learning_rate": 1.6566318911957647e-05,
"loss": 1.339,
"step": 1260
},
{
"epoch": 0.34629071995620037,
"grad_norm": 0.38859511955741677,
"learning_rate": 1.6530201716546647e-05,
"loss": 1.3556,
"step": 1265
},
{
"epoch": 0.3476594579797427,
"grad_norm": 0.3993643770992966,
"learning_rate": 1.649393539075851e-05,
"loss": 1.3476,
"step": 1270
},
{
"epoch": 0.349028196003285,
"grad_norm": 0.39035176617866785,
"learning_rate": 1.6457520762808217e-05,
"loss": 1.3228,
"step": 1275
},
{
"epoch": 0.35039693402682726,
"grad_norm": 0.3830707286890397,
"learning_rate": 1.6420958664297514e-05,
"loss": 1.3094,
"step": 1280
},
{
"epoch": 0.35176567205036957,
"grad_norm": 0.36321444690348953,
"learning_rate": 1.638424993019595e-05,
"loss": 1.3853,
"step": 1285
},
{
"epoch": 0.35313441007391183,
"grad_norm": 0.371835147945527,
"learning_rate": 1.634739539882178e-05,
"loss": 1.3477,
"step": 1290
},
{
"epoch": 0.35450314809745415,
"grad_norm": 0.3906744323143432,
"learning_rate": 1.6310395911822848e-05,
"loss": 1.3149,
"step": 1295
},
{
"epoch": 0.35587188612099646,
"grad_norm": 0.37408867437468385,
"learning_rate": 1.6273252314157352e-05,
"loss": 1.3135,
"step": 1300
},
{
"epoch": 0.3572406241445387,
"grad_norm": 0.37403440281560435,
"learning_rate": 1.6235965454074535e-05,
"loss": 1.2301,
"step": 1305
},
{
"epoch": 0.35860936216808104,
"grad_norm": 0.3867708656040568,
"learning_rate": 1.619853618309535e-05,
"loss": 1.3716,
"step": 1310
},
{
"epoch": 0.3599781001916233,
"grad_norm": 0.3885328796144267,
"learning_rate": 1.6160965355992966e-05,
"loss": 1.366,
"step": 1315
},
{
"epoch": 0.3613468382151656,
"grad_norm": 0.403719948973195,
"learning_rate": 1.6123253830773293e-05,
"loss": 1.3661,
"step": 1320
},
{
"epoch": 0.3627155762387079,
"grad_norm": 0.3827196192323514,
"learning_rate": 1.6085402468655356e-05,
"loss": 1.3567,
"step": 1325
},
{
"epoch": 0.3640843142622502,
"grad_norm": 0.3968500270407259,
"learning_rate": 1.6047412134051645e-05,
"loss": 1.4044,
"step": 1330
},
{
"epoch": 0.3654530522857925,
"grad_norm": 0.38847711656237954,
"learning_rate": 1.6009283694548365e-05,
"loss": 1.3591,
"step": 1335
},
{
"epoch": 0.3668217903093348,
"grad_norm": 0.4036707276972636,
"learning_rate": 1.5971018020885623e-05,
"loss": 1.3916,
"step": 1340
},
{
"epoch": 0.3681905283328771,
"grad_norm": 0.39526777473833846,
"learning_rate": 1.593261598693755e-05,
"loss": 1.3478,
"step": 1345
},
{
"epoch": 0.3695592663564194,
"grad_norm": 0.3837685918142903,
"learning_rate": 1.5894078469692343e-05,
"loss": 1.3604,
"step": 1350
},
{
"epoch": 0.37092800437996165,
"grad_norm": 0.4184389090754904,
"learning_rate": 1.585540634923223e-05,
"loss": 1.4042,
"step": 1355
},
{
"epoch": 0.37229674240350397,
"grad_norm": 0.4132734637825316,
"learning_rate": 1.5816600508713372e-05,
"loss": 1.3901,
"step": 1360
},
{
"epoch": 0.3736654804270463,
"grad_norm": 0.3876929955451876,
"learning_rate": 1.5777661834345708e-05,
"loss": 1.3296,
"step": 1365
},
{
"epoch": 0.37503421845058854,
"grad_norm": 0.39294627819783745,
"learning_rate": 1.57385912153727e-05,
"loss": 1.3581,
"step": 1370
},
{
"epoch": 0.37640295647413086,
"grad_norm": 0.3875865415567979,
"learning_rate": 1.5699389544051028e-05,
"loss": 1.3167,
"step": 1375
},
{
"epoch": 0.37777169449767317,
"grad_norm": 0.3863898154888909,
"learning_rate": 1.566005771563023e-05,
"loss": 1.3694,
"step": 1380
},
{
"epoch": 0.37914043252121543,
"grad_norm": 0.38577432820095847,
"learning_rate": 1.5620596628332242e-05,
"loss": 1.323,
"step": 1385
},
{
"epoch": 0.38050917054475775,
"grad_norm": 0.3971392209603939,
"learning_rate": 1.5581007183330877e-05,
"loss": 1.3432,
"step": 1390
},
{
"epoch": 0.3818779085683,
"grad_norm": 0.40311344444707337,
"learning_rate": 1.554129028473127e-05,
"loss": 1.3802,
"step": 1395
},
{
"epoch": 0.3832466465918423,
"grad_norm": 0.40527858159368374,
"learning_rate": 1.5501446839549207e-05,
"loss": 1.3445,
"step": 1400
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.4025758901452214,
"learning_rate": 1.5461477757690424e-05,
"loss": 1.3321,
"step": 1405
},
{
"epoch": 0.3859841226389269,
"grad_norm": 0.3855393398790419,
"learning_rate": 1.542138395192983e-05,
"loss": 1.3405,
"step": 1410
},
{
"epoch": 0.3873528606624692,
"grad_norm": 0.38944762738859545,
"learning_rate": 1.538116633789065e-05,
"loss": 1.3289,
"step": 1415
},
{
"epoch": 0.38872159868601147,
"grad_norm": 0.4028684971128393,
"learning_rate": 1.5340825834023526e-05,
"loss": 1.3798,
"step": 1420
},
{
"epoch": 0.3900903367095538,
"grad_norm": 0.37654892707807947,
"learning_rate": 1.530036336158553e-05,
"loss": 1.3432,
"step": 1425
},
{
"epoch": 0.3914590747330961,
"grad_norm": 0.3986727790830818,
"learning_rate": 1.5259779844619152e-05,
"loss": 1.3422,
"step": 1430
},
{
"epoch": 0.39282781275663836,
"grad_norm": 0.4029735444716236,
"learning_rate": 1.5219076209931159e-05,
"loss": 1.3136,
"step": 1435
},
{
"epoch": 0.3941965507801807,
"grad_norm": 0.41012078556001347,
"learning_rate": 1.5178253387071458e-05,
"loss": 1.4002,
"step": 1440
},
{
"epoch": 0.395565288803723,
"grad_norm": 0.40771281882367433,
"learning_rate": 1.5137312308311857e-05,
"loss": 1.3684,
"step": 1445
},
{
"epoch": 0.39693402682726525,
"grad_norm": 0.38445172076552614,
"learning_rate": 1.5096253908624778e-05,
"loss": 1.3137,
"step": 1450
},
{
"epoch": 0.39830276485080757,
"grad_norm": 0.3869608798791883,
"learning_rate": 1.5055079125661908e-05,
"loss": 1.2812,
"step": 1455
},
{
"epoch": 0.3996715028743498,
"grad_norm": 0.3987871366347409,
"learning_rate": 1.5013788899732775e-05,
"loss": 1.3394,
"step": 1460
},
{
"epoch": 0.40104024089789214,
"grad_norm": 0.3970490183809672,
"learning_rate": 1.4972384173783284e-05,
"loss": 1.3544,
"step": 1465
},
{
"epoch": 0.40240897892143446,
"grad_norm": 0.3925074770480503,
"learning_rate": 1.493086589337418e-05,
"loss": 1.3294,
"step": 1470
},
{
"epoch": 0.4037777169449767,
"grad_norm": 0.40973551864547075,
"learning_rate": 1.4889235006659448e-05,
"loss": 1.3675,
"step": 1475
},
{
"epoch": 0.40514645496851903,
"grad_norm": 0.3812235133361238,
"learning_rate": 1.484749246436468e-05,
"loss": 1.3288,
"step": 1480
},
{
"epoch": 0.40651519299206135,
"grad_norm": 0.3827389411340829,
"learning_rate": 1.4805639219765337e-05,
"loss": 1.4128,
"step": 1485
},
{
"epoch": 0.4078839310156036,
"grad_norm": 0.37343264715636293,
"learning_rate": 1.476367622866499e-05,
"loss": 1.3419,
"step": 1490
},
{
"epoch": 0.4092526690391459,
"grad_norm": 0.4116326320287375,
"learning_rate": 1.4721604449373505e-05,
"loss": 1.297,
"step": 1495
},
{
"epoch": 0.4106214070626882,
"grad_norm": 0.38955441577160277,
"learning_rate": 1.4679424842685137e-05,
"loss": 1.3138,
"step": 1500
},
{
"epoch": 0.4119901450862305,
"grad_norm": 0.38444568811838686,
"learning_rate": 1.4637138371856601e-05,
"loss": 1.3284,
"step": 1505
},
{
"epoch": 0.4133588831097728,
"grad_norm": 0.3938144538178601,
"learning_rate": 1.4594746002585072e-05,
"loss": 1.3498,
"step": 1510
},
{
"epoch": 0.41472762113331507,
"grad_norm": 0.39696555787686244,
"learning_rate": 1.4552248702986127e-05,
"loss": 1.3524,
"step": 1515
},
{
"epoch": 0.4160963591568574,
"grad_norm": 0.3802714110333375,
"learning_rate": 1.4509647443571643e-05,
"loss": 1.288,
"step": 1520
},
{
"epoch": 0.41746509718039965,
"grad_norm": 0.38532238774246486,
"learning_rate": 1.446694319722763e-05,
"loss": 1.3656,
"step": 1525
},
{
"epoch": 0.41883383520394196,
"grad_norm": 0.3866193343216763,
"learning_rate": 1.4424136939192009e-05,
"loss": 1.3696,
"step": 1530
},
{
"epoch": 0.4202025732274843,
"grad_norm": 0.37501111401637777,
"learning_rate": 1.4381229647032346e-05,
"loss": 1.3512,
"step": 1535
},
{
"epoch": 0.42157131125102654,
"grad_norm": 0.3700012749063157,
"learning_rate": 1.4338222300623533e-05,
"loss": 1.3092,
"step": 1540
},
{
"epoch": 0.42294004927456885,
"grad_norm": 0.38806617391675347,
"learning_rate": 1.4295115882125393e-05,
"loss": 1.3471,
"step": 1545
},
{
"epoch": 0.42430878729811117,
"grad_norm": 0.39116631927262135,
"learning_rate": 1.4251911375960261e-05,
"loss": 1.4043,
"step": 1550
},
{
"epoch": 0.4256775253216534,
"grad_norm": 0.3907489868883888,
"learning_rate": 1.4208609768790513e-05,
"loss": 1.3476,
"step": 1555
},
{
"epoch": 0.42704626334519574,
"grad_norm": 0.40188011896979126,
"learning_rate": 1.4165212049496013e-05,
"loss": 1.3398,
"step": 1560
},
{
"epoch": 0.428415001368738,
"grad_norm": 0.39583117671628015,
"learning_rate": 1.4121719209151545e-05,
"loss": 1.3827,
"step": 1565
},
{
"epoch": 0.4297837393922803,
"grad_norm": 0.3948036088686505,
"learning_rate": 1.4078132241004174e-05,
"loss": 1.3509,
"step": 1570
},
{
"epoch": 0.43115247741582263,
"grad_norm": 0.3844018510427473,
"learning_rate": 1.4034452140450561e-05,
"loss": 1.3619,
"step": 1575
},
{
"epoch": 0.4325212154393649,
"grad_norm": 0.40654356409937986,
"learning_rate": 1.3990679905014235e-05,
"loss": 1.2983,
"step": 1580
},
{
"epoch": 0.4338899534629072,
"grad_norm": 0.401731234750035,
"learning_rate": 1.3946816534322815e-05,
"loss": 1.3573,
"step": 1585
},
{
"epoch": 0.43525869148644947,
"grad_norm": 0.40604771069832035,
"learning_rate": 1.3902863030085176e-05,
"loss": 1.333,
"step": 1590
},
{
"epoch": 0.4366274295099918,
"grad_norm": 0.3821888564061559,
"learning_rate": 1.3858820396068572e-05,
"loss": 1.3062,
"step": 1595
},
{
"epoch": 0.4379961675335341,
"grad_norm": 0.3992586017031061,
"learning_rate": 1.3814689638075725e-05,
"loss": 1.3671,
"step": 1600
},
{
"epoch": 0.43936490555707636,
"grad_norm": 0.4018110425948897,
"learning_rate": 1.3770471763921833e-05,
"loss": 1.3709,
"step": 1605
},
{
"epoch": 0.4407336435806187,
"grad_norm": 0.387994622633552,
"learning_rate": 1.372616778341158e-05,
"loss": 1.3138,
"step": 1610
},
{
"epoch": 0.442102381604161,
"grad_norm": 0.3974604574879118,
"learning_rate": 1.3681778708316054e-05,
"loss": 1.4087,
"step": 1615
},
{
"epoch": 0.44347111962770325,
"grad_norm": 0.39844593969419934,
"learning_rate": 1.3637305552349656e-05,
"loss": 1.3599,
"step": 1620
},
{
"epoch": 0.44483985765124556,
"grad_norm": 0.3857194594528982,
"learning_rate": 1.3592749331146941e-05,
"loss": 1.2897,
"step": 1625
},
{
"epoch": 0.4462085956747878,
"grad_norm": 0.3750305160936592,
"learning_rate": 1.3548111062239432e-05,
"loss": 1.2819,
"step": 1630
},
{
"epoch": 0.44757733369833014,
"grad_norm": 0.4007647993356723,
"learning_rate": 1.350339176503237e-05,
"loss": 1.3194,
"step": 1635
},
{
"epoch": 0.44894607172187245,
"grad_norm": 0.4013887994126227,
"learning_rate": 1.3458592460781446e-05,
"loss": 1.3199,
"step": 1640
},
{
"epoch": 0.4503148097454147,
"grad_norm": 0.3981164766269796,
"learning_rate": 1.341371417256947e-05,
"loss": 1.3464,
"step": 1645
},
{
"epoch": 0.451683547768957,
"grad_norm": 0.39477092552678616,
"learning_rate": 1.3368757925283015e-05,
"loss": 1.3529,
"step": 1650
},
{
"epoch": 0.45305228579249934,
"grad_norm": 0.3896314211042654,
"learning_rate": 1.3323724745589007e-05,
"loss": 1.3881,
"step": 1655
},
{
"epoch": 0.4544210238160416,
"grad_norm": 0.3954835469300208,
"learning_rate": 1.3278615661911274e-05,
"loss": 1.3124,
"step": 1660
},
{
"epoch": 0.4557897618395839,
"grad_norm": 0.3708379387210574,
"learning_rate": 1.3233431704407072e-05,
"loss": 1.2866,
"step": 1665
},
{
"epoch": 0.4571584998631262,
"grad_norm": 0.38510327105983166,
"learning_rate": 1.318817390494355e-05,
"loss": 1.2968,
"step": 1670
},
{
"epoch": 0.4585272378866685,
"grad_norm": 0.38362958034859684,
"learning_rate": 1.3142843297074182e-05,
"loss": 1.3155,
"step": 1675
},
{
"epoch": 0.4598959759102108,
"grad_norm": 0.4082500339960413,
"learning_rate": 1.3097440916015179e-05,
"loss": 1.3646,
"step": 1680
},
{
"epoch": 0.46126471393375307,
"grad_norm": 0.3927808793669975,
"learning_rate": 1.3051967798621834e-05,
"loss": 1.3165,
"step": 1685
},
{
"epoch": 0.4626334519572954,
"grad_norm": 0.4002328011893471,
"learning_rate": 1.300642498336484e-05,
"loss": 1.3287,
"step": 1690
},
{
"epoch": 0.46400218998083764,
"grad_norm": 0.38096569725907414,
"learning_rate": 1.2960813510306599e-05,
"loss": 1.319,
"step": 1695
},
{
"epoch": 0.46537092800437996,
"grad_norm": 0.38869356480860534,
"learning_rate": 1.2915134421077433e-05,
"loss": 1.3763,
"step": 1700
},
{
"epoch": 0.4667396660279223,
"grad_norm": 0.38290589898614796,
"learning_rate": 1.2869388758851828e-05,
"loss": 1.3204,
"step": 1705
},
{
"epoch": 0.46810840405146453,
"grad_norm": 0.41880465304512116,
"learning_rate": 1.2823577568324604e-05,
"loss": 1.3858,
"step": 1710
},
{
"epoch": 0.46947714207500685,
"grad_norm": 0.4031243358335159,
"learning_rate": 1.2777701895687034e-05,
"loss": 1.3346,
"step": 1715
},
{
"epoch": 0.47084588009854916,
"grad_norm": 0.3919138760842289,
"learning_rate": 1.2731762788602988e-05,
"loss": 1.4046,
"step": 1720
},
{
"epoch": 0.4722146181220914,
"grad_norm": 0.40045145219664335,
"learning_rate": 1.2685761296184987e-05,
"loss": 1.3808,
"step": 1725
},
{
"epoch": 0.47358335614563374,
"grad_norm": 0.38631426280799075,
"learning_rate": 1.2639698468970237e-05,
"loss": 1.3563,
"step": 1730
},
{
"epoch": 0.474952094169176,
"grad_norm": 0.3891224496732836,
"learning_rate": 1.259357535889666e-05,
"loss": 1.368,
"step": 1735
},
{
"epoch": 0.4763208321927183,
"grad_norm": 0.40044455029632475,
"learning_rate": 1.2547393019278853e-05,
"loss": 1.3548,
"step": 1740
},
{
"epoch": 0.47768957021626063,
"grad_norm": 0.3899915581292829,
"learning_rate": 1.2501152504784044e-05,
"loss": 1.3291,
"step": 1745
},
{
"epoch": 0.4790583082398029,
"grad_norm": 0.3869143385442828,
"learning_rate": 1.2454854871407993e-05,
"loss": 1.3473,
"step": 1750
},
{
"epoch": 0.4804270462633452,
"grad_norm": 0.3824047072867315,
"learning_rate": 1.2408501176450898e-05,
"loss": 1.3196,
"step": 1755
},
{
"epoch": 0.48179578428688746,
"grad_norm": 0.4057108906200233,
"learning_rate": 1.2362092478493226e-05,
"loss": 1.3554,
"step": 1760
},
{
"epoch": 0.4831645223104298,
"grad_norm": 0.3776742288321122,
"learning_rate": 1.2315629837371556e-05,
"loss": 1.3014,
"step": 1765
},
{
"epoch": 0.4845332603339721,
"grad_norm": 0.3977510874849243,
"learning_rate": 1.2269114314154365e-05,
"loss": 1.3967,
"step": 1770
},
{
"epoch": 0.48590199835751435,
"grad_norm": 0.3697505648941807,
"learning_rate": 1.2222546971117797e-05,
"loss": 1.3171,
"step": 1775
},
{
"epoch": 0.48727073638105667,
"grad_norm": 0.40788370625821474,
"learning_rate": 1.2175928871721411e-05,
"loss": 1.3222,
"step": 1780
},
{
"epoch": 0.488639474404599,
"grad_norm": 0.39090111193092664,
"learning_rate": 1.2129261080583897e-05,
"loss": 1.3676,
"step": 1785
},
{
"epoch": 0.49000821242814124,
"grad_norm": 0.3983751803246069,
"learning_rate": 1.2082544663458736e-05,
"loss": 1.3364,
"step": 1790
},
{
"epoch": 0.49137695045168356,
"grad_norm": 0.3848833412781912,
"learning_rate": 1.20357806872099e-05,
"loss": 1.2978,
"step": 1795
},
{
"epoch": 0.4927456884752258,
"grad_norm": 0.37897812252926655,
"learning_rate": 1.1988970219787467e-05,
"loss": 1.3333,
"step": 1800
},
{
"epoch": 0.49411442649876813,
"grad_norm": 0.3948100327011319,
"learning_rate": 1.1942114330203227e-05,
"loss": 1.3665,
"step": 1805
},
{
"epoch": 0.49548316452231045,
"grad_norm": 0.3844309877664426,
"learning_rate": 1.1895214088506284e-05,
"loss": 1.3383,
"step": 1810
},
{
"epoch": 0.4968519025458527,
"grad_norm": 0.3785571762413702,
"learning_rate": 1.1848270565758616e-05,
"loss": 1.2997,
"step": 1815
},
{
"epoch": 0.498220640569395,
"grad_norm": 0.3873700740980167,
"learning_rate": 1.1801284834010596e-05,
"loss": 1.3332,
"step": 1820
},
{
"epoch": 0.49958937859293734,
"grad_norm": 0.3971195600650439,
"learning_rate": 1.1754257966276544e-05,
"loss": 1.3468,
"step": 1825
},
{
"epoch": 0.5009581166164796,
"grad_norm": 0.37924897770225724,
"learning_rate": 1.1707191036510189e-05,
"loss": 1.3152,
"step": 1830
},
{
"epoch": 0.5023268546400219,
"grad_norm": 0.37889309262468557,
"learning_rate": 1.1660085119580165e-05,
"loss": 1.3074,
"step": 1835
},
{
"epoch": 0.5036955926635642,
"grad_norm": 0.38526267448234497,
"learning_rate": 1.1612941291245456e-05,
"loss": 1.2863,
"step": 1840
},
{
"epoch": 0.5050643306871064,
"grad_norm": 0.4007327343298376,
"learning_rate": 1.1565760628130824e-05,
"loss": 1.348,
"step": 1845
},
{
"epoch": 0.5064330687106487,
"grad_norm": 0.3917232580410847,
"learning_rate": 1.1518544207702238e-05,
"loss": 1.3193,
"step": 1850
},
{
"epoch": 0.5078018067341911,
"grad_norm": 0.41154960813532804,
"learning_rate": 1.1471293108242251e-05,
"loss": 1.3921,
"step": 1855
},
{
"epoch": 0.5091705447577334,
"grad_norm": 0.3808027664016428,
"learning_rate": 1.1424008408825383e-05,
"loss": 1.337,
"step": 1860
},
{
"epoch": 0.5105392827812757,
"grad_norm": 0.38767901381709907,
"learning_rate": 1.1376691189293474e-05,
"loss": 1.3142,
"step": 1865
},
{
"epoch": 0.511908020804818,
"grad_norm": 0.39402951617560983,
"learning_rate": 1.1329342530231036e-05,
"loss": 1.358,
"step": 1870
},
{
"epoch": 0.5132767588283602,
"grad_norm": 0.38538496638703307,
"learning_rate": 1.128196351294055e-05,
"loss": 1.2994,
"step": 1875
},
{
"epoch": 0.5146454968519025,
"grad_norm": 0.38082093170114384,
"learning_rate": 1.1234555219417804e-05,
"loss": 1.3208,
"step": 1880
},
{
"epoch": 0.5160142348754448,
"grad_norm": 0.3748307863634792,
"learning_rate": 1.1187118732327167e-05,
"loss": 1.3506,
"step": 1885
},
{
"epoch": 0.5173829728989872,
"grad_norm": 0.38705593276141625,
"learning_rate": 1.1139655134976855e-05,
"loss": 1.2906,
"step": 1890
},
{
"epoch": 0.5187517109225295,
"grad_norm": 0.39888481423655353,
"learning_rate": 1.1092165511294206e-05,
"loss": 1.367,
"step": 1895
},
{
"epoch": 0.5201204489460717,
"grad_norm": 0.3971141048112466,
"learning_rate": 1.104465094580093e-05,
"loss": 1.3405,
"step": 1900
},
{
"epoch": 0.521489186969614,
"grad_norm": 0.3837313790185174,
"learning_rate": 1.0997112523588322e-05,
"loss": 1.3351,
"step": 1905
},
{
"epoch": 0.5228579249931563,
"grad_norm": 0.39019433157562,
"learning_rate": 1.0949551330292502e-05,
"loss": 1.2482,
"step": 1910
},
{
"epoch": 0.5242266630166986,
"grad_norm": 0.39062012523465395,
"learning_rate": 1.090196845206961e-05,
"loss": 1.343,
"step": 1915
},
{
"epoch": 0.5255954010402409,
"grad_norm": 0.3948259084399907,
"learning_rate": 1.0854364975571004e-05,
"loss": 1.3001,
"step": 1920
},
{
"epoch": 0.5269641390637831,
"grad_norm": 0.3845620013681871,
"learning_rate": 1.0806741987918448e-05,
"loss": 1.3449,
"step": 1925
},
{
"epoch": 0.5283328770873255,
"grad_norm": 0.3976520791295208,
"learning_rate": 1.075910057667928e-05,
"loss": 1.3007,
"step": 1930
},
{
"epoch": 0.5297016151108678,
"grad_norm": 0.3817709747936168,
"learning_rate": 1.071144182984158e-05,
"loss": 1.3453,
"step": 1935
},
{
"epoch": 0.5310703531344101,
"grad_norm": 0.38542609340341144,
"learning_rate": 1.0663766835789327e-05,
"loss": 1.3187,
"step": 1940
},
{
"epoch": 0.5324390911579524,
"grad_norm": 0.3775171570445251,
"learning_rate": 1.0616076683277524e-05,
"loss": 1.3315,
"step": 1945
},
{
"epoch": 0.5338078291814946,
"grad_norm": 0.40024298975169065,
"learning_rate": 1.056837246140736e-05,
"loss": 1.3772,
"step": 1950
},
{
"epoch": 0.5351765672050369,
"grad_norm": 0.3882397104456436,
"learning_rate": 1.0520655259601325e-05,
"loss": 1.3284,
"step": 1955
},
{
"epoch": 0.5365453052285792,
"grad_norm": 0.3892143818918897,
"learning_rate": 1.0472926167578323e-05,
"loss": 1.302,
"step": 1960
},
{
"epoch": 0.5379140432521216,
"grad_norm": 0.41282669532669847,
"learning_rate": 1.042518627532881e-05,
"loss": 1.3261,
"step": 1965
},
{
"epoch": 0.5392827812756639,
"grad_norm": 0.37154889529358315,
"learning_rate": 1.0377436673089873e-05,
"loss": 1.3142,
"step": 1970
},
{
"epoch": 0.5406515192992062,
"grad_norm": 0.4001478843028637,
"learning_rate": 1.0329678451320352e-05,
"loss": 1.3894,
"step": 1975
},
{
"epoch": 0.5420202573227484,
"grad_norm": 0.4082737912884333,
"learning_rate": 1.0281912700675937e-05,
"loss": 1.38,
"step": 1980
},
{
"epoch": 0.5433889953462907,
"grad_norm": 0.3976694377900842,
"learning_rate": 1.0234140511984246e-05,
"loss": 1.337,
"step": 1985
},
{
"epoch": 0.544757733369833,
"grad_norm": 0.4090490835246688,
"learning_rate": 1.0186362976219926e-05,
"loss": 1.3389,
"step": 1990
},
{
"epoch": 0.5461264713933753,
"grad_norm": 0.39123091240448754,
"learning_rate": 1.0138581184479737e-05,
"loss": 1.2726,
"step": 1995
},
{
"epoch": 0.5474952094169176,
"grad_norm": 0.383808685203924,
"learning_rate": 1.0090796227957633e-05,
"loss": 1.3344,
"step": 2000
},
{
"epoch": 0.5488639474404599,
"grad_norm": 0.39282920042398406,
"learning_rate": 1.0043009197919836e-05,
"loss": 1.3188,
"step": 2005
},
{
"epoch": 0.5502326854640022,
"grad_norm": 0.3877735838132022,
"learning_rate": 9.99522118567993e-06,
"loss": 1.3412,
"step": 2010
},
{
"epoch": 0.5516014234875445,
"grad_norm": 0.3855977732261345,
"learning_rate": 9.947433282573926e-06,
"loss": 1.31,
"step": 2015
},
{
"epoch": 0.5529701615110868,
"grad_norm": 0.39182359203627265,
"learning_rate": 9.899646579935336e-06,
"loss": 1.3288,
"step": 2020
},
{
"epoch": 0.5543388995346291,
"grad_norm": 0.37896193597955086,
"learning_rate": 9.85186216907027e-06,
"loss": 1.3222,
"step": 2025
},
{
"epoch": 0.5557076375581713,
"grad_norm": 0.39627324276578396,
"learning_rate": 9.80408114123249e-06,
"loss": 1.3659,
"step": 2030
},
{
"epoch": 0.5570763755817136,
"grad_norm": 0.39331176053433153,
"learning_rate": 9.756304587598503e-06,
"loss": 1.4234,
"step": 2035
},
{
"epoch": 0.558445113605256,
"grad_norm": 0.38413569943245757,
"learning_rate": 9.708533599242643e-06,
"loss": 1.2809,
"step": 2040
},
{
"epoch": 0.5598138516287983,
"grad_norm": 0.378050582923272,
"learning_rate": 9.660769267112137e-06,
"loss": 1.3,
"step": 2045
},
{
"epoch": 0.5611825896523406,
"grad_norm": 0.3810287903511996,
"learning_rate": 9.61301268200222e-06,
"loss": 1.328,
"step": 2050
},
{
"epoch": 0.5625513276758828,
"grad_norm": 0.3968883040797777,
"learning_rate": 9.5652649345312e-06,
"loss": 1.3861,
"step": 2055
},
{
"epoch": 0.5639200656994251,
"grad_norm": 0.381754630880566,
"learning_rate": 9.517527115115554e-06,
"loss": 1.2931,
"step": 2060
},
{
"epoch": 0.5652888037229674,
"grad_norm": 0.3934475062551264,
"learning_rate": 9.46980031394504e-06,
"loss": 1.357,
"step": 2065
},
{
"epoch": 0.5666575417465097,
"grad_norm": 0.3764960791780743,
"learning_rate": 9.422085620957795e-06,
"loss": 1.3112,
"step": 2070
},
{
"epoch": 0.568026279770052,
"grad_norm": 0.3819475902039454,
"learning_rate": 9.374384125815427e-06,
"loss": 1.317,
"step": 2075
},
{
"epoch": 0.5693950177935944,
"grad_norm": 0.39075935188345634,
"learning_rate": 9.326696917878159e-06,
"loss": 1.329,
"step": 2080
},
{
"epoch": 0.5707637558171366,
"grad_norm": 0.4115410325839856,
"learning_rate": 9.27902508617993e-06,
"loss": 1.3405,
"step": 2085
},
{
"epoch": 0.5721324938406789,
"grad_norm": 0.3929327529237431,
"learning_rate": 9.23136971940353e-06,
"loss": 1.2811,
"step": 2090
},
{
"epoch": 0.5735012318642212,
"grad_norm": 0.38257057379328274,
"learning_rate": 9.183731905855746e-06,
"loss": 1.3234,
"step": 2095
},
{
"epoch": 0.5748699698877635,
"grad_norm": 0.38674820150451145,
"learning_rate": 9.136112733442493e-06,
"loss": 1.3466,
"step": 2100
},
{
"epoch": 0.5762387079113058,
"grad_norm": 0.39368922489348585,
"learning_rate": 9.088513289643982e-06,
"loss": 1.3149,
"step": 2105
},
{
"epoch": 0.577607445934848,
"grad_norm": 0.38199335563674713,
"learning_rate": 9.040934661489885e-06,
"loss": 1.3576,
"step": 2110
},
{
"epoch": 0.5789761839583903,
"grad_norm": 0.39323361965983067,
"learning_rate": 8.9933779355345e-06,
"loss": 1.3159,
"step": 2115
},
{
"epoch": 0.5803449219819327,
"grad_norm": 0.39256654497554794,
"learning_rate": 8.945844197831945e-06,
"loss": 1.3623,
"step": 2120
},
{
"epoch": 0.581713660005475,
"grad_norm": 0.3905410660434604,
"learning_rate": 8.898334533911362e-06,
"loss": 1.3074,
"step": 2125
},
{
"epoch": 0.5830823980290173,
"grad_norm": 0.3700729231212675,
"learning_rate": 8.850850028752108e-06,
"loss": 1.3436,
"step": 2130
},
{
"epoch": 0.5844511360525595,
"grad_norm": 0.3701852614302805,
"learning_rate": 8.803391766758998e-06,
"loss": 1.2672,
"step": 2135
},
{
"epoch": 0.5858198740761018,
"grad_norm": 0.3809658344783075,
"learning_rate": 8.755960831737529e-06,
"loss": 1.3638,
"step": 2140
},
{
"epoch": 0.5871886120996441,
"grad_norm": 0.3852501406589481,
"learning_rate": 8.708558306869125e-06,
"loss": 1.3389,
"step": 2145
},
{
"epoch": 0.5885573501231864,
"grad_norm": 0.4003501899309122,
"learning_rate": 8.661185274686418e-06,
"loss": 1.3228,
"step": 2150
},
{
"epoch": 0.5899260881467288,
"grad_norm": 0.38933438340786924,
"learning_rate": 8.613842817048503e-06,
"loss": 1.3627,
"step": 2155
},
{
"epoch": 0.591294826170271,
"grad_norm": 0.39608781513600605,
"learning_rate": 8.566532015116251e-06,
"loss": 1.3626,
"step": 2160
},
{
"epoch": 0.5926635641938133,
"grad_norm": 0.37582004076749886,
"learning_rate": 8.51925394932761e-06,
"loss": 1.2766,
"step": 2165
},
{
"epoch": 0.5940323022173556,
"grad_norm": 0.39046439420533485,
"learning_rate": 8.47200969937293e-06,
"loss": 1.3342,
"step": 2170
},
{
"epoch": 0.5954010402408979,
"grad_norm": 0.3846853640345239,
"learning_rate": 8.424800344170308e-06,
"loss": 1.2973,
"step": 2175
},
{
"epoch": 0.5967697782644402,
"grad_norm": 0.3794031254673053,
"learning_rate": 8.37762696184095e-06,
"loss": 1.3288,
"step": 2180
},
{
"epoch": 0.5981385162879824,
"grad_norm": 0.38487683969290326,
"learning_rate": 8.330490629684547e-06,
"loss": 1.3374,
"step": 2185
},
{
"epoch": 0.5995072543115247,
"grad_norm": 0.38666671699682115,
"learning_rate": 8.28339242415468e-06,
"loss": 1.3202,
"step": 2190
},
{
"epoch": 0.600875992335067,
"grad_norm": 0.3788541652969095,
"learning_rate": 8.236333420834216e-06,
"loss": 1.2778,
"step": 2195
},
{
"epoch": 0.6022447303586094,
"grad_norm": 0.4003559394681289,
"learning_rate": 8.189314694410781e-06,
"loss": 1.3349,
"step": 2200
},
{
"epoch": 0.6036134683821517,
"grad_norm": 0.40672129713209243,
"learning_rate": 8.14233731865218e-06,
"loss": 1.3343,
"step": 2205
},
{
"epoch": 0.604982206405694,
"grad_norm": 0.392344465653996,
"learning_rate": 8.0954023663819e-06,
"loss": 1.2804,
"step": 2210
},
{
"epoch": 0.6063509444292362,
"grad_norm": 0.396931681160349,
"learning_rate": 8.048510909454601e-06,
"loss": 1.3712,
"step": 2215
},
{
"epoch": 0.6077196824527785,
"grad_norm": 0.3882127247657791,
"learning_rate": 8.001664018731637e-06,
"loss": 1.3313,
"step": 2220
},
{
"epoch": 0.6090884204763208,
"grad_norm": 0.3727086035483801,
"learning_rate": 7.954862764056602e-06,
"loss": 1.2715,
"step": 2225
},
{
"epoch": 0.6104571584998632,
"grad_norm": 0.39519191088539024,
"learning_rate": 7.908108214230902e-06,
"loss": 1.3018,
"step": 2230
},
{
"epoch": 0.6118258965234055,
"grad_norm": 0.39954515244878724,
"learning_rate": 7.861401436989337e-06,
"loss": 1.3697,
"step": 2235
},
{
"epoch": 0.6131946345469477,
"grad_norm": 0.3961862899698687,
"learning_rate": 7.814743498975731e-06,
"loss": 1.3651,
"step": 2240
},
{
"epoch": 0.61456337257049,
"grad_norm": 0.38565731038310297,
"learning_rate": 7.768135465718559e-06,
"loss": 1.3462,
"step": 2245
},
{
"epoch": 0.6159321105940323,
"grad_norm": 0.38796255525719986,
"learning_rate": 7.72157840160662e-06,
"loss": 1.3087,
"step": 2250
},
{
"epoch": 0.6173008486175746,
"grad_norm": 0.3947254708668997,
"learning_rate": 7.67507336986474e-06,
"loss": 1.262,
"step": 2255
},
{
"epoch": 0.6186695866411169,
"grad_norm": 0.39167259958488637,
"learning_rate": 7.628621432529461e-06,
"loss": 1.3117,
"step": 2260
},
{
"epoch": 0.6200383246646591,
"grad_norm": 0.39458361491505556,
"learning_rate": 7.582223650424825e-06,
"loss": 1.3067,
"step": 2265
},
{
"epoch": 0.6214070626882015,
"grad_norm": 0.3948240515553033,
"learning_rate": 7.5358810831381225e-06,
"loss": 1.3177,
"step": 2270
},
{
"epoch": 0.6227758007117438,
"grad_norm": 0.3814626352546649,
"learning_rate": 7.489594788995698e-06,
"loss": 1.3259,
"step": 2275
},
{
"epoch": 0.6241445387352861,
"grad_norm": 0.39070667312634827,
"learning_rate": 7.443365825038793e-06,
"loss": 1.3212,
"step": 2280
},
{
"epoch": 0.6255132767588284,
"grad_norm": 0.38913877038236866,
"learning_rate": 7.397195246999391e-06,
"loss": 1.346,
"step": 2285
},
{
"epoch": 0.6268820147823706,
"grad_norm": 0.4070153012541291,
"learning_rate": 7.351084109276119e-06,
"loss": 1.3826,
"step": 2290
},
{
"epoch": 0.6282507528059129,
"grad_norm": 0.36428744429147014,
"learning_rate": 7.305033464910164e-06,
"loss": 1.2659,
"step": 2295
},
{
"epoch": 0.6296194908294552,
"grad_norm": 0.3877290507395246,
"learning_rate": 7.25904436556122e-06,
"loss": 1.304,
"step": 2300
},
{
"epoch": 0.6309882288529975,
"grad_norm": 0.3803920075270922,
"learning_rate": 7.21311786148348e-06,
"loss": 1.3333,
"step": 2305
},
{
"epoch": 0.6323569668765399,
"grad_norm": 0.3822541348646928,
"learning_rate": 7.167255001501651e-06,
"loss": 1.3307,
"step": 2310
},
{
"epoch": 0.6337257049000822,
"grad_norm": 0.3977624877197784,
"learning_rate": 7.121456832986988e-06,
"loss": 1.3329,
"step": 2315
},
{
"epoch": 0.6350944429236244,
"grad_norm": 0.38990040940646387,
"learning_rate": 7.075724401833395e-06,
"loss": 1.3647,
"step": 2320
},
{
"epoch": 0.6364631809471667,
"grad_norm": 0.39550975070062716,
"learning_rate": 7.030058752433526e-06,
"loss": 1.2715,
"step": 2325
},
{
"epoch": 0.637831918970709,
"grad_norm": 0.3857148917491234,
"learning_rate": 6.984460927654937e-06,
"loss": 1.3191,
"step": 2330
},
{
"epoch": 0.6392006569942513,
"grad_norm": 0.38855941306596253,
"learning_rate": 6.938931968816275e-06,
"loss": 1.2979,
"step": 2335
},
{
"epoch": 0.6405693950177936,
"grad_norm": 0.3892088605550479,
"learning_rate": 6.893472915663493e-06,
"loss": 1.3826,
"step": 2340
},
{
"epoch": 0.6419381330413358,
"grad_norm": 0.4024108620394257,
"learning_rate": 6.8480848063461035e-06,
"loss": 1.2943,
"step": 2345
},
{
"epoch": 0.6433068710648782,
"grad_norm": 0.3976600247063458,
"learning_rate": 6.8027686773934765e-06,
"loss": 1.3241,
"step": 2350
},
{
"epoch": 0.6446756090884205,
"grad_norm": 0.38538373822690386,
"learning_rate": 6.7575255636911626e-06,
"loss": 1.3316,
"step": 2355
},
{
"epoch": 0.6460443471119628,
"grad_norm": 0.3961847974725629,
"learning_rate": 6.7123564984572596e-06,
"loss": 1.3481,
"step": 2360
},
{
"epoch": 0.6474130851355051,
"grad_norm": 0.3739018408786073,
"learning_rate": 6.667262513218824e-06,
"loss": 1.2857,
"step": 2365
},
{
"epoch": 0.6487818231590473,
"grad_norm": 0.4058027362822848,
"learning_rate": 6.622244637788302e-06,
"loss": 1.3623,
"step": 2370
},
{
"epoch": 0.6501505611825896,
"grad_norm": 0.3887614780397367,
"learning_rate": 6.577303900240023e-06,
"loss": 1.3466,
"step": 2375
},
{
"epoch": 0.6515192992061319,
"grad_norm": 0.39583219449776136,
"learning_rate": 6.532441326886716e-06,
"loss": 1.3826,
"step": 2380
},
{
"epoch": 0.6528880372296743,
"grad_norm": 0.38600694323078566,
"learning_rate": 6.487657942256069e-06,
"loss": 1.3124,
"step": 2385
},
{
"epoch": 0.6542567752532166,
"grad_norm": 0.35581732986644143,
"learning_rate": 6.442954769067341e-06,
"loss": 1.3352,
"step": 2390
},
{
"epoch": 0.6556255132767588,
"grad_norm": 0.3977530272775962,
"learning_rate": 6.398332828207996e-06,
"loss": 1.3226,
"step": 2395
},
{
"epoch": 0.6569942513003011,
"grad_norm": 0.3928175408666658,
"learning_rate": 6.3537931387103925e-06,
"loss": 1.3262,
"step": 2400
},
{
"epoch": 0.6583629893238434,
"grad_norm": 0.3795840583774513,
"learning_rate": 6.309336717728516e-06,
"loss": 1.3203,
"step": 2405
},
{
"epoch": 0.6597317273473857,
"grad_norm": 0.396679060826067,
"learning_rate": 6.264964580514745e-06,
"loss": 1.2842,
"step": 2410
},
{
"epoch": 0.661100465370928,
"grad_norm": 0.3699325537006795,
"learning_rate": 6.220677740396668e-06,
"loss": 1.2936,
"step": 2415
},
{
"epoch": 0.6624692033944704,
"grad_norm": 0.38098356916961845,
"learning_rate": 6.176477208753944e-06,
"loss": 1.276,
"step": 2420
},
{
"epoch": 0.6638379414180126,
"grad_norm": 0.38873579028440075,
"learning_rate": 6.132363994995194e-06,
"loss": 1.2348,
"step": 2425
},
{
"epoch": 0.6652066794415549,
"grad_norm": 0.39405073870487267,
"learning_rate": 6.088339106534971e-06,
"loss": 1.2659,
"step": 2430
},
{
"epoch": 0.6665754174650972,
"grad_norm": 0.3833620202872981,
"learning_rate": 6.044403548770735e-06,
"loss": 1.3103,
"step": 2435
},
{
"epoch": 0.6679441554886395,
"grad_norm": 0.39527725366864974,
"learning_rate": 6.000558325059894e-06,
"loss": 1.3423,
"step": 2440
},
{
"epoch": 0.6693128935121818,
"grad_norm": 0.38996615233504806,
"learning_rate": 5.956804436696904e-06,
"loss": 1.303,
"step": 2445
},
{
"epoch": 0.670681631535724,
"grad_norm": 0.3943489977366666,
"learning_rate": 5.9131428828903905e-06,
"loss": 1.3547,
"step": 2450
},
{
"epoch": 0.6720503695592663,
"grad_norm": 0.3899208460332646,
"learning_rate": 5.8695746607403285e-06,
"loss": 1.2969,
"step": 2455
},
{
"epoch": 0.6734191075828087,
"grad_norm": 0.3940611248671062,
"learning_rate": 5.826100765215273e-06,
"loss": 1.318,
"step": 2460
},
{
"epoch": 0.674787845606351,
"grad_norm": 0.3930083338553073,
"learning_rate": 5.782722189129655e-06,
"loss": 1.2906,
"step": 2465
},
{
"epoch": 0.6761565836298933,
"grad_norm": 0.39311675749157776,
"learning_rate": 5.739439923121077e-06,
"loss": 1.3569,
"step": 2470
},
{
"epoch": 0.6775253216534355,
"grad_norm": 0.4114954929599597,
"learning_rate": 5.6962549556277134e-06,
"loss": 1.3773,
"step": 2475
},
{
"epoch": 0.6788940596769778,
"grad_norm": 0.38652001787654966,
"learning_rate": 5.653168272865724e-06,
"loss": 1.3008,
"step": 2480
},
{
"epoch": 0.6802627977005201,
"grad_norm": 0.3878760723325068,
"learning_rate": 5.6101808588067505e-06,
"loss": 1.2992,
"step": 2485
},
{
"epoch": 0.6816315357240624,
"grad_norm": 0.39884745029373364,
"learning_rate": 5.56729369515542e-06,
"loss": 1.3088,
"step": 2490
},
{
"epoch": 0.6830002737476047,
"grad_norm": 0.38961302860625757,
"learning_rate": 5.52450776132694e-06,
"loss": 1.2904,
"step": 2495
},
{
"epoch": 0.684369011771147,
"grad_norm": 0.3970778873479824,
"learning_rate": 5.481824034424741e-06,
"loss": 1.3464,
"step": 2500
},
{
"epoch": 0.6857377497946893,
"grad_norm": 0.4014161218820373,
"learning_rate": 5.439243489218138e-06,
"loss": 1.3376,
"step": 2505
},
{
"epoch": 0.6871064878182316,
"grad_norm": 0.3874962119825426,
"learning_rate": 5.396767098120087e-06,
"loss": 1.3078,
"step": 2510
},
{
"epoch": 0.6884752258417739,
"grad_norm": 0.37396226223807555,
"learning_rate": 5.354395831164982e-06,
"loss": 1.3097,
"step": 2515
},
{
"epoch": 0.6898439638653162,
"grad_norm": 0.3792883560955228,
"learning_rate": 5.312130655986485e-06,
"loss": 1.3158,
"step": 2520
},
{
"epoch": 0.6912127018888585,
"grad_norm": 0.3798203691318429,
"learning_rate": 5.269972537795434e-06,
"loss": 1.3087,
"step": 2525
},
{
"epoch": 0.6925814399124007,
"grad_norm": 0.38253235315284295,
"learning_rate": 5.227922439357823e-06,
"loss": 1.3198,
"step": 2530
},
{
"epoch": 0.693950177935943,
"grad_norm": 0.3817014501814054,
"learning_rate": 5.1859813209727775e-06,
"loss": 1.321,
"step": 2535
},
{
"epoch": 0.6953189159594854,
"grad_norm": 0.3950376035714224,
"learning_rate": 5.144150140450643e-06,
"loss": 1.3024,
"step": 2540
},
{
"epoch": 0.6966876539830277,
"grad_norm": 0.4084629435066235,
"learning_rate": 5.102429853091128e-06,
"loss": 1.3437,
"step": 2545
},
{
"epoch": 0.69805639200657,
"grad_norm": 0.3808185831587325,
"learning_rate": 5.060821411661459e-06,
"loss": 1.3264,
"step": 2550
},
{
"epoch": 0.6994251300301122,
"grad_norm": 0.3937638284493571,
"learning_rate": 5.019325766374625e-06,
"loss": 1.314,
"step": 2555
},
{
"epoch": 0.7007938680536545,
"grad_norm": 0.3920386340433558,
"learning_rate": 4.977943864867712e-06,
"loss": 1.3271,
"step": 2560
},
{
"epoch": 0.7021626060771968,
"grad_norm": 0.3898729624487372,
"learning_rate": 4.936676652180215e-06,
"loss": 1.3467,
"step": 2565
},
{
"epoch": 0.7035313441007391,
"grad_norm": 0.39902316286587364,
"learning_rate": 4.89552507073248e-06,
"loss": 1.3102,
"step": 2570
},
{
"epoch": 0.7049000821242815,
"grad_norm": 0.3673934948431135,
"learning_rate": 4.854490060304192e-06,
"loss": 1.2612,
"step": 2575
},
{
"epoch": 0.7062688201478237,
"grad_norm": 0.3922497888421968,
"learning_rate": 4.813572558012892e-06,
"loss": 1.329,
"step": 2580
},
{
"epoch": 0.707637558171366,
"grad_norm": 0.3860080742782407,
"learning_rate": 4.772773498292579e-06,
"loss": 1.3332,
"step": 2585
},
{
"epoch": 0.7090062961949083,
"grad_norm": 0.3695087849805836,
"learning_rate": 4.732093812872391e-06,
"loss": 1.2697,
"step": 2590
},
{
"epoch": 0.7103750342184506,
"grad_norm": 0.4012769440751328,
"learning_rate": 4.691534430755302e-06,
"loss": 1.3213,
"step": 2595
},
{
"epoch": 0.7117437722419929,
"grad_norm": 0.38657730746893026,
"learning_rate": 4.651096278196916e-06,
"loss": 1.3665,
"step": 2600
},
{
"epoch": 0.7131125102655351,
"grad_norm": 0.38560469814966725,
"learning_rate": 4.610780278684315e-06,
"loss": 1.3369,
"step": 2605
},
{
"epoch": 0.7144812482890774,
"grad_norm": 0.39542221048440374,
"learning_rate": 4.570587352914977e-06,
"loss": 1.3048,
"step": 2610
},
{
"epoch": 0.7158499863126198,
"grad_norm": 0.3910002548641959,
"learning_rate": 4.530518418775734e-06,
"loss": 1.3247,
"step": 2615
},
{
"epoch": 0.7172187243361621,
"grad_norm": 0.3825608407076146,
"learning_rate": 4.490574391321814e-06,
"loss": 1.2967,
"step": 2620
},
{
"epoch": 0.7185874623597044,
"grad_norm": 0.4028460389272689,
"learning_rate": 4.450756182755963e-06,
"loss": 1.3671,
"step": 2625
},
{
"epoch": 0.7199562003832466,
"grad_norm": 0.40485919090431155,
"learning_rate": 4.411064702407585e-06,
"loss": 1.3556,
"step": 2630
},
{
"epoch": 0.7213249384067889,
"grad_norm": 0.39021669719654245,
"learning_rate": 4.371500856711988e-06,
"loss": 1.3774,
"step": 2635
},
{
"epoch": 0.7226936764303312,
"grad_norm": 0.3958279714364836,
"learning_rate": 4.332065549189697e-06,
"loss": 1.2982,
"step": 2640
},
{
"epoch": 0.7240624144538735,
"grad_norm": 0.4060690524084766,
"learning_rate": 4.292759680425794e-06,
"loss": 1.3767,
"step": 2645
},
{
"epoch": 0.7254311524774159,
"grad_norm": 0.40926421108177974,
"learning_rate": 4.253584148049369e-06,
"loss": 1.339,
"step": 2650
},
{
"epoch": 0.7267998905009582,
"grad_norm": 0.37375698755072967,
"learning_rate": 4.214539846713024e-06,
"loss": 1.2961,
"step": 2655
},
{
"epoch": 0.7281686285245004,
"grad_norm": 0.3968852233360893,
"learning_rate": 4.175627668072425e-06,
"loss": 1.2962,
"step": 2660
},
{
"epoch": 0.7295373665480427,
"grad_norm": 0.4073445114415598,
"learning_rate": 4.136848500765948e-06,
"loss": 1.4056,
"step": 2665
},
{
"epoch": 0.730906104571585,
"grad_norm": 0.3950383171619063,
"learning_rate": 4.098203230394399e-06,
"loss": 1.3192,
"step": 2670
},
{
"epoch": 0.7322748425951273,
"grad_norm": 0.3986322017273241,
"learning_rate": 4.059692739500761e-06,
"loss": 1.3485,
"step": 2675
},
{
"epoch": 0.7336435806186696,
"grad_norm": 0.3848930065792978,
"learning_rate": 4.02131790755006e-06,
"loss": 1.378,
"step": 2680
},
{
"epoch": 0.7350123186422118,
"grad_norm": 0.39139666434515563,
"learning_rate": 3.983079610909283e-06,
"loss": 1.3334,
"step": 2685
},
{
"epoch": 0.7363810566657542,
"grad_norm": 0.38858454704969564,
"learning_rate": 3.944978722827347e-06,
"loss": 1.3235,
"step": 2690
},
{
"epoch": 0.7377497946892965,
"grad_norm": 0.390122324459421,
"learning_rate": 3.907016113415166e-06,
"loss": 1.3583,
"step": 2695
},
{
"epoch": 0.7391185327128388,
"grad_norm": 0.3919502066927978,
"learning_rate": 3.869192649625792e-06,
"loss": 1.3092,
"step": 2700
},
{
"epoch": 0.7404872707363811,
"grad_norm": 0.39777490757009926,
"learning_rate": 3.831509195234598e-06,
"loss": 1.3358,
"step": 2705
},
{
"epoch": 0.7418560087599233,
"grad_norm": 0.39353868348346743,
"learning_rate": 3.793966610819545e-06,
"loss": 1.3515,
"step": 2710
},
{
"epoch": 0.7432247467834656,
"grad_norm": 0.39342447998988617,
"learning_rate": 3.756565753741569e-06,
"loss": 1.2885,
"step": 2715
},
{
"epoch": 0.7445934848070079,
"grad_norm": 0.37781062124467246,
"learning_rate": 3.7193074781249585e-06,
"loss": 1.276,
"step": 2720
},
{
"epoch": 0.7459622228305502,
"grad_norm": 0.38397833073923665,
"learning_rate": 3.6821926348378666e-06,
"loss": 1.3031,
"step": 2725
},
{
"epoch": 0.7473309608540926,
"grad_norm": 0.3993776914034358,
"learning_rate": 3.6452220714728883e-06,
"loss": 1.3719,
"step": 2730
},
{
"epoch": 0.7486996988776348,
"grad_norm": 0.387591612397714,
"learning_rate": 3.608396632327684e-06,
"loss": 1.3006,
"step": 2735
},
{
"epoch": 0.7500684369011771,
"grad_norm": 0.4059102325008591,
"learning_rate": 3.5717171583857115e-06,
"loss": 1.3689,
"step": 2740
},
{
"epoch": 0.7514371749247194,
"grad_norm": 0.3772375625162727,
"learning_rate": 3.5351844872970233e-06,
"loss": 1.3091,
"step": 2745
},
{
"epoch": 0.7528059129482617,
"grad_norm": 0.3853940006094772,
"learning_rate": 3.498799453359124e-06,
"loss": 1.3454,
"step": 2750
},
{
"epoch": 0.754174650971804,
"grad_norm": 0.3771144405512011,
"learning_rate": 3.462562887497927e-06,
"loss": 1.2998,
"step": 2755
},
{
"epoch": 0.7555433889953463,
"grad_norm": 0.39604150906533137,
"learning_rate": 3.4264756172487813e-06,
"loss": 1.3129,
"step": 2760
},
{
"epoch": 0.7569121270188885,
"grad_norm": 0.3843657920576998,
"learning_rate": 3.390538466737564e-06,
"loss": 1.2981,
"step": 2765
},
{
"epoch": 0.7582808650424309,
"grad_norm": 0.39238834291334906,
"learning_rate": 3.3547522566618593e-06,
"loss": 1.2697,
"step": 2770
},
{
"epoch": 0.7596496030659732,
"grad_norm": 0.4030855832137369,
"learning_rate": 3.319117804272236e-06,
"loss": 1.3152,
"step": 2775
},
{
"epoch": 0.7610183410895155,
"grad_norm": 0.38853914068753576,
"learning_rate": 3.283635923353553e-06,
"loss": 1.3125,
"step": 2780
},
{
"epoch": 0.7623870791130578,
"grad_norm": 0.39575307616938343,
"learning_rate": 3.248307424206395e-06,
"loss": 1.4215,
"step": 2785
},
{
"epoch": 0.7637558171366,
"grad_norm": 0.3903008557955737,
"learning_rate": 3.2131331136285717e-06,
"loss": 1.3034,
"step": 2790
},
{
"epoch": 0.7651245551601423,
"grad_norm": 0.38760587663102863,
"learning_rate": 3.1781137948966754e-06,
"loss": 1.291,
"step": 2795
},
{
"epoch": 0.7664932931836846,
"grad_norm": 0.3917779069091076,
"learning_rate": 3.1432502677477494e-06,
"loss": 1.3318,
"step": 2800
},
{
"epoch": 0.767862031207227,
"grad_norm": 0.3978532288818969,
"learning_rate": 3.108543328361017e-06,
"loss": 1.3074,
"step": 2805
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.3931552217963087,
"learning_rate": 3.0739937693397113e-06,
"loss": 1.3181,
"step": 2810
},
{
"epoch": 0.7705995072543115,
"grad_norm": 0.38132685696899987,
"learning_rate": 3.0396023796929597e-06,
"loss": 1.3202,
"step": 2815
},
{
"epoch": 0.7719682452778538,
"grad_norm": 0.3895674318092009,
"learning_rate": 3.0053699448177687e-06,
"loss": 1.2912,
"step": 2820
},
{
"epoch": 0.7733369833013961,
"grad_norm": 0.3923145835781509,
"learning_rate": 2.971297246481101e-06,
"loss": 1.2874,
"step": 2825
},
{
"epoch": 0.7747057213249384,
"grad_norm": 0.3964424700594031,
"learning_rate": 2.937385062802004e-06,
"loss": 1.3723,
"step": 2830
},
{
"epoch": 0.7760744593484807,
"grad_norm": 0.39752354179970156,
"learning_rate": 2.9036341682338466e-06,
"loss": 1.3329,
"step": 2835
},
{
"epoch": 0.7774431973720229,
"grad_norm": 0.39323110376722525,
"learning_rate": 2.870045333546644e-06,
"loss": 1.2905,
"step": 2840
},
{
"epoch": 0.7788119353955653,
"grad_norm": 0.3924911670687984,
"learning_rate": 2.8366193258094355e-06,
"loss": 1.292,
"step": 2845
},
{
"epoch": 0.7801806734191076,
"grad_norm": 0.372337059578975,
"learning_rate": 2.8033569083727797e-06,
"loss": 1.3368,
"step": 2850
},
{
"epoch": 0.7815494114426499,
"grad_norm": 0.386256983183401,
"learning_rate": 2.7702588408513276e-06,
"loss": 1.3236,
"step": 2855
},
{
"epoch": 0.7829181494661922,
"grad_norm": 0.4151085195929508,
"learning_rate": 2.7373258791064572e-06,
"loss": 1.3342,
"step": 2860
},
{
"epoch": 0.7842868874897345,
"grad_norm": 0.39410387314037304,
"learning_rate": 2.7045587752290224e-06,
"loss": 1.28,
"step": 2865
},
{
"epoch": 0.7856556255132767,
"grad_norm": 0.40070665770653285,
"learning_rate": 2.6719582775221862e-06,
"loss": 1.3336,
"step": 2870
},
{
"epoch": 0.787024363536819,
"grad_norm": 0.3842777541641571,
"learning_rate": 2.6395251304843137e-06,
"loss": 1.2757,
"step": 2875
},
{
"epoch": 0.7883931015603614,
"grad_norm": 0.40580643578053277,
"learning_rate": 2.6072600747919773e-06,
"loss": 1.3196,
"step": 2880
},
{
"epoch": 0.7897618395839037,
"grad_norm": 0.388179506322075,
"learning_rate": 2.575163847283053e-06,
"loss": 1.293,
"step": 2885
},
{
"epoch": 0.791130577607446,
"grad_norm": 0.3732450173203748,
"learning_rate": 2.543237180939875e-06,
"loss": 1.2751,
"step": 2890
},
{
"epoch": 0.7924993156309882,
"grad_norm": 0.38506878802774175,
"learning_rate": 2.5114808048725035e-06,
"loss": 1.3083,
"step": 2895
},
{
"epoch": 0.7938680536545305,
"grad_norm": 0.4075860384902042,
"learning_rate": 2.479895444302086e-06,
"loss": 1.307,
"step": 2900
},
{
"epoch": 0.7952367916780728,
"grad_norm": 0.39287683770161946,
"learning_rate": 2.4484818205442763e-06,
"loss": 1.3645,
"step": 2905
},
{
"epoch": 0.7966055297016151,
"grad_norm": 0.3882005380231924,
"learning_rate": 2.417240650992767e-06,
"loss": 1.3512,
"step": 2910
},
{
"epoch": 0.7979742677251574,
"grad_norm": 0.3866431713970654,
"learning_rate": 2.3861726491029237e-06,
"loss": 1.2793,
"step": 2915
},
{
"epoch": 0.7993430057486997,
"grad_norm": 0.3798968085022952,
"learning_rate": 2.355278524375465e-06,
"loss": 1.2865,
"step": 2920
},
{
"epoch": 0.800711743772242,
"grad_norm": 0.39453328832547074,
"learning_rate": 2.324558982340275e-06,
"loss": 1.2576,
"step": 2925
},
{
"epoch": 0.8020804817957843,
"grad_norm": 0.3764906658011197,
"learning_rate": 2.2940147245402944e-06,
"loss": 1.3167,
"step": 2930
},
{
"epoch": 0.8034492198193266,
"grad_norm": 0.3823354264503374,
"learning_rate": 2.2636464485154875e-06,
"loss": 1.334,
"step": 2935
},
{
"epoch": 0.8048179578428689,
"grad_norm": 0.3911098766218853,
"learning_rate": 2.23345484778692e-06,
"loss": 1.3856,
"step": 2940
},
{
"epoch": 0.8061866958664111,
"grad_norm": 0.4024307217930752,
"learning_rate": 2.2034406118409178e-06,
"loss": 1.3704,
"step": 2945
},
{
"epoch": 0.8075554338899534,
"grad_norm": 0.38312668558103635,
"learning_rate": 2.1736044261133305e-06,
"loss": 1.32,
"step": 2950
},
{
"epoch": 0.8089241719134957,
"grad_norm": 0.39944235808095113,
"learning_rate": 2.1439469719738615e-06,
"loss": 1.3348,
"step": 2955
},
{
"epoch": 0.8102929099370381,
"grad_norm": 0.37362368240209753,
"learning_rate": 2.1144689267105213e-06,
"loss": 1.3058,
"step": 2960
},
{
"epoch": 0.8116616479605804,
"grad_norm": 0.3977426993128578,
"learning_rate": 2.0851709635141526e-06,
"loss": 1.329,
"step": 2965
},
{
"epoch": 0.8130303859841227,
"grad_norm": 0.390250388707541,
"learning_rate": 2.0560537514630595e-06,
"loss": 1.3435,
"step": 2970
},
{
"epoch": 0.8143991240076649,
"grad_norm": 0.41070469591969644,
"learning_rate": 2.0271179555077357e-06,
"loss": 1.3172,
"step": 2975
},
{
"epoch": 0.8157678620312072,
"grad_norm": 0.378348117809942,
"learning_rate": 1.998364236455661e-06,
"loss": 1.3027,
"step": 2980
},
{
"epoch": 0.8171366000547495,
"grad_norm": 0.380190609171277,
"learning_rate": 1.969793250956221e-06,
"loss": 1.2577,
"step": 2985
},
{
"epoch": 0.8185053380782918,
"grad_norm": 0.38655656053220244,
"learning_rate": 1.9414056514857205e-06,
"loss": 1.3137,
"step": 2990
},
{
"epoch": 0.8198740761018342,
"grad_norm": 0.38867016212920535,
"learning_rate": 1.913202086332463e-06,
"loss": 1.3597,
"step": 2995
},
{
"epoch": 0.8212428141253764,
"grad_norm": 0.38641157633799067,
"learning_rate": 1.8851831995819569e-06,
"loss": 1.3184,
"step": 3000
},
{
"epoch": 0.8226115521489187,
"grad_norm": 0.39081114681314566,
"learning_rate": 1.8573496311022133e-06,
"loss": 1.3219,
"step": 3005
},
{
"epoch": 0.823980290172461,
"grad_norm": 0.37733544335013924,
"learning_rate": 1.8297020165291158e-06,
"loss": 1.2927,
"step": 3010
},
{
"epoch": 0.8253490281960033,
"grad_norm": 0.38748075262103254,
"learning_rate": 1.8022409872519197e-06,
"loss": 1.3184,
"step": 3015
},
{
"epoch": 0.8267177662195456,
"grad_norm": 0.40138106139790847,
"learning_rate": 1.7749671703988226e-06,
"loss": 1.3312,
"step": 3020
},
{
"epoch": 0.8280865042430878,
"grad_norm": 0.3876362883781664,
"learning_rate": 1.7478811888226555e-06,
"loss": 1.3101,
"step": 3025
},
{
"epoch": 0.8294552422666301,
"grad_norm": 0.3936459314036851,
"learning_rate": 1.7209836610866426e-06,
"loss": 1.3431,
"step": 3030
},
{
"epoch": 0.8308239802901725,
"grad_norm": 0.38764243587502206,
"learning_rate": 1.694275201450284e-06,
"loss": 1.3386,
"step": 3035
},
{
"epoch": 0.8321927183137148,
"grad_norm": 0.3908568598595201,
"learning_rate": 1.6677564198553332e-06,
"loss": 1.3342,
"step": 3040
},
{
"epoch": 0.8335614563372571,
"grad_norm": 0.38588972696708185,
"learning_rate": 1.6414279219118568e-06,
"loss": 1.3527,
"step": 3045
},
{
"epoch": 0.8349301943607993,
"grad_norm": 0.3907806499233958,
"learning_rate": 1.6152903088844051e-06,
"loss": 1.3104,
"step": 3050
},
{
"epoch": 0.8362989323843416,
"grad_norm": 0.3825824287507951,
"learning_rate": 1.5893441776782947e-06,
"loss": 1.3062,
"step": 3055
},
{
"epoch": 0.8376676704078839,
"grad_norm": 0.3842728055099776,
"learning_rate": 1.5635901208259608e-06,
"loss": 1.3581,
"step": 3060
},
{
"epoch": 0.8390364084314262,
"grad_norm": 0.3905623616859072,
"learning_rate": 1.5380287264734285e-06,
"loss": 1.3148,
"step": 3065
},
{
"epoch": 0.8404051464549686,
"grad_norm": 0.39844460827488454,
"learning_rate": 1.5126605783668945e-06,
"loss": 1.3074,
"step": 3070
},
{
"epoch": 0.8417738844785108,
"grad_norm": 0.39217288885390583,
"learning_rate": 1.4874862558393787e-06,
"loss": 1.3171,
"step": 3075
},
{
"epoch": 0.8431426225020531,
"grad_norm": 0.38303308206260517,
"learning_rate": 1.462506333797501e-06,
"loss": 1.2985,
"step": 3080
},
{
"epoch": 0.8445113605255954,
"grad_norm": 0.37178133903556354,
"learning_rate": 1.4377213827083602e-06,
"loss": 1.278,
"step": 3085
},
{
"epoch": 0.8458800985491377,
"grad_norm": 0.383873936282195,
"learning_rate": 1.413131968586491e-06,
"loss": 1.2989,
"step": 3090
},
{
"epoch": 0.84724883657268,
"grad_norm": 0.3877611566994199,
"learning_rate": 1.3887386529809454e-06,
"loss": 1.3543,
"step": 3095
},
{
"epoch": 0.8486175745962223,
"grad_norm": 0.3844140889489012,
"learning_rate": 1.364541992962476e-06,
"loss": 1.345,
"step": 3100
},
{
"epoch": 0.8499863126197645,
"grad_norm": 0.3791779816963577,
"learning_rate": 1.3405425411108008e-06,
"loss": 1.3202,
"step": 3105
},
{
"epoch": 0.8513550506433069,
"grad_norm": 0.37802806800903216,
"learning_rate": 1.3167408455019903e-06,
"loss": 1.3,
"step": 3110
},
{
"epoch": 0.8527237886668492,
"grad_norm": 0.38421255363489804,
"learning_rate": 1.2931374496959548e-06,
"loss": 1.3033,
"step": 3115
},
{
"epoch": 0.8540925266903915,
"grad_norm": 0.3890245703911996,
"learning_rate": 1.2697328927240238e-06,
"loss": 1.3155,
"step": 3120
},
{
"epoch": 0.8554612647139338,
"grad_norm": 0.38965532355017113,
"learning_rate": 1.2465277090766381e-06,
"loss": 1.3408,
"step": 3125
},
{
"epoch": 0.856830002737476,
"grad_norm": 0.38847581936098935,
"learning_rate": 1.2235224286911495e-06,
"loss": 1.3619,
"step": 3130
},
{
"epoch": 0.8581987407610183,
"grad_norm": 0.3901578768967818,
"learning_rate": 1.2007175769397117e-06,
"loss": 1.3714,
"step": 3135
},
{
"epoch": 0.8595674787845606,
"grad_norm": 0.38170228292380465,
"learning_rate": 1.178113674617285e-06,
"loss": 1.3144,
"step": 3140
},
{
"epoch": 0.860936216808103,
"grad_norm": 0.3852269233134472,
"learning_rate": 1.1557112379297385e-06,
"loss": 1.3542,
"step": 3145
},
{
"epoch": 0.8623049548316453,
"grad_norm": 0.385512160462019,
"learning_rate": 1.1335107784820741e-06,
"loss": 1.3556,
"step": 3150
},
{
"epoch": 0.8636736928551875,
"grad_norm": 0.39758307605071247,
"learning_rate": 1.1115128032667288e-06,
"loss": 1.2992,
"step": 3155
},
{
"epoch": 0.8650424308787298,
"grad_norm": 0.3702170793773961,
"learning_rate": 1.0897178146520014e-06,
"loss": 1.3861,
"step": 3160
},
{
"epoch": 0.8664111689022721,
"grad_norm": 0.3862887179246134,
"learning_rate": 1.0681263103705853e-06,
"loss": 1.3317,
"step": 3165
},
{
"epoch": 0.8677799069258144,
"grad_norm": 0.3979382501364706,
"learning_rate": 1.0467387835081944e-06,
"loss": 1.351,
"step": 3170
},
{
"epoch": 0.8691486449493567,
"grad_norm": 0.3853783562962658,
"learning_rate": 1.0255557224923018e-06,
"loss": 1.3474,
"step": 3175
},
{
"epoch": 0.8705173829728989,
"grad_norm": 0.3921947417156507,
"learning_rate": 1.004577611080998e-06,
"loss": 1.3162,
"step": 3180
},
{
"epoch": 0.8718861209964412,
"grad_norm": 0.38229199053031476,
"learning_rate": 9.838049283519258e-07,
"loss": 1.3265,
"step": 3185
},
{
"epoch": 0.8732548590199836,
"grad_norm": 0.38057187484097027,
"learning_rate": 9.63238148691351e-07,
"loss": 1.3087,
"step": 3190
},
{
"epoch": 0.8746235970435259,
"grad_norm": 0.3942222628186198,
"learning_rate": 9.42877741783328e-07,
"loss": 1.366,
"step": 3195
},
{
"epoch": 0.8759923350670682,
"grad_norm": 0.38514993886291565,
"learning_rate": 9.227241725989699e-07,
"loss": 1.3212,
"step": 3200
},
{
"epoch": 0.8773610730906105,
"grad_norm": 0.3836381492014198,
"learning_rate": 9.027779013858284e-07,
"loss": 1.2787,
"step": 3205
},
{
"epoch": 0.8787298111141527,
"grad_norm": 0.38734970197899504,
"learning_rate": 8.830393836573947e-07,
"loss": 1.3387,
"step": 3210
},
{
"epoch": 0.880098549137695,
"grad_norm": 0.3817865137800505,
"learning_rate": 8.635090701826799e-07,
"loss": 1.3753,
"step": 3215
},
{
"epoch": 0.8814672871612373,
"grad_norm": 0.38200788300985267,
"learning_rate": 8.441874069759337e-07,
"loss": 1.2776,
"step": 3220
},
{
"epoch": 0.8828360251847797,
"grad_norm": 0.37300363369723033,
"learning_rate": 8.250748352864546e-07,
"loss": 1.317,
"step": 3225
},
{
"epoch": 0.884204763208322,
"grad_norm": 0.39737199946658025,
"learning_rate": 8.061717915885103e-07,
"loss": 1.3048,
"step": 3230
},
{
"epoch": 0.8855735012318642,
"grad_norm": 0.3771135467998678,
"learning_rate": 7.874787075713742e-07,
"loss": 1.2507,
"step": 3235
},
{
"epoch": 0.8869422392554065,
"grad_norm": 0.37723109683472905,
"learning_rate": 7.689960101294691e-07,
"loss": 1.3081,
"step": 3240
},
{
"epoch": 0.8883109772789488,
"grad_norm": 0.3871833961992763,
"learning_rate": 7.507241213526073e-07,
"loss": 1.3122,
"step": 3245
},
{
"epoch": 0.8896797153024911,
"grad_norm": 0.3774090621861952,
"learning_rate": 7.326634585163617e-07,
"loss": 1.3243,
"step": 3250
},
{
"epoch": 0.8910484533260334,
"grad_norm": 0.39061075194420053,
"learning_rate": 7.148144340725371e-07,
"loss": 1.3123,
"step": 3255
},
{
"epoch": 0.8924171913495756,
"grad_norm": 0.3844913895203542,
"learning_rate": 6.971774556397415e-07,
"loss": 1.3238,
"step": 3260
},
{
"epoch": 0.893785929373118,
"grad_norm": 0.3954658749141559,
"learning_rate": 6.797529259940827e-07,
"loss": 1.3421,
"step": 3265
},
{
"epoch": 0.8951546673966603,
"grad_norm": 0.38848733821104914,
"learning_rate": 6.625412430599765e-07,
"loss": 1.3485,
"step": 3270
},
{
"epoch": 0.8965234054202026,
"grad_norm": 0.38275187348582623,
"learning_rate": 6.455427999010466e-07,
"loss": 1.3343,
"step": 3275
},
{
"epoch": 0.8978921434437449,
"grad_norm": 0.3946581641011151,
"learning_rate": 6.287579847111569e-07,
"loss": 1.3362,
"step": 3280
},
{
"epoch": 0.8992608814672871,
"grad_norm": 0.3931302904815353,
"learning_rate": 6.121871808055479e-07,
"loss": 1.3095,
"step": 3285
},
{
"epoch": 0.9006296194908294,
"grad_norm": 0.39874206377284055,
"learning_rate": 5.958307666120733e-07,
"loss": 1.2925,
"step": 3290
},
{
"epoch": 0.9019983575143717,
"grad_norm": 0.38856468686205425,
"learning_rate": 5.796891156625639e-07,
"loss": 1.2878,
"step": 3295
},
{
"epoch": 0.903367095537914,
"grad_norm": 0.3830311422743617,
"learning_rate": 5.637625965843041e-07,
"loss": 1.3247,
"step": 3300
},
{
"epoch": 0.9047358335614564,
"grad_norm": 0.39286283244773124,
"learning_rate": 5.480515730915992e-07,
"loss": 1.2902,
"step": 3305
},
{
"epoch": 0.9061045715849987,
"grad_norm": 0.37864973007608094,
"learning_rate": 5.325564039774777e-07,
"loss": 1.308,
"step": 3310
},
{
"epoch": 0.9074733096085409,
"grad_norm": 0.3932055048918823,
"learning_rate": 5.172774431054995e-07,
"loss": 1.3245,
"step": 3315
},
{
"epoch": 0.9088420476320832,
"grad_norm": 0.3861464572512248,
"learning_rate": 5.022150394016701e-07,
"loss": 1.3345,
"step": 3320
},
{
"epoch": 0.9102107856556255,
"grad_norm": 0.3849211658260205,
"learning_rate": 4.873695368464693e-07,
"loss": 1.3522,
"step": 3325
},
{
"epoch": 0.9115795236791678,
"grad_norm": 0.3850036124623499,
"learning_rate": 4.72741274467009e-07,
"loss": 1.3205,
"step": 3330
},
{
"epoch": 0.9129482617027102,
"grad_norm": 0.3858389082786252,
"learning_rate": 4.5833058632927417e-07,
"loss": 1.2984,
"step": 3335
},
{
"epoch": 0.9143169997262524,
"grad_norm": 0.3974200110164499,
"learning_rate": 4.441378015305031e-07,
"loss": 1.2907,
"step": 3340
},
{
"epoch": 0.9156857377497947,
"grad_norm": 0.3885026941845055,
"learning_rate": 4.3016324419167365e-07,
"loss": 1.3571,
"step": 3345
},
{
"epoch": 0.917054475773337,
"grad_norm": 0.391701334822345,
"learning_rate": 4.164072334500935e-07,
"loss": 1.2946,
"step": 3350
},
{
"epoch": 0.9184232137968793,
"grad_norm": 0.371298260058105,
"learning_rate": 4.028700834521193e-07,
"loss": 1.2734,
"step": 3355
},
{
"epoch": 0.9197919518204216,
"grad_norm": 0.39257123679148415,
"learning_rate": 3.8955210334597595e-07,
"loss": 1.3792,
"step": 3360
},
{
"epoch": 0.9211606898439638,
"grad_norm": 0.37739382436504915,
"learning_rate": 3.764535972747052e-07,
"loss": 1.3182,
"step": 3365
},
{
"epoch": 0.9225294278675061,
"grad_norm": 0.4040354445639437,
"learning_rate": 3.6357486436921164e-07,
"loss": 1.3149,
"step": 3370
},
{
"epoch": 0.9238981658910485,
"grad_norm": 0.3885892463097669,
"learning_rate": 3.5091619874143446e-07,
"loss": 1.3612,
"step": 3375
},
{
"epoch": 0.9252669039145908,
"grad_norm": 0.39478973344601664,
"learning_rate": 3.3847788947763194e-07,
"loss": 1.3338,
"step": 3380
},
{
"epoch": 0.9266356419381331,
"grad_norm": 0.4002415191432085,
"learning_rate": 3.2626022063177997e-07,
"loss": 1.3854,
"step": 3385
},
{
"epoch": 0.9280043799616753,
"grad_norm": 0.3871640152525417,
"learning_rate": 3.142634712190795e-07,
"loss": 1.2663,
"step": 3390
},
{
"epoch": 0.9293731179852176,
"grad_norm": 0.3844217189319342,
"learning_rate": 3.0248791520959387e-07,
"loss": 1.3304,
"step": 3395
},
{
"epoch": 0.9307418560087599,
"grad_norm": 0.40010784379733844,
"learning_rate": 2.909338215219859e-07,
"loss": 1.3458,
"step": 3400
},
{
"epoch": 0.9321105940323022,
"grad_norm": 0.38274676057072776,
"learning_rate": 2.7960145401737415e-07,
"loss": 1.2606,
"step": 3405
},
{
"epoch": 0.9334793320558445,
"grad_norm": 0.39275920250834023,
"learning_rate": 2.6849107149331756e-07,
"loss": 1.2825,
"step": 3410
},
{
"epoch": 0.9348480700793868,
"grad_norm": 0.3947633255379109,
"learning_rate": 2.576029276778924e-07,
"loss": 1.3441,
"step": 3415
},
{
"epoch": 0.9362168081029291,
"grad_norm": 0.41407416506322803,
"learning_rate": 2.4693727122390597e-07,
"loss": 1.371,
"step": 3420
},
{
"epoch": 0.9375855461264714,
"grad_norm": 0.37625661449174036,
"learning_rate": 2.3649434570321984e-07,
"loss": 1.2862,
"step": 3425
},
{
"epoch": 0.9389542841500137,
"grad_norm": 0.37104631834710733,
"learning_rate": 2.2627438960117876e-07,
"loss": 1.2833,
"step": 3430
},
{
"epoch": 0.940323022173556,
"grad_norm": 0.3868409112001441,
"learning_rate": 2.1627763631117182e-07,
"loss": 1.3551,
"step": 3435
},
{
"epoch": 0.9416917601970983,
"grad_norm": 0.4061569745095073,
"learning_rate": 2.0650431412930104e-07,
"loss": 1.3273,
"step": 3440
},
{
"epoch": 0.9430604982206405,
"grad_norm": 0.3923199404039606,
"learning_rate": 1.969546462491634e-07,
"loss": 1.3093,
"step": 3445
},
{
"epoch": 0.9444292362441828,
"grad_norm": 0.40035298460473323,
"learning_rate": 1.876288507567592e-07,
"loss": 1.2859,
"step": 3450
},
{
"epoch": 0.9457979742677252,
"grad_norm": 0.39270377743419116,
"learning_rate": 1.785271406255107e-07,
"loss": 1.3086,
"step": 3455
},
{
"epoch": 0.9471667122912675,
"grad_norm": 0.391272234134139,
"learning_rate": 1.6964972371139588e-07,
"loss": 1.3324,
"step": 3460
},
{
"epoch": 0.9485354503148098,
"grad_norm": 0.3971408419550116,
"learning_rate": 1.609968027482012e-07,
"loss": 1.3241,
"step": 3465
},
{
"epoch": 0.949904188338352,
"grad_norm": 0.397310674294107,
"learning_rate": 1.5256857534289626e-07,
"loss": 1.344,
"step": 3470
},
{
"epoch": 0.9512729263618943,
"grad_norm": 0.37737586107823284,
"learning_rate": 1.443652339711199e-07,
"loss": 1.3227,
"step": 3475
},
{
"epoch": 0.9526416643854366,
"grad_norm": 0.3953334287452581,
"learning_rate": 1.3638696597277678e-07,
"loss": 1.3323,
"step": 3480
},
{
"epoch": 0.9540104024089789,
"grad_norm": 0.39727863290285664,
"learning_rate": 1.2863395354777097e-07,
"loss": 1.2965,
"step": 3485
},
{
"epoch": 0.9553791404325213,
"grad_norm": 0.3915026545665889,
"learning_rate": 1.211063737518392e-07,
"loss": 1.2945,
"step": 3490
},
{
"epoch": 0.9567478784560635,
"grad_norm": 0.3963237143375255,
"learning_rate": 1.1380439849250414e-07,
"loss": 1.3079,
"step": 3495
},
{
"epoch": 0.9581166164796058,
"grad_norm": 0.38628531323230814,
"learning_rate": 1.0672819452515526e-07,
"loss": 1.347,
"step": 3500
},
{
"epoch": 0.9594853545031481,
"grad_norm": 0.39250735459020125,
"learning_rate": 9.987792344923753e-08,
"loss": 1.3292,
"step": 3505
},
{
"epoch": 0.9608540925266904,
"grad_norm": 0.39214485753203543,
"learning_rate": 9.32537417045576e-08,
"loss": 1.2703,
"step": 3510
},
{
"epoch": 0.9622228305502327,
"grad_norm": 0.38156832143297204,
"learning_rate": 8.685580056771781e-08,
"loss": 1.3404,
"step": 3515
},
{
"epoch": 0.9635915685737749,
"grad_norm": 0.3907257773875925,
"learning_rate": 8.0684246148659e-08,
"loss": 1.2681,
"step": 3520
},
{
"epoch": 0.9649603065973172,
"grad_norm": 0.4194193748219591,
"learning_rate": 7.473921938731865e-08,
"loss": 1.382,
"step": 3525
},
{
"epoch": 0.9663290446208596,
"grad_norm": 0.3858101635490962,
"learning_rate": 6.902085605042019e-08,
"loss": 1.3671,
"step": 3530
},
{
"epoch": 0.9676977826444019,
"grad_norm": 0.3813102160301827,
"learning_rate": 6.352928672836767e-08,
"loss": 1.3013,
"step": 3535
},
{
"epoch": 0.9690665206679442,
"grad_norm": 0.38117598807083153,
"learning_rate": 5.82646368322648e-08,
"loss": 1.3406,
"step": 3540
},
{
"epoch": 0.9704352586914865,
"grad_norm": 0.3869856138402919,
"learning_rate": 5.3227026591049505e-08,
"loss": 1.3311,
"step": 3545
},
{
"epoch": 0.9718039967150287,
"grad_norm": 0.3866858707608126,
"learning_rate": 4.841657104875275e-08,
"loss": 1.3593,
"step": 3550
},
{
"epoch": 0.973172734738571,
"grad_norm": 0.3866568363141188,
"learning_rate": 4.3833380061865104e-08,
"loss": 1.3318,
"step": 3555
},
{
"epoch": 0.9745414727621133,
"grad_norm": 0.39553603732869785,
"learning_rate": 3.947755829683097e-08,
"loss": 1.3403,
"step": 3560
},
{
"epoch": 0.9759102107856557,
"grad_norm": 0.3990993940103786,
"learning_rate": 3.5349205227660496e-08,
"loss": 1.3812,
"step": 3565
},
{
"epoch": 0.977278948809198,
"grad_norm": 0.3925614431869845,
"learning_rate": 3.144841513365249e-08,
"loss": 1.3025,
"step": 3570
},
{
"epoch": 0.9786476868327402,
"grad_norm": 0.3883373657835178,
"learning_rate": 2.7775277097247255e-08,
"loss": 1.313,
"step": 3575
},
{
"epoch": 0.9800164248562825,
"grad_norm": 0.38572501548529475,
"learning_rate": 2.4329875001989356e-08,
"loss": 1.3058,
"step": 3580
},
{
"epoch": 0.9813851628798248,
"grad_norm": 0.3838975342711645,
"learning_rate": 2.1112287530609122e-08,
"loss": 1.3165,
"step": 3585
},
{
"epoch": 0.9827539009033671,
"grad_norm": 0.3918735272830008,
"learning_rate": 1.812258816323187e-08,
"loss": 1.3388,
"step": 3590
},
{
"epoch": 0.9841226389269094,
"grad_norm": 0.38644635827545804,
"learning_rate": 1.5360845175695916e-08,
"loss": 1.3378,
"step": 3595
},
{
"epoch": 0.9854913769504516,
"grad_norm": 0.4046028492421605,
"learning_rate": 1.2827121637992712e-08,
"loss": 1.3104,
"step": 3600
},
{
"epoch": 0.986860114973994,
"grad_norm": 0.37588594630122674,
"learning_rate": 1.0521475412830218e-08,
"loss": 1.3345,
"step": 3605
},
{
"epoch": 0.9882288529975363,
"grad_norm": 0.39556552792879707,
"learning_rate": 8.44395915430729e-09,
"loss": 1.3184,
"step": 3610
},
{
"epoch": 0.9895975910210786,
"grad_norm": 0.3745912876489376,
"learning_rate": 6.5946203067135395e-09,
"loss": 1.3036,
"step": 3615
},
{
"epoch": 0.9909663290446209,
"grad_norm": 0.39112472329549625,
"learning_rate": 4.9735011034457434e-09,
"loss": 1.2682,
"step": 3620
},
{
"epoch": 0.9923350670681631,
"grad_norm": 0.380307545731869,
"learning_rate": 3.580638566043071e-09,
"loss": 1.2837,
"step": 3625
},
{
"epoch": 0.9937038050917054,
"grad_norm": 0.3955743424810463,
"learning_rate": 2.416064503342197e-09,
"loss": 1.3092,
"step": 3630
},
{
"epoch": 0.9950725431152477,
"grad_norm": 0.3853245568354318,
"learning_rate": 1.4798055107489996e-09,
"loss": 1.3025,
"step": 3635
},
{
"epoch": 0.99644128113879,
"grad_norm": 0.39071516823719027,
"learning_rate": 7.718829696334862e-10,
"loss": 1.3332,
"step": 3640
},
{
"epoch": 0.9978100191623324,
"grad_norm": 0.38346889130554324,
"learning_rate": 2.9231304683907667e-10,
"loss": 1.3369,
"step": 3645
},
{
"epoch": 0.9991787571858747,
"grad_norm": 0.4139215786504167,
"learning_rate": 4.1106694317338826e-11,
"loss": 1.3518,
"step": 3650
},
{
"epoch": 1.0,
"eval_loss": 1.320330262184143,
"eval_runtime": 951.5767,
"eval_samples_per_second": 92.065,
"eval_steps_per_second": 5.755,
"step": 3653
},
{
"epoch": 1.0,
"step": 3653,
"total_flos": 66190143651840.0,
"train_loss": 1.3573657579367173,
"train_runtime": 8062.8425,
"train_samples_per_second": 7.249,
"train_steps_per_second": 0.453
}
],
"logging_steps": 5,
"max_steps": 3653,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 66190143651840.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}