PrefixTuning_FlanT5_v1 / trainer_state.json
manhdofts03's picture
pretrain_model and pretrain_model_token
ab7ab0b verified
raw
history blame contribute delete
No virus
60 kB
{
"best_metric": 3.9158332347869873,
"best_model_checkpoint": "/content/drive/MyDrive/checkpoints/checkpoint-3342",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 3342,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008976660682226212,
"grad_norm": 0.31800901889801025,
"learning_rate": 0.0004997755834829443,
"loss": 45.5673,
"step": 10
},
{
"epoch": 0.017953321364452424,
"grad_norm": 0.35396715998649597,
"learning_rate": 0.0004995511669658887,
"loss": 45.3249,
"step": 20
},
{
"epoch": 0.026929982046678635,
"grad_norm": 0.37757596373558044,
"learning_rate": 0.000499326750448833,
"loss": 44.7853,
"step": 30
},
{
"epoch": 0.03590664272890485,
"grad_norm": 0.38259994983673096,
"learning_rate": 0.0004991023339317774,
"loss": 44.8996,
"step": 40
},
{
"epoch": 0.04488330341113106,
"grad_norm": 0.38321229815483093,
"learning_rate": 0.0004988779174147217,
"loss": 44.1242,
"step": 50
},
{
"epoch": 0.05385996409335727,
"grad_norm": 0.42167848348617554,
"learning_rate": 0.0004986535008976661,
"loss": 44.4739,
"step": 60
},
{
"epoch": 0.06283662477558348,
"grad_norm": 0.40018683671951294,
"learning_rate": 0.0004984290843806105,
"loss": 44.4548,
"step": 70
},
{
"epoch": 0.0718132854578097,
"grad_norm": 0.39394208788871765,
"learning_rate": 0.0004982046678635547,
"loss": 43.4622,
"step": 80
},
{
"epoch": 0.0807899461400359,
"grad_norm": 0.3660307228565216,
"learning_rate": 0.0004979802513464991,
"loss": 43.0475,
"step": 90
},
{
"epoch": 0.08976660682226212,
"grad_norm": 0.36663416028022766,
"learning_rate": 0.0004977558348294434,
"loss": 42.8994,
"step": 100
},
{
"epoch": 0.09874326750448834,
"grad_norm": 0.40418022871017456,
"learning_rate": 0.0004975314183123878,
"loss": 43.0499,
"step": 110
},
{
"epoch": 0.10771992818671454,
"grad_norm": 0.32946863770484924,
"learning_rate": 0.0004973070017953322,
"loss": 43.0516,
"step": 120
},
{
"epoch": 0.11669658886894076,
"grad_norm": 0.36427420377731323,
"learning_rate": 0.0004970825852782765,
"loss": 43.0005,
"step": 130
},
{
"epoch": 0.12567324955116696,
"grad_norm": 0.3754049837589264,
"learning_rate": 0.0004968581687612209,
"loss": 42.3461,
"step": 140
},
{
"epoch": 0.13464991023339318,
"grad_norm": 0.3867158889770508,
"learning_rate": 0.0004966337522441652,
"loss": 42.5113,
"step": 150
},
{
"epoch": 0.1436265709156194,
"grad_norm": 0.35019099712371826,
"learning_rate": 0.0004964093357271095,
"loss": 41.8364,
"step": 160
},
{
"epoch": 0.1526032315978456,
"grad_norm": 0.37168896198272705,
"learning_rate": 0.0004961849192100539,
"loss": 42.4202,
"step": 170
},
{
"epoch": 0.1615798922800718,
"grad_norm": 0.36585116386413574,
"learning_rate": 0.0004959605026929982,
"loss": 41.5454,
"step": 180
},
{
"epoch": 0.17055655296229802,
"grad_norm": 0.4089430272579193,
"learning_rate": 0.0004957360861759426,
"loss": 41.0863,
"step": 190
},
{
"epoch": 0.17953321364452424,
"grad_norm": 0.3681723475456238,
"learning_rate": 0.0004955116696588868,
"loss": 41.0869,
"step": 200
},
{
"epoch": 0.18850987432675045,
"grad_norm": 0.3823374807834625,
"learning_rate": 0.0004952872531418312,
"loss": 41.0254,
"step": 210
},
{
"epoch": 0.19748653500897667,
"grad_norm": 0.3909670412540436,
"learning_rate": 0.0004950628366247755,
"loss": 41.0026,
"step": 220
},
{
"epoch": 0.20646319569120286,
"grad_norm": 0.39831164479255676,
"learning_rate": 0.0004948384201077199,
"loss": 40.3224,
"step": 230
},
{
"epoch": 0.21543985637342908,
"grad_norm": 0.3801274597644806,
"learning_rate": 0.0004946140035906643,
"loss": 40.4711,
"step": 240
},
{
"epoch": 0.2244165170556553,
"grad_norm": 0.39255771040916443,
"learning_rate": 0.0004943895870736086,
"loss": 39.9713,
"step": 250
},
{
"epoch": 0.2333931777378815,
"grad_norm": 0.400642067193985,
"learning_rate": 0.000494165170556553,
"loss": 39.3574,
"step": 260
},
{
"epoch": 0.24236983842010773,
"grad_norm": 0.44542375206947327,
"learning_rate": 0.0004939407540394973,
"loss": 39.4756,
"step": 270
},
{
"epoch": 0.2513464991023339,
"grad_norm": 0.41471394896507263,
"learning_rate": 0.0004937163375224417,
"loss": 39.4551,
"step": 280
},
{
"epoch": 0.26032315978456017,
"grad_norm": 0.3956909775733948,
"learning_rate": 0.000493491921005386,
"loss": 39.0815,
"step": 290
},
{
"epoch": 0.26929982046678635,
"grad_norm": 0.5405673384666443,
"learning_rate": 0.0004932675044883304,
"loss": 38.7405,
"step": 300
},
{
"epoch": 0.27827648114901254,
"grad_norm": 0.4720427691936493,
"learning_rate": 0.0004930430879712747,
"loss": 38.1905,
"step": 310
},
{
"epoch": 0.2872531418312388,
"grad_norm": 0.4677943289279938,
"learning_rate": 0.0004928186714542191,
"loss": 38.023,
"step": 320
},
{
"epoch": 0.296229802513465,
"grad_norm": 0.4742816090583801,
"learning_rate": 0.0004925942549371633,
"loss": 37.6844,
"step": 330
},
{
"epoch": 0.3052064631956912,
"grad_norm": 0.463733047246933,
"learning_rate": 0.0004923698384201077,
"loss": 37.4262,
"step": 340
},
{
"epoch": 0.3141831238779174,
"grad_norm": 0.48447635769844055,
"learning_rate": 0.000492145421903052,
"loss": 37.1974,
"step": 350
},
{
"epoch": 0.3231597845601436,
"grad_norm": 0.5126340389251709,
"learning_rate": 0.0004919210053859964,
"loss": 36.4875,
"step": 360
},
{
"epoch": 0.33213644524236985,
"grad_norm": 0.5128099322319031,
"learning_rate": 0.0004916965888689407,
"loss": 36.8894,
"step": 370
},
{
"epoch": 0.34111310592459604,
"grad_norm": 0.5677986741065979,
"learning_rate": 0.0004914721723518851,
"loss": 36.0053,
"step": 380
},
{
"epoch": 0.3500897666068223,
"grad_norm": 0.6088815927505493,
"learning_rate": 0.0004912477558348294,
"loss": 36.2308,
"step": 390
},
{
"epoch": 0.3590664272890485,
"grad_norm": 0.5765969157218933,
"learning_rate": 0.0004910233393177738,
"loss": 35.4461,
"step": 400
},
{
"epoch": 0.36804308797127466,
"grad_norm": 0.5858592391014099,
"learning_rate": 0.0004907989228007182,
"loss": 35.0571,
"step": 410
},
{
"epoch": 0.3770197486535009,
"grad_norm": 0.6825990080833435,
"learning_rate": 0.0004905745062836625,
"loss": 34.717,
"step": 420
},
{
"epoch": 0.3859964093357271,
"grad_norm": 0.7166014313697815,
"learning_rate": 0.0004903500897666069,
"loss": 33.8366,
"step": 430
},
{
"epoch": 0.39497307001795334,
"grad_norm": 0.6887209415435791,
"learning_rate": 0.0004901256732495512,
"loss": 33.7563,
"step": 440
},
{
"epoch": 0.40394973070017953,
"grad_norm": 0.7413772344589233,
"learning_rate": 0.0004899012567324956,
"loss": 33.1205,
"step": 450
},
{
"epoch": 0.4129263913824057,
"grad_norm": 0.7537035942077637,
"learning_rate": 0.0004896768402154398,
"loss": 32.9826,
"step": 460
},
{
"epoch": 0.42190305206463197,
"grad_norm": 0.730989396572113,
"learning_rate": 0.0004894524236983842,
"loss": 31.9265,
"step": 470
},
{
"epoch": 0.43087971274685816,
"grad_norm": 0.9165148735046387,
"learning_rate": 0.0004892280071813285,
"loss": 31.3014,
"step": 480
},
{
"epoch": 0.4398563734290844,
"grad_norm": 0.8587144613265991,
"learning_rate": 0.0004890035906642729,
"loss": 30.889,
"step": 490
},
{
"epoch": 0.4488330341113106,
"grad_norm": 0.9183847904205322,
"learning_rate": 0.0004887791741472172,
"loss": 30.1653,
"step": 500
},
{
"epoch": 0.4578096947935368,
"grad_norm": 0.9044579863548279,
"learning_rate": 0.0004885547576301616,
"loss": 29.9274,
"step": 510
},
{
"epoch": 0.466786355475763,
"grad_norm": 0.8621285557746887,
"learning_rate": 0.0004883303411131059,
"loss": 29.5479,
"step": 520
},
{
"epoch": 0.4757630161579892,
"grad_norm": 1.1030315160751343,
"learning_rate": 0.0004881059245960503,
"loss": 29.1674,
"step": 530
},
{
"epoch": 0.48473967684021546,
"grad_norm": 1.071616768836975,
"learning_rate": 0.00048788150807899463,
"loss": 28.5656,
"step": 540
},
{
"epoch": 0.49371633752244165,
"grad_norm": 0.9452396035194397,
"learning_rate": 0.000487657091561939,
"loss": 28.1162,
"step": 550
},
{
"epoch": 0.5026929982046678,
"grad_norm": 0.9999839067459106,
"learning_rate": 0.0004874326750448833,
"loss": 27.1627,
"step": 560
},
{
"epoch": 0.5116696588868941,
"grad_norm": 1.5522288084030151,
"learning_rate": 0.00048720825852782766,
"loss": 26.8812,
"step": 570
},
{
"epoch": 0.5206463195691203,
"grad_norm": 1.1541786193847656,
"learning_rate": 0.000486983842010772,
"loss": 26.2589,
"step": 580
},
{
"epoch": 0.5296229802513465,
"grad_norm": 0.9977880120277405,
"learning_rate": 0.00048675942549371634,
"loss": 26.168,
"step": 590
},
{
"epoch": 0.5385996409335727,
"grad_norm": 0.9028811454772949,
"learning_rate": 0.0004865350089766607,
"loss": 24.9378,
"step": 600
},
{
"epoch": 0.547576301615799,
"grad_norm": 1.0026092529296875,
"learning_rate": 0.00048631059245960503,
"loss": 25.0135,
"step": 610
},
{
"epoch": 0.5565529622980251,
"grad_norm": 1.0198203325271606,
"learning_rate": 0.00048608617594254937,
"loss": 24.6053,
"step": 620
},
{
"epoch": 0.5655296229802513,
"grad_norm": 1.1564388275146484,
"learning_rate": 0.0004858617594254937,
"loss": 24.0813,
"step": 630
},
{
"epoch": 0.5745062836624776,
"grad_norm": 0.8892808556556702,
"learning_rate": 0.0004856373429084381,
"loss": 23.7441,
"step": 640
},
{
"epoch": 0.5834829443447038,
"grad_norm": 1.1114846467971802,
"learning_rate": 0.0004854129263913824,
"loss": 22.8734,
"step": 650
},
{
"epoch": 0.59245960502693,
"grad_norm": 1.0558847188949585,
"learning_rate": 0.0004851885098743268,
"loss": 22.3045,
"step": 660
},
{
"epoch": 0.6014362657091562,
"grad_norm": 0.8897343277931213,
"learning_rate": 0.0004849640933572711,
"loss": 21.977,
"step": 670
},
{
"epoch": 0.6104129263913824,
"grad_norm": 0.9796168208122253,
"learning_rate": 0.0004847396768402155,
"loss": 21.5113,
"step": 680
},
{
"epoch": 0.6193895870736086,
"grad_norm": 0.8519884943962097,
"learning_rate": 0.0004845152603231598,
"loss": 20.9744,
"step": 690
},
{
"epoch": 0.6283662477558348,
"grad_norm": 1.1632051467895508,
"learning_rate": 0.00048429084380610416,
"loss": 20.271,
"step": 700
},
{
"epoch": 0.6373429084380611,
"grad_norm": 0.9868700504302979,
"learning_rate": 0.0004840664272890485,
"loss": 19.9961,
"step": 710
},
{
"epoch": 0.6463195691202872,
"grad_norm": 0.9679480791091919,
"learning_rate": 0.0004838420107719928,
"loss": 19.4405,
"step": 720
},
{
"epoch": 0.6552962298025135,
"grad_norm": 1.0145677328109741,
"learning_rate": 0.0004836175942549372,
"loss": 19.2046,
"step": 730
},
{
"epoch": 0.6642728904847397,
"grad_norm": 1.0279533863067627,
"learning_rate": 0.00048339317773788147,
"loss": 18.2792,
"step": 740
},
{
"epoch": 0.6732495511669659,
"grad_norm": 1.2876602411270142,
"learning_rate": 0.00048316876122082587,
"loss": 17.8022,
"step": 750
},
{
"epoch": 0.6822262118491921,
"grad_norm": 1.0419774055480957,
"learning_rate": 0.0004829443447037702,
"loss": 17.4577,
"step": 760
},
{
"epoch": 0.6912028725314183,
"grad_norm": 1.0887730121612549,
"learning_rate": 0.00048271992818671455,
"loss": 16.5106,
"step": 770
},
{
"epoch": 0.7001795332136446,
"grad_norm": 1.1203436851501465,
"learning_rate": 0.0004824955116696589,
"loss": 16.4582,
"step": 780
},
{
"epoch": 0.7091561938958707,
"grad_norm": 1.0770111083984375,
"learning_rate": 0.00048227109515260324,
"loss": 16.003,
"step": 790
},
{
"epoch": 0.718132854578097,
"grad_norm": 1.2158771753311157,
"learning_rate": 0.0004820466786355476,
"loss": 15.2694,
"step": 800
},
{
"epoch": 0.7271095152603232,
"grad_norm": 1.1706403493881226,
"learning_rate": 0.000481822262118492,
"loss": 14.9252,
"step": 810
},
{
"epoch": 0.7360861759425493,
"grad_norm": 1.189310908317566,
"learning_rate": 0.00048159784560143626,
"loss": 14.4921,
"step": 820
},
{
"epoch": 0.7450628366247756,
"grad_norm": 1.6199108362197876,
"learning_rate": 0.00048137342908438066,
"loss": 13.9443,
"step": 830
},
{
"epoch": 0.7540394973070018,
"grad_norm": 1.1757200956344604,
"learning_rate": 0.00048114901256732494,
"loss": 13.8288,
"step": 840
},
{
"epoch": 0.7630161579892281,
"grad_norm": 1.2064054012298584,
"learning_rate": 0.00048092459605026934,
"loss": 12.9563,
"step": 850
},
{
"epoch": 0.7719928186714542,
"grad_norm": 1.1954108476638794,
"learning_rate": 0.00048070017953321363,
"loss": 12.1382,
"step": 860
},
{
"epoch": 0.7809694793536804,
"grad_norm": 1.5387598276138306,
"learning_rate": 0.00048047576301615797,
"loss": 12.1248,
"step": 870
},
{
"epoch": 0.7899461400359067,
"grad_norm": 1.2923359870910645,
"learning_rate": 0.00048025134649910237,
"loss": 11.7902,
"step": 880
},
{
"epoch": 0.7989228007181328,
"grad_norm": 0.9865145683288574,
"learning_rate": 0.00048002692998204665,
"loss": 10.7329,
"step": 890
},
{
"epoch": 0.8078994614003591,
"grad_norm": 1.140541672706604,
"learning_rate": 0.00047980251346499105,
"loss": 10.5986,
"step": 900
},
{
"epoch": 0.8168761220825853,
"grad_norm": 1.1022454500198364,
"learning_rate": 0.00047957809694793534,
"loss": 10.2782,
"step": 910
},
{
"epoch": 0.8258527827648114,
"grad_norm": 0.8876429200172424,
"learning_rate": 0.00047935368043087973,
"loss": 9.3573,
"step": 920
},
{
"epoch": 0.8348294434470377,
"grad_norm": 0.9144046306610107,
"learning_rate": 0.0004791292639138241,
"loss": 9.4616,
"step": 930
},
{
"epoch": 0.8438061041292639,
"grad_norm": 1.022176742553711,
"learning_rate": 0.0004789048473967684,
"loss": 9.0571,
"step": 940
},
{
"epoch": 0.8527827648114902,
"grad_norm": 0.9050130248069763,
"learning_rate": 0.00047868043087971276,
"loss": 8.4811,
"step": 950
},
{
"epoch": 0.8617594254937163,
"grad_norm": 0.8372008800506592,
"learning_rate": 0.0004784560143626571,
"loss": 8.3873,
"step": 960
},
{
"epoch": 0.8707360861759426,
"grad_norm": 0.8663610816001892,
"learning_rate": 0.00047823159784560144,
"loss": 8.0233,
"step": 970
},
{
"epoch": 0.8797127468581688,
"grad_norm": 0.6936354637145996,
"learning_rate": 0.00047800718132854584,
"loss": 7.8054,
"step": 980
},
{
"epoch": 0.8886894075403949,
"grad_norm": 0.5529871582984924,
"learning_rate": 0.00047778276481149013,
"loss": 7.6013,
"step": 990
},
{
"epoch": 0.8976660682226212,
"grad_norm": 0.6260952353477478,
"learning_rate": 0.00047755834829443447,
"loss": 7.4237,
"step": 1000
},
{
"epoch": 0.9066427289048474,
"grad_norm": 0.851337730884552,
"learning_rate": 0.0004773339317773788,
"loss": 7.2549,
"step": 1010
},
{
"epoch": 0.9156193895870736,
"grad_norm": 0.6702756285667419,
"learning_rate": 0.00047710951526032315,
"loss": 7.0967,
"step": 1020
},
{
"epoch": 0.9245960502692998,
"grad_norm": 0.6650304794311523,
"learning_rate": 0.0004768850987432675,
"loss": 6.9988,
"step": 1030
},
{
"epoch": 0.933572710951526,
"grad_norm": 0.551717221736908,
"learning_rate": 0.00047666068222621184,
"loss": 6.5465,
"step": 1040
},
{
"epoch": 0.9425493716337523,
"grad_norm": 0.4560067653656006,
"learning_rate": 0.00047643626570915623,
"loss": 6.5641,
"step": 1050
},
{
"epoch": 0.9515260323159784,
"grad_norm": 0.4556948244571686,
"learning_rate": 0.0004762118491921005,
"loss": 6.6911,
"step": 1060
},
{
"epoch": 0.9605026929982047,
"grad_norm": 0.8652740716934204,
"learning_rate": 0.0004759874326750449,
"loss": 6.7453,
"step": 1070
},
{
"epoch": 0.9694793536804309,
"grad_norm": 0.32210618257522583,
"learning_rate": 0.0004757630161579892,
"loss": 6.5263,
"step": 1080
},
{
"epoch": 0.9784560143626571,
"grad_norm": 1.9738398790359497,
"learning_rate": 0.0004755385996409336,
"loss": 6.4019,
"step": 1090
},
{
"epoch": 0.9874326750448833,
"grad_norm": 0.31478866934776306,
"learning_rate": 0.00047531418312387794,
"loss": 6.266,
"step": 1100
},
{
"epoch": 0.9964093357271095,
"grad_norm": 0.39359068870544434,
"learning_rate": 0.0004750897666068223,
"loss": 6.2422,
"step": 1110
},
{
"epoch": 1.0,
"eval_loss": 5.025014400482178,
"eval_runtime": 436.9889,
"eval_samples_per_second": 10.197,
"eval_steps_per_second": 1.275,
"step": 1114
},
{
"epoch": 1.0053859964093357,
"grad_norm": 0.3087250888347626,
"learning_rate": 0.0004748653500897666,
"loss": 6.1059,
"step": 1120
},
{
"epoch": 1.014362657091562,
"grad_norm": 0.4997764825820923,
"learning_rate": 0.00047464093357271097,
"loss": 6.1567,
"step": 1130
},
{
"epoch": 1.0233393177737882,
"grad_norm": 0.4492017328739166,
"learning_rate": 0.0004744165170556553,
"loss": 6.0689,
"step": 1140
},
{
"epoch": 1.0323159784560143,
"grad_norm": 0.35565611720085144,
"learning_rate": 0.00047419210053859965,
"loss": 5.9551,
"step": 1150
},
{
"epoch": 1.0412926391382407,
"grad_norm": 0.28686025738716125,
"learning_rate": 0.000473967684021544,
"loss": 5.9306,
"step": 1160
},
{
"epoch": 1.0502692998204668,
"grad_norm": 0.28098103404045105,
"learning_rate": 0.00047374326750448834,
"loss": 5.8205,
"step": 1170
},
{
"epoch": 1.059245960502693,
"grad_norm": 0.3124157190322876,
"learning_rate": 0.0004735188509874327,
"loss": 5.7734,
"step": 1180
},
{
"epoch": 1.0682226211849193,
"grad_norm": 0.27604150772094727,
"learning_rate": 0.000473294434470377,
"loss": 5.8549,
"step": 1190
},
{
"epoch": 1.0771992818671454,
"grad_norm": 0.48105934262275696,
"learning_rate": 0.00047307001795332136,
"loss": 5.8208,
"step": 1200
},
{
"epoch": 1.0861759425493716,
"grad_norm": 0.33073532581329346,
"learning_rate": 0.0004728456014362657,
"loss": 5.7798,
"step": 1210
},
{
"epoch": 1.095152603231598,
"grad_norm": 0.24770517647266388,
"learning_rate": 0.0004726211849192101,
"loss": 5.6513,
"step": 1220
},
{
"epoch": 1.104129263913824,
"grad_norm": 0.23116350173950195,
"learning_rate": 0.0004723967684021544,
"loss": 5.6458,
"step": 1230
},
{
"epoch": 1.1131059245960502,
"grad_norm": 0.2757456302642822,
"learning_rate": 0.0004721723518850988,
"loss": 5.7592,
"step": 1240
},
{
"epoch": 1.1220825852782765,
"grad_norm": 0.23286688327789307,
"learning_rate": 0.00047194793536804307,
"loss": 5.6889,
"step": 1250
},
{
"epoch": 1.1310592459605027,
"grad_norm": 0.1967301219701767,
"learning_rate": 0.00047172351885098747,
"loss": 5.5865,
"step": 1260
},
{
"epoch": 1.140035906642729,
"grad_norm": 0.22576653957366943,
"learning_rate": 0.0004714991023339318,
"loss": 5.4764,
"step": 1270
},
{
"epoch": 1.1490125673249552,
"grad_norm": 0.217813640832901,
"learning_rate": 0.00047127468581687615,
"loss": 5.6309,
"step": 1280
},
{
"epoch": 1.1579892280071813,
"grad_norm": 0.1798250824213028,
"learning_rate": 0.0004710502692998205,
"loss": 5.4452,
"step": 1290
},
{
"epoch": 1.1669658886894076,
"grad_norm": 0.22210471332073212,
"learning_rate": 0.0004708258527827648,
"loss": 5.5905,
"step": 1300
},
{
"epoch": 1.1759425493716338,
"grad_norm": 0.24236564338207245,
"learning_rate": 0.0004706014362657092,
"loss": 5.5106,
"step": 1310
},
{
"epoch": 1.18491921005386,
"grad_norm": 0.205738365650177,
"learning_rate": 0.00047037701974865346,
"loss": 5.4863,
"step": 1320
},
{
"epoch": 1.1938958707360863,
"grad_norm": 0.2275596708059311,
"learning_rate": 0.00047015260323159786,
"loss": 5.4782,
"step": 1330
},
{
"epoch": 1.2028725314183124,
"grad_norm": 0.40637847781181335,
"learning_rate": 0.0004699281867145422,
"loss": 5.4103,
"step": 1340
},
{
"epoch": 1.2118491921005385,
"grad_norm": 0.17678338289260864,
"learning_rate": 0.00046970377019748654,
"loss": 5.3858,
"step": 1350
},
{
"epoch": 1.220825852782765,
"grad_norm": 0.1862853765487671,
"learning_rate": 0.0004694793536804309,
"loss": 5.379,
"step": 1360
},
{
"epoch": 1.229802513464991,
"grad_norm": 0.12334032356739044,
"learning_rate": 0.0004692549371633752,
"loss": 5.396,
"step": 1370
},
{
"epoch": 1.2387791741472172,
"grad_norm": 0.15632939338684082,
"learning_rate": 0.00046903052064631957,
"loss": 5.3853,
"step": 1380
},
{
"epoch": 1.2477558348294435,
"grad_norm": 0.18021011352539062,
"learning_rate": 0.00046880610412926396,
"loss": 5.2905,
"step": 1390
},
{
"epoch": 1.2567324955116697,
"grad_norm": 0.15651032328605652,
"learning_rate": 0.00046858168761220825,
"loss": 5.4102,
"step": 1400
},
{
"epoch": 1.2657091561938958,
"grad_norm": 0.15990717709064484,
"learning_rate": 0.00046835727109515265,
"loss": 5.3213,
"step": 1410
},
{
"epoch": 1.2746858168761221,
"grad_norm": 0.23683366179466248,
"learning_rate": 0.00046813285457809694,
"loss": 5.3596,
"step": 1420
},
{
"epoch": 1.2836624775583483,
"grad_norm": 0.17186540365219116,
"learning_rate": 0.0004679084380610413,
"loss": 5.2734,
"step": 1430
},
{
"epoch": 1.2926391382405744,
"grad_norm": 0.12084522843360901,
"learning_rate": 0.0004676840215439857,
"loss": 5.2741,
"step": 1440
},
{
"epoch": 1.3016157989228008,
"grad_norm": 0.13929304480552673,
"learning_rate": 0.00046745960502692996,
"loss": 5.2734,
"step": 1450
},
{
"epoch": 1.310592459605027,
"grad_norm": 0.22931580245494843,
"learning_rate": 0.00046723518850987436,
"loss": 5.2281,
"step": 1460
},
{
"epoch": 1.319569120287253,
"grad_norm": 0.13986773788928986,
"learning_rate": 0.00046701077199281865,
"loss": 5.2185,
"step": 1470
},
{
"epoch": 1.3285457809694794,
"grad_norm": 0.11496925354003906,
"learning_rate": 0.00046678635547576304,
"loss": 5.2082,
"step": 1480
},
{
"epoch": 1.3375224416517055,
"grad_norm": 0.2594555616378784,
"learning_rate": 0.00046656193895870733,
"loss": 5.1917,
"step": 1490
},
{
"epoch": 1.3464991023339317,
"grad_norm": 0.13332834839820862,
"learning_rate": 0.0004663375224416517,
"loss": 5.1701,
"step": 1500
},
{
"epoch": 1.355475763016158,
"grad_norm": 0.1260669082403183,
"learning_rate": 0.00046611310592459607,
"loss": 5.1703,
"step": 1510
},
{
"epoch": 1.3644524236983842,
"grad_norm": 0.17557017505168915,
"learning_rate": 0.0004658886894075404,
"loss": 5.1374,
"step": 1520
},
{
"epoch": 1.3734290843806103,
"grad_norm": 0.1354808807373047,
"learning_rate": 0.00046566427289048475,
"loss": 5.1732,
"step": 1530
},
{
"epoch": 1.3824057450628366,
"grad_norm": 0.16720908880233765,
"learning_rate": 0.0004654398563734291,
"loss": 5.3396,
"step": 1540
},
{
"epoch": 1.3913824057450628,
"grad_norm": 0.19078396260738373,
"learning_rate": 0.00046521543985637343,
"loss": 5.1455,
"step": 1550
},
{
"epoch": 1.400359066427289,
"grad_norm": 0.2168230563402176,
"learning_rate": 0.00046499102333931783,
"loss": 5.1026,
"step": 1560
},
{
"epoch": 1.4093357271095153,
"grad_norm": 0.12317873537540436,
"learning_rate": 0.0004647666068222621,
"loss": 5.1632,
"step": 1570
},
{
"epoch": 1.4183123877917414,
"grad_norm": 0.16298305988311768,
"learning_rate": 0.00046454219030520646,
"loss": 5.1489,
"step": 1580
},
{
"epoch": 1.4272890484739678,
"grad_norm": 0.09502866864204407,
"learning_rate": 0.0004643177737881508,
"loss": 5.1068,
"step": 1590
},
{
"epoch": 1.436265709156194,
"grad_norm": 0.15911273658275604,
"learning_rate": 0.00046409335727109514,
"loss": 5.0888,
"step": 1600
},
{
"epoch": 1.44524236983842,
"grad_norm": 0.12198328226804733,
"learning_rate": 0.00046386894075403954,
"loss": 5.071,
"step": 1610
},
{
"epoch": 1.4542190305206464,
"grad_norm": 0.11831381171941757,
"learning_rate": 0.00046364452423698383,
"loss": 5.0809,
"step": 1620
},
{
"epoch": 1.4631956912028725,
"grad_norm": 0.1053285300731659,
"learning_rate": 0.0004634201077199282,
"loss": 5.0774,
"step": 1630
},
{
"epoch": 1.4721723518850989,
"grad_norm": 0.1193586066365242,
"learning_rate": 0.0004631956912028725,
"loss": 5.0553,
"step": 1640
},
{
"epoch": 1.481149012567325,
"grad_norm": 0.16306863725185394,
"learning_rate": 0.0004629712746858169,
"loss": 5.0607,
"step": 1650
},
{
"epoch": 1.4901256732495511,
"grad_norm": 0.12861207127571106,
"learning_rate": 0.0004627468581687612,
"loss": 5.0656,
"step": 1660
},
{
"epoch": 1.4991023339317775,
"grad_norm": 0.08006058633327484,
"learning_rate": 0.0004625224416517056,
"loss": 5.0515,
"step": 1670
},
{
"epoch": 1.5080789946140036,
"grad_norm": 0.11404240876436234,
"learning_rate": 0.00046229802513464993,
"loss": 5.0098,
"step": 1680
},
{
"epoch": 1.5170556552962298,
"grad_norm": 0.13075587153434753,
"learning_rate": 0.0004620736086175943,
"loss": 4.9911,
"step": 1690
},
{
"epoch": 1.5260323159784561,
"grad_norm": 0.17212539911270142,
"learning_rate": 0.0004618491921005386,
"loss": 5.0541,
"step": 1700
},
{
"epoch": 1.5350089766606823,
"grad_norm": 0.07674333453178406,
"learning_rate": 0.00046162477558348296,
"loss": 5.0126,
"step": 1710
},
{
"epoch": 1.5439856373429084,
"grad_norm": 0.1121719628572464,
"learning_rate": 0.0004614003590664273,
"loss": 5.0082,
"step": 1720
},
{
"epoch": 1.5529622980251347,
"grad_norm": 0.16214531660079956,
"learning_rate": 0.00046117594254937164,
"loss": 4.9905,
"step": 1730
},
{
"epoch": 1.5619389587073609,
"grad_norm": 0.12353977560997009,
"learning_rate": 0.000460951526032316,
"loss": 4.9644,
"step": 1740
},
{
"epoch": 1.570915619389587,
"grad_norm": 0.15267392992973328,
"learning_rate": 0.0004607271095152603,
"loss": 4.9708,
"step": 1750
},
{
"epoch": 1.5798922800718134,
"grad_norm": 0.17361833155155182,
"learning_rate": 0.00046050269299820467,
"loss": 4.9869,
"step": 1760
},
{
"epoch": 1.5888689407540395,
"grad_norm": 0.2920306622982025,
"learning_rate": 0.000460278276481149,
"loss": 4.9322,
"step": 1770
},
{
"epoch": 1.5978456014362656,
"grad_norm": 0.09478717297315598,
"learning_rate": 0.0004600538599640934,
"loss": 4.9247,
"step": 1780
},
{
"epoch": 1.606822262118492,
"grad_norm": 0.09164275228977203,
"learning_rate": 0.0004598294434470377,
"loss": 4.9086,
"step": 1790
},
{
"epoch": 1.6157989228007181,
"grad_norm": 0.07962439954280853,
"learning_rate": 0.0004596050269299821,
"loss": 4.9412,
"step": 1800
},
{
"epoch": 1.6247755834829443,
"grad_norm": 0.08752849698066711,
"learning_rate": 0.0004593806104129264,
"loss": 4.9291,
"step": 1810
},
{
"epoch": 1.6337522441651706,
"grad_norm": 0.09293937683105469,
"learning_rate": 0.0004591561938958708,
"loss": 4.9652,
"step": 1820
},
{
"epoch": 1.6427289048473968,
"grad_norm": 0.09523571282625198,
"learning_rate": 0.00045893177737881506,
"loss": 4.9137,
"step": 1830
},
{
"epoch": 1.6517055655296229,
"grad_norm": 0.09075015783309937,
"learning_rate": 0.00045870736086175946,
"loss": 4.8925,
"step": 1840
},
{
"epoch": 1.6606822262118492,
"grad_norm": 0.14088210463523865,
"learning_rate": 0.0004584829443447038,
"loss": 4.8941,
"step": 1850
},
{
"epoch": 1.6696588868940754,
"grad_norm": 0.06859997659921646,
"learning_rate": 0.0004582585278276481,
"loss": 4.8731,
"step": 1860
},
{
"epoch": 1.6786355475763015,
"grad_norm": 0.06676523387432098,
"learning_rate": 0.0004580341113105925,
"loss": 4.8615,
"step": 1870
},
{
"epoch": 1.6876122082585279,
"grad_norm": 0.08721990138292313,
"learning_rate": 0.00045780969479353677,
"loss": 4.8847,
"step": 1880
},
{
"epoch": 1.696588868940754,
"grad_norm": 0.08681096136569977,
"learning_rate": 0.00045758527827648117,
"loss": 4.884,
"step": 1890
},
{
"epoch": 1.7055655296229801,
"grad_norm": 0.1754937767982483,
"learning_rate": 0.0004573608617594255,
"loss": 4.8663,
"step": 1900
},
{
"epoch": 1.7145421903052065,
"grad_norm": 0.07060963660478592,
"learning_rate": 0.00045713644524236985,
"loss": 4.9142,
"step": 1910
},
{
"epoch": 1.7235188509874326,
"grad_norm": 0.12035933881998062,
"learning_rate": 0.0004569120287253142,
"loss": 4.8599,
"step": 1920
},
{
"epoch": 1.7324955116696588,
"grad_norm": 0.11212557554244995,
"learning_rate": 0.00045668761220825853,
"loss": 4.8899,
"step": 1930
},
{
"epoch": 1.7414721723518851,
"grad_norm": 0.058452803641557693,
"learning_rate": 0.0004564631956912029,
"loss": 4.8454,
"step": 1940
},
{
"epoch": 1.7504488330341115,
"grad_norm": 0.1073731780052185,
"learning_rate": 0.0004562387791741472,
"loss": 4.8546,
"step": 1950
},
{
"epoch": 1.7594254937163374,
"grad_norm": 0.12025927007198334,
"learning_rate": 0.00045601436265709156,
"loss": 4.8446,
"step": 1960
},
{
"epoch": 1.7684021543985637,
"grad_norm": 0.08838968724012375,
"learning_rate": 0.00045578994614003596,
"loss": 4.8625,
"step": 1970
},
{
"epoch": 1.77737881508079,
"grad_norm": 0.0963386595249176,
"learning_rate": 0.00045556552962298024,
"loss": 4.8518,
"step": 1980
},
{
"epoch": 1.786355475763016,
"grad_norm": 0.10317738354206085,
"learning_rate": 0.00045534111310592464,
"loss": 4.8161,
"step": 1990
},
{
"epoch": 1.7953321364452424,
"grad_norm": 2.792144536972046,
"learning_rate": 0.00045511669658886893,
"loss": 4.8589,
"step": 2000
},
{
"epoch": 1.8043087971274687,
"grad_norm": 0.08297235518693924,
"learning_rate": 0.00045489228007181327,
"loss": 4.8135,
"step": 2010
},
{
"epoch": 1.8132854578096946,
"grad_norm": 0.080784372985363,
"learning_rate": 0.00045466786355475767,
"loss": 4.8436,
"step": 2020
},
{
"epoch": 1.822262118491921,
"grad_norm": 0.08878181129693985,
"learning_rate": 0.00045444344703770195,
"loss": 4.8532,
"step": 2030
},
{
"epoch": 1.8312387791741473,
"grad_norm": 0.0814034566283226,
"learning_rate": 0.00045421903052064635,
"loss": 4.7992,
"step": 2040
},
{
"epoch": 1.8402154398563735,
"grad_norm": 0.05908200889825821,
"learning_rate": 0.00045399461400359064,
"loss": 4.8005,
"step": 2050
},
{
"epoch": 1.8491921005385996,
"grad_norm": 0.06837856769561768,
"learning_rate": 0.00045377019748653503,
"loss": 4.802,
"step": 2060
},
{
"epoch": 1.858168761220826,
"grad_norm": 0.06775591522455215,
"learning_rate": 0.0004535457809694794,
"loss": 4.7847,
"step": 2070
},
{
"epoch": 1.867145421903052,
"grad_norm": 0.27018266916275024,
"learning_rate": 0.0004533213644524237,
"loss": 4.7918,
"step": 2080
},
{
"epoch": 1.8761220825852782,
"grad_norm": 0.21435914933681488,
"learning_rate": 0.00045309694793536806,
"loss": 4.8033,
"step": 2090
},
{
"epoch": 1.8850987432675046,
"grad_norm": 0.07224582880735397,
"learning_rate": 0.0004528725314183124,
"loss": 4.7639,
"step": 2100
},
{
"epoch": 1.8940754039497307,
"grad_norm": 0.08708648383617401,
"learning_rate": 0.00045264811490125674,
"loss": 4.7894,
"step": 2110
},
{
"epoch": 1.9030520646319569,
"grad_norm": 0.08637712150812149,
"learning_rate": 0.0004524236983842011,
"loss": 4.7745,
"step": 2120
},
{
"epoch": 1.9120287253141832,
"grad_norm": 0.06233949586749077,
"learning_rate": 0.0004521992818671454,
"loss": 4.783,
"step": 2130
},
{
"epoch": 1.9210053859964094,
"grad_norm": 0.07999356091022491,
"learning_rate": 0.0004519748653500898,
"loss": 4.7585,
"step": 2140
},
{
"epoch": 1.9299820466786355,
"grad_norm": 0.09440754354000092,
"learning_rate": 0.0004517504488330341,
"loss": 4.7653,
"step": 2150
},
{
"epoch": 1.9389587073608618,
"grad_norm": 0.09272520244121552,
"learning_rate": 0.00045152603231597845,
"loss": 4.7523,
"step": 2160
},
{
"epoch": 1.947935368043088,
"grad_norm": 0.08041410148143768,
"learning_rate": 0.0004513016157989228,
"loss": 4.7593,
"step": 2170
},
{
"epoch": 1.9569120287253141,
"grad_norm": 0.048107001930475235,
"learning_rate": 0.00045107719928186714,
"loss": 4.7392,
"step": 2180
},
{
"epoch": 1.9658886894075405,
"grad_norm": 0.07445549219846725,
"learning_rate": 0.00045085278276481153,
"loss": 4.7288,
"step": 2190
},
{
"epoch": 1.9748653500897666,
"grad_norm": 0.06540877372026443,
"learning_rate": 0.0004506283662477558,
"loss": 4.7311,
"step": 2200
},
{
"epoch": 1.9838420107719927,
"grad_norm": 0.05422632023692131,
"learning_rate": 0.0004504039497307002,
"loss": 4.728,
"step": 2210
},
{
"epoch": 1.992818671454219,
"grad_norm": 0.05353199318051338,
"learning_rate": 0.0004501795332136445,
"loss": 4.7274,
"step": 2220
},
{
"epoch": 2.0,
"eval_loss": 4.539531707763672,
"eval_runtime": 437.126,
"eval_samples_per_second": 10.194,
"eval_steps_per_second": 1.274,
"step": 2228
},
{
"epoch": 2.0017953321364454,
"grad_norm": 0.05564208701252937,
"learning_rate": 0.0004499551166965889,
"loss": 4.7397,
"step": 2230
},
{
"epoch": 2.0107719928186714,
"grad_norm": 0.05997077003121376,
"learning_rate": 0.00044973070017953324,
"loss": 4.714,
"step": 2240
},
{
"epoch": 2.0197486535008977,
"grad_norm": 0.0496087446808815,
"learning_rate": 0.0004495062836624776,
"loss": 4.7431,
"step": 2250
},
{
"epoch": 2.028725314183124,
"grad_norm": 0.08797594904899597,
"learning_rate": 0.0004492818671454219,
"loss": 4.7186,
"step": 2260
},
{
"epoch": 2.03770197486535,
"grad_norm": 0.05270407721400261,
"learning_rate": 0.00044905745062836627,
"loss": 4.7419,
"step": 2270
},
{
"epoch": 2.0466786355475763,
"grad_norm": 0.06538320332765579,
"learning_rate": 0.0004488330341113106,
"loss": 4.714,
"step": 2280
},
{
"epoch": 2.0556552962298027,
"grad_norm": 0.060536161065101624,
"learning_rate": 0.0004486086175942549,
"loss": 4.691,
"step": 2290
},
{
"epoch": 2.0646319569120286,
"grad_norm": 0.10158341377973557,
"learning_rate": 0.0004483842010771993,
"loss": 4.702,
"step": 2300
},
{
"epoch": 2.073608617594255,
"grad_norm": 0.08171387016773224,
"learning_rate": 0.00044815978456014363,
"loss": 4.7029,
"step": 2310
},
{
"epoch": 2.0825852782764813,
"grad_norm": 0.07701843976974487,
"learning_rate": 0.000447935368043088,
"loss": 4.6957,
"step": 2320
},
{
"epoch": 2.0915619389587072,
"grad_norm": 0.06302302330732346,
"learning_rate": 0.0004477109515260323,
"loss": 4.6855,
"step": 2330
},
{
"epoch": 2.1005385996409336,
"grad_norm": 0.12679466605186462,
"learning_rate": 0.00044748653500897666,
"loss": 4.7147,
"step": 2340
},
{
"epoch": 2.10951526032316,
"grad_norm": 0.17339470982551575,
"learning_rate": 0.000447262118491921,
"loss": 4.6697,
"step": 2350
},
{
"epoch": 2.118491921005386,
"grad_norm": 0.07397322356700897,
"learning_rate": 0.0004470377019748654,
"loss": 4.6642,
"step": 2360
},
{
"epoch": 2.127468581687612,
"grad_norm": 0.0524037629365921,
"learning_rate": 0.0004468132854578097,
"loss": 4.6511,
"step": 2370
},
{
"epoch": 2.1364452423698386,
"grad_norm": 0.06674987077713013,
"learning_rate": 0.0004465888689407541,
"loss": 4.6374,
"step": 2380
},
{
"epoch": 2.1454219030520645,
"grad_norm": 0.04827852547168732,
"learning_rate": 0.00044636445242369837,
"loss": 4.6531,
"step": 2390
},
{
"epoch": 2.154398563734291,
"grad_norm": 0.05094282329082489,
"learning_rate": 0.00044614003590664277,
"loss": 4.6554,
"step": 2400
},
{
"epoch": 2.163375224416517,
"grad_norm": 0.0653914213180542,
"learning_rate": 0.00044591561938958705,
"loss": 4.6568,
"step": 2410
},
{
"epoch": 2.172351885098743,
"grad_norm": 0.06652519851922989,
"learning_rate": 0.00044569120287253145,
"loss": 4.6533,
"step": 2420
},
{
"epoch": 2.1813285457809695,
"grad_norm": 0.051527008414268494,
"learning_rate": 0.0004454667863554758,
"loss": 4.6438,
"step": 2430
},
{
"epoch": 2.190305206463196,
"grad_norm": 0.047185543924570084,
"learning_rate": 0.0004452423698384201,
"loss": 4.6157,
"step": 2440
},
{
"epoch": 2.1992818671454217,
"grad_norm": 0.0524996742606163,
"learning_rate": 0.0004450179533213645,
"loss": 4.6583,
"step": 2450
},
{
"epoch": 2.208258527827648,
"grad_norm": 0.055864688009023666,
"learning_rate": 0.00044479353680430876,
"loss": 4.611,
"step": 2460
},
{
"epoch": 2.2172351885098744,
"grad_norm": 0.055937688797712326,
"learning_rate": 0.00044456912028725316,
"loss": 4.6302,
"step": 2470
},
{
"epoch": 2.2262118491921004,
"grad_norm": 0.07318311929702759,
"learning_rate": 0.0004443447037701975,
"loss": 4.6383,
"step": 2480
},
{
"epoch": 2.2351885098743267,
"grad_norm": 0.05302512273192406,
"learning_rate": 0.00044412028725314184,
"loss": 4.6142,
"step": 2490
},
{
"epoch": 2.244165170556553,
"grad_norm": 0.050843581557273865,
"learning_rate": 0.0004438958707360862,
"loss": 4.5937,
"step": 2500
},
{
"epoch": 2.253141831238779,
"grad_norm": 0.0519312284886837,
"learning_rate": 0.0004436714542190305,
"loss": 4.6083,
"step": 2510
},
{
"epoch": 2.2621184919210053,
"grad_norm": 0.05857894569635391,
"learning_rate": 0.00044344703770197487,
"loss": 4.5765,
"step": 2520
},
{
"epoch": 2.2710951526032317,
"grad_norm": 0.05550041422247887,
"learning_rate": 0.00044322262118491926,
"loss": 4.5859,
"step": 2530
},
{
"epoch": 2.280071813285458,
"grad_norm": 0.10349979996681213,
"learning_rate": 0.00044299820466786355,
"loss": 4.5765,
"step": 2540
},
{
"epoch": 2.289048473967684,
"grad_norm": 0.1185607761144638,
"learning_rate": 0.00044277378815080795,
"loss": 4.5946,
"step": 2550
},
{
"epoch": 2.2980251346499103,
"grad_norm": 0.09133188426494598,
"learning_rate": 0.00044254937163375224,
"loss": 4.578,
"step": 2560
},
{
"epoch": 2.3070017953321367,
"grad_norm": 0.08713024109601974,
"learning_rate": 0.00044232495511669663,
"loss": 4.6011,
"step": 2570
},
{
"epoch": 2.3159784560143626,
"grad_norm": 0.05465725436806679,
"learning_rate": 0.0004421005385996409,
"loss": 4.5755,
"step": 2580
},
{
"epoch": 2.324955116696589,
"grad_norm": 0.056493621319532394,
"learning_rate": 0.00044187612208258526,
"loss": 4.5855,
"step": 2590
},
{
"epoch": 2.3339317773788153,
"grad_norm": 0.047107528895139694,
"learning_rate": 0.00044165170556552966,
"loss": 4.5681,
"step": 2600
},
{
"epoch": 2.342908438061041,
"grad_norm": 0.05533495545387268,
"learning_rate": 0.00044142728904847394,
"loss": 4.5581,
"step": 2610
},
{
"epoch": 2.3518850987432676,
"grad_norm": 0.0478278249502182,
"learning_rate": 0.00044120287253141834,
"loss": 4.5425,
"step": 2620
},
{
"epoch": 2.360861759425494,
"grad_norm": 0.06553395092487335,
"learning_rate": 0.00044097845601436263,
"loss": 4.5484,
"step": 2630
},
{
"epoch": 2.36983842010772,
"grad_norm": 0.07375505566596985,
"learning_rate": 0.000440754039497307,
"loss": 4.541,
"step": 2640
},
{
"epoch": 2.378815080789946,
"grad_norm": 0.20693852007389069,
"learning_rate": 0.00044052962298025137,
"loss": 4.5521,
"step": 2650
},
{
"epoch": 2.3877917414721725,
"grad_norm": 0.056829433888196945,
"learning_rate": 0.0004403052064631957,
"loss": 4.5588,
"step": 2660
},
{
"epoch": 2.3967684021543985,
"grad_norm": 0.05583564192056656,
"learning_rate": 0.00044008078994614005,
"loss": 4.5358,
"step": 2670
},
{
"epoch": 2.405745062836625,
"grad_norm": 0.07319542020559311,
"learning_rate": 0.0004398563734290844,
"loss": 4.523,
"step": 2680
},
{
"epoch": 2.414721723518851,
"grad_norm": 0.052402835339307785,
"learning_rate": 0.00043963195691202873,
"loss": 4.5096,
"step": 2690
},
{
"epoch": 2.423698384201077,
"grad_norm": 0.05206010863184929,
"learning_rate": 0.00043940754039497313,
"loss": 4.5053,
"step": 2700
},
{
"epoch": 2.4326750448833034,
"grad_norm": 0.05443358048796654,
"learning_rate": 0.0004391831238779174,
"loss": 4.5501,
"step": 2710
},
{
"epoch": 2.44165170556553,
"grad_norm": 0.07843279093503952,
"learning_rate": 0.00043895870736086176,
"loss": 4.5027,
"step": 2720
},
{
"epoch": 2.4506283662477557,
"grad_norm": 0.046305350959300995,
"learning_rate": 0.0004387342908438061,
"loss": 4.4975,
"step": 2730
},
{
"epoch": 2.459605026929982,
"grad_norm": 0.22592291235923767,
"learning_rate": 0.00043850987432675044,
"loss": 4.5183,
"step": 2740
},
{
"epoch": 2.4685816876122084,
"grad_norm": 0.05082382634282112,
"learning_rate": 0.0004382854578096948,
"loss": 4.4864,
"step": 2750
},
{
"epoch": 2.4775583482944343,
"grad_norm": 0.06731193512678146,
"learning_rate": 0.00043806104129263913,
"loss": 4.4982,
"step": 2760
},
{
"epoch": 2.4865350089766607,
"grad_norm": 185.7747039794922,
"learning_rate": 0.0004378366247755835,
"loss": 4.6692,
"step": 2770
},
{
"epoch": 2.495511669658887,
"grad_norm": 0.058124568313360214,
"learning_rate": 0.0004376122082585278,
"loss": 4.4775,
"step": 2780
},
{
"epoch": 2.504488330341113,
"grad_norm": 0.08968983590602875,
"learning_rate": 0.0004373877917414722,
"loss": 4.4944,
"step": 2790
},
{
"epoch": 2.5134649910233393,
"grad_norm": 0.2788603901863098,
"learning_rate": 0.0004371633752244165,
"loss": 4.503,
"step": 2800
},
{
"epoch": 2.5224416517055657,
"grad_norm": 0.05559522658586502,
"learning_rate": 0.0004369389587073609,
"loss": 4.4733,
"step": 2810
},
{
"epoch": 2.5314183123877916,
"grad_norm": 0.05935097113251686,
"learning_rate": 0.00043671454219030523,
"loss": 4.4686,
"step": 2820
},
{
"epoch": 2.540394973070018,
"grad_norm": 0.05860767886042595,
"learning_rate": 0.0004364901256732496,
"loss": 4.4593,
"step": 2830
},
{
"epoch": 2.5493716337522443,
"grad_norm": 0.047259800136089325,
"learning_rate": 0.0004362657091561939,
"loss": 4.4479,
"step": 2840
},
{
"epoch": 2.55834829443447,
"grad_norm": 0.04901234060525894,
"learning_rate": 0.00043604129263913826,
"loss": 4.4621,
"step": 2850
},
{
"epoch": 2.5673249551166966,
"grad_norm": 0.05742761120200157,
"learning_rate": 0.0004358168761220826,
"loss": 4.4422,
"step": 2860
},
{
"epoch": 2.576301615798923,
"grad_norm": 0.05717416852712631,
"learning_rate": 0.0004355924596050269,
"loss": 4.4248,
"step": 2870
},
{
"epoch": 2.585278276481149,
"grad_norm": 0.0896502435207367,
"learning_rate": 0.0004353680430879713,
"loss": 4.4368,
"step": 2880
},
{
"epoch": 2.594254937163375,
"grad_norm": 0.08746081590652466,
"learning_rate": 0.0004351436265709156,
"loss": 4.4282,
"step": 2890
},
{
"epoch": 2.6032315978456015,
"grad_norm": 0.07144750654697418,
"learning_rate": 0.00043491921005385997,
"loss": 4.4794,
"step": 2900
},
{
"epoch": 2.6122082585278275,
"grad_norm": 0.05990668013691902,
"learning_rate": 0.0004346947935368043,
"loss": 4.4117,
"step": 2910
},
{
"epoch": 2.621184919210054,
"grad_norm": 0.07920947670936584,
"learning_rate": 0.00043447037701974865,
"loss": 4.4179,
"step": 2920
},
{
"epoch": 2.63016157989228,
"grad_norm": 0.053824532777071,
"learning_rate": 0.000434245960502693,
"loss": 4.3963,
"step": 2930
},
{
"epoch": 2.639138240574506,
"grad_norm": 0.06394129246473312,
"learning_rate": 0.0004340215439856374,
"loss": 4.4045,
"step": 2940
},
{
"epoch": 2.6481149012567324,
"grad_norm": 0.2640804648399353,
"learning_rate": 0.0004337971274685817,
"loss": 4.3916,
"step": 2950
},
{
"epoch": 2.657091561938959,
"grad_norm": 0.04887564107775688,
"learning_rate": 0.0004335727109515261,
"loss": 4.3892,
"step": 2960
},
{
"epoch": 2.6660682226211847,
"grad_norm": 0.05104290321469307,
"learning_rate": 0.00043334829443447036,
"loss": 4.3883,
"step": 2970
},
{
"epoch": 2.675044883303411,
"grad_norm": 0.18991751968860626,
"learning_rate": 0.00043312387791741476,
"loss": 4.3747,
"step": 2980
},
{
"epoch": 2.6840215439856374,
"grad_norm": 0.3262752294540405,
"learning_rate": 0.0004328994614003591,
"loss": 4.3755,
"step": 2990
},
{
"epoch": 2.6929982046678633,
"grad_norm": 0.6619095802307129,
"learning_rate": 0.0004326750448833034,
"loss": 4.3711,
"step": 3000
},
{
"epoch": 2.7019748653500897,
"grad_norm": 0.06734511256217957,
"learning_rate": 0.0004324506283662478,
"loss": 4.3606,
"step": 3010
},
{
"epoch": 2.710951526032316,
"grad_norm": 0.06055251508951187,
"learning_rate": 0.00043222621184919207,
"loss": 4.3639,
"step": 3020
},
{
"epoch": 2.719928186714542,
"grad_norm": 0.08325715363025665,
"learning_rate": 0.00043200179533213647,
"loss": 4.3439,
"step": 3030
},
{
"epoch": 2.7289048473967683,
"grad_norm": 0.06473597139120102,
"learning_rate": 0.00043177737881508075,
"loss": 4.3448,
"step": 3040
},
{
"epoch": 2.7378815080789947,
"grad_norm": 0.062395766377449036,
"learning_rate": 0.00043155296229802515,
"loss": 4.3262,
"step": 3050
},
{
"epoch": 2.7468581687612206,
"grad_norm": 0.054201096296310425,
"learning_rate": 0.0004313285457809695,
"loss": 4.3466,
"step": 3060
},
{
"epoch": 2.755834829443447,
"grad_norm": 0.05278482288122177,
"learning_rate": 0.00043110412926391383,
"loss": 4.3493,
"step": 3070
},
{
"epoch": 2.7648114901256733,
"grad_norm": 0.05334211513400078,
"learning_rate": 0.0004308797127468582,
"loss": 4.3308,
"step": 3080
},
{
"epoch": 2.773788150807899,
"grad_norm": 0.06164594739675522,
"learning_rate": 0.0004306552962298025,
"loss": 4.3166,
"step": 3090
},
{
"epoch": 2.7827648114901256,
"grad_norm": 0.07043807953596115,
"learning_rate": 0.00043043087971274686,
"loss": 4.3009,
"step": 3100
},
{
"epoch": 2.791741472172352,
"grad_norm": 0.05904858186841011,
"learning_rate": 0.00043020646319569126,
"loss": 4.2982,
"step": 3110
},
{
"epoch": 2.800718132854578,
"grad_norm": 0.3487374782562256,
"learning_rate": 0.00042998204667863554,
"loss": 4.2992,
"step": 3120
},
{
"epoch": 2.809694793536804,
"grad_norm": 0.06090310215950012,
"learning_rate": 0.00042975763016157994,
"loss": 4.3199,
"step": 3130
},
{
"epoch": 2.8186714542190305,
"grad_norm": 0.0674201026558876,
"learning_rate": 0.0004295332136445242,
"loss": 4.2813,
"step": 3140
},
{
"epoch": 2.827648114901257,
"grad_norm": 0.0564940869808197,
"learning_rate": 0.00042930879712746857,
"loss": 4.2875,
"step": 3150
},
{
"epoch": 2.836624775583483,
"grad_norm": 0.08277291059494019,
"learning_rate": 0.00042908438061041297,
"loss": 4.2741,
"step": 3160
},
{
"epoch": 2.845601436265709,
"grad_norm": 0.05882051959633827,
"learning_rate": 0.00042885996409335725,
"loss": 4.2606,
"step": 3170
},
{
"epoch": 2.8545780969479355,
"grad_norm": 0.056912124156951904,
"learning_rate": 0.00042863554757630165,
"loss": 4.2387,
"step": 3180
},
{
"epoch": 2.8635547576301614,
"grad_norm": 0.06803829967975616,
"learning_rate": 0.00042841113105924594,
"loss": 4.2595,
"step": 3190
},
{
"epoch": 2.872531418312388,
"grad_norm": 0.38242146372795105,
"learning_rate": 0.00042818671454219033,
"loss": 4.252,
"step": 3200
},
{
"epoch": 2.881508078994614,
"grad_norm": 0.06552311778068542,
"learning_rate": 0.0004279622980251346,
"loss": 4.2386,
"step": 3210
},
{
"epoch": 2.89048473967684,
"grad_norm": 0.06190953776240349,
"learning_rate": 0.000427737881508079,
"loss": 4.2279,
"step": 3220
},
{
"epoch": 2.8994614003590664,
"grad_norm": 0.07202804833650589,
"learning_rate": 0.00042751346499102336,
"loss": 4.2397,
"step": 3230
},
{
"epoch": 2.9084380610412928,
"grad_norm": 0.06415878981351852,
"learning_rate": 0.0004272890484739677,
"loss": 4.2124,
"step": 3240
},
{
"epoch": 2.917414721723519,
"grad_norm": 0.06290468573570251,
"learning_rate": 0.00042706463195691204,
"loss": 4.2267,
"step": 3250
},
{
"epoch": 2.926391382405745,
"grad_norm": 0.05975602567195892,
"learning_rate": 0.0004268402154398564,
"loss": 4.21,
"step": 3260
},
{
"epoch": 2.9353680430879714,
"grad_norm": 0.08022774755954742,
"learning_rate": 0.0004266157989228007,
"loss": 4.2127,
"step": 3270
},
{
"epoch": 2.9443447037701977,
"grad_norm": 0.11041318625211716,
"learning_rate": 0.0004263913824057451,
"loss": 4.1903,
"step": 3280
},
{
"epoch": 2.9533213644524237,
"grad_norm": 0.06093136593699455,
"learning_rate": 0.0004261669658886894,
"loss": 4.1882,
"step": 3290
},
{
"epoch": 2.96229802513465,
"grad_norm": 0.060306135565042496,
"learning_rate": 0.00042594254937163375,
"loss": 4.1874,
"step": 3300
},
{
"epoch": 2.9712746858168764,
"grad_norm": 0.0592743381857872,
"learning_rate": 0.0004257181328545781,
"loss": 4.1881,
"step": 3310
},
{
"epoch": 2.9802513464991023,
"grad_norm": 0.06113787367939949,
"learning_rate": 0.00042549371633752244,
"loss": 4.1788,
"step": 3320
},
{
"epoch": 2.9892280071813286,
"grad_norm": 0.0978228747844696,
"learning_rate": 0.0004252692998204668,
"loss": 4.1714,
"step": 3330
},
{
"epoch": 2.998204667863555,
"grad_norm": 0.06704937666654587,
"learning_rate": 0.0004250448833034111,
"loss": 4.1375,
"step": 3340
},
{
"epoch": 3.0,
"eval_loss": 3.9158332347869873,
"eval_runtime": 437.3013,
"eval_samples_per_second": 10.19,
"eval_steps_per_second": 1.274,
"step": 3342
}
],
"logging_steps": 10,
"max_steps": 22280,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.661123660598477e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}