tmp_ft / trainer_state.json
emonnsl's picture
Upload 7 files
ae0aef2 verified
{
"best_metric": 1.9664931297302246,
"best_model_checkpoint": "./lora_bn_resume/checkpoint-3000",
"epoch": 1.9292604501607717,
"eval_steps": 200,
"global_step": 3000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.006430868167202572,
"grad_norm": 0.7529953718185425,
"learning_rate": 2.9999999999999997e-05,
"loss": 2.01,
"step": 10
},
{
"epoch": 0.012861736334405145,
"grad_norm": 0.8143910765647888,
"learning_rate": 5.9999999999999995e-05,
"loss": 1.9794,
"step": 20
},
{
"epoch": 0.01929260450160772,
"grad_norm": 0.7554563283920288,
"learning_rate": 8.999999999999999e-05,
"loss": 1.9687,
"step": 30
},
{
"epoch": 0.02572347266881029,
"grad_norm": 0.701172411441803,
"learning_rate": 0.00011999999999999999,
"loss": 2.0374,
"step": 40
},
{
"epoch": 0.03215434083601286,
"grad_norm": 0.7426002621650696,
"learning_rate": 0.00015,
"loss": 1.8484,
"step": 50
},
{
"epoch": 0.03858520900321544,
"grad_norm": 0.7900332808494568,
"learning_rate": 0.00017999999999999998,
"loss": 1.91,
"step": 60
},
{
"epoch": 0.04501607717041801,
"grad_norm": 0.7825136184692383,
"learning_rate": 0.00020999999999999998,
"loss": 1.9625,
"step": 70
},
{
"epoch": 0.05144694533762058,
"grad_norm": 0.9338003993034363,
"learning_rate": 0.00023999999999999998,
"loss": 1.9668,
"step": 80
},
{
"epoch": 0.05787781350482315,
"grad_norm": 0.8660485148429871,
"learning_rate": 0.00027,
"loss": 2.0447,
"step": 90
},
{
"epoch": 0.06430868167202572,
"grad_norm": 0.8631746768951416,
"learning_rate": 0.0003,
"loss": 2.0347,
"step": 100
},
{
"epoch": 0.0707395498392283,
"grad_norm": 0.9202760457992554,
"learning_rate": 0.00029934282584884994,
"loss": 2.0218,
"step": 110
},
{
"epoch": 0.07717041800643087,
"grad_norm": 0.8508992791175842,
"learning_rate": 0.00029868565169769985,
"loss": 1.9808,
"step": 120
},
{
"epoch": 0.08360128617363344,
"grad_norm": 0.9962050914764404,
"learning_rate": 0.0002980284775465498,
"loss": 1.9586,
"step": 130
},
{
"epoch": 0.09003215434083602,
"grad_norm": 0.9159810543060303,
"learning_rate": 0.00029737130339539973,
"loss": 2.0257,
"step": 140
},
{
"epoch": 0.09646302250803858,
"grad_norm": 0.8135138750076294,
"learning_rate": 0.0002967141292442497,
"loss": 2.0103,
"step": 150
},
{
"epoch": 0.10289389067524116,
"grad_norm": 0.7933633327484131,
"learning_rate": 0.00029605695509309966,
"loss": 2.028,
"step": 160
},
{
"epoch": 0.10932475884244373,
"grad_norm": 0.9258368611335754,
"learning_rate": 0.00029539978094194957,
"loss": 2.0654,
"step": 170
},
{
"epoch": 0.1157556270096463,
"grad_norm": 0.8758969902992249,
"learning_rate": 0.00029474260679079954,
"loss": 1.9928,
"step": 180
},
{
"epoch": 0.12218649517684887,
"grad_norm": 0.8316165804862976,
"learning_rate": 0.00029408543263964945,
"loss": 1.9748,
"step": 190
},
{
"epoch": 0.12861736334405144,
"grad_norm": 0.8353763222694397,
"learning_rate": 0.0002934282584884994,
"loss": 2.0167,
"step": 200
},
{
"epoch": 0.12861736334405144,
"eval_loss": 2.0699551105499268,
"eval_runtime": 131.8406,
"eval_samples_per_second": 15.17,
"eval_steps_per_second": 1.896,
"step": 200
},
{
"epoch": 0.13504823151125403,
"grad_norm": 0.8024882078170776,
"learning_rate": 0.0002927710843373494,
"loss": 2.1039,
"step": 210
},
{
"epoch": 0.1414790996784566,
"grad_norm": 0.861377477645874,
"learning_rate": 0.0002921139101861993,
"loss": 2.023,
"step": 220
},
{
"epoch": 0.14790996784565916,
"grad_norm": 0.8247071504592896,
"learning_rate": 0.00029145673603504926,
"loss": 1.9341,
"step": 230
},
{
"epoch": 0.15434083601286175,
"grad_norm": 0.8182681202888489,
"learning_rate": 0.0002907995618838992,
"loss": 2.0137,
"step": 240
},
{
"epoch": 0.1607717041800643,
"grad_norm": 0.8556217551231384,
"learning_rate": 0.00029014238773274913,
"loss": 2.0638,
"step": 250
},
{
"epoch": 0.16720257234726688,
"grad_norm": 0.7721512913703918,
"learning_rate": 0.0002894852135815991,
"loss": 2.0061,
"step": 260
},
{
"epoch": 0.17363344051446947,
"grad_norm": 0.7948784828186035,
"learning_rate": 0.000288828039430449,
"loss": 1.9751,
"step": 270
},
{
"epoch": 0.18006430868167203,
"grad_norm": 0.7582404613494873,
"learning_rate": 0.000288170865279299,
"loss": 2.0254,
"step": 280
},
{
"epoch": 0.1864951768488746,
"grad_norm": 0.9620535969734192,
"learning_rate": 0.00028751369112814894,
"loss": 1.9978,
"step": 290
},
{
"epoch": 0.19292604501607716,
"grad_norm": 0.7374221682548523,
"learning_rate": 0.00028685651697699885,
"loss": 2.0631,
"step": 300
},
{
"epoch": 0.19935691318327975,
"grad_norm": 0.794651210308075,
"learning_rate": 0.0002861993428258488,
"loss": 1.9507,
"step": 310
},
{
"epoch": 0.2057877813504823,
"grad_norm": 0.7450920939445496,
"learning_rate": 0.00028554216867469873,
"loss": 2.0363,
"step": 320
},
{
"epoch": 0.21221864951768488,
"grad_norm": 0.7574348449707031,
"learning_rate": 0.0002848849945235487,
"loss": 2.0508,
"step": 330
},
{
"epoch": 0.21864951768488747,
"grad_norm": 0.9118533134460449,
"learning_rate": 0.00028422782037239866,
"loss": 2.0118,
"step": 340
},
{
"epoch": 0.22508038585209003,
"grad_norm": 0.8136394023895264,
"learning_rate": 0.0002835706462212486,
"loss": 2.1211,
"step": 350
},
{
"epoch": 0.2315112540192926,
"grad_norm": 0.9099079966545105,
"learning_rate": 0.00028291347207009854,
"loss": 2.0346,
"step": 360
},
{
"epoch": 0.2379421221864952,
"grad_norm": 0.830896258354187,
"learning_rate": 0.0002822562979189485,
"loss": 2.0494,
"step": 370
},
{
"epoch": 0.24437299035369775,
"grad_norm": 0.789002001285553,
"learning_rate": 0.0002815991237677984,
"loss": 1.9791,
"step": 380
},
{
"epoch": 0.2508038585209003,
"grad_norm": 0.8194644451141357,
"learning_rate": 0.0002809419496166484,
"loss": 2.0106,
"step": 390
},
{
"epoch": 0.2572347266881029,
"grad_norm": 0.8226191401481628,
"learning_rate": 0.00028028477546549835,
"loss": 2.0268,
"step": 400
},
{
"epoch": 0.2572347266881029,
"eval_loss": 2.057727575302124,
"eval_runtime": 127.2637,
"eval_samples_per_second": 15.715,
"eval_steps_per_second": 1.964,
"step": 400
},
{
"epoch": 0.26366559485530544,
"grad_norm": 0.796454668045044,
"learning_rate": 0.00027962760131434826,
"loss": 2.0376,
"step": 410
},
{
"epoch": 0.27009646302250806,
"grad_norm": 0.8327352404594421,
"learning_rate": 0.0002789704271631982,
"loss": 2.0481,
"step": 420
},
{
"epoch": 0.2765273311897106,
"grad_norm": 0.8051420450210571,
"learning_rate": 0.0002783132530120482,
"loss": 1.99,
"step": 430
},
{
"epoch": 0.2829581993569132,
"grad_norm": 0.7519128322601318,
"learning_rate": 0.0002776560788608981,
"loss": 2.0339,
"step": 440
},
{
"epoch": 0.28938906752411575,
"grad_norm": 0.8251495957374573,
"learning_rate": 0.00027699890470974807,
"loss": 2.0289,
"step": 450
},
{
"epoch": 0.2958199356913183,
"grad_norm": 0.7058277130126953,
"learning_rate": 0.000276341730558598,
"loss": 2.0669,
"step": 460
},
{
"epoch": 0.3022508038585209,
"grad_norm": 0.8475114107131958,
"learning_rate": 0.00027568455640744795,
"loss": 2.0506,
"step": 470
},
{
"epoch": 0.3086816720257235,
"grad_norm": 0.7855744957923889,
"learning_rate": 0.0002750273822562979,
"loss": 1.97,
"step": 480
},
{
"epoch": 0.31511254019292606,
"grad_norm": 0.727988064289093,
"learning_rate": 0.0002743702081051478,
"loss": 2.0705,
"step": 490
},
{
"epoch": 0.3215434083601286,
"grad_norm": 0.7662935853004456,
"learning_rate": 0.0002737130339539978,
"loss": 1.9678,
"step": 500
},
{
"epoch": 0.3279742765273312,
"grad_norm": 0.9171555638313293,
"learning_rate": 0.00027305585980284776,
"loss": 1.9818,
"step": 510
},
{
"epoch": 0.33440514469453375,
"grad_norm": 0.7959179282188416,
"learning_rate": 0.00027239868565169767,
"loss": 2.0014,
"step": 520
},
{
"epoch": 0.3408360128617363,
"grad_norm": 0.9359775185585022,
"learning_rate": 0.00027174151150054763,
"loss": 2.0244,
"step": 530
},
{
"epoch": 0.34726688102893893,
"grad_norm": 0.7740966081619263,
"learning_rate": 0.0002710843373493976,
"loss": 2.0883,
"step": 540
},
{
"epoch": 0.3536977491961415,
"grad_norm": 0.868601381778717,
"learning_rate": 0.0002704271631982475,
"loss": 2.0226,
"step": 550
},
{
"epoch": 0.36012861736334406,
"grad_norm": 0.8721134662628174,
"learning_rate": 0.0002697699890470975,
"loss": 2.0965,
"step": 560
},
{
"epoch": 0.3665594855305466,
"grad_norm": 0.8080394268035889,
"learning_rate": 0.00026911281489594744,
"loss": 2.0082,
"step": 570
},
{
"epoch": 0.3729903536977492,
"grad_norm": 1.7169413566589355,
"learning_rate": 0.00026845564074479735,
"loss": 2.039,
"step": 580
},
{
"epoch": 0.37942122186495175,
"grad_norm": 0.8220880031585693,
"learning_rate": 0.0002677984665936473,
"loss": 2.0696,
"step": 590
},
{
"epoch": 0.3858520900321543,
"grad_norm": 0.7639694213867188,
"learning_rate": 0.00026714129244249723,
"loss": 2.0014,
"step": 600
},
{
"epoch": 0.3858520900321543,
"eval_loss": 2.0443177223205566,
"eval_runtime": 133.8726,
"eval_samples_per_second": 14.94,
"eval_steps_per_second": 1.867,
"step": 600
},
{
"epoch": 0.39228295819935693,
"grad_norm": 0.817965567111969,
"learning_rate": 0.0002664841182913472,
"loss": 2.0553,
"step": 610
},
{
"epoch": 0.3987138263665595,
"grad_norm": 0.871166467666626,
"learning_rate": 0.00026582694414019716,
"loss": 2.0027,
"step": 620
},
{
"epoch": 0.40514469453376206,
"grad_norm": 0.7483948469161987,
"learning_rate": 0.00026516976998904707,
"loss": 2.0355,
"step": 630
},
{
"epoch": 0.4115755627009646,
"grad_norm": 0.8223303556442261,
"learning_rate": 0.00026451259583789704,
"loss": 2.0076,
"step": 640
},
{
"epoch": 0.4180064308681672,
"grad_norm": 0.80986088514328,
"learning_rate": 0.00026385542168674695,
"loss": 2.0781,
"step": 650
},
{
"epoch": 0.42443729903536975,
"grad_norm": 0.7527362704277039,
"learning_rate": 0.0002631982475355969,
"loss": 1.9727,
"step": 660
},
{
"epoch": 0.43086816720257237,
"grad_norm": 0.7571489810943604,
"learning_rate": 0.0002625410733844469,
"loss": 2.0205,
"step": 670
},
{
"epoch": 0.43729903536977494,
"grad_norm": 0.7976600527763367,
"learning_rate": 0.0002618838992332968,
"loss": 2.0505,
"step": 680
},
{
"epoch": 0.4437299035369775,
"grad_norm": 0.8057394623756409,
"learning_rate": 0.00026122672508214676,
"loss": 2.0351,
"step": 690
},
{
"epoch": 0.45016077170418006,
"grad_norm": 0.8420009016990662,
"learning_rate": 0.0002605695509309967,
"loss": 1.9655,
"step": 700
},
{
"epoch": 0.4565916398713826,
"grad_norm": 0.853597104549408,
"learning_rate": 0.00025991237677984664,
"loss": 1.9939,
"step": 710
},
{
"epoch": 0.4630225080385852,
"grad_norm": 0.7588443160057068,
"learning_rate": 0.0002592552026286966,
"loss": 2.032,
"step": 720
},
{
"epoch": 0.4694533762057878,
"grad_norm": 0.8099080920219421,
"learning_rate": 0.0002585980284775465,
"loss": 1.9817,
"step": 730
},
{
"epoch": 0.4758842443729904,
"grad_norm": 0.7894070148468018,
"learning_rate": 0.0002579408543263965,
"loss": 2.0001,
"step": 740
},
{
"epoch": 0.48231511254019294,
"grad_norm": 0.7474116683006287,
"learning_rate": 0.00025728368017524644,
"loss": 2.0077,
"step": 750
},
{
"epoch": 0.4887459807073955,
"grad_norm": 0.8076878786087036,
"learning_rate": 0.00025662650602409636,
"loss": 2.0394,
"step": 760
},
{
"epoch": 0.49517684887459806,
"grad_norm": 0.7559667825698853,
"learning_rate": 0.0002559693318729463,
"loss": 1.9753,
"step": 770
},
{
"epoch": 0.5016077170418006,
"grad_norm": 0.7402215600013733,
"learning_rate": 0.00025531215772179623,
"loss": 2.0353,
"step": 780
},
{
"epoch": 0.5080385852090032,
"grad_norm": 0.7112523317337036,
"learning_rate": 0.0002546549835706462,
"loss": 1.989,
"step": 790
},
{
"epoch": 0.5144694533762058,
"grad_norm": 0.7255666255950928,
"learning_rate": 0.00025399780941949616,
"loss": 1.9912,
"step": 800
},
{
"epoch": 0.5144694533762058,
"eval_loss": 2.0358893871307373,
"eval_runtime": 131.9747,
"eval_samples_per_second": 15.154,
"eval_steps_per_second": 1.894,
"step": 800
},
{
"epoch": 0.5209003215434084,
"grad_norm": 0.7614848613739014,
"learning_rate": 0.0002533406352683461,
"loss": 1.9507,
"step": 810
},
{
"epoch": 0.5273311897106109,
"grad_norm": 0.7834282517433167,
"learning_rate": 0.00025268346111719604,
"loss": 2.0572,
"step": 820
},
{
"epoch": 0.5337620578778135,
"grad_norm": 0.8642615079879761,
"learning_rate": 0.00025202628696604595,
"loss": 1.9766,
"step": 830
},
{
"epoch": 0.5401929260450161,
"grad_norm": 0.7937222123146057,
"learning_rate": 0.0002513691128148959,
"loss": 1.9718,
"step": 840
},
{
"epoch": 0.5466237942122186,
"grad_norm": 0.7922580242156982,
"learning_rate": 0.0002507119386637459,
"loss": 2.0098,
"step": 850
},
{
"epoch": 0.5530546623794212,
"grad_norm": 0.7464605569839478,
"learning_rate": 0.0002500547645125958,
"loss": 1.9529,
"step": 860
},
{
"epoch": 0.5594855305466238,
"grad_norm": 0.7568275332450867,
"learning_rate": 0.00024939759036144576,
"loss": 1.989,
"step": 870
},
{
"epoch": 0.5659163987138264,
"grad_norm": 0.7011362910270691,
"learning_rate": 0.00024874041621029573,
"loss": 2.031,
"step": 880
},
{
"epoch": 0.572347266881029,
"grad_norm": 0.7106270790100098,
"learning_rate": 0.00024808324205914564,
"loss": 2.022,
"step": 890
},
{
"epoch": 0.5787781350482315,
"grad_norm": 0.7415210604667664,
"learning_rate": 0.0002474260679079956,
"loss": 2.0595,
"step": 900
},
{
"epoch": 0.5852090032154341,
"grad_norm": 0.7313567399978638,
"learning_rate": 0.0002467688937568455,
"loss": 2.0293,
"step": 910
},
{
"epoch": 0.5916398713826366,
"grad_norm": 0.692523181438446,
"learning_rate": 0.0002461117196056955,
"loss": 2.0746,
"step": 920
},
{
"epoch": 0.5980707395498392,
"grad_norm": 0.6929277181625366,
"learning_rate": 0.00024545454545454545,
"loss": 1.955,
"step": 930
},
{
"epoch": 0.6045016077170418,
"grad_norm": 0.7199161648750305,
"learning_rate": 0.00024479737130339536,
"loss": 2.0454,
"step": 940
},
{
"epoch": 0.6109324758842444,
"grad_norm": 0.767314076423645,
"learning_rate": 0.00024414019715224533,
"loss": 2.0428,
"step": 950
},
{
"epoch": 0.617363344051447,
"grad_norm": 0.8044443130493164,
"learning_rate": 0.00024348302300109526,
"loss": 1.9423,
"step": 960
},
{
"epoch": 0.6237942122186495,
"grad_norm": 0.702936589717865,
"learning_rate": 0.0002428258488499452,
"loss": 1.9271,
"step": 970
},
{
"epoch": 0.6302250803858521,
"grad_norm": 0.7394160032272339,
"learning_rate": 0.00024216867469879517,
"loss": 1.9674,
"step": 980
},
{
"epoch": 0.6366559485530546,
"grad_norm": 0.7981842160224915,
"learning_rate": 0.0002415115005476451,
"loss": 1.9932,
"step": 990
},
{
"epoch": 0.6430868167202572,
"grad_norm": 0.871896505355835,
"learning_rate": 0.00024085432639649505,
"loss": 2.0182,
"step": 1000
},
{
"epoch": 0.6430868167202572,
"eval_loss": 2.024224281311035,
"eval_runtime": 130.1041,
"eval_samples_per_second": 15.372,
"eval_steps_per_second": 1.922,
"step": 1000
},
{
"epoch": 0.6495176848874598,
"grad_norm": 0.7123499512672424,
"learning_rate": 0.00024019715224534498,
"loss": 2.0923,
"step": 1010
},
{
"epoch": 0.6559485530546624,
"grad_norm": 0.7226546406745911,
"learning_rate": 0.00023953997809419495,
"loss": 2.0035,
"step": 1020
},
{
"epoch": 0.662379421221865,
"grad_norm": 0.7627468109130859,
"learning_rate": 0.0002388828039430449,
"loss": 1.9667,
"step": 1030
},
{
"epoch": 0.6688102893890675,
"grad_norm": 0.8175467252731323,
"learning_rate": 0.00023822562979189483,
"loss": 1.948,
"step": 1040
},
{
"epoch": 0.6752411575562701,
"grad_norm": 0.690073549747467,
"learning_rate": 0.0002375684556407448,
"loss": 2.0498,
"step": 1050
},
{
"epoch": 0.6816720257234726,
"grad_norm": 0.9848446249961853,
"learning_rate": 0.0002369112814895947,
"loss": 1.9874,
"step": 1060
},
{
"epoch": 0.6881028938906752,
"grad_norm": 0.7157571315765381,
"learning_rate": 0.00023625410733844467,
"loss": 2.0488,
"step": 1070
},
{
"epoch": 0.6945337620578779,
"grad_norm": 0.8503302931785583,
"learning_rate": 0.00023559693318729464,
"loss": 1.9958,
"step": 1080
},
{
"epoch": 0.7009646302250804,
"grad_norm": 0.7864677906036377,
"learning_rate": 0.00023493975903614455,
"loss": 2.0212,
"step": 1090
},
{
"epoch": 0.707395498392283,
"grad_norm": 1.7837698459625244,
"learning_rate": 0.0002342825848849945,
"loss": 1.9828,
"step": 1100
},
{
"epoch": 0.7138263665594855,
"grad_norm": 0.7183972001075745,
"learning_rate": 0.00023362541073384445,
"loss": 2.0652,
"step": 1110
},
{
"epoch": 0.7202572347266881,
"grad_norm": 0.7377676963806152,
"learning_rate": 0.0002329682365826944,
"loss": 2.0123,
"step": 1120
},
{
"epoch": 0.7266881028938906,
"grad_norm": 0.7170071601867676,
"learning_rate": 0.00023231106243154436,
"loss": 1.9759,
"step": 1130
},
{
"epoch": 0.7331189710610932,
"grad_norm": 0.6442170143127441,
"learning_rate": 0.00023165388828039427,
"loss": 2.047,
"step": 1140
},
{
"epoch": 0.7395498392282959,
"grad_norm": 0.7356306910514832,
"learning_rate": 0.00023099671412924423,
"loss": 2.0438,
"step": 1150
},
{
"epoch": 0.7459807073954984,
"grad_norm": 0.7483031153678894,
"learning_rate": 0.0002303395399780942,
"loss": 2.0274,
"step": 1160
},
{
"epoch": 0.752411575562701,
"grad_norm": 0.7624642848968506,
"learning_rate": 0.0002296823658269441,
"loss": 1.9938,
"step": 1170
},
{
"epoch": 0.7588424437299035,
"grad_norm": 0.7435073256492615,
"learning_rate": 0.00022902519167579408,
"loss": 1.9848,
"step": 1180
},
{
"epoch": 0.7652733118971061,
"grad_norm": 0.7327163219451904,
"learning_rate": 0.000228368017524644,
"loss": 2.0286,
"step": 1190
},
{
"epoch": 0.7717041800643086,
"grad_norm": 0.8398700952529907,
"learning_rate": 0.00022771084337349395,
"loss": 1.999,
"step": 1200
},
{
"epoch": 0.7717041800643086,
"eval_loss": 2.0166773796081543,
"eval_runtime": 129.989,
"eval_samples_per_second": 15.386,
"eval_steps_per_second": 1.923,
"step": 1200
},
{
"epoch": 0.7781350482315113,
"grad_norm": 0.6727181673049927,
"learning_rate": 0.00022705366922234392,
"loss": 2.0044,
"step": 1210
},
{
"epoch": 0.7845659163987139,
"grad_norm": 0.8738404512405396,
"learning_rate": 0.00022639649507119383,
"loss": 2.0246,
"step": 1220
},
{
"epoch": 0.7909967845659164,
"grad_norm": 0.760010302066803,
"learning_rate": 0.0002257393209200438,
"loss": 2.0058,
"step": 1230
},
{
"epoch": 0.797427652733119,
"grad_norm": 0.701081395149231,
"learning_rate": 0.00022508214676889373,
"loss": 1.9974,
"step": 1240
},
{
"epoch": 0.8038585209003215,
"grad_norm": 0.7346913814544678,
"learning_rate": 0.00022442497261774367,
"loss": 2.0884,
"step": 1250
},
{
"epoch": 0.8102893890675241,
"grad_norm": 0.7433114647865295,
"learning_rate": 0.00022376779846659364,
"loss": 1.9927,
"step": 1260
},
{
"epoch": 0.8167202572347267,
"grad_norm": 0.7781444787979126,
"learning_rate": 0.00022311062431544358,
"loss": 2.001,
"step": 1270
},
{
"epoch": 0.8231511254019293,
"grad_norm": 0.7538995742797852,
"learning_rate": 0.00022245345016429352,
"loss": 1.9947,
"step": 1280
},
{
"epoch": 0.8295819935691319,
"grad_norm": 0.7132537961006165,
"learning_rate": 0.00022179627601314345,
"loss": 1.9781,
"step": 1290
},
{
"epoch": 0.8360128617363344,
"grad_norm": 0.7174340486526489,
"learning_rate": 0.0002211391018619934,
"loss": 1.9848,
"step": 1300
},
{
"epoch": 0.842443729903537,
"grad_norm": 0.7245258092880249,
"learning_rate": 0.00022048192771084336,
"loss": 2.005,
"step": 1310
},
{
"epoch": 0.8488745980707395,
"grad_norm": 0.667892336845398,
"learning_rate": 0.0002198247535596933,
"loss": 1.9939,
"step": 1320
},
{
"epoch": 0.8553054662379421,
"grad_norm": 0.7173146605491638,
"learning_rate": 0.00021916757940854324,
"loss": 2.0636,
"step": 1330
},
{
"epoch": 0.8617363344051447,
"grad_norm": 0.7765901684761047,
"learning_rate": 0.0002185104052573932,
"loss": 1.9966,
"step": 1340
},
{
"epoch": 0.8681672025723473,
"grad_norm": 0.7077351808547974,
"learning_rate": 0.00021785323110624314,
"loss": 2.0078,
"step": 1350
},
{
"epoch": 0.8745980707395499,
"grad_norm": 0.736723780632019,
"learning_rate": 0.00021719605695509308,
"loss": 2.0292,
"step": 1360
},
{
"epoch": 0.8810289389067524,
"grad_norm": 0.732185959815979,
"learning_rate": 0.00021653888280394302,
"loss": 2.0223,
"step": 1370
},
{
"epoch": 0.887459807073955,
"grad_norm": 0.7002454400062561,
"learning_rate": 0.00021588170865279298,
"loss": 2.0068,
"step": 1380
},
{
"epoch": 0.8938906752411575,
"grad_norm": 0.75859534740448,
"learning_rate": 0.00021522453450164292,
"loss": 1.9556,
"step": 1390
},
{
"epoch": 0.9003215434083601,
"grad_norm": 0.7475289106369019,
"learning_rate": 0.00021456736035049286,
"loss": 1.9792,
"step": 1400
},
{
"epoch": 0.9003215434083601,
"eval_loss": 2.0089023113250732,
"eval_runtime": 130.0325,
"eval_samples_per_second": 15.381,
"eval_steps_per_second": 1.923,
"step": 1400
},
{
"epoch": 0.9067524115755627,
"grad_norm": 0.7917546629905701,
"learning_rate": 0.00021391018619934283,
"loss": 1.9999,
"step": 1410
},
{
"epoch": 0.9131832797427653,
"grad_norm": 0.7062447667121887,
"learning_rate": 0.00021325301204819274,
"loss": 1.9779,
"step": 1420
},
{
"epoch": 0.9196141479099679,
"grad_norm": 0.6973288655281067,
"learning_rate": 0.0002125958378970427,
"loss": 2.0511,
"step": 1430
},
{
"epoch": 0.9260450160771704,
"grad_norm": 0.7297340035438538,
"learning_rate": 0.00021193866374589267,
"loss": 1.9764,
"step": 1440
},
{
"epoch": 0.932475884244373,
"grad_norm": 0.9256350994110107,
"learning_rate": 0.00021128148959474258,
"loss": 1.9559,
"step": 1450
},
{
"epoch": 0.9389067524115756,
"grad_norm": 0.6994000673294067,
"learning_rate": 0.00021062431544359255,
"loss": 2.0152,
"step": 1460
},
{
"epoch": 0.9453376205787781,
"grad_norm": 0.7412806749343872,
"learning_rate": 0.00020996714129244246,
"loss": 1.9494,
"step": 1470
},
{
"epoch": 0.9517684887459807,
"grad_norm": 0.729680061340332,
"learning_rate": 0.00020930996714129242,
"loss": 2.0272,
"step": 1480
},
{
"epoch": 0.9581993569131833,
"grad_norm": 0.7601342797279358,
"learning_rate": 0.0002086527929901424,
"loss": 1.9714,
"step": 1490
},
{
"epoch": 0.9646302250803859,
"grad_norm": 0.6875161528587341,
"learning_rate": 0.0002079956188389923,
"loss": 1.993,
"step": 1500
},
{
"epoch": 0.9710610932475884,
"grad_norm": 0.7520968317985535,
"learning_rate": 0.00020733844468784227,
"loss": 2.0471,
"step": 1510
},
{
"epoch": 0.977491961414791,
"grad_norm": 0.8061411380767822,
"learning_rate": 0.00020668127053669218,
"loss": 2.0145,
"step": 1520
},
{
"epoch": 0.9839228295819936,
"grad_norm": 0.7837228775024414,
"learning_rate": 0.00020602409638554214,
"loss": 1.9889,
"step": 1530
},
{
"epoch": 0.9903536977491961,
"grad_norm": 0.744296133518219,
"learning_rate": 0.0002053669222343921,
"loss": 1.9834,
"step": 1540
},
{
"epoch": 0.9967845659163987,
"grad_norm": 0.7137749791145325,
"learning_rate": 0.00020470974808324202,
"loss": 2.0582,
"step": 1550
},
{
"epoch": 1.0032154340836013,
"grad_norm": 0.718320906162262,
"learning_rate": 0.000204052573932092,
"loss": 1.9576,
"step": 1560
},
{
"epoch": 1.0096463022508038,
"grad_norm": 0.719998836517334,
"learning_rate": 0.00020339539978094195,
"loss": 1.9138,
"step": 1570
},
{
"epoch": 1.0160771704180065,
"grad_norm": 0.7154316306114197,
"learning_rate": 0.00020273822562979186,
"loss": 1.875,
"step": 1580
},
{
"epoch": 1.022508038585209,
"grad_norm": 0.6565534472465515,
"learning_rate": 0.00020208105147864183,
"loss": 1.9994,
"step": 1590
},
{
"epoch": 1.0289389067524115,
"grad_norm": 0.7222368121147156,
"learning_rate": 0.00020142387732749177,
"loss": 1.9591,
"step": 1600
},
{
"epoch": 1.0289389067524115,
"eval_loss": 2.002497673034668,
"eval_runtime": 131.2869,
"eval_samples_per_second": 15.234,
"eval_steps_per_second": 1.904,
"step": 1600
},
{
"epoch": 1.0353697749196142,
"grad_norm": 0.7213057279586792,
"learning_rate": 0.0002007667031763417,
"loss": 1.9464,
"step": 1610
},
{
"epoch": 1.0418006430868167,
"grad_norm": 0.6436830163002014,
"learning_rate": 0.00020010952902519167,
"loss": 1.8951,
"step": 1620
},
{
"epoch": 1.0482315112540193,
"grad_norm": 0.7160071134567261,
"learning_rate": 0.00019945235487404158,
"loss": 1.9062,
"step": 1630
},
{
"epoch": 1.0546623794212218,
"grad_norm": 0.6585739850997925,
"learning_rate": 0.00019879518072289155,
"loss": 1.9514,
"step": 1640
},
{
"epoch": 1.0610932475884245,
"grad_norm": 0.7445241808891296,
"learning_rate": 0.0001981380065717415,
"loss": 1.8301,
"step": 1650
},
{
"epoch": 1.067524115755627,
"grad_norm": 0.6654142141342163,
"learning_rate": 0.00019748083242059143,
"loss": 1.9048,
"step": 1660
},
{
"epoch": 1.0739549839228295,
"grad_norm": 0.7550114393234253,
"learning_rate": 0.0001968236582694414,
"loss": 1.9266,
"step": 1670
},
{
"epoch": 1.0803858520900322,
"grad_norm": 0.7276896834373474,
"learning_rate": 0.00019616648411829133,
"loss": 1.8942,
"step": 1680
},
{
"epoch": 1.0868167202572347,
"grad_norm": 0.7431575059890747,
"learning_rate": 0.00019550930996714127,
"loss": 1.9148,
"step": 1690
},
{
"epoch": 1.0932475884244373,
"grad_norm": 0.74256831407547,
"learning_rate": 0.0001948521358159912,
"loss": 1.942,
"step": 1700
},
{
"epoch": 1.09967845659164,
"grad_norm": 0.7295734286308289,
"learning_rate": 0.00019419496166484117,
"loss": 1.9331,
"step": 1710
},
{
"epoch": 1.1061093247588425,
"grad_norm": 0.7749672532081604,
"learning_rate": 0.0001935377875136911,
"loss": 1.9373,
"step": 1720
},
{
"epoch": 1.112540192926045,
"grad_norm": 0.6896611452102661,
"learning_rate": 0.00019288061336254105,
"loss": 1.8813,
"step": 1730
},
{
"epoch": 1.1189710610932475,
"grad_norm": 0.7282217741012573,
"learning_rate": 0.00019222343921139102,
"loss": 1.9634,
"step": 1740
},
{
"epoch": 1.1254019292604502,
"grad_norm": 0.7761743068695068,
"learning_rate": 0.00019156626506024093,
"loss": 1.8708,
"step": 1750
},
{
"epoch": 1.1318327974276527,
"grad_norm": 0.7596757411956787,
"learning_rate": 0.0001909090909090909,
"loss": 1.9446,
"step": 1760
},
{
"epoch": 1.1382636655948553,
"grad_norm": 0.7023797631263733,
"learning_rate": 0.00019025191675794086,
"loss": 1.8837,
"step": 1770
},
{
"epoch": 1.144694533762058,
"grad_norm": 0.7191573977470398,
"learning_rate": 0.00018959474260679077,
"loss": 1.9141,
"step": 1780
},
{
"epoch": 1.1511254019292605,
"grad_norm": 0.784885048866272,
"learning_rate": 0.00018893756845564074,
"loss": 1.9506,
"step": 1790
},
{
"epoch": 1.157556270096463,
"grad_norm": 0.710903525352478,
"learning_rate": 0.00018828039430449068,
"loss": 1.9157,
"step": 1800
},
{
"epoch": 1.157556270096463,
"eval_loss": 1.998835563659668,
"eval_runtime": 121.0458,
"eval_samples_per_second": 16.523,
"eval_steps_per_second": 2.065,
"step": 1800
},
{
"epoch": 1.1639871382636655,
"grad_norm": 0.7552351355552673,
"learning_rate": 0.00018762322015334062,
"loss": 1.9139,
"step": 1810
},
{
"epoch": 1.1704180064308682,
"grad_norm": 0.7722271084785461,
"learning_rate": 0.00018696604600219058,
"loss": 1.863,
"step": 1820
},
{
"epoch": 1.1768488745980707,
"grad_norm": 0.7195548415184021,
"learning_rate": 0.0001863088718510405,
"loss": 1.8697,
"step": 1830
},
{
"epoch": 1.1832797427652733,
"grad_norm": 0.7423893809318542,
"learning_rate": 0.00018565169769989046,
"loss": 1.9772,
"step": 1840
},
{
"epoch": 1.189710610932476,
"grad_norm": 0.7222315073013306,
"learning_rate": 0.00018499452354874042,
"loss": 1.9308,
"step": 1850
},
{
"epoch": 1.1961414790996785,
"grad_norm": 0.6815035939216614,
"learning_rate": 0.00018433734939759034,
"loss": 1.9675,
"step": 1860
},
{
"epoch": 1.202572347266881,
"grad_norm": 0.7621594071388245,
"learning_rate": 0.0001836801752464403,
"loss": 1.9295,
"step": 1870
},
{
"epoch": 1.2090032154340835,
"grad_norm": 0.7405025959014893,
"learning_rate": 0.0001830230010952902,
"loss": 1.9088,
"step": 1880
},
{
"epoch": 1.2154340836012862,
"grad_norm": 0.6729809641838074,
"learning_rate": 0.00018236582694414018,
"loss": 1.9446,
"step": 1890
},
{
"epoch": 1.2218649517684887,
"grad_norm": 0.7389471530914307,
"learning_rate": 0.00018170865279299014,
"loss": 1.8841,
"step": 1900
},
{
"epoch": 1.2282958199356913,
"grad_norm": 0.6453628540039062,
"learning_rate": 0.00018105147864184006,
"loss": 1.8661,
"step": 1910
},
{
"epoch": 1.234726688102894,
"grad_norm": 0.6971079111099243,
"learning_rate": 0.00018039430449069002,
"loss": 1.9807,
"step": 1920
},
{
"epoch": 1.2411575562700965,
"grad_norm": 0.7807840704917908,
"learning_rate": 0.00017973713033953996,
"loss": 1.9475,
"step": 1930
},
{
"epoch": 1.247588424437299,
"grad_norm": 0.78909832239151,
"learning_rate": 0.0001790799561883899,
"loss": 1.8439,
"step": 1940
},
{
"epoch": 1.2540192926045015,
"grad_norm": 0.7715321183204651,
"learning_rate": 0.00017842278203723986,
"loss": 1.9478,
"step": 1950
},
{
"epoch": 1.2604501607717042,
"grad_norm": 0.7786479592323303,
"learning_rate": 0.0001777656078860898,
"loss": 1.8773,
"step": 1960
},
{
"epoch": 1.2668810289389068,
"grad_norm": 0.6935726404190063,
"learning_rate": 0.00017710843373493974,
"loss": 1.94,
"step": 1970
},
{
"epoch": 1.2733118971061093,
"grad_norm": 0.7824066877365112,
"learning_rate": 0.00017645125958378968,
"loss": 1.8996,
"step": 1980
},
{
"epoch": 1.279742765273312,
"grad_norm": 0.7019379138946533,
"learning_rate": 0.00017579408543263962,
"loss": 1.9114,
"step": 1990
},
{
"epoch": 1.2861736334405145,
"grad_norm": 0.8215466737747192,
"learning_rate": 0.00017513691128148958,
"loss": 1.8294,
"step": 2000
},
{
"epoch": 1.2861736334405145,
"eval_loss": 1.9947528839111328,
"eval_runtime": 132.3397,
"eval_samples_per_second": 15.113,
"eval_steps_per_second": 1.889,
"step": 2000
},
{
"epoch": 1.292604501607717,
"grad_norm": 0.7088531851768494,
"learning_rate": 0.00017447973713033952,
"loss": 1.9497,
"step": 2010
},
{
"epoch": 1.2990353697749195,
"grad_norm": 0.7754150032997131,
"learning_rate": 0.00017382256297918946,
"loss": 1.9047,
"step": 2020
},
{
"epoch": 1.3054662379421222,
"grad_norm": 0.7185202836990356,
"learning_rate": 0.00017316538882803943,
"loss": 1.8529,
"step": 2030
},
{
"epoch": 1.3118971061093248,
"grad_norm": 0.7496573328971863,
"learning_rate": 0.00017250821467688937,
"loss": 1.8618,
"step": 2040
},
{
"epoch": 1.3183279742765273,
"grad_norm": 0.6794284582138062,
"learning_rate": 0.0001718510405257393,
"loss": 1.898,
"step": 2050
},
{
"epoch": 1.32475884244373,
"grad_norm": 0.7059448957443237,
"learning_rate": 0.00017119386637458924,
"loss": 1.9594,
"step": 2060
},
{
"epoch": 1.3311897106109325,
"grad_norm": 0.7007871866226196,
"learning_rate": 0.0001705366922234392,
"loss": 1.9476,
"step": 2070
},
{
"epoch": 1.337620578778135,
"grad_norm": 0.6973986029624939,
"learning_rate": 0.00016987951807228915,
"loss": 1.9567,
"step": 2080
},
{
"epoch": 1.3440514469453375,
"grad_norm": 0.7169969081878662,
"learning_rate": 0.00016922234392113909,
"loss": 1.9685,
"step": 2090
},
{
"epoch": 1.3504823151125402,
"grad_norm": 0.7009272575378418,
"learning_rate": 0.00016856516976998905,
"loss": 1.9714,
"step": 2100
},
{
"epoch": 1.3569131832797428,
"grad_norm": 0.7070193290710449,
"learning_rate": 0.00016790799561883896,
"loss": 1.9695,
"step": 2110
},
{
"epoch": 1.3633440514469453,
"grad_norm": 0.7268947958946228,
"learning_rate": 0.00016725082146768893,
"loss": 1.9107,
"step": 2120
},
{
"epoch": 1.369774919614148,
"grad_norm": 0.7544928789138794,
"learning_rate": 0.00016659364731653887,
"loss": 1.8658,
"step": 2130
},
{
"epoch": 1.3762057877813505,
"grad_norm": 0.6320627927780151,
"learning_rate": 0.0001659364731653888,
"loss": 1.8917,
"step": 2140
},
{
"epoch": 1.382636655948553,
"grad_norm": 0.6863923668861389,
"learning_rate": 0.00016527929901423877,
"loss": 1.9237,
"step": 2150
},
{
"epoch": 1.3890675241157555,
"grad_norm": 0.7775669097900391,
"learning_rate": 0.00016462212486308868,
"loss": 1.8548,
"step": 2160
},
{
"epoch": 1.3954983922829582,
"grad_norm": 0.7198719382286072,
"learning_rate": 0.00016396495071193865,
"loss": 1.9145,
"step": 2170
},
{
"epoch": 1.4019292604501608,
"grad_norm": 0.7938317656517029,
"learning_rate": 0.00016330777656078861,
"loss": 1.8939,
"step": 2180
},
{
"epoch": 1.4083601286173635,
"grad_norm": 0.7361711263656616,
"learning_rate": 0.00016265060240963853,
"loss": 1.9642,
"step": 2190
},
{
"epoch": 1.414790996784566,
"grad_norm": 0.7385576963424683,
"learning_rate": 0.0001619934282584885,
"loss": 1.9134,
"step": 2200
},
{
"epoch": 1.414790996784566,
"eval_loss": 1.9883830547332764,
"eval_runtime": 130.0767,
"eval_samples_per_second": 15.376,
"eval_steps_per_second": 1.922,
"step": 2200
},
{
"epoch": 1.4212218649517685,
"grad_norm": 0.7863461971282959,
"learning_rate": 0.0001613362541073384,
"loss": 2.0157,
"step": 2210
},
{
"epoch": 1.427652733118971,
"grad_norm": 0.7755898237228394,
"learning_rate": 0.00016067907995618837,
"loss": 1.8973,
"step": 2220
},
{
"epoch": 1.4340836012861735,
"grad_norm": 0.7090388536453247,
"learning_rate": 0.00016002190580503833,
"loss": 1.9034,
"step": 2230
},
{
"epoch": 1.4405144694533762,
"grad_norm": 0.6487644910812378,
"learning_rate": 0.00015936473165388825,
"loss": 1.906,
"step": 2240
},
{
"epoch": 1.4469453376205788,
"grad_norm": 0.6597898006439209,
"learning_rate": 0.0001587075575027382,
"loss": 1.843,
"step": 2250
},
{
"epoch": 1.4533762057877815,
"grad_norm": 0.7069796323776245,
"learning_rate": 0.00015805038335158818,
"loss": 1.9554,
"step": 2260
},
{
"epoch": 1.459807073954984,
"grad_norm": 0.7358680367469788,
"learning_rate": 0.0001573932092004381,
"loss": 1.9268,
"step": 2270
},
{
"epoch": 1.4662379421221865,
"grad_norm": 0.675457775592804,
"learning_rate": 0.00015673603504928806,
"loss": 1.8981,
"step": 2280
},
{
"epoch": 1.472668810289389,
"grad_norm": 0.7369397878646851,
"learning_rate": 0.000156078860898138,
"loss": 1.9535,
"step": 2290
},
{
"epoch": 1.4790996784565915,
"grad_norm": 0.666994035243988,
"learning_rate": 0.00015542168674698793,
"loss": 1.8657,
"step": 2300
},
{
"epoch": 1.4855305466237942,
"grad_norm": 0.7241340279579163,
"learning_rate": 0.0001547645125958379,
"loss": 1.8097,
"step": 2310
},
{
"epoch": 1.4919614147909968,
"grad_norm": 0.7224936485290527,
"learning_rate": 0.0001541073384446878,
"loss": 1.8397,
"step": 2320
},
{
"epoch": 1.4983922829581995,
"grad_norm": 0.7167637348175049,
"learning_rate": 0.00015345016429353778,
"loss": 1.9225,
"step": 2330
},
{
"epoch": 1.504823151125402,
"grad_norm": 0.7176666259765625,
"learning_rate": 0.00015279299014238771,
"loss": 1.8764,
"step": 2340
},
{
"epoch": 1.5112540192926045,
"grad_norm": 0.735252857208252,
"learning_rate": 0.00015213581599123765,
"loss": 1.8935,
"step": 2350
},
{
"epoch": 1.517684887459807,
"grad_norm": 0.6805827021598816,
"learning_rate": 0.00015147864184008762,
"loss": 1.9212,
"step": 2360
},
{
"epoch": 1.5241157556270095,
"grad_norm": 0.7019375562667847,
"learning_rate": 0.00015082146768893756,
"loss": 1.9318,
"step": 2370
},
{
"epoch": 1.5305466237942122,
"grad_norm": 0.6795372366905212,
"learning_rate": 0.0001501642935377875,
"loss": 1.9023,
"step": 2380
},
{
"epoch": 1.5369774919614148,
"grad_norm": 0.6497982144355774,
"learning_rate": 0.00014950711938663743,
"loss": 1.9721,
"step": 2390
},
{
"epoch": 1.5434083601286175,
"grad_norm": 0.7713346481323242,
"learning_rate": 0.0001488499452354874,
"loss": 1.9906,
"step": 2400
},
{
"epoch": 1.5434083601286175,
"eval_loss": 1.9822700023651123,
"eval_runtime": 130.376,
"eval_samples_per_second": 15.34,
"eval_steps_per_second": 1.918,
"step": 2400
},
{
"epoch": 1.54983922829582,
"grad_norm": 0.7202898263931274,
"learning_rate": 0.00014819277108433734,
"loss": 1.8816,
"step": 2410
},
{
"epoch": 1.5562700964630225,
"grad_norm": 0.7167313694953918,
"learning_rate": 0.00014753559693318728,
"loss": 1.9316,
"step": 2420
},
{
"epoch": 1.562700964630225,
"grad_norm": 0.7133712768554688,
"learning_rate": 0.00014687842278203724,
"loss": 2.0053,
"step": 2430
},
{
"epoch": 1.5691318327974275,
"grad_norm": 0.76304692029953,
"learning_rate": 0.00014622124863088718,
"loss": 1.8718,
"step": 2440
},
{
"epoch": 1.5755627009646302,
"grad_norm": 0.667654812335968,
"learning_rate": 0.00014556407447973712,
"loss": 1.8727,
"step": 2450
},
{
"epoch": 1.5819935691318328,
"grad_norm": 0.7308873534202576,
"learning_rate": 0.00014490690032858706,
"loss": 1.8918,
"step": 2460
},
{
"epoch": 1.5884244372990355,
"grad_norm": 0.9376251697540283,
"learning_rate": 0.00014424972617743702,
"loss": 1.96,
"step": 2470
},
{
"epoch": 1.594855305466238,
"grad_norm": 0.6924982666969299,
"learning_rate": 0.00014359255202628696,
"loss": 1.8744,
"step": 2480
},
{
"epoch": 1.6012861736334405,
"grad_norm": 0.7420899868011475,
"learning_rate": 0.0001429353778751369,
"loss": 1.9112,
"step": 2490
},
{
"epoch": 1.607717041800643,
"grad_norm": 0.7384818196296692,
"learning_rate": 0.00014227820372398684,
"loss": 1.9562,
"step": 2500
},
{
"epoch": 1.6141479099678455,
"grad_norm": 0.7550799250602722,
"learning_rate": 0.0001416210295728368,
"loss": 1.891,
"step": 2510
},
{
"epoch": 1.6205787781350482,
"grad_norm": 0.7184371948242188,
"learning_rate": 0.00014096385542168674,
"loss": 1.9361,
"step": 2520
},
{
"epoch": 1.6270096463022508,
"grad_norm": 0.770914614200592,
"learning_rate": 0.00014030668127053668,
"loss": 1.9132,
"step": 2530
},
{
"epoch": 1.6334405144694535,
"grad_norm": 0.7566716074943542,
"learning_rate": 0.00013964950711938662,
"loss": 1.8982,
"step": 2540
},
{
"epoch": 1.639871382636656,
"grad_norm": 0.6670147776603699,
"learning_rate": 0.00013899233296823656,
"loss": 1.9211,
"step": 2550
},
{
"epoch": 1.6463022508038585,
"grad_norm": 0.7093060612678528,
"learning_rate": 0.00013833515881708653,
"loss": 1.8881,
"step": 2560
},
{
"epoch": 1.652733118971061,
"grad_norm": 0.6549977660179138,
"learning_rate": 0.00013767798466593646,
"loss": 1.9187,
"step": 2570
},
{
"epoch": 1.6591639871382635,
"grad_norm": 0.7039531469345093,
"learning_rate": 0.0001370208105147864,
"loss": 1.9165,
"step": 2580
},
{
"epoch": 1.6655948553054662,
"grad_norm": 0.7216307520866394,
"learning_rate": 0.00013636363636363634,
"loss": 1.9228,
"step": 2590
},
{
"epoch": 1.6720257234726688,
"grad_norm": 0.6866537928581238,
"learning_rate": 0.00013570646221248628,
"loss": 1.9003,
"step": 2600
},
{
"epoch": 1.6720257234726688,
"eval_loss": 1.977206826210022,
"eval_runtime": 131.9243,
"eval_samples_per_second": 15.16,
"eval_steps_per_second": 1.895,
"step": 2600
},
{
"epoch": 1.6784565916398715,
"grad_norm": 0.7328875660896301,
"learning_rate": 0.00013504928806133625,
"loss": 1.9,
"step": 2610
},
{
"epoch": 1.684887459807074,
"grad_norm": 0.7623500227928162,
"learning_rate": 0.00013439211391018618,
"loss": 1.9117,
"step": 2620
},
{
"epoch": 1.6913183279742765,
"grad_norm": 0.6996557712554932,
"learning_rate": 0.00013373493975903612,
"loss": 1.8342,
"step": 2630
},
{
"epoch": 1.697749196141479,
"grad_norm": 0.6597011685371399,
"learning_rate": 0.00013307776560788606,
"loss": 1.911,
"step": 2640
},
{
"epoch": 1.7041800643086815,
"grad_norm": 0.7154627442359924,
"learning_rate": 0.00013242059145673603,
"loss": 1.8955,
"step": 2650
},
{
"epoch": 1.7106109324758842,
"grad_norm": 0.6822642087936401,
"learning_rate": 0.00013176341730558597,
"loss": 1.928,
"step": 2660
},
{
"epoch": 1.717041800643087,
"grad_norm": 0.6770340204238892,
"learning_rate": 0.0001311062431544359,
"loss": 1.934,
"step": 2670
},
{
"epoch": 1.7234726688102895,
"grad_norm": 0.7235671877861023,
"learning_rate": 0.00013044906900328584,
"loss": 1.9248,
"step": 2680
},
{
"epoch": 1.729903536977492,
"grad_norm": 0.6428620219230652,
"learning_rate": 0.0001297918948521358,
"loss": 1.8998,
"step": 2690
},
{
"epoch": 1.7363344051446945,
"grad_norm": 0.7132564783096313,
"learning_rate": 0.00012913472070098575,
"loss": 1.9353,
"step": 2700
},
{
"epoch": 1.742765273311897,
"grad_norm": 0.7110019326210022,
"learning_rate": 0.0001284775465498357,
"loss": 1.8877,
"step": 2710
},
{
"epoch": 1.7491961414790995,
"grad_norm": 0.7546197772026062,
"learning_rate": 0.00012782037239868565,
"loss": 1.9219,
"step": 2720
},
{
"epoch": 1.7556270096463023,
"grad_norm": 0.8485615253448486,
"learning_rate": 0.0001271631982475356,
"loss": 1.9238,
"step": 2730
},
{
"epoch": 1.762057877813505,
"grad_norm": 0.7058401703834534,
"learning_rate": 0.00012650602409638553,
"loss": 1.9012,
"step": 2740
},
{
"epoch": 1.7684887459807075,
"grad_norm": 0.7222112417221069,
"learning_rate": 0.00012584884994523547,
"loss": 1.8442,
"step": 2750
},
{
"epoch": 1.77491961414791,
"grad_norm": 0.7010639905929565,
"learning_rate": 0.00012519167579408543,
"loss": 1.9322,
"step": 2760
},
{
"epoch": 1.7813504823151125,
"grad_norm": 0.6908234357833862,
"learning_rate": 0.00012453450164293537,
"loss": 1.9456,
"step": 2770
},
{
"epoch": 1.787781350482315,
"grad_norm": 0.6615903973579407,
"learning_rate": 0.0001238773274917853,
"loss": 1.9052,
"step": 2780
},
{
"epoch": 1.7942122186495175,
"grad_norm": 0.6688089370727539,
"learning_rate": 0.00012322015334063528,
"loss": 1.87,
"step": 2790
},
{
"epoch": 1.8006430868167203,
"grad_norm": 0.7396994233131409,
"learning_rate": 0.00012256297918948522,
"loss": 1.9243,
"step": 2800
},
{
"epoch": 1.8006430868167203,
"eval_loss": 1.974278450012207,
"eval_runtime": 144.2243,
"eval_samples_per_second": 13.867,
"eval_steps_per_second": 1.733,
"step": 2800
},
{
"epoch": 1.807073954983923,
"grad_norm": 0.6520466208457947,
"learning_rate": 0.00012190580503833514,
"loss": 1.902,
"step": 2810
},
{
"epoch": 1.8135048231511255,
"grad_norm": 0.7591603398323059,
"learning_rate": 0.00012124863088718509,
"loss": 1.9079,
"step": 2820
},
{
"epoch": 1.819935691318328,
"grad_norm": 0.6622514128684998,
"learning_rate": 0.00012059145673603504,
"loss": 1.9288,
"step": 2830
},
{
"epoch": 1.8263665594855305,
"grad_norm": 0.7578607797622681,
"learning_rate": 0.00011993428258488498,
"loss": 1.8936,
"step": 2840
},
{
"epoch": 1.832797427652733,
"grad_norm": 0.730093240737915,
"learning_rate": 0.00011927710843373494,
"loss": 1.8809,
"step": 2850
},
{
"epoch": 1.8392282958199357,
"grad_norm": 0.6403250098228455,
"learning_rate": 0.00011861993428258487,
"loss": 1.8866,
"step": 2860
},
{
"epoch": 1.8456591639871383,
"grad_norm": 0.7032350897789001,
"learning_rate": 0.00011796276013143481,
"loss": 1.938,
"step": 2870
},
{
"epoch": 1.852090032154341,
"grad_norm": 0.7376342415809631,
"learning_rate": 0.00011730558598028478,
"loss": 1.8925,
"step": 2880
},
{
"epoch": 1.8585209003215435,
"grad_norm": 0.7093110680580139,
"learning_rate": 0.00011664841182913472,
"loss": 1.9029,
"step": 2890
},
{
"epoch": 1.864951768488746,
"grad_norm": 0.6826250553131104,
"learning_rate": 0.00011599123767798466,
"loss": 1.8956,
"step": 2900
},
{
"epoch": 1.8713826366559485,
"grad_norm": 0.7709969282150269,
"learning_rate": 0.0001153340635268346,
"loss": 1.92,
"step": 2910
},
{
"epoch": 1.877813504823151,
"grad_norm": 0.6641222238540649,
"learning_rate": 0.00011467688937568453,
"loss": 1.8998,
"step": 2920
},
{
"epoch": 1.8842443729903537,
"grad_norm": 0.7321887612342834,
"learning_rate": 0.0001140197152245345,
"loss": 1.9257,
"step": 2930
},
{
"epoch": 1.8906752411575563,
"grad_norm": 0.7000001668930054,
"learning_rate": 0.00011336254107338444,
"loss": 1.8944,
"step": 2940
},
{
"epoch": 1.897106109324759,
"grad_norm": 0.7347818613052368,
"learning_rate": 0.00011270536692223438,
"loss": 1.9256,
"step": 2950
},
{
"epoch": 1.9035369774919615,
"grad_norm": 0.708888590335846,
"learning_rate": 0.00011204819277108433,
"loss": 1.9307,
"step": 2960
},
{
"epoch": 1.909967845659164,
"grad_norm": 0.6980915665626526,
"learning_rate": 0.00011139101861993428,
"loss": 1.883,
"step": 2970
},
{
"epoch": 1.9163987138263665,
"grad_norm": 0.8052535653114319,
"learning_rate": 0.00011073384446878422,
"loss": 1.899,
"step": 2980
},
{
"epoch": 1.922829581993569,
"grad_norm": 0.707011878490448,
"learning_rate": 0.00011007667031763416,
"loss": 1.9263,
"step": 2990
},
{
"epoch": 1.9292604501607717,
"grad_norm": 0.7086938619613647,
"learning_rate": 0.00010941949616648411,
"loss": 1.883,
"step": 3000
},
{
"epoch": 1.9292604501607717,
"eval_loss": 1.9664931297302246,
"eval_runtime": 133.023,
"eval_samples_per_second": 15.035,
"eval_steps_per_second": 1.879,
"step": 3000
}
],
"logging_steps": 10,
"max_steps": 4665,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.0137669676957696e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}