llama3.1-cpo_j-full-0912 / trainer_state.json
jbjeong91's picture
Model save
b7e74d3 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9985553308292401,
"eval_steps": 100,
"global_step": 432,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023114706732158336,
"grad_norm": 68.13874053955078,
"learning_rate": 2.2727272727272726e-07,
"logits/chosen": -0.33626726269721985,
"logits/rejected": -0.31605297327041626,
"logps/chosen": -269.3142395019531,
"logps/rejected": -267.5635681152344,
"loss": 2.9227,
"nll_loss": 1.0585803985595703,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": -26.93142318725586,
"rewards/margins": -0.17506682872772217,
"rewards/rejected": -26.756357192993164,
"step": 10
},
{
"epoch": 0.04622941346431667,
"grad_norm": 57.248863220214844,
"learning_rate": 4.545454545454545e-07,
"logits/chosen": -0.3509574234485626,
"logits/rejected": -0.3329581320285797,
"logps/chosen": -260.00225830078125,
"logps/rejected": -266.528076171875,
"loss": 2.8595,
"nll_loss": 0.9751935005187988,
"rewards/accuracies": 0.565625011920929,
"rewards/chosen": -26.000228881835938,
"rewards/margins": 0.6525786519050598,
"rewards/rejected": -26.652807235717773,
"step": 20
},
{
"epoch": 0.06934412019647501,
"grad_norm": 57.46038055419922,
"learning_rate": 6.818181818181817e-07,
"logits/chosen": -0.38174495100975037,
"logits/rejected": -0.3690889775753021,
"logps/chosen": -243.15576171875,
"logps/rejected": -246.4366455078125,
"loss": 2.6573,
"nll_loss": 1.0130800008773804,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -24.315576553344727,
"rewards/margins": 0.32808613777160645,
"rewards/rejected": -24.64366340637207,
"step": 30
},
{
"epoch": 0.09245882692863334,
"grad_norm": 51.81425476074219,
"learning_rate": 9.09090909090909e-07,
"logits/chosen": -0.6905701756477356,
"logits/rejected": -0.673626184463501,
"logps/chosen": -202.0510711669922,
"logps/rejected": -203.85264587402344,
"loss": 2.3834,
"nll_loss": 0.8630102872848511,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -20.20510482788086,
"rewards/margins": 0.18015719950199127,
"rewards/rejected": -20.385265350341797,
"step": 40
},
{
"epoch": 0.11557353366079168,
"grad_norm": 46.2398796081543,
"learning_rate": 9.845360824742267e-07,
"logits/chosen": -0.8133252263069153,
"logits/rejected": -0.7886686325073242,
"logps/chosen": -176.6295623779297,
"logps/rejected": -175.64236450195312,
"loss": 2.1663,
"nll_loss": 0.46288958191871643,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -17.662960052490234,
"rewards/margins": -0.09872126579284668,
"rewards/rejected": -17.56423568725586,
"step": 50
},
{
"epoch": 0.13868824039295002,
"grad_norm": 55.56767272949219,
"learning_rate": 9.587628865979382e-07,
"logits/chosen": -0.6391716003417969,
"logits/rejected": -0.6422410607337952,
"logps/chosen": -158.78402709960938,
"logps/rejected": -159.15992736816406,
"loss": 1.9369,
"nll_loss": 0.4064036011695862,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": -15.878405570983887,
"rewards/margins": 0.037588153034448624,
"rewards/rejected": -15.915992736816406,
"step": 60
},
{
"epoch": 0.16180294712510834,
"grad_norm": 53.74827575683594,
"learning_rate": 9.329896907216495e-07,
"logits/chosen": -0.4799535274505615,
"logits/rejected": -0.45562925934791565,
"logps/chosen": -153.95602416992188,
"logps/rejected": -156.0488739013672,
"loss": 1.8829,
"nll_loss": 0.33092719316482544,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": -15.395601272583008,
"rewards/margins": 0.20928561687469482,
"rewards/rejected": -15.604887008666992,
"step": 70
},
{
"epoch": 0.1849176538572667,
"grad_norm": 45.28865432739258,
"learning_rate": 9.072164948453608e-07,
"logits/chosen": -0.39702308177948,
"logits/rejected": -0.3713148832321167,
"logps/chosen": -158.48983764648438,
"logps/rejected": -161.58985900878906,
"loss": 1.7248,
"nll_loss": 0.2892971634864807,
"rewards/accuracies": 0.515625,
"rewards/chosen": -15.848983764648438,
"rewards/margins": 0.31000271439552307,
"rewards/rejected": -16.158987045288086,
"step": 80
},
{
"epoch": 0.208032360589425,
"grad_norm": 47.74916076660156,
"learning_rate": 8.814432989690721e-07,
"logits/chosen": -0.39265576004981995,
"logits/rejected": -0.3789977431297302,
"logps/chosen": -151.72657775878906,
"logps/rejected": -159.8419189453125,
"loss": 1.625,
"nll_loss": 0.24877457320690155,
"rewards/accuracies": 0.59375,
"rewards/chosen": -15.17265796661377,
"rewards/margins": 0.8115337491035461,
"rewards/rejected": -15.98419189453125,
"step": 90
},
{
"epoch": 0.23114706732158335,
"grad_norm": 46.02494430541992,
"learning_rate": 8.556701030927834e-07,
"logits/chosen": -0.3637830317020416,
"logits/rejected": -0.34602683782577515,
"logps/chosen": -156.66448974609375,
"logps/rejected": -159.7339630126953,
"loss": 1.7848,
"nll_loss": 0.3014821708202362,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -15.666448593139648,
"rewards/margins": 0.306946337223053,
"rewards/rejected": -15.973396301269531,
"step": 100
},
{
"epoch": 0.23114706732158335,
"eval_logits/chosen": -0.3793767988681793,
"eval_logits/rejected": -0.35158297419548035,
"eval_logps/chosen": -153.7521209716797,
"eval_logps/rejected": -157.6624755859375,
"eval_loss": 1.6452385187149048,
"eval_nll_loss": 0.27188077569007874,
"eval_rewards/accuracies": 0.5804347991943359,
"eval_rewards/chosen": -15.375213623046875,
"eval_rewards/margins": 0.39103466272354126,
"eval_rewards/rejected": -15.766247749328613,
"eval_runtime": 77.4102,
"eval_samples_per_second": 23.589,
"eval_steps_per_second": 1.486,
"step": 100
},
{
"epoch": 0.2542617740537417,
"grad_norm": 47.49018096923828,
"learning_rate": 8.298969072164948e-07,
"logits/chosen": -0.35000625252723694,
"logits/rejected": -0.31752458214759827,
"logps/chosen": -150.97109985351562,
"logps/rejected": -155.34848022460938,
"loss": 1.5701,
"nll_loss": 0.26048144698143005,
"rewards/accuracies": 0.5625,
"rewards/chosen": -15.0971097946167,
"rewards/margins": 0.4377376139163971,
"rewards/rejected": -15.534846305847168,
"step": 110
},
{
"epoch": 0.27737648078590005,
"grad_norm": 45.2691535949707,
"learning_rate": 8.041237113402062e-07,
"logits/chosen": -0.3519677221775055,
"logits/rejected": -0.32791006565093994,
"logps/chosen": -157.8858642578125,
"logps/rejected": -159.35427856445312,
"loss": 1.6168,
"nll_loss": 0.3137766718864441,
"rewards/accuracies": 0.534375011920929,
"rewards/chosen": -15.788586616516113,
"rewards/margins": 0.14684121310710907,
"rewards/rejected": -15.93542766571045,
"step": 120
},
{
"epoch": 0.30049118751805837,
"grad_norm": 57.24121856689453,
"learning_rate": 7.783505154639175e-07,
"logits/chosen": -0.42790165543556213,
"logits/rejected": -0.40469226241111755,
"logps/chosen": -157.43922424316406,
"logps/rejected": -165.82485961914062,
"loss": 1.6817,
"nll_loss": 0.3016583323478699,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -15.743922233581543,
"rewards/margins": 0.8385635614395142,
"rewards/rejected": -16.58248519897461,
"step": 130
},
{
"epoch": 0.3236058942502167,
"grad_norm": 58.36777114868164,
"learning_rate": 7.525773195876288e-07,
"logits/chosen": -0.5298252105712891,
"logits/rejected": -0.5220087170600891,
"logps/chosen": -148.60704040527344,
"logps/rejected": -154.1773681640625,
"loss": 1.6595,
"nll_loss": 0.2847565710544586,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -14.86070442199707,
"rewards/margins": 0.5570319294929504,
"rewards/rejected": -15.41773796081543,
"step": 140
},
{
"epoch": 0.34672060098237506,
"grad_norm": 68.2182846069336,
"learning_rate": 7.268041237113402e-07,
"logits/chosen": -0.5616439580917358,
"logits/rejected": -0.5490089654922485,
"logps/chosen": -149.12344360351562,
"logps/rejected": -154.98716735839844,
"loss": 1.7257,
"nll_loss": 0.2902756631374359,
"rewards/accuracies": 0.59375,
"rewards/chosen": -14.912343978881836,
"rewards/margins": 0.5863727331161499,
"rewards/rejected": -15.49871826171875,
"step": 150
},
{
"epoch": 0.3698353077145334,
"grad_norm": 39.74797821044922,
"learning_rate": 7.010309278350515e-07,
"logits/chosen": -0.5092633962631226,
"logits/rejected": -0.480968177318573,
"logps/chosen": -163.02232360839844,
"logps/rejected": -166.95147705078125,
"loss": 1.5552,
"nll_loss": 0.28835493326187134,
"rewards/accuracies": 0.59375,
"rewards/chosen": -16.302234649658203,
"rewards/margins": 0.39291518926620483,
"rewards/rejected": -16.695148468017578,
"step": 160
},
{
"epoch": 0.3929500144466917,
"grad_norm": 47.49101257324219,
"learning_rate": 6.752577319587629e-07,
"logits/chosen": -0.3924413323402405,
"logits/rejected": -0.3796409070491791,
"logps/chosen": -158.3751220703125,
"logps/rejected": -165.7510223388672,
"loss": 1.4469,
"nll_loss": 0.2873372733592987,
"rewards/accuracies": 0.59375,
"rewards/chosen": -15.837512016296387,
"rewards/margins": 0.7375894784927368,
"rewards/rejected": -16.575103759765625,
"step": 170
},
{
"epoch": 0.41606472117885,
"grad_norm": 47.435874938964844,
"learning_rate": 6.494845360824742e-07,
"logits/chosen": -0.47805255651474,
"logits/rejected": -0.4726547598838806,
"logps/chosen": -152.86453247070312,
"logps/rejected": -159.31887817382812,
"loss": 1.4799,
"nll_loss": 0.2764199376106262,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -15.286453247070312,
"rewards/margins": 0.6454333066940308,
"rewards/rejected": -15.93188762664795,
"step": 180
},
{
"epoch": 0.4391794279110084,
"grad_norm": 53.53675842285156,
"learning_rate": 6.237113402061855e-07,
"logits/chosen": -0.5260998606681824,
"logits/rejected": -0.5136414766311646,
"logps/chosen": -162.34234619140625,
"logps/rejected": -168.7532196044922,
"loss": 1.5476,
"nll_loss": 0.28763675689697266,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -16.234233856201172,
"rewards/margins": 0.6410863995552063,
"rewards/rejected": -16.875320434570312,
"step": 190
},
{
"epoch": 0.4622941346431667,
"grad_norm": 45.457393646240234,
"learning_rate": 5.979381443298969e-07,
"logits/chosen": -0.4183273911476135,
"logits/rejected": -0.4106271266937256,
"logps/chosen": -158.37742614746094,
"logps/rejected": -164.9311065673828,
"loss": 1.5276,
"nll_loss": 0.2878963351249695,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -15.837743759155273,
"rewards/margins": 0.6553663015365601,
"rewards/rejected": -16.49310874938965,
"step": 200
},
{
"epoch": 0.4622941346431667,
"eval_logits/chosen": -0.4236670732498169,
"eval_logits/rejected": -0.39834392070770264,
"eval_logps/chosen": -158.0997314453125,
"eval_logps/rejected": -164.43028259277344,
"eval_loss": 1.5229449272155762,
"eval_nll_loss": 0.27477577328681946,
"eval_rewards/accuracies": 0.604347825050354,
"eval_rewards/chosen": -15.809972763061523,
"eval_rewards/margins": 0.6330567002296448,
"eval_rewards/rejected": -16.443029403686523,
"eval_runtime": 77.51,
"eval_samples_per_second": 23.558,
"eval_steps_per_second": 1.484,
"step": 200
},
{
"epoch": 0.48540884137532503,
"grad_norm": 51.9904899597168,
"learning_rate": 5.721649484536082e-07,
"logits/chosen": -0.412663996219635,
"logits/rejected": -0.385576069355011,
"logps/chosen": -155.5344696044922,
"logps/rejected": -157.98362731933594,
"loss": 1.5112,
"nll_loss": 0.2793150544166565,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": -15.553448677062988,
"rewards/margins": 0.24491462111473083,
"rewards/rejected": -15.798362731933594,
"step": 210
},
{
"epoch": 0.5085235481074833,
"grad_norm": 49.318214416503906,
"learning_rate": 5.463917525773195e-07,
"logits/chosen": -0.41389769315719604,
"logits/rejected": -0.38615840673446655,
"logps/chosen": -168.82632446289062,
"logps/rejected": -174.5264129638672,
"loss": 1.5696,
"nll_loss": 0.2951294779777527,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -16.882633209228516,
"rewards/margins": 0.5700088143348694,
"rewards/rejected": -17.4526424407959,
"step": 220
},
{
"epoch": 0.5316382548396418,
"grad_norm": 47.010562896728516,
"learning_rate": 5.20618556701031e-07,
"logits/chosen": -0.3984927237033844,
"logits/rejected": -0.3791876435279846,
"logps/chosen": -166.37368774414062,
"logps/rejected": -173.02516174316406,
"loss": 1.583,
"nll_loss": 0.29688653349876404,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -16.637371063232422,
"rewards/margins": 0.6651442050933838,
"rewards/rejected": -17.302515029907227,
"step": 230
},
{
"epoch": 0.5547529615718001,
"grad_norm": 50.885963439941406,
"learning_rate": 4.948453608247422e-07,
"logits/chosen": -0.37319958209991455,
"logits/rejected": -0.35784170031547546,
"logps/chosen": -166.8050537109375,
"logps/rejected": -172.14004516601562,
"loss": 1.4338,
"nll_loss": 0.28680044412612915,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": -16.68050765991211,
"rewards/margins": 0.5334986448287964,
"rewards/rejected": -17.214006423950195,
"step": 240
},
{
"epoch": 0.5778676683039584,
"grad_norm": 60.3321418762207,
"learning_rate": 4.6907216494845357e-07,
"logits/chosen": -0.41152358055114746,
"logits/rejected": -0.3965645730495453,
"logps/chosen": -161.13088989257812,
"logps/rejected": -164.7410888671875,
"loss": 1.5649,
"nll_loss": 0.25410208106040955,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": -16.113088607788086,
"rewards/margins": 0.361021488904953,
"rewards/rejected": -16.474109649658203,
"step": 250
},
{
"epoch": 0.6009823750361167,
"grad_norm": 49.894596099853516,
"learning_rate": 4.432989690721649e-07,
"logits/chosen": -0.518215537071228,
"logits/rejected": -0.504486083984375,
"logps/chosen": -157.352294921875,
"logps/rejected": -166.1837921142578,
"loss": 1.4283,
"nll_loss": 0.2800624370574951,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -15.7352294921875,
"rewards/margins": 0.8831487894058228,
"rewards/rejected": -16.618377685546875,
"step": 260
},
{
"epoch": 0.624097081768275,
"grad_norm": 44.237037658691406,
"learning_rate": 4.175257731958763e-07,
"logits/chosen": -0.4858783185482025,
"logits/rejected": -0.47672492265701294,
"logps/chosen": -158.05084228515625,
"logps/rejected": -165.44720458984375,
"loss": 1.4637,
"nll_loss": 0.2714413106441498,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -15.805084228515625,
"rewards/margins": 0.739635169506073,
"rewards/rejected": -16.544719696044922,
"step": 270
},
{
"epoch": 0.6472117885004334,
"grad_norm": 52.68519973754883,
"learning_rate": 3.917525773195876e-07,
"logits/chosen": -0.4353243410587311,
"logits/rejected": -0.4047181010246277,
"logps/chosen": -162.55743408203125,
"logps/rejected": -171.99114990234375,
"loss": 1.4694,
"nll_loss": 0.2985231876373291,
"rewards/accuracies": 0.6468750238418579,
"rewards/chosen": -16.2557430267334,
"rewards/margins": 0.9433721303939819,
"rewards/rejected": -17.199115753173828,
"step": 280
},
{
"epoch": 0.6703264952325917,
"grad_norm": 54.75222396850586,
"learning_rate": 3.659793814432989e-07,
"logits/chosen": -0.4927333891391754,
"logits/rejected": -0.46107035875320435,
"logps/chosen": -165.6143035888672,
"logps/rejected": -171.44073486328125,
"loss": 1.4147,
"nll_loss": 0.2945927381515503,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -16.561431884765625,
"rewards/margins": 0.5826419591903687,
"rewards/rejected": -17.144071578979492,
"step": 290
},
{
"epoch": 0.6934412019647501,
"grad_norm": 44.112884521484375,
"learning_rate": 3.402061855670103e-07,
"logits/chosen": -0.5113806128501892,
"logits/rejected": -0.5009027719497681,
"logps/chosen": -168.73641967773438,
"logps/rejected": -178.53773498535156,
"loss": 1.4811,
"nll_loss": 0.3009123206138611,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -16.873641967773438,
"rewards/margins": 0.9801331758499146,
"rewards/rejected": -17.853775024414062,
"step": 300
},
{
"epoch": 0.6934412019647501,
"eval_logits/chosen": -0.4338991940021515,
"eval_logits/rejected": -0.40692025423049927,
"eval_logps/chosen": -160.7057342529297,
"eval_logps/rejected": -168.00125122070312,
"eval_loss": 1.463964819908142,
"eval_nll_loss": 0.28037506341934204,
"eval_rewards/accuracies": 0.613043487071991,
"eval_rewards/chosen": -16.070573806762695,
"eval_rewards/margins": 0.7295539975166321,
"eval_rewards/rejected": -16.800127029418945,
"eval_runtime": 77.4432,
"eval_samples_per_second": 23.579,
"eval_steps_per_second": 1.485,
"step": 300
},
{
"epoch": 0.7165559086969084,
"grad_norm": 47.5919303894043,
"learning_rate": 3.1443298969072163e-07,
"logits/chosen": -0.47527360916137695,
"logits/rejected": -0.4661695957183838,
"logps/chosen": -170.21279907226562,
"logps/rejected": -174.0437774658203,
"loss": 1.5403,
"nll_loss": 0.3031921982765198,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -17.021282196044922,
"rewards/margins": 0.38309773802757263,
"rewards/rejected": -17.404376983642578,
"step": 310
},
{
"epoch": 0.7396706154290668,
"grad_norm": 45.896297454833984,
"learning_rate": 2.8865979381443296e-07,
"logits/chosen": -0.4675866961479187,
"logits/rejected": -0.4545253813266754,
"logps/chosen": -159.13848876953125,
"logps/rejected": -168.03915405273438,
"loss": 1.5256,
"nll_loss": 0.28240206837654114,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -15.913850784301758,
"rewards/margins": 0.8900658488273621,
"rewards/rejected": -16.80391502380371,
"step": 320
},
{
"epoch": 0.7627853221612251,
"grad_norm": 46.80994415283203,
"learning_rate": 2.6288659793814435e-07,
"logits/chosen": -0.4704606533050537,
"logits/rejected": -0.4714192748069763,
"logps/chosen": -158.95199584960938,
"logps/rejected": -165.59017944335938,
"loss": 1.5045,
"nll_loss": 0.2972811162471771,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -15.8951997756958,
"rewards/margins": 0.6638190150260925,
"rewards/rejected": -16.559019088745117,
"step": 330
},
{
"epoch": 0.7859000288933834,
"grad_norm": 44.699893951416016,
"learning_rate": 2.3711340206185566e-07,
"logits/chosen": -0.47059255838394165,
"logits/rejected": -0.45767131447792053,
"logps/chosen": -162.30491638183594,
"logps/rejected": -167.87808227539062,
"loss": 1.3925,
"nll_loss": 0.2751705050468445,
"rewards/accuracies": 0.578125,
"rewards/chosen": -16.230493545532227,
"rewards/margins": 0.5573164820671082,
"rewards/rejected": -16.787809371948242,
"step": 340
},
{
"epoch": 0.8090147356255417,
"grad_norm": 46.135292053222656,
"learning_rate": 2.11340206185567e-07,
"logits/chosen": -0.4483928680419922,
"logits/rejected": -0.4473996162414551,
"logps/chosen": -163.79013061523438,
"logps/rejected": -172.65423583984375,
"loss": 1.4903,
"nll_loss": 0.2803964912891388,
"rewards/accuracies": 0.609375,
"rewards/chosen": -16.379013061523438,
"rewards/margins": 0.8864116668701172,
"rewards/rejected": -17.265422821044922,
"step": 350
},
{
"epoch": 0.8321294423577,
"grad_norm": 56.81486511230469,
"learning_rate": 1.8556701030927835e-07,
"logits/chosen": -0.4406563639640808,
"logits/rejected": -0.4262049198150635,
"logps/chosen": -168.64210510253906,
"logps/rejected": -174.23231506347656,
"loss": 1.535,
"nll_loss": 0.27415817975997925,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -16.86421012878418,
"rewards/margins": 0.559019923210144,
"rewards/rejected": -17.423233032226562,
"step": 360
},
{
"epoch": 0.8552441490898585,
"grad_norm": 50.85494613647461,
"learning_rate": 1.5979381443298966e-07,
"logits/chosen": -0.4312410354614258,
"logits/rejected": -0.4031241834163666,
"logps/chosen": -162.82699584960938,
"logps/rejected": -172.81155395507812,
"loss": 1.4015,
"nll_loss": 0.2656095623970032,
"rewards/accuracies": 0.671875,
"rewards/chosen": -16.282699584960938,
"rewards/margins": 0.9984554052352905,
"rewards/rejected": -17.28115463256836,
"step": 370
},
{
"epoch": 0.8783588558220168,
"grad_norm": 54.44953918457031,
"learning_rate": 1.3402061855670102e-07,
"logits/chosen": -0.38503849506378174,
"logits/rejected": -0.36510857939720154,
"logps/chosen": -159.20938110351562,
"logps/rejected": -166.0490264892578,
"loss": 1.4196,
"nll_loss": 0.2949269711971283,
"rewards/accuracies": 0.6031249761581421,
"rewards/chosen": -15.920938491821289,
"rewards/margins": 0.6839638352394104,
"rewards/rejected": -16.604902267456055,
"step": 380
},
{
"epoch": 0.9014735625541751,
"grad_norm": 53.84355926513672,
"learning_rate": 1.0824742268041237e-07,
"logits/chosen": -0.4367571473121643,
"logits/rejected": -0.41515684127807617,
"logps/chosen": -162.70645141601562,
"logps/rejected": -173.05177307128906,
"loss": 1.4525,
"nll_loss": 0.27491894364356995,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -16.27064323425293,
"rewards/margins": 1.0345335006713867,
"rewards/rejected": -17.30517578125,
"step": 390
},
{
"epoch": 0.9245882692863334,
"grad_norm": 51.0332145690918,
"learning_rate": 8.24742268041237e-08,
"logits/chosen": -0.4105447232723236,
"logits/rejected": -0.38223332166671753,
"logps/chosen": -167.57431030273438,
"logps/rejected": -172.93923950195312,
"loss": 1.4642,
"nll_loss": 0.3052617907524109,
"rewards/accuracies": 0.578125,
"rewards/chosen": -16.757429122924805,
"rewards/margins": 0.5364928841590881,
"rewards/rejected": -17.293922424316406,
"step": 400
},
{
"epoch": 0.9245882692863334,
"eval_logits/chosen": -0.38119494915008545,
"eval_logits/rejected": -0.35089170932769775,
"eval_logps/chosen": -161.57652282714844,
"eval_logps/rejected": -169.120361328125,
"eval_loss": 1.4428884983062744,
"eval_nll_loss": 0.2844657897949219,
"eval_rewards/accuracies": 0.6304348111152649,
"eval_rewards/chosen": -16.157651901245117,
"eval_rewards/margins": 0.7543851137161255,
"eval_rewards/rejected": -16.912038803100586,
"eval_runtime": 77.5027,
"eval_samples_per_second": 23.56,
"eval_steps_per_second": 1.484,
"step": 400
},
{
"epoch": 0.9477029760184917,
"grad_norm": 46.96026611328125,
"learning_rate": 5.670103092783505e-08,
"logits/chosen": -0.3439410626888275,
"logits/rejected": -0.3307141661643982,
"logps/chosen": -154.14857482910156,
"logps/rejected": -163.5908966064453,
"loss": 1.3967,
"nll_loss": 0.2838439345359802,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -15.4148588180542,
"rewards/margins": 0.9442328214645386,
"rewards/rejected": -16.35909080505371,
"step": 410
},
{
"epoch": 0.9708176827506501,
"grad_norm": 44.309818267822266,
"learning_rate": 3.092783505154639e-08,
"logits/chosen": -0.3803669214248657,
"logits/rejected": -0.3566874861717224,
"logps/chosen": -156.36138916015625,
"logps/rejected": -162.5567626953125,
"loss": 1.4043,
"nll_loss": 0.28173336386680603,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -15.636138916015625,
"rewards/margins": 0.6195372939109802,
"rewards/rejected": -16.25567626953125,
"step": 420
},
{
"epoch": 0.9939323894828085,
"grad_norm": 48.28641128540039,
"learning_rate": 5.154639175257731e-09,
"logits/chosen": -0.3961271345615387,
"logits/rejected": -0.40863722562789917,
"logps/chosen": -164.62960815429688,
"logps/rejected": -171.9886474609375,
"loss": 1.4854,
"nll_loss": 0.2709375023841858,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -16.46295928955078,
"rewards/margins": 0.7359048128128052,
"rewards/rejected": -17.198863983154297,
"step": 430
},
{
"epoch": 0.9985553308292401,
"step": 432,
"total_flos": 0.0,
"train_loss": 1.6653077276768509,
"train_runtime": 9934.0195,
"train_samples_per_second": 5.574,
"train_steps_per_second": 0.043
}
],
"logging_steps": 10,
"max_steps": 432,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}