llama3.1-cpo-full-0912 / trainer_state.json
jbjeong91's picture
Model save
050f60c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9985553308292401,
"eval_steps": 100,
"global_step": 432,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023114706732158336,
"grad_norm": 65.15836334228516,
"learning_rate": 2.2727272727272726e-07,
"logits/chosen": -0.335565984249115,
"logits/rejected": -0.31526079773902893,
"logps/chosen": -269.28985595703125,
"logps/rejected": -267.5926818847656,
"loss": 2.6152,
"nll_loss": 0.7412666082382202,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -26.92898941040039,
"rewards/margins": -0.1697184145450592,
"rewards/rejected": -26.7592716217041,
"step": 10
},
{
"epoch": 0.04622941346431667,
"grad_norm": 55.07333755493164,
"learning_rate": 4.545454545454545e-07,
"logits/chosen": -0.3471914827823639,
"logits/rejected": -0.32920125126838684,
"logps/chosen": -260.79205322265625,
"logps/rejected": -267.349853515625,
"loss": 2.5239,
"nll_loss": 0.7186842560768127,
"rewards/accuracies": 0.565625011920929,
"rewards/chosen": -26.079208374023438,
"rewards/margins": 0.6557787656784058,
"rewards/rejected": -26.734989166259766,
"step": 20
},
{
"epoch": 0.06934412019647501,
"grad_norm": 57.19869613647461,
"learning_rate": 6.818181818181817e-07,
"logits/chosen": -0.34477299451828003,
"logits/rejected": -0.33347639441490173,
"logps/chosen": -247.47900390625,
"logps/rejected": -250.7107391357422,
"loss": 2.3552,
"nll_loss": 0.703576922416687,
"rewards/accuracies": 0.515625,
"rewards/chosen": -24.74790382385254,
"rewards/margins": 0.3231719732284546,
"rewards/rejected": -25.071073532104492,
"step": 30
},
{
"epoch": 0.09245882692863334,
"grad_norm": 47.48102569580078,
"learning_rate": 9.09090909090909e-07,
"logits/chosen": -0.5700438618659973,
"logits/rejected": -0.556909441947937,
"logps/chosen": -215.1627197265625,
"logps/rejected": -217.0400848388672,
"loss": 2.1715,
"nll_loss": 0.6503027081489563,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -21.51627540588379,
"rewards/margins": 0.18773558735847473,
"rewards/rejected": -21.704008102416992,
"step": 40
},
{
"epoch": 0.11557353366079168,
"grad_norm": 48.25373458862305,
"learning_rate": 9.845360824742267e-07,
"logits/chosen": -0.8266013264656067,
"logits/rejected": -0.8015046119689941,
"logps/chosen": -196.6488800048828,
"logps/rejected": -195.6967010498047,
"loss": 2.1841,
"nll_loss": 0.5290184020996094,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": -19.664888381958008,
"rewards/margins": -0.09521917253732681,
"rewards/rejected": -19.56966781616211,
"step": 50
},
{
"epoch": 0.13868824039295002,
"grad_norm": 55.580039978027344,
"learning_rate": 9.587628865979382e-07,
"logits/chosen": -0.6845192313194275,
"logits/rejected": -0.689314067363739,
"logps/chosen": -164.92901611328125,
"logps/rejected": -165.1588592529297,
"loss": 2.0022,
"nll_loss": 0.4657168388366699,
"rewards/accuracies": 0.53125,
"rewards/chosen": -16.492902755737305,
"rewards/margins": 0.022982392460107803,
"rewards/rejected": -16.515884399414062,
"step": 60
},
{
"epoch": 0.16180294712510834,
"grad_norm": 50.51268768310547,
"learning_rate": 9.329896907216495e-07,
"logits/chosen": -0.5050565004348755,
"logits/rejected": -0.4807310998439789,
"logps/chosen": -155.29498291015625,
"logps/rejected": -157.361328125,
"loss": 1.9881,
"nll_loss": 0.44492220878601074,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": -15.529500007629395,
"rewards/margins": 0.20663371682167053,
"rewards/rejected": -15.736132621765137,
"step": 70
},
{
"epoch": 0.1849176538572667,
"grad_norm": 46.45564651489258,
"learning_rate": 9.072164948453608e-07,
"logits/chosen": -0.48326191306114197,
"logits/rejected": -0.457420289516449,
"logps/chosen": -158.08729553222656,
"logps/rejected": -161.24571228027344,
"loss": 1.8567,
"nll_loss": 0.42924928665161133,
"rewards/accuracies": 0.5093749761581421,
"rewards/chosen": -15.80872917175293,
"rewards/margins": 0.31584271788597107,
"rewards/rejected": -16.124568939208984,
"step": 80
},
{
"epoch": 0.208032360589425,
"grad_norm": 50.26318359375,
"learning_rate": 8.814432989690721e-07,
"logits/chosen": -0.4506359100341797,
"logits/rejected": -0.43782296776771545,
"logps/chosen": -152.2831573486328,
"logps/rejected": -160.30429077148438,
"loss": 1.7674,
"nll_loss": 0.4159914553165436,
"rewards/accuracies": 0.59375,
"rewards/chosen": -15.228317260742188,
"rewards/margins": 0.8021124005317688,
"rewards/rejected": -16.03042984008789,
"step": 90
},
{
"epoch": 0.23114706732158335,
"grad_norm": 45.81875991821289,
"learning_rate": 8.556701030927834e-07,
"logits/chosen": -0.40928536653518677,
"logits/rejected": -0.39079341292381287,
"logps/chosen": -153.24673461914062,
"logps/rejected": -156.20919799804688,
"loss": 1.9362,
"nll_loss": 0.4179740846157074,
"rewards/accuracies": 0.53125,
"rewards/chosen": -15.324671745300293,
"rewards/margins": 0.29624658823013306,
"rewards/rejected": -15.620920181274414,
"step": 100
},
{
"epoch": 0.23114706732158335,
"eval_logits/chosen": -0.4377523362636566,
"eval_logits/rejected": -0.4122772812843323,
"eval_logps/chosen": -149.33935546875,
"eval_logps/rejected": -152.84754943847656,
"eval_loss": 1.7930248975753784,
"eval_nll_loss": 0.40668219327926636,
"eval_rewards/accuracies": 0.5760869383811951,
"eval_rewards/chosen": -14.933935165405273,
"eval_rewards/margins": 0.35081860423088074,
"eval_rewards/rejected": -15.28475284576416,
"eval_runtime": 74.3015,
"eval_samples_per_second": 24.576,
"eval_steps_per_second": 1.548,
"step": 100
},
{
"epoch": 0.2542617740537417,
"grad_norm": 45.55659103393555,
"learning_rate": 8.298969072164948e-07,
"logits/chosen": -0.38547706604003906,
"logits/rejected": -0.3579915165901184,
"logps/chosen": -146.1110382080078,
"logps/rejected": -150.4032745361328,
"loss": 1.7214,
"nll_loss": 0.39803242683410645,
"rewards/accuracies": 0.5625,
"rewards/chosen": -14.611104011535645,
"rewards/margins": 0.42922306060791016,
"rewards/rejected": -15.040326118469238,
"step": 110
},
{
"epoch": 0.27737648078590005,
"grad_norm": 44.77095031738281,
"learning_rate": 8.041237113402062e-07,
"logits/chosen": -0.376223623752594,
"logits/rejected": -0.3552733063697815,
"logps/chosen": -155.74005126953125,
"logps/rejected": -157.14755249023438,
"loss": 1.753,
"nll_loss": 0.4237498342990875,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": -15.574007034301758,
"rewards/margins": 0.14074988663196564,
"rewards/rejected": -15.714755058288574,
"step": 120
},
{
"epoch": 0.30049118751805837,
"grad_norm": 54.516483306884766,
"learning_rate": 7.783505154639175e-07,
"logits/chosen": -0.39556393027305603,
"logits/rejected": -0.3727474808692932,
"logps/chosen": -152.9895477294922,
"logps/rejected": -161.13479614257812,
"loss": 1.8165,
"nll_loss": 0.42241328954696655,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -15.298955917358398,
"rewards/margins": 0.8145230412483215,
"rewards/rejected": -16.11347770690918,
"step": 130
},
{
"epoch": 0.3236058942502167,
"grad_norm": 58.50905227661133,
"learning_rate": 7.525773195876288e-07,
"logits/chosen": -0.41800642013549805,
"logits/rejected": -0.41197213530540466,
"logps/chosen": -143.42355346679688,
"logps/rejected": -148.9073486328125,
"loss": 1.8037,
"nll_loss": 0.41033467650413513,
"rewards/accuracies": 0.590624988079071,
"rewards/chosen": -14.342355728149414,
"rewards/margins": 0.5483782291412354,
"rewards/rejected": -14.890734672546387,
"step": 140
},
{
"epoch": 0.34672060098237506,
"grad_norm": 59.64632034301758,
"learning_rate": 7.268041237113402e-07,
"logits/chosen": -0.40256112813949585,
"logits/rejected": -0.3912666440010071,
"logps/chosen": -143.48622131347656,
"logps/rejected": -148.83050537109375,
"loss": 1.8835,
"nll_loss": 0.41666117310523987,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -14.34862232208252,
"rewards/margins": 0.5344293117523193,
"rewards/rejected": -14.883050918579102,
"step": 150
},
{
"epoch": 0.3698353077145334,
"grad_norm": 41.37995529174805,
"learning_rate": 7.010309278350515e-07,
"logits/chosen": -0.3729507327079773,
"logits/rejected": -0.34710609912872314,
"logps/chosen": -155.8257598876953,
"logps/rejected": -159.4755096435547,
"loss": 1.7067,
"nll_loss": 0.41083773970603943,
"rewards/accuracies": 0.5718749761581421,
"rewards/chosen": -15.582575798034668,
"rewards/margins": 0.36497658491134644,
"rewards/rejected": -15.947550773620605,
"step": 160
},
{
"epoch": 0.3929500144466917,
"grad_norm": 50.4566535949707,
"learning_rate": 6.752577319587629e-07,
"logits/chosen": -0.3252796530723572,
"logits/rejected": -0.31979063153266907,
"logps/chosen": -154.66848754882812,
"logps/rejected": -161.5574951171875,
"loss": 1.6017,
"nll_loss": 0.42361512780189514,
"rewards/accuracies": 0.578125,
"rewards/chosen": -15.46684741973877,
"rewards/margins": 0.6889010071754456,
"rewards/rejected": -16.15574836730957,
"step": 170
},
{
"epoch": 0.41606472117885,
"grad_norm": 48.24229431152344,
"learning_rate": 6.494845360824742e-07,
"logits/chosen": -0.3405265212059021,
"logits/rejected": -0.33944639563560486,
"logps/chosen": -147.56602478027344,
"logps/rejected": -154.09613037109375,
"loss": 1.6478,
"nll_loss": 0.424372136592865,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -14.756604194641113,
"rewards/margins": 0.653009295463562,
"rewards/rejected": -15.409612655639648,
"step": 180
},
{
"epoch": 0.4391794279110084,
"grad_norm": 50.57717514038086,
"learning_rate": 6.237113402061855e-07,
"logits/chosen": -0.3636409640312195,
"logits/rejected": -0.3508070111274719,
"logps/chosen": -156.1150360107422,
"logps/rejected": -162.10330200195312,
"loss": 1.7155,
"nll_loss": 0.4282284379005432,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -15.611505508422852,
"rewards/margins": 0.5988240838050842,
"rewards/rejected": -16.210330963134766,
"step": 190
},
{
"epoch": 0.4622941346431667,
"grad_norm": 44.41514205932617,
"learning_rate": 5.979381443298969e-07,
"logits/chosen": -0.32660025358200073,
"logits/rejected": -0.3209044337272644,
"logps/chosen": -156.2790985107422,
"logps/rejected": -162.4671173095703,
"loss": 1.7019,
"nll_loss": 0.4315672516822815,
"rewards/accuracies": 0.596875011920929,
"rewards/chosen": -15.627909660339355,
"rewards/margins": 0.6188000440597534,
"rewards/rejected": -16.2467098236084,
"step": 200
},
{
"epoch": 0.4622941346431667,
"eval_logits/chosen": -0.3579607307910919,
"eval_logits/rejected": -0.3357972204685211,
"eval_logps/chosen": -154.3026885986328,
"eval_logps/rejected": -160.1311492919922,
"eval_loss": 1.678566575050354,
"eval_nll_loss": 0.4193345308303833,
"eval_rewards/accuracies": 0.6086956262588501,
"eval_rewards/chosen": -15.430268287658691,
"eval_rewards/margins": 0.5828461647033691,
"eval_rewards/rejected": -16.01311492919922,
"eval_runtime": 74.1864,
"eval_samples_per_second": 24.614,
"eval_steps_per_second": 1.55,
"step": 200
},
{
"epoch": 0.48540884137532503,
"grad_norm": 51.62085723876953,
"learning_rate": 5.721649484536082e-07,
"logits/chosen": -0.3630141615867615,
"logits/rejected": -0.3378238081932068,
"logps/chosen": -150.49215698242188,
"logps/rejected": -152.28367614746094,
"loss": 1.6739,
"nll_loss": 0.41899624466896057,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -15.049214363098145,
"rewards/margins": 0.17915421724319458,
"rewards/rejected": -15.228368759155273,
"step": 210
},
{
"epoch": 0.5085235481074833,
"grad_norm": 49.88188552856445,
"learning_rate": 5.463917525773195e-07,
"logits/chosen": -0.37590575218200684,
"logits/rejected": -0.3511108160018921,
"logps/chosen": -159.89659118652344,
"logps/rejected": -165.49131774902344,
"loss": 1.7447,
"nll_loss": 0.42955484986305237,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -15.989659309387207,
"rewards/margins": 0.5594727993011475,
"rewards/rejected": -16.549131393432617,
"step": 220
},
{
"epoch": 0.5316382548396418,
"grad_norm": 46.68313217163086,
"learning_rate": 5.20618556701031e-07,
"logits/chosen": -0.37392115592956543,
"logits/rejected": -0.3575811982154846,
"logps/chosen": -162.5522918701172,
"logps/rejected": -168.78067016601562,
"loss": 1.7586,
"nll_loss": 0.4414497911930084,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -16.255229949951172,
"rewards/margins": 0.6228369474411011,
"rewards/rejected": -16.878068923950195,
"step": 230
},
{
"epoch": 0.5547529615718001,
"grad_norm": 54.655609130859375,
"learning_rate": 4.948453608247422e-07,
"logits/chosen": -0.3484077453613281,
"logits/rejected": -0.3337170481681824,
"logps/chosen": -159.63836669921875,
"logps/rejected": -164.4112091064453,
"loss": 1.6017,
"nll_loss": 0.4336668848991394,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -15.963836669921875,
"rewards/margins": 0.47728481888771057,
"rewards/rejected": -16.44112205505371,
"step": 240
},
{
"epoch": 0.5778676683039584,
"grad_norm": 50.76809310913086,
"learning_rate": 4.6907216494845357e-07,
"logits/chosen": -0.30525675415992737,
"logits/rejected": -0.2880803048610687,
"logps/chosen": -156.4806365966797,
"logps/rejected": -159.4465789794922,
"loss": 1.7451,
"nll_loss": 0.4165531098842621,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -15.648063659667969,
"rewards/margins": 0.29659539461135864,
"rewards/rejected": -15.944659233093262,
"step": 250
},
{
"epoch": 0.6009823750361167,
"grad_norm": 51.902610778808594,
"learning_rate": 4.432989690721649e-07,
"logits/chosen": -0.3701649308204651,
"logits/rejected": -0.3554461896419525,
"logps/chosen": -152.5877685546875,
"logps/rejected": -160.40426635742188,
"loss": 1.6025,
"nll_loss": 0.4253969192504883,
"rewards/accuracies": 0.59375,
"rewards/chosen": -15.258776664733887,
"rewards/margins": 0.7816492319107056,
"rewards/rejected": -16.04042625427246,
"step": 260
},
{
"epoch": 0.624097081768275,
"grad_norm": 44.464599609375,
"learning_rate": 4.175257731958763e-07,
"logits/chosen": -0.3865426182746887,
"logits/rejected": -0.3753945231437683,
"logps/chosen": -153.08734130859375,
"logps/rejected": -159.94705200195312,
"loss": 1.628,
"nll_loss": 0.4174048900604248,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -15.308735847473145,
"rewards/margins": 0.6859728097915649,
"rewards/rejected": -15.994707107543945,
"step": 270
},
{
"epoch": 0.6472117885004334,
"grad_norm": 50.29905700683594,
"learning_rate": 3.917525773195876e-07,
"logits/chosen": -0.35409292578697205,
"logits/rejected": -0.3260190784931183,
"logps/chosen": -154.6301727294922,
"logps/rejected": -163.79635620117188,
"loss": 1.6203,
"nll_loss": 0.4250774383544922,
"rewards/accuracies": 0.628125011920929,
"rewards/chosen": -15.463017463684082,
"rewards/margins": 0.916618824005127,
"rewards/rejected": -16.379634857177734,
"step": 280
},
{
"epoch": 0.6703264952325917,
"grad_norm": 54.7519416809082,
"learning_rate": 3.659793814432989e-07,
"logits/chosen": -0.42501506209373474,
"logits/rejected": -0.39394429326057434,
"logps/chosen": -159.5155487060547,
"logps/rejected": -164.74307250976562,
"loss": 1.5987,
"nll_loss": 0.4190928339958191,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -15.951556205749512,
"rewards/margins": 0.5227512717247009,
"rewards/rejected": -16.474306106567383,
"step": 290
},
{
"epoch": 0.6934412019647501,
"grad_norm": 44.03036880493164,
"learning_rate": 3.402061855670103e-07,
"logits/chosen": -0.4323659837245941,
"logits/rejected": -0.4210866391658783,
"logps/chosen": -163.0435333251953,
"logps/rejected": -172.29119873046875,
"loss": 1.6388,
"nll_loss": 0.4356729984283447,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -16.304353713989258,
"rewards/margins": 0.9247667193412781,
"rewards/rejected": -17.229122161865234,
"step": 300
},
{
"epoch": 0.6934412019647501,
"eval_logits/chosen": -0.38277825713157654,
"eval_logits/rejected": -0.35816264152526855,
"eval_logps/chosen": -155.46498107910156,
"eval_logps/rejected": -162.12692260742188,
"eval_loss": 1.6232643127441406,
"eval_nll_loss": 0.4229773283004761,
"eval_rewards/accuracies": 0.613043487071991,
"eval_rewards/chosen": -15.546499252319336,
"eval_rewards/margins": 0.6661920547485352,
"eval_rewards/rejected": -16.212690353393555,
"eval_runtime": 74.1312,
"eval_samples_per_second": 24.632,
"eval_steps_per_second": 1.551,
"step": 300
},
{
"epoch": 0.7165559086969084,
"grad_norm": 47.341087341308594,
"learning_rate": 3.1443298969072163e-07,
"logits/chosen": -0.4356638789176941,
"logits/rejected": -0.4280335307121277,
"logps/chosen": -164.1811065673828,
"logps/rejected": -167.7774200439453,
"loss": 1.6949,
"nll_loss": 0.4244704246520996,
"rewards/accuracies": 0.546875,
"rewards/chosen": -16.41811180114746,
"rewards/margins": 0.3596319258213043,
"rewards/rejected": -16.77774429321289,
"step": 310
},
{
"epoch": 0.7396706154290668,
"grad_norm": 43.78164291381836,
"learning_rate": 2.8865979381443296e-07,
"logits/chosen": -0.4178016781806946,
"logits/rejected": -0.40296635031700134,
"logps/chosen": -152.5771484375,
"logps/rejected": -160.88571166992188,
"loss": 1.6922,
"nll_loss": 0.4172099232673645,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -15.257715225219727,
"rewards/margins": 0.8308565020561218,
"rewards/rejected": -16.088571548461914,
"step": 320
},
{
"epoch": 0.7627853221612251,
"grad_norm": 48.753013610839844,
"learning_rate": 2.6288659793814435e-07,
"logits/chosen": -0.4328450560569763,
"logits/rejected": -0.43247896432876587,
"logps/chosen": -153.868896484375,
"logps/rejected": -160.49305725097656,
"loss": 1.6731,
"nll_loss": 0.4279722571372986,
"rewards/accuracies": 0.621874988079071,
"rewards/chosen": -15.38688850402832,
"rewards/margins": 0.6624161601066589,
"rewards/rejected": -16.049304962158203,
"step": 330
},
{
"epoch": 0.7859000288933834,
"grad_norm": 48.8376350402832,
"learning_rate": 2.3711340206185566e-07,
"logits/chosen": -0.4575740694999695,
"logits/rejected": -0.44574373960494995,
"logps/chosen": -157.2711944580078,
"logps/rejected": -161.98927307128906,
"loss": 1.5679,
"nll_loss": 0.4292600154876709,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -15.727119445800781,
"rewards/margins": 0.4718071520328522,
"rewards/rejected": -16.198925018310547,
"step": 340
},
{
"epoch": 0.8090147356255417,
"grad_norm": 46.211063385009766,
"learning_rate": 2.11340206185567e-07,
"logits/chosen": -0.44085240364074707,
"logits/rejected": -0.44065386056900024,
"logps/chosen": -157.3097686767578,
"logps/rejected": -166.4695281982422,
"loss": 1.6698,
"nll_loss": 0.4102792739868164,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -15.730977058410645,
"rewards/margins": 0.9159765243530273,
"rewards/rejected": -16.646953582763672,
"step": 350
},
{
"epoch": 0.8321294423577,
"grad_norm": 52.41377639770508,
"learning_rate": 1.8556701030927835e-07,
"logits/chosen": -0.4241538941860199,
"logits/rejected": -0.4094991087913513,
"logps/chosen": -160.124267578125,
"logps/rejected": -165.10821533203125,
"loss": 1.7134,
"nll_loss": 0.42789340019226074,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -16.012426376342773,
"rewards/margins": 0.4983920156955719,
"rewards/rejected": -16.510820388793945,
"step": 360
},
{
"epoch": 0.8552441490898585,
"grad_norm": 48.11139678955078,
"learning_rate": 1.5979381443298966e-07,
"logits/chosen": -0.43041014671325684,
"logits/rejected": -0.4028114676475525,
"logps/chosen": -154.57138061523438,
"logps/rejected": -164.22232055664062,
"loss": 1.5667,
"nll_loss": 0.4179977774620056,
"rewards/accuracies": 0.65625,
"rewards/chosen": -15.45713996887207,
"rewards/margins": 0.9650918841362,
"rewards/rejected": -16.422229766845703,
"step": 370
},
{
"epoch": 0.8783588558220168,
"grad_norm": 47.23114776611328,
"learning_rate": 1.3402061855670102e-07,
"logits/chosen": -0.427821546792984,
"logits/rejected": -0.4097885191440582,
"logps/chosen": -154.52496337890625,
"logps/rejected": -161.27987670898438,
"loss": 1.5921,
"nll_loss": 0.4322156012058258,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -15.452497482299805,
"rewards/margins": 0.6754907369613647,
"rewards/rejected": -16.127986907958984,
"step": 380
},
{
"epoch": 0.9014735625541751,
"grad_norm": 55.62732696533203,
"learning_rate": 1.0824742268041237e-07,
"logits/chosen": -0.461261123418808,
"logits/rejected": -0.44340047240257263,
"logps/chosen": -157.7149658203125,
"logps/rejected": -168.34735107421875,
"loss": 1.6161,
"nll_loss": 0.42217200994491577,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -15.771496772766113,
"rewards/margins": 1.0632401704788208,
"rewards/rejected": -16.834735870361328,
"step": 390
},
{
"epoch": 0.9245882692863334,
"grad_norm": 52.596492767333984,
"learning_rate": 8.24742268041237e-08,
"logits/chosen": -0.43360406160354614,
"logits/rejected": -0.41087478399276733,
"logps/chosen": -162.21621704101562,
"logps/rejected": -167.1909637451172,
"loss": 1.632,
"nll_loss": 0.4444475769996643,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -16.221622467041016,
"rewards/margins": 0.4974748194217682,
"rewards/rejected": -16.719097137451172,
"step": 400
},
{
"epoch": 0.9245882692863334,
"eval_logits/chosen": -0.40716680884361267,
"eval_logits/rejected": -0.3811309337615967,
"eval_logps/chosen": -156.50477600097656,
"eval_logps/rejected": -163.44790649414062,
"eval_loss": 1.6007416248321533,
"eval_nll_loss": 0.42774829268455505,
"eval_rewards/accuracies": 0.636956512928009,
"eval_rewards/chosen": -15.65047550201416,
"eval_rewards/margins": 0.6943140625953674,
"eval_rewards/rejected": -16.344789505004883,
"eval_runtime": 74.2865,
"eval_samples_per_second": 24.581,
"eval_steps_per_second": 1.548,
"step": 400
},
{
"epoch": 0.9477029760184917,
"grad_norm": 50.8940315246582,
"learning_rate": 5.670103092783505e-08,
"logits/chosen": -0.36925220489501953,
"logits/rejected": -0.35820272564888,
"logps/chosen": -148.66673278808594,
"logps/rejected": -157.42532348632812,
"loss": 1.566,
"nll_loss": 0.42418622970581055,
"rewards/accuracies": 0.6156250238418579,
"rewards/chosen": -14.866673469543457,
"rewards/margins": 0.8758570551872253,
"rewards/rejected": -15.742530822753906,
"step": 410
},
{
"epoch": 0.9708176827506501,
"grad_norm": 44.86955642700195,
"learning_rate": 3.092783505154639e-08,
"logits/chosen": -0.40748652815818787,
"logits/rejected": -0.383215069770813,
"logps/chosen": -150.21824645996094,
"logps/rejected": -155.44349670410156,
"loss": 1.5783,
"nll_loss": 0.4278343617916107,
"rewards/accuracies": 0.5843750238418579,
"rewards/chosen": -15.021825790405273,
"rewards/margins": 0.5225244760513306,
"rewards/rejected": -15.544349670410156,
"step": 420
},
{
"epoch": 0.9939323894828085,
"grad_norm": 48.80271911621094,
"learning_rate": 5.154639175257731e-09,
"logits/chosen": -0.41907650232315063,
"logits/rejected": -0.4291330873966217,
"logps/chosen": -157.33888244628906,
"logps/rejected": -164.2548370361328,
"loss": 1.655,
"nll_loss": 0.4265294075012207,
"rewards/accuracies": 0.59375,
"rewards/chosen": -15.733888626098633,
"rewards/margins": 0.6915954351425171,
"rewards/rejected": -16.425485610961914,
"step": 430
},
{
"epoch": 0.9985553308292401,
"step": 432,
"total_flos": 0.0,
"train_loss": 1.77929983039697,
"train_runtime": 9807.604,
"train_samples_per_second": 5.646,
"train_steps_per_second": 0.044
}
],
"logging_steps": 10,
"max_steps": 432,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}