phi-2-kto-i0 / trainer_state.json
BraylonDash's picture
Model save
db0f377 verified
raw
history blame
92.5 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997382884061764,
"eval_steps": 500,
"global_step": 1910,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 2.617801047120419e-08,
"logits/chosen": 0.8436492085456848,
"logits/rejected": 1.1560968160629272,
"logps/chosen": -330.2955322265625,
"logps/rejected": -239.8994140625,
"loss": 0.5,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 2.617801047120419e-07,
"logits/chosen": 1.0090492963790894,
"logits/rejected": 1.0627849102020264,
"logps/chosen": -279.4153137207031,
"logps/rejected": -249.27322387695312,
"loss": 0.5,
"rewards/accuracies": 0.375,
"rewards/chosen": -8.76396952662617e-05,
"rewards/margins": -9.456619591219351e-05,
"rewards/rejected": 6.926496553205652e-06,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 5.235602094240838e-07,
"logits/chosen": 1.0303412675857544,
"logits/rejected": 1.0532195568084717,
"logps/chosen": -321.72723388671875,
"logps/rejected": -270.56353759765625,
"loss": 0.5,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -6.834287341916934e-05,
"rewards/margins": -4.8897858505370095e-05,
"rewards/rejected": -1.9445011275820434e-05,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 7.853403141361258e-07,
"logits/chosen": 1.002454400062561,
"logits/rejected": 1.06557297706604,
"logps/chosen": -252.0704345703125,
"logps/rejected": -246.32705688476562,
"loss": 0.5,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 1.5753510524518788e-05,
"rewards/margins": 5.4146301408763975e-05,
"rewards/rejected": -3.83927981602028e-05,
"step": 30
},
{
"epoch": 0.02,
"learning_rate": 1.0471204188481676e-06,
"logits/chosen": 1.0041682720184326,
"logits/rejected": 1.1504443883895874,
"logps/chosen": -235.38217163085938,
"logps/rejected": -230.2617645263672,
"loss": 0.5,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": 7.3400560722802766e-06,
"rewards/margins": 2.9947289021947654e-06,
"rewards/rejected": 4.3453355829115026e-06,
"step": 40
},
{
"epoch": 0.03,
"learning_rate": 1.3089005235602096e-06,
"logits/chosen": 0.9595837593078613,
"logits/rejected": 1.0130202770233154,
"logps/chosen": -294.26007080078125,
"logps/rejected": -249.2256317138672,
"loss": 0.5,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.00017269175441469997,
"rewards/margins": 9.17307916097343e-05,
"rewards/rejected": 8.096096280496567e-05,
"step": 50
},
{
"epoch": 0.03,
"learning_rate": 1.5706806282722515e-06,
"logits/chosen": 0.9245076179504395,
"logits/rejected": 1.023485779762268,
"logps/chosen": -242.47689819335938,
"logps/rejected": -230.57373046875,
"loss": 0.5,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.0002746728132478893,
"rewards/margins": 0.00012865502503700554,
"rewards/rejected": 0.00014601778821088374,
"step": 60
},
{
"epoch": 0.04,
"learning_rate": 1.8324607329842933e-06,
"logits/chosen": 0.9357272386550903,
"logits/rejected": 1.0410839319229126,
"logps/chosen": -257.8460388183594,
"logps/rejected": -238.37973022460938,
"loss": 0.5,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.00047300319420173764,
"rewards/margins": 0.00019578025967348367,
"rewards/rejected": 0.0002772229490801692,
"step": 70
},
{
"epoch": 0.04,
"learning_rate": 2.094240837696335e-06,
"logits/chosen": 1.0097007751464844,
"logits/rejected": 1.0268934965133667,
"logps/chosen": -263.69903564453125,
"logps/rejected": -256.5643615722656,
"loss": 0.4999,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.0005936628440394998,
"rewards/margins": 0.00022301140415947884,
"rewards/rejected": 0.0003706514835357666,
"step": 80
},
{
"epoch": 0.05,
"learning_rate": 2.356020942408377e-06,
"logits/chosen": 0.9857368469238281,
"logits/rejected": 1.050782561302185,
"logps/chosen": -252.1823272705078,
"logps/rejected": -253.6891326904297,
"loss": 0.4999,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.000997263239696622,
"rewards/margins": 0.00041304732440039515,
"rewards/rejected": 0.0005842159152962267,
"step": 90
},
{
"epoch": 0.05,
"learning_rate": 2.617801047120419e-06,
"logits/chosen": 1.0416964292526245,
"logits/rejected": 1.0389362573623657,
"logps/chosen": -254.76235961914062,
"logps/rejected": -224.39559936523438,
"loss": 0.4998,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.0013410584069788456,
"rewards/margins": 0.0005450797034427524,
"rewards/rejected": 0.0007959787035360932,
"step": 100
},
{
"epoch": 0.06,
"learning_rate": 2.8795811518324613e-06,
"logits/chosen": 1.0654562711715698,
"logits/rejected": 1.1301515102386475,
"logps/chosen": -294.14031982421875,
"logps/rejected": -258.11077880859375,
"loss": 0.4998,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.001866974518634379,
"rewards/margins": 0.0006641600048169494,
"rewards/rejected": 0.0012028145138174295,
"step": 110
},
{
"epoch": 0.06,
"learning_rate": 3.141361256544503e-06,
"logits/chosen": 0.9807151556015015,
"logits/rejected": 1.125035285949707,
"logps/chosen": -303.8504943847656,
"logps/rejected": -249.7647705078125,
"loss": 0.4997,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.002772308187559247,
"rewards/margins": 0.0014164599124342203,
"rewards/rejected": 0.001355848042294383,
"step": 120
},
{
"epoch": 0.07,
"learning_rate": 3.403141361256545e-06,
"logits/chosen": 1.096975564956665,
"logits/rejected": 1.1348248720169067,
"logps/chosen": -278.3834533691406,
"logps/rejected": -245.82968139648438,
"loss": 0.4995,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.003960819449275732,
"rewards/margins": 0.0022526984103024006,
"rewards/rejected": 0.0017081208061426878,
"step": 130
},
{
"epoch": 0.07,
"learning_rate": 3.6649214659685865e-06,
"logits/chosen": 1.0514932870864868,
"logits/rejected": 1.1338948011398315,
"logps/chosen": -275.76031494140625,
"logps/rejected": -258.5254821777344,
"loss": 0.4995,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.004108738619834185,
"rewards/margins": 0.001808557310141623,
"rewards/rejected": 0.0023001814261078835,
"step": 140
},
{
"epoch": 0.08,
"learning_rate": 3.926701570680629e-06,
"logits/chosen": 0.9971652030944824,
"logits/rejected": 1.0917918682098389,
"logps/chosen": -291.89044189453125,
"logps/rejected": -254.80679321289062,
"loss": 0.4993,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.0049656108021736145,
"rewards/margins": 0.002658768789842725,
"rewards/rejected": 0.0023068420123308897,
"step": 150
},
{
"epoch": 0.08,
"learning_rate": 4.18848167539267e-06,
"logits/chosen": 1.0320146083831787,
"logits/rejected": 1.053504228591919,
"logps/chosen": -285.04559326171875,
"logps/rejected": -244.1322784423828,
"loss": 0.4993,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.005488889757543802,
"rewards/margins": 0.002789679216220975,
"rewards/rejected": 0.00269921007566154,
"step": 160
},
{
"epoch": 0.09,
"learning_rate": 4.450261780104713e-06,
"logits/chosen": 1.0273762941360474,
"logits/rejected": 1.062558650970459,
"logps/chosen": -287.9652099609375,
"logps/rejected": -232.247314453125,
"loss": 0.4993,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": 0.007104066200554371,
"rewards/margins": 0.0033009883482009172,
"rewards/rejected": 0.0038030785508453846,
"step": 170
},
{
"epoch": 0.09,
"learning_rate": 4.712041884816754e-06,
"logits/chosen": 1.0532909631729126,
"logits/rejected": 1.1673284769058228,
"logps/chosen": -274.5193786621094,
"logps/rejected": -238.21286010742188,
"loss": 0.499,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.00742004532366991,
"rewards/margins": 0.003918725997209549,
"rewards/rejected": 0.003501318860799074,
"step": 180
},
{
"epoch": 0.1,
"learning_rate": 4.9738219895287965e-06,
"logits/chosen": 1.1504939794540405,
"logits/rejected": 1.1638376712799072,
"logps/chosen": -237.76797485351562,
"logps/rejected": -211.50613403320312,
"loss": 0.499,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": 0.007698298431932926,
"rewards/margins": 0.003723274450749159,
"rewards/rejected": 0.00397502351552248,
"step": 190
},
{
"epoch": 0.1,
"learning_rate": 4.999661831436499e-06,
"logits/chosen": 1.0712188482284546,
"logits/rejected": 1.0771671533584595,
"logps/chosen": -288.3528747558594,
"logps/rejected": -265.5425109863281,
"loss": 0.4989,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.008598363026976585,
"rewards/margins": 0.005429488606750965,
"rewards/rejected": 0.003168874653056264,
"step": 200
},
{
"epoch": 0.11,
"learning_rate": 4.9984929711403395e-06,
"logits/chosen": 1.1236344575881958,
"logits/rejected": 1.2009334564208984,
"logps/chosen": -254.3011932373047,
"logps/rejected": -224.9448699951172,
"loss": 0.4988,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.007906198501586914,
"rewards/margins": 0.005368872079998255,
"rewards/rejected": 0.0025373264215886593,
"step": 210
},
{
"epoch": 0.12,
"learning_rate": 4.996489634487865e-06,
"logits/chosen": 1.0867538452148438,
"logits/rejected": 1.2004356384277344,
"logps/chosen": -258.08062744140625,
"logps/rejected": -240.8439483642578,
"loss": 0.4988,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.007928581908345222,
"rewards/margins": 0.004648840986192226,
"rewards/rejected": 0.003279739525169134,
"step": 220
},
{
"epoch": 0.12,
"learning_rate": 4.9936524905772466e-06,
"logits/chosen": 1.0192543268203735,
"logits/rejected": 1.2005066871643066,
"logps/chosen": -274.07037353515625,
"logps/rejected": -256.2618713378906,
"loss": 0.4988,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.006800153758376837,
"rewards/margins": 0.003369166050106287,
"rewards/rejected": 0.0034309872426092625,
"step": 230
},
{
"epoch": 0.13,
"learning_rate": 4.9899824869915e-06,
"logits/chosen": 1.111426830291748,
"logits/rejected": 1.1554086208343506,
"logps/chosen": -243.208984375,
"logps/rejected": -205.7252655029297,
"loss": 0.4984,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.007739508058875799,
"rewards/margins": 0.0072770556434988976,
"rewards/rejected": 0.0004624520370271057,
"step": 240
},
{
"epoch": 0.13,
"learning_rate": 4.985480849482012e-06,
"logits/chosen": 1.1005799770355225,
"logits/rejected": 1.230799913406372,
"logps/chosen": -272.18597412109375,
"logps/rejected": -257.9790954589844,
"loss": 0.4988,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.005292638670653105,
"rewards/margins": 0.0029095064383000135,
"rewards/rejected": 0.0023831322323530912,
"step": 250
},
{
"epoch": 0.14,
"learning_rate": 4.980149081559142e-06,
"logits/chosen": 1.0777183771133423,
"logits/rejected": 1.155970573425293,
"logps/chosen": -294.93328857421875,
"logps/rejected": -261.9263610839844,
"loss": 0.4982,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": 0.009707033634185791,
"rewards/margins": 0.008258306421339512,
"rewards/rejected": 0.00144872663076967,
"step": 260
},
{
"epoch": 0.14,
"learning_rate": 4.9739889639900655e-06,
"logits/chosen": 1.1088669300079346,
"logits/rejected": 1.1434690952301025,
"logps/chosen": -254.5012664794922,
"logps/rejected": -254.6510009765625,
"loss": 0.4979,
"rewards/accuracies": 0.65625,
"rewards/chosen": 0.009705386124551296,
"rewards/margins": 0.009683574549853802,
"rewards/rejected": 2.181164381909184e-05,
"step": 270
},
{
"epoch": 0.15,
"learning_rate": 4.967002554204009e-06,
"logits/chosen": 1.0548467636108398,
"logits/rejected": 1.1509649753570557,
"logps/chosen": -245.9481964111328,
"logps/rejected": -229.8827362060547,
"loss": 0.4985,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.009255246259272099,
"rewards/margins": 0.006528814323246479,
"rewards/rejected": 0.0027264312375336885,
"step": 280
},
{
"epoch": 0.15,
"learning_rate": 4.959192185605089e-06,
"logits/chosen": 1.0842396020889282,
"logits/rejected": 1.1220932006835938,
"logps/chosen": -266.4988708496094,
"logps/rejected": -246.9526824951172,
"loss": 0.4988,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.009208474308252335,
"rewards/margins": 0.007726150564849377,
"rewards/rejected": 0.0014823225792497396,
"step": 290
},
{
"epoch": 0.16,
"learning_rate": 4.950560466792969e-06,
"logits/chosen": 1.1049131155014038,
"logits/rejected": 1.1441484689712524,
"logps/chosen": -275.13421630859375,
"logps/rejected": -246.1587677001953,
"loss": 0.4984,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.006010602228343487,
"rewards/margins": 0.00892153661698103,
"rewards/rejected": -0.0029109339229762554,
"step": 300
},
{
"epoch": 0.16,
"learning_rate": 4.9411102806916185e-06,
"logits/chosen": 1.021583080291748,
"logits/rejected": 1.047163963317871,
"logps/chosen": -323.06097412109375,
"logps/rejected": -254.7588653564453,
"loss": 0.4977,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.008619217202067375,
"rewards/margins": 0.012051543220877647,
"rewards/rejected": -0.003432326018810272,
"step": 310
},
{
"epoch": 0.17,
"learning_rate": 4.930844783586424e-06,
"logits/chosen": 1.024611473083496,
"logits/rejected": 1.0655776262283325,
"logps/chosen": -238.3491668701172,
"logps/rejected": -231.0393829345703,
"loss": 0.498,
"rewards/accuracies": 0.59375,
"rewards/chosen": 0.006629918701946735,
"rewards/margins": 0.010882768779993057,
"rewards/rejected": -0.004252850078046322,
"step": 320
},
{
"epoch": 0.17,
"learning_rate": 4.919767404070033e-06,
"logits/chosen": 1.04720139503479,
"logits/rejected": 1.0630711317062378,
"logps/chosen": -261.62982177734375,
"logps/rejected": -247.97607421875,
"loss": 0.4981,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": 0.0036115895491093397,
"rewards/margins": 0.009240304119884968,
"rewards/rejected": -0.005628715269267559,
"step": 330
},
{
"epoch": 0.18,
"learning_rate": 4.907881841897216e-06,
"logits/chosen": 1.0087223052978516,
"logits/rejected": 1.059715986251831,
"logps/chosen": -314.62408447265625,
"logps/rejected": -248.10879516601562,
"loss": 0.4979,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": 0.003107100958004594,
"rewards/margins": 0.013965976424515247,
"rewards/rejected": -0.010858876630663872,
"step": 340
},
{
"epoch": 0.18,
"learning_rate": 4.89519206674919e-06,
"logits/chosen": 0.9633463621139526,
"logits/rejected": 1.0100409984588623,
"logps/chosen": -241.84793090820312,
"logps/rejected": -252.7783203125,
"loss": 0.4976,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0028146414551883936,
"rewards/margins": 0.013054436072707176,
"rewards/rejected": -0.010239794850349426,
"step": 350
},
{
"epoch": 0.19,
"learning_rate": 4.881702316907769e-06,
"logits/chosen": 0.9069837331771851,
"logits/rejected": 1.0270668268203735,
"logps/chosen": -210.9730987548828,
"logps/rejected": -243.6437225341797,
"loss": 0.4983,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.0008282591588795185,
"rewards/margins": 0.010188087821006775,
"rewards/rejected": -0.009359828196465969,
"step": 360
},
{
"epoch": 0.19,
"learning_rate": 4.86741709783982e-06,
"logits/chosen": 0.8630668520927429,
"logits/rejected": 0.9914480447769165,
"logps/chosen": -332.7330627441406,
"logps/rejected": -281.46807861328125,
"loss": 0.4977,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0035103503614664078,
"rewards/margins": 0.01303508598357439,
"rewards/rejected": -0.009524735622107983,
"step": 370
},
{
"epoch": 0.2,
"learning_rate": 4.852341180692471e-06,
"logits/chosen": 0.9135398864746094,
"logits/rejected": 0.9984884262084961,
"logps/chosen": -284.92620849609375,
"logps/rejected": -252.03970336914062,
"loss": 0.4976,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": 0.0040648458525538445,
"rewards/margins": 0.0157476756721735,
"rewards/rejected": -0.011682827956974506,
"step": 380
},
{
"epoch": 0.2,
"learning_rate": 4.836479600699579e-06,
"logits/chosen": 0.9406082034111023,
"logits/rejected": 0.9047748446464539,
"logps/chosen": -278.61248779296875,
"logps/rejected": -284.1888732910156,
"loss": 0.4972,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": 0.006144961342215538,
"rewards/margins": 0.017049867659807205,
"rewards/rejected": -0.010904906317591667,
"step": 390
},
{
"epoch": 0.21,
"learning_rate": 4.819837655500014e-06,
"logits/chosen": 0.8400663137435913,
"logits/rejected": 0.9222391843795776,
"logps/chosen": -230.8615264892578,
"logps/rejected": -221.2638397216797,
"loss": 0.4984,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0005404525436460972,
"rewards/margins": 0.011931750923395157,
"rewards/rejected": -0.011391298845410347,
"step": 400
},
{
"epoch": 0.21,
"learning_rate": 4.802420903368286e-06,
"logits/chosen": 0.8889272809028625,
"logits/rejected": 0.8912805318832397,
"logps/chosen": -268.0902099609375,
"logps/rejected": -250.4331512451172,
"loss": 0.4979,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.0009850022615864873,
"rewards/margins": 0.010063153691589832,
"rewards/rejected": -0.009078151546418667,
"step": 410
},
{
"epoch": 0.22,
"learning_rate": 4.784235161358124e-06,
"logits/chosen": 0.8787338137626648,
"logits/rejected": 0.9284510612487793,
"logps/chosen": -288.6819152832031,
"logps/rejected": -265.958984375,
"loss": 0.4971,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": 0.0032099136151373386,
"rewards/margins": 0.021029185503721237,
"rewards/rejected": -0.017819274216890335,
"step": 420
},
{
"epoch": 0.23,
"learning_rate": 4.765286503359632e-06,
"logits/chosen": 0.8820232152938843,
"logits/rejected": 0.9475772976875305,
"logps/chosen": -270.6169738769531,
"logps/rejected": -259.78839111328125,
"loss": 0.4973,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.0021267482079565525,
"rewards/margins": 0.019616421312093735,
"rewards/rejected": -0.021743169054389,
"step": 430
},
{
"epoch": 0.23,
"learning_rate": 4.745581258070654e-06,
"logits/chosen": 0.7767919301986694,
"logits/rejected": 0.87933349609375,
"logps/chosen": -254.14315795898438,
"logps/rejected": -252.87222290039062,
"loss": 0.498,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0043312786146998405,
"rewards/margins": 0.013770043849945068,
"rewards/rejected": -0.018101321533322334,
"step": 440
},
{
"epoch": 0.24,
"learning_rate": 4.725126006883047e-06,
"logits/chosen": 0.7937654256820679,
"logits/rejected": 0.8364180326461792,
"logps/chosen": -238.3746337890625,
"logps/rejected": -241.1796875,
"loss": 0.4977,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.00781493354588747,
"rewards/margins": 0.011845615692436695,
"rewards/rejected": -0.019660547375679016,
"step": 450
},
{
"epoch": 0.24,
"learning_rate": 4.70392758168454e-06,
"logits/chosen": 0.7985974550247192,
"logits/rejected": 0.8068701028823853,
"logps/chosen": -345.21343994140625,
"logps/rejected": -304.43817138671875,
"loss": 0.4965,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.007276026997715235,
"rewards/margins": 0.02650422975420952,
"rewards/rejected": -0.033780258148908615,
"step": 460
},
{
"epoch": 0.25,
"learning_rate": 4.68199306257695e-06,
"logits/chosen": 0.7760607004165649,
"logits/rejected": 0.773891806602478,
"logps/chosen": -327.35369873046875,
"logps/rejected": -314.1829528808594,
"loss": 0.4961,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.014054256491363049,
"rewards/margins": 0.03367748484015465,
"rewards/rejected": -0.04773174598813057,
"step": 470
},
{
"epoch": 0.25,
"learning_rate": 4.659329775511478e-06,
"logits/chosen": 0.7017660140991211,
"logits/rejected": 0.7137667536735535,
"logps/chosen": -287.37652587890625,
"logps/rejected": -271.36358642578125,
"loss": 0.497,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.021392906084656715,
"rewards/margins": 0.025359559804201126,
"rewards/rejected": -0.04675246775150299,
"step": 480
},
{
"epoch": 0.26,
"learning_rate": 4.635945289841902e-06,
"logits/chosen": 0.5314046144485474,
"logits/rejected": 0.5452633500099182,
"logps/chosen": -337.0295104980469,
"logps/rejected": -379.64593505859375,
"loss": 0.4958,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.05135764926671982,
"rewards/margins": 0.04310908168554306,
"rewards/rejected": -0.09446673840284348,
"step": 490
},
{
"epoch": 0.26,
"learning_rate": 4.611847415796476e-06,
"logits/chosen": 0.29375532269477844,
"logits/rejected": 0.2797163724899292,
"logps/chosen": -427.3785095214844,
"logps/rejected": -405.41461181640625,
"loss": 0.4932,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.12762612104415894,
"rewards/margins": 0.03379129245877266,
"rewards/rejected": -0.16141743957996368,
"step": 500
},
{
"epoch": 0.27,
"learning_rate": 4.587044201869378e-06,
"logits/chosen": -0.2227209359407425,
"logits/rejected": -0.20223090052604675,
"logps/chosen": -787.1062622070312,
"logps/rejected": -1045.249267578125,
"loss": 0.4818,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.5222705602645874,
"rewards/margins": 0.26146870851516724,
"rewards/rejected": -0.7837392687797546,
"step": 510
},
{
"epoch": 0.27,
"learning_rate": 4.561543932132574e-06,
"logits/chosen": -0.11980749666690826,
"logits/rejected": -0.12788312137126923,
"logps/chosen": -732.790283203125,
"logps/rejected": -833.4085693359375,
"loss": 0.4873,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.45401230454444885,
"rewards/margins": 0.1505609005689621,
"rewards/rejected": -0.6045731902122498,
"step": 520
},
{
"epoch": 0.28,
"learning_rate": 4.535355123469009e-06,
"logits/chosen": -0.14909827709197998,
"logits/rejected": -0.18795037269592285,
"logps/chosen": -696.719482421875,
"logps/rejected": -1046.4390869140625,
"loss": 0.4836,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.42250218987464905,
"rewards/margins": 0.3782690167427063,
"rewards/rejected": -0.8007712364196777,
"step": 530
},
{
"epoch": 0.28,
"learning_rate": 4.508486522728037e-06,
"logits/chosen": -0.18408063054084778,
"logits/rejected": -0.14851421117782593,
"logps/chosen": -893.5338134765625,
"logps/rejected": -1111.295654296875,
"loss": 0.4841,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.629914402961731,
"rewards/margins": 0.23182418942451477,
"rewards/rejected": -0.8617385625839233,
"step": 540
},
{
"epoch": 0.29,
"learning_rate": 4.480947103804044e-06,
"logits/chosen": -0.20195765793323517,
"logits/rejected": -0.2249602973461151,
"logps/chosen": -970.3370971679688,
"logps/rejected": -1377.3724365234375,
"loss": 0.4747,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7055306434631348,
"rewards/margins": 0.46008825302124023,
"rewards/rejected": -1.165618896484375,
"step": 550
},
{
"epoch": 0.29,
"learning_rate": 4.452746064639239e-06,
"logits/chosen": -0.27148136496543884,
"logits/rejected": -0.24398574233055115,
"logps/chosen": -1213.016357421875,
"logps/rejected": -1345.276123046875,
"loss": 0.4846,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.8822624087333679,
"rewards/margins": 0.20509858429431915,
"rewards/rejected": -1.0873609781265259,
"step": 560
},
{
"epoch": 0.3,
"learning_rate": 4.423892824151617e-06,
"logits/chosen": -0.32366353273391724,
"logits/rejected": -0.3419601321220398,
"logps/chosen": -1556.5191650390625,
"logps/rejected": -1787.370361328125,
"loss": 0.4843,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.2833073139190674,
"rewards/margins": 0.28631919622421265,
"rewards/rejected": -1.5696265697479248,
"step": 570
},
{
"epoch": 0.3,
"learning_rate": 4.3943970190891164e-06,
"logits/chosen": -0.23246267437934875,
"logits/rejected": -0.25014322996139526,
"logps/chosen": -1218.4473876953125,
"logps/rejected": -1182.8861083984375,
"loss": 0.4809,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.9385588765144348,
"rewards/margins": 0.02590467967092991,
"rewards/rejected": -0.9644634127616882,
"step": 580
},
{
"epoch": 0.31,
"learning_rate": 4.364268500811025e-06,
"logits/chosen": -0.17416557669639587,
"logits/rejected": -0.17902135848999023,
"logps/chosen": -985.0808715820312,
"logps/rejected": -1348.008056640625,
"loss": 0.4847,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.6908355951309204,
"rewards/margins": 0.41670989990234375,
"rewards/rejected": -1.1075454950332642,
"step": 590
},
{
"epoch": 0.31,
"learning_rate": 4.333517331997704e-06,
"logits/chosen": -0.14922045171260834,
"logits/rejected": -0.19132760167121887,
"logps/chosen": -1128.3173828125,
"logps/rejected": -1489.837158203125,
"loss": 0.4745,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.8316856622695923,
"rewards/margins": 0.4227636754512787,
"rewards/rejected": -1.2544492483139038,
"step": 600
},
{
"epoch": 0.32,
"learning_rate": 4.302153783289737e-06,
"logits/chosen": -0.1274535059928894,
"logits/rejected": -0.17803938686847687,
"logps/chosen": -1048.890625,
"logps/rejected": -1506.1158447265625,
"loss": 0.4743,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7957647442817688,
"rewards/margins": 0.4669608175754547,
"rewards/rejected": -1.2627254724502563,
"step": 610
},
{
"epoch": 0.32,
"learning_rate": 4.270188329857613e-06,
"logits/chosen": -0.14815063774585724,
"logits/rejected": -0.15499570965766907,
"logps/chosen": -1084.8118896484375,
"logps/rejected": -1618.885009765625,
"loss": 0.4711,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7856907844543457,
"rewards/margins": 0.5791957974433899,
"rewards/rejected": -1.3648868799209595,
"step": 620
},
{
"epoch": 0.33,
"learning_rate": 4.237631647903115e-06,
"logits/chosen": -0.024261217564344406,
"logits/rejected": -0.038342759013175964,
"logps/chosen": -723.5900268554688,
"logps/rejected": -1155.1717529296875,
"loss": 0.4678,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.46949324011802673,
"rewards/margins": 0.45854002237319946,
"rewards/rejected": -0.9280332326889038,
"step": 630
},
{
"epoch": 0.33,
"learning_rate": 4.204494611093548e-06,
"logits/chosen": -0.05518772080540657,
"logits/rejected": -0.100825235247612,
"logps/chosen": -1270.6005859375,
"logps/rejected": -1703.8551025390625,
"loss": 0.4819,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.9448369145393372,
"rewards/margins": 0.4941697120666504,
"rewards/rejected": -1.4390065670013428,
"step": 640
},
{
"epoch": 0.34,
"learning_rate": 4.170788286930024e-06,
"logits/chosen": -0.06449203193187714,
"logits/rejected": -0.1527264416217804,
"logps/chosen": -1250.4991455078125,
"logps/rejected": -1752.0111083984375,
"loss": 0.4822,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.9863438606262207,
"rewards/margins": 0.5237391591072083,
"rewards/rejected": -1.5100830793380737,
"step": 650
},
{
"epoch": 0.35,
"learning_rate": 4.136523933051005e-06,
"logits/chosen": -0.10980840772390366,
"logits/rejected": -0.13391873240470886,
"logps/chosen": -1053.7823486328125,
"logps/rejected": -1614.2884521484375,
"loss": 0.4762,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.8304306864738464,
"rewards/margins": 0.5787540078163147,
"rewards/rejected": -1.4091846942901611,
"step": 660
},
{
"epoch": 0.35,
"learning_rate": 4.101712993472348e-06,
"logits/chosen": -0.10138118267059326,
"logits/rejected": -0.13220438361167908,
"logps/chosen": -1581.559326171875,
"logps/rejected": -1862.4993896484375,
"loss": 0.481,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -1.2885770797729492,
"rewards/margins": 0.32578420639038086,
"rewards/rejected": -1.6143611669540405,
"step": 670
},
{
"epoch": 0.36,
"learning_rate": 4.066367094765091e-06,
"logits/chosen": -0.06212924048304558,
"logits/rejected": -0.09771373122930527,
"logps/chosen": -1470.7352294921875,
"logps/rejected": -1844.652587890625,
"loss": 0.4783,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.204660177230835,
"rewards/margins": 0.3980388641357422,
"rewards/rejected": -1.6026990413665771,
"step": 680
},
{
"epoch": 0.36,
"learning_rate": 4.030498042172277e-06,
"logits/chosen": 0.01754361391067505,
"logits/rejected": -0.048445507884025574,
"logps/chosen": -979.1268310546875,
"logps/rejected": -1244.6566162109375,
"loss": 0.4726,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.7009618878364563,
"rewards/margins": 0.3003775477409363,
"rewards/rejected": -1.0013394355773926,
"step": 690
},
{
"epoch": 0.37,
"learning_rate": 3.994117815666095e-06,
"logits/chosen": -0.04728760942816734,
"logits/rejected": -0.0919174998998642,
"logps/chosen": -1344.916015625,
"logps/rejected": -1900.5986328125,
"loss": 0.472,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.0581436157226562,
"rewards/margins": 0.5791832208633423,
"rewards/rejected": -1.6373268365859985,
"step": 700
},
{
"epoch": 0.37,
"learning_rate": 3.957238565946672e-06,
"logits/chosen": 0.004687662236392498,
"logits/rejected": -0.06074858829379082,
"logps/chosen": -1193.521240234375,
"logps/rejected": -2065.345947265625,
"loss": 0.4653,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.8945425152778625,
"rewards/margins": 0.8969429731369019,
"rewards/rejected": -1.7914857864379883,
"step": 710
},
{
"epoch": 0.38,
"learning_rate": 3.919872610383831e-06,
"logits/chosen": 0.07505255192518234,
"logits/rejected": -0.015723228454589844,
"logps/chosen": -1065.49365234375,
"logps/rejected": -1707.6328125,
"loss": 0.4739,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.7956100702285767,
"rewards/margins": 0.6891741752624512,
"rewards/rejected": -1.4847842454910278,
"step": 720
},
{
"epoch": 0.38,
"learning_rate": 3.882032428903195e-06,
"logits/chosen": 0.02505052089691162,
"logits/rejected": -0.009700920432806015,
"logps/chosen": -1372.3634033203125,
"logps/rejected": -2129.860595703125,
"loss": 0.4656,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.0954824686050415,
"rewards/margins": 0.7768491506576538,
"rewards/rejected": -1.8723316192626953,
"step": 730
},
{
"epoch": 0.39,
"learning_rate": 3.84373065981799e-06,
"logits/chosen": 0.1249980553984642,
"logits/rejected": 0.04747745767235756,
"logps/chosen": -956.18115234375,
"logps/rejected": -1541.792724609375,
"loss": 0.4693,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6708589792251587,
"rewards/margins": 0.6129963994026184,
"rewards/rejected": -1.2838553190231323,
"step": 740
},
{
"epoch": 0.39,
"learning_rate": 3.8049800956079552e-06,
"logits/chosen": 0.23526708781719208,
"logits/rejected": 0.19636312127113342,
"logps/chosen": -1106.01513671875,
"logps/rejected": -1326.5162353515625,
"loss": 0.4752,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.8387149572372437,
"rewards/margins": 0.24964456260204315,
"rewards/rejected": -1.0883597135543823,
"step": 750
},
{
"epoch": 0.4,
"learning_rate": 3.765793678646753e-06,
"logits/chosen": 0.19188269972801208,
"logits/rejected": 0.1782020926475525,
"logps/chosen": -802.7251586914062,
"logps/rejected": -1634.812255859375,
"loss": 0.4647,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.5356382727622986,
"rewards/margins": 0.8580523729324341,
"rewards/rejected": -1.3936904668807983,
"step": 760
},
{
"epoch": 0.4,
"learning_rate": 3.726184496879323e-06,
"logits/chosen": 0.14159968495368958,
"logits/rejected": 0.08811040967702866,
"logps/chosen": -1127.4029541015625,
"logps/rejected": -1502.1641845703125,
"loss": 0.4756,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.8773125410079956,
"rewards/margins": 0.3974476158618927,
"rewards/rejected": -1.274760365486145,
"step": 770
},
{
"epoch": 0.41,
"learning_rate": 3.686165779450619e-06,
"logits/chosen": 0.1939581334590912,
"logits/rejected": 0.1522776186466217,
"logps/chosen": -968.0919799804688,
"logps/rejected": -1507.5386962890625,
"loss": 0.4793,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.7149516344070435,
"rewards/margins": 0.5672934055328369,
"rewards/rejected": -1.2822450399398804,
"step": 780
},
{
"epoch": 0.41,
"learning_rate": 3.645750892287178e-06,
"logits/chosen": 0.1306479275226593,
"logits/rejected": 0.05887848883867264,
"logps/chosen": -1289.082275390625,
"logps/rejected": -1864.7164306640625,
"loss": 0.4721,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.9888286590576172,
"rewards/margins": 0.6331573724746704,
"rewards/rejected": -1.6219860315322876,
"step": 790
},
{
"epoch": 0.42,
"learning_rate": 3.604953333633009e-06,
"logits/chosen": 0.205116868019104,
"logits/rejected": 0.15303435921669006,
"logps/chosen": -848.7705078125,
"logps/rejected": -1336.090576171875,
"loss": 0.4708,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5991231799125671,
"rewards/margins": 0.5247097015380859,
"rewards/rejected": -1.1238329410552979,
"step": 800
},
{
"epoch": 0.42,
"learning_rate": 3.56378672954129e-06,
"logits/chosen": 0.22229023277759552,
"logits/rejected": 0.17705193161964417,
"logps/chosen": -1094.6126708984375,
"logps/rejected": -1681.7445068359375,
"loss": 0.469,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.8267385363578796,
"rewards/margins": 0.6382580995559692,
"rewards/rejected": -1.4649966955184937,
"step": 810
},
{
"epoch": 0.43,
"learning_rate": 3.5222648293233806e-06,
"logits/chosen": 0.1940724402666092,
"logits/rejected": 0.1474287211894989,
"logps/chosen": -1133.5128173828125,
"logps/rejected": -1901.333984375,
"loss": 0.4687,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.8616431951522827,
"rewards/margins": 0.8072026968002319,
"rewards/rejected": -1.6688458919525146,
"step": 820
},
{
"epoch": 0.43,
"learning_rate": 3.4804015009566573e-06,
"logits/chosen": 0.14867620170116425,
"logits/rejected": 0.050886522978544235,
"logps/chosen": -1169.879638671875,
"logps/rejected": -2415.080078125,
"loss": 0.4639,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9197257161140442,
"rewards/margins": 1.2611197233200073,
"rewards/rejected": -2.180845260620117,
"step": 830
},
{
"epoch": 0.44,
"learning_rate": 3.4382107264527244e-06,
"logits/chosen": 0.16670770943164825,
"logits/rejected": 0.11358609050512314,
"logps/chosen": -1215.7694091796875,
"logps/rejected": -1938.170654296875,
"loss": 0.4701,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.9481611251831055,
"rewards/margins": 0.7456313967704773,
"rewards/rejected": -1.6937923431396484,
"step": 840
},
{
"epoch": 0.44,
"learning_rate": 3.3957065971875387e-06,
"logits/chosen": 0.24467554688453674,
"logits/rejected": 0.1815129816532135,
"logps/chosen": -1700.726806640625,
"logps/rejected": -2238.2724609375,
"loss": 0.4738,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -1.4334746599197388,
"rewards/margins": 0.5637392997741699,
"rewards/rejected": -1.9972139596939087,
"step": 850
},
{
"epoch": 0.45,
"learning_rate": 3.352903309194999e-06,
"logits/chosen": 0.25681573152542114,
"logits/rejected": 0.22445912659168243,
"logps/chosen": -1175.2008056640625,
"logps/rejected": -1852.9886474609375,
"loss": 0.476,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.9056652784347534,
"rewards/margins": 0.6970826387405396,
"rewards/rejected": -1.6027476787567139,
"step": 860
},
{
"epoch": 0.46,
"learning_rate": 3.309815158425591e-06,
"logits/chosen": 0.35658639669418335,
"logits/rejected": 0.23468701541423798,
"logps/chosen": -1126.1968994140625,
"logps/rejected": -1490.5289306640625,
"loss": 0.4765,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.8638699650764465,
"rewards/margins": 0.4017399847507477,
"rewards/rejected": -1.265609860420227,
"step": 870
},
{
"epoch": 0.46,
"learning_rate": 3.266456535971654e-06,
"logits/chosen": 0.29603832960128784,
"logits/rejected": 0.2804957330226898,
"logps/chosen": -1391.908447265625,
"logps/rejected": -1630.26220703125,
"loss": 0.4842,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.103570580482483,
"rewards/margins": 0.2950761914253235,
"rewards/rejected": -1.3986468315124512,
"step": 880
},
{
"epoch": 0.47,
"learning_rate": 3.2228419232608692e-06,
"logits/chosen": 0.25324004888534546,
"logits/rejected": 0.19424840807914734,
"logps/chosen": -1254.45947265625,
"logps/rejected": -1625.465087890625,
"loss": 0.483,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -1.0103174448013306,
"rewards/margins": 0.3879779279232025,
"rewards/rejected": -1.3982954025268555,
"step": 890
},
{
"epoch": 0.47,
"learning_rate": 3.1789858872195888e-06,
"logits/chosen": 0.35612553358078003,
"logits/rejected": 0.2640685737133026,
"logps/chosen": -1018.1083984375,
"logps/rejected": -1447.966796875,
"loss": 0.4713,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.7736637592315674,
"rewards/margins": 0.4553070068359375,
"rewards/rejected": -1.2289707660675049,
"step": 900
},
{
"epoch": 0.48,
"learning_rate": 3.1349030754075945e-06,
"logits/chosen": 0.32709187269210815,
"logits/rejected": 0.27523329854011536,
"logps/chosen": -996.7443237304688,
"logps/rejected": -1309.497802734375,
"loss": 0.4674,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.6882850527763367,
"rewards/margins": 0.3720100224018097,
"rewards/rejected": -1.0602951049804688,
"step": 910
},
{
"epoch": 0.48,
"learning_rate": 3.0906082111259313e-06,
"logits/chosen": 0.28385213017463684,
"logits/rejected": 0.26248598098754883,
"logps/chosen": -1238.9512939453125,
"logps/rejected": -1446.0545654296875,
"loss": 0.4729,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.9621122479438782,
"rewards/margins": 0.24497418105602264,
"rewards/rejected": -1.207086443901062,
"step": 920
},
{
"epoch": 0.49,
"learning_rate": 3.046116088499449e-06,
"logits/chosen": 0.20961081981658936,
"logits/rejected": 0.12288858741521835,
"logps/chosen": -1385.43359375,
"logps/rejected": -2388.202392578125,
"loss": 0.4591,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.1082097291946411,
"rewards/margins": 1.0280786752700806,
"rewards/rejected": -2.1362884044647217,
"step": 930
},
{
"epoch": 0.49,
"learning_rate": 3.0014415675356813e-06,
"logits/chosen": 0.2143702507019043,
"logits/rejected": 0.12640917301177979,
"logps/chosen": -1842.924072265625,
"logps/rejected": -2572.03759765625,
"loss": 0.4703,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -1.5414974689483643,
"rewards/margins": 0.7958974838256836,
"rewards/rejected": -2.337394952774048,
"step": 940
},
{
"epoch": 0.5,
"learning_rate": 2.9565995691617242e-06,
"logits/chosen": 0.2267983853816986,
"logits/rejected": 0.19906947016716003,
"logps/chosen": -1659.0390625,
"logps/rejected": -1897.612548828125,
"loss": 0.4796,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -1.4338902235031128,
"rewards/margins": 0.23841390013694763,
"rewards/rejected": -1.672304391860962,
"step": 950
},
{
"epoch": 0.5,
"learning_rate": 2.9116050702407706e-06,
"logits/chosen": 0.2076607495546341,
"logits/rejected": 0.15953665971755981,
"logps/chosen": -1761.4957275390625,
"logps/rejected": -2119.157470703125,
"loss": 0.4733,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -1.521126627922058,
"rewards/margins": 0.37375563383102417,
"rewards/rejected": -1.8948824405670166,
"step": 960
},
{
"epoch": 0.51,
"learning_rate": 2.8664730985699537e-06,
"logits/chosen": 0.2155609130859375,
"logits/rejected": 0.15363694727420807,
"logps/chosen": -1374.277587890625,
"logps/rejected": -2335.11962890625,
"loss": 0.4691,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -1.131838083267212,
"rewards/margins": 0.9804404973983765,
"rewards/rejected": -2.112278461456299,
"step": 970
},
{
"epoch": 0.51,
"learning_rate": 2.8212187278611907e-06,
"logits/chosen": 0.3766547739505768,
"logits/rejected": 0.23996052145957947,
"logps/chosen": -978.6238403320312,
"logps/rejected": -1637.2352294921875,
"loss": 0.4666,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.6910273432731628,
"rewards/margins": 0.7083319425582886,
"rewards/rejected": -1.3993593454360962,
"step": 980
},
{
"epoch": 0.52,
"learning_rate": 2.7758570727066843e-06,
"logits/chosen": 0.3515971899032593,
"logits/rejected": 0.2718420922756195,
"logps/chosen": -945.19921875,
"logps/rejected": -1549.3182373046875,
"loss": 0.4667,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.6832455396652222,
"rewards/margins": 0.6422259211540222,
"rewards/rejected": -1.3254715204238892,
"step": 990
},
{
"epoch": 0.52,
"learning_rate": 2.730403283530767e-06,
"logits/chosen": 0.3331068158149719,
"logits/rejected": 0.21990351378917694,
"logps/chosen": -957.8298950195312,
"logps/rejected": -1847.413330078125,
"loss": 0.4687,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.6945260167121887,
"rewards/margins": 0.9040031433105469,
"rewards/rejected": -1.5985292196273804,
"step": 1000
},
{
"epoch": 0.53,
"learning_rate": 2.6848725415297888e-06,
"logits/chosen": 0.24949748814105988,
"logits/rejected": 0.1596693992614746,
"logps/chosen": -1084.4876708984375,
"logps/rejected": -1898.144287109375,
"loss": 0.4618,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.8169393539428711,
"rewards/margins": 0.8545991778373718,
"rewards/rejected": -1.6715381145477295,
"step": 1010
},
{
"epoch": 0.53,
"learning_rate": 2.639280053601719e-06,
"logits/chosen": 0.22901049256324768,
"logits/rejected": 0.1595744788646698,
"logps/chosen": -1491.752197265625,
"logps/rejected": -2144.299560546875,
"loss": 0.4707,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.2246992588043213,
"rewards/margins": 0.6539346575737,
"rewards/rejected": -1.8786340951919556,
"step": 1020
},
{
"epoch": 0.54,
"learning_rate": 2.59364104726716e-06,
"logits/chosen": 0.31597059965133667,
"logits/rejected": 0.21497178077697754,
"logps/chosen": -1171.93212890625,
"logps/rejected": -1925.6861572265625,
"loss": 0.465,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.8903388977050781,
"rewards/margins": 0.8044350743293762,
"rewards/rejected": -1.6947739124298096,
"step": 1030
},
{
"epoch": 0.54,
"learning_rate": 2.547970765583491e-06,
"logits/chosen": 0.35459914803504944,
"logits/rejected": 0.21209494769573212,
"logps/chosen": -1010.3555908203125,
"logps/rejected": -1694.515869140625,
"loss": 0.4642,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.7338335514068604,
"rewards/margins": 0.7184348106384277,
"rewards/rejected": -1.452268362045288,
"step": 1040
},
{
"epoch": 0.55,
"learning_rate": 2.502284462053799e-06,
"logits/chosen": 0.2834840416908264,
"logits/rejected": 0.19832350313663483,
"logps/chosen": -1069.5567626953125,
"logps/rejected": -1713.046142578125,
"loss": 0.4653,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.781063437461853,
"rewards/margins": 0.6842104196548462,
"rewards/rejected": -1.4652738571166992,
"step": 1050
},
{
"epoch": 0.55,
"learning_rate": 2.456597395532338e-06,
"logits/chosen": 0.23369982838630676,
"logits/rejected": 0.15703235566616058,
"logps/chosen": -1476.560546875,
"logps/rejected": -2163.74267578125,
"loss": 0.4708,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.1821922063827515,
"rewards/margins": 0.7186304330825806,
"rewards/rejected": -1.900822639465332,
"step": 1060
},
{
"epoch": 0.56,
"learning_rate": 2.4109248251281953e-06,
"logits/chosen": 0.2690127491950989,
"logits/rejected": 0.1083533763885498,
"logps/chosen": -1436.783447265625,
"logps/rejected": -2573.591064453125,
"loss": 0.4639,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.1651248931884766,
"rewards/margins": 1.1516902446746826,
"rewards/rejected": -2.316815137863159,
"step": 1070
},
{
"epoch": 0.57,
"learning_rate": 2.365282005108875e-06,
"logits/chosen": 0.2598133087158203,
"logits/rejected": 0.17415449023246765,
"logps/chosen": -1348.472412109375,
"logps/rejected": -1934.5325927734375,
"loss": 0.4721,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.0647830963134766,
"rewards/margins": 0.6361646056175232,
"rewards/rejected": -1.7009475231170654,
"step": 1080
},
{
"epoch": 0.57,
"learning_rate": 2.319684179805491e-06,
"logits/chosen": 0.28293731808662415,
"logits/rejected": 0.16613037884235382,
"logps/chosen": -1299.3868408203125,
"logps/rejected": -2169.75830078125,
"loss": 0.4726,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.0253424644470215,
"rewards/margins": 0.8982712626457214,
"rewards/rejected": -1.9236137866973877,
"step": 1090
},
{
"epoch": 0.58,
"learning_rate": 2.2741465785212905e-06,
"logits/chosen": 0.3770299553871155,
"logits/rejected": 0.3206137418746948,
"logps/chosen": -845.8580322265625,
"logps/rejected": -1371.0318603515625,
"loss": 0.4754,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.573067843914032,
"rewards/margins": 0.5587201714515686,
"rewards/rejected": -1.1317881345748901,
"step": 1100
},
{
"epoch": 0.58,
"learning_rate": 2.2286844104451848e-06,
"logits/chosen": 0.29950836300849915,
"logits/rejected": 0.2572200298309326,
"logps/chosen": -1225.456298828125,
"logps/rejected": -1701.4114990234375,
"loss": 0.4717,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.9399654269218445,
"rewards/margins": 0.5411953926086426,
"rewards/rejected": -1.4811608791351318,
"step": 1110
},
{
"epoch": 0.59,
"learning_rate": 2.183312859572008e-06,
"logits/chosen": 0.2056627720594406,
"logits/rejected": 0.13243384659290314,
"logps/chosen": -1311.5948486328125,
"logps/rejected": -2090.031494140625,
"loss": 0.473,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.03976309299469,
"rewards/margins": 0.8429223895072937,
"rewards/rejected": -1.8826854228973389,
"step": 1120
},
{
"epoch": 0.59,
"learning_rate": 2.1380470796311843e-06,
"logits/chosen": 0.26897698640823364,
"logits/rejected": 0.19322913885116577,
"logps/chosen": -1409.248779296875,
"logps/rejected": -1968.114013671875,
"loss": 0.4624,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1417442560195923,
"rewards/margins": 0.591802716255188,
"rewards/rejected": -1.7335469722747803,
"step": 1130
},
{
"epoch": 0.6,
"learning_rate": 2.092902189025507e-06,
"logits/chosen": 0.298466295003891,
"logits/rejected": 0.1567627638578415,
"logps/chosen": -1206.5018310546875,
"logps/rejected": -2206.86767578125,
"loss": 0.4604,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.9475823640823364,
"rewards/margins": 1.0231791734695435,
"rewards/rejected": -1.9707612991333008,
"step": 1140
},
{
"epoch": 0.6,
"learning_rate": 2.0478932657817105e-06,
"logits/chosen": 0.31211769580841064,
"logits/rejected": 0.1320025771856308,
"logps/chosen": -1475.707275390625,
"logps/rejected": -2485.997802734375,
"loss": 0.4686,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.1953158378601074,
"rewards/margins": 1.0596123933792114,
"rewards/rejected": -2.2549283504486084,
"step": 1150
},
{
"epoch": 0.61,
"learning_rate": 2.0030353425145376e-06,
"logits/chosen": 0.29154402017593384,
"logits/rejected": 0.20484980940818787,
"logps/chosen": -1307.7490234375,
"logps/rejected": -1891.5804443359375,
"loss": 0.475,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.068193793296814,
"rewards/margins": 0.5871996879577637,
"rewards/rejected": -1.6553936004638672,
"step": 1160
},
{
"epoch": 0.61,
"learning_rate": 1.958343401405964e-06,
"logits/chosen": 0.2675972282886505,
"logits/rejected": 0.20726804435253143,
"logps/chosen": -1136.7181396484375,
"logps/rejected": -1507.15234375,
"loss": 0.4705,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.8827505111694336,
"rewards/margins": 0.3945409953594208,
"rewards/rejected": -1.2772915363311768,
"step": 1170
},
{
"epoch": 0.62,
"learning_rate": 1.9138323692012734e-06,
"logits/chosen": 0.273415207862854,
"logits/rejected": 0.16786028444766998,
"logps/chosen": -1736.076416015625,
"logps/rejected": -2560.149169921875,
"loss": 0.4705,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.4262146949768066,
"rewards/margins": 0.8943548202514648,
"rewards/rejected": -2.3205695152282715,
"step": 1180
},
{
"epoch": 0.62,
"learning_rate": 1.8695171122236443e-06,
"logits/chosen": 0.20894399285316467,
"logits/rejected": 0.10228965431451797,
"logps/chosen": -1324.954345703125,
"logps/rejected": -2638.982666015625,
"loss": 0.4668,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.0546363592147827,
"rewards/margins": 1.3503773212432861,
"rewards/rejected": -2.4050137996673584,
"step": 1190
},
{
"epoch": 0.63,
"learning_rate": 1.8254124314089225e-06,
"logits/chosen": 0.3430663049221039,
"logits/rejected": 0.2673262655735016,
"logps/chosen": -861.5838623046875,
"logps/rejected": -1974.1458740234375,
"loss": 0.4518,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5944491624832153,
"rewards/margins": 1.1017727851867676,
"rewards/rejected": -1.696221947669983,
"step": 1200
},
{
"epoch": 0.63,
"learning_rate": 1.781533057362221e-06,
"logits/chosen": 0.3156498670578003,
"logits/rejected": 0.185347780585289,
"logps/chosen": -1168.6217041015625,
"logps/rejected": -1924.7115478515625,
"loss": 0.4583,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.8821272850036621,
"rewards/margins": 0.8161047101020813,
"rewards/rejected": -1.6982319355010986,
"step": 1210
},
{
"epoch": 0.64,
"learning_rate": 1.7378936454380277e-06,
"logits/chosen": 0.36333876848220825,
"logits/rejected": 0.26434630155563354,
"logps/chosen": -1027.678466796875,
"logps/rejected": -1634.684814453125,
"loss": 0.4654,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.7384849786758423,
"rewards/margins": 0.6454702615737915,
"rewards/rejected": -1.383955478668213,
"step": 1220
},
{
"epoch": 0.64,
"learning_rate": 1.6945087708454273e-06,
"logits/chosen": 0.27189189195632935,
"logits/rejected": 0.18399885296821594,
"logps/chosen": -1334.14990234375,
"logps/rejected": -1880.106201171875,
"loss": 0.4767,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -1.090384840965271,
"rewards/margins": 0.5727940797805786,
"rewards/rejected": -1.66317880153656,
"step": 1230
},
{
"epoch": 0.65,
"learning_rate": 1.651392923780105e-06,
"logits/chosen": 0.4100673794746399,
"logits/rejected": 0.2657643258571625,
"logps/chosen": -1111.376220703125,
"logps/rejected": -1941.037353515625,
"loss": 0.46,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.8117152452468872,
"rewards/margins": 0.8833802938461304,
"rewards/rejected": -1.695095419883728,
"step": 1240
},
{
"epoch": 0.65,
"learning_rate": 1.608560504584737e-06,
"logits/chosen": 0.301455020904541,
"logits/rejected": 0.22863301634788513,
"logps/chosen": -1159.3509521484375,
"logps/rejected": -2089.1953125,
"loss": 0.4631,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.8735660314559937,
"rewards/margins": 0.9659594297409058,
"rewards/rejected": -1.839525580406189,
"step": 1250
},
{
"epoch": 0.66,
"learning_rate": 1.5660258189393945e-06,
"logits/chosen": 0.19146260619163513,
"logits/rejected": 0.14353962242603302,
"logps/chosen": -1484.316650390625,
"logps/rejected": -2343.659423828125,
"loss": 0.4687,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2383463382720947,
"rewards/margins": 0.8734383583068848,
"rewards/rejected": -2.1117844581604004,
"step": 1260
},
{
"epoch": 0.66,
"learning_rate": 1.5238030730835578e-06,
"logits/chosen": 0.31026071310043335,
"logits/rejected": 0.19475135207176208,
"logps/chosen": -1738.6246337890625,
"logps/rejected": -2328.933349609375,
"loss": 0.4693,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -1.462491750717163,
"rewards/margins": 0.6261566281318665,
"rewards/rejected": -2.0886483192443848,
"step": 1270
},
{
"epoch": 0.67,
"learning_rate": 1.4819063690713565e-06,
"logits/chosen": 0.26937440037727356,
"logits/rejected": 0.15669001638889313,
"logps/chosen": -1396.046630859375,
"logps/rejected": -2102.262939453125,
"loss": 0.4598,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1547179222106934,
"rewards/margins": 0.7373046278953552,
"rewards/rejected": -1.892022728919983,
"step": 1280
},
{
"epoch": 0.68,
"learning_rate": 1.4403497000615885e-06,
"logits/chosen": 0.3091123700141907,
"logits/rejected": 0.204463392496109,
"logps/chosen": -1624.702392578125,
"logps/rejected": -2571.47412109375,
"loss": 0.4654,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.3447537422180176,
"rewards/margins": 0.9728642702102661,
"rewards/rejected": -2.3176181316375732,
"step": 1290
},
{
"epoch": 0.68,
"learning_rate": 1.3991469456441273e-06,
"logits/chosen": 0.31638103723526,
"logits/rejected": 0.23879094421863556,
"logps/chosen": -1413.901123046875,
"logps/rejected": -2330.88330078125,
"loss": 0.4546,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1106030941009521,
"rewards/margins": 0.9651702642440796,
"rewards/rejected": -2.075773239135742,
"step": 1300
},
{
"epoch": 0.69,
"learning_rate": 1.3583118672042441e-06,
"logits/chosen": 0.274738609790802,
"logits/rejected": 0.18945345282554626,
"logps/chosen": -1652.8541259765625,
"logps/rejected": -2093.89990234375,
"loss": 0.4704,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -1.371697187423706,
"rewards/margins": 0.5045391917228699,
"rewards/rejected": -1.8762363195419312,
"step": 1310
},
{
"epoch": 0.69,
"learning_rate": 1.3178581033264218e-06,
"logits/chosen": 0.27669447660446167,
"logits/rejected": 0.16615034639835358,
"logps/chosen": -1164.213134765625,
"logps/rejected": -2034.477783203125,
"loss": 0.4566,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.9321501851081848,
"rewards/margins": 0.888770580291748,
"rewards/rejected": -1.820920705795288,
"step": 1320
},
{
"epoch": 0.7,
"learning_rate": 1.2777991652391757e-06,
"logits/chosen": 0.31228479743003845,
"logits/rejected": 0.21845977008342743,
"logps/chosen": -1202.439697265625,
"logps/rejected": -1930.3785400390625,
"loss": 0.4661,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.9665547609329224,
"rewards/margins": 0.7526635527610779,
"rewards/rejected": -1.7192184925079346,
"step": 1330
},
{
"epoch": 0.7,
"learning_rate": 1.2381484323024178e-06,
"logits/chosen": 0.35927221179008484,
"logits/rejected": 0.2287793606519699,
"logps/chosen": -1124.3046875,
"logps/rejected": -2056.501708984375,
"loss": 0.4623,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.8218294382095337,
"rewards/margins": 0.98065185546875,
"rewards/rejected": -1.8024810552597046,
"step": 1340
},
{
"epoch": 0.71,
"learning_rate": 1.1989191475388518e-06,
"logits/chosen": 0.3698425889015198,
"logits/rejected": 0.2954414486885071,
"logps/chosen": -1166.0611572265625,
"logps/rejected": -1549.630126953125,
"loss": 0.4695,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8860333561897278,
"rewards/margins": 0.42343559861183167,
"rewards/rejected": -1.3094689846038818,
"step": 1350
},
{
"epoch": 0.71,
"learning_rate": 1.160124413210918e-06,
"logits/chosen": 0.35506299138069153,
"logits/rejected": 0.2409767210483551,
"logps/chosen": -1092.040283203125,
"logps/rejected": -1912.9970703125,
"loss": 0.4582,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.802148163318634,
"rewards/margins": 0.8678997755050659,
"rewards/rejected": -1.6700481176376343,
"step": 1360
},
{
"epoch": 0.72,
"learning_rate": 1.1217771864447396e-06,
"logits/chosen": 0.3243677616119385,
"logits/rejected": 0.17696735262870789,
"logps/chosen": -991.9781494140625,
"logps/rejected": -2300.01806640625,
"loss": 0.4563,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.6990408897399902,
"rewards/margins": 1.3517526388168335,
"rewards/rejected": -2.050793409347534,
"step": 1370
},
{
"epoch": 0.72,
"learning_rate": 1.08389027490255e-06,
"logits/chosen": 0.27917546033859253,
"logits/rejected": 0.13479743897914886,
"logps/chosen": -1405.369384765625,
"logps/rejected": -2042.785888671875,
"loss": 0.4724,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.150667428970337,
"rewards/margins": 0.6895908713340759,
"rewards/rejected": -1.8402583599090576,
"step": 1380
},
{
"epoch": 0.73,
"learning_rate": 1.046476332505036e-06,
"logits/chosen": 0.3343364894390106,
"logits/rejected": 0.23037847876548767,
"logps/chosen": -1098.7138671875,
"logps/rejected": -2268.34130859375,
"loss": 0.463,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.8331190347671509,
"rewards/margins": 1.2149721384048462,
"rewards/rejected": -2.048090934753418,
"step": 1390
},
{
"epoch": 0.73,
"learning_rate": 1.0095478552050348e-06,
"logits/chosen": 0.26566246151924133,
"logits/rejected": 0.2032911777496338,
"logps/chosen": -956.0791015625,
"logps/rejected": -1707.116943359375,
"loss": 0.4575,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.7228320837020874,
"rewards/margins": 0.797810435295105,
"rewards/rejected": -1.5206425189971924,
"step": 1400
},
{
"epoch": 0.74,
"learning_rate": 9.731171768139808e-07,
"logits/chosen": 0.3556608557701111,
"logits/rejected": 0.28849393129348755,
"logps/chosen": -1063.8748779296875,
"logps/rejected": -1503.830322265625,
"loss": 0.4712,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.8245790600776672,
"rewards/margins": 0.47296270728111267,
"rewards/rejected": -1.2975417375564575,
"step": 1410
},
{
"epoch": 0.74,
"learning_rate": 9.371964648825221e-07,
"logits/chosen": 0.3505791425704956,
"logits/rejected": 0.22841492295265198,
"logps/chosen": -1045.915283203125,
"logps/rejected": -2175.98876953125,
"loss": 0.4619,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7385074496269226,
"rewards/margins": 1.1998847723007202,
"rewards/rejected": -1.9383922815322876,
"step": 1420
},
{
"epoch": 0.75,
"learning_rate": 9.017977166366445e-07,
"logits/chosen": 0.2420744001865387,
"logits/rejected": 0.17516903579235077,
"logps/chosen": -1474.8148193359375,
"logps/rejected": -1958.3011474609375,
"loss": 0.4696,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.2239506244659424,
"rewards/margins": 0.5244899988174438,
"rewards/rejected": -1.7484405040740967,
"step": 1430
},
{
"epoch": 0.75,
"learning_rate": 8.669327549707096e-07,
"logits/chosen": 0.2893267571926117,
"logits/rejected": 0.18889647722244263,
"logps/chosen": -1455.797607421875,
"logps/rejected": -2097.127685546875,
"loss": 0.467,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.1924570798873901,
"rewards/margins": 0.6752533912658691,
"rewards/rejected": -1.8677103519439697,
"step": 1440
},
{
"epoch": 0.76,
"learning_rate": 8.326132244986932e-07,
"logits/chosen": 0.24378347396850586,
"logits/rejected": 0.050407588481903076,
"logps/chosen": -1516.9559326171875,
"logps/rejected": -2759.12353515625,
"loss": 0.4595,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.2377126216888428,
"rewards/margins": 1.2976287603378296,
"rewards/rejected": -2.535341262817383,
"step": 1450
},
{
"epoch": 0.76,
"learning_rate": 7.988505876649863e-07,
"logits/chosen": 0.2632651925086975,
"logits/rejected": 0.18519091606140137,
"logps/chosen": -1460.3145751953125,
"logps/rejected": -1786.594482421875,
"loss": 0.4719,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.1815235614776611,
"rewards/margins": 0.385366290807724,
"rewards/rejected": -1.566890001296997,
"step": 1460
},
{
"epoch": 0.77,
"learning_rate": 7.656561209160248e-07,
"logits/chosen": 0.2761257290840149,
"logits/rejected": 0.16277745366096497,
"logps/chosen": -1284.623291015625,
"logps/rejected": -2513.677734375,
"loss": 0.4624,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.9854960441589355,
"rewards/margins": 1.282775640487671,
"rewards/rejected": -2.2682716846466064,
"step": 1470
},
{
"epoch": 0.77,
"learning_rate": 7.330409109340563e-07,
"logits/chosen": 0.2461864948272705,
"logits/rejected": 0.16639626026153564,
"logps/chosen": -1436.0191650390625,
"logps/rejected": -2242.687744140625,
"loss": 0.4672,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.1464489698410034,
"rewards/margins": 0.8480439186096191,
"rewards/rejected": -1.9944928884506226,
"step": 1480
},
{
"epoch": 0.78,
"learning_rate": 7.010158509342682e-07,
"logits/chosen": 0.2478228360414505,
"logits/rejected": 0.1436949521303177,
"logps/chosen": -1367.794921875,
"logps/rejected": -2203.85107421875,
"loss": 0.4721,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.1351174116134644,
"rewards/margins": 0.8538748621940613,
"rewards/rejected": -1.9889923334121704,
"step": 1490
},
{
"epoch": 0.79,
"learning_rate": 6.695916370265529e-07,
"logits/chosen": 0.27428361773490906,
"logits/rejected": 0.18057170510292053,
"logps/chosen": -1440.5146484375,
"logps/rejected": -2161.803466796875,
"loss": 0.4628,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.1880239248275757,
"rewards/margins": 0.7238161563873291,
"rewards/rejected": -1.9118402004241943,
"step": 1500
},
{
"epoch": 0.79,
"learning_rate": 6.387787646430854e-07,
"logits/chosen": 0.25450989603996277,
"logits/rejected": 0.1020331159234047,
"logps/chosen": -1392.365966796875,
"logps/rejected": -2656.48974609375,
"loss": 0.4575,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.118877649307251,
"rewards/margins": 1.2837135791778564,
"rewards/rejected": -2.4025912284851074,
"step": 1510
},
{
"epoch": 0.8,
"learning_rate": 6.085875250329401e-07,
"logits/chosen": 0.3250389099121094,
"logits/rejected": 0.23088189959526062,
"logps/chosen": -1277.065673828125,
"logps/rejected": -2237.5419921875,
"loss": 0.4588,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.98078453540802,
"rewards/margins": 1.0101826190948486,
"rewards/rejected": -1.9909673929214478,
"step": 1520
},
{
"epoch": 0.8,
"learning_rate": 5.79028001824894e-07,
"logits/chosen": 0.34990447759628296,
"logits/rejected": 0.1642770618200302,
"logps/chosen": -1346.687744140625,
"logps/rejected": -3187.396484375,
"loss": 0.4642,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0415836572647095,
"rewards/margins": 1.8741792440414429,
"rewards/rejected": -2.9157626628875732,
"step": 1530
},
{
"epoch": 0.81,
"learning_rate": 5.501100676595761e-07,
"logits/chosen": 0.2536852955818176,
"logits/rejected": 0.1401246041059494,
"logps/chosen": -1562.163818359375,
"logps/rejected": -2294.75732421875,
"loss": 0.4614,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2489855289459229,
"rewards/margins": 0.7926613092422485,
"rewards/rejected": -2.041646957397461,
"step": 1540
},
{
"epoch": 0.81,
"learning_rate": 5.218433808920884e-07,
"logits/chosen": 0.2926151752471924,
"logits/rejected": 0.09962544590234756,
"logps/chosen": -1433.572509765625,
"logps/rejected": -2299.615478515625,
"loss": 0.4524,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.1709363460540771,
"rewards/margins": 0.9143635630607605,
"rewards/rejected": -2.0853002071380615,
"step": 1550
},
{
"epoch": 0.82,
"learning_rate": 4.942373823661928e-07,
"logits/chosen": 0.23216836154460907,
"logits/rejected": 0.19754758477210999,
"logps/chosen": -1521.939208984375,
"logps/rejected": -2178.3291015625,
"loss": 0.4698,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.2342188358306885,
"rewards/margins": 0.6809908151626587,
"rewards/rejected": -1.9152095317840576,
"step": 1560
},
{
"epoch": 0.82,
"learning_rate": 4.6730129226114363e-07,
"logits/chosen": 0.19226306676864624,
"logits/rejected": 0.13501006364822388,
"logps/chosen": -1532.320068359375,
"logps/rejected": -2355.093505859375,
"loss": 0.4712,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.2820327281951904,
"rewards/margins": 0.8527010679244995,
"rewards/rejected": -2.1347339153289795,
"step": 1570
},
{
"epoch": 0.83,
"learning_rate": 4.4104410701222703e-07,
"logits/chosen": 0.15366807579994202,
"logits/rejected": 0.11835174262523651,
"logps/chosen": -1608.6761474609375,
"logps/rejected": -2489.91455078125,
"loss": 0.469,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.338335394859314,
"rewards/margins": 0.8876321911811829,
"rewards/rejected": -2.2259676456451416,
"step": 1580
},
{
"epoch": 0.83,
"learning_rate": 4.154745963060197e-07,
"logits/chosen": 0.21381524205207825,
"logits/rejected": 0.0645713359117508,
"logps/chosen": -1354.0247802734375,
"logps/rejected": -2909.98828125,
"loss": 0.4559,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.0845643281936646,
"rewards/margins": 1.571176290512085,
"rewards/rejected": -2.655740737915039,
"step": 1590
},
{
"epoch": 0.84,
"learning_rate": 3.9060130015138863e-07,
"logits/chosen": 0.25924235582351685,
"logits/rejected": 0.1109732836484909,
"logps/chosen": -1437.39501953125,
"logps/rejected": -2759.584228515625,
"loss": 0.4559,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1647388935089111,
"rewards/margins": 1.3532658815383911,
"rewards/rejected": -2.5180046558380127,
"step": 1600
},
{
"epoch": 0.84,
"learning_rate": 3.664325260271953e-07,
"logits/chosen": 0.22887060046195984,
"logits/rejected": 0.09053263813257217,
"logps/chosen": -1473.260009765625,
"logps/rejected": -1995.0269775390625,
"loss": 0.4712,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -1.2130258083343506,
"rewards/margins": 0.5937215089797974,
"rewards/rejected": -1.8067471981048584,
"step": 1610
},
{
"epoch": 0.85,
"learning_rate": 3.429763461076677e-07,
"logits/chosen": 0.1899276226758957,
"logits/rejected": 0.12356813251972198,
"logps/chosen": -1743.064453125,
"logps/rejected": -2304.783203125,
"loss": 0.4677,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.4444160461425781,
"rewards/margins": 0.6152055859565735,
"rewards/rejected": -2.059621572494507,
"step": 1620
},
{
"epoch": 0.85,
"learning_rate": 3.202405945663556e-07,
"logits/chosen": 0.22914421558380127,
"logits/rejected": 0.09422020614147186,
"logps/chosen": -1509.6998291015625,
"logps/rejected": -2195.837646484375,
"loss": 0.4638,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -1.2225978374481201,
"rewards/margins": 0.7319514155387878,
"rewards/rejected": -1.9545494318008423,
"step": 1630
},
{
"epoch": 0.86,
"learning_rate": 2.982328649595856e-07,
"logits/chosen": 0.24722608923912048,
"logits/rejected": 0.10591373592615128,
"logps/chosen": -1233.9052734375,
"logps/rejected": -2268.322509765625,
"loss": 0.4653,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.9642356634140015,
"rewards/margins": 1.0845736265182495,
"rewards/rejected": -2.04880952835083,
"step": 1640
},
{
"epoch": 0.86,
"learning_rate": 2.7696050769026954e-07,
"logits/chosen": 0.21008674800395966,
"logits/rejected": 0.05934596806764603,
"logps/chosen": -1442.0106201171875,
"logps/rejected": -2874.48388671875,
"loss": 0.4615,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2054154872894287,
"rewards/margins": 1.4088830947875977,
"rewards/rejected": -2.6142985820770264,
"step": 1650
},
{
"epoch": 0.87,
"learning_rate": 2.564306275529341e-07,
"logits/chosen": 0.18529877066612244,
"logits/rejected": 0.12559422850608826,
"logps/chosen": -1704.0299072265625,
"logps/rejected": -2808.08349609375,
"loss": 0.4591,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.4349021911621094,
"rewards/margins": 1.1065049171447754,
"rewards/rejected": -2.5414071083068848,
"step": 1660
},
{
"epoch": 0.87,
"learning_rate": 2.3665008136077332e-07,
"logits/chosen": 0.19881121814250946,
"logits/rejected": 0.17202343046665192,
"logps/chosen": -1710.2633056640625,
"logps/rejected": -1984.5556640625,
"loss": 0.473,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.3971118927001953,
"rewards/margins": 0.33112001419067383,
"rewards/rejected": -1.7282320261001587,
"step": 1670
},
{
"epoch": 0.88,
"learning_rate": 2.1762547565553293e-07,
"logits/chosen": 0.17657816410064697,
"logits/rejected": 0.11265295743942261,
"logps/chosen": -1725.0482177734375,
"logps/rejected": -1982.76953125,
"loss": 0.466,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -1.5021684169769287,
"rewards/margins": 0.2558698058128357,
"rewards/rejected": -1.7580381631851196,
"step": 1680
},
{
"epoch": 0.88,
"learning_rate": 1.993631645009747e-07,
"logits/chosen": 0.19522444903850555,
"logits/rejected": 0.058800529688596725,
"logps/chosen": -1578.4208984375,
"logps/rejected": -2554.65185546875,
"loss": 0.4675,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.2845125198364258,
"rewards/margins": 1.006216049194336,
"rewards/rejected": -2.290728807449341,
"step": 1690
},
{
"epoch": 0.89,
"learning_rate": 1.818692473606748e-07,
"logits/chosen": 0.2271948605775833,
"logits/rejected": 0.18108686804771423,
"logps/chosen": -1478.1927490234375,
"logps/rejected": -2156.734375,
"loss": 0.4747,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.2414519786834717,
"rewards/margins": 0.7026728391647339,
"rewards/rejected": -1.9441248178482056,
"step": 1700
},
{
"epoch": 0.9,
"learning_rate": 1.6514956706084885e-07,
"logits/chosen": 0.23735575377941132,
"logits/rejected": 0.11482490599155426,
"logps/chosen": -1801.324462890625,
"logps/rejected": -2704.887939453125,
"loss": 0.4736,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.552073359489441,
"rewards/margins": 0.8944045901298523,
"rewards/rejected": -2.4464781284332275,
"step": 1710
},
{
"epoch": 0.9,
"learning_rate": 1.4920970783889737e-07,
"logits/chosen": 0.22280173003673553,
"logits/rejected": 0.11919368803501129,
"logps/chosen": -1566.2589111328125,
"logps/rejected": -2471.8125,
"loss": 0.4684,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2630040645599365,
"rewards/margins": 0.966931164264679,
"rewards/rejected": -2.229935646057129,
"step": 1720
},
{
"epoch": 0.91,
"learning_rate": 1.340549934783164e-07,
"logits/chosen": 0.2689998745918274,
"logits/rejected": 0.12245997041463852,
"logps/chosen": -1098.0716552734375,
"logps/rejected": -2332.2578125,
"loss": 0.4659,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -0.8609301447868347,
"rewards/margins": 1.2441167831420898,
"rewards/rejected": -2.1050469875335693,
"step": 1730
},
{
"epoch": 0.91,
"learning_rate": 1.196904855305961e-07,
"logits/chosen": 0.2383730709552765,
"logits/rejected": 0.15037932991981506,
"logps/chosen": -1544.904052734375,
"logps/rejected": -2499.219482421875,
"loss": 0.4561,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.2684749364852905,
"rewards/margins": 0.9886786341667175,
"rewards/rejected": -2.2571537494659424,
"step": 1740
},
{
"epoch": 0.92,
"learning_rate": 1.0612098162470302e-07,
"logits/chosen": 0.20837631821632385,
"logits/rejected": 0.1260487288236618,
"logps/chosen": -1376.4371337890625,
"logps/rejected": -2311.586669921875,
"loss": 0.4467,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.126542329788208,
"rewards/margins": 0.9472710490226746,
"rewards/rejected": -2.0738134384155273,
"step": 1750
},
{
"epoch": 0.92,
"learning_rate": 9.335101386471285e-08,
"logits/chosen": 0.2322504222393036,
"logits/rejected": 0.06627029925584793,
"logps/chosen": -1435.283447265625,
"logps/rejected": -2674.15576171875,
"loss": 0.4715,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.151064157485962,
"rewards/margins": 1.2704349756240845,
"rewards/rejected": -2.421499252319336,
"step": 1760
},
{
"epoch": 0.93,
"learning_rate": 8.138484731612273e-08,
"logits/chosen": 0.2155352383852005,
"logits/rejected": 0.12622274458408356,
"logps/chosen": -1182.394287109375,
"logps/rejected": -2245.24462890625,
"loss": 0.4629,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.9620729684829712,
"rewards/margins": 1.0421679019927979,
"rewards/rejected": -2.0042405128479004,
"step": 1770
},
{
"epoch": 0.93,
"learning_rate": 7.022647858135501e-08,
"logits/chosen": 0.30309510231018066,
"logits/rejected": 0.18017789721488953,
"logps/chosen": -1599.3291015625,
"logps/rejected": -2475.38720703125,
"loss": 0.465,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.3132779598236084,
"rewards/margins": 0.8995476961135864,
"rewards/rejected": -2.2128255367279053,
"step": 1780
},
{
"epoch": 0.94,
"learning_rate": 5.987963446492384e-08,
"logits/chosen": 0.23334476351737976,
"logits/rejected": 0.17126549780368805,
"logps/chosen": -1491.8856201171875,
"logps/rejected": -2064.55078125,
"loss": 0.4679,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.2208709716796875,
"rewards/margins": 0.6093058586120605,
"rewards/rejected": -1.8301767110824585,
"step": 1790
},
{
"epoch": 0.94,
"learning_rate": 5.034777072871394e-08,
"logits/chosen": 0.23894283175468445,
"logits/rejected": 0.16225464642047882,
"logps/chosen": -1209.31494140625,
"logps/rejected": -1923.9974365234375,
"loss": 0.4748,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.9387443661689758,
"rewards/margins": 0.7473281621932983,
"rewards/rejected": -1.6860727071762085,
"step": 1800
},
{
"epoch": 0.95,
"learning_rate": 4.163407093778243e-08,
"logits/chosen": 0.30054157972335815,
"logits/rejected": 0.17386284470558167,
"logps/chosen": -1040.991455078125,
"logps/rejected": -2445.29541015625,
"loss": 0.4516,
"rewards/accuracies": 0.65625,
"rewards/chosen": -0.7577398419380188,
"rewards/margins": 1.4329960346221924,
"rewards/rejected": -2.1907360553741455,
"step": 1810
},
{
"epoch": 0.95,
"learning_rate": 3.37414453970758e-08,
"logits/chosen": 0.303236186504364,
"logits/rejected": 0.1971709430217743,
"logps/chosen": -1248.239501953125,
"logps/rejected": -2541.384033203125,
"loss": 0.4512,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -0.9271729588508606,
"rewards/margins": 1.3627898693084717,
"rewards/rejected": -2.2899627685546875,
"step": 1820
},
{
"epoch": 0.96,
"learning_rate": 2.6672530179410183e-08,
"logits/chosen": 0.25464674830436707,
"logits/rejected": 0.13462017476558685,
"logps/chosen": -1484.759521484375,
"logps/rejected": -2381.3857421875,
"loss": 0.4582,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2104871273040771,
"rewards/margins": 0.955074667930603,
"rewards/rejected": -2.165562152862549,
"step": 1830
},
{
"epoch": 0.96,
"learning_rate": 2.04296862450451e-08,
"logits/chosen": 0.34345191717147827,
"logits/rejected": 0.1766502857208252,
"logps/chosen": -1336.6195068359375,
"logps/rejected": -2531.597412109375,
"loss": 0.4675,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.0585607290267944,
"rewards/margins": 1.2248413562774658,
"rewards/rejected": -2.28340220451355,
"step": 1840
},
{
"epoch": 0.97,
"learning_rate": 1.501499865314171e-08,
"logits/chosen": 0.31596893072128296,
"logits/rejected": 0.17752663791179657,
"logps/chosen": -1208.4625244140625,
"logps/rejected": -2460.017578125,
"loss": 0.4534,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.9176605939865112,
"rewards/margins": 1.2910888195037842,
"rewards/rejected": -2.208749294281006,
"step": 1850
},
{
"epoch": 0.97,
"learning_rate": 1.0430275865371265e-08,
"logits/chosen": 0.30796024203300476,
"logits/rejected": 0.15131710469722748,
"logps/chosen": -1164.2542724609375,
"logps/rejected": -2230.33056640625,
"loss": 0.4555,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.8924150466918945,
"rewards/margins": 1.0938717126846313,
"rewards/rejected": -1.9862868785858154,
"step": 1860
},
{
"epoch": 0.98,
"learning_rate": 6.677049141901315e-09,
"logits/chosen": 0.26449787616729736,
"logits/rejected": 0.12270595878362656,
"logps/chosen": -1493.645263671875,
"logps/rejected": -2633.17626953125,
"loss": 0.4614,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.2356170415878296,
"rewards/margins": 1.1605656147003174,
"rewards/rejected": -2.3961825370788574,
"step": 1870
},
{
"epoch": 0.98,
"learning_rate": 3.756572029968708e-09,
"logits/chosen": 0.23211045563220978,
"logits/rejected": 0.13400281965732574,
"logps/chosen": -1511.829345703125,
"logps/rejected": -2489.31494140625,
"loss": 0.4594,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2328951358795166,
"rewards/margins": 1.0022671222686768,
"rewards/rejected": -2.2351622581481934,
"step": 1880
},
{
"epoch": 0.99,
"learning_rate": 1.6698199452053199e-09,
"logits/chosen": 0.19983918964862823,
"logits/rejected": 0.11516892910003662,
"logps/chosen": -1396.664306640625,
"logps/rejected": -2378.28857421875,
"loss": 0.4543,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.1454143524169922,
"rewards/margins": 1.0044304132461548,
"rewards/rejected": -2.1498446464538574,
"step": 1890
},
{
"epoch": 0.99,
"learning_rate": 4.1748984585560094e-10,
"logits/chosen": 0.2773471474647522,
"logits/rejected": 0.1175018697977066,
"logps/chosen": -1402.577392578125,
"logps/rejected": -2661.568603515625,
"loss": 0.4649,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.132912039756775,
"rewards/margins": 1.262406587600708,
"rewards/rejected": -2.3953185081481934,
"step": 1900
},
{
"epoch": 1.0,
"learning_rate": 0.0,
"logits/chosen": 0.21633613109588623,
"logits/rejected": 0.111175537109375,
"logps/chosen": -1688.2955322265625,
"logps/rejected": -2615.567626953125,
"loss": 0.4707,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -1.4329384565353394,
"rewards/margins": 0.9476302862167358,
"rewards/rejected": -2.380568504333496,
"step": 1910
},
{
"epoch": 1.0,
"step": 1910,
"total_flos": 0.0,
"train_loss": 0.09940854217369519,
"train_runtime": 5146.4957,
"train_samples_per_second": 11.879,
"train_steps_per_second": 0.371
}
],
"logging_steps": 10,
"max_steps": 1910,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}