Meta-Llama-3-8B-Base-MI-1e-6 / trainer_state.json
tengxiao1
TX
f74ea10
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998691442030882,
"eval_steps": 500,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010468463752944255,
"grad_norm": 31.324190504537746,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -0.49775856733322144,
"logits/rejected": -0.5134874582290649,
"logps/chosen": -1.1746575832366943,
"logps/rejected": -1.3592634201049805,
"loss": 2.1738,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1746575832366943,
"rewards/margins": 0.18460586667060852,
"rewards/rejected": -1.3592634201049805,
"step": 5
},
{
"epoch": 0.02093692750588851,
"grad_norm": 17.522763098577006,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -0.5211091637611389,
"logits/rejected": -0.49808019399642944,
"logps/chosen": -1.1585900783538818,
"logps/rejected": -1.2622541189193726,
"loss": 2.1407,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1585900783538818,
"rewards/margins": 0.10366388410329819,
"rewards/rejected": -1.2622541189193726,
"step": 10
},
{
"epoch": 0.031405391258832765,
"grad_norm": 25.192278194697494,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -0.461596816778183,
"logits/rejected": -0.45038098096847534,
"logps/chosen": -1.1062204837799072,
"logps/rejected": -1.3620827198028564,
"loss": 2.1074,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1062204837799072,
"rewards/margins": 0.255862295627594,
"rewards/rejected": -1.3620827198028564,
"step": 15
},
{
"epoch": 0.04187385501177702,
"grad_norm": 44.544789847879194,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -0.4408242106437683,
"logits/rejected": -0.45246267318725586,
"logps/chosen": -1.1579445600509644,
"logps/rejected": -1.2627536058425903,
"loss": 2.1651,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1579445600509644,
"rewards/margins": 0.10480908304452896,
"rewards/rejected": -1.2627536058425903,
"step": 20
},
{
"epoch": 0.05234231876472128,
"grad_norm": 11.346692540130856,
"learning_rate": 5.208333333333334e-07,
"logits/chosen": -0.5032289028167725,
"logits/rejected": -0.4789913296699524,
"logps/chosen": -1.166441559791565,
"logps/rejected": -1.2368651628494263,
"loss": 2.1373,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.166441559791565,
"rewards/margins": 0.07042353600263596,
"rewards/rejected": -1.2368651628494263,
"step": 25
},
{
"epoch": 0.06281078251766553,
"grad_norm": 28.570034370144306,
"learning_rate": 6.249999999999999e-07,
"logits/chosen": -0.49172288179397583,
"logits/rejected": -0.4948248267173767,
"logps/chosen": -1.1403913497924805,
"logps/rejected": -1.275451898574829,
"loss": 2.163,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -1.1403913497924805,
"rewards/margins": 0.13506053388118744,
"rewards/rejected": -1.275451898574829,
"step": 30
},
{
"epoch": 0.07327924627060979,
"grad_norm": 19.91642226793408,
"learning_rate": 7.291666666666666e-07,
"logits/chosen": -0.47831740975379944,
"logits/rejected": -0.4338778853416443,
"logps/chosen": -1.1529806852340698,
"logps/rejected": -1.3276116847991943,
"loss": 2.1154,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.1529806852340698,
"rewards/margins": 0.1746309995651245,
"rewards/rejected": -1.3276116847991943,
"step": 35
},
{
"epoch": 0.08374771002355404,
"grad_norm": 26.52326580399366,
"learning_rate": 8.333333333333333e-07,
"logits/chosen": -0.4782256484031677,
"logits/rejected": -0.4668501019477844,
"logps/chosen": -1.108135461807251,
"logps/rejected": -1.4614675045013428,
"loss": 2.0666,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.108135461807251,
"rewards/margins": 0.353331983089447,
"rewards/rejected": -1.4614675045013428,
"step": 40
},
{
"epoch": 0.0942161737764983,
"grad_norm": 13.796799671660693,
"learning_rate": 9.374999999999999e-07,
"logits/chosen": -0.44356870651245117,
"logits/rejected": -0.4471743702888489,
"logps/chosen": -1.0965029001235962,
"logps/rejected": -1.3664577007293701,
"loss": 2.0864,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.0965029001235962,
"rewards/margins": 0.26995497941970825,
"rewards/rejected": -1.3664577007293701,
"step": 45
},
{
"epoch": 0.10468463752944256,
"grad_norm": 30.371297005919416,
"learning_rate": 9.999463737538052e-07,
"logits/chosen": -0.461489200592041,
"logits/rejected": -0.4655645489692688,
"logps/chosen": -1.1575626134872437,
"logps/rejected": -1.4973771572113037,
"loss": 2.1199,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.1575626134872437,
"rewards/margins": 0.3398147225379944,
"rewards/rejected": -1.4973771572113037,
"step": 50
},
{
"epoch": 0.11515310128238682,
"grad_norm": 26.67718500476433,
"learning_rate": 9.993432105822034e-07,
"logits/chosen": -0.4001489281654358,
"logits/rejected": -0.37682315707206726,
"logps/chosen": -1.1248127222061157,
"logps/rejected": -1.4001871347427368,
"loss": 2.0897,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1248127222061157,
"rewards/margins": 0.27537447214126587,
"rewards/rejected": -1.4001871347427368,
"step": 55
},
{
"epoch": 0.12562156503533106,
"grad_norm": 15.812441875154704,
"learning_rate": 9.980706626858607e-07,
"logits/chosen": -0.43878427147865295,
"logits/rejected": -0.4231850504875183,
"logps/chosen": -1.2165329456329346,
"logps/rejected": -1.3715764284133911,
"loss": 2.0665,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.2165329456329346,
"rewards/margins": 0.1550435572862625,
"rewards/rejected": -1.3715764284133911,
"step": 60
},
{
"epoch": 0.1360900287882753,
"grad_norm": 32.69892893599103,
"learning_rate": 9.961304359538434e-07,
"logits/chosen": -0.38188332319259644,
"logits/rejected": -0.30855393409729004,
"logps/chosen": -1.1145586967468262,
"logps/rejected": -1.7429344654083252,
"loss": 2.0414,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1145586967468262,
"rewards/margins": 0.6283758878707886,
"rewards/rejected": -1.7429344654083252,
"step": 65
},
{
"epoch": 0.14655849254121958,
"grad_norm": 44.90817025126785,
"learning_rate": 9.935251313189563e-07,
"logits/chosen": -0.27111151814460754,
"logits/rejected": -0.24608612060546875,
"logps/chosen": -1.1660597324371338,
"logps/rejected": -1.5309925079345703,
"loss": 2.0234,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.1660597324371338,
"rewards/margins": 0.3649328947067261,
"rewards/rejected": -1.5309925079345703,
"step": 70
},
{
"epoch": 0.15702695629416383,
"grad_norm": 38.268073195027156,
"learning_rate": 9.902582412711118e-07,
"logits/chosen": -0.28683459758758545,
"logits/rejected": -0.25514599680900574,
"logps/chosen": -1.1409043073654175,
"logps/rejected": -1.5740129947662354,
"loss": 2.0488,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1409043073654175,
"rewards/margins": 0.4331088066101074,
"rewards/rejected": -1.5740129947662354,
"step": 75
},
{
"epoch": 0.16749542004710807,
"grad_norm": 23.626638109106274,
"learning_rate": 9.86334145175542e-07,
"logits/chosen": -0.40040236711502075,
"logits/rejected": -0.3598732650279999,
"logps/chosen": -1.1197240352630615,
"logps/rejected": -1.6543350219726562,
"loss": 2.0889,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.1197240352630615,
"rewards/margins": 0.5346111059188843,
"rewards/rejected": -1.6543350219726562,
"step": 80
},
{
"epoch": 0.17796388380005235,
"grad_norm": 21.67675055841775,
"learning_rate": 9.817581034021272e-07,
"logits/chosen": -0.4968738555908203,
"logits/rejected": -0.4568953514099121,
"logps/chosen": -1.1042544841766357,
"logps/rejected": -1.4778095483779907,
"loss": 2.0732,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.1042544841766357,
"rewards/margins": 0.37355509400367737,
"rewards/rejected": -1.4778095483779907,
"step": 85
},
{
"epoch": 0.1884323475529966,
"grad_norm": 32.61370646153053,
"learning_rate": 9.765362502737097e-07,
"logits/chosen": -0.4779502749443054,
"logits/rejected": -0.44491392374038696,
"logps/chosen": -1.144523024559021,
"logps/rejected": -1.4939491748809814,
"loss": 2.0171,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.144523024559021,
"rewards/margins": 0.3494262099266052,
"rewards/rejected": -1.4939491748809814,
"step": 90
},
{
"epoch": 0.19890081130594087,
"grad_norm": 29.010011255606027,
"learning_rate": 9.706755858428485e-07,
"logits/chosen": -0.4942244589328766,
"logits/rejected": -0.39027169346809387,
"logps/chosen": -1.2216947078704834,
"logps/rejected": -1.6423091888427734,
"loss": 2.0511,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.2216947078704834,
"rewards/margins": 0.4206143319606781,
"rewards/rejected": -1.6423091888427734,
"step": 95
},
{
"epoch": 0.2093692750588851,
"grad_norm": 25.050588840086288,
"learning_rate": 9.641839665080363e-07,
"logits/chosen": -0.46108850836753845,
"logits/rejected": -0.423541396856308,
"logps/chosen": -1.1832860708236694,
"logps/rejected": -1.7398521900177002,
"loss": 2.0554,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.1832860708236694,
"rewards/margins": 0.5565661787986755,
"rewards/rejected": -1.7398521900177002,
"step": 100
},
{
"epoch": 0.21983773881182936,
"grad_norm": 76.09509812922548,
"learning_rate": 9.570700944819582e-07,
"logits/chosen": -0.48844489455223083,
"logits/rejected": -0.47664815187454224,
"logps/chosen": -1.065321683883667,
"logps/rejected": -1.5008853673934937,
"loss": 2.0306,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.065321683883667,
"rewards/margins": 0.4355636537075043,
"rewards/rejected": -1.5008853673934937,
"step": 105
},
{
"epoch": 0.23030620256477363,
"grad_norm": 87.9539848283412,
"learning_rate": 9.493435061259129e-07,
"logits/chosen": -0.5218511819839478,
"logits/rejected": -0.49293455481529236,
"logps/chosen": -1.0804827213287354,
"logps/rejected": -1.5555989742279053,
"loss": 2.0182,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.0804827213287354,
"rewards/margins": 0.4751162528991699,
"rewards/rejected": -1.5555989742279053,
"step": 110
},
{
"epoch": 0.24077466631771788,
"grad_norm": 24.95587592194343,
"learning_rate": 9.4101455916603e-07,
"logits/chosen": -0.4004356265068054,
"logits/rejected": -0.34801220893859863,
"logps/chosen": -1.1054725646972656,
"logps/rejected": -1.7531585693359375,
"loss": 1.9992,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1054725646972656,
"rewards/margins": 0.6476858854293823,
"rewards/rejected": -1.7531585693359375,
"step": 115
},
{
"epoch": 0.2512431300706621,
"grad_norm": 53.12789958164912,
"learning_rate": 9.320944188084241e-07,
"logits/chosen": -0.3867969810962677,
"logits/rejected": -0.3542706072330475,
"logps/chosen": -1.3296326398849487,
"logps/rejected": -1.7101236581802368,
"loss": 2.069,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.3296326398849487,
"rewards/margins": 0.3804909884929657,
"rewards/rejected": -1.7101236581802368,
"step": 120
},
{
"epoch": 0.26171159382360637,
"grad_norm": 25.68062394354381,
"learning_rate": 9.225950427718974e-07,
"logits/chosen": -0.4343915581703186,
"logits/rejected": -0.40751656889915466,
"logps/chosen": -1.1859281063079834,
"logps/rejected": -1.5661814212799072,
"loss": 2.0229,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.1859281063079834,
"rewards/margins": 0.3802531659603119,
"rewards/rejected": -1.5661814212799072,
"step": 125
},
{
"epoch": 0.2721800575765506,
"grad_norm": 146.99732744643043,
"learning_rate": 9.125291652582547e-07,
"logits/chosen": -0.43255624175071716,
"logits/rejected": -0.42008519172668457,
"logps/chosen": -1.1270229816436768,
"logps/rejected": -1.3844034671783447,
"loss": 2.0368,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1270229816436768,
"rewards/margins": 0.2573803663253784,
"rewards/rejected": -1.3844034671783447,
"step": 130
},
{
"epoch": 0.2826485213294949,
"grad_norm": 42.69972929682183,
"learning_rate": 9.019102798817195e-07,
"logits/chosen": -0.5087494254112244,
"logits/rejected": -0.4200964570045471,
"logps/chosen": -1.1956226825714111,
"logps/rejected": -1.9745105504989624,
"loss": 1.9952,
"rewards/accuracies": 0.78125,
"rewards/chosen": -1.1956226825714111,
"rewards/margins": 0.7788880467414856,
"rewards/rejected": -1.9745105504989624,
"step": 135
},
{
"epoch": 0.29311698508243916,
"grad_norm": 19.87017547277629,
"learning_rate": 8.90752621580335e-07,
"logits/chosen": -0.4257656931877136,
"logits/rejected": -0.364449143409729,
"logps/chosen": -1.2079570293426514,
"logps/rejected": -1.8338918685913086,
"loss": 1.9605,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2079570293426514,
"rewards/margins": 0.6259347200393677,
"rewards/rejected": -1.8338918685913086,
"step": 140
},
{
"epoch": 0.3035854488353834,
"grad_norm": 15.24234201577276,
"learning_rate": 8.79071147533597e-07,
"logits/chosen": -0.47194284200668335,
"logits/rejected": -0.44540295004844666,
"logps/chosen": -1.2036808729171753,
"logps/rejected": -1.6797609329223633,
"loss": 2.0129,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.2036808729171753,
"rewards/margins": 0.4760800004005432,
"rewards/rejected": -1.6797609329223633,
"step": 145
},
{
"epoch": 0.31405391258832765,
"grad_norm": 41.583916372931604,
"learning_rate": 8.668815171119019e-07,
"logits/chosen": -0.4502836763858795,
"logits/rejected": -0.416980117559433,
"logps/chosen": -1.0764203071594238,
"logps/rejected": -1.5866191387176514,
"loss": 1.9679,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.0764203071594238,
"rewards/margins": 0.5101990699768066,
"rewards/rejected": -1.5866191387176514,
"step": 150
},
{
"epoch": 0.3245223763412719,
"grad_norm": 17.97044676211115,
"learning_rate": 8.54200070884685e-07,
"logits/chosen": -0.4577752947807312,
"logits/rejected": -0.4022301733493805,
"logps/chosen": -1.1599218845367432,
"logps/rejected": -1.6104686260223389,
"loss": 1.9736,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1599218845367432,
"rewards/margins": 0.45054665207862854,
"rewards/rejected": -1.6104686260223389,
"step": 155
},
{
"epoch": 0.33499084009421615,
"grad_norm": 37.67621637306142,
"learning_rate": 8.410438087153911e-07,
"logits/chosen": -0.33586519956588745,
"logits/rejected": -0.2821674942970276,
"logps/chosen": -1.2303192615509033,
"logps/rejected": -1.7895514965057373,
"loss": 2.0104,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.2303192615509033,
"rewards/margins": 0.5592321753501892,
"rewards/rejected": -1.7895514965057373,
"step": 160
},
{
"epoch": 0.34545930384716045,
"grad_norm": 16.05482538779056,
"learning_rate": 8.274303669726426e-07,
"logits/chosen": -0.4002958834171295,
"logits/rejected": -0.34722983837127686,
"logps/chosen": -1.1306252479553223,
"logps/rejected": -1.6940090656280518,
"loss": 2.0112,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1306252479553223,
"rewards/margins": 0.5633838176727295,
"rewards/rejected": -1.6940090656280518,
"step": 165
},
{
"epoch": 0.3559277676001047,
"grad_norm": 15.217072980607172,
"learning_rate": 8.133779948881513e-07,
"logits/chosen": -0.45079272985458374,
"logits/rejected": -0.37534087896347046,
"logps/chosen": -1.1774274110794067,
"logps/rejected": -1.6361265182495117,
"loss": 2.0148,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.1774274110794067,
"rewards/margins": 0.4586990773677826,
"rewards/rejected": -1.6361265182495117,
"step": 170
},
{
"epoch": 0.36639623135304894,
"grad_norm": 19.691952142371672,
"learning_rate": 7.989055300930704e-07,
"logits/chosen": -0.42495885491371155,
"logits/rejected": -0.3137228488922119,
"logps/chosen": -1.2254281044006348,
"logps/rejected": -1.73735773563385,
"loss": 2.0104,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2254281044006348,
"rewards/margins": 0.5119296312332153,
"rewards/rejected": -1.73735773563385,
"step": 175
},
{
"epoch": 0.3768646951059932,
"grad_norm": 30.34827875211837,
"learning_rate": 7.840323733655778e-07,
"logits/chosen": -0.3100610673427582,
"logits/rejected": -0.25817859172821045,
"logps/chosen": -1.2358551025390625,
"logps/rejected": -1.8043813705444336,
"loss": 1.9916,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.2358551025390625,
"rewards/margins": 0.5685264468193054,
"rewards/rejected": -1.8043813705444336,
"step": 180
},
{
"epoch": 0.38733315885893743,
"grad_norm": 21.54243489627896,
"learning_rate": 7.687784626235447e-07,
"logits/chosen": -0.24814710021018982,
"logits/rejected": -0.12512032687664032,
"logps/chosen": -1.2242952585220337,
"logps/rejected": -1.974454641342163,
"loss": 1.9456,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.2242952585220337,
"rewards/margins": 0.7501593828201294,
"rewards/rejected": -1.974454641342163,
"step": 185
},
{
"epoch": 0.39780162261188173,
"grad_norm": 27.936855442626964,
"learning_rate": 7.531642461971514e-07,
"logits/chosen": -0.2731862962245941,
"logits/rejected": -0.18622538447380066,
"logps/chosen": -1.176733136177063,
"logps/rejected": -1.7295942306518555,
"loss": 2.0622,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.176733136177063,
"rewards/margins": 0.5528609752655029,
"rewards/rejected": -1.7295942306518555,
"step": 190
},
{
"epoch": 0.408270086364826,
"grad_norm": 18.211412725741514,
"learning_rate": 7.372106554172801e-07,
"logits/chosen": -0.21031120419502258,
"logits/rejected": -0.14914147555828094,
"logps/chosen": -1.2273377180099487,
"logps/rejected": -1.6471458673477173,
"loss": 1.9975,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2273377180099487,
"rewards/margins": 0.41980820894241333,
"rewards/rejected": -1.6471458673477173,
"step": 195
},
{
"epoch": 0.4187385501177702,
"grad_norm": 28.304585509307277,
"learning_rate": 7.209390765564318e-07,
"logits/chosen": -0.13628198206424713,
"logits/rejected": -0.0973358079791069,
"logps/chosen": -1.2455083131790161,
"logps/rejected": -1.753761649131775,
"loss": 2.0029,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.2455083131790161,
"rewards/margins": 0.5082534551620483,
"rewards/rejected": -1.753761649131775,
"step": 200
},
{
"epoch": 0.42920701387071447,
"grad_norm": 23.204471353515586,
"learning_rate": 7.043713221597773e-07,
"logits/chosen": -0.07737751305103302,
"logits/rejected": -0.005436101462692022,
"logps/chosen": -1.0530147552490234,
"logps/rejected": -1.7120428085327148,
"loss": 1.9468,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0530147552490234,
"rewards/margins": 0.6590279340744019,
"rewards/rejected": -1.7120428085327148,
"step": 205
},
{
"epoch": 0.4396754776236587,
"grad_norm": 19.22100285707222,
"learning_rate": 6.875296018047809e-07,
"logits/chosen": -0.14544904232025146,
"logits/rejected": -0.09322938323020935,
"logps/chosen": -1.25759756565094,
"logps/rejected": -1.6059818267822266,
"loss": 2.0319,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.25759756565094,
"rewards/margins": 0.3483843505382538,
"rewards/rejected": -1.6059818267822266,
"step": 210
},
{
"epoch": 0.45014394137660296,
"grad_norm": 28.866239067564,
"learning_rate": 6.704364923285857e-07,
"logits/chosen": -0.21608710289001465,
"logits/rejected": -0.135384202003479,
"logps/chosen": -1.1534065008163452,
"logps/rejected": -1.7110164165496826,
"loss": 1.9831,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -1.1534065008163452,
"rewards/margins": 0.5576101541519165,
"rewards/rejected": -1.7110164165496826,
"step": 215
},
{
"epoch": 0.46061240512954726,
"grad_norm": 83.00316897734959,
"learning_rate": 6.531149075630796e-07,
"logits/chosen": -0.22518062591552734,
"logits/rejected": -0.04796000197529793,
"logps/chosen": -1.2540584802627563,
"logps/rejected": -1.8683173656463623,
"loss": 1.9781,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.2540584802627563,
"rewards/margins": 0.6142589449882507,
"rewards/rejected": -1.8683173656463623,
"step": 220
},
{
"epoch": 0.4710808688824915,
"grad_norm": 15.63146505822897,
"learning_rate": 6.355880676182085e-07,
"logits/chosen": -0.24729761481285095,
"logits/rejected": -0.10253201425075531,
"logps/chosen": -1.148567795753479,
"logps/rejected": -1.861864447593689,
"loss": 1.9337,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.148567795753479,
"rewards/margins": 0.7132967114448547,
"rewards/rejected": -1.861864447593689,
"step": 225
},
{
"epoch": 0.48154933263543576,
"grad_norm": 1092.6523686417404,
"learning_rate": 6.178794677547137e-07,
"logits/chosen": -0.33141931891441345,
"logits/rejected": -0.1571967899799347,
"logps/chosen": -1.125832200050354,
"logps/rejected": -1.9030935764312744,
"loss": 1.9444,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.125832200050354,
"rewards/margins": 0.7772611379623413,
"rewards/rejected": -1.9030935764312744,
"step": 230
},
{
"epoch": 0.49201779638838,
"grad_norm": 18.438187215474308,
"learning_rate": 6.000128468880222e-07,
"logits/chosen": -0.19492967426776886,
"logits/rejected": -0.088912233710289,
"logps/chosen": -1.1279089450836182,
"logps/rejected": -1.7057428359985352,
"loss": 1.9794,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1279089450836182,
"rewards/margins": 0.5778340101242065,
"rewards/rejected": -1.7057428359985352,
"step": 235
},
{
"epoch": 0.5024862601413242,
"grad_norm": 45.11647792728952,
"learning_rate": 5.820121557655108e-07,
"logits/chosen": -0.17841561138629913,
"logits/rejected": -0.08987215161323547,
"logps/chosen": -1.1346948146820068,
"logps/rejected": -1.8120676279067993,
"loss": 1.9898,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.1346948146820068,
"rewards/margins": 0.6773727536201477,
"rewards/rejected": -1.8120676279067993,
"step": 240
},
{
"epoch": 0.5129547238942685,
"grad_norm": 53.80136713305279,
"learning_rate": 5.639015248598023e-07,
"logits/chosen": -0.2315063774585724,
"logits/rejected": -0.11919162422418594,
"logps/chosen": -1.254396677017212,
"logps/rejected": -1.7449557781219482,
"loss": 1.9968,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.254396677017212,
"rewards/margins": 0.49055904150009155,
"rewards/rejected": -1.7449557781219482,
"step": 245
},
{
"epoch": 0.5234231876472127,
"grad_norm": 30.376240963875347,
"learning_rate": 5.457052320211339e-07,
"logits/chosen": -0.2132711410522461,
"logits/rejected": -0.11911521106958389,
"logps/chosen": -1.1606347560882568,
"logps/rejected": -1.8521320819854736,
"loss": 1.9963,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.1606347560882568,
"rewards/margins": 0.6914970874786377,
"rewards/rejected": -1.8521320819854736,
"step": 250
},
{
"epoch": 0.533891651400157,
"grad_norm": 24.612321850210826,
"learning_rate": 5.274476699321637e-07,
"logits/chosen": -0.17434340715408325,
"logits/rejected": -0.02575433813035488,
"logps/chosen": -1.2206462621688843,
"logps/rejected": -1.893471121788025,
"loss": 1.9294,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.2206462621688843,
"rewards/margins": 0.6728248000144958,
"rewards/rejected": -1.893471121788025,
"step": 255
},
{
"epoch": 0.5443601151531012,
"grad_norm": 23.578174980485148,
"learning_rate": 5.091533134088387e-07,
"logits/chosen": -0.19827161729335785,
"logits/rejected": -0.10442183911800385,
"logps/chosen": -1.1325616836547852,
"logps/rejected": -1.894374132156372,
"loss": 1.9889,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1325616836547852,
"rewards/margins": 0.7618124485015869,
"rewards/rejected": -1.894374132156372,
"step": 260
},
{
"epoch": 0.5548285789060455,
"grad_norm": 23.363765551953982,
"learning_rate": 4.908466865911614e-07,
"logits/chosen": -0.22801117599010468,
"logits/rejected": -0.15166376531124115,
"logps/chosen": -1.2147762775421143,
"logps/rejected": -1.6708816289901733,
"loss": 1.9391,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2147762775421143,
"rewards/margins": 0.45610541105270386,
"rewards/rejected": -1.6708816289901733,
"step": 265
},
{
"epoch": 0.5652970426589898,
"grad_norm": 20.86303085584383,
"learning_rate": 4.7255233006783624e-07,
"logits/chosen": -0.22982990741729736,
"logits/rejected": -0.13931187987327576,
"logps/chosen": -1.2865099906921387,
"logps/rejected": -1.766331434249878,
"loss": 1.9878,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.2865099906921387,
"rewards/margins": 0.47982144355773926,
"rewards/rejected": -1.766331434249878,
"step": 270
},
{
"epoch": 0.575765506411934,
"grad_norm": 12.144303285220628,
"learning_rate": 4.5429476797886617e-07,
"logits/chosen": -0.2274014949798584,
"logits/rejected": -0.07431206852197647,
"logps/chosen": -1.1824675798416138,
"logps/rejected": -1.998253583908081,
"loss": 1.962,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1824675798416138,
"rewards/margins": 0.8157860040664673,
"rewards/rejected": -1.998253583908081,
"step": 275
},
{
"epoch": 0.5862339701648783,
"grad_norm": 37.56330617572613,
"learning_rate": 4.3609847514019763e-07,
"logits/chosen": -0.2594318687915802,
"logits/rejected": -0.14403223991394043,
"logps/chosen": -1.1071598529815674,
"logps/rejected": -1.610290765762329,
"loss": 1.957,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1071598529815674,
"rewards/margins": 0.5031307935714722,
"rewards/rejected": -1.610290765762329,
"step": 280
},
{
"epoch": 0.5967024339178225,
"grad_norm": 55.56290292891477,
"learning_rate": 4.179878442344892e-07,
"logits/chosen": -0.2227039635181427,
"logits/rejected": -0.1900090128183365,
"logps/chosen": -1.1886059045791626,
"logps/rejected": -1.7931125164031982,
"loss": 1.9481,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1886059045791626,
"rewards/margins": 0.60450679063797,
"rewards/rejected": -1.7931125164031982,
"step": 285
},
{
"epoch": 0.6071708976707668,
"grad_norm": 24.48468402537705,
"learning_rate": 3.9998715311197783e-07,
"logits/chosen": -0.26827192306518555,
"logits/rejected": -0.17545387148857117,
"logps/chosen": -1.1850652694702148,
"logps/rejected": -1.8715204000473022,
"loss": 1.9349,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1850652694702148,
"rewards/margins": 0.6864550709724426,
"rewards/rejected": -1.8715204000473022,
"step": 290
},
{
"epoch": 0.6176393614237111,
"grad_norm": 19.0989416435893,
"learning_rate": 3.821205322452863e-07,
"logits/chosen": -0.2373635321855545,
"logits/rejected": -0.1607808768749237,
"logps/chosen": -1.1796191930770874,
"logps/rejected": -1.9065383672714233,
"loss": 1.9901,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.1796191930770874,
"rewards/margins": 0.7269191741943359,
"rewards/rejected": -1.9065383672714233,
"step": 295
},
{
"epoch": 0.6281078251766553,
"grad_norm": 35.51594817128474,
"learning_rate": 3.6441193238179146e-07,
"logits/chosen": -0.28120699524879456,
"logits/rejected": -0.2147771418094635,
"logps/chosen": -1.2024883031845093,
"logps/rejected": -1.7524086236953735,
"loss": 1.9577,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.2024883031845093,
"rewards/margins": 0.5499202013015747,
"rewards/rejected": -1.7524086236953735,
"step": 300
},
{
"epoch": 0.6385762889295996,
"grad_norm": 19.807409901213642,
"learning_rate": 3.4688509243692034e-07,
"logits/chosen": -0.1579556167125702,
"logits/rejected": -0.09319324791431427,
"logps/chosen": -1.2312943935394287,
"logps/rejected": -1.9326064586639404,
"loss": 1.9317,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.2312943935394287,
"rewards/margins": 0.7013120055198669,
"rewards/rejected": -1.9326064586639404,
"step": 305
},
{
"epoch": 0.6490447526825438,
"grad_norm": 26.79163246884692,
"learning_rate": 3.295635076714144e-07,
"logits/chosen": -0.13611330091953278,
"logits/rejected": -0.1433105766773224,
"logps/chosen": -1.1258060932159424,
"logps/rejected": -1.763738989830017,
"loss": 1.9276,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.1258060932159424,
"rewards/margins": 0.6379327774047852,
"rewards/rejected": -1.763738989830017,
"step": 310
},
{
"epoch": 0.6595132164354881,
"grad_norm": 26.007353485880714,
"learning_rate": 3.12470398195219e-07,
"logits/chosen": -0.1855328381061554,
"logits/rejected": -0.06350420415401459,
"logps/chosen": -1.1226041316986084,
"logps/rejected": -1.979421854019165,
"loss": 1.9461,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.1226041316986084,
"rewards/margins": 0.8568177223205566,
"rewards/rejected": -1.979421854019165,
"step": 315
},
{
"epoch": 0.6699816801884323,
"grad_norm": 25.93600538288609,
"learning_rate": 2.956286778402226e-07,
"logits/chosen": -0.16057109832763672,
"logits/rejected": -0.10531453043222427,
"logps/chosen": -1.1869053840637207,
"logps/rejected": -1.7816956043243408,
"loss": 1.8982,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.1869053840637207,
"rewards/margins": 0.5947902798652649,
"rewards/rejected": -1.7816956043243408,
"step": 320
},
{
"epoch": 0.6804501439413766,
"grad_norm": 41.1877461903664,
"learning_rate": 2.7906092344356826e-07,
"logits/chosen": -0.16566753387451172,
"logits/rejected": -0.06549857556819916,
"logps/chosen": -1.1580512523651123,
"logps/rejected": -1.8924694061279297,
"loss": 1.9157,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.1580512523651123,
"rewards/margins": 0.7344181537628174,
"rewards/rejected": -1.8924694061279297,
"step": 325
},
{
"epoch": 0.6909186076943209,
"grad_norm": 13.497224748766067,
"learning_rate": 2.6278934458271996e-07,
"logits/chosen": -0.09990070015192032,
"logits/rejected": -0.019180208444595337,
"logps/chosen": -1.1130152940750122,
"logps/rejected": -1.6457436084747314,
"loss": 1.9451,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.1130152940750122,
"rewards/margins": 0.5327284932136536,
"rewards/rejected": -1.6457436084747314,
"step": 330
},
{
"epoch": 0.7013870714472651,
"grad_norm": 20.73440619316291,
"learning_rate": 2.468357538028487e-07,
"logits/chosen": -0.17166391015052795,
"logits/rejected": -0.08680696785449982,
"logps/chosen": -1.109227180480957,
"logps/rejected": -1.7418838739395142,
"loss": 1.9573,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.109227180480957,
"rewards/margins": 0.6326566934585571,
"rewards/rejected": -1.7418838739395142,
"step": 335
},
{
"epoch": 0.7118555352002094,
"grad_norm": 20.25166204813565,
"learning_rate": 2.312215373764551e-07,
"logits/chosen": -0.155477374792099,
"logits/rejected": -0.05189569666981697,
"logps/chosen": -1.3119245767593384,
"logps/rejected": -1.9228538274765015,
"loss": 1.9728,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.3119245767593384,
"rewards/margins": 0.6109293103218079,
"rewards/rejected": -1.9228538274765015,
"step": 340
},
{
"epoch": 0.7223239989531536,
"grad_norm": 35.62472752098736,
"learning_rate": 2.1596762663442213e-07,
"logits/chosen": -0.18124118447303772,
"logits/rejected": -0.04932355508208275,
"logps/chosen": -1.2099921703338623,
"logps/rejected": -1.9292633533477783,
"loss": 1.9751,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2099921703338623,
"rewards/margins": 0.719271183013916,
"rewards/rejected": -1.9292633533477783,
"step": 345
},
{
"epoch": 0.7327924627060979,
"grad_norm": 19.36102520036485,
"learning_rate": 2.0109446990692963e-07,
"logits/chosen": -0.048113010823726654,
"logits/rejected": -0.02143859677016735,
"logps/chosen": -1.227217197418213,
"logps/rejected": -1.7735779285430908,
"loss": 2.0111,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.227217197418213,
"rewards/margins": 0.5463606715202332,
"rewards/rejected": -1.7735779285430908,
"step": 350
},
{
"epoch": 0.7432609264590422,
"grad_norm": 16.299019547207138,
"learning_rate": 1.8662200511184872e-07,
"logits/chosen": -0.09398343414068222,
"logits/rejected": -0.01715996116399765,
"logps/chosen": -1.061127781867981,
"logps/rejected": -1.851822853088379,
"loss": 1.8894,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.061127781867981,
"rewards/margins": 0.7906948328018188,
"rewards/rejected": -1.851822853088379,
"step": 355
},
{
"epoch": 0.7537293902119864,
"grad_norm": 21.325612236488393,
"learning_rate": 1.725696330273575e-07,
"logits/chosen": -0.19810739159584045,
"logits/rejected": -0.09949172288179398,
"logps/chosen": -1.0794689655303955,
"logps/rejected": -1.6091794967651367,
"loss": 1.8836,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.0794689655303955,
"rewards/margins": 0.529710590839386,
"rewards/rejected": -1.6091794967651367,
"step": 360
},
{
"epoch": 0.7641978539649307,
"grad_norm": 17.67539053293725,
"learning_rate": 1.589561912846089e-07,
"logits/chosen": -0.19371333718299866,
"logits/rejected": -0.06843050569295883,
"logps/chosen": -1.2321817874908447,
"logps/rejected": -1.8411308526992798,
"loss": 1.9833,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.2321817874908447,
"rewards/margins": 0.6089491844177246,
"rewards/rejected": -1.8411308526992798,
"step": 365
},
{
"epoch": 0.7746663177178749,
"grad_norm": 23.235373655195016,
"learning_rate": 1.4579992911531496e-07,
"logits/chosen": -0.11578913033008575,
"logits/rejected": -0.025940338149666786,
"logps/chosen": -1.196590781211853,
"logps/rejected": -1.895391821861267,
"loss": 1.9263,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.196590781211853,
"rewards/margins": 0.6988012790679932,
"rewards/rejected": -1.895391821861267,
"step": 370
},
{
"epoch": 0.7851347814708192,
"grad_norm": 19.259561354186946,
"learning_rate": 1.3311848288809813e-07,
"logits/chosen": -0.11768321692943573,
"logits/rejected": -0.1705169379711151,
"logps/chosen": -1.2138588428497314,
"logps/rejected": -1.7918386459350586,
"loss": 1.9695,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2138588428497314,
"rewards/margins": 0.5779798030853271,
"rewards/rejected": -1.7918386459350586,
"step": 375
},
{
"epoch": 0.7956032452237635,
"grad_norm": 19.09434464976567,
"learning_rate": 1.209288524664029e-07,
"logits/chosen": -0.1390591561794281,
"logits/rejected": -0.08628968149423599,
"logps/chosen": -1.211247444152832,
"logps/rejected": -1.7502481937408447,
"loss": 1.9086,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.211247444152832,
"rewards/margins": 0.5390007495880127,
"rewards/rejected": -1.7502481937408447,
"step": 380
},
{
"epoch": 0.8060717089767077,
"grad_norm": 22.75496669970745,
"learning_rate": 1.0924737841966497e-07,
"logits/chosen": -0.14960381388664246,
"logits/rejected": -0.08989100158214569,
"logps/chosen": -1.1806560754776,
"logps/rejected": -1.799631118774414,
"loss": 1.9473,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.1806560754776,
"rewards/margins": 0.6189749240875244,
"rewards/rejected": -1.799631118774414,
"step": 385
},
{
"epoch": 0.816540172729652,
"grad_norm": 21.199803415422714,
"learning_rate": 9.808972011828054e-08,
"logits/chosen": -0.13692599534988403,
"logits/rejected": -0.04226923733949661,
"logps/chosen": -1.1819908618927002,
"logps/rejected": -1.9731757640838623,
"loss": 1.9367,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.1819908618927002,
"rewards/margins": 0.7911848425865173,
"rewards/rejected": -1.9731757640838623,
"step": 390
},
{
"epoch": 0.8270086364825961,
"grad_norm": 33.49806758309421,
"learning_rate": 8.747083474174527e-08,
"logits/chosen": -0.13622619211673737,
"logits/rejected": 0.037842754274606705,
"logps/chosen": -1.2155778408050537,
"logps/rejected": -1.890428900718689,
"loss": 1.9388,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.2155778408050537,
"rewards/margins": 0.6748510599136353,
"rewards/rejected": -1.890428900718689,
"step": 395
},
{
"epoch": 0.8374771002355405,
"grad_norm": 15.890713698381443,
"learning_rate": 7.740495722810269e-08,
"logits/chosen": -0.05593853071331978,
"logits/rejected": -0.004029959440231323,
"logps/chosen": -1.112066388130188,
"logps/rejected": -1.8403129577636719,
"loss": 1.9207,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.112066388130188,
"rewards/margins": 0.7282465696334839,
"rewards/rejected": -1.8403129577636719,
"step": 400
},
{
"epoch": 0.8479455639884846,
"grad_norm": 19.88967649390424,
"learning_rate": 6.790558119157597e-08,
"logits/chosen": -0.18492689728736877,
"logits/rejected": -0.10850385576486588,
"logps/chosen": -1.2788586616516113,
"logps/rejected": -2.0290207862854004,
"loss": 1.9523,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2788586616516113,
"rewards/margins": 0.7501621842384338,
"rewards/rejected": -2.0290207862854004,
"step": 405
},
{
"epoch": 0.8584140277414289,
"grad_norm": 22.56741853126592,
"learning_rate": 5.898544083397e-08,
"logits/chosen": -0.14272233843803406,
"logits/rejected": -0.0651661604642868,
"logps/chosen": -1.1273430585861206,
"logps/rejected": -1.6827017068862915,
"loss": 1.9304,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.1273430585861206,
"rewards/margins": 0.5553585290908813,
"rewards/rejected": -1.6827017068862915,
"step": 410
},
{
"epoch": 0.8688824914943732,
"grad_norm": 15.199788752886258,
"learning_rate": 5.065649387408705e-08,
"logits/chosen": -0.14387831091880798,
"logits/rejected": -0.009860972873866558,
"logps/chosen": -1.161084771156311,
"logps/rejected": -1.8390836715698242,
"loss": 1.9141,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.161084771156311,
"rewards/margins": 0.6779987812042236,
"rewards/rejected": -1.8390836715698242,
"step": 415
},
{
"epoch": 0.8793509552473174,
"grad_norm": 14.485810825336134,
"learning_rate": 4.292990551804171e-08,
"logits/chosen": -0.12360888719558716,
"logits/rejected": -0.05216851085424423,
"logps/chosen": -1.1394500732421875,
"logps/rejected": -1.831883192062378,
"loss": 1.9578,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.1394500732421875,
"rewards/margins": 0.69243323802948,
"rewards/rejected": -1.831883192062378,
"step": 420
},
{
"epoch": 0.8898194190002617,
"grad_norm": 22.957524299991945,
"learning_rate": 3.581603349196371e-08,
"logits/chosen": -0.08880945295095444,
"logits/rejected": -0.02426137961447239,
"logps/chosen": -1.296489953994751,
"logps/rejected": -1.8570985794067383,
"loss": 1.9254,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.296489953994751,
"rewards/margins": 0.5606086254119873,
"rewards/rejected": -1.8570985794067383,
"step": 425
},
{
"epoch": 0.9002878827532059,
"grad_norm": 17.939695720657745,
"learning_rate": 2.9324414157151367e-08,
"logits/chosen": -0.10626481473445892,
"logits/rejected": -0.055657435208559036,
"logps/chosen": -1.219440221786499,
"logps/rejected": -1.922663688659668,
"loss": 1.9204,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.219440221786499,
"rewards/margins": 0.7032233476638794,
"rewards/rejected": -1.922663688659668,
"step": 430
},
{
"epoch": 0.9107563465061502,
"grad_norm": 19.609830420854962,
"learning_rate": 2.3463749726290284e-08,
"logits/chosen": -0.14449790120124817,
"logits/rejected": -0.08098597824573517,
"logps/chosen": -1.1550737619400024,
"logps/rejected": -1.9791103601455688,
"loss": 1.9163,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1550737619400024,
"rewards/margins": 0.8240365982055664,
"rewards/rejected": -1.9791103601455688,
"step": 435
},
{
"epoch": 0.9212248102590945,
"grad_norm": 31.437744158726638,
"learning_rate": 1.824189659787284e-08,
"logits/chosen": 0.0060030072927474976,
"logits/rejected": 0.009024476632475853,
"logps/chosen": -1.1824986934661865,
"logps/rejected": -1.7867063283920288,
"loss": 1.9724,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.1824986934661865,
"rewards/margins": 0.6042075157165527,
"rewards/rejected": -1.7867063283920288,
"step": 440
},
{
"epoch": 0.9316932740120387,
"grad_norm": 34.49114038658599,
"learning_rate": 1.3665854824458035e-08,
"logits/chosen": -0.15822723507881165,
"logits/rejected": -0.08658315241336823,
"logps/chosen": -1.1747385263442993,
"logps/rejected": -1.7831497192382812,
"loss": 1.9708,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1747385263442993,
"rewards/margins": 0.6084113121032715,
"rewards/rejected": -1.7831497192382812,
"step": 445
},
{
"epoch": 0.942161737764983,
"grad_norm": 22.736368343788918,
"learning_rate": 9.741758728888217e-09,
"logits/chosen": -0.05001335218548775,
"logits/rejected": -0.013674241490662098,
"logps/chosen": -1.179164171218872,
"logps/rejected": -1.8373947143554688,
"loss": 1.909,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.179164171218872,
"rewards/margins": 0.6582303643226624,
"rewards/rejected": -1.8373947143554688,
"step": 450
},
{
"epoch": 0.9526302015179272,
"grad_norm": 21.67558731525575,
"learning_rate": 6.474868681043577e-09,
"logits/chosen": -0.10400988906621933,
"logits/rejected": -0.05608060210943222,
"logps/chosen": -1.3397135734558105,
"logps/rejected": -1.716301679611206,
"loss": 1.9844,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3397135734558105,
"rewards/margins": 0.3765881657600403,
"rewards/rejected": -1.716301679611206,
"step": 455
},
{
"epoch": 0.9630986652708715,
"grad_norm": 22.99781560062592,
"learning_rate": 3.869564046156459e-09,
"logits/chosen": -0.06456808745861053,
"logits/rejected": -0.012792855501174927,
"logps/chosen": -1.0940654277801514,
"logps/rejected": -1.805354356765747,
"loss": 1.8916,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0940654277801514,
"rewards/margins": 0.7112888097763062,
"rewards/rejected": -1.805354356765747,
"step": 460
},
{
"epoch": 0.9735671290238157,
"grad_norm": 17.285058450470018,
"learning_rate": 1.929337314139412e-09,
"logits/chosen": -0.19403138756752014,
"logits/rejected": -0.07949899882078171,
"logps/chosen": -1.2133488655090332,
"logps/rejected": -1.8430767059326172,
"loss": 1.9376,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2133488655090332,
"rewards/margins": 0.6297277808189392,
"rewards/rejected": -1.8430767059326172,
"step": 465
},
{
"epoch": 0.98403559277676,
"grad_norm": 19.37788975464885,
"learning_rate": 6.567894177967325e-10,
"logits/chosen": -0.1643257737159729,
"logits/rejected": -0.06100650504231453,
"logps/chosen": -1.181461215019226,
"logps/rejected": -1.707772970199585,
"loss": 1.9914,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.181461215019226,
"rewards/margins": 0.5263119339942932,
"rewards/rejected": -1.707772970199585,
"step": 470
},
{
"epoch": 0.9945040565297043,
"grad_norm": 24.408366857719134,
"learning_rate": 5.3626246194704575e-11,
"logits/chosen": -0.20142404735088348,
"logits/rejected": -0.07068441808223724,
"logps/chosen": -1.2009718418121338,
"logps/rejected": -1.803815245628357,
"loss": 1.9479,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.2009718418121338,
"rewards/margins": 0.6028433442115784,
"rewards/rejected": -1.803815245628357,
"step": 475
},
{
"epoch": 0.998691442030882,
"step": 477,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 4.3143,
"train_samples_per_second": 14170.447,
"train_steps_per_second": 110.564
}
],
"logging_steps": 5,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}