{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010468463752944255, "grad_norm": 31.324190504537746, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -0.49775856733322144, "logits/rejected": -0.5134874582290649, "logps/chosen": -1.1746575832366943, "logps/rejected": -1.3592634201049805, "loss": 2.1738, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1746575832366943, "rewards/margins": 0.18460586667060852, "rewards/rejected": -1.3592634201049805, "step": 5 }, { "epoch": 0.02093692750588851, "grad_norm": 17.522763098577006, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -0.5211091637611389, "logits/rejected": -0.49808019399642944, "logps/chosen": -1.1585900783538818, "logps/rejected": -1.2622541189193726, "loss": 2.1407, "rewards/accuracies": 0.5, "rewards/chosen": -1.1585900783538818, "rewards/margins": 0.10366388410329819, "rewards/rejected": -1.2622541189193726, "step": 10 }, { "epoch": 0.031405391258832765, "grad_norm": 25.192278194697494, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -0.461596816778183, "logits/rejected": -0.45038098096847534, "logps/chosen": -1.1062204837799072, "logps/rejected": -1.3620827198028564, "loss": 2.1074, "rewards/accuracies": 0.65625, "rewards/chosen": -1.1062204837799072, "rewards/margins": 0.255862295627594, "rewards/rejected": -1.3620827198028564, "step": 15 }, { "epoch": 0.04187385501177702, "grad_norm": 44.544789847879194, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -0.4408242106437683, "logits/rejected": -0.45246267318725586, "logps/chosen": -1.1579445600509644, "logps/rejected": -1.2627536058425903, "loss": 2.1651, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1579445600509644, "rewards/margins": 0.10480908304452896, "rewards/rejected": -1.2627536058425903, "step": 20 }, { "epoch": 0.05234231876472128, "grad_norm": 11.346692540130856, "learning_rate": 5.208333333333334e-07, "logits/chosen": -0.5032289028167725, "logits/rejected": -0.4789913296699524, "logps/chosen": -1.166441559791565, "logps/rejected": -1.2368651628494263, "loss": 2.1373, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.166441559791565, "rewards/margins": 0.07042353600263596, "rewards/rejected": -1.2368651628494263, "step": 25 }, { "epoch": 0.06281078251766553, "grad_norm": 28.570034370144306, "learning_rate": 6.249999999999999e-07, "logits/chosen": -0.49172288179397583, "logits/rejected": -0.4948248267173767, "logps/chosen": -1.1403913497924805, "logps/rejected": -1.275451898574829, "loss": 2.163, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.1403913497924805, "rewards/margins": 0.13506053388118744, "rewards/rejected": -1.275451898574829, "step": 30 }, { "epoch": 0.07327924627060979, "grad_norm": 19.91642226793408, "learning_rate": 7.291666666666666e-07, "logits/chosen": -0.47831740975379944, "logits/rejected": -0.4338778853416443, "logps/chosen": -1.1529806852340698, "logps/rejected": -1.3276116847991943, "loss": 2.1154, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.1529806852340698, "rewards/margins": 0.1746309995651245, "rewards/rejected": -1.3276116847991943, "step": 35 }, { "epoch": 0.08374771002355404, "grad_norm": 26.52326580399366, "learning_rate": 8.333333333333333e-07, "logits/chosen": -0.4782256484031677, "logits/rejected": -0.4668501019477844, "logps/chosen": -1.108135461807251, "logps/rejected": -1.4614675045013428, "loss": 2.0666, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.108135461807251, "rewards/margins": 0.353331983089447, "rewards/rejected": -1.4614675045013428, "step": 40 }, { "epoch": 0.0942161737764983, "grad_norm": 13.796799671660693, "learning_rate": 9.374999999999999e-07, "logits/chosen": -0.44356870651245117, "logits/rejected": -0.4471743702888489, "logps/chosen": -1.0965029001235962, "logps/rejected": -1.3664577007293701, "loss": 2.0864, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.0965029001235962, "rewards/margins": 0.26995497941970825, "rewards/rejected": -1.3664577007293701, "step": 45 }, { "epoch": 0.10468463752944256, "grad_norm": 30.371297005919416, "learning_rate": 9.999463737538052e-07, "logits/chosen": -0.461489200592041, "logits/rejected": -0.4655645489692688, "logps/chosen": -1.1575626134872437, "logps/rejected": -1.4973771572113037, "loss": 2.1199, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.1575626134872437, "rewards/margins": 0.3398147225379944, "rewards/rejected": -1.4973771572113037, "step": 50 }, { "epoch": 0.11515310128238682, "grad_norm": 26.67718500476433, "learning_rate": 9.993432105822034e-07, "logits/chosen": -0.4001489281654358, "logits/rejected": -0.37682315707206726, "logps/chosen": -1.1248127222061157, "logps/rejected": -1.4001871347427368, "loss": 2.0897, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1248127222061157, "rewards/margins": 0.27537447214126587, "rewards/rejected": -1.4001871347427368, "step": 55 }, { "epoch": 0.12562156503533106, "grad_norm": 15.812441875154704, "learning_rate": 9.980706626858607e-07, "logits/chosen": -0.43878427147865295, "logits/rejected": -0.4231850504875183, "logps/chosen": -1.2165329456329346, "logps/rejected": -1.3715764284133911, "loss": 2.0665, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2165329456329346, "rewards/margins": 0.1550435572862625, "rewards/rejected": -1.3715764284133911, "step": 60 }, { "epoch": 0.1360900287882753, "grad_norm": 32.69892893599103, "learning_rate": 9.961304359538434e-07, "logits/chosen": -0.38188332319259644, "logits/rejected": -0.30855393409729004, "logps/chosen": -1.1145586967468262, "logps/rejected": -1.7429344654083252, "loss": 2.0414, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1145586967468262, "rewards/margins": 0.6283758878707886, "rewards/rejected": -1.7429344654083252, "step": 65 }, { "epoch": 0.14655849254121958, "grad_norm": 44.90817025126785, "learning_rate": 9.935251313189563e-07, "logits/chosen": -0.27111151814460754, "logits/rejected": -0.24608612060546875, "logps/chosen": -1.1660597324371338, "logps/rejected": -1.5309925079345703, "loss": 2.0234, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.1660597324371338, "rewards/margins": 0.3649328947067261, "rewards/rejected": -1.5309925079345703, "step": 70 }, { "epoch": 0.15702695629416383, "grad_norm": 38.268073195027156, "learning_rate": 9.902582412711118e-07, "logits/chosen": -0.28683459758758545, "logits/rejected": -0.25514599680900574, "logps/chosen": -1.1409043073654175, "logps/rejected": -1.5740129947662354, "loss": 2.0488, "rewards/accuracies": 0.625, "rewards/chosen": -1.1409043073654175, "rewards/margins": 0.4331088066101074, "rewards/rejected": -1.5740129947662354, "step": 75 }, { "epoch": 0.16749542004710807, "grad_norm": 23.626638109106274, "learning_rate": 9.86334145175542e-07, "logits/chosen": -0.40040236711502075, "logits/rejected": -0.3598732650279999, "logps/chosen": -1.1197240352630615, "logps/rejected": -1.6543350219726562, "loss": 2.0889, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1197240352630615, "rewards/margins": 0.5346111059188843, "rewards/rejected": -1.6543350219726562, "step": 80 }, { "epoch": 0.17796388380005235, "grad_norm": 21.67675055841775, "learning_rate": 9.817581034021272e-07, "logits/chosen": -0.4968738555908203, "logits/rejected": -0.4568953514099121, "logps/chosen": -1.1042544841766357, "logps/rejected": -1.4778095483779907, "loss": 2.0732, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1042544841766357, "rewards/margins": 0.37355509400367737, "rewards/rejected": -1.4778095483779907, "step": 85 }, { "epoch": 0.1884323475529966, "grad_norm": 32.61370646153053, "learning_rate": 9.765362502737097e-07, "logits/chosen": -0.4779502749443054, "logits/rejected": -0.44491392374038696, "logps/chosen": -1.144523024559021, "logps/rejected": -1.4939491748809814, "loss": 2.0171, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.144523024559021, "rewards/margins": 0.3494262099266052, "rewards/rejected": -1.4939491748809814, "step": 90 }, { "epoch": 0.19890081130594087, "grad_norm": 29.010011255606027, "learning_rate": 9.706755858428485e-07, "logits/chosen": -0.4942244589328766, "logits/rejected": -0.39027169346809387, "logps/chosen": -1.2216947078704834, "logps/rejected": -1.6423091888427734, "loss": 2.0511, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2216947078704834, "rewards/margins": 0.4206143319606781, "rewards/rejected": -1.6423091888427734, "step": 95 }, { "epoch": 0.2093692750588851, "grad_norm": 25.050588840086288, "learning_rate": 9.641839665080363e-07, "logits/chosen": -0.46108850836753845, "logits/rejected": -0.423541396856308, "logps/chosen": -1.1832860708236694, "logps/rejected": -1.7398521900177002, "loss": 2.0554, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.1832860708236694, "rewards/margins": 0.5565661787986755, "rewards/rejected": -1.7398521900177002, "step": 100 }, { "epoch": 0.21983773881182936, "grad_norm": 76.09509812922548, "learning_rate": 9.570700944819582e-07, "logits/chosen": -0.48844489455223083, "logits/rejected": -0.47664815187454224, "logps/chosen": -1.065321683883667, "logps/rejected": -1.5008853673934937, "loss": 2.0306, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.065321683883667, "rewards/margins": 0.4355636537075043, "rewards/rejected": -1.5008853673934937, "step": 105 }, { "epoch": 0.23030620256477363, "grad_norm": 87.9539848283412, "learning_rate": 9.493435061259129e-07, "logits/chosen": -0.5218511819839478, "logits/rejected": -0.49293455481529236, "logps/chosen": -1.0804827213287354, "logps/rejected": -1.5555989742279053, "loss": 2.0182, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0804827213287354, "rewards/margins": 0.4751162528991699, "rewards/rejected": -1.5555989742279053, "step": 110 }, { "epoch": 0.24077466631771788, "grad_norm": 24.95587592194343, "learning_rate": 9.4101455916603e-07, "logits/chosen": -0.4004356265068054, "logits/rejected": -0.34801220893859863, "logps/chosen": -1.1054725646972656, "logps/rejected": -1.7531585693359375, "loss": 1.9992, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1054725646972656, "rewards/margins": 0.6476858854293823, "rewards/rejected": -1.7531585693359375, "step": 115 }, { "epoch": 0.2512431300706621, "grad_norm": 53.12789958164912, "learning_rate": 9.320944188084241e-07, "logits/chosen": -0.3867969810962677, "logits/rejected": -0.3542706072330475, "logps/chosen": -1.3296326398849487, "logps/rejected": -1.7101236581802368, "loss": 2.069, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3296326398849487, "rewards/margins": 0.3804909884929657, "rewards/rejected": -1.7101236581802368, "step": 120 }, { "epoch": 0.26171159382360637, "grad_norm": 25.68062394354381, "learning_rate": 9.225950427718974e-07, "logits/chosen": -0.4343915581703186, "logits/rejected": -0.40751656889915466, "logps/chosen": -1.1859281063079834, "logps/rejected": -1.5661814212799072, "loss": 2.0229, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1859281063079834, "rewards/margins": 0.3802531659603119, "rewards/rejected": -1.5661814212799072, "step": 125 }, { "epoch": 0.2721800575765506, "grad_norm": 146.99732744643043, "learning_rate": 9.125291652582547e-07, "logits/chosen": -0.43255624175071716, "logits/rejected": -0.42008519172668457, "logps/chosen": -1.1270229816436768, "logps/rejected": -1.3844034671783447, "loss": 2.0368, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1270229816436768, "rewards/margins": 0.2573803663253784, "rewards/rejected": -1.3844034671783447, "step": 130 }, { "epoch": 0.2826485213294949, "grad_norm": 42.69972929682183, "learning_rate": 9.019102798817195e-07, "logits/chosen": -0.5087494254112244, "logits/rejected": -0.4200964570045471, "logps/chosen": -1.1956226825714111, "logps/rejected": -1.9745105504989624, "loss": 1.9952, "rewards/accuracies": 0.78125, "rewards/chosen": -1.1956226825714111, "rewards/margins": 0.7788880467414856, "rewards/rejected": -1.9745105504989624, "step": 135 }, { "epoch": 0.29311698508243916, "grad_norm": 19.87017547277629, "learning_rate": 8.90752621580335e-07, "logits/chosen": -0.4257656931877136, "logits/rejected": -0.364449143409729, "logps/chosen": -1.2079570293426514, "logps/rejected": -1.8338918685913086, "loss": 1.9605, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2079570293426514, "rewards/margins": 0.6259347200393677, "rewards/rejected": -1.8338918685913086, "step": 140 }, { "epoch": 0.3035854488353834, "grad_norm": 15.24234201577276, "learning_rate": 8.79071147533597e-07, "logits/chosen": -0.47194284200668335, "logits/rejected": -0.44540295004844666, "logps/chosen": -1.2036808729171753, "logps/rejected": -1.6797609329223633, "loss": 2.0129, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2036808729171753, "rewards/margins": 0.4760800004005432, "rewards/rejected": -1.6797609329223633, "step": 145 }, { "epoch": 0.31405391258832765, "grad_norm": 41.583916372931604, "learning_rate": 8.668815171119019e-07, "logits/chosen": -0.4502836763858795, "logits/rejected": -0.416980117559433, "logps/chosen": -1.0764203071594238, "logps/rejected": -1.5866191387176514, "loss": 1.9679, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.0764203071594238, "rewards/margins": 0.5101990699768066, "rewards/rejected": -1.5866191387176514, "step": 150 }, { "epoch": 0.3245223763412719, "grad_norm": 17.97044676211115, "learning_rate": 8.54200070884685e-07, "logits/chosen": -0.4577752947807312, "logits/rejected": -0.4022301733493805, "logps/chosen": -1.1599218845367432, "logps/rejected": -1.6104686260223389, "loss": 1.9736, "rewards/accuracies": 0.625, "rewards/chosen": -1.1599218845367432, "rewards/margins": 0.45054665207862854, "rewards/rejected": -1.6104686260223389, "step": 155 }, { "epoch": 0.33499084009421615, "grad_norm": 37.67621637306142, "learning_rate": 8.410438087153911e-07, "logits/chosen": -0.33586519956588745, "logits/rejected": -0.2821674942970276, "logps/chosen": -1.2303192615509033, "logps/rejected": -1.7895514965057373, "loss": 2.0104, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2303192615509033, "rewards/margins": 0.5592321753501892, "rewards/rejected": -1.7895514965057373, "step": 160 }, { "epoch": 0.34545930384716045, "grad_norm": 16.05482538779056, "learning_rate": 8.274303669726426e-07, "logits/chosen": -0.4002958834171295, "logits/rejected": -0.34722983837127686, "logps/chosen": -1.1306252479553223, "logps/rejected": -1.6940090656280518, "loss": 2.0112, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.1306252479553223, "rewards/margins": 0.5633838176727295, "rewards/rejected": -1.6940090656280518, "step": 165 }, { "epoch": 0.3559277676001047, "grad_norm": 15.217072980607172, "learning_rate": 8.133779948881513e-07, "logits/chosen": -0.45079272985458374, "logits/rejected": -0.37534087896347046, "logps/chosen": -1.1774274110794067, "logps/rejected": -1.6361265182495117, "loss": 2.0148, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.1774274110794067, "rewards/margins": 0.4586990773677826, "rewards/rejected": -1.6361265182495117, "step": 170 }, { "epoch": 0.36639623135304894, "grad_norm": 19.691952142371672, "learning_rate": 7.989055300930704e-07, "logits/chosen": -0.42495885491371155, "logits/rejected": -0.3137228488922119, "logps/chosen": -1.2254281044006348, "logps/rejected": -1.73735773563385, "loss": 2.0104, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2254281044006348, "rewards/margins": 0.5119296312332153, "rewards/rejected": -1.73735773563385, "step": 175 }, { "epoch": 0.3768646951059932, "grad_norm": 30.34827875211837, "learning_rate": 7.840323733655778e-07, "logits/chosen": -0.3100610673427582, "logits/rejected": -0.25817859172821045, "logps/chosen": -1.2358551025390625, "logps/rejected": -1.8043813705444336, "loss": 1.9916, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2358551025390625, "rewards/margins": 0.5685264468193054, "rewards/rejected": -1.8043813705444336, "step": 180 }, { "epoch": 0.38733315885893743, "grad_norm": 21.54243489627896, "learning_rate": 7.687784626235447e-07, "logits/chosen": -0.24814710021018982, "logits/rejected": -0.12512032687664032, "logps/chosen": -1.2242952585220337, "logps/rejected": -1.974454641342163, "loss": 1.9456, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2242952585220337, "rewards/margins": 0.7501593828201294, "rewards/rejected": -1.974454641342163, "step": 185 }, { "epoch": 0.39780162261188173, "grad_norm": 27.936855442626964, "learning_rate": 7.531642461971514e-07, "logits/chosen": -0.2731862962245941, "logits/rejected": -0.18622538447380066, "logps/chosen": -1.176733136177063, "logps/rejected": -1.7295942306518555, "loss": 2.0622, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.176733136177063, "rewards/margins": 0.5528609752655029, "rewards/rejected": -1.7295942306518555, "step": 190 }, { "epoch": 0.408270086364826, "grad_norm": 18.211412725741514, "learning_rate": 7.372106554172801e-07, "logits/chosen": -0.21031120419502258, "logits/rejected": -0.14914147555828094, "logps/chosen": -1.2273377180099487, "logps/rejected": -1.6471458673477173, "loss": 1.9975, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2273377180099487, "rewards/margins": 0.41980820894241333, "rewards/rejected": -1.6471458673477173, "step": 195 }, { "epoch": 0.4187385501177702, "grad_norm": 28.304585509307277, "learning_rate": 7.209390765564318e-07, "logits/chosen": -0.13628198206424713, "logits/rejected": -0.0973358079791069, "logps/chosen": -1.2455083131790161, "logps/rejected": -1.753761649131775, "loss": 2.0029, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2455083131790161, "rewards/margins": 0.5082534551620483, "rewards/rejected": -1.753761649131775, "step": 200 }, { "epoch": 0.42920701387071447, "grad_norm": 23.204471353515586, "learning_rate": 7.043713221597773e-07, "logits/chosen": -0.07737751305103302, "logits/rejected": -0.005436101462692022, "logps/chosen": -1.0530147552490234, "logps/rejected": -1.7120428085327148, "loss": 1.9468, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0530147552490234, "rewards/margins": 0.6590279340744019, "rewards/rejected": -1.7120428085327148, "step": 205 }, { "epoch": 0.4396754776236587, "grad_norm": 19.22100285707222, "learning_rate": 6.875296018047809e-07, "logits/chosen": -0.14544904232025146, "logits/rejected": -0.09322938323020935, "logps/chosen": -1.25759756565094, "logps/rejected": -1.6059818267822266, "loss": 2.0319, "rewards/accuracies": 0.625, "rewards/chosen": -1.25759756565094, "rewards/margins": 0.3483843505382538, "rewards/rejected": -1.6059818267822266, "step": 210 }, { "epoch": 0.45014394137660296, "grad_norm": 28.866239067564, "learning_rate": 6.704364923285857e-07, "logits/chosen": -0.21608710289001465, "logits/rejected": -0.135384202003479, "logps/chosen": -1.1534065008163452, "logps/rejected": -1.7110164165496826, "loss": 1.9831, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.1534065008163452, "rewards/margins": 0.5576101541519165, "rewards/rejected": -1.7110164165496826, "step": 215 }, { "epoch": 0.46061240512954726, "grad_norm": 83.00316897734959, "learning_rate": 6.531149075630796e-07, "logits/chosen": -0.22518062591552734, "logits/rejected": -0.04796000197529793, "logps/chosen": -1.2540584802627563, "logps/rejected": -1.8683173656463623, "loss": 1.9781, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.2540584802627563, "rewards/margins": 0.6142589449882507, "rewards/rejected": -1.8683173656463623, "step": 220 }, { "epoch": 0.4710808688824915, "grad_norm": 15.63146505822897, "learning_rate": 6.355880676182085e-07, "logits/chosen": -0.24729761481285095, "logits/rejected": -0.10253201425075531, "logps/chosen": -1.148567795753479, "logps/rejected": -1.861864447593689, "loss": 1.9337, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.148567795753479, "rewards/margins": 0.7132967114448547, "rewards/rejected": -1.861864447593689, "step": 225 }, { "epoch": 0.48154933263543576, "grad_norm": 1092.6523686417404, "learning_rate": 6.178794677547137e-07, "logits/chosen": -0.33141931891441345, "logits/rejected": -0.1571967899799347, "logps/chosen": -1.125832200050354, "logps/rejected": -1.9030935764312744, "loss": 1.9444, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.125832200050354, "rewards/margins": 0.7772611379623413, "rewards/rejected": -1.9030935764312744, "step": 230 }, { "epoch": 0.49201779638838, "grad_norm": 18.438187215474308, "learning_rate": 6.000128468880222e-07, "logits/chosen": -0.19492967426776886, "logits/rejected": -0.088912233710289, "logps/chosen": -1.1279089450836182, "logps/rejected": -1.7057428359985352, "loss": 1.9794, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1279089450836182, "rewards/margins": 0.5778340101242065, "rewards/rejected": -1.7057428359985352, "step": 235 }, { "epoch": 0.5024862601413242, "grad_norm": 45.11647792728952, "learning_rate": 5.820121557655108e-07, "logits/chosen": -0.17841561138629913, "logits/rejected": -0.08987215161323547, "logps/chosen": -1.1346948146820068, "logps/rejected": -1.8120676279067993, "loss": 1.9898, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1346948146820068, "rewards/margins": 0.6773727536201477, "rewards/rejected": -1.8120676279067993, "step": 240 }, { "epoch": 0.5129547238942685, "grad_norm": 53.80136713305279, "learning_rate": 5.639015248598023e-07, "logits/chosen": -0.2315063774585724, "logits/rejected": -0.11919162422418594, "logps/chosen": -1.254396677017212, "logps/rejected": -1.7449557781219482, "loss": 1.9968, "rewards/accuracies": 0.625, "rewards/chosen": -1.254396677017212, "rewards/margins": 0.49055904150009155, "rewards/rejected": -1.7449557781219482, "step": 245 }, { "epoch": 0.5234231876472127, "grad_norm": 30.376240963875347, "learning_rate": 5.457052320211339e-07, "logits/chosen": -0.2132711410522461, "logits/rejected": -0.11911521106958389, "logps/chosen": -1.1606347560882568, "logps/rejected": -1.8521320819854736, "loss": 1.9963, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1606347560882568, "rewards/margins": 0.6914970874786377, "rewards/rejected": -1.8521320819854736, "step": 250 }, { "epoch": 0.533891651400157, "grad_norm": 24.612321850210826, "learning_rate": 5.274476699321637e-07, "logits/chosen": -0.17434340715408325, "logits/rejected": -0.02575433813035488, "logps/chosen": -1.2206462621688843, "logps/rejected": -1.893471121788025, "loss": 1.9294, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2206462621688843, "rewards/margins": 0.6728248000144958, "rewards/rejected": -1.893471121788025, "step": 255 }, { "epoch": 0.5443601151531012, "grad_norm": 23.578174980485148, "learning_rate": 5.091533134088387e-07, "logits/chosen": -0.19827161729335785, "logits/rejected": -0.10442183911800385, "logps/chosen": -1.1325616836547852, "logps/rejected": -1.894374132156372, "loss": 1.9889, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1325616836547852, "rewards/margins": 0.7618124485015869, "rewards/rejected": -1.894374132156372, "step": 260 }, { "epoch": 0.5548285789060455, "grad_norm": 23.363765551953982, "learning_rate": 4.908466865911614e-07, "logits/chosen": -0.22801117599010468, "logits/rejected": -0.15166376531124115, "logps/chosen": -1.2147762775421143, "logps/rejected": -1.6708816289901733, "loss": 1.9391, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2147762775421143, "rewards/margins": 0.45610541105270386, "rewards/rejected": -1.6708816289901733, "step": 265 }, { "epoch": 0.5652970426589898, "grad_norm": 20.86303085584383, "learning_rate": 4.7255233006783624e-07, "logits/chosen": -0.22982990741729736, "logits/rejected": -0.13931187987327576, "logps/chosen": -1.2865099906921387, "logps/rejected": -1.766331434249878, "loss": 1.9878, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2865099906921387, "rewards/margins": 0.47982144355773926, "rewards/rejected": -1.766331434249878, "step": 270 }, { "epoch": 0.575765506411934, "grad_norm": 12.144303285220628, "learning_rate": 4.5429476797886617e-07, "logits/chosen": -0.2274014949798584, "logits/rejected": -0.07431206852197647, "logps/chosen": -1.1824675798416138, "logps/rejected": -1.998253583908081, "loss": 1.962, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.1824675798416138, "rewards/margins": 0.8157860040664673, "rewards/rejected": -1.998253583908081, "step": 275 }, { "epoch": 0.5862339701648783, "grad_norm": 37.56330617572613, "learning_rate": 4.3609847514019763e-07, "logits/chosen": -0.2594318687915802, "logits/rejected": -0.14403223991394043, "logps/chosen": -1.1071598529815674, "logps/rejected": -1.610290765762329, "loss": 1.957, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1071598529815674, "rewards/margins": 0.5031307935714722, "rewards/rejected": -1.610290765762329, "step": 280 }, { "epoch": 0.5967024339178225, "grad_norm": 55.56290292891477, "learning_rate": 4.179878442344892e-07, "logits/chosen": -0.2227039635181427, "logits/rejected": -0.1900090128183365, "logps/chosen": -1.1886059045791626, "logps/rejected": -1.7931125164031982, "loss": 1.9481, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1886059045791626, "rewards/margins": 0.60450679063797, "rewards/rejected": -1.7931125164031982, "step": 285 }, { "epoch": 0.6071708976707668, "grad_norm": 24.48468402537705, "learning_rate": 3.9998715311197783e-07, "logits/chosen": -0.26827192306518555, "logits/rejected": -0.17545387148857117, "logps/chosen": -1.1850652694702148, "logps/rejected": -1.8715204000473022, "loss": 1.9349, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1850652694702148, "rewards/margins": 0.6864550709724426, "rewards/rejected": -1.8715204000473022, "step": 290 }, { "epoch": 0.6176393614237111, "grad_norm": 19.0989416435893, "learning_rate": 3.821205322452863e-07, "logits/chosen": -0.2373635321855545, "logits/rejected": -0.1607808768749237, "logps/chosen": -1.1796191930770874, "logps/rejected": -1.9065383672714233, "loss": 1.9901, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1796191930770874, "rewards/margins": 0.7269191741943359, "rewards/rejected": -1.9065383672714233, "step": 295 }, { "epoch": 0.6281078251766553, "grad_norm": 35.51594817128474, "learning_rate": 3.6441193238179146e-07, "logits/chosen": -0.28120699524879456, "logits/rejected": -0.2147771418094635, "logps/chosen": -1.2024883031845093, "logps/rejected": -1.7524086236953735, "loss": 1.9577, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2024883031845093, "rewards/margins": 0.5499202013015747, "rewards/rejected": -1.7524086236953735, "step": 300 }, { "epoch": 0.6385762889295996, "grad_norm": 19.807409901213642, "learning_rate": 3.4688509243692034e-07, "logits/chosen": -0.1579556167125702, "logits/rejected": -0.09319324791431427, "logps/chosen": -1.2312943935394287, "logps/rejected": -1.9326064586639404, "loss": 1.9317, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2312943935394287, "rewards/margins": 0.7013120055198669, "rewards/rejected": -1.9326064586639404, "step": 305 }, { "epoch": 0.6490447526825438, "grad_norm": 26.79163246884692, "learning_rate": 3.295635076714144e-07, "logits/chosen": -0.13611330091953278, "logits/rejected": -0.1433105766773224, "logps/chosen": -1.1258060932159424, "logps/rejected": -1.763738989830017, "loss": 1.9276, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.1258060932159424, "rewards/margins": 0.6379327774047852, "rewards/rejected": -1.763738989830017, "step": 310 }, { "epoch": 0.6595132164354881, "grad_norm": 26.007353485880714, "learning_rate": 3.12470398195219e-07, "logits/chosen": -0.1855328381061554, "logits/rejected": -0.06350420415401459, "logps/chosen": -1.1226041316986084, "logps/rejected": -1.979421854019165, "loss": 1.9461, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.1226041316986084, "rewards/margins": 0.8568177223205566, "rewards/rejected": -1.979421854019165, "step": 315 }, { "epoch": 0.6699816801884323, "grad_norm": 25.93600538288609, "learning_rate": 2.956286778402226e-07, "logits/chosen": -0.16057109832763672, "logits/rejected": -0.10531453043222427, "logps/chosen": -1.1869053840637207, "logps/rejected": -1.7816956043243408, "loss": 1.8982, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1869053840637207, "rewards/margins": 0.5947902798652649, "rewards/rejected": -1.7816956043243408, "step": 320 }, { "epoch": 0.6804501439413766, "grad_norm": 41.1877461903664, "learning_rate": 2.7906092344356826e-07, "logits/chosen": -0.16566753387451172, "logits/rejected": -0.06549857556819916, "logps/chosen": -1.1580512523651123, "logps/rejected": -1.8924694061279297, "loss": 1.9157, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1580512523651123, "rewards/margins": 0.7344181537628174, "rewards/rejected": -1.8924694061279297, "step": 325 }, { "epoch": 0.6909186076943209, "grad_norm": 13.497224748766067, "learning_rate": 2.6278934458271996e-07, "logits/chosen": -0.09990070015192032, "logits/rejected": -0.019180208444595337, "logps/chosen": -1.1130152940750122, "logps/rejected": -1.6457436084747314, "loss": 1.9451, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1130152940750122, "rewards/margins": 0.5327284932136536, "rewards/rejected": -1.6457436084747314, "step": 330 }, { "epoch": 0.7013870714472651, "grad_norm": 20.73440619316291, "learning_rate": 2.468357538028487e-07, "logits/chosen": -0.17166391015052795, "logits/rejected": -0.08680696785449982, "logps/chosen": -1.109227180480957, "logps/rejected": -1.7418838739395142, "loss": 1.9573, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.109227180480957, "rewards/margins": 0.6326566934585571, "rewards/rejected": -1.7418838739395142, "step": 335 }, { "epoch": 0.7118555352002094, "grad_norm": 20.25166204813565, "learning_rate": 2.312215373764551e-07, "logits/chosen": -0.155477374792099, "logits/rejected": -0.05189569666981697, "logps/chosen": -1.3119245767593384, "logps/rejected": -1.9228538274765015, "loss": 1.9728, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3119245767593384, "rewards/margins": 0.6109293103218079, "rewards/rejected": -1.9228538274765015, "step": 340 }, { "epoch": 0.7223239989531536, "grad_norm": 35.62472752098736, "learning_rate": 2.1596762663442213e-07, "logits/chosen": -0.18124118447303772, "logits/rejected": -0.04932355508208275, "logps/chosen": -1.2099921703338623, "logps/rejected": -1.9292633533477783, "loss": 1.9751, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2099921703338623, "rewards/margins": 0.719271183013916, "rewards/rejected": -1.9292633533477783, "step": 345 }, { "epoch": 0.7327924627060979, "grad_norm": 19.36102520036485, "learning_rate": 2.0109446990692963e-07, "logits/chosen": -0.048113010823726654, "logits/rejected": -0.02143859677016735, "logps/chosen": -1.227217197418213, "logps/rejected": -1.7735779285430908, "loss": 2.0111, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.227217197418213, "rewards/margins": 0.5463606715202332, "rewards/rejected": -1.7735779285430908, "step": 350 }, { "epoch": 0.7432609264590422, "grad_norm": 16.299019547207138, "learning_rate": 1.8662200511184872e-07, "logits/chosen": -0.09398343414068222, "logits/rejected": -0.01715996116399765, "logps/chosen": -1.061127781867981, "logps/rejected": -1.851822853088379, "loss": 1.8894, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.061127781867981, "rewards/margins": 0.7906948328018188, "rewards/rejected": -1.851822853088379, "step": 355 }, { "epoch": 0.7537293902119864, "grad_norm": 21.325612236488393, "learning_rate": 1.725696330273575e-07, "logits/chosen": -0.19810739159584045, "logits/rejected": -0.09949172288179398, "logps/chosen": -1.0794689655303955, "logps/rejected": -1.6091794967651367, "loss": 1.8836, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.0794689655303955, "rewards/margins": 0.529710590839386, "rewards/rejected": -1.6091794967651367, "step": 360 }, { "epoch": 0.7641978539649307, "grad_norm": 17.67539053293725, "learning_rate": 1.589561912846089e-07, "logits/chosen": -0.19371333718299866, "logits/rejected": -0.06843050569295883, "logps/chosen": -1.2321817874908447, "logps/rejected": -1.8411308526992798, "loss": 1.9833, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2321817874908447, "rewards/margins": 0.6089491844177246, "rewards/rejected": -1.8411308526992798, "step": 365 }, { "epoch": 0.7746663177178749, "grad_norm": 23.235373655195016, "learning_rate": 1.4579992911531496e-07, "logits/chosen": -0.11578913033008575, "logits/rejected": -0.025940338149666786, "logps/chosen": -1.196590781211853, "logps/rejected": -1.895391821861267, "loss": 1.9263, "rewards/accuracies": 0.6875, "rewards/chosen": -1.196590781211853, "rewards/margins": 0.6988012790679932, "rewards/rejected": -1.895391821861267, "step": 370 }, { "epoch": 0.7851347814708192, "grad_norm": 19.259561354186946, "learning_rate": 1.3311848288809813e-07, "logits/chosen": -0.11768321692943573, "logits/rejected": -0.1705169379711151, "logps/chosen": -1.2138588428497314, "logps/rejected": -1.7918386459350586, "loss": 1.9695, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2138588428497314, "rewards/margins": 0.5779798030853271, "rewards/rejected": -1.7918386459350586, "step": 375 }, { "epoch": 0.7956032452237635, "grad_norm": 19.09434464976567, "learning_rate": 1.209288524664029e-07, "logits/chosen": -0.1390591561794281, "logits/rejected": -0.08628968149423599, "logps/chosen": -1.211247444152832, "logps/rejected": -1.7502481937408447, "loss": 1.9086, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.211247444152832, "rewards/margins": 0.5390007495880127, "rewards/rejected": -1.7502481937408447, "step": 380 }, { "epoch": 0.8060717089767077, "grad_norm": 22.75496669970745, "learning_rate": 1.0924737841966497e-07, "logits/chosen": -0.14960381388664246, "logits/rejected": -0.08989100158214569, "logps/chosen": -1.1806560754776, "logps/rejected": -1.799631118774414, "loss": 1.9473, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1806560754776, "rewards/margins": 0.6189749240875244, "rewards/rejected": -1.799631118774414, "step": 385 }, { "epoch": 0.816540172729652, "grad_norm": 21.199803415422714, "learning_rate": 9.808972011828054e-08, "logits/chosen": -0.13692599534988403, "logits/rejected": -0.04226923733949661, "logps/chosen": -1.1819908618927002, "logps/rejected": -1.9731757640838623, "loss": 1.9367, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1819908618927002, "rewards/margins": 0.7911848425865173, "rewards/rejected": -1.9731757640838623, "step": 390 }, { "epoch": 0.8270086364825961, "grad_norm": 33.49806758309421, "learning_rate": 8.747083474174527e-08, "logits/chosen": -0.13622619211673737, "logits/rejected": 0.037842754274606705, "logps/chosen": -1.2155778408050537, "logps/rejected": -1.890428900718689, "loss": 1.9388, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2155778408050537, "rewards/margins": 0.6748510599136353, "rewards/rejected": -1.890428900718689, "step": 395 }, { "epoch": 0.8374771002355405, "grad_norm": 15.890713698381443, "learning_rate": 7.740495722810269e-08, "logits/chosen": -0.05593853071331978, "logits/rejected": -0.004029959440231323, "logps/chosen": -1.112066388130188, "logps/rejected": -1.8403129577636719, "loss": 1.9207, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.112066388130188, "rewards/margins": 0.7282465696334839, "rewards/rejected": -1.8403129577636719, "step": 400 }, { "epoch": 0.8479455639884846, "grad_norm": 19.88967649390424, "learning_rate": 6.790558119157597e-08, "logits/chosen": -0.18492689728736877, "logits/rejected": -0.10850385576486588, "logps/chosen": -1.2788586616516113, "logps/rejected": -2.0290207862854004, "loss": 1.9523, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2788586616516113, "rewards/margins": 0.7501621842384338, "rewards/rejected": -2.0290207862854004, "step": 405 }, { "epoch": 0.8584140277414289, "grad_norm": 22.56741853126592, "learning_rate": 5.898544083397e-08, "logits/chosen": -0.14272233843803406, "logits/rejected": -0.0651661604642868, "logps/chosen": -1.1273430585861206, "logps/rejected": -1.6827017068862915, "loss": 1.9304, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1273430585861206, "rewards/margins": 0.5553585290908813, "rewards/rejected": -1.6827017068862915, "step": 410 }, { "epoch": 0.8688824914943732, "grad_norm": 15.199788752886258, "learning_rate": 5.065649387408705e-08, "logits/chosen": -0.14387831091880798, "logits/rejected": -0.009860972873866558, "logps/chosen": -1.161084771156311, "logps/rejected": -1.8390836715698242, "loss": 1.9141, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.161084771156311, "rewards/margins": 0.6779987812042236, "rewards/rejected": -1.8390836715698242, "step": 415 }, { "epoch": 0.8793509552473174, "grad_norm": 14.485810825336134, "learning_rate": 4.292990551804171e-08, "logits/chosen": -0.12360888719558716, "logits/rejected": -0.05216851085424423, "logps/chosen": -1.1394500732421875, "logps/rejected": -1.831883192062378, "loss": 1.9578, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.1394500732421875, "rewards/margins": 0.69243323802948, "rewards/rejected": -1.831883192062378, "step": 420 }, { "epoch": 0.8898194190002617, "grad_norm": 22.957524299991945, "learning_rate": 3.581603349196371e-08, "logits/chosen": -0.08880945295095444, "logits/rejected": -0.02426137961447239, "logps/chosen": -1.296489953994751, "logps/rejected": -1.8570985794067383, "loss": 1.9254, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.296489953994751, "rewards/margins": 0.5606086254119873, "rewards/rejected": -1.8570985794067383, "step": 425 }, { "epoch": 0.9002878827532059, "grad_norm": 17.939695720657745, "learning_rate": 2.9324414157151367e-08, "logits/chosen": -0.10626481473445892, "logits/rejected": -0.055657435208559036, "logps/chosen": -1.219440221786499, "logps/rejected": -1.922663688659668, "loss": 1.9204, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.219440221786499, "rewards/margins": 0.7032233476638794, "rewards/rejected": -1.922663688659668, "step": 430 }, { "epoch": 0.9107563465061502, "grad_norm": 19.609830420854962, "learning_rate": 2.3463749726290284e-08, "logits/chosen": -0.14449790120124817, "logits/rejected": -0.08098597824573517, "logps/chosen": -1.1550737619400024, "logps/rejected": -1.9791103601455688, "loss": 1.9163, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1550737619400024, "rewards/margins": 0.8240365982055664, "rewards/rejected": -1.9791103601455688, "step": 435 }, { "epoch": 0.9212248102590945, "grad_norm": 31.437744158726638, "learning_rate": 1.824189659787284e-08, "logits/chosen": 0.0060030072927474976, "logits/rejected": 0.009024476632475853, "logps/chosen": -1.1824986934661865, "logps/rejected": -1.7867063283920288, "loss": 1.9724, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1824986934661865, "rewards/margins": 0.6042075157165527, "rewards/rejected": -1.7867063283920288, "step": 440 }, { "epoch": 0.9316932740120387, "grad_norm": 34.49114038658599, "learning_rate": 1.3665854824458035e-08, "logits/chosen": -0.15822723507881165, "logits/rejected": -0.08658315241336823, "logps/chosen": -1.1747385263442993, "logps/rejected": -1.7831497192382812, "loss": 1.9708, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1747385263442993, "rewards/margins": 0.6084113121032715, "rewards/rejected": -1.7831497192382812, "step": 445 }, { "epoch": 0.942161737764983, "grad_norm": 22.736368343788918, "learning_rate": 9.741758728888217e-09, "logits/chosen": -0.05001335218548775, "logits/rejected": -0.013674241490662098, "logps/chosen": -1.179164171218872, "logps/rejected": -1.8373947143554688, "loss": 1.909, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.179164171218872, "rewards/margins": 0.6582303643226624, "rewards/rejected": -1.8373947143554688, "step": 450 }, { "epoch": 0.9526302015179272, "grad_norm": 21.67558731525575, "learning_rate": 6.474868681043577e-09, "logits/chosen": -0.10400988906621933, "logits/rejected": -0.05608060210943222, "logps/chosen": -1.3397135734558105, "logps/rejected": -1.716301679611206, "loss": 1.9844, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3397135734558105, "rewards/margins": 0.3765881657600403, "rewards/rejected": -1.716301679611206, "step": 455 }, { "epoch": 0.9630986652708715, "grad_norm": 22.99781560062592, "learning_rate": 3.869564046156459e-09, "logits/chosen": -0.06456808745861053, "logits/rejected": -0.012792855501174927, "logps/chosen": -1.0940654277801514, "logps/rejected": -1.805354356765747, "loss": 1.8916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0940654277801514, "rewards/margins": 0.7112888097763062, "rewards/rejected": -1.805354356765747, "step": 460 }, { "epoch": 0.9735671290238157, "grad_norm": 17.285058450470018, "learning_rate": 1.929337314139412e-09, "logits/chosen": -0.19403138756752014, "logits/rejected": -0.07949899882078171, "logps/chosen": -1.2133488655090332, "logps/rejected": -1.8430767059326172, "loss": 1.9376, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2133488655090332, "rewards/margins": 0.6297277808189392, "rewards/rejected": -1.8430767059326172, "step": 465 }, { "epoch": 0.98403559277676, "grad_norm": 19.37788975464885, "learning_rate": 6.567894177967325e-10, "logits/chosen": -0.1643257737159729, "logits/rejected": -0.06100650504231453, "logps/chosen": -1.181461215019226, "logps/rejected": -1.707772970199585, "loss": 1.9914, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.181461215019226, "rewards/margins": 0.5263119339942932, "rewards/rejected": -1.707772970199585, "step": 470 }, { "epoch": 0.9945040565297043, "grad_norm": 24.408366857719134, "learning_rate": 5.3626246194704575e-11, "logits/chosen": -0.20142404735088348, "logits/rejected": -0.07068441808223724, "logps/chosen": -1.2009718418121338, "logps/rejected": -1.803815245628357, "loss": 1.9479, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2009718418121338, "rewards/margins": 0.6028433442115784, "rewards/rejected": -1.803815245628357, "step": 475 }, { "epoch": 0.998691442030882, "step": 477, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 4.3143, "train_samples_per_second": 14170.447, "train_steps_per_second": 110.564 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }