{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997382884061764, "eval_steps": 100, "global_step": 955, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.208333333333333e-08, "logps/chosen": -304.8013916015625, "logps/rejected": -229.5030517578125, "loss": 0.6931, "neglected": 10.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "selected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 5.208333333333334e-07, "logps/chosen": -313.4251708984375, "logps/rejected": -277.2637023925781, "loss": 0.693, "neglected": 90.0, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 8.512949716532603e-05, "rewards/margins": 0.0002041187253780663, "rewards/rejected": -0.00011898923548869789, "selected": 0.0, "step": 10 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-06, "logps/chosen": -229.010986328125, "logps/rejected": -232.58932495117188, "loss": 0.6931, "neglected": 242.0, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.00024582125479355454, "rewards/margins": 0.0001970421290025115, "rewards/rejected": 4.8779074859339744e-05, "selected": 0.0, "step": 20 }, { "epoch": 0.03, "learning_rate": 1.5625e-06, "logps/chosen": -270.5987548828125, "logps/rejected": -244.8210906982422, "loss": 0.693, "neglected": 402.0, "rewards/accuracies": 0.5, "rewards/chosen": -2.5821285817073658e-05, "rewards/margins": 0.00013345989282242954, "rewards/rejected": -0.00015928114589769393, "selected": 0.0, "step": 30 }, { "epoch": 0.04, "learning_rate": 2.0833333333333334e-06, "logps/chosen": -270.58099365234375, "logps/rejected": -263.6953125, "loss": 0.6928, "neglected": 562.0, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0004904457600787282, "rewards/margins": 0.0006916436250321567, "rewards/rejected": -0.00020119785040151328, "selected": 0.0, "step": 40 }, { "epoch": 0.05, "learning_rate": 2.604166666666667e-06, "logps/chosen": -255.3240509033203, "logps/rejected": -249.5677490234375, "loss": 0.6925, "neglected": 722.0, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0003948220401071012, "rewards/margins": 0.001298406976275146, "rewards/rejected": -0.0009035851107910275, "selected": 0.0, "step": 50 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logps/chosen": -278.1262512207031, "logps/rejected": -257.60858154296875, "loss": 0.6921, "neglected": 882.0, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.001432361314073205, "rewards/margins": 0.0019829857628792524, "rewards/rejected": -0.000550624099560082, "selected": 0.0, "step": 60 }, { "epoch": 0.07, "learning_rate": 3.6458333333333333e-06, "logps/chosen": -285.159423828125, "logps/rejected": -263.88519287109375, "loss": 0.6909, "neglected": 1042.0, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0027831769548356533, "rewards/margins": 0.004454310052096844, "rewards/rejected": -0.0016711335629224777, "selected": 0.0, "step": 70 }, { "epoch": 0.08, "learning_rate": 4.166666666666667e-06, "logps/chosen": -292.46826171875, "logps/rejected": -263.4407043457031, "loss": 0.6879, "neglected": 1202.0, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.0038231350481510162, "rewards/margins": 0.009836939163506031, "rewards/rejected": -0.006013805046677589, "selected": 0.0, "step": 80 }, { "epoch": 0.09, "learning_rate": 4.6875000000000004e-06, "logps/chosen": -280.253662109375, "logps/rejected": -257.877197265625, "loss": 0.6827, "neglected": 1362.0, "rewards/accuracies": 0.71875, "rewards/chosen": 0.007530213333666325, "rewards/margins": 0.01851402223110199, "rewards/rejected": -0.010983810760080814, "selected": 0.0, "step": 90 }, { "epoch": 0.1, "learning_rate": 4.9997324926814375e-06, "logps/chosen": -262.7320251464844, "logps/rejected": -284.05206298828125, "loss": 0.6727, "neglected": 1522.0, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008031011559069157, "rewards/margins": 0.026352444663643837, "rewards/rejected": -0.018321430310606956, "selected": 0.0, "step": 100 }, { "epoch": 0.1, "eval_logps/chosen": -272.2622985839844, "eval_logps/rejected": -256.4744873046875, "eval_loss": 0.6630775332450867, "eval_neglected": 256.0, "eval_rewards/accuracies": 0.7023809552192688, "eval_rewards/chosen": 0.0073691424913704395, "eval_rewards/margins": 0.040533702820539474, "eval_rewards/rejected": -0.03316456079483032, "eval_runtime": 667.065, "eval_samples_per_second": 2.998, "eval_selected": 0.0, "eval_steps_per_second": 0.094, "step": 100 }, { "epoch": 0.12, "learning_rate": 4.996723692767927e-06, "logps/chosen": -256.07342529296875, "logps/rejected": -245.7953643798828, "loss": 0.6539, "neglected": 586.0, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.0058533623814582825, "rewards/margins": 0.04092119634151459, "rewards/rejected": -0.04677455872297287, "selected": 0.0, "step": 110 }, { "epoch": 0.13, "learning_rate": 4.9903757462135984e-06, "logps/chosen": -270.7391357421875, "logps/rejected": -248.25411987304688, "loss": 0.6261, "neglected": 746.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.039820872247219086, "rewards/margins": 0.06930369138717651, "rewards/rejected": -0.1091245636343956, "selected": 0.0, "step": 120 }, { "epoch": 0.14, "learning_rate": 4.980697142834315e-06, "logps/chosen": -312.620849609375, "logps/rejected": -286.03900146484375, "loss": 0.588, "neglected": 906.0, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.1690162718296051, "rewards/margins": 0.05675806850194931, "rewards/rejected": -0.2257743626832962, "selected": 0.0, "step": 130 }, { "epoch": 0.15, "learning_rate": 4.967700826904229e-06, "logps/chosen": -297.30609130859375, "logps/rejected": -288.460693359375, "loss": 0.5194, "neglected": 1066.0, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4745141863822937, "rewards/margins": 0.12637066841125488, "rewards/rejected": -0.6008848547935486, "selected": 0.0, "step": 140 }, { "epoch": 0.16, "learning_rate": 4.951404179843963e-06, "logps/chosen": -450.36004638671875, "logps/rejected": -442.3370056152344, "loss": 0.3643, "neglected": 1226.0, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.6727848052978516, "rewards/margins": 0.14853505790233612, "rewards/rejected": -1.821319818496704, "selected": 0.0, "step": 150 }, { "epoch": 0.17, "learning_rate": 4.931828996974498e-06, "logps/chosen": -864.3917846679688, "logps/rejected": -883.9728393554688, "loss": 0.1924, "neglected": 1386.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -6.123291492462158, "rewards/margins": 0.1330125331878662, "rewards/rejected": -6.2563042640686035, "selected": 0.0, "step": 160 }, { "epoch": 0.18, "learning_rate": 4.909001458367867e-06, "logps/chosen": -2515.188232421875, "logps/rejected": -2068.07666015625, "loss": 0.0935, "neglected": 1546.0, "rewards/accuracies": 0.5, "rewards/chosen": -22.28314781188965, "rewards/margins": -4.024808406829834, "rewards/rejected": -18.25834083557129, "selected": 0.0, "step": 170 }, { "epoch": 0.19, "learning_rate": 4.882952093833628e-06, "logps/chosen": -4048.93603515625, "logps/rejected": -3594.38134765625, "loss": 0.0695, "neglected": 1706.0, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -38.03619384765625, "rewards/margins": -4.711598873138428, "rewards/rejected": -33.32460021972656, "selected": 0.0, "step": 180 }, { "epoch": 0.2, "learning_rate": 4.853715742087947e-06, "logps/chosen": -5398.57421875, "logps/rejected": -4498.2265625, "loss": 0.0402, "neglected": 1866.0, "rewards/accuracies": 0.4375, "rewards/chosen": -51.07048416137695, "rewards/margins": -8.596471786499023, "rewards/rejected": -42.47401428222656, "selected": 0.0, "step": 190 }, { "epoch": 0.21, "learning_rate": 4.821331504159906e-06, "logps/chosen": -8471.2490234375, "logps/rejected": -7905.1455078125, "loss": 0.0392, "neglected": 2026.0, "rewards/accuracies": 0.4375, "rewards/chosen": -82.20130920410156, "rewards/margins": -5.739879131317139, "rewards/rejected": -76.46143341064453, "selected": 0.0, "step": 200 }, { "epoch": 0.21, "eval_logps/chosen": -12272.142578125, "eval_logps/rejected": -10795.0419921875, "eval_loss": 0.027584508061408997, "eval_neglected": 256.0, "eval_rewards/accuracies": 0.4464285671710968, "eval_rewards/chosen": -119.99142456054688, "eval_rewards/margins": -14.572598457336426, "eval_rewards/rejected": -105.4188232421875, "eval_runtime": 670.5193, "eval_samples_per_second": 2.983, "eval_selected": 0.0, "eval_steps_per_second": 0.094, "step": 200 }, { "epoch": 0.22, "learning_rate": 4.7858426910973435e-06, "logps/chosen": -12588.5732421875, "logps/rejected": -11012.65625, "loss": 0.0212, "neglected": 586.0, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -122.9972152709961, "rewards/margins": -15.338798522949219, "rewards/rejected": -107.6584243774414, "selected": 0.0, "step": 210 }, { "epoch": 0.23, "learning_rate": 4.747296766042161e-06, "logps/chosen": -9843.9521484375, "logps/rejected": -9481.474609375, "loss": 0.0355, "neglected": 746.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -95.80561065673828, "rewards/margins": -3.6035759449005127, "rewards/rejected": -92.20204162597656, "selected": 0.0, "step": 220 }, { "epoch": 0.24, "learning_rate": 4.705745280752586e-06, "logps/chosen": -9174.8818359375, "logps/rejected": -8592.8603515625, "loss": 0.0293, "neglected": 906.0, "rewards/accuracies": 0.46875, "rewards/chosen": -88.80033874511719, "rewards/margins": -5.414186000823975, "rewards/rejected": -83.38614654541016, "selected": 0.0, "step": 230 }, { "epoch": 0.25, "learning_rate": 4.661243806657256e-06, "logps/chosen": -10734.61328125, "logps/rejected": -10024.224609375, "loss": 0.0237, "neglected": 1066.0, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -104.76686096191406, "rewards/margins": -6.675856113433838, "rewards/rejected": -98.09098815917969, "selected": 0.0, "step": 240 }, { "epoch": 0.26, "learning_rate": 4.613851860533367e-06, "logps/chosen": -12696.328125, "logps/rejected": -12253.8046875, "loss": 0.0216, "neglected": 1226.0, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -124.18550872802734, "rewards/margins": -4.2357869148254395, "rewards/rejected": -119.94972229003906, "selected": 0.0, "step": 250 }, { "epoch": 0.27, "learning_rate": 4.563632824908252e-06, "logps/chosen": -16011.0830078125, "logps/rejected": -15822.650390625, "loss": 0.0224, "neglected": 1386.0, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -157.71505737304688, "rewards/margins": -1.8953163623809814, "rewards/rejected": -155.81973266601562, "selected": 0.0, "step": 260 }, { "epoch": 0.28, "learning_rate": 4.510653863290871e-06, "logps/chosen": -21082.57421875, "logps/rejected": -18679.11328125, "loss": 0.0214, "neglected": 1546.0, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -208.0191192626953, "rewards/margins": -23.887981414794922, "rewards/rejected": -184.1311492919922, "selected": 0.0, "step": 270 }, { "epoch": 0.29, "learning_rate": 4.454985830346574e-06, "logps/chosen": -24180.8984375, "logps/rejected": -22673.47265625, "loss": 0.0205, "neglected": 1706.0, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -238.95950317382812, "rewards/margins": -14.777926445007324, "rewards/rejected": -224.18154907226562, "selected": 0.0, "step": 280 }, { "epoch": 0.3, "learning_rate": 4.396703177135262e-06, "logps/chosen": -25520.37890625, "logps/rejected": -20765.65234375, "loss": 0.0181, "neglected": 1866.0, "rewards/accuracies": 0.375, "rewards/chosen": -252.4154815673828, "rewards/margins": -46.95801544189453, "rewards/rejected": -205.4574432373047, "selected": 0.0, "step": 290 }, { "epoch": 0.31, "learning_rate": 4.335883851539693e-06, "logps/chosen": -26487.619140625, "logps/rejected": -21971.16796875, "loss": 0.0208, "neglected": 2026.0, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -262.32843017578125, "rewards/margins": -44.91776657104492, "rewards/rejected": -217.41067504882812, "selected": 0.0, "step": 300 }, { "epoch": 0.31, "eval_logps/chosen": -28411.646484375, "eval_logps/rejected": -24774.666015625, "eval_loss": 0.019909363240003586, "eval_neglected": 256.0, "eval_rewards/accuracies": 0.4444444477558136, "eval_rewards/chosen": -281.386474609375, "eval_rewards/margins": -36.1713981628418, "eval_rewards/rejected": -245.21505737304688, "eval_runtime": 505.6466, "eval_samples_per_second": 3.955, "eval_selected": 0.0, "eval_steps_per_second": 0.125, "step": 300 }, { "epoch": 0.32, "learning_rate": 4.2726091940171055e-06, "logps/chosen": -23575.23046875, "logps/rejected": -23384.05078125, "loss": 0.0186, "neglected": 586.0, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -233.19082641601562, "rewards/margins": -2.1331238746643066, "rewards/rejected": -231.0576934814453, "selected": 0.0, "step": 310 }, { "epoch": 0.33, "learning_rate": 4.206963828813555e-06, "logps/chosen": -25525.6015625, "logps/rejected": -22454.76953125, "loss": 0.0147, "neglected": 746.0, "rewards/accuracies": 0.4375, "rewards/chosen": -252.456787109375, "rewards/margins": -30.628952026367188, "rewards/rejected": -221.8278350830078, "selected": 0.0, "step": 320 }, { "epoch": 0.35, "learning_rate": 4.139035550786495e-06, "logps/chosen": -23703.537109375, "logps/rejected": -19463.728515625, "loss": 0.022, "neglected": 906.0, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -234.44473266601562, "rewards/margins": -42.07261657714844, "rewards/rejected": -192.3721160888672, "selected": 0.0, "step": 330 }, { "epoch": 0.36, "learning_rate": 4.068915207986931e-06, "logps/chosen": -25306.37109375, "logps/rejected": -21047.587890625, "loss": 0.0215, "neglected": 1066.0, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -250.39120483398438, "rewards/margins": -42.19620132446289, "rewards/rejected": -208.1950225830078, "selected": 0.0, "step": 340 }, { "epoch": 0.37, "learning_rate": 3.996696580158211e-06, "logps/chosen": -30706.958984375, "logps/rejected": -28025.75390625, "loss": 0.0174, "neglected": 1226.0, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -304.2236328125, "rewards/margins": -26.65300941467285, "rewards/rejected": -277.57061767578125, "selected": 0.0, "step": 350 }, { "epoch": 0.38, "learning_rate": 3.922476253313921e-06, "logps/chosen": -30064.09375, "logps/rejected": -27983.521484375, "loss": 0.0134, "neglected": 1386.0, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -298.0165710449219, "rewards/margins": -20.747739791870117, "rewards/rejected": -277.268798828125, "selected": 0.0, "step": 360 }, { "epoch": 0.39, "learning_rate": 3.846353490562664e-06, "logps/chosen": -31143.537109375, "logps/rejected": -27110.509765625, "loss": 0.016, "neglected": 1546.0, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -308.7967529296875, "rewards/margins": -40.30244827270508, "rewards/rejected": -268.49432373046875, "selected": 0.0, "step": 370 }, { "epoch": 0.4, "learning_rate": 3.768430099352445e-06, "logps/chosen": -32500.40234375, "logps/rejected": -29200.681640625, "loss": 0.0166, "neglected": 1706.0, "rewards/accuracies": 0.4375, "rewards/chosen": -322.29559326171875, "rewards/margins": -32.805747985839844, "rewards/rejected": -289.4898376464844, "selected": 0.0, "step": 380 }, { "epoch": 0.41, "learning_rate": 3.6888102953122307e-06, "logps/chosen": -30651.95703125, "logps/rejected": -26855.337890625, "loss": 0.0167, "neglected": 1866.0, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -303.84075927734375, "rewards/margins": -37.63256072998047, "rewards/rejected": -266.208251953125, "selected": 0.0, "step": 390 }, { "epoch": 0.42, "learning_rate": 3.607600562872785e-06, "logps/chosen": -34714.859375, "logps/rejected": -29928.291015625, "loss": 0.0157, "neglected": 2026.0, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -344.1798400878906, "rewards/margins": -47.40506362915039, "rewards/rejected": -296.77471923828125, "selected": 0.0, "step": 400 }, { "epoch": 0.42, "eval_logps/chosen": -35648.6171875, "eval_logps/rejected": -30971.783203125, "eval_loss": 0.01606718823313713, "eval_neglected": 256.0, "eval_rewards/accuracies": 0.4563491940498352, "eval_rewards/chosen": -353.75616455078125, "eval_rewards/margins": -46.569923400878906, "eval_rewards/rejected": -307.1862487792969, "eval_runtime": 506.2643, "eval_samples_per_second": 3.951, "eval_selected": 0.0, "eval_steps_per_second": 0.124, "step": 400 }, { "epoch": 0.43, "learning_rate": 3.5249095128531863e-06, "logps/chosen": -35328.84375, "logps/rejected": -30211.0, "loss": 0.0123, "neglected": 586.0, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -350.4105529785156, "rewards/margins": -50.80678176879883, "rewards/rejected": -299.60382080078125, "selected": 0.0, "step": 410 }, { "epoch": 0.44, "learning_rate": 3.4408477372034743e-06, "logps/chosen": -28428.931640625, "logps/rejected": -24432.90234375, "loss": 0.0282, "neglected": 746.0, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -281.9709777832031, "rewards/margins": -40.07168960571289, "rewards/rejected": -241.89932250976562, "selected": 0.0, "step": 420 }, { "epoch": 0.45, "learning_rate": 3.355527661097728e-06, "logps/chosen": -32917.82421875, "logps/rejected": -31499.80078125, "loss": 0.0179, "neglected": 906.0, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -326.5057678222656, "rewards/margins": -14.0189790725708, "rewards/rejected": -312.4867858886719, "selected": 0.0, "step": 430 }, { "epoch": 0.46, "learning_rate": 3.269063392575352e-06, "logps/chosen": -32941.2421875, "logps/rejected": -28788.244140625, "loss": 0.018, "neglected": 1066.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -326.6683349609375, "rewards/margins": -41.23744583129883, "rewards/rejected": -285.4308776855469, "selected": 0.0, "step": 440 }, { "epoch": 0.47, "learning_rate": 3.181570569931697e-06, "logps/chosen": -28051.92578125, "logps/rejected": -26678.359375, "loss": 0.025, "neglected": 1226.0, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -278.347900390625, "rewards/margins": -13.726778984069824, "rewards/rejected": -264.62115478515625, "selected": 0.0, "step": 450 }, { "epoch": 0.48, "learning_rate": 3.09316620706208e-06, "logps/chosen": -38179.28515625, "logps/rejected": -31792.712890625, "loss": 0.0135, "neglected": 1386.0, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -378.8819885253906, "rewards/margins": -63.55814743041992, "rewards/rejected": -315.3238220214844, "selected": 0.0, "step": 460 }, { "epoch": 0.49, "learning_rate": 3.0039685369660785e-06, "logps/chosen": -34295.48046875, "logps/rejected": -30349.291015625, "loss": 0.0128, "neglected": 1546.0, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -340.40753173828125, "rewards/margins": -39.258506774902344, "rewards/rejected": -301.1490173339844, "selected": 0.0, "step": 470 }, { "epoch": 0.5, "learning_rate": 2.91409685362137e-06, "logps/chosen": -30752.1875, "logps/rejected": -30045.68359375, "loss": 0.0118, "neglected": 1706.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -305.1645202636719, "rewards/margins": -7.10043478012085, "rewards/rejected": -298.06414794921875, "selected": 0.0, "step": 480 }, { "epoch": 0.51, "learning_rate": 2.8236713524386085e-06, "logps/chosen": -32674.708984375, "logps/rejected": -27154.64453125, "loss": 0.0173, "neglected": 1866.0, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -324.16522216796875, "rewards/margins": -54.92888641357422, "rewards/rejected": -269.236328125, "selected": 0.0, "step": 490 }, { "epoch": 0.52, "learning_rate": 2.7328129695107205e-06, "logps/chosen": -33942.0390625, "logps/rejected": -26798.068359375, "loss": 0.0182, "neglected": 2026.0, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -336.67034912109375, "rewards/margins": -71.16191864013672, "rewards/rejected": -265.5083923339844, "selected": 0.0, "step": 500 }, { "epoch": 0.52, "eval_logps/chosen": -33432.5625, "eval_logps/rejected": -29219.611328125, "eval_loss": 0.014793259091675282, "eval_neglected": 256.0, "eval_rewards/accuracies": 0.4464285671710968, "eval_rewards/chosen": -331.59564208984375, "eval_rewards/margins": -41.931129455566406, "eval_rewards/rejected": -289.6645202636719, "eval_runtime": 505.8824, "eval_samples_per_second": 3.953, "eval_selected": 0.0, "eval_steps_per_second": 0.125, "step": 500 }, { "epoch": 0.53, "learning_rate": 2.641643219871597e-06, "logps/chosen": -31474.45703125, "logps/rejected": -25789.703125, "loss": 0.0149, "neglected": 586.0, "rewards/accuracies": 0.46875, "rewards/chosen": -312.119384765625, "rewards/margins": -56.47794723510742, "rewards/rejected": -255.6414337158203, "selected": 0.0, "step": 510 }, { "epoch": 0.54, "learning_rate": 2.5502840349805074e-06, "logps/chosen": -32054.20703125, "logps/rejected": -27586.98046875, "loss": 0.0173, "neglected": 746.0, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -317.7613830566406, "rewards/margins": -44.480018615722656, "rewards/rejected": -273.2813720703125, "selected": 0.0, "step": 520 }, { "epoch": 0.55, "learning_rate": 2.4588575996495797e-06, "logps/chosen": -34159.5703125, "logps/rejected": -30448.79296875, "loss": 0.0138, "neglected": 906.0, "rewards/accuracies": 0.40625, "rewards/chosen": -338.7040710449219, "rewards/margins": -37.0126953125, "rewards/rejected": -301.69134521484375, "selected": 0.0, "step": 530 }, { "epoch": 0.57, "learning_rate": 2.367486188632446e-06, "logps/chosen": -33792.76171875, "logps/rejected": -29700.40625, "loss": 0.0078, "neglected": 1066.0, "rewards/accuracies": 0.40625, "rewards/chosen": -335.1407775878906, "rewards/margins": -40.700645446777344, "rewards/rejected": -294.440185546875, "selected": 0.0, "step": 540 }, { "epoch": 0.58, "learning_rate": 2.276292003092593e-06, "logps/chosen": -33080.16796875, "logps/rejected": -29123.927734375, "loss": 0.0145, "neglected": 1226.0, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -328.01007080078125, "rewards/margins": -39.278717041015625, "rewards/rejected": -288.7312927246094, "selected": 0.0, "step": 550 }, { "epoch": 0.59, "learning_rate": 2.1853970071701415e-06, "logps/chosen": -29881.09375, "logps/rejected": -24806.671875, "loss": 0.0136, "neglected": 1386.0, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -296.17803955078125, "rewards/margins": -50.19990539550781, "rewards/rejected": -245.9781494140625, "selected": 0.0, "step": 560 }, { "epoch": 0.6, "learning_rate": 2.0949227648656194e-06, "logps/chosen": -34481.3203125, "logps/rejected": -30493.02734375, "loss": 0.0172, "neglected": 1546.0, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -342.2419738769531, "rewards/margins": -39.841697692871094, "rewards/rejected": -302.4002685546875, "selected": 0.0, "step": 570 }, { "epoch": 0.61, "learning_rate": 2.00499027745888e-06, "logps/chosen": -34967.98828125, "logps/rejected": -29546.712890625, "loss": 0.0141, "neglected": 1706.0, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -347.0220031738281, "rewards/margins": -54.10175323486328, "rewards/rejected": -292.9202575683594, "selected": 0.0, "step": 580 }, { "epoch": 0.62, "learning_rate": 1.915719821680624e-06, "logps/chosen": -32696.44921875, "logps/rejected": -30430.255859375, "loss": 0.0167, "neglected": 1866.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -324.4429626464844, "rewards/margins": -22.577922821044922, "rewards/rejected": -301.8650207519531, "selected": 0.0, "step": 590 }, { "epoch": 0.63, "learning_rate": 1.8272307888529276e-06, "logps/chosen": -36048.4453125, "logps/rejected": -33494.70703125, "loss": 0.013, "neglected": 2026.0, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -357.53070068359375, "rewards/margins": -25.526748657226562, "rewards/rejected": -332.0039978027344, "selected": 0.0, "step": 600 }, { "epoch": 0.63, "eval_logps/chosen": -35941.4140625, "eval_logps/rejected": -31495.03125, "eval_loss": 0.014261237345635891, "eval_neglected": 256.0, "eval_rewards/accuracies": 0.454365074634552, "eval_rewards/chosen": -356.68414306640625, "eval_rewards/margins": -44.265377044677734, "eval_rewards/rejected": -312.41876220703125, "eval_runtime": 505.9708, "eval_samples_per_second": 3.953, "eval_selected": 0.0, "eval_steps_per_second": 0.125, "step": 600 }, { "epoch": 0.64, "learning_rate": 1.739641525213929e-06, "logps/chosen": -36224.6953125, "logps/rejected": -28970.59765625, "loss": 0.0178, "neglected": 586.0, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -359.3634948730469, "rewards/margins": -71.96965789794922, "rewards/rejected": -287.39385986328125, "selected": 0.0, "step": 610 }, { "epoch": 0.65, "learning_rate": 1.6530691736402317e-06, "logps/chosen": -33294.37109375, "logps/rejected": -29921.0, "loss": 0.0107, "neglected": 746.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -330.2255859375, "rewards/margins": -33.42811965942383, "rewards/rejected": -296.7974548339844, "selected": 0.0, "step": 620 }, { "epoch": 0.66, "learning_rate": 1.5676295169786864e-06, "logps/chosen": -33129.5703125, "logps/rejected": -29490.744140625, "loss": 0.0122, "neglected": 906.0, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -328.56658935546875, "rewards/margins": -36.11338806152344, "rewards/rejected": -292.45318603515625, "selected": 0.0, "step": 630 }, { "epoch": 0.67, "learning_rate": 1.4834368231970922e-06, "logps/chosen": -31400.677734375, "logps/rejected": -29454.287109375, "loss": 0.0158, "neglected": 1066.0, "rewards/accuracies": 0.46875, "rewards/chosen": -311.5920104980469, "rewards/margins": -19.44215202331543, "rewards/rejected": -292.14984130859375, "selected": 0.0, "step": 640 }, { "epoch": 0.68, "learning_rate": 1.4006036925609245e-06, "logps/chosen": -37346.015625, "logps/rejected": -32676.51171875, "loss": 0.0115, "neglected": 1226.0, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -370.58929443359375, "rewards/margins": -46.472129821777344, "rewards/rejected": -324.11724853515625, "selected": 0.0, "step": 650 }, { "epoch": 0.69, "learning_rate": 1.3192409070404582e-06, "logps/chosen": -35501.5234375, "logps/rejected": -31327.859375, "loss": 0.0195, "neglected": 1386.0, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -352.1575622558594, "rewards/margins": -41.29918670654297, "rewards/rejected": -310.8583984375, "selected": 0.0, "step": 660 }, { "epoch": 0.7, "learning_rate": 1.2394572821496953e-06, "logps/chosen": -34126.65234375, "logps/rejected": -28228.837890625, "loss": 0.0213, "neglected": 1546.0, "rewards/accuracies": 0.4375, "rewards/chosen": -338.5702209472656, "rewards/margins": -58.62493896484375, "rewards/rejected": -279.9453125, "selected": 0.0, "step": 670 }, { "epoch": 0.71, "learning_rate": 1.1613595214152713e-06, "logps/chosen": -33340.77734375, "logps/rejected": -30065.52734375, "loss": 0.0155, "neglected": 1706.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -330.6611633300781, "rewards/margins": -32.5056037902832, "rewards/rejected": -298.15557861328125, "selected": 0.0, "step": 680 }, { "epoch": 0.72, "learning_rate": 1.0850520736699362e-06, "logps/chosen": -30460.240234375, "logps/rejected": -24525.88671875, "loss": 0.0166, "neglected": 1866.0, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": -302.1998291015625, "rewards/margins": -58.95698165893555, "rewards/rejected": -243.2428436279297, "selected": 0.0, "step": 690 }, { "epoch": 0.73, "learning_rate": 1.0106369933615043e-06, "logps/chosen": -34674.8515625, "logps/rejected": -27218.890625, "loss": 0.0165, "neglected": 2026.0, "rewards/accuracies": 0.40625, "rewards/chosen": -344.1193542480469, "rewards/margins": -74.25706481933594, "rewards/rejected": -269.8622741699219, "selected": 0.0, "step": 700 }, { "epoch": 0.73, "eval_logps/chosen": -35642.40234375, "eval_logps/rejected": -31306.609375, "eval_loss": 0.014253102242946625, "eval_neglected": 256.0, "eval_rewards/accuracies": 0.4503968358039856, "eval_rewards/chosen": -353.6939697265625, "eval_rewards/margins": -43.15950012207031, "eval_rewards/rejected": -310.53448486328125, "eval_runtime": 505.4591, "eval_samples_per_second": 3.957, "eval_selected": 0.0, "eval_steps_per_second": 0.125, "step": 700 }, { "epoch": 0.74, "learning_rate": 9.382138040640714e-07, "logps/chosen": -35340.9296875, "logps/rejected": -29694.728515625, "loss": 0.0173, "neglected": 586.0, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -350.610595703125, "rewards/margins": -55.991127014160156, "rewards/rejected": -294.61944580078125, "selected": 0.0, "step": 710 }, { "epoch": 0.75, "learning_rate": 8.678793653740633e-07, "logps/chosen": -33239.89453125, "logps/rejected": -29252.65234375, "loss": 0.0194, "neglected": 746.0, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -329.77923583984375, "rewards/margins": -39.71256637573242, "rewards/rejected": -290.0666809082031, "selected": 0.0, "step": 720 }, { "epoch": 0.76, "learning_rate": 7.997277433690984e-07, "logps/chosen": -33704.45703125, "logps/rejected": -27931.458984375, "loss": 0.0181, "neglected": 906.0, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -334.1654052734375, "rewards/margins": -57.19194793701172, "rewards/rejected": -276.97344970703125, "selected": 0.0, "step": 730 }, { "epoch": 0.77, "learning_rate": 7.338500848029603e-07, "logps/chosen": -38638.05078125, "logps/rejected": -33758.7890625, "loss": 0.0091, "neglected": 1066.0, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -383.42559814453125, "rewards/margins": -48.62553024291992, "rewards/rejected": -334.800048828125, "selected": 0.0, "step": 740 }, { "epoch": 0.79, "learning_rate": 6.70334495204884e-07, "logps/chosen": -35638.7421875, "logps/rejected": -31301.58984375, "loss": 0.0156, "neglected": 1226.0, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -353.8833312988281, "rewards/margins": -43.371212005615234, "rewards/rejected": -310.51214599609375, "selected": 0.0, "step": 750 }, { "epoch": 0.8, "learning_rate": 6.092659210462232e-07, "logps/chosen": -38116.8828125, "logps/rejected": -33497.234375, "loss": 0.012, "neglected": 1386.0, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -378.3582763671875, "rewards/margins": -45.90812301635742, "rewards/rejected": -332.45013427734375, "selected": 0.0, "step": 760 }, { "epoch": 0.81, "learning_rate": 5.507260361320738e-07, "logps/chosen": -36556.7890625, "logps/rejected": -33627.23828125, "loss": 0.0106, "neglected": 1546.0, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -362.6523742675781, "rewards/margins": -29.13754653930664, "rewards/rejected": -333.51483154296875, "selected": 0.0, "step": 770 }, { "epoch": 0.82, "learning_rate": 4.947931323697983e-07, "logps/chosen": -33210.11328125, "logps/rejected": -29063.90234375, "loss": 0.013, "neglected": 1706.0, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": -329.4010925292969, "rewards/margins": -41.132415771484375, "rewards/rejected": -288.2686462402344, "selected": 0.0, "step": 780 }, { "epoch": 0.83, "learning_rate": 4.4154201506053985e-07, "logps/chosen": -35076.3515625, "logps/rejected": -33320.109375, "loss": 0.0146, "neglected": 1866.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -348.1772155761719, "rewards/margins": -17.483577728271484, "rewards/rejected": -330.693603515625, "selected": 0.0, "step": 790 }, { "epoch": 0.84, "learning_rate": 3.910439028537638e-07, "logps/chosen": -37705.09375, "logps/rejected": -33144.11328125, "loss": 0.0145, "neglected": 2026.0, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -374.12744140625, "rewards/margins": -45.31665802001953, "rewards/rejected": -328.810791015625, "selected": 0.0, "step": 800 }, { "epoch": 0.84, "eval_logps/chosen": -37680.9765625, "eval_logps/rejected": -33080.87890625, "eval_loss": 0.013520442880690098, "eval_neglected": 256.0, "eval_rewards/accuracies": 0.454365074634552, "eval_rewards/chosen": -374.0797424316406, "eval_rewards/margins": -45.80255889892578, "eval_rewards/rejected": -328.27716064453125, "eval_runtime": 506.1329, "eval_samples_per_second": 3.952, "eval_selected": 0.0, "eval_steps_per_second": 0.124, "step": 800 }, { "epoch": 0.85, "learning_rate": 3.4336633249862084e-07, "logps/chosen": -34459.21484375, "logps/rejected": -27397.837890625, "loss": 0.0064, "neglected": 586.0, "rewards/accuracies": 0.375, "rewards/chosen": -341.8682556152344, "rewards/margins": -70.08294677734375, "rewards/rejected": -271.78533935546875, "selected": 0.0, "step": 810 }, { "epoch": 0.86, "learning_rate": 2.98573068519539e-07, "logps/chosen": -37666.25390625, "logps/rejected": -29870.04296875, "loss": 0.0225, "neglected": 746.0, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -373.8772888183594, "rewards/margins": -77.50922393798828, "rewards/rejected": -296.3680725097656, "selected": 0.0, "step": 820 }, { "epoch": 0.87, "learning_rate": 2.5672401793681854e-07, "logps/chosen": -35177.1796875, "logps/rejected": -35886.7734375, "loss": 0.0127, "neglected": 906.0, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -349.24237060546875, "rewards/margins": 6.9106035232543945, "rewards/rejected": -356.1529235839844, "selected": 0.0, "step": 830 }, { "epoch": 0.88, "learning_rate": 2.178751501463036e-07, "logps/chosen": -35737.97265625, "logps/rejected": -33345.86328125, "loss": 0.0107, "neglected": 1066.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -354.8319396972656, "rewards/margins": -23.900835037231445, "rewards/rejected": -330.93109130859375, "selected": 0.0, "step": 840 }, { "epoch": 0.89, "learning_rate": 1.820784220652766e-07, "logps/chosen": -36996.6171875, "logps/rejected": -31543.29296875, "loss": 0.0169, "neglected": 1226.0, "rewards/accuracies": 0.40625, "rewards/chosen": -367.1126708984375, "rewards/margins": -54.04352951049805, "rewards/rejected": -313.0691833496094, "selected": 0.0, "step": 850 }, { "epoch": 0.9, "learning_rate": 1.4938170864468636e-07, "logps/chosen": -35994.3359375, "logps/rejected": -32652.32421875, "loss": 0.0158, "neglected": 1386.0, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -357.20001220703125, "rewards/margins": -33.3302001953125, "rewards/rejected": -323.8697814941406, "selected": 0.0, "step": 860 }, { "epoch": 0.91, "learning_rate": 1.1982873884064466e-07, "logps/chosen": -31830.9375, "logps/rejected": -30581.083984375, "loss": 0.0158, "neglected": 1546.0, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -315.9645690917969, "rewards/margins": -12.488165855407715, "rewards/rejected": -303.4764404296875, "selected": 0.0, "step": 870 }, { "epoch": 0.92, "learning_rate": 9.345903713082305e-08, "logps/chosen": -37537.7265625, "logps/rejected": -32952.95703125, "loss": 0.0108, "neglected": 1706.0, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -372.6186218261719, "rewards/margins": -45.61473846435547, "rewards/rejected": -327.00390625, "selected": 0.0, "step": 880 }, { "epoch": 0.93, "learning_rate": 7.030787065396866e-08, "logps/chosen": -39424.8828125, "logps/rejected": -34691.7265625, "loss": 0.0135, "neglected": 1866.0, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -391.5462341308594, "rewards/margins": -47.24760437011719, "rewards/rejected": -344.29864501953125, "selected": 0.0, "step": 890 }, { "epoch": 0.94, "learning_rate": 5.0406202043228604e-08, "logps/chosen": -36448.4765625, "logps/rejected": -30919.54296875, "loss": 0.0195, "neglected": 2026.0, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -361.8363037109375, "rewards/margins": -55.14497756958008, "rewards/rejected": -306.69134521484375, "selected": 0.0, "step": 900 }, { "epoch": 0.94, "eval_logps/chosen": -37924.83984375, "eval_logps/rejected": -33293.47265625, "eval_loss": 0.013714035972952843, "eval_neglected": 256.0, "eval_rewards/accuracies": 0.454365074634552, "eval_rewards/chosen": -376.51837158203125, "eval_rewards/margins": -46.11524200439453, "eval_rewards/rejected": -330.4031677246094, "eval_runtime": 967.1825, "eval_samples_per_second": 2.068, "eval_selected": 0.0, "eval_steps_per_second": 0.065, "step": 900 }, { "epoch": 0.95, "learning_rate": 3.378064801637687e-08, "logps/chosen": -37312.3515625, "logps/rejected": -31717.978515625, "loss": 0.0123, "neglected": 586.0, "rewards/accuracies": 0.4375, "rewards/chosen": -370.2989196777344, "rewards/margins": -55.649620056152344, "rewards/rejected": -314.6493225097656, "selected": 0.0, "step": 910 }, { "epoch": 0.96, "learning_rate": 2.0453443778310766e-08, "logps/chosen": -38356.51953125, "logps/rejected": -32435.177734375, "loss": 0.0172, "neglected": 746.0, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -380.7662048339844, "rewards/margins": -58.904090881347656, "rewards/rejected": -321.86212158203125, "selected": 0.0, "step": 920 }, { "epoch": 0.97, "learning_rate": 1.0442413283435759e-08, "logps/chosen": -37030.9140625, "logps/rejected": -29036.62109375, "loss": 0.0174, "neglected": 906.0, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -367.7157897949219, "rewards/margins": -79.69428253173828, "rewards/rejected": -288.02154541015625, "selected": 0.0, "step": 930 }, { "epoch": 0.98, "learning_rate": 3.760945397705828e-09, "logps/chosen": -39078.89453125, "logps/rejected": -32606.041015625, "loss": 0.0123, "neglected": 1066.0, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -388.01318359375, "rewards/margins": -64.55225372314453, "rewards/rejected": -323.4609375, "selected": 0.0, "step": 940 }, { "epoch": 0.99, "learning_rate": 4.1797599220405605e-10, "logps/chosen": -34646.1640625, "logps/rejected": -31544.458984375, "loss": 0.0168, "neglected": 1226.0, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -343.90411376953125, "rewards/margins": -30.896869659423828, "rewards/rejected": -313.00726318359375, "selected": 0.0, "step": 950 }, { "epoch": 1.0, "step": 955, "total_flos": 0.0, "train_loss": 0.11865393293933718, "train_runtime": 42177.7253, "train_samples_per_second": 1.449, "train_steps_per_second": 0.023 } ], "logging_steps": 10, "max_steps": 955, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }