{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 100, "global_step": 2208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": 0.8079685568809509, "debug/policy_chosen_logps": -5.034485816955566, "debug/policy_rejected_logits": 0.6268295645713806, "debug/policy_rejected_logps": -2.0584616661071777, "debug/reference_chosen_logps": -5.034485816955566, "debug/reference_rejected_logps": -2.0584616661071777, "debug/sppo_chosen_loss": 2500.0, "debug/sppo_chosen_reward_in_loss": 0.0, "debug/sppo_rej_reward_in_loss": 0.0, "debug/sppo_reject_loss": 2500.0, "epoch": 0.0036231884057971015, "grad_norm": 47446.78972903909, "learning_rate": 1e-09, "logits/chosen": 0.8079685568809509, "logits/rejected": 0.6268295645713806, "logps/chosen": -5.034485816955566, "logps/rejected": -2.0584616661071777, "loss": 5000.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": 1.1306993961334229, "debug/policy_chosen_logps": -5.146453857421875, "debug/policy_rejected_logits": 1.368752360343933, "debug/policy_rejected_logps": -47.807464599609375, "debug/reference_chosen_logps": -5.141074180603027, "debug/reference_rejected_logps": -47.71269607543945, "debug/sppo_chosen_loss": 2500.540283203125, "debug/sppo_chosen_reward_in_loss": -0.005379434674978256, "debug/sppo_rej_reward_in_loss": -0.09476511180400848, "debug/sppo_reject_loss": 2490.82568359375, "epoch": 0.018115942028985508, "grad_norm": 86988.1218000221, "learning_rate": 5e-09, "logits/chosen": 1.1306993961334229, "logits/rejected": 1.368752360343933, "logps/chosen": -5.146453857421875, "logps/rejected": -47.807464599609375, "loss": 5005.1191, "rewards/accuracies": 0.15625, "rewards/chosen": -5.379434514907189e-05, "rewards/margins": 0.0008938567480072379, "rewards/rejected": -0.0009476511622779071, "step": 5 }, { "debug/policy_chosen_logits": 1.3424609899520874, "debug/policy_chosen_logps": -37.58217239379883, "debug/policy_rejected_logits": 1.6095311641693115, "debug/policy_rejected_logps": -8.872511863708496, "debug/reference_chosen_logps": -37.657630920410156, "debug/reference_rejected_logps": -8.866057395935059, "debug/sppo_chosen_loss": 2492.56103515625, "debug/sppo_chosen_reward_in_loss": 0.07545705139636993, "debug/sppo_rej_reward_in_loss": -0.006454542279243469, "debug/sppo_reject_loss": 2499.407470703125, "epoch": 0.036231884057971016, "grad_norm": 26571.836221466936, "learning_rate": 1e-08, "logits/chosen": 1.3424609899520874, "logits/rejected": 1.6095311641693115, "logps/chosen": -37.58217239379883, "logps/rejected": -8.872511863708496, "loss": 5003.4242, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0007545704720541835, "rewards/margins": 0.000819115899503231, "rewards/rejected": -6.454542744904757e-05, "step": 10 }, { "debug/policy_chosen_logits": 1.2810733318328857, "debug/policy_chosen_logps": -7.365481376647949, "debug/policy_rejected_logits": 1.6369354724884033, "debug/policy_rejected_logps": -5.41949462890625, "debug/reference_chosen_logps": -7.438336372375488, "debug/reference_rejected_logps": -5.3998122215271, "debug/sppo_chosen_loss": 2492.84228515625, "debug/sppo_chosen_reward_in_loss": 0.07285457104444504, "debug/sppo_rej_reward_in_loss": -0.019682347774505615, "debug/sppo_reject_loss": 2498.06396484375, "epoch": 0.05434782608695652, "grad_norm": 46548.55053972237, "learning_rate": 1.5e-08, "logits/chosen": 1.2810733318328857, "logits/rejected": 1.6369354724884033, "logps/chosen": -7.365481376647949, "logps/rejected": -5.41949462890625, "loss": 4994.0027, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0007285457104444504, "rewards/margins": 0.0009253692114725709, "rewards/rejected": -0.00019682347192429006, "step": 15 }, { "debug/policy_chosen_logits": 1.2527421712875366, "debug/policy_chosen_logps": -4.222568035125732, "debug/policy_rejected_logits": 1.5992556810379028, "debug/policy_rejected_logps": -88.94862365722656, "debug/reference_chosen_logps": -4.198297023773193, "debug/reference_rejected_logps": -88.761962890625, "debug/sppo_chosen_loss": 2502.44970703125, "debug/sppo_chosen_reward_in_loss": -0.024270813912153244, "debug/sppo_rej_reward_in_loss": -0.1866498440504074, "debug/sppo_reject_loss": 2482.566162109375, "epoch": 0.07246376811594203, "grad_norm": 20714.973298759876, "learning_rate": 2e-08, "logits/chosen": 1.2527421712875366, "logits/rejected": 1.5992556810379028, "logps/chosen": -4.222568035125732, "logps/rejected": -88.94862365722656, "loss": 4995.7758, "rewards/accuracies": 0.25, "rewards/chosen": -0.00024270816356875002, "rewards/margins": 0.0016237900126725435, "rewards/rejected": -0.0018664983799681067, "step": 20 }, { "debug/policy_chosen_logits": 1.3294768333435059, "debug/policy_chosen_logps": -208.87466430664062, "debug/policy_rejected_logits": 1.7009861469268799, "debug/policy_rejected_logps": -267.98822021484375, "debug/reference_chosen_logps": -208.846923828125, "debug/reference_rejected_logps": -267.5577697753906, "debug/sppo_chosen_loss": 2504.19287109375, "debug/sppo_chosen_reward_in_loss": -0.027720510959625244, "debug/sppo_rej_reward_in_loss": -0.43049392104148865, "debug/sppo_reject_loss": 2464.42333984375, "epoch": 0.09057971014492754, "grad_norm": 19541.66662305683, "learning_rate": 2.5e-08, "logits/chosen": 1.3294768333435059, "logits/rejected": 1.7009861469268799, "logps/chosen": -208.87466430664062, "logps/rejected": -267.98822021484375, "loss": 4989.9965, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.0002772051375359297, "rewards/margins": 0.0040277340449392796, "rewards/rejected": -0.004304938949644566, "step": 25 }, { "debug/policy_chosen_logits": 1.496190071105957, "debug/policy_chosen_logps": -61.30437469482422, "debug/policy_rejected_logits": 1.8422586917877197, "debug/policy_rejected_logps": -5.341076850891113, "debug/reference_chosen_logps": -61.356834411621094, "debug/reference_rejected_logps": -5.380406379699707, "debug/sppo_chosen_loss": 2494.81396484375, "debug/sppo_chosen_reward_in_loss": 0.05245880037546158, "debug/sppo_rej_reward_in_loss": 0.039329804480075836, "debug/sppo_reject_loss": 2503.96142578125, "epoch": 0.10869565217391304, "grad_norm": 34229.97647594211, "learning_rate": 3e-08, "logits/chosen": 1.496190071105957, "logits/rejected": 1.8422586917877197, "logps/chosen": -61.30437469482422, "logps/rejected": -5.341076850891113, "loss": 4996.6758, "rewards/accuracies": 0.375, "rewards/chosen": 0.0005245879874564707, "rewards/margins": 0.00013128995487932116, "rewards/rejected": 0.0003932980471290648, "step": 30 }, { "debug/policy_chosen_logits": 1.5899847745895386, "debug/policy_chosen_logps": -67.06717681884766, "debug/policy_rejected_logits": 1.9133622646331787, "debug/policy_rejected_logps": -7.41262149810791, "debug/reference_chosen_logps": -67.15159606933594, "debug/reference_rejected_logps": -7.461578369140625, "debug/sppo_chosen_loss": 2491.75341796875, "debug/sppo_chosen_reward_in_loss": 0.08442829549312592, "debug/sppo_rej_reward_in_loss": 0.04895613715052605, "debug/sppo_reject_loss": 2504.94482421875, "epoch": 0.12681159420289856, "grad_norm": 76077.34148629114, "learning_rate": 3.4999999999999996e-08, "logits/chosen": 1.5899847745895386, "logits/rejected": 1.9133622646331787, "logps/chosen": -67.06717681884766, "logps/rejected": -7.41262149810791, "loss": 4993.0664, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.0008442830294370651, "rewards/margins": 0.00035472161835059524, "rewards/rejected": 0.0004895614110864699, "step": 35 }, { "debug/policy_chosen_logits": 1.4952932596206665, "debug/policy_chosen_logps": -90.9863510131836, "debug/policy_rejected_logits": 1.8197733163833618, "debug/policy_rejected_logps": -81.26140594482422, "debug/reference_chosen_logps": -90.82229614257812, "debug/reference_rejected_logps": -81.13166809082031, "debug/sppo_chosen_loss": 2516.98828125, "debug/sppo_chosen_reward_in_loss": -0.16405606269836426, "debug/sppo_rej_reward_in_loss": -0.12973186373710632, "debug/sppo_reject_loss": 2488.0302734375, "epoch": 0.14492753623188406, "grad_norm": 118160.8838705892, "learning_rate": 4e-08, "logits/chosen": 1.4952932596206665, "logits/rejected": 1.8197733163833618, "logps/chosen": -90.9863510131836, "logps/rejected": -81.26140594482422, "loss": 5003.6594, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.00164056068751961, "rewards/margins": -0.0003432419034652412, "rewards/rejected": -0.001297318609431386, "step": 40 }, { "debug/policy_chosen_logits": 1.4438402652740479, "debug/policy_chosen_logps": -43.4237060546875, "debug/policy_rejected_logits": 1.6279480457305908, "debug/policy_rejected_logps": -7.545201301574707, "debug/reference_chosen_logps": -43.420326232910156, "debug/reference_rejected_logps": -7.612727165222168, "debug/sppo_chosen_loss": 2500.438232421875, "debug/sppo_chosen_reward_in_loss": -0.0033864795695990324, "debug/sppo_rej_reward_in_loss": 0.06752587854862213, "debug/sppo_reject_loss": 2506.79541015625, "epoch": 0.16304347826086957, "grad_norm": 124385.93511806424, "learning_rate": 4.5e-08, "logits/chosen": 1.4438402652740479, "logits/rejected": 1.6279480457305908, "logps/chosen": -43.4237060546875, "logps/rejected": -7.545201301574707, "loss": 5001.2043, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -3.3864831493701786e-05, "rewards/margins": -0.0007091236184351146, "rewards/rejected": 0.0006752588087692857, "step": 45 }, { "debug/policy_chosen_logits": 1.3551827669143677, "debug/policy_chosen_logps": -88.38333129882812, "debug/policy_rejected_logits": 1.7912641763687134, "debug/policy_rejected_logps": -3.3702595233917236, "debug/reference_chosen_logps": -88.64862823486328, "debug/reference_rejected_logps": -3.3678181171417236, "debug/sppo_chosen_loss": 2475.676513671875, "debug/sppo_chosen_reward_in_loss": 0.2652997672557831, "debug/sppo_rej_reward_in_loss": -0.0024413815699517727, "debug/sppo_reject_loss": 2499.763916015625, "epoch": 0.18115942028985507, "grad_norm": 53378.910480323204, "learning_rate": 5e-08, "logits/chosen": 1.3551827669143677, "logits/rejected": 1.7912641763687134, "logps/chosen": -88.38333129882812, "logps/rejected": -3.3702595233917236, "loss": 4991.7719, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0026529976166784763, "rewards/margins": 0.00267741153948009, "rewards/rejected": -2.441380638629198e-05, "step": 50 }, { "debug/policy_chosen_logits": 1.3388534784317017, "debug/policy_chosen_logps": -11.796781539916992, "debug/policy_rejected_logits": 1.8775174617767334, "debug/policy_rejected_logps": -3.491015672683716, "debug/reference_chosen_logps": -11.87784194946289, "debug/reference_rejected_logps": -3.4915835857391357, "debug/sppo_chosen_loss": 2491.936767578125, "debug/sppo_chosen_reward_in_loss": 0.08106164634227753, "debug/sppo_rej_reward_in_loss": 0.000567974173463881, "debug/sppo_reject_loss": 2500.0693359375, "epoch": 0.19927536231884058, "grad_norm": 118072.71131945525, "learning_rate": 5.5e-08, "logits/chosen": 1.3388534784317017, "logits/rejected": 1.8775174617767334, "logps/chosen": -11.796781539916992, "logps/rejected": -3.491015672683716, "loss": 4995.9875, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0008106164750643075, "rewards/margins": 0.000804936804343015, "rewards/rejected": 5.6797434808686376e-06, "step": 55 }, { "debug/policy_chosen_logits": 1.5325512886047363, "debug/policy_chosen_logps": -4.535607814788818, "debug/policy_rejected_logits": 1.8329050540924072, "debug/policy_rejected_logps": -7.996115684509277, "debug/reference_chosen_logps": -4.53155517578125, "debug/reference_rejected_logps": -8.040096282958984, "debug/sppo_chosen_loss": 2500.421875, "debug/sppo_chosen_reward_in_loss": -0.0040526925586164, "debug/sppo_rej_reward_in_loss": 0.043981026858091354, "debug/sppo_reject_loss": 2504.436279296875, "epoch": 0.21739130434782608, "grad_norm": 87318.06319857825, "learning_rate": 6e-08, "logits/chosen": 1.5325512886047363, "logits/rejected": 1.8329050540924072, "logps/chosen": -4.535607814788818, "logps/rejected": -7.996115684509277, "loss": 5011.9992, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -4.0526916563976556e-05, "rewards/margins": -0.00048033715575002134, "rewards/rejected": 0.00043981027556583285, "step": 60 }, { "debug/policy_chosen_logits": 1.4887205362319946, "debug/policy_chosen_logps": -133.07762145996094, "debug/policy_rejected_logits": 1.864768624305725, "debug/policy_rejected_logps": -12.535041809082031, "debug/reference_chosen_logps": -133.0397491455078, "debug/reference_rejected_logps": -12.575475692749023, "debug/sppo_chosen_loss": 2503.914794921875, "debug/sppo_chosen_reward_in_loss": -0.037886202335357666, "debug/sppo_rej_reward_in_loss": 0.04043303057551384, "debug/sppo_reject_loss": 2504.06494140625, "epoch": 0.23550724637681159, "grad_norm": 19201.222948479794, "learning_rate": 6.5e-08, "logits/chosen": 1.4887205362319946, "logits/rejected": 1.864768624305725, "logps/chosen": -133.07762145996094, "logps/rejected": -12.535041809082031, "loss": 5001.0832, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.00037886202335357666, "rewards/margins": -0.0007831922848708928, "rewards/rejected": 0.0004043303488288075, "step": 65 }, { "debug/policy_chosen_logits": 1.653656005859375, "debug/policy_chosen_logps": -7.038564205169678, "debug/policy_rejected_logits": 1.6395747661590576, "debug/policy_rejected_logps": -95.44420623779297, "debug/reference_chosen_logps": -7.0487470626831055, "debug/reference_rejected_logps": -95.66188049316406, "debug/sppo_chosen_loss": 2499.0263671875, "debug/sppo_chosen_reward_in_loss": 0.01018262468278408, "debug/sppo_rej_reward_in_loss": 0.2176676243543625, "debug/sppo_reject_loss": 2524.071044921875, "epoch": 0.2536231884057971, "grad_norm": 76320.37177734253, "learning_rate": 6.999999999999999e-08, "logits/chosen": 1.653656005859375, "logits/rejected": 1.6395747661590576, "logps/chosen": -7.038564205169678, "logps/rejected": -95.44420623779297, "loss": 5002.3164, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.00010182622645515949, "rewards/margins": -0.0020748500246554613, "rewards/rejected": 0.0021766764111816883, "step": 70 }, { "debug/policy_chosen_logits": 1.4142253398895264, "debug/policy_chosen_logps": -122.62004089355469, "debug/policy_rejected_logits": 1.8007938861846924, "debug/policy_rejected_logps": -41.24224853515625, "debug/reference_chosen_logps": -122.72642517089844, "debug/reference_rejected_logps": -41.365840911865234, "debug/sppo_chosen_loss": 2489.86474609375, "debug/sppo_chosen_reward_in_loss": 0.10638396441936493, "debug/sppo_rej_reward_in_loss": 0.12358677387237549, "debug/sppo_reject_loss": 2512.81982421875, "epoch": 0.2717391304347826, "grad_norm": 246375.2452522303, "learning_rate": 7.5e-08, "logits/chosen": 1.4142253398895264, "logits/rejected": 1.8007938861846924, "logps/chosen": -122.62004089355469, "logps/rejected": -41.24224853515625, "loss": 5015.5594, "rewards/accuracies": 0.25, "rewards/chosen": 0.0010638395324349403, "rewards/margins": -0.00017202818708028644, "rewards/rejected": 0.0012358678504824638, "step": 75 }, { "debug/policy_chosen_logits": 1.3784531354904175, "debug/policy_chosen_logps": -5.820089817047119, "debug/policy_rejected_logits": 1.759643793106079, "debug/policy_rejected_logps": -4.801230430603027, "debug/reference_chosen_logps": -5.829151630401611, "debug/reference_rejected_logps": -4.762129306793213, "debug/sppo_chosen_loss": 2499.149658203125, "debug/sppo_chosen_reward_in_loss": 0.009061263874173164, "debug/sppo_rej_reward_in_loss": -0.039101071655750275, "debug/sppo_reject_loss": 2496.10791015625, "epoch": 0.2898550724637681, "grad_norm": 15570.962766998295, "learning_rate": 8e-08, "logits/chosen": 1.3784531354904175, "logits/rejected": 1.759643793106079, "logps/chosen": -5.820089817047119, "logps/rejected": -4.801230430603027, "loss": 4992.4242, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 9.061263699550182e-05, "rewards/margins": 0.0004816233122255653, "rewards/rejected": -0.00039101074798963964, "step": 80 }, { "debug/policy_chosen_logits": 1.5179107189178467, "debug/policy_chosen_logps": -64.65785217285156, "debug/policy_rejected_logits": 2.1373679637908936, "debug/policy_rejected_logps": -7.893537998199463, "debug/reference_chosen_logps": -64.85469055175781, "debug/reference_rejected_logps": -7.927921295166016, "debug/sppo_chosen_loss": 2482.44580078125, "debug/sppo_chosen_reward_in_loss": 0.1968422681093216, "debug/sppo_rej_reward_in_loss": 0.03438347578048706, "debug/sppo_reject_loss": 2503.471923828125, "epoch": 0.3079710144927536, "grad_norm": 52086.12616840031, "learning_rate": 8.5e-08, "logits/chosen": 1.5179107189178467, "logits/rejected": 2.1373679637908936, "logps/chosen": -64.65785217285156, "logps/rejected": -7.893537998199463, "loss": 4994.7273, "rewards/accuracies": 0.25, "rewards/chosen": 0.0019684224389493465, "rewards/margins": 0.0016245876904577017, "rewards/rejected": 0.00034383474849164486, "step": 85 }, { "debug/policy_chosen_logits": 1.2522896528244019, "debug/policy_chosen_logps": -121.9262466430664, "debug/policy_rejected_logits": 1.4387035369873047, "debug/policy_rejected_logps": -7.519499778747559, "debug/reference_chosen_logps": -122.35246276855469, "debug/reference_rejected_logps": -7.5182623863220215, "debug/sppo_chosen_loss": 2462.625, "debug/sppo_chosen_reward_in_loss": 0.4262300431728363, "debug/sppo_rej_reward_in_loss": -0.0012376547092571855, "debug/sppo_reject_loss": 2499.88916015625, "epoch": 0.32608695652173914, "grad_norm": 16700.98081474839, "learning_rate": 9e-08, "logits/chosen": 1.2522896528244019, "logits/rejected": 1.4387035369873047, "logps/chosen": -121.9262466430664, "logps/rejected": -7.519499778747559, "loss": 5001.143, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.004262299742549658, "rewards/margins": 0.00427467655390501, "rewards/rejected": -1.237654396391008e-05, "step": 90 }, { "debug/policy_chosen_logits": 1.4466530084609985, "debug/policy_chosen_logps": -149.0825653076172, "debug/policy_rejected_logits": 1.7712924480438232, "debug/policy_rejected_logps": -8.136910438537598, "debug/reference_chosen_logps": -149.43124389648438, "debug/reference_rejected_logps": -8.184698104858398, "debug/sppo_chosen_loss": 2467.141845703125, "debug/sppo_chosen_reward_in_loss": 0.3486630320549011, "debug/sppo_rej_reward_in_loss": 0.04778692126274109, "debug/sppo_reject_loss": 2504.84228515625, "epoch": 0.3442028985507246, "grad_norm": 13190.559176247856, "learning_rate": 9.499999999999999e-08, "logits/chosen": 1.4466530084609985, "logits/rejected": 1.7712924480438232, "logps/chosen": -149.0825653076172, "logps/rejected": -8.136910438537598, "loss": 4991.9758, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.00348663073964417, "rewards/margins": 0.003008761443197727, "rewards/rejected": 0.00047786920913495123, "step": 95 }, { "debug/policy_chosen_logits": 1.210399866104126, "debug/policy_chosen_logps": -6.510025978088379, "debug/policy_rejected_logits": 1.8930845260620117, "debug/policy_rejected_logps": -1.8170474767684937, "debug/reference_chosen_logps": -6.515327453613281, "debug/reference_rejected_logps": -1.8118356466293335, "debug/sppo_chosen_loss": 2499.52197265625, "debug/sppo_chosen_reward_in_loss": 0.0053020804189145565, "debug/sppo_rej_reward_in_loss": -0.005211913492530584, "debug/sppo_reject_loss": 2499.489013671875, "epoch": 0.36231884057971014, "grad_norm": 48326.079597574775, "learning_rate": 1e-07, "logits/chosen": 1.210399866104126, "logits/rejected": 1.8930845260620117, "logps/chosen": -6.510025978088379, "logps/rejected": -1.8170474767684937, "loss": 4999.5461, "rewards/accuracies": 0.25, "rewards/chosen": 5.302080535329878e-05, "rewards/margins": 0.00010513995948713273, "rewards/rejected": -5.21191323059611e-05, "step": 100 }, { "epoch": 0.36231884057971014, "eval_debug/policy_chosen_logits": 1.6641840934753418, "eval_debug/policy_chosen_logps": -122.6432113647461, "eval_debug/policy_rejected_logits": 1.7268767356872559, "eval_debug/policy_rejected_logps": -63.68825149536133, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2453.15234375, "eval_debug/sppo_chosen_reward_in_loss": 0.5048502087593079, "eval_debug/sppo_rej_reward_in_loss": 0.1988052874803543, "eval_debug/sppo_reject_loss": 2523.21435546875, "eval_logits/chosen": 1.6641840934753418, "eval_logits/rejected": 1.7268767356872559, "eval_logps/chosen": -122.6432113647461, "eval_logps/rejected": -63.68825149536133, "eval_loss": 4988.09521484375, "eval_rewards/accuracies": 0.2763157784938812, "eval_rewards/chosen": 0.005048501770943403, "eval_rewards/margins": 0.0030604489147663116, "eval_rewards/rejected": 0.001988052623346448, "eval_runtime": 28.7537, "eval_samples_per_second": 20.867, "eval_steps_per_second": 0.661, "step": 100 }, { "debug/policy_chosen_logits": 1.1344143152236938, "debug/policy_chosen_logps": -5.457496643066406, "debug/policy_rejected_logits": 1.2507808208465576, "debug/policy_rejected_logps": -100.43708801269531, "debug/reference_chosen_logps": -5.478997230529785, "debug/reference_rejected_logps": -100.13005828857422, "debug/sppo_chosen_loss": 2497.888671875, "debug/sppo_chosen_reward_in_loss": 0.021500717848539352, "debug/sppo_rej_reward_in_loss": -0.30704087018966675, "debug/sppo_reject_loss": 2476.33740234375, "epoch": 0.3804347826086957, "grad_norm": 195735.53101143672, "learning_rate": 9.999861184954399e-08, "logits/chosen": 1.1344143152236938, "logits/rejected": 1.2507808208465576, "logps/chosen": -5.457496643066406, "logps/rejected": -100.43708801269531, "loss": 5019.4508, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.00021500718139577657, "rewards/margins": 0.003285416169092059, "rewards/rejected": -0.003070408944040537, "step": 105 }, { "debug/policy_chosen_logits": 1.2915951013565063, "debug/policy_chosen_logps": -69.68765258789062, "debug/policy_rejected_logits": 1.5798096656799316, "debug/policy_rejected_logps": -6.682408332824707, "debug/reference_chosen_logps": -70.15248107910156, "debug/reference_rejected_logps": -6.677098274230957, "debug/sppo_chosen_loss": 2461.51318359375, "debug/sppo_chosen_reward_in_loss": 0.46482428908348083, "debug/sppo_rej_reward_in_loss": -0.0053092241287231445, "debug/sppo_reject_loss": 2499.48974609375, "epoch": 0.39855072463768115, "grad_norm": 50077.94286045815, "learning_rate": 9.999444747525447e-08, "logits/chosen": 1.2915951013565063, "logits/rejected": 1.5798096656799316, "logps/chosen": -69.68765258789062, "logps/rejected": -6.682408332824707, "loss": 4991.7578, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.004648244008421898, "rewards/margins": 0.0047013359144330025, "rewards/rejected": -5.309223706717603e-05, "step": 110 }, { "debug/policy_chosen_logits": 1.5621331930160522, "debug/policy_chosen_logps": -72.80657196044922, "debug/policy_rejected_logits": 1.9434821605682373, "debug/policy_rejected_logps": -17.86495590209961, "debug/reference_chosen_logps": -73.14714050292969, "debug/reference_rejected_logps": -17.959739685058594, "debug/sppo_chosen_loss": 2469.55419921875, "debug/sppo_chosen_reward_in_loss": 0.340567409992218, "debug/sppo_rej_reward_in_loss": 0.09478273242712021, "debug/sppo_reject_loss": 2509.90625, "epoch": 0.4166666666666667, "grad_norm": 46640.34842108522, "learning_rate": 9.998750710836255e-08, "logits/chosen": 1.5621331930160522, "logits/rejected": 1.9434821605682373, "logps/chosen": -72.80657196044922, "logps/rejected": -17.86495590209961, "loss": 4993.5625, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0034056738950312138, "rewards/margins": 0.0024578471202403307, "rewards/rejected": 0.0009478272986598313, "step": 115 }, { "debug/policy_chosen_logits": 1.3487129211425781, "debug/policy_chosen_logps": -47.41932678222656, "debug/policy_rejected_logits": 1.7025136947631836, "debug/policy_rejected_logps": -2.4432568550109863, "debug/reference_chosen_logps": -47.64037322998047, "debug/reference_rejected_logps": -2.4745194911956787, "debug/sppo_chosen_loss": 2479.402099609375, "debug/sppo_chosen_reward_in_loss": 0.22104182839393616, "debug/sppo_rej_reward_in_loss": 0.03126268461346626, "debug/sppo_reject_loss": 2503.14013671875, "epoch": 0.43478260869565216, "grad_norm": 81955.17702947302, "learning_rate": 9.997779113423914e-08, "logits/chosen": 1.3487129211425781, "logits/rejected": 1.7025136947631836, "logps/chosen": -47.41932678222656, "logps/rejected": -2.4432568550109863, "loss": 4997.5547, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.0022104184608906507, "rewards/margins": 0.0018977917497977614, "rewards/rejected": 0.0003126268566120416, "step": 120 }, { "debug/policy_chosen_logits": 1.4282780885696411, "debug/policy_chosen_logps": -6.312525749206543, "debug/policy_rejected_logits": 1.669171929359436, "debug/policy_rejected_logps": -58.1739387512207, "debug/reference_chosen_logps": -6.291218280792236, "debug/reference_rejected_logps": -58.57655715942383, "debug/sppo_chosen_loss": 2502.173095703125, "debug/sppo_chosen_reward_in_loss": -0.021307198330760002, "debug/sppo_rej_reward_in_loss": 0.4026147723197937, "debug/sppo_reject_loss": 2546.1806640625, "epoch": 0.4528985507246377, "grad_norm": 35277.88079837917, "learning_rate": 9.996530009237363e-08, "logits/chosen": 1.4282780885696411, "logits/rejected": 1.669171929359436, "logps/chosen": -6.312525749206543, "logps/rejected": -58.1739387512207, "loss": 5003.2082, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.00021307200950104743, "rewards/margins": -0.00423921924084425, "rewards/rejected": 0.004026147536933422, "step": 125 }, { "debug/policy_chosen_logits": 1.215766191482544, "debug/policy_chosen_logps": -4.787589073181152, "debug/policy_rejected_logits": 1.6413600444793701, "debug/policy_rejected_logps": -7.680710792541504, "debug/reference_chosen_logps": -4.811456203460693, "debug/reference_rejected_logps": -7.753373622894287, "debug/sppo_chosen_loss": 2497.645751953125, "debug/sppo_chosen_reward_in_loss": 0.02386767603456974, "debug/sppo_rej_reward_in_loss": 0.07266347110271454, "debug/sppo_reject_loss": 2507.34765625, "epoch": 0.47101449275362317, "grad_norm": 63892.61735071012, "learning_rate": 9.995003467634381e-08, "logits/chosen": 1.215766191482544, "logits/rejected": 1.6413600444793701, "logps/chosen": -4.787589073181152, "logps/rejected": -7.680710792541504, "loss": 4967.5477, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.00023867675918154418, "rewards/margins": -0.00048795799375511706, "rewards/rejected": 0.0007266346947290003, "step": 130 }, { "debug/policy_chosen_logits": 1.353515386581421, "debug/policy_chosen_logps": -7.581971645355225, "debug/policy_rejected_logits": 1.8122670650482178, "debug/policy_rejected_logps": -45.84088897705078, "debug/reference_chosen_logps": -7.610405921936035, "debug/reference_rejected_logps": -46.041690826416016, "debug/sppo_chosen_loss": 2497.179443359375, "debug/sppo_chosen_reward_in_loss": 0.028433550149202347, "debug/sppo_rej_reward_in_loss": 0.20080089569091797, "debug/sppo_reject_loss": 2521.134765625, "epoch": 0.4891304347826087, "grad_norm": 21808.144692485934, "learning_rate": 9.99319957337775e-08, "logits/chosen": 1.353515386581421, "logits/rejected": 1.8122670650482178, "logps/chosen": -7.581971645355225, "logps/rejected": -45.84088897705078, "loss": 5004.7598, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.0002843355177901685, "rewards/margins": -0.001723673427477479, "rewards/rejected": 0.0020080087706446648, "step": 135 }, { "debug/policy_chosen_logits": 1.7343065738677979, "debug/policy_chosen_logps": -10.862150192260742, "debug/policy_rejected_logits": 2.0277140140533447, "debug/policy_rejected_logps": -1.9356765747070312, "debug/reference_chosen_logps": -10.920201301574707, "debug/reference_rejected_logps": -1.9509022235870361, "debug/sppo_chosen_loss": 2494.29736328125, "debug/sppo_chosen_reward_in_loss": 0.058051250874996185, "debug/sppo_rej_reward_in_loss": 0.015225586481392384, "debug/sppo_reject_loss": 2501.537109375, "epoch": 0.5072463768115942, "grad_norm": 184174.1935477376, "learning_rate": 9.991118426630531e-08, "logits/chosen": 1.7343065738677979, "logits/rejected": 2.0277140140533447, "logps/chosen": -10.862150192260742, "logps/rejected": -1.9356765747070312, "loss": 4997.9758, "rewards/accuracies": 0.375, "rewards/chosen": 0.0005805124528706074, "rewards/margins": 0.00042825666605494916, "rewards/rejected": 0.00015225585957523435, "step": 140 }, { "debug/policy_chosen_logits": 1.248162031173706, "debug/policy_chosen_logps": -226.30712890625, "debug/policy_rejected_logits": 1.5711052417755127, "debug/policy_rejected_logps": -5.885363578796387, "debug/reference_chosen_logps": -228.1366729736328, "debug/reference_rejected_logps": -5.889040946960449, "debug/sppo_chosen_loss": 2364.07470703125, "debug/sppo_chosen_reward_in_loss": 1.8295139074325562, "debug/sppo_rej_reward_in_loss": 0.0036771714221686125, "debug/sppo_reject_loss": 2500.3818359375, "epoch": 0.5253623188405797, "grad_norm": 46357.11462364378, "learning_rate": 9.988760142950516e-08, "logits/chosen": 1.248162031173706, "logits/rejected": 1.5711052417755127, "logps/chosen": -226.30712890625, "logps/rejected": -5.885363578796387, "loss": 4990.2539, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.01829513907432556, "rewards/margins": 0.018258368596434593, "rewards/rejected": 3.677170752780512e-05, "step": 145 }, { "debug/policy_chosen_logits": 1.61501944065094, "debug/policy_chosen_logps": -195.53323364257812, "debug/policy_rejected_logits": 2.077481746673584, "debug/policy_rejected_logps": -69.79290771484375, "debug/reference_chosen_logps": -198.35165405273438, "debug/reference_rejected_logps": -70.21018981933594, "debug/sppo_chosen_loss": 2389.683349609375, "debug/sppo_chosen_reward_in_loss": 2.818429470062256, "debug/sppo_rej_reward_in_loss": 0.417285680770874, "debug/sppo_reject_loss": 2549.34130859375, "epoch": 0.5434782608695652, "grad_norm": 28713.592266637042, "learning_rate": 9.98612485328381e-08, "logits/chosen": 1.61501944065094, "logits/rejected": 2.077481746673584, "logps/chosen": -195.53323364257812, "logps/rejected": -69.79290771484375, "loss": 4972.5906, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.02818429097533226, "rewards/margins": 0.024011436849832535, "rewards/rejected": 0.004172856919467449, "step": 150 }, { "debug/policy_chosen_logits": 1.3936008214950562, "debug/policy_chosen_logps": -186.7118682861328, "debug/policy_rejected_logits": 1.7444528341293335, "debug/policy_rejected_logps": -11.318367004394531, "debug/reference_chosen_logps": -188.67599487304688, "debug/reference_rejected_logps": -11.309015274047852, "debug/sppo_chosen_loss": 2415.69140625, "debug/sppo_chosen_reward_in_loss": 1.9641516208648682, "debug/sppo_rej_reward_in_loss": -0.009350456297397614, "debug/sppo_reject_loss": 2499.109130859375, "epoch": 0.5615942028985508, "grad_norm": 55501.62480336653, "learning_rate": 9.983212703957554e-08, "logits/chosen": 1.3936008214950562, "logits/rejected": 1.7444528341293335, "logps/chosen": -186.7118682861328, "logps/rejected": -11.318367004394531, "loss": 4965.591, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.019641516730189323, "rewards/margins": 0.019735019654035568, "rewards/rejected": -9.350453910883516e-05, "step": 155 }, { "debug/policy_chosen_logits": 1.12738835811615, "debug/policy_chosen_logps": -41.19074249267578, "debug/policy_rejected_logits": 1.589592695236206, "debug/policy_rejected_logps": -7.270390510559082, "debug/reference_chosen_logps": -41.46500778198242, "debug/reference_rejected_logps": -7.236231803894043, "debug/sppo_chosen_loss": 2476.40478515625, "debug/sppo_chosen_reward_in_loss": 0.27427077293395996, "debug/sppo_rej_reward_in_loss": -0.03415922075510025, "debug/sppo_reject_loss": 2496.604248046875, "epoch": 0.5797101449275363, "grad_norm": 13483.19188792202, "learning_rate": 9.980023856671804e-08, "logits/chosen": 1.12738835811615, "logits/rejected": 1.589592695236206, "logps/chosen": -41.19074249267578, "logps/rejected": -7.270390510559082, "loss": 4963.8273, "rewards/accuracies": 0.375, "rewards/chosen": 0.002742707496508956, "rewards/margins": 0.0030842998530715704, "rewards/rejected": -0.0003415921819396317, "step": 160 }, { "debug/policy_chosen_logits": 1.6330093145370483, "debug/policy_chosen_logps": -46.82362365722656, "debug/policy_rejected_logits": 1.4521421194076538, "debug/policy_rejected_logps": -26.289093017578125, "debug/reference_chosen_logps": -47.207061767578125, "debug/reference_rejected_logps": -26.676687240600586, "debug/sppo_chosen_loss": 2465.522216796875, "debug/sppo_chosen_reward_in_loss": 0.3834393620491028, "debug/sppo_rej_reward_in_loss": 0.3875953257083893, "debug/sppo_reject_loss": 2543.467529296875, "epoch": 0.5978260869565217, "grad_norm": 40577.152085164584, "learning_rate": 9.976558488490555e-08, "logits/chosen": 1.6330093145370483, "logits/rejected": 1.4521421194076538, "logps/chosen": -46.82362365722656, "logps/rejected": -26.289093017578125, "loss": 5033.4609, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.003834393573924899, "rewards/margins": -4.155974602326751e-05, "rewards/rejected": 0.0038759533781558275, "step": 165 }, { "debug/policy_chosen_logits": 1.3022582530975342, "debug/policy_chosen_logps": -50.278297424316406, "debug/policy_rejected_logits": 1.609271764755249, "debug/policy_rejected_logps": -6.32607364654541, "debug/reference_chosen_logps": -49.824684143066406, "debug/reference_rejected_logps": -6.327885627746582, "debug/sppo_chosen_loss": 2552.67724609375, "debug/sppo_chosen_reward_in_loss": -0.45361679792404175, "debug/sppo_rej_reward_in_loss": 0.0018122732872143388, "debug/sppo_reject_loss": 2500.300048828125, "epoch": 0.6159420289855072, "grad_norm": 200849.77727708206, "learning_rate": 9.972816791831899e-08, "logits/chosen": 1.3022582530975342, "logits/rejected": 1.609271764755249, "logps/chosen": -50.278297424316406, "logps/rejected": -6.32607364654541, "loss": 5056.8414, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.004536167718470097, "rewards/margins": -0.004554290324449539, "rewards/rejected": 1.8122711480828002e-05, "step": 170 }, { "debug/policy_chosen_logits": 1.4527250528335571, "debug/policy_chosen_logps": -11.62114429473877, "debug/policy_rejected_logits": 1.8912639617919922, "debug/policy_rejected_logps": -112.84718322753906, "debug/reference_chosen_logps": -11.554305076599121, "debug/reference_rejected_logps": -113.70204162597656, "debug/sppo_chosen_loss": 2506.783935546875, "debug/sppo_chosen_reward_in_loss": -0.06683876365423203, "debug/sppo_rej_reward_in_loss": 0.8548470735549927, "debug/sppo_reject_loss": 2617.95166015625, "epoch": 0.6340579710144928, "grad_norm": 123695.54642316517, "learning_rate": 9.968798974457359e-08, "logits/chosen": 1.4527250528335571, "logits/rejected": 1.8912639617919922, "logps/chosen": -11.62114429473877, "logps/rejected": -112.84718322753906, "loss": 5001.0488, "rewards/accuracies": 0.25, "rewards/chosen": -0.0006683876854367554, "rewards/margins": -0.009216858074069023, "rewards/rejected": 0.008548470214009285, "step": 175 }, { "debug/policy_chosen_logits": 1.263779878616333, "debug/policy_chosen_logps": -6.504377841949463, "debug/policy_rejected_logits": 1.6323928833007812, "debug/policy_rejected_logps": -11.183588981628418, "debug/reference_chosen_logps": -6.455367088317871, "debug/reference_rejected_logps": -11.164003372192383, "debug/sppo_chosen_loss": 2505.044921875, "debug/sppo_chosen_reward_in_loss": -0.0490097776055336, "debug/sppo_rej_reward_in_loss": -0.019586723297834396, "debug/sppo_reject_loss": 2498.0791015625, "epoch": 0.6521739130434783, "grad_norm": 38655.0777029622, "learning_rate": 9.964505259460332e-08, "logits/chosen": 1.263779878616333, "logits/rejected": 1.6323928833007812, "logps/chosen": -6.504377841949463, "logps/rejected": -11.183588981628418, "loss": 5011.5578, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.0004900977946817875, "rewards/margins": -0.00029423058731481433, "rewards/rejected": -0.00019586723647080362, "step": 180 }, { "debug/policy_chosen_logits": 1.248523473739624, "debug/policy_chosen_logps": -7.896877288818359, "debug/policy_rejected_logits": 1.699440598487854, "debug/policy_rejected_logps": -15.617756843566895, "debug/reference_chosen_logps": -7.793820858001709, "debug/reference_rejected_logps": -15.452303886413574, "debug/sppo_chosen_loss": 2510.38232421875, "debug/sppo_chosen_reward_in_loss": -0.10305584967136383, "debug/sppo_rej_reward_in_loss": -0.1654536873102188, "debug/sppo_reject_loss": 2483.616943359375, "epoch": 0.6702898550724637, "grad_norm": 32620.35688431207, "learning_rate": 9.959935885253715e-08, "logits/chosen": 1.248523473739624, "logits/rejected": 1.699440598487854, "logps/chosen": -7.896877288818359, "logps/rejected": -15.617756843566895, "loss": 5009.3246, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.0010305584874004126, "rewards/margins": 0.0006239783251658082, "rewards/rejected": -0.0016545368125662208, "step": 185 }, { "debug/policy_chosen_logits": 1.2533260583877563, "debug/policy_chosen_logps": -55.031654357910156, "debug/policy_rejected_logits": 1.324285626411438, "debug/policy_rejected_logps": -3.648160219192505, "debug/reference_chosen_logps": -55.711204528808594, "debug/reference_rejected_logps": -3.6667861938476562, "debug/sppo_chosen_loss": 2448.38818359375, "debug/sppo_chosen_reward_in_loss": 0.6795614361763, "debug/sppo_rej_reward_in_loss": 0.01862628385424614, "debug/sppo_reject_loss": 2501.91064453125, "epoch": 0.6884057971014492, "grad_norm": 45927.79597339519, "learning_rate": 9.955091105556664e-08, "logits/chosen": 1.2533260583877563, "logits/rejected": 1.324285626411438, "logps/chosen": -55.031654357910156, "logps/rejected": -3.648160219192505, "loss": 5011.5492, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.006795613560825586, "rewards/margins": 0.0066093518398702145, "rewards/rejected": 0.00018626284145284444, "step": 190 }, { "debug/policy_chosen_logits": 1.8112897872924805, "debug/policy_chosen_logps": -100.58317565917969, "debug/policy_rejected_logits": 2.179638624191284, "debug/policy_rejected_logps": -83.06779479980469, "debug/reference_chosen_logps": -101.08064270019531, "debug/reference_rejected_logps": -83.28086853027344, "debug/sppo_chosen_loss": 2454.91259765625, "debug/sppo_chosen_reward_in_loss": 0.4974687993526459, "debug/sppo_rej_reward_in_loss": 0.21308830380439758, "debug/sppo_reject_loss": 2524.06103515625, "epoch": 0.7065217391304348, "grad_norm": 54560.619792140395, "learning_rate": 9.949971189380507e-08, "logits/chosen": 1.8112897872924805, "logits/rejected": 2.179638624191284, "logps/chosen": -100.58317565917969, "logps/rejected": -83.06779479980469, "loss": 4988.9668, "rewards/accuracies": 0.375, "rewards/chosen": 0.004974688403308392, "rewards/margins": 0.0028438051231205463, "rewards/rejected": 0.002130882814526558, "step": 195 }, { "debug/policy_chosen_logits": 1.266251802444458, "debug/policy_chosen_logps": -4.813408851623535, "debug/policy_rejected_logits": 1.5832109451293945, "debug/policy_rejected_logps": -52.66980743408203, "debug/reference_chosen_logps": -4.822422027587891, "debug/reference_rejected_logps": -53.10357666015625, "debug/sppo_chosen_loss": 2499.11328125, "debug/sppo_chosen_reward_in_loss": 0.009012925438582897, "debug/sppo_rej_reward_in_loss": 0.4337654709815979, "debug/sppo_reject_loss": 2550.784423828125, "epoch": 0.7246376811594203, "grad_norm": 71334.45511788806, "learning_rate": 9.944576421013802e-08, "logits/chosen": 1.266251802444458, "logits/rejected": 1.5832109451293945, "logps/chosen": -4.813408851623535, "logps/rejected": -52.66980743408203, "loss": 5011.4531, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 9.012925147544593e-05, "rewards/margins": -0.0042475247755646706, "rewards/rejected": 0.004337654449045658, "step": 200 }, { "epoch": 0.7246376811594203, "eval_debug/policy_chosen_logits": 1.6732391119003296, "eval_debug/policy_chosen_logps": -121.37860107421875, "eval_debug/policy_rejected_logits": 1.733038067817688, "eval_debug/policy_rejected_logps": -63.30970001220703, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2386.03955078125, "eval_debug/sppo_chosen_reward_in_loss": 1.769453763961792, "eval_debug/sppo_rej_reward_in_loss": 0.5773504972457886, "eval_debug/sppo_reject_loss": 2582.69482421875, "eval_logits/chosen": 1.6732391119003296, "eval_logits/rejected": 1.733038067817688, "eval_logps/chosen": -121.37860107421875, "eval_logps/rejected": -63.30970001220703, "eval_loss": 4990.56103515625, "eval_rewards/accuracies": 0.31578946113586426, "eval_rewards/chosen": 0.017694536596536636, "eval_rewards/margins": 0.011921032331883907, "eval_rewards/rejected": 0.005773505661636591, "eval_runtime": 28.7496, "eval_samples_per_second": 20.87, "eval_steps_per_second": 0.661, "step": 200 }, { "debug/policy_chosen_logits": 1.3786900043487549, "debug/policy_chosen_logps": -4.881337642669678, "debug/policy_rejected_logits": 1.7196651697158813, "debug/policy_rejected_logps": -85.54505920410156, "debug/reference_chosen_logps": -4.833613872528076, "debug/reference_rejected_logps": -85.68836212158203, "debug/sppo_chosen_loss": 2504.79931640625, "debug/sppo_chosen_reward_in_loss": -0.047723717987537384, "debug/sppo_rej_reward_in_loss": 0.14329808950424194, "debug/sppo_reject_loss": 2516.5068359375, "epoch": 0.7427536231884058, "grad_norm": 173806.55476719516, "learning_rate": 9.938907100006552e-08, "logits/chosen": 1.3786900043487549, "logits/rejected": 1.7196651697158813, "logps/chosen": -4.881337642669678, "logps/rejected": -85.54505920410156, "loss": 5008.4223, "rewards/accuracies": 0.25, "rewards/chosen": -0.00047723716124892235, "rewards/margins": -0.001910218270495534, "rewards/rejected": 0.0014329809928312898, "step": 205 }, { "debug/policy_chosen_logits": 1.2093603610992432, "debug/policy_chosen_logps": -63.24913787841797, "debug/policy_rejected_logits": 1.508322834968567, "debug/policy_rejected_logps": -45.30883026123047, "debug/reference_chosen_logps": -63.378448486328125, "debug/reference_rejected_logps": -45.54474639892578, "debug/sppo_chosen_loss": 2488.00537109375, "debug/sppo_chosen_reward_in_loss": 0.12931086122989655, "debug/sppo_rej_reward_in_loss": 0.23591335117816925, "debug/sppo_reject_loss": 2525.66552734375, "epoch": 0.7608695652173914, "grad_norm": 49685.39908078443, "learning_rate": 9.932963541153584e-08, "logits/chosen": 1.2093603610992432, "logits/rejected": 1.508322834968567, "logps/chosen": -63.24913787841797, "logps/rejected": -45.30883026123047, "loss": 5018.6305, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": 0.0012931086821481586, "rewards/margins": -0.0010660247644409537, "rewards/rejected": 0.002359133679419756, "step": 210 }, { "debug/policy_chosen_logits": 1.493700623512268, "debug/policy_chosen_logps": -46.357460021972656, "debug/policy_rejected_logits": 1.8851032257080078, "debug/policy_rejected_logps": -8.006715774536133, "debug/reference_chosen_logps": -46.571571350097656, "debug/reference_rejected_logps": -7.923023223876953, "debug/sppo_chosen_loss": 2481.19482421875, "debug/sppo_chosen_reward_in_loss": 0.21411189436912537, "debug/sppo_rej_reward_in_loss": -0.08369234204292297, "debug/sppo_reject_loss": 2491.713134765625, "epoch": 0.7789855072463768, "grad_norm": 54093.60168531016, "learning_rate": 9.926746074477053e-08, "logits/chosen": 1.493700623512268, "logits/rejected": 1.8851032257080078, "logps/chosen": -46.357460021972656, "logps/rejected": -8.006715774536133, "loss": 4960.9219, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.002141118748113513, "rewards/margins": 0.0029780424665659666, "rewards/rejected": -0.0008369233692064881, "step": 215 }, { "debug/policy_chosen_logits": 0.9914774894714355, "debug/policy_chosen_logps": -26.690719604492188, "debug/policy_rejected_logits": 1.316943883895874, "debug/policy_rejected_logps": -10.111607551574707, "debug/reference_chosen_logps": -26.971553802490234, "debug/reference_rejected_logps": -10.127599716186523, "debug/sppo_chosen_loss": 2475.00048828125, "debug/sppo_chosen_reward_in_loss": 0.2808329463005066, "debug/sppo_rej_reward_in_loss": 0.015992391854524612, "debug/sppo_reject_loss": 2501.6474609375, "epoch": 0.7971014492753623, "grad_norm": 35172.37430087494, "learning_rate": 9.920255045208128e-08, "logits/chosen": 0.9914774894714355, "logits/rejected": 1.316943883895874, "logps/chosen": -26.690719604492188, "logps/rejected": -10.111607551574707, "loss": 5010.748, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.002808329416438937, "rewards/margins": 0.0026484052650630474, "rewards/rejected": 0.0001599239039933309, "step": 220 }, { "debug/policy_chosen_logits": 1.3616772890090942, "debug/policy_chosen_logps": -7.503972053527832, "debug/policy_rejected_logits": 1.6508338451385498, "debug/policy_rejected_logps": -117.13150787353516, "debug/reference_chosen_logps": -7.492678165435791, "debug/reference_rejected_logps": -117.96882629394531, "debug/sppo_chosen_loss": 2501.196044921875, "debug/sppo_chosen_reward_in_loss": -0.011294806376099586, "debug/sppo_rej_reward_in_loss": 0.8373193740844727, "debug/sppo_reject_loss": 2609.630126953125, "epoch": 0.8152173913043478, "grad_norm": 33681.17088519409, "learning_rate": 9.913490813767816e-08, "logits/chosen": 1.3616772890090942, "logits/rejected": 1.6508338451385498, "logps/chosen": -7.503972053527832, "logps/rejected": -117.13150787353516, "loss": 5015.6332, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.00011294805153738707, "rewards/margins": -0.008486142382025719, "rewards/rejected": 0.008373195305466652, "step": 225 }, { "debug/policy_chosen_logits": 1.2211979627609253, "debug/policy_chosen_logps": -44.33199691772461, "debug/policy_rejected_logits": 1.4328826665878296, "debug/policy_rejected_logps": -9.628314971923828, "debug/reference_chosen_logps": -44.462120056152344, "debug/reference_rejected_logps": -9.55119514465332, "debug/sppo_chosen_loss": 2488.130126953125, "debug/sppo_chosen_reward_in_loss": 0.13011744618415833, "debug/sppo_rej_reward_in_loss": -0.0771201103925705, "debug/sppo_reject_loss": 2492.480224609375, "epoch": 0.8333333333333334, "grad_norm": 178662.08925744414, "learning_rate": 9.906453755746957e-08, "logits/chosen": 1.2211979627609253, "logits/rejected": 1.4328826665878296, "logps/chosen": -44.33199691772461, "logps/rejected": -9.628314971923828, "loss": 4973.0992, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.001301174284890294, "rewards/margins": 0.0020723752677440643, "rewards/rejected": -0.0007712010992690921, "step": 230 }, { "debug/policy_chosen_logits": 1.4040919542312622, "debug/policy_chosen_logps": -3.8263511657714844, "debug/policy_rejected_logits": 1.5735127925872803, "debug/policy_rejected_logps": -11.321832656860352, "debug/reference_chosen_logps": -3.7865378856658936, "debug/reference_rejected_logps": -11.303999900817871, "debug/sppo_chosen_loss": 2504.001953125, "debug/sppo_chosen_reward_in_loss": -0.03981299325823784, "debug/sppo_rej_reward_in_loss": -0.017831813544034958, "debug/sppo_reject_loss": 2498.27197265625, "epoch": 0.8514492753623188, "grad_norm": 144538.62516345055, "learning_rate": 9.899144261885363e-08, "logits/chosen": 1.4040919542312622, "logits/rejected": 1.5735127925872803, "logps/chosen": -3.8263511657714844, "logps/rejected": -11.321832656860352, "loss": 4986.9398, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": -0.00039812998147681355, "rewards/margins": -0.00021981183090247214, "rewards/rejected": -0.00017831812147051096, "step": 235 }, { "debug/policy_chosen_logits": 1.4072325229644775, "debug/policy_chosen_logps": -3.5935165882110596, "debug/policy_rejected_logits": 1.9952967166900635, "debug/policy_rejected_logps": -36.11383819580078, "debug/reference_chosen_logps": -3.5993995666503906, "debug/reference_rejected_logps": -36.39490509033203, "debug/sppo_chosen_loss": 2499.442138671875, "debug/sppo_chosen_reward_in_loss": 0.005883321166038513, "debug/sppo_rej_reward_in_loss": 0.281069278717041, "debug/sppo_reject_loss": 2532.0126953125, "epoch": 0.8695652173913043, "grad_norm": 61561.99660500412, "learning_rate": 9.891562738050125e-08, "logits/chosen": 1.4072325229644775, "logits/rejected": 1.9952967166900635, "logps/chosen": -3.5935165882110596, "logps/rejected": -36.11383819580078, "loss": 5021.4039, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 5.883322955924086e-05, "rewards/margins": -0.0027518596034497023, "rewards/rejected": 0.0028106928803026676, "step": 240 }, { "debug/policy_chosen_logits": 1.1142200231552124, "debug/policy_chosen_logps": -3.6962780952453613, "debug/policy_rejected_logits": 1.4678951501846313, "debug/policy_rejected_logps": -171.041259765625, "debug/reference_chosen_logps": -3.6889584064483643, "debug/reference_rejected_logps": -171.53671264648438, "debug/sppo_chosen_loss": 2500.814453125, "debug/sppo_chosen_reward_in_loss": -0.007319542579352856, "debug/sppo_rej_reward_in_loss": 0.4954606592655182, "debug/sppo_reject_loss": 2555.048095703125, "epoch": 0.8876811594202898, "grad_norm": 82653.33032738155, "learning_rate": 9.883709605213071e-08, "logits/chosen": 1.1142200231552124, "logits/rejected": 1.4678951501846313, "logps/chosen": -3.6962780952453613, "logps/rejected": -171.041259765625, "loss": 4998.6938, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -7.319539145100862e-05, "rewards/margins": -0.005027801729738712, "rewards/rejected": 0.004954606294631958, "step": 245 }, { "debug/policy_chosen_logits": 1.2410731315612793, "debug/policy_chosen_logps": -5.444675445556641, "debug/policy_rejected_logits": 1.6047786474227905, "debug/policy_rejected_logps": -4.875066757202148, "debug/reference_chosen_logps": -5.442919731140137, "debug/reference_rejected_logps": -4.868186950683594, "debug/sppo_chosen_loss": 2500.20849609375, "debug/sppo_chosen_reward_in_loss": -0.001756325364112854, "debug/sppo_rej_reward_in_loss": -0.006879883818328381, "debug/sppo_reject_loss": 2499.33935546875, "epoch": 0.9057971014492754, "grad_norm": 67798.96882578227, "learning_rate": 9.8755852994274e-08, "logits/chosen": 1.2410731315612793, "logits/rejected": 1.6047786474227905, "logps/chosen": -5.444675445556641, "logps/rejected": -4.875066757202148, "loss": 5035.3043, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -1.7563277651788667e-05, "rewards/margins": 5.123559822095558e-05, "rewards/rejected": -6.87988504068926e-05, "step": 250 }, { "debug/policy_chosen_logits": 1.4319255352020264, "debug/policy_chosen_logps": -6.509596824645996, "debug/policy_rejected_logits": 1.6014362573623657, "debug/policy_rejected_logps": -2.829972743988037, "debug/reference_chosen_logps": -6.483206748962402, "debug/reference_rejected_logps": -2.824536085128784, "debug/sppo_chosen_loss": 2502.6669921875, "debug/sppo_chosen_reward_in_loss": -0.026390206068754196, "debug/sppo_rej_reward_in_loss": -0.005436348728835583, "debug/sppo_reject_loss": 2499.470703125, "epoch": 0.9239130434782609, "grad_norm": 121783.20276785568, "learning_rate": 9.867190271803463e-08, "logits/chosen": 1.4319255352020264, "logits/rejected": 1.6014362573623657, "logps/chosen": -6.509596824645996, "logps/rejected": -2.829972743988037, "loss": 5004.7781, "rewards/accuracies": 0.25, "rewards/chosen": -0.00026390206767246127, "rewards/margins": -0.00020953858620487154, "rewards/rejected": -5.436349238152616e-05, "step": 255 }, { "debug/policy_chosen_logits": 1.355271577835083, "debug/policy_chosen_logps": -67.56895446777344, "debug/policy_rejected_logits": 1.9250675439834595, "debug/policy_rejected_logps": -47.16566467285156, "debug/reference_chosen_logps": -67.80909729003906, "debug/reference_rejected_logps": -47.41716384887695, "debug/sppo_chosen_loss": 2477.47900390625, "debug/sppo_chosen_reward_in_loss": 0.24013932049274445, "debug/sppo_rej_reward_in_loss": 0.2515000104904175, "debug/sppo_reject_loss": 2528.0703125, "epoch": 0.9420289855072463, "grad_norm": 175494.6132913126, "learning_rate": 9.858524988483717e-08, "logits/chosen": 1.355271577835083, "logits/rejected": 1.9250675439834595, "logps/chosen": -67.56895446777344, "logps/rejected": -47.16566467285156, "loss": 4985.1531, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.002401393372565508, "rewards/margins": -0.0001136069986387156, "rewards/rejected": 0.002515000058338046, "step": 260 }, { "debug/policy_chosen_logits": 1.5709632635116577, "debug/policy_chosen_logps": -38.5024528503418, "debug/policy_rejected_logits": 1.721308946609497, "debug/policy_rejected_logps": -131.8822784423828, "debug/reference_chosen_logps": -38.708030700683594, "debug/reference_rejected_logps": -132.49644470214844, "debug/sppo_chosen_loss": 2480.50537109375, "debug/sppo_chosen_reward_in_loss": 0.20557840168476105, "debug/sppo_rej_reward_in_loss": 0.6141676902770996, "debug/sppo_reject_loss": 2565.62353515625, "epoch": 0.9601449275362319, "grad_norm": 62932.29201922026, "learning_rate": 9.849589930616841e-08, "logits/chosen": 1.5709632635116577, "logits/rejected": 1.721308946609497, "logps/chosen": -38.5024528503418, "logps/rejected": -131.8822784423828, "loss": 4977.1621, "rewards/accuracies": 0.25, "rewards/chosen": 0.0020557839889079332, "rewards/margins": -0.004085893277078867, "rewards/rejected": 0.006141676567494869, "step": 265 }, { "debug/policy_chosen_logits": 1.5259959697723389, "debug/policy_chosen_logps": -5.877644062042236, "debug/policy_rejected_logits": 2.049582004547119, "debug/policy_rejected_logps": -3.5420100688934326, "debug/reference_chosen_logps": -5.859536170959473, "debug/reference_rejected_logps": -3.5223605632781982, "debug/sppo_chosen_loss": 2501.84912109375, "debug/sppo_chosen_reward_in_loss": -0.018107330426573753, "debug/sppo_rej_reward_in_loss": -0.019649498164653778, "debug/sppo_reject_loss": 2498.049072265625, "epoch": 0.9782608695652174, "grad_norm": 42987.56807220796, "learning_rate": 9.840385594331021e-08, "logits/chosen": 1.5259959697723389, "logits/rejected": 2.049582004547119, "logps/chosen": -5.877644062042236, "logps/rejected": -3.5420100688934326, "loss": 4982.1203, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.00018107329378835857, "rewards/margins": 1.5421677744598128e-05, "rewards/rejected": -0.0001964949769899249, "step": 270 }, { "debug/policy_chosen_logits": 1.1668039560317993, "debug/policy_chosen_logps": -4.2212419509887695, "debug/policy_rejected_logits": 1.433667778968811, "debug/policy_rejected_logps": -12.042573928833008, "debug/reference_chosen_logps": -4.2250447273254395, "debug/reference_rejected_logps": -11.967859268188477, "debug/sppo_chosen_loss": 2499.632080078125, "debug/sppo_chosen_reward_in_loss": 0.003803357481956482, "debug/sppo_rej_reward_in_loss": -0.07471342384815216, "debug/sppo_reject_loss": 2492.65234375, "epoch": 0.9963768115942029, "grad_norm": 78261.36031246795, "learning_rate": 9.830912490706402e-08, "logits/chosen": 1.1668039560317993, "logits/rejected": 1.433667778968811, "logps/chosen": -4.2212419509887695, "logps/rejected": -12.042573928833008, "loss": 5011.2508, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 3.803357685683295e-05, "rewards/margins": 0.0007851679110899568, "rewards/rejected": -0.0007471342687495053, "step": 275 }, { "debug/policy_chosen_logits": 1.4105700254440308, "debug/policy_chosen_logps": -134.09332275390625, "debug/policy_rejected_logits": 1.8624595403671265, "debug/policy_rejected_logps": -10.357294082641602, "debug/reference_chosen_logps": -135.3412628173828, "debug/reference_rejected_logps": -10.286213874816895, "debug/sppo_chosen_loss": 2440.96435546875, "debug/sppo_chosen_reward_in_loss": 1.2479249238967896, "debug/sppo_rej_reward_in_loss": -0.07108066231012344, "debug/sppo_reject_loss": 2493.04248046875, "epoch": 1.0144927536231885, "grad_norm": 14565.086909959728, "learning_rate": 9.821171145746709e-08, "logits/chosen": 1.4105700254440308, "logits/rejected": 1.8624595403671265, "logps/chosen": -134.09332275390625, "logps/rejected": -10.357294082641602, "loss": 4985.3234, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.012479249387979507, "rewards/margins": 0.013190053403377533, "rewards/rejected": -0.0007108066347427666, "step": 280 }, { "debug/policy_chosen_logits": 1.2903943061828613, "debug/policy_chosen_logps": -40.79714584350586, "debug/policy_rejected_logits": 1.5260273218154907, "debug/policy_rejected_logps": -7.4305925369262695, "debug/reference_chosen_logps": -40.90019989013672, "debug/reference_rejected_logps": -7.467019557952881, "debug/sppo_chosen_loss": 2490.119873046875, "debug/sppo_chosen_reward_in_loss": 0.1030566543340683, "debug/sppo_rej_reward_in_loss": 0.03642674535512924, "debug/sppo_reject_loss": 2503.66455078125, "epoch": 1.0326086956521738, "grad_norm": 81517.82462180834, "learning_rate": 9.811162100350039e-08, "logits/chosen": 1.2903943061828613, "logits/rejected": 1.5260273218154907, "logps/chosen": -40.79714584350586, "logps/rejected": -7.4305925369262695, "loss": 4979.7484, "rewards/accuracies": 0.25, "rewards/chosen": 0.0010305665200576186, "rewards/margins": 0.0006662990781478584, "rewards/rejected": 0.00036426744190976024, "step": 285 }, { "debug/policy_chosen_logits": 1.5964186191558838, "debug/policy_chosen_logps": -127.29988861083984, "debug/policy_rejected_logits": 1.718488097190857, "debug/policy_rejected_logps": -2.0937576293945312, "debug/reference_chosen_logps": -128.1494140625, "debug/reference_rejected_logps": -2.083263397216797, "debug/sppo_chosen_loss": 2435.686279296875, "debug/sppo_chosen_reward_in_loss": 0.849535346031189, "debug/sppo_rej_reward_in_loss": -0.010494199581444263, "debug/sppo_reject_loss": 2498.955810546875, "epoch": 1.0507246376811594, "grad_norm": 30230.894609034844, "learning_rate": 9.80088591027883e-08, "logits/chosen": 1.5964186191558838, "logits/rejected": 1.718488097190857, "logps/chosen": -127.29988861083984, "logps/rejected": -2.0937576293945312, "loss": 4958.2203, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.008495353162288666, "rewards/margins": 0.008600296452641487, "rewards/rejected": -0.00010494198795640841, "step": 290 }, { "debug/policy_chosen_logits": 1.3330150842666626, "debug/policy_chosen_logps": -3.2329049110412598, "debug/policy_rejected_logits": 1.7619152069091797, "debug/policy_rejected_logps": -5.186083793640137, "debug/reference_chosen_logps": -3.226156711578369, "debug/reference_rejected_logps": -5.182940483093262, "debug/sppo_chosen_loss": 2500.681884765625, "debug/sppo_chosen_reward_in_loss": -0.00674806535243988, "debug/sppo_rej_reward_in_loss": -0.003144294023513794, "debug/sppo_reject_loss": 2499.6923828125, "epoch": 1.068840579710145, "grad_norm": 33593.09315150871, "learning_rate": 9.790343146128999e-08, "logits/chosen": 1.3330150842666626, "logits/rejected": 1.7619152069091797, "logps/chosen": -3.2329049110412598, "logps/rejected": -5.186083793640137, "loss": 4969.8094, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -6.748065061401576e-05, "rewards/margins": -3.6037723475601524e-05, "rewards/rejected": -3.1442927138414234e-05, "step": 295 }, { "debug/policy_chosen_logits": 1.1224730014801025, "debug/policy_chosen_logps": -6.240724086761475, "debug/policy_rejected_logits": 1.4011281728744507, "debug/policy_rejected_logps": -92.62322235107422, "debug/reference_chosen_logps": -6.221102237701416, "debug/reference_rejected_logps": -93.07762145996094, "debug/sppo_chosen_loss": 2502.00146484375, "debug/sppo_chosen_reward_in_loss": -0.0196220763027668, "debug/sppo_rej_reward_in_loss": 0.4543977379798889, "debug/sppo_reject_loss": 2554.57177734375, "epoch": 1.0869565217391304, "grad_norm": 14055.162880344345, "learning_rate": 9.779534393298261e-08, "logits/chosen": 1.1224730014801025, "logits/rejected": 1.4011281728744507, "logps/chosen": -6.240724086761475, "logps/rejected": -92.62322235107422, "loss": 4987.3762, "rewards/accuracies": 0.25, "rewards/chosen": -0.00019622074614744633, "rewards/margins": -0.0047401972115039825, "rewards/rejected": 0.00454397639259696, "step": 300 }, { "epoch": 1.0869565217391304, "eval_debug/policy_chosen_logits": 1.682997703552246, "eval_debug/policy_chosen_logps": -121.15852355957031, "eval_debug/policy_rejected_logits": 1.7421146631240845, "eval_debug/policy_rejected_logps": -63.27250671386719, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2385.26953125, "eval_debug/sppo_chosen_reward_in_loss": 1.9895236492156982, "eval_debug/sppo_rej_reward_in_loss": 0.6145469546318054, "eval_debug/sppo_reject_loss": 2590.797607421875, "eval_logits/chosen": 1.682997703552246, "eval_logits/rejected": 1.7421146631240845, "eval_logps/chosen": -121.15852355957031, "eval_logps/rejected": -63.27250671386719, "eval_loss": 4987.791015625, "eval_rewards/accuracies": 0.2631579041481018, "eval_rewards/chosen": 0.019895238801836967, "eval_rewards/margins": 0.013749766163527966, "eval_rewards/rejected": 0.006145468447357416, "eval_runtime": 28.7047, "eval_samples_per_second": 20.903, "eval_steps_per_second": 0.662, "step": 300 }, { "debug/policy_chosen_logits": 1.7497295141220093, "debug/policy_chosen_logps": -8.143346786499023, "debug/policy_rejected_logits": 1.957035779953003, "debug/policy_rejected_logps": -63.610557556152344, "debug/reference_chosen_logps": -8.141159057617188, "debug/reference_rejected_logps": -64.1399154663086, "debug/sppo_chosen_loss": 2500.2724609375, "debug/sppo_chosen_reward_in_loss": -0.0021881223656237125, "debug/sppo_rej_reward_in_loss": 0.5293562412261963, "debug/sppo_reject_loss": 2565.92529296875, "epoch": 1.105072463768116, "grad_norm": 34133.624255543014, "learning_rate": 9.768460251953622e-08, "logits/chosen": 1.7497295141220093, "logits/rejected": 1.957035779953003, "logps/chosen": -8.143346786499023, "logps/rejected": -63.610557556152344, "loss": 4998.8664, "rewards/accuracies": 0.25, "rewards/chosen": -2.188121470680926e-05, "rewards/margins": -0.005315443500876427, "rewards/rejected": 0.005293562542647123, "step": 305 }, { "debug/policy_chosen_logits": 1.4633691310882568, "debug/policy_chosen_logps": -44.50762939453125, "debug/policy_rejected_logits": 1.713273286819458, "debug/policy_rejected_logps": -67.6939926147461, "debug/reference_chosen_logps": -44.858192443847656, "debug/reference_rejected_logps": -68.05248260498047, "debug/sppo_chosen_loss": 2469.2236328125, "debug/sppo_chosen_reward_in_loss": 0.3505615293979645, "debug/sppo_rej_reward_in_loss": 0.3584887385368347, "debug/sppo_reject_loss": 2541.96533203125, "epoch": 1.1231884057971016, "grad_norm": 14676.387899828507, "learning_rate": 9.757121336998056e-08, "logits/chosen": 1.4633691310882568, "logits/rejected": 1.713273286819458, "logps/chosen": -44.50762939453125, "logps/rejected": -67.6939926147461, "loss": 4993.5055, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0035056150518357754, "rewards/margins": -7.927203841973096e-05, "rewards/rejected": 0.003584887133911252, "step": 310 }, { "debug/policy_chosen_logits": 1.0026930570602417, "debug/policy_chosen_logps": -143.84512329101562, "debug/policy_rejected_logits": 1.394247055053711, "debug/policy_rejected_logps": -44.47309112548828, "debug/reference_chosen_logps": -142.84730529785156, "debug/reference_rejected_logps": -44.07756805419922, "debug/sppo_chosen_loss": 2685.460205078125, "debug/sppo_chosen_reward_in_loss": -0.9978082776069641, "debug/sppo_rej_reward_in_loss": -0.3955157399177551, "debug/sppo_reject_loss": 2462.848876953125, "epoch": 1.141304347826087, "grad_norm": 110184.15262431938, "learning_rate": 9.745518278036364e-08, "logits/chosen": 1.0026930570602417, "logits/rejected": 1.394247055053711, "logps/chosen": -143.84512329101562, "logps/rejected": -44.47309112548828, "loss": 5180.4508, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.009978082962334156, "rewards/margins": -0.006022926419973373, "rewards/rejected": -0.003955157473683357, "step": 315 }, { "debug/policy_chosen_logits": 1.370428442955017, "debug/policy_chosen_logps": -8.47814655303955, "debug/policy_rejected_logits": 1.5973377227783203, "debug/policy_rejected_logps": -5.688195705413818, "debug/reference_chosen_logps": -7.988713264465332, "debug/reference_rejected_logps": -5.633814811706543, "debug/sppo_chosen_loss": 2552.051513671875, "debug/sppo_chosen_reward_in_loss": -0.4894336760044098, "debug/sppo_rej_reward_in_loss": -0.054381467401981354, "debug/sppo_reject_loss": 2494.6552734375, "epoch": 1.1594202898550725, "grad_norm": 107459.50178194568, "learning_rate": 9.733651719340206e-08, "logits/chosen": 1.370428442955017, "logits/rejected": 1.5973377227783203, "logps/chosen": -8.47814655303955, "logps/rejected": -5.688195705413818, "loss": 5681.6391, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.004894336219877005, "rewards/margins": -0.0043505216017365456, "rewards/rejected": -0.0005438146181404591, "step": 320 }, { "debug/policy_chosen_logits": 1.0244429111480713, "debug/policy_chosen_logps": -8.453259468078613, "debug/policy_rejected_logits": 1.3034971952438354, "debug/policy_rejected_logps": -9.284985542297363, "debug/reference_chosen_logps": -8.260573387145996, "debug/reference_rejected_logps": -9.093328475952148, "debug/sppo_chosen_loss": 2519.63818359375, "debug/sppo_chosen_reward_in_loss": -0.1926857978105545, "debug/sppo_rej_reward_in_loss": -0.19165608286857605, "debug/sppo_reject_loss": 2481.523193359375, "epoch": 1.177536231884058, "grad_norm": 153235.36999245975, "learning_rate": 9.721522319812339e-08, "logits/chosen": 1.0244429111480713, "logits/rejected": 1.3034971952438354, "logps/chosen": -8.453259468078613, "logps/rejected": -9.284985542297363, "loss": 5010.5242, "rewards/accuracies": 0.25, "rewards/chosen": -0.001926857978105545, "rewards/margins": -1.0297144399373792e-05, "rewards/rejected": -0.0019165606936439872, "step": 325 }, { "debug/policy_chosen_logits": 0.9440256953239441, "debug/policy_chosen_logps": -46.329376220703125, "debug/policy_rejected_logits": 1.418959140777588, "debug/policy_rejected_logps": -3.674494504928589, "debug/reference_chosen_logps": -46.528934478759766, "debug/reference_rejected_logps": -3.632153034210205, "debug/sppo_chosen_loss": 2483.08837890625, "debug/sppo_chosen_reward_in_loss": 0.19955678284168243, "debug/sppo_rej_reward_in_loss": -0.04234158992767334, "debug/sppo_reject_loss": 2495.803955078125, "epoch": 1.1956521739130435, "grad_norm": 255547.45419809004, "learning_rate": 9.709130752950023e-08, "logits/chosen": 0.9440256953239441, "logits/rejected": 1.418959140777588, "logps/chosen": -46.329376220703125, "logps/rejected": -3.674494504928589, "loss": 5056.682, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.001995567698031664, "rewards/margins": 0.002418983494862914, "rewards/rejected": -0.00042341588414274156, "step": 330 }, { "debug/policy_chosen_logits": 1.3271191120147705, "debug/policy_chosen_logps": -61.30743408203125, "debug/policy_rejected_logits": 1.510683298110962, "debug/policy_rejected_logps": -6.224665641784668, "debug/reference_chosen_logps": -62.81486129760742, "debug/reference_rejected_logps": -6.199158668518066, "debug/sppo_chosen_loss": 2445.33935546875, "debug/sppo_chosen_reward_in_loss": 1.5074307918548584, "debug/sppo_rej_reward_in_loss": -0.025507647544145584, "debug/sppo_reject_loss": 2497.513427734375, "epoch": 1.213768115942029, "grad_norm": 53618.54126158761, "learning_rate": 9.696477706807624e-08, "logits/chosen": 1.3271191120147705, "logits/rejected": 1.510683298110962, "logps/chosen": -61.30743408203125, "logps/rejected": -6.224665641784668, "loss": 4968.4492, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.015074307098984718, "rewards/margins": 0.015329385176301003, "rewards/rejected": -0.0002550765057094395, "step": 335 }, { "debug/policy_chosen_logits": 1.691650629043579, "debug/policy_chosen_logps": -118.9373550415039, "debug/policy_rejected_logits": 1.850358247756958, "debug/policy_rejected_logps": -6.650857448577881, "debug/reference_chosen_logps": -120.7728271484375, "debug/reference_rejected_logps": -6.6778130531311035, "debug/sppo_chosen_loss": 2385.220703125, "debug/sppo_chosen_reward_in_loss": 1.8354644775390625, "debug/sppo_rej_reward_in_loss": 0.02695630118250847, "debug/sppo_reject_loss": 2502.734375, "epoch": 1.2318840579710144, "grad_norm": 130333.1547743022, "learning_rate": 9.683563883958413e-08, "logits/chosen": 1.691650629043579, "logits/rejected": 1.850358247756958, "logps/chosen": -118.9373550415039, "logps/rejected": -6.650857448577881, "loss": 4940.8477, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.018354643136262894, "rewards/margins": 0.01808508113026619, "rewards/rejected": 0.00026956299552693963, "step": 340 }, { "debug/policy_chosen_logits": 1.3374946117401123, "debug/policy_chosen_logps": -127.35295104980469, "debug/policy_rejected_logits": 1.5628750324249268, "debug/policy_rejected_logps": -9.430685043334961, "debug/reference_chosen_logps": -128.16708374023438, "debug/reference_rejected_logps": -9.438863754272461, "debug/sppo_chosen_loss": 2430.123779296875, "debug/sppo_chosen_reward_in_loss": 0.8141282796859741, "debug/sppo_rej_reward_in_loss": 0.008177550509572029, "debug/sppo_reject_loss": 2500.85595703125, "epoch": 1.25, "grad_norm": 160434.02122114887, "learning_rate": 9.670390001455554e-08, "logits/chosen": 1.3374946117401123, "logits/rejected": 1.5628750324249268, "logps/chosen": -127.35295104980469, "logps/rejected": -9.430685043334961, "loss": 5021.2414, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.008141283877193928, "rewards/margins": 0.008059507235884666, "rewards/rejected": 8.17754989839159e-05, "step": 345 }, { "debug/policy_chosen_logits": 1.122680425643921, "debug/policy_chosen_logps": -16.481828689575195, "debug/policy_rejected_logits": 1.4836502075195312, "debug/policy_rejected_logps": -145.7734375, "debug/reference_chosen_logps": -16.5395565032959, "debug/reference_rejected_logps": -146.82769775390625, "debug/sppo_chosen_loss": 2494.484375, "debug/sppo_chosen_reward_in_loss": 0.05773244425654411, "debug/sppo_rej_reward_in_loss": 1.0542521476745605, "debug/sppo_reject_loss": 2639.715576171875, "epoch": 1.2681159420289856, "grad_norm": 23559.40185231292, "learning_rate": 9.656956790792285e-08, "logits/chosen": 1.122680425643921, "logits/rejected": 1.4836502075195312, "logps/chosen": -16.481828689575195, "logps/rejected": -145.7734375, "loss": 5000.1555, "rewards/accuracies": 0.25, "rewards/chosen": 0.0005773244774900377, "rewards/margins": -0.009965196251869202, "rewards/rejected": 0.010542521253228188, "step": 350 }, { "debug/policy_chosen_logits": 1.597556471824646, "debug/policy_chosen_logps": -65.97279357910156, "debug/policy_rejected_logits": 1.7873481512069702, "debug/policy_rejected_logps": -63.13127899169922, "debug/reference_chosen_logps": -66.51875305175781, "debug/reference_rejected_logps": -63.463356018066406, "debug/sppo_chosen_loss": 2455.319580078125, "debug/sppo_chosen_reward_in_loss": 0.5459603071212769, "debug/sppo_rej_reward_in_loss": 0.332086980342865, "debug/sppo_reject_loss": 2535.92041015625, "epoch": 1.286231884057971, "grad_norm": 72589.03219809674, "learning_rate": 9.643264997861311e-08, "logits/chosen": 1.597556471824646, "logits/rejected": 1.7873481512069702, "logps/chosen": -65.97279357910156, "logps/rejected": -63.13127899169922, "loss": 5002.4812, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.005459602922201157, "rewards/margins": 0.002138732932507992, "rewards/rejected": 0.003320869756862521, "step": 355 }, { "debug/policy_chosen_logits": 0.953771710395813, "debug/policy_chosen_logps": -46.30854415893555, "debug/policy_rejected_logits": 1.2139240503311157, "debug/policy_rejected_logps": -14.151881217956543, "debug/reference_chosen_logps": -46.70463180541992, "debug/reference_rejected_logps": -14.1810302734375, "debug/sppo_chosen_loss": 2466.861328125, "debug/sppo_chosen_reward_in_loss": 0.3960918188095093, "debug/sppo_rej_reward_in_loss": 0.029148459434509277, "debug/sppo_reject_loss": 2503.0625, "epoch": 1.3043478260869565, "grad_norm": 18955.75102371124, "learning_rate": 9.62931538291337e-08, "logits/chosen": 0.953771710395813, "logits/rejected": 1.2139240503311157, "logps/chosen": -46.30854415893555, "logps/rejected": -14.151881217956543, "loss": 5020.1805, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.003960918635129929, "rewards/margins": 0.0036694337613880634, "rewards/rejected": 0.0002914845827035606, "step": 360 }, { "debug/policy_chosen_logits": 1.412501335144043, "debug/policy_chosen_logps": -84.13123321533203, "debug/policy_rejected_logits": 1.9537162780761719, "debug/policy_rejected_logps": -3.2911763191223145, "debug/reference_chosen_logps": -84.7721939086914, "debug/reference_rejected_logps": -3.231293201446533, "debug/sppo_chosen_loss": 2446.66064453125, "debug/sppo_chosen_reward_in_loss": 0.6409674286842346, "debug/sppo_rej_reward_in_loss": -0.05988314747810364, "debug/sppo_reject_loss": 2494.05029296875, "epoch": 1.322463768115942, "grad_norm": 156549.79949802379, "learning_rate": 9.615108720515041e-08, "logits/chosen": 1.412501335144043, "logits/rejected": 1.9537162780761719, "logps/chosen": -84.13123321533203, "logps/rejected": -3.2911763191223145, "loss": 5023.6352, "rewards/accuracies": 0.25, "rewards/chosen": 0.006409673951566219, "rewards/margins": 0.007008505053818226, "rewards/rejected": -0.000598831451497972, "step": 365 }, { "debug/policy_chosen_logits": 1.4894671440124512, "debug/policy_chosen_logps": -87.32758331298828, "debug/policy_rejected_logits": 1.7298352718353271, "debug/policy_rejected_logps": -8.41947078704834, "debug/reference_chosen_logps": -87.77662658691406, "debug/reference_rejected_logps": -8.368696212768555, "debug/sppo_chosen_loss": 2461.70458984375, "debug/sppo_chosen_reward_in_loss": 0.44905930757522583, "debug/sppo_rej_reward_in_loss": -0.05077431723475456, "debug/sppo_reject_loss": 2494.939453125, "epoch": 1.3405797101449275, "grad_norm": 14733.4489628871, "learning_rate": 9.600645799505717e-08, "logits/chosen": 1.4894671440124512, "logits/rejected": 1.7298352718353271, "logps/chosen": -87.32758331298828, "logps/rejected": -8.41947078704834, "loss": 5020.6863, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.004490592982620001, "rewards/margins": 0.0049983360804617405, "rewards/rejected": -0.0005077432142570615, "step": 370 }, { "debug/policy_chosen_logits": 1.288341760635376, "debug/policy_chosen_logps": -5.893795013427734, "debug/policy_rejected_logits": 1.5736020803451538, "debug/policy_rejected_logps": -6.31109619140625, "debug/reference_chosen_logps": -5.9267425537109375, "debug/reference_rejected_logps": -6.289887428283691, "debug/sppo_chosen_loss": 2496.726318359375, "debug/sppo_chosen_reward_in_loss": 0.0329475998878479, "debug/sppo_rej_reward_in_loss": -0.021208759397268295, "debug/sppo_reject_loss": 2497.89892578125, "epoch": 1.358695652173913, "grad_norm": 124574.25953796296, "learning_rate": 9.585927422953815e-08, "logits/chosen": 1.288341760635376, "logits/rejected": 1.5736020803451538, "logps/chosen": -5.893795013427734, "logps/rejected": -6.31109619140625, "loss": 5010.1195, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.00032947599538601935, "rewards/margins": 0.0005415636114776134, "rewards/rejected": -0.00021208760153967887, "step": 375 }, { "debug/policy_chosen_logits": 1.269359827041626, "debug/policy_chosen_logps": -38.63573455810547, "debug/policy_rejected_logits": 1.6088886260986328, "debug/policy_rejected_logps": -10.015954971313477, "debug/reference_chosen_logps": -39.23326110839844, "debug/reference_rejected_logps": -10.048616409301758, "debug/sppo_chosen_loss": 2452.991455078125, "debug/sppo_chosen_reward_in_loss": 0.5975297093391418, "debug/sppo_rej_reward_in_loss": 0.03266085311770439, "debug/sppo_reject_loss": 2503.306640625, "epoch": 1.3768115942028984, "grad_norm": 37698.3097811622, "learning_rate": 9.570954408112178e-08, "logits/chosen": 1.269359827041626, "logits/rejected": 1.6088886260986328, "logps/chosen": -38.63573455810547, "logps/rejected": -10.015954971313477, "loss": 4974.3117, "rewards/accuracies": 0.25, "rewards/chosen": 0.005975296720862389, "rewards/margins": 0.005648687947541475, "rewards/rejected": 0.00032660854049026966, "step": 380 }, { "debug/policy_chosen_logits": 1.6295541524887085, "debug/policy_chosen_logps": -70.42657470703125, "debug/policy_rejected_logits": 1.7486521005630493, "debug/policy_rejected_logps": -6.265525817871094, "debug/reference_chosen_logps": -71.53660583496094, "debug/reference_rejected_logps": -6.130346775054932, "debug/sppo_chosen_loss": 2444.93408203125, "debug/sppo_chosen_reward_in_loss": 1.1100376844406128, "debug/sppo_rej_reward_in_loss": -0.13517935574054718, "debug/sppo_reject_loss": 2486.710693359375, "epoch": 1.394927536231884, "grad_norm": 76736.39646895205, "learning_rate": 9.555727586372702e-08, "logits/chosen": 1.6295541524887085, "logits/rejected": 1.7486521005630493, "logps/chosen": -70.42657470703125, "logps/rejected": -6.265525817871094, "loss": 4958.0344, "rewards/accuracies": 0.375, "rewards/chosen": 0.011100376956164837, "rewards/margins": 0.012452170252799988, "rewards/rejected": -0.0013517935294657946, "step": 385 }, { "debug/policy_chosen_logits": 1.7042818069458008, "debug/policy_chosen_logps": -299.77423095703125, "debug/policy_rejected_logits": 1.5136713981628418, "debug/policy_rejected_logps": -161.97178649902344, "debug/reference_chosen_logps": -303.56982421875, "debug/reference_rejected_logps": -163.62176513671875, "debug/sppo_chosen_loss": 2350.44677734375, "debug/sppo_chosen_reward_in_loss": 3.7955734729766846, "debug/sppo_rej_reward_in_loss": 1.6499783992767334, "debug/sppo_reject_loss": 2721.290771484375, "epoch": 1.4130434782608696, "grad_norm": 84471.48718695647, "learning_rate": 9.540247803220169e-08, "logits/chosen": 1.7042818069458008, "logits/rejected": 1.5136713981628418, "logps/chosen": -299.77423095703125, "logps/rejected": -161.97178649902344, "loss": 4963.8687, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.037955738604068756, "rewards/margins": 0.021455949172377586, "rewards/rejected": 0.016499783843755722, "step": 390 }, { "debug/policy_chosen_logits": 1.0722544193267822, "debug/policy_chosen_logps": -73.74263000488281, "debug/policy_rejected_logits": 1.3228495121002197, "debug/policy_rejected_logps": -6.77597188949585, "debug/reference_chosen_logps": -74.72174835205078, "debug/reference_rejected_logps": -6.651388645172119, "debug/sppo_chosen_loss": 2438.303955078125, "debug/sppo_chosen_reward_in_loss": 0.9791187047958374, "debug/sppo_rej_reward_in_loss": -0.12458336353302002, "debug/sppo_reject_loss": 2487.65234375, "epoch": 1.431159420289855, "grad_norm": 115791.39472397226, "learning_rate": 9.524515918185301e-08, "logits/chosen": 1.0722544193267822, "logits/rejected": 1.3228495121002197, "logps/chosen": -73.74263000488281, "logps/rejected": -6.77597188949585, "loss": 4981.4145, "rewards/accuracies": 0.375, "rewards/chosen": 0.009791186079382896, "rewards/margins": 0.01103702001273632, "rewards/rejected": -0.0012458335841074586, "step": 395 }, { "debug/policy_chosen_logits": 1.3299223184585571, "debug/policy_chosen_logps": -4.213925361633301, "debug/policy_rejected_logits": 1.4513185024261475, "debug/policy_rejected_logps": -3.2649288177490234, "debug/reference_chosen_logps": -4.213369846343994, "debug/reference_rejected_logps": -3.2468135356903076, "debug/sppo_chosen_loss": 2500.088623046875, "debug/sppo_chosen_reward_in_loss": -0.0005550414207391441, "debug/sppo_rej_reward_in_loss": -0.018115734681487083, "debug/sppo_reject_loss": 2498.21044921875, "epoch": 1.4492753623188406, "grad_norm": 174824.51357316782, "learning_rate": 9.508532804797034e-08, "logits/chosen": 1.3299223184585571, "logits/rejected": 1.4513185024261475, "logps/chosen": -4.213925361633301, "logps/rejected": -3.2649288177490234, "loss": 5014.9531, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -5.55041469851858e-06, "rewards/margins": 0.00017560695414431393, "rewards/rejected": -0.00018115734565071762, "step": 400 }, { "epoch": 1.4492753623188406, "eval_debug/policy_chosen_logits": 1.671087622642517, "eval_debug/policy_chosen_logps": -121.15187072753906, "eval_debug/policy_rejected_logits": 1.7307648658752441, "eval_debug/policy_rejected_logps": -63.41480255126953, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2383.670654296875, "eval_debug/sppo_chosen_reward_in_loss": 1.9961920976638794, "eval_debug/sppo_rej_reward_in_loss": 0.4722493886947632, "eval_debug/sppo_reject_loss": 2565.975341796875, "eval_logits/chosen": 1.671087622642517, "eval_logits/rejected": 1.7307648658752441, "eval_logps/chosen": -121.15187072753906, "eval_logps/rejected": -63.41480255126953, "eval_loss": 4983.84228515625, "eval_rewards/accuracies": 0.2631579041481018, "eval_rewards/chosen": 0.019961921498179436, "eval_rewards/margins": 0.015239425003528595, "eval_rewards/rejected": 0.00472249323502183, "eval_runtime": 29.1395, "eval_samples_per_second": 20.591, "eval_steps_per_second": 0.652, "step": 400 }, { "debug/policy_chosen_logits": 1.4486382007598877, "debug/policy_chosen_logps": -46.97897720336914, "debug/policy_rejected_logits": 1.9581079483032227, "debug/policy_rejected_logps": -4.515638828277588, "debug/reference_chosen_logps": -47.18468475341797, "debug/reference_rejected_logps": -4.431929111480713, "debug/sppo_chosen_loss": 2481.34619140625, "debug/sppo_chosen_reward_in_loss": 0.20570294559001923, "debug/sppo_rej_reward_in_loss": -0.08370967209339142, "debug/sppo_reject_loss": 2491.882568359375, "epoch": 1.4673913043478262, "grad_norm": 51579.2008215819, "learning_rate": 9.49229935053401e-08, "logits/chosen": 1.4486382007598877, "logits/rejected": 1.9581079483032227, "logps/chosen": -46.97897720336914, "logps/rejected": -4.515638828277588, "loss": 5010.3117, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.002057029400020838, "rewards/margins": 0.0028941261116415262, "rewards/rejected": -0.0008370967698283494, "step": 405 }, { "debug/policy_chosen_logits": 1.4875986576080322, "debug/policy_chosen_logps": -69.13442993164062, "debug/policy_rejected_logits": 1.7215429544448853, "debug/policy_rejected_logps": -88.55685424804688, "debug/reference_chosen_logps": -69.37290954589844, "debug/reference_rejected_logps": -88.96483612060547, "debug/sppo_chosen_loss": 2478.44970703125, "debug/sppo_chosen_reward_in_loss": 0.23847372829914093, "debug/sppo_rej_reward_in_loss": 0.40798917412757874, "debug/sppo_reject_loss": 2544.380615234375, "epoch": 1.4855072463768115, "grad_norm": 49022.24817353761, "learning_rate": 9.475816456775311e-08, "logits/chosen": 1.4875986576080322, "logits/rejected": 1.7215429544448853, "logps/chosen": -69.13442993164062, "logps/rejected": -88.55685424804688, "loss": 4993.2547, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.002384737366810441, "rewards/margins": -0.0016951540019363165, "rewards/rejected": 0.00407989090308547, "step": 410 }, { "debug/policy_chosen_logits": 1.0342118740081787, "debug/policy_chosen_logps": -118.620849609375, "debug/policy_rejected_logits": 1.4359633922576904, "debug/policy_rejected_logps": -10.003622055053711, "debug/reference_chosen_logps": -119.50848388671875, "debug/reference_rejected_logps": -10.004406929016113, "debug/sppo_chosen_loss": 2436.148681640625, "debug/sppo_chosen_reward_in_loss": 0.887627899646759, "debug/sppo_rej_reward_in_loss": 0.000785432755947113, "debug/sppo_reject_loss": 2500.10498046875, "epoch": 1.5036231884057971, "grad_norm": 42617.747759851016, "learning_rate": 9.459085038750394e-08, "logits/chosen": 1.0342118740081787, "logits/rejected": 1.4359633922576904, "logps/chosen": -118.620849609375, "logps/rejected": -10.003622055053711, "loss": 4961.6234, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.008876277133822441, "rewards/margins": 0.008868424221873283, "rewards/rejected": 7.854320756450761e-06, "step": 415 }, { "debug/policy_chosen_logits": 1.6918962001800537, "debug/policy_chosen_logps": -130.5413360595703, "debug/policy_rejected_logits": 2.267665386199951, "debug/policy_rejected_logps": -4.041918754577637, "debug/reference_chosen_logps": -132.0562286376953, "debug/reference_rejected_logps": -4.032851219177246, "debug/sppo_chosen_loss": 2404.010986328125, "debug/sppo_chosen_reward_in_loss": 1.5148848295211792, "debug/sppo_rej_reward_in_loss": -0.009067353792488575, "debug/sppo_reject_loss": 2499.10595703125, "epoch": 1.5217391304347827, "grad_norm": 175047.2476467055, "learning_rate": 9.442106025488283e-08, "logits/chosen": 1.6918962001800537, "logits/rejected": 2.267665386199951, "logps/chosen": -130.5413360595703, "logps/rejected": -4.041918754577637, "loss": 5008.1953, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.015148850157856941, "rewards/margins": 0.015239521861076355, "rewards/rejected": -9.06735222088173e-05, "step": 420 }, { "debug/policy_chosen_logits": 1.5057414770126343, "debug/policy_chosen_logps": -10.860461235046387, "debug/policy_rejected_logits": 2.052546262741089, "debug/policy_rejected_logps": -7.867926597595215, "debug/reference_chosen_logps": -10.919036865234375, "debug/reference_rejected_logps": -7.864465236663818, "debug/sppo_chosen_loss": 2494.17578125, "debug/sppo_chosen_reward_in_loss": 0.05857586860656738, "debug/sppo_rej_reward_in_loss": -0.003462588880211115, "debug/sppo_reject_loss": 2499.682861328125, "epoch": 1.539855072463768, "grad_norm": 11173.206523699306, "learning_rate": 9.424880359765976e-08, "logits/chosen": 1.5057414770126343, "logits/rejected": 2.052546262741089, "logps/chosen": -10.860461235046387, "logps/rejected": -7.867926597595215, "loss": 5013.4879, "rewards/accuracies": 0.375, "rewards/chosen": 0.0005857586511410773, "rewards/margins": 0.0006203845841810107, "rewards/rejected": -3.462588574620895e-05, "step": 425 }, { "debug/policy_chosen_logits": 1.0262612104415894, "debug/policy_chosen_logps": -8.376487731933594, "debug/policy_rejected_logits": 1.3994934558868408, "debug/policy_rejected_logps": -10.841070175170898, "debug/reference_chosen_logps": -8.375343322753906, "debug/reference_rejected_logps": -10.8840970993042, "debug/sppo_chosen_loss": 2500.147216796875, "debug/sppo_chosen_reward_in_loss": -0.0011449723970144987, "debug/sppo_rej_reward_in_loss": 0.043027814477682114, "debug/sppo_reject_loss": 2504.35302734375, "epoch": 1.5579710144927537, "grad_norm": 152968.24031416993, "learning_rate": 9.407408998056104e-08, "logits/chosen": 1.0262612104415894, "logits/rejected": 1.3994934558868408, "logps/chosen": -8.376487731933594, "logps/rejected": -10.841070175170898, "loss": 5026.2105, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -1.1449685189290904e-05, "rewards/margins": -0.00044172778143547475, "rewards/rejected": 0.00043027810170315206, "step": 430 }, { "debug/policy_chosen_logits": 1.455554485321045, "debug/policy_chosen_logps": -6.7334418296813965, "debug/policy_rejected_logits": 1.6406171321868896, "debug/policy_rejected_logps": -13.025958061218262, "debug/reference_chosen_logps": -6.73760461807251, "debug/reference_rejected_logps": -13.031814575195312, "debug/sppo_chosen_loss": 2499.63037109375, "debug/sppo_chosen_reward_in_loss": 0.004162711091339588, "debug/sppo_rej_reward_in_loss": 0.0058563947677612305, "debug/sppo_reject_loss": 2500.63037109375, "epoch": 1.5760869565217392, "grad_norm": 109688.91026494751, "learning_rate": 9.389692910473814e-08, "logits/chosen": 1.455554485321045, "logits/rejected": 1.6406171321868896, "logps/chosen": -6.7334418296813965, "logps/rejected": -13.025958061218262, "loss": 5026.4758, "rewards/accuracies": 0.25, "rewards/chosen": 4.162711411481723e-05, "rewards/margins": -1.693684680503793e-05, "rewards/rejected": 5.8563931816024706e-05, "step": 435 }, { "debug/policy_chosen_logits": 1.4137792587280273, "debug/policy_chosen_logps": -6.53154993057251, "debug/policy_rejected_logits": 1.8845860958099365, "debug/policy_rejected_logps": -5.826476097106934, "debug/reference_chosen_logps": -6.581674098968506, "debug/reference_rejected_logps": -5.773827075958252, "debug/sppo_chosen_loss": 2495.0634765625, "debug/sppo_chosen_reward_in_loss": 0.05012422055006027, "debug/sppo_rej_reward_in_loss": -0.052649516612291336, "debug/sppo_reject_loss": 2494.764404296875, "epoch": 1.5942028985507246, "grad_norm": 41741.86511984669, "learning_rate": 9.37173308072291e-08, "logits/chosen": 1.4137792587280273, "logits/rejected": 1.8845860958099365, "logps/chosen": -6.53154993057251, "logps/rejected": -5.826476097106934, "loss": 4984.9031, "rewards/accuracies": 0.375, "rewards/chosen": 0.0005012422916479409, "rewards/margins": 0.0010277373949065804, "rewards/rejected": -0.0005264951614663005, "step": 440 }, { "debug/policy_chosen_logits": 1.448659062385559, "debug/policy_chosen_logps": -87.86599731445312, "debug/policy_rejected_logits": 1.6381359100341797, "debug/policy_rejected_logps": -94.04246520996094, "debug/reference_chosen_logps": -88.7278823852539, "debug/reference_rejected_logps": -94.71257019042969, "debug/sppo_chosen_loss": 2440.04052734375, "debug/sppo_chosen_reward_in_loss": 0.8618772625923157, "debug/sppo_rej_reward_in_loss": 0.6700990796089172, "debug/sppo_reject_loss": 2584.06298828125, "epoch": 1.6123188405797102, "grad_norm": 51927.9449696722, "learning_rate": 9.353530506041226e-08, "logits/chosen": 1.448659062385559, "logits/rejected": 1.6381359100341797, "logps/chosen": -87.86599731445312, "logps/rejected": -94.04246520996094, "loss": 5011.218, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.008618771098554134, "rewards/margins": 0.0019177815411239862, "rewards/rejected": 0.006700989790260792, "step": 445 }, { "debug/policy_chosen_logits": 1.3283036947250366, "debug/policy_chosen_logps": -7.102647304534912, "debug/policy_rejected_logits": 1.4235103130340576, "debug/policy_rejected_logps": -8.258275985717773, "debug/reference_chosen_logps": -7.1367597579956055, "debug/reference_rejected_logps": -8.24354362487793, "debug/sppo_chosen_loss": 2496.60693359375, "debug/sppo_chosen_reward_in_loss": 0.03411325812339783, "debug/sppo_rej_reward_in_loss": -0.01473400928080082, "debug/sppo_reject_loss": 2498.552734375, "epoch": 1.6304347826086958, "grad_norm": 74922.26573009959, "learning_rate": 9.335086197145254e-08, "logits/chosen": 1.3283036947250366, "logits/rejected": 1.4235103130340576, "logps/chosen": -7.102647304534912, "logps/rejected": -8.258275985717773, "loss": 4969.9434, "rewards/accuracies": 0.375, "rewards/chosen": 0.00034113257424905896, "rewards/margins": 0.0004884726367890835, "rewards/rejected": -0.00014734007709193975, "step": 450 }, { "debug/policy_chosen_logits": 1.2878503799438477, "debug/policy_chosen_logps": -9.6986722946167, "debug/policy_rejected_logits": 1.5618960857391357, "debug/policy_rejected_logps": -2.9802498817443848, "debug/reference_chosen_logps": -9.742944717407227, "debug/reference_rejected_logps": -2.9481358528137207, "debug/sppo_chosen_loss": 2495.665283203125, "debug/sppo_chosen_reward_in_loss": 0.04427263140678406, "debug/sppo_rej_reward_in_loss": -0.03211439028382301, "debug/sppo_reject_loss": 2496.798095703125, "epoch": 1.6485507246376812, "grad_norm": 23359.63900194764, "learning_rate": 9.31640117817403e-08, "logits/chosen": 1.2878503799438477, "logits/rejected": 1.5618960857391357, "logps/chosen": -9.6986722946167, "logps/rejected": -2.9802498817443848, "loss": 5031.1742, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.00044272636296227574, "rewards/margins": 0.0007638701936230063, "rewards/rejected": -0.00032114385976456106, "step": 455 }, { "debug/policy_chosen_logits": 1.4598870277404785, "debug/policy_chosen_logps": -7.750649452209473, "debug/policy_rejected_logits": 1.663448691368103, "debug/policy_rejected_logps": -5.442452907562256, "debug/reference_chosen_logps": -7.756170749664307, "debug/reference_rejected_logps": -5.4446258544921875, "debug/sppo_chosen_loss": 2499.549560546875, "debug/sppo_chosen_reward_in_loss": 0.005521661136299372, "debug/sppo_rej_reward_in_loss": 0.002173125743865967, "debug/sppo_reject_loss": 2500.23681640625, "epoch": 1.6666666666666665, "grad_norm": 58467.618956239334, "learning_rate": 9.297476486632254e-08, "logits/chosen": 1.4598870277404785, "logits/rejected": 1.663448691368103, "logps/chosen": -7.750649452209473, "logps/rejected": -5.442452907562256, "loss": 4997.1562, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 5.5216623877640814e-05, "rewards/margins": 3.3485335734440014e-05, "rewards/rejected": 2.1731253582402132e-05, "step": 460 }, { "debug/policy_chosen_logits": 1.4006640911102295, "debug/policy_chosen_logps": -7.651175022125244, "debug/policy_rejected_logits": 1.677852988243103, "debug/policy_rejected_logps": -3.8987457752227783, "debug/reference_chosen_logps": -7.709532737731934, "debug/reference_rejected_logps": -3.8724441528320312, "debug/sppo_chosen_loss": 2494.216552734375, "debug/sppo_chosen_reward_in_loss": 0.05835745483636856, "debug/sppo_rej_reward_in_loss": -0.0263015516102314, "debug/sppo_reject_loss": 2497.393798828125, "epoch": 1.6847826086956523, "grad_norm": 46533.44005719886, "learning_rate": 9.278313173332697e-08, "logits/chosen": 1.4006640911102295, "logits/rejected": 1.677852988243103, "logps/chosen": -7.651175022125244, "logps/rejected": -3.8987457752227783, "loss": 4983.8816, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.0005835745250806212, "rewards/margins": 0.0008465900318697095, "rewards/rejected": -0.00026301550678908825, "step": 465 }, { "debug/policy_chosen_logits": 1.3789927959442139, "debug/policy_chosen_logps": -103.65354919433594, "debug/policy_rejected_logits": 1.97138249874115, "debug/policy_rejected_logps": -9.05884075164795, "debug/reference_chosen_logps": -104.1981201171875, "debug/reference_rejected_logps": -9.050098419189453, "debug/sppo_chosen_loss": 2455.534423828125, "debug/sppo_chosen_reward_in_loss": 0.5445730090141296, "debug/sppo_rej_reward_in_loss": -0.008742582984268665, "debug/sppo_reject_loss": 2499.20556640625, "epoch": 1.7028985507246377, "grad_norm": 213712.8539683823, "learning_rate": 9.25891230233784e-08, "logits/chosen": 1.3789927959442139, "logits/rejected": 1.97138249874115, "logps/chosen": -103.65354919433594, "logps/rejected": -9.05884075164795, "loss": 4952.0137, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.005445730872452259, "rewards/margins": 0.005533156450837851, "rewards/rejected": -8.74258839758113e-05, "step": 470 }, { "debug/policy_chosen_logits": 1.4862921237945557, "debug/policy_chosen_logps": -7.328289985656738, "debug/policy_rejected_logits": 1.6887321472167969, "debug/policy_rejected_logps": -5.346103668212891, "debug/reference_chosen_logps": -7.227102756500244, "debug/reference_rejected_logps": -5.338844299316406, "debug/sppo_chosen_loss": 2510.253662109375, "debug/sppo_chosen_reward_in_loss": -0.10118672996759415, "debug/sppo_rej_reward_in_loss": -0.007259574718773365, "debug/sppo_reject_loss": 2499.27978515625, "epoch": 1.721014492753623, "grad_norm": 17916.65671053826, "learning_rate": 9.239274950900804e-08, "logits/chosen": 1.4862921237945557, "logits/rejected": 1.6887321472167969, "logps/chosen": -7.328289985656738, "logps/rejected": -5.346103668212891, "loss": 4989.702, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.0010118673089891672, "rewards/margins": -0.0009392714127898216, "rewards/rejected": -7.259573612827808e-05, "step": 475 }, { "debug/policy_chosen_logits": 1.2880994081497192, "debug/policy_chosen_logps": -82.17488098144531, "debug/policy_rejected_logits": 1.463820457458496, "debug/policy_rejected_logps": -110.88824462890625, "debug/reference_chosen_logps": -80.98526000976562, "debug/reference_rejected_logps": -109.05181884765625, "debug/sppo_chosen_loss": 2648.33984375, "debug/sppo_chosen_reward_in_loss": -1.1896164417266846, "debug/sppo_rej_reward_in_loss": -1.8364288806915283, "debug/sppo_reject_loss": 2423.713134765625, "epoch": 1.7391304347826086, "grad_norm": 63303.196162746026, "learning_rate": 9.219402209405519e-08, "logits/chosen": 1.2880994081497192, "logits/rejected": 1.463820457458496, "logps/chosen": -82.17488098144531, "logps/rejected": -110.88824462890625, "loss": 5339.8383, "rewards/accuracies": 0.375, "rewards/chosen": -0.01189616322517395, "rewards/margins": 0.006468124687671661, "rewards/rejected": -0.01836428791284561, "step": 480 }, { "debug/policy_chosen_logits": 1.540368914604187, "debug/policy_chosen_logps": -41.15214157104492, "debug/policy_rejected_logits": 1.7952556610107422, "debug/policy_rejected_logps": -4.985678672790527, "debug/reference_chosen_logps": -40.81630325317383, "debug/reference_rejected_logps": -4.973757266998291, "debug/sppo_chosen_loss": 2535.805908203125, "debug/sppo_chosen_reward_in_loss": -0.3358452618122101, "debug/sppo_rej_reward_in_loss": -0.011920953169465065, "debug/sppo_reject_loss": 2498.871337890625, "epoch": 1.7572463768115942, "grad_norm": 42104.168054579364, "learning_rate": 9.19929518130619e-08, "logits/chosen": 1.540368914604187, "logits/rejected": 1.7952556610107422, "logps/chosen": -41.15214157104492, "logps/rejected": -4.985678672790527, "loss": 4989.6789, "rewards/accuracies": 0.25, "rewards/chosen": -0.003358452348038554, "rewards/margins": -0.003239243058487773, "rewards/rejected": -0.00011920950782950968, "step": 485 }, { "debug/policy_chosen_logits": 1.550143837928772, "debug/policy_chosen_logps": -6.626770973205566, "debug/policy_rejected_logits": 1.8395893573760986, "debug/policy_rejected_logps": -5.070040225982666, "debug/reference_chosen_logps": -6.592990875244141, "debug/reference_rejected_logps": -5.0225749015808105, "debug/sppo_chosen_loss": 2503.404296875, "debug/sppo_chosen_reward_in_loss": -0.033779919147491455, "debug/sppo_rej_reward_in_loss": -0.04746539518237114, "debug/sppo_reject_loss": 2495.298095703125, "epoch": 1.7753623188405796, "grad_norm": 15193.057330276955, "learning_rate": 9.178954983066031e-08, "logits/chosen": 1.550143837928772, "logits/rejected": 1.8395893573760986, "logps/chosen": -6.626770973205566, "logps/rejected": -5.070040225982666, "loss": 4983.8785, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.00033779916702769697, "rewards/margins": 0.0001368547382298857, "rewards/rejected": -0.00047465390525758266, "step": 490 }, { "debug/policy_chosen_logits": 1.2941255569458008, "debug/policy_chosen_logps": -22.95914649963379, "debug/policy_rejected_logits": 1.5828083753585815, "debug/policy_rejected_logps": -65.86346435546875, "debug/reference_chosen_logps": -23.14602279663086, "debug/reference_rejected_logps": -66.37723541259766, "debug/sppo_chosen_loss": 2482.569091796875, "debug/sppo_chosen_reward_in_loss": 0.18687725067138672, "debug/sppo_rej_reward_in_loss": 0.5137732625007629, "debug/sppo_reject_loss": 2561.97900390625, "epoch": 1.7934782608695652, "grad_norm": 88121.3617189524, "learning_rate": 9.15838274409526e-08, "logits/chosen": 1.2941255569458008, "logits/rejected": 1.5828083753585815, "logps/chosen": -22.95914649963379, "logps/rejected": -65.86346435546875, "loss": 5002.2465, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.0018687723204493523, "rewards/margins": -0.003268960863351822, "rewards/rejected": 0.005137733183801174, "step": 495 }, { "debug/policy_chosen_logits": 1.3350841999053955, "debug/policy_chosen_logps": -101.71826934814453, "debug/policy_rejected_logits": 1.585424780845642, "debug/policy_rejected_logps": -52.58026123046875, "debug/reference_chosen_logps": -102.49649810791016, "debug/reference_rejected_logps": -53.1512451171875, "debug/sppo_chosen_loss": 2436.214111328125, "debug/sppo_chosen_reward_in_loss": 0.778225839138031, "debug/sppo_rej_reward_in_loss": 0.5709857940673828, "debug/sppo_reject_loss": 2575.375732421875, "epoch": 1.8115942028985508, "grad_norm": 172017.29987280557, "learning_rate": 9.13757960668839e-08, "logits/chosen": 1.3350841999053955, "logits/rejected": 1.585424780845642, "logps/chosen": -101.71826934814453, "logps/rejected": -52.58026123046875, "loss": 5006.941, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.007782258093357086, "rewards/margins": 0.0020724008791148663, "rewards/rejected": 0.005709857679903507, "step": 500 }, { "epoch": 1.8115942028985508, "eval_debug/policy_chosen_logits": 1.6503223180770874, "eval_debug/policy_chosen_logps": -121.97325897216797, "eval_debug/policy_rejected_logits": 1.7112685441970825, "eval_debug/policy_rejected_logps": -63.9327507019043, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2416.376953125, "eval_debug/sppo_chosen_reward_in_loss": 1.1747816801071167, "eval_debug/sppo_rej_reward_in_loss": -0.04569563642144203, "eval_debug/sppo_reject_loss": 2495.625244140625, "eval_logits/chosen": 1.6503223180770874, "eval_logits/rejected": 1.7112685441970825, "eval_logps/chosen": -121.97325897216797, "eval_logps/rejected": -63.9327507019043, "eval_loss": 4965.4326171875, "eval_rewards/accuracies": 0.31578946113586426, "eval_rewards/chosen": 0.011747815646231174, "eval_rewards/margins": 0.012204772792756557, "eval_rewards/rejected": -0.00045695636072196066, "eval_runtime": 28.8045, "eval_samples_per_second": 20.83, "eval_steps_per_second": 0.66, "step": 500 }, { "debug/policy_chosen_logits": 1.3786325454711914, "debug/policy_chosen_logps": -7.279969692230225, "debug/policy_rejected_logits": 1.5108921527862549, "debug/policy_rejected_logps": -122.37618255615234, "debug/reference_chosen_logps": -7.294312953948975, "debug/reference_rejected_logps": -122.4188232421875, "debug/sppo_chosen_loss": 2498.588134765625, "debug/sppo_chosen_reward_in_loss": 0.014342248439788818, "debug/sppo_rej_reward_in_loss": 0.042636167258024216, "debug/sppo_reject_loss": 2504.616943359375, "epoch": 1.8297101449275361, "grad_norm": 233187.52109567318, "learning_rate": 9.11654672596081e-08, "logits/chosen": 1.3786325454711914, "logits/rejected": 1.5108921527862549, "logps/chosen": -7.279969692230225, "logps/rejected": -122.37618255615234, "loss": 4994.732, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.00014342248323373497, "rewards/margins": -0.00028293923242017627, "rewards/rejected": 0.00042636171565391123, "step": 505 }, { "debug/policy_chosen_logits": 1.5226022005081177, "debug/policy_chosen_logps": -101.56987762451172, "debug/policy_rejected_logits": 1.7057052850723267, "debug/policy_rejected_logps": -11.258237838745117, "debug/reference_chosen_logps": -102.32771301269531, "debug/reference_rejected_logps": -11.211788177490234, "debug/sppo_chosen_loss": 2450.17333984375, "debug/sppo_chosen_reward_in_loss": 0.7578359246253967, "debug/sppo_rej_reward_in_loss": -0.046448830515146255, "debug/sppo_reject_loss": 2495.43115234375, "epoch": 1.8478260869565217, "grad_norm": 46795.00293216267, "learning_rate": 9.095285269784641e-08, "logits/chosen": 1.5226022005081177, "logits/rejected": 1.7057052850723267, "logps/chosen": -101.56987762451172, "logps/rejected": -11.258237838745117, "loss": 4986.0867, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.007578360382467508, "rewards/margins": 0.008042847737669945, "rewards/rejected": -0.00046448828652501106, "step": 510 }, { "debug/policy_chosen_logits": 1.5528548955917358, "debug/policy_chosen_logps": -25.6921443939209, "debug/policy_rejected_logits": 1.7076743841171265, "debug/policy_rejected_logps": -27.83782958984375, "debug/reference_chosen_logps": -25.6488094329834, "debug/reference_rejected_logps": -27.818252563476562, "debug/sppo_chosen_loss": 2504.416015625, "debug/sppo_chosen_reward_in_loss": -0.04333125427365303, "debug/sppo_rej_reward_in_loss": -0.019580483436584473, "debug/sppo_reject_loss": 2498.138671875, "epoch": 1.8659420289855073, "grad_norm": 30001.557286497267, "learning_rate": 9.073796418723882e-08, "logits/chosen": 1.5528548955917358, "logits/rejected": 1.7076743841171265, "logps/chosen": -25.6921443939209, "logps/rejected": -27.83782958984375, "loss": 4991.8574, "rewards/accuracies": 0.25, "rewards/chosen": -0.0004333124961704016, "rewards/margins": -0.00023750770196784288, "rewards/rejected": -0.00019580486696213484, "step": 515 }, { "debug/policy_chosen_logits": 1.657427191734314, "debug/policy_chosen_logps": -233.5861358642578, "debug/policy_rejected_logits": 1.7426013946533203, "debug/policy_rejected_logps": -211.1157684326172, "debug/reference_chosen_logps": -234.6603546142578, "debug/reference_rejected_logps": -211.98037719726562, "debug/sppo_chosen_loss": 2416.29296875, "debug/sppo_chosen_reward_in_loss": 1.0742230415344238, "debug/sppo_rej_reward_in_loss": 0.8645865321159363, "debug/sppo_reject_loss": 2598.1376953125, "epoch": 1.8840579710144927, "grad_norm": 116479.66302407594, "learning_rate": 9.05208136596887e-08, "logits/chosen": 1.657427191734314, "logits/rejected": 1.7426013946533203, "logps/chosen": -233.5861358642578, "logps/rejected": -211.1157684326172, "loss": 4989.6113, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.010742230340838432, "rewards/margins": 0.002096364740282297, "rewards/rejected": 0.008645866066217422, "step": 520 }, { "debug/policy_chosen_logits": 1.3473526239395142, "debug/policy_chosen_logps": -49.44628143310547, "debug/policy_rejected_logits": 1.7877445220947266, "debug/policy_rejected_logps": -45.069190979003906, "debug/reference_chosen_logps": -49.641510009765625, "debug/reference_rejected_logps": -45.34258270263672, "debug/sppo_chosen_loss": 2481.927734375, "debug/sppo_chosen_reward_in_loss": 0.19523265957832336, "debug/sppo_rej_reward_in_loss": 0.2733902335166931, "debug/sppo_reject_loss": 2531.37255859375, "epoch": 1.9021739130434783, "grad_norm": 196749.11602219281, "learning_rate": 9.030141317270026e-08, "logits/chosen": 1.3473526239395142, "logits/rejected": 1.7877445220947266, "logps/chosen": -49.44628143310547, "logps/rejected": -45.069190979003906, "loss": 5003.5402, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.001952326507307589, "rewards/margins": -0.0007815755670890212, "rewards/rejected": 0.002733902307227254, "step": 525 }, { "debug/policy_chosen_logits": 1.295692801475525, "debug/policy_chosen_logps": -304.587158203125, "debug/policy_rejected_logits": 1.4055874347686768, "debug/policy_rejected_logps": -84.8868637084961, "debug/reference_chosen_logps": -306.0018005371094, "debug/reference_rejected_logps": -85.27592468261719, "debug/sppo_chosen_loss": 2384.4462890625, "debug/sppo_chosen_reward_in_loss": 1.4146642684936523, "debug/sppo_rej_reward_in_loss": 0.3890670835971832, "debug/sppo_reject_loss": 2545.71826171875, "epoch": 1.9202898550724639, "grad_norm": 123146.8823897493, "learning_rate": 9.007977490870885e-08, "logits/chosen": 1.295692801475525, "logits/rejected": 1.4055874347686768, "logps/chosen": -304.587158203125, "logps/rejected": -84.8868637084961, "loss": 4962.9609, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.01414664089679718, "rewards/margins": 0.010255971923470497, "rewards/rejected": 0.003890670370310545, "step": 530 }, { "debug/policy_chosen_logits": 1.5718975067138672, "debug/policy_chosen_logps": -85.6647720336914, "debug/policy_rejected_logits": 1.8537788391113281, "debug/policy_rejected_logps": -4.973170280456543, "debug/reference_chosen_logps": -86.55270385742188, "debug/reference_rejected_logps": -4.980034828186035, "debug/sppo_chosen_loss": 2430.51318359375, "debug/sppo_chosen_reward_in_loss": 0.8879335522651672, "debug/sppo_rej_reward_in_loss": 0.006863677408546209, "debug/sppo_reject_loss": 2500.72119140625, "epoch": 1.9384057971014492, "grad_norm": 110293.70890303263, "learning_rate": 8.985591117440483e-08, "logits/chosen": 1.5718975067138672, "logits/rejected": 1.8537788391113281, "logps/chosen": -85.6647720336914, "logps/rejected": -4.973170280456543, "loss": 4991.8945, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.008879335597157478, "rewards/margins": 0.008810698986053467, "rewards/rejected": 6.863677117507905e-05, "step": 535 }, { "debug/policy_chosen_logits": 1.1742925643920898, "debug/policy_chosen_logps": -6.625035285949707, "debug/policy_rejected_logits": 1.5054795742034912, "debug/policy_rejected_logps": -13.368057250976562, "debug/reference_chosen_logps": -6.643076419830322, "debug/reference_rejected_logps": -13.368135452270508, "debug/sppo_chosen_loss": 2498.23779296875, "debug/sppo_chosen_reward_in_loss": 0.018040591850876808, "debug/sppo_rej_reward_in_loss": 7.841289334464818e-05, "debug/sppo_reject_loss": 2500.033935546875, "epoch": 1.9565217391304348, "grad_norm": 148544.360577327, "learning_rate": 8.962983440004998e-08, "logits/chosen": 1.1742925643920898, "logits/rejected": 1.5054795742034912, "logps/chosen": -6.625035285949707, "logps/rejected": -13.368057250976562, "loss": 4971.3461, "rewards/accuracies": 0.375, "rewards/chosen": 0.00018040589930023998, "rewards/margins": 0.00017962176934815943, "rewards/rejected": 7.841154001653194e-07, "step": 540 }, { "debug/policy_chosen_logits": 1.3015177249908447, "debug/policy_chosen_logps": -7.803164005279541, "debug/policy_rejected_logits": 1.5490944385528564, "debug/policy_rejected_logps": -5.706773281097412, "debug/reference_chosen_logps": -7.7561540603637695, "debug/reference_rejected_logps": -5.6011643409729, "debug/sppo_chosen_loss": 2504.740966796875, "debug/sppo_chosen_reward_in_loss": -0.047010183334350586, "debug/sppo_rej_reward_in_loss": -0.10560951381921768, "debug/sppo_reject_loss": 2489.6650390625, "epoch": 1.9746376811594204, "grad_norm": 144478.87888735058, "learning_rate": 8.940155713878738e-08, "logits/chosen": 1.3015177249908447, "logits/rejected": 1.5490944385528564, "logps/chosen": -7.803164005279541, "logps/rejected": -5.706773281097412, "loss": 4983.6199, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.00047010177513584495, "rewards/margins": 0.0005859933444298804, "rewards/rejected": -0.0010560952359810472, "step": 545 }, { "debug/policy_chosen_logits": 1.075130581855774, "debug/policy_chosen_logps": -107.33253479003906, "debug/policy_rejected_logits": 1.1537376642227173, "debug/policy_rejected_logps": -47.780128479003906, "debug/reference_chosen_logps": -108.74848937988281, "debug/reference_rejected_logps": -48.089027404785156, "debug/sppo_chosen_loss": 2437.177734375, "debug/sppo_chosen_reward_in_loss": 1.4159678220748901, "debug/sppo_rej_reward_in_loss": 0.30889958143234253, "debug/sppo_reject_loss": 2536.8515625, "epoch": 1.9927536231884058, "grad_norm": 50259.35447171487, "learning_rate": 8.91710920659444e-08, "logits/chosen": 1.075130581855774, "logits/rejected": 1.1537376642227173, "logps/chosen": -107.33253479003906, "logps/rejected": -47.780128479003906, "loss": 4971.0098, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.014159679412841797, "rewards/margins": 0.011070680804550648, "rewards/rejected": 0.0030889958143234253, "step": 550 }, { "debug/policy_chosen_logits": 1.1352243423461914, "debug/policy_chosen_logps": -182.50460815429688, "debug/policy_rejected_logits": 1.8139286041259766, "debug/policy_rejected_logps": -3.0896496772766113, "debug/reference_chosen_logps": -184.52658081054688, "debug/reference_rejected_logps": -3.0741100311279297, "debug/sppo_chosen_loss": 2388.943115234375, "debug/sppo_chosen_reward_in_loss": 2.021972894668579, "debug/sppo_rej_reward_in_loss": -0.015539753250777721, "debug/sppo_reject_loss": 2498.46044921875, "epoch": 2.010869565217391, "grad_norm": 49649.56682933092, "learning_rate": 8.89384519783289e-08, "logits/chosen": 1.1352243423461914, "logits/rejected": 1.8139286041259766, "logps/chosen": -182.50460815429688, "logps/rejected": -3.0896496772766113, "loss": 4967.3969, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.020219724625349045, "rewards/margins": 0.020375125110149384, "rewards/rejected": -0.0001553975307615474, "step": 555 }, { "debug/policy_chosen_logits": 1.670299768447876, "debug/policy_chosen_logps": -181.0923614501953, "debug/policy_rejected_logits": 1.915454626083374, "debug/policy_rejected_logps": -6.760970115661621, "debug/reference_chosen_logps": -182.94830322265625, "debug/reference_rejected_logps": -6.724902153015137, "debug/sppo_chosen_loss": 2381.651611328125, "debug/sppo_chosen_reward_in_loss": 1.8559490442276, "debug/sppo_rej_reward_in_loss": -0.03606755658984184, "debug/sppo_reject_loss": 2496.419189453125, "epoch": 2.028985507246377, "grad_norm": 30482.05807316755, "learning_rate": 8.87036497935186e-08, "logits/chosen": 1.670299768447876, "logits/rejected": 1.915454626083374, "logps/chosen": -181.0923614501953, "logps/rejected": -6.760970115661621, "loss": 4958.1211, "rewards/accuracies": 0.375, "rewards/chosen": 0.018559489399194717, "rewards/margins": 0.018920164555311203, "rewards/rejected": -0.000360675563570112, "step": 560 }, { "debug/policy_chosen_logits": 1.4729254245758057, "debug/policy_chosen_logps": -6.2717108726501465, "debug/policy_rejected_logits": 1.8031227588653564, "debug/policy_rejected_logps": -65.36786651611328, "debug/reference_chosen_logps": -6.2621893882751465, "debug/reference_rejected_logps": -65.73162078857422, "debug/sppo_chosen_loss": 2500.97509765625, "debug/sppo_chosen_reward_in_loss": -0.009521400555968285, "debug/sppo_rej_reward_in_loss": 0.36376261711120605, "debug/sppo_reject_loss": 2541.224365234375, "epoch": 2.0471014492753623, "grad_norm": 203795.44514949693, "learning_rate": 8.846669854914395e-08, "logits/chosen": 1.4729254245758057, "logits/rejected": 1.8031227588653564, "logps/chosen": -6.2717108726501465, "logps/rejected": -65.36786651611328, "loss": 4950.2574, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -9.521401807432994e-05, "rewards/margins": -0.0037328400649130344, "rewards/rejected": 0.0036376260686665773, "step": 565 }, { "debug/policy_chosen_logits": 1.4667936563491821, "debug/policy_chosen_logps": -6.170874118804932, "debug/policy_rejected_logits": 1.7932231426239014, "debug/policy_rejected_logps": -13.33043098449707, "debug/reference_chosen_logps": -6.186774253845215, "debug/reference_rejected_logps": -13.234382629394531, "debug/sppo_chosen_loss": 2498.48046875, "debug/sppo_chosen_reward_in_loss": 0.015900660306215286, "debug/sppo_rej_reward_in_loss": -0.09604871273040771, "debug/sppo_reject_loss": 2490.5224609375, "epoch": 2.0652173913043477, "grad_norm": 40680.96901777434, "learning_rate": 8.8227611402164e-08, "logits/chosen": 1.4667936563491821, "logits/rejected": 1.7932231426239014, "logps/chosen": -6.170874118804932, "logps/rejected": -13.33043098449707, "loss": 4956.7676, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.00015900659491308033, "rewards/margins": 0.0011194937396794558, "rewards/rejected": -0.000960487115662545, "step": 570 }, { "debug/policy_chosen_logits": 1.4026988744735718, "debug/policy_chosen_logps": -6.8413496017456055, "debug/policy_rejected_logits": 1.7165952920913696, "debug/policy_rejected_logps": -5.553938865661621, "debug/reference_chosen_logps": -6.833249092102051, "debug/reference_rejected_logps": -5.51195764541626, "debug/sppo_chosen_loss": 2500.821044921875, "debug/sppo_chosen_reward_in_loss": -0.008101532235741615, "debug/sppo_rej_reward_in_loss": -0.04198155552148819, "debug/sppo_reject_loss": 2495.84033203125, "epoch": 2.0833333333333335, "grad_norm": 38117.71154382252, "learning_rate": 8.798640162813607e-08, "logits/chosen": 1.4026988744735718, "logits/rejected": 1.7165952920913696, "logps/chosen": -6.8413496017456055, "logps/rejected": -5.553938865661621, "loss": 4978.9121, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -8.101532148430124e-05, "rewards/margins": 0.0003388002223800868, "rewards/rejected": -0.0004198155365884304, "step": 575 }, { "debug/policy_chosen_logits": 0.9549234509468079, "debug/policy_chosen_logps": -8.150744438171387, "debug/policy_rejected_logits": 1.3587868213653564, "debug/policy_rejected_logps": -3.6934826374053955, "debug/reference_chosen_logps": -8.1328763961792, "debug/reference_rejected_logps": -3.7049155235290527, "debug/sppo_chosen_loss": 2501.79931640625, "debug/sppo_chosen_reward_in_loss": -0.01786838099360466, "debug/sppo_rej_reward_in_loss": 0.011432814411818981, "debug/sppo_reject_loss": 2501.164306640625, "epoch": 2.101449275362319, "grad_norm": 23425.05066375264, "learning_rate": 8.774308262047847e-08, "logits/chosen": 0.9549234509468079, "logits/rejected": 1.3587868213653564, "logps/chosen": -8.150744438171387, "logps/rejected": -3.6934826374053955, "loss": 4984.6969, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.00017868381110019982, "rewards/margins": -0.00029301195172593, "rewards/rejected": 0.00011432813334977254, "step": 580 }, { "debug/policy_chosen_logits": 1.5237423181533813, "debug/policy_chosen_logps": -24.07058334350586, "debug/policy_rejected_logits": 1.7695128917694092, "debug/policy_rejected_logps": -20.570938110351562, "debug/reference_chosen_logps": -24.254491806030273, "debug/reference_rejected_logps": -20.668045043945312, "debug/sppo_chosen_loss": 2482.30712890625, "debug/sppo_chosen_reward_in_loss": 0.1839061677455902, "debug/sppo_rej_reward_in_loss": 0.0971066802740097, "debug/sppo_reject_loss": 2510.917236328125, "epoch": 2.119565217391304, "grad_norm": 262399.6367462418, "learning_rate": 8.749766788972685e-08, "logits/chosen": 1.5237423181533813, "logits/rejected": 1.7695128917694092, "logps/chosen": -24.07058334350586, "logps/rejected": -20.570938110351562, "loss": 4996.8371, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0018390618497505784, "rewards/margins": 0.0008679949678480625, "rewards/rejected": 0.0009710669401101768, "step": 585 }, { "debug/policy_chosen_logits": 1.4658873081207275, "debug/policy_chosen_logps": -152.597900390625, "debug/policy_rejected_logits": 1.8432172536849976, "debug/policy_rejected_logps": -120.95674133300781, "debug/reference_chosen_logps": -153.42547607421875, "debug/reference_rejected_logps": -121.61518859863281, "debug/sppo_chosen_loss": 2434.657470703125, "debug/sppo_chosen_reward_in_loss": 0.8275735974311829, "debug/sppo_rej_reward_in_loss": 0.6584376692771912, "debug/sppo_reject_loss": 2584.908447265625, "epoch": 2.13768115942029, "grad_norm": 132098.8654189129, "learning_rate": 8.725017106278406e-08, "logits/chosen": 1.4658873081207275, "logits/rejected": 1.8432172536849976, "logps/chosen": -152.597900390625, "logps/rejected": -120.95674133300781, "loss": 4985.0195, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.008275735192000866, "rewards/margins": 0.0016913587460294366, "rewards/rejected": 0.006584376096725464, "step": 590 }, { "debug/policy_chosen_logits": 1.142416000366211, "debug/policy_chosen_logps": -3.1027462482452393, "debug/policy_rejected_logits": 1.4878085851669312, "debug/policy_rejected_logps": -40.01049041748047, "debug/reference_chosen_logps": -3.100511074066162, "debug/reference_rejected_logps": -39.13794708251953, "debug/sppo_chosen_loss": 2500.24609375, "debug/sppo_chosen_reward_in_loss": -0.0022350430954247713, "debug/sppo_rej_reward_in_loss": -0.8725408315658569, "debug/sppo_reject_loss": 2442.185546875, "epoch": 2.1557971014492754, "grad_norm": 57677.43561431882, "learning_rate": 8.700060588216336e-08, "logits/chosen": 1.142416000366211, "logits/rejected": 1.4878085851669312, "logps/chosen": -3.1027462482452393, "logps/rejected": -40.01049041748047, "loss": 4966.9574, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -2.235042120446451e-05, "rewards/margins": 0.008703058585524559, "rewards/rejected": -0.008725408464670181, "step": 595 }, { "debug/policy_chosen_logits": 1.3228609561920166, "debug/policy_chosen_logps": -89.22603607177734, "debug/policy_rejected_logits": 1.5925976037979126, "debug/policy_rejected_logps": -8.777485847473145, "debug/reference_chosen_logps": -90.09474182128906, "debug/reference_rejected_logps": -8.721701622009277, "debug/sppo_chosen_loss": 2428.719482421875, "debug/sppo_chosen_reward_in_loss": 0.8686982989311218, "debug/sppo_rej_reward_in_loss": -0.05578421801328659, "debug/sppo_reject_loss": 2494.53466796875, "epoch": 2.1739130434782608, "grad_norm": 26456.846770028904, "learning_rate": 8.674898620522557e-08, "logits/chosen": 1.3228609561920166, "logits/rejected": 1.5925976037979126, "logps/chosen": -89.22603607177734, "logps/rejected": -8.777485847473145, "loss": 4945.2656, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.008686983026564121, "rewards/margins": 0.009244824759662151, "rewards/rejected": -0.0005578421987593174, "step": 600 }, { "epoch": 2.1739130434782608, "eval_debug/policy_chosen_logits": 1.6724315881729126, "eval_debug/policy_chosen_logps": -121.49956512451172, "eval_debug/policy_rejected_logits": 1.7309714555740356, "eval_debug/policy_rejected_logps": -63.582584381103516, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2391.6708984375, "eval_debug/sppo_chosen_reward_in_loss": 1.648483157157898, "eval_debug/sppo_rej_reward_in_loss": 0.30446669459342957, "eval_debug/sppo_reject_loss": 2537.979736328125, "eval_logits/chosen": 1.6724315881729126, "eval_logits/rejected": 1.7309714555740356, "eval_logps/chosen": -121.49956512451172, "eval_logps/rejected": -63.582584381103516, "eval_loss": 4971.419921875, "eval_rewards/accuracies": 0.2631579041481018, "eval_rewards/chosen": 0.016484834253787994, "eval_rewards/margins": 0.013440164737403393, "eval_rewards/rejected": 0.0030446667224168777, "eval_runtime": 28.7421, "eval_samples_per_second": 20.875, "eval_steps_per_second": 0.661, "step": 600 }, { "debug/policy_chosen_logits": 1.3927724361419678, "debug/policy_chosen_logps": -101.41067504882812, "debug/policy_rejected_logits": 1.9825413227081299, "debug/policy_rejected_logps": -194.62008666992188, "debug/reference_chosen_logps": -102.28282928466797, "debug/reference_rejected_logps": -195.50665283203125, "debug/sppo_chosen_loss": 2443.44384765625, "debug/sppo_chosen_reward_in_loss": 0.8721572756767273, "debug/sppo_rej_reward_in_loss": 0.8865618705749512, "debug/sppo_reject_loss": 2603.52490234375, "epoch": 2.1920289855072466, "grad_norm": 175518.1998228211, "learning_rate": 8.649532600340945e-08, "logits/chosen": 1.3927724361419678, "logits/rejected": 1.9825413227081299, "logps/chosen": -101.41067504882812, "logps/rejected": -194.62008666992188, "loss": 5020.6316, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.00872157420963049, "rewards/margins": -0.0001440450141672045, "rewards/rejected": 0.008865619078278542, "step": 605 }, { "debug/policy_chosen_logits": 1.3799394369125366, "debug/policy_chosen_logps": -4.568610191345215, "debug/policy_rejected_logits": 1.4788639545440674, "debug/policy_rejected_logps": -7.467223167419434, "debug/reference_chosen_logps": -4.522200584411621, "debug/reference_rejected_logps": -7.417989253997803, "debug/sppo_chosen_loss": 2504.758056640625, "debug/sppo_chosen_reward_in_loss": -0.04641067236661911, "debug/sppo_rej_reward_in_loss": -0.04923428222537041, "debug/sppo_reject_loss": 2495.16455078125, "epoch": 2.210144927536232, "grad_norm": 28836.031191275946, "learning_rate": 8.6239639361456e-08, "logits/chosen": 1.3799394369125366, "logits/rejected": 1.4788639545440674, "logps/chosen": -4.568610191345215, "logps/rejected": -7.467223167419434, "loss": 4994.4496, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.00046410676441155374, "rewards/margins": 2.823607064783573e-05, "rewards/rejected": -0.000492342805955559, "step": 610 }, { "debug/policy_chosen_logits": 1.4084560871124268, "debug/policy_chosen_logps": -94.44200897216797, "debug/policy_rejected_logits": 1.5994064807891846, "debug/policy_rejected_logps": -140.82565307617188, "debug/reference_chosen_logps": -94.77066802978516, "debug/reference_rejected_logps": -140.92489624023438, "debug/sppo_chosen_loss": 2469.033203125, "debug/sppo_chosen_reward_in_loss": 0.32865768671035767, "debug/sppo_rej_reward_in_loss": 0.09925515949726105, "debug/sppo_reject_loss": 2513.01171875, "epoch": 2.2282608695652173, "grad_norm": 18299.95734609692, "learning_rate": 8.598194047662634e-08, "logits/chosen": 1.4084560871124268, "logits/rejected": 1.5994064807891846, "logps/chosen": -94.44200897216797, "logps/rejected": -140.82565307617188, "loss": 4993.9336, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.00328657659702003, "rewards/margins": 0.002294025616720319, "rewards/rejected": 0.0009925514459609985, "step": 615 }, { "debug/policy_chosen_logits": 1.0762665271759033, "debug/policy_chosen_logps": -5.409428119659424, "debug/policy_rejected_logits": 1.567284107208252, "debug/policy_rejected_logps": -10.24363899230957, "debug/reference_chosen_logps": -5.394388675689697, "debug/reference_rejected_logps": -10.254476547241211, "debug/sppo_chosen_loss": 2501.52978515625, "debug/sppo_chosen_reward_in_loss": -0.015039680525660515, "debug/sppo_rej_reward_in_loss": 0.010836672969162464, "debug/sppo_reject_loss": 2501.1484375, "epoch": 2.246376811594203, "grad_norm": 20795.532608772475, "learning_rate": 8.572224365791348e-08, "logits/chosen": 1.0762665271759033, "logits/rejected": 1.567284107208252, "logps/chosen": -5.409428119659424, "logps/rejected": -10.24363899230957, "loss": 4980.1465, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.00015039679419714957, "rewards/margins": -0.00025876349536702037, "rewards/rejected": 0.00010836673754965886, "step": 620 }, { "debug/policy_chosen_logits": 1.2370128631591797, "debug/policy_chosen_logps": -68.23397827148438, "debug/policy_rejected_logits": 1.3979778289794922, "debug/policy_rejected_logps": -121.84098052978516, "debug/reference_chosen_logps": -68.59146118164062, "debug/reference_rejected_logps": -121.55644226074219, "debug/sppo_chosen_loss": 2468.83984375, "debug/sppo_chosen_reward_in_loss": 0.3574833571910858, "debug/sppo_rej_reward_in_loss": -0.2845422327518463, "debug/sppo_reject_loss": 2473.54052734375, "epoch": 2.2644927536231885, "grad_norm": 50042.69603249167, "learning_rate": 8.546056332524771e-08, "logits/chosen": 1.2370128631591797, "logits/rejected": 1.3979778289794922, "logps/chosen": -68.23397827148438, "logps/rejected": -121.84098052978516, "loss": 4990.7344, "rewards/accuracies": 0.375, "rewards/chosen": 0.0035748339723795652, "rewards/margins": 0.006420256104320288, "rewards/rejected": -0.002845422364771366, "step": 625 }, { "debug/policy_chosen_logits": 1.5076558589935303, "debug/policy_chosen_logps": -94.11219024658203, "debug/policy_rejected_logits": 1.6326602697372437, "debug/policy_rejected_logps": -8.206840515136719, "debug/reference_chosen_logps": -95.49501037597656, "debug/reference_rejected_logps": -8.245024681091309, "debug/sppo_chosen_loss": 2439.8701171875, "debug/sppo_chosen_reward_in_loss": 1.382814645767212, "debug/sppo_rej_reward_in_loss": 0.03818460553884506, "debug/sppo_reject_loss": 2503.880126953125, "epoch": 2.282608695652174, "grad_norm": 90885.63243722945, "learning_rate": 8.519691400869593e-08, "logits/chosen": 1.5076558589935303, "logits/rejected": 1.6326602697372437, "logps/chosen": -94.11219024658203, "logps/rejected": -8.206840515136719, "loss": 4966.5055, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": 0.013828148134052753, "rewards/margins": 0.013446303084492683, "rewards/rejected": 0.00038184603909030557, "step": 630 }, { "debug/policy_chosen_logits": 1.3542203903198242, "debug/policy_chosen_logps": -90.0873794555664, "debug/policy_rejected_logits": 1.5896203517913818, "debug/policy_rejected_logps": -11.76610279083252, "debug/reference_chosen_logps": -90.65675354003906, "debug/reference_rejected_logps": -11.677980422973633, "debug/sppo_chosen_loss": 2453.387451171875, "debug/sppo_chosen_reward_in_loss": 0.5693702101707458, "debug/sppo_rej_reward_in_loss": -0.08812247216701508, "debug/sppo_reject_loss": 2491.30224609375, "epoch": 2.300724637681159, "grad_norm": 19411.98506738936, "learning_rate": 8.493131034765493e-08, "logits/chosen": 1.3542203903198242, "logits/rejected": 1.5896203517913818, "logps/chosen": -90.0873794555664, "logps/rejected": -11.76610279083252, "loss": 4986.6453, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.0056937020272016525, "rewards/margins": 0.006574926432222128, "rewards/rejected": -0.00088122469605878, "step": 635 }, { "debug/policy_chosen_logits": 1.1856319904327393, "debug/policy_chosen_logps": -114.78668212890625, "debug/policy_rejected_logits": 1.5505703687667847, "debug/policy_rejected_logps": -7.685643196105957, "debug/reference_chosen_logps": -115.78373718261719, "debug/reference_rejected_logps": -7.740739345550537, "debug/sppo_chosen_loss": 2438.772216796875, "debug/sppo_chosen_reward_in_loss": 0.9970676302909851, "debug/sppo_rej_reward_in_loss": 0.05509599298238754, "debug/sppo_reject_loss": 2505.55517578125, "epoch": 2.318840579710145, "grad_norm": 29618.681197439273, "learning_rate": 8.46637670900384e-08, "logits/chosen": 1.1856319904327393, "logits/rejected": 1.5505703687667847, "logps/chosen": -114.78668212890625, "logps/rejected": -7.685643196105957, "loss": 4971.8281, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.00997067615389824, "rewards/margins": 0.009419716894626617, "rewards/rejected": 0.0005509598995558918, "step": 640 }, { "debug/policy_chosen_logits": 1.0289747714996338, "debug/policy_chosen_logps": -36.51557540893555, "debug/policy_rejected_logits": 1.003749132156372, "debug/policy_rejected_logps": -4.6930084228515625, "debug/reference_chosen_logps": -36.7678108215332, "debug/reference_rejected_logps": -4.662884712219238, "debug/sppo_chosen_loss": 2477.410400390625, "debug/sppo_chosen_reward_in_loss": 0.2522297501564026, "debug/sppo_rej_reward_in_loss": -0.030124317854642868, "debug/sppo_reject_loss": 2497.0859375, "epoch": 2.3369565217391304, "grad_norm": 22920.09823430301, "learning_rate": 8.439429909145816e-08, "logits/chosen": 1.0289747714996338, "logits/rejected": 1.003749132156372, "logps/chosen": -36.51557540893555, "logps/rejected": -4.6930084228515625, "loss": 5015.5141, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.0025222976692020893, "rewards/margins": 0.002823540708050132, "rewards/rejected": -0.00030124321347102523, "step": 645 }, { "debug/policy_chosen_logits": 0.9863542318344116, "debug/policy_chosen_logps": -197.14846801757812, "debug/policy_rejected_logits": 1.0776666402816772, "debug/policy_rejected_logps": -7.684308052062988, "debug/reference_chosen_logps": -173.88534545898438, "debug/reference_rejected_logps": -7.317915916442871, "debug/sppo_chosen_loss": 12122.8720703125, "debug/sppo_chosen_reward_in_loss": -23.263113021850586, "debug/sppo_rej_reward_in_loss": -0.3663920760154724, "debug/sppo_reject_loss": 2464.9951171875, "epoch": 2.355072463768116, "grad_norm": 684166.1661533532, "learning_rate": 8.412292131439924e-08, "logits/chosen": 0.9863542318344116, "logits/rejected": 1.0776666402816772, "logps/chosen": -197.14846801757812, "logps/rejected": -7.684308052062988, "loss": 13862.5594, "rewards/accuracies": 0.25, "rewards/chosen": -0.2326311320066452, "rewards/margins": -0.22896721959114075, "rewards/rejected": -0.0036639210302382708, "step": 650 }, { "debug/policy_chosen_logits": 1.3219021558761597, "debug/policy_chosen_logps": -73.93031311035156, "debug/policy_rejected_logits": 1.630967378616333, "debug/policy_rejected_logps": -6.003985404968262, "debug/reference_chosen_logps": -72.91654205322266, "debug/reference_rejected_logps": -5.8764119148254395, "debug/sppo_chosen_loss": 2638.525634765625, "debug/sppo_chosen_reward_in_loss": -1.013765811920166, "debug/sppo_rej_reward_in_loss": -0.12757425010204315, "debug/sppo_reject_loss": 2487.59423828125, "epoch": 2.3731884057971016, "grad_norm": 293010.06647534797, "learning_rate": 8.3849648827389e-08, "logits/chosen": 1.3219021558761597, "logits/rejected": 1.630967378616333, "logps/chosen": -73.93031311035156, "logps/rejected": -6.003985404968262, "loss": 5156.391, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.010137656703591347, "rewards/margins": -0.008861915208399296, "rewards/rejected": -0.0012757425429299474, "step": 655 }, { "debug/policy_chosen_logits": 1.1500542163848877, "debug/policy_chosen_logps": -157.37864685058594, "debug/policy_rejected_logits": 1.3998603820800781, "debug/policy_rejected_logps": -3.760720729827881, "debug/reference_chosen_logps": -157.91355895996094, "debug/reference_rejected_logps": -3.7736668586730957, "debug/sppo_chosen_loss": 2453.85302734375, "debug/sppo_chosen_reward_in_loss": 0.5348891019821167, "debug/sppo_rej_reward_in_loss": 0.012946033850312233, "debug/sppo_reject_loss": 2501.311767578125, "epoch": 2.391304347826087, "grad_norm": 296086.1205906361, "learning_rate": 8.357449680416058e-08, "logits/chosen": 1.1500542163848877, "logits/rejected": 1.3998603820800781, "logps/chosen": -157.37864685058594, "logps/rejected": -3.760720729827881, "loss": 4984.7086, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.005348891019821167, "rewards/margins": 0.005219430662691593, "rewards/rejected": 0.00012946032802574337, "step": 660 }, { "debug/policy_chosen_logits": 1.3848161697387695, "debug/policy_chosen_logps": -218.70840454101562, "debug/policy_rejected_logits": 1.7765977382659912, "debug/policy_rejected_logps": -49.826717376708984, "debug/reference_chosen_logps": -219.96499633789062, "debug/reference_rejected_logps": -50.28038787841797, "debug/sppo_chosen_loss": 2407.37548828125, "debug/sppo_chosen_reward_in_loss": 1.2566018104553223, "debug/sppo_rej_reward_in_loss": 0.4536685049533844, "debug/sppo_reject_loss": 2554.99267578125, "epoch": 2.4094202898550723, "grad_norm": 27987.495677897332, "learning_rate": 8.32974805228102e-08, "logits/chosen": 1.3848161697387695, "logits/rejected": 1.7765977382659912, "logps/chosen": -218.70840454101562, "logps/rejected": -49.826717376708984, "loss": 4991.1609, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.012566019780933857, "rewards/margins": 0.0080293333157897, "rewards/rejected": 0.004536684602499008, "step": 665 }, { "debug/policy_chosen_logits": 1.7411706447601318, "debug/policy_chosen_logps": -100.1866683959961, "debug/policy_rejected_logits": 1.9671207666397095, "debug/policy_rejected_logps": -2.9916300773620605, "debug/reference_chosen_logps": -100.52616882324219, "debug/reference_rejected_logps": -2.9794654846191406, "debug/sppo_chosen_loss": 2471.088134765625, "debug/sppo_chosen_reward_in_loss": 0.3394917845726013, "debug/sppo_rej_reward_in_loss": -0.01216481626033783, "debug/sppo_reject_loss": 2498.791748046875, "epoch": 2.427536231884058, "grad_norm": 16029.879876874074, "learning_rate": 8.301861536494898e-08, "logits/chosen": 1.7411706447601318, "logits/rejected": 1.9671207666397095, "logps/chosen": -100.1866683959961, "logps/rejected": -2.9916300773620605, "loss": 4966.6785, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": 0.003394917817786336, "rewards/margins": 0.003516565542668104, "rewards/rejected": -0.00012164816871518269, "step": 670 }, { "debug/policy_chosen_logits": 1.366699457168579, "debug/policy_chosen_logps": -18.988161087036133, "debug/policy_rejected_logits": 1.5202587842941284, "debug/policy_rejected_logps": -7.101962089538574, "debug/reference_chosen_logps": -19.44676971435547, "debug/reference_rejected_logps": -7.09658145904541, "debug/sppo_chosen_loss": 2461.796630859375, "debug/sppo_chosen_reward_in_loss": 0.4586108326911926, "debug/sppo_rej_reward_in_loss": -0.0053813280537724495, "debug/sppo_reject_loss": 2499.51025390625, "epoch": 2.4456521739130435, "grad_norm": 17519.687278823465, "learning_rate": 8.273791681484874e-08, "logits/chosen": 1.366699457168579, "logits/rejected": 1.5202587842941284, "logps/chosen": -18.988161087036133, "logps/rejected": -7.101962089538574, "loss": 4969.4477, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.004586108960211277, "rewards/margins": 0.004639922175556421, "rewards/rejected": -5.381325172493234e-05, "step": 675 }, { "debug/policy_chosen_logits": 1.7357063293457031, "debug/policy_chosen_logps": -8.619532585144043, "debug/policy_rejected_logits": 1.7283729314804077, "debug/policy_rejected_logps": -6.644012451171875, "debug/reference_chosen_logps": -8.653813362121582, "debug/reference_rejected_logps": -6.547207832336426, "debug/sppo_chosen_loss": 2496.63037109375, "debug/sppo_chosen_reward_in_loss": 0.034280020743608475, "debug/sppo_rej_reward_in_loss": -0.096805639564991, "debug/sppo_reject_loss": 2490.397705078125, "epoch": 2.463768115942029, "grad_norm": 29898.56256046382, "learning_rate": 8.245540045858228e-08, "logits/chosen": 1.7357063293457031, "logits/rejected": 1.7283729314804077, "logps/chosen": -8.619532585144043, "logps/rejected": -6.644012451171875, "loss": 5039.0094, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0003428002237342298, "rewards/margins": 0.001310856663621962, "rewards/rejected": -0.0009680563816800714, "step": 680 }, { "debug/policy_chosen_logits": 1.397537350654602, "debug/policy_chosen_logps": -8.459815979003906, "debug/policy_rejected_logits": 1.6885648965835571, "debug/policy_rejected_logps": -8.780861854553223, "debug/reference_chosen_logps": -8.536941528320312, "debug/reference_rejected_logps": -8.682905197143555, "debug/sppo_chosen_loss": 2492.366455078125, "debug/sppo_chosen_reward_in_loss": 0.07712530344724655, "debug/sppo_rej_reward_in_loss": -0.09795691072940826, "debug/sppo_reject_loss": 2490.386962890625, "epoch": 2.4818840579710146, "grad_norm": 28367.54690240989, "learning_rate": 8.2171081983158e-08, "logits/chosen": 1.397537350654602, "logits/rejected": 1.6885648965835571, "logps/chosen": -8.459815979003906, "logps/rejected": -8.780861854553223, "loss": 4975.252, "rewards/accuracies": 0.375, "rewards/chosen": 0.0007712530205026269, "rewards/margins": 0.0017508221790194511, "rewards/rejected": -0.0009795691585168242, "step": 685 }, { "debug/policy_chosen_logits": 1.323064923286438, "debug/policy_chosen_logps": -87.09587097167969, "debug/policy_rejected_logits": 1.6259129047393799, "debug/policy_rejected_logps": -2.9550280570983887, "debug/reference_chosen_logps": -88.01163482666016, "debug/reference_rejected_logps": -2.9231133460998535, "debug/sppo_chosen_loss": 2444.694091796875, "debug/sppo_chosen_reward_in_loss": 0.9157629013061523, "debug/sppo_rej_reward_in_loss": -0.03191450238227844, "debug/sppo_reject_loss": 2496.852783203125, "epoch": 2.5, "grad_norm": 19170.56108204549, "learning_rate": 8.188497717564871e-08, "logits/chosen": 1.323064923286438, "logits/rejected": 1.6259129047393799, "logps/chosen": -87.09587097167969, "logps/rejected": -2.9550280570983887, "loss": 4982.8348, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.009157629683613777, "rewards/margins": 0.009476774372160435, "rewards/rejected": -0.00031914503779262304, "step": 690 }, { "debug/policy_chosen_logits": 1.3540644645690918, "debug/policy_chosen_logps": -39.49969482421875, "debug/policy_rejected_logits": 1.4677890539169312, "debug/policy_rejected_logps": -5.493186950683594, "debug/reference_chosen_logps": -40.38248062133789, "debug/reference_rejected_logps": -5.549628257751465, "debug/sppo_chosen_loss": 2439.629638671875, "debug/sppo_chosen_reward_in_loss": 0.8827875256538391, "debug/sppo_rej_reward_in_loss": 0.05644185096025467, "debug/sppo_reject_loss": 2505.71240234375, "epoch": 2.5181159420289854, "grad_norm": 13443.833494994296, "learning_rate": 8.159710192231519e-08, "logits/chosen": 1.3540644645690918, "logits/rejected": 1.4677890539169312, "logps/chosen": -39.49969482421875, "logps/rejected": -5.493186950683594, "loss": 4983.9488, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.008827874436974525, "rewards/margins": 0.008263456635177135, "rewards/rejected": 0.00056441844208166, "step": 695 }, { "debug/policy_chosen_logits": 1.5028470754623413, "debug/policy_chosen_logps": -6.181028842926025, "debug/policy_rejected_logits": 1.7701442241668701, "debug/policy_rejected_logps": -293.8742980957031, "debug/reference_chosen_logps": -6.198336601257324, "debug/reference_rejected_logps": -296.1982116699219, "debug/sppo_chosen_loss": 2498.30908203125, "debug/sppo_chosen_reward_in_loss": 0.01730785146355629, "debug/sppo_rej_reward_in_loss": 2.3239176273345947, "debug/sppo_reject_loss": 2835.44775390625, "epoch": 2.536231884057971, "grad_norm": 198869.9368081879, "learning_rate": 8.130747220772401e-08, "logits/chosen": 1.5028470754623413, "logits/rejected": 1.7701442241668701, "logps/chosen": -6.181028842926025, "logps/rejected": -293.8742980957031, "loss": 5016.1723, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.00017307850066572428, "rewards/margins": -0.023066096007823944, "rewards/rejected": 0.023239172995090485, "step": 700 }, { "epoch": 2.536231884057971, "eval_debug/policy_chosen_logits": 1.6918941736221313, "eval_debug/policy_chosen_logps": -121.2217788696289, "eval_debug/policy_rejected_logits": 1.7527785301208496, "eval_debug/policy_rejected_logps": -63.50968933105469, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2372.3935546875, "eval_debug/sppo_chosen_reward_in_loss": 1.9262654781341553, "eval_debug/sppo_rej_reward_in_loss": 0.37736475467681885, "eval_debug/sppo_reject_loss": 2549.70458984375, "eval_logits/chosen": 1.6918941736221313, "eval_logits/rejected": 1.7527785301208496, "eval_logps/chosen": -121.2217788696289, "eval_logps/rejected": -63.50968933105469, "eval_loss": 4956.60546875, "eval_rewards/accuracies": 0.3684210479259491, "eval_rewards/chosen": 0.019262652844190598, "eval_rewards/margins": 0.015489005483686924, "eval_rewards/rejected": 0.0037736473605036736, "eval_runtime": 28.6917, "eval_samples_per_second": 20.912, "eval_steps_per_second": 0.662, "step": 700 }, { "debug/policy_chosen_logits": 1.5237798690795898, "debug/policy_chosen_logps": -64.85299682617188, "debug/policy_rejected_logits": 1.7822554111480713, "debug/policy_rejected_logps": -53.100791931152344, "debug/reference_chosen_logps": -65.52136993408203, "debug/reference_rejected_logps": -53.42998504638672, "debug/sppo_chosen_loss": 2450.79150390625, "debug/sppo_chosen_reward_in_loss": 0.6683730483055115, "debug/sppo_rej_reward_in_loss": 0.32918962836265564, "debug/sppo_reject_loss": 2537.10498046875, "epoch": 2.5543478260869565, "grad_norm": 143379.79187457016, "learning_rate": 8.101610411385998e-08, "logits/chosen": 1.5237798690795898, "logits/rejected": 1.7822554111480713, "logps/chosen": -64.85299682617188, "logps/rejected": -53.100791931152344, "loss": 4997.3418, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.006683730520308018, "rewards/margins": 0.003391833510249853, "rewards/rejected": 0.00329189607873559, "step": 705 }, { "debug/policy_chosen_logits": 1.4197657108306885, "debug/policy_chosen_logps": -38.90526580810547, "debug/policy_rejected_logits": 1.690158486366272, "debug/policy_rejected_logps": -59.02082061767578, "debug/reference_chosen_logps": -39.279052734375, "debug/reference_rejected_logps": -59.38611602783203, "debug/sppo_chosen_loss": 2466.936767578125, "debug/sppo_chosen_reward_in_loss": 0.37378019094467163, "debug/sppo_rej_reward_in_loss": 0.3652983009815216, "debug/sppo_reject_loss": 2543.61572265625, "epoch": 2.572463768115942, "grad_norm": 82603.33231204493, "learning_rate": 8.072301381923319e-08, "logits/chosen": 1.4197657108306885, "logits/rejected": 1.690158486366272, "logps/chosen": -38.90526580810547, "logps/rejected": -59.02082061767578, "loss": 4954.934, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0037378016859292984, "rewards/margins": 8.481879194732755e-05, "rewards/rejected": 0.0036529828794300556, "step": 710 }, { "debug/policy_chosen_logits": 1.1540336608886719, "debug/policy_chosen_logps": -65.85258483886719, "debug/policy_rejected_logits": 1.452874779701233, "debug/policy_rejected_logps": -97.27635192871094, "debug/reference_chosen_logps": -66.37857818603516, "debug/reference_rejected_logps": -97.83031463623047, "debug/sppo_chosen_loss": 2457.917236328125, "debug/sppo_chosen_reward_in_loss": 0.525994598865509, "debug/sppo_rej_reward_in_loss": 0.5539580583572388, "debug/sppo_reject_loss": 2563.376708984375, "epoch": 2.5905797101449277, "grad_norm": 151638.6033962213, "learning_rate": 8.042821759798069e-08, "logits/chosen": 1.1540336608886719, "logits/rejected": 1.452874779701233, "logps/chosen": -65.85258483886719, "logps/rejected": -97.27635192871094, "loss": 5043.066, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.00525994598865509, "rewards/margins": -0.00027963408501818776, "rewards/rejected": 0.005539580248296261, "step": 715 }, { "debug/policy_chosen_logits": 1.2361079454421997, "debug/policy_chosen_logps": -190.7241668701172, "debug/policy_rejected_logits": 1.4423071146011353, "debug/policy_rejected_logps": -50.05461883544922, "debug/reference_chosen_logps": -192.1670684814453, "debug/reference_rejected_logps": -50.335174560546875, "debug/sppo_chosen_loss": 2387.143798828125, "debug/sppo_chosen_reward_in_loss": 1.4429091215133667, "debug/sppo_rej_reward_in_loss": 0.28055450320243835, "debug/sppo_reject_loss": 2530.188720703125, "epoch": 2.608695652173913, "grad_norm": 114355.47541385186, "learning_rate": 8.013173181896283e-08, "logits/chosen": 1.2361079454421997, "logits/rejected": 1.4423071146011353, "logps/chosen": -190.7241668701172, "logps/rejected": -50.05461883544922, "loss": 5014.6727, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.014429089613258839, "rewards/margins": 0.011623546481132507, "rewards/rejected": 0.002805545227602124, "step": 720 }, { "debug/policy_chosen_logits": 1.4188085794448853, "debug/policy_chosen_logps": -3.2541916370391846, "debug/policy_rejected_logits": 2.0764987468719482, "debug/policy_rejected_logps": -6.639716148376465, "debug/reference_chosen_logps": -3.298539400100708, "debug/reference_rejected_logps": -6.633472442626953, "debug/sppo_chosen_loss": 2495.631591796875, "debug/sppo_chosen_reward_in_loss": 0.04434752091765404, "debug/sppo_rej_reward_in_loss": -0.006244021467864513, "debug/sppo_reject_loss": 2499.39990234375, "epoch": 2.6268115942028984, "grad_norm": 33276.3183913755, "learning_rate": 7.983357294485438e-08, "logits/chosen": 1.4188085794448853, "logits/rejected": 2.0764987468719482, "logps/chosen": -3.2541916370391846, "logps/rejected": -6.639716148376465, "loss": 4974.9684, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.00044347523362375796, "rewards/margins": 0.0005059154354967177, "rewards/rejected": -6.244022370083258e-05, "step": 725 }, { "debug/policy_chosen_logits": 1.2809937000274658, "debug/policy_chosen_logps": -95.74784851074219, "debug/policy_rejected_logits": 1.5771560668945312, "debug/policy_rejected_logps": -6.52874755859375, "debug/reference_chosen_logps": -96.49998474121094, "debug/reference_rejected_logps": -6.519639492034912, "debug/sppo_chosen_loss": 2443.24755859375, "debug/sppo_chosen_reward_in_loss": 0.752140998840332, "debug/sppo_rej_reward_in_loss": -0.009107852354645729, "debug/sppo_reject_loss": 2499.21533203125, "epoch": 2.644927536231884, "grad_norm": 19272.62726317121, "learning_rate": 7.953375753123043e-08, "logits/chosen": 1.2809937000274658, "logits/rejected": 1.5771560668945312, "logps/chosen": -95.74784851074219, "logps/rejected": -6.52874755859375, "loss": 4976.9359, "rewards/accuracies": 0.375, "rewards/chosen": 0.007521410472691059, "rewards/margins": 0.007612487766891718, "rewards/rejected": -9.107850200962275e-05, "step": 730 }, { "debug/policy_chosen_logits": 1.3743760585784912, "debug/policy_chosen_logps": -7.804142951965332, "debug/policy_rejected_logits": 1.8149964809417725, "debug/policy_rejected_logps": -8.986127853393555, "debug/reference_chosen_logps": -7.975428581237793, "debug/reference_rejected_logps": -9.003293991088867, "debug/sppo_chosen_loss": 2483.430908203125, "debug/sppo_chosen_reward_in_loss": 0.1712864637374878, "debug/sppo_rej_reward_in_loss": 0.01716712675988674, "debug/sppo_reject_loss": 2501.77734375, "epoch": 2.6630434782608696, "grad_norm": 40733.81048853685, "learning_rate": 7.923230222564714e-08, "logits/chosen": 1.3743760585784912, "logits/rejected": 1.8149964809417725, "logps/chosen": -7.804142951965332, "logps/rejected": -8.986127853393555, "loss": 4977.1977, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0017128646140918136, "rewards/margins": 0.0015411933418363333, "rewards/rejected": 0.00017167125770356506, "step": 735 }, { "debug/policy_chosen_logits": 1.503496766090393, "debug/policy_chosen_logps": -51.18857192993164, "debug/policy_rejected_logits": 1.3035733699798584, "debug/policy_rejected_logps": -99.83528137207031, "debug/reference_chosen_logps": -51.798500061035156, "debug/reference_rejected_logps": -100.50444030761719, "debug/sppo_chosen_loss": 2453.153564453125, "debug/sppo_chosen_reward_in_loss": 0.6099240779876709, "debug/sppo_rej_reward_in_loss": 0.6691586375236511, "debug/sppo_reject_loss": 2574.427490234375, "epoch": 2.681159420289855, "grad_norm": 42171.6864593525, "learning_rate": 7.892922376671725e-08, "logits/chosen": 1.503496766090393, "logits/rejected": 1.3035733699798584, "logps/chosen": -51.18857192993164, "logps/rejected": -99.83528137207031, "loss": 4985.3766, "rewards/accuracies": 0.25, "rewards/chosen": 0.006099240854382515, "rewards/margins": -0.0005923454882577062, "rewards/rejected": 0.006691586226224899, "step": 740 }, { "debug/policy_chosen_logits": 1.429189920425415, "debug/policy_chosen_logps": -6.224120140075684, "debug/policy_rejected_logits": 1.3991663455963135, "debug/policy_rejected_logps": -9.770666122436523, "debug/reference_chosen_logps": -6.19936466217041, "debug/reference_rejected_logps": -9.75121021270752, "debug/sppo_chosen_loss": 2502.48876953125, "debug/sppo_chosen_reward_in_loss": -0.02475614845752716, "debug/sppo_rej_reward_in_loss": -0.019455324858427048, "debug/sppo_reject_loss": 2498.18798828125, "epoch": 2.699275362318841, "grad_norm": 24792.576815406654, "learning_rate": 7.862453898318082e-08, "logits/chosen": 1.429189920425415, "logits/rejected": 1.3991663455963135, "logps/chosen": -6.224120140075684, "logps/rejected": -9.770666122436523, "loss": 4984.5469, "rewards/accuracies": 0.25, "rewards/chosen": -0.000247561460128054, "rewards/margins": -5.3008232498541474e-05, "rewards/rejected": -0.0001945531985256821, "step": 745 }, { "debug/policy_chosen_logits": 1.2401114702224731, "debug/policy_chosen_logps": -131.48655700683594, "debug/policy_rejected_logits": 1.3504592180252075, "debug/policy_rejected_logps": -5.182490348815918, "debug/reference_chosen_logps": -132.68191528320312, "debug/reference_rejected_logps": -5.177205562591553, "debug/sppo_chosen_loss": 2408.582275390625, "debug/sppo_chosen_reward_in_loss": 1.1953630447387695, "debug/sppo_rej_reward_in_loss": -0.005285191349685192, "debug/sppo_reject_loss": 2499.49658203125, "epoch": 2.717391304347826, "grad_norm": 74083.92332842606, "learning_rate": 7.83182647929707e-08, "logits/chosen": 1.2401114702224731, "logits/rejected": 1.3504592180252075, "logps/chosen": -131.48655700683594, "logps/rejected": -5.182490348815918, "loss": 4977.15, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.011953630484640598, "rewards/margins": 0.012006482109427452, "rewards/rejected": -5.285191582515836e-05, "step": 750 }, { "debug/policy_chosen_logits": 1.3811697959899902, "debug/policy_chosen_logps": -11.790118217468262, "debug/policy_rejected_logits": 1.489039421081543, "debug/policy_rejected_logps": -6.24985408782959, "debug/reference_chosen_logps": -11.79719352722168, "debug/reference_rejected_logps": -6.178770542144775, "debug/sppo_chosen_loss": 2499.384765625, "debug/sppo_chosen_reward_in_loss": 0.007074213121086359, "debug/sppo_rej_reward_in_loss": -0.07108394801616669, "debug/sppo_reject_loss": 2492.964599609375, "epoch": 2.7355072463768115, "grad_norm": 23278.854578509196, "learning_rate": 7.801041820227318e-08, "logits/chosen": 1.3811697959899902, "logits/rejected": 1.489039421081543, "logps/chosen": -11.790118217468262, "logps/rejected": -6.24985408782959, "loss": 4997.6656, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 7.074214954627678e-05, "rewards/margins": 0.0007815815624780953, "rewards/rejected": -0.0007108395220711827, "step": 755 }, { "debug/policy_chosen_logits": 1.3936350345611572, "debug/policy_chosen_logps": -75.14498901367188, "debug/policy_rejected_logits": 1.888794183731079, "debug/policy_rejected_logps": -5.494256019592285, "debug/reference_chosen_logps": -75.92655181884766, "debug/reference_rejected_logps": -5.4222517013549805, "debug/sppo_chosen_loss": 2445.77734375, "debug/sppo_chosen_reward_in_loss": 0.7815699577331543, "debug/sppo_rej_reward_in_loss": -0.07200449705123901, "debug/sppo_reject_loss": 2492.99658203125, "epoch": 2.753623188405797, "grad_norm": 29840.001702239286, "learning_rate": 7.770101630458363e-08, "logits/chosen": 1.3936350345611572, "logits/rejected": 1.888794183731079, "logps/chosen": -75.14498901367188, "logps/rejected": -5.494256019592285, "loss": 4991.5336, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.007815699093043804, "rewards/margins": 0.008535744622349739, "rewards/rejected": -0.0007200449472293258, "step": 760 }, { "debug/policy_chosen_logits": 1.5029393434524536, "debug/policy_chosen_logps": -159.31239318847656, "debug/policy_rejected_logits": 1.6016242504119873, "debug/policy_rejected_logps": -104.73112487792969, "debug/reference_chosen_logps": -160.2303009033203, "debug/reference_rejected_logps": -105.09793853759766, "debug/sppo_chosen_loss": 2427.389892578125, "debug/sppo_chosen_reward_in_loss": 0.917911171913147, "debug/sppo_rej_reward_in_loss": 0.36682644486427307, "debug/sppo_reject_loss": 2546.86279296875, "epoch": 2.7717391304347827, "grad_norm": 83986.53622769123, "learning_rate": 7.73900762797575e-08, "logits/chosen": 1.5029393434524536, "logits/rejected": 1.6016242504119873, "logps/chosen": -159.31239318847656, "logps/rejected": -104.73112487792969, "loss": 4972.0297, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.009179111570119858, "rewards/margins": 0.005510847084224224, "rewards/rejected": 0.00366826425306499, "step": 765 }, { "debug/policy_chosen_logits": 1.6192662715911865, "debug/policy_chosen_logps": -6.2633233070373535, "debug/policy_rejected_logits": 2.133389949798584, "debug/policy_rejected_logps": -100.17567443847656, "debug/reference_chosen_logps": -6.2502970695495605, "debug/reference_rejected_logps": -100.2561264038086, "debug/sppo_chosen_loss": 2501.37646484375, "debug/sppo_chosen_reward_in_loss": -0.013025665655732155, "debug/sppo_rej_reward_in_loss": 0.08045091480016708, "debug/sppo_reject_loss": 2508.57470703125, "epoch": 2.789855072463768, "grad_norm": 56626.66595335295, "learning_rate": 7.707761539305629e-08, "logits/chosen": 1.6192662715911865, "logits/rejected": 2.133389949798584, "logps/chosen": -6.2633233070373535, "logps/rejected": -100.17567443847656, "loss": 4981.4305, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.0001302566088270396, "rewards/margins": -0.0009347657905891538, "rewards/rejected": 0.0008045091526582837, "step": 770 }, { "debug/policy_chosen_logits": 1.7449042797088623, "debug/policy_chosen_logps": -8.103353500366211, "debug/policy_rejected_logits": 2.027343511581421, "debug/policy_rejected_logps": -11.463616371154785, "debug/reference_chosen_logps": -8.093632698059082, "debug/reference_rejected_logps": -11.414515495300293, "debug/sppo_chosen_loss": 2501.008544921875, "debug/sppo_chosen_reward_in_loss": -0.009721839800477028, "debug/sppo_rej_reward_in_loss": -0.04910029098391533, "debug/sppo_reject_loss": 2495.2451171875, "epoch": 2.807971014492754, "grad_norm": 37051.72556855272, "learning_rate": 7.676365099418883e-08, "logits/chosen": 1.7449042797088623, "logits/rejected": 2.027343511581421, "logps/chosen": -8.103353500366211, "logps/rejected": -11.463616371154785, "loss": 4985.1809, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -9.721839160192758e-05, "rewards/margins": 0.0003937845176551491, "rewards/rejected": -0.0004910029238089919, "step": 775 }, { "debug/policy_chosen_logits": 1.2894489765167236, "debug/policy_chosen_logps": -6.635662078857422, "debug/policy_rejected_logits": 1.7854931354522705, "debug/policy_rejected_logps": -7.891185760498047, "debug/reference_chosen_logps": -6.597743034362793, "debug/reference_rejected_logps": -7.864927768707275, "debug/sppo_chosen_loss": 2503.895751953125, "debug/sppo_chosen_reward_in_loss": -0.037918753921985626, "debug/sppo_rej_reward_in_loss": -0.026258278638124466, "debug/sppo_reject_loss": 2497.450439453125, "epoch": 2.8260869565217392, "grad_norm": 70854.1432228094, "learning_rate": 7.644820051634812e-08, "logits/chosen": 1.2894489765167236, "logits/rejected": 1.7854931354522705, "logps/chosen": -6.635662078857422, "logps/rejected": -7.891185760498047, "loss": 4985.7082, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.00037918752059340477, "rewards/margins": -0.00011660467134788632, "rewards/rejected": -0.0002625827619340271, "step": 780 }, { "debug/policy_chosen_logits": 1.285874605178833, "debug/policy_chosen_logps": -162.66134643554688, "debug/policy_rejected_logits": 1.6362215280532837, "debug/policy_rejected_logps": -28.97531509399414, "debug/reference_chosen_logps": -164.0539093017578, "debug/reference_rejected_logps": -28.904144287109375, "debug/sppo_chosen_loss": 2434.125732421875, "debug/sppo_chosen_reward_in_loss": 1.3925551176071167, "debug/sppo_rej_reward_in_loss": -0.07116986811161041, "debug/sppo_reject_loss": 2493.13525390625, "epoch": 2.8442028985507246, "grad_norm": 12614.495350006166, "learning_rate": 7.613128147524313e-08, "logits/chosen": 1.285874605178833, "logits/rejected": 1.6362215280532837, "logps/chosen": -162.66134643554688, "logps/rejected": -28.97531509399414, "loss": 4989.4074, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.013925550505518913, "rewards/margins": 0.01463724859058857, "rewards/rejected": -0.0007116986089386046, "step": 785 }, { "debug/policy_chosen_logits": 1.5040881633758545, "debug/policy_chosen_logps": -6.494426727294922, "debug/policy_rejected_logits": 1.8926185369491577, "debug/policy_rejected_logps": -8.781606674194336, "debug/reference_chosen_logps": -6.4667768478393555, "debug/reference_rejected_logps": -8.72008991241455, "debug/sppo_chosen_loss": 2502.7958984375, "debug/sppo_chosen_reward_in_loss": -0.02764938771724701, "debug/sppo_rej_reward_in_loss": -0.061516910791397095, "debug/sppo_reject_loss": 2493.894775390625, "epoch": 2.86231884057971, "grad_norm": 17012.863848366116, "learning_rate": 7.581291146812631e-08, "logits/chosen": 1.5040881633758545, "logits/rejected": 1.8926185369491577, "logps/chosen": -6.494426727294922, "logps/rejected": -8.781606674194336, "loss": 4962.2969, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0002764938399195671, "rewards/margins": 0.000338675279635936, "rewards/rejected": -0.0006151691195555031, "step": 790 }, { "debug/policy_chosen_logits": 1.390342116355896, "debug/policy_chosen_logps": -199.53041076660156, "debug/policy_rejected_logits": 1.637220025062561, "debug/policy_rejected_logps": -7.687531471252441, "debug/reference_chosen_logps": -202.3122100830078, "debug/reference_rejected_logps": -7.594631195068359, "debug/sppo_chosen_loss": 2333.692626953125, "debug/sppo_chosen_reward_in_loss": 2.781808376312256, "debug/sppo_rej_reward_in_loss": -0.09290023148059845, "debug/sppo_reject_loss": 2490.8046875, "epoch": 2.880434782608696, "grad_norm": 42195.63865309199, "learning_rate": 7.549310817281647e-08, "logits/chosen": 1.390342116355896, "logits/rejected": 1.637220025062561, "logps/chosen": -199.53041076660156, "logps/rejected": -7.687531471252441, "loss": 4943.05, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.02781808003783226, "rewards/margins": 0.028747087344527245, "rewards/rejected": -0.0009290023008361459, "step": 795 }, { "debug/policy_chosen_logits": 1.495915412902832, "debug/policy_chosen_logps": -74.32989501953125, "debug/policy_rejected_logits": 1.6923925876617432, "debug/policy_rejected_logps": -109.06733703613281, "debug/reference_chosen_logps": -75.12210845947266, "debug/reference_rejected_logps": -110.198974609375, "debug/sppo_chosen_loss": 2446.45458984375, "debug/sppo_chosen_reward_in_loss": 0.7922137379646301, "debug/sppo_rej_reward_in_loss": 1.1316362619400024, "debug/sppo_reject_loss": 2659.77587890625, "epoch": 2.898550724637681, "grad_norm": 29224.335171583865, "learning_rate": 7.517188934671725e-08, "logits/chosen": 1.495915412902832, "logits/rejected": 1.6923925876617432, "logps/chosen": -74.32989501953125, "logps/rejected": -109.06733703613281, "loss": 4980.475, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.007922137156128883, "rewards/margins": -0.0033942267764359713, "rewards/rejected": 0.011316363699734211, "step": 800 }, { "epoch": 2.898550724637681, "eval_debug/policy_chosen_logits": 1.6936593055725098, "eval_debug/policy_chosen_logps": -120.97956848144531, "eval_debug/policy_rejected_logits": 1.7533241510391235, "eval_debug/policy_rejected_logps": -63.4107551574707, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2370.336181640625, "eval_debug/sppo_chosen_reward_in_loss": 2.1684792041778564, "eval_debug/sppo_rej_reward_in_loss": 0.47630366683006287, "eval_debug/sppo_reject_loss": 2566.853515625, "eval_logits/chosen": 1.6936593055725098, "eval_logits/rejected": 1.7533241510391235, "eval_logps/chosen": -120.97956848144531, "eval_logps/rejected": -63.4107551574707, "eval_loss": 4967.69921875, "eval_rewards/accuracies": 0.34210526943206787, "eval_rewards/chosen": 0.021684790030121803, "eval_rewards/margins": 0.0169217512011528, "eval_rewards/rejected": 0.0047630369663238525, "eval_runtime": 28.6137, "eval_samples_per_second": 20.969, "eval_steps_per_second": 0.664, "step": 800 }, { "debug/policy_chosen_logits": 1.4139964580535889, "debug/policy_chosen_logps": -5.254188060760498, "debug/policy_rejected_logits": 1.8479747772216797, "debug/policy_rejected_logps": -80.65660095214844, "debug/reference_chosen_logps": -5.291328430175781, "debug/reference_rejected_logps": -81.42278289794922, "debug/sppo_chosen_loss": 2496.327392578125, "debug/sppo_chosen_reward_in_loss": 0.03714003041386604, "debug/sppo_rej_reward_in_loss": 0.766180157661438, "debug/sppo_reject_loss": 2602.15673828125, "epoch": 2.9166666666666665, "grad_norm": 23344.606675734554, "learning_rate": 7.484927282583103e-08, "logits/chosen": 1.4139964580535889, "logits/rejected": 1.8479747772216797, "logps/chosen": -5.254188060760498, "logps/rejected": -80.65660095214844, "loss": 4984.5766, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.00037140032509341836, "rewards/margins": -0.007290400564670563, "rewards/rejected": 0.00766180083155632, "step": 805 }, { "debug/policy_chosen_logits": 1.4258983135223389, "debug/policy_chosen_logps": -9.582771301269531, "debug/policy_rejected_logits": 1.7011148929595947, "debug/policy_rejected_logps": -80.58012390136719, "debug/reference_chosen_logps": -9.574371337890625, "debug/reference_rejected_logps": -81.211669921875, "debug/sppo_chosen_loss": 2500.90185546875, "debug/sppo_chosen_reward_in_loss": -0.008400765247642994, "debug/sppo_rej_reward_in_loss": 0.6315392255783081, "debug/sppo_reject_loss": 2582.731201171875, "epoch": 2.9347826086956523, "grad_norm": 203434.12457277052, "learning_rate": 7.452527652376863e-08, "logits/chosen": 1.4258983135223389, "logits/rejected": 1.7011148929595947, "logps/chosen": -9.582771301269531, "logps/rejected": -80.58012390136719, "loss": 5000.9332, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -8.400764636462554e-05, "rewards/margins": -0.006399400532245636, "rewards/rejected": 0.006315392442047596, "step": 810 }, { "debug/policy_chosen_logits": 1.1923292875289917, "debug/policy_chosen_logps": -232.3626708984375, "debug/policy_rejected_logits": 1.5656123161315918, "debug/policy_rejected_logps": -86.41791534423828, "debug/reference_chosen_logps": -234.8483428955078, "debug/reference_rejected_logps": -86.75303649902344, "debug/sppo_chosen_loss": 2323.52294921875, "debug/sppo_chosen_reward_in_loss": 2.4856557846069336, "debug/sppo_rej_reward_in_loss": 0.33512455224990845, "debug/sppo_reject_loss": 2539.06201171875, "epoch": 2.9528985507246377, "grad_norm": 64583.61749037997, "learning_rate": 7.419991843075463e-08, "logits/chosen": 1.1923292875289917, "logits/rejected": 1.5656123161315918, "logps/chosen": -232.3626708984375, "logps/rejected": -86.41791534423828, "loss": 4988.2164, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.024856556206941605, "rewards/margins": 0.021505311131477356, "rewards/rejected": 0.003351245541125536, "step": 815 }, { "debug/policy_chosen_logits": 1.5343048572540283, "debug/policy_chosen_logps": -4.10895299911499, "debug/policy_rejected_logits": 2.1222310066223145, "debug/policy_rejected_logps": -43.48785400390625, "debug/reference_chosen_logps": -4.089119911193848, "debug/reference_rejected_logps": -43.911102294921875, "debug/sppo_chosen_loss": 2502.04833984375, "debug/sppo_chosen_reward_in_loss": -0.019832782447338104, "debug/sppo_rej_reward_in_loss": 0.4232407510280609, "debug/sppo_reject_loss": 2551.0126953125, "epoch": 2.971014492753623, "grad_norm": 23129.93757107744, "learning_rate": 7.387321661262844e-08, "logits/chosen": 1.5343048572540283, "logits/rejected": 2.1222310066223145, "logps/chosen": -4.10895299911499, "logps/rejected": -43.48785400390625, "loss": 4987.3191, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.00019832784892059863, "rewards/margins": -0.004430735018104315, "rewards/rejected": 0.004232407547533512, "step": 820 }, { "debug/policy_chosen_logits": 1.634982705116272, "debug/policy_chosen_logps": -136.30104064941406, "debug/policy_rejected_logits": 1.9212862253189087, "debug/policy_rejected_logps": -2.298676013946533, "debug/reference_chosen_logps": -137.60731506347656, "debug/reference_rejected_logps": -2.2748847007751465, "debug/sppo_chosen_loss": 2409.412841796875, "debug/sppo_chosen_reward_in_loss": 1.306257724761963, "debug/sppo_rej_reward_in_loss": -0.023791467770934105, "debug/sppo_reject_loss": 2497.64306640625, "epoch": 2.9891304347826084, "grad_norm": 141358.7304244266, "learning_rate": 7.354518920984119e-08, "logits/chosen": 1.634982705116272, "logits/rejected": 1.9212862253189087, "logps/chosen": -136.30104064941406, "logps/rejected": -2.298676013946533, "loss": 4989.5301, "rewards/accuracies": 0.25, "rewards/chosen": 0.01306257676333189, "rewards/margins": 0.013300491496920586, "rewards/rejected": -0.00023791468993294984, "step": 825 }, { "debug/policy_chosen_logits": 1.4530479907989502, "debug/policy_chosen_logps": -173.53610229492188, "debug/policy_rejected_logits": 1.9338299036026, "debug/policy_rejected_logps": -120.071533203125, "debug/reference_chosen_logps": -175.3301239013672, "debug/reference_rejected_logps": -120.7976303100586, "debug/sppo_chosen_loss": 2389.144775390625, "debug/sppo_chosen_reward_in_loss": 1.7940285205841064, "debug/sppo_rej_reward_in_loss": 0.7260942459106445, "debug/sppo_reject_loss": 2583.453125, "epoch": 3.0072463768115942, "grad_norm": 135192.97365103906, "learning_rate": 7.32158544364484e-08, "logits/chosen": 1.4530479907989502, "logits/rejected": 1.9338299036026, "logps/chosen": -173.53610229492188, "logps/rejected": -120.071533203125, "loss": 4991.182, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.017940282821655273, "rewards/margins": 0.010679340921342373, "rewards/rejected": 0.0072609419003129005, "step": 830 }, { "debug/policy_chosen_logits": 1.534420132637024, "debug/policy_chosen_logps": -44.638526916503906, "debug/policy_rejected_logits": 1.7307441234588623, "debug/policy_rejected_logps": -5.520297527313232, "debug/reference_chosen_logps": -45.01803970336914, "debug/reference_rejected_logps": -5.35309362411499, "debug/sppo_chosen_loss": 2468.969970703125, "debug/sppo_chosen_reward_in_loss": 0.37951406836509705, "debug/sppo_rej_reward_in_loss": -0.16720393300056458, "debug/sppo_reject_loss": 2483.90185546875, "epoch": 3.0253623188405796, "grad_norm": 48554.29610792757, "learning_rate": 7.28852305790987e-08, "logits/chosen": 1.534420132637024, "logits/rejected": 1.7307441234588623, "logps/chosen": -44.638526916503906, "logps/rejected": -5.520297527313232, "loss": 4978.2723, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0037951406557112932, "rewards/margins": 0.005467180162668228, "rewards/rejected": -0.0016720391577109694, "step": 835 }, { "debug/policy_chosen_logits": 1.738743782043457, "debug/policy_chosen_logps": -6.872030735015869, "debug/policy_rejected_logits": 1.5160869359970093, "debug/policy_rejected_logps": -48.826316833496094, "debug/reference_chosen_logps": -6.925424098968506, "debug/reference_rejected_logps": -48.94139862060547, "debug/sppo_chosen_loss": 2494.70556640625, "debug/sppo_chosen_reward_in_loss": 0.05339394882321358, "debug/sppo_rej_reward_in_loss": 0.11508418619632721, "debug/sppo_reject_loss": 2513.247314453125, "epoch": 3.0434782608695654, "grad_norm": 58056.918038302334, "learning_rate": 7.255333599601847e-08, "logits/chosen": 1.738743782043457, "logits/rejected": 1.5160869359970093, "logps/chosen": -6.872030735015869, "logps/rejected": -48.826316833496094, "loss": 4979.0797, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0005339394556358457, "rewards/margins": -0.0006169023690745234, "rewards/rejected": 0.0011508417082950473, "step": 840 }, { "debug/policy_chosen_logits": 1.530464768409729, "debug/policy_chosen_logps": -5.381126880645752, "debug/policy_rejected_logits": 1.68129563331604, "debug/policy_rejected_logps": -106.15922546386719, "debug/reference_chosen_logps": -5.335769176483154, "debug/reference_rejected_logps": -104.54981994628906, "debug/sppo_chosen_loss": 2504.649658203125, "debug/sppo_chosen_reward_in_loss": -0.04535723477602005, "debug/sppo_rej_reward_in_loss": -1.6094074249267578, "debug/sppo_reject_loss": 2437.71337890625, "epoch": 3.0615942028985508, "grad_norm": 442414.9779694248, "learning_rate": 7.222018911599233e-08, "logits/chosen": 1.530464768409729, "logits/rejected": 1.68129563331604, "logps/chosen": -5.381126880645752, "logps/rejected": -106.15922546386719, "loss": 4985.4902, "rewards/accuracies": 0.375, "rewards/chosen": -0.0004535723419394344, "rewards/margins": 0.0156405009329319, "rewards/rejected": -0.01609407365322113, "step": 845 }, { "debug/policy_chosen_logits": 1.275017499923706, "debug/policy_chosen_logps": -156.49697875976562, "debug/policy_rejected_logits": 1.7312116622924805, "debug/policy_rejected_logps": -6.046191215515137, "debug/reference_chosen_logps": -155.05332946777344, "debug/reference_rejected_logps": -6.033763408660889, "debug/sppo_chosen_loss": 2739.3701171875, "debug/sppo_chosen_reward_in_loss": -1.4436414241790771, "debug/sppo_rej_reward_in_loss": -0.012427590787410736, "debug/sppo_reject_loss": 2498.781982421875, "epoch": 3.079710144927536, "grad_norm": 11109.098013746996, "learning_rate": 7.188580843734004e-08, "logits/chosen": 1.275017499923706, "logits/rejected": 1.7312116622924805, "logps/chosen": -156.49697875976562, "logps/rejected": -6.046191215515137, "loss": 5026.35, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.014436411671340466, "rewards/margins": -0.014312135986983776, "rewards/rejected": -0.00012427588808350265, "step": 850 }, { "debug/policy_chosen_logits": 1.6546322107315063, "debug/policy_chosen_logps": -65.32437896728516, "debug/policy_rejected_logits": 1.7387187480926514, "debug/policy_rejected_logps": -54.997344970703125, "debug/reference_chosen_logps": -65.62288665771484, "debug/reference_rejected_logps": -55.23314666748047, "debug/sppo_chosen_loss": 2474.333740234375, "debug/sppo_chosen_reward_in_loss": 0.2985118329524994, "debug/sppo_rej_reward_in_loss": 0.23580560088157654, "debug/sppo_reject_loss": 2526.950439453125, "epoch": 3.097826086956522, "grad_norm": 69204.93950224378, "learning_rate": 7.155021252688928e-08, "logits/chosen": 1.6546322107315063, "logits/rejected": 1.7387187480926514, "logps/chosen": -65.32437896728516, "logps/rejected": -54.997344970703125, "loss": 4982.6203, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0029851181898266077, "rewards/margins": 0.0006270622834563255, "rewards/rejected": 0.002358056139200926, "step": 855 }, { "debug/policy_chosen_logits": 1.3846735954284668, "debug/policy_chosen_logps": -5.223902225494385, "debug/policy_rejected_logits": 1.431933045387268, "debug/policy_rejected_logps": -11.20073127746582, "debug/reference_chosen_logps": -5.2109761238098145, "debug/reference_rejected_logps": -11.119375228881836, "debug/sppo_chosen_loss": 2501.326171875, "debug/sppo_chosen_reward_in_loss": -0.012926379218697548, "debug/sppo_rej_reward_in_loss": -0.08135490119457245, "debug/sppo_reject_loss": 2491.940673828125, "epoch": 3.1159420289855073, "grad_norm": 37701.26854382551, "learning_rate": 7.121342001894466e-08, "logits/chosen": 1.3846735954284668, "logits/rejected": 1.431933045387268, "logps/chosen": -5.223902225494385, "logps/rejected": -11.20073127746582, "loss": 4978.2109, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.00012926380441058427, "rewards/margins": 0.0006842851871624589, "rewards/rejected": -0.0008135490352287889, "step": 860 }, { "debug/policy_chosen_logits": 1.4681646823883057, "debug/policy_chosen_logps": -129.4868927001953, "debug/policy_rejected_logits": 1.829880714416504, "debug/policy_rejected_logps": -37.24610900878906, "debug/reference_chosen_logps": -131.31796264648438, "debug/reference_rejected_logps": -37.071983337402344, "debug/sppo_chosen_loss": 2449.047119140625, "debug/sppo_chosen_reward_in_loss": 1.8310648202896118, "debug/sppo_rej_reward_in_loss": -0.17412912845611572, "debug/sppo_reject_loss": 2483.40576171875, "epoch": 3.1340579710144927, "grad_norm": 99821.60439992421, "learning_rate": 7.087544961425316e-08, "logits/chosen": 1.4681646823883057, "logits/rejected": 1.829880714416504, "logps/chosen": -129.4868927001953, "logps/rejected": -37.24610900878906, "loss": 4986.7359, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.01831064745783806, "rewards/margins": 0.02005193755030632, "rewards/rejected": -0.00174129125662148, "step": 865 }, { "debug/policy_chosen_logits": 1.681283712387085, "debug/policy_chosen_logps": -132.54238891601562, "debug/policy_rejected_logits": 1.9492746591567993, "debug/policy_rejected_logps": -5.482358932495117, "debug/reference_chosen_logps": -134.39736938476562, "debug/reference_rejected_logps": -5.434831142425537, "debug/sppo_chosen_loss": 2388.852783203125, "debug/sppo_chosen_reward_in_loss": 1.8550058603286743, "debug/sppo_rej_reward_in_loss": -0.04752717167139053, "debug/sppo_reject_loss": 2495.307861328125, "epoch": 3.1521739130434785, "grad_norm": 550789.1177861865, "learning_rate": 7.05363200789656e-08, "logits/chosen": 1.681283712387085, "logits/rejected": 1.9492746591567993, "logps/chosen": -132.54238891601562, "logps/rejected": -5.482358932495117, "loss": 4989.0504, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.018550056964159012, "rewards/margins": 0.019025329500436783, "rewards/rejected": -0.00047527169226668775, "step": 870 }, { "debug/policy_chosen_logits": 1.6098753213882446, "debug/policy_chosen_logps": -139.66818237304688, "debug/policy_rejected_logits": 1.653414011001587, "debug/policy_rejected_logps": -4.20380163192749, "debug/reference_chosen_logps": -140.52279663085938, "debug/reference_rejected_logps": -4.109394550323486, "debug/sppo_chosen_loss": 2437.76513671875, "debug/sppo_chosen_reward_in_loss": 0.854594886302948, "debug/sppo_rej_reward_in_loss": -0.09440730512142181, "debug/sppo_reject_loss": 2490.6650390625, "epoch": 3.170289855072464, "grad_norm": 48958.889482783205, "learning_rate": 7.019605024359474e-08, "logits/chosen": 1.6098753213882446, "logits/rejected": 1.653414011001587, "logps/chosen": -139.66818237304688, "logps/rejected": -4.20380163192749, "loss": 4978.1699, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.008545948192477226, "rewards/margins": 0.009490021504461765, "rewards/rejected": -0.0009440731373615563, "step": 875 }, { "debug/policy_chosen_logits": 1.5032477378845215, "debug/policy_chosen_logps": -28.241653442382812, "debug/policy_rejected_logits": 1.8489735126495361, "debug/policy_rejected_logps": -8.052087783813477, "debug/reference_chosen_logps": -28.440570831298828, "debug/reference_rejected_logps": -8.051187515258789, "debug/sppo_chosen_loss": 2481.45458984375, "debug/sppo_chosen_reward_in_loss": 0.19891729950904846, "debug/sppo_rej_reward_in_loss": -0.0009006023174151778, "debug/sppo_reject_loss": 2499.945556640625, "epoch": 3.1884057971014492, "grad_norm": 62593.19407353733, "learning_rate": 6.98546590019697e-08, "logits/chosen": 1.5032477378845215, "logits/rejected": 1.8489735126495361, "logps/chosen": -28.241653442382812, "logps/rejected": -8.052087783813477, "loss": 4963.157, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.0019891727715730667, "rewards/margins": 0.0019981791265308857, "rewards/rejected": -9.006034815683961e-06, "step": 880 }, { "debug/policy_chosen_logits": 1.147802710533142, "debug/policy_chosen_logps": -6.319035530090332, "debug/policy_rejected_logits": 1.4721888303756714, "debug/policy_rejected_logps": -71.1871566772461, "debug/reference_chosen_logps": -6.258852481842041, "debug/reference_rejected_logps": -71.7362289428711, "debug/sppo_chosen_loss": 2506.15478515625, "debug/sppo_chosen_reward_in_loss": -0.06018335744738579, "debug/sppo_rej_reward_in_loss": 0.5490648746490479, "debug/sppo_reject_loss": 2567.87939453125, "epoch": 3.2065217391304346, "grad_norm": 161981.32299852488, "learning_rate": 6.951216531018677e-08, "logits/chosen": 1.147802710533142, "logits/rejected": 1.4721888303756714, "logps/chosen": -6.319035530090332, "logps/rejected": -71.1871566772461, "loss": 4983.9688, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0006018335116095841, "rewards/margins": -0.00609248224645853, "rewards/rejected": 0.005490648560225964, "step": 885 }, { "debug/policy_chosen_logits": 1.4311046600341797, "debug/policy_chosen_logps": -7.13021183013916, "debug/policy_rejected_logits": 1.7070996761322021, "debug/policy_rejected_logps": -58.46037673950195, "debug/reference_chosen_logps": -7.0944318771362305, "debug/reference_rejected_logps": -57.73896408081055, "debug/sppo_chosen_loss": 2503.66845703125, "debug/sppo_chosen_reward_in_loss": -0.0357792004942894, "debug/sppo_rej_reward_in_loss": -0.7214129567146301, "debug/sppo_reject_loss": 2440.578369140625, "epoch": 3.2246376811594204, "grad_norm": 135450.79939417174, "learning_rate": 6.91685881855569e-08, "logits/chosen": 1.4311046600341797, "logits/rejected": 1.7070996761322021, "logps/chosen": -7.13021183013916, "logps/rejected": -58.46037673950195, "loss": 4962.2988, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.00035779201425611973, "rewards/margins": 0.0068563371896743774, "rewards/rejected": -0.0072141289710998535, "step": 890 }, { "debug/policy_chosen_logits": 1.5483067035675049, "debug/policy_chosen_logps": -142.58644104003906, "debug/policy_rejected_logits": 1.7533977031707764, "debug/policy_rejected_logps": -9.595304489135742, "debug/reference_chosen_logps": -144.22349548339844, "debug/reference_rejected_logps": -9.525235176086426, "debug/sppo_chosen_loss": 2416.18798828125, "debug/sppo_chosen_reward_in_loss": 1.6370826959609985, "debug/sppo_rej_reward_in_loss": -0.07006971538066864, "debug/sppo_reject_loss": 2493.2314453125, "epoch": 3.2427536231884058, "grad_norm": 95663.08629385695, "learning_rate": 6.882394670554983e-08, "logits/chosen": 1.5483067035675049, "logits/rejected": 1.7533977031707764, "logps/chosen": -142.58644104003906, "logps/rejected": -9.595304489135742, "loss": 4957.9945, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.016370827332139015, "rewards/margins": 0.017071522772312164, "rewards/rejected": -0.0007006971864029765, "step": 895 }, { "debug/policy_chosen_logits": 1.2544628381729126, "debug/policy_chosen_logps": -12.271692276000977, "debug/policy_rejected_logits": 1.7263704538345337, "debug/policy_rejected_logps": -5.130194664001465, "debug/reference_chosen_logps": -12.303363800048828, "debug/reference_rejected_logps": -5.078009605407715, "debug/sppo_chosen_loss": 2496.895751953125, "debug/sppo_chosen_reward_in_loss": 0.031671054661273956, "debug/sppo_rej_reward_in_loss": -0.0521845817565918, "debug/sppo_reject_loss": 2494.86962890625, "epoch": 3.260869565217391, "grad_norm": 27376.88996289447, "learning_rate": 6.847826000673463e-08, "logits/chosen": 1.2544628381729126, "logits/rejected": 1.7263704538345337, "logps/chosen": -12.271692276000977, "logps/rejected": -5.130194664001465, "loss": 4962.825, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.00031671050237491727, "rewards/margins": 0.0008385563269257545, "rewards/rejected": -0.0005218457663431764, "step": 900 }, { "epoch": 3.260869565217391, "eval_debug/policy_chosen_logits": 1.6754149198532104, "eval_debug/policy_chosen_logps": -120.75407409667969, "eval_debug/policy_rejected_logits": 1.734663963317871, "eval_debug/policy_rejected_logps": -63.416831970214844, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2374.9814453125, "eval_debug/sppo_chosen_reward_in_loss": 2.3939788341522217, "eval_debug/sppo_rej_reward_in_loss": 0.47022318840026855, "eval_debug/sppo_reject_loss": 2564.927734375, "eval_logits/chosen": 1.6754149198532104, "eval_logits/rejected": 1.734663963317871, "eval_logps/chosen": -120.75407409667969, "eval_logps/rejected": -63.416831970214844, "eval_loss": 4973.931640625, "eval_rewards/accuracies": 0.30263158679008484, "eval_rewards/chosen": 0.023939788341522217, "eval_rewards/margins": 0.01923755370080471, "eval_rewards/rejected": 0.004702231381088495, "eval_runtime": 28.8099, "eval_samples_per_second": 20.826, "eval_steps_per_second": 0.659, "step": 900 }, { "debug/policy_chosen_logits": 1.5319929122924805, "debug/policy_chosen_logps": -92.01017761230469, "debug/policy_rejected_logits": 1.8365590572357178, "debug/policy_rejected_logps": -6.147439479827881, "debug/reference_chosen_logps": -93.24861145019531, "debug/reference_rejected_logps": -6.105729579925537, "debug/sppo_chosen_loss": 2433.957275390625, "debug/sppo_chosen_reward_in_loss": 1.2384226322174072, "debug/sppo_rej_reward_in_loss": -0.041709840297698975, "debug/sppo_reject_loss": 2495.8828125, "epoch": 3.278985507246377, "grad_norm": 32822.40527938065, "learning_rate": 6.813154728371727e-08, "logits/chosen": 1.5319929122924805, "logits/rejected": 1.8365590572357178, "logps/chosen": -92.01017761230469, "logps/rejected": -6.147439479827881, "loss": 4930.5828, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.012384224683046341, "rewards/margins": 0.012801324017345905, "rewards/rejected": -0.00041709840297698975, "step": 905 }, { "debug/policy_chosen_logits": 1.4001384973526, "debug/policy_chosen_logps": -10.247201919555664, "debug/policy_rejected_logits": 1.5040111541748047, "debug/policy_rejected_logps": -56.53535079956055, "debug/reference_chosen_logps": -10.355507850646973, "debug/reference_rejected_logps": -56.988853454589844, "debug/sppo_chosen_loss": 2489.294189453125, "debug/sppo_chosen_reward_in_loss": 0.10830533504486084, "debug/sppo_rej_reward_in_loss": 0.4534986913204193, "debug/sppo_reject_loss": 2558.11083984375, "epoch": 3.2971014492753623, "grad_norm": 28942.014444050405, "learning_rate": 6.77838277880747e-08, "logits/chosen": 1.4001384973526, "logits/rejected": 1.5040111541748047, "logps/chosen": -10.247201919555664, "logps/rejected": -56.53535079956055, "loss": 4983.3313, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0010830534156411886, "rewards/margins": -0.0034519329201430082, "rewards/rejected": 0.004534986801445484, "step": 910 }, { "debug/policy_chosen_logits": 1.7952178716659546, "debug/policy_chosen_logps": -8.348150253295898, "debug/policy_rejected_logits": 1.9302947521209717, "debug/policy_rejected_logps": -6.760983467102051, "debug/reference_chosen_logps": -8.424813270568848, "debug/reference_rejected_logps": -6.69986629486084, "debug/sppo_chosen_loss": 2492.595947265625, "debug/sppo_chosen_reward_in_loss": 0.07666263729333878, "debug/sppo_rej_reward_in_loss": -0.06111597269773483, "debug/sppo_reject_loss": 2493.96435546875, "epoch": 3.3152173913043477, "grad_norm": 24580.04788876407, "learning_rate": 6.743512082728601e-08, "logits/chosen": 1.7952178716659546, "logits/rejected": 1.9302947521209717, "logps/chosen": -8.348150253295898, "logps/rejected": -6.760983467102051, "loss": 5015.4937, "rewards/accuracies": 0.375, "rewards/chosen": 0.0007666262681595981, "rewards/margins": 0.0013777860440313816, "rewards/rejected": -0.0006111597758717835, "step": 915 }, { "debug/policy_chosen_logits": 1.4187487363815308, "debug/policy_chosen_logps": -60.065940856933594, "debug/policy_rejected_logits": 1.4213262796401978, "debug/policy_rejected_logps": -8.208401679992676, "debug/reference_chosen_logps": -60.804039001464844, "debug/reference_rejected_logps": -8.141077995300293, "debug/sppo_chosen_loss": 2451.47265625, "debug/sppo_chosen_reward_in_loss": 0.7380932569503784, "debug/sppo_rej_reward_in_loss": -0.06732266396284103, "debug/sppo_reject_loss": 2493.366943359375, "epoch": 3.3333333333333335, "grad_norm": 94068.85239545858, "learning_rate": 6.708544576366023e-08, "logits/chosen": 1.4187487363815308, "logits/rejected": 1.4213262796401978, "logps/chosen": -60.065940856933594, "logps/rejected": -8.208401679992676, "loss": 4956.3301, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.007380933500826359, "rewards/margins": 0.008054159581661224, "rewards/rejected": -0.0006732266047038138, "step": 920 }, { "debug/policy_chosen_logits": 1.3455225229263306, "debug/policy_chosen_logps": -11.98884391784668, "debug/policy_rejected_logits": 1.4567105770111084, "debug/policy_rejected_logps": -5.1883463859558105, "debug/reference_chosen_logps": -12.01789665222168, "debug/reference_rejected_logps": -5.1840620040893555, "debug/sppo_chosen_loss": 2497.13720703125, "debug/sppo_chosen_reward_in_loss": 0.029053032398223877, "debug/sppo_rej_reward_in_loss": -0.004284513182938099, "debug/sppo_reject_loss": 2499.62255859375, "epoch": 3.351449275362319, "grad_norm": 21182.57721411257, "learning_rate": 6.673482201326134e-08, "logits/chosen": 1.3455225229263306, "logits/rejected": 1.4567105770111084, "logps/chosen": -11.98884391784668, "logps/rejected": -5.1883463859558105, "loss": 4994.7363, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0002905303263105452, "rewards/margins": 0.0003333754721097648, "rewards/rejected": -4.2845123971346766e-05, "step": 925 }, { "debug/policy_chosen_logits": 1.2485249042510986, "debug/policy_chosen_logps": -7.425021171569824, "debug/policy_rejected_logits": 1.5507420301437378, "debug/policy_rejected_logps": -5.726008892059326, "debug/reference_chosen_logps": -7.4084978103637695, "debug/reference_rejected_logps": -5.688941478729248, "debug/sppo_chosen_loss": 2501.70751953125, "debug/sppo_chosen_reward_in_loss": -0.016523806378245354, "debug/sppo_rej_reward_in_loss": -0.03706775978207588, "debug/sppo_reject_loss": 2496.3525390625, "epoch": 3.369565217391304, "grad_norm": 19122.709153454154, "learning_rate": 6.638326904483011e-08, "logits/chosen": 1.2485249042510986, "logits/rejected": 1.5507420301437378, "logps/chosen": -7.425021171569824, "logps/rejected": -5.726008892059326, "loss": 5020.5883, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.0001652380742598325, "rewards/margins": 0.00020543955906759948, "rewards/rejected": -0.00037067761877551675, "step": 930 }, { "debug/policy_chosen_logits": 1.2247673273086548, "debug/policy_chosen_logps": -86.65150451660156, "debug/policy_rejected_logits": 1.7062976360321045, "debug/policy_rejected_logps": -20.9200496673584, "debug/reference_chosen_logps": -87.83734893798828, "debug/reference_rejected_logps": -21.036624908447266, "debug/sppo_chosen_loss": 2410.58251953125, "debug/sppo_chosen_reward_in_loss": 1.185845971107483, "debug/sppo_rej_reward_in_loss": 0.11657413095235825, "debug/sppo_reject_loss": 2512.258056640625, "epoch": 3.38768115942029, "grad_norm": 38217.4159970108, "learning_rate": 6.603080637870306e-08, "logits/chosen": 1.2247673273086548, "logits/rejected": 1.7062976360321045, "logps/chosen": -86.65150451660156, "logps/rejected": -20.9200496673584, "loss": 4958.7309, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.011858460493385792, "rewards/margins": 0.01069271843880415, "rewards/rejected": 0.0011657412396743894, "step": 935 }, { "debug/policy_chosen_logits": 1.3352899551391602, "debug/policy_chosen_logps": -122.0180435180664, "debug/policy_rejected_logits": 1.483473300933838, "debug/policy_rejected_logps": -11.162653923034668, "debug/reference_chosen_logps": -123.60845947265625, "debug/reference_rejected_logps": -11.164692878723145, "debug/sppo_chosen_loss": 2397.21875, "debug/sppo_chosen_reward_in_loss": 1.5904266834259033, "debug/sppo_rej_reward_in_loss": 0.002038282109424472, "debug/sppo_reject_loss": 2500.35693359375, "epoch": 3.4057971014492754, "grad_norm": 50184.480295075125, "learning_rate": 6.567745358572863e-08, "logits/chosen": 1.3352899551391602, "logits/rejected": 1.483473300933838, "logps/chosen": -122.0180435180664, "logps/rejected": -11.162653923034668, "loss": 4978.9664, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.015904268249869347, "rewards/margins": 0.015883883461356163, "rewards/rejected": 2.0382798538776115e-05, "step": 940 }, { "debug/policy_chosen_logits": 1.3300280570983887, "debug/policy_chosen_logps": -9.331120491027832, "debug/policy_rejected_logits": 1.696936845779419, "debug/policy_rejected_logps": -3.8738656044006348, "debug/reference_chosen_logps": -9.403645515441895, "debug/reference_rejected_logps": -3.788437604904175, "debug/sppo_chosen_loss": 2492.847412109375, "debug/sppo_chosen_reward_in_loss": 0.07252510637044907, "debug/sppo_rej_reward_in_loss": -0.0854276567697525, "debug/sppo_reject_loss": 2491.583984375, "epoch": 3.4239130434782608, "grad_norm": 103169.33067958256, "learning_rate": 6.532323028618045e-08, "logits/chosen": 1.3300280570983887, "logits/rejected": 1.696936845779419, "logps/chosen": -9.331120491027832, "logps/rejected": -3.8738656044006348, "loss": 4984.443, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0007252510986290872, "rewards/margins": 0.0015795277431607246, "rewards/rejected": -0.0008542767027392983, "step": 945 }, { "debug/policy_chosen_logits": 1.0924346446990967, "debug/policy_chosen_logps": -3.215465545654297, "debug/policy_rejected_logits": 1.3993812799453735, "debug/policy_rejected_logps": -73.68339538574219, "debug/reference_chosen_logps": -3.216484785079956, "debug/reference_rejected_logps": -73.69139099121094, "debug/sppo_chosen_loss": 2499.912841796875, "debug/sppo_chosen_reward_in_loss": 0.0010193288326263428, "debug/sppo_rej_reward_in_loss": 0.008005738258361816, "debug/sppo_reject_loss": 2509.82861328125, "epoch": 3.4420289855072466, "grad_norm": 144279.71193241715, "learning_rate": 6.496815614866791e-08, "logits/chosen": 1.0924346446990967, "logits/rejected": 1.3993812799453735, "logps/chosen": -3.215465545654297, "logps/rejected": -73.68339538574219, "loss": 4982.2168, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 1.019327282847371e-05, "rewards/margins": -6.986409425735474e-05, "rewards/rejected": 8.005723066162318e-05, "step": 950 }, { "debug/policy_chosen_logits": 1.4003221988677979, "debug/policy_chosen_logps": -3.923161745071411, "debug/policy_rejected_logits": 1.6568844318389893, "debug/policy_rejected_logps": -175.89572143554688, "debug/reference_chosen_logps": -3.9458231925964355, "debug/reference_rejected_logps": -176.2043914794922, "debug/sppo_chosen_loss": 2497.77392578125, "debug/sppo_chosen_reward_in_loss": 0.022661328315734863, "debug/sppo_rej_reward_in_loss": 0.3086616098880768, "debug/sppo_reject_loss": 2548.55322265625, "epoch": 3.460144927536232, "grad_norm": 197002.61813742446, "learning_rate": 6.461225088904402e-08, "logits/chosen": 1.4003221988677979, "logits/rejected": 1.6568844318389893, "logps/chosen": -3.923161745071411, "logps/rejected": -175.89572143554688, "loss": 4977.5516, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.00022661327966488898, "rewards/margins": -0.002860002452507615, "rewards/rejected": 0.0030866158194839954, "step": 955 }, { "debug/policy_chosen_logits": 1.4162993431091309, "debug/policy_chosen_logps": -102.50821685791016, "debug/policy_rejected_logits": 1.6783605813980103, "debug/policy_rejected_logps": -101.08860778808594, "debug/reference_chosen_logps": -103.34651947021484, "debug/reference_rejected_logps": -101.48641967773438, "debug/sppo_chosen_loss": 2443.6142578125, "debug/sppo_chosen_reward_in_loss": 0.8383097648620605, "debug/sppo_rej_reward_in_loss": 0.39780789613723755, "debug/sppo_reject_loss": 2546.668212890625, "epoch": 3.4782608695652173, "grad_norm": 149836.9337657968, "learning_rate": 6.425553426931074e-08, "logits/chosen": 1.4162993431091309, "logits/rejected": 1.6783605813980103, "logps/chosen": -102.50821685791016, "logps/rejected": -101.08860778808594, "loss": 4989.5258, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.008383098989725113, "rewards/margins": 0.0044050198048353195, "rewards/rejected": 0.003978079184889793, "step": 960 }, { "debug/policy_chosen_logits": 1.5620143413543701, "debug/policy_chosen_logps": -45.25919723510742, "debug/policy_rejected_logits": 1.9142471551895142, "debug/policy_rejected_logps": -48.91422653198242, "debug/reference_chosen_logps": -44.924034118652344, "debug/reference_rejected_logps": -48.36165237426758, "debug/sppo_chosen_loss": 2541.82861328125, "debug/sppo_chosen_reward_in_loss": -0.3351626992225647, "debug/sppo_rej_reward_in_loss": -0.5525776743888855, "debug/sppo_reject_loss": 2454.18115234375, "epoch": 3.496376811594203, "grad_norm": 178793.26151823337, "learning_rate": 6.389802609652162e-08, "logits/chosen": 1.5620143413543701, "logits/rejected": 1.9142471551895142, "logps/chosen": -45.25919723510742, "logps/rejected": -48.91422653198242, "loss": 4964.8625, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.003351626917719841, "rewards/margins": 0.0021741497330367565, "rewards/rejected": -0.005525777116417885, "step": 965 }, { "debug/policy_chosen_logits": 1.4095633029937744, "debug/policy_chosen_logps": -76.5968017578125, "debug/policy_rejected_logits": 1.7205696105957031, "debug/policy_rejected_logps": -9.44313907623291, "debug/reference_chosen_logps": -77.12386322021484, "debug/reference_rejected_logps": -9.334480285644531, "debug/sppo_chosen_loss": 2457.120361328125, "debug/sppo_chosen_reward_in_loss": 0.5270634889602661, "debug/sppo_rej_reward_in_loss": -0.10865894705057144, "debug/sppo_reject_loss": 2489.608154296875, "epoch": 3.5144927536231885, "grad_norm": 140604.6892442817, "learning_rate": 6.353974622168195e-08, "logits/chosen": 1.4095633029937744, "logits/rejected": 1.7205696105957031, "logps/chosen": -76.5968017578125, "logps/rejected": -9.44313907623291, "loss": 4968.2918, "rewards/accuracies": 0.25, "rewards/chosen": 0.005270634777843952, "rewards/margins": 0.006357223726809025, "rewards/rejected": -0.0010865895310416818, "step": 970 }, { "debug/policy_chosen_logits": 1.416648507118225, "debug/policy_chosen_logps": -83.42720031738281, "debug/policy_rejected_logits": 1.6831588745117188, "debug/policy_rejected_logps": -7.553765773773193, "debug/reference_chosen_logps": -84.18830871582031, "debug/reference_rejected_logps": -7.56585168838501, "debug/sppo_chosen_loss": 2445.76318359375, "debug/sppo_chosen_reward_in_loss": 0.761110246181488, "debug/sppo_rej_reward_in_loss": 0.012085432186722755, "debug/sppo_reject_loss": 2501.31982421875, "epoch": 3.532608695652174, "grad_norm": 23927.49505793401, "learning_rate": 6.318071453864662e-08, "logits/chosen": 1.416648507118225, "logits/rejected": 1.6831588745117188, "logps/chosen": -83.42720031738281, "logps/rejected": -7.553765773773193, "loss": 4942.6359, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.007611102424561977, "rewards/margins": 0.007490247488021851, "rewards/rejected": 0.00012085431080777198, "step": 975 }, { "debug/policy_chosen_logits": 1.383797287940979, "debug/policy_chosen_logps": -7.178348541259766, "debug/policy_rejected_logits": 1.9433517456054688, "debug/policy_rejected_logps": -4.049078941345215, "debug/reference_chosen_logps": -7.126869201660156, "debug/reference_rejected_logps": -4.064708709716797, "debug/sppo_chosen_loss": 2505.275146484375, "debug/sppo_chosen_reward_in_loss": -0.05147979408502579, "debug/sppo_rej_reward_in_loss": 0.015629494562745094, "debug/sppo_reject_loss": 2501.583251953125, "epoch": 3.550724637681159, "grad_norm": 12659.540632684862, "learning_rate": 6.282095098301539e-08, "logits/chosen": 1.383797287940979, "logits/rejected": 1.9433517456054688, "logps/chosen": -7.178348541259766, "logps/rejected": -4.049078941345215, "loss": 4979.3664, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": -0.0005147979827597737, "rewards/margins": -0.000671092770062387, "rewards/rejected": 0.00015629493282176554, "step": 980 }, { "debug/policy_chosen_logits": 1.442685842514038, "debug/policy_chosen_logps": -7.381352424621582, "debug/policy_rejected_logits": 1.629230260848999, "debug/policy_rejected_logps": -2.113102436065674, "debug/reference_chosen_logps": -7.488625526428223, "debug/reference_rejected_logps": -2.104032516479492, "debug/sppo_chosen_loss": 2489.37890625, "debug/sppo_chosen_reward_in_loss": 0.10727301985025406, "debug/sppo_rej_reward_in_loss": -0.00906982459127903, "debug/sppo_reject_loss": 2499.10498046875, "epoch": 3.568840579710145, "grad_norm": 91833.33571946724, "learning_rate": 6.246047553102603e-08, "logits/chosen": 1.442685842514038, "logits/rejected": 1.629230260848999, "logps/chosen": -7.381352424621582, "logps/rejected": -2.113102436065674, "loss": 4977.3875, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0010727301705628633, "rewards/margins": 0.0011634284164756536, "rewards/rejected": -9.069825318874791e-05, "step": 985 }, { "debug/policy_chosen_logits": 1.598938226699829, "debug/policy_chosen_logps": -112.53471374511719, "debug/policy_rejected_logits": 1.5688692331314087, "debug/policy_rejected_logps": -6.433315277099609, "debug/reference_chosen_logps": -113.3122787475586, "debug/reference_rejected_logps": -6.36132287979126, "debug/sppo_chosen_loss": 2437.538818359375, "debug/sppo_chosen_reward_in_loss": 0.7775698900222778, "debug/sppo_rej_reward_in_loss": -0.0719926729798317, "debug/sppo_reject_loss": 2492.946533203125, "epoch": 3.5869565217391304, "grad_norm": 21827.903143224208, "learning_rate": 6.209930819844507e-08, "logits/chosen": 1.598938226699829, "logits/rejected": 1.5688692331314087, "logps/chosen": -112.53471374511719, "logps/rejected": -6.433315277099609, "loss": 4944.0035, "rewards/accuracies": 0.25, "rewards/chosen": 0.007775699254125357, "rewards/margins": 0.008495626039803028, "rewards/rejected": -0.0007199266692623496, "step": 990 }, { "debug/policy_chosen_logits": 1.3111555576324463, "debug/policy_chosen_logps": -3.1619713306427, "debug/policy_rejected_logits": 1.8154757022857666, "debug/policy_rejected_logps": -7.095166206359863, "debug/reference_chosen_logps": -3.14458966255188, "debug/reference_rejected_logps": -7.091891288757324, "debug/sppo_chosen_loss": 2501.7578125, "debug/sppo_chosen_reward_in_loss": -0.017381509765982628, "debug/sppo_rej_reward_in_loss": -0.003274583723396063, "debug/sppo_reject_loss": 2499.736083984375, "epoch": 3.605072463768116, "grad_norm": 14545.56635807843, "learning_rate": 6.173746903945638e-08, "logits/chosen": 1.3111555576324463, "logits/rejected": 1.8154757022857666, "logps/chosen": -3.1619713306427, "logps/rejected": -7.095166206359863, "loss": 4989.5094, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": -0.00017381510406266898, "rewards/margins": -0.0001410692639183253, "rewards/rejected": -3.2745814678492025e-05, "step": 995 }, { "debug/policy_chosen_logits": 1.641177773475647, "debug/policy_chosen_logps": -37.600616455078125, "debug/policy_rejected_logits": 1.8367513418197632, "debug/policy_rejected_logps": -5.523456573486328, "debug/reference_chosen_logps": -38.062889099121094, "debug/reference_rejected_logps": -5.441411018371582, "debug/sppo_chosen_loss": 2461.43115234375, "debug/sppo_chosen_reward_in_loss": 0.4622744619846344, "debug/sppo_rej_reward_in_loss": -0.08204521238803864, "debug/sppo_reject_loss": 2492.31787109375, "epoch": 3.6231884057971016, "grad_norm": 19563.908830881417, "learning_rate": 6.137497814554771e-08, "logits/chosen": 1.641177773475647, "logits/rejected": 1.8367513418197632, "logps/chosen": -37.600616455078125, "logps/rejected": -5.523456573486328, "loss": 4960.6797, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.004622744861990213, "rewards/margins": 0.00544319674372673, "rewards/rejected": -0.0008204520563594997, "step": 1000 }, { "epoch": 3.6231884057971016, "eval_debug/policy_chosen_logits": 1.6772539615631104, "eval_debug/policy_chosen_logps": -121.2982406616211, "eval_debug/policy_rejected_logits": 1.7362858057022095, "eval_debug/policy_rejected_logps": -63.62192153930664, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2376.774169921875, "eval_debug/sppo_chosen_reward_in_loss": 1.8498167991638184, "eval_debug/sppo_rej_reward_in_loss": 0.26512840390205383, "eval_debug/sppo_reject_loss": 2531.566162109375, "eval_logits/chosen": 1.6772539615631104, "eval_logits/rejected": 1.7362858057022095, "eval_logps/chosen": -121.2982406616211, "eval_logps/rejected": -63.62192153930664, "eval_loss": 4954.90625, "eval_rewards/accuracies": 0.3552631437778473, "eval_rewards/chosen": 0.018498167395591736, "eval_rewards/margins": 0.01584688387811184, "eval_rewards/rejected": 0.0026512842159718275, "eval_runtime": 28.798, "eval_samples_per_second": 20.835, "eval_steps_per_second": 0.66, "step": 1000 }, { "debug/policy_chosen_logits": 1.3107197284698486, "debug/policy_chosen_logps": -17.90933609008789, "debug/policy_rejected_logits": 1.9283233880996704, "debug/policy_rejected_logps": -21.076196670532227, "debug/reference_chosen_logps": -18.150279998779297, "debug/reference_rejected_logps": -21.3471736907959, "debug/sppo_chosen_loss": 2478.620361328125, "debug/sppo_chosen_reward_in_loss": 0.2409447431564331, "debug/sppo_rej_reward_in_loss": 0.2709776759147644, "debug/sppo_reject_loss": 2529.163330078125, "epoch": 3.641304347826087, "grad_norm": 54788.294461105914, "learning_rate": 6.101185564439507e-08, "logits/chosen": 1.3107197284698486, "logits/rejected": 1.9283233880996704, "logps/chosen": -17.90933609008789, "logps/rejected": -21.076196670532227, "loss": 4962.2664, "rewards/accuracies": 0.25, "rewards/chosen": 0.0024094474501907825, "rewards/margins": -0.00030032964423298836, "rewards/rejected": 0.0027097768615931273, "step": 1005 }, { "debug/policy_chosen_logits": 1.3410780429840088, "debug/policy_chosen_logps": -144.1995849609375, "debug/policy_rejected_logits": 1.9149768352508545, "debug/policy_rejected_logps": -91.39128112792969, "debug/reference_chosen_logps": -145.67306518554688, "debug/reference_rejected_logps": -91.565673828125, "debug/sppo_chosen_loss": 2394.4208984375, "debug/sppo_chosen_reward_in_loss": 1.4734869003295898, "debug/sppo_rej_reward_in_loss": 0.174391970038414, "debug/sppo_reject_loss": 2519.900390625, "epoch": 3.6594202898550723, "grad_norm": 162432.09862152304, "learning_rate": 6.064812169874505e-08, "logits/chosen": 1.3410780429840088, "logits/rejected": 1.9149768352508545, "logps/chosen": -144.1995849609375, "logps/rejected": -91.39128112792969, "loss": 4995.3125, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.014734869822859764, "rewards/margins": 0.012990949675440788, "rewards/rejected": 0.0017439197981730103, "step": 1010 }, { "debug/policy_chosen_logits": 1.6520534753799438, "debug/policy_chosen_logps": -38.33971405029297, "debug/policy_rejected_logits": 1.4942057132720947, "debug/policy_rejected_logps": -42.747718811035156, "debug/reference_chosen_logps": -38.78121566772461, "debug/reference_rejected_logps": -42.984474182128906, "debug/sppo_chosen_loss": 2461.59228515625, "debug/sppo_chosen_reward_in_loss": 0.44149914383888245, "debug/sppo_rej_reward_in_loss": 0.23675867915153503, "debug/sppo_reject_loss": 2525.84326171875, "epoch": 3.677536231884058, "grad_norm": 12922.993207888096, "learning_rate": 6.028379650529536e-08, "logits/chosen": 1.6520534753799438, "logits/rejected": 1.4942057132720947, "logps/chosen": -38.33971405029297, "logps/rejected": -42.747718811035156, "loss": 4970.468, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.00441499100998044, "rewards/margins": 0.002047403948381543, "rewards/rejected": 0.0023675865959376097, "step": 1015 }, { "debug/policy_chosen_logits": 1.1207090616226196, "debug/policy_chosen_logps": -129.29237365722656, "debug/policy_rejected_logits": 1.569268822669983, "debug/policy_rejected_logps": -7.01397180557251, "debug/reference_chosen_logps": -130.89085388183594, "debug/reference_rejected_logps": -7.041855812072754, "debug/sppo_chosen_loss": 2396.17626953125, "debug/sppo_chosen_reward_in_loss": 1.598466396331787, "debug/sppo_rej_reward_in_loss": 0.027884578332304955, "debug/sppo_reject_loss": 2502.80908203125, "epoch": 3.6956521739130435, "grad_norm": 28521.087396950876, "learning_rate": 5.991890029357334e-08, "logits/chosen": 1.1207090616226196, "logits/rejected": 1.569268822669983, "logps/chosen": -129.29237365722656, "logps/rejected": -7.01397180557251, "loss": 4943.3207, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.015984663739800453, "rewards/margins": 0.01570582017302513, "rewards/rejected": 0.00027884577866643667, "step": 1020 }, { "debug/policy_chosen_logits": 1.6262223720550537, "debug/policy_chosen_logps": -6.06890344619751, "debug/policy_rejected_logits": 2.0924668312072754, "debug/policy_rejected_logps": -63.56451416015625, "debug/reference_chosen_logps": -6.047281265258789, "debug/reference_rejected_logps": -63.94500732421875, "debug/sppo_chosen_loss": 2502.19970703125, "debug/sppo_chosen_reward_in_loss": -0.02162191830575466, "debug/sppo_rej_reward_in_loss": 0.38049551844596863, "debug/sppo_reject_loss": 2543.684326171875, "epoch": 3.713768115942029, "grad_norm": 76883.81226029202, "learning_rate": 5.9553453324812716e-08, "logits/chosen": 1.6262223720550537, "logits/rejected": 2.0924668312072754, "logps/chosen": -6.06890344619751, "logps/rejected": -63.56451416015625, "loss": 4996.8648, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.00021621919586323202, "rewards/margins": -0.004021174740046263, "rewards/rejected": 0.0038049560971558094, "step": 1025 }, { "debug/policy_chosen_logits": 1.42290461063385, "debug/policy_chosen_logps": -4.187399864196777, "debug/policy_rejected_logits": 1.6779588460922241, "debug/policy_rejected_logps": -11.891450881958008, "debug/reference_chosen_logps": -4.215153217315674, "debug/reference_rejected_logps": -11.835853576660156, "debug/sppo_chosen_loss": 2497.26953125, "debug/sppo_chosen_reward_in_loss": 0.027753179892897606, "debug/sppo_rej_reward_in_loss": -0.05559650808572769, "debug/sppo_reject_loss": 2494.495361328125, "epoch": 3.7318840579710146, "grad_norm": 59971.605951203645, "learning_rate": 5.918747589082852e-08, "logits/chosen": 1.42290461063385, "logits/rejected": 1.6779588460922241, "logps/chosen": -4.187399864196777, "logps/rejected": -11.891450881958008, "loss": 4997.2922, "rewards/accuracies": 0.375, "rewards/chosen": 0.00027753179892897606, "rewards/margins": 0.0008334968006238341, "rewards/rejected": -0.0005559650016948581, "step": 1030 }, { "debug/policy_chosen_logits": 1.2106631994247437, "debug/policy_chosen_logps": -118.35246276855469, "debug/policy_rejected_logits": 1.325660228729248, "debug/policy_rejected_logps": -3.5711700916290283, "debug/reference_chosen_logps": -119.53863525390625, "debug/reference_rejected_logps": -3.519488573074341, "debug/sppo_chosen_loss": 2435.480712890625, "debug/sppo_chosen_reward_in_loss": 1.1861621141433716, "debug/sppo_rej_reward_in_loss": -0.051680900156497955, "debug/sppo_reject_loss": 2494.873779296875, "epoch": 3.75, "grad_norm": 11298.184488025197, "learning_rate": 5.882098831289043e-08, "logits/chosen": 1.2106631994247437, "logits/rejected": 1.325660228729248, "logps/chosen": -118.35246276855469, "logps/rejected": -3.5711700916290283, "loss": 4957.1719, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.011861620470881462, "rewards/margins": 0.012378430925309658, "rewards/rejected": -0.0005168089992366731, "step": 1035 }, { "debug/policy_chosen_logits": 1.9664922952651978, "debug/policy_chosen_logps": -5.417148590087891, "debug/policy_rejected_logits": 2.2743256092071533, "debug/policy_rejected_logps": -97.33494567871094, "debug/reference_chosen_logps": -5.451835632324219, "debug/reference_rejected_logps": -97.595947265625, "debug/sppo_chosen_loss": 2496.56005859375, "debug/sppo_chosen_reward_in_loss": 0.03468700498342514, "debug/sppo_rej_reward_in_loss": 0.2610108554363251, "debug/sppo_reject_loss": 2529.059814453125, "epoch": 3.7681159420289854, "grad_norm": 35799.7053202882, "learning_rate": 5.845401094059438e-08, "logits/chosen": 1.9664922952651978, "logits/rejected": 2.2743256092071533, "logps/chosen": -5.417148590087891, "logps/rejected": -97.33494567871094, "loss": 4980.9273, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.000346870074281469, "rewards/margins": -0.0022632384207099676, "rewards/rejected": 0.002610108582302928, "step": 1040 }, { "debug/policy_chosen_logits": 1.7527313232421875, "debug/policy_chosen_logps": -10.68443775177002, "debug/policy_rejected_logits": 2.0008606910705566, "debug/policy_rejected_logps": -23.338722229003906, "debug/reference_chosen_logps": -10.742517471313477, "debug/reference_rejected_logps": -23.387409210205078, "debug/sppo_chosen_loss": 2494.250732421875, "debug/sppo_chosen_reward_in_loss": 0.05808082967996597, "debug/sppo_rej_reward_in_loss": 0.04868631809949875, "debug/sppo_reject_loss": 2505.852783203125, "epoch": 3.786231884057971, "grad_norm": 49904.94403131573, "learning_rate": 5.808656415073263e-08, "logits/chosen": 1.7527313232421875, "logits/rejected": 2.0008606910705566, "logps/chosen": -10.68443775177002, "logps/rejected": -23.338722229003906, "loss": 4987.3695, "rewards/accuracies": 0.375, "rewards/chosen": 0.0005808082642033696, "rewards/margins": 9.394520748173818e-05, "rewards/rejected": 0.00048686322406865656, "step": 1045 }, { "debug/policy_chosen_logits": 1.4339758157730103, "debug/policy_chosen_logps": -6.624693393707275, "debug/policy_rejected_logits": 1.8763647079467773, "debug/policy_rejected_logps": -4.869560241699219, "debug/reference_chosen_logps": -6.616199493408203, "debug/reference_rejected_logps": -4.851325988769531, "debug/sppo_chosen_loss": 2500.89404296875, "debug/sppo_chosen_reward_in_loss": -0.008493724279105663, "debug/sppo_rej_reward_in_loss": -0.018234139308333397, "debug/sppo_reject_loss": 2498.192138671875, "epoch": 3.8043478260869565, "grad_norm": 15836.643628654465, "learning_rate": 5.7718668346162357e-08, "logits/chosen": 1.4339758157730103, "logits/rejected": 1.8763647079467773, "logps/chosen": -6.624693393707275, "logps/rejected": -4.869560241699219, "loss": 4990.3969, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -8.493725908920169e-05, "rewards/margins": 9.740416135173291e-05, "rewards/rejected": -0.00018234140588901937, "step": 1050 }, { "debug/policy_chosen_logits": 1.619794487953186, "debug/policy_chosen_logps": -65.96073150634766, "debug/policy_rejected_logits": 1.9365803003311157, "debug/policy_rejected_logps": -5.874847888946533, "debug/reference_chosen_logps": -66.7884292602539, "debug/reference_rejected_logps": -5.737187385559082, "debug/sppo_chosen_loss": 2443.231201171875, "debug/sppo_chosen_reward_in_loss": 0.8276926279067993, "debug/sppo_rej_reward_in_loss": -0.13766086101531982, "debug/sppo_reject_loss": 2486.42431640625, "epoch": 3.822463768115942, "grad_norm": 43718.42476723501, "learning_rate": 5.735034395467271e-08, "logits/chosen": 1.619794487953186, "logits/rejected": 1.9365803003311157, "logps/chosen": -65.96073150634766, "logps/rejected": -5.874847888946533, "loss": 4987.2109, "rewards/accuracies": 0.375, "rewards/chosen": 0.0082769263535738, "rewards/margins": 0.009653533808887005, "rewards/rejected": -0.001376608619466424, "step": 1055 }, { "debug/policy_chosen_logits": 1.5370725393295288, "debug/policy_chosen_logps": -77.93592071533203, "debug/policy_rejected_logits": 1.730516791343689, "debug/policy_rejected_logps": -54.92805862426758, "debug/reference_chosen_logps": -78.50907897949219, "debug/reference_rejected_logps": -55.16875457763672, "debug/sppo_chosen_loss": 2450.21728515625, "debug/sppo_chosen_reward_in_loss": 0.5731583833694458, "debug/sppo_rej_reward_in_loss": 0.24069073796272278, "debug/sppo_reject_loss": 2526.795166015625, "epoch": 3.8405797101449277, "grad_norm": 55740.024917307426, "learning_rate": 5.698161142785058e-08, "logits/chosen": 1.5370725393295288, "logits/rejected": 1.730516791343689, "logps/chosen": -77.93592071533203, "logps/rejected": -54.92805862426758, "loss": 5006.2063, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.005731584038585424, "rewards/margins": 0.0033246767707169056, "rewards/rejected": 0.002406907267868519, "step": 1060 }, { "debug/policy_chosen_logits": 1.6740124225616455, "debug/policy_chosen_logps": -71.5028305053711, "debug/policy_rejected_logits": 1.7378705739974976, "debug/policy_rejected_logps": -13.004046440124512, "debug/reference_chosen_logps": -72.53044128417969, "debug/reference_rejected_logps": -12.9281587600708, "debug/sppo_chosen_loss": 2440.041015625, "debug/sppo_chosen_reward_in_loss": 1.0275957584381104, "debug/sppo_rej_reward_in_loss": -0.07588809728622437, "debug/sppo_reject_loss": 2492.650390625, "epoch": 3.858695652173913, "grad_norm": 85242.61023067872, "learning_rate": 5.661249123994495e-08, "logits/chosen": 1.6740124225616455, "logits/rejected": 1.7378705739974976, "logps/chosen": -71.5028305053711, "logps/rejected": -13.004046440124512, "loss": 4975.35, "rewards/accuracies": 0.25, "rewards/chosen": 0.01027595717459917, "rewards/margins": 0.011034837923943996, "rewards/rejected": -0.0007588809239678085, "step": 1065 }, { "debug/policy_chosen_logits": 1.4391615390777588, "debug/policy_chosen_logps": -128.398193359375, "debug/policy_rejected_logits": 1.686471700668335, "debug/policy_rejected_logps": -3.1188156604766846, "debug/reference_chosen_logps": -129.23959350585938, "debug/reference_rejected_logps": -3.075037717819214, "debug/sppo_chosen_loss": 2426.225830078125, "debug/sppo_chosen_reward_in_loss": 0.8414149284362793, "debug/sppo_rej_reward_in_loss": -0.043777965009212494, "debug/sppo_reject_loss": 2495.640869140625, "epoch": 3.8768115942028984, "grad_norm": 56181.295485855626, "learning_rate": 5.624300388673012e-08, "logits/chosen": 1.4391615390777588, "logits/rejected": 1.686471700668335, "logps/chosen": -128.398193359375, "logps/rejected": -3.1188156604766846, "loss": 4957.4633, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.008414149284362793, "rewards/margins": 0.008851928636431694, "rewards/rejected": -0.00043777964310720563, "step": 1070 }, { "debug/policy_chosen_logits": 1.263810157775879, "debug/policy_chosen_logps": -6.219008445739746, "debug/policy_rejected_logits": 1.5881799459457397, "debug/policy_rejected_logps": -2.813546657562256, "debug/reference_chosen_logps": -6.159215450286865, "debug/reference_rejected_logps": -2.7770676612854004, "debug/sppo_chosen_loss": 2506.015625, "debug/sppo_chosen_reward_in_loss": -0.05979280546307564, "debug/sppo_rej_reward_in_loss": -0.03647901862859726, "debug/sppo_reject_loss": 2496.407470703125, "epoch": 3.894927536231884, "grad_norm": 18758.982232600774, "learning_rate": 5.5873169884367596e-08, "logits/chosen": 1.263810157775879, "logits/rejected": 1.5881799459457397, "logps/chosen": -6.219008445739746, "logps/rejected": -2.813546657562256, "loss": 4972.5805, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.000597928068600595, "rewards/margins": -0.0002331378636881709, "rewards/rejected": -0.0003647901467047632, "step": 1075 }, { "debug/policy_chosen_logits": 1.089515209197998, "debug/policy_chosen_logps": -8.586580276489258, "debug/policy_rejected_logits": 1.5352369546890259, "debug/policy_rejected_logps": -48.5157585144043, "debug/reference_chosen_logps": -8.5914306640625, "debug/reference_rejected_logps": -46.42311477661133, "debug/sppo_chosen_loss": 2499.690673828125, "debug/sppo_chosen_reward_in_loss": 0.004849851131439209, "debug/sppo_rej_reward_in_loss": -2.092644214630127, "debug/sppo_reject_loss": 2430.58154296875, "epoch": 3.9130434782608696, "grad_norm": 122627.82625988509, "learning_rate": 5.550300976826696e-08, "logits/chosen": 1.089515209197998, "logits/rejected": 1.5352369546890259, "logps/chosen": -8.586580276489258, "logps/rejected": -48.5157585144043, "loss": 6918.868, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 4.849850665777922e-05, "rewards/margins": 0.020974941551685333, "rewards/rejected": -0.020926441997289658, "step": 1080 }, { "debug/policy_chosen_logits": 1.199636459350586, "debug/policy_chosen_logps": -24.127920150756836, "debug/policy_rejected_logits": 1.6900031566619873, "debug/policy_rejected_logps": -72.46263885498047, "debug/reference_chosen_logps": -23.595714569091797, "debug/reference_rejected_logps": -69.52255249023438, "debug/sppo_chosen_loss": 2559.35791015625, "debug/sppo_chosen_reward_in_loss": -0.5322056412696838, "debug/sppo_rej_reward_in_loss": -2.940089702606201, "debug/sppo_reject_loss": 2554.7861328125, "epoch": 3.931159420289855, "grad_norm": 33230.32184098702, "learning_rate": 5.513254409194554e-08, "logits/chosen": 1.199636459350586, "logits/rejected": 1.6900031566619873, "logps/chosen": -24.127920150756836, "logps/rejected": -72.46263885498047, "loss": 6726.4039, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.005322056356817484, "rewards/margins": 0.024078840389847755, "rewards/rejected": -0.029400896281003952, "step": 1085 }, { "debug/policy_chosen_logits": 1.1701465845108032, "debug/policy_chosen_logps": -3.776742935180664, "debug/policy_rejected_logits": 1.678185224533081, "debug/policy_rejected_logps": -98.24723052978516, "debug/reference_chosen_logps": -3.756516695022583, "debug/reference_rejected_logps": -96.35320281982422, "debug/sppo_chosen_loss": 2502.0458984375, "debug/sppo_chosen_reward_in_loss": -0.020226484164595604, "debug/sppo_rej_reward_in_loss": -1.8940198421478271, "debug/sppo_reject_loss": 2430.40771484375, "epoch": 3.949275362318841, "grad_norm": 117522.55522735156, "learning_rate": 5.4761793425887274e-08, "logits/chosen": 1.1701465845108032, "logits/rejected": 1.678185224533081, "logps/chosen": -3.776742935180664, "logps/rejected": -98.24723052978516, "loss": 5037.0734, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.00020226484048180282, "rewards/margins": 0.018737932667136192, "rewards/rejected": -0.01894019916653633, "step": 1090 }, { "debug/policy_chosen_logits": 1.0843307971954346, "debug/policy_chosen_logps": -95.433837890625, "debug/policy_rejected_logits": 1.361130714416504, "debug/policy_rejected_logps": -124.06141662597656, "debug/reference_chosen_logps": -94.8940658569336, "debug/reference_rejected_logps": -120.67181396484375, "debug/sppo_chosen_loss": 2557.362060546875, "debug/sppo_chosen_reward_in_loss": -0.5397787094116211, "debug/sppo_rej_reward_in_loss": -3.3895785808563232, "debug/sppo_reject_loss": 2550.26904296875, "epoch": 3.967391304347826, "grad_norm": 26851.84280806889, "learning_rate": 5.439077835640038e-08, "logits/chosen": 1.0843307971954346, "logits/rejected": 1.361130714416504, "logps/chosen": -95.433837890625, "logps/rejected": -124.06141662597656, "loss": 5040.2961, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.005397786386311054, "rewards/margins": 0.028497997671365738, "rewards/rejected": -0.033895786851644516, "step": 1095 }, { "debug/policy_chosen_logits": 1.5177290439605713, "debug/policy_chosen_logps": -148.07943725585938, "debug/policy_rejected_logits": 1.742882490158081, "debug/policy_rejected_logps": -44.79954528808594, "debug/reference_chosen_logps": -147.768310546875, "debug/reference_rejected_logps": -44.236473083496094, "debug/sppo_chosen_loss": 2549.940185546875, "debug/sppo_chosen_reward_in_loss": -0.3111165761947632, "debug/sppo_rej_reward_in_loss": -0.5630791187286377, "debug/sppo_reject_loss": 2449.529296875, "epoch": 3.9855072463768115, "grad_norm": 57946.96393753477, "learning_rate": 5.4019519484474376e-08, "logits/chosen": 1.5177290439605713, "logits/rejected": 1.742882490158081, "logps/chosen": -148.07943725585938, "logps/rejected": -44.79954528808594, "loss": 4996.0746, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.003111165016889572, "rewards/margins": 0.0025196250062435865, "rewards/rejected": -0.00563079072162509, "step": 1100 }, { "epoch": 3.9855072463768115, "eval_debug/policy_chosen_logits": 1.6290916204452515, "eval_debug/policy_chosen_logps": -122.25318908691406, "eval_debug/policy_rejected_logits": 1.6883715391159058, "eval_debug/policy_rejected_logps": -64.11194610595703, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2438.27734375, "eval_debug/sppo_chosen_reward_in_loss": 0.8948830962181091, "eval_debug/sppo_rej_reward_in_loss": -0.22488436102867126, "eval_debug/sppo_reject_loss": 2479.807373046875, "eval_logits/chosen": 1.6290916204452515, "eval_logits/rejected": 1.6883715391159058, "eval_logps/chosen": -122.25318908691406, "eval_logps/rejected": -64.11194610595703, "eval_loss": 4978.2021484375, "eval_rewards/accuracies": 0.3684210479259491, "eval_rewards/chosen": 0.00894882995635271, "eval_rewards/margins": 0.01119767315685749, "eval_rewards/rejected": -0.0022488434333354235, "eval_runtime": 28.8354, "eval_samples_per_second": 20.808, "eval_steps_per_second": 0.659, "step": 1100 }, { "debug/policy_chosen_logits": 1.2726986408233643, "debug/policy_chosen_logps": -31.461612701416016, "debug/policy_rejected_logits": 1.2194486856460571, "debug/policy_rejected_logps": -41.34273147583008, "debug/reference_chosen_logps": -31.847381591796875, "debug/reference_rejected_logps": -41.46240234375, "debug/sppo_chosen_loss": 2467.671630859375, "debug/sppo_chosen_reward_in_loss": 0.3857712745666504, "debug/sppo_rej_reward_in_loss": 0.11967315524816513, "debug/sppo_reject_loss": 2513.02490234375, "epoch": 4.003623188405797, "grad_norm": 14390.631883943946, "learning_rate": 5.364803742463616e-08, "logits/chosen": 1.2726986408233643, "logits/rejected": 1.2194486856460571, "logps/chosen": -31.461612701416016, "logps/rejected": -41.34273147583008, "loss": 4960.7297, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0038577124942094088, "rewards/margins": 0.00266098091378808, "rewards/rejected": 0.0011967314640060067, "step": 1105 }, { "debug/policy_chosen_logits": 1.3380236625671387, "debug/policy_chosen_logps": -16.757080078125, "debug/policy_rejected_logits": 1.8596937656402588, "debug/policy_rejected_logps": -73.24615478515625, "debug/reference_chosen_logps": -16.892589569091797, "debug/reference_rejected_logps": -73.21855926513672, "debug/sppo_chosen_loss": 2487.00537109375, "debug/sppo_chosen_reward_in_loss": 0.135512575507164, "debug/sppo_rej_reward_in_loss": -0.027594005689024925, "debug/sppo_reject_loss": 2497.5625, "epoch": 4.021739130434782, "grad_norm": 174385.9038673273, "learning_rate": 5.327635280380538e-08, "logits/chosen": 1.3380236625671387, "logits/rejected": 1.8596937656402588, "logps/chosen": -16.757080078125, "logps/rejected": -73.24615478515625, "loss": 4997.8684, "rewards/accuracies": 0.375, "rewards/chosen": 0.001355125685222447, "rewards/margins": 0.0016310656210407615, "rewards/rejected": -0.0002759400231298059, "step": 1110 }, { "debug/policy_chosen_logits": 1.4081073999404907, "debug/policy_chosen_logps": -4.7746782302856445, "debug/policy_rejected_logits": 1.539998173713684, "debug/policy_rejected_logps": -9.67628002166748, "debug/reference_chosen_logps": -4.702683448791504, "debug/reference_rejected_logps": -9.626226425170898, "debug/sppo_chosen_loss": 2507.261962890625, "debug/sppo_chosen_reward_in_loss": -0.07199463248252869, "debug/sppo_rej_reward_in_loss": -0.05005418509244919, "debug/sppo_reject_loss": 2495.11767578125, "epoch": 4.0398550724637685, "grad_norm": 12845.722912349, "learning_rate": 5.290448626014904e-08, "logits/chosen": 1.4081073999404907, "logits/rejected": 1.539998173713684, "logps/chosen": -4.7746782302856445, "logps/rejected": -9.67628002166748, "loss": 4972.6246, "rewards/accuracies": 0.25, "rewards/chosen": -0.0007199462852440774, "rewards/margins": -0.00021940446458756924, "rewards/rejected": -0.0005005418206565082, "step": 1115 }, { "debug/policy_chosen_logits": 1.394338846206665, "debug/policy_chosen_logps": -93.58735656738281, "debug/policy_rejected_logits": 1.4865351915359497, "debug/policy_rejected_logps": -50.265098571777344, "debug/reference_chosen_logps": -93.95970153808594, "debug/reference_rejected_logps": -50.229618072509766, "debug/sppo_chosen_loss": 2469.903564453125, "debug/sppo_chosen_reward_in_loss": 0.3723534643650055, "debug/sppo_rej_reward_in_loss": -0.03548173978924751, "debug/sppo_reject_loss": 2496.9140625, "epoch": 4.057971014492754, "grad_norm": 17415.662622386862, "learning_rate": 5.253245844193563e-08, "logits/chosen": 1.394338846206665, "logits/rejected": 1.4865351915359497, "logps/chosen": -93.58735656738281, "logps/rejected": -50.265098571777344, "loss": 4960.0031, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0037235342897474766, "rewards/margins": 0.004078351892530918, "rewards/rejected": -0.00035481739905662835, "step": 1120 }, { "debug/policy_chosen_logits": 1.3554985523223877, "debug/policy_chosen_logps": -7.912178993225098, "debug/policy_rejected_logits": 1.6807849407196045, "debug/policy_rejected_logps": -11.890970230102539, "debug/reference_chosen_logps": -7.895465850830078, "debug/reference_rejected_logps": -11.934921264648438, "debug/sppo_chosen_loss": 2501.777587890625, "debug/sppo_chosen_reward_in_loss": -0.01671208068728447, "debug/sppo_rej_reward_in_loss": 0.04395123943686485, "debug/sppo_reject_loss": 2504.53662109375, "epoch": 4.076086956521739, "grad_norm": 103713.50097817494, "learning_rate": 5.21602900063886e-08, "logits/chosen": 1.3554985523223877, "logits/rejected": 1.6807849407196045, "logps/chosen": -7.912178993225098, "logps/rejected": -11.890970230102539, "loss": 4984.6324, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.00016712083015590906, "rewards/margins": -0.0006066331407055259, "rewards/rejected": 0.00043951236875727773, "step": 1125 }, { "debug/policy_chosen_logits": 1.517746090888977, "debug/policy_chosen_logps": -8.11583137512207, "debug/policy_rejected_logits": 1.7350282669067383, "debug/policy_rejected_logps": -7.305092811584473, "debug/reference_chosen_logps": -8.121663093566895, "debug/reference_rejected_logps": -7.3379998207092285, "debug/sppo_chosen_loss": 2499.482666015625, "debug/sppo_chosen_reward_in_loss": 0.005830919835716486, "debug/sppo_rej_reward_in_loss": 0.03290678188204765, "debug/sppo_reject_loss": 2503.31640625, "epoch": 4.094202898550725, "grad_norm": 18856.269373546747, "learning_rate": 5.1788001618539276e-08, "logits/chosen": 1.517746090888977, "logits/rejected": 1.7350282669067383, "logps/chosen": -8.11583137512207, "logps/rejected": -7.305092811584473, "loss": 4974.3199, "rewards/accuracies": 0.25, "rewards/chosen": 5.830921509186737e-05, "rewards/margins": -0.0002707586099859327, "rewards/rejected": 0.00032906781416386366, "step": 1130 }, { "debug/policy_chosen_logits": 1.1660598516464233, "debug/policy_chosen_logps": -234.23757934570312, "debug/policy_rejected_logits": 1.4418781995773315, "debug/policy_rejected_logps": -6.014125823974609, "debug/reference_chosen_logps": -236.1083984375, "debug/reference_rejected_logps": -5.9557414054870605, "debug/sppo_chosen_loss": 2370.38916015625, "debug/sppo_chosen_reward_in_loss": 1.8707882165908813, "debug/sppo_rej_reward_in_loss": -0.05838487297296524, "debug/sppo_reject_loss": 2494.24267578125, "epoch": 4.11231884057971, "grad_norm": 42251.384892899616, "learning_rate": 5.141561395007945e-08, "logits/chosen": 1.1660598516464233, "logits/rejected": 1.4418781995773315, "logps/chosen": -234.23757934570312, "logps/rejected": -6.014125823974609, "loss": 4964.8172, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.01870787888765335, "rewards/margins": 0.019291730597615242, "rewards/rejected": -0.0005838487413711846, "step": 1135 }, { "debug/policy_chosen_logits": 1.1201825141906738, "debug/policy_chosen_logps": -3.2453339099884033, "debug/policy_rejected_logits": 1.6488778591156006, "debug/policy_rejected_logps": -3.550246000289917, "debug/reference_chosen_logps": -3.2428536415100098, "debug/reference_rejected_logps": -3.4997127056121826, "debug/sppo_chosen_loss": 2500.299560546875, "debug/sppo_chosen_reward_in_loss": -0.002480667782947421, "debug/sppo_rej_reward_in_loss": -0.050533294677734375, "debug/sppo_reject_loss": 2494.99853515625, "epoch": 4.130434782608695, "grad_norm": 28041.110258157885, "learning_rate": 5.104314767821363e-08, "logits/chosen": 1.1201825141906738, "logits/rejected": 1.6488778591156006, "logps/chosen": -3.2453339099884033, "logps/rejected": -3.550246000289917, "loss": 4963.5508, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -2.480667353665922e-05, "rewards/margins": 0.0004805262142326683, "rewards/rejected": -0.0005053329514339566, "step": 1140 }, { "debug/policy_chosen_logits": 1.3151696920394897, "debug/policy_chosen_logps": -8.421621322631836, "debug/policy_rejected_logits": 1.582086205482483, "debug/policy_rejected_logps": -98.97586822509766, "debug/reference_chosen_logps": -8.481250762939453, "debug/reference_rejected_logps": -99.89872741699219, "debug/sppo_chosen_loss": 2494.12939453125, "debug/sppo_chosen_reward_in_loss": 0.059628378599882126, "debug/sppo_rej_reward_in_loss": 0.92286616563797, "debug/sppo_reject_loss": 2614.516357421875, "epoch": 4.148550724637682, "grad_norm": 49447.562521618194, "learning_rate": 5.067062348451078e-08, "logits/chosen": 1.3151696920394897, "logits/rejected": 1.582086205482483, "logps/chosen": -8.421621322631836, "logps/rejected": -98.97586822509766, "loss": 4971.2035, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0005962837603874505, "rewards/margins": -0.008632375858724117, "rewards/rejected": 0.00922865979373455, "step": 1145 }, { "debug/policy_chosen_logits": 1.3772354125976562, "debug/policy_chosen_logps": -8.230802536010742, "debug/policy_rejected_logits": 1.7520824670791626, "debug/policy_rejected_logps": -15.763114929199219, "debug/reference_chosen_logps": -8.285252571105957, "debug/reference_rejected_logps": -15.81109619140625, "debug/sppo_chosen_loss": 2494.632080078125, "debug/sppo_chosen_reward_in_loss": 0.05445030331611633, "debug/sppo_rej_reward_in_loss": 0.04798306152224541, "debug/sppo_reject_loss": 2505.33544921875, "epoch": 4.166666666666667, "grad_norm": 39027.58952313228, "learning_rate": 5.029806205375612e-08, "logits/chosen": 1.3772354125976562, "logits/rejected": 1.7520824670791626, "logps/chosen": -8.230802536010742, "logps/rejected": -15.763114929199219, "loss": 4928.2445, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.0005445030401460826, "rewards/margins": 6.467229832196608e-05, "rewards/rejected": 0.0004798306617885828, "step": 1150 }, { "debug/policy_chosen_logits": 1.506194829940796, "debug/policy_chosen_logps": -64.88957214355469, "debug/policy_rejected_logits": 1.9257148504257202, "debug/policy_rejected_logps": -57.977745056152344, "debug/reference_chosen_logps": -65.75659942626953, "debug/reference_rejected_logps": -58.31553268432617, "debug/sppo_chosen_loss": 2440.390625, "debug/sppo_chosen_reward_in_loss": 0.8670161366462708, "debug/sppo_rej_reward_in_loss": 0.33778566122055054, "debug/sppo_reject_loss": 2537.008544921875, "epoch": 4.184782608695652, "grad_norm": 124407.66920706173, "learning_rate": 4.9925484072802416e-08, "logits/chosen": 1.506194829940796, "logits/rejected": 1.9257148504257202, "logps/chosen": -64.88957214355469, "logps/rejected": -57.977745056152344, "loss": 4976.1289, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.008670160546898842, "rewards/margins": 0.005292304325848818, "rewards/rejected": 0.0033778566867113113, "step": 1155 }, { "debug/policy_chosen_logits": 1.3619253635406494, "debug/policy_chosen_logps": -6.0907135009765625, "debug/policy_rejected_logits": 1.9251205921173096, "debug/policy_rejected_logps": -9.874423027038574, "debug/reference_chosen_logps": -6.167381286621094, "debug/reference_rejected_logps": -9.863636016845703, "debug/sppo_chosen_loss": 2492.4296875, "debug/sppo_chosen_reward_in_loss": 0.07666828483343124, "debug/sppo_rej_reward_in_loss": -0.010786160826683044, "debug/sppo_reject_loss": 2499.029052734375, "epoch": 4.202898550724638, "grad_norm": 22190.683016201292, "learning_rate": 4.955291022942145e-08, "logits/chosen": 1.3619253635406494, "logits/rejected": 1.9251205921173096, "logps/chosen": -6.0907135009765625, "logps/rejected": -9.874423027038574, "loss": 4974.7156, "rewards/accuracies": 0.375, "rewards/chosen": 0.0007666827877983451, "rewards/margins": 0.0008745444938540459, "rewards/rejected": -0.0001078616114682518, "step": 1160 }, { "debug/policy_chosen_logits": 1.2866249084472656, "debug/policy_chosen_logps": -8.930032730102539, "debug/policy_rejected_logits": 1.7079235315322876, "debug/policy_rejected_logps": -43.02538299560547, "debug/reference_chosen_logps": -9.041211128234863, "debug/reference_rejected_logps": -43.405174255371094, "debug/sppo_chosen_loss": 2489.014892578125, "debug/sppo_chosen_reward_in_loss": 0.1111774668097496, "debug/sppo_rej_reward_in_loss": 0.3797917366027832, "debug/sppo_reject_loss": 2545.455810546875, "epoch": 4.221014492753623, "grad_norm": 32505.852256733157, "learning_rate": 4.918036121115522e-08, "logits/chosen": 1.2866249084472656, "logits/rejected": 1.7079235315322876, "logps/chosen": -8.930032730102539, "logps/rejected": -43.02538299560547, "loss": 4973.9523, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.0011117748217657208, "rewards/margins": -0.002686142222955823, "rewards/rejected": 0.0037979166954755783, "step": 1165 }, { "debug/policy_chosen_logits": 1.465065836906433, "debug/policy_chosen_logps": -217.9519805908203, "debug/policy_rejected_logits": 1.463714361190796, "debug/policy_rejected_logps": -7.929549217224121, "debug/reference_chosen_logps": -220.4656524658203, "debug/reference_rejected_logps": -7.890463829040527, "debug/sppo_chosen_loss": 2341.96142578125, "debug/sppo_chosen_reward_in_loss": 2.513664484024048, "debug/sppo_rej_reward_in_loss": -0.039085645228624344, "debug/sppo_reject_loss": 2496.15966796875, "epoch": 4.239130434782608, "grad_norm": 72702.98564243142, "learning_rate": 4.8807857704167354e-08, "logits/chosen": 1.465065836906433, "logits/rejected": 1.463714361190796, "logps/chosen": -217.9519805908203, "logps/rejected": -7.929549217224121, "loss": 4925.4531, "rewards/accuracies": 0.375, "rewards/chosen": 0.025136644020676613, "rewards/margins": 0.02552749775350094, "rewards/rejected": -0.00039085643948055804, "step": 1170 }, { "debug/policy_chosen_logits": 1.3393514156341553, "debug/policy_chosen_logps": -36.42089080810547, "debug/policy_rejected_logits": 1.355088472366333, "debug/policy_rejected_logps": -5.211674690246582, "debug/reference_chosen_logps": -36.75517654418945, "debug/reference_rejected_logps": -5.078423976898193, "debug/sppo_chosen_loss": 2470.31103515625, "debug/sppo_chosen_reward_in_loss": 0.33427920937538147, "debug/sppo_rej_reward_in_loss": -0.1332508623600006, "debug/sppo_reject_loss": 2486.86865234375, "epoch": 4.257246376811594, "grad_norm": 33139.00528925964, "learning_rate": 4.843542039209433e-08, "logits/chosen": 1.3393514156341553, "logits/rejected": 1.355088472366333, "logps/chosen": -36.42089080810547, "logps/rejected": -5.211674690246582, "loss": 4967.2695, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0033427923917770386, "rewards/margins": 0.0046753003261983395, "rewards/rejected": -0.00133250851649791, "step": 1175 }, { "debug/policy_chosen_logits": 1.5113728046417236, "debug/policy_chosen_logps": -5.965389728546143, "debug/policy_rejected_logits": 1.702850103378296, "debug/policy_rejected_logps": -50.03304672241211, "debug/reference_chosen_logps": -5.976899147033691, "debug/reference_rejected_logps": -50.49010467529297, "debug/sppo_chosen_loss": 2498.88427734375, "debug/sppo_chosen_reward_in_loss": 0.011508792638778687, "debug/sppo_rej_reward_in_loss": 0.4570561945438385, "debug/sppo_reject_loss": 2554.249267578125, "epoch": 4.27536231884058, "grad_norm": 53191.90854419258, "learning_rate": 4.806306995489717e-08, "logits/chosen": 1.5113728046417236, "logits/rejected": 1.702850103378296, "logps/chosen": -5.965389728546143, "logps/rejected": -50.03304672241211, "loss": 5031.8117, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.00011508789611980319, "rewards/margins": -0.0044554732739925385, "rewards/rejected": 0.004570561461150646, "step": 1180 }, { "debug/policy_chosen_logits": 1.0482234954833984, "debug/policy_chosen_logps": -6.115649223327637, "debug/policy_rejected_logits": 1.4473512172698975, "debug/policy_rejected_logps": -6.680502414703369, "debug/reference_chosen_logps": -6.171689033508301, "debug/reference_rejected_logps": -6.672621726989746, "debug/sppo_chosen_loss": 2494.41943359375, "debug/sppo_chosen_reward_in_loss": 0.05603953078389168, "debug/sppo_rej_reward_in_loss": -0.007880300283432007, "debug/sppo_reject_loss": 2499.238037109375, "epoch": 4.293478260869565, "grad_norm": 82370.38963375674, "learning_rate": 4.769082706771303e-08, "logits/chosen": 1.0482234954833984, "logits/rejected": 1.4473512172698975, "logps/chosen": -6.115649223327637, "logps/rejected": -6.680502414703369, "loss": 4981.8988, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0005603953031823039, "rewards/margins": 0.000639198231510818, "rewards/rejected": -7.880300108809024e-05, "step": 1185 }, { "debug/policy_chosen_logits": 1.4925187826156616, "debug/policy_chosen_logps": -12.615328788757324, "debug/policy_rejected_logits": 1.646032691001892, "debug/policy_rejected_logps": -8.762921333312988, "debug/reference_chosen_logps": -12.709831237792969, "debug/reference_rejected_logps": -8.655733108520508, "debug/sppo_chosen_loss": 2490.93701171875, "debug/sppo_chosen_reward_in_loss": 0.09450257569551468, "debug/sppo_rej_reward_in_loss": -0.10718753188848495, "debug/sppo_reject_loss": 2489.41943359375, "epoch": 4.311594202898551, "grad_norm": 17342.831448047742, "learning_rate": 4.731871239970723e-08, "logits/chosen": 1.4925187826156616, "logits/rejected": 1.646032691001892, "logps/chosen": -12.615328788757324, "logps/rejected": -8.762921333312988, "loss": 4988.416, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0009450257639400661, "rewards/margins": 0.0020169010385870934, "rewards/rejected": -0.00107187544927001, "step": 1190 }, { "debug/policy_chosen_logits": 1.2318902015686035, "debug/policy_chosen_logps": -3.509345293045044, "debug/policy_rejected_logits": 1.5467309951782227, "debug/policy_rejected_logps": -110.87577819824219, "debug/reference_chosen_logps": -3.5323424339294434, "debug/reference_rejected_logps": -111.6783676147461, "debug/sppo_chosen_loss": 2497.713623046875, "debug/sppo_chosen_reward_in_loss": 0.022996854037046432, "debug/sppo_rej_reward_in_loss": 0.8025868535041809, "debug/sppo_reject_loss": 2596.442626953125, "epoch": 4.329710144927536, "grad_norm": 84040.45593026832, "learning_rate": 4.694674661292563e-08, "logits/chosen": 1.2318902015686035, "logits/rejected": 1.5467309951782227, "logps/chosen": -3.509345293045044, "logps/rejected": -110.87577819824219, "loss": 4999.8031, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.00022996852931100875, "rewards/margins": -0.007795900106430054, "rewards/rejected": 0.008025867864489555, "step": 1195 }, { "debug/policy_chosen_logits": 1.8548692464828491, "debug/policy_chosen_logps": -9.570596694946289, "debug/policy_rejected_logits": 2.2644782066345215, "debug/policy_rejected_logps": -92.725341796875, "debug/reference_chosen_logps": -9.619321823120117, "debug/reference_rejected_logps": -92.8711166381836, "debug/sppo_chosen_loss": 2495.21044921875, "debug/sppo_chosen_reward_in_loss": 0.0487261600792408, "debug/sppo_rej_reward_in_loss": 0.14577673375606537, "debug/sppo_reject_loss": 2515.51611328125, "epoch": 4.3478260869565215, "grad_norm": 108224.5908545546, "learning_rate": 4.6574950361147296e-08, "logits/chosen": 1.8548692464828491, "logits/rejected": 2.2644782066345215, "logps/chosen": -9.570596694946289, "logps/rejected": -92.725341796875, "loss": 4988.032, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0004872615390922874, "rewards/margins": -0.0009705059346742928, "rewards/rejected": 0.001457767328247428, "step": 1200 }, { "epoch": 4.3478260869565215, "eval_debug/policy_chosen_logits": 1.6634116172790527, "eval_debug/policy_chosen_logps": -121.4332504272461, "eval_debug/policy_rejected_logits": 1.7223279476165771, "eval_debug/policy_rejected_logps": -63.91315841674805, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2381.583984375, "eval_debug/sppo_chosen_reward_in_loss": 1.7147976160049438, "eval_debug/sppo_rej_reward_in_loss": -0.026104070246219635, "eval_debug/sppo_reject_loss": 2497.433837890625, "eval_logits/chosen": 1.6634116172790527, "eval_logits/rejected": 1.7223279476165771, "eval_logps/chosen": -121.4332504272461, "eval_logps/rejected": -63.91315841674805, "eval_loss": 4952.40185546875, "eval_rewards/accuracies": 0.3815789520740509, "eval_rewards/chosen": 0.01714797504246235, "eval_rewards/margins": 0.017409013584256172, "eval_rewards/rejected": -0.0002610406663734466, "eval_runtime": 28.7268, "eval_samples_per_second": 20.886, "eval_steps_per_second": 0.661, "step": 1200 }, { "debug/policy_chosen_logits": 1.2625658512115479, "debug/policy_chosen_logps": -104.67900085449219, "debug/policy_rejected_logits": 1.5109230279922485, "debug/policy_rejected_logps": -119.05806732177734, "debug/reference_chosen_logps": -104.79878234863281, "debug/reference_rejected_logps": -119.53370666503906, "debug/sppo_chosen_loss": 2488.929931640625, "debug/sppo_chosen_reward_in_loss": 0.11978638172149658, "debug/sppo_rej_reward_in_loss": 0.475635826587677, "debug/sppo_reject_loss": 2553.296875, "epoch": 4.365942028985507, "grad_norm": 83697.58265847957, "learning_rate": 4.6203344288737694e-08, "logits/chosen": 1.2625658512115479, "logits/rejected": 1.5109230279922485, "logps/chosen": -104.67900085449219, "logps/rejected": -119.05806732177734, "loss": 4984.5461, "rewards/accuracies": 0.25, "rewards/chosen": 0.0011978638358414173, "rewards/margins": -0.0035584941506385803, "rewards/rejected": 0.004756357986479998, "step": 1205 }, { "debug/policy_chosen_logits": 1.564095377922058, "debug/policy_chosen_logps": -136.6577606201172, "debug/policy_rejected_logits": 1.9398508071899414, "debug/policy_rejected_logps": -105.86729431152344, "debug/reference_chosen_logps": -138.21644592285156, "debug/reference_rejected_logps": -105.77984619140625, "debug/sppo_chosen_loss": 2396.899658203125, "debug/sppo_chosen_reward_in_loss": 1.5586907863616943, "debug/sppo_rej_reward_in_loss": -0.08747353404760361, "debug/sppo_reject_loss": 2491.5009765625, "epoch": 4.384057971014493, "grad_norm": 53515.422913985036, "learning_rate": 4.583194902950234e-08, "logits/chosen": 1.564095377922058, "logits/rejected": 1.9398508071899414, "logps/chosen": -136.6577606201172, "logps/rejected": -105.86729431152344, "loss": 4950.4969, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.015586909838020802, "rewards/margins": 0.01646164432168007, "rewards/rejected": -0.0008747352985665202, "step": 1210 }, { "debug/policy_chosen_logits": 1.2941845655441284, "debug/policy_chosen_logps": -6.256102561950684, "debug/policy_rejected_logits": 1.3666229248046875, "debug/policy_rejected_logps": -2.8788514137268066, "debug/reference_chosen_logps": -6.278141021728516, "debug/reference_rejected_logps": -2.8032212257385254, "debug/sppo_chosen_loss": 2497.870849609375, "debug/sppo_chosen_reward_in_loss": 0.02203887328505516, "debug/sppo_rej_reward_in_loss": -0.07563024759292603, "debug/sppo_reject_loss": 2492.65673828125, "epoch": 4.4021739130434785, "grad_norm": 173823.9493048742, "learning_rate": 4.546078520554123e-08, "logits/chosen": 1.2941845655441284, "logits/rejected": 1.3666229248046875, "logps/chosen": -6.256102561950684, "logps/rejected": -2.8788514137268066, "loss": 4986.5516, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.00022038875613361597, "rewards/margins": 0.0009766912553459406, "rewards/rejected": -0.0007563024410046637, "step": 1215 }, { "debug/policy_chosen_logits": 0.9879053831100464, "debug/policy_chosen_logps": -3.2489356994628906, "debug/policy_rejected_logits": 1.3216302394866943, "debug/policy_rejected_logps": -6.594407558441162, "debug/reference_chosen_logps": -3.2298636436462402, "debug/reference_rejected_logps": -6.5863037109375, "debug/sppo_chosen_loss": 2501.91650390625, "debug/sppo_chosen_reward_in_loss": -0.0190723929554224, "debug/sppo_rej_reward_in_loss": -0.0081039909273386, "debug/sppo_reject_loss": 2499.23583984375, "epoch": 4.420289855072464, "grad_norm": 93139.60160339653, "learning_rate": 4.5089873426103575e-08, "logits/chosen": 0.9879053831100464, "logits/rejected": 1.3216302394866943, "logps/chosen": -3.2489356994628906, "logps/rejected": -6.594407558441162, "loss": 4996.0926, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.00019072392024099827, "rewards/margins": -0.00010968399874400347, "rewards/rejected": -8.103992877295241e-05, "step": 1220 }, { "debug/policy_chosen_logits": 1.4566363096237183, "debug/policy_chosen_logps": -98.23893737792969, "debug/policy_rejected_logits": 1.5736243724822998, "debug/policy_rejected_logps": -63.44609832763672, "debug/reference_chosen_logps": -98.9661865234375, "debug/reference_rejected_logps": -63.7095832824707, "debug/sppo_chosen_loss": 2452.74853515625, "debug/sppo_chosen_reward_in_loss": 0.7272627353668213, "debug/sppo_rej_reward_in_loss": 0.2634854316711426, "debug/sppo_reject_loss": 2532.317138671875, "epoch": 4.438405797101449, "grad_norm": 42485.04790901766, "learning_rate": 4.471923428644361e-08, "logits/chosen": 1.4566363096237183, "logits/rejected": 1.5736243724822998, "logps/chosen": -98.23893737792969, "logps/rejected": -63.44609832763672, "loss": 4969.5867, "rewards/accuracies": 0.25, "rewards/chosen": 0.007272626273334026, "rewards/margins": 0.004637773614376783, "rewards/rejected": 0.0026348542887717485, "step": 1225 }, { "debug/policy_chosen_logits": 1.3912606239318848, "debug/policy_chosen_logps": -10.750028610229492, "debug/policy_rejected_logits": 1.7333920001983643, "debug/policy_rejected_logps": -44.13505935668945, "debug/reference_chosen_logps": -10.939241409301758, "debug/reference_rejected_logps": -44.20545196533203, "debug/sppo_chosen_loss": 2481.545654296875, "debug/sppo_chosen_reward_in_loss": 0.18921318650245667, "debug/sppo_rej_reward_in_loss": 0.07039423286914825, "debug/sppo_reject_loss": 2507.452880859375, "epoch": 4.456521739130435, "grad_norm": 62889.09649007921, "learning_rate": 4.4348888366677e-08, "logits/chosen": 1.3912606239318848, "logits/rejected": 1.7333920001983643, "logps/chosen": -10.750028610229492, "logps/rejected": -44.13505935668945, "loss": 4940.2406, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0018921317532658577, "rewards/margins": 0.0011881894897669554, "rewards/rejected": 0.0007039423217065632, "step": 1230 }, { "debug/policy_chosen_logits": 1.3826220035552979, "debug/policy_chosen_logps": -43.23428726196289, "debug/policy_rejected_logits": 1.7299890518188477, "debug/policy_rejected_logps": -10.38016414642334, "debug/reference_chosen_logps": -43.48090744018555, "debug/reference_rejected_logps": -10.295641899108887, "debug/sppo_chosen_loss": 2476.89794921875, "debug/sppo_chosen_reward_in_loss": 0.24661867320537567, "debug/sppo_rej_reward_in_loss": -0.0845230221748352, "debug/sppo_reject_loss": 2491.640869140625, "epoch": 4.47463768115942, "grad_norm": 43290.7943303566, "learning_rate": 4.3978856230638006e-08, "logits/chosen": 1.3826220035552979, "logits/rejected": 1.7299890518188477, "logps/chosen": -43.23428726196289, "logps/rejected": -10.38016414642334, "loss": 4942.0086, "rewards/accuracies": 0.25, "rewards/chosen": 0.0024661868810653687, "rewards/margins": 0.0033114172983914614, "rewards/rejected": -0.0008452301844954491, "step": 1235 }, { "debug/policy_chosen_logits": 1.140354871749878, "debug/policy_chosen_logps": -2.2498490810394287, "debug/policy_rejected_logits": 1.7514533996582031, "debug/policy_rejected_logps": -7.117529392242432, "debug/reference_chosen_logps": -2.2591500282287598, "debug/reference_rejected_logps": -7.0815110206604, "debug/sppo_chosen_loss": 2499.13720703125, "debug/sppo_chosen_reward_in_loss": 0.009300893172621727, "debug/sppo_rej_reward_in_loss": -0.03601868823170662, "debug/sppo_reject_loss": 2496.442138671875, "epoch": 4.492753623188406, "grad_norm": 55766.08036341901, "learning_rate": 4.360915842473778e-08, "logits/chosen": 1.140354871749878, "logits/rejected": 1.7514533996582031, "logps/chosen": -2.2498490810394287, "logps/rejected": -7.117529392242432, "loss": 5001.0477, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 9.300892997998744e-05, "rewards/margins": 0.00045319582568481565, "rewards/rejected": -0.00036018691025674343, "step": 1240 }, { "debug/policy_chosen_logits": 1.4584836959838867, "debug/policy_chosen_logps": -8.69191837310791, "debug/policy_rejected_logits": 1.6307458877563477, "debug/policy_rejected_logps": -6.163792133331299, "debug/reference_chosen_logps": -8.682868957519531, "debug/reference_rejected_logps": -6.076182842254639, "debug/sppo_chosen_loss": 2500.9580078125, "debug/sppo_chosen_reward_in_loss": -0.009048166684806347, "debug/sppo_rej_reward_in_loss": -0.08760930597782135, "debug/sppo_reject_loss": 2491.5205078125, "epoch": 4.510869565217392, "grad_norm": 58435.35153947338, "learning_rate": 4.323981547682341e-08, "logits/chosen": 1.4584836959838867, "logits/rejected": 1.6307458877563477, "logps/chosen": -8.69191837310791, "logps/rejected": -6.163792133331299, "loss": 4954.7418, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -9.048165520653129e-05, "rewards/margins": 0.0007856113952584565, "rewards/rejected": -0.0008760929922573268, "step": 1245 }, { "debug/policy_chosen_logits": 1.0195560455322266, "debug/policy_chosen_logps": -221.4563446044922, "debug/policy_rejected_logits": 1.488187551498413, "debug/policy_rejected_logps": -3.5700771808624268, "debug/reference_chosen_logps": -222.7086181640625, "debug/reference_rejected_logps": -3.507493257522583, "debug/sppo_chosen_loss": 2407.380615234375, "debug/sppo_chosen_reward_in_loss": 1.2522696256637573, "debug/sppo_rej_reward_in_loss": -0.06258374452590942, "debug/sppo_reject_loss": 2493.918212890625, "epoch": 4.528985507246377, "grad_norm": 40810.196959335975, "learning_rate": 4.287084789503821e-08, "logits/chosen": 1.0195560455322266, "logits/rejected": 1.488187551498413, "logps/chosen": -221.4563446044922, "logps/rejected": -3.5700771808624268, "loss": 4969.1547, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.01252269558608532, "rewards/margins": 0.013148533180356026, "rewards/rejected": -0.0006258374196477234, "step": 1250 }, { "debug/policy_chosen_logits": 1.3192247152328491, "debug/policy_chosen_logps": -59.671043395996094, "debug/policy_rejected_logits": 1.411346197128296, "debug/policy_rejected_logps": -74.88134765625, "debug/reference_chosen_logps": -60.365997314453125, "debug/reference_rejected_logps": -74.52059173583984, "debug/sppo_chosen_loss": 2451.70263671875, "debug/sppo_chosen_reward_in_loss": 0.6949574947357178, "debug/sppo_rej_reward_in_loss": -0.36074990034103394, "debug/sppo_reject_loss": 2468.903076171875, "epoch": 4.547101449275362, "grad_norm": 70655.09583558, "learning_rate": 4.25022761666828e-08, "logits/chosen": 1.3192247152328491, "logits/rejected": 1.411346197128296, "logps/chosen": -59.671043395996094, "logps/rejected": -74.88134765625, "loss": 4962.475, "rewards/accuracies": 0.25, "rewards/chosen": 0.006949573755264282, "rewards/margins": 0.010557073168456554, "rewards/rejected": -0.0036074991803616285, "step": 1255 }, { "debug/policy_chosen_logits": 1.4108763933181763, "debug/policy_chosen_logps": -81.37194061279297, "debug/policy_rejected_logits": 1.4654836654663086, "debug/policy_rejected_logps": -46.3926887512207, "debug/reference_chosen_logps": -82.198974609375, "debug/reference_rejected_logps": -46.564697265625, "debug/sppo_chosen_loss": 2432.73681640625, "debug/sppo_chosen_reward_in_loss": 0.8270298838615417, "debug/sppo_rej_reward_in_loss": 0.1720089167356491, "debug/sppo_reject_loss": 2520.19091796875, "epoch": 4.565217391304348, "grad_norm": 171830.112021159, "learning_rate": 4.2134120757077734e-08, "logits/chosen": 1.4108763933181763, "logits/rejected": 1.4654836654663086, "logps/chosen": -81.37194061279297, "logps/rejected": -46.3926887512207, "loss": 4966.8379, "rewards/accuracies": 0.375, "rewards/chosen": 0.008270299062132835, "rewards/margins": 0.006550210062414408, "rewards/rejected": 0.0017200892325490713, "step": 1260 }, { "debug/policy_chosen_logits": 1.5830053091049194, "debug/policy_chosen_logps": -59.827484130859375, "debug/policy_rejected_logits": 1.7552168369293213, "debug/policy_rejected_logps": -5.3651838302612305, "debug/reference_chosen_logps": -60.350914001464844, "debug/reference_rejected_logps": -5.316500186920166, "debug/sppo_chosen_loss": 2459.37744140625, "debug/sppo_chosen_reward_in_loss": 0.523438572883606, "debug/sppo_rej_reward_in_loss": -0.04868318513035774, "debug/sppo_reject_loss": 2495.2138671875, "epoch": 4.583333333333333, "grad_norm": 118053.55600928616, "learning_rate": 4.176640210842699e-08, "logits/chosen": 1.5830053091049194, "logits/rejected": 1.7552168369293213, "logps/chosen": -59.827484130859375, "logps/rejected": -5.3651838302612305, "loss": 4983.1359, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.0052343858405947685, "rewards/margins": 0.0057212174870073795, "rewards/rejected": -0.00048683182103559375, "step": 1265 }, { "debug/policy_chosen_logits": 1.5887081623077393, "debug/policy_chosen_logps": -7.698332786560059, "debug/policy_rejected_logits": 1.689623236656189, "debug/policy_rejected_logps": -9.404260635375977, "debug/reference_chosen_logps": -7.772337436676025, "debug/reference_rejected_logps": -9.234907150268555, "debug/sppo_chosen_loss": 2492.68994140625, "debug/sppo_chosen_reward_in_loss": 0.07400371134281158, "debug/sppo_rej_reward_in_loss": -0.1693534404039383, "debug/sppo_reject_loss": 2483.37890625, "epoch": 4.601449275362318, "grad_norm": 32881.04113353441, "learning_rate": 4.139914063868293e-08, "logits/chosen": 1.5887081623077393, "logits/rejected": 1.689623236656189, "logps/chosen": -7.698332786560059, "logps/rejected": -9.404260635375977, "loss": 4986.0195, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0007400370668619871, "rewards/margins": 0.0024335714988410473, "rewards/rejected": -0.0016935344319790602, "step": 1270 }, { "debug/policy_chosen_logits": 1.3465834856033325, "debug/policy_chosen_logps": -41.00519943237305, "debug/policy_rejected_logits": 1.6631807088851929, "debug/policy_rejected_logps": -170.9693603515625, "debug/reference_chosen_logps": -41.317928314208984, "debug/reference_rejected_logps": -171.03085327148438, "debug/sppo_chosen_loss": 2471.82666015625, "debug/sppo_chosen_reward_in_loss": 0.312730610370636, "debug/sppo_rej_reward_in_loss": 0.06150226667523384, "debug/sppo_reject_loss": 2529.27001953125, "epoch": 4.619565217391305, "grad_norm": 59882.690953666315, "learning_rate": 4.103235674041266e-08, "logits/chosen": 1.3465834856033325, "logits/rejected": 1.6631807088851929, "logps/chosen": -41.00519943237305, "logps/rejected": -170.9693603515625, "loss": 4958.9504, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0031273060012608767, "rewards/margins": 0.0025122829247266054, "rewards/rejected": 0.0006150230765342712, "step": 1275 }, { "debug/policy_chosen_logits": 1.631051778793335, "debug/policy_chosen_logps": -4.740657329559326, "debug/policy_rejected_logits": 1.5207099914550781, "debug/policy_rejected_logps": -86.55484771728516, "debug/reference_chosen_logps": -4.780890464782715, "debug/reference_rejected_logps": -86.65788269042969, "debug/sppo_chosen_loss": 2496.043212890625, "debug/sppo_chosen_reward_in_loss": 0.04023274779319763, "debug/sppo_rej_reward_in_loss": 0.1030365452170372, "debug/sppo_reject_loss": 2510.56689453125, "epoch": 4.63768115942029, "grad_norm": 19294.14415598518, "learning_rate": 4.066607077966558e-08, "logits/chosen": 1.631051778793335, "logits/rejected": 1.5207099914550781, "logps/chosen": -4.740657329559326, "logps/rejected": -86.55484771728516, "loss": 4968.8836, "rewards/accuracies": 0.25, "rewards/chosen": 0.0004023274523206055, "rewards/margins": -0.0006280379020608962, "rewards/rejected": 0.0010303652379661798, "step": 1280 }, { "debug/policy_chosen_logits": 1.387338399887085, "debug/policy_chosen_logps": -173.78817749023438, "debug/policy_rejected_logits": 1.8178889751434326, "debug/policy_rejected_logps": -4.233766555786133, "debug/reference_chosen_logps": -176.03570556640625, "debug/reference_rejected_logps": -4.204698085784912, "debug/sppo_chosen_loss": 2344.955810546875, "debug/sppo_chosen_reward_in_loss": 2.2475287914276123, "debug/sppo_rej_reward_in_loss": -0.029068315401673317, "debug/sppo_reject_loss": 2497.22607421875, "epoch": 4.655797101449275, "grad_norm": 89091.06024682595, "learning_rate": 4.030030309484266e-08, "logits/chosen": 1.387338399887085, "logits/rejected": 1.8178889751434326, "logps/chosen": -173.78817749023438, "logps/rejected": -4.233766555786133, "loss": 4987.3121, "rewards/accuracies": 0.375, "rewards/chosen": 0.022475287318229675, "rewards/margins": 0.02276597172021866, "rewards/rejected": -0.00029068312142044306, "step": 1285 }, { "debug/policy_chosen_logits": 1.2566910982131958, "debug/policy_chosen_logps": -51.799217224121094, "debug/policy_rejected_logits": 1.6671909093856812, "debug/policy_rejected_logps": -109.68305969238281, "debug/reference_chosen_logps": -52.22807693481445, "debug/reference_rejected_logps": -110.21329498291016, "debug/sppo_chosen_loss": 2464.20361328125, "debug/sppo_chosen_reward_in_loss": 0.428859144449234, "debug/sppo_rej_reward_in_loss": 0.5302290916442871, "debug/sppo_reject_loss": 2560.167724609375, "epoch": 4.673913043478261, "grad_norm": 107053.68268230498, "learning_rate": 3.9935073995566984e-08, "logits/chosen": 1.2566910982131958, "logits/rejected": 1.6671909093856812, "logps/chosen": -51.799217224121094, "logps/rejected": -109.68305969238281, "loss": 4971.6789, "rewards/accuracies": 0.25, "rewards/chosen": 0.00428859144449234, "rewards/margins": -0.0010136992204934359, "rewards/rejected": 0.005302290432155132, "step": 1290 }, { "debug/policy_chosen_logits": 1.4166959524154663, "debug/policy_chosen_logps": -55.22007369995117, "debug/policy_rejected_logits": 1.9318931102752686, "debug/policy_rejected_logps": -5.746127128601074, "debug/reference_chosen_logps": -55.77851486206055, "debug/reference_rejected_logps": -5.727991580963135, "debug/sppo_chosen_loss": 2452.86083984375, "debug/sppo_chosen_reward_in_loss": 0.5584417581558228, "debug/sppo_rej_reward_in_loss": -0.018135327845811844, "debug/sppo_reject_loss": 2498.22412109375, "epoch": 4.692028985507246, "grad_norm": 97066.94860922363, "learning_rate": 3.957040376155625e-08, "logits/chosen": 1.4166959524154663, "logits/rejected": 1.9318931102752686, "logps/chosen": -55.22007369995117, "logps/rejected": -5.746127128601074, "loss": 4982.6426, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.005584417376667261, "rewards/margins": 0.005765770561993122, "rewards/rejected": -0.00018135327263735235, "step": 1295 }, { "debug/policy_chosen_logits": 1.464232087135315, "debug/policy_chosen_logps": -8.02291202545166, "debug/policy_rejected_logits": 1.5606266260147095, "debug/policy_rejected_logps": -103.11021423339844, "debug/reference_chosen_logps": -8.032278060913086, "debug/reference_rejected_logps": -102.90438079833984, "debug/sppo_chosen_loss": 2499.16552734375, "debug/sppo_chosen_reward_in_loss": 0.009366476908326149, "debug/sppo_rej_reward_in_loss": -0.20583924651145935, "debug/sppo_reject_loss": 2480.649658203125, "epoch": 4.710144927536232, "grad_norm": 14508.559003332673, "learning_rate": 3.920631264149647e-08, "logits/chosen": 1.464232087135315, "logits/rejected": 1.5606266260147095, "logps/chosen": -8.02291202545166, "logps/rejected": -103.11021423339844, "loss": 4982.1008, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 9.366478479932994e-05, "rewards/margins": 0.0021520573645830154, "rewards/rejected": -0.0020583923906087875, "step": 1300 }, { "epoch": 4.710144927536232, "eval_debug/policy_chosen_logits": 1.6602166891098022, "eval_debug/policy_chosen_logps": -121.43695831298828, "eval_debug/policy_rejected_logits": 1.71922767162323, "eval_debug/policy_rejected_logps": -63.9127197265625, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2388.193359375, "eval_debug/sppo_chosen_reward_in_loss": 1.711087703704834, "eval_debug/sppo_rej_reward_in_loss": -0.025658784434199333, "eval_debug/sppo_reject_loss": 2497.482421875, "eval_logits/chosen": 1.6602166891098022, "eval_logits/rejected": 1.71922767162323, "eval_logps/chosen": -121.43695831298828, "eval_logps/rejected": -63.9127197265625, "eval_loss": 4951.431640625, "eval_rewards/accuracies": 0.3552631437778473, "eval_rewards/chosen": 0.017110876739025116, "eval_rewards/margins": 0.017367463558912277, "eval_rewards/rejected": -0.00025658783852122724, "eval_runtime": 28.7518, "eval_samples_per_second": 20.868, "eval_steps_per_second": 0.661, "step": 1300 }, { "debug/policy_chosen_logits": 1.230797529220581, "debug/policy_chosen_logps": -40.214088439941406, "debug/policy_rejected_logits": 1.6269108057022095, "debug/policy_rejected_logps": -63.09296798706055, "debug/reference_chosen_logps": -40.53647994995117, "debug/reference_rejected_logps": -63.32228469848633, "debug/sppo_chosen_loss": 2471.490478515625, "debug/sppo_chosen_reward_in_loss": 0.32239586114883423, "debug/sppo_rej_reward_in_loss": 0.2293127328157425, "debug/sppo_reject_loss": 2526.587890625, "epoch": 4.728260869565218, "grad_norm": 39875.52883919624, "learning_rate": 3.884282085191782e-08, "logits/chosen": 1.230797529220581, "logits/rejected": 1.6269108057022095, "logps/chosen": -40.214088439941406, "logps/rejected": -63.09296798706055, "loss": 4932.1164, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.003223958658054471, "rewards/margins": 0.0009308316512033343, "rewards/rejected": 0.0022931271232664585, "step": 1305 }, { "debug/policy_chosen_logits": 1.4519037008285522, "debug/policy_chosen_logps": -3.3636975288391113, "debug/policy_rejected_logits": 1.6874501705169678, "debug/policy_rejected_logps": -5.398979187011719, "debug/reference_chosen_logps": -3.3413052558898926, "debug/reference_rejected_logps": -5.367804527282715, "debug/sppo_chosen_loss": 2502.26513671875, "debug/sppo_chosen_reward_in_loss": -0.02239195629954338, "debug/sppo_rej_reward_in_loss": -0.031174475327134132, "debug/sppo_reject_loss": 2496.99560546875, "epoch": 4.746376811594203, "grad_norm": 87716.95432278488, "learning_rate": 3.847994857607208e-08, "logits/chosen": 1.4519037008285522, "logits/rejected": 1.6874501705169678, "logps/chosen": -3.3636975288391113, "logps/rejected": -5.398979187011719, "loss": 4950.9809, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.00022391956008505076, "rewards/margins": 8.782518852967769e-05, "rewards/rejected": -0.00031174480682238936, "step": 1310 }, { "debug/policy_chosen_logits": 1.7106530666351318, "debug/policy_chosen_logps": -117.10188293457031, "debug/policy_rejected_logits": 2.0245490074157715, "debug/policy_rejected_logps": -139.97669982910156, "debug/reference_chosen_logps": -118.21076965332031, "debug/reference_rejected_logps": -140.56314086914062, "debug/sppo_chosen_loss": 2412.84326171875, "debug/sppo_chosen_reward_in_loss": 1.1088967323303223, "debug/sppo_rej_reward_in_loss": 0.5864347815513611, "debug/sppo_reject_loss": 2565.67822265625, "epoch": 4.7644927536231885, "grad_norm": 27388.447729584295, "learning_rate": 3.811771596281181e-08, "logits/chosen": 1.7106530666351318, "logits/rejected": 2.0245490074157715, "logps/chosen": -117.10188293457031, "logps/rejected": -139.97669982910156, "loss": 4963.977, "rewards/accuracies": 0.375, "rewards/chosen": 0.011088966391980648, "rewards/margins": 0.005224618129432201, "rewards/rejected": 0.005864348262548447, "step": 1315 }, { "debug/policy_chosen_logits": 1.3039146661758423, "debug/policy_chosen_logps": -105.98665618896484, "debug/policy_rejected_logits": 1.6506599187850952, "debug/policy_rejected_logps": -87.28633880615234, "debug/reference_chosen_logps": -106.98687744140625, "debug/reference_rejected_logps": -87.4347152709961, "debug/sppo_chosen_loss": 2435.859375, "debug/sppo_chosen_reward_in_loss": 1.0002405643463135, "debug/sppo_rej_reward_in_loss": 0.14837436378002167, "debug/sppo_reject_loss": 2516.51708984375, "epoch": 4.782608695652174, "grad_norm": 59042.910850701286, "learning_rate": 3.775614312547174e-08, "logits/chosen": 1.3039146661758423, "logits/rejected": 1.6506599187850952, "logps/chosen": -105.98665618896484, "logps/rejected": -87.28633880615234, "loss": 4972.8613, "rewards/accuracies": 0.375, "rewards/chosen": 0.010002405382692814, "rewards/margins": 0.008518660441040993, "rewards/rejected": 0.0014837437774986029, "step": 1320 }, { "debug/policy_chosen_logits": 1.3661106824874878, "debug/policy_chosen_logps": -87.15650939941406, "debug/policy_rejected_logits": 1.580627202987671, "debug/policy_rejected_logps": -4.730563163757324, "debug/reference_chosen_logps": -87.5677719116211, "debug/reference_rejected_logps": -4.6185102462768555, "debug/sppo_chosen_loss": 2462.53515625, "debug/sppo_chosen_reward_in_loss": 0.4112638831138611, "debug/sppo_rej_reward_in_loss": -0.11205343902111053, "debug/sppo_reject_loss": 2488.926513671875, "epoch": 4.800724637681159, "grad_norm": 62960.21840594305, "learning_rate": 3.739525014075178e-08, "logits/chosen": 1.3661106824874878, "logits/rejected": 1.580627202987671, "logps/chosen": -87.15650939941406, "logps/rejected": -4.730563163757324, "loss": 4934.282, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.004112638533115387, "rewards/margins": 0.005233173258602619, "rewards/rejected": -0.0011205343762412667, "step": 1325 }, { "debug/policy_chosen_logits": 1.2207223176956177, "debug/policy_chosen_logps": -9.838308334350586, "debug/policy_rejected_logits": 1.5549678802490234, "debug/policy_rejected_logps": -4.913496971130371, "debug/reference_chosen_logps": -9.850369453430176, "debug/reference_rejected_logps": -4.753626823425293, "debug/sppo_chosen_loss": 2498.87255859375, "debug/sppo_chosen_reward_in_loss": 0.01206052303314209, "debug/sppo_rej_reward_in_loss": -0.15986987948417664, "debug/sppo_reject_loss": 2484.53955078125, "epoch": 4.818840579710145, "grad_norm": 19636.755810878683, "learning_rate": 3.7035057047602446e-08, "logits/chosen": 1.2207223176956177, "logits/rejected": 1.5549678802490234, "logps/chosen": -9.838308334350586, "logps/rejected": -4.913496971130371, "loss": 4959.743, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.00012060524750268087, "rewards/margins": 0.001719303778372705, "rewards/rejected": -0.0015986986691132188, "step": 1330 }, { "debug/policy_chosen_logits": 1.4991430044174194, "debug/policy_chosen_logps": -64.02582550048828, "debug/policy_rejected_logits": 1.8418090343475342, "debug/policy_rejected_logps": -123.17252349853516, "debug/reference_chosen_logps": -64.95429992675781, "debug/reference_rejected_logps": -123.97188568115234, "debug/sppo_chosen_loss": 2441.60107421875, "debug/sppo_chosen_reward_in_loss": 0.9284830093383789, "debug/sppo_rej_reward_in_loss": 0.7993800640106201, "debug/sppo_reject_loss": 2604.604736328125, "epoch": 4.836956521739131, "grad_norm": 122864.65786371518, "learning_rate": 3.6675583846111964e-08, "logits/chosen": 1.4991430044174194, "logits/rejected": 1.8418090343475342, "logps/chosen": -64.02582550048828, "logps/rejected": -123.17252349853516, "loss": 4964.0172, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.00928482972085476, "rewards/margins": 0.0012910297373309731, "rewards/rejected": 0.007993800565600395, "step": 1335 }, { "debug/policy_chosen_logits": 1.318119764328003, "debug/policy_chosen_logps": -6.59066915512085, "debug/policy_rejected_logits": 1.749263048171997, "debug/policy_rejected_logps": -40.836952209472656, "debug/reference_chosen_logps": -6.516819953918457, "debug/reference_rejected_logps": -41.23200225830078, "debug/sppo_chosen_loss": 2507.443359375, "debug/sppo_chosen_reward_in_loss": -0.07384970039129257, "debug/sppo_rej_reward_in_loss": 0.39504918456077576, "debug/sppo_reject_loss": 2545.98095703125, "epoch": 4.855072463768116, "grad_norm": 67075.63174516647, "learning_rate": 3.6316850496395855e-08, "logits/chosen": 1.318119764328003, "logits/rejected": 1.749263048171997, "logps/chosen": -6.59066915512085, "logps/rejected": -40.836952209472656, "loss": 4970.3449, "rewards/accuracies": 0.15000000596046448, "rewards/chosen": -0.0007384970085695386, "rewards/margins": -0.004688988905400038, "rewards/rejected": 0.003950492013245821, "step": 1340 }, { "debug/policy_chosen_logits": 1.5387303829193115, "debug/policy_chosen_logps": -45.710426330566406, "debug/policy_rejected_logits": 1.8789077997207642, "debug/policy_rejected_logps": -163.49185180664062, "debug/reference_chosen_logps": -46.10194778442383, "debug/reference_rejected_logps": -163.77232360839844, "debug/sppo_chosen_loss": 2465.898193359375, "debug/sppo_chosen_reward_in_loss": 0.3915260434150696, "debug/sppo_rej_reward_in_loss": 0.28048887848854065, "debug/sppo_reject_loss": 2530.44580078125, "epoch": 4.8731884057971016, "grad_norm": 141492.75971524185, "learning_rate": 3.595887691748868e-08, "logits/chosen": 1.5387303829193115, "logits/rejected": 1.8789077997207642, "logps/chosen": -45.710426330566406, "logps/rejected": -163.49185180664062, "loss": 4980.1398, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0039152600802481174, "rewards/margins": 0.0011103710858151317, "rewards/rejected": 0.0028048886451870203, "step": 1345 }, { "debug/policy_chosen_logits": 1.3324600458145142, "debug/policy_chosen_logps": -87.28633117675781, "debug/policy_rejected_logits": 1.6665313243865967, "debug/policy_rejected_logps": -87.04779052734375, "debug/reference_chosen_logps": -87.97621154785156, "debug/reference_rejected_logps": -86.86080932617188, "debug/sppo_chosen_loss": 2448.89892578125, "debug/sppo_chosen_reward_in_loss": 0.689897358417511, "debug/sppo_rej_reward_in_loss": -0.1869787871837616, "debug/sppo_reject_loss": 2481.780029296875, "epoch": 4.891304347826087, "grad_norm": 21304.836540870867, "learning_rate": 3.560168298623788e-08, "logits/chosen": 1.3324600458145142, "logits/rejected": 1.6665313243865967, "logps/chosen": -87.28633117675781, "logps/rejected": -87.04779052734375, "loss": 4983.0137, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.006898973137140274, "rewards/margins": 0.008768761530518532, "rewards/rejected": -0.0018697878113016486, "step": 1350 }, { "debug/policy_chosen_logits": 1.4361904859542847, "debug/policy_chosen_logps": -89.75782012939453, "debug/policy_rejected_logits": 1.7363128662109375, "debug/policy_rejected_logps": -48.72291946411133, "debug/reference_chosen_logps": -91.06925964355469, "debug/reference_rejected_logps": -48.99866485595703, "debug/sppo_chosen_loss": 2434.47265625, "debug/sppo_chosen_reward_in_loss": 1.3114492893218994, "debug/sppo_rej_reward_in_loss": 0.27574411034584045, "debug/sppo_reject_loss": 2529.968017578125, "epoch": 4.909420289855072, "grad_norm": 45422.41816913489, "learning_rate": 3.524528853620023e-08, "logits/chosen": 1.4361904859542847, "logits/rejected": 1.7363128662109375, "logps/chosen": -89.75782012939453, "logps/rejected": -48.72291946411133, "loss": 4958.9566, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.01311449147760868, "rewards/margins": 0.010357052087783813, "rewards/rejected": 0.002757440786808729, "step": 1355 }, { "debug/policy_chosen_logits": 1.239624261856079, "debug/policy_chosen_logps": -6.533757209777832, "debug/policy_rejected_logits": 1.8003345727920532, "debug/policy_rejected_logps": -221.8753662109375, "debug/reference_chosen_logps": -6.552236080169678, "debug/reference_rejected_logps": -222.52499389648438, "debug/sppo_chosen_loss": 2498.17529296875, "debug/sppo_chosen_reward_in_loss": 0.018478691577911377, "debug/sppo_rej_reward_in_loss": 0.6496568918228149, "debug/sppo_reject_loss": 2580.92041015625, "epoch": 4.927536231884058, "grad_norm": 44839.07402057305, "learning_rate": 3.488971335654043e-08, "logits/chosen": 1.239624261856079, "logits/rejected": 1.8003345727920532, "logps/chosen": -6.533757209777832, "logps/rejected": -221.8753662109375, "loss": 4956.4266, "rewards/accuracies": 0.25, "rewards/chosen": 0.00018478692800272256, "rewards/margins": -0.006311782635748386, "rewards/rejected": 0.006496569607406855, "step": 1360 }, { "debug/policy_chosen_logits": 1.4441231489181519, "debug/policy_chosen_logps": -51.7540168762207, "debug/policy_rejected_logits": 1.6184126138687134, "debug/policy_rejected_logps": -10.866933822631836, "debug/reference_chosen_logps": -52.23685836791992, "debug/reference_rejected_logps": -10.776144027709961, "debug/sppo_chosen_loss": 2462.35205078125, "debug/sppo_chosen_reward_in_loss": 0.4828435778617859, "debug/sppo_rej_reward_in_loss": -0.09079112857580185, "debug/sppo_reject_loss": 2491.034423828125, "epoch": 4.945652173913043, "grad_norm": 18507.073770942294, "learning_rate": 3.453497719093242e-08, "logits/chosen": 1.4441231489181519, "logits/rejected": 1.6184126138687134, "logps/chosen": -51.7540168762207, "logps/rejected": -10.866933822631836, "loss": 4962.8289, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.004828435834497213, "rewards/margins": 0.005736346822232008, "rewards/rejected": -0.0009079112787730992, "step": 1365 }, { "debug/policy_chosen_logits": 1.3185319900512695, "debug/policy_chosen_logps": -3.4400722980499268, "debug/policy_rejected_logits": 1.7543909549713135, "debug/policy_rejected_logps": -10.64331340789795, "debug/reference_chosen_logps": -3.437901258468628, "debug/reference_rejected_logps": -10.604125022888184, "debug/sppo_chosen_loss": 2500.22119140625, "debug/sppo_chosen_reward_in_loss": -0.0021708994172513485, "debug/sppo_rej_reward_in_loss": -0.0391872301697731, "debug/sppo_reject_loss": 2496.134765625, "epoch": 4.963768115942029, "grad_norm": 18503.157462592393, "learning_rate": 3.418109973646298e-08, "logits/chosen": 1.3185319900512695, "logits/rejected": 1.7543909549713135, "logps/chosen": -3.4400722980499268, "logps/rejected": -10.64331340789795, "loss": 4956.5137, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -2.1709001885028556e-05, "rewards/margins": 0.00037016329588368535, "rewards/rejected": -0.00039187230868265033, "step": 1370 }, { "debug/policy_chosen_logits": 1.6731418371200562, "debug/policy_chosen_logps": -92.99998474121094, "debug/policy_rejected_logits": 1.9045652151107788, "debug/policy_rejected_logps": -7.382849216461182, "debug/reference_chosen_logps": -93.96662139892578, "debug/reference_rejected_logps": -7.362084865570068, "debug/sppo_chosen_loss": 2441.77734375, "debug/sppo_chosen_reward_in_loss": 0.9666271209716797, "debug/sppo_rej_reward_in_loss": -0.02076394483447075, "debug/sppo_reject_loss": 2497.94140625, "epoch": 4.981884057971015, "grad_norm": 130910.55400750034, "learning_rate": 3.382810064253809e-08, "logits/chosen": 1.6731418371200562, "logits/rejected": 1.9045652151107788, "logps/chosen": -92.99998474121094, "logps/rejected": -7.382849216461182, "loss": 4987.2437, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.009666272439062595, "rewards/margins": 0.009873910807073116, "rewards/rejected": -0.00020763944485224783, "step": 1375 }, { "debug/policy_chosen_logits": 1.620819091796875, "debug/policy_chosen_logps": -42.269256591796875, "debug/policy_rejected_logits": 1.8143199682235718, "debug/policy_rejected_logps": -43.09006118774414, "debug/reference_chosen_logps": -42.54169464111328, "debug/reference_rejected_logps": -42.27654266357422, "debug/sppo_chosen_loss": 2476.973388671875, "debug/sppo_chosen_reward_in_loss": 0.2724388539791107, "debug/sppo_rej_reward_in_loss": -0.8135198354721069, "debug/sppo_reject_loss": 2441.762451171875, "epoch": 5.0, "grad_norm": 155116.98948182954, "learning_rate": 3.3475999509791925e-08, "logits/chosen": 1.620819091796875, "logits/rejected": 1.8143199682235718, "logps/chosen": -42.269256591796875, "logps/rejected": -43.09006118774414, "loss": 4971.9367, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0027243883814662695, "rewards/margins": 0.01085958443582058, "rewards/rejected": -0.008135197684168816, "step": 1380 }, { "debug/policy_chosen_logits": 1.260926604270935, "debug/policy_chosen_logps": -42.514923095703125, "debug/policy_rejected_logits": 1.5364911556243896, "debug/policy_rejected_logps": -80.88084411621094, "debug/reference_chosen_logps": -42.970298767089844, "debug/reference_rejected_logps": -80.71112060546875, "debug/sppo_chosen_loss": 2461.684814453125, "debug/sppo_chosen_reward_in_loss": 0.4553709030151367, "debug/sppo_rej_reward_in_loss": -0.1697220504283905, "debug/sppo_reject_loss": 2489.947265625, "epoch": 5.018115942028985, "grad_norm": 102530.42437778349, "learning_rate": 3.3124815888998345e-08, "logits/chosen": 1.260926604270935, "logits/rejected": 1.5364911556243896, "logps/chosen": -42.514923095703125, "logps/rejected": -80.88084411621094, "loss": 4971.3895, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.004553708713501692, "rewards/margins": 0.006250930018723011, "rewards/rejected": -0.0016972202574834228, "step": 1385 }, { "debug/policy_chosen_logits": 1.4940139055252075, "debug/policy_chosen_logps": -117.74589538574219, "debug/policy_rejected_logits": 1.9126107692718506, "debug/policy_rejected_logps": -59.227630615234375, "debug/reference_chosen_logps": -119.01092529296875, "debug/reference_rejected_logps": -59.480979919433594, "debug/sppo_chosen_loss": 2433.262451171875, "debug/sppo_chosen_reward_in_loss": 1.2650257349014282, "debug/sppo_rej_reward_in_loss": 0.2533467710018158, "debug/sppo_reject_loss": 2529.0869140625, "epoch": 5.036231884057971, "grad_norm": 24814.915242020114, "learning_rate": 3.277456927998554e-08, "logits/chosen": 1.4940139055252075, "logits/rejected": 1.9126107692718506, "logps/chosen": -117.74589538574219, "logps/rejected": -59.227630615234375, "loss": 4934.1328, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.012650258839130402, "rewards/margins": 0.010116788558661938, "rewards/rejected": 0.0025334679521620274, "step": 1390 }, { "debug/policy_chosen_logits": 1.6011402606964111, "debug/policy_chosen_logps": -81.42131805419922, "debug/policy_rejected_logits": 2.059464454650879, "debug/policy_rejected_logps": -69.45082092285156, "debug/reference_chosen_logps": -81.94474792480469, "debug/reference_rejected_logps": -69.47213745117188, "debug/sppo_chosen_loss": 2457.69384765625, "debug/sppo_chosen_reward_in_loss": 0.5234284996986389, "debug/sppo_rej_reward_in_loss": 0.021312737837433815, "debug/sppo_reject_loss": 2503.0224609375, "epoch": 5.054347826086956, "grad_norm": 79468.50560047722, "learning_rate": 3.2425279130553076e-08, "logits/chosen": 1.6011402606964111, "logits/rejected": 2.059464454650879, "logps/chosen": -81.42131805419922, "logps/rejected": -69.45082092285156, "loss": 4965.5277, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0052342843264341354, "rewards/margins": 0.005021157208830118, "rewards/rejected": 0.0002131273940904066, "step": 1395 }, { "debug/policy_chosen_logits": 0.9660686254501343, "debug/policy_chosen_logps": -6.722316741943359, "debug/policy_rejected_logits": 1.4823577404022217, "debug/policy_rejected_logps": -8.81474494934082, "debug/reference_chosen_logps": -6.740177154541016, "debug/reference_rejected_logps": -8.824418067932129, "debug/sppo_chosen_loss": 2498.24853515625, "debug/sppo_chosen_reward_in_loss": 0.01786055602133274, "debug/sppo_rej_reward_in_loss": 0.009673374705016613, "debug/sppo_reject_loss": 2500.98681640625, "epoch": 5.072463768115942, "grad_norm": 38092.331300582664, "learning_rate": 3.2076964835392185e-08, "logits/chosen": 0.9660686254501343, "logits/rejected": 1.4823577404022217, "logps/chosen": -6.722316741943359, "logps/rejected": -8.81474494934082, "loss": 4966.7375, "rewards/accuracies": 0.25, "rewards/chosen": 0.00017860555090010166, "rewards/margins": 8.187181083485484e-05, "rewards/rejected": 9.673374734120443e-05, "step": 1400 }, { "epoch": 5.072463768115942, "eval_debug/policy_chosen_logits": 1.6630836725234985, "eval_debug/policy_chosen_logps": -121.30001068115234, "eval_debug/policy_rejected_logits": 1.7215626239776611, "eval_debug/policy_rejected_logps": -63.81118392944336, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2383.47265625, "eval_debug/sppo_chosen_reward_in_loss": 1.848038911819458, "eval_debug/sppo_rej_reward_in_loss": 0.07587086409330368, "eval_debug/sppo_reject_loss": 2508.167236328125, "eval_logits/chosen": 1.6630836725234985, "eval_logits/rejected": 1.7215626239776611, "eval_logps/chosen": -121.30001068115234, "eval_logps/rejected": -63.81118392944336, "eval_loss": 4954.5615234375, "eval_rewards/accuracies": 0.32894736528396606, "eval_rewards/chosen": 0.018480388447642326, "eval_rewards/margins": 0.01772168092429638, "eval_rewards/rejected": 0.0007587086874991655, "eval_runtime": 28.5999, "eval_samples_per_second": 20.979, "eval_steps_per_second": 0.664, "step": 1400 }, { "debug/policy_chosen_logits": 1.263564109802246, "debug/policy_chosen_logps": -7.162301540374756, "debug/policy_rejected_logits": 1.3726948499679565, "debug/policy_rejected_logps": -12.884763717651367, "debug/reference_chosen_logps": -7.235052585601807, "debug/reference_rejected_logps": -12.837717056274414, "debug/sppo_chosen_loss": 2492.81787109375, "debug/sppo_chosen_reward_in_loss": 0.07275019586086273, "debug/sppo_rej_reward_in_loss": -0.047046225517988205, "debug/sppo_reject_loss": 2495.34912109375, "epoch": 5.090579710144928, "grad_norm": 87870.29951945957, "learning_rate": 3.1729645735008747e-08, "logits/chosen": 1.263564109802246, "logits/rejected": 1.3726948499679565, "logps/chosen": -7.162301540374756, "logps/rejected": -12.884763717651367, "loss": 4933.7195, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0007275020470842719, "rewards/margins": 0.0011979641858488321, "rewards/rejected": -0.00047046225517988205, "step": 1405 }, { "debug/policy_chosen_logits": 1.4458630084991455, "debug/policy_chosen_logps": -7.43654727935791, "debug/policy_rejected_logits": 1.725818395614624, "debug/policy_rejected_logps": -48.61075973510742, "debug/reference_chosen_logps": -7.446101188659668, "debug/reference_rejected_logps": -48.94523239135742, "debug/sppo_chosen_loss": 2499.11181640625, "debug/sppo_chosen_reward_in_loss": 0.009554145857691765, "debug/sppo_rej_reward_in_loss": 0.33447298407554626, "debug/sppo_reject_loss": 2543.507568359375, "epoch": 5.108695652173913, "grad_norm": 44868.43771764957, "learning_rate": 3.1383341114649466e-08, "logits/chosen": 1.4458630084991455, "logits/rejected": 1.725818395614624, "logps/chosen": -7.43654727935791, "logps/rejected": -48.61075973510742, "loss": 4976.9219, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 9.554145071888342e-05, "rewards/margins": -0.0032491893507540226, "rewards/rejected": 0.0033447302412241697, "step": 1410 }, { "debug/policy_chosen_logits": 1.523651361465454, "debug/policy_chosen_logps": -37.519222259521484, "debug/policy_rejected_logits": 1.5594791173934937, "debug/policy_rejected_logps": -7.28360652923584, "debug/reference_chosen_logps": -37.77952194213867, "debug/reference_rejected_logps": -7.246865749359131, "debug/sppo_chosen_loss": 2476.875, "debug/sppo_chosen_reward_in_loss": 0.26029694080352783, "debug/sppo_rej_reward_in_loss": -0.03674084693193436, "debug/sppo_reject_loss": 2496.37060546875, "epoch": 5.1268115942028984, "grad_norm": 15700.324908220333, "learning_rate": 3.103807020323103e-08, "logits/chosen": 1.523651361465454, "logits/rejected": 1.5594791173934937, "logps/chosen": -37.519222259521484, "logps/rejected": -7.28360652923584, "loss": 4972.832, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0026029690634459257, "rewards/margins": 0.0029703774489462376, "rewards/rejected": -0.00036740844370797276, "step": 1415 }, { "debug/policy_chosen_logits": 1.50445556640625, "debug/policy_chosen_logps": -44.503395080566406, "debug/policy_rejected_logits": 1.775874137878418, "debug/policy_rejected_logps": -3.673715591430664, "debug/reference_chosen_logps": -45.08138656616211, "debug/reference_rejected_logps": -3.631639003753662, "debug/sppo_chosen_loss": 2458.071533203125, "debug/sppo_chosen_reward_in_loss": 0.5779950022697449, "debug/sppo_rej_reward_in_loss": -0.04207686334848404, "debug/sppo_reject_loss": 2495.871826171875, "epoch": 5.144927536231884, "grad_norm": 34151.714340891085, "learning_rate": 3.0693852172272336e-08, "logits/chosen": 1.50445556640625, "logits/rejected": 1.775874137878418, "logps/chosen": -44.503395080566406, "logps/rejected": -3.673715591430664, "loss": 4965.2797, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.005779949016869068, "rewards/margins": 0.006200718227773905, "rewards/rejected": -0.000420768657932058, "step": 1420 }, { "debug/policy_chosen_logits": 1.2192208766937256, "debug/policy_chosen_logps": -9.350908279418945, "debug/policy_rejected_logits": 1.4083795547485352, "debug/policy_rejected_logps": -56.97005081176758, "debug/reference_chosen_logps": -9.44443416595459, "debug/reference_rejected_logps": -57.086700439453125, "debug/sppo_chosen_loss": 2490.8037109375, "debug/sppo_chosen_reward_in_loss": 0.09352628141641617, "debug/sppo_rej_reward_in_loss": 0.11664694547653198, "debug/sppo_reject_loss": 2512.870361328125, "epoch": 5.163043478260869, "grad_norm": 201659.98549099246, "learning_rate": 3.035070613483009e-08, "logits/chosen": 1.2192208766937256, "logits/rejected": 1.4083795547485352, "logps/chosen": -9.350908279418945, "logps/rejected": -56.97005081176758, "loss": 4987.5887, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0009352628258056939, "rewards/margins": -0.0002312064461875707, "rewards/rejected": 0.0011664694175124168, "step": 1425 }, { "debug/policy_chosen_logits": 1.247273325920105, "debug/policy_chosen_logps": -135.34231567382812, "debug/policy_rejected_logits": 1.6571645736694336, "debug/policy_rejected_logps": -3.9415409564971924, "debug/reference_chosen_logps": -137.0669403076172, "debug/reference_rejected_logps": -3.9132537841796875, "debug/sppo_chosen_loss": 2389.262451171875, "debug/sppo_chosen_reward_in_loss": 1.7246224880218506, "debug/sppo_rej_reward_in_loss": -0.028286689892411232, "debug/sppo_reject_loss": 2497.1953125, "epoch": 5.181159420289855, "grad_norm": 20746.435828936956, "learning_rate": 3.0008651144437394e-08, "logits/chosen": 1.247273325920105, "logits/rejected": 1.6571645736694336, "logps/chosen": -135.34231567382812, "logps/rejected": -3.9415409564971924, "loss": 4958.9523, "rewards/accuracies": 0.25, "rewards/chosen": 0.017246225848793983, "rewards/margins": 0.017529090866446495, "rewards/rejected": -0.00028286693850532174, "step": 1430 }, { "debug/policy_chosen_logits": 0.9180505871772766, "debug/policy_chosen_logps": -87.34517669677734, "debug/policy_rejected_logits": 1.3577024936676025, "debug/policy_rejected_logps": -113.94576263427734, "debug/reference_chosen_logps": -88.36367797851562, "debug/reference_rejected_logps": -114.4028549194336, "debug/sppo_chosen_loss": 2435.109375, "debug/sppo_chosen_reward_in_loss": 1.0185062885284424, "debug/sppo_rej_reward_in_loss": 0.45710498094558716, "debug/sppo_reject_loss": 2555.91845703125, "epoch": 5.199275362318841, "grad_norm": 40359.386901904065, "learning_rate": 2.9667706194045895e-08, "logits/chosen": 0.9180505871772766, "logits/rejected": 1.3577024936676025, "logps/chosen": -87.34517669677734, "logps/rejected": -113.94576263427734, "loss": 4951.1922, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.010185063816606998, "rewards/margins": 0.005614013411104679, "rewards/rejected": 0.004571049474179745, "step": 1435 }, { "debug/policy_chosen_logits": 1.3888263702392578, "debug/policy_chosen_logps": -38.0069694519043, "debug/policy_rejected_logits": 1.4886255264282227, "debug/policy_rejected_logps": -98.35933685302734, "debug/reference_chosen_logps": -38.32720947265625, "debug/reference_rejected_logps": -97.77687072753906, "debug/sppo_chosen_loss": 2470.5048828125, "debug/sppo_chosen_reward_in_loss": 0.3202434182167053, "debug/sppo_rej_reward_in_loss": -0.5824726819992065, "debug/sppo_reject_loss": 2454.067138671875, "epoch": 5.217391304347826, "grad_norm": 102306.17458836819, "learning_rate": 2.932789021497113e-08, "logits/chosen": 1.3888263702392578, "logits/rejected": 1.4886255264282227, "logps/chosen": -38.0069694519043, "logps/rejected": -98.35933685302734, "loss": 4976.9648, "rewards/accuracies": 0.375, "rewards/chosen": 0.0032024341635406017, "rewards/margins": 0.009027160704135895, "rewards/rejected": -0.005824726540595293, "step": 1440 }, { "debug/policy_chosen_logits": 1.1596992015838623, "debug/policy_chosen_logps": -304.5762939453125, "debug/policy_rejected_logits": 1.8658138513565063, "debug/policy_rejected_logps": -76.65251922607422, "debug/reference_chosen_logps": -306.47906494140625, "debug/reference_rejected_logps": -73.67118835449219, "debug/sppo_chosen_loss": 2346.65869140625, "debug/sppo_chosen_reward_in_loss": 1.9027868509292603, "debug/sppo_rej_reward_in_loss": -2.9813292026519775, "debug/sppo_reject_loss": 2521.363525390625, "epoch": 5.2355072463768115, "grad_norm": 174816.26328619532, "learning_rate": 2.898922207584133e-08, "logits/chosen": 1.1596992015838623, "logits/rejected": 1.8658138513565063, "logps/chosen": -304.5762939453125, "logps/rejected": -76.65251922607422, "loss": 4940.8094, "rewards/accuracies": 0.375, "rewards/chosen": 0.019027868285775185, "rewards/margins": 0.04884115606546402, "rewards/rejected": -0.029813289642333984, "step": 1445 }, { "debug/policy_chosen_logits": 1.468611717224121, "debug/policy_chosen_logps": -6.327389240264893, "debug/policy_rejected_logits": 1.804332971572876, "debug/policy_rejected_logps": -31.989837646484375, "debug/reference_chosen_logps": -6.336493492126465, "debug/reference_rejected_logps": -31.913803100585938, "debug/sppo_chosen_loss": 2499.13037109375, "debug/sppo_chosen_reward_in_loss": 0.00910444837063551, "debug/sppo_rej_reward_in_loss": -0.07603711634874344, "debug/sppo_reject_loss": 2492.576416015625, "epoch": 5.253623188405797, "grad_norm": 88243.21066753895, "learning_rate": 2.8651720581549797e-08, "logits/chosen": 1.468611717224121, "logits/rejected": 1.804332971572876, "logps/chosen": -6.327389240264893, "logps/rejected": -31.989837646484375, "loss": 5000.843, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 9.104450145969167e-05, "rewards/margins": 0.0008514156797900796, "rewards/rejected": -0.0007603711565025151, "step": 1450 }, { "debug/policy_chosen_logits": 1.0762215852737427, "debug/policy_chosen_logps": -5.938260555267334, "debug/policy_rejected_logits": 1.4214346408843994, "debug/policy_rejected_logps": -3.833444118499756, "debug/reference_chosen_logps": -5.939286231994629, "debug/reference_rejected_logps": -3.8238983154296875, "debug/sppo_chosen_loss": 2499.92138671875, "debug/sppo_chosen_reward_in_loss": 0.0010257899994030595, "debug/sppo_rej_reward_in_loss": -0.009545820765197277, "debug/sppo_reject_loss": 2499.067626953125, "epoch": 5.271739130434782, "grad_norm": 34003.001028799015, "learning_rate": 2.8315404472210646e-08, "logits/chosen": 1.0762215852737427, "logits/rejected": 1.4214346408843994, "logps/chosen": -5.938260555267334, "logps/rejected": -3.833444118499756, "loss": 4997.2461, "rewards/accuracies": 0.25, "rewards/chosen": 1.0257912435918115e-05, "rewards/margins": 0.00010571611346676946, "rewards/rejected": -9.545819193590432e-05, "step": 1455 }, { "debug/policy_chosen_logits": 1.1473190784454346, "debug/policy_chosen_logps": -154.47604370117188, "debug/policy_rejected_logits": 1.4396816492080688, "debug/policy_rejected_logps": -46.66912841796875, "debug/reference_chosen_logps": -155.37582397460938, "debug/reference_rejected_logps": -45.943355560302734, "debug/sppo_chosen_loss": 2426.6806640625, "debug/sppo_chosen_reward_in_loss": 0.8997659683227539, "debug/sppo_rej_reward_in_loss": -0.7257744073867798, "debug/sppo_reject_loss": 2435.462158203125, "epoch": 5.2898550724637685, "grad_norm": 25815.935940843035, "learning_rate": 2.798029242211828e-08, "logits/chosen": 1.1473190784454346, "logits/rejected": 1.4396816492080688, "logps/chosen": -154.47604370117188, "logps/rejected": -46.66912841796875, "loss": 4971.1812, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.008997658267617226, "rewards/margins": 0.01625540480017662, "rewards/rejected": -0.007257743738591671, "step": 1460 }, { "debug/policy_chosen_logits": 1.4703161716461182, "debug/policy_chosen_logps": -5.166600227355957, "debug/policy_rejected_logits": 1.7033302783966064, "debug/policy_rejected_logps": -7.808079719543457, "debug/reference_chosen_logps": -5.220461368560791, "debug/reference_rejected_logps": -7.778662204742432, "debug/sppo_chosen_loss": 2494.688232421875, "debug/sppo_chosen_reward_in_loss": 0.053861040621995926, "debug/sppo_rej_reward_in_loss": -0.029417354613542557, "debug/sppo_reject_loss": 2497.150390625, "epoch": 5.307971014492754, "grad_norm": 31133.426328736365, "learning_rate": 2.7646403038710535e-08, "logits/chosen": 1.4703161716461182, "logits/rejected": 1.7033302783966064, "logps/chosen": -5.166600227355957, "logps/rejected": -7.808079719543457, "loss": 4952.575, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.0005386103875935078, "rewards/margins": 0.000832783873192966, "rewards/rejected": -0.00029417354380711913, "step": 1465 }, { "debug/policy_chosen_logits": 1.207201361656189, "debug/policy_chosen_logps": -4.800561428070068, "debug/policy_rejected_logits": 2.0751421451568604, "debug/policy_rejected_logps": -39.52562713623047, "debug/reference_chosen_logps": -4.839789390563965, "debug/reference_rejected_logps": -39.348365783691406, "debug/sppo_chosen_loss": 2496.11572265625, "debug/sppo_chosen_reward_in_loss": 0.03922843933105469, "debug/sppo_rej_reward_in_loss": -0.17726507782936096, "debug/sppo_reject_loss": 2483.154296875, "epoch": 5.326086956521739, "grad_norm": 113822.0058425963, "learning_rate": 2.73137548615354e-08, "logits/chosen": 1.207201361656189, "logits/rejected": 2.0751421451568604, "logps/chosen": -4.800561428070068, "logps/rejected": -39.52562713623047, "loss": 4958.3809, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0003922843898180872, "rewards/margins": 0.002164935227483511, "rewards/rejected": -0.0017726507503539324, "step": 1470 }, { "debug/policy_chosen_logits": 1.3511595726013184, "debug/policy_chosen_logps": -6.863890647888184, "debug/policy_rejected_logits": 1.6475614309310913, "debug/policy_rejected_logps": -11.602303504943848, "debug/reference_chosen_logps": -6.857815742492676, "debug/reference_rejected_logps": -11.570013999938965, "debug/sppo_chosen_loss": 2500.66064453125, "debug/sppo_chosen_reward_in_loss": -0.0060753049328923225, "debug/sppo_rej_reward_in_loss": -0.03228985145688057, "debug/sppo_reject_loss": 2496.8466796875, "epoch": 5.344202898550725, "grad_norm": 74469.45251769363, "learning_rate": 2.6982366361221608e-08, "logits/chosen": 1.3511595726013184, "logits/rejected": 1.6475614309310913, "logps/chosen": -6.863890647888184, "logps/rejected": -11.602303504943848, "loss": 4967.2383, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -6.0753012803616e-05, "rewards/margins": 0.00026214547688141465, "rewards/rejected": -0.0003228984714951366, "step": 1475 }, { "debug/policy_chosen_logits": 1.508335828781128, "debug/policy_chosen_logps": -91.82176208496094, "debug/policy_rejected_logits": 1.8259601593017578, "debug/policy_rejected_logps": -7.642992973327637, "debug/reference_chosen_logps": -91.68228912353516, "debug/reference_rejected_logps": -7.644742488861084, "debug/sppo_chosen_loss": 2514.510986328125, "debug/sppo_chosen_reward_in_loss": -0.1394740641117096, "debug/sppo_rej_reward_in_loss": 0.0017492175102233887, "debug/sppo_reject_loss": 2500.24609375, "epoch": 5.36231884057971, "grad_norm": 60475.79714699111, "learning_rate": 2.6652255938453066e-08, "logits/chosen": 1.508335828781128, "logits/rejected": 1.8259601593017578, "logps/chosen": -91.82176208496094, "logps/rejected": -7.642992973327637, "loss": 4990.225, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": -0.0013947406550869346, "rewards/margins": -0.0014122327556833625, "rewards/rejected": 1.74921933648875e-05, "step": 1480 }, { "debug/policy_chosen_logits": 1.23860764503479, "debug/policy_chosen_logps": -1.5549476146697998, "debug/policy_rejected_logits": 1.4271659851074219, "debug/policy_rejected_logps": -6.305294990539551, "debug/reference_chosen_logps": -1.591352939605713, "debug/reference_rejected_logps": -6.144689083099365, "debug/sppo_chosen_loss": 2496.39111328125, "debug/sppo_chosen_reward_in_loss": 0.036405403167009354, "debug/sppo_rej_reward_in_loss": -0.1606057584285736, "debug/sppo_reject_loss": 2484.45751953125, "epoch": 5.380434782608695, "grad_norm": 16096.718760006366, "learning_rate": 2.6323441922947165e-08, "logits/chosen": 1.23860764503479, "logits/rejected": 1.4271659851074219, "logps/chosen": -1.5549476146697998, "logps/rejected": -6.305294990539551, "loss": 4993.3313, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.000364054023521021, "rewards/margins": 0.0019701116252690554, "rewards/rejected": -0.0016060576308518648, "step": 1485 }, { "debug/policy_chosen_logits": 1.4922605752944946, "debug/policy_chosen_logps": -3.7463607788085938, "debug/policy_rejected_logits": 1.7682536840438843, "debug/policy_rejected_logps": -5.208930015563965, "debug/reference_chosen_logps": -3.7803280353546143, "debug/reference_rejected_logps": -5.147512435913086, "debug/sppo_chosen_loss": 2496.62890625, "debug/sppo_chosen_reward_in_loss": 0.033967334777116776, "debug/sppo_rej_reward_in_loss": -0.06141723319888115, "debug/sppo_reject_loss": 2493.93359375, "epoch": 5.398550724637682, "grad_norm": 52884.19999815421, "learning_rate": 2.599594257243689e-08, "logits/chosen": 1.4922605752944946, "logits/rejected": 1.7682536840438843, "logps/chosen": -3.7463607788085938, "logps/rejected": -5.208930015563965, "loss": 4973.1719, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00033967330818995833, "rewards/margins": 0.0009538456797599792, "rewards/rejected": -0.00061417231336236, "step": 1490 }, { "debug/policy_chosen_logits": 1.44706130027771, "debug/policy_chosen_logps": -98.2606201171875, "debug/policy_rejected_logits": 1.587388277053833, "debug/policy_rejected_logps": -102.90731048583984, "debug/reference_chosen_logps": -98.66349792480469, "debug/reference_rejected_logps": -102.67134094238281, "debug/sppo_chosen_loss": 2466.350830078125, "debug/sppo_chosen_reward_in_loss": 0.402879536151886, "debug/sppo_rej_reward_in_loss": -0.23598095774650574, "debug/sppo_reject_loss": 2478.0322265625, "epoch": 5.416666666666667, "grad_norm": 40384.904653695776, "learning_rate": 2.566977607165719e-08, "logits/chosen": 1.44706130027771, "logits/rejected": 1.587388277053833, "logps/chosen": -98.2606201171875, "logps/rejected": -102.90731048583984, "loss": 4966.3234, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.004028795287013054, "rewards/margins": 0.006388605572283268, "rewards/rejected": -0.0023598098196089268, "step": 1495 }, { "debug/policy_chosen_logits": 1.2098346948623657, "debug/policy_chosen_logps": -66.90326690673828, "debug/policy_rejected_logits": 1.4877550601959229, "debug/policy_rejected_logps": -3.8573672771453857, "debug/reference_chosen_logps": -67.42205810546875, "debug/reference_rejected_logps": -3.8296828269958496, "debug/sppo_chosen_loss": 2455.467041015625, "debug/sppo_chosen_reward_in_loss": 0.5187984704971313, "debug/sppo_rej_reward_in_loss": -0.027684330940246582, "debug/sppo_reject_loss": 2497.25390625, "epoch": 5.434782608695652, "grad_norm": 113722.38086063277, "learning_rate": 2.5344960531335102e-08, "logits/chosen": 1.2098346948623657, "logits/rejected": 1.4877550601959229, "logps/chosen": -66.90326690673828, "logps/rejected": -3.8573672771453857, "loss": 4937.6176, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.005187984090298414, "rewards/margins": 0.005464828107506037, "rewards/rejected": -0.00027684326050803065, "step": 1500 }, { "epoch": 5.434782608695652, "eval_debug/policy_chosen_logits": 1.6507964134216309, "eval_debug/policy_chosen_logps": -121.57608032226562, "eval_debug/policy_rejected_logits": 1.7099039554595947, "eval_debug/policy_rejected_logps": -64.07382202148438, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2396.666748046875, "eval_debug/sppo_chosen_reward_in_loss": 1.571982502937317, "eval_debug/sppo_rej_reward_in_loss": -0.1867600828409195, "eval_debug/sppo_reject_loss": 2483.373779296875, "eval_logits/chosen": 1.6507964134216309, "eval_logits/rejected": 1.7099039554595947, "eval_logps/chosen": -121.57608032226562, "eval_logps/rejected": -64.07382202148438, "eval_loss": 4952.794921875, "eval_rewards/accuracies": 0.32894736528396606, "eval_rewards/chosen": 0.0157198254019022, "eval_rewards/margins": 0.01758742704987526, "eval_rewards/rejected": -0.001867600716650486, "eval_runtime": 28.7118, "eval_samples_per_second": 20.897, "eval_steps_per_second": 0.662, "step": 1500 }, { "debug/policy_chosen_logits": 1.3756154775619507, "debug/policy_chosen_logps": -91.82910919189453, "debug/policy_rejected_logits": 1.6417385339736938, "debug/policy_rejected_logps": -9.595918655395508, "debug/reference_chosen_logps": -92.66380310058594, "debug/reference_rejected_logps": -9.599748611450195, "debug/sppo_chosen_loss": 2444.77587890625, "debug/sppo_chosen_reward_in_loss": 0.8346956968307495, "debug/sppo_rej_reward_in_loss": 0.003830662462860346, "debug/sppo_reject_loss": 2500.419189453125, "epoch": 5.452898550724638, "grad_norm": 28811.329860734953, "learning_rate": 2.5021513987184274e-08, "logits/chosen": 1.3756154775619507, "logits/rejected": 1.6417385339736938, "logps/chosen": -91.82910919189453, "logps/rejected": -9.595918655395508, "loss": 4964.0762, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.00834695715457201, "rewards/margins": 0.0083086509257555, "rewards/rejected": 3.830664718407206e-05, "step": 1505 }, { "debug/policy_chosen_logits": 1.3515055179595947, "debug/policy_chosen_logps": -117.73277282714844, "debug/policy_rejected_logits": 1.781686544418335, "debug/policy_rejected_logps": -3.768369674682617, "debug/reference_chosen_logps": -119.2478256225586, "debug/reference_rejected_logps": -3.7696566581726074, "debug/sppo_chosen_loss": 2400.694580078125, "debug/sppo_chosen_reward_in_loss": 1.515067219734192, "debug/sppo_rej_reward_in_loss": 0.0012873649829998612, "debug/sppo_reject_loss": 2500.185546875, "epoch": 5.471014492753623, "grad_norm": 60044.81482657004, "learning_rate": 2.469945439890339e-08, "logits/chosen": 1.3515055179595947, "logits/rejected": 1.781686544418335, "logps/chosen": -117.73277282714844, "logps/rejected": -3.768369674682617, "loss": 4970.3941, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.015150671824812889, "rewards/margins": 0.015137799084186554, "rewards/rejected": 1.2873625564679969e-05, "step": 1510 }, { "debug/policy_chosen_logits": 1.2596395015716553, "debug/policy_chosen_logps": -135.70750427246094, "debug/policy_rejected_logits": 1.7082059383392334, "debug/policy_rejected_logps": -3.1411919593811035, "debug/reference_chosen_logps": -137.1483917236328, "debug/reference_rejected_logps": -3.143246650695801, "debug/sppo_chosen_loss": 2399.11572265625, "debug/sppo_chosen_reward_in_loss": 1.4408762454986572, "debug/sppo_rej_reward_in_loss": 0.0020549341570585966, "debug/sppo_reject_loss": 2500.251953125, "epoch": 5.489130434782608, "grad_norm": 19892.24602111211, "learning_rate": 2.4378799649179023e-08, "logits/chosen": 1.2596395015716553, "logits/rejected": 1.7082059383392334, "logps/chosen": -135.70750427246094, "logps/rejected": -3.1411919593811035, "loss": 4974.0191, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.014408761635422707, "rewards/margins": 0.01438821293413639, "rewards/rejected": 2.0549334294628352e-05, "step": 1515 }, { "debug/policy_chosen_logits": 1.7696821689605713, "debug/policy_chosen_logps": -39.84080123901367, "debug/policy_rejected_logits": 2.0549137592315674, "debug/policy_rejected_logps": -37.69769287109375, "debug/reference_chosen_logps": -40.2617301940918, "debug/reference_rejected_logps": -38.05076599121094, "debug/sppo_chosen_loss": 2462.45947265625, "debug/sppo_chosen_reward_in_loss": 0.42092904448509216, "debug/sppo_rej_reward_in_loss": 0.3530712127685547, "debug/sppo_reject_loss": 2540.897705078125, "epoch": 5.507246376811594, "grad_norm": 10927.809107387751, "learning_rate": 2.4059567542692682e-08, "logits/chosen": 1.7696821689605713, "logits/rejected": 2.0549137592315674, "logps/chosen": -39.84080123901367, "logps/rejected": -37.69769287109375, "loss": 4968.6633, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.004209290258586407, "rewards/margins": 0.0006785778095945716, "rewards/rejected": 0.0035307123325765133, "step": 1520 }, { "debug/policy_chosen_logits": 1.4420033693313599, "debug/policy_chosen_logps": -130.97665405273438, "debug/policy_rejected_logits": 1.7147963047027588, "debug/policy_rejected_logps": -5.494109153747559, "debug/reference_chosen_logps": -132.44015502929688, "debug/reference_rejected_logps": -5.4024763107299805, "debug/sppo_chosen_loss": 2403.02783203125, "debug/sppo_chosen_reward_in_loss": 1.4635010957717896, "debug/sppo_rej_reward_in_loss": -0.09163336455821991, "debug/sppo_reject_loss": 2490.93603515625, "epoch": 5.52536231884058, "grad_norm": 42754.147933008375, "learning_rate": 2.3741775805132096e-08, "logits/chosen": 1.4420033693313599, "logits/rejected": 1.7147963047027588, "logps/chosen": -130.97665405273438, "logps/rejected": -5.494109153747559, "loss": 4937.6957, "rewards/accuracies": 0.375, "rewards/chosen": 0.014635011553764343, "rewards/margins": 0.015551343560218811, "rewards/rejected": -0.0009163336944766343, "step": 1525 }, { "debug/policy_chosen_logits": 0.9784383773803711, "debug/policy_chosen_logps": -3.5851662158966064, "debug/policy_rejected_logits": 1.0781360864639282, "debug/policy_rejected_logps": -3.1338181495666504, "debug/reference_chosen_logps": -3.6078414916992188, "debug/reference_rejected_logps": -3.0336270332336426, "debug/sppo_chosen_loss": 2497.748046875, "debug/sppo_chosen_reward_in_loss": 0.02267526462674141, "debug/sppo_rej_reward_in_loss": -0.1001911386847496, "debug/sppo_reject_loss": 2490.076904296875, "epoch": 5.543478260869565, "grad_norm": 94364.70960662822, "learning_rate": 2.342544208220712e-08, "logits/chosen": 0.9784383773803711, "logits/rejected": 1.0781360864639282, "logps/chosen": -3.5851662158966064, "logps/rejected": -3.1338181495666504, "loss": 4948.1461, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.00022675264335703105, "rewards/margins": 0.0012286640703678131, "rewards/rejected": -0.0010019114706665277, "step": 1530 }, { "debug/policy_chosen_logits": 1.2283227443695068, "debug/policy_chosen_logps": -5.095221519470215, "debug/policy_rejected_logits": 1.604286551475525, "debug/policy_rejected_logps": -5.098898887634277, "debug/reference_chosen_logps": -5.104105472564697, "debug/reference_rejected_logps": -5.033654689788818, "debug/sppo_chosen_loss": 2499.12890625, "debug/sppo_chosen_reward_in_loss": 0.008884263224899769, "debug/sppo_rej_reward_in_loss": -0.06524410098791122, "debug/sppo_reject_loss": 2493.52880859375, "epoch": 5.561594202898551, "grad_norm": 31375.294164562067, "learning_rate": 2.311058393866981e-08, "logits/chosen": 1.2283227443695068, "logits/rejected": 1.604286551475525, "logps/chosen": -5.095221519470215, "logps/rejected": -5.098898887634277, "loss": 4971.3152, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 8.884264389052987e-05, "rewards/margins": 0.0007412837003357708, "rewards/rejected": -0.0006524409982375801, "step": 1535 }, { "debug/policy_chosen_logits": 1.2428638935089111, "debug/policy_chosen_logps": -3.204160690307617, "debug/policy_rejected_logits": 1.8065483570098877, "debug/policy_rejected_logps": -39.54701614379883, "debug/reference_chosen_logps": -3.233112335205078, "debug/reference_rejected_logps": -39.82624816894531, "debug/sppo_chosen_loss": 2497.147705078125, "debug/sppo_chosen_reward_in_loss": 0.028951648622751236, "debug/sppo_rej_reward_in_loss": 0.27923327684402466, "debug/sppo_reject_loss": 2531.39453125, "epoch": 5.579710144927536, "grad_norm": 113449.50424985864, "learning_rate": 2.2797218857339163e-08, "logits/chosen": 1.2428638935089111, "logits/rejected": 1.8065483570098877, "logps/chosen": -3.204160690307617, "logps/rejected": -39.54701614379883, "loss": 4978.8164, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.000289516436168924, "rewards/margins": -0.0025028162635862827, "rewards/rejected": 0.002792332787066698, "step": 1540 }, { "debug/policy_chosen_logits": 1.2723385095596313, "debug/policy_chosen_logps": -5.52636194229126, "debug/policy_rejected_logits": 1.5404714345932007, "debug/policy_rejected_logps": -81.84542083740234, "debug/reference_chosen_logps": -5.5925612449646, "debug/reference_rejected_logps": -82.02436828613281, "debug/sppo_chosen_loss": 2493.4482421875, "debug/sppo_chosen_reward_in_loss": 0.06619935482740402, "debug/sppo_rej_reward_in_loss": 0.178949773311615, "debug/sppo_reject_loss": 2519.896240234375, "epoch": 5.5978260869565215, "grad_norm": 48324.04633195426, "learning_rate": 2.2485364238130432e-08, "logits/chosen": 1.2723385095596313, "logits/rejected": 1.5404714345932007, "logps/chosen": -5.52636194229126, "logps/rejected": -81.84542083740234, "loss": 4950.6531, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.000661993573885411, "rewards/margins": -0.0011275041615590453, "rewards/rejected": 0.0017894977936521173, "step": 1545 }, { "debug/policy_chosen_logits": 1.4062082767486572, "debug/policy_chosen_logps": -44.904937744140625, "debug/policy_rejected_logits": 2.0090537071228027, "debug/policy_rejected_logps": -57.84265899658203, "debug/reference_chosen_logps": -44.77842712402344, "debug/reference_rejected_logps": -57.672447204589844, "debug/sppo_chosen_loss": 2513.40478515625, "debug/sppo_chosen_reward_in_loss": -0.1265116035938263, "debug/sppo_rej_reward_in_loss": -0.17020754516124725, "debug/sppo_reject_loss": 2484.35400390625, "epoch": 5.615942028985507, "grad_norm": 86547.78902474005, "learning_rate": 2.2175037397088887e-08, "logits/chosen": 1.4062082767486572, "logits/rejected": 2.0090537071228027, "logps/chosen": -44.904937744140625, "logps/rejected": -57.84265899658203, "loss": 4973.593, "rewards/accuracies": 0.25, "rewards/chosen": -0.0012651159195229411, "rewards/margins": 0.0004369594971649349, "rewards/rejected": -0.001702075474895537, "step": 1550 }, { "debug/policy_chosen_logits": 1.3488719463348389, "debug/policy_chosen_logps": -103.87060546875, "debug/policy_rejected_logits": 1.7295091152191162, "debug/policy_rejected_logps": -5.074273109436035, "debug/reference_chosen_logps": -104.84139251708984, "debug/reference_rejected_logps": -4.836784362792969, "debug/sppo_chosen_loss": 2434.17333984375, "debug/sppo_chosen_reward_in_loss": 0.9708011746406555, "debug/sppo_rej_reward_in_loss": -0.23748907446861267, "debug/sppo_reject_loss": 2477.032958984375, "epoch": 5.634057971014493, "grad_norm": 16952.48691619026, "learning_rate": 2.1866255565428348e-08, "logits/chosen": 1.3488719463348389, "logits/rejected": 1.7295091152191162, "logps/chosen": -103.87060546875, "logps/rejected": -5.074273109436035, "loss": 4969.0188, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.009708011522889137, "rewards/margins": 0.0120829027146101, "rewards/rejected": -0.002374890726059675, "step": 1555 }, { "debug/policy_chosen_logits": 1.477739930152893, "debug/policy_chosen_logps": -143.45654296875, "debug/policy_rejected_logits": 1.8630015850067139, "debug/policy_rejected_logps": -6.69199275970459, "debug/reference_chosen_logps": -144.6988067626953, "debug/reference_rejected_logps": -6.722146034240723, "debug/sppo_chosen_loss": 2407.891845703125, "debug/sppo_chosen_reward_in_loss": 1.242272138595581, "debug/sppo_rej_reward_in_loss": 0.030152231454849243, "debug/sppo_reject_loss": 2503.066162109375, "epoch": 5.6521739130434785, "grad_norm": 122067.55604950039, "learning_rate": 2.1559035888574427e-08, "logits/chosen": 1.477739930152893, "logits/rejected": 1.8630015850067139, "logps/chosen": -143.45654296875, "logps/rejected": -6.69199275970459, "loss": 4943.3527, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.012422721832990646, "rewards/margins": 0.012121197767555714, "rewards/rejected": 0.0003015223192051053, "step": 1560 }, { "debug/policy_chosen_logits": 0.9819734692573547, "debug/policy_chosen_logps": -142.57333374023438, "debug/policy_rejected_logits": 1.2570345401763916, "debug/policy_rejected_logps": -4.001745223999023, "debug/reference_chosen_logps": -143.37928771972656, "debug/reference_rejected_logps": -3.9735684394836426, "debug/sppo_chosen_loss": 2434.35595703125, "debug/sppo_chosen_reward_in_loss": 0.8059395551681519, "debug/sppo_rej_reward_in_loss": -0.028176825493574142, "debug/sppo_reject_loss": 2497.1953125, "epoch": 5.670289855072464, "grad_norm": 35113.08272461505, "learning_rate": 2.125339542521254e-08, "logits/chosen": 0.9819734692573547, "logits/rejected": 1.2570345401763916, "logps/chosen": -142.57333374023438, "logps/rejected": -4.001745223999023, "loss": 4968.5063, "rewards/accuracies": 0.25, "rewards/chosen": 0.008059395477175713, "rewards/margins": 0.008341163396835327, "rewards/rejected": -0.0002817682398017496, "step": 1565 }, { "debug/policy_chosen_logits": 1.5321605205535889, "debug/policy_chosen_logps": -36.463138580322266, "debug/policy_rejected_logits": 1.8616657257080078, "debug/policy_rejected_logps": -10.840460777282715, "debug/reference_chosen_logps": -36.733680725097656, "debug/reference_rejected_logps": -10.768438339233398, "debug/sppo_chosen_loss": 2475.48974609375, "debug/sppo_chosen_reward_in_loss": 0.27053922414779663, "debug/sppo_rej_reward_in_loss": -0.07202298939228058, "debug/sppo_reject_loss": 2492.84619140625, "epoch": 5.688405797101449, "grad_norm": 34054.68447112709, "learning_rate": 2.0949351146340583e-08, "logits/chosen": 1.5321605205535889, "logits/rejected": 1.8616657257080078, "logps/chosen": -36.463138580322266, "logps/rejected": -10.840460777282715, "loss": 4953.1039, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.002705392660573125, "rewards/margins": 0.0034256228245794773, "rewards/rejected": -0.0007202298147603869, "step": 1570 }, { "debug/policy_chosen_logits": 1.6852152347564697, "debug/policy_chosen_logps": -127.41075134277344, "debug/policy_rejected_logits": 1.835598349571228, "debug/policy_rejected_logps": -12.836685180664062, "debug/reference_chosen_logps": -128.343994140625, "debug/reference_rejected_logps": -12.645792007446289, "debug/sppo_chosen_loss": 2423.934326171875, "debug/sppo_chosen_reward_in_loss": 0.9332368969917297, "debug/sppo_rej_reward_in_loss": -0.1908944994211197, "debug/sppo_reject_loss": 2481.622314453125, "epoch": 5.706521739130435, "grad_norm": 66311.10933740414, "learning_rate": 2.064691993432678e-08, "logits/chosen": 1.6852152347564697, "logits/rejected": 1.835598349571228, "logps/chosen": -127.41075134277344, "logps/rejected": -12.836685180664062, "loss": 4960.1531, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.009332368150353432, "rewards/margins": 0.011241314001381397, "rewards/rejected": -0.001908944919705391, "step": 1575 }, { "debug/policy_chosen_logits": 1.5353212356567383, "debug/policy_chosen_logps": -29.207284927368164, "debug/policy_rejected_logits": 1.856058120727539, "debug/policy_rejected_logps": -112.31217956542969, "debug/reference_chosen_logps": -29.465951919555664, "debug/reference_rejected_logps": -111.66182708740234, "debug/sppo_chosen_loss": 2477.1689453125, "debug/sppo_chosen_reward_in_loss": 0.2586670517921448, "debug/sppo_rej_reward_in_loss": -0.6503463983535767, "debug/sppo_reject_loss": 2446.81396484375, "epoch": 5.72463768115942, "grad_norm": 68310.56043010576, "learning_rate": 2.0346118581972095e-08, "logits/chosen": 1.5353212356567383, "logits/rejected": 1.856058120727539, "logps/chosen": -29.207284927368164, "logps/rejected": -112.31217956542969, "loss": 4946.3703, "rewards/accuracies": 0.25, "rewards/chosen": 0.0025866704527288675, "rewards/margins": 0.00909013394266367, "rewards/rejected": -0.006503463722765446, "step": 1580 }, { "debug/policy_chosen_logits": 1.1937682628631592, "debug/policy_chosen_logps": -8.574786186218262, "debug/policy_rejected_logits": 1.6493606567382812, "debug/policy_rejected_logps": -5.910714626312256, "debug/reference_chosen_logps": -8.66431999206543, "debug/reference_rejected_logps": -5.612049102783203, "debug/sppo_chosen_loss": 2491.16064453125, "debug/sppo_chosen_reward_in_loss": 0.08953367918729782, "debug/sppo_rej_reward_in_loss": -0.2986653447151184, "debug/sppo_reject_loss": 2472.857666015625, "epoch": 5.742753623188406, "grad_norm": 48942.931194586854, "learning_rate": 2.0046963791577898e-08, "logits/chosen": 1.1937682628631592, "logits/rejected": 1.6493606567382812, "logps/chosen": -8.574786186218262, "logps/rejected": -5.910714626312256, "loss": 4980.3055, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0008953368524089456, "rewards/margins": 0.0038819897454231977, "rewards/rejected": -0.0029866532422602177, "step": 1585 }, { "debug/policy_chosen_logits": 1.1924240589141846, "debug/policy_chosen_logps": -4.262330055236816, "debug/policy_rejected_logits": 1.827478051185608, "debug/policy_rejected_logps": -4.214280128479004, "debug/reference_chosen_logps": -4.240781784057617, "debug/reference_rejected_logps": -4.2187275886535645, "debug/sppo_chosen_loss": 2502.18212890625, "debug/sppo_chosen_reward_in_loss": -0.021547485142946243, "debug/sppo_rej_reward_in_loss": 0.004447087645530701, "debug/sppo_reject_loss": 2500.471923828125, "epoch": 5.760869565217392, "grad_norm": 90776.20357763865, "learning_rate": 1.9749472174018567e-08, "logits/chosen": 1.1924240589141846, "logits/rejected": 1.827478051185608, "logps/chosen": -4.262330055236816, "logps/rejected": -4.214280128479004, "loss": 4967.0945, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": -0.0002154748362954706, "rewards/margins": -0.00025994572206400335, "rewards/rejected": 4.447086394065991e-05, "step": 1590 }, { "debug/policy_chosen_logits": 1.219681978225708, "debug/policy_chosen_logps": -5.209751605987549, "debug/policy_rejected_logits": 1.4254045486450195, "debug/policy_rejected_logps": -39.709861755371094, "debug/reference_chosen_logps": -5.168766021728516, "debug/reference_rejected_logps": -39.74993896484375, "debug/sppo_chosen_loss": 2504.1240234375, "debug/sppo_chosen_reward_in_loss": -0.04098554700613022, "debug/sppo_rej_reward_in_loss": 0.04007842391729355, "debug/sppo_reject_loss": 2504.6416015625, "epoch": 5.778985507246377, "grad_norm": 62582.41924389442, "learning_rate": 1.9453660247819054e-08, "logits/chosen": 1.219681978225708, "logits/rejected": 1.4254045486450195, "logps/chosen": -5.209751605987549, "logps/rejected": -39.709861755371094, "loss": 4977.907, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.00040985550731420517, "rewards/margins": -0.0008106398163363338, "rewards/rejected": 0.00040078425081446767, "step": 1595 }, { "debug/policy_chosen_logits": 1.196634292602539, "debug/policy_chosen_logps": -114.95948791503906, "debug/policy_rejected_logits": 1.610282301902771, "debug/policy_rejected_logps": -38.30967330932617, "debug/reference_chosen_logps": -116.2649154663086, "debug/reference_rejected_logps": -38.35938262939453, "debug/sppo_chosen_loss": 2439.80126953125, "debug/sppo_chosen_reward_in_loss": 1.3054416179656982, "debug/sppo_rej_reward_in_loss": 0.04970797151327133, "debug/sppo_reject_loss": 2506.794921875, "epoch": 5.797101449275362, "grad_norm": 49066.01515019851, "learning_rate": 1.9159544438237795e-08, "logits/chosen": 1.196634292602539, "logits/rejected": 1.610282301902771, "logps/chosen": -114.95948791503906, "logps/rejected": -38.30967330932617, "loss": 4969.5398, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.013054417446255684, "rewards/margins": 0.012557337991893291, "rewards/rejected": 0.0004970798036083579, "step": 1600 }, { "epoch": 5.797101449275362, "eval_debug/policy_chosen_logits": 1.6601438522338867, "eval_debug/policy_chosen_logps": -121.30487823486328, "eval_debug/policy_rejected_logits": 1.7189868688583374, "eval_debug/policy_rejected_logps": -63.89989471435547, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2383.505615234375, "eval_debug/sppo_chosen_reward_in_loss": 1.8431689739227295, "eval_debug/sppo_rej_reward_in_loss": -0.012839298695325851, "eval_debug/sppo_reject_loss": 2498.8603515625, "eval_logits/chosen": 1.6601438522338867, "eval_logits/rejected": 1.7189868688583374, "eval_logps/chosen": -121.30487823486328, "eval_logps/rejected": -63.89989471435547, "eval_loss": 4948.79248046875, "eval_rewards/accuracies": 0.32894736528396606, "eval_rewards/chosen": 0.018431689590215683, "eval_rewards/margins": 0.018560083582997322, "eval_rewards/rejected": -0.00012839300325140357, "eval_runtime": 28.6862, "eval_samples_per_second": 20.916, "eval_steps_per_second": 0.662, "step": 1600 }, { "debug/policy_chosen_logits": 1.646305799484253, "debug/policy_chosen_logps": -7.4169464111328125, "debug/policy_rejected_logits": 1.6483768224716187, "debug/policy_rejected_logps": -3.287621021270752, "debug/reference_chosen_logps": -7.392036437988281, "debug/reference_rejected_logps": -3.246826648712158, "debug/sppo_chosen_loss": 2502.554931640625, "debug/sppo_chosen_reward_in_loss": -0.024910276755690575, "debug/sppo_rej_reward_in_loss": -0.0407944992184639, "debug/sppo_reject_loss": 2495.96533203125, "epoch": 5.815217391304348, "grad_norm": 39638.46836244049, "learning_rate": 1.8867141076354575e-08, "logits/chosen": 1.646305799484253, "logits/rejected": 1.6483768224716187, "logps/chosen": -7.4169464111328125, "logps/rejected": -3.287621021270752, "loss": 4960.8277, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -0.0002491027698852122, "rewards/margins": 0.0001588421582709998, "rewards/rejected": -0.00040794495726004243, "step": 1605 }, { "debug/policy_chosen_logits": 1.5078829526901245, "debug/policy_chosen_logps": -6.721767425537109, "debug/policy_rejected_logits": 1.6727834939956665, "debug/policy_rejected_logps": -12.31233024597168, "debug/reference_chosen_logps": -6.868844509124756, "debug/reference_rejected_logps": -12.332254409790039, "debug/sppo_chosen_loss": 2485.453125, "debug/sppo_chosen_reward_in_loss": 0.14707691967487335, "debug/sppo_rej_reward_in_loss": 0.019924622029066086, "debug/sppo_reject_loss": 2502.11572265625, "epoch": 5.833333333333333, "grad_norm": 80964.07965783261, "learning_rate": 1.8576466398163825e-08, "logits/chosen": 1.5078829526901245, "logits/rejected": 1.6727834939956665, "logps/chosen": -6.721767425537109, "logps/rejected": -12.31233024597168, "loss": 4969.4906, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0014707691734656692, "rewards/margins": 0.0012715229531750083, "rewards/rejected": 0.00019924622029066086, "step": 1610 }, { "debug/policy_chosen_logits": 1.3070571422576904, "debug/policy_chosen_logps": -180.6649169921875, "debug/policy_rejected_logits": 1.4238245487213135, "debug/policy_rejected_logps": -6.752913475036621, "debug/reference_chosen_logps": -182.20623779296875, "debug/reference_rejected_logps": -6.71105432510376, "debug/sppo_chosen_loss": 2404.08935546875, "debug/sppo_chosen_reward_in_loss": 1.5413240194320679, "debug/sppo_rej_reward_in_loss": -0.04185943678021431, "debug/sppo_reject_loss": 2495.949462890625, "epoch": 5.851449275362318, "grad_norm": 42745.54004827307, "learning_rate": 1.828753654367301e-08, "logits/chosen": 1.3070571422576904, "logits/rejected": 1.4238245487213135, "logps/chosen": -180.6649169921875, "logps/rejected": -6.752913475036621, "loss": 4952.7008, "rewards/accuracies": 0.25, "rewards/chosen": 0.015413239598274231, "rewards/margins": 0.015831835567951202, "rewards/rejected": -0.00041859433986246586, "step": 1615 }, { "debug/policy_chosen_logits": 1.2102620601654053, "debug/policy_chosen_logps": -80.89813995361328, "debug/policy_rejected_logits": 1.483999490737915, "debug/policy_rejected_logps": -7.6511359214782715, "debug/reference_chosen_logps": -82.38662719726562, "debug/reference_rejected_logps": -7.408883571624756, "debug/sppo_chosen_loss": 2424.609619140625, "debug/sppo_chosen_reward_in_loss": 1.4884874820709229, "debug/sppo_rej_reward_in_loss": -0.24225196242332458, "debug/sppo_reject_loss": 2476.855224609375, "epoch": 5.869565217391305, "grad_norm": 75330.57654822464, "learning_rate": 1.800036755600649e-08, "logits/chosen": 1.2102620601654053, "logits/rejected": 1.483999490737915, "logps/chosen": -80.89813995361328, "logps/rejected": -7.6511359214782715, "loss": 4970.0746, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.01488487422466278, "rewards/margins": 0.01730739325284958, "rewards/rejected": -0.0024225199595093727, "step": 1620 }, { "debug/policy_chosen_logits": 1.5239629745483398, "debug/policy_chosen_logps": -3.7502646446228027, "debug/policy_rejected_logits": 1.8067678213119507, "debug/policy_rejected_logps": -3.719534397125244, "debug/reference_chosen_logps": -3.66951060295105, "debug/reference_rejected_logps": -3.7241806983947754, "debug/sppo_chosen_loss": 2508.1845703125, "debug/sppo_chosen_reward_in_loss": -0.08075402677059174, "debug/sppo_rej_reward_in_loss": 0.00464663514867425, "debug/sppo_reject_loss": 2500.53271484375, "epoch": 5.88768115942029, "grad_norm": 16391.062531334497, "learning_rate": 1.7714975380514747e-08, "logits/chosen": 1.5239629745483398, "logits/rejected": 1.8067678213119507, "logps/chosen": -3.7502646446228027, "logps/rejected": -3.719534397125244, "loss": 4989.9801, "rewards/accuracies": 0.125, "rewards/chosen": -0.0008075403166003525, "rewards/margins": -0.0008540066774003208, "rewards/rejected": 4.646631350624375e-05, "step": 1625 }, { "debug/policy_chosen_logits": 1.2187312841415405, "debug/policy_chosen_logps": -5.0353240966796875, "debug/policy_rejected_logits": 1.888061761856079, "debug/policy_rejected_logps": -7.782090187072754, "debug/reference_chosen_logps": -5.09005069732666, "debug/reference_rejected_logps": -7.698686122894287, "debug/sppo_chosen_loss": 2494.61279296875, "debug/sppo_chosen_reward_in_loss": 0.05472656339406967, "debug/sppo_rej_reward_in_loss": -0.08340326696634293, "debug/sppo_reject_loss": 2491.93603515625, "epoch": 5.905797101449275, "grad_norm": 27282.517385064813, "learning_rate": 1.74313758638889e-08, "logits/chosen": 1.2187312841415405, "logits/rejected": 1.888061761856079, "logps/chosen": -5.0353240966796875, "logps/rejected": -7.782090187072754, "loss": 5002.2801, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": 0.0005472655757330358, "rewards/margins": 0.0013812981778755784, "rewards/rejected": -0.0008340327185578644, "step": 1630 }, { "debug/policy_chosen_logits": 1.2369142770767212, "debug/policy_chosen_logps": -103.79841613769531, "debug/policy_rejected_logits": 1.7102587223052979, "debug/policy_rejected_logps": -3.7868709564208984, "debug/reference_chosen_logps": -105.0621337890625, "debug/reference_rejected_logps": -3.808579921722412, "debug/sppo_chosen_loss": 2407.60693359375, "debug/sppo_chosen_reward_in_loss": 1.2637252807617188, "debug/sppo_rej_reward_in_loss": 0.0217093825340271, "debug/sppo_reject_loss": 2502.196533203125, "epoch": 5.923913043478261, "grad_norm": 111368.3422774197, "learning_rate": 1.7149584753280877e-08, "logits/chosen": 1.2369142770767212, "logits/rejected": 1.7102587223052979, "logps/chosen": -103.79841613769531, "logps/rejected": -3.7868709564208984, "loss": 4993.5477, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.012637251988053322, "rewards/margins": 0.01242015790194273, "rewards/rejected": 0.00021709380962420255, "step": 1635 }, { "debug/policy_chosen_logits": 1.109013319015503, "debug/policy_chosen_logps": -10.810483932495117, "debug/policy_rejected_logits": 1.179882287979126, "debug/policy_rejected_logps": -6.817288875579834, "debug/reference_chosen_logps": -10.813862800598145, "debug/reference_rejected_logps": -6.787085056304932, "debug/sppo_chosen_loss": 2499.72509765625, "debug/sppo_chosen_reward_in_loss": 0.0033795118797570467, "debug/sppo_rej_reward_in_loss": -0.03020372986793518, "debug/sppo_reject_loss": 2497.005126953125, "epoch": 5.942028985507246, "grad_norm": 49605.73934186764, "learning_rate": 1.6869617695429024e-08, "logits/chosen": 1.109013319015503, "logits/rejected": 1.179882287979126, "logps/chosen": -10.810483932495117, "logps/rejected": -6.817288875579834, "loss": 4980.6281, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 3.37951241817791e-05, "rewards/margins": 0.0003358324174769223, "rewards/rejected": -0.00030203728238120675, "step": 1640 }, { "debug/policy_chosen_logits": 1.5205527544021606, "debug/policy_chosen_logps": -6.074230194091797, "debug/policy_rejected_logits": 1.9895918369293213, "debug/policy_rejected_logps": -7.170431613922119, "debug/reference_chosen_logps": -6.074235439300537, "debug/reference_rejected_logps": -7.147464752197266, "debug/sppo_chosen_loss": 2500.06396484375, "debug/sppo_chosen_reward_in_loss": 5.422532467491692e-06, "debug/sppo_rej_reward_in_loss": -0.022966548800468445, "debug/sppo_reject_loss": 2497.725830078125, "epoch": 5.960144927536232, "grad_norm": 20646.385076483217, "learning_rate": 1.659149023578932e-08, "logits/chosen": 1.5205527544021606, "logits/rejected": 1.9895918369293213, "logps/chosen": -6.074230194091797, "logps/rejected": -7.170431613922119, "loss": 4987.4703, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 5.417969006771273e-08, "rewards/margins": 0.0002297197061125189, "rewards/rejected": -0.00022966550022829324, "step": 1645 }, { "debug/policy_chosen_logits": 1.3919899463653564, "debug/policy_chosen_logps": -185.52378845214844, "debug/policy_rejected_logits": 1.7907886505126953, "debug/policy_rejected_logps": -9.291855812072754, "debug/reference_chosen_logps": -187.60826110839844, "debug/reference_rejected_logps": -9.161067008972168, "debug/sppo_chosen_loss": 2373.19775390625, "debug/sppo_chosen_reward_in_loss": 2.0844624042510986, "debug/sppo_rej_reward_in_loss": -0.13078823685646057, "debug/sppo_reject_loss": 2487.045654296875, "epoch": 5.978260869565218, "grad_norm": 22834.800414878046, "learning_rate": 1.631521781767214e-08, "logits/chosen": 1.3919899463653564, "logits/rejected": 1.7907886505126953, "logps/chosen": -185.52378845214844, "logps/rejected": -9.291855812072754, "loss": 4937.6363, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.02084462344646454, "rewards/margins": 0.022152505815029144, "rewards/rejected": -0.0013078822521492839, "step": 1650 }, { "debug/policy_chosen_logits": 1.2088714838027954, "debug/policy_chosen_logps": -7.3690619468688965, "debug/policy_rejected_logits": 1.3337514400482178, "debug/policy_rejected_logps": -85.87855529785156, "debug/reference_chosen_logps": -7.433223724365234, "debug/reference_rejected_logps": -85.72587585449219, "debug/sppo_chosen_loss": 2493.611328125, "debug/sppo_chosen_reward_in_loss": 0.06416082382202148, "debug/sppo_rej_reward_in_loss": -0.15267474949359894, "debug/sppo_reject_loss": 2485.583740234375, "epoch": 5.996376811594203, "grad_norm": 19416.46289066037, "learning_rate": 1.6040815781384835e-08, "logits/chosen": 1.2088714838027954, "logits/rejected": 1.3337514400482178, "logps/chosen": -7.3690619468688965, "logps/rejected": -85.87855529785156, "loss": 4952.1059, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0006416082615032792, "rewards/margins": 0.00216835574246943, "rewards/rejected": -0.0015267474809661508, "step": 1655 }, { "debug/policy_chosen_logits": 1.5499736070632935, "debug/policy_chosen_logps": -66.28557586669922, "debug/policy_rejected_logits": 1.6676782369613647, "debug/policy_rejected_logps": -7.251202583312988, "debug/reference_chosen_logps": -66.9842300415039, "debug/reference_rejected_logps": -7.152446746826172, "debug/sppo_chosen_loss": 2450.07763671875, "debug/sppo_chosen_reward_in_loss": 0.6986457109451294, "debug/sppo_rej_reward_in_loss": -0.0987558513879776, "debug/sppo_reject_loss": 2490.287109375, "epoch": 6.0144927536231885, "grad_norm": 59859.51539404919, "learning_rate": 1.5768299363379873e-08, "logits/chosen": 1.5499736070632935, "logits/rejected": 1.6676782369613647, "logps/chosen": -66.28557586669922, "logps/rejected": -7.251202583312988, "loss": 4996.2133, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.006986456923186779, "rewards/margins": 0.007974015548825264, "rewards/rejected": -0.0009875585092231631, "step": 1660 }, { "debug/policy_chosen_logits": 1.3529396057128906, "debug/policy_chosen_logps": -83.43315124511719, "debug/policy_rejected_logits": 1.4442598819732666, "debug/policy_rejected_logps": -5.616559028625488, "debug/reference_chosen_logps": -84.40177154541016, "debug/reference_rejected_logps": -5.586115837097168, "debug/sppo_chosen_loss": 2433.258544921875, "debug/sppo_chosen_reward_in_loss": 0.9686270952224731, "debug/sppo_rej_reward_in_loss": -0.03044305369257927, "debug/sppo_reject_loss": 2497.00732421875, "epoch": 6.032608695652174, "grad_norm": 17008.392369189205, "learning_rate": 1.549768369540882e-08, "logits/chosen": 1.3529396057128906, "logits/rejected": 1.4442598819732666, "logps/chosen": -83.43315124511719, "logps/rejected": -5.616559028625488, "loss": 4962.4039, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.009686270728707314, "rewards/margins": 0.009990701451897621, "rewards/rejected": -0.0003044305194634944, "step": 1665 }, { "debug/policy_chosen_logits": 1.2111425399780273, "debug/policy_chosen_logps": -8.826377868652344, "debug/policy_rejected_logits": 1.5931882858276367, "debug/policy_rejected_logps": -10.089231491088867, "debug/reference_chosen_logps": -8.831872940063477, "debug/reference_rejected_logps": -9.832635879516602, "debug/sppo_chosen_loss": 2499.543701171875, "debug/sppo_chosen_reward_in_loss": 0.005494547076523304, "debug/sppo_rej_reward_in_loss": -0.25659456849098206, "debug/sppo_reject_loss": 2474.903564453125, "epoch": 6.050724637681159, "grad_norm": 91928.50603702008, "learning_rate": 1.5228983803682233e-08, "logits/chosen": 1.2111425399780273, "logits/rejected": 1.5931882858276367, "logps/chosen": -8.826377868652344, "logps/rejected": -10.089231491088867, "loss": 4971.4375, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 5.494547076523304e-05, "rewards/margins": 0.0026208912022411823, "rewards/rejected": -0.002565945964306593, "step": 1670 }, { "debug/policy_chosen_logits": 1.316004753112793, "debug/policy_chosen_logps": -8.941271781921387, "debug/policy_rejected_logits": 1.5742324590682983, "debug/policy_rejected_logps": -104.9312973022461, "debug/reference_chosen_logps": -8.899450302124023, "debug/reference_rejected_logps": -105.12371826171875, "debug/sppo_chosen_loss": 2504.29931640625, "debug/sppo_chosen_reward_in_loss": -0.0418214276432991, "debug/sppo_rej_reward_in_loss": 0.1924237310886383, "debug/sppo_reject_loss": 2521.781494140625, "epoch": 6.068840579710145, "grad_norm": 23841.03068137901, "learning_rate": 1.4962214608035174e-08, "logits/chosen": 1.316004753112793, "logits/rejected": 1.5742324590682983, "logps/chosen": -8.941271781921387, "logps/rejected": -104.9312973022461, "loss": 4985.0203, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.0004182142729405314, "rewards/margins": -0.0023424515966326, "rewards/rejected": 0.001924237236380577, "step": 1675 }, { "debug/policy_chosen_logits": 1.1703146696090698, "debug/policy_chosen_logps": -34.33024978637695, "debug/policy_rejected_logits": 1.4100024700164795, "debug/policy_rejected_logps": -6.018087863922119, "debug/reference_chosen_logps": -34.69602584838867, "debug/reference_rejected_logps": -5.9452972412109375, "debug/sppo_chosen_loss": 2467.084716796875, "debug/sppo_chosen_reward_in_loss": 0.3657784163951874, "debug/sppo_rej_reward_in_loss": -0.07279090583324432, "debug/sppo_reject_loss": 2492.968017578125, "epoch": 6.086956521739131, "grad_norm": 64459.95215283099, "learning_rate": 1.4697390921098884e-08, "logits/chosen": 1.1703146696090698, "logits/rejected": 1.4100024700164795, "logps/chosen": -34.33024978637695, "logps/rejected": -6.018087863922119, "loss": 4958.3562, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0036577838473021984, "rewards/margins": 0.004385693464428186, "rewards/rejected": -0.000727908976841718, "step": 1680 }, { "debug/policy_chosen_logits": 1.2827374935150146, "debug/policy_chosen_logps": -7.593850612640381, "debug/policy_rejected_logits": 1.7532732486724854, "debug/policy_rejected_logps": -7.625147342681885, "debug/reference_chosen_logps": -7.592371463775635, "debug/reference_rejected_logps": -7.573750972747803, "debug/sppo_chosen_loss": 2500.18115234375, "debug/sppo_chosen_reward_in_loss": -0.0014788091648370028, "debug/sppo_rej_reward_in_loss": -0.0513957254588604, "debug/sppo_reject_loss": 2494.90771484375, "epoch": 6.105072463768116, "grad_norm": 29640.476422377305, "learning_rate": 1.4434527447478211e-08, "logits/chosen": 1.2827374935150146, "logits/rejected": 1.7532732486724854, "logps/chosen": -7.593850612640381, "logps/rejected": -7.625147342681885, "loss": 4962.9441, "rewards/accuracies": 0.375, "rewards/chosen": -1.4788075532123912e-05, "rewards/margins": 0.0004991692258045077, "rewards/rejected": -0.0005139572313055396, "step": 1685 }, { "debug/policy_chosen_logits": 1.40358304977417, "debug/policy_chosen_logps": -121.5127944946289, "debug/policy_rejected_logits": 1.5667502880096436, "debug/policy_rejected_logps": -5.4873127937316895, "debug/reference_chosen_logps": -123.18696594238281, "debug/reference_rejected_logps": -5.408583641052246, "debug/sppo_chosen_loss": 2434.274169921875, "debug/sppo_chosen_reward_in_loss": 1.6741540431976318, "debug/sppo_rej_reward_in_loss": -0.07872933149337769, "debug/sppo_reject_loss": 2492.261474609375, "epoch": 6.1231884057971016, "grad_norm": 66838.30315050815, "learning_rate": 1.4173638782935222e-08, "logits/chosen": 1.40358304977417, "logits/rejected": 1.5667502880096436, "logps/chosen": -121.5127944946289, "logps/rejected": -5.4873127937316895, "loss": 4980.4492, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.016741540282964706, "rewards/margins": 0.017528831958770752, "rewards/rejected": -0.000787293363828212, "step": 1690 }, { "debug/policy_chosen_logits": 1.351097822189331, "debug/policy_chosen_logps": -5.877753257751465, "debug/policy_rejected_logits": 1.7722618579864502, "debug/policy_rejected_logps": -4.840622901916504, "debug/reference_chosen_logps": -5.891690254211426, "debug/reference_rejected_logps": -4.724798679351807, "debug/sppo_chosen_loss": 2498.634765625, "debug/sppo_chosen_reward_in_loss": 0.013937163166701794, "debug/sppo_rej_reward_in_loss": -0.11582396179437637, "debug/sppo_reject_loss": 2488.51708984375, "epoch": 6.141304347826087, "grad_norm": 163089.54838260455, "learning_rate": 1.3914739413578635e-08, "logits/chosen": 1.351097822189331, "logits/rejected": 1.7722618579864502, "logps/chosen": -5.877753257751465, "logps/rejected": -4.840622901916504, "loss": 4937.6969, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0001393716229358688, "rewards/margins": 0.001297611161135137, "rewards/rejected": -0.0011582396691665053, "step": 1695 }, { "debug/policy_chosen_logits": 1.2490367889404297, "debug/policy_chosen_logps": -5.262415409088135, "debug/policy_rejected_logits": 1.4162667989730835, "debug/policy_rejected_logps": -5.6771368980407715, "debug/reference_chosen_logps": -5.239500999450684, "debug/reference_rejected_logps": -5.618521213531494, "debug/sppo_chosen_loss": 2502.337158203125, "debug/sppo_chosen_reward_in_loss": -0.02291507087647915, "debug/sppo_rej_reward_in_loss": -0.058615513145923615, "debug/sppo_reject_loss": 2494.166748046875, "epoch": 6.159420289855072, "grad_norm": 62909.51237939224, "learning_rate": 1.3657843715059546e-08, "logits/chosen": 1.2490367889404297, "logits/rejected": 1.4162667989730835, "logps/chosen": -5.262415409088135, "logps/rejected": -5.6771368980407715, "loss": 4931.8516, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.00022915071167517453, "rewards/margins": 0.0003570044063962996, "rewards/rejected": -0.0005861551035195589, "step": 1700 }, { "epoch": 6.159420289855072, "eval_debug/policy_chosen_logits": 1.6596750020980835, "eval_debug/policy_chosen_logps": -121.01417541503906, "eval_debug/policy_rejected_logits": 1.7205696105957031, "eval_debug/policy_rejected_logps": -63.6300048828125, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2381.447509765625, "eval_debug/sppo_chosen_reward_in_loss": 2.133875846862793, "eval_debug/sppo_rej_reward_in_loss": 0.25704607367515564, "eval_debug/sppo_reject_loss": 2532.861572265625, "eval_logits/chosen": 1.6596750020980835, "eval_logits/rejected": 1.7205696105957031, "eval_logps/chosen": -121.01417541503906, "eval_logps/rejected": -63.6300048828125, "eval_loss": 4959.40234375, "eval_rewards/accuracies": 0.2631579041481018, "eval_rewards/chosen": 0.02133875899016857, "eval_rewards/margins": 0.018768297508358955, "eval_rewards/rejected": 0.002570460783317685, "eval_runtime": 28.9734, "eval_samples_per_second": 20.709, "eval_steps_per_second": 0.656, "step": 1700 }, { "debug/policy_chosen_logits": 1.3703433275222778, "debug/policy_chosen_logps": -81.84334564208984, "debug/policy_rejected_logits": 1.7938333749771118, "debug/policy_rejected_logps": -16.49462890625, "debug/reference_chosen_logps": -82.63568878173828, "debug/reference_rejected_logps": -16.59540557861328, "debug/sppo_chosen_loss": 2433.10791015625, "debug/sppo_chosen_reward_in_loss": 0.7923446893692017, "debug/sppo_rej_reward_in_loss": 0.10077603906393051, "debug/sppo_reject_loss": 2510.62548828125, "epoch": 6.177536231884058, "grad_norm": 14951.240983492704, "learning_rate": 1.3402965951773231e-08, "logits/chosen": 1.3703433275222778, "logits/rejected": 1.7938333749771118, "logps/chosen": -81.84334564208984, "logps/rejected": -16.49462890625, "loss": 4947.6234, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.007923447526991367, "rewards/margins": 0.006915686186403036, "rewards/rejected": 0.0010077605256810784, "step": 1705 }, { "debug/policy_chosen_logits": 1.2998325824737549, "debug/policy_chosen_logps": -12.14608097076416, "debug/policy_rejected_logits": 1.7567226886749268, "debug/policy_rejected_logps": -27.60630226135254, "debug/reference_chosen_logps": -12.21338176727295, "debug/reference_rejected_logps": -27.601428985595703, "debug/sppo_chosen_loss": 2493.40576171875, "debug/sppo_chosen_reward_in_loss": 0.06730131804943085, "debug/sppo_rej_reward_in_loss": -0.00487400870770216, "debug/sppo_reject_loss": 2499.742919921875, "epoch": 6.195652173913044, "grad_norm": 45002.25815377738, "learning_rate": 1.3150120276067005e-08, "logits/chosen": 1.2998325824737549, "logits/rejected": 1.7567226886749268, "logps/chosen": -12.14608097076416, "logps/rejected": -27.60630226135254, "loss": 4988.4074, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0006730131572112441, "rewards/margins": 0.0007217532838694751, "rewards/rejected": -4.874013757216744e-05, "step": 1710 }, { "debug/policy_chosen_logits": 1.0907835960388184, "debug/policy_chosen_logps": -93.25957489013672, "debug/policy_rejected_logits": 1.5257341861724854, "debug/policy_rejected_logps": -2.9405274391174316, "debug/reference_chosen_logps": -94.5040283203125, "debug/reference_rejected_logps": -2.808331251144409, "debug/sppo_chosen_loss": 2436.52685546875, "debug/sppo_chosen_reward_in_loss": 1.244462013244629, "debug/sppo_rej_reward_in_loss": -0.13219599425792694, "debug/sppo_reject_loss": 2486.90576171875, "epoch": 6.213768115942029, "grad_norm": 78490.81524588111, "learning_rate": 1.2899320727454472e-08, "logits/chosen": 1.0907835960388184, "logits/rejected": 1.5257341861724854, "logps/chosen": -93.25957489013672, "logps/rejected": -2.9405274391174316, "loss": 4980.668, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.012444620952010155, "rewards/margins": 0.013766579329967499, "rewards/rejected": -0.0013219600077718496, "step": 1715 }, { "debug/policy_chosen_logits": 1.462038278579712, "debug/policy_chosen_logps": -46.9264030456543, "debug/policy_rejected_logits": 1.577406883239746, "debug/policy_rejected_logps": -6.186155319213867, "debug/reference_chosen_logps": -47.269874572753906, "debug/reference_rejected_logps": -6.084237098693848, "debug/sppo_chosen_loss": 2469.14013671875, "debug/sppo_chosen_reward_in_loss": 0.343473345041275, "debug/sppo_rej_reward_in_loss": -0.10191808640956879, "debug/sppo_reject_loss": 2489.91943359375, "epoch": 6.231884057971015, "grad_norm": 47521.68527150804, "learning_rate": 1.2650581231835921e-08, "logits/chosen": 1.462038278579712, "logits/rejected": 1.577406883239746, "logps/chosen": -46.9264030456543, "logps/rejected": -6.186155319213867, "loss": 4976.3578, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0034347332548350096, "rewards/margins": 0.004453913774341345, "rewards/rejected": -0.0010191809851676226, "step": 1720 }, { "debug/policy_chosen_logits": 1.4536765813827515, "debug/policy_chosen_logps": -5.371994495391846, "debug/policy_rejected_logits": 1.726032018661499, "debug/policy_rejected_logps": -11.83240032196045, "debug/reference_chosen_logps": -5.359372138977051, "debug/reference_rejected_logps": -11.723997116088867, "debug/sppo_chosen_loss": 2501.31396484375, "debug/sppo_chosen_reward_in_loss": -0.012622115202248096, "debug/sppo_rej_reward_in_loss": -0.10840407758951187, "debug/sppo_reject_loss": 2489.285400390625, "epoch": 6.25, "grad_norm": 19863.151932196044, "learning_rate": 1.2403915600725157e-08, "logits/chosen": 1.4536765813827515, "logits/rejected": 1.726032018661499, "logps/chosen": -5.371994495391846, "logps/rejected": -11.83240032196045, "loss": 4973.6211, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.00012622112990356982, "rewards/margins": 0.0009578196331858635, "rewards/rejected": -0.001084040617570281, "step": 1725 }, { "debug/policy_chosen_logits": 1.4391934871673584, "debug/policy_chosen_logps": -6.197887420654297, "debug/policy_rejected_logits": 1.6112970113754272, "debug/policy_rejected_logps": -6.456604957580566, "debug/reference_chosen_logps": -6.293431758880615, "debug/reference_rejected_logps": -6.414093971252441, "debug/sppo_chosen_loss": 2490.48974609375, "debug/sppo_chosen_reward_in_loss": 0.09554453194141388, "debug/sppo_rej_reward_in_loss": -0.04251168295741081, "debug/sppo_reject_loss": 2495.76806640625, "epoch": 6.268115942028985, "grad_norm": 35954.85828078031, "learning_rate": 1.2159337530482494e-08, "logits/chosen": 1.4391934871673584, "logits/rejected": 1.6112970113754272, "logps/chosen": -6.197887420654297, "logps/rejected": -6.456604957580566, "loss": 4946.5695, "rewards/accuracies": 0.5, "rewards/chosen": 0.0009554452262818813, "rewards/margins": 0.0013805620837956667, "rewards/rejected": -0.00042511679930612445, "step": 1730 }, { "debug/policy_chosen_logits": 1.232269048690796, "debug/policy_chosen_logps": -9.366016387939453, "debug/policy_rejected_logits": 1.5713814496994019, "debug/policy_rejected_logps": -171.44876098632812, "debug/reference_chosen_logps": -9.423843383789062, "debug/reference_rejected_logps": -171.1486053466797, "debug/sppo_chosen_loss": 2494.317626953125, "debug/sppo_chosen_reward_in_loss": 0.057827699929475784, "debug/sppo_rej_reward_in_loss": -0.30015993118286133, "debug/sppo_reject_loss": 2523.0087890625, "epoch": 6.286231884057971, "grad_norm": 37988.770141737434, "learning_rate": 1.1916860601554312e-08, "logits/chosen": 1.232269048690796, "logits/rejected": 1.5713814496994019, "logps/chosen": -9.366016387939453, "logps/rejected": -171.44876098632812, "loss": 4943.5426, "rewards/accuracies": 0.375, "rewards/chosen": 0.0005782769876532257, "rewards/margins": 0.0035798773169517517, "rewards/rejected": -0.003001599106937647, "step": 1735 }, { "debug/policy_chosen_logits": 1.485622763633728, "debug/policy_chosen_logps": -92.58248901367188, "debug/policy_rejected_logits": 1.8550310134887695, "debug/policy_rejected_logps": -46.930030822753906, "debug/reference_chosen_logps": -93.40971374511719, "debug/reference_rejected_logps": -47.21750259399414, "debug/sppo_chosen_loss": 2430.24951171875, "debug/sppo_chosen_reward_in_loss": 0.8272258639335632, "debug/sppo_rej_reward_in_loss": 0.28746458888053894, "debug/sppo_reject_loss": 2533.79833984375, "epoch": 6.304347826086957, "grad_norm": 46267.3607143779, "learning_rate": 1.1676498277719017e-08, "logits/chosen": 1.485622763633728, "logits/rejected": 1.8550310134887695, "logps/chosen": -92.58248901367188, "logps/rejected": -46.930030822753906, "loss": 4976.6016, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.008272258564829826, "rewards/margins": 0.0053976126946508884, "rewards/rejected": 0.0028746456373482943, "step": 1740 }, { "debug/policy_chosen_logits": 1.071164608001709, "debug/policy_chosen_logps": -28.790414810180664, "debug/policy_rejected_logits": 1.2676937580108643, "debug/policy_rejected_logps": -7.485450744628906, "debug/reference_chosen_logps": -29.042627334594727, "debug/reference_rejected_logps": -7.397076606750488, "debug/sppo_chosen_loss": 2477.00244140625, "debug/sppo_chosen_reward_in_loss": 0.25221022963523865, "debug/sppo_rej_reward_in_loss": -0.08837547898292542, "debug/sppo_reject_loss": 2491.236083984375, "epoch": 6.322463768115942, "grad_norm": 40642.82620036158, "learning_rate": 1.1438263905339358e-08, "logits/chosen": 1.071164608001709, "logits/rejected": 1.2676937580108643, "logps/chosen": -28.790414810180664, "logps/rejected": -7.485450744628906, "loss": 4962.7656, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.00252210209146142, "rewards/margins": 0.0034058571327477694, "rewards/rejected": -0.0008837546920403838, "step": 1745 }, { "debug/policy_chosen_logits": 1.449456810951233, "debug/policy_chosen_logps": -67.2823257446289, "debug/policy_rejected_logits": 1.6647388935089111, "debug/policy_rejected_logps": -62.98200607299805, "debug/reference_chosen_logps": -67.7783203125, "debug/reference_rejected_logps": -62.94593048095703, "debug/sppo_chosen_loss": 2457.11669921875, "debug/sppo_chosen_reward_in_loss": 0.4959987699985504, "debug/sppo_rej_reward_in_loss": -0.03608518838882446, "debug/sppo_reject_loss": 2497.629638671875, "epoch": 6.340579710144928, "grad_norm": 93559.37413403038, "learning_rate": 1.1202170712621467e-08, "logits/chosen": 1.449456810951233, "logits/rejected": 1.6647388935089111, "logps/chosen": -67.2823257446289, "logps/rejected": -62.98200607299805, "loss": 4976.6578, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0049599879421293736, "rewards/margins": 0.005320839583873749, "rewards/rejected": -0.0003608519327826798, "step": 1750 }, { "debug/policy_chosen_logits": 1.1579824686050415, "debug/policy_chosen_logps": -92.89985656738281, "debug/policy_rejected_logits": 1.4179567098617554, "debug/policy_rejected_logps": -8.033535957336426, "debug/reference_chosen_logps": -94.22650909423828, "debug/reference_rejected_logps": -7.949918270111084, "debug/sppo_chosen_loss": 2439.9619140625, "debug/sppo_chosen_reward_in_loss": 1.3266427516937256, "debug/sppo_rej_reward_in_loss": -0.08361731469631195, "debug/sppo_reject_loss": 2491.69677734375, "epoch": 6.358695652173913, "grad_norm": 17044.334965689755, "learning_rate": 1.0968231808880241e-08, "logits/chosen": 1.1579824686050415, "logits/rejected": 1.4179567098617554, "logps/chosen": -92.89985656738281, "logps/rejected": -8.033535957336426, "loss": 4974.5734, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.013266431167721748, "rewards/margins": 0.014102603308856487, "rewards/rejected": -0.0008361730724573135, "step": 1755 }, { "debug/policy_chosen_logits": 1.3427035808563232, "debug/policy_chosen_logps": -77.3194580078125, "debug/policy_rejected_logits": 1.6082401275634766, "debug/policy_rejected_logps": -3.9593892097473145, "debug/reference_chosen_logps": -78.01481628417969, "debug/reference_rejected_logps": -3.9095966815948486, "debug/sppo_chosen_loss": 2439.074462890625, "debug/sppo_chosen_reward_in_loss": 0.6953679323196411, "debug/sppo_rej_reward_in_loss": -0.049792028963565826, "debug/sppo_reject_loss": 2495.048583984375, "epoch": 6.3768115942028984, "grad_norm": 34719.28752700949, "learning_rate": 1.0736460183811546e-08, "logits/chosen": 1.3427035808563232, "logits/rejected": 1.6082401275634766, "logps/chosen": -77.3194580078125, "logps/rejected": -3.9593892097473145, "loss": 4948.2844, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.006953679025173187, "rewards/margins": 0.00745159899815917, "rewards/rejected": -0.0004979203222319484, "step": 1760 }, { "debug/policy_chosen_logits": 1.547303318977356, "debug/policy_chosen_logps": -80.71726989746094, "debug/policy_rejected_logits": 1.6058326959609985, "debug/policy_rejected_logps": -4.3524065017700195, "debug/reference_chosen_logps": -81.39645385742188, "debug/reference_rejected_logps": -4.305384635925293, "debug/sppo_chosen_loss": 2450.37158203125, "debug/sppo_chosen_reward_in_loss": 0.6791876554489136, "debug/sppo_rej_reward_in_loss": -0.04702133685350418, "debug/sppo_reject_loss": 2495.509765625, "epoch": 6.394927536231884, "grad_norm": 32521.525350586937, "learning_rate": 1.0506868706770844e-08, "logits/chosen": 1.547303318977356, "logits/rejected": 1.6058326959609985, "logps/chosen": -80.71726989746094, "logps/rejected": -4.3524065017700195, "loss": 4981.0656, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.006791877094656229, "rewards/margins": 0.007262089289724827, "rewards/rejected": -0.000470213417429477, "step": 1765 }, { "debug/policy_chosen_logits": 1.31116783618927, "debug/policy_chosen_logps": -70.85577392578125, "debug/policy_rejected_logits": 1.7284290790557861, "debug/policy_rejected_logps": -135.3305206298828, "debug/reference_chosen_logps": -71.41020202636719, "debug/reference_rejected_logps": -136.6486053466797, "debug/sppo_chosen_loss": 2456.178466796875, "debug/sppo_chosen_reward_in_loss": 0.5544347167015076, "debug/sppo_rej_reward_in_loss": 1.318110704421997, "debug/sppo_reject_loss": 2655.50439453125, "epoch": 6.413043478260869, "grad_norm": 133465.56232908618, "learning_rate": 1.0279470126058676e-08, "logits/chosen": 1.31116783618927, "logits/rejected": 1.7284290790557861, "logps/chosen": -70.85577392578125, "logps/rejected": -135.3305206298828, "loss": 4962.3023, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.005544346757233143, "rewards/margins": -0.007636760361492634, "rewards/rejected": 0.013181107118725777, "step": 1770 }, { "debug/policy_chosen_logits": 1.4565809965133667, "debug/policy_chosen_logps": -111.69891357421875, "debug/policy_rejected_logits": 1.4828486442565918, "debug/policy_rejected_logps": -28.409770965576172, "debug/reference_chosen_logps": -112.68605041503906, "debug/reference_rejected_logps": -28.50644874572754, "debug/sppo_chosen_loss": 2424.48779296875, "debug/sppo_chosen_reward_in_loss": 0.9871335029602051, "debug/sppo_rej_reward_in_loss": 0.09667740762233734, "debug/sppo_reject_loss": 2510.657470703125, "epoch": 6.431159420289855, "grad_norm": 22320.355015060846, "learning_rate": 1.0054277068212797e-08, "logits/chosen": 1.4565809965133667, "logits/rejected": 1.4828486442565918, "logps/chosen": -111.69891357421875, "logps/rejected": -28.409770965576172, "loss": 4958.4504, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.009871335700154305, "rewards/margins": 0.008904560469090939, "rewards/rejected": 0.0009667740087024868, "step": 1775 }, { "debug/policy_chosen_logits": 1.0780291557312012, "debug/policy_chosen_logps": -3.783858060836792, "debug/policy_rejected_logits": 1.4318087100982666, "debug/policy_rejected_logps": -10.056068420410156, "debug/reference_chosen_logps": -3.734755039215088, "debug/reference_rejected_logps": -9.857994079589844, "debug/sppo_chosen_loss": 2504.95849609375, "debug/sppo_chosen_reward_in_loss": -0.049103133380413055, "debug/sppo_rej_reward_in_loss": -0.1980745792388916, "debug/sppo_reject_loss": 2480.594482421875, "epoch": 6.449275362318841, "grad_norm": 37180.6381103719, "learning_rate": 9.831302037307021e-09, "logits/chosen": 1.0780291557312012, "logits/rejected": 1.4318087100982666, "logps/chosen": -3.783858060836792, "logps/rejected": -10.056068420410156, "loss": 4990.1324, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.0004910313291475177, "rewards/margins": 0.0014897143701091409, "rewards/rejected": -0.0019807456992566586, "step": 1780 }, { "debug/policy_chosen_logits": 1.710719108581543, "debug/policy_chosen_logps": -56.6034049987793, "debug/policy_rejected_logits": 1.7720228433609009, "debug/policy_rejected_logps": -109.38954162597656, "debug/reference_chosen_logps": -57.20923614501953, "debug/reference_rejected_logps": -108.96925354003906, "debug/sppo_chosen_loss": 2455.64306640625, "debug/sppo_chosen_reward_in_loss": 0.6058371663093567, "debug/sppo_rej_reward_in_loss": -0.4202834665775299, "debug/sppo_reject_loss": 2462.017333984375, "epoch": 6.467391304347826, "grad_norm": 93062.52102557954, "learning_rate": 9.610557414257009e-09, "logits/chosen": 1.710719108581543, "logits/rejected": 1.7720228433609009, "logps/chosen": -56.6034049987793, "logps/rejected": -109.38954162597656, "loss": 4958.8195, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.006058371625840664, "rewards/margins": 0.010261205956339836, "rewards/rejected": -0.004202834330499172, "step": 1785 }, { "debug/policy_chosen_logits": 1.0839542150497437, "debug/policy_chosen_logps": -113.24333190917969, "debug/policy_rejected_logits": 1.4320446252822876, "debug/policy_rejected_logps": -16.650415420532227, "debug/reference_chosen_logps": -114.78114318847656, "debug/reference_rejected_logps": -16.70585060119629, "debug/sppo_chosen_loss": 2395.4326171875, "debug/sppo_chosen_reward_in_loss": 1.5378117561340332, "debug/sppo_rej_reward_in_loss": 0.05543426424264908, "debug/sppo_reject_loss": 2505.8603515625, "epoch": 6.4855072463768115, "grad_norm": 45870.12494449873, "learning_rate": 9.392055456132713e-09, "logits/chosen": 1.0839542150497437, "logits/rejected": 1.4320446252822876, "logps/chosen": -113.24333190917969, "logps/rejected": -16.650415420532227, "loss": 4949.2418, "rewards/accuracies": 0.375, "rewards/chosen": 0.015378117561340332, "rewards/margins": 0.014823774807155132, "rewards/rejected": 0.0005543426377698779, "step": 1790 }, { "debug/policy_chosen_logits": 1.4075796604156494, "debug/policy_chosen_logps": -177.176025390625, "debug/policy_rejected_logits": 1.5782158374786377, "debug/policy_rejected_logps": -196.8184051513672, "debug/reference_chosen_logps": -178.44227600097656, "debug/reference_rejected_logps": -196.77088928222656, "debug/sppo_chosen_loss": 2415.40283203125, "debug/sppo_chosen_reward_in_loss": 1.2662612199783325, "debug/sppo_rej_reward_in_loss": -0.04751582071185112, "debug/sppo_reject_loss": 2501.083740234375, "epoch": 6.503623188405797, "grad_norm": 68764.55007615987, "learning_rate": 9.175808295477849e-09, "logits/chosen": 1.4075796604156494, "logits/rejected": 1.5782158374786377, "logps/chosen": -177.176025390625, "logps/rejected": -196.8184051513672, "loss": 4960.5887, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.012662611901760101, "rewards/margins": 0.013137768022716045, "rewards/rejected": -0.00047515815822407603, "step": 1795 }, { "debug/policy_chosen_logits": 1.3861515522003174, "debug/policy_chosen_logps": -133.00692749023438, "debug/policy_rejected_logits": 1.6806089878082275, "debug/policy_rejected_logps": -5.027390003204346, "debug/reference_chosen_logps": -134.6995391845703, "debug/reference_rejected_logps": -4.946457862854004, "debug/sppo_chosen_loss": 2394.70849609375, "debug/sppo_chosen_reward_in_loss": 1.6926031112670898, "debug/sppo_rej_reward_in_loss": -0.0809326022863388, "debug/sppo_reject_loss": 2491.97900390625, "epoch": 6.521739130434782, "grad_norm": 13857.539334061381, "learning_rate": 8.961827939636196e-09, "logits/chosen": 1.3861515522003174, "logits/rejected": 1.6806089878082275, "logps/chosen": -133.00692749023438, "logps/rejected": -5.027390003204346, "loss": 4953.9797, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.016926029697060585, "rewards/margins": 0.017735354602336884, "rewards/rejected": -0.0008093260112218559, "step": 1800 }, { "epoch": 6.521739130434782, "eval_debug/policy_chosen_logits": 1.6601722240447998, "eval_debug/policy_chosen_logps": -121.04447174072266, "eval_debug/policy_rejected_logits": 1.7200790643692017, "eval_debug/policy_rejected_logps": -63.84328842163086, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2382.340576171875, "eval_debug/sppo_chosen_reward_in_loss": 2.1035938262939453, "eval_debug/sppo_rej_reward_in_loss": 0.04376450926065445, "eval_debug/sppo_reject_loss": 2504.533447265625, "eval_logits/chosen": 1.6601722240447998, "eval_logits/rejected": 1.7200790643692017, "eval_logps/chosen": -121.04447174072266, "eval_logps/rejected": -63.84328842163086, "eval_loss": 4962.03173828125, "eval_rewards/accuracies": 0.28947368264198303, "eval_rewards/chosen": 0.02103593945503235, "eval_rewards/margins": 0.020598294213414192, "eval_rewards/rejected": 0.00043764509609900415, "eval_runtime": 28.7001, "eval_samples_per_second": 20.906, "eval_steps_per_second": 0.662, "step": 1800 }, { "debug/policy_chosen_logits": 1.2995086908340454, "debug/policy_chosen_logps": -103.15284729003906, "debug/policy_rejected_logits": 1.3403165340423584, "debug/policy_rejected_logps": -4.50433349609375, "debug/reference_chosen_logps": -104.10665130615234, "debug/reference_rejected_logps": -4.467912197113037, "debug/sppo_chosen_loss": 2442.026123046875, "debug/sppo_chosen_reward_in_loss": 0.9538156390190125, "debug/sppo_rej_reward_in_loss": -0.03642112761735916, "debug/sppo_reject_loss": 2496.39892578125, "epoch": 6.539855072463768, "grad_norm": 232163.60657949795, "learning_rate": 8.75012627008489e-09, "logits/chosen": 1.2995086908340454, "logits/rejected": 1.3403165340423584, "logps/chosen": -103.15284729003906, "logps/rejected": -4.50433349609375, "loss": 4973.9258, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.009538156911730766, "rewards/margins": 0.009902368299663067, "rewards/rejected": -0.0003642113006208092, "step": 1805 }, { "debug/policy_chosen_logits": 1.3399465084075928, "debug/policy_chosen_logps": -93.3157730102539, "debug/policy_rejected_logits": 1.8695213794708252, "debug/policy_rejected_logps": -96.85462951660156, "debug/reference_chosen_logps": -94.25801849365234, "debug/reference_rejected_logps": -96.91804504394531, "debug/sppo_chosen_loss": 2424.90234375, "debug/sppo_chosen_reward_in_loss": 0.9422407150268555, "debug/sppo_rej_reward_in_loss": 0.06341278553009033, "debug/sppo_reject_loss": 2508.17822265625, "epoch": 6.557971014492754, "grad_norm": 29403.531785865885, "learning_rate": 8.540715041774716e-09, "logits/chosen": 1.3399465084075928, "logits/rejected": 1.8695213794708252, "logps/chosen": -93.3157730102539, "logps/rejected": -96.85462951660156, "loss": 4931.4, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.009422407485544682, "rewards/margins": 0.008788278326392174, "rewards/rejected": 0.0006341278785839677, "step": 1810 }, { "debug/policy_chosen_logits": 1.7488319873809814, "debug/policy_chosen_logps": -6.826148986816406, "debug/policy_rejected_logits": 1.9633022546768188, "debug/policy_rejected_logps": -159.08013916015625, "debug/reference_chosen_logps": -6.899423122406006, "debug/reference_rejected_logps": -159.81626892089844, "debug/sppo_chosen_loss": 2492.733642578125, "debug/sppo_chosen_reward_in_loss": 0.0732741430401802, "debug/sppo_rej_reward_in_loss": 0.7361389398574829, "debug/sppo_reject_loss": 2588.43310546875, "epoch": 6.576086956521739, "grad_norm": 52605.38863327159, "learning_rate": 8.333605882477334e-09, "logits/chosen": 1.7488319873809814, "logits/rejected": 1.9633022546768188, "logps/chosen": -6.826148986816406, "logps/rejected": -159.08013916015625, "loss": 4973.7809, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0007327414350584149, "rewards/margins": -0.006628647446632385, "rewards/rejected": 0.007361388765275478, "step": 1815 }, { "debug/policy_chosen_logits": 1.2105982303619385, "debug/policy_chosen_logps": -6.728180885314941, "debug/policy_rejected_logits": 1.6417105197906494, "debug/policy_rejected_logps": -1.4647002220153809, "debug/reference_chosen_logps": -6.754980564117432, "debug/reference_rejected_logps": -1.4245069026947021, "debug/sppo_chosen_loss": 2497.39013671875, "debug/sppo_chosen_reward_in_loss": 0.026798833161592484, "debug/sppo_rej_reward_in_loss": -0.040193162858486176, "debug/sppo_reject_loss": 2496.04345703125, "epoch": 6.594202898550725, "grad_norm": 51059.263377887626, "learning_rate": 8.128810292139726e-09, "logits/chosen": 1.2105982303619385, "logits/rejected": 1.6417105197906494, "logps/chosen": -6.728180885314941, "logps/rejected": -1.4647002220153809, "loss": 4986.3105, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.0002679883036762476, "rewards/margins": 0.0006699199439026415, "rewards/rejected": -0.00040193164022639394, "step": 1820 }, { "debug/policy_chosen_logits": 1.1504125595092773, "debug/policy_chosen_logps": -116.09504699707031, "debug/policy_rejected_logits": 1.5865685939788818, "debug/policy_rejected_logps": -29.11785316467285, "debug/reference_chosen_logps": -117.39445495605469, "debug/reference_rejected_logps": -28.985509872436523, "debug/sppo_chosen_loss": 2397.369873046875, "debug/sppo_chosen_reward_in_loss": 1.2994048595428467, "debug/sppo_rej_reward_in_loss": -0.13234791159629822, "debug/sppo_reject_loss": 2487.07568359375, "epoch": 6.61231884057971, "grad_norm": 101492.65501818733, "learning_rate": 7.926339642245555e-09, "logits/chosen": 1.1504125595092773, "logits/rejected": 1.5865685939788818, "logps/chosen": -116.09504699707031, "logps/rejected": -29.11785316467285, "loss": 4946.0242, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.012994048185646534, "rewards/margins": 0.014317525550723076, "rewards/rejected": -0.0013234791113063693, "step": 1825 }, { "debug/policy_chosen_logits": 1.4600260257720947, "debug/policy_chosen_logps": -57.27390670776367, "debug/policy_rejected_logits": 1.795204758644104, "debug/policy_rejected_logps": -11.209230422973633, "debug/reference_chosen_logps": -57.94880294799805, "debug/reference_rejected_logps": -11.025108337402344, "debug/sppo_chosen_loss": 2448.365234375, "debug/sppo_chosen_reward_in_loss": 0.6748945116996765, "debug/sppo_rej_reward_in_loss": -0.18412268161773682, "debug/sppo_reject_loss": 2482.366455078125, "epoch": 6.630434782608695, "grad_norm": 34975.16633075631, "learning_rate": 7.726205175183837e-09, "logits/chosen": 1.4600260257720947, "logits/rejected": 1.795204758644104, "logps/chosen": -57.27390670776367, "logps/rejected": -11.209230422973633, "loss": 4951.8664, "rewards/accuracies": 0.375, "rewards/chosen": 0.006748943589627743, "rewards/margins": 0.008590172044932842, "rewards/rejected": -0.0018412265926599503, "step": 1830 }, { "debug/policy_chosen_logits": 1.6779327392578125, "debug/policy_chosen_logps": -185.33444213867188, "debug/policy_rejected_logits": 2.0866103172302246, "debug/policy_rejected_logps": -39.333656311035156, "debug/reference_chosen_logps": -187.2967987060547, "debug/reference_rejected_logps": -39.620033264160156, "debug/sppo_chosen_loss": 2357.04345703125, "debug/sppo_chosen_reward_in_loss": 1.9623523950576782, "debug/sppo_rej_reward_in_loss": 0.28637897968292236, "debug/sppo_reject_loss": 2534.167724609375, "epoch": 6.648550724637682, "grad_norm": 35717.58879929515, "learning_rate": 7.528418003624632e-09, "logits/chosen": 1.6779327392578125, "logits/rejected": 2.0866103172302246, "logps/chosen": -185.33444213867188, "logps/rejected": -39.333656311035156, "loss": 4972.4289, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.019623523578047752, "rewards/margins": 0.016759734600782394, "rewards/rejected": 0.0028637899085879326, "step": 1835 }, { "debug/policy_chosen_logits": 1.5565046072006226, "debug/policy_chosen_logps": -9.357144355773926, "debug/policy_rejected_logits": 1.729261040687561, "debug/policy_rejected_logps": -59.85749435424805, "debug/reference_chosen_logps": -9.433355331420898, "debug/reference_rejected_logps": -60.15769577026367, "debug/sppo_chosen_loss": 2492.625, "debug/sppo_chosen_reward_in_loss": 0.07621130347251892, "debug/sppo_rej_reward_in_loss": 0.30020034313201904, "debug/sppo_reject_loss": 2535.39013671875, "epoch": 6.666666666666667, "grad_norm": 79150.22055889097, "learning_rate": 7.332989109902027e-09, "logits/chosen": 1.5565046072006226, "logits/rejected": 1.729261040687561, "logps/chosen": -9.357144355773926, "logps/rejected": -59.85749435424805, "loss": 4957.7422, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0007621130207553506, "rewards/margins": -0.002239889930933714, "rewards/rejected": 0.0030020037665963173, "step": 1840 }, { "debug/policy_chosen_logits": 1.373781681060791, "debug/policy_chosen_logps": -8.933435440063477, "debug/policy_rejected_logits": 1.3789024353027344, "debug/policy_rejected_logps": -49.28669738769531, "debug/reference_chosen_logps": -8.973352432250977, "debug/reference_rejected_logps": -49.62647247314453, "debug/sppo_chosen_loss": 2496.12158203125, "debug/sppo_chosen_reward_in_loss": 0.03991802781820297, "debug/sppo_rej_reward_in_loss": 0.33977818489074707, "debug/sppo_reject_loss": 2537.918212890625, "epoch": 6.684782608695652, "grad_norm": 52914.801414132875, "learning_rate": 7.139929345404355e-09, "logits/chosen": 1.373781681060791, "logits/rejected": 1.3789024353027344, "logps/chosen": -8.933435440063477, "logps/rejected": -49.28669738769531, "loss": 4942.4, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.00039918028051033616, "rewards/margins": -0.002998600946739316, "rewards/rejected": 0.003397781867533922, "step": 1845 }, { "debug/policy_chosen_logits": 1.5857799053192139, "debug/policy_chosen_logps": -8.181180953979492, "debug/policy_rejected_logits": 1.7160711288452148, "debug/policy_rejected_logps": -48.743385314941406, "debug/reference_chosen_logps": -8.133310317993164, "debug/reference_rejected_logps": -48.77833938598633, "debug/sppo_chosen_loss": 2504.904296875, "debug/sppo_chosen_reward_in_loss": -0.04787103459239006, "debug/sppo_rej_reward_in_loss": 0.03495336323976517, "debug/sppo_reject_loss": 2506.09521484375, "epoch": 6.702898550724638, "grad_norm": 42818.14913238259, "learning_rate": 6.94924942997161e-09, "logits/chosen": 1.5857799053192139, "logits/rejected": 1.7160711288452148, "logps/chosen": -8.181180953979492, "logps/rejected": -48.743385314941406, "loss": 4959.0766, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.0004787102807313204, "rewards/margins": -0.0008282443741336465, "rewards/rejected": 0.00034953386057168245, "step": 1850 }, { "debug/policy_chosen_logits": 1.2619130611419678, "debug/policy_chosen_logps": -206.5513916015625, "debug/policy_rejected_logits": 1.305058240890503, "debug/policy_rejected_logps": -49.41690444946289, "debug/reference_chosen_logps": -207.99520874023438, "debug/reference_rejected_logps": -49.724403381347656, "debug/sppo_chosen_loss": 2385.61572265625, "debug/sppo_chosen_reward_in_loss": 1.4438073635101318, "debug/sppo_rej_reward_in_loss": 0.3074987530708313, "debug/sppo_reject_loss": 2540.949462890625, "epoch": 6.721014492753623, "grad_norm": 33172.9312159423, "learning_rate": 6.760959951300266e-09, "logits/chosen": 1.2619130611419678, "logits/rejected": 1.305058240890503, "logps/chosen": -206.5513916015625, "logps/rejected": -49.41690444946289, "loss": 4962.0422, "rewards/accuracies": 0.375, "rewards/chosen": 0.014438075013458729, "rewards/margins": 0.011363087221980095, "rewards/rejected": 0.0030749875586479902, "step": 1855 }, { "debug/policy_chosen_logits": 1.3062779903411865, "debug/policy_chosen_logps": -6.350187301635742, "debug/policy_rejected_logits": 1.4955189228057861, "debug/policy_rejected_logps": -84.61576080322266, "debug/reference_chosen_logps": -6.3128814697265625, "debug/reference_rejected_logps": -84.64005279541016, "debug/sppo_chosen_loss": 2503.76416015625, "debug/sppo_chosen_reward_in_loss": -0.03730545565485954, "debug/sppo_rej_reward_in_loss": 0.024292271584272385, "debug/sppo_reject_loss": 2502.453369140625, "epoch": 6.739130434782608, "grad_norm": 79946.04928582598, "learning_rate": 6.575071364355334e-09, "logits/chosen": 1.3062779903411865, "logits/rejected": 1.4955189228057861, "logps/chosen": -6.350187301635742, "logps/rejected": -84.61576080322266, "loss": 4979.5609, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": -0.0003730545868165791, "rewards/margins": -0.0006159773329272866, "rewards/rejected": 0.0002429227315587923, "step": 1860 }, { "debug/policy_chosen_logits": 1.5117706060409546, "debug/policy_chosen_logps": -12.266745567321777, "debug/policy_rejected_logits": 1.6946170330047607, "debug/policy_rejected_logps": -31.560510635375977, "debug/reference_chosen_logps": -12.45335865020752, "debug/reference_rejected_logps": -31.73923683166504, "debug/sppo_chosen_loss": 2481.663818359375, "debug/sppo_chosen_reward_in_loss": 0.1866125762462616, "debug/sppo_rej_reward_in_loss": 0.17872247099876404, "debug/sppo_reject_loss": 2520.138671875, "epoch": 6.757246376811594, "grad_norm": 50232.091526532335, "learning_rate": 6.3915939907899005e-09, "logits/chosen": 1.5117706060409546, "logits/rejected": 1.6946170330047607, "logps/chosen": -12.266745567321777, "logps/rejected": -31.560510635375977, "loss": 4973.9902, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0018661257345229387, "rewards/margins": 7.890113920439035e-05, "rewards/rejected": 0.0017872245516628027, "step": 1865 }, { "debug/policy_chosen_logits": 1.5194110870361328, "debug/policy_chosen_logps": -6.500889778137207, "debug/policy_rejected_logits": 1.9726336002349854, "debug/policy_rejected_logps": -53.24345779418945, "debug/reference_chosen_logps": -6.544236183166504, "debug/reference_rejected_logps": -53.29387283325195, "debug/sppo_chosen_loss": 2495.716064453125, "debug/sppo_chosen_reward_in_loss": 0.04334554821252823, "debug/sppo_rej_reward_in_loss": 0.050412945449352264, "debug/sppo_reject_loss": 2505.23095703125, "epoch": 6.77536231884058, "grad_norm": 138093.30292108277, "learning_rate": 6.210538018371947e-09, "logits/chosen": 1.5194110870361328, "logits/rejected": 1.9726336002349854, "logps/chosen": -6.500889778137207, "logps/rejected": -53.24345779418945, "loss": 4979.6133, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.00043345545418560505, "rewards/margins": -7.067407568683848e-05, "rewards/rejected": 0.0005041294498369098, "step": 1870 }, { "debug/policy_chosen_logits": 1.3560199737548828, "debug/policy_chosen_logps": -99.5659408569336, "debug/policy_rejected_logits": 1.5991158485412598, "debug/policy_rejected_logps": -68.61759185791016, "debug/reference_chosen_logps": -101.13746643066406, "debug/reference_rejected_logps": -68.70438385009766, "debug/sppo_chosen_loss": 2393.70947265625, "debug/sppo_chosen_reward_in_loss": 1.5715234279632568, "debug/sppo_rej_reward_in_loss": 0.08679278194904327, "debug/sppo_reject_loss": 2510.05859375, "epoch": 6.793478260869565, "grad_norm": 81739.07076406304, "learning_rate": 6.031913500418706e-09, "logits/chosen": 1.3560199737548828, "logits/rejected": 1.5991158485412598, "logps/chosen": -99.5659408569336, "logps/rejected": -68.61759185791016, "loss": 4951.8965, "rewards/accuracies": 0.375, "rewards/chosen": 0.015715233981609344, "rewards/margins": 0.014847305603325367, "rewards/rejected": 0.0008679277962073684, "step": 1875 }, { "debug/policy_chosen_logits": 1.5198544263839722, "debug/policy_chosen_logps": -11.458330154418945, "debug/policy_rejected_logits": 1.7217159271240234, "debug/policy_rejected_logps": -5.870211601257324, "debug/reference_chosen_logps": -11.4380521774292, "debug/reference_rejected_logps": -5.718893527984619, "debug/sppo_chosen_loss": 2502.083251953125, "debug/sppo_chosen_reward_in_loss": -0.020276492461562157, "debug/sppo_rej_reward_in_loss": -0.15131817758083344, "debug/sppo_reject_loss": 2485.108154296875, "epoch": 6.811594202898551, "grad_norm": 15831.326179336696, "learning_rate": 5.855730355238414e-09, "logits/chosen": 1.5198544263839722, "logits/rejected": 1.7217159271240234, "logps/chosen": -11.458330154418945, "logps/rejected": -5.870211601257324, "loss": 4973.0461, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.00020276490249671042, "rewards/margins": 0.001310416730120778, "rewards/rejected": -0.0015131818363443017, "step": 1880 }, { "debug/policy_chosen_logits": 1.4488341808319092, "debug/policy_chosen_logps": -4.607147693634033, "debug/policy_rejected_logits": 1.5726737976074219, "debug/policy_rejected_logps": -5.2993035316467285, "debug/reference_chosen_logps": -4.665145397186279, "debug/reference_rejected_logps": -5.2398681640625, "debug/sppo_chosen_loss": 2494.266845703125, "debug/sppo_chosen_reward_in_loss": 0.05799790471792221, "debug/sppo_rej_reward_in_loss": -0.05943550541996956, "debug/sppo_reject_loss": 2494.17578125, "epoch": 6.829710144927536, "grad_norm": 90620.67852450555, "learning_rate": 5.681998365579593e-09, "logits/chosen": 1.4488341808319092, "logits/rejected": 1.5726737976074219, "logps/chosen": -4.607147693634033, "logps/rejected": -5.2993035316467285, "loss": 4947.2133, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0005799789796583354, "rewards/margins": 0.001174334087409079, "rewards/rejected": -0.0005943549913354218, "step": 1885 }, { "debug/policy_chosen_logits": 1.5017623901367188, "debug/policy_chosen_logps": -105.869140625, "debug/policy_rejected_logits": 1.7325398921966553, "debug/policy_rejected_logps": -3.299717426300049, "debug/reference_chosen_logps": -107.45467376708984, "debug/reference_rejected_logps": -3.288224458694458, "debug/sppo_chosen_loss": 2430.885986328125, "debug/sppo_chosen_reward_in_loss": 1.585521936416626, "debug/sppo_rej_reward_in_loss": -0.01149294339120388, "debug/sppo_reject_loss": 2498.860595703125, "epoch": 6.8478260869565215, "grad_norm": 54094.38063781058, "learning_rate": 5.5107271780878875e-09, "logits/chosen": 1.5017623901367188, "logits/rejected": 1.7325398921966553, "logps/chosen": -105.869140625, "logps/rejected": -3.299717426300049, "loss": 4985.4461, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.015855219215154648, "rewards/margins": 0.015970150008797646, "rewards/rejected": -0.0001149294403148815, "step": 1890 }, { "debug/policy_chosen_logits": 1.5512073040008545, "debug/policy_chosen_logps": -58.420372009277344, "debug/policy_rejected_logits": 1.8119548559188843, "debug/policy_rejected_logps": -64.79940032958984, "debug/reference_chosen_logps": -59.187416076660156, "debug/reference_rejected_logps": -65.53862762451172, "debug/sppo_chosen_loss": 2446.90576171875, "debug/sppo_chosen_reward_in_loss": 0.7670369744300842, "debug/sppo_rej_reward_in_loss": 0.7392305731773376, "debug/sppo_reject_loss": 2595.95458984375, "epoch": 6.865942028985507, "grad_norm": 21846.49873184378, "learning_rate": 5.3419263027703665e-09, "logits/chosen": 1.5512073040008545, "logits/rejected": 1.8119548559188843, "logps/chosen": -58.420372009277344, "logps/rejected": -64.79940032958984, "loss": 4981.8445, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.007670368999242783, "rewards/margins": 0.0002780636423267424, "rewards/rejected": 0.007392305880784988, "step": 1895 }, { "debug/policy_chosen_logits": 1.598944067955017, "debug/policy_chosen_logps": -7.6433000564575195, "debug/policy_rejected_logits": 1.8712600469589233, "debug/policy_rejected_logps": -9.256098747253418, "debug/reference_chosen_logps": -7.718649387359619, "debug/reference_rejected_logps": -9.06689739227295, "debug/sppo_chosen_loss": 2492.526611328125, "debug/sppo_chosen_reward_in_loss": 0.07534961402416229, "debug/sppo_rej_reward_in_loss": -0.18920165300369263, "debug/sppo_reject_loss": 2481.498046875, "epoch": 6.884057971014493, "grad_norm": 42971.28975732039, "learning_rate": 5.175605112467529e-09, "logits/chosen": 1.598944067955017, "logits/rejected": 1.8712600469589233, "logps/chosen": -7.6433000564575195, "logps/rejected": -9.256098747253418, "loss": 4965.893, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0007534961914643645, "rewards/margins": 0.0026455125771462917, "rewards/rejected": -0.001892016502097249, "step": 1900 }, { "epoch": 6.884057971014493, "eval_debug/policy_chosen_logits": 1.6619244813919067, "eval_debug/policy_chosen_logps": -121.27937316894531, "eval_debug/policy_rejected_logits": 1.7206820249557495, "eval_debug/policy_rejected_logps": -63.83897018432617, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2383.25341796875, "eval_debug/sppo_chosen_reward_in_loss": 1.8686622381210327, "eval_debug/sppo_rej_reward_in_loss": 0.048085130751132965, "eval_debug/sppo_reject_loss": 2505.0400390625, "eval_logits/chosen": 1.6619244813919067, "eval_logits/rejected": 1.7206820249557495, "eval_logps/chosen": -121.27937316894531, "eval_logps/rejected": -63.83897018432617, "eval_loss": 4953.71923828125, "eval_rewards/accuracies": 0.32894736528396606, "eval_rewards/chosen": 0.018686622381210327, "eval_rewards/margins": 0.01820576936006546, "eval_rewards/rejected": 0.00048085130401887, "eval_runtime": 28.9186, "eval_samples_per_second": 20.748, "eval_steps_per_second": 0.657, "step": 1900 }, { "debug/policy_chosen_logits": 1.3151332139968872, "debug/policy_chosen_logps": -196.119384765625, "debug/policy_rejected_logits": 1.70207941532135, "debug/policy_rejected_logps": -8.450445175170898, "debug/reference_chosen_logps": -197.6197967529297, "debug/reference_rejected_logps": -8.366586685180664, "debug/sppo_chosen_loss": 2396.025146484375, "debug/sppo_chosen_reward_in_loss": 1.5004112720489502, "debug/sppo_rej_reward_in_loss": -0.08385799080133438, "debug/sppo_reject_loss": 2491.7724609375, "epoch": 6.9021739130434785, "grad_norm": 344605.9794571025, "learning_rate": 5.011772842332812e-09, "logits/chosen": 1.3151332139968872, "logits/rejected": 1.70207941532135, "logps/chosen": -196.119384765625, "logps/rejected": -8.450445175170898, "loss": 4978.2461, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.015004110522568226, "rewards/margins": 0.01584269106388092, "rewards/rejected": -0.0008385797846131027, "step": 1905 }, { "debug/policy_chosen_logits": 1.4751882553100586, "debug/policy_chosen_logps": -18.409475326538086, "debug/policy_rejected_logits": 1.8412799835205078, "debug/policy_rejected_logps": -9.695950508117676, "debug/reference_chosen_logps": -18.452165603637695, "debug/reference_rejected_logps": -9.541703224182129, "debug/sppo_chosen_loss": 2497.104736328125, "debug/sppo_chosen_reward_in_loss": 0.04269067570567131, "debug/sppo_rej_reward_in_loss": -0.15424777567386627, "debug/sppo_reject_loss": 2484.8056640625, "epoch": 6.920289855072464, "grad_norm": 25008.55126084477, "learning_rate": 4.850438589319817e-09, "logits/chosen": 1.4751882553100586, "logits/rejected": 1.8412799835205078, "logps/chosen": -18.409475326538086, "logps/rejected": -9.695950508117676, "loss": 4988.3641, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0004269068595021963, "rewards/margins": 0.0019693844951689243, "rewards/rejected": -0.001542477635666728, "step": 1910 }, { "debug/policy_chosen_logits": 1.1970794200897217, "debug/policy_chosen_logps": -114.7748794555664, "debug/policy_rejected_logits": 1.488607406616211, "debug/policy_rejected_logps": -3.9021098613739014, "debug/reference_chosen_logps": -115.864013671875, "debug/reference_rejected_logps": -3.795975923538208, "debug/sppo_chosen_loss": 2432.29931640625, "debug/sppo_chosen_reward_in_loss": 1.089134931564331, "debug/sppo_rej_reward_in_loss": -0.10613443702459335, "debug/sppo_reject_loss": 2489.446044921875, "epoch": 6.938405797101449, "grad_norm": 89866.73269935897, "learning_rate": 4.691611311677252e-09, "logits/chosen": 1.1970794200897217, "logits/rejected": 1.488607406616211, "logps/chosen": -114.7748794555664, "logps/rejected": -3.9021098613739014, "loss": 4968.5992, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.01089134905487299, "rewards/margins": 0.01195269264280796, "rewards/rejected": -0.0010613442864269018, "step": 1915 }, { "debug/policy_chosen_logits": 1.413727045059204, "debug/policy_chosen_logps": -7.005964756011963, "debug/policy_rejected_logits": 1.4010305404663086, "debug/policy_rejected_logps": -103.01905822753906, "debug/reference_chosen_logps": -6.958459377288818, "debug/reference_rejected_logps": -103.35543060302734, "debug/sppo_chosen_loss": 2504.821044921875, "debug/sppo_chosen_reward_in_loss": -0.047505300492048264, "debug/sppo_rej_reward_in_loss": 0.33637818694114685, "debug/sppo_reject_loss": 2547.64453125, "epoch": 6.956521739130435, "grad_norm": 29784.77188219884, "learning_rate": 4.5352998284514e-09, "logits/chosen": 1.413727045059204, "logits/rejected": 1.4010305404663086, "logps/chosen": -7.005964756011963, "logps/rejected": -103.01905822753906, "loss": 4988.8727, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": -0.0004750529187731445, "rewards/margins": -0.003838835284113884, "rewards/rejected": 0.0033637825399637222, "step": 1920 }, { "debug/policy_chosen_logits": 1.646676778793335, "debug/policy_chosen_logps": -114.084228515625, "debug/policy_rejected_logits": 1.9740114212036133, "debug/policy_rejected_logps": -11.519428253173828, "debug/reference_chosen_logps": -114.66383361816406, "debug/reference_rejected_logps": -11.365381240844727, "debug/sppo_chosen_loss": 2456.304443359375, "debug/sppo_chosen_reward_in_loss": 0.5796173810958862, "debug/sppo_rej_reward_in_loss": -0.1540469527244568, "debug/sppo_reject_loss": 2484.80615234375, "epoch": 6.97463768115942, "grad_norm": 45353.276422883, "learning_rate": 4.381512818996564e-09, "logits/chosen": 1.646676778793335, "logits/rejected": 1.9740114212036133, "logps/chosen": -114.084228515625, "logps/rejected": -11.519428253173828, "loss": 4931.0711, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.005796174053102732, "rewards/margins": 0.0073366425931453705, "rewards/rejected": -0.0015404695877805352, "step": 1925 }, { "debug/policy_chosen_logits": 1.5430171489715576, "debug/policy_chosen_logps": -7.83358907699585, "debug/policy_rejected_logits": 1.9725978374481201, "debug/policy_rejected_logps": -9.83609390258789, "debug/reference_chosen_logps": -7.876543998718262, "debug/reference_rejected_logps": -9.774301528930664, "debug/sppo_chosen_loss": 2495.756103515625, "debug/sppo_chosen_reward_in_loss": 0.04295391961932182, "debug/sppo_rej_reward_in_loss": -0.061791546642780304, "debug/sppo_reject_loss": 2493.89013671875, "epoch": 6.992753623188406, "grad_norm": 76091.99754978222, "learning_rate": 4.230258822492999e-09, "logits/chosen": 1.5430171489715576, "logits/rejected": 1.9725978374481201, "logps/chosen": -7.83358907699585, "logps/rejected": -9.83609390258789, "loss": 4924.5797, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00042953918455168605, "rewards/margins": 0.0010474545415490866, "rewards/rejected": -0.0006179154152050614, "step": 1930 }, { "debug/policy_chosen_logits": 1.2076952457427979, "debug/policy_chosen_logps": -75.2942886352539, "debug/policy_rejected_logits": 1.4182268381118774, "debug/policy_rejected_logps": -95.08870697021484, "debug/reference_chosen_logps": -76.09022521972656, "debug/reference_rejected_logps": -94.5392074584961, "debug/sppo_chosen_loss": 2443.27685546875, "debug/sppo_chosen_reward_in_loss": 0.7959342002868652, "debug/sppo_rej_reward_in_loss": -0.5495109558105469, "debug/sppo_reject_loss": 2453.36572265625, "epoch": 7.010869565217392, "grad_norm": 134882.31845909625, "learning_rate": 4.08154623747291e-09, "logits/chosen": 1.2076952457427979, "logits/rejected": 1.4182268381118774, "logps/chosen": -75.2942886352539, "logps/rejected": -95.08870697021484, "loss": 4946.7008, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.007959342561662197, "rewards/margins": 0.01345445029437542, "rewards/rejected": -0.005495109595358372, "step": 1935 }, { "debug/policy_chosen_logits": 1.5721310377120972, "debug/policy_chosen_logps": -8.771934509277344, "debug/policy_rejected_logits": 1.6766021251678467, "debug/policy_rejected_logps": -11.332067489624023, "debug/reference_chosen_logps": -8.80463981628418, "debug/reference_rejected_logps": -11.33636474609375, "debug/sppo_chosen_loss": 2496.836669921875, "debug/sppo_chosen_reward_in_loss": 0.03270546346902847, "debug/sppo_rej_reward_in_loss": 0.0042982459999620914, "debug/sppo_reject_loss": 2500.466796875, "epoch": 7.028985507246377, "grad_norm": 52758.98029037163, "learning_rate": 3.935383321353974e-09, "logits/chosen": 1.5721310377120972, "logits/rejected": 1.6766021251678467, "logps/chosen": -8.771934509277344, "logps/rejected": -11.332067489624023, "loss": 4983.7531, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": 0.00032705458579584956, "rewards/margins": 0.0002840720990207046, "rewards/rejected": 4.29824685852509e-05, "step": 1940 }, { "debug/policy_chosen_logits": 1.3476965427398682, "debug/policy_chosen_logps": -71.43219757080078, "debug/policy_rejected_logits": 1.754065752029419, "debug/policy_rejected_logps": -24.273134231567383, "debug/reference_chosen_logps": -72.51323699951172, "debug/reference_rejected_logps": -24.24868392944336, "debug/sppo_chosen_loss": 2416.68994140625, "debug/sppo_chosen_reward_in_loss": 1.0810452699661255, "debug/sppo_rej_reward_in_loss": -0.024448633193969727, "debug/sppo_reject_loss": 2497.874755859375, "epoch": 7.047101449275362, "grad_norm": 27855.018852172558, "learning_rate": 3.79177818998096e-09, "logits/chosen": 1.3476965427398682, "logits/rejected": 1.754065752029419, "logps/chosen": -71.43219757080078, "logps/rejected": -24.273134231567383, "loss": 4945.7527, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.010810451582074165, "rewards/margins": 0.011054937727749348, "rewards/rejected": -0.0002444863202981651, "step": 1945 }, { "debug/policy_chosen_logits": 1.3279770612716675, "debug/policy_chosen_logps": -3.2820205688476562, "debug/policy_rejected_logits": 1.6869817972183228, "debug/policy_rejected_logps": -23.908376693725586, "debug/reference_chosen_logps": -3.290344715118408, "debug/reference_rejected_logps": -23.97683334350586, "debug/sppo_chosen_loss": 2499.235107421875, "debug/sppo_chosen_reward_in_loss": 0.008324271067976952, "debug/sppo_rej_reward_in_loss": 0.0684545487165451, "debug/sppo_reject_loss": 2507.13427734375, "epoch": 7.065217391304348, "grad_norm": 40259.906785748564, "learning_rate": 3.6507388171750085e-09, "logits/chosen": 1.3279770612716675, "logits/rejected": 1.6869817972183228, "logps/chosen": -3.2820205688476562, "logps/rejected": -23.908376693725586, "loss": 4964.6969, "rewards/accuracies": 0.25, "rewards/chosen": 8.324269583681598e-05, "rewards/margins": -0.0006013027741573751, "rewards/rejected": 0.0006845454918220639, "step": 1950 }, { "debug/policy_chosen_logits": 1.5937316417694092, "debug/policy_chosen_logps": -6.354199409484863, "debug/policy_rejected_logits": 1.5936843156814575, "debug/policy_rejected_logps": -71.20558166503906, "debug/reference_chosen_logps": -6.436964988708496, "debug/reference_rejected_logps": -71.70610046386719, "debug/sppo_chosen_loss": 2491.802734375, "debug/sppo_chosen_reward_in_loss": 0.08276697248220444, "debug/sppo_rej_reward_in_loss": 0.500520646572113, "debug/sppo_reject_loss": 2559.892333984375, "epoch": 7.083333333333333, "grad_norm": 18405.851869641454, "learning_rate": 3.512273034290897e-09, "logits/chosen": 1.5937316417694092, "logits/rejected": 1.5936843156814575, "logps/chosen": -6.354199409484863, "logps/rejected": -71.20558166503906, "loss": 4963.8676, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0008276697481051087, "rewards/margins": -0.004177537281066179, "rewards/rejected": 0.005005206912755966, "step": 1955 }, { "debug/policy_chosen_logits": 1.562395691871643, "debug/policy_chosen_logps": -92.8037109375, "debug/policy_rejected_logits": 1.645330786705017, "debug/policy_rejected_logps": -5.119664192199707, "debug/reference_chosen_logps": -94.31362915039062, "debug/reference_rejected_logps": -5.15993070602417, "debug/sppo_chosen_loss": 2430.552001953125, "debug/sppo_chosen_reward_in_loss": 1.50991952419281, "debug/sppo_rej_reward_in_loss": 0.04026692360639572, "debug/sppo_reject_loss": 2504.11376953125, "epoch": 7.101449275362318, "grad_norm": 17219.269739131698, "learning_rate": 3.376388529782215e-09, "logits/chosen": 1.562395691871643, "logits/rejected": 1.645330786705017, "logps/chosen": -92.8037109375, "logps/rejected": -5.119664192199707, "loss": 4975.5121, "rewards/accuracies": 0.375, "rewards/chosen": 0.015099194832146168, "rewards/margins": 0.014696525409817696, "rewards/rejected": 0.0004026692477054894, "step": 1960 }, { "debug/policy_chosen_logits": 1.4849071502685547, "debug/policy_chosen_logps": -7.2184858322143555, "debug/policy_rejected_logits": 1.9326832294464111, "debug/policy_rejected_logps": -8.318476676940918, "debug/reference_chosen_logps": -7.292272090911865, "debug/reference_rejected_logps": -8.146242141723633, "debug/sppo_chosen_loss": 2492.69580078125, "debug/sppo_chosen_reward_in_loss": 0.07378659397363663, "debug/sppo_rej_reward_in_loss": -0.17223384976387024, "debug/sppo_reject_loss": 2483.416748046875, "epoch": 7.119565217391305, "grad_norm": 19707.672949436143, "learning_rate": 3.243092848774437e-09, "logits/chosen": 1.4849071502685547, "logits/rejected": 1.9326832294464111, "logps/chosen": -7.2184858322143555, "logps/rejected": -8.318476676940918, "loss": 4955.3687, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.000737865804694593, "rewards/margins": 0.002460204064846039, "rewards/rejected": -0.0017223382601514459, "step": 1965 }, { "debug/policy_chosen_logits": 1.0677505731582642, "debug/policy_chosen_logps": -68.87767791748047, "debug/policy_rejected_logits": 1.5847123861312866, "debug/policy_rejected_logps": -6.216179847717285, "debug/reference_chosen_logps": -69.42707824707031, "debug/reference_rejected_logps": -6.121747016906738, "debug/sppo_chosen_loss": 2455.88134765625, "debug/sppo_chosen_reward_in_loss": 0.5493996739387512, "debug/sppo_rej_reward_in_loss": -0.09443233907222748, "debug/sppo_reject_loss": 2490.632080078125, "epoch": 7.13768115942029, "grad_norm": 43463.093853412334, "learning_rate": 3.1123933926459844e-09, "logits/chosen": 1.0677505731582642, "logits/rejected": 1.5847123861312866, "logps/chosen": -68.87767791748047, "logps/rejected": -6.216179847717285, "loss": 4969.4148, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.005493996199220419, "rewards/margins": 0.006438320968300104, "rewards/rejected": -0.0009443233720958233, "step": 1970 }, { "debug/policy_chosen_logits": 1.6951192617416382, "debug/policy_chosen_logps": -28.68180274963379, "debug/policy_rejected_logits": 1.7396814823150635, "debug/policy_rejected_logps": -7.060442924499512, "debug/reference_chosen_logps": -28.961029052734375, "debug/reference_rejected_logps": -6.966675758361816, "debug/sppo_chosen_loss": 2474.169921875, "debug/sppo_chosen_reward_in_loss": 0.279226154088974, "debug/sppo_rej_reward_in_loss": -0.09376756846904755, "debug/sppo_reject_loss": 2491.10302734375, "epoch": 7.155797101449275, "grad_norm": 25830.26190072128, "learning_rate": 2.9842974186172264e-09, "logits/chosen": 1.6951192617416382, "logits/rejected": 1.7396814823150635, "logps/chosen": -28.68180274963379, "logps/rejected": -7.060442924499512, "loss": 4985.9414, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0027922613080590963, "rewards/margins": 0.003729937132447958, "rewards/rejected": -0.0009376758825965226, "step": 1975 }, { "debug/policy_chosen_logits": 1.3979766368865967, "debug/policy_chosen_logps": -5.364877223968506, "debug/policy_rejected_logits": 1.73822021484375, "debug/policy_rejected_logps": -7.082423210144043, "debug/reference_chosen_logps": -5.378409385681152, "debug/reference_rejected_logps": -6.849114894866943, "debug/sppo_chosen_loss": 2498.72412109375, "debug/sppo_chosen_reward_in_loss": 0.013532241806387901, "debug/sppo_rej_reward_in_loss": -0.2333078384399414, "debug/sppo_reject_loss": 2477.42333984375, "epoch": 7.173913043478261, "grad_norm": 32499.430699635028, "learning_rate": 2.8588120393475745e-09, "logits/chosen": 1.3979766368865967, "logits/rejected": 1.73822021484375, "logps/chosen": -5.364877223968506, "logps/rejected": -7.082423210144043, "loss": 4968.6375, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.00013532240700442344, "rewards/margins": 0.0024684006348252296, "rewards/rejected": -0.0023330783005803823, "step": 1980 }, { "debug/policy_chosen_logits": 1.617895483970642, "debug/policy_chosen_logps": -10.363906860351562, "debug/policy_rejected_logits": 1.6561801433563232, "debug/policy_rejected_logps": -79.88899993896484, "debug/reference_chosen_logps": -10.41511058807373, "debug/reference_rejected_logps": -80.3069839477539, "debug/sppo_chosen_loss": 2494.94873046875, "debug/sppo_chosen_reward_in_loss": 0.051203448325395584, "debug/sppo_rej_reward_in_loss": 0.41797810792922974, "debug/sppo_reject_loss": 2548.5205078125, "epoch": 7.192028985507246, "grad_norm": 112641.55082382417, "learning_rate": 2.7359442225404815e-09, "logits/chosen": 1.617895483970642, "logits/rejected": 1.6561801433563232, "logps/chosen": -10.363906860351562, "logps/rejected": -79.88899993896484, "loss": 4962.1551, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.000512034457642585, "rewards/margins": -0.00366774620488286, "rewards/rejected": 0.004179780837148428, "step": 1985 }, { "debug/policy_chosen_logits": 1.1359546184539795, "debug/policy_chosen_logps": -31.22540283203125, "debug/policy_rejected_logits": 1.722290277481079, "debug/policy_rejected_logps": -2.322068214416504, "debug/reference_chosen_logps": -31.72493553161621, "debug/reference_rejected_logps": -2.3256211280822754, "debug/sppo_chosen_loss": 2456.843017578125, "debug/sppo_chosen_reward_in_loss": 0.49953514337539673, "debug/sppo_rej_reward_in_loss": 0.0035528182052075863, "debug/sppo_reject_loss": 2500.45458984375, "epoch": 7.2101449275362315, "grad_norm": 17569.68750796473, "learning_rate": 2.615700790556569e-09, "logits/chosen": 1.1359546184539795, "logits/rejected": 1.722290277481079, "logps/chosen": -31.22540283203125, "logps/rejected": -2.322068214416504, "loss": 4938.5555, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.004995351191610098, "rewards/margins": 0.004959822632372379, "rewards/rejected": 3.552816269802861e-05, "step": 1990 }, { "debug/policy_chosen_logits": 1.5556493997573853, "debug/policy_chosen_logps": -7.839386940002441, "debug/policy_rejected_logits": 1.7251060009002686, "debug/policy_rejected_logps": -41.27373504638672, "debug/reference_chosen_logps": -7.837969779968262, "debug/reference_rejected_logps": -41.54066848754883, "debug/sppo_chosen_loss": 2500.193359375, "debug/sppo_chosen_reward_in_loss": -0.0014158368576318026, "debug/sppo_rej_reward_in_loss": 0.2669321894645691, "debug/sppo_reject_loss": 2528.64208984375, "epoch": 7.228260869565218, "grad_norm": 25680.07227051324, "learning_rate": 2.498088420034855e-09, "logits/chosen": 1.5556493997573853, "logits/rejected": 1.7251060009002686, "logps/chosen": -7.839386940002441, "logps/rejected": -41.27373504638672, "loss": 4982.9617, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": -1.4158390513330232e-05, "rewards/margins": -0.002683480503037572, "rewards/rejected": 0.002669321605935693, "step": 1995 }, { "debug/policy_chosen_logits": 1.378981351852417, "debug/policy_chosen_logps": -67.80242919921875, "debug/policy_rejected_logits": 1.6534407138824463, "debug/policy_rejected_logps": -2.966207981109619, "debug/reference_chosen_logps": -68.45893096923828, "debug/reference_rejected_logps": -2.8858752250671387, "debug/sppo_chosen_loss": 2452.061279296875, "debug/sppo_chosen_reward_in_loss": 0.6565004587173462, "debug/sppo_rej_reward_in_loss": -0.08033265918493271, "debug/sppo_reject_loss": 2492.261962890625, "epoch": 7.246376811594203, "grad_norm": 22185.151403178796, "learning_rate": 2.3831136415219554e-09, "logits/chosen": 1.378981351852417, "logits/rejected": 1.6534407138824463, "logps/chosen": -67.80242919921875, "logps/rejected": -2.966207981109619, "loss": 4950.5336, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.006565003655850887, "rewards/margins": 0.0073683299124240875, "rewards/rejected": -0.0008033266058191657, "step": 2000 }, { "epoch": 7.246376811594203, "eval_debug/policy_chosen_logits": 1.6611069440841675, "eval_debug/policy_chosen_logps": -121.03800964355469, "eval_debug/policy_rejected_logits": 1.7193275690078735, "eval_debug/policy_rejected_logps": -63.848331451416016, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2382.793701171875, "eval_debug/sppo_chosen_reward_in_loss": 2.110050916671753, "eval_debug/sppo_rej_reward_in_loss": 0.03871466591954231, "eval_debug/sppo_reject_loss": 2504.2783203125, "eval_logits/chosen": 1.6611069440841675, "eval_logits/rejected": 1.7193275690078735, "eval_logps/chosen": -121.03800964355469, "eval_logps/rejected": -63.848331451416016, "eval_loss": 4958.17333984375, "eval_rewards/accuracies": 0.31578946113586426, "eval_rewards/chosen": 0.02110050618648529, "eval_rewards/margins": 0.020713360980153084, "eval_rewards/rejected": 0.00038714674883522093, "eval_runtime": 28.8651, "eval_samples_per_second": 20.786, "eval_steps_per_second": 0.658, "step": 2000 }, { "debug/policy_chosen_logits": 1.334449052810669, "debug/policy_chosen_logps": -47.005836486816406, "debug/policy_rejected_logits": 1.6832910776138306, "debug/policy_rejected_logps": -84.16943359375, "debug/reference_chosen_logps": -47.29503631591797, "debug/reference_rejected_logps": -84.16597747802734, "debug/sppo_chosen_loss": 2475.026611328125, "debug/sppo_chosen_reward_in_loss": 0.28920143842697144, "debug/sppo_rej_reward_in_loss": -0.00345133850350976, "debug/sppo_reject_loss": 2500.001220703125, "epoch": 7.2644927536231885, "grad_norm": 28197.55258917959, "learning_rate": 2.2707828391095307e-09, "logits/chosen": 1.334449052810669, "logits/rejected": 1.6832910776138306, "logps/chosen": -47.005836486816406, "logps/rejected": -84.16943359375, "loss": 4952.0859, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.002892014104872942, "rewards/margins": 0.0029265275225043297, "rewards/rejected": -3.4513323043938726e-05, "step": 2005 }, { "debug/policy_chosen_logits": 1.3075889348983765, "debug/policy_chosen_logps": -5.680796146392822, "debug/policy_rejected_logits": 1.6421716213226318, "debug/policy_rejected_logps": -16.327085494995117, "debug/reference_chosen_logps": -5.665097236633301, "debug/reference_rejected_logps": -16.296802520751953, "debug/sppo_chosen_loss": 2501.62890625, "debug/sppo_chosen_reward_in_loss": -0.015698790550231934, "debug/sppo_rej_reward_in_loss": -0.030284658074378967, "debug/sppo_reject_loss": 2497.1787109375, "epoch": 7.282608695652174, "grad_norm": 80480.02839006041, "learning_rate": 2.1611022500797495e-09, "logits/chosen": 1.3075889348983765, "logits/rejected": 1.6421716213226318, "logps/chosen": -5.680796146392822, "logps/rejected": -16.327085494995117, "loss": 4923.6852, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": -0.00015698788047302514, "rewards/margins": 0.000145858692121692, "rewards/rejected": -0.00030284651438705623, "step": 2010 }, { "debug/policy_chosen_logits": 1.3101032972335815, "debug/policy_chosen_logps": -9.938323020935059, "debug/policy_rejected_logits": 1.5853919982910156, "debug/policy_rejected_logps": -7.298520088195801, "debug/reference_chosen_logps": -10.036648750305176, "debug/reference_rejected_logps": -7.27008056640625, "debug/sppo_chosen_loss": 2490.873779296875, "debug/sppo_chosen_reward_in_loss": 0.09832565486431122, "debug/sppo_rej_reward_in_loss": -0.028439437970519066, "debug/sppo_reject_loss": 2497.19384765625, "epoch": 7.300724637681159, "grad_norm": 15937.533105456187, "learning_rate": 2.0540779645590146e-09, "logits/chosen": 1.3101032972335815, "logits/rejected": 1.5853919982910156, "logps/chosen": -9.938323020935059, "logps/rejected": -7.298520088195801, "loss": 4993.9605, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.0009832566138356924, "rewards/margins": 0.0012676508631557226, "rewards/rejected": -0.0002843943948391825, "step": 2015 }, { "debug/policy_chosen_logits": 1.6539599895477295, "debug/policy_chosen_logps": -3.874885082244873, "debug/policy_rejected_logits": 1.9012521505355835, "debug/policy_rejected_logps": -53.810874938964844, "debug/reference_chosen_logps": -3.9204821586608887, "debug/reference_rejected_logps": -54.2545280456543, "debug/sppo_chosen_loss": 2495.48876953125, "debug/sppo_chosen_reward_in_loss": 0.045596878975629807, "debug/sppo_rej_reward_in_loss": 0.44365444779396057, "debug/sppo_reject_loss": 2551.696533203125, "epoch": 7.318840579710145, "grad_norm": 219081.70523670744, "learning_rate": 1.9497159251797514e-09, "logits/chosen": 1.6539599895477295, "logits/rejected": 1.9012521505355835, "logps/chosen": -3.874885082244873, "logps/rejected": -53.810874938964844, "loss": 4983.9703, "rewards/accuracies": 0.25, "rewards/chosen": 0.0004559687804430723, "rewards/margins": -0.003980574663728476, "rewards/rejected": 0.004436544142663479, "step": 2020 }, { "debug/policy_chosen_logits": 1.3193690776824951, "debug/policy_chosen_logps": -106.75616455078125, "debug/policy_rejected_logits": 1.5243886709213257, "debug/policy_rejected_logps": -66.73078155517578, "debug/reference_chosen_logps": -108.31185150146484, "debug/reference_rejected_logps": -67.40676879882812, "debug/sppo_chosen_loss": 2398.130126953125, "debug/sppo_chosen_reward_in_loss": 1.5556986331939697, "debug/sppo_rej_reward_in_loss": 0.675986111164093, "debug/sppo_reject_loss": 2588.185302734375, "epoch": 7.336956521739131, "grad_norm": 18212.093611129352, "learning_rate": 1.8480219267504537e-09, "logits/chosen": 1.3193690776824951, "logits/rejected": 1.5243886709213257, "logps/chosen": -106.75616455078125, "logps/rejected": -66.73078155517578, "loss": 4975.1203, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.015556985512375832, "rewards/margins": 0.00879712589085102, "rewards/rejected": 0.006759860552847385, "step": 2025 }, { "debug/policy_chosen_logits": 1.5854101181030273, "debug/policy_chosen_logps": -57.868019104003906, "debug/policy_rejected_logits": 1.5196328163146973, "debug/policy_rejected_logps": -165.49179077148438, "debug/reference_chosen_logps": -58.72990036010742, "debug/reference_rejected_logps": -166.75540161132812, "debug/sppo_chosen_loss": 2437.488525390625, "debug/sppo_chosen_reward_in_loss": 0.8618768453598022, "debug/sppo_rej_reward_in_loss": 1.263615369796753, "debug/sppo_reject_loss": 2653.237548828125, "epoch": 7.355072463768116, "grad_norm": 20434.53838869497, "learning_rate": 1.7490016159339482e-09, "logits/chosen": 1.5854101181030273, "logits/rejected": 1.5196328163146973, "logps/chosen": -57.868019104003906, "logps/rejected": -165.49179077148438, "loss": 5006.3223, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.00861876830458641, "rewards/margins": -0.004017384722828865, "rewards/rejected": 0.012636152096092701, "step": 2030 }, { "debug/policy_chosen_logits": 1.2507524490356445, "debug/policy_chosen_logps": -4.431945323944092, "debug/policy_rejected_logits": 1.8344917297363281, "debug/policy_rejected_logps": -3.940941333770752, "debug/reference_chosen_logps": -4.469573020935059, "debug/reference_rejected_logps": -3.8946938514709473, "debug/sppo_chosen_loss": 2496.252685546875, "debug/sppo_chosen_reward_in_loss": 0.03762750327587128, "debug/sppo_rej_reward_in_loss": -0.046247877180576324, "debug/sppo_reject_loss": 2495.398193359375, "epoch": 7.3731884057971016, "grad_norm": 148865.13044599458, "learning_rate": 1.6526604909338049e-09, "logits/chosen": 1.2507524490356445, "logits/rejected": 1.8344917297363281, "logps/chosen": -4.431945323944092, "logps/rejected": -3.940941333770752, "loss": 4979.9312, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.00037627501296810806, "rewards/margins": 0.000838753767311573, "rewards/rejected": -0.0004624787252396345, "step": 2035 }, { "debug/policy_chosen_logits": 1.1575661897659302, "debug/policy_chosen_logps": -8.020904541015625, "debug/policy_rejected_logits": 1.633065938949585, "debug/policy_rejected_logps": -7.339245796203613, "debug/reference_chosen_logps": -8.049311637878418, "debug/reference_rejected_logps": -7.240313529968262, "debug/sppo_chosen_loss": 2497.212890625, "debug/sppo_chosen_reward_in_loss": 0.028407037258148193, "debug/sppo_rej_reward_in_loss": -0.09893138706684113, "debug/sppo_reject_loss": 2490.25146484375, "epoch": 7.391304347826087, "grad_norm": 55309.24171919323, "learning_rate": 1.5590039011890987e-09, "logits/chosen": 1.1575661897659302, "logits/rejected": 1.633065938949585, "logps/chosen": -8.020904541015625, "logps/rejected": -7.339245796203613, "loss": 4967.1578, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.00028407032368704677, "rewards/margins": 0.0012733842013403773, "rewards/rejected": -0.0009893138194456697, "step": 2040 }, { "debug/policy_chosen_logits": 1.6919845342636108, "debug/policy_chosen_logps": -193.3542938232422, "debug/policy_rejected_logits": 1.7625573873519897, "debug/policy_rejected_logps": -72.6007308959961, "debug/reference_chosen_logps": -195.5371551513672, "debug/reference_rejected_logps": -71.1019058227539, "debug/sppo_chosen_loss": 2359.487060546875, "debug/sppo_chosen_reward_in_loss": 2.1828505992889404, "debug/sppo_rej_reward_in_loss": -1.4988242387771606, "debug/sppo_reject_loss": 2429.931640625, "epoch": 7.409420289855072, "grad_norm": 58376.62312425554, "learning_rate": 1.4680370470773251e-09, "logits/chosen": 1.6919845342636108, "logits/rejected": 1.7625573873519897, "logps/chosen": -193.3542938232422, "logps/rejected": -72.6007308959961, "loss": 4920.8977, "rewards/accuracies": 0.17499999701976776, "rewards/chosen": 0.021828506141901016, "rewards/margins": 0.03681674599647522, "rewards/rejected": -0.014988240785896778, "step": 2045 }, { "debug/policy_chosen_logits": 1.5771560668945312, "debug/policy_chosen_logps": -43.043434143066406, "debug/policy_rejected_logits": 1.6918752193450928, "debug/policy_rejected_logps": -42.778587341308594, "debug/reference_chosen_logps": -43.37819290161133, "debug/reference_rejected_logps": -42.858863830566406, "debug/sppo_chosen_loss": 2470.401611328125, "debug/sppo_chosen_reward_in_loss": 0.33475473523139954, "debug/sppo_rej_reward_in_loss": 0.08027082681655884, "debug/sppo_reject_loss": 2508.87939453125, "epoch": 7.427536231884058, "grad_norm": 50301.2544034016, "learning_rate": 1.3797649796257027e-09, "logits/chosen": 1.5771560668945312, "logits/rejected": 1.6918752193450928, "logps/chosen": -43.043434143066406, "logps/rejected": -42.778587341308594, "loss": 4967.6637, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.003347547724843025, "rewards/margins": 0.0025448394007980824, "rewards/rejected": 0.0008027080330066383, "step": 2050 }, { "debug/policy_chosen_logits": 1.4068259000778198, "debug/policy_chosen_logps": -85.03541564941406, "debug/policy_rejected_logits": 1.4162929058074951, "debug/policy_rejected_logps": -4.39855432510376, "debug/reference_chosen_logps": -85.24625396728516, "debug/reference_rejected_logps": -4.37467622756958, "debug/sppo_chosen_loss": 2481.27587890625, "debug/sppo_chosen_reward_in_loss": 0.21083812415599823, "debug/sppo_rej_reward_in_loss": -0.023878157138824463, "debug/sppo_reject_loss": 2497.63525390625, "epoch": 7.445652173913043, "grad_norm": 42489.64460782813, "learning_rate": 1.2941926002306536e-09, "logits/chosen": 1.4068259000778198, "logits/rejected": 1.4162929058074951, "logps/chosen": -85.03541564941406, "logps/rejected": -4.39855432510376, "loss": 4955.0344, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0021083815954625607, "rewards/margins": 0.002347162924706936, "rewards/rejected": -0.00023878156207501888, "step": 2055 }, { "debug/policy_chosen_logits": 1.6689949035644531, "debug/policy_chosen_logps": -86.88020324707031, "debug/policy_rejected_logits": 1.9532970190048218, "debug/policy_rejected_logps": -66.47990417480469, "debug/reference_chosen_logps": -87.58280944824219, "debug/reference_rejected_logps": -67.03447723388672, "debug/sppo_chosen_loss": 2440.328857421875, "debug/sppo_chosen_reward_in_loss": 0.7026048302650452, "debug/sppo_rej_reward_in_loss": 0.5545698404312134, "debug/sppo_reject_loss": 2566.56689453125, "epoch": 7.463768115942029, "grad_norm": 99239.66788334773, "learning_rate": 1.2113246603856653e-09, "logits/chosen": 1.6689949035644531, "logits/rejected": 1.9532970190048218, "logps/chosen": -86.88020324707031, "logps/rejected": -66.47990417480469, "loss": 4963.5922, "rewards/accuracies": 0.375, "rewards/chosen": 0.007026048842817545, "rewards/margins": 0.0014803505036979914, "rewards/rejected": 0.00554569810628891, "step": 2060 }, { "debug/policy_chosen_logits": 1.5571136474609375, "debug/policy_chosen_logps": -101.85566711425781, "debug/policy_rejected_logits": 1.8737516403198242, "debug/policy_rejected_logps": -7.925866603851318, "debug/reference_chosen_logps": -102.98133850097656, "debug/reference_rejected_logps": -7.96884822845459, "debug/sppo_chosen_loss": 2412.94140625, "debug/sppo_chosen_reward_in_loss": 1.1256725788116455, "debug/sppo_rej_reward_in_loss": 0.04298214986920357, "debug/sppo_reject_loss": 2504.38330078125, "epoch": 7.481884057971015, "grad_norm": 53368.37163939332, "learning_rate": 1.1311657614174907e-09, "logits/chosen": 1.5571136474609375, "logits/rejected": 1.8737516403198242, "logps/chosen": -101.85566711425781, "logps/rejected": -7.925866603851318, "loss": 4952.066, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.01125672459602356, "rewards/margins": 0.010826903395354748, "rewards/rejected": 0.00042982149170711637, "step": 2065 }, { "debug/policy_chosen_logits": 1.4775936603546143, "debug/policy_chosen_logps": -4.901671409606934, "debug/policy_rejected_logits": 1.8943735361099243, "debug/policy_rejected_logps": -8.222626686096191, "debug/reference_chosen_logps": -4.927979469299316, "debug/reference_rejected_logps": -8.165047645568848, "debug/sppo_chosen_loss": 2497.412841796875, "debug/sppo_chosen_reward_in_loss": 0.02630743384361267, "debug/sppo_rej_reward_in_loss": -0.057579897344112396, "debug/sppo_reject_loss": 2494.337158203125, "epoch": 7.5, "grad_norm": 39960.50407559724, "learning_rate": 1.0537203542306083e-09, "logits/chosen": 1.4775936603546143, "logits/rejected": 1.8943735361099243, "logps/chosen": -4.901671409606934, "logps/rejected": -8.222626686096191, "loss": 4957.1836, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0002630743256304413, "rewards/margins": 0.0008388733258470893, "rewards/rejected": -0.0005757989711128175, "step": 2070 }, { "debug/policy_chosen_logits": 1.5643664598464966, "debug/policy_chosen_logps": -17.637718200683594, "debug/policy_rejected_logits": 1.8542522192001343, "debug/policy_rejected_logps": -3.963315486907959, "debug/reference_chosen_logps": -17.670480728149414, "debug/reference_rejected_logps": -3.907533645629883, "debug/sppo_chosen_loss": 2496.928955078125, "debug/sppo_chosen_reward_in_loss": 0.032761022448539734, "debug/sppo_rej_reward_in_loss": -0.05578208714723587, "debug/sppo_reject_loss": 2494.459228515625, "epoch": 7.518115942028985, "grad_norm": 28345.506490378466, "learning_rate": 9.78992739060114e-10, "logits/chosen": 1.5643664598464966, "logits/rejected": 1.8542522192001343, "logps/chosen": -17.637718200683594, "logps/rejected": -3.963315486907959, "loss": 4935.1836, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0003276102361269295, "rewards/margins": 0.0008854310726746917, "rewards/rejected": -0.0005578208947554231, "step": 2075 }, { "debug/policy_chosen_logits": 1.3403733968734741, "debug/policy_chosen_logps": -5.345233917236328, "debug/policy_rejected_logits": 1.2960846424102783, "debug/policy_rejected_logps": -51.7831916809082, "debug/reference_chosen_logps": -5.416479587554932, "debug/reference_rejected_logps": -52.12139129638672, "debug/sppo_chosen_loss": 2493.102783203125, "debug/sppo_chosen_reward_in_loss": 0.07124529033899307, "debug/sppo_rej_reward_in_loss": 0.3382047712802887, "debug/sppo_reject_loss": 2541.473388671875, "epoch": 7.536231884057971, "grad_norm": 94955.37808876166, "learning_rate": 9.069870652329281e-10, "logits/chosen": 1.3403733968734741, "logits/rejected": 1.2960846424102783, "logps/chosen": -5.345233917236328, "logps/rejected": -51.7831916809082, "loss": 4967.0781, "rewards/accuracies": 0.375, "rewards/chosen": 0.0007124529802240431, "rewards/margins": -0.0026695942506194115, "rewards/rejected": 0.0033820471726357937, "step": 2080 }, { "debug/policy_chosen_logits": 1.144832730293274, "debug/policy_chosen_logps": -160.45376586914062, "debug/policy_rejected_logits": 1.7088559865951538, "debug/policy_rejected_logps": -55.5744743347168, "debug/reference_chosen_logps": -161.3338165283203, "debug/reference_rejected_logps": -55.968605041503906, "debug/sppo_chosen_loss": 2427.180908203125, "debug/sppo_chosen_reward_in_loss": 0.8800380825996399, "debug/sppo_rej_reward_in_loss": 0.3941327929496765, "debug/sppo_reject_loss": 2547.92529296875, "epoch": 7.554347826086957, "grad_norm": 16569.004287662552, "learning_rate": 8.377073309374149e-10, "logits/chosen": 1.144832730293274, "logits/rejected": 1.7088559865951538, "logps/chosen": -160.45376586914062, "logps/rejected": -55.5744743347168, "loss": 4969.9805, "rewards/accuracies": 0.25, "rewards/chosen": 0.008800379931926727, "rewards/margins": 0.00485905259847641, "rewards/rejected": 0.003941327333450317, "step": 2085 }, { "debug/policy_chosen_logits": 1.1751086711883545, "debug/policy_chosen_logps": -8.354531288146973, "debug/policy_rejected_logits": 1.748895287513733, "debug/policy_rejected_logps": -4.76932430267334, "debug/reference_chosen_logps": -8.452306747436523, "debug/reference_rejected_logps": -4.780450820922852, "debug/sppo_chosen_loss": 2490.37109375, "debug/sppo_chosen_reward_in_loss": 0.09777592122554779, "debug/sppo_rej_reward_in_loss": 0.01112664956599474, "debug/sppo_reject_loss": 2501.253173828125, "epoch": 7.572463768115942, "grad_norm": 76191.27187078352, "learning_rate": 7.711573830013584e-10, "logits/chosen": 1.1751086711883545, "logits/rejected": 1.748895287513733, "logps/chosen": -8.354531288146973, "logps/rejected": -4.76932430267334, "loss": 4984.5789, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.0009777592495083809, "rewards/margins": 0.0008664926281198859, "rewards/rejected": 0.00011126654862891883, "step": 2090 }, { "debug/policy_chosen_logits": 1.0294712781906128, "debug/policy_chosen_logps": -222.2999725341797, "debug/policy_rejected_logits": 1.4768553972244263, "debug/policy_rejected_logps": -85.98731231689453, "debug/reference_chosen_logps": -224.5519256591797, "debug/reference_rejected_logps": -85.32658386230469, "debug/sppo_chosen_loss": 2367.41748046875, "debug/sppo_chosen_reward_in_loss": 2.2519757747650146, "debug/sppo_rej_reward_in_loss": -0.6607199907302856, "debug/sppo_reject_loss": 2448.37548828125, "epoch": 7.590579710144928, "grad_norm": 42262.16624197945, "learning_rate": 7.073409166783839e-10, "logits/chosen": 1.0294712781906128, "logits/rejected": 1.4768553972244263, "logps/chosen": -222.2999725341797, "logps/rejected": -85.98731231689453, "loss": 4948.4563, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.022519756108522415, "rewards/margins": 0.029126957058906555, "rewards/rejected": -0.006607199553400278, "step": 2095 }, { "debug/policy_chosen_logits": 1.5281635522842407, "debug/policy_chosen_logps": -4.486605167388916, "debug/policy_rejected_logits": 1.6393181085586548, "debug/policy_rejected_logps": -10.155960083007812, "debug/reference_chosen_logps": -4.5533342361450195, "debug/reference_rejected_logps": -10.117715835571289, "debug/sppo_chosen_loss": 2493.39404296875, "debug/sppo_chosen_reward_in_loss": 0.06672950088977814, "debug/sppo_rej_reward_in_loss": -0.038244642317295074, "debug/sppo_reject_loss": 2496.24609375, "epoch": 7.608695652173913, "grad_norm": 17733.68097482202, "learning_rate": 6.462614754427665e-10, "logits/chosen": 1.5281635522842407, "logits/rejected": 1.6393181085586548, "logps/chosen": -4.486605167388916, "logps/rejected": -10.155960083007812, "loss": 4966.3176, "rewards/accuracies": 0.375, "rewards/chosen": 0.0006672950112260878, "rewards/margins": 0.0010497414041310549, "rewards/rejected": -0.00038244639290496707, "step": 2100 }, { "epoch": 7.608695652173913, "eval_debug/policy_chosen_logits": 1.6607288122177124, "eval_debug/policy_chosen_logps": -121.20295715332031, "eval_debug/policy_rejected_logits": 1.7189559936523438, "eval_debug/policy_rejected_logps": -63.9396858215332, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2381.825927734375, "eval_debug/sppo_chosen_reward_in_loss": 1.9450958967208862, "eval_debug/sppo_rej_reward_in_loss": -0.052633725106716156, "eval_debug/sppo_reject_loss": 2494.81396484375, "eval_logits/chosen": 1.6607288122177124, "eval_logits/rejected": 1.7189559936523438, "eval_logps/chosen": -121.20295715332031, "eval_logps/rejected": -63.9396858215332, "eval_loss": 4951.517578125, "eval_rewards/accuracies": 0.3815789520740509, "eval_rewards/chosen": 0.0194509606808424, "eval_rewards/margins": 0.019977295771241188, "eval_rewards/rejected": -0.0005263372440822423, "eval_runtime": 28.6147, "eval_samples_per_second": 20.968, "eval_steps_per_second": 0.664, "step": 2100 }, { "debug/policy_chosen_logits": 0.8863218426704407, "debug/policy_chosen_logps": -151.0982666015625, "debug/policy_rejected_logits": 1.1469491720199585, "debug/policy_rejected_logps": -153.2440948486328, "debug/reference_chosen_logps": -152.16748046875, "debug/reference_rejected_logps": -152.31759643554688, "debug/sppo_chosen_loss": 2414.37548828125, "debug/sppo_chosen_reward_in_loss": 1.0692083835601807, "debug/sppo_rej_reward_in_loss": -0.9264828562736511, "debug/sppo_reject_loss": 2426.324951171875, "epoch": 7.6268115942028984, "grad_norm": 16513.740373182336, "learning_rate": 5.879224507926661e-10, "logits/chosen": 0.8863218426704407, "logits/rejected": 1.1469491720199585, "logps/chosen": -151.0982666015625, "logps/rejected": -153.2440948486328, "loss": 4951.6277, "rewards/accuracies": 0.22499999403953552, "rewards/chosen": 0.010692082345485687, "rewards/margins": 0.019956910982728004, "rewards/rejected": -0.009264828637242317, "step": 2105 }, { "debug/policy_chosen_logits": 1.4576388597488403, "debug/policy_chosen_logps": -91.89716339111328, "debug/policy_rejected_logits": 1.6273235082626343, "debug/policy_rejected_logps": -28.347070693969727, "debug/reference_chosen_logps": -93.01536560058594, "debug/reference_rejected_logps": -28.119426727294922, "debug/sppo_chosen_loss": 2416.934814453125, "debug/sppo_chosen_reward_in_loss": 1.118211030960083, "debug/sppo_rej_reward_in_loss": -0.22764325141906738, "debug/sppo_reject_loss": 2478.69287109375, "epoch": 7.644927536231884, "grad_norm": 22333.020502010528, "learning_rate": 5.323270820618398e-10, "logits/chosen": 1.4576388597488403, "logits/rejected": 1.6273235082626343, "logps/chosen": -91.89716339111328, "logps/rejected": -28.347070693969727, "loss": 4973.691, "rewards/accuracies": 0.375, "rewards/chosen": 0.011182109825313091, "rewards/margins": 0.01345854252576828, "rewards/rejected": -0.002276432467624545, "step": 2110 }, { "debug/policy_chosen_logits": 1.3041584491729736, "debug/policy_chosen_logps": -5.307699203491211, "debug/policy_rejected_logits": 1.428374171257019, "debug/policy_rejected_logps": -49.849998474121094, "debug/reference_chosen_logps": -5.272663593292236, "debug/reference_rejected_logps": -50.06426239013672, "debug/sppo_chosen_loss": 2503.54345703125, "debug/sppo_chosen_reward_in_loss": -0.0350353829562664, "debug/sppo_rej_reward_in_loss": 0.21425704658031464, "debug/sppo_reject_loss": 2525.06005859375, "epoch": 7.663043478260869, "grad_norm": 55515.3032975634, "learning_rate": 4.794784562397458e-10, "logits/chosen": 1.3041584491729736, "logits/rejected": 1.428374171257019, "logps/chosen": -5.307699203491211, "logps/rejected": -49.849998474121094, "loss": 4985.0938, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.00035035383189097047, "rewards/margins": -0.0024929242208600044, "rewards/rejected": 0.002142570214346051, "step": 2115 }, { "debug/policy_chosen_logits": 1.2195727825164795, "debug/policy_chosen_logps": -119.41800689697266, "debug/policy_rejected_logits": 1.574762225151062, "debug/policy_rejected_logps": -7.585010528564453, "debug/reference_chosen_logps": -120.79359436035156, "debug/reference_rejected_logps": -7.519468784332275, "debug/sppo_chosen_loss": 2408.789794921875, "debug/sppo_chosen_reward_in_loss": 1.3755899667739868, "debug/sppo_rej_reward_in_loss": -0.06554163992404938, "debug/sppo_reject_loss": 2493.525146484375, "epoch": 7.681159420289855, "grad_norm": 60586.33455216261, "learning_rate": 4.293795078001317e-10, "logits/chosen": 1.2195727825164795, "logits/rejected": 1.574762225151062, "logps/chosen": -119.41800689697266, "logps/rejected": -7.585010528564453, "loss": 4978.3258, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.013755899854004383, "rewards/margins": 0.014411315321922302, "rewards/rejected": -0.0006554163410328329, "step": 2120 }, { "debug/policy_chosen_logits": 1.1966047286987305, "debug/policy_chosen_logps": -321.14874267578125, "debug/policy_rejected_logits": 1.6263717412948608, "debug/policy_rejected_logps": -123.89298248291016, "debug/reference_chosen_logps": -324.15594482421875, "debug/reference_rejected_logps": -123.63587951660156, "debug/sppo_chosen_loss": 2315.09619140625, "debug/sppo_chosen_reward_in_loss": 3.007225275039673, "debug/sppo_rej_reward_in_loss": -0.25710296630859375, "debug/sppo_reject_loss": 2475.29638671875, "epoch": 7.699275362318841, "grad_norm": 77994.49366422728, "learning_rate": 3.8203301853813594e-10, "logits/chosen": 1.1966047286987305, "logits/rejected": 1.6263717412948608, "logps/chosen": -321.14874267578125, "logps/rejected": -123.89298248291016, "loss": 4952.218, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.030072247609496117, "rewards/margins": 0.03264328092336655, "rewards/rejected": -0.0025710291229188442, "step": 2125 }, { "debug/policy_chosen_logits": 1.554592251777649, "debug/policy_chosen_logps": -59.23078536987305, "debug/policy_rejected_logits": 1.9070403575897217, "debug/policy_rejected_logps": -11.090579986572266, "debug/reference_chosen_logps": -59.7724494934082, "debug/reference_rejected_logps": -11.075045585632324, "debug/sppo_chosen_loss": 2456.537841796875, "debug/sppo_chosen_reward_in_loss": 0.5416635274887085, "debug/sppo_rej_reward_in_loss": -0.015533894300460815, "debug/sppo_reject_loss": 2498.547607421875, "epoch": 7.717391304347826, "grad_norm": 12136.219767734032, "learning_rate": 3.3744161741577905e-10, "logits/chosen": 1.554592251777649, "logits/rejected": 1.9070403575897217, "logps/chosen": -59.23078536987305, "logps/rejected": -11.090579986572266, "loss": 4975.9988, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.005416635423898697, "rewards/margins": 0.005571974441409111, "rewards/rejected": -0.00015533894475083798, "step": 2130 }, { "debug/policy_chosen_logits": 1.3897290229797363, "debug/policy_chosen_logps": -6.960751533508301, "debug/policy_rejected_logits": 1.8097299337387085, "debug/policy_rejected_logps": -44.101280212402344, "debug/reference_chosen_logps": -6.974306583404541, "debug/reference_rejected_logps": -44.45081329345703, "debug/sppo_chosen_loss": 2498.709716796875, "debug/sppo_chosen_reward_in_loss": 0.013555025681853294, "debug/sppo_rej_reward_in_loss": 0.34952980279922485, "debug/sppo_reject_loss": 2539.877685546875, "epoch": 7.7355072463768115, "grad_norm": 37982.14069984209, "learning_rate": 2.956077804160184e-10, "logits/chosen": 1.3897290229797363, "logits/rejected": 1.8097299337387085, "logps/chosen": -6.960751533508301, "logps/rejected": -44.101280212402344, "loss": 4967.4977, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.00013555024634115398, "rewards/margins": -0.0033597475849092007, "rewards/rejected": 0.003495297860354185, "step": 2135 }, { "debug/policy_chosen_logits": 1.4668445587158203, "debug/policy_chosen_logps": -46.630943298339844, "debug/policy_rejected_logits": 1.8659709692001343, "debug/policy_rejected_logps": -110.12214660644531, "debug/reference_chosen_logps": -46.85969161987305, "debug/reference_rejected_logps": -109.70494079589844, "debug/sppo_chosen_loss": 2479.55615234375, "debug/sppo_chosen_reward_in_loss": 0.22875066101551056, "debug/sppo_rej_reward_in_loss": -0.4171935021877289, "debug/sppo_reject_loss": 2461.060302734375, "epoch": 7.753623188405797, "grad_norm": 128032.58166241871, "learning_rate": 2.5653383040524224e-10, "logits/chosen": 1.4668445587158203, "logits/rejected": 1.8659709692001343, "logps/chosen": -46.630943298339844, "logps/rejected": -110.12214660644531, "loss": 4970.1238, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.002287506591528654, "rewards/margins": 0.006459442432969809, "rewards/rejected": -0.00417193491011858, "step": 2140 }, { "debug/policy_chosen_logits": 1.5519484281539917, "debug/policy_chosen_logps": -23.999021530151367, "debug/policy_rejected_logits": 1.6031992435455322, "debug/policy_rejected_logps": -5.421201229095459, "debug/reference_chosen_logps": -24.1997013092041, "debug/reference_rejected_logps": -5.325526237487793, "debug/sppo_chosen_loss": 2481.80126953125, "debug/sppo_chosen_reward_in_loss": 0.20067811012268066, "debug/sppo_rej_reward_in_loss": -0.09567555040121078, "debug/sppo_reject_loss": 2490.593017578125, "epoch": 7.771739130434782, "grad_norm": 64134.64898312743, "learning_rate": 2.202219370043168e-10, "logits/chosen": 1.5519484281539917, "logits/rejected": 1.6031992435455322, "logps/chosen": -23.999021530151367, "logps/rejected": -5.421201229095459, "loss": 4941.8523, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0020067808218300343, "rewards/margins": 0.002963536186143756, "rewards/rejected": -0.0009567554807290435, "step": 2145 }, { "debug/policy_chosen_logits": 1.2649717330932617, "debug/policy_chosen_logps": -116.90604400634766, "debug/policy_rejected_logits": 1.3826146125793457, "debug/policy_rejected_logps": -5.580374240875244, "debug/reference_chosen_logps": -117.74879455566406, "debug/reference_rejected_logps": -5.538172721862793, "debug/sppo_chosen_loss": 2431.57177734375, "debug/sppo_chosen_reward_in_loss": 0.8427442312240601, "debug/sppo_rej_reward_in_loss": -0.04220107942819595, "debug/sppo_reject_loss": 2495.842041015625, "epoch": 7.789855072463768, "grad_norm": 51972.56525598102, "learning_rate": 1.866741164680996e-10, "logits/chosen": 1.2649717330932617, "logits/rejected": 1.3826146125793457, "logps/chosen": -116.90604400634766, "logps/rejected": -5.580374240875244, "loss": 4961.6664, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.008427442982792854, "rewards/margins": 0.008849453181028366, "rewards/rejected": -0.00042201075120829046, "step": 2150 }, { "debug/policy_chosen_logits": 1.320969581604004, "debug/policy_chosen_logps": -53.896141052246094, "debug/policy_rejected_logits": 1.4506657123565674, "debug/policy_rejected_logps": -115.2258529663086, "debug/reference_chosen_logps": -54.441184997558594, "debug/reference_rejected_logps": -115.64637756347656, "debug/sppo_chosen_loss": 2454.39453125, "debug/sppo_chosen_reward_in_loss": 0.5450425148010254, "debug/sppo_rej_reward_in_loss": 0.42051178216934204, "debug/sppo_reject_loss": 2550.57080078125, "epoch": 7.807971014492754, "grad_norm": 48095.01900926162, "learning_rate": 1.5589223157347896e-10, "logits/chosen": 1.320969581604004, "logits/rejected": 1.4506657123565674, "logps/chosen": -53.896141052246094, "logps/rejected": -115.2258529663086, "loss": 4989.3406, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.005450425203889608, "rewards/margins": 0.001245307270437479, "rewards/rejected": 0.004205117933452129, "step": 2155 }, { "debug/policy_chosen_logits": 1.4193211793899536, "debug/policy_chosen_logps": -99.2984848022461, "debug/policy_rejected_logits": 2.197362184524536, "debug/policy_rejected_logps": -11.946706771850586, "debug/reference_chosen_logps": -99.80180358886719, "debug/reference_rejected_logps": -11.882006645202637, "debug/sppo_chosen_loss": 2459.90185546875, "debug/sppo_chosen_reward_in_loss": 0.5033254623413086, "debug/sppo_rej_reward_in_loss": -0.0646994560956955, "debug/sppo_reject_loss": 2493.612548828125, "epoch": 7.826086956521739, "grad_norm": 21238.252517439265, "learning_rate": 1.2787799151596224e-10, "logits/chosen": 1.4193211793899536, "logits/rejected": 2.197362184524536, "logps/chosen": -99.2984848022461, "logps/rejected": -11.946706771850586, "loss": 4975.0094, "rewards/accuracies": 0.375, "rewards/chosen": 0.005033254623413086, "rewards/margins": 0.0056802500039339066, "rewards/rejected": -0.0006469946238212287, "step": 2160 }, { "debug/policy_chosen_logits": 1.4936144351959229, "debug/policy_chosen_logps": -71.16127014160156, "debug/policy_rejected_logits": 1.4722700119018555, "debug/policy_rejected_logps": -3.080197811126709, "debug/reference_chosen_logps": -71.87583923339844, "debug/reference_rejected_logps": -3.0672378540039062, "debug/sppo_chosen_loss": 2444.332763671875, "debug/sppo_chosen_reward_in_loss": 0.7145590782165527, "debug/sppo_rej_reward_in_loss": -0.012960237450897694, "debug/sppo_reject_loss": 2498.7158203125, "epoch": 7.844202898550725, "grad_norm": 32976.823472238815, "learning_rate": 1.0263295181475174e-10, "logits/chosen": 1.4936144351959229, "logits/rejected": 1.4722700119018555, "logps/chosen": -71.16127014160156, "logps/rejected": -3.080197811126709, "loss": 4933.2437, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.007145591080188751, "rewards/margins": 0.007275192998349667, "rewards/rejected": -0.00012960235471837223, "step": 2165 }, { "debug/policy_chosen_logits": 1.2984204292297363, "debug/policy_chosen_logps": -78.85707092285156, "debug/policy_rejected_logits": 1.487083077430725, "debug/policy_rejected_logps": -91.02099609375, "debug/reference_chosen_logps": -79.83101654052734, "debug/reference_rejected_logps": -91.58612060546875, "debug/sppo_chosen_loss": 2443.150390625, "debug/sppo_chosen_reward_in_loss": 0.9739497303962708, "debug/sppo_rej_reward_in_loss": 0.5651189088821411, "debug/sppo_reject_loss": 2565.64111328125, "epoch": 7.86231884057971, "grad_norm": 13898.710994654932, "learning_rate": 8.015851422638053e-11, "logits/chosen": 1.2984204292297363, "logits/rejected": 1.487083077430725, "logps/chosen": -78.85707092285156, "logps/rejected": -91.02099609375, "loss": 4946.6707, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.009739495813846588, "rewards/margins": 0.004088307730853558, "rewards/rejected": 0.005651188548654318, "step": 2170 }, { "debug/policy_chosen_logits": 1.599073886871338, "debug/policy_chosen_logps": -67.3268051147461, "debug/policy_rejected_logits": 1.7592706680297852, "debug/policy_rejected_logps": -4.544020652770996, "debug/reference_chosen_logps": -68.33360290527344, "debug/reference_rejected_logps": -4.408754348754883, "debug/sppo_chosen_loss": 2433.844970703125, "debug/sppo_chosen_reward_in_loss": 1.0067921876907349, "debug/sppo_rej_reward_in_loss": -0.13526609539985657, "debug/sppo_reject_loss": 2486.622802734375, "epoch": 7.880434782608695, "grad_norm": 43523.80025194495, "learning_rate": 6.045592666688581e-11, "logits/chosen": 1.599073886871338, "logits/rejected": 1.7592706680297852, "logps/chosen": -67.3268051147461, "logps/rejected": -4.544020652770996, "loss": 4965.3559, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.010067922063171864, "rewards/margins": 0.011420582421123981, "rewards/rejected": -0.001352660940028727, "step": 2175 }, { "debug/policy_chosen_logits": 1.2306917905807495, "debug/policy_chosen_logps": -7.127419948577881, "debug/policy_rejected_logits": 1.494236946105957, "debug/policy_rejected_logps": -5.229714870452881, "debug/reference_chosen_logps": -7.189650535583496, "debug/reference_rejected_logps": -5.031564235687256, "debug/sppo_chosen_loss": 2493.880126953125, "debug/sppo_chosen_reward_in_loss": 0.062230490148067474, "debug/sppo_rej_reward_in_loss": -0.19815053045749664, "debug/sppo_reject_loss": 2480.84326171875, "epoch": 7.898550724637682, "grad_norm": 36279.389376456544, "learning_rate": 4.352628314249762e-11, "logits/chosen": 1.2306917905807495, "logits/rejected": 1.494236946105957, "logps/chosen": -7.127419948577881, "logps/rejected": -5.229714870452881, "loss": 4971.9434, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0006223049713298678, "rewards/margins": 0.0026038100477308035, "rewards/rejected": -0.0019815051928162575, "step": 2180 }, { "debug/policy_chosen_logits": 1.5966850519180298, "debug/policy_chosen_logps": -7.728903293609619, "debug/policy_rejected_logits": 1.7997925281524658, "debug/policy_rejected_logps": -8.765410423278809, "debug/reference_chosen_logps": -7.779355525970459, "debug/reference_rejected_logps": -8.762332916259766, "debug/sppo_chosen_loss": 2495.04736328125, "debug/sppo_chosen_reward_in_loss": 0.050452686846256256, "debug/sppo_rej_reward_in_loss": -0.003078100038692355, "debug/sppo_reject_loss": 2499.779541015625, "epoch": 7.916666666666667, "grad_norm": 59183.94302363507, "learning_rate": 2.9370523688915237e-11, "logits/chosen": 1.5966850519180298, "logits/rejected": 1.7997925281524658, "logps/chosen": -7.728903293609619, "logps/rejected": -8.765410423278809, "loss": 4965.4145, "rewards/accuracies": 0.375, "rewards/chosen": 0.0005045267753303051, "rewards/margins": 0.0005353078013285995, "rewards/rejected": -3.078100417042151e-05, "step": 2185 }, { "debug/policy_chosen_logits": 0.9988452196121216, "debug/policy_chosen_logps": -170.04989624023438, "debug/policy_rejected_logits": 1.4539412260055542, "debug/policy_rejected_logps": -45.55120086669922, "debug/reference_chosen_logps": -172.30654907226562, "debug/reference_rejected_logps": -45.706825256347656, "debug/sppo_chosen_loss": 2349.8671875, "debug/sppo_chosen_reward_in_loss": 2.256650924682617, "debug/sppo_rej_reward_in_loss": 0.1556188017129898, "debug/sppo_reject_loss": 2518.135986328125, "epoch": 7.934782608695652, "grad_norm": 62623.203297711116, "learning_rate": 1.7989434319093387e-11, "logits/chosen": 0.9988452196121216, "logits/rejected": 1.4539412260055542, "logps/chosen": -170.04989624023438, "logps/rejected": -45.55120086669922, "loss": 4958.1398, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.022566508501768112, "rewards/margins": 0.021010320633649826, "rewards/rejected": 0.0015561877517029643, "step": 2190 }, { "debug/policy_chosen_logits": 1.526430368423462, "debug/policy_chosen_logps": -185.01708984375, "debug/policy_rejected_logits": 1.6621005535125732, "debug/policy_rejected_logps": -12.181209564208984, "debug/reference_chosen_logps": -187.01004028320312, "debug/reference_rejected_logps": -12.076559066772461, "debug/sppo_chosen_loss": 2384.4912109375, "debug/sppo_chosen_reward_in_loss": 1.9929542541503906, "debug/sppo_rej_reward_in_loss": -0.10464892536401749, "debug/sppo_reject_loss": 2489.60205078125, "epoch": 7.952898550724638, "grad_norm": 20509.023708193017, "learning_rate": 9.38364697961047e-12, "logits/chosen": 1.526430368423462, "logits/rejected": 1.6621005535125732, "logps/chosen": -185.01708984375, "logps/rejected": -12.181209564208984, "loss": 4943.2129, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.019929539412260056, "rewards/margins": 0.020976031199097633, "rewards/rejected": -0.0010464892257004976, "step": 2195 }, { "debug/policy_chosen_logits": 1.359118103981018, "debug/policy_chosen_logps": -7.3567352294921875, "debug/policy_rejected_logits": 1.7415691614151, "debug/policy_rejected_logps": -5.697220802307129, "debug/reference_chosen_logps": -7.411065578460693, "debug/reference_rejected_logps": -5.5189714431762695, "debug/sppo_chosen_loss": 2495.0263671875, "debug/sppo_chosen_reward_in_loss": 0.05433068796992302, "debug/sppo_rej_reward_in_loss": -0.17824828624725342, "debug/sppo_reject_loss": 2482.53271484375, "epoch": 7.971014492753623, "grad_norm": 45338.52419547441, "learning_rate": 3.5536395155744138e-12, "logits/chosen": 1.359118103981018, "logits/rejected": 1.7415691614151, "logps/chosen": -7.3567352294921875, "logps/rejected": -5.697220802307129, "loss": 4946.1824, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": 0.0005433068727143109, "rewards/margins": 0.002325789537280798, "rewards/rejected": -0.0017824828391894698, "step": 2200 }, { "epoch": 7.971014492753623, "eval_debug/policy_chosen_logits": 1.6603082418441772, "eval_debug/policy_chosen_logps": -121.0837173461914, "eval_debug/policy_rejected_logits": 1.7198450565338135, "eval_debug/policy_rejected_logps": -63.905799865722656, "eval_debug/reference_chosen_logps": -123.14806365966797, "eval_debug/reference_rejected_logps": -63.887054443359375, "eval_debug/sppo_chosen_loss": 2387.424560546875, "eval_debug/sppo_chosen_reward_in_loss": 2.064335584640503, "eval_debug/sppo_rej_reward_in_loss": -0.018739407882094383, "eval_debug/sppo_reject_loss": 2498.160888671875, "eval_logits/chosen": 1.6603082418441772, "eval_logits/rejected": 1.7198450565338135, "eval_logps/chosen": -121.0837173461914, "eval_logps/rejected": -63.905799865722656, "eval_loss": 4957.30810546875, "eval_rewards/accuracies": 0.30263158679008484, "eval_rewards/chosen": 0.020643359050154686, "eval_rewards/margins": 0.020830752328038216, "eval_rewards/rejected": -0.00018739403458312154, "eval_runtime": 28.7159, "eval_samples_per_second": 20.894, "eval_steps_per_second": 0.662, "step": 2200 }, { "debug/policy_chosen_logits": 1.5274391174316406, "debug/policy_chosen_logps": -40.76905059814453, "debug/policy_rejected_logits": 1.8320732116699219, "debug/policy_rejected_logps": -6.004141330718994, "debug/reference_chosen_logps": -41.25218200683594, "debug/reference_rejected_logps": -5.933625221252441, "debug/sppo_chosen_loss": 2461.10791015625, "debug/sppo_chosen_reward_in_loss": 0.4831283688545227, "debug/sppo_rej_reward_in_loss": -0.07051616162061691, "debug/sppo_reject_loss": 2492.98486328125, "epoch": 7.989130434782608, "grad_norm": 203503.92951456137, "learning_rate": 4.997356440772371e-13, "logits/chosen": 1.5274391174316406, "logits/rejected": 1.8320732116699219, "logps/chosen": -40.76905059814453, "logps/rejected": -6.004141330718994, "loss": 5007.7508, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.0048312833532691, "rewards/margins": 0.005536444950848818, "rewards/rejected": -0.0007051616557873785, "step": 2205 }, { "epoch": 8.0, "step": 2208, "total_flos": 0.0, "train_loss": 5009.442099750906, "train_runtime": 15540.9203, "train_samples_per_second": 9.073, "train_steps_per_second": 0.142 } ], "logging_steps": 5, "max_steps": 2208, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }