diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,11664 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 8.0, + "eval_steps": 100, + "global_step": 2208, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "debug/policy_chosen_logits": 0.8079685568809509, + "debug/policy_chosen_logps": -298.0812683105469, + "debug/policy_rejected_logits": 0.6268295645713806, + "debug/policy_rejected_logps": -240.20742797851562, + "debug/reference_chosen_logps": -298.0812683105469, + "debug/reference_rejected_logps": -240.20742797851562, + "debug/sppo_chosen_loss": 2500.0, + "debug/sppo_chosen_reward_in_loss": 0.0, + "debug/sppo_rej_reward_in_loss": 0.0, + "debug/sppo_reject_loss": 2500.0, + "epoch": 0.0036231884057971015, + "grad_norm": 63517.94720887712, + "learning_rate": 1e-09, + "logits/chosen": 0.8079685568809509, + "logits/rejected": 0.6268295645713806, + "logps/chosen": -298.0812683105469, + "logps/rejected": -240.20742797851562, + "loss": 5000.0, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "debug/policy_chosen_logits": 1.1307491064071655, + "debug/policy_chosen_logps": -262.5867614746094, + "debug/policy_rejected_logits": 1.3688106536865234, + "debug/policy_rejected_logps": -295.9023742675781, + "debug/reference_chosen_logps": -262.58026123046875, + "debug/reference_rejected_logps": -296.0579528808594, + "debug/sppo_chosen_loss": 2501.32470703125, + "debug/sppo_chosen_reward_in_loss": -0.006488680839538574, + "debug/sppo_rej_reward_in_loss": 0.15558385848999023, + "debug/sppo_reject_loss": 2515.9296875, + "epoch": 0.018115942028985508, + "grad_norm": 58736.14839713635, + "learning_rate": 5e-09, + "logits/chosen": 1.1307491064071655, + "logits/rejected": 1.3688106536865234, + "logps/chosen": -262.5867614746094, + "logps/rejected": -295.9023742675781, + "loss": 4991.5791, + "rewards/accuracies": 0.34375, + "rewards/chosen": -6.488675717264414e-05, + "rewards/margins": -0.0016207253793254495, + "rewards/rejected": 0.0015558383893221617, + "step": 5 + }, + { + "debug/policy_chosen_logits": 1.3411222696304321, + "debug/policy_chosen_logps": -282.69219970703125, + "debug/policy_rejected_logits": 1.611016869544983, + "debug/policy_rejected_logps": -287.430908203125, + "debug/reference_chosen_logps": -282.7684631347656, + "debug/reference_rejected_logps": -287.81396484375, + "debug/sppo_chosen_loss": 2493.212646484375, + "debug/sppo_chosen_reward_in_loss": 0.07623787224292755, + "debug/sppo_rej_reward_in_loss": 0.38306236267089844, + "debug/sppo_reject_loss": 2539.445556640625, + "epoch": 0.036231884057971016, + "grad_norm": 68115.07386767172, + "learning_rate": 1e-08, + "logits/chosen": 1.3411222696304321, + "logits/rejected": 1.611016869544983, + "logps/chosen": -282.69219970703125, + "logps/rejected": -287.430908203125, + "loss": 5009.1367, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.0007623785641044378, + "rewards/margins": -0.0030682452488690615, + "rewards/rejected": 0.0038306235801428556, + "step": 10 + }, + { + "debug/policy_chosen_logits": 1.2816027402877808, + "debug/policy_chosen_logps": -247.15579223632812, + "debug/policy_rejected_logits": 1.63693368434906, + "debug/policy_rejected_logps": -285.6268615722656, + "debug/reference_chosen_logps": -247.40646362304688, + "debug/reference_rejected_logps": -286.155029296875, + "debug/sppo_chosen_loss": 2475.64404296875, + "debug/sppo_chosen_reward_in_loss": 0.25067728757858276, + "debug/sppo_rej_reward_in_loss": 0.5281627774238586, + "debug/sppo_reject_loss": 2554.003662109375, + "epoch": 0.05434782608695652, + "grad_norm": 54977.353764878644, + "learning_rate": 1.5e-08, + "logits/chosen": 1.2816027402877808, + "logits/rejected": 1.63693368434906, + "logps/chosen": -247.15579223632812, + "logps/rejected": -285.6268615722656, + "loss": 5003.7672, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.0025067729875445366, + "rewards/margins": -0.0027748546563088894, + "rewards/rejected": 0.005281627178192139, + "step": 15 + }, + { + "debug/policy_chosen_logits": 1.2541359663009644, + "debug/policy_chosen_logps": -275.11187744140625, + "debug/policy_rejected_logits": 1.6011371612548828, + "debug/policy_rejected_logps": -277.25579833984375, + "debug/reference_chosen_logps": -275.2658386230469, + "debug/reference_rejected_logps": -277.7287292480469, + "debug/sppo_chosen_loss": 2485.12744140625, + "debug/sppo_chosen_reward_in_loss": 0.15397301316261292, + "debug/sppo_rej_reward_in_loss": 0.47296810150146484, + "debug/sppo_reject_loss": 2548.18603515625, + "epoch": 0.07246376811594203, + "grad_norm": 57534.90143937713, + "learning_rate": 2e-08, + "logits/chosen": 1.2541359663009644, + "logits/rejected": 1.6011371612548828, + "logps/chosen": -275.11187744140625, + "logps/rejected": -277.25579833984375, + "loss": 5019.257, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0015397300012409687, + "rewards/margins": -0.003189950715750456, + "rewards/rejected": 0.004729681182652712, + "step": 20 + }, + { + "debug/policy_chosen_logits": 1.3301985263824463, + "debug/policy_chosen_logps": -256.6197204589844, + "debug/policy_rejected_logits": 1.700563669204712, + "debug/policy_rejected_logps": -268.60101318359375, + "debug/reference_chosen_logps": -257.0242614746094, + "debug/reference_rejected_logps": -269.0206604003906, + "debug/sppo_chosen_loss": 2460.305419921875, + "debug/sppo_chosen_reward_in_loss": 0.40453624725341797, + "debug/sppo_rej_reward_in_loss": 0.41967296600341797, + "debug/sppo_reject_loss": 2542.882080078125, + "epoch": 0.09057971014492754, + "grad_norm": 60821.795107824626, + "learning_rate": 2.5e-08, + "logits/chosen": 1.3301985263824463, + "logits/rejected": 1.700563669204712, + "logps/chosen": -256.6197204589844, + "logps/rejected": -268.60101318359375, + "loss": 5002.3281, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.004045362584292889, + "rewards/margins": -0.00015136711590457708, + "rewards/rejected": 0.004196729511022568, + "step": 25 + }, + { + "debug/policy_chosen_logits": 1.4955631494522095, + "debug/policy_chosen_logps": -226.08700561523438, + "debug/policy_rejected_logits": 1.840157151222229, + "debug/policy_rejected_logps": -272.52227783203125, + "debug/reference_chosen_logps": -226.3221893310547, + "debug/reference_rejected_logps": -272.57330322265625, + "debug/sppo_chosen_loss": 2477.2919921875, + "debug/sppo_chosen_reward_in_loss": 0.23515930771827698, + "debug/sppo_rej_reward_in_loss": 0.05106544494628906, + "debug/sppo_reject_loss": 2505.71826171875, + "epoch": 0.10869565217391304, + "grad_norm": 59806.97677705937, + "learning_rate": 3e-08, + "logits/chosen": 1.4955631494522095, + "logits/rejected": 1.840157151222229, + "logps/chosen": -226.08700561523438, + "logps/rejected": -272.52227783203125, + "loss": 4999.6945, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0023515927605330944, + "rewards/margins": 0.001840938231907785, + "rewards/rejected": 0.0005106544704176486, + "step": 30 + }, + { + "debug/policy_chosen_logits": 1.5929896831512451, + "debug/policy_chosen_logps": -250.57470703125, + "debug/policy_rejected_logits": 1.9173164367675781, + "debug/policy_rejected_logps": -278.4027404785156, + "debug/reference_chosen_logps": -250.8056640625, + "debug/reference_rejected_logps": -278.73834228515625, + "debug/sppo_chosen_loss": 2477.46240234375, + "debug/sppo_chosen_reward_in_loss": 0.2309425324201584, + "debug/sppo_rej_reward_in_loss": 0.335653692483902, + "debug/sppo_reject_loss": 2534.33251953125, + "epoch": 0.12681159420289856, + "grad_norm": 60289.32214413659, + "learning_rate": 3.4999999999999996e-08, + "logits/chosen": 1.5929896831512451, + "logits/rejected": 1.9173164367675781, + "logps/chosen": -250.57470703125, + "logps/rejected": -278.4027404785156, + "loss": 5009.1906, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.0023094252683222294, + "rewards/margins": -0.0010471114655956626, + "rewards/rejected": 0.003356536850333214, + "step": 35 + }, + { + "debug/policy_chosen_logits": 1.4920421838760376, + "debug/policy_chosen_logps": -258.7696228027344, + "debug/policy_rejected_logits": 1.8183902502059937, + "debug/policy_rejected_logps": -306.36370849609375, + "debug/reference_chosen_logps": -259.2444152832031, + "debug/reference_rejected_logps": -306.8253479003906, + "debug/sppo_chosen_loss": 2453.486572265625, + "debug/sppo_chosen_reward_in_loss": 0.4747522473335266, + "debug/sppo_rej_reward_in_loss": 0.46161746978759766, + "debug/sppo_reject_loss": 2546.947265625, + "epoch": 0.14492753623188406, + "grad_norm": 59391.41080156482, + "learning_rate": 4e-08, + "logits/chosen": 1.4920421838760376, + "logits/rejected": 1.8183902502059937, + "logps/chosen": -258.7696228027344, + "logps/rejected": -306.36370849609375, + "loss": 5000.5938, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.004747522063553333, + "rewards/margins": 0.00013134740584064275, + "rewards/rejected": 0.004616174381226301, + "step": 40 + }, + { + "debug/policy_chosen_logits": 1.4446805715560913, + "debug/policy_chosen_logps": -251.4736785888672, + "debug/policy_rejected_logits": 1.6244945526123047, + "debug/policy_rejected_logps": -283.0340270996094, + "debug/reference_chosen_logps": -251.584228515625, + "debug/reference_rejected_logps": -283.26763916015625, + "debug/sppo_chosen_loss": 2489.77587890625, + "debug/sppo_chosen_reward_in_loss": 0.11053428798913956, + "debug/sppo_rej_reward_in_loss": 0.2336265593767166, + "debug/sppo_reject_loss": 2524.171142578125, + "epoch": 0.16304347826086957, + "grad_norm": 69631.29334784736, + "learning_rate": 4.5e-08, + "logits/chosen": 1.4446805715560913, + "logits/rejected": 1.6244945526123047, + "logps/chosen": -251.4736785888672, + "logps/rejected": -283.0340270996094, + "loss": 5000.6957, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0011053427588194609, + "rewards/margins": -0.0012309231096878648, + "rewards/rejected": 0.002336265752092004, + "step": 45 + }, + { + "debug/policy_chosen_logits": 1.348311185836792, + "debug/policy_chosen_logps": -244.5071258544922, + "debug/policy_rejected_logits": 1.7887471914291382, + "debug/policy_rejected_logps": -292.06756591796875, + "debug/reference_chosen_logps": -244.6476287841797, + "debug/reference_rejected_logps": -292.14788818359375, + "debug/sppo_chosen_loss": 2486.54541015625, + "debug/sppo_chosen_reward_in_loss": 0.14052048325538635, + "debug/sppo_rej_reward_in_loss": 0.080322265625, + "debug/sppo_reject_loss": 2508.5693359375, + "epoch": 0.18115942028985507, + "grad_norm": 60893.65741162143, + "learning_rate": 5e-08, + "logits/chosen": 1.348311185836792, + "logits/rejected": 1.7887471914291382, + "logps/chosen": -244.5071258544922, + "logps/rejected": -292.06756591796875, + "loss": 4994.2488, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0014052048791199923, + "rewards/margins": 0.0006019821157678962, + "rewards/rejected": 0.0008032227051444352, + "step": 50 + }, + { + "debug/policy_chosen_logits": 1.3296738862991333, + "debug/policy_chosen_logps": -256.0931396484375, + "debug/policy_rejected_logits": 1.8709113597869873, + "debug/policy_rejected_logps": -305.2488098144531, + "debug/reference_chosen_logps": -255.957275390625, + "debug/reference_rejected_logps": -305.47186279296875, + "debug/sppo_chosen_loss": 2514.22216796875, + "debug/sppo_chosen_reward_in_loss": -0.1358652114868164, + "debug/sppo_rej_reward_in_loss": 0.22305870056152344, + "debug/sppo_reject_loss": 2523.094970703125, + "epoch": 0.19927536231884058, + "grad_norm": 68084.35003371171, + "learning_rate": 5.5e-08, + "logits/chosen": 1.3296738862991333, + "logits/rejected": 1.8709113597869873, + "logps/chosen": -256.0931396484375, + "logps/rejected": -305.2488098144531, + "loss": 4993.8352, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0013586520217359066, + "rewards/margins": -0.0035892389714717865, + "rewards/rejected": 0.00223058694973588, + "step": 55 + }, + { + "debug/policy_chosen_logits": 1.5263268947601318, + "debug/policy_chosen_logps": -259.1288146972656, + "debug/policy_rejected_logits": 1.8259985446929932, + "debug/policy_rejected_logps": -309.2344970703125, + "debug/reference_chosen_logps": -259.2587585449219, + "debug/reference_rejected_logps": -309.2173767089844, + "debug/sppo_chosen_loss": 2487.46142578125, + "debug/sppo_chosen_reward_in_loss": 0.12995243072509766, + "debug/sppo_rej_reward_in_loss": -0.01710205152630806, + "debug/sppo_reject_loss": 2498.983642578125, + "epoch": 0.21739130434782608, + "grad_norm": 66891.2264132623, + "learning_rate": 6e-08, + "logits/chosen": 1.5263268947601318, + "logits/rejected": 1.8259985446929932, + "logps/chosen": -259.1288146972656, + "logps/rejected": -309.2344970703125, + "loss": 4989.0859, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.001299524214118719, + "rewards/margins": 0.001470544608309865, + "rewards/rejected": -0.00017102043784689158, + "step": 60 + }, + { + "debug/policy_chosen_logits": 1.4817605018615723, + "debug/policy_chosen_logps": -272.45111083984375, + "debug/policy_rejected_logits": 1.8564655780792236, + "debug/policy_rejected_logps": -295.9479675292969, + "debug/reference_chosen_logps": -272.57183837890625, + "debug/reference_rejected_logps": -295.93487548828125, + "debug/sppo_chosen_loss": 2488.473388671875, + "debug/sppo_chosen_reward_in_loss": 0.12075519561767578, + "debug/sppo_rej_reward_in_loss": -0.013109969906508923, + "debug/sppo_reject_loss": 2499.827392578125, + "epoch": 0.23550724637681159, + "grad_norm": 65457.17891751278, + "learning_rate": 6.5e-08, + "logits/chosen": 1.4817605018615723, + "logits/rejected": 1.8564655780792236, + "logps/chosen": -272.45111083984375, + "logps/rejected": -295.9479675292969, + "loss": 4989.7848, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0012075519189238548, + "rewards/margins": 0.0013386515202000737, + "rewards/rejected": -0.00013109967403579503, + "step": 65 + }, + { + "debug/policy_chosen_logits": 1.643053412437439, + "debug/policy_chosen_logps": -266.3773193359375, + "debug/policy_rejected_logits": 1.6303755044937134, + "debug/policy_rejected_logps": -279.2357482910156, + "debug/reference_chosen_logps": -266.346435546875, + "debug/reference_rejected_logps": -279.0445861816406, + "debug/sppo_chosen_loss": 2503.91015625, + "debug/sppo_chosen_reward_in_loss": -0.0308837890625, + "debug/sppo_rej_reward_in_loss": -0.19117030501365662, + "debug/sppo_reject_loss": 2481.839111328125, + "epoch": 0.2536231884057971, + "grad_norm": 56546.95847698017, + "learning_rate": 6.999999999999999e-08, + "logits/chosen": 1.643053412437439, + "logits/rejected": 1.6303755044937134, + "logps/chosen": -266.3773193359375, + "logps/rejected": -279.2357482910156, + "loss": 4996.5586, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0003088379744440317, + "rewards/margins": 0.0016028654063120484, + "rewards/rejected": -0.0019117031479254365, + "step": 70 + }, + { + "debug/policy_chosen_logits": 1.3959085941314697, + "debug/policy_chosen_logps": -241.9213409423828, + "debug/policy_rejected_logits": 1.7832151651382446, + "debug/policy_rejected_logps": -272.53082275390625, + "debug/reference_chosen_logps": -241.78036499023438, + "debug/reference_rejected_logps": -272.10552978515625, + "debug/sppo_chosen_loss": 2515.092529296875, + "debug/sppo_chosen_reward_in_loss": -0.14095115661621094, + "debug/sppo_rej_reward_in_loss": -0.4252597689628601, + "debug/sppo_reject_loss": 2458.417724609375, + "epoch": 0.2717391304347826, + "grad_norm": 56288.35239166632, + "learning_rate": 7.5e-08, + "logits/chosen": 1.3959085941314697, + "logits/rejected": 1.7832151651382446, + "logps/chosen": -241.9213409423828, + "logps/rejected": -272.53082275390625, + "loss": 4976.884, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0014095116639509797, + "rewards/margins": 0.002843086142092943, + "rewards/rejected": -0.004252597689628601, + "step": 75 + }, + { + "debug/policy_chosen_logits": 1.3575584888458252, + "debug/policy_chosen_logps": -263.7048034667969, + "debug/policy_rejected_logits": 1.7402280569076538, + "debug/policy_rejected_logps": -292.9505310058594, + "debug/reference_chosen_logps": -263.3627014160156, + "debug/reference_rejected_logps": -292.67999267578125, + "debug/sppo_chosen_loss": 2535.425537109375, + "debug/sppo_chosen_reward_in_loss": -0.3420942425727844, + "debug/sppo_rej_reward_in_loss": -0.2705673277378082, + "debug/sppo_reject_loss": 2473.82861328125, + "epoch": 0.2898550724637681, + "grad_norm": 58321.48565606706, + "learning_rate": 8e-08, + "logits/chosen": 1.3575584888458252, + "logits/rejected": 1.7402280569076538, + "logps/chosen": -263.7048034667969, + "logps/rejected": -292.9505310058594, + "loss": 4986.0457, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0034209422301501036, + "rewards/margins": -0.0007152691250666976, + "rewards/rejected": -0.002705672988668084, + "step": 80 + }, + { + "debug/policy_chosen_logits": 1.495825171470642, + "debug/policy_chosen_logps": -239.78091430664062, + "debug/policy_rejected_logits": 2.119466781616211, + "debug/policy_rejected_logps": -303.726318359375, + "debug/reference_chosen_logps": -239.67935180664062, + "debug/reference_rejected_logps": -303.4689636230469, + "debug/sppo_chosen_loss": 2511.033203125, + "debug/sppo_chosen_reward_in_loss": -0.10157432407140732, + "debug/sppo_rej_reward_in_loss": -0.25736045837402344, + "debug/sppo_reject_loss": 2474.99365234375, + "epoch": 0.3079710144927536, + "grad_norm": 70326.60868995877, + "learning_rate": 8.5e-08, + "logits/chosen": 1.495825171470642, + "logits/rejected": 2.119466781616211, + "logps/chosen": -239.78091430664062, + "logps/rejected": -303.726318359375, + "loss": 4978.4234, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0010157432407140732, + "rewards/margins": 0.0015578612219542265, + "rewards/rejected": -0.0025736044626682997, + "step": 85 + }, + { + "debug/policy_chosen_logits": 1.2334277629852295, + "debug/policy_chosen_logps": -251.84042358398438, + "debug/policy_rejected_logits": 1.418172836303711, + "debug/policy_rejected_logps": -281.77276611328125, + "debug/reference_chosen_logps": -251.71041870117188, + "debug/reference_rejected_logps": -281.41546630859375, + "debug/sppo_chosen_loss": 2513.858154296875, + "debug/sppo_chosen_reward_in_loss": -0.1299985945224762, + "debug/sppo_rej_reward_in_loss": -0.3572982847690582, + "debug/sppo_reject_loss": 2465.033935546875, + "epoch": 0.32608695652173914, + "grad_norm": 64153.45121330528, + "learning_rate": 9e-08, + "logits/chosen": 1.2334277629852295, + "logits/rejected": 1.418172836303711, + "logps/chosen": -251.84042358398438, + "logps/rejected": -281.77276611328125, + "loss": 4968.8496, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.001299985800869763, + "rewards/margins": 0.002272996585816145, + "rewards/rejected": -0.0035729825031012297, + "step": 90 + }, + { + "debug/policy_chosen_logits": 1.4227510690689087, + "debug/policy_chosen_logps": -250.6143035888672, + "debug/policy_rejected_logits": 1.7439839839935303, + "debug/policy_rejected_logps": -265.55133056640625, + "debug/reference_chosen_logps": -250.3922119140625, + "debug/reference_rejected_logps": -265.2495422363281, + "debug/sppo_chosen_loss": 2522.94677734375, + "debug/sppo_chosen_reward_in_loss": -0.22208480536937714, + "debug/sppo_rej_reward_in_loss": -0.3018133044242859, + "debug/sppo_reject_loss": 2470.69970703125, + "epoch": 0.3442028985507246, + "grad_norm": 69704.93653164264, + "learning_rate": 9.499999999999999e-08, + "logits/chosen": 1.4227510690689087, + "logits/rejected": 1.7439839839935303, + "logps/chosen": -250.6143035888672, + "logps/rejected": -265.55133056640625, + "loss": 4967.1359, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.002220848109573126, + "rewards/margins": 0.0007972849416546524, + "rewards/rejected": -0.0030181333422660828, + "step": 95 + }, + { + "debug/policy_chosen_logits": 1.181106448173523, + "debug/policy_chosen_logps": -229.50833129882812, + "debug/policy_rejected_logits": 1.8651745319366455, + "debug/policy_rejected_logps": -291.35162353515625, + "debug/reference_chosen_logps": -229.11221313476562, + "debug/reference_rejected_logps": -290.591552734375, + "debug/sppo_chosen_loss": 2540.667724609375, + "debug/sppo_chosen_reward_in_loss": -0.3961181640625, + "debug/sppo_rej_reward_in_loss": -0.7600471377372742, + "debug/sppo_reject_loss": 2425.351318359375, + "epoch": 0.36231884057971014, + "grad_norm": 61806.31661274745, + "learning_rate": 1e-07, + "logits/chosen": 1.181106448173523, + "logits/rejected": 1.8651745319366455, + "logps/chosen": -229.50833129882812, + "logps/rejected": -291.35162353515625, + "loss": 4970.1539, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0039611817337572575, + "rewards/margins": 0.0036392901092767715, + "rewards/rejected": -0.007600471377372742, + "step": 100 + }, + { + "epoch": 0.36231884057971014, + "eval_debug/policy_chosen_logits": 1.6355112791061401, + "eval_debug/policy_chosen_logps": -253.23245239257812, + "eval_debug/policy_rejected_logits": 1.6972817182540894, + "eval_debug/policy_rejected_logps": -260.1171875, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2532.337158203125, + "eval_debug/sppo_chosen_reward_in_loss": -0.3139660954475403, + "eval_debug/sppo_rej_reward_in_loss": -0.45856496691703796, + "eval_debug/sppo_reject_loss": 2455.31591796875, + "eval_logits/chosen": 1.6355112791061401, + "eval_logits/rejected": 1.6972817182540894, + "eval_logps/chosen": -253.23245239257812, + "eval_logps/rejected": -260.1171875, + "eval_loss": 4979.080078125, + "eval_rewards/accuracies": 0.5657894611358643, + "eval_rewards/chosen": -0.0031396609265357256, + "eval_rewards/margins": 0.0014459885424003005, + "eval_rewards/rejected": -0.004585649818181992, + "eval_runtime": 28.5359, + "eval_samples_per_second": 21.026, + "eval_steps_per_second": 0.666, + "step": 100 + }, + { + "debug/policy_chosen_logits": 1.1145771741867065, + "debug/policy_chosen_logps": -245.8112030029297, + "debug/policy_rejected_logits": 1.2326147556304932, + "debug/policy_rejected_logps": -293.15155029296875, + "debug/reference_chosen_logps": -246.10745239257812, + "debug/reference_rejected_logps": -293.17578125, + "debug/sppo_chosen_loss": 2471.974365234375, + "debug/sppo_chosen_reward_in_loss": 0.2962339520454407, + "debug/sppo_rej_reward_in_loss": 0.024216841906309128, + "debug/sppo_reject_loss": 2503.906982421875, + "epoch": 0.3804347826086957, + "grad_norm": 67474.7309879143, + "learning_rate": 9.999861184954399e-08, + "logits/chosen": 1.1145771741867065, + "logits/rejected": 1.2326147556304932, + "logps/chosen": -245.8112030029297, + "logps/rejected": -293.15155029296875, + "loss": 4954.6148, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0029623392038047314, + "rewards/margins": 0.002720171120017767, + "rewards/rejected": 0.00024216854944825172, + "step": 105 + }, + { + "debug/policy_chosen_logits": 1.262205719947815, + "debug/policy_chosen_logps": -252.00979614257812, + "debug/policy_rejected_logits": 1.5525165796279907, + "debug/policy_rejected_logps": -277.41937255859375, + "debug/reference_chosen_logps": -251.8131561279297, + "debug/reference_rejected_logps": -276.50152587890625, + "debug/sppo_chosen_loss": 2520.6708984375, + "debug/sppo_chosen_reward_in_loss": -0.19663181900978088, + "debug/sppo_rej_reward_in_loss": -0.9178388714790344, + "debug/sppo_reject_loss": 2410.38037109375, + "epoch": 0.39855072463768115, + "grad_norm": 54919.44910846822, + "learning_rate": 9.999444747525447e-08, + "logits/chosen": 1.262205719947815, + "logits/rejected": 1.5525165796279907, + "logps/chosen": -252.00979614257812, + "logps/rejected": -277.41937255859375, + "loss": 4954.4176, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.001966318115592003, + "rewards/margins": 0.007212069816887379, + "rewards/rejected": -0.009178387932479382, + "step": 110 + }, + { + "debug/policy_chosen_logits": 1.517817735671997, + "debug/policy_chosen_logps": -240.8375244140625, + "debug/policy_rejected_logits": 1.9009748697280884, + "debug/policy_rejected_logps": -302.6877746582031, + "debug/reference_chosen_logps": -240.13211059570312, + "debug/reference_rejected_logps": -301.23028564453125, + "debug/sppo_chosen_loss": 2572.050048828125, + "debug/sppo_chosen_reward_in_loss": -0.7054191827774048, + "debug/sppo_rej_reward_in_loss": -1.457501769065857, + "debug/sppo_reject_loss": 2358.313720703125, + "epoch": 0.4166666666666667, + "grad_norm": 60632.14793206507, + "learning_rate": 9.998750710836255e-08, + "logits/chosen": 1.517817735671997, + "logits/rejected": 1.9009748697280884, + "logps/chosen": -240.8375244140625, + "logps/rejected": -302.6877746582031, + "loss": 4951.5859, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.007054192014038563, + "rewards/margins": 0.007520826067775488, + "rewards/rejected": -0.014575016684830189, + "step": 115 + }, + { + "debug/policy_chosen_logits": 1.3009164333343506, + "debug/policy_chosen_logps": -239.3059539794922, + "debug/policy_rejected_logits": 1.6617835760116577, + "debug/policy_rejected_logps": -276.4163513183594, + "debug/reference_chosen_logps": -238.712158203125, + "debug/reference_rejected_logps": -275.2547302246094, + "debug/sppo_chosen_loss": 2561.05419921875, + "debug/sppo_chosen_reward_in_loss": -0.5937992334365845, + "debug/sppo_rej_reward_in_loss": -1.1615955829620361, + "debug/sppo_reject_loss": 2387.34130859375, + "epoch": 0.43478260869565216, + "grad_norm": 71791.48071321512, + "learning_rate": 9.997779113423914e-08, + "logits/chosen": 1.3009164333343506, + "logits/rejected": 1.6617835760116577, + "logps/chosen": -239.3059539794922, + "logps/rejected": -276.4163513183594, + "loss": 4950.5766, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.005937992129474878, + "rewards/margins": 0.005677963141351938, + "rewards/rejected": -0.011615955270826817, + "step": 120 + }, + { + "debug/policy_chosen_logits": 1.38350510597229, + "debug/policy_chosen_logps": -240.8602752685547, + "debug/policy_rejected_logits": 1.6243913173675537, + "debug/policy_rejected_logps": -272.2510681152344, + "debug/reference_chosen_logps": -240.1095428466797, + "debug/reference_rejected_logps": -271.2296447753906, + "debug/sppo_chosen_loss": 2576.484130859375, + "debug/sppo_chosen_reward_in_loss": -0.7507423162460327, + "debug/sppo_rej_reward_in_loss": -1.021427869796753, + "debug/sppo_reject_loss": 2400.478271484375, + "epoch": 0.4528985507246377, + "grad_norm": 151701.3681973675, + "learning_rate": 9.996530009237363e-08, + "logits/chosen": 1.38350510597229, + "logits/rejected": 1.6243913173675537, + "logps/chosen": -240.8602752685547, + "logps/rejected": -272.2510681152344, + "loss": 4954.3414, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007507423870265484, + "rewards/margins": 0.0027068553026765585, + "rewards/rejected": -0.010214278474450111, + "step": 125 + }, + { + "debug/policy_chosen_logits": 1.1746623516082764, + "debug/policy_chosen_logps": -251.74844360351562, + "debug/policy_rejected_logits": 1.6008373498916626, + "debug/policy_rejected_logps": -297.63568115234375, + "debug/reference_chosen_logps": -250.86929321289062, + "debug/reference_rejected_logps": -296.35479736328125, + "debug/sppo_chosen_loss": 2590.117919921875, + "debug/sppo_chosen_reward_in_loss": -0.8791602849960327, + "debug/sppo_rej_reward_in_loss": -1.2808887958526611, + "debug/sppo_reject_loss": 2374.90869140625, + "epoch": 0.47101449275362317, + "grad_norm": 70424.18585342077, + "learning_rate": 9.995003467634381e-08, + "logits/chosen": 1.1746623516082764, + "logits/rejected": 1.6008373498916626, + "logps/chosen": -251.74844360351562, + "logps/rejected": -297.63568115234375, + "loss": 4954.9508, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.008791603147983551, + "rewards/margins": 0.004017284605652094, + "rewards/rejected": -0.012808887287974358, + "step": 130 + }, + { + "debug/policy_chosen_logits": 1.3090918064117432, + "debug/policy_chosen_logps": -248.5448455810547, + "debug/policy_rejected_logits": 1.7777769565582275, + "debug/policy_rejected_logps": -299.4295349121094, + "debug/reference_chosen_logps": -247.79171752929688, + "debug/reference_rejected_logps": -298.0932312011719, + "debug/sppo_chosen_loss": 2577.368896484375, + "debug/sppo_chosen_reward_in_loss": -0.7531425356864929, + "debug/sppo_rej_reward_in_loss": -1.3363120555877686, + "debug/sppo_reject_loss": 2369.63916015625, + "epoch": 0.4891304347826087, + "grad_norm": 55772.8366539, + "learning_rate": 9.99319957337775e-08, + "logits/chosen": 1.3090918064117432, + "logits/rejected": 1.7777769565582275, + "logps/chosen": -248.5448455810547, + "logps/rejected": -299.4295349121094, + "loss": 4953.4102, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.007531425449997187, + "rewards/margins": 0.005831695627421141, + "rewards/rejected": -0.013363120146095753, + "step": 135 + }, + { + "debug/policy_chosen_logits": 1.6867640018463135, + "debug/policy_chosen_logps": -267.0757751464844, + "debug/policy_rejected_logits": 1.9828193187713623, + "debug/policy_rejected_logps": -309.44317626953125, + "debug/reference_chosen_logps": -266.6741638183594, + "debug/reference_rejected_logps": -308.5638427734375, + "debug/sppo_chosen_loss": 2542.128662109375, + "debug/sppo_chosen_reward_in_loss": -0.40160447359085083, + "debug/sppo_rej_reward_in_loss": -0.8793373107910156, + "debug/sppo_reject_loss": 2414.30419921875, + "epoch": 0.5072463768115942, + "grad_norm": 87804.69325443542, + "learning_rate": 9.991118426630531e-08, + "logits/chosen": 1.6867640018463135, + "logits/rejected": 1.9828193187713623, + "logps/chosen": -267.0757751464844, + "logps/rejected": -309.44317626953125, + "loss": 4954.575, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004016044549643993, + "rewards/margins": 0.004777328111231327, + "rewards/rejected": -0.00879337266087532, + "step": 140 + }, + { + "debug/policy_chosen_logits": 1.2096970081329346, + "debug/policy_chosen_logps": -240.99169921875, + "debug/policy_rejected_logits": 1.5321658849716187, + "debug/policy_rejected_logps": -282.29827880859375, + "debug/reference_chosen_logps": -240.69192504882812, + "debug/reference_rejected_logps": -281.6559753417969, + "debug/sppo_chosen_loss": 2530.816162109375, + "debug/sppo_chosen_reward_in_loss": -0.2997651994228363, + "debug/sppo_rej_reward_in_loss": -0.6423038244247437, + "debug/sppo_reject_loss": 2437.80322265625, + "epoch": 0.5253623188405797, + "grad_norm": 62100.452115897504, + "learning_rate": 9.988760142950516e-08, + "logits/chosen": 1.2096970081329346, + "logits/rejected": 1.5321658849716187, + "logps/chosen": -240.99169921875, + "logps/rejected": -282.29827880859375, + "loss": 4960.2316, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.0029976521618664265, + "rewards/margins": 0.0034253865014761686, + "rewards/rejected": -0.0064230384305119514, + "step": 145 + }, + { + "debug/policy_chosen_logits": 1.5637954473495483, + "debug/policy_chosen_logps": -266.9645080566406, + "debug/policy_rejected_logits": 2.029578924179077, + "debug/policy_rejected_logps": -296.61737060546875, + "debug/reference_chosen_logps": -266.7818298339844, + "debug/reference_rejected_logps": -295.389892578125, + "debug/sppo_chosen_loss": 2519.594970703125, + "debug/sppo_chosen_reward_in_loss": -0.18268242478370667, + "debug/sppo_rej_reward_in_loss": -1.2275073528289795, + "debug/sppo_reject_loss": 2380.341796875, + "epoch": 0.5434782608695652, + "grad_norm": 105223.1261169405, + "learning_rate": 9.98612485328381e-08, + "logits/chosen": 1.5637954473495483, + "logits/rejected": 2.029578924179077, + "logps/chosen": -266.9645080566406, + "logps/rejected": -296.61737060546875, + "loss": 4930.8102, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0018268240382894874, + "rewards/margins": 0.010448249988257885, + "rewards/rejected": -0.012275073677301407, + "step": 150 + }, + { + "debug/policy_chosen_logits": 1.3448460102081299, + "debug/policy_chosen_logps": -241.264892578125, + "debug/policy_rejected_logits": 1.6951926946640015, + "debug/policy_rejected_logps": -277.68731689453125, + "debug/reference_chosen_logps": -240.8543701171875, + "debug/reference_rejected_logps": -276.788818359375, + "debug/sppo_chosen_loss": 2542.348876953125, + "debug/sppo_chosen_reward_in_loss": -0.4105297029018402, + "debug/sppo_rej_reward_in_loss": -0.8985021710395813, + "debug/sppo_reject_loss": 2412.73291015625, + "epoch": 0.5615942028985508, + "grad_norm": 72896.69066553783, + "learning_rate": 9.983212703957554e-08, + "logits/chosen": 1.3448460102081299, + "logits/rejected": 1.6951926946640015, + "logps/chosen": -241.264892578125, + "logps/rejected": -277.68731689453125, + "loss": 4944.0086, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004105296917259693, + "rewards/margins": 0.004879724234342575, + "rewards/rejected": -0.008985022082924843, + "step": 155 + }, + { + "debug/policy_chosen_logits": 1.077109932899475, + "debug/policy_chosen_logps": -255.0774383544922, + "debug/policy_rejected_logits": 1.5379952192306519, + "debug/policy_rejected_logps": -297.5126953125, + "debug/reference_chosen_logps": -254.5387420654297, + "debug/reference_rejected_logps": -295.9001770019531, + "debug/sppo_chosen_loss": 2555.77099609375, + "debug/sppo_chosen_reward_in_loss": -0.5386981964111328, + "debug/sppo_rej_reward_in_loss": -1.6125160455703735, + "debug/sppo_reject_loss": 2343.542724609375, + "epoch": 0.5797101449275363, + "grad_norm": 93485.02078548388, + "learning_rate": 9.980023856671804e-08, + "logits/chosen": 1.077109932899475, + "logits/rejected": 1.5379952192306519, + "logps/chosen": -255.0774383544922, + "logps/rejected": -297.5126953125, + "loss": 4954.2586, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005386981647461653, + "rewards/margins": 0.010738177224993706, + "rewards/rejected": -0.016125161200761795, + "step": 160 + }, + { + "debug/policy_chosen_logits": 1.5786542892456055, + "debug/policy_chosen_logps": -269.99346923828125, + "debug/policy_rejected_logits": 1.402779221534729, + "debug/policy_rejected_logps": -263.34893798828125, + "debug/reference_chosen_logps": -269.4091491699219, + "debug/reference_rejected_logps": -262.3013610839844, + "debug/sppo_chosen_loss": 2560.51904296875, + "debug/sppo_chosen_reward_in_loss": -0.5842826962471008, + "debug/sppo_rej_reward_in_loss": -1.0475749969482422, + "debug/sppo_reject_loss": 2399.6630859375, + "epoch": 0.5978260869565217, + "grad_norm": 92597.71266050417, + "learning_rate": 9.976558488490555e-08, + "logits/chosen": 1.5786542892456055, + "logits/rejected": 1.402779221534729, + "logps/chosen": -269.99346923828125, + "logps/rejected": -263.34893798828125, + "loss": 4936.507, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0058428263291716576, + "rewards/margins": 0.004632922820746899, + "rewards/rejected": -0.01047575008124113, + "step": 165 + }, + { + "debug/policy_chosen_logits": 1.290741205215454, + "debug/policy_chosen_logps": -236.67715454101562, + "debug/policy_rejected_logits": 1.5977189540863037, + "debug/policy_rejected_logps": -291.2699279785156, + "debug/reference_chosen_logps": -238.73080444335938, + "debug/reference_rejected_logps": -293.11700439453125, + "debug/sppo_chosen_loss": 2302.3115234375, + "debug/sppo_chosen_reward_in_loss": 2.053657054901123, + "debug/sppo_rej_reward_in_loss": 1.8470966815948486, + "debug/sppo_reject_loss": 2691.03515625, + "epoch": 0.6159420289855072, + "grad_norm": 98094.66729862805, + "learning_rate": 9.972816791831899e-08, + "logits/chosen": 1.290741205215454, + "logits/rejected": 1.5977189540863037, + "logps/chosen": -236.67715454101562, + "logps/rejected": -291.2699279785156, + "loss": 4975.907, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.020536571741104126, + "rewards/margins": 0.002065605018287897, + "rewards/rejected": 0.018470967188477516, + "step": 170 + }, + { + "debug/policy_chosen_logits": 1.4175643920898438, + "debug/policy_chosen_logps": -263.4878845214844, + "debug/policy_rejected_logits": 1.8533436059951782, + "debug/policy_rejected_logps": -309.7564697265625, + "debug/reference_chosen_logps": -263.7447814941406, + "debug/reference_rejected_logps": -309.0677795410156, + "debug/sppo_chosen_loss": 2477.23681640625, + "debug/sppo_chosen_reward_in_loss": 0.2568736970424652, + "debug/sppo_rej_reward_in_loss": -0.6886796951293945, + "debug/sppo_reject_loss": 2437.29541015625, + "epoch": 0.6340579710144928, + "grad_norm": 63312.73342726648, + "learning_rate": 9.968798974457359e-08, + "logits/chosen": 1.4175643920898438, + "logits/rejected": 1.8533436059951782, + "logps/chosen": -263.4878845214844, + "logps/rejected": -309.7564697265625, + "loss": 4920.0543, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0025687366724014282, + "rewards/margins": 0.00945553369820118, + "rewards/rejected": -0.006886796560138464, + "step": 175 + }, + { + "debug/policy_chosen_logits": 1.2178828716278076, + "debug/policy_chosen_logps": -246.9923095703125, + "debug/policy_rejected_logits": 1.5837467908859253, + "debug/policy_rejected_logps": -289.5892028808594, + "debug/reference_chosen_logps": -246.3313446044922, + "debug/reference_rejected_logps": -288.1348876953125, + "debug/sppo_chosen_loss": 2569.297119140625, + "debug/sppo_chosen_reward_in_loss": -0.6609573364257812, + "debug/sppo_rej_reward_in_loss": -1.4543180465698242, + "debug/sppo_reject_loss": 2363.291015625, + "epoch": 0.6521739130434783, + "grad_norm": 73130.10544261185, + "learning_rate": 9.964505259460332e-08, + "logits/chosen": 1.2178828716278076, + "logits/rejected": 1.5837467908859253, + "logps/chosen": -246.9923095703125, + "logps/rejected": -289.5892028808594, + "loss": 4913.6719, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006609573028981686, + "rewards/margins": 0.007933606393635273, + "rewards/rejected": -0.014543181285262108, + "step": 180 + }, + { + "debug/policy_chosen_logits": 1.19364333152771, + "debug/policy_chosen_logps": -248.1702423095703, + "debug/policy_rejected_logits": 1.6476013660430908, + "debug/policy_rejected_logps": -322.3099365234375, + "debug/reference_chosen_logps": -246.93405151367188, + "debug/reference_rejected_logps": -319.759765625, + "debug/sppo_chosen_loss": 2628.030029296875, + "debug/sppo_chosen_reward_in_loss": -1.2361927032470703, + "debug/sppo_rej_reward_in_loss": -2.5501551628112793, + "debug/sppo_reject_loss": 2257.75341796875, + "epoch": 0.6702898550724637, + "grad_norm": 62164.347458530305, + "learning_rate": 9.959935885253715e-08, + "logits/chosen": 1.19364333152771, + "logits/rejected": 1.6476013660430908, + "logps/chosen": -248.1702423095703, + "logps/rejected": -322.3099365234375, + "loss": 4902.393, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.012361926957964897, + "rewards/margins": 0.013139625079929829, + "rewards/rejected": -0.025501549243927002, + "step": 185 + }, + { + "debug/policy_chosen_logits": 1.20594322681427, + "debug/policy_chosen_logps": -251.36129760742188, + "debug/policy_rejected_logits": 1.2766417264938354, + "debug/policy_rejected_logps": -275.73919677734375, + "debug/reference_chosen_logps": -250.4452362060547, + "debug/reference_rejected_logps": -273.4372863769531, + "debug/sppo_chosen_loss": 2595.31591796875, + "debug/sppo_chosen_reward_in_loss": -0.9160749316215515, + "debug/sppo_rej_reward_in_loss": -2.3019137382507324, + "debug/sppo_reject_loss": 2283.59375, + "epoch": 0.6884057971014492, + "grad_norm": 61551.26752821376, + "learning_rate": 9.955091105556664e-08, + "logits/chosen": 1.20594322681427, + "logits/rejected": 1.2766417264938354, + "logps/chosen": -251.36129760742188, + "logps/rejected": -275.73919677734375, + "loss": 4926.9672, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.009160749614238739, + "rewards/margins": 0.013858387246727943, + "rewards/rejected": -0.023019134998321533, + "step": 190 + }, + { + "debug/policy_chosen_logits": 1.7529083490371704, + "debug/policy_chosen_logps": -272.32366943359375, + "debug/policy_rejected_logits": 2.1265456676483154, + "debug/policy_rejected_logps": -313.41241455078125, + "debug/reference_chosen_logps": -270.6104736328125, + "debug/reference_rejected_logps": -310.8916320800781, + "debug/sppo_chosen_loss": 2683.697021484375, + "debug/sppo_chosen_reward_in_loss": -1.7131826877593994, + "debug/sppo_rej_reward_in_loss": -2.5207762718200684, + "debug/sppo_reject_loss": 2260.98779296875, + "epoch": 0.7065217391304348, + "grad_norm": 64279.10209786374, + "learning_rate": 9.949971189380507e-08, + "logits/chosen": 1.7529083490371704, + "logits/rejected": 2.1265456676483154, + "logps/chosen": -272.32366943359375, + "logps/rejected": -313.41241455078125, + "loss": 4930.4125, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.017131825909018517, + "rewards/margins": 0.008075936697423458, + "rewards/rejected": -0.02520776353776455, + "step": 195 + }, + { + "debug/policy_chosen_logits": 1.2168382406234741, + "debug/policy_chosen_logps": -283.47161865234375, + "debug/policy_rejected_logits": 1.5349094867706299, + "debug/policy_rejected_logps": -290.456787109375, + "debug/reference_chosen_logps": -282.74090576171875, + "debug/reference_rejected_logps": -288.225341796875, + "debug/sppo_chosen_loss": 2577.8662109375, + "debug/sppo_chosen_reward_in_loss": -0.7306663393974304, + "debug/sppo_rej_reward_in_loss": -2.2314419746398926, + "debug/sppo_reject_loss": 2285.576171875, + "epoch": 0.7246376811594203, + "grad_norm": 60882.04316379648, + "learning_rate": 9.944576421013802e-08, + "logits/chosen": 1.2168382406234741, + "logits/rejected": 1.5349094867706299, + "logps/chosen": -283.47161865234375, + "logps/rejected": -290.456787109375, + "loss": 4913.6875, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.007306662853807211, + "rewards/margins": 0.015007754787802696, + "rewards/rejected": -0.022314418107271194, + "step": 200 + }, + { + "epoch": 0.7246376811594203, + "eval_debug/policy_chosen_logits": 1.6047344207763672, + "eval_debug/policy_chosen_logps": -253.59323120117188, + "eval_debug/policy_rejected_logits": 1.6657979488372803, + "eval_debug/policy_rejected_logps": -260.560546875, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2570.339111328125, + "eval_debug/sppo_chosen_reward_in_loss": -0.6747744083404541, + "eval_debug/sppo_rej_reward_in_loss": -0.9019157886505127, + "eval_debug/sppo_reject_loss": 2415.142578125, + "eval_logits/chosen": 1.6047344207763672, + "eval_logits/rejected": 1.6657979488372803, + "eval_logps/chosen": -253.59323120117188, + "eval_logps/rejected": -260.560546875, + "eval_loss": 4922.29638671875, + "eval_rewards/accuracies": 0.5394737124443054, + "eval_rewards/chosen": -0.006747743580490351, + "eval_rewards/margins": 0.002271413803100586, + "eval_rewards/rejected": -0.009019157849252224, + "eval_runtime": 28.6024, + "eval_samples_per_second": 20.977, + "eval_steps_per_second": 0.664, + "step": 200 + }, + { + "debug/policy_chosen_logits": 1.3171964883804321, + "debug/policy_chosen_logps": -250.88916015625, + "debug/policy_rejected_logits": 1.6703647375106812, + "debug/policy_rejected_logps": -307.4503479003906, + "debug/reference_chosen_logps": -250.2000274658203, + "debug/reference_rejected_logps": -305.8931579589844, + "debug/sppo_chosen_loss": 2571.575439453125, + "debug/sppo_chosen_reward_in_loss": -0.6891248822212219, + "debug/sppo_rej_reward_in_loss": -1.5571798086166382, + "debug/sppo_reject_loss": 2350.726806640625, + "epoch": 0.7427536231884058, + "grad_norm": 77669.5503273904, + "learning_rate": 9.938907100006552e-08, + "logits/chosen": 1.3171964883804321, + "logits/rejected": 1.6703647375106812, + "logps/chosen": -250.88916015625, + "logps/rejected": -307.4503479003906, + "loss": 4928.9711, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.006891248282045126, + "rewards/margins": 0.008680549450218678, + "rewards/rejected": -0.015571797266602516, + "step": 205 + }, + { + "debug/policy_chosen_logits": 1.1587716341018677, + "debug/policy_chosen_logps": -259.26190185546875, + "debug/policy_rejected_logits": 1.460614562034607, + "debug/policy_rejected_logps": -255.4847869873047, + "debug/reference_chosen_logps": -258.54693603515625, + "debug/reference_rejected_logps": -254.0518035888672, + "debug/sppo_chosen_loss": 2574.91796875, + "debug/sppo_chosen_reward_in_loss": -0.7149562835693359, + "debug/sppo_rej_reward_in_loss": -1.4329769611358643, + "debug/sppo_reject_loss": 2362.268798828125, + "epoch": 0.7608695652173914, + "grad_norm": 56509.01147720915, + "learning_rate": 9.932963541153584e-08, + "logits/chosen": 1.1587716341018677, + "logits/rejected": 1.460614562034607, + "logps/chosen": -259.26190185546875, + "logps/rejected": -255.4847869873047, + "loss": 4912.7797, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.007149563170969486, + "rewards/margins": 0.007180205546319485, + "rewards/rejected": -0.014329768717288971, + "step": 210 + }, + { + "debug/policy_chosen_logits": 1.4320160150527954, + "debug/policy_chosen_logps": -268.22747802734375, + "debug/policy_rejected_logits": 1.8260990381240845, + "debug/policy_rejected_logps": -298.08526611328125, + "debug/reference_chosen_logps": -267.0002136230469, + "debug/reference_rejected_logps": -295.2477111816406, + "debug/sppo_chosen_loss": 2628.953857421875, + "debug/sppo_chosen_reward_in_loss": -1.227246642112732, + "debug/sppo_rej_reward_in_loss": -2.8375353813171387, + "debug/sppo_reject_loss": 2234.884033203125, + "epoch": 0.7789855072463768, + "grad_norm": 57238.35237205925, + "learning_rate": 9.926746074477053e-08, + "logits/chosen": 1.4320160150527954, + "logits/rejected": 1.8260990381240845, + "logps/chosen": -268.22747802734375, + "logps/rejected": -298.08526611328125, + "loss": 4883.1227, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.012272466905415058, + "rewards/margins": 0.01610288769006729, + "rewards/rejected": -0.028375351801514626, + "step": 215 + }, + { + "debug/policy_chosen_logits": 0.9355718493461609, + "debug/policy_chosen_logps": -250.63589477539062, + "debug/policy_rejected_logits": 1.2643530368804932, + "debug/policy_rejected_logps": -276.95855712890625, + "debug/reference_chosen_logps": -249.66796875, + "debug/reference_rejected_logps": -274.86138916015625, + "debug/sppo_chosen_loss": 2604.803466796875, + "debug/sppo_chosen_reward_in_loss": -0.9679214358329773, + "debug/sppo_rej_reward_in_loss": -2.0971500873565674, + "debug/sppo_reject_loss": 2302.240966796875, + "epoch": 0.7971014492753623, + "grad_norm": 62378.00382376367, + "learning_rate": 9.920255045208128e-08, + "logits/chosen": 0.9355718493461609, + "logits/rejected": 1.2643530368804932, + "logps/chosen": -250.63589477539062, + "logps/rejected": -276.95855712890625, + "loss": 4879.9465, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.009679214097559452, + "rewards/margins": 0.011292284354567528, + "rewards/rejected": -0.020971499383449554, + "step": 220 + }, + { + "debug/policy_chosen_logits": 1.3104256391525269, + "debug/policy_chosen_logps": -283.29400634765625, + "debug/policy_rejected_logits": 1.6018108129501343, + "debug/policy_rejected_logps": -291.35772705078125, + "debug/reference_chosen_logps": -282.43328857421875, + "debug/reference_rejected_logps": -289.2051696777344, + "debug/sppo_chosen_loss": 2590.73828125, + "debug/sppo_chosen_reward_in_loss": -0.8607318997383118, + "debug/sppo_rej_reward_in_loss": -2.1526076793670654, + "debug/sppo_reject_loss": 2297.698974609375, + "epoch": 0.8152173913043478, + "grad_norm": 65152.871244207236, + "learning_rate": 9.913490813767816e-08, + "logits/chosen": 1.3104256391525269, + "logits/rejected": 1.6018108129501343, + "logps/chosen": -283.29400634765625, + "logps/rejected": -291.35772705078125, + "loss": 4881.9191, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.008607318624854088, + "rewards/margins": 0.01291875820606947, + "rewards/rejected": -0.021526077762246132, + "step": 225 + }, + { + "debug/policy_chosen_logits": 1.1648050546646118, + "debug/policy_chosen_logps": -233.7320098876953, + "debug/policy_rejected_logits": 1.381399393081665, + "debug/policy_rejected_logps": -276.8841552734375, + "debug/reference_chosen_logps": -233.37783813476562, + "debug/reference_rejected_logps": -275.42218017578125, + "debug/sppo_chosen_loss": 2539.67919921875, + "debug/sppo_chosen_reward_in_loss": -0.35420626401901245, + "debug/sppo_rej_reward_in_loss": -1.4619684219360352, + "debug/sppo_reject_loss": 2360.643310546875, + "epoch": 0.8333333333333334, + "grad_norm": 55449.34911003317, + "learning_rate": 9.906453755746957e-08, + "logits/chosen": 1.1648050546646118, + "logits/rejected": 1.381399393081665, + "logps/chosen": -233.7320098876953, + "logps/rejected": -276.8841552734375, + "loss": 4885.1031, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.0035420632921159267, + "rewards/margins": 0.011077621951699257, + "rewards/rejected": -0.014619683846831322, + "step": 230 + }, + { + "debug/policy_chosen_logits": 1.3417161703109741, + "debug/policy_chosen_logps": -252.96591186523438, + "debug/policy_rejected_logits": 1.5157549381256104, + "debug/policy_rejected_logps": -298.88677978515625, + "debug/reference_chosen_logps": -251.4791259765625, + "debug/reference_rejected_logps": -296.46527099609375, + "debug/sppo_chosen_loss": 2657.89697265625, + "debug/sppo_chosen_reward_in_loss": -1.4867690801620483, + "debug/sppo_rej_reward_in_loss": -2.4215126037597656, + "debug/sppo_reject_loss": 2273.120849609375, + "epoch": 0.8514492753623188, + "grad_norm": 100941.82559973896, + "learning_rate": 9.899144261885363e-08, + "logits/chosen": 1.3417161703109741, + "logits/rejected": 1.5157549381256104, + "logps/chosen": -252.96591186523438, + "logps/rejected": -298.88677978515625, + "loss": 4858.2457, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.01486769039183855, + "rewards/margins": 0.009347434155642986, + "rewards/rejected": -0.024215126410126686, + "step": 235 + }, + { + "debug/policy_chosen_logits": 1.3339102268218994, + "debug/policy_chosen_logps": -243.60916137695312, + "debug/policy_rejected_logits": 1.9277820587158203, + "debug/policy_rejected_logps": -299.7930908203125, + "debug/reference_chosen_logps": -242.9955291748047, + "debug/reference_rejected_logps": -297.2693786621094, + "debug/sppo_chosen_loss": 2566.054931640625, + "debug/sppo_chosen_reward_in_loss": -0.6136573553085327, + "debug/sppo_rej_reward_in_loss": -2.523709535598755, + "debug/sppo_reject_loss": 2259.904052734375, + "epoch": 0.8695652173913043, + "grad_norm": 59001.90623118747, + "learning_rate": 9.891562738050125e-08, + "logits/chosen": 1.3339102268218994, + "logits/rejected": 1.9277820587158203, + "logps/chosen": -243.60916137695312, + "logps/rejected": -299.7930908203125, + "loss": 4878.6145, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.006136573851108551, + "rewards/margins": 0.01910051889717579, + "rewards/rejected": -0.02523709461092949, + "step": 240 + }, + { + "debug/policy_chosen_logits": 1.0627963542938232, + "debug/policy_chosen_logps": -232.9734344482422, + "debug/policy_rejected_logits": 1.4089231491088867, + "debug/policy_rejected_logps": -283.24371337890625, + "debug/reference_chosen_logps": -233.43692016601562, + "debug/reference_rejected_logps": -282.2871398925781, + "debug/sppo_chosen_loss": 2458.481201171875, + "debug/sppo_chosen_reward_in_loss": 0.46344834566116333, + "debug/sppo_rej_reward_in_loss": -0.9565681219100952, + "debug/sppo_reject_loss": 2416.12548828125, + "epoch": 0.8876811594202898, + "grad_norm": 62196.217799154256, + "learning_rate": 9.883709605213071e-08, + "logits/chosen": 1.0627963542938232, + "logits/rejected": 1.4089231491088867, + "logps/chosen": -232.9734344482422, + "logps/rejected": -283.24371337890625, + "loss": 4866.8133, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.004634483251720667, + "rewards/margins": 0.014200164005160332, + "rewards/rejected": -0.009565682150423527, + "step": 245 + }, + { + "debug/policy_chosen_logits": 1.1752718687057495, + "debug/policy_chosen_logps": -277.11181640625, + "debug/policy_rejected_logits": 1.5447168350219727, + "debug/policy_rejected_logps": -283.4997863769531, + "debug/reference_chosen_logps": -276.27154541015625, + "debug/reference_rejected_logps": -282.2914123535156, + "debug/sppo_chosen_loss": 2592.04931640625, + "debug/sppo_chosen_reward_in_loss": -0.8402732610702515, + "debug/sppo_rej_reward_in_loss": -1.2084100246429443, + "debug/sppo_reject_loss": 2386.884033203125, + "epoch": 0.9057971014492754, + "grad_norm": 59626.787092260776, + "learning_rate": 9.8755852994274e-08, + "logits/chosen": 1.1752718687057495, + "logits/rejected": 1.5447168350219727, + "logps/chosen": -277.11181640625, + "logps/rejected": -283.4997863769531, + "loss": 4915.7211, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.008402733132243156, + "rewards/margins": 0.003681367728859186, + "rewards/rejected": -0.012084100395441055, + "step": 250 + }, + { + "debug/policy_chosen_logits": 1.3651785850524902, + "debug/policy_chosen_logps": -236.29833984375, + "debug/policy_rejected_logits": 1.5351622104644775, + "debug/policy_rejected_logps": -271.6980285644531, + "debug/reference_chosen_logps": -235.9077606201172, + "debug/reference_rejected_logps": -269.55755615234375, + "debug/sppo_chosen_loss": 2543.5859375, + "debug/sppo_chosen_reward_in_loss": -0.3905603289604187, + "debug/sppo_rej_reward_in_loss": -2.140444755554199, + "debug/sppo_reject_loss": 2298.95458984375, + "epoch": 0.9239130434782609, + "grad_norm": 68845.86649779897, + "learning_rate": 9.867190271803463e-08, + "logits/chosen": 1.3651785850524902, + "logits/rejected": 1.5351622104644775, + "logps/chosen": -236.29833984375, + "logps/rejected": -271.6980285644531, + "loss": 4868.9133, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0039056031964719296, + "rewards/margins": 0.017498845234513283, + "rewards/rejected": -0.02140444703400135, + "step": 255 + }, + { + "debug/policy_chosen_logits": 1.2891263961791992, + "debug/policy_chosen_logps": -253.2111053466797, + "debug/policy_rejected_logits": 1.8494764566421509, + "debug/policy_rejected_logps": -307.9271545410156, + "debug/reference_chosen_logps": -253.3511199951172, + "debug/reference_rejected_logps": -305.5581359863281, + "debug/sppo_chosen_loss": 2489.207763671875, + "debug/sppo_chosen_reward_in_loss": 0.1400173157453537, + "debug/sppo_rej_reward_in_loss": -2.3690409660339355, + "debug/sppo_reject_loss": 2279.093017578125, + "epoch": 0.9420289855072463, + "grad_norm": 77776.03745480823, + "learning_rate": 9.858524988483717e-08, + "logits/chosen": 1.2891263961791992, + "logits/rejected": 1.8494764566421509, + "logps/chosen": -253.2111053466797, + "logps/rejected": -307.9271545410156, + "loss": 4828.5781, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0014001730596646667, + "rewards/margins": 0.02509058080613613, + "rewards/rejected": -0.02369040809571743, + "step": 260 + }, + { + "debug/policy_chosen_logits": 1.4969512224197388, + "debug/policy_chosen_logps": -305.0471496582031, + "debug/policy_rejected_logits": 1.644335150718689, + "debug/policy_rejected_logps": -293.63507080078125, + "debug/reference_chosen_logps": -304.60321044921875, + "debug/reference_rejected_logps": -291.7633056640625, + "debug/sppo_chosen_loss": 2549.2109375, + "debug/sppo_chosen_reward_in_loss": -0.44389915466308594, + "debug/sppo_rej_reward_in_loss": -1.8717502355575562, + "debug/sppo_reject_loss": 2324.712158203125, + "epoch": 0.9601449275362319, + "grad_norm": 57117.242259449704, + "learning_rate": 9.849589930616841e-08, + "logits/chosen": 1.4969512224197388, + "logits/rejected": 1.644335150718689, + "logps/chosen": -305.0471496582031, + "logps/rejected": -293.63507080078125, + "loss": 4841.2102, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.004438991658389568, + "rewards/margins": 0.014278510585427284, + "rewards/rejected": -0.018717501312494278, + "step": 265 + }, + { + "debug/policy_chosen_logits": 1.4527175426483154, + "debug/policy_chosen_logps": -268.75811767578125, + "debug/policy_rejected_logits": 1.9718250036239624, + "debug/policy_rejected_logps": -277.8961486816406, + "debug/reference_chosen_logps": -267.8411560058594, + "debug/reference_rejected_logps": -275.6684875488281, + "debug/sppo_chosen_loss": 2599.503662109375, + "debug/sppo_chosen_reward_in_loss": -0.9169847369194031, + "debug/sppo_rej_reward_in_loss": -2.2276530265808105, + "debug/sppo_reject_loss": 2295.29248046875, + "epoch": 0.9782608695652174, + "grad_norm": 65236.81774306339, + "learning_rate": 9.840385594331021e-08, + "logits/chosen": 1.4527175426483154, + "logits/rejected": 1.9718250036239624, + "logps/chosen": -268.75811767578125, + "logps/rejected": -277.8961486816406, + "loss": 4872.7801, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.009169846773147583, + "rewards/margins": 0.013106681406497955, + "rewards/rejected": -0.022276530042290688, + "step": 270 + }, + { + "debug/policy_chosen_logits": 1.0952537059783936, + "debug/policy_chosen_logps": -257.93865966796875, + "debug/policy_rejected_logits": 1.3622313737869263, + "debug/policy_rejected_logps": -256.76025390625, + "debug/reference_chosen_logps": -257.44097900390625, + "debug/reference_rejected_logps": -254.19161987304688, + "debug/sppo_chosen_loss": 2554.732421875, + "debug/sppo_chosen_reward_in_loss": -0.49769458174705505, + "debug/sppo_rej_reward_in_loss": -2.5686464309692383, + "debug/sppo_reject_loss": 2260.558349609375, + "epoch": 0.9963768115942029, + "grad_norm": 74696.15311144819, + "learning_rate": 9.830912490706402e-08, + "logits/chosen": 1.0952537059783936, + "logits/rejected": 1.3622313737869263, + "logps/chosen": -257.93865966796875, + "logps/rejected": -256.76025390625, + "loss": 4826.0516, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0049769459292292595, + "rewards/margins": 0.02070951648056507, + "rewards/rejected": -0.025686467066407204, + "step": 275 + }, + { + "debug/policy_chosen_logits": 1.3448244333267212, + "debug/policy_chosen_logps": -261.9836730957031, + "debug/policy_rejected_logits": 1.7952938079833984, + "debug/policy_rejected_logps": -297.6934814453125, + "debug/reference_chosen_logps": -260.2370300292969, + "debug/reference_rejected_logps": -293.6090393066406, + "debug/sppo_chosen_loss": 2686.778564453125, + "debug/sppo_chosen_reward_in_loss": -1.746645212173462, + "debug/sppo_rej_reward_in_loss": -4.084471225738525, + "debug/sppo_reject_loss": 2124.232421875, + "epoch": 1.0144927536231885, + "grad_norm": 64113.010757584976, + "learning_rate": 9.821171145746709e-08, + "logits/chosen": 1.3448244333267212, + "logits/rejected": 1.7952938079833984, + "logps/chosen": -261.9836730957031, + "logps/rejected": -297.6934814453125, + "loss": 4835.6227, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.017466451972723007, + "rewards/margins": 0.02337825670838356, + "rewards/rejected": -0.04084470868110657, + "step": 280 + }, + { + "debug/policy_chosen_logits": 1.2289044857025146, + "debug/policy_chosen_logps": -260.09234619140625, + "debug/policy_rejected_logits": 1.4668129682540894, + "debug/policy_rejected_logps": -285.3551940917969, + "debug/reference_chosen_logps": -259.2257385253906, + "debug/reference_rejected_logps": -282.3316955566406, + "debug/sppo_chosen_loss": 2595.444091796875, + "debug/sppo_chosen_reward_in_loss": -0.8666107058525085, + "debug/sppo_rej_reward_in_loss": -3.0234737396240234, + "debug/sppo_reject_loss": 2219.795166015625, + "epoch": 1.0326086956521738, + "grad_norm": 73460.26213970502, + "learning_rate": 9.811162100350039e-08, + "logits/chosen": 1.2289044857025146, + "logits/rejected": 1.4668129682540894, + "logps/chosen": -260.09234619140625, + "logps/rejected": -285.3551940917969, + "loss": 4821.3758, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.008666107431054115, + "rewards/margins": 0.021568629890680313, + "rewards/rejected": -0.03023473545908928, + "step": 285 + }, + { + "debug/policy_chosen_logits": 1.5169494152069092, + "debug/policy_chosen_logps": -275.1809997558594, + "debug/policy_rejected_logits": 1.6399990320205688, + "debug/policy_rejected_logps": -274.2919921875, + "debug/reference_chosen_logps": -274.32757568359375, + "debug/reference_rejected_logps": -271.07623291015625, + "debug/sppo_chosen_loss": 2591.622314453125, + "debug/sppo_chosen_reward_in_loss": -0.8534218072891235, + "debug/sppo_rej_reward_in_loss": -3.2157363891601562, + "debug/sppo_reject_loss": 2204.520263671875, + "epoch": 1.0507246376811594, + "grad_norm": 78833.13341982767, + "learning_rate": 9.80088591027883e-08, + "logits/chosen": 1.5169494152069092, + "logits/rejected": 1.6399990320205688, + "logps/chosen": -275.1809997558594, + "logps/rejected": -274.2919921875, + "loss": 4839.1961, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00853421725332737, + "rewards/margins": 0.023623144254088402, + "rewards/rejected": -0.03215736150741577, + "step": 290 + }, + { + "debug/policy_chosen_logits": 1.2514266967773438, + "debug/policy_chosen_logps": -246.48178100585938, + "debug/policy_rejected_logits": 1.6836073398590088, + "debug/policy_rejected_logps": -315.2175598144531, + "debug/reference_chosen_logps": -245.95834350585938, + "debug/reference_rejected_logps": -312.770751953125, + "debug/sppo_chosen_loss": 2557.650634765625, + "debug/sppo_chosen_reward_in_loss": -0.5234573483467102, + "debug/sppo_rej_reward_in_loss": -2.4468045234680176, + "debug/sppo_reject_loss": 2270.903564453125, + "epoch": 1.068840579710145, + "grad_norm": 58519.22419342189, + "learning_rate": 9.790343146128999e-08, + "logits/chosen": 1.2514266967773438, + "logits/rejected": 1.6836073398590088, + "logps/chosen": -246.48178100585938, + "logps/rejected": -315.2175598144531, + "loss": 4841.0684, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.005234573967754841, + "rewards/margins": 0.019233472645282745, + "rewards/rejected": -0.02446804568171501, + "step": 295 + }, + { + "debug/policy_chosen_logits": 1.0419118404388428, + "debug/policy_chosen_logps": -253.39053344726562, + "debug/policy_rejected_logits": 1.3241182565689087, + "debug/policy_rejected_logps": -298.21099853515625, + "debug/reference_chosen_logps": -251.8573455810547, + "debug/reference_rejected_logps": -295.08416748046875, + "debug/sppo_chosen_loss": 2667.366943359375, + "debug/sppo_chosen_reward_in_loss": -1.5331599712371826, + "debug/sppo_rej_reward_in_loss": -3.1267876625061035, + "debug/sppo_reject_loss": 2213.57275390625, + "epoch": 1.0869565217391304, + "grad_norm": 58950.492438903406, + "learning_rate": 9.779534393298261e-08, + "logits/chosen": 1.0419118404388428, + "logits/rejected": 1.3241182565689087, + "logps/chosen": -253.39053344726562, + "logps/rejected": -298.21099853515625, + "loss": 4852.6547, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015331600792706013, + "rewards/margins": 0.01593627780675888, + "rewards/rejected": -0.03126787766814232, + "step": 300 + }, + { + "epoch": 1.0869565217391304, + "eval_debug/policy_chosen_logits": 1.5894911289215088, + "eval_debug/policy_chosen_logps": -253.82180786132812, + "eval_debug/policy_rejected_logits": 1.6477311849594116, + "eval_debug/policy_rejected_logps": -261.3568115234375, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2599.375244140625, + "eval_debug/sppo_chosen_reward_in_loss": -0.9033258557319641, + "eval_debug/sppo_rej_reward_in_loss": -1.6981514692306519, + "eval_debug/sppo_reject_loss": 2346.007080078125, + "eval_logits/chosen": 1.5894911289215088, + "eval_logits/rejected": 1.6477311849594116, + "eval_logps/chosen": -253.82180786132812, + "eval_logps/rejected": -261.3568115234375, + "eval_loss": 4861.89599609375, + "eval_rewards/accuracies": 0.46052631735801697, + "eval_rewards/chosen": -0.009033258073031902, + "eval_rewards/margins": 0.007948257029056549, + "eval_rewards/rejected": -0.016981516033411026, + "eval_runtime": 28.419, + "eval_samples_per_second": 21.113, + "eval_steps_per_second": 0.669, + "step": 300 + }, + { + "debug/policy_chosen_logits": 1.674957036972046, + "debug/policy_chosen_logps": -274.6082458496094, + "debug/policy_rejected_logits": 1.868452787399292, + "debug/policy_rejected_logps": -293.33526611328125, + "debug/reference_chosen_logps": -273.9171142578125, + "debug/reference_rejected_logps": -290.7630920410156, + "debug/sppo_chosen_loss": 2573.90234375, + "debug/sppo_chosen_reward_in_loss": -0.6911390423774719, + "debug/sppo_rej_reward_in_loss": -2.5721654891967773, + "debug/sppo_reject_loss": 2263.828125, + "epoch": 1.105072463768116, + "grad_norm": 66459.25152549255, + "learning_rate": 9.768460251953622e-08, + "logits/chosen": 1.674957036972046, + "logits/rejected": 1.868452787399292, + "logps/chosen": -274.6082458496094, + "logps/rejected": -293.33526611328125, + "loss": 4867.6438, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.006911390461027622, + "rewards/margins": 0.018810266628861427, + "rewards/rejected": -0.025721654295921326, + "step": 305 + }, + { + "debug/policy_chosen_logits": 1.3893417119979858, + "debug/policy_chosen_logps": -289.6875915527344, + "debug/policy_rejected_logits": 1.6382023096084595, + "debug/policy_rejected_logps": -310.7853698730469, + "debug/reference_chosen_logps": -288.9293212890625, + "debug/reference_rejected_logps": -308.27606201171875, + "debug/sppo_chosen_loss": 2581.68017578125, + "debug/sppo_chosen_reward_in_loss": -0.7582954168319702, + "debug/sppo_rej_reward_in_loss": -2.50929594039917, + "debug/sppo_reject_loss": 2264.07470703125, + "epoch": 1.1231884057971016, + "grad_norm": 75849.96054845196, + "learning_rate": 9.757121336998056e-08, + "logits/chosen": 1.3893417119979858, + "logits/rejected": 1.6382023096084595, + "logps/chosen": -289.6875915527344, + "logps/rejected": -310.7853698730469, + "loss": 4836.8883, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.007582955062389374, + "rewards/margins": 0.017510006204247475, + "rewards/rejected": -0.0250929594039917, + "step": 310 + }, + { + "debug/policy_chosen_logits": 0.9951621294021606, + "debug/policy_chosen_logps": -221.96142578125, + "debug/policy_rejected_logits": 1.386580228805542, + "debug/policy_rejected_logps": -279.8023681640625, + "debug/reference_chosen_logps": -227.9397430419922, + "debug/reference_rejected_logps": -283.32073974609375, + "debug/sppo_chosen_loss": 1973.23828125, + "debug/sppo_chosen_reward_in_loss": 5.978316783905029, + "debug/sppo_rej_reward_in_loss": 3.5184273719787598, + "debug/sppo_reject_loss": 2925.00048828125, + "epoch": 1.141304347826087, + "grad_norm": 127556.90854651273, + "learning_rate": 9.745518278036364e-08, + "logits/chosen": 0.9951621294021606, + "logits/rejected": 1.386580228805542, + "logps/chosen": -221.96142578125, + "logps/rejected": -279.8023681640625, + "loss": 4991.3445, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.059783171862363815, + "rewards/margins": 0.024598896503448486, + "rewards/rejected": 0.03518427163362503, + "step": 315 + }, + { + "debug/policy_chosen_logits": 1.4349600076675415, + "debug/policy_chosen_logps": -244.4615936279297, + "debug/policy_rejected_logits": 1.6520965099334717, + "debug/policy_rejected_logps": -296.48944091796875, + "debug/reference_chosen_logps": -250.67617797851562, + "debug/reference_rejected_logps": -301.82122802734375, + "debug/sppo_chosen_loss": 1937.580322265625, + "debug/sppo_chosen_reward_in_loss": 6.214603900909424, + "debug/sppo_rej_reward_in_loss": 5.331799507141113, + "debug/sppo_reject_loss": 3087.521240234375, + "epoch": 1.1594202898550725, + "grad_norm": 82651.7113274058, + "learning_rate": 9.733651719340206e-08, + "logits/chosen": 1.4349600076675415, + "logits/rejected": 1.6520965099334717, + "logps/chosen": -244.4615936279297, + "logps/rejected": -296.48944091796875, + "loss": 4934.0418, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06214603781700134, + "rewards/margins": 0.008828045800328255, + "rewards/rejected": 0.05331799387931824, + "step": 320 + }, + { + "debug/policy_chosen_logits": 0.9610234498977661, + "debug/policy_chosen_logps": -239.1781005859375, + "debug/policy_rejected_logits": 1.2378222942352295, + "debug/policy_rejected_logps": -296.514892578125, + "debug/reference_chosen_logps": -239.85531616210938, + "debug/reference_rejected_logps": -296.14971923828125, + "debug/sppo_chosen_loss": 2439.22607421875, + "debug/sppo_chosen_reward_in_loss": 0.6772235631942749, + "debug/sppo_rej_reward_in_loss": -0.3651662766933441, + "debug/sppo_reject_loss": 2480.13232421875, + "epoch": 1.177536231884058, + "grad_norm": 62900.497256271476, + "learning_rate": 9.721522319812339e-08, + "logits/chosen": 0.9610234498977661, + "logits/rejected": 1.2378222942352295, + "logps/chosen": -239.1781005859375, + "logps/rejected": -296.514892578125, + "loss": 4778.1418, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.006772235967218876, + "rewards/margins": 0.010423899628221989, + "rewards/rejected": -0.003651663661003113, + "step": 325 + }, + { + "debug/policy_chosen_logits": 0.8668224215507507, + "debug/policy_chosen_logps": -240.64358520507812, + "debug/policy_rejected_logits": 1.338693380355835, + "debug/policy_rejected_logps": -296.9376525878906, + "debug/reference_chosen_logps": -240.9998321533203, + "debug/reference_rejected_logps": -293.64276123046875, + "debug/sppo_chosen_loss": 2472.85888671875, + "debug/sppo_chosen_reward_in_loss": 0.35626524686813354, + "debug/sppo_rej_reward_in_loss": -3.2948780059814453, + "debug/sppo_reject_loss": 2214.165771484375, + "epoch": 1.1956521739130435, + "grad_norm": 78794.22306053036, + "learning_rate": 9.709130752950023e-08, + "logits/chosen": 0.8668224215507507, + "logits/rejected": 1.338693380355835, + "logps/chosen": -240.64358520507812, + "logps/rejected": -296.9376525878906, + "loss": 4845.3301, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0035626522731035948, + "rewards/margins": 0.03651143237948418, + "rewards/rejected": -0.03294878080487251, + "step": 330 + }, + { + "debug/policy_chosen_logits": 1.226872444152832, + "debug/policy_chosen_logps": -249.2202911376953, + "debug/policy_rejected_logits": 1.4146177768707275, + "debug/policy_rejected_logps": -284.56500244140625, + "debug/reference_chosen_logps": -247.591064453125, + "debug/reference_rejected_logps": -282.10699462890625, + "debug/sppo_chosen_loss": 2689.522705078125, + "debug/sppo_chosen_reward_in_loss": -1.629233956336975, + "debug/sppo_rej_reward_in_loss": -2.4580025672912598, + "debug/sppo_reject_loss": 2273.725830078125, + "epoch": 1.213768115942029, + "grad_norm": 72224.73272645177, + "learning_rate": 9.696477706807624e-08, + "logits/chosen": 1.226872444152832, + "logits/rejected": 1.4146177768707275, + "logps/chosen": -249.2202911376953, + "logps/rejected": -284.56500244140625, + "loss": 4783.2437, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.016292337328195572, + "rewards/margins": 0.008287685923278332, + "rewards/rejected": -0.02458002418279648, + "step": 335 + }, + { + "debug/policy_chosen_logits": 1.5981292724609375, + "debug/policy_chosen_logps": -253.11019897460938, + "debug/policy_rejected_logits": 1.7621173858642578, + "debug/policy_rejected_logps": -273.3591003417969, + "debug/reference_chosen_logps": -252.81381225585938, + "debug/reference_rejected_logps": -270.9890441894531, + "debug/sppo_chosen_loss": 2534.176025390625, + "debug/sppo_chosen_reward_in_loss": -0.2963891923427582, + "debug/sppo_rej_reward_in_loss": -2.3700661659240723, + "debug/sppo_reject_loss": 2284.81640625, + "epoch": 1.2318840579710144, + "grad_norm": 58031.91363262894, + "learning_rate": 9.683563883958413e-08, + "logits/chosen": 1.5981292724609375, + "logits/rejected": 1.7621173858642578, + "logps/chosen": -253.11019897460938, + "logps/rejected": -273.3591003417969, + "loss": 4819.1984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0029638917185366154, + "rewards/margins": 0.020736772567033768, + "rewards/rejected": -0.023700661957263947, + "step": 340 + }, + { + "debug/policy_chosen_logits": 1.2439069747924805, + "debug/policy_chosen_logps": -270.01458740234375, + "debug/policy_rejected_logits": 1.4633022546768188, + "debug/policy_rejected_logps": -274.4990539550781, + "debug/reference_chosen_logps": -269.8506774902344, + "debug/reference_rejected_logps": -272.15484619140625, + "debug/sppo_chosen_loss": 2524.285888671875, + "debug/sppo_chosen_reward_in_loss": -0.1639108657836914, + "debug/sppo_rej_reward_in_loss": -2.344228982925415, + "debug/sppo_reject_loss": 2287.3037109375, + "epoch": 1.25, + "grad_norm": 62603.32397855888, + "learning_rate": 9.670390001455554e-08, + "logits/chosen": 1.2439069747924805, + "logits/rejected": 1.4633022546768188, + "logps/chosen": -270.01458740234375, + "logps/rejected": -274.4990539550781, + "loss": 4809.4809, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.001639108406379819, + "rewards/margins": 0.021803181618452072, + "rewards/rejected": -0.023442288860678673, + "step": 345 + }, + { + "debug/policy_chosen_logits": 1.034299612045288, + "debug/policy_chosen_logps": -244.1306610107422, + "debug/policy_rejected_logits": 1.3921207189559937, + "debug/policy_rejected_logps": -290.91131591796875, + "debug/reference_chosen_logps": -243.61593627929688, + "debug/reference_rejected_logps": -288.1432189941406, + "debug/sppo_chosen_loss": 2561.4248046875, + "debug/sppo_chosen_reward_in_loss": -0.5147092938423157, + "debug/sppo_rej_reward_in_loss": -2.7680869102478027, + "debug/sppo_reject_loss": 2245.42236328125, + "epoch": 1.2681159420289856, + "grad_norm": 99341.70642745105, + "learning_rate": 9.656956790792285e-08, + "logits/chosen": 1.034299612045288, + "logits/rejected": 1.3921207189559937, + "logps/chosen": -244.1306610107422, + "logps/rejected": -290.91131591796875, + "loss": 4828.575, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.00514709297567606, + "rewards/margins": 0.022533774375915527, + "rewards/rejected": -0.02768086828291416, + "step": 350 + }, + { + "debug/policy_chosen_logits": 1.4998633861541748, + "debug/policy_chosen_logps": -259.9757385253906, + "debug/policy_rejected_logits": 1.6802341938018799, + "debug/policy_rejected_logps": -296.2747497558594, + "debug/reference_chosen_logps": -260.3787841796875, + "debug/reference_rejected_logps": -293.5425109863281, + "debug/sppo_chosen_loss": 2463.8203125, + "debug/sppo_chosen_reward_in_loss": 0.40305614471435547, + "debug/sppo_rej_reward_in_loss": -2.7322874069213867, + "debug/sppo_reject_loss": 2261.598388671875, + "epoch": 1.286231884057971, + "grad_norm": 59802.412832959686, + "learning_rate": 9.643264997861311e-08, + "logits/chosen": 1.4998633861541748, + "logits/rejected": 1.6802341938018799, + "logps/chosen": -259.9757385253906, + "logps/rejected": -296.2747497558594, + "loss": 4807.6715, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.004030561074614525, + "rewards/margins": 0.031353436410427094, + "rewards/rejected": -0.027322877198457718, + "step": 355 + }, + { + "debug/policy_chosen_logits": 0.8489713668823242, + "debug/policy_chosen_logps": -229.5171356201172, + "debug/policy_rejected_logits": 1.1113895177841187, + "debug/policy_rejected_logps": -260.78619384765625, + "debug/reference_chosen_logps": -229.55331420898438, + "debug/reference_rejected_logps": -256.6519470214844, + "debug/sppo_chosen_loss": 2506.991455078125, + "debug/sppo_chosen_reward_in_loss": 0.03618621826171875, + "debug/sppo_rej_reward_in_loss": -4.134216785430908, + "debug/sppo_reject_loss": 2127.05078125, + "epoch": 1.3043478260869565, + "grad_norm": 56753.91600403247, + "learning_rate": 9.62931538291337e-08, + "logits/chosen": 0.8489713668823242, + "logits/rejected": 1.1113895177841187, + "logps/chosen": -229.5171356201172, + "logps/rejected": -260.78619384765625, + "loss": 4780.2891, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.0003618622140493244, + "rewards/margins": 0.04170403257012367, + "rewards/rejected": -0.04134216904640198, + "step": 360 + }, + { + "debug/policy_chosen_logits": 1.2941102981567383, + "debug/policy_chosen_logps": -229.6419677734375, + "debug/policy_rejected_logits": 1.855833649635315, + "debug/policy_rejected_logps": -298.4671325683594, + "debug/reference_chosen_logps": -229.5476531982422, + "debug/reference_rejected_logps": -296.23944091796875, + "debug/sppo_chosen_loss": 2519.103515625, + "debug/sppo_chosen_reward_in_loss": -0.09428653866052628, + "debug/sppo_rej_reward_in_loss": -2.227700710296631, + "debug/sppo_reject_loss": 2297.76025390625, + "epoch": 1.322463768115942, + "grad_norm": 64235.39187696898, + "learning_rate": 9.615108720515041e-08, + "logits/chosen": 1.2941102981567383, + "logits/rejected": 1.855833649635315, + "logps/chosen": -229.6419677734375, + "logps/rejected": -298.4671325683594, + "loss": 4844.5879, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.00094286521198228, + "rewards/margins": 0.021334141492843628, + "rewards/rejected": -0.02227700687944889, + "step": 365 + }, + { + "debug/policy_chosen_logits": 1.3823152780532837, + "debug/policy_chosen_logps": -261.19049072265625, + "debug/policy_rejected_logits": 1.6258795261383057, + "debug/policy_rejected_logps": -302.4359436035156, + "debug/reference_chosen_logps": -260.66339111328125, + "debug/reference_rejected_logps": -298.43231201171875, + "debug/sppo_chosen_loss": 2561.10693359375, + "debug/sppo_chosen_reward_in_loss": -0.5271316766738892, + "debug/sppo_rej_reward_in_loss": -4.003632545471191, + "debug/sppo_reject_loss": 2143.478759765625, + "epoch": 1.3405797101449275, + "grad_norm": 62125.362422127815, + "learning_rate": 9.600645799505717e-08, + "logits/chosen": 1.3823152780532837, + "logits/rejected": 1.6258795261383057, + "logps/chosen": -261.19049072265625, + "logps/rejected": -302.4359436035156, + "loss": 4781.7582, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.005271316505968571, + "rewards/margins": 0.034765005111694336, + "rewards/rejected": -0.04003632068634033, + "step": 370 + }, + { + "debug/policy_chosen_logits": 1.1877999305725098, + "debug/policy_chosen_logps": -255.3175506591797, + "debug/policy_rejected_logits": 1.4676315784454346, + "debug/policy_rejected_logps": -262.8641662597656, + "debug/reference_chosen_logps": -254.73141479492188, + "debug/reference_rejected_logps": -260.73736572265625, + "debug/sppo_chosen_loss": 2564.98974609375, + "debug/sppo_chosen_reward_in_loss": -0.586154580116272, + "debug/sppo_rej_reward_in_loss": -2.1267733573913574, + "debug/sppo_reject_loss": 2307.00927734375, + "epoch": 1.358695652173913, + "grad_norm": 58105.264293713124, + "learning_rate": 9.585927422953815e-08, + "logits/chosen": 1.1877999305725098, + "logits/rejected": 1.4676315784454346, + "logps/chosen": -255.3175506591797, + "logps/rejected": -262.8641662597656, + "loss": 4838.052, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.005861545447260141, + "rewards/margins": 0.015406189486384392, + "rewards/rejected": -0.021267732605338097, + "step": 375 + }, + { + "debug/policy_chosen_logits": 1.14890456199646, + "debug/policy_chosen_logps": -236.107177734375, + "debug/policy_rejected_logits": 1.4956175088882446, + "debug/policy_rejected_logps": -260.3258056640625, + "debug/reference_chosen_logps": -235.4017333984375, + "debug/reference_rejected_logps": -256.94818115234375, + "debug/sppo_chosen_loss": 2575.2421875, + "debug/sppo_chosen_reward_in_loss": -0.7054517865180969, + "debug/sppo_rej_reward_in_loss": -3.3776297569274902, + "debug/sppo_reject_loss": 2199.29345703125, + "epoch": 1.3768115942028984, + "grad_norm": 61814.61417535102, + "learning_rate": 9.570954408112178e-08, + "logits/chosen": 1.14890456199646, + "logits/rejected": 1.4956175088882446, + "logps/chosen": -236.107177734375, + "logps/rejected": -260.3258056640625, + "loss": 4763.4984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.007054517511278391, + "rewards/margins": 0.026721779257059097, + "rewards/rejected": -0.03377629444003105, + "step": 380 + }, + { + "debug/policy_chosen_logits": 1.5179424285888672, + "debug/policy_chosen_logps": -263.9978942871094, + "debug/policy_rejected_logits": 1.6211563348770142, + "debug/policy_rejected_logps": -294.53656005859375, + "debug/reference_chosen_logps": -262.984375, + "debug/reference_rejected_logps": -292.22747802734375, + "debug/sppo_chosen_loss": 2611.53662109375, + "debug/sppo_chosen_reward_in_loss": -1.0135478973388672, + "debug/sppo_rej_reward_in_loss": -2.3091111183166504, + "debug/sppo_reject_loss": 2293.55908203125, + "epoch": 1.394927536231884, + "grad_norm": 75427.62511951616, + "learning_rate": 9.555727586372702e-08, + "logits/chosen": 1.5179424285888672, + "logits/rejected": 1.6211563348770142, + "logps/chosen": -263.9978942871094, + "logps/rejected": -294.53656005859375, + "loss": 4820.6531, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.010135479271411896, + "rewards/margins": 0.012955631129443645, + "rewards/rejected": -0.023091109469532967, + "step": 385 + }, + { + "debug/policy_chosen_logits": 1.590163230895996, + "debug/policy_chosen_logps": -296.49066162109375, + "debug/policy_rejected_logits": 1.3836915493011475, + "debug/policy_rejected_logps": -283.4267883300781, + "debug/reference_chosen_logps": -296.08233642578125, + "debug/reference_rejected_logps": -280.37335205078125, + "debug/sppo_chosen_loss": 2550.162109375, + "debug/sppo_chosen_reward_in_loss": -0.40835076570510864, + "debug/sppo_rej_reward_in_loss": -3.0534708499908447, + "debug/sppo_reject_loss": 2222.74755859375, + "epoch": 1.4130434782608696, + "grad_norm": 62180.159416556795, + "learning_rate": 9.540247803220169e-08, + "logits/chosen": 1.590163230895996, + "logits/rejected": 1.3836915493011475, + "logps/chosen": -296.49066162109375, + "logps/rejected": -283.4267883300781, + "loss": 4742.4914, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.004083507228642702, + "rewards/margins": 0.026451200246810913, + "rewards/rejected": -0.030534708872437477, + "step": 390 + }, + { + "debug/policy_chosen_logits": 0.9762474894523621, + "debug/policy_chosen_logps": -274.21893310546875, + "debug/policy_rejected_logits": 1.2334016561508179, + "debug/policy_rejected_logps": -305.20465087890625, + "debug/reference_chosen_logps": -273.4296875, + "debug/reference_rejected_logps": -303.46014404296875, + "debug/sppo_chosen_loss": 2594.09521484375, + "debug/sppo_chosen_reward_in_loss": -0.7892316579818726, + "debug/sppo_rej_reward_in_loss": -1.744462251663208, + "debug/sppo_reject_loss": 2350.6826171875, + "epoch": 1.431159420289855, + "grad_norm": 94722.70795318228, + "learning_rate": 9.524515918185301e-08, + "logits/chosen": 0.9762474894523621, + "logits/rejected": 1.2334016561508179, + "logps/chosen": -274.21893310546875, + "logps/rejected": -305.20465087890625, + "loss": 4802.1969, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00789231713861227, + "rewards/margins": 0.009552305564284325, + "rewards/rejected": -0.01744462177157402, + "step": 395 + }, + { + "debug/policy_chosen_logits": 1.2278473377227783, + "debug/policy_chosen_logps": -283.450927734375, + "debug/policy_rejected_logits": 1.3526257276535034, + "debug/policy_rejected_logps": -314.7835388183594, + "debug/reference_chosen_logps": -283.3064270019531, + "debug/reference_rejected_logps": -312.33880615234375, + "debug/sppo_chosen_loss": 2527.92822265625, + "debug/sppo_chosen_reward_in_loss": -0.1445016860961914, + "debug/sppo_rej_reward_in_loss": -2.444725275039673, + "debug/sppo_reject_loss": 2280.696533203125, + "epoch": 1.4492753623188406, + "grad_norm": 70038.44434880349, + "learning_rate": 9.508532804797034e-08, + "logits/chosen": 1.2278473377227783, + "logits/rejected": 1.3526257276535034, + "logps/chosen": -283.450927734375, + "logps/rejected": -314.7835388183594, + "loss": 4810.0602, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0014450167072936893, + "rewards/margins": 0.02300223708152771, + "rewards/rejected": -0.024447252973914146, + "step": 400 + }, + { + "epoch": 1.4492753623188406, + "eval_debug/policy_chosen_logits": 1.5488909482955933, + "eval_debug/policy_chosen_logps": -253.56918334960938, + "eval_debug/policy_rejected_logits": 1.603257656097412, + "eval_debug/policy_rejected_logps": -261.84649658203125, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2584.198486328125, + "eval_debug/sppo_chosen_reward_in_loss": -0.6506962776184082, + "eval_debug/sppo_rej_reward_in_loss": -2.187859296798706, + "eval_debug/sppo_reject_loss": 2322.553466796875, + "eval_logits/chosen": 1.5488909482955933, + "eval_logits/rejected": 1.603257656097412, + "eval_logps/chosen": -253.56918334960938, + "eval_logps/rejected": -261.84649658203125, + "eval_loss": 4799.115234375, + "eval_rewards/accuracies": 0.5394737124443054, + "eval_rewards/chosen": -0.006506962701678276, + "eval_rewards/margins": 0.015371627174317837, + "eval_rewards/rejected": -0.021878590807318687, + "eval_runtime": 28.7384, + "eval_samples_per_second": 20.878, + "eval_steps_per_second": 0.661, + "step": 400 + }, + { + "debug/policy_chosen_logits": 1.3403141498565674, + "debug/policy_chosen_logps": -248.9816436767578, + "debug/policy_rejected_logits": 1.8409442901611328, + "debug/policy_rejected_logps": -277.715576171875, + "debug/reference_chosen_logps": -250.1258544921875, + "debug/reference_rejected_logps": -276.1705627441406, + "debug/sppo_chosen_loss": 2390.42578125, + "debug/sppo_chosen_reward_in_loss": 1.1441965103149414, + "debug/sppo_rej_reward_in_loss": -1.5450060367584229, + "debug/sppo_reject_loss": 2368.06884765625, + "epoch": 1.4673913043478262, + "grad_norm": 70367.72710076011, + "learning_rate": 9.49229935053401e-08, + "logits/chosen": 1.3403141498565674, + "logits/rejected": 1.8409442901611328, + "logps/chosen": -248.9816436767578, + "logps/rejected": -277.715576171875, + "loss": 4720.8113, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.011441965587437153, + "rewards/margins": 0.026892025023698807, + "rewards/rejected": -0.015450060367584229, + "step": 405 + }, + { + "debug/policy_chosen_logits": 1.3786104917526245, + "debug/policy_chosen_logps": -244.37557983398438, + "debug/policy_rejected_logits": 1.605938196182251, + "debug/policy_rejected_logps": -292.2764587402344, + "debug/reference_chosen_logps": -244.8103485107422, + "debug/reference_rejected_logps": -288.8402404785156, + "debug/sppo_chosen_loss": 2460.53564453125, + "debug/sppo_chosen_reward_in_loss": 0.4347648620605469, + "debug/sppo_rej_reward_in_loss": -3.43621826171875, + "debug/sppo_reject_loss": 2192.02734375, + "epoch": 1.4855072463768115, + "grad_norm": 77871.27716914566, + "learning_rate": 9.475816456775311e-08, + "logits/chosen": 1.3786104917526245, + "logits/rejected": 1.605938196182251, + "logps/chosen": -244.37557983398438, + "logps/rejected": -292.2764587402344, + "loss": 4758.2246, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.004347648937255144, + "rewards/margins": 0.03870982676744461, + "rewards/rejected": -0.034362178295850754, + "step": 410 + }, + { + "debug/policy_chosen_logits": 0.9305599927902222, + "debug/policy_chosen_logps": -243.4819793701172, + "debug/policy_rejected_logits": 1.324118971824646, + "debug/policy_rejected_logps": -293.31402587890625, + "debug/reference_chosen_logps": -243.03857421875, + "debug/reference_rejected_logps": -289.7604064941406, + "debug/sppo_chosen_loss": 2563.819580078125, + "debug/sppo_chosen_reward_in_loss": -0.44340628385543823, + "debug/sppo_rej_reward_in_loss": -3.5536065101623535, + "debug/sppo_reject_loss": 2186.4287109375, + "epoch": 1.5036231884057971, + "grad_norm": 84731.09766930714, + "learning_rate": 9.459085038750394e-08, + "logits/chosen": 0.9305599927902222, + "logits/rejected": 1.324118971824646, + "logps/chosen": -243.4819793701172, + "logps/rejected": -293.31402587890625, + "loss": 4734.184, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.004434062633663416, + "rewards/margins": 0.031102001667022705, + "rewards/rejected": -0.035536061972379684, + "step": 415 + }, + { + "debug/policy_chosen_logits": 1.5847978591918945, + "debug/policy_chosen_logps": -257.9969787597656, + "debug/policy_rejected_logits": 2.149386167526245, + "debug/policy_rejected_logps": -308.333251953125, + "debug/reference_chosen_logps": -255.08334350585938, + "debug/reference_rejected_logps": -302.2669372558594, + "debug/sppo_chosen_loss": 2824.914794921875, + "debug/sppo_chosen_reward_in_loss": -2.9136509895324707, + "debug/sppo_rej_reward_in_loss": -6.066329479217529, + "debug/sppo_reject_loss": 1976.962646484375, + "epoch": 1.5217391304347827, + "grad_norm": 104826.24697622382, + "learning_rate": 9.442106025488283e-08, + "logits/chosen": 1.5847978591918945, + "logits/rejected": 2.149386167526245, + "logps/chosen": -257.9969787597656, + "logps/rejected": -308.333251953125, + "loss": 4816.7281, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.02913650870323181, + "rewards/margins": 0.03152678534388542, + "rewards/rejected": -0.06066329404711723, + "step": 420 + }, + { + "debug/policy_chosen_logits": 1.3920035362243652, + "debug/policy_chosen_logps": -255.4589080810547, + "debug/policy_rejected_logits": 1.937829613685608, + "debug/policy_rejected_logps": -310.65008544921875, + "debug/reference_chosen_logps": -254.27566528320312, + "debug/reference_rejected_logps": -306.13055419921875, + "debug/sppo_chosen_loss": 2635.071533203125, + "debug/sppo_chosen_reward_in_loss": -1.1832473278045654, + "debug/sppo_rej_reward_in_loss": -4.519493579864502, + "debug/sppo_reject_loss": 2109.51611328125, + "epoch": 1.539855072463768, + "grad_norm": 87017.94217456014, + "learning_rate": 9.424880359765976e-08, + "logits/chosen": 1.3920035362243652, + "logits/rejected": 1.937829613685608, + "logps/chosen": -255.4589080810547, + "logps/rejected": -310.65008544921875, + "loss": 4719.7344, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.011832473799586296, + "rewards/margins": 0.033362459391355515, + "rewards/rejected": -0.04519493132829666, + "step": 425 + }, + { + "debug/policy_chosen_logits": 0.9032995104789734, + "debug/policy_chosen_logps": -243.0791473388672, + "debug/policy_rejected_logits": 1.2816048860549927, + "debug/policy_rejected_logps": -295.67889404296875, + "debug/reference_chosen_logps": -243.094970703125, + "debug/reference_rejected_logps": -292.0074462890625, + "debug/sppo_chosen_loss": 2512.8564453125, + "debug/sppo_chosen_reward_in_loss": 0.015825461596250534, + "debug/sppo_rej_reward_in_loss": -3.671428680419922, + "debug/sppo_reject_loss": 2182.8984375, + "epoch": 1.5579710144927537, + "grad_norm": 63772.18252754498, + "learning_rate": 9.407408998056104e-08, + "logits/chosen": 0.9032995104789734, + "logits/rejected": 1.2816048860549927, + "logps/chosen": -243.0791473388672, + "logps/rejected": -295.67889404296875, + "loss": 4704.9863, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00015825479931663722, + "rewards/margins": 0.036872539669275284, + "rewards/rejected": -0.036714281886816025, + "step": 430 + }, + { + "debug/policy_chosen_logits": 1.313698410987854, + "debug/policy_chosen_logps": -236.9527130126953, + "debug/policy_rejected_logits": 1.505110502243042, + "debug/policy_rejected_logps": -270.3215637207031, + "debug/reference_chosen_logps": -235.29800415039062, + "debug/reference_rejected_logps": -266.57958984375, + "debug/sppo_chosen_loss": 2689.590087890625, + "debug/sppo_chosen_reward_in_loss": -1.6547034978866577, + "debug/sppo_rej_reward_in_loss": -3.7419822216033936, + "debug/sppo_reject_loss": 2173.85205078125, + "epoch": 1.5760869565217392, + "grad_norm": 73743.00558373412, + "learning_rate": 9.389692910473814e-08, + "logits/chosen": 1.313698410987854, + "logits/rejected": 1.505110502243042, + "logps/chosen": -236.9527130126953, + "logps/rejected": -270.3215637207031, + "loss": 4785.4102, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.016547035425901413, + "rewards/margins": 0.02087278850376606, + "rewards/rejected": -0.037419818341732025, + "step": 435 + }, + { + "debug/policy_chosen_logits": 1.3025128841400146, + "debug/policy_chosen_logps": -247.7681121826172, + "debug/policy_rejected_logits": 1.7691015005111694, + "debug/policy_rejected_logps": -306.4404296875, + "debug/reference_chosen_logps": -246.77597045898438, + "debug/reference_rejected_logps": -302.6429138183594, + "debug/sppo_chosen_loss": 2613.018310546875, + "debug/sppo_chosen_reward_in_loss": -0.9921543002128601, + "debug/sppo_rej_reward_in_loss": -3.7975406646728516, + "debug/sppo_reject_loss": 2169.7216796875, + "epoch": 1.5942028985507246, + "grad_norm": 60935.995908769066, + "learning_rate": 9.37173308072291e-08, + "logits/chosen": 1.3025128841400146, + "logits/rejected": 1.7691015005111694, + "logps/chosen": -247.7681121826172, + "logps/rejected": -306.4404296875, + "loss": 4807.8977, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.009921541437506676, + "rewards/margins": 0.02805386111140251, + "rewards/rejected": -0.03797540441155434, + "step": 440 + }, + { + "debug/policy_chosen_logits": 1.3220393657684326, + "debug/policy_chosen_logps": -262.1022033691406, + "debug/policy_rejected_logits": 1.4996846914291382, + "debug/policy_rejected_logps": -287.05889892578125, + "debug/reference_chosen_logps": -260.79449462890625, + "debug/reference_rejected_logps": -283.6168518066406, + "debug/sppo_chosen_loss": 2663.69580078125, + "debug/sppo_chosen_reward_in_loss": -1.3077014684677124, + "debug/sppo_rej_reward_in_loss": -3.442105531692505, + "debug/sppo_reject_loss": 2197.2099609375, + "epoch": 1.6123188405797102, + "grad_norm": 88654.99949107536, + "learning_rate": 9.353530506041226e-08, + "logits/chosen": 1.3220393657684326, + "logits/rejected": 1.4996846914291382, + "logps/chosen": -262.1022033691406, + "logps/rejected": -287.05889892578125, + "loss": 4738.5777, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.013077013194561005, + "rewards/margins": 0.02134403958916664, + "rewards/rejected": -0.034421052783727646, + "step": 445 + }, + { + "debug/policy_chosen_logits": 1.200622797012329, + "debug/policy_chosen_logps": -262.21710205078125, + "debug/policy_rejected_logits": 1.3046382665634155, + "debug/policy_rejected_logps": -259.45977783203125, + "debug/reference_chosen_logps": -261.75518798828125, + "debug/reference_rejected_logps": -257.6763916015625, + "debug/sppo_chosen_loss": 2556.416748046875, + "debug/sppo_chosen_reward_in_loss": -0.46194133162498474, + "debug/sppo_rej_reward_in_loss": -1.7833735942840576, + "debug/sppo_reject_loss": 2343.46044921875, + "epoch": 1.6304347826086958, + "grad_norm": 61443.35825745383, + "learning_rate": 9.335086197145254e-08, + "logits/chosen": 1.200622797012329, + "logits/rejected": 1.3046382665634155, + "logps/chosen": -262.21710205078125, + "logps/rejected": -259.45977783203125, + "loss": 4789.7859, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.004619413521140814, + "rewards/margins": 0.013214322738349438, + "rewards/rejected": -0.017833735793828964, + "step": 450 + }, + { + "debug/policy_chosen_logits": 1.166078805923462, + "debug/policy_chosen_logps": -261.41748046875, + "debug/policy_rejected_logits": 1.4401895999908447, + "debug/policy_rejected_logps": -309.0025329589844, + "debug/reference_chosen_logps": -260.24456787109375, + "debug/reference_rejected_logps": -304.6410827636719, + "debug/sppo_chosen_loss": 2636.66455078125, + "debug/sppo_chosen_reward_in_loss": -1.172864317893982, + "debug/sppo_rej_reward_in_loss": -4.361422538757324, + "debug/sppo_reject_loss": 2135.68115234375, + "epoch": 1.6485507246376812, + "grad_norm": 64189.45844600206, + "learning_rate": 9.31640117817403e-08, + "logits/chosen": 1.166078805923462, + "logits/rejected": 1.4401895999908447, + "logps/chosen": -261.41748046875, + "logps/rejected": -309.0025329589844, + "loss": 4745.2758, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01172864343971014, + "rewards/margins": 0.03188558667898178, + "rewards/rejected": -0.0436142273247242, + "step": 455 + }, + { + "debug/policy_chosen_logits": 1.3456027507781982, + "debug/policy_chosen_logps": -230.1571502685547, + "debug/policy_rejected_logits": 1.5562714338302612, + "debug/policy_rejected_logps": -313.42950439453125, + "debug/reference_chosen_logps": -229.99716186523438, + "debug/reference_rejected_logps": -310.053955078125, + "debug/sppo_chosen_loss": 2534.66064453125, + "debug/sppo_chosen_reward_in_loss": -0.15998229384422302, + "debug/sppo_rej_reward_in_loss": -3.375528335571289, + "debug/sppo_reject_loss": 2215.09814453125, + "epoch": 1.6666666666666665, + "grad_norm": 67332.59952507944, + "learning_rate": 9.297476486632254e-08, + "logits/chosen": 1.3456027507781982, + "logits/rejected": 1.5562714338302612, + "logps/chosen": -230.1571502685547, + "logps/rejected": -313.42950439453125, + "loss": 4755.9258, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0015998227754607797, + "rewards/margins": 0.032155461609363556, + "rewards/rejected": -0.03375528007745743, + "step": 460 + }, + { + "debug/policy_chosen_logits": 1.2817943096160889, + "debug/policy_chosen_logps": -256.0505676269531, + "debug/policy_rejected_logits": 1.5631263256072998, + "debug/policy_rejected_logps": -262.54083251953125, + "debug/reference_chosen_logps": -254.5850372314453, + "debug/reference_rejected_logps": -259.1615905761719, + "debug/sppo_chosen_loss": 2667.42919921875, + "debug/sppo_chosen_reward_in_loss": -1.4655250310897827, + "debug/sppo_rej_reward_in_loss": -3.3792197704315186, + "debug/sppo_reject_loss": 2197.87646484375, + "epoch": 1.6847826086956523, + "grad_norm": 60994.94822797016, + "learning_rate": 9.278313173332697e-08, + "logits/chosen": 1.2817943096160889, + "logits/rejected": 1.5631263256072998, + "logps/chosen": -256.0505676269531, + "logps/rejected": -262.54083251953125, + "loss": 4761.1359, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.014655251987278461, + "rewards/margins": 0.019136948511004448, + "rewards/rejected": -0.033792201429605484, + "step": 465 + }, + { + "debug/policy_chosen_logits": 1.2423388957977295, + "debug/policy_chosen_logps": -234.31753540039062, + "debug/policy_rejected_logits": 1.8380489349365234, + "debug/policy_rejected_logps": -291.81353759765625, + "debug/reference_chosen_logps": -234.3294677734375, + "debug/reference_rejected_logps": -288.486083984375, + "debug/sppo_chosen_loss": 2512.909912109375, + "debug/sppo_chosen_reward_in_loss": 0.011934471316635609, + "debug/sppo_rej_reward_in_loss": -3.327467441558838, + "debug/sppo_reject_loss": 2219.69287109375, + "epoch": 1.7028985507246377, + "grad_norm": 59392.99089032479, + "learning_rate": 9.25891230233784e-08, + "logits/chosen": 1.2423388957977295, + "logits/rejected": 1.8380489349365234, + "logps/chosen": -234.31753540039062, + "logps/rejected": -291.81353759765625, + "loss": 4827.7793, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.00011934452049899846, + "rewards/margins": 0.033394016325473785, + "rewards/rejected": -0.03327467292547226, + "step": 470 + }, + { + "debug/policy_chosen_logits": 1.3661912679672241, + "debug/policy_chosen_logps": -267.3763427734375, + "debug/policy_rejected_logits": 1.5575473308563232, + "debug/policy_rejected_logps": -279.25225830078125, + "debug/reference_chosen_logps": -268.66082763671875, + "debug/reference_rejected_logps": -276.802490234375, + "debug/sppo_chosen_loss": 2395.666748046875, + "debug/sppo_chosen_reward_in_loss": 1.284515142440796, + "debug/sppo_rej_reward_in_loss": -2.449761390686035, + "debug/sppo_reject_loss": 2320.32177734375, + "epoch": 1.721014492753623, + "grad_norm": 137145.32835366024, + "learning_rate": 9.239274950900804e-08, + "logits/chosen": 1.3661912679672241, + "logits/rejected": 1.5575473308563232, + "logps/chosen": -267.3763427734375, + "logps/rejected": -279.25225830078125, + "loss": 4731.8875, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012845151126384735, + "rewards/margins": 0.03734276443719864, + "rewards/rejected": -0.024497613310813904, + "step": 475 + }, + { + "debug/policy_chosen_logits": 1.221142053604126, + "debug/policy_chosen_logps": -253.66561889648438, + "debug/policy_rejected_logits": 1.3845961093902588, + "debug/policy_rejected_logps": -257.0927734375, + "debug/reference_chosen_logps": -257.324951171875, + "debug/reference_rejected_logps": -257.91546630859375, + "debug/sppo_chosen_loss": 2159.9609375, + "debug/sppo_chosen_reward_in_loss": 3.659325122833252, + "debug/sppo_rej_reward_in_loss": 0.8226556777954102, + "debug/sppo_reject_loss": 2633.194091796875, + "epoch": 1.7391304347826086, + "grad_norm": 100541.81048766572, + "learning_rate": 9.219402209405519e-08, + "logits/chosen": 1.221142053604126, + "logits/rejected": 1.3845961093902588, + "logps/chosen": -253.66561889648438, + "logps/rejected": -257.0927734375, + "loss": 4748.7293, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.036593250930309296, + "rewards/margins": 0.0283666905015707, + "rewards/rejected": 0.008226556703448296, + "step": 480 + }, + { + "debug/policy_chosen_logits": 1.4586317539215088, + "debug/policy_chosen_logps": -274.37286376953125, + "debug/policy_rejected_logits": 1.7072410583496094, + "debug/policy_rejected_logps": -322.7748107910156, + "debug/reference_chosen_logps": -275.3488464355469, + "debug/reference_rejected_logps": -318.627685546875, + "debug/sppo_chosen_loss": 2421.141357421875, + "debug/sppo_chosen_reward_in_loss": 0.975965678691864, + "debug/sppo_rej_reward_in_loss": -4.1471357345581055, + "debug/sppo_reject_loss": 2191.396484375, + "epoch": 1.7572463768115942, + "grad_norm": 59327.87743261923, + "learning_rate": 9.19929518130619e-08, + "logits/chosen": 1.4586317539215088, + "logits/rejected": 1.7072410583496094, + "logps/chosen": -274.37286376953125, + "logps/rejected": -322.7748107910156, + "loss": 4686.2148, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.00975965615361929, + "rewards/margins": 0.0512310154736042, + "rewards/rejected": -0.04147135466337204, + "step": 485 + }, + { + "debug/policy_chosen_logits": 1.4661785364151, + "debug/policy_chosen_logps": -258.8686828613281, + "debug/policy_rejected_logits": 1.7450615167617798, + "debug/policy_rejected_logps": -294.00946044921875, + "debug/reference_chosen_logps": -258.397216796875, + "debug/reference_rejected_logps": -290.43145751953125, + "debug/sppo_chosen_loss": 2561.934814453125, + "debug/sppo_chosen_reward_in_loss": -0.47144660353660583, + "debug/sppo_rej_reward_in_loss": -3.578030824661255, + "debug/sppo_reject_loss": 2190.13330078125, + "epoch": 1.7753623188405796, + "grad_norm": 116417.92157649665, + "learning_rate": 9.178954983066031e-08, + "logits/chosen": 1.4661785364151, + "logits/rejected": 1.7450615167617798, + "logps/chosen": -258.8686828613281, + "logps/rejected": -294.00946044921875, + "loss": 4626.4297, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.004714466631412506, + "rewards/margins": 0.031065840274095535, + "rewards/rejected": -0.03578030690550804, + "step": 490 + }, + { + "debug/policy_chosen_logits": 1.1955199241638184, + "debug/policy_chosen_logps": -232.2951202392578, + "debug/policy_rejected_logits": 1.4762752056121826, + "debug/policy_rejected_logps": -290.5370178222656, + "debug/reference_chosen_logps": -231.7759246826172, + "debug/reference_rejected_logps": -286.0237121582031, + "debug/sppo_chosen_loss": 2573.547607421875, + "debug/sppo_chosen_reward_in_loss": -0.5191976428031921, + "debug/sppo_rej_reward_in_loss": -4.513314247131348, + "debug/sppo_reject_loss": 2124.11328125, + "epoch": 1.7934782608695652, + "grad_norm": 65420.3826874141, + "learning_rate": 9.15838274409526e-08, + "logits/chosen": 1.1955199241638184, + "logits/rejected": 1.4762752056121826, + "logps/chosen": -232.2951202392578, + "logps/rejected": -290.5370178222656, + "loss": 4745.8953, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.005191975738853216, + "rewards/margins": 0.03994116559624672, + "rewards/rejected": -0.045133139938116074, + "step": 495 + }, + { + "debug/policy_chosen_logits": 1.2134708166122437, + "debug/policy_chosen_logps": -261.5220031738281, + "debug/policy_rejected_logits": 1.4601694345474243, + "debug/policy_rejected_logps": -268.22821044921875, + "debug/reference_chosen_logps": -260.53173828125, + "debug/reference_rejected_logps": -265.97222900390625, + "debug/sppo_chosen_loss": 2622.430419921875, + "debug/sppo_chosen_reward_in_loss": -0.9902515411376953, + "debug/sppo_rej_reward_in_loss": -2.25596022605896, + "debug/sppo_reject_loss": 2303.806884765625, + "epoch": 1.8115942028985508, + "grad_norm": 73796.98161312056, + "learning_rate": 9.13757960668839e-08, + "logits/chosen": 1.2134708166122437, + "logits/rejected": 1.4601694345474243, + "logps/chosen": -261.5220031738281, + "logps/rejected": -268.22821044921875, + "loss": 4686.3855, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.00990251637995243, + "rewards/margins": 0.012657088227570057, + "rewards/rejected": -0.022559601813554764, + "step": 500 + }, + { + "epoch": 1.8115942028985508, + "eval_debug/policy_chosen_logits": 1.5348409414291382, + "eval_debug/policy_chosen_logps": -254.37594604492188, + "eval_debug/policy_rejected_logits": 1.5898981094360352, + "eval_debug/policy_rejected_logps": -263.16796875, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2678.08642578125, + "eval_debug/sppo_chosen_reward_in_loss": -1.4574708938598633, + "eval_debug/sppo_rej_reward_in_loss": -3.509323835372925, + "eval_debug/sppo_reject_loss": 2224.341552734375, + "eval_logits/chosen": 1.5348409414291382, + "eval_logits/rejected": 1.5898981094360352, + "eval_logps/chosen": -254.37594604492188, + "eval_logps/rejected": -263.16796875, + "eval_loss": 4767.90185546875, + "eval_rewards/accuracies": 0.5131579041481018, + "eval_rewards/chosen": -0.014574708417057991, + "eval_rewards/margins": 0.020518526434898376, + "eval_rewards/rejected": -0.03509323671460152, + "eval_runtime": 28.3258, + "eval_samples_per_second": 21.182, + "eval_steps_per_second": 0.671, + "step": 500 + }, + { + "debug/policy_chosen_logits": 1.278747320175171, + "debug/policy_chosen_logps": -251.4324188232422, + "debug/policy_rejected_logits": 1.3868252038955688, + "debug/policy_rejected_logps": -285.3130798339844, + "debug/reference_chosen_logps": -250.66650390625, + "debug/reference_rejected_logps": -280.2650451660156, + "debug/sppo_chosen_loss": 2590.232177734375, + "debug/sppo_chosen_reward_in_loss": -0.7659379839897156, + "debug/sppo_rej_reward_in_loss": -5.048047065734863, + "debug/sppo_reject_loss": 2071.307373046875, + "epoch": 1.8297101449275361, + "grad_norm": 65502.62694895854, + "learning_rate": 9.11654672596081e-08, + "logits/chosen": 1.278747320175171, + "logits/rejected": 1.3868252038955688, + "logps/chosen": -251.4324188232422, + "logps/rejected": -285.3130798339844, + "loss": 4641.1789, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0076593803241848946, + "rewards/margins": 0.04282108694314957, + "rewards/rejected": -0.05048046633601189, + "step": 505 + }, + { + "debug/policy_chosen_logits": 1.4022386074066162, + "debug/policy_chosen_logps": -257.87750244140625, + "debug/policy_rejected_logits": 1.592950701713562, + "debug/policy_rejected_logps": -300.3731689453125, + "debug/reference_chosen_logps": -257.995361328125, + "debug/reference_rejected_logps": -295.6236267089844, + "debug/sppo_chosen_loss": 2500.1318359375, + "debug/sppo_chosen_reward_in_loss": 0.11788959801197052, + "debug/sppo_rej_reward_in_loss": -4.749524116516113, + "debug/sppo_reject_loss": 2101.89306640625, + "epoch": 1.8478260869565217, + "grad_norm": 90656.15151946226, + "learning_rate": 9.095285269784641e-08, + "logits/chosen": 1.4022386074066162, + "logits/rejected": 1.592950701713562, + "logps/chosen": -257.87750244140625, + "logps/rejected": -300.3731689453125, + "loss": 4618.7891, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0011788962874561548, + "rewards/margins": 0.04867414012551308, + "rewards/rejected": -0.047495242208242416, + "step": 510 + }, + { + "debug/policy_chosen_logits": 1.4164687395095825, + "debug/policy_chosen_logps": -256.8494873046875, + "debug/policy_rejected_logits": 1.567403793334961, + "debug/policy_rejected_logps": -271.0446472167969, + "debug/reference_chosen_logps": -255.5850830078125, + "debug/reference_rejected_logps": -265.862548828125, + "debug/sppo_chosen_loss": 2653.49267578125, + "debug/sppo_chosen_reward_in_loss": -1.2644188404083252, + "debug/sppo_rej_reward_in_loss": -5.182098388671875, + "debug/sppo_reject_loss": 2071.83935546875, + "epoch": 1.8659420289855073, + "grad_norm": 58337.062881640195, + "learning_rate": 9.073796418723882e-08, + "logits/chosen": 1.4164687395095825, + "logits/rejected": 1.567403793334961, + "logps/chosen": -256.8494873046875, + "logps/rejected": -271.0446472167969, + "loss": 4786.3289, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.012644186615943909, + "rewards/margins": 0.03917679563164711, + "rewards/rejected": -0.05182098224759102, + "step": 515 + }, + { + "debug/policy_chosen_logits": 1.5307259559631348, + "debug/policy_chosen_logps": -268.090576171875, + "debug/policy_rejected_logits": 1.6045730113983154, + "debug/policy_rejected_logps": -283.01275634765625, + "debug/reference_chosen_logps": -267.4107360839844, + "debug/reference_rejected_logps": -279.47845458984375, + "debug/sppo_chosen_loss": 2597.373046875, + "debug/sppo_chosen_reward_in_loss": -0.6798439025878906, + "debug/sppo_rej_reward_in_loss": -3.5343170166015625, + "debug/sppo_reject_loss": 2190.690673828125, + "epoch": 1.8840579710144927, + "grad_norm": 61099.75772656539, + "learning_rate": 9.05208136596887e-08, + "logits/chosen": 1.5307259559631348, + "logits/rejected": 1.6045730113983154, + "logps/chosen": -268.090576171875, + "logps/rejected": -283.01275634765625, + "loss": 4765.5648, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.006798438727855682, + "rewards/margins": 0.028544727712869644, + "rewards/rejected": -0.03534316644072533, + "step": 520 + }, + { + "debug/policy_chosen_logits": 1.2059051990509033, + "debug/policy_chosen_logps": -255.1666717529297, + "debug/policy_rejected_logits": 1.6417697668075562, + "debug/policy_rejected_logps": -289.1039123535156, + "debug/reference_chosen_logps": -254.3158416748047, + "debug/reference_rejected_logps": -288.393310546875, + "debug/sppo_chosen_loss": 2654.73095703125, + "debug/sppo_chosen_reward_in_loss": -0.8508337140083313, + "debug/sppo_rej_reward_in_loss": -0.7106183767318726, + "debug/sppo_reject_loss": 2452.39599609375, + "epoch": 1.9021739130434783, + "grad_norm": 69150.2726035533, + "learning_rate": 9.030141317270026e-08, + "logits/chosen": 1.2059051990509033, + "logits/rejected": 1.6417697668075562, + "logps/chosen": -255.1666717529297, + "logps/rejected": -289.1039123535156, + "loss": 4758.4645, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.00850833673030138, + "rewards/margins": -0.0014021530514582992, + "rewards/rejected": -0.0071061840280890465, + "step": 525 + }, + { + "debug/policy_chosen_logits": 1.1592168807983398, + "debug/policy_chosen_logps": -254.8201446533203, + "debug/policy_rejected_logits": 1.2738714218139648, + "debug/policy_rejected_logps": -289.788330078125, + "debug/reference_chosen_logps": -253.6742401123047, + "debug/reference_rejected_logps": -286.6373596191406, + "debug/sppo_chosen_loss": 2657.622314453125, + "debug/sppo_chosen_reward_in_loss": -1.1458953619003296, + "debug/sppo_rej_reward_in_loss": -3.150979518890381, + "debug/sppo_reject_loss": 2247.098388671875, + "epoch": 1.9202898550724639, + "grad_norm": 62523.02200109494, + "learning_rate": 9.007977490870885e-08, + "logits/chosen": 1.1592168807983398, + "logits/rejected": 1.2738714218139648, + "logps/chosen": -254.8201446533203, + "logps/rejected": -289.788330078125, + "loss": 4778.1098, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011458953842520714, + "rewards/margins": 0.020050838589668274, + "rewards/rejected": -0.03150979429483414, + "step": 530 + }, + { + "debug/policy_chosen_logits": 1.4269134998321533, + "debug/policy_chosen_logps": -235.1399688720703, + "debug/policy_rejected_logits": 1.690899133682251, + "debug/policy_rejected_logps": -264.2394104003906, + "debug/reference_chosen_logps": -234.9927520751953, + "debug/reference_rejected_logps": -260.7703552246094, + "debug/sppo_chosen_loss": 2530.692138671875, + "debug/sppo_chosen_reward_in_loss": -0.14720706641674042, + "debug/sppo_rej_reward_in_loss": -3.4690022468566895, + "debug/sppo_reject_loss": 2216.19677734375, + "epoch": 1.9384057971014492, + "grad_norm": 78624.62327767034, + "learning_rate": 8.985591117440483e-08, + "logits/chosen": 1.4269134998321533, + "logits/rejected": 1.690899133682251, + "logps/chosen": -235.1399688720703, + "logps/rejected": -264.2394104003906, + "loss": 4685.6562, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0014720701146870852, + "rewards/margins": 0.03321795165538788, + "rewards/rejected": -0.034690018743276596, + "step": 535 + }, + { + "debug/policy_chosen_logits": 1.0426114797592163, + "debug/policy_chosen_logps": -249.03506469726562, + "debug/policy_rejected_logits": 1.3688905239105225, + "debug/policy_rejected_logps": -278.43450927734375, + "debug/reference_chosen_logps": -248.1215362548828, + "debug/reference_rejected_logps": -276.47796630859375, + "debug/sppo_chosen_loss": 2628.769775390625, + "debug/sppo_chosen_reward_in_loss": -0.9134899377822876, + "debug/sppo_rej_reward_in_loss": -1.9565985202789307, + "debug/sppo_reject_loss": 2334.08935546875, + "epoch": 1.9565217391304348, + "grad_norm": 64105.71961637225, + "learning_rate": 8.962983440004998e-08, + "logits/chosen": 1.0426114797592163, + "logits/rejected": 1.3688905239105225, + "logps/chosen": -249.03506469726562, + "logps/rejected": -278.43450927734375, + "loss": 4719.7977, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.009134897962212563, + "rewards/margins": 0.010431085713207722, + "rewards/rejected": -0.01956598274409771, + "step": 540 + }, + { + "debug/policy_chosen_logits": 1.1802746057510376, + "debug/policy_chosen_logps": -262.9992980957031, + "debug/policy_rejected_logits": 1.4178202152252197, + "debug/policy_rejected_logps": -295.21051025390625, + "debug/reference_chosen_logps": -261.45489501953125, + "debug/reference_rejected_logps": -292.7149658203125, + "debug/sppo_chosen_loss": 2691.10888671875, + "debug/sppo_chosen_reward_in_loss": -1.5444284677505493, + "debug/sppo_rej_reward_in_loss": -2.495530128479004, + "debug/sppo_reject_loss": 2297.989501953125, + "epoch": 1.9746376811594204, + "grad_norm": 58590.81549818591, + "learning_rate": 8.940155713878738e-08, + "logits/chosen": 1.1802746057510376, + "logits/rejected": 1.4178202152252197, + "logps/chosen": -262.9992980957031, + "logps/rejected": -295.21051025390625, + "loss": 4644.4012, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.015444284304976463, + "rewards/margins": 0.009511016309261322, + "rewards/rejected": -0.024955300614237785, + "step": 545 + }, + { + "debug/policy_chosen_logits": 0.9551759958267212, + "debug/policy_chosen_logps": -247.09033203125, + "debug/policy_rejected_logits": 1.022687315940857, + "debug/policy_rejected_logps": -282.8631591796875, + "debug/reference_chosen_logps": -247.67520141601562, + "debug/reference_rejected_logps": -279.9501647949219, + "debug/sppo_chosen_loss": 2454.7587890625, + "debug/sppo_chosen_reward_in_loss": 0.5848686099052429, + "debug/sppo_rej_reward_in_loss": -2.9129879474639893, + "debug/sppo_reject_loss": 2258.426513671875, + "epoch": 1.9927536231884058, + "grad_norm": 64771.314710374965, + "learning_rate": 8.91710920659444e-08, + "logits/chosen": 0.9551759958267212, + "logits/rejected": 1.022687315940857, + "logps/chosen": -247.09033203125, + "logps/rejected": -282.8631591796875, + "loss": 4761.4836, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.005848685745149851, + "rewards/margins": 0.03497856482863426, + "rewards/rejected": -0.02912987396121025, + "step": 550 + }, + { + "debug/policy_chosen_logits": 1.0029628276824951, + "debug/policy_chosen_logps": -234.5623016357422, + "debug/policy_rejected_logits": 1.6541814804077148, + "debug/policy_rejected_logps": -285.01568603515625, + "debug/reference_chosen_logps": -234.8700714111328, + "debug/reference_rejected_logps": -281.872314453125, + "debug/sppo_chosen_loss": 2493.84619140625, + "debug/sppo_chosen_reward_in_loss": 0.307760626077652, + "debug/sppo_rej_reward_in_loss": -3.143383502960205, + "debug/sppo_reject_loss": 2232.835205078125, + "epoch": 2.010869565217391, + "grad_norm": 69525.29332871876, + "learning_rate": 8.89384519783289e-08, + "logits/chosen": 1.0029628276824951, + "logits/rejected": 1.6541814804077148, + "logps/chosen": -234.5623016357422, + "logps/rejected": -285.01568603515625, + "loss": 4813.4883, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.003077606437727809, + "rewards/margins": 0.03451143950223923, + "rewards/rejected": -0.0314338319003582, + "step": 555 + }, + { + "debug/policy_chosen_logits": 1.5291016101837158, + "debug/policy_chosen_logps": -235.267822265625, + "debug/policy_rejected_logits": 1.787105917930603, + "debug/policy_rejected_logps": -272.7067565917969, + "debug/reference_chosen_logps": -236.3900604248047, + "debug/reference_rejected_logps": -269.92254638671875, + "debug/sppo_chosen_loss": 2397.44677734375, + "debug/sppo_chosen_reward_in_loss": 1.1222379207611084, + "debug/sppo_rej_reward_in_loss": -2.7842187881469727, + "debug/sppo_reject_loss": 2279.22802734375, + "epoch": 2.028985507246377, + "grad_norm": 62880.48737554153, + "learning_rate": 8.87036497935186e-08, + "logits/chosen": 1.5291016101837158, + "logits/rejected": 1.787105917930603, + "logps/chosen": -235.267822265625, + "logps/rejected": -272.7067565917969, + "loss": 4683.1055, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01122237928211689, + "rewards/margins": 0.03906456381082535, + "rewards/rejected": -0.027842188253998756, + "step": 560 + }, + { + "debug/policy_chosen_logits": 1.333370566368103, + "debug/policy_chosen_logps": -250.8878173828125, + "debug/policy_rejected_logits": 1.6568689346313477, + "debug/policy_rejected_logps": -294.1556091308594, + "debug/reference_chosen_logps": -251.97714233398438, + "debug/reference_rejected_logps": -290.07403564453125, + "debug/sppo_chosen_loss": 2402.43994140625, + "debug/sppo_chosen_reward_in_loss": 1.0893455743789673, + "debug/sppo_rej_reward_in_loss": -4.081561088562012, + "debug/sppo_reject_loss": 2164.110107421875, + "epoch": 2.0471014492753623, + "grad_norm": 67652.64805516161, + "learning_rate": 8.846669854914395e-08, + "logits/chosen": 1.333370566368103, + "logits/rejected": 1.6568689346313477, + "logps/chosen": -250.8878173828125, + "logps/rejected": -294.1556091308594, + "loss": 4661.5477, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.01089345570653677, + "rewards/margins": 0.051709067076444626, + "rewards/rejected": -0.04081561043858528, + "step": 565 + }, + { + "debug/policy_chosen_logits": 1.3098406791687012, + "debug/policy_chosen_logps": -255.63565063476562, + "debug/policy_rejected_logits": 1.6657825708389282, + "debug/policy_rejected_logps": -292.25701904296875, + "debug/reference_chosen_logps": -254.9364776611328, + "debug/reference_rejected_logps": -289.62884521484375, + "debug/sppo_chosen_loss": 2624.39404296875, + "debug/sppo_chosen_reward_in_loss": -0.6991499066352844, + "debug/sppo_rej_reward_in_loss": -2.6281533241271973, + "debug/sppo_reject_loss": 2296.69287109375, + "epoch": 2.0652173913043477, + "grad_norm": 72051.52919968299, + "learning_rate": 8.8227611402164e-08, + "logits/chosen": 1.3098406791687012, + "logits/rejected": 1.6657825708389282, + "logps/chosen": -255.63565063476562, + "logps/rejected": -292.25701904296875, + "loss": 4720.9078, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.006991499103605747, + "rewards/margins": 0.019290033727884293, + "rewards/rejected": -0.026281530037522316, + "step": 570 + }, + { + "debug/policy_chosen_logits": 1.2542155981063843, + "debug/policy_chosen_logps": -253.7539825439453, + "debug/policy_rejected_logits": 1.5640454292297363, + "debug/policy_rejected_logps": -275.55792236328125, + "debug/reference_chosen_logps": -253.73959350585938, + "debug/reference_rejected_logps": -272.0418395996094, + "debug/sppo_chosen_loss": 2514.656982421875, + "debug/sppo_chosen_reward_in_loss": -0.014367866329848766, + "debug/sppo_rej_reward_in_loss": -3.5160465240478516, + "debug/sppo_reject_loss": 2216.046630859375, + "epoch": 2.0833333333333335, + "grad_norm": 71283.46672505948, + "learning_rate": 8.798640162813607e-08, + "logits/chosen": 1.2542155981063843, + "logits/rejected": 1.5640454292297363, + "logps/chosen": -253.7539825439453, + "logps/rejected": -275.55792236328125, + "loss": 4628.4422, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.00014367885887622833, + "rewards/margins": 0.03501678630709648, + "rewards/rejected": -0.03516046330332756, + "step": 575 + }, + { + "debug/policy_chosen_logits": 0.8257444500923157, + "debug/policy_chosen_logps": -239.25830078125, + "debug/policy_rejected_logits": 1.225967526435852, + "debug/policy_rejected_logps": -295.7880859375, + "debug/reference_chosen_logps": -238.2715606689453, + "debug/reference_rejected_logps": -293.7236022949219, + "debug/sppo_chosen_loss": 2639.497802734375, + "debug/sppo_chosen_reward_in_loss": -0.9867492914199829, + "debug/sppo_rej_reward_in_loss": -2.0644497871398926, + "debug/sppo_reject_loss": 2316.61572265625, + "epoch": 2.101449275362319, + "grad_norm": 64002.07229866191, + "learning_rate": 8.774308262047847e-08, + "logits/chosen": 0.8257444500923157, + "logits/rejected": 1.225967526435852, + "logps/chosen": -239.25830078125, + "logps/rejected": -295.7880859375, + "loss": 4701.6742, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.009867492131888866, + "rewards/margins": 0.010777004063129425, + "rewards/rejected": -0.020644497126340866, + "step": 580 + }, + { + "debug/policy_chosen_logits": 1.3923568725585938, + "debug/policy_chosen_logps": -267.9619140625, + "debug/policy_rejected_logits": 1.6333366632461548, + "debug/policy_rejected_logps": -270.2325134277344, + "debug/reference_chosen_logps": -268.8836975097656, + "debug/reference_rejected_logps": -266.7135314941406, + "debug/sppo_chosen_loss": 2432.814453125, + "debug/sppo_chosen_reward_in_loss": 0.9217990636825562, + "debug/sppo_rej_reward_in_loss": -3.519031047821045, + "debug/sppo_reject_loss": 2213.14453125, + "epoch": 2.119565217391304, + "grad_norm": 75378.71430621752, + "learning_rate": 8.749766788972685e-08, + "logits/chosen": 1.3923568725585938, + "logits/rejected": 1.6333366632461548, + "logps/chosen": -267.9619140625, + "logps/rejected": -270.2325134277344, + "loss": 4584.3477, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.009217990562319756, + "rewards/margins": 0.04440830275416374, + "rewards/rejected": -0.03519031032919884, + "step": 585 + }, + { + "debug/policy_chosen_logits": 1.3195666074752808, + "debug/policy_chosen_logps": -250.1842803955078, + "debug/policy_rejected_logits": 1.6939647197723389, + "debug/policy_rejected_logps": -320.55340576171875, + "debug/reference_chosen_logps": -250.12686157226562, + "debug/reference_rejected_logps": -315.68890380859375, + "debug/sppo_chosen_loss": 2529.84912109375, + "debug/sppo_chosen_reward_in_loss": -0.057431600987911224, + "debug/sppo_rej_reward_in_loss": -4.864499092102051, + "debug/sppo_reject_loss": 2121.0732421875, + "epoch": 2.13768115942029, + "grad_norm": 61422.39294040227, + "learning_rate": 8.725017106278406e-08, + "logits/chosen": 1.3195666074752808, + "logits/rejected": 1.6939647197723389, + "logps/chosen": -250.1842803955078, + "logps/rejected": -320.55340576171875, + "loss": 4650.8539, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0005743157234974205, + "rewards/margins": 0.04807067662477493, + "rewards/rejected": -0.04864499717950821, + "step": 590 + }, + { + "debug/policy_chosen_logits": 0.9939780235290527, + "debug/policy_chosen_logps": -235.31338500976562, + "debug/policy_rejected_logits": 1.3324755430221558, + "debug/policy_rejected_logps": -293.82421875, + "debug/reference_chosen_logps": -235.8861541748047, + "debug/reference_rejected_logps": -288.54498291015625, + "debug/sppo_chosen_loss": 2456.348876953125, + "debug/sppo_chosen_reward_in_loss": 0.5727742910385132, + "debug/sppo_rej_reward_in_loss": -5.279238700866699, + "debug/sppo_reject_loss": 2089.032470703125, + "epoch": 2.1557971014492754, + "grad_norm": 120561.10121499117, + "learning_rate": 8.700060588216336e-08, + "logits/chosen": 0.9939780235290527, + "logits/rejected": 1.3324755430221558, + "logps/chosen": -235.31338500976562, + "logps/rejected": -293.82421875, + "loss": 4629.7258, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.005727742798626423, + "rewards/margins": 0.0585201270878315, + "rewards/rejected": -0.05279238149523735, + "step": 595 + }, + { + "debug/policy_chosen_logits": 1.170904278755188, + "debug/policy_chosen_logps": -241.5663604736328, + "debug/policy_rejected_logits": 1.3933851718902588, + "debug/policy_rejected_logps": -263.28515625, + "debug/reference_chosen_logps": -241.9515838623047, + "debug/reference_rejected_logps": -258.27471923828125, + "debug/sppo_chosen_loss": 2477.68212890625, + "debug/sppo_chosen_reward_in_loss": 0.38524895906448364, + "debug/sppo_rej_reward_in_loss": -5.010422706604004, + "debug/sppo_reject_loss": 2074.12890625, + "epoch": 2.1739130434782608, + "grad_norm": 69013.1201950737, + "learning_rate": 8.674898620522557e-08, + "logits/chosen": 1.170904278755188, + "logits/rejected": 1.3933851718902588, + "logps/chosen": -241.5663604736328, + "logps/rejected": -263.28515625, + "loss": 4647.1707, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.003852488938719034, + "rewards/margins": 0.05395671725273132, + "rewards/rejected": -0.050104230642318726, + "step": 600 + }, + { + "epoch": 2.1739130434782608, + "eval_debug/policy_chosen_logits": 1.5053505897521973, + "eval_debug/policy_chosen_logps": -253.22561645507812, + "eval_debug/policy_rejected_logits": 1.5586278438568115, + "eval_debug/policy_rejected_logps": -262.3002624511719, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2562.319091796875, + "eval_debug/sppo_chosen_reward_in_loss": -0.30712229013442993, + "eval_debug/sppo_rej_reward_in_loss": -2.6416518688201904, + "eval_debug/sppo_reject_loss": 2304.574462890625, + "eval_logits/chosen": 1.5053505897521973, + "eval_logits/rejected": 1.5586278438568115, + "eval_logps/chosen": -253.22561645507812, + "eval_logps/rejected": -262.3002624511719, + "eval_loss": 4725.65478515625, + "eval_rewards/accuracies": 0.5394737124443054, + "eval_rewards/chosen": -0.0030712224543094635, + "eval_rewards/margins": 0.02334529533982277, + "eval_rewards/rejected": -0.026416515931487083, + "eval_runtime": 28.4753, + "eval_samples_per_second": 21.071, + "eval_steps_per_second": 0.667, + "step": 600 + }, + { + "debug/policy_chosen_logits": 1.2302472591400146, + "debug/policy_chosen_logps": -245.91958618164062, + "debug/policy_rejected_logits": 1.8347580432891846, + "debug/policy_rejected_logps": -305.92706298828125, + "debug/reference_chosen_logps": -245.0672607421875, + "debug/reference_rejected_logps": -303.65692138671875, + "debug/sppo_chosen_loss": 2615.14404296875, + "debug/sppo_chosen_reward_in_loss": -0.8523017764091492, + "debug/sppo_rej_reward_in_loss": -2.2700839042663574, + "debug/sppo_reject_loss": 2321.53369140625, + "epoch": 2.1920289855072466, + "grad_norm": 68805.59854839399, + "learning_rate": 8.649532600340945e-08, + "logits/chosen": 1.2302472591400146, + "logits/rejected": 1.8347580432891846, + "logps/chosen": -245.91958618164062, + "logps/rejected": -305.92706298828125, + "loss": 4716.0961, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.008523017168045044, + "rewards/margins": 0.01417782437056303, + "rewards/rejected": -0.0227008406072855, + "step": 605 + }, + { + "debug/policy_chosen_logits": 1.2323944568634033, + "debug/policy_chosen_logps": -251.2030487060547, + "debug/policy_rejected_logits": 1.3216893672943115, + "debug/policy_rejected_logps": -264.74530029296875, + "debug/reference_chosen_logps": -251.7701873779297, + "debug/reference_rejected_logps": -263.05694580078125, + "debug/sppo_chosen_loss": 2460.92529296875, + "debug/sppo_chosen_reward_in_loss": 0.5671443939208984, + "debug/sppo_rej_reward_in_loss": -1.688367247581482, + "debug/sppo_reject_loss": 2375.023681640625, + "epoch": 2.210144927536232, + "grad_norm": 89575.39371679806, + "learning_rate": 8.6239639361456e-08, + "logits/chosen": 1.2323944568634033, + "logits/rejected": 1.3216893672943115, + "logps/chosen": -251.2030487060547, + "logps/rejected": -264.74530029296875, + "loss": 4727.2539, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005671444348990917, + "rewards/margins": 0.022555116564035416, + "rewards/rejected": -0.016883673146367073, + "step": 610 + }, + { + "debug/policy_chosen_logits": 1.2631947994232178, + "debug/policy_chosen_logps": -281.2205505371094, + "debug/policy_rejected_logits": 1.4563754796981812, + "debug/policy_rejected_logps": -297.29949951171875, + "debug/reference_chosen_logps": -281.66461181640625, + "debug/reference_rejected_logps": -295.0270080566406, + "debug/sppo_chosen_loss": 2473.56298828125, + "debug/sppo_chosen_reward_in_loss": 0.44406241178512573, + "debug/sppo_rej_reward_in_loss": -2.272524356842041, + "debug/sppo_reject_loss": 2317.353515625, + "epoch": 2.2282608695652173, + "grad_norm": 88394.34834173425, + "learning_rate": 8.598194047662634e-08, + "logits/chosen": 1.2631947994232178, + "logits/rejected": 1.4563754796981812, + "logps/chosen": -281.2205505371094, + "logps/rejected": -297.29949951171875, + "loss": 4709.5641, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.004440623801201582, + "rewards/margins": 0.027165865525603294, + "rewards/rejected": -0.022725243121385574, + "step": 615 + }, + { + "debug/policy_chosen_logits": 0.9352153539657593, + "debug/policy_chosen_logps": -233.7841339111328, + "debug/policy_rejected_logits": 1.4436912536621094, + "debug/policy_rejected_logps": -316.3915100097656, + "debug/reference_chosen_logps": -233.95346069335938, + "debug/reference_rejected_logps": -311.99176025390625, + "debug/sppo_chosen_loss": 2502.010498046875, + "debug/sppo_chosen_reward_in_loss": 0.16931553184986115, + "debug/sppo_rej_reward_in_loss": -4.399728298187256, + "debug/sppo_reject_loss": 2132.748779296875, + "epoch": 2.246376811594203, + "grad_norm": 57675.208745475386, + "learning_rate": 8.572224365791348e-08, + "logits/chosen": 0.9352153539657593, + "logits/rejected": 1.4436912536621094, + "logps/chosen": -233.7841339111328, + "logps/rejected": -316.3915100097656, + "loss": 4686.8801, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0016931556165218353, + "rewards/margins": 0.04569043964147568, + "rewards/rejected": -0.043997280299663544, + "step": 620 + }, + { + "debug/policy_chosen_logits": 1.0854655504226685, + "debug/policy_chosen_logps": -250.3056182861328, + "debug/policy_rejected_logits": 1.247107744216919, + "debug/policy_rejected_logps": -271.73699951171875, + "debug/reference_chosen_logps": -250.57568359375, + "debug/reference_rejected_logps": -266.9862976074219, + "debug/sppo_chosen_loss": 2503.90576171875, + "debug/sppo_chosen_reward_in_loss": 0.2700786590576172, + "debug/sppo_rej_reward_in_loss": -4.750700950622559, + "debug/sppo_reject_loss": 2117.43603515625, + "epoch": 2.2644927536231885, + "grad_norm": 65162.73558403237, + "learning_rate": 8.546056332524771e-08, + "logits/chosen": 1.0854655504226685, + "logits/rejected": 1.247107744216919, + "logps/chosen": -250.3056182861328, + "logps/rejected": -271.73699951171875, + "loss": 4696.8086, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0027007872704416513, + "rewards/margins": 0.05020779371261597, + "rewards/rejected": -0.04750701040029526, + "step": 625 + }, + { + "debug/policy_chosen_logits": 1.3634555339813232, + "debug/policy_chosen_logps": -284.90277099609375, + "debug/policy_rejected_logits": 1.4885923862457275, + "debug/policy_rejected_logps": -262.86126708984375, + "debug/reference_chosen_logps": -283.0180969238281, + "debug/reference_rejected_logps": -259.73748779296875, + "debug/sppo_chosen_loss": 2754.628662109375, + "debug/sppo_chosen_reward_in_loss": -1.8847014904022217, + "debug/sppo_rej_reward_in_loss": -3.123795986175537, + "debug/sppo_reject_loss": 2240.63037109375, + "epoch": 2.282608695652174, + "grad_norm": 64391.60616811912, + "learning_rate": 8.519691400869593e-08, + "logits/chosen": 1.3634555339813232, + "logits/rejected": 1.4885923862457275, + "logps/chosen": -284.90277099609375, + "logps/rejected": -262.86126708984375, + "loss": 4721.7711, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.018847014755010605, + "rewards/margins": 0.012390943244099617, + "rewards/rejected": -0.03123795986175537, + "step": 630 + }, + { + "debug/policy_chosen_logits": 1.2123847007751465, + "debug/policy_chosen_logps": -235.73715209960938, + "debug/policy_rejected_logits": 1.4528264999389648, + "debug/policy_rejected_logps": -287.69244384765625, + "debug/reference_chosen_logps": -236.39944458007812, + "debug/reference_rejected_logps": -285.35406494140625, + "debug/sppo_chosen_loss": 2444.96435546875, + "debug/sppo_chosen_reward_in_loss": 0.6622905731201172, + "debug/sppo_rej_reward_in_loss": -2.338413953781128, + "debug/sppo_reject_loss": 2319.29736328125, + "epoch": 2.300724637681159, + "grad_norm": 104523.93148426696, + "learning_rate": 8.493131034765493e-08, + "logits/chosen": 1.2123847007751465, + "logits/rejected": 1.4528264999389648, + "logps/chosen": -235.73715209960938, + "logps/rejected": -287.69244384765625, + "loss": 4711.3289, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.006622905842959881, + "rewards/margins": 0.030007043853402138, + "rewards/rejected": -0.02338413894176483, + "step": 635 + }, + { + "debug/policy_chosen_logits": 1.0598101615905762, + "debug/policy_chosen_logps": -259.14007568359375, + "debug/policy_rejected_logits": 1.4087841510772705, + "debug/policy_rejected_logps": -289.23944091796875, + "debug/reference_chosen_logps": -259.8706359863281, + "debug/reference_rejected_logps": -284.48968505859375, + "debug/sppo_chosen_loss": 2438.184326171875, + "debug/sppo_chosen_reward_in_loss": 0.7305816411972046, + "debug/sppo_rej_reward_in_loss": -4.749767780303955, + "debug/sppo_reject_loss": 2118.009765625, + "epoch": 2.318840579710145, + "grad_norm": 72785.5043936409, + "learning_rate": 8.46637670900384e-08, + "logits/chosen": 1.0598101615905762, + "logits/rejected": 1.4087841510772705, + "logps/chosen": -259.14007568359375, + "logps/rejected": -289.23944091796875, + "loss": 4633.6555, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.00730581721290946, + "rewards/margins": 0.05480349063873291, + "rewards/rejected": -0.04749767482280731, + "step": 640 + }, + { + "debug/policy_chosen_logits": 1.2692564725875854, + "debug/policy_chosen_logps": -271.09075927734375, + "debug/policy_rejected_logits": 1.205157995223999, + "debug/policy_rejected_logps": -245.5635223388672, + "debug/reference_chosen_logps": -280.4898681640625, + "debug/reference_rejected_logps": -251.0946807861328, + "debug/sppo_chosen_loss": 1798.6510009765625, + "debug/sppo_chosen_reward_in_loss": 9.399101257324219, + "debug/sppo_rej_reward_in_loss": 5.531121730804443, + "debug/sppo_reject_loss": 3291.958984375, + "epoch": 2.3369565217391304, + "grad_norm": 86753.47801949162, + "learning_rate": 8.439429909145816e-08, + "logits/chosen": 1.2692564725875854, + "logits/rejected": 1.205157995223999, + "logps/chosen": -271.09075927734375, + "logps/rejected": -245.5635223388672, + "loss": 5429.5953, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0939910039305687, + "rewards/margins": 0.038679786026477814, + "rewards/rejected": 0.05531121417880058, + "step": 645 + }, + { + "debug/policy_chosen_logits": 1.5434187650680542, + "debug/policy_chosen_logps": -259.09027099609375, + "debug/policy_rejected_logits": 1.58522367477417, + "debug/policy_rejected_logps": -283.14508056640625, + "debug/reference_chosen_logps": -268.76470947265625, + "debug/reference_rejected_logps": -293.04833984375, + "debug/sppo_chosen_loss": 1690.5296630859375, + "debug/sppo_chosen_reward_in_loss": 9.674398422241211, + "debug/sppo_rej_reward_in_loss": 9.90326976776123, + "debug/sppo_reject_loss": 3680.02392578125, + "epoch": 2.355072463768116, + "grad_norm": 71014.8256938341, + "learning_rate": 8.412292131439924e-08, + "logits/chosen": 1.5434187650680542, + "logits/rejected": 1.58522367477417, + "logps/chosen": -259.09027099609375, + "logps/rejected": -283.14508056640625, + "loss": 5486.1242, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.09674398601055145, + "rewards/margins": -0.002288705902174115, + "rewards/rejected": 0.0990326926112175, + "step": 650 + }, + { + "debug/policy_chosen_logits": 1.3143525123596191, + "debug/policy_chosen_logps": -252.29421997070312, + "debug/policy_rejected_logits": 1.6215347051620483, + "debug/policy_rejected_logps": -290.984130859375, + "debug/reference_chosen_logps": -257.20928955078125, + "debug/reference_rejected_logps": -293.06396484375, + "debug/sppo_chosen_loss": 2047.283203125, + "debug/sppo_chosen_reward_in_loss": 4.915032863616943, + "debug/sppo_rej_reward_in_loss": 2.0798301696777344, + "debug/sppo_reject_loss": 2748.5693359375, + "epoch": 2.3731884057971016, + "grad_norm": 63977.00550042232, + "learning_rate": 8.3849648827389e-08, + "logits/chosen": 1.3143525123596191, + "logits/rejected": 1.6215347051620483, + "logps/chosen": -252.29421997070312, + "logps/rejected": -290.984130859375, + "loss": 4779.2102, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.04915032535791397, + "rewards/margins": 0.028352027758955956, + "rewards/rejected": 0.020798301324248314, + "step": 655 + }, + { + "debug/policy_chosen_logits": 1.0396404266357422, + "debug/policy_chosen_logps": -263.76708984375, + "debug/policy_rejected_logits": 1.2717006206512451, + "debug/policy_rejected_logps": -282.8274841308594, + "debug/reference_chosen_logps": -265.6632080078125, + "debug/reference_rejected_logps": -280.2948303222656, + "debug/sppo_chosen_loss": 2353.92333984375, + "debug/sppo_chosen_reward_in_loss": 1.8960940837860107, + "debug/sppo_rej_reward_in_loss": -2.5326685905456543, + "debug/sppo_reject_loss": 2308.527099609375, + "epoch": 2.391304347826087, + "grad_norm": 59400.02624948723, + "learning_rate": 8.357449680416058e-08, + "logits/chosen": 1.0396404266357422, + "logits/rejected": 1.2717006206512451, + "logps/chosen": -263.76708984375, + "logps/rejected": -282.8274841308594, + "loss": 4731.8664, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.018960941582918167, + "rewards/margins": 0.044287629425525665, + "rewards/rejected": -0.025326687842607498, + "step": 660 + }, + { + "debug/policy_chosen_logits": 1.234412670135498, + "debug/policy_chosen_logps": -243.5044403076172, + "debug/policy_rejected_logits": 1.6132608652114868, + "debug/policy_rejected_logps": -273.7642822265625, + "debug/reference_chosen_logps": -245.8137969970703, + "debug/reference_rejected_logps": -272.16265869140625, + "debug/sppo_chosen_loss": 2285.48583984375, + "debug/sppo_chosen_reward_in_loss": 2.3093769550323486, + "debug/sppo_rej_reward_in_loss": -1.601636290550232, + "debug/sppo_reject_loss": 2426.29541015625, + "epoch": 2.4094202898550723, + "grad_norm": 58602.8192263023, + "learning_rate": 8.32974805228102e-08, + "logits/chosen": 1.234412670135498, + "logits/rejected": 1.6132608652114868, + "logps/chosen": -243.5044403076172, + "logps/rejected": -273.7642822265625, + "loss": 4634.8891, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.02309376932680607, + "rewards/margins": 0.039110131561756134, + "rewards/rejected": -0.016016360372304916, + "step": 665 + }, + { + "debug/policy_chosen_logits": 1.563645601272583, + "debug/policy_chosen_logps": -278.4286804199219, + "debug/policy_rejected_logits": 1.7810630798339844, + "debug/policy_rejected_logps": -304.18304443359375, + "debug/reference_chosen_logps": -281.46917724609375, + "debug/reference_rejected_logps": -302.9301452636719, + "debug/sppo_chosen_loss": 2210.00927734375, + "debug/sppo_chosen_reward_in_loss": 3.040518045425415, + "debug/sppo_rej_reward_in_loss": -1.2529163360595703, + "debug/sppo_reject_loss": 2420.52783203125, + "epoch": 2.427536231884058, + "grad_norm": 68478.8669991549, + "learning_rate": 8.301861536494898e-08, + "logits/chosen": 1.563645601272583, + "logits/rejected": 1.7810630798339844, + "logps/chosen": -278.4286804199219, + "logps/rejected": -304.18304443359375, + "loss": 4713.7953, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.030405178666114807, + "rewards/margins": 0.04293433949351311, + "rewards/rejected": -0.012529164552688599, + "step": 670 + }, + { + "debug/policy_chosen_logits": 1.1350958347320557, + "debug/policy_chosen_logps": -234.85903930664062, + "debug/policy_rejected_logits": 1.3057701587677002, + "debug/policy_rejected_logps": -273.68170166015625, + "debug/reference_chosen_logps": -235.17245483398438, + "debug/reference_rejected_logps": -270.46295166015625, + "debug/sppo_chosen_loss": 2496.69873046875, + "debug/sppo_chosen_reward_in_loss": 0.31343594193458557, + "debug/sppo_rej_reward_in_loss": -3.2187705039978027, + "debug/sppo_reject_loss": 2251.916259765625, + "epoch": 2.4456521739130435, + "grad_norm": 68306.26071631578, + "learning_rate": 8.273791681484874e-08, + "logits/chosen": 1.1350958347320557, + "logits/rejected": 1.3057701587677002, + "logps/chosen": -234.85903930664062, + "logps/rejected": -273.68170166015625, + "loss": 4683.6773, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.003134358674287796, + "rewards/margins": 0.03532206267118454, + "rewards/rejected": -0.03218770772218704, + "step": 675 + }, + { + "debug/policy_chosen_logits": 1.5269218683242798, + "debug/policy_chosen_logps": -283.9064025878906, + "debug/policy_rejected_logits": 1.508864402770996, + "debug/policy_rejected_logps": -261.4410095214844, + "debug/reference_chosen_logps": -283.28729248046875, + "debug/reference_rejected_logps": -259.95697021484375, + "debug/sppo_chosen_loss": 2623.616943359375, + "debug/sppo_chosen_reward_in_loss": -0.6191161870956421, + "debug/sppo_rej_reward_in_loss": -1.484053373336792, + "debug/sppo_reject_loss": 2396.083251953125, + "epoch": 2.463768115942029, + "grad_norm": 82805.1153427768, + "learning_rate": 8.245540045858228e-08, + "logits/chosen": 1.5269218683242798, + "logits/rejected": 1.508864402770996, + "logps/chosen": -283.9064025878906, + "logps/rejected": -261.4410095214844, + "loss": 4723.4434, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006191161461174488, + "rewards/margins": 0.008649373427033424, + "rewards/rejected": -0.014840533025562763, + "step": 680 + }, + { + "debug/policy_chosen_logits": 1.1852375268936157, + "debug/policy_chosen_logps": -252.340087890625, + "debug/policy_rejected_logits": 1.4935983419418335, + "debug/policy_rejected_logps": -282.58343505859375, + "debug/reference_chosen_logps": -253.1710662841797, + "debug/reference_rejected_logps": -280.0072021484375, + "debug/sppo_chosen_loss": 2442.70654296875, + "debug/sppo_chosen_reward_in_loss": 0.8309797048568726, + "debug/sppo_rej_reward_in_loss": -2.5762057304382324, + "debug/sppo_reject_loss": 2296.81982421875, + "epoch": 2.4818840579710146, + "grad_norm": 63401.64185770447, + "learning_rate": 8.2171081983158e-08, + "logits/chosen": 1.1852375268936157, + "logits/rejected": 1.4935983419418335, + "logps/chosen": -252.340087890625, + "logps/rejected": -282.58343505859375, + "loss": 4596.534, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.008309796452522278, + "rewards/margins": 0.03407185524702072, + "rewards/rejected": -0.025762056931853294, + "step": 685 + }, + { + "debug/policy_chosen_logits": 1.1588332653045654, + "debug/policy_chosen_logps": -248.93905639648438, + "debug/policy_rejected_logits": 1.4399282932281494, + "debug/policy_rejected_logps": -285.43927001953125, + "debug/reference_chosen_logps": -250.5298309326172, + "debug/reference_rejected_logps": -281.73443603515625, + "debug/sppo_chosen_loss": 2347.6015625, + "debug/sppo_chosen_reward_in_loss": 1.5907951593399048, + "debug/sppo_rej_reward_in_loss": -3.7047877311706543, + "debug/sppo_reject_loss": 2226.80322265625, + "epoch": 2.5, + "grad_norm": 64223.88240269286, + "learning_rate": 8.188497717564871e-08, + "logits/chosen": 1.1588332653045654, + "logits/rejected": 1.4399282932281494, + "logps/chosen": -248.93905639648438, + "logps/rejected": -285.43927001953125, + "loss": 4621.6918, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.015907950699329376, + "rewards/margins": 0.05295582860708237, + "rewards/rejected": -0.03704787790775299, + "step": 690 + }, + { + "debug/policy_chosen_logits": 1.1629136800765991, + "debug/policy_chosen_logps": -258.650390625, + "debug/policy_rejected_logits": 1.2538448572158813, + "debug/policy_rejected_logps": -269.53631591796875, + "debug/reference_chosen_logps": -258.3566589355469, + "debug/reference_rejected_logps": -263.8230285644531, + "debug/sppo_chosen_loss": 2563.53466796875, + "debug/sppo_chosen_reward_in_loss": -0.29371222853660583, + "debug/sppo_rej_reward_in_loss": -5.7132792472839355, + "debug/sppo_reject_loss": 2022.1617431640625, + "epoch": 2.5181159420289854, + "grad_norm": 58930.82852049117, + "learning_rate": 8.159710192231519e-08, + "logits/chosen": 1.1629136800765991, + "logits/rejected": 1.2538448572158813, + "logps/chosen": -258.650390625, + "logps/rejected": -269.53631591796875, + "loss": 4662.3734, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0029371220152825117, + "rewards/margins": 0.05419566482305527, + "rewards/rejected": -0.057132791727781296, + "step": 695 + }, + { + "debug/policy_chosen_logits": 1.296156644821167, + "debug/policy_chosen_logps": -247.963134765625, + "debug/policy_rejected_logits": 1.5709506273269653, + "debug/policy_rejected_logps": -284.2712097167969, + "debug/reference_chosen_logps": -248.04214477539062, + "debug/reference_rejected_logps": -281.76080322265625, + "debug/sppo_chosen_loss": 2526.7412109375, + "debug/sppo_chosen_reward_in_loss": 0.0790136307477951, + "debug/sppo_rej_reward_in_loss": -2.510422706604004, + "debug/sppo_reject_loss": 2294.28271484375, + "epoch": 2.536231884057971, + "grad_norm": 83941.42671095916, + "learning_rate": 8.130747220772401e-08, + "logits/chosen": 1.296156644821167, + "logits/rejected": 1.5709506273269653, + "logps/chosen": -247.963134765625, + "logps/rejected": -284.2712097167969, + "loss": 4590.507, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0007901365170255303, + "rewards/margins": 0.02589436247944832, + "rewards/rejected": -0.025104224681854248, + "step": 700 + }, + { + "epoch": 2.536231884057971, + "eval_debug/policy_chosen_logits": 1.480161190032959, + "eval_debug/policy_chosen_logps": -253.2023468017578, + "eval_debug/policy_rejected_logits": 1.5311214923858643, + "eval_debug/policy_rejected_logps": -262.8334655761719, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2563.16015625, + "eval_debug/sppo_chosen_reward_in_loss": -0.2838987112045288, + "eval_debug/sppo_rej_reward_in_loss": -3.1748383045196533, + "eval_debug/sppo_reject_loss": 2266.701904296875, + "eval_logits/chosen": 1.480161190032959, + "eval_logits/rejected": 1.5311214923858643, + "eval_logps/chosen": -253.2023468017578, + "eval_logps/rejected": -262.8334655761719, + "eval_loss": 4709.8720703125, + "eval_rewards/accuracies": 0.5657894611358643, + "eval_rewards/chosen": -0.0028389859944581985, + "eval_rewards/margins": 0.02890939824283123, + "eval_rewards/rejected": -0.03174838423728943, + "eval_runtime": 28.36, + "eval_samples_per_second": 21.157, + "eval_steps_per_second": 0.67, + "step": 700 + }, + { + "debug/policy_chosen_logits": 1.316695213317871, + "debug/policy_chosen_logps": -262.3095397949219, + "debug/policy_rejected_logits": 1.5722310543060303, + "debug/policy_rejected_logps": -308.7387390136719, + "debug/reference_chosen_logps": -263.1445007324219, + "debug/reference_rejected_logps": -301.7698059082031, + "debug/sppo_chosen_loss": 2425.18603515625, + "debug/sppo_chosen_reward_in_loss": 0.8349674344062805, + "debug/sppo_rej_reward_in_loss": -6.9689836502075195, + "debug/sppo_reject_loss": 1957.8179931640625, + "epoch": 2.5543478260869565, + "grad_norm": 67599.29085197397, + "learning_rate": 8.101610411385998e-08, + "logits/chosen": 1.316695213317871, + "logits/rejected": 1.5722310543060303, + "logps/chosen": -262.3095397949219, + "logps/rejected": -308.7387390136719, + "loss": 4607.5508, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.008349673822522163, + "rewards/margins": 0.0780395045876503, + "rewards/rejected": -0.06968982517719269, + "step": 705 + }, + { + "debug/policy_chosen_logits": 1.2466450929641724, + "debug/policy_chosen_logps": -267.37567138671875, + "debug/policy_rejected_logits": 1.5063731670379639, + "debug/policy_rejected_logps": -330.72149658203125, + "debug/reference_chosen_logps": -267.941162109375, + "debug/reference_rejected_logps": -326.2596740722656, + "debug/sppo_chosen_loss": 2450.247314453125, + "debug/sppo_chosen_reward_in_loss": 0.5654850006103516, + "debug/sppo_rej_reward_in_loss": -4.461817264556885, + "debug/sppo_reject_loss": 2120.1513671875, + "epoch": 2.572463768115942, + "grad_norm": 64522.98009611156, + "learning_rate": 8.072301381923319e-08, + "logits/chosen": 1.2466450929641724, + "logits/rejected": 1.5063731670379639, + "logps/chosen": -267.37567138671875, + "logps/rejected": -330.72149658203125, + "loss": 4674.4805, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005654850043356419, + "rewards/margins": 0.050273019820451736, + "rewards/rejected": -0.04461817070841789, + "step": 710 + }, + { + "debug/policy_chosen_logits": 0.9615011215209961, + "debug/policy_chosen_logps": -229.64877319335938, + "debug/policy_rejected_logits": 1.251741647720337, + "debug/policy_rejected_logps": -282.9585266113281, + "debug/reference_chosen_logps": -229.4971923828125, + "debug/reference_rejected_logps": -279.83587646484375, + "debug/sppo_chosen_loss": 2542.4091796875, + "debug/sppo_chosen_reward_in_loss": -0.1516149491071701, + "debug/sppo_rej_reward_in_loss": -3.1226649284362793, + "debug/sppo_reject_loss": 2246.38330078125, + "epoch": 2.5905797101449277, + "grad_norm": 77147.82344419556, + "learning_rate": 8.042821759798069e-08, + "logits/chosen": 0.9615011215209961, + "logits/rejected": 1.251741647720337, + "logps/chosen": -229.64877319335938, + "logps/rejected": -282.9585266113281, + "loss": 4721.9094, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0015161499613896012, + "rewards/margins": 0.029710497707128525, + "rewards/rejected": -0.031226646155118942, + "step": 715 + }, + { + "debug/policy_chosen_logits": 1.0589511394500732, + "debug/policy_chosen_logps": -245.3362579345703, + "debug/policy_rejected_logits": 1.2744250297546387, + "debug/policy_rejected_logps": -274.80999755859375, + "debug/reference_chosen_logps": -246.52197265625, + "debug/reference_rejected_logps": -272.2623596191406, + "debug/sppo_chosen_loss": 2395.19580078125, + "debug/sppo_chosen_reward_in_loss": 1.1857404708862305, + "debug/sppo_rej_reward_in_loss": -2.547642707824707, + "debug/sppo_reject_loss": 2284.42041015625, + "epoch": 2.608695652173913, + "grad_norm": 68760.10675885266, + "learning_rate": 8.013173181896283e-08, + "logits/chosen": 1.0589511394500732, + "logits/rejected": 1.2744250297546387, + "logps/chosen": -245.3362579345703, + "logps/rejected": -274.80999755859375, + "loss": 4566.4594, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.011857403442263603, + "rewards/margins": 0.03733383119106293, + "rewards/rejected": -0.025476425886154175, + "step": 720 + }, + { + "debug/policy_chosen_logits": 1.229744553565979, + "debug/policy_chosen_logps": -231.814208984375, + "debug/policy_rejected_logits": 1.8818788528442383, + "debug/policy_rejected_logps": -300.50482177734375, + "debug/reference_chosen_logps": -232.2402801513672, + "debug/reference_rejected_logps": -296.17901611328125, + "debug/sppo_chosen_loss": 2466.02392578125, + "debug/sppo_chosen_reward_in_loss": 0.42609596252441406, + "debug/sppo_rej_reward_in_loss": -4.325751304626465, + "debug/sppo_reject_loss": 2153.456298828125, + "epoch": 2.6268115942028984, + "grad_norm": 81272.2688889121, + "learning_rate": 7.983357294485438e-08, + "logits/chosen": 1.229744553565979, + "logits/rejected": 1.8818788528442383, + "logps/chosen": -231.814208984375, + "logps/rejected": -300.50482177734375, + "loss": 4581.5086, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.004260959569364786, + "rewards/margins": 0.04751847684383392, + "rewards/rejected": -0.043257515877485275, + "step": 725 + }, + { + "debug/policy_chosen_logits": 1.1009639501571655, + "debug/policy_chosen_logps": -244.9521026611328, + "debug/policy_rejected_logits": 1.392866849899292, + "debug/policy_rejected_logps": -281.6346740722656, + "debug/reference_chosen_logps": -244.76611328125, + "debug/reference_rejected_logps": -277.59765625, + "debug/sppo_chosen_loss": 2543.88134765625, + "debug/sppo_chosen_reward_in_loss": -0.185984805226326, + "debug/sppo_rej_reward_in_loss": -4.037027359008789, + "debug/sppo_reject_loss": 2177.278076171875, + "epoch": 2.644927536231884, + "grad_norm": 66672.84499784847, + "learning_rate": 7.953375753123043e-08, + "logits/chosen": 1.1009639501571655, + "logits/rejected": 1.392866849899292, + "logps/chosen": -244.9521026611328, + "logps/rejected": -281.6346740722656, + "loss": 4614.0613, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0018598471069708467, + "rewards/margins": 0.03851042687892914, + "rewards/rejected": -0.04037027060985565, + "step": 730 + }, + { + "debug/policy_chosen_logits": 1.2034881114959717, + "debug/policy_chosen_logps": -262.13726806640625, + "debug/policy_rejected_logits": 1.6349296569824219, + "debug/policy_rejected_logps": -300.8846740722656, + "debug/reference_chosen_logps": -262.77691650390625, + "debug/reference_rejected_logps": -295.9397277832031, + "debug/sppo_chosen_loss": 2470.29443359375, + "debug/sppo_chosen_reward_in_loss": 0.6396778225898743, + "debug/sppo_rej_reward_in_loss": -4.944947719573975, + "debug/sppo_reject_loss": 2114.920166015625, + "epoch": 2.6630434782608696, + "grad_norm": 59913.553582771856, + "learning_rate": 7.923230222564714e-08, + "logits/chosen": 1.2034881114959717, + "logits/rejected": 1.6349296569824219, + "logps/chosen": -262.13726806640625, + "logps/rejected": -300.8846740722656, + "loss": 4603.5797, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.006396777927875519, + "rewards/margins": 0.05584625154733658, + "rewards/rejected": -0.04944947734475136, + "step": 735 + }, + { + "debug/policy_chosen_logits": 1.3139673471450806, + "debug/policy_chosen_logps": -275.12811279296875, + "debug/policy_rejected_logits": 1.1129451990127563, + "debug/policy_rejected_logps": -263.61767578125, + "debug/reference_chosen_logps": -274.85089111328125, + "debug/reference_rejected_logps": -262.1892395019531, + "debug/sppo_chosen_loss": 2561.744140625, + "debug/sppo_chosen_reward_in_loss": -0.27727144956588745, + "debug/sppo_rej_reward_in_loss": -1.4284439086914062, + "debug/sppo_reject_loss": 2384.283935546875, + "epoch": 2.681159420289855, + "grad_norm": 60968.86609888939, + "learning_rate": 7.892922376671725e-08, + "logits/chosen": 1.3139673471450806, + "logits/rejected": 1.1129451990127563, + "logps/chosen": -275.12811279296875, + "logps/rejected": -263.61767578125, + "loss": 4665.1484, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.002772714477032423, + "rewards/margins": 0.011511723510921001, + "rewards/rejected": -0.014284437522292137, + "step": 740 + }, + { + "debug/policy_chosen_logits": 1.2646352052688599, + "debug/policy_chosen_logps": -249.6303253173828, + "debug/policy_rejected_logits": 1.2260851860046387, + "debug/policy_rejected_logps": -264.775390625, + "debug/reference_chosen_logps": -249.9891357421875, + "debug/reference_rejected_logps": -260.71539306640625, + "debug/sppo_chosen_loss": 2489.34375, + "debug/sppo_chosen_reward_in_loss": 0.35877054929733276, + "debug/sppo_rej_reward_in_loss": -4.0600104331970215, + "debug/sppo_reject_loss": 2168.15087890625, + "epoch": 2.699275362318841, + "grad_norm": 70768.43523511974, + "learning_rate": 7.862453898318082e-08, + "logits/chosen": 1.2646352052688599, + "logits/rejected": 1.2260851860046387, + "logps/chosen": -249.6303253173828, + "logps/rejected": -264.775390625, + "loss": 4640.3344, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.003587705548852682, + "rewards/margins": 0.04418780654668808, + "rewards/rejected": -0.04060010612010956, + "step": 745 + }, + { + "debug/policy_chosen_logits": 1.0691126585006714, + "debug/policy_chosen_logps": -255.4532012939453, + "debug/policy_rejected_logits": 1.1531434059143066, + "debug/policy_rejected_logps": -261.6268005371094, + "debug/reference_chosen_logps": -257.0854797363281, + "debug/reference_rejected_logps": -259.08953857421875, + "debug/sppo_chosen_loss": 2347.645751953125, + "debug/sppo_chosen_reward_in_loss": 1.6322675943374634, + "debug/sppo_rej_reward_in_loss": -2.53729248046875, + "debug/sppo_reject_loss": 2319.710205078125, + "epoch": 2.717391304347826, + "grad_norm": 63375.54807649489, + "learning_rate": 7.83182647929707e-08, + "logits/chosen": 1.0691126585006714, + "logits/rejected": 1.1531434059143066, + "logps/chosen": -255.4532012939453, + "logps/rejected": -261.6268005371094, + "loss": 4591.8281, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.016322676092386246, + "rewards/margins": 0.04169560223817825, + "rewards/rejected": -0.02537292242050171, + "step": 750 + }, + { + "debug/policy_chosen_logits": 1.1826550960540771, + "debug/policy_chosen_logps": -275.0767517089844, + "debug/policy_rejected_logits": 1.3011400699615479, + "debug/policy_rejected_logps": -286.65875244140625, + "debug/reference_chosen_logps": -274.70001220703125, + "debug/reference_rejected_logps": -281.486083984375, + "debug/sppo_chosen_loss": 2563.9345703125, + "debug/sppo_chosen_reward_in_loss": -0.3767387270927429, + "debug/sppo_rej_reward_in_loss": -5.172691822052002, + "debug/sppo_reject_loss": 2118.130126953125, + "epoch": 2.7355072463768115, + "grad_norm": 73693.41186252929, + "learning_rate": 7.801041820227318e-08, + "logits/chosen": 1.1826550960540771, + "logits/rejected": 1.3011400699615479, + "logps/chosen": -275.0767517089844, + "logps/rejected": -286.65875244140625, + "loss": 4697.5813, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.0037673874758183956, + "rewards/margins": 0.04795952886343002, + "rewards/rejected": -0.05172691494226456, + "step": 755 + }, + { + "debug/policy_chosen_logits": 1.2178585529327393, + "debug/policy_chosen_logps": -258.1893310546875, + "debug/policy_rejected_logits": 1.7031934261322021, + "debug/policy_rejected_logps": -302.24322509765625, + "debug/reference_chosen_logps": -258.6191711425781, + "debug/reference_rejected_logps": -298.0517883300781, + "debug/sppo_chosen_loss": 2476.55078125, + "debug/sppo_chosen_reward_in_loss": 0.4298551678657532, + "debug/sppo_rej_reward_in_loss": -4.191437244415283, + "debug/sppo_reject_loss": 2163.860107421875, + "epoch": 2.753623188405797, + "grad_norm": 122726.8457950013, + "learning_rate": 7.770101630458363e-08, + "logits/chosen": 1.2178585529327393, + "logits/rejected": 1.7031934261322021, + "logps/chosen": -258.1893310546875, + "logps/rejected": -302.24322509765625, + "loss": 4691.1711, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.004298551939427853, + "rewards/margins": 0.046212922781705856, + "rewards/rejected": -0.04191437363624573, + "step": 760 + }, + { + "debug/policy_chosen_logits": 1.3344509601593018, + "debug/policy_chosen_logps": -262.7269592285156, + "debug/policy_rejected_logits": 1.428648591041565, + "debug/policy_rejected_logps": -281.42303466796875, + "debug/reference_chosen_logps": -263.6646728515625, + "debug/reference_rejected_logps": -276.64959716796875, + "debug/sppo_chosen_loss": 2433.05029296875, + "debug/sppo_chosen_reward_in_loss": 0.9377063512802124, + "debug/sppo_rej_reward_in_loss": -4.773464202880859, + "debug/sppo_reject_loss": 2114.662841796875, + "epoch": 2.7717391304347827, + "grad_norm": 70208.33518676949, + "learning_rate": 7.73900762797575e-08, + "logits/chosen": 1.3344509601593018, + "logits/rejected": 1.428648591041565, + "logps/chosen": -262.7269592285156, + "logps/rejected": -281.42303466796875, + "loss": 4616.6656, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.009377063252031803, + "rewards/margins": 0.0571117028594017, + "rewards/rejected": -0.047734636813402176, + "step": 765 + }, + { + "debug/policy_chosen_logits": 1.4318565130233765, + "debug/policy_chosen_logps": -246.78271484375, + "debug/policy_rejected_logits": 1.9473499059677124, + "debug/policy_rejected_logps": -292.7750549316406, + "debug/reference_chosen_logps": -248.1265869140625, + "debug/reference_rejected_logps": -289.3108825683594, + "debug/sppo_chosen_loss": 2377.390869140625, + "debug/sppo_chosen_reward_in_loss": 1.3438713550567627, + "debug/sppo_rej_reward_in_loss": -3.4641430377960205, + "debug/sppo_reject_loss": 2215.130126953125, + "epoch": 2.789855072463768, + "grad_norm": 56045.35340217485, + "learning_rate": 7.707761539305629e-08, + "logits/chosen": 1.4318565130233765, + "logits/rejected": 1.9473499059677124, + "logps/chosen": -246.78271484375, + "logps/rejected": -292.7750549316406, + "loss": 4641.2305, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.013438713736832142, + "rewards/margins": 0.048080142587423325, + "rewards/rejected": -0.03464142605662346, + "step": 770 + }, + { + "debug/policy_chosen_logits": 1.5560919046401978, + "debug/policy_chosen_logps": -253.90786743164062, + "debug/policy_rejected_logits": 1.8329626321792603, + "debug/policy_rejected_logps": -306.51739501953125, + "debug/reference_chosen_logps": -254.22415161132812, + "debug/reference_rejected_logps": -302.6522216796875, + "debug/sppo_chosen_loss": 2491.45654296875, + "debug/sppo_chosen_reward_in_loss": 0.3162704408168793, + "debug/sppo_rej_reward_in_loss": -3.8651795387268066, + "debug/sppo_reject_loss": 2216.763916015625, + "epoch": 2.807971014492754, + "grad_norm": 74056.59942683075, + "learning_rate": 7.676365099418883e-08, + "logits/chosen": 1.5560919046401978, + "logits/rejected": 1.8329626321792603, + "logps/chosen": -253.90786743164062, + "logps/rejected": -306.51739501953125, + "loss": 4730.9008, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0031627051066607237, + "rewards/margins": 0.041814498603343964, + "rewards/rejected": -0.03865179046988487, + "step": 775 + }, + { + "debug/policy_chosen_logits": 1.1131713390350342, + "debug/policy_chosen_logps": -242.6931610107422, + "debug/policy_rejected_logits": 1.5814272165298462, + "debug/policy_rejected_logps": -303.05401611328125, + "debug/reference_chosen_logps": -242.7500762939453, + "debug/reference_rejected_logps": -299.9582824707031, + "debug/sppo_chosen_loss": 2545.71142578125, + "debug/sppo_chosen_reward_in_loss": 0.05693111568689346, + "debug/sppo_rej_reward_in_loss": -3.095724105834961, + "debug/sppo_reject_loss": 2284.80517578125, + "epoch": 2.8260869565217392, + "grad_norm": 62252.28679752256, + "learning_rate": 7.644820051634812e-08, + "logits/chosen": 1.1131713390350342, + "logits/rejected": 1.5814272165298462, + "logps/chosen": -242.6931610107422, + "logps/rejected": -303.05401611328125, + "loss": 4659.9199, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0005693117855116725, + "rewards/margins": 0.03152655437588692, + "rewards/rejected": -0.030957240611314774, + "step": 780 + }, + { + "debug/policy_chosen_logits": 1.1171921491622925, + "debug/policy_chosen_logps": -248.11083984375, + "debug/policy_rejected_logits": 1.436684012413025, + "debug/policy_rejected_logps": -278.32916259765625, + "debug/reference_chosen_logps": -248.8385467529297, + "debug/reference_rejected_logps": -271.8460693359375, + "debug/sppo_chosen_loss": 2448.23193359375, + "debug/sppo_chosen_reward_in_loss": 0.7277113199234009, + "debug/sppo_rej_reward_in_loss": -6.483117580413818, + "debug/sppo_reject_loss": 2002.7955322265625, + "epoch": 2.8442028985507246, + "grad_norm": 65694.1491439372, + "learning_rate": 7.613128147524313e-08, + "logits/chosen": 1.1171921491622925, + "logits/rejected": 1.436684012413025, + "logps/chosen": -248.11083984375, + "logps/rejected": -278.32916259765625, + "loss": 4572.9023, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.007277113385498524, + "rewards/margins": 0.07210828363895416, + "rewards/rejected": -0.06483118236064911, + "step": 785 + }, + { + "debug/policy_chosen_logits": 1.31461501121521, + "debug/policy_chosen_logps": -246.47268676757812, + "debug/policy_rejected_logits": 1.7004801034927368, + "debug/policy_rejected_logps": -302.381103515625, + "debug/reference_chosen_logps": -246.872802734375, + "debug/reference_rejected_logps": -299.05181884765625, + "debug/sppo_chosen_loss": 2485.46142578125, + "debug/sppo_chosen_reward_in_loss": 0.40012186765670776, + "debug/sppo_rej_reward_in_loss": -3.329306125640869, + "debug/sppo_reject_loss": 2230.423583984375, + "epoch": 2.86231884057971, + "grad_norm": 78799.29136632854, + "learning_rate": 7.581291146812631e-08, + "logits/chosen": 1.31461501121521, + "logits/rejected": 1.7004801034927368, + "logps/chosen": -246.47268676757812, + "logps/rejected": -302.381103515625, + "loss": 4617.8523, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.00400121882557869, + "rewards/margins": 0.03729427605867386, + "rewards/rejected": -0.03329306095838547, + "step": 790 + }, + { + "debug/policy_chosen_logits": 1.1879009008407593, + "debug/policy_chosen_logps": -256.074951171875, + "debug/policy_rejected_logits": 1.436812162399292, + "debug/policy_rejected_logps": -293.434814453125, + "debug/reference_chosen_logps": -256.1133117675781, + "debug/reference_rejected_logps": -289.4981994628906, + "debug/sppo_chosen_loss": 2532.99658203125, + "debug/sppo_chosen_reward_in_loss": 0.03835143893957138, + "debug/sppo_rej_reward_in_loss": -3.936647891998291, + "debug/sppo_reject_loss": 2198.875732421875, + "epoch": 2.880434782608696, + "grad_norm": 67427.30031192664, + "learning_rate": 7.549310817281647e-08, + "logits/chosen": 1.1879009008407593, + "logits/rejected": 1.436812162399292, + "logps/chosen": -256.074951171875, + "logps/rejected": -293.434814453125, + "loss": 4572.7508, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.000383514619898051, + "rewards/margins": 0.03974999114871025, + "rewards/rejected": -0.039366476237773895, + "step": 795 + }, + { + "debug/policy_chosen_logits": 1.2889708280563354, + "debug/policy_chosen_logps": -275.5391540527344, + "debug/policy_rejected_logits": 1.4859613180160522, + "debug/policy_rejected_logps": -283.573486328125, + "debug/reference_chosen_logps": -275.1273193359375, + "debug/reference_rejected_logps": -279.24957275390625, + "debug/sppo_chosen_loss": 2589.01220703125, + "debug/sppo_chosen_reward_in_loss": -0.4118543565273285, + "debug/sppo_rej_reward_in_loss": -4.323914527893066, + "debug/sppo_reject_loss": 2165.736572265625, + "epoch": 2.898550724637681, + "grad_norm": 82038.69191055794, + "learning_rate": 7.517188934671725e-08, + "logits/chosen": 1.2889708280563354, + "logits/rejected": 1.4859613180160522, + "logps/chosen": -275.5391540527344, + "logps/rejected": -283.573486328125, + "loss": 4624.6344, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.004118544049561024, + "rewards/margins": 0.03912060707807541, + "rewards/rejected": -0.04323914647102356, + "step": 800 + }, + { + "epoch": 2.898550724637681, + "eval_debug/policy_chosen_logits": 1.4659922122955322, + "eval_debug/policy_chosen_logps": -253.1265106201172, + "eval_debug/policy_rejected_logits": 1.5167869329452515, + "eval_debug/policy_rejected_logps": -262.9391784667969, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2564.37353515625, + "eval_debug/sppo_chosen_reward_in_loss": -0.20804350078105927, + "eval_debug/sppo_rej_reward_in_loss": -3.2805538177490234, + "eval_debug/sppo_reject_loss": 2277.46337890625, + "eval_logits/chosen": 1.4659922122955322, + "eval_logits/rejected": 1.5167869329452515, + "eval_logps/chosen": -253.1265106201172, + "eval_logps/rejected": -262.9391784667969, + "eval_loss": 4685.78759765625, + "eval_rewards/accuracies": 0.6315789222717285, + "eval_rewards/chosen": -0.002080434700474143, + "eval_rewards/margins": 0.030725106596946716, + "eval_rewards/rejected": -0.032805539667606354, + "eval_runtime": 28.3849, + "eval_samples_per_second": 21.138, + "eval_steps_per_second": 0.669, + "step": 800 + }, + { + "debug/policy_chosen_logits": 1.221453070640564, + "debug/policy_chosen_logps": -251.9959259033203, + "debug/policy_rejected_logits": 1.6424691677093506, + "debug/policy_rejected_logps": -314.9390563964844, + "debug/reference_chosen_logps": -253.0550079345703, + "debug/reference_rejected_logps": -309.57537841796875, + "debug/sppo_chosen_loss": 2406.28515625, + "debug/sppo_chosen_reward_in_loss": 1.059086799621582, + "debug/sppo_rej_reward_in_loss": -5.363643646240234, + "debug/sppo_reject_loss": 2103.30810546875, + "epoch": 2.9166666666666665, + "grad_norm": 69167.85516381508, + "learning_rate": 7.484927282583103e-08, + "logits/chosen": 1.221453070640564, + "logits/rejected": 1.6424691677093506, + "logps/chosen": -251.9959259033203, + "logps/rejected": -314.9390563964844, + "loss": 4556.7664, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.010590868070721626, + "rewards/margins": 0.06422730535268784, + "rewards/rejected": -0.05363643169403076, + "step": 805 + }, + { + "debug/policy_chosen_logits": 1.2087690830230713, + "debug/policy_chosen_logps": -248.7294464111328, + "debug/policy_rejected_logits": 1.498429536819458, + "debug/policy_rejected_logps": -316.4676818847656, + "debug/reference_chosen_logps": -249.0221710205078, + "debug/reference_rejected_logps": -309.80548095703125, + "debug/sppo_chosen_loss": 2494.62451171875, + "debug/sppo_chosen_reward_in_loss": 0.2927181124687195, + "debug/sppo_rej_reward_in_loss": -6.6621809005737305, + "debug/sppo_reject_loss": 1967.0299072265625, + "epoch": 2.9347826086956523, + "grad_norm": 68731.1622774303, + "learning_rate": 7.452527652376863e-08, + "logits/chosen": 1.2087690830230713, + "logits/rejected": 1.498429536819458, + "logps/chosen": -248.7294464111328, + "logps/rejected": -316.4676818847656, + "loss": 4657.1684, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.002927180379629135, + "rewards/margins": 0.06954899430274963, + "rewards/rejected": -0.0666218176484108, + "step": 810 + }, + { + "debug/policy_chosen_logits": 1.0130951404571533, + "debug/policy_chosen_logps": -237.8261260986328, + "debug/policy_rejected_logits": 1.4029724597930908, + "debug/policy_rejected_logps": -298.6962890625, + "debug/reference_chosen_logps": -238.4188232421875, + "debug/reference_rejected_logps": -296.5367126464844, + "debug/sppo_chosen_loss": 2480.473388671875, + "debug/sppo_chosen_reward_in_loss": 0.5927131772041321, + "debug/sppo_rej_reward_in_loss": -2.159576892852783, + "debug/sppo_reject_loss": 2338.80712890625, + "epoch": 2.9528985507246377, + "grad_norm": 73180.34487996876, + "learning_rate": 7.419991843075463e-08, + "logits/chosen": 1.0130951404571533, + "logits/rejected": 1.4029724597930908, + "logps/chosen": -237.8261260986328, + "logps/rejected": -298.6962890625, + "loss": 4632.7711, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.005927131976932287, + "rewards/margins": 0.027522901073098183, + "rewards/rejected": -0.021595770493149757, + "step": 815 + }, + { + "debug/policy_chosen_logits": 1.3386638164520264, + "debug/policy_chosen_logps": -250.01718139648438, + "debug/policy_rejected_logits": 1.921316385269165, + "debug/policy_rejected_logps": -306.84710693359375, + "debug/reference_chosen_logps": -251.37051391601562, + "debug/reference_rejected_logps": -302.274658203125, + "debug/sppo_chosen_loss": 2387.706298828125, + "debug/sppo_chosen_reward_in_loss": 1.3533411026000977, + "debug/sppo_rej_reward_in_loss": -4.572475910186768, + "debug/sppo_reject_loss": 2148.51806640625, + "epoch": 2.971014492753623, + "grad_norm": 57116.2603627131, + "learning_rate": 7.387321661262844e-08, + "logits/chosen": 1.3386638164520264, + "logits/rejected": 1.921316385269165, + "logps/chosen": -250.01718139648438, + "logps/rejected": -306.84710693359375, + "loss": 4651.9684, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.013533410616219044, + "rewards/margins": 0.059258170425891876, + "rewards/rejected": -0.04572475701570511, + "step": 820 + }, + { + "debug/policy_chosen_logits": 1.445495843887329, + "debug/policy_chosen_logps": -283.45599365234375, + "debug/policy_rejected_logits": 1.7299737930297852, + "debug/policy_rejected_logps": -301.90399169921875, + "debug/reference_chosen_logps": -284.3904724121094, + "debug/reference_rejected_logps": -295.550537109375, + "debug/sppo_chosen_loss": 2426.180419921875, + "debug/sppo_chosen_reward_in_loss": 0.9344981908798218, + "debug/sppo_rej_reward_in_loss": -6.353468894958496, + "debug/sppo_reject_loss": 2020.0521240234375, + "epoch": 2.9891304347826084, + "grad_norm": 65796.06205811449, + "learning_rate": 7.354518920984119e-08, + "logits/chosen": 1.445495843887329, + "logits/rejected": 1.7299737930297852, + "logps/chosen": -283.45599365234375, + "logps/rejected": -301.90399169921875, + "loss": 4561.118, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.009344981983304024, + "rewards/margins": 0.07287967205047607, + "rewards/rejected": -0.0635346919298172, + "step": 825 + }, + { + "debug/policy_chosen_logits": 1.2625454664230347, + "debug/policy_chosen_logps": -236.8692169189453, + "debug/policy_rejected_logits": 1.7302738428115845, + "debug/policy_rejected_logps": -306.16729736328125, + "debug/reference_chosen_logps": -238.1107177734375, + "debug/reference_rejected_logps": -302.297607421875, + "debug/sppo_chosen_loss": 2399.04638671875, + "debug/sppo_chosen_reward_in_loss": 1.2415053844451904, + "debug/sppo_rej_reward_in_loss": -3.869725465774536, + "debug/sppo_reject_loss": 2204.76416015625, + "epoch": 3.0072463768115942, + "grad_norm": 61193.65334425578, + "learning_rate": 7.32158544364484e-08, + "logits/chosen": 1.2625454664230347, + "logits/rejected": 1.7302738428115845, + "logps/chosen": -236.8692169189453, + "logps/rejected": -306.16729736328125, + "loss": 4634.2973, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.012415053322911263, + "rewards/margins": 0.051112305372953415, + "rewards/rejected": -0.0386972539126873, + "step": 830 + }, + { + "debug/policy_chosen_logits": 1.3355305194854736, + "debug/policy_chosen_logps": -260.23663330078125, + "debug/policy_rejected_logits": 1.5196969509124756, + "debug/policy_rejected_logps": -283.5062255859375, + "debug/reference_chosen_logps": -261.3477478027344, + "debug/reference_rejected_logps": -279.7190856933594, + "debug/sppo_chosen_loss": 2415.00537109375, + "debug/sppo_chosen_reward_in_loss": 1.111120581626892, + "debug/sppo_rej_reward_in_loss": -3.787144184112549, + "debug/sppo_reject_loss": 2210.35009765625, + "epoch": 3.0253623188405796, + "grad_norm": 72850.47897855999, + "learning_rate": 7.28852305790987e-08, + "logits/chosen": 1.3355305194854736, + "logits/rejected": 1.5196969509124756, + "logps/chosen": -260.23663330078125, + "logps/rejected": -283.5062255859375, + "loss": 4518.6047, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.011111205443739891, + "rewards/margins": 0.0489826463162899, + "rewards/rejected": -0.03787143900990486, + "step": 835 + }, + { + "debug/policy_chosen_logits": 1.5348484516143799, + "debug/policy_chosen_logps": -273.3497619628906, + "debug/policy_rejected_logits": 1.320349931716919, + "debug/policy_rejected_logps": -276.6268005371094, + "debug/reference_chosen_logps": -274.346923828125, + "debug/reference_rejected_logps": -274.3509216308594, + "debug/sppo_chosen_loss": 2448.82177734375, + "debug/sppo_chosen_reward_in_loss": 0.9972000122070312, + "debug/sppo_rej_reward_in_loss": -2.2758421897888184, + "debug/sppo_reject_loss": 2341.25048828125, + "epoch": 3.0434782608695654, + "grad_norm": 67740.2764421149, + "learning_rate": 7.255333599601847e-08, + "logits/chosen": 1.5348484516143799, + "logits/rejected": 1.320349931716919, + "logps/chosen": -273.3497619628906, + "logps/rejected": -276.6268005371094, + "loss": 4595.4688, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.009972000494599342, + "rewards/margins": 0.03273041918873787, + "rewards/rejected": -0.022758418694138527, + "step": 840 + }, + { + "debug/policy_chosen_logits": 1.3700846433639526, + "debug/policy_chosen_logps": -254.62109375, + "debug/policy_rejected_logits": 1.505577802658081, + "debug/policy_rejected_logps": -268.71044921875, + "debug/reference_chosen_logps": -258.35516357421875, + "debug/reference_rejected_logps": -267.05230712890625, + "debug/sppo_chosen_loss": 2162.54052734375, + "debug/sppo_chosen_reward_in_loss": 3.7340476512908936, + "debug/sppo_rej_reward_in_loss": -1.6581627130508423, + "debug/sppo_reject_loss": 2429.60986328125, + "epoch": 3.0615942028985508, + "grad_norm": 70335.9560176472, + "learning_rate": 7.222018911599233e-08, + "logits/chosen": 1.3700846433639526, + "logits/rejected": 1.505577802658081, + "logps/chosen": -254.62109375, + "logps/rejected": -268.71044921875, + "loss": 4590.6805, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03734047710895538, + "rewards/margins": 0.053922105580568314, + "rewards/rejected": -0.01658162660896778, + "step": 845 + }, + { + "debug/policy_chosen_logits": 1.1331188678741455, + "debug/policy_chosen_logps": -258.7958068847656, + "debug/policy_rejected_logits": 1.5421819686889648, + "debug/policy_rejected_logps": -287.38116455078125, + "debug/reference_chosen_logps": -262.4872131347656, + "debug/reference_rejected_logps": -282.98260498046875, + "debug/sppo_chosen_loss": 2155.307373046875, + "debug/sppo_chosen_reward_in_loss": 3.6914010047912598, + "debug/sppo_rej_reward_in_loss": -4.398567199707031, + "debug/sppo_reject_loss": 2190.74169921875, + "epoch": 3.079710144927536, + "grad_norm": 98115.78335956299, + "learning_rate": 7.188580843734004e-08, + "logits/chosen": 1.1331188678741455, + "logits/rejected": 1.5421819686889648, + "logps/chosen": -258.7958068847656, + "logps/rejected": -287.38116455078125, + "loss": 4541.2988, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03691400960087776, + "rewards/margins": 0.080899678170681, + "rewards/rejected": -0.043985672295093536, + "step": 850 + }, + { + "debug/policy_chosen_logits": 1.476320505142212, + "debug/policy_chosen_logps": -272.7628479003906, + "debug/policy_rejected_logits": 1.561833381652832, + "debug/policy_rejected_logps": -317.9288024902344, + "debug/reference_chosen_logps": -275.04705810546875, + "debug/reference_rejected_logps": -311.6878356933594, + "debug/sppo_chosen_loss": 2295.048583984375, + "debug/sppo_chosen_reward_in_loss": 2.2842469215393066, + "debug/sppo_rej_reward_in_loss": -6.240972995758057, + "debug/sppo_reject_loss": 2041.3408203125, + "epoch": 3.097826086956522, + "grad_norm": 65203.51672900081, + "learning_rate": 7.155021252688928e-08, + "logits/chosen": 1.476320505142212, + "logits/rejected": 1.561833381652832, + "logps/chosen": -272.7628479003906, + "logps/rejected": -317.9288024902344, + "loss": 4582.6109, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.022842470556497574, + "rewards/margins": 0.08525218814611435, + "rewards/rejected": -0.06240972876548767, + "step": 855 + }, + { + "debug/policy_chosen_logits": 1.192882776260376, + "debug/policy_chosen_logps": -258.40386962890625, + "debug/policy_rejected_logits": 1.2437019348144531, + "debug/policy_rejected_logps": -273.90728759765625, + "debug/reference_chosen_logps": -260.32659912109375, + "debug/reference_rejected_logps": -270.2201232910156, + "debug/sppo_chosen_loss": 2330.452392578125, + "debug/sppo_chosen_reward_in_loss": 1.9227139949798584, + "debug/sppo_rej_reward_in_loss": -3.687180757522583, + "debug/sppo_reject_loss": 2217.678466796875, + "epoch": 3.1159420289855073, + "grad_norm": 70023.40665554293, + "learning_rate": 7.121342001894466e-08, + "logits/chosen": 1.192882776260376, + "logits/rejected": 1.2437019348144531, + "logps/chosen": -258.40386962890625, + "logps/rejected": -273.90728759765625, + "loss": 4706.7301, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.01922714151442051, + "rewards/margins": 0.056098949164152145, + "rewards/rejected": -0.03687180578708649, + "step": 860 + }, + { + "debug/policy_chosen_logits": 1.283287525177002, + "debug/policy_chosen_logps": -259.46160888671875, + "debug/policy_rejected_logits": 1.6224002838134766, + "debug/policy_rejected_logps": -277.697998046875, + "debug/reference_chosen_logps": -260.6236877441406, + "debug/reference_rejected_logps": -272.70477294921875, + "debug/sppo_chosen_loss": 2419.24853515625, + "debug/sppo_chosen_reward_in_loss": 1.1620738506317139, + "debug/sppo_rej_reward_in_loss": -4.993208885192871, + "debug/sppo_reject_loss": 2130.47021484375, + "epoch": 3.1340579710144927, + "grad_norm": 65909.5441623718, + "learning_rate": 7.087544961425316e-08, + "logits/chosen": 1.283287525177002, + "logits/rejected": 1.6224002838134766, + "logps/chosen": -259.46160888671875, + "logps/rejected": -277.697998046875, + "loss": 4560.4773, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.011620739474892616, + "rewards/margins": 0.061552830040454865, + "rewards/rejected": -0.0499320924282074, + "step": 865 + }, + { + "debug/policy_chosen_logits": 1.4900705814361572, + "debug/policy_chosen_logps": -271.69781494140625, + "debug/policy_rejected_logits": 1.7472995519638062, + "debug/policy_rejected_logps": -323.4566955566406, + "debug/reference_chosen_logps": -273.129150390625, + "debug/reference_rejected_logps": -319.109130859375, + "debug/sppo_chosen_loss": 2371.99560546875, + "debug/sppo_chosen_reward_in_loss": 1.4313232898712158, + "debug/sppo_rej_reward_in_loss": -4.347558498382568, + "debug/sppo_reject_loss": 2175.521728515625, + "epoch": 3.1521739130434785, + "grad_norm": 98449.00469359074, + "learning_rate": 7.05363200789656e-08, + "logits/chosen": 1.4900705814361572, + "logits/rejected": 1.7472995519638062, + "logps/chosen": -271.69781494140625, + "logps/rejected": -323.4566955566406, + "loss": 4573.332, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.014313233084976673, + "rewards/margins": 0.05778881907463074, + "rewards/rejected": -0.04347558692097664, + "step": 870 + }, + { + "debug/policy_chosen_logits": 1.4324753284454346, + "debug/policy_chosen_logps": -273.22796630859375, + "debug/policy_rejected_logits": 1.4399211406707764, + "debug/policy_rejected_logps": -269.93475341796875, + "debug/reference_chosen_logps": -273.6427307128906, + "debug/reference_rejected_logps": -265.6277160644531, + "debug/sppo_chosen_loss": 2514.471435546875, + "debug/sppo_chosen_reward_in_loss": 0.41477125883102417, + "debug/sppo_rej_reward_in_loss": -4.3070831298828125, + "debug/sppo_reject_loss": 2171.365234375, + "epoch": 3.170289855072464, + "grad_norm": 62355.19727690433, + "learning_rate": 7.019605024359474e-08, + "logits/chosen": 1.4324753284454346, + "logits/rejected": 1.4399211406707764, + "logps/chosen": -273.22796630859375, + "logps/rejected": -269.93475341796875, + "loss": 4654.0688, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0041477130725979805, + "rewards/margins": 0.04721853882074356, + "rewards/rejected": -0.043070826679468155, + "step": 875 + }, + { + "debug/policy_chosen_logits": 1.3117127418518066, + "debug/policy_chosen_logps": -256.207763671875, + "debug/policy_rejected_logits": 1.6756629943847656, + "debug/policy_rejected_logps": -342.5567321777344, + "debug/reference_chosen_logps": -257.4051818847656, + "debug/reference_rejected_logps": -340.0152587890625, + "debug/sppo_chosen_loss": 2410.57080078125, + "debug/sppo_chosen_reward_in_loss": 1.1974289417266846, + "debug/sppo_rej_reward_in_loss": -2.541478157043457, + "debug/sppo_reject_loss": 2331.065185546875, + "epoch": 3.1884057971014492, + "grad_norm": 92334.78755239808, + "learning_rate": 6.98546590019697e-08, + "logits/chosen": 1.3117127418518066, + "logits/rejected": 1.6756629943847656, + "logps/chosen": -256.207763671875, + "logps/rejected": -342.5567321777344, + "loss": 4476.9602, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011974288150668144, + "rewards/margins": 0.03738906979560852, + "rewards/rejected": -0.025414779782295227, + "step": 880 + }, + { + "debug/policy_chosen_logits": 0.9446808099746704, + "debug/policy_chosen_logps": -233.10348510742188, + "debug/policy_rejected_logits": 1.277573585510254, + "debug/policy_rejected_logps": -285.7528991699219, + "debug/reference_chosen_logps": -232.969482421875, + "debug/reference_rejected_logps": -279.6332702636719, + "debug/sppo_chosen_loss": 2578.521484375, + "debug/sppo_chosen_reward_in_loss": -0.13400498032569885, + "debug/sppo_rej_reward_in_loss": -6.119626045227051, + "debug/sppo_reject_loss": 2058.451171875, + "epoch": 3.2065217391304346, + "grad_norm": 101465.78772268088, + "learning_rate": 6.951216531018677e-08, + "logits/chosen": 0.9446808099746704, + "logits/rejected": 1.277573585510254, + "logps/chosen": -233.10348510742188, + "logps/rejected": -285.7528991699219, + "loss": 4723.9219, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0013400499010458589, + "rewards/margins": 0.05985620617866516, + "rewards/rejected": -0.061196256428956985, + "step": 885 + }, + { + "debug/policy_chosen_logits": 1.2341248989105225, + "debug/policy_chosen_logps": -257.8463134765625, + "debug/policy_rejected_logits": 1.4800808429718018, + "debug/policy_rejected_logps": -290.77386474609375, + "debug/reference_chosen_logps": -256.938720703125, + "debug/reference_rejected_logps": -284.2198486328125, + "debug/sppo_chosen_loss": 2649.87548828125, + "debug/sppo_chosen_reward_in_loss": -0.9075664281845093, + "debug/sppo_rej_reward_in_loss": -6.554051399230957, + "debug/sppo_reject_loss": 1996.097900390625, + "epoch": 3.2246376811594204, + "grad_norm": 60303.7658421852, + "learning_rate": 6.91685881855569e-08, + "logits/chosen": 1.2341248989105225, + "logits/rejected": 1.4800808429718018, + "logps/chosen": -257.8463134765625, + "logps/rejected": -290.77386474609375, + "loss": 4511.9859, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.009075663983821869, + "rewards/margins": 0.056464843451976776, + "rewards/rejected": -0.06554051488637924, + "step": 890 + }, + { + "debug/policy_chosen_logits": 1.3582326173782349, + "debug/policy_chosen_logps": -245.34634399414062, + "debug/policy_rejected_logits": 1.5517940521240234, + "debug/policy_rejected_logps": -263.07659912109375, + "debug/reference_chosen_logps": -245.36166381835938, + "debug/reference_rejected_logps": -260.2655334472656, + "debug/sppo_chosen_loss": 2533.37841796875, + "debug/sppo_chosen_reward_in_loss": 0.015349959954619408, + "debug/sppo_rej_reward_in_loss": -2.811088800430298, + "debug/sppo_reject_loss": 2292.26513671875, + "epoch": 3.2427536231884058, + "grad_norm": 67855.48725470214, + "learning_rate": 6.882394670554983e-08, + "logits/chosen": 1.3582326173782349, + "logits/rejected": 1.5517940521240234, + "logps/chosen": -245.34634399414062, + "logps/rejected": -263.07659912109375, + "loss": 4732.8016, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.00015349910245276988, + "rewards/margins": 0.02826438844203949, + "rewards/rejected": -0.028110885992646217, + "step": 895 + }, + { + "debug/policy_chosen_logits": 1.0621987581253052, + "debug/policy_chosen_logps": -241.30923461914062, + "debug/policy_rejected_logits": 1.5428647994995117, + "debug/policy_rejected_logps": -297.16314697265625, + "debug/reference_chosen_logps": -242.0436553955078, + "debug/reference_rejected_logps": -291.9529724121094, + "debug/sppo_chosen_loss": 2451.71728515625, + "debug/sppo_chosen_reward_in_loss": 0.7344198226928711, + "debug/sppo_rej_reward_in_loss": -5.210179328918457, + "debug/sppo_reject_loss": 2117.42138671875, + "epoch": 3.260869565217391, + "grad_norm": 66758.97499411159, + "learning_rate": 6.847826000673463e-08, + "logits/chosen": 1.0621987581253052, + "logits/rejected": 1.5428647994995117, + "logps/chosen": -241.30923461914062, + "logps/rejected": -297.16314697265625, + "loss": 4526.798, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.007344198413193226, + "rewards/margins": 0.05944598838686943, + "rewards/rejected": -0.05210179090499878, + "step": 900 + }, + { + "epoch": 3.260869565217391, + "eval_debug/policy_chosen_logits": 1.4543341398239136, + "eval_debug/policy_chosen_logps": -253.01719665527344, + "eval_debug/policy_rejected_logits": 1.5043613910675049, + "eval_debug/policy_rejected_logps": -263.0449523925781, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2560.71923828125, + "eval_debug/sppo_chosen_reward_in_loss": -0.0987061932682991, + "eval_debug/sppo_rej_reward_in_loss": -3.386300802230835, + "eval_debug/sppo_reject_loss": 2277.551513671875, + "eval_logits/chosen": 1.4543341398239136, + "eval_logits/rejected": 1.5043613910675049, + "eval_logps/chosen": -253.01719665527344, + "eval_logps/rejected": -263.0449523925781, + "eval_loss": 4673.5791015625, + "eval_rewards/accuracies": 0.5921052694320679, + "eval_rewards/chosen": -0.000987062114290893, + "eval_rewards/margins": 0.032875943928956985, + "eval_rewards/rejected": -0.03386300429701805, + "eval_runtime": 28.4446, + "eval_samples_per_second": 21.094, + "eval_steps_per_second": 0.668, + "step": 900 + }, + { + "debug/policy_chosen_logits": 1.3216747045516968, + "debug/policy_chosen_logps": -273.6351318359375, + "debug/policy_rejected_logits": 1.6194353103637695, + "debug/policy_rejected_logps": -297.20819091796875, + "debug/reference_chosen_logps": -273.14617919921875, + "debug/reference_rejected_logps": -291.0046691894531, + "debug/sppo_chosen_loss": 2630.943359375, + "debug/sppo_chosen_reward_in_loss": -0.488912969827652, + "debug/sppo_rej_reward_in_loss": -6.203536033630371, + "debug/sppo_reject_loss": 2036.7193603515625, + "epoch": 3.278985507246377, + "grad_norm": 86282.95682551128, + "learning_rate": 6.813154728371727e-08, + "logits/chosen": 1.3216747045516968, + "logits/rejected": 1.6194353103637695, + "logps/chosen": -273.6351318359375, + "logps/rejected": -297.20819091796875, + "loss": 4647.6363, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.004889129661023617, + "rewards/margins": 0.05714622884988785, + "rewards/rejected": -0.06203535944223404, + "step": 905 + }, + { + "debug/policy_chosen_logits": 1.189056396484375, + "debug/policy_chosen_logps": -247.99783325195312, + "debug/policy_rejected_logits": 1.2917503118515015, + "debug/policy_rejected_logps": -281.1531982421875, + "debug/reference_chosen_logps": -248.80142211914062, + "debug/reference_rejected_logps": -276.1498107910156, + "debug/sppo_chosen_loss": 2473.15283203125, + "debug/sppo_chosen_reward_in_loss": 0.8035877346992493, + "debug/sppo_rej_reward_in_loss": -5.003408908843994, + "debug/sppo_reject_loss": 2130.677490234375, + "epoch": 3.2971014492753623, + "grad_norm": 85369.33504264854, + "learning_rate": 6.77838277880747e-08, + "logits/chosen": 1.189056396484375, + "logits/rejected": 1.2917503118515015, + "logps/chosen": -247.99783325195312, + "logps/rejected": -281.1531982421875, + "loss": 4709.9395, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008035877719521523, + "rewards/margins": 0.05806996300816536, + "rewards/rejected": -0.050034087151288986, + "step": 910 + }, + { + "debug/policy_chosen_logits": 1.5871620178222656, + "debug/policy_chosen_logps": -265.612060546875, + "debug/policy_rejected_logits": 1.6904337406158447, + "debug/policy_rejected_logps": -276.8652648925781, + "debug/reference_chosen_logps": -264.2405700683594, + "debug/reference_rejected_logps": -274.06915283203125, + "debug/sppo_chosen_loss": 2715.077880859375, + "debug/sppo_chosen_reward_in_loss": -1.3715009689331055, + "debug/sppo_rej_reward_in_loss": -2.7960915565490723, + "debug/sppo_reject_loss": 2292.55712890625, + "epoch": 3.3152173913043477, + "grad_norm": 62428.39352075046, + "learning_rate": 6.743512082728601e-08, + "logits/chosen": 1.5871620178222656, + "logits/rejected": 1.6904337406158447, + "logps/chosen": -265.612060546875, + "logps/rejected": -276.8652648925781, + "loss": 4642.7937, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.013715009205043316, + "rewards/margins": 0.014245906844735146, + "rewards/rejected": -0.02796091139316559, + "step": 915 + }, + { + "debug/policy_chosen_logits": 1.2219802141189575, + "debug/policy_chosen_logps": -264.1886291503906, + "debug/policy_rejected_logits": 1.2222537994384766, + "debug/policy_rejected_logps": -267.58038330078125, + "debug/reference_chosen_logps": -265.7757263183594, + "debug/reference_rejected_logps": -265.7680358886719, + "debug/sppo_chosen_loss": 2358.83642578125, + "debug/sppo_chosen_reward_in_loss": 1.5870968103408813, + "debug/sppo_rej_reward_in_loss": -1.8123528957366943, + "debug/sppo_reject_loss": 2367.68115234375, + "epoch": 3.3333333333333335, + "grad_norm": 60585.62405711025, + "learning_rate": 6.708544576366023e-08, + "logits/chosen": 1.2219802141189575, + "logits/rejected": 1.2222537994384766, + "logps/chosen": -264.1886291503906, + "logps/rejected": -267.58038330078125, + "loss": 4629.0422, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015870967879891396, + "rewards/margins": 0.03399449959397316, + "rewards/rejected": -0.018123529851436615, + "step": 920 + }, + { + "debug/policy_chosen_logits": 1.1675989627838135, + "debug/policy_chosen_logps": -278.4476013183594, + "debug/policy_rejected_logits": 1.2601947784423828, + "debug/policy_rejected_logps": -310.80718994140625, + "debug/reference_chosen_logps": -278.3129577636719, + "debug/reference_rejected_logps": -303.284423828125, + "debug/sppo_chosen_loss": 2571.292236328125, + "debug/sppo_chosen_reward_in_loss": -0.1346588134765625, + "debug/sppo_rej_reward_in_loss": -7.522784233093262, + "debug/sppo_reject_loss": 1986.8385009765625, + "epoch": 3.351449275362319, + "grad_norm": 65406.092156402694, + "learning_rate": 6.673482201326134e-08, + "logits/chosen": 1.1675989627838135, + "logits/rejected": 1.2601947784423828, + "logps/chosen": -278.4476013183594, + "logps/rejected": -310.80718994140625, + "loss": 4602.2395, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.001346587436273694, + "rewards/margins": 0.07388125360012054, + "rewards/rejected": -0.07522784918546677, + "step": 925 + }, + { + "debug/policy_chosen_logits": 1.0340900421142578, + "debug/policy_chosen_logps": -243.76431274414062, + "debug/policy_rejected_logits": 1.3360588550567627, + "debug/policy_rejected_logps": -282.24517822265625, + "debug/reference_chosen_logps": -244.71994018554688, + "debug/reference_rejected_logps": -276.67999267578125, + "debug/sppo_chosen_loss": 2420.22119140625, + "debug/sppo_chosen_reward_in_loss": 0.9556635022163391, + "debug/sppo_rej_reward_in_loss": -5.565131187438965, + "debug/sppo_reject_loss": 2063.99853515625, + "epoch": 3.369565217391304, + "grad_norm": 62121.8313624803, + "learning_rate": 6.638326904483011e-08, + "logits/chosen": 1.0340900421142578, + "logits/rejected": 1.3360588550567627, + "logps/chosen": -243.76431274414062, + "logps/rejected": -282.24517822265625, + "loss": 4595.0086, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.009556634351611137, + "rewards/margins": 0.06520794332027435, + "rewards/rejected": -0.055651307106018066, + "step": 930 + }, + { + "debug/policy_chosen_logits": 1.0084972381591797, + "debug/policy_chosen_logps": -231.0150604248047, + "debug/policy_rejected_logits": 1.483633041381836, + "debug/policy_rejected_logps": -263.78753662109375, + "debug/reference_chosen_logps": -233.00662231445312, + "debug/reference_rejected_logps": -258.6335144042969, + "debug/sppo_chosen_loss": 2313.91064453125, + "debug/sppo_chosen_reward_in_loss": 1.9915668964385986, + "debug/sppo_rej_reward_in_loss": -5.154005527496338, + "debug/sppo_reject_loss": 2143.50732421875, + "epoch": 3.38768115942029, + "grad_norm": 58778.16609152914, + "learning_rate": 6.603080637870306e-08, + "logits/chosen": 1.0084972381591797, + "logits/rejected": 1.483633041381836, + "logps/chosen": -231.0150604248047, + "logps/rejected": -263.78753662109375, + "loss": 4569.0508, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.019915666431188583, + "rewards/margins": 0.07145573198795319, + "rewards/rejected": -0.05154005438089371, + "step": 935 + }, + { + "debug/policy_chosen_logits": 1.1337854862213135, + "debug/policy_chosen_logps": -245.05838012695312, + "debug/policy_rejected_logits": 1.2576329708099365, + "debug/policy_rejected_logps": -268.67169189453125, + "debug/reference_chosen_logps": -245.1829071044922, + "debug/reference_rejected_logps": -262.12872314453125, + "debug/sppo_chosen_loss": 2540.96142578125, + "debug/sppo_chosen_reward_in_loss": 0.12454567104578018, + "debug/sppo_rej_reward_in_loss": -6.542975425720215, + "debug/sppo_reject_loss": 1980.4417724609375, + "epoch": 3.4057971014492754, + "grad_norm": 79533.11373377292, + "learning_rate": 6.567745358572863e-08, + "logits/chosen": 1.1337854862213135, + "logits/rejected": 1.2576329708099365, + "logps/chosen": -245.05838012695312, + "logps/rejected": -268.67169189453125, + "loss": 4578.5586, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0012454565148800611, + "rewards/margins": 0.06667520850896835, + "rewards/rejected": -0.06542975455522537, + "step": 940 + }, + { + "debug/policy_chosen_logits": 1.131906270980835, + "debug/policy_chosen_logps": -226.1370849609375, + "debug/policy_rejected_logits": 1.4897301197052002, + "debug/policy_rejected_logps": -268.9652099609375, + "debug/reference_chosen_logps": -227.4475555419922, + "debug/reference_rejected_logps": -264.8956298828125, + "debug/sppo_chosen_loss": 2411.512451171875, + "debug/sppo_chosen_reward_in_loss": 1.3104562759399414, + "debug/sppo_rej_reward_in_loss": -4.069582939147949, + "debug/sppo_reject_loss": 2181.656982421875, + "epoch": 3.4239130434782608, + "grad_norm": 106805.97530068812, + "learning_rate": 6.532323028618045e-08, + "logits/chosen": 1.131906270980835, + "logits/rejected": 1.4897301197052002, + "logps/chosen": -226.1370849609375, + "logps/rejected": -268.9652099609375, + "loss": 4561.7008, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.013104562647640705, + "rewards/margins": 0.053800392895936966, + "rewards/rejected": -0.04069582745432854, + "step": 945 + }, + { + "debug/policy_chosen_logits": 0.8874229192733765, + "debug/policy_chosen_logps": -253.674560546875, + "debug/policy_rejected_logits": 1.1931557655334473, + "debug/policy_rejected_logps": -277.47772216796875, + "debug/reference_chosen_logps": -252.0839385986328, + "debug/reference_rejected_logps": -270.8529052734375, + "debug/sppo_chosen_loss": 2753.93359375, + "debug/sppo_chosen_reward_in_loss": -1.5906407833099365, + "debug/sppo_rej_reward_in_loss": -6.624871253967285, + "debug/sppo_reject_loss": 1967.4906005859375, + "epoch": 3.4420289855072466, + "grad_norm": 71768.84768061835, + "learning_rate": 6.496815614866791e-08, + "logits/chosen": 0.8874229192733765, + "logits/rejected": 1.1931557655334473, + "logps/chosen": -253.674560546875, + "logps/rejected": -277.47772216796875, + "loss": 4537.0883, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.015906408429145813, + "rewards/margins": 0.05034229904413223, + "rewards/rejected": -0.06624870747327805, + "step": 950 + }, + { + "debug/policy_chosen_logits": 1.2168424129486084, + "debug/policy_chosen_logps": -248.585693359375, + "debug/policy_rejected_logits": 1.4745090007781982, + "debug/policy_rejected_logps": -255.3431854248047, + "debug/reference_chosen_logps": -250.8568572998047, + "debug/reference_rejected_logps": -252.77029418945312, + "debug/sppo_chosen_loss": 2288.59375, + "debug/sppo_chosen_reward_in_loss": 2.271167516708374, + "debug/sppo_rej_reward_in_loss": -2.5729286670684814, + "debug/sppo_reject_loss": 2316.78564453125, + "epoch": 3.460144927536232, + "grad_norm": 104690.3662571117, + "learning_rate": 6.461225088904402e-08, + "logits/chosen": 1.2168424129486084, + "logits/rejected": 1.4745090007781982, + "logps/chosen": -248.585693359375, + "logps/rejected": -255.3431854248047, + "loss": 4551.0578, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.022711673751473427, + "rewards/margins": 0.04844096302986145, + "rewards/rejected": -0.025729287415742874, + "step": 955 + }, + { + "debug/policy_chosen_logits": 1.2385737895965576, + "debug/policy_chosen_logps": -241.7220458984375, + "debug/policy_rejected_logits": 1.4867292642593384, + "debug/policy_rejected_logps": -280.5006103515625, + "debug/reference_chosen_logps": -242.78359985351562, + "debug/reference_rejected_logps": -274.2628479003906, + "debug/sppo_chosen_loss": 2402.5537109375, + "debug/sppo_chosen_reward_in_loss": 1.0615535974502563, + "debug/sppo_rej_reward_in_loss": -6.237776756286621, + "debug/sppo_reject_loss": 2054.988525390625, + "epoch": 3.4782608695652173, + "grad_norm": 55443.77261586784, + "learning_rate": 6.425553426931074e-08, + "logits/chosen": 1.2385737895965576, + "logits/rejected": 1.4867292642593384, + "logps/chosen": -241.7220458984375, + "logps/rejected": -280.5006103515625, + "loss": 4659.8496, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.010615535080432892, + "rewards/margins": 0.07299330085515976, + "rewards/rejected": -0.06237776204943657, + "step": 960 + }, + { + "debug/policy_chosen_logits": 1.3841091394424438, + "debug/policy_chosen_logps": -256.5181579589844, + "debug/policy_rejected_logits": 1.741140365600586, + "debug/policy_rejected_logps": -308.02252197265625, + "debug/reference_chosen_logps": -256.64630126953125, + "debug/reference_rejected_logps": -303.5486755371094, + "debug/sppo_chosen_loss": 2536.14306640625, + "debug/sppo_chosen_reward_in_loss": 0.12813511490821838, + "debug/sppo_rej_reward_in_loss": -4.473842144012451, + "debug/sppo_reject_loss": 2165.02587890625, + "epoch": 3.496376811594203, + "grad_norm": 67083.57000441544, + "learning_rate": 6.389802609652162e-08, + "logits/chosen": 1.3841091394424438, + "logits/rejected": 1.741140365600586, + "logps/chosen": -256.5181579589844, + "logps/rejected": -308.02252197265625, + "loss": 4516.9992, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.001281349454075098, + "rewards/margins": 0.046019770205020905, + "rewards/rejected": -0.044738419353961945, + "step": 965 + }, + { + "debug/policy_chosen_logits": 1.2212345600128174, + "debug/policy_chosen_logps": -264.72930908203125, + "debug/policy_rejected_logits": 1.5062768459320068, + "debug/policy_rejected_logps": -296.52203369140625, + "debug/reference_chosen_logps": -266.0826721191406, + "debug/reference_rejected_logps": -290.98681640625, + "debug/sppo_chosen_loss": 2382.504150390625, + "debug/sppo_chosen_reward_in_loss": 1.3533518314361572, + "debug/sppo_rej_reward_in_loss": -5.535216331481934, + "debug/sppo_reject_loss": 2121.054443359375, + "epoch": 3.5144927536231885, + "grad_norm": 71337.47534161789, + "learning_rate": 6.353974622168195e-08, + "logits/chosen": 1.2212345600128174, + "logits/rejected": 1.5062768459320068, + "logps/chosen": -264.72930908203125, + "logps/rejected": -296.52203369140625, + "loss": 4735.7297, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.013533517718315125, + "rewards/margins": 0.0688856840133667, + "rewards/rejected": -0.05535217002034187, + "step": 970 + }, + { + "debug/policy_chosen_logits": 1.2449438571929932, + "debug/policy_chosen_logps": -274.74798583984375, + "debug/policy_rejected_logits": 1.4908430576324463, + "debug/policy_rejected_logps": -294.26043701171875, + "debug/reference_chosen_logps": -276.3829040527344, + "debug/reference_rejected_logps": -289.47418212890625, + "debug/sppo_chosen_loss": 2366.81787109375, + "debug/sppo_chosen_reward_in_loss": 1.6348743438720703, + "debug/sppo_rej_reward_in_loss": -4.786262035369873, + "debug/sppo_reject_loss": 2130.755615234375, + "epoch": 3.532608695652174, + "grad_norm": 81418.27012097696, + "learning_rate": 6.318071453864662e-08, + "logits/chosen": 1.2449438571929932, + "logits/rejected": 1.4908430576324463, + "logps/chosen": -274.74798583984375, + "logps/rejected": -294.26043701171875, + "loss": 4559.5645, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.016348743811249733, + "rewards/margins": 0.06421136111021042, + "rewards/rejected": -0.047862619161605835, + "step": 975 + }, + { + "debug/policy_chosen_logits": 1.1797082424163818, + "debug/policy_chosen_logps": -226.591796875, + "debug/policy_rejected_logits": 1.7147998809814453, + "debug/policy_rejected_logps": -287.06842041015625, + "debug/reference_chosen_logps": -229.1777801513672, + "debug/reference_rejected_logps": -279.7626953125, + "debug/sppo_chosen_loss": 2254.087646484375, + "debug/sppo_chosen_reward_in_loss": 2.585974931716919, + "debug/sppo_rej_reward_in_loss": -7.305711269378662, + "debug/sppo_reject_loss": 1962.0999755859375, + "epoch": 3.550724637681159, + "grad_norm": 67774.91197865033, + "learning_rate": 6.282095098301539e-08, + "logits/chosen": 1.1797082424163818, + "logits/rejected": 1.7147998809814453, + "logps/chosen": -226.591796875, + "logps/rejected": -287.06842041015625, + "loss": 4500.5508, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02585974894464016, + "rewards/margins": 0.09891685843467712, + "rewards/rejected": -0.07305711507797241, + "step": 980 + }, + { + "debug/policy_chosen_logits": 1.225523591041565, + "debug/policy_chosen_logps": -247.4475555419922, + "debug/policy_rejected_logits": 1.4108251333236694, + "debug/policy_rejected_logps": -291.97711181640625, + "debug/reference_chosen_logps": -249.59201049804688, + "debug/reference_rejected_logps": -285.00433349609375, + "debug/sppo_chosen_loss": 2312.104736328125, + "debug/sppo_chosen_reward_in_loss": 2.144468069076538, + "debug/sppo_rej_reward_in_loss": -6.972817897796631, + "debug/sppo_reject_loss": 1981.7515869140625, + "epoch": 3.568840579710145, + "grad_norm": 66697.2543743667, + "learning_rate": 6.246047553102603e-08, + "logits/chosen": 1.225523591041565, + "logits/rejected": 1.4108251333236694, + "logps/chosen": -247.4475555419922, + "logps/rejected": -291.97711181640625, + "loss": 4519.1367, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.02144468203186989, + "rewards/margins": 0.09117285907268524, + "rewards/rejected": -0.06972817331552505, + "step": 985 + }, + { + "debug/policy_chosen_logits": 1.4036672115325928, + "debug/policy_chosen_logps": -270.65557861328125, + "debug/policy_rejected_logits": 1.3850643634796143, + "debug/policy_rejected_logps": -292.19305419921875, + "debug/reference_chosen_logps": -272.51019287109375, + "debug/reference_rejected_logps": -290.306884765625, + "debug/sppo_chosen_loss": 2350.90185546875, + "debug/sppo_chosen_reward_in_loss": 1.854596734046936, + "debug/sppo_rej_reward_in_loss": -1.886178731918335, + "debug/sppo_reject_loss": 2385.397705078125, + "epoch": 3.5869565217391304, + "grad_norm": 87720.9428138047, + "learning_rate": 6.209930819844507e-08, + "logits/chosen": 1.4036672115325928, + "logits/rejected": 1.3850643634796143, + "logps/chosen": -270.65557861328125, + "logps/rejected": -292.19305419921875, + "loss": 4641.4867, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0185459665954113, + "rewards/margins": 0.037407755851745605, + "rewards/rejected": -0.018861789256334305, + "step": 990 + }, + { + "debug/policy_chosen_logits": 1.102624535560608, + "debug/policy_chosen_logps": -255.6370849609375, + "debug/policy_rejected_logits": 1.6262718439102173, + "debug/policy_rejected_logps": -312.22467041015625, + "debug/reference_chosen_logps": -255.8466339111328, + "debug/reference_rejected_logps": -307.6287841796875, + "debug/sppo_chosen_loss": 2506.376708984375, + "debug/sppo_chosen_reward_in_loss": 0.20953139662742615, + "debug/sppo_rej_reward_in_loss": -4.5958662033081055, + "debug/sppo_reject_loss": 2155.5029296875, + "epoch": 3.605072463768116, + "grad_norm": 79109.53179825464, + "learning_rate": 6.173746903945638e-08, + "logits/chosen": 1.102624535560608, + "logits/rejected": 1.6262718439102173, + "logps/chosen": -255.6370849609375, + "logps/rejected": -312.22467041015625, + "loss": 4548.2949, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.002095314208418131, + "rewards/margins": 0.04805397614836693, + "rewards/rejected": -0.045958660542964935, + "step": 995 + }, + { + "debug/policy_chosen_logits": 1.402718186378479, + "debug/policy_chosen_logps": -263.24212646484375, + "debug/policy_rejected_logits": 1.5921787023544312, + "debug/policy_rejected_logps": -289.51226806640625, + "debug/reference_chosen_logps": -265.0359802246094, + "debug/reference_rejected_logps": -285.32977294921875, + "debug/sppo_chosen_loss": 2352.199462890625, + "debug/sppo_chosen_reward_in_loss": 1.7938053607940674, + "debug/sppo_rej_reward_in_loss": -4.182505130767822, + "debug/sppo_reject_loss": 2218.07470703125, + "epoch": 3.6231884057971016, + "grad_norm": 64425.89693535943, + "learning_rate": 6.137497814554771e-08, + "logits/chosen": 1.402718186378479, + "logits/rejected": 1.5921787023544312, + "logps/chosen": -263.24212646484375, + "logps/rejected": -289.51226806640625, + "loss": 4599.7109, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.017938053235411644, + "rewards/margins": 0.059763096272945404, + "rewards/rejected": -0.04182504862546921, + "step": 1000 + }, + { + "epoch": 3.6231884057971016, + "eval_debug/policy_chosen_logits": 1.4479906558990479, + "eval_debug/policy_chosen_logps": -252.73809814453125, + "eval_debug/policy_rejected_logits": 1.4972885847091675, + "eval_debug/policy_rejected_logps": -262.917236328125, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2535.936767578125, + "eval_debug/sppo_chosen_reward_in_loss": 0.18038207292556763, + "eval_debug/sppo_rej_reward_in_loss": -3.2586004734039307, + "eval_debug/sppo_reject_loss": 2302.096923828125, + "eval_logits/chosen": 1.4479906558990479, + "eval_logits/rejected": 1.4972885847091675, + "eval_logps/chosen": -252.73809814453125, + "eval_logps/rejected": -262.917236328125, + "eval_loss": 4664.81689453125, + "eval_rewards/accuracies": 0.5657894611358643, + "eval_rewards/chosen": 0.0018038200214505196, + "eval_rewards/margins": 0.03438982367515564, + "eval_rewards/rejected": -0.032586004585027695, + "eval_runtime": 28.5745, + "eval_samples_per_second": 20.998, + "eval_steps_per_second": 0.665, + "step": 1000 + }, + { + "debug/policy_chosen_logits": 1.068411111831665, + "debug/policy_chosen_logps": -243.59439086914062, + "debug/policy_rejected_logits": 1.6734635829925537, + "debug/policy_rejected_logps": -307.09027099609375, + "debug/reference_chosen_logps": -243.37484741210938, + "debug/reference_rejected_logps": -299.6080017089844, + "debug/sppo_chosen_loss": 2582.09765625, + "debug/sppo_chosen_reward_in_loss": -0.21955490112304688, + "debug/sppo_rej_reward_in_loss": -7.482234001159668, + "debug/sppo_reject_loss": 2003.4375, + "epoch": 3.641304347826087, + "grad_norm": 72879.08819563665, + "learning_rate": 6.101185564439507e-08, + "logits/chosen": 1.068411111831665, + "logits/rejected": 1.6734635829925537, + "logps/chosen": -243.59439086914062, + "logps/rejected": -307.09027099609375, + "loss": 4494.1641, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0021955487318336964, + "rewards/margins": 0.07262678444385529, + "rewards/rejected": -0.074822336435318, + "step": 1005 + }, + { + "debug/policy_chosen_logits": 1.1344451904296875, + "debug/policy_chosen_logps": -240.4451446533203, + "debug/policy_rejected_logits": 1.7183849811553955, + "debug/policy_rejected_logps": -306.75244140625, + "debug/reference_chosen_logps": -241.87393188476562, + "debug/reference_rejected_logps": -300.93377685546875, + "debug/sppo_chosen_loss": 2374.83251953125, + "debug/sppo_chosen_reward_in_loss": 1.4287716150283813, + "debug/sppo_rej_reward_in_loss": -5.818660259246826, + "debug/sppo_reject_loss": 2069.93798828125, + "epoch": 3.6594202898550723, + "grad_norm": 58992.621556186234, + "learning_rate": 6.064812169874505e-08, + "logits/chosen": 1.1344451904296875, + "logits/rejected": 1.7183849811553955, + "logps/chosen": -240.4451446533203, + "logps/rejected": -306.75244140625, + "loss": 4562.4156, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.014287715777754784, + "rewards/margins": 0.07247431576251984, + "rewards/rejected": -0.0581866018474102, + "step": 1010 + }, + { + "debug/policy_chosen_logits": 1.4492950439453125, + "debug/policy_chosen_logps": -258.4239807128906, + "debug/policy_rejected_logits": 1.282820224761963, + "debug/policy_rejected_logps": -271.81219482421875, + "debug/reference_chosen_logps": -259.12908935546875, + "debug/reference_rejected_logps": -265.5401611328125, + "debug/sppo_chosen_loss": 2455.122314453125, + "debug/sppo_chosen_reward_in_loss": 0.7051193118095398, + "debug/sppo_rej_reward_in_loss": -6.272065162658691, + "debug/sppo_reject_loss": 2034.664306640625, + "epoch": 3.677536231884058, + "grad_norm": 77077.84401835203, + "learning_rate": 6.028379650529536e-08, + "logits/chosen": 1.4492950439453125, + "logits/rejected": 1.282820224761963, + "logps/chosen": -258.4239807128906, + "logps/rejected": -271.81219482421875, + "loss": 4588.6488, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.007051193621009588, + "rewards/margins": 0.06977184116840363, + "rewards/rejected": -0.0627206414937973, + "step": 1015 + }, + { + "debug/policy_chosen_logits": 0.9079713821411133, + "debug/policy_chosen_logps": -240.22738647460938, + "debug/policy_rejected_logits": 1.334970474243164, + "debug/policy_rejected_logps": -296.3286437988281, + "debug/reference_chosen_logps": -241.02230834960938, + "debug/reference_rejected_logps": -289.7048034667969, + "debug/sppo_chosen_loss": 2454.479736328125, + "debug/sppo_chosen_reward_in_loss": 0.7949361801147461, + "debug/sppo_rej_reward_in_loss": -6.62381649017334, + "debug/sppo_reject_loss": 2016.115966796875, + "epoch": 3.6956521739130435, + "grad_norm": 61592.09809646041, + "learning_rate": 5.991890029357334e-08, + "logits/chosen": 0.9079713821411133, + "logits/rejected": 1.334970474243164, + "logps/chosen": -240.22738647460938, + "logps/rejected": -296.3286437988281, + "loss": 4505.7461, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.007949361577630043, + "rewards/margins": 0.07418752461671829, + "rewards/rejected": -0.0662381649017334, + "step": 1020 + }, + { + "debug/policy_chosen_logits": 1.4035258293151855, + "debug/policy_chosen_logps": -254.43551635742188, + "debug/policy_rejected_logits": 1.8539726734161377, + "debug/policy_rejected_logps": -316.92572021484375, + "debug/reference_chosen_logps": -256.17486572265625, + "debug/reference_rejected_logps": -310.9232482910156, + "debug/sppo_chosen_loss": 2340.42724609375, + "debug/sppo_chosen_reward_in_loss": 1.739314317703247, + "debug/sppo_rej_reward_in_loss": -6.002488613128662, + "debug/sppo_reject_loss": 2059.16162109375, + "epoch": 3.713768115942029, + "grad_norm": 64473.59022347943, + "learning_rate": 5.9553453324812716e-08, + "logits/chosen": 1.4035258293151855, + "logits/rejected": 1.8539726734161377, + "logps/chosen": -254.43551635742188, + "logps/rejected": -316.92572021484375, + "loss": 4521.1812, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.017393141984939575, + "rewards/margins": 0.07741802930831909, + "rewards/rejected": -0.06002488732337952, + "step": 1025 + }, + { + "debug/policy_chosen_logits": 1.2367794513702393, + "debug/policy_chosen_logps": -276.3914489746094, + "debug/policy_rejected_logits": 1.4828026294708252, + "debug/policy_rejected_logps": -286.67523193359375, + "debug/reference_chosen_logps": -277.847412109375, + "debug/reference_rejected_logps": -283.011962890625, + "debug/sppo_chosen_loss": 2384.060791015625, + "debug/sppo_chosen_reward_in_loss": 1.4559547901153564, + "debug/sppo_rej_reward_in_loss": -3.6632473468780518, + "debug/sppo_reject_loss": 2214.15087890625, + "epoch": 3.7318840579710146, + "grad_norm": 73969.22297967359, + "learning_rate": 5.918747589082852e-08, + "logits/chosen": 1.2367794513702393, + "logits/rejected": 1.4828026294708252, + "logps/chosen": -276.3914489746094, + "logps/rejected": -286.67523193359375, + "loss": 4686.9672, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.014559546485543251, + "rewards/margins": 0.0511920228600502, + "rewards/rejected": -0.0366324707865715, + "step": 1030 + }, + { + "debug/policy_chosen_logits": 1.0385093688964844, + "debug/policy_chosen_logps": -269.85687255859375, + "debug/policy_rejected_logits": 1.1317791938781738, + "debug/policy_rejected_logps": -279.90869140625, + "debug/reference_chosen_logps": -271.19110107421875, + "debug/reference_rejected_logps": -272.58306884765625, + "debug/sppo_chosen_loss": 2418.24658203125, + "debug/sppo_chosen_reward_in_loss": 1.3342158794403076, + "debug/sppo_rej_reward_in_loss": -7.325618743896484, + "debug/sppo_reject_loss": 1939.541748046875, + "epoch": 3.75, + "grad_norm": 61579.2486295793, + "learning_rate": 5.882098831289043e-08, + "logits/chosen": 1.0385093688964844, + "logits/rejected": 1.1317791938781738, + "logps/chosen": -269.85687255859375, + "logps/rejected": -279.90869140625, + "loss": 4513.3438, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.013342161662876606, + "rewards/margins": 0.08659834414720535, + "rewards/rejected": -0.07325618714094162, + "step": 1035 + }, + { + "debug/policy_chosen_logits": 1.7431812286376953, + "debug/policy_chosen_logps": -261.0448913574219, + "debug/policy_rejected_logits": 2.046821117401123, + "debug/policy_rejected_logps": -301.9384460449219, + "debug/reference_chosen_logps": -260.8301696777344, + "debug/reference_rejected_logps": -295.55316162109375, + "debug/sppo_chosen_loss": 2600.938232421875, + "debug/sppo_chosen_reward_in_loss": -0.2147224396467209, + "debug/sppo_rej_reward_in_loss": -6.385306358337402, + "debug/sppo_reject_loss": 2045.309326171875, + "epoch": 3.7681159420289854, + "grad_norm": 60098.91869574073, + "learning_rate": 5.845401094059438e-08, + "logits/chosen": 1.7431812286376953, + "logits/rejected": 2.046821117401123, + "logps/chosen": -261.0448913574219, + "logps/rejected": -301.9384460449219, + "loss": 4533.4078, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.002147223800420761, + "rewards/margins": 0.06170583888888359, + "rewards/rejected": -0.06385305523872375, + "step": 1040 + }, + { + "debug/policy_chosen_logits": 1.536130666732788, + "debug/policy_chosen_logps": -244.9329376220703, + "debug/policy_rejected_logits": 1.77217698097229, + "debug/policy_rejected_logps": -294.2804260253906, + "debug/reference_chosen_logps": -246.18161010742188, + "debug/reference_rejected_logps": -288.33087158203125, + "debug/sppo_chosen_loss": 2424.373779296875, + "debug/sppo_chosen_reward_in_loss": 1.2486801147460938, + "debug/sppo_rej_reward_in_loss": -5.949560642242432, + "debug/sppo_reject_loss": 2032.7587890625, + "epoch": 3.786231884057971, + "grad_norm": 69585.66644369718, + "learning_rate": 5.808656415073263e-08, + "logits/chosen": 1.536130666732788, + "logits/rejected": 1.77217698097229, + "logps/chosen": -244.9329376220703, + "logps/rejected": -294.2804260253906, + "loss": 4555.6328, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.012486802414059639, + "rewards/margins": 0.07198240607976913, + "rewards/rejected": -0.059495605528354645, + "step": 1045 + }, + { + "debug/policy_chosen_logits": 1.2508312463760376, + "debug/policy_chosen_logps": -263.029052734375, + "debug/policy_rejected_logits": 1.6683003902435303, + "debug/policy_rejected_logps": -312.407958984375, + "debug/reference_chosen_logps": -265.32025146484375, + "debug/reference_rejected_logps": -303.7322998046875, + "debug/sppo_chosen_loss": 2288.44775390625, + "debug/sppo_chosen_reward_in_loss": 2.291184663772583, + "debug/sppo_rej_reward_in_loss": -8.675673484802246, + "debug/sppo_reject_loss": 1858.896240234375, + "epoch": 3.8043478260869565, + "grad_norm": 127296.46824033877, + "learning_rate": 5.7718668346162357e-08, + "logits/chosen": 1.2508312463760376, + "logits/rejected": 1.6683003902435303, + "logps/chosen": -263.029052734375, + "logps/rejected": -312.407958984375, + "loss": 4507.2723, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.02291184663772583, + "rewards/margins": 0.10966857522726059, + "rewards/rejected": -0.08675673604011536, + "step": 1050 + }, + { + "debug/policy_chosen_logits": 1.4180892705917358, + "debug/policy_chosen_logps": -273.88018798828125, + "debug/policy_rejected_logits": 1.6986258029937744, + "debug/policy_rejected_logps": -292.30291748046875, + "debug/reference_chosen_logps": -276.7886962890625, + "debug/reference_rejected_logps": -286.98443603515625, + "debug/sppo_chosen_loss": 2226.932861328125, + "debug/sppo_chosen_reward_in_loss": 2.9084973335266113, + "debug/sppo_rej_reward_in_loss": -5.318492412567139, + "debug/sppo_reject_loss": 2071.72412109375, + "epoch": 3.822463768115942, + "grad_norm": 108931.73433353592, + "learning_rate": 5.735034395467271e-08, + "logits/chosen": 1.4180892705917358, + "logits/rejected": 1.6986258029937744, + "logps/chosen": -273.88018798828125, + "logps/rejected": -292.30291748046875, + "loss": 4483.5527, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02908497489988804, + "rewards/margins": 0.08226989209651947, + "rewards/rejected": -0.05318492650985718, + "step": 1055 + }, + { + "debug/policy_chosen_logits": 1.3309862613677979, + "debug/policy_chosen_logps": -267.27777099609375, + "debug/policy_rejected_logits": 1.5425870418548584, + "debug/policy_rejected_logps": -288.1453552246094, + "debug/reference_chosen_logps": -266.3821716308594, + "debug/reference_rejected_logps": -285.1313171386719, + "debug/sppo_chosen_loss": 2662.57666015625, + "debug/sppo_chosen_reward_in_loss": -0.8955985903739929, + "debug/sppo_rej_reward_in_loss": -3.014031171798706, + "debug/sppo_reject_loss": 2292.829345703125, + "epoch": 3.8405797101449277, + "grad_norm": 95367.0531697411, + "learning_rate": 5.698161142785058e-08, + "logits/chosen": 1.3309862613677979, + "logits/rejected": 1.5425870418548584, + "logps/chosen": -267.27777099609375, + "logps/rejected": -288.1453552246094, + "loss": 4694.8043, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.008955985307693481, + "rewards/margins": 0.021184323355555534, + "rewards/rejected": -0.030140310525894165, + "step": 1060 + }, + { + "debug/policy_chosen_logits": 1.4495620727539062, + "debug/policy_chosen_logps": -266.0342712402344, + "debug/policy_rejected_logits": 1.4949467182159424, + "debug/policy_rejected_logps": -267.76898193359375, + "debug/reference_chosen_logps": -266.792236328125, + "debug/reference_rejected_logps": -261.2626953125, + "debug/sppo_chosen_loss": 2474.59521484375, + "debug/sppo_chosen_reward_in_loss": 0.7579633593559265, + "debug/sppo_rej_reward_in_loss": -6.50632381439209, + "debug/sppo_reject_loss": 2022.4810791015625, + "epoch": 3.858695652173913, + "grad_norm": 61935.53835577618, + "learning_rate": 5.661249123994495e-08, + "logits/chosen": 1.4495620727539062, + "logits/rejected": 1.4949467182159424, + "logps/chosen": -266.0342712402344, + "logps/rejected": -267.76898193359375, + "loss": 4553.7563, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.007579633500427008, + "rewards/margins": 0.07264287769794464, + "rewards/rejected": -0.0650632381439209, + "step": 1065 + }, + { + "debug/policy_chosen_logits": 1.2382694482803345, + "debug/policy_chosen_logps": -242.1920623779297, + "debug/policy_rejected_logits": 1.4644126892089844, + "debug/policy_rejected_logps": -260.42327880859375, + "debug/reference_chosen_logps": -243.15463256835938, + "debug/reference_rejected_logps": -256.77166748046875, + "debug/sppo_chosen_loss": 2455.119140625, + "debug/sppo_chosen_reward_in_loss": 0.9625652432441711, + "debug/sppo_rej_reward_in_loss": -3.651651382446289, + "debug/sppo_reject_loss": 2244.75634765625, + "epoch": 3.8768115942028984, + "grad_norm": 62560.529926078765, + "learning_rate": 5.624300388673012e-08, + "logits/chosen": 1.2382694482803345, + "logits/rejected": 1.4644126892089844, + "logps/chosen": -242.1920623779297, + "logps/rejected": -260.42327880859375, + "loss": 4614.7484, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.009625652804970741, + "rewards/margins": 0.04614216461777687, + "rewards/rejected": -0.03651650995016098, + "step": 1070 + }, + { + "debug/policy_chosen_logits": 1.057923674583435, + "debug/policy_chosen_logps": -227.30722045898438, + "debug/policy_rejected_logits": 1.3531568050384521, + "debug/policy_rejected_logps": -271.60675048828125, + "debug/reference_chosen_logps": -231.5152130126953, + "debug/reference_rejected_logps": -270.4071350097656, + "debug/sppo_chosen_loss": 2120.608154296875, + "debug/sppo_chosen_reward_in_loss": 4.2079925537109375, + "debug/sppo_rej_reward_in_loss": -1.1996062994003296, + "debug/sppo_reject_loss": 2496.496337890625, + "epoch": 3.894927536231884, + "grad_norm": 212962.5914278675, + "learning_rate": 5.5873169884367596e-08, + "logits/chosen": 1.057923674583435, + "logits/rejected": 1.3531568050384521, + "logps/chosen": -227.30722045898438, + "logps/rejected": -271.60675048828125, + "loss": 4589.2344, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.042079925537109375, + "rewards/margins": 0.05407598614692688, + "rewards/rejected": -0.011996065266430378, + "step": 1075 + }, + { + "debug/policy_chosen_logits": 1.2656968832015991, + "debug/policy_chosen_logps": -228.71029663085938, + "debug/policy_rejected_logits": 1.6959224939346313, + "debug/policy_rejected_logps": -311.6900939941406, + "debug/reference_chosen_logps": -238.9965362548828, + "debug/reference_rejected_logps": -321.0993957519531, + "debug/sppo_chosen_loss": 1639.9117431640625, + "debug/sppo_chosen_reward_in_loss": 10.28625202178955, + "debug/sppo_rej_reward_in_loss": 9.409296035766602, + "debug/sppo_reject_loss": 3562.62158203125, + "epoch": 3.9130434782608696, + "grad_norm": 134964.29261057146, + "learning_rate": 5.550300976826696e-08, + "logits/chosen": 1.2656968832015991, + "logits/rejected": 1.6959224939346313, + "logps/chosen": -228.71029663085938, + "logps/rejected": -311.6900939941406, + "loss": 5403.2051, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.10286252200603485, + "rewards/margins": 0.008769561536610126, + "rewards/rejected": 0.094092957675457, + "step": 1080 + }, + { + "debug/policy_chosen_logits": 1.2611393928527832, + "debug/policy_chosen_logps": -254.24649047851562, + "debug/policy_rejected_logits": 1.7364327907562256, + "debug/policy_rejected_logps": -283.56488037109375, + "debug/reference_chosen_logps": -259.33831787109375, + "debug/reference_rejected_logps": -290.1636962890625, + "debug/sppo_chosen_loss": 2095.25146484375, + "debug/sppo_chosen_reward_in_loss": 5.091801643371582, + "debug/sppo_rej_reward_in_loss": 6.598813056945801, + "debug/sppo_reject_loss": 3243.10498046875, + "epoch": 3.931159420289855, + "grad_norm": 80427.36638938684, + "learning_rate": 5.513254409194554e-08, + "logits/chosen": 1.2611393928527832, + "logits/rejected": 1.7364327907562256, + "logps/chosen": -254.24649047851562, + "logps/rejected": -283.56488037109375, + "loss": 5105.2883, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.05091802030801773, + "rewards/margins": -0.015070107765495777, + "rewards/rejected": 0.06598811596632004, + "step": 1085 + }, + { + "debug/policy_chosen_logits": 1.1099778413772583, + "debug/policy_chosen_logps": -226.66616821289062, + "debug/policy_rejected_logits": 1.6032660007476807, + "debug/policy_rejected_logps": -317.4427490234375, + "debug/reference_chosen_logps": -233.14584350585938, + "debug/reference_rejected_logps": -319.9851989746094, + "debug/sppo_chosen_loss": 1903.612548828125, + "debug/sppo_chosen_reward_in_loss": 6.479703426361084, + "debug/sppo_rej_reward_in_loss": 2.5424790382385254, + "debug/sppo_reject_loss": 2803.58642578125, + "epoch": 3.949275362318841, + "grad_norm": 60879.37374870972, + "learning_rate": 5.4761793425887274e-08, + "logits/chosen": 1.1099778413772583, + "logits/rejected": 1.6032660007476807, + "logps/chosen": -226.66616821289062, + "logps/rejected": -317.4427490234375, + "loss": 4794.3469, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.06479702889919281, + "rewards/margins": 0.039372242987155914, + "rewards/rejected": 0.025424787774682045, + "step": 1090 + }, + { + "debug/policy_chosen_logits": 0.9639069437980652, + "debug/policy_chosen_logps": -227.4275665283203, + "debug/policy_rejected_logits": 1.2056257724761963, + "debug/policy_rejected_logps": -266.2969665527344, + "debug/reference_chosen_logps": -232.79541015625, + "debug/reference_rejected_logps": -265.0921325683594, + "debug/sppo_chosen_loss": 2008.8134765625, + "debug/sppo_chosen_reward_in_loss": 5.367839336395264, + "debug/sppo_rej_reward_in_loss": -1.2048313617706299, + "debug/sppo_reject_loss": 2482.92919921875, + "epoch": 3.967391304347826, + "grad_norm": 62988.92224608488, + "learning_rate": 5.439077835640038e-08, + "logits/chosen": 0.9639069437980652, + "logits/rejected": 1.2056257724761963, + "logps/chosen": -227.4275665283203, + "logps/rejected": -266.2969665527344, + "loss": 4757.8082, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05367839336395264, + "rewards/margins": 0.06572670489549637, + "rewards/rejected": -0.012048312462866306, + "step": 1095 + }, + { + "debug/policy_chosen_logits": 1.3540475368499756, + "debug/policy_chosen_logps": -268.8951721191406, + "debug/policy_rejected_logits": 1.5584566593170166, + "debug/policy_rejected_logps": -272.23944091796875, + "debug/reference_chosen_logps": -273.50811767578125, + "debug/reference_rejected_logps": -270.3443298339844, + "debug/sppo_chosen_loss": 2084.36181640625, + "debug/sppo_chosen_reward_in_loss": 4.612961769104004, + "debug/sppo_rej_reward_in_loss": -1.8950939178466797, + "debug/sppo_reject_loss": 2381.415283203125, + "epoch": 3.9855072463768115, + "grad_norm": 78353.44336278702, + "learning_rate": 5.4019519484474376e-08, + "logits/chosen": 1.3540475368499756, + "logits/rejected": 1.5584566593170166, + "logps/chosen": -268.8951721191406, + "logps/rejected": -272.23944091796875, + "loss": 4598.4699, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04612961411476135, + "rewards/margins": 0.06508056074380875, + "rewards/rejected": -0.018950939178466797, + "step": 1100 + }, + { + "epoch": 3.9855072463768115, + "eval_debug/policy_chosen_logits": 1.4246468544006348, + "eval_debug/policy_chosen_logps": -250.67323303222656, + "eval_debug/policy_rejected_logits": 1.4704437255859375, + "eval_debug/policy_rejected_logps": -261.152099609375, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2330.43505859375, + "eval_debug/sppo_chosen_reward_in_loss": 2.245246648788452, + "eval_debug/sppo_rej_reward_in_loss": -1.4934808015823364, + "eval_debug/sppo_reject_loss": 2454.228515625, + "eval_logits/chosen": 1.4246468544006348, + "eval_logits/rejected": 1.4704437255859375, + "eval_logps/chosen": -250.67323303222656, + "eval_logps/rejected": -261.152099609375, + "eval_loss": 4659.80908203125, + "eval_rewards/accuracies": 0.6578947305679321, + "eval_rewards/chosen": 0.022452462464571, + "eval_rewards/margins": 0.03738727420568466, + "eval_rewards/rejected": -0.014934806153178215, + "eval_runtime": 28.6024, + "eval_samples_per_second": 20.977, + "eval_steps_per_second": 0.664, + "step": 1100 + }, + { + "debug/policy_chosen_logits": 1.1118760108947754, + "debug/policy_chosen_logps": -255.175048828125, + "debug/policy_rejected_logits": 1.0309432744979858, + "debug/policy_rejected_logps": -250.95205688476562, + "debug/reference_chosen_logps": -258.0804748535156, + "debug/reference_rejected_logps": -249.09701538085938, + "debug/sppo_chosen_loss": 2245.64990234375, + "debug/sppo_chosen_reward_in_loss": 2.9053878784179688, + "debug/sppo_rej_reward_in_loss": -1.855006456375122, + "debug/sppo_reject_loss": 2433.066162109375, + "epoch": 4.003623188405797, + "grad_norm": 66540.11198134471, + "learning_rate": 5.364803742463616e-08, + "logits/chosen": 1.1118760108947754, + "logits/rejected": 1.0309432744979858, + "logps/chosen": -255.175048828125, + "logps/rejected": -250.95205688476562, + "loss": 4605.5047, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.02905387617647648, + "rewards/margins": 0.04760394245386124, + "rewards/rejected": -0.01855006255209446, + "step": 1105 + }, + { + "debug/policy_chosen_logits": 1.1205860376358032, + "debug/policy_chosen_logps": -227.9105987548828, + "debug/policy_rejected_logits": 1.6373428106307983, + "debug/policy_rejected_logps": -315.283203125, + "debug/reference_chosen_logps": -230.3061981201172, + "debug/reference_rejected_logps": -310.6712341308594, + "debug/sppo_chosen_loss": 2305.743408203125, + "debug/sppo_chosen_reward_in_loss": 2.3956267833709717, + "debug/sppo_rej_reward_in_loss": -4.611940383911133, + "debug/sppo_reject_loss": 2187.669189453125, + "epoch": 4.021739130434782, + "grad_norm": 117309.90344802302, + "learning_rate": 5.327635280380538e-08, + "logits/chosen": 1.1205860376358032, + "logits/rejected": 1.6373428106307983, + "logps/chosen": -227.9105987548828, + "logps/rejected": -315.283203125, + "loss": 4562.8648, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.023956269025802612, + "rewards/margins": 0.07007567584514618, + "rewards/rejected": -0.04611939936876297, + "step": 1110 + }, + { + "debug/policy_chosen_logits": 1.216569185256958, + "debug/policy_chosen_logps": -245.4633331298828, + "debug/policy_rejected_logits": 1.344404935836792, + "debug/policy_rejected_logps": -275.8775939941406, + "debug/reference_chosen_logps": -246.9245147705078, + "debug/reference_rejected_logps": -274.3244323730469, + "debug/sppo_chosen_loss": 2414.36669921875, + "debug/sppo_chosen_reward_in_loss": 1.4611823558807373, + "debug/sppo_rej_reward_in_loss": -1.5531257390975952, + "debug/sppo_reject_loss": 2421.377197265625, + "epoch": 4.0398550724637685, + "grad_norm": 65295.14624639694, + "learning_rate": 5.290448626014904e-08, + "logits/chosen": 1.216569185256958, + "logits/rejected": 1.344404935836792, + "logps/chosen": -245.4633331298828, + "logps/rejected": -275.8775939941406, + "loss": 4631.8605, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014611823484301567, + "rewards/margins": 0.03014308586716652, + "rewards/rejected": -0.015531256794929504, + "step": 1115 + }, + { + "debug/policy_chosen_logits": 1.2135611772537231, + "debug/policy_chosen_logps": -256.9342041015625, + "debug/policy_rejected_logits": 1.3099197149276733, + "debug/policy_rejected_logps": -286.2529296875, + "debug/reference_chosen_logps": -259.2249450683594, + "debug/reference_rejected_logps": -282.4788818359375, + "debug/sppo_chosen_loss": 2306.509521484375, + "debug/sppo_chosen_reward_in_loss": 2.2907767295837402, + "debug/sppo_rej_reward_in_loss": -3.77405047416687, + "debug/sppo_reject_loss": 2217.175537109375, + "epoch": 4.057971014492754, + "grad_norm": 105412.84079895554, + "learning_rate": 5.253245844193563e-08, + "logits/chosen": 1.2135611772537231, + "logits/rejected": 1.3099197149276733, + "logps/chosen": -256.9342041015625, + "logps/rejected": -286.2529296875, + "loss": 4491.2824, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.022907767444849014, + "rewards/margins": 0.060648269951343536, + "rewards/rejected": -0.03774050623178482, + "step": 1120 + }, + { + "debug/policy_chosen_logits": 1.1843010187149048, + "debug/policy_chosen_logps": -252.4182586669922, + "debug/policy_rejected_logits": 1.4800398349761963, + "debug/policy_rejected_logps": -295.4695739746094, + "debug/reference_chosen_logps": -255.17752075195312, + "debug/reference_rejected_logps": -289.46148681640625, + "debug/sppo_chosen_loss": 2242.72412109375, + "debug/sppo_chosen_reward_in_loss": 2.7592289447784424, + "debug/sppo_rej_reward_in_loss": -6.00807523727417, + "debug/sppo_reject_loss": 2072.38037109375, + "epoch": 4.076086956521739, + "grad_norm": 77597.85285256788, + "learning_rate": 5.21602900063886e-08, + "logits/chosen": 1.1843010187149048, + "logits/rejected": 1.4800398349761963, + "logps/chosen": -252.4182586669922, + "logps/rejected": -295.4695739746094, + "loss": 4496.1145, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.027592290192842484, + "rewards/margins": 0.08767304569482803, + "rewards/rejected": -0.06008074805140495, + "step": 1125 + }, + { + "debug/policy_chosen_logits": 1.3048770427703857, + "debug/policy_chosen_logps": -276.3455505371094, + "debug/policy_rejected_logits": 1.5092872381210327, + "debug/policy_rejected_logps": -294.993896484375, + "debug/reference_chosen_logps": -276.34832763671875, + "debug/reference_rejected_logps": -289.8849182128906, + "debug/sppo_chosen_loss": 2554.799560546875, + "debug/sppo_chosen_reward_in_loss": 0.0027608871459960938, + "debug/sppo_rej_reward_in_loss": -5.10897159576416, + "debug/sppo_reject_loss": 2119.67626953125, + "epoch": 4.094202898550725, + "grad_norm": 97216.84067192367, + "learning_rate": 5.1788001618539276e-08, + "logits/chosen": 1.3048770427703857, + "logits/rejected": 1.5092872381210327, + "logps/chosen": -276.3455505371094, + "logps/rejected": -294.993896484375, + "loss": 4531.5531, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 2.7608499294728972e-05, + "rewards/margins": 0.05111732333898544, + "rewards/rejected": -0.05108971521258354, + "step": 1130 + }, + { + "debug/policy_chosen_logits": 0.9452874064445496, + "debug/policy_chosen_logps": -240.93484497070312, + "debug/policy_rejected_logits": 1.2251718044281006, + "debug/policy_rejected_logps": -279.31304931640625, + "debug/reference_chosen_logps": -242.5782012939453, + "debug/reference_rejected_logps": -273.7024841308594, + "debug/sppo_chosen_loss": 2375.52734375, + "debug/sppo_chosen_reward_in_loss": 1.643363356590271, + "debug/sppo_rej_reward_in_loss": -5.610522270202637, + "debug/sppo_reject_loss": 2086.57177734375, + "epoch": 4.11231884057971, + "grad_norm": 79223.02835815644, + "learning_rate": 5.141561395007945e-08, + "logits/chosen": 0.9452874064445496, + "logits/rejected": 1.2251718044281006, + "logps/chosen": -240.93484497070312, + "logps/rejected": -279.31304931640625, + "loss": 4587.2625, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.016433632001280785, + "rewards/margins": 0.07253885269165039, + "rewards/rejected": -0.05610521882772446, + "step": 1135 + }, + { + "debug/policy_chosen_logits": 0.9107455015182495, + "debug/policy_chosen_logps": -257.3502197265625, + "debug/policy_rejected_logits": 1.4192378520965576, + "debug/policy_rejected_logps": -306.44573974609375, + "debug/reference_chosen_logps": -257.38555908203125, + "debug/reference_rejected_logps": -298.5243225097656, + "debug/sppo_chosen_loss": 2568.659912109375, + "debug/sppo_chosen_reward_in_loss": 0.035347748547792435, + "debug/sppo_rej_reward_in_loss": -7.921439170837402, + "debug/sppo_reject_loss": 1932.2445068359375, + "epoch": 4.130434782608695, + "grad_norm": 120058.59485182083, + "learning_rate": 5.104314767821363e-08, + "logits/chosen": 0.9107455015182495, + "logits/rejected": 1.4192378520965576, + "logps/chosen": -257.3502197265625, + "logps/rejected": -306.44573974609375, + "loss": 4574.6625, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.000353475654264912, + "rewards/margins": 0.07956786453723907, + "rewards/rejected": -0.07921438664197922, + "step": 1140 + }, + { + "debug/policy_chosen_logits": 1.1127492189407349, + "debug/policy_chosen_logps": -247.59640502929688, + "debug/policy_rejected_logits": 1.3368191719055176, + "debug/policy_rejected_logps": -298.4590148925781, + "debug/reference_chosen_logps": -248.70059204101562, + "debug/reference_rejected_logps": -292.8433532714844, + "debug/sppo_chosen_loss": 2429.892333984375, + "debug/sppo_chosen_reward_in_loss": 1.1041476726531982, + "debug/sppo_rej_reward_in_loss": -5.615652561187744, + "debug/sppo_reject_loss": 2096.07275390625, + "epoch": 4.148550724637682, + "grad_norm": 57946.196357600886, + "learning_rate": 5.067062348451078e-08, + "logits/chosen": 1.1127492189407349, + "logits/rejected": 1.3368191719055176, + "logps/chosen": -247.59640502929688, + "logps/rejected": -298.4590148925781, + "loss": 4558.4766, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011041476391255856, + "rewards/margins": 0.0671980008482933, + "rewards/rejected": -0.05615652725100517, + "step": 1145 + }, + { + "debug/policy_chosen_logits": 1.174381971359253, + "debug/policy_chosen_logps": -249.43716430664062, + "debug/policy_rejected_logits": 1.5160521268844604, + "debug/policy_rejected_logps": -294.8294982910156, + "debug/reference_chosen_logps": -251.27197265625, + "debug/reference_rejected_logps": -290.90399169921875, + "debug/sppo_chosen_loss": 2353.6328125, + "debug/sppo_chosen_reward_in_loss": 1.8347762823104858, + "debug/sppo_rej_reward_in_loss": -3.92549467086792, + "debug/sppo_reject_loss": 2237.70751953125, + "epoch": 4.166666666666667, + "grad_norm": 59812.82756526929, + "learning_rate": 5.029806205375612e-08, + "logits/chosen": 1.174381971359253, + "logits/rejected": 1.5160521268844604, + "logps/chosen": -249.43716430664062, + "logps/rejected": -294.8294982910156, + "loss": 4607.048, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018347764387726784, + "rewards/margins": 0.057602714747190475, + "rewards/rejected": -0.039254944771528244, + "step": 1150 + }, + { + "debug/policy_chosen_logits": 1.2691490650177002, + "debug/policy_chosen_logps": -268.0215759277344, + "debug/policy_rejected_logits": 1.664320707321167, + "debug/policy_rejected_logps": -268.55377197265625, + "debug/reference_chosen_logps": -268.6834716796875, + "debug/reference_rejected_logps": -264.5723571777344, + "debug/sppo_chosen_loss": 2495.21826171875, + "debug/sppo_chosen_reward_in_loss": 0.661870002746582, + "debug/sppo_rej_reward_in_loss": -3.981393814086914, + "debug/sppo_reject_loss": 2244.986328125, + "epoch": 4.184782608695652, + "grad_norm": 66401.13018831579, + "learning_rate": 4.9925484072802416e-08, + "logits/chosen": 1.2691490650177002, + "logits/rejected": 1.664320707321167, + "logps/chosen": -268.0215759277344, + "logps/rejected": -268.55377197265625, + "loss": 4537.3687, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.00661869952455163, + "rewards/margins": 0.04643263667821884, + "rewards/rejected": -0.03981393203139305, + "step": 1155 + }, + { + "debug/policy_chosen_logits": 1.1173344850540161, + "debug/policy_chosen_logps": -228.13916015625, + "debug/policy_rejected_logits": 1.6752166748046875, + "debug/policy_rejected_logps": -300.904296875, + "debug/reference_chosen_logps": -228.95333862304688, + "debug/reference_rejected_logps": -298.1351623535156, + "debug/sppo_chosen_loss": 2445.013671875, + "debug/sppo_chosen_reward_in_loss": 0.8141956329345703, + "debug/sppo_rej_reward_in_loss": -2.769136428833008, + "debug/sppo_reject_loss": 2345.28369140625, + "epoch": 4.202898550724638, + "grad_norm": 60564.715991142424, + "learning_rate": 4.955291022942145e-08, + "logits/chosen": 1.1173344850540161, + "logits/rejected": 1.6752166748046875, + "logps/chosen": -228.13916015625, + "logps/rejected": -300.904296875, + "loss": 4666.5391, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.008141955360770226, + "rewards/margins": 0.035833317786455154, + "rewards/rejected": -0.027691364288330078, + "step": 1160 + }, + { + "debug/policy_chosen_logits": 1.0344722270965576, + "debug/policy_chosen_logps": -238.86990356445312, + "debug/policy_rejected_logits": 1.4619827270507812, + "debug/policy_rejected_logps": -276.3166809082031, + "debug/reference_chosen_logps": -240.12442016601562, + "debug/reference_rejected_logps": -271.47845458984375, + "debug/sppo_chosen_loss": 2413.0107421875, + "debug/sppo_chosen_reward_in_loss": 1.2545111179351807, + "debug/sppo_rej_reward_in_loss": -4.838225364685059, + "debug/sppo_reject_loss": 2133.81298828125, + "epoch": 4.221014492753623, + "grad_norm": 95972.71297880122, + "learning_rate": 4.918036121115522e-08, + "logits/chosen": 1.0344722270965576, + "logits/rejected": 1.4619827270507812, + "logps/chosen": -238.86990356445312, + "logps/rejected": -276.3166809082031, + "loss": 4568.5465, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.012545110657811165, + "rewards/margins": 0.060927361249923706, + "rewards/rejected": -0.04838225245475769, + "step": 1165 + }, + { + "debug/policy_chosen_logits": 1.2485231161117554, + "debug/policy_chosen_logps": -272.33270263671875, + "debug/policy_rejected_logits": 1.2228999137878418, + "debug/policy_rejected_logps": -272.02862548828125, + "debug/reference_chosen_logps": -274.59075927734375, + "debug/reference_rejected_logps": -266.5238037109375, + "debug/sppo_chosen_loss": 2290.04931640625, + "debug/sppo_chosen_reward_in_loss": 2.2580807209014893, + "debug/sppo_rej_reward_in_loss": -5.504796028137207, + "debug/sppo_reject_loss": 2098.5244140625, + "epoch": 4.239130434782608, + "grad_norm": 61518.25577334932, + "learning_rate": 4.8807857704167354e-08, + "logits/chosen": 1.2485231161117554, + "logits/rejected": 1.2228999137878418, + "logps/chosen": -272.33270263671875, + "logps/rejected": -272.02862548828125, + "loss": 4555.2211, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.02258080616593361, + "rewards/margins": 0.07762876898050308, + "rewards/rejected": -0.05504796653985977, + "step": 1170 + }, + { + "debug/policy_chosen_logits": 1.1164934635162354, + "debug/policy_chosen_logps": -260.64654541015625, + "debug/policy_rejected_logits": 1.1424113512039185, + "debug/policy_rejected_logps": -283.02191162109375, + "debug/reference_chosen_logps": -261.7978515625, + "debug/reference_rejected_logps": -278.51116943359375, + "debug/sppo_chosen_loss": 2429.515625, + "debug/sppo_chosen_reward_in_loss": 1.1512893438339233, + "debug/sppo_rej_reward_in_loss": -4.510707378387451, + "debug/sppo_reject_loss": 2166.89794921875, + "epoch": 4.257246376811594, + "grad_norm": 63260.42201895485, + "learning_rate": 4.843542039209433e-08, + "logits/chosen": 1.1164934635162354, + "logits/rejected": 1.1424113512039185, + "logps/chosen": -260.64654541015625, + "logps/rejected": -283.02191162109375, + "loss": 4533.0938, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011512893252074718, + "rewards/margins": 0.05661996454000473, + "rewards/rejected": -0.04510707035660744, + "step": 1175 + }, + { + "debug/policy_chosen_logits": 1.2684993743896484, + "debug/policy_chosen_logps": -276.10369873046875, + "debug/policy_rejected_logits": 1.4684734344482422, + "debug/policy_rejected_logps": -307.7631530761719, + "debug/reference_chosen_logps": -273.1227722167969, + "debug/reference_rejected_logps": -302.58367919921875, + "debug/sppo_chosen_loss": 2924.57373046875, + "debug/sppo_chosen_reward_in_loss": -2.980940341949463, + "debug/sppo_rej_reward_in_loss": -5.179494380950928, + "debug/sppo_reject_loss": 2151.70654296875, + "epoch": 4.27536231884058, + "grad_norm": 93177.70691280684, + "learning_rate": 4.806306995489717e-08, + "logits/chosen": 1.2684993743896484, + "logits/rejected": 1.4684734344482422, + "logps/chosen": -276.10369873046875, + "logps/rejected": -307.7631530761719, + "loss": 4602.8684, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.02980940416455269, + "rewards/margins": 0.02198554016649723, + "rewards/rejected": -0.05179494619369507, + "step": 1180 + }, + { + "debug/policy_chosen_logits": 0.8586879968643188, + "debug/policy_chosen_logps": -242.4465789794922, + "debug/policy_rejected_logits": 1.266420602798462, + "debug/policy_rejected_logps": -309.1039733886719, + "debug/reference_chosen_logps": -243.1239013671875, + "debug/reference_rejected_logps": -302.6136169433594, + "debug/sppo_chosen_loss": 2482.39892578125, + "debug/sppo_chosen_reward_in_loss": 0.6773250699043274, + "debug/sppo_rej_reward_in_loss": -6.490335941314697, + "debug/sppo_reject_loss": 2004.927734375, + "epoch": 4.293478260869565, + "grad_norm": 64413.494297686426, + "learning_rate": 4.769082706771303e-08, + "logits/chosen": 0.8586879968643188, + "logits/rejected": 1.266420602798462, + "logps/chosen": -242.4465789794922, + "logps/rejected": -309.1039733886719, + "loss": 4494.5984, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.006773251108825207, + "rewards/margins": 0.07167660444974899, + "rewards/rejected": -0.06490335613489151, + "step": 1185 + }, + { + "debug/policy_chosen_logits": 1.2695519924163818, + "debug/policy_chosen_logps": -249.88650512695312, + "debug/policy_rejected_logits": 1.4001498222351074, + "debug/policy_rejected_logps": -259.56414794921875, + "debug/reference_chosen_logps": -252.074951171875, + "debug/reference_rejected_logps": -256.25079345703125, + "debug/sppo_chosen_loss": 2297.40478515625, + "debug/sppo_chosen_reward_in_loss": 2.188446521759033, + "debug/sppo_rej_reward_in_loss": -3.3133347034454346, + "debug/sppo_reject_loss": 2301.43310546875, + "epoch": 4.311594202898551, + "grad_norm": 68571.0234947558, + "learning_rate": 4.731871239970723e-08, + "logits/chosen": 1.2695519924163818, + "logits/rejected": 1.4001498222351074, + "logps/chosen": -249.88650512695312, + "logps/rejected": -259.56414794921875, + "loss": 4506.7273, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02188446745276451, + "rewards/margins": 0.05501781031489372, + "rewards/rejected": -0.03313334658741951, + "step": 1190 + }, + { + "debug/policy_chosen_logits": 1.0433049201965332, + "debug/policy_chosen_logps": -260.4845275878906, + "debug/policy_rejected_logits": 1.3592349290847778, + "debug/policy_rejected_logps": -304.9561462402344, + "debug/reference_chosen_logps": -261.36907958984375, + "debug/reference_rejected_logps": -301.53497314453125, + "debug/sppo_chosen_loss": 2446.2802734375, + "debug/sppo_chosen_reward_in_loss": 0.8845428228378296, + "debug/sppo_rej_reward_in_loss": -3.4211902618408203, + "debug/sppo_reject_loss": 2262.201904296875, + "epoch": 4.329710144927536, + "grad_norm": 60012.33571932537, + "learning_rate": 4.694674661292563e-08, + "logits/chosen": 1.0433049201965332, + "logits/rejected": 1.3592349290847778, + "logps/chosen": -260.4845275878906, + "logps/rejected": -304.9561462402344, + "loss": 4682.9477, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.008845428004860878, + "rewards/margins": 0.04305732995271683, + "rewards/rejected": -0.0342118963599205, + "step": 1195 + }, + { + "debug/policy_chosen_logits": 1.639452338218689, + "debug/policy_chosen_logps": -251.68209838867188, + "debug/policy_rejected_logits": 2.0136008262634277, + "debug/policy_rejected_logps": -309.7662048339844, + "debug/reference_chosen_logps": -252.7598114013672, + "debug/reference_rejected_logps": -300.9691467285156, + "debug/sppo_chosen_loss": 2417.675048828125, + "debug/sppo_chosen_reward_in_loss": 1.0777438879013062, + "debug/sppo_rej_reward_in_loss": -8.797119140625, + "debug/sppo_reject_loss": 1863.860595703125, + "epoch": 4.3478260869565215, + "grad_norm": 59513.496975803915, + "learning_rate": 4.6574950361147296e-08, + "logits/chosen": 1.639452338218689, + "logits/rejected": 2.0136008262634277, + "logps/chosen": -251.68209838867188, + "logps/rejected": -309.7662048339844, + "loss": 4434.3441, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.010777438059449196, + "rewards/margins": 0.09874863177537918, + "rewards/rejected": -0.08797118812799454, + "step": 1200 + }, + { + "epoch": 4.3478260869565215, + "eval_debug/policy_chosen_logits": 1.4176274538040161, + "eval_debug/policy_chosen_logps": -253.55946350097656, + "eval_debug/policy_rejected_logits": 1.4647811651229858, + "eval_debug/policy_rejected_logps": -264.1338806152344, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2633.100830078125, + "eval_debug/sppo_chosen_reward_in_loss": -0.6409844160079956, + "eval_debug/sppo_rej_reward_in_loss": -4.475238800048828, + "eval_debug/sppo_reject_loss": 2222.516357421875, + "eval_logits/chosen": 1.4176274538040161, + "eval_logits/rejected": 1.4647811651229858, + "eval_logps/chosen": -253.55946350097656, + "eval_logps/rejected": -264.1338806152344, + "eval_loss": 4652.3701171875, + "eval_rewards/accuracies": 0.5789473652839661, + "eval_rewards/chosen": -0.006409844849258661, + "eval_rewards/margins": 0.03834254667162895, + "eval_rewards/rejected": -0.044752392917871475, + "eval_runtime": 28.2997, + "eval_samples_per_second": 21.202, + "eval_steps_per_second": 0.671, + "step": 1200 + }, + { + "debug/policy_chosen_logits": 1.046290397644043, + "debug/policy_chosen_logps": -251.0709991455078, + "debug/policy_rejected_logits": 1.3077054023742676, + "debug/policy_rejected_logps": -298.0001525878906, + "debug/reference_chosen_logps": -251.0874481201172, + "debug/reference_rejected_logps": -295.08001708984375, + "debug/sppo_chosen_loss": 2553.08642578125, + "debug/sppo_chosen_reward_in_loss": 0.016447830945253372, + "debug/sppo_rej_reward_in_loss": -2.9201416969299316, + "debug/sppo_reject_loss": 2284.946533203125, + "epoch": 4.365942028985507, + "grad_norm": 69233.98566803512, + "learning_rate": 4.6203344288737694e-08, + "logits/chosen": 1.046290397644043, + "logits/rejected": 1.3077054023742676, + "logps/chosen": -251.0709991455078, + "logps/rejected": -298.0001525878906, + "loss": 4569.1977, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00016447734378743917, + "rewards/margins": 0.029365893453359604, + "rewards/rejected": -0.029201412573456764, + "step": 1205 + }, + { + "debug/policy_chosen_logits": 1.3429532051086426, + "debug/policy_chosen_logps": -232.9199981689453, + "debug/policy_rejected_logits": 1.702622652053833, + "debug/policy_rejected_logps": -267.97393798828125, + "debug/reference_chosen_logps": -235.09268188476562, + "debug/reference_rejected_logps": -262.50762939453125, + "debug/sppo_chosen_loss": 2300.420654296875, + "debug/sppo_chosen_reward_in_loss": 2.1726772785186768, + "debug/sppo_rej_reward_in_loss": -5.466324806213379, + "debug/sppo_reject_loss": 2110.576171875, + "epoch": 4.384057971014493, + "grad_norm": 86817.63430815592, + "learning_rate": 4.583194902950234e-08, + "logits/chosen": 1.3429532051086426, + "logits/rejected": 1.702622652053833, + "logps/chosen": -232.9199981689453, + "logps/rejected": -267.97393798828125, + "loss": 4474.6441, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.02172677218914032, + "rewards/margins": 0.07639001309871674, + "rewards/rejected": -0.054663240909576416, + "step": 1210 + }, + { + "debug/policy_chosen_logits": 1.091511845588684, + "debug/policy_chosen_logps": -274.4998779296875, + "debug/policy_rejected_logits": 1.1359487771987915, + "debug/policy_rejected_logps": -276.3737487792969, + "debug/reference_chosen_logps": -274.5265197753906, + "debug/reference_rejected_logps": -268.24322509765625, + "debug/sppo_chosen_loss": 2567.46533203125, + "debug/sppo_chosen_reward_in_loss": 0.02658367156982422, + "debug/sppo_rej_reward_in_loss": -8.130558967590332, + "debug/sppo_reject_loss": 1878.1728515625, + "epoch": 4.4021739130434785, + "grad_norm": 57731.05906653854, + "learning_rate": 4.546078520554123e-08, + "logits/chosen": 1.091511845588684, + "logits/rejected": 1.1359487771987915, + "logps/chosen": -274.4998779296875, + "logps/rejected": -276.3737487792969, + "loss": 4550.2871, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.00026583747239783406, + "rewards/margins": 0.08157142251729965, + "rewards/rejected": -0.08130558580160141, + "step": 1215 + }, + { + "debug/policy_chosen_logits": 0.7921018600463867, + "debug/policy_chosen_logps": -234.0077667236328, + "debug/policy_rejected_logits": 1.1153194904327393, + "debug/policy_rejected_logps": -288.6137390136719, + "debug/reference_chosen_logps": -235.14913940429688, + "debug/reference_rejected_logps": -283.31939697265625, + "debug/sppo_chosen_loss": 2416.521484375, + "debug/sppo_chosen_reward_in_loss": 1.1413967609405518, + "debug/sppo_rej_reward_in_loss": -5.294376850128174, + "debug/sppo_reject_loss": 2090.29541015625, + "epoch": 4.420289855072464, + "grad_norm": 64438.99635837363, + "learning_rate": 4.5089873426103575e-08, + "logits/chosen": 0.7921018600463867, + "logits/rejected": 1.1153194904327393, + "logps/chosen": -234.0077667236328, + "logps/rejected": -288.6137390136719, + "loss": 4561.3641, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.011413967236876488, + "rewards/margins": 0.06435773521661758, + "rewards/rejected": -0.05294376611709595, + "step": 1220 + }, + { + "debug/policy_chosen_logits": 1.2593185901641846, + "debug/policy_chosen_logps": -251.8949432373047, + "debug/policy_rejected_logits": 1.3442823886871338, + "debug/policy_rejected_logps": -266.51556396484375, + "debug/reference_chosen_logps": -254.294189453125, + "debug/reference_rejected_logps": -261.4356689453125, + "debug/sppo_chosen_loss": 2281.20849609375, + "debug/sppo_chosen_reward_in_loss": 2.3992409706115723, + "debug/sppo_rej_reward_in_loss": -5.079881191253662, + "debug/sppo_reject_loss": 2105.07666015625, + "epoch": 4.438405797101449, + "grad_norm": 69135.99785715759, + "learning_rate": 4.471923428644361e-08, + "logits/chosen": 1.2593185901641846, + "logits/rejected": 1.3442823886871338, + "logps/chosen": -251.8949432373047, + "logps/rejected": -266.51556396484375, + "loss": 4308.8, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.02399240806698799, + "rewards/margins": 0.07479121536016464, + "rewards/rejected": -0.05079881101846695, + "step": 1225 + }, + { + "debug/policy_chosen_logits": 1.1671477556228638, + "debug/policy_chosen_logps": -262.21832275390625, + "debug/policy_rejected_logits": 1.5277702808380127, + "debug/policy_rejected_logps": -303.25189208984375, + "debug/reference_chosen_logps": -262.35906982421875, + "debug/reference_rejected_logps": -297.26397705078125, + "debug/sppo_chosen_loss": 2544.049072265625, + "debug/sppo_chosen_reward_in_loss": 0.14074191451072693, + "debug/sppo_rej_reward_in_loss": -5.987893581390381, + "debug/sppo_reject_loss": 2118.998291015625, + "epoch": 4.456521739130435, + "grad_norm": 75122.42668605472, + "learning_rate": 4.4348888366677e-08, + "logits/chosen": 1.1671477556228638, + "logits/rejected": 1.5277702808380127, + "logps/chosen": -262.21832275390625, + "logps/rejected": -303.25189208984375, + "loss": 4490.9406, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0014074190985411406, + "rewards/margins": 0.06128635257482529, + "rewards/rejected": -0.059878937900066376, + "step": 1230 + }, + { + "debug/policy_chosen_logits": 1.1719297170639038, + "debug/policy_chosen_logps": -243.4971923828125, + "debug/policy_rejected_logits": 1.5150502920150757, + "debug/policy_rejected_logps": -290.97918701171875, + "debug/reference_chosen_logps": -245.56253051757812, + "debug/reference_rejected_logps": -283.6595153808594, + "debug/sppo_chosen_loss": 2315.97802734375, + "debug/sppo_chosen_reward_in_loss": 2.0653247833251953, + "debug/sppo_rej_reward_in_loss": -7.319671630859375, + "debug/sppo_reject_loss": 1958.665283203125, + "epoch": 4.47463768115942, + "grad_norm": 65485.326826669945, + "learning_rate": 4.3978856230638006e-08, + "logits/chosen": 1.1719297170639038, + "logits/rejected": 1.5150502920150757, + "logps/chosen": -243.4971923828125, + "logps/rejected": -290.97918701171875, + "loss": 4511.8867, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.020653247833251953, + "rewards/margins": 0.09384995698928833, + "rewards/rejected": -0.07319670915603638, + "step": 1235 + }, + { + "debug/policy_chosen_logits": 0.9258686304092407, + "debug/policy_chosen_logps": -252.1705780029297, + "debug/policy_rejected_logits": 1.5341691970825195, + "debug/policy_rejected_logps": -304.2678527832031, + "debug/reference_chosen_logps": -252.189697265625, + "debug/reference_rejected_logps": -299.5684814453125, + "debug/sppo_chosen_loss": 2564.078125, + "debug/sppo_chosen_reward_in_loss": 0.01913604699075222, + "debug/sppo_rej_reward_in_loss": -4.6993584632873535, + "debug/sppo_reject_loss": 2178.47705078125, + "epoch": 4.492753623188406, + "grad_norm": 67467.04571567297, + "learning_rate": 4.360915842473778e-08, + "logits/chosen": 0.9258686304092407, + "logits/rejected": 1.5341691970825195, + "logps/chosen": -252.1705780029297, + "logps/rejected": -304.2678527832031, + "loss": 4636.2949, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.00019135959155391902, + "rewards/margins": 0.047184936702251434, + "rewards/rejected": -0.04699358344078064, + "step": 1240 + }, + { + "debug/policy_chosen_logits": 1.2430813312530518, + "debug/policy_chosen_logps": -246.3681182861328, + "debug/policy_rejected_logits": 1.4096765518188477, + "debug/policy_rejected_logps": -265.3592834472656, + "debug/reference_chosen_logps": -249.0845184326172, + "debug/reference_rejected_logps": -261.7829895019531, + "debug/sppo_chosen_loss": 2265.285400390625, + "debug/sppo_chosen_reward_in_loss": 2.7164249420166016, + "debug/sppo_rej_reward_in_loss": -3.576289415359497, + "debug/sppo_reject_loss": 2272.076171875, + "epoch": 4.510869565217392, + "grad_norm": 87505.91804699051, + "learning_rate": 4.323981547682341e-08, + "logits/chosen": 1.2430813312530518, + "logits/rejected": 1.4096765518188477, + "logps/chosen": -246.3681182861328, + "logps/rejected": -265.3592834472656, + "loss": 4565.9375, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02716425061225891, + "rewards/margins": 0.06292714178562164, + "rewards/rejected": -0.03576289117336273, + "step": 1245 + }, + { + "debug/policy_chosen_logits": 0.8251383900642395, + "debug/policy_chosen_logps": -251.19461059570312, + "debug/policy_rejected_logits": 1.2840948104858398, + "debug/policy_rejected_logps": -310.5716247558594, + "debug/reference_chosen_logps": -252.380859375, + "debug/reference_rejected_logps": -305.73828125, + "debug/sppo_chosen_loss": 2430.56005859375, + "debug/sppo_chosen_reward_in_loss": 1.1862595081329346, + "debug/sppo_rej_reward_in_loss": -4.833325386047363, + "debug/sppo_reject_loss": 2147.44091796875, + "epoch": 4.528985507246377, + "grad_norm": 65261.528655588234, + "learning_rate": 4.287084789503821e-08, + "logits/chosen": 0.8251383900642395, + "logits/rejected": 1.2840948104858398, + "logps/chosen": -251.19461059570312, + "logps/rejected": -310.5716247558594, + "loss": 4561.8227, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.011862593702971935, + "rewards/margins": 0.06019585207104683, + "rewards/rejected": -0.04833325743675232, + "step": 1250 + }, + { + "debug/policy_chosen_logits": 1.1032116413116455, + "debug/policy_chosen_logps": -236.8427276611328, + "debug/policy_rejected_logits": 1.1994361877441406, + "debug/policy_rejected_logps": -281.3302917480469, + "debug/reference_chosen_logps": -238.1943359375, + "debug/reference_rejected_logps": -277.8025817871094, + "debug/sppo_chosen_loss": 2413.16064453125, + "debug/sppo_chosen_reward_in_loss": 1.351604700088501, + "debug/sppo_rej_reward_in_loss": -3.5276970863342285, + "debug/sppo_reject_loss": 2266.24072265625, + "epoch": 4.547101449275362, + "grad_norm": 60726.047146706565, + "learning_rate": 4.25022761666828e-08, + "logits/chosen": 1.1032116413116455, + "logits/rejected": 1.1994361877441406, + "logps/chosen": -236.8427276611328, + "logps/rejected": -281.3302917480469, + "loss": 4491.7547, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013516046106815338, + "rewards/margins": 0.048793014138936996, + "rewards/rejected": -0.03527696803212166, + "step": 1255 + }, + { + "debug/policy_chosen_logits": 1.1790199279785156, + "debug/policy_chosen_logps": -230.02743530273438, + "debug/policy_rejected_logits": 1.228280782699585, + "debug/policy_rejected_logps": -251.299072265625, + "debug/reference_chosen_logps": -231.77627563476562, + "debug/reference_rejected_logps": -250.15414428710938, + "debug/sppo_chosen_loss": 2367.98876953125, + "debug/sppo_chosen_reward_in_loss": 1.7488276958465576, + "debug/sppo_rej_reward_in_loss": -1.1449229717254639, + "debug/sppo_reject_loss": 2475.135009765625, + "epoch": 4.565217391304348, + "grad_norm": 64332.87644757188, + "learning_rate": 4.2134120757077734e-08, + "logits/chosen": 1.1790199279785156, + "logits/rejected": 1.228280782699585, + "logps/chosen": -230.02743530273438, + "logps/rejected": -251.299072265625, + "loss": 4465.0578, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.017488278448581696, + "rewards/margins": 0.028937507420778275, + "rewards/rejected": -0.011449231766164303, + "step": 1260 + }, + { + "debug/policy_chosen_logits": 1.3369028568267822, + "debug/policy_chosen_logps": -251.6381378173828, + "debug/policy_rejected_logits": 1.5229181051254272, + "debug/policy_rejected_logps": -292.3419494628906, + "debug/reference_chosen_logps": -252.33349609375, + "debug/reference_rejected_logps": -288.603515625, + "debug/sppo_chosen_loss": 2490.442138671875, + "debug/sppo_chosen_reward_in_loss": 0.6953468322753906, + "debug/sppo_rej_reward_in_loss": -3.7384228706359863, + "debug/sppo_reject_loss": 2298.62744140625, + "epoch": 4.583333333333333, + "grad_norm": 64540.17470785349, + "learning_rate": 4.176640210842699e-08, + "logits/chosen": 1.3369028568267822, + "logits/rejected": 1.5229181051254272, + "logps/chosen": -251.6381378173828, + "logps/rejected": -292.3419494628906, + "loss": 4575.027, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0069534690119326115, + "rewards/margins": 0.04433769732713699, + "rewards/rejected": -0.03738423064351082, + "step": 1265 + }, + { + "debug/policy_chosen_logits": 1.377716302871704, + "debug/policy_chosen_logps": -278.78375244140625, + "debug/policy_rejected_logits": 1.4594463109970093, + "debug/policy_rejected_logps": -312.36566162109375, + "debug/reference_chosen_logps": -279.77496337890625, + "debug/reference_rejected_logps": -307.9085693359375, + "debug/sppo_chosen_loss": 2480.254150390625, + "debug/sppo_chosen_reward_in_loss": 0.9911910891532898, + "debug/sppo_rej_reward_in_loss": -4.457060813903809, + "debug/sppo_reject_loss": 2181.80517578125, + "epoch": 4.601449275362318, + "grad_norm": 61145.46098697978, + "learning_rate": 4.139914063868293e-08, + "logits/chosen": 1.377716302871704, + "logits/rejected": 1.4594463109970093, + "logps/chosen": -278.78375244140625, + "logps/rejected": -312.36566162109375, + "loss": 4571.3488, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.009911911562085152, + "rewards/margins": 0.05448251962661743, + "rewards/rejected": -0.04457060620188713, + "step": 1270 + }, + { + "debug/policy_chosen_logits": 1.1420056819915771, + "debug/policy_chosen_logps": -258.6353454589844, + "debug/policy_rejected_logits": 1.4533131122589111, + "debug/policy_rejected_logps": -294.0809631347656, + "debug/reference_chosen_logps": -259.3207092285156, + "debug/reference_rejected_logps": -291.1746520996094, + "debug/sppo_chosen_loss": 2481.23779296875, + "debug/sppo_chosen_reward_in_loss": 0.6853691339492798, + "debug/sppo_rej_reward_in_loss": -2.906320571899414, + "debug/sppo_reject_loss": 2302.75341796875, + "epoch": 4.619565217391305, + "grad_norm": 83740.46552961235, + "learning_rate": 4.103235674041266e-08, + "logits/chosen": 1.1420056819915771, + "logits/rejected": 1.4533131122589111, + "logps/chosen": -258.6353454589844, + "logps/rejected": -294.0809631347656, + "loss": 4558.2504, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.006853691302239895, + "rewards/margins": 0.03591689467430115, + "rewards/rejected": -0.02906320057809353, + "step": 1275 + }, + { + "debug/policy_chosen_logits": 1.4147323369979858, + "debug/policy_chosen_logps": -269.6281433105469, + "debug/policy_rejected_logits": 1.2961864471435547, + "debug/policy_rejected_logps": -270.24127197265625, + "debug/reference_chosen_logps": -270.97503662109375, + "debug/reference_rejected_logps": -266.56317138671875, + "debug/sppo_chosen_loss": 2403.29443359375, + "debug/sppo_chosen_reward_in_loss": 1.3468936681747437, + "debug/sppo_rej_reward_in_loss": -3.678117275238037, + "debug/sppo_reject_loss": 2250.15869140625, + "epoch": 4.63768115942029, + "grad_norm": 96431.02323300211, + "learning_rate": 4.066607077966558e-08, + "logits/chosen": 1.4147323369979858, + "logits/rejected": 1.2961864471435547, + "logps/chosen": -269.6281433105469, + "logps/rejected": -270.24127197265625, + "loss": 4642.2664, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.013468936085700989, + "rewards/margins": 0.0502501018345356, + "rewards/rejected": -0.03678116947412491, + "step": 1280 + }, + { + "debug/policy_chosen_logits": 1.1305530071258545, + "debug/policy_chosen_logps": -217.1304931640625, + "debug/policy_rejected_logits": 1.5472891330718994, + "debug/policy_rejected_logps": -268.34173583984375, + "debug/reference_chosen_logps": -220.8345947265625, + "debug/reference_rejected_logps": -263.44635009765625, + "debug/sppo_chosen_loss": 2151.916259765625, + "debug/sppo_chosen_reward_in_loss": 3.7041258811950684, + "debug/sppo_rej_reward_in_loss": -4.89541482925415, + "debug/sppo_reject_loss": 2223.81689453125, + "epoch": 4.655797101449275, + "grad_norm": 63172.904139896964, + "learning_rate": 4.030030309484266e-08, + "logits/chosen": 1.1305530071258545, + "logits/rejected": 1.5472891330718994, + "logps/chosen": -217.1304931640625, + "logps/rejected": -268.34173583984375, + "loss": 4577.1152, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.037041254341602325, + "rewards/margins": 0.08599540591239929, + "rewards/rejected": -0.04895415157079697, + "step": 1285 + }, + { + "debug/policy_chosen_logits": 1.053808569908142, + "debug/policy_chosen_logps": -240.4136505126953, + "debug/policy_rejected_logits": 1.4557785987854004, + "debug/policy_rejected_logps": -289.88995361328125, + "debug/reference_chosen_logps": -240.63711547851562, + "debug/reference_rejected_logps": -285.86376953125, + "debug/sppo_chosen_loss": 2550.99853515625, + "debug/sppo_chosen_reward_in_loss": 0.22346897423267365, + "debug/sppo_rej_reward_in_loss": -4.026174068450928, + "debug/sppo_reject_loss": 2216.336669921875, + "epoch": 4.673913043478261, + "grad_norm": 58932.00259799572, + "learning_rate": 3.9935073995566984e-08, + "logits/chosen": 1.053808569908142, + "logits/rejected": 1.4557785987854004, + "logps/chosen": -240.4136505126953, + "logps/rejected": -289.88995361328125, + "loss": 4557.0328, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0022346898913383484, + "rewards/margins": 0.04249643161892891, + "rewards/rejected": -0.04026174172759056, + "step": 1290 + }, + { + "debug/policy_chosen_logits": 1.1827704906463623, + "debug/policy_chosen_logps": -250.3016815185547, + "debug/policy_rejected_logits": 1.66567063331604, + "debug/policy_rejected_logps": -292.5500793457031, + "debug/reference_chosen_logps": -250.30029296875, + "debug/reference_rejected_logps": -286.7928771972656, + "debug/sppo_chosen_loss": 2592.457763671875, + "debug/sppo_chosen_reward_in_loss": -0.0013914108276367188, + "debug/sppo_rej_reward_in_loss": -5.757199764251709, + "debug/sppo_reject_loss": 2131.22412109375, + "epoch": 4.692028985507246, + "grad_norm": 68959.83758611538, + "learning_rate": 3.957040376155625e-08, + "logits/chosen": 1.1827704906463623, + "logits/rejected": 1.66567063331604, + "logps/chosen": -250.3016815185547, + "logps/rejected": -292.5500793457031, + "loss": 4449.4266, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3914332157582976e-05, + "rewards/margins": 0.0575580820441246, + "rewards/rejected": -0.05757199600338936, + "step": 1295 + }, + { + "debug/policy_chosen_logits": 1.2554280757904053, + "debug/policy_chosen_logps": -279.84320068359375, + "debug/policy_rejected_logits": 1.32145094871521, + "debug/policy_rejected_logps": -276.5966796875, + "debug/reference_chosen_logps": -281.0545959472656, + "debug/reference_rejected_logps": -272.4754333496094, + "debug/sppo_chosen_loss": 2443.668701171875, + "debug/sppo_chosen_reward_in_loss": 1.2113920450210571, + "debug/sppo_rej_reward_in_loss": -4.121267795562744, + "debug/sppo_reject_loss": 2225.25048828125, + "epoch": 4.710144927536232, + "grad_norm": 67623.47642834642, + "learning_rate": 3.920631264149647e-08, + "logits/chosen": 1.2554280757904053, + "logits/rejected": 1.32145094871521, + "logps/chosen": -279.84320068359375, + "logps/rejected": -276.5966796875, + "loss": 4673.5336, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.012113918550312519, + "rewards/margins": 0.053326599299907684, + "rewards/rejected": -0.04121267795562744, + "step": 1300 + }, + { + "epoch": 4.710144927536232, + "eval_debug/policy_chosen_logits": 1.413673996925354, + "eval_debug/policy_chosen_logps": -252.32928466796875, + "eval_debug/policy_rejected_logits": 1.4597282409667969, + "eval_debug/policy_rejected_logps": -263.0262756347656, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2506.592041015625, + "eval_debug/sppo_chosen_reward_in_loss": 0.5891677737236023, + "eval_debug/sppo_rej_reward_in_loss": -3.3676302433013916, + "eval_debug/sppo_reject_loss": 2317.545654296875, + "eval_logits/chosen": 1.413673996925354, + "eval_logits/rejected": 1.4597282409667969, + "eval_logps/chosen": -252.32928466796875, + "eval_logps/rejected": -263.0262756347656, + "eval_loss": 4629.23583984375, + "eval_rewards/accuracies": 0.6052631735801697, + "eval_rewards/chosen": 0.005891676992177963, + "eval_rewards/margins": 0.039567980915308, + "eval_rewards/rejected": -0.033676303923130035, + "eval_runtime": 28.2834, + "eval_samples_per_second": 21.214, + "eval_steps_per_second": 0.672, + "step": 1300 + }, + { + "debug/policy_chosen_logits": 1.0135124921798706, + "debug/policy_chosen_logps": -241.3780059814453, + "debug/policy_rejected_logits": 1.3717260360717773, + "debug/policy_rejected_logps": -283.3448791503906, + "debug/reference_chosen_logps": -243.1513214111328, + "debug/reference_rejected_logps": -277.0658264160156, + "debug/sppo_chosen_loss": 2355.27001953125, + "debug/sppo_chosen_reward_in_loss": 1.773308515548706, + "debug/sppo_rej_reward_in_loss": -6.279069900512695, + "debug/sppo_reject_loss": 2043.216064453125, + "epoch": 4.728260869565218, + "grad_norm": 63917.706723514275, + "learning_rate": 3.884282085191782e-08, + "logits/chosen": 1.0135124921798706, + "logits/rejected": 1.3717260360717773, + "logps/chosen": -241.3780059814453, + "logps/rejected": -283.3448791503906, + "loss": 4531.777, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.01773308590054512, + "rewards/margins": 0.080523781478405, + "rewards/rejected": -0.06279069930315018, + "step": 1305 + }, + { + "debug/policy_chosen_logits": 1.2251166105270386, + "debug/policy_chosen_logps": -254.70632934570312, + "debug/policy_rejected_logits": 1.443878412246704, + "debug/policy_rejected_logps": -293.47235107421875, + "debug/reference_chosen_logps": -256.298828125, + "debug/reference_rejected_logps": -287.5867614746094, + "debug/sppo_chosen_loss": 2382.60595703125, + "debug/sppo_chosen_reward_in_loss": 1.5925235748291016, + "debug/sppo_rej_reward_in_loss": -5.885610103607178, + "debug/sppo_reject_loss": 2065.21435546875, + "epoch": 4.746376811594203, + "grad_norm": 65263.12493107783, + "learning_rate": 3.847994857607208e-08, + "logits/chosen": 1.2251166105270386, + "logits/rejected": 1.443878412246704, + "logps/chosen": -254.70632934570312, + "logps/rejected": -293.47235107421875, + "loss": 4509.5789, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.01592523418366909, + "rewards/margins": 0.07478133589029312, + "rewards/rejected": -0.05885609984397888, + "step": 1310 + }, + { + "debug/policy_chosen_logits": 1.4621236324310303, + "debug/policy_chosen_logps": -258.2713928222656, + "debug/policy_rejected_logits": 1.7599725723266602, + "debug/policy_rejected_logps": -283.0798034667969, + "debug/reference_chosen_logps": -258.59027099609375, + "debug/reference_rejected_logps": -279.9950256347656, + "debug/sppo_chosen_loss": 2545.721923828125, + "debug/sppo_chosen_reward_in_loss": 0.3188707232475281, + "debug/sppo_rej_reward_in_loss": -3.0847675800323486, + "debug/sppo_reject_loss": 2300.00830078125, + "epoch": 4.7644927536231885, + "grad_norm": 66844.54159609167, + "learning_rate": 3.811771596281181e-08, + "logits/chosen": 1.4621236324310303, + "logits/rejected": 1.7599725723266602, + "logps/chosen": -258.2713928222656, + "logps/rejected": -283.0798034667969, + "loss": 4572.9363, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003188707632943988, + "rewards/margins": 0.03403637930750847, + "rewards/rejected": -0.030847672373056412, + "step": 1315 + }, + { + "debug/policy_chosen_logits": 1.078034520149231, + "debug/policy_chosen_logps": -244.0189666748047, + "debug/policy_rejected_logits": 1.416656255722046, + "debug/policy_rejected_logps": -286.8953552246094, + "debug/reference_chosen_logps": -244.8174591064453, + "debug/reference_rejected_logps": -283.5927429199219, + "debug/sppo_chosen_loss": 2442.52783203125, + "debug/sppo_chosen_reward_in_loss": 0.7984712719917297, + "debug/sppo_rej_reward_in_loss": -3.3026015758514404, + "debug/sppo_reject_loss": 2292.23291015625, + "epoch": 4.782608695652174, + "grad_norm": 59809.747670291195, + "learning_rate": 3.775614312547174e-08, + "logits/chosen": 1.078034520149231, + "logits/rejected": 1.416656255722046, + "logps/chosen": -244.0189666748047, + "logps/rejected": -286.8953552246094, + "loss": 4636.6781, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007984711788594723, + "rewards/margins": 0.04101072996854782, + "rewards/rejected": -0.033026017248630524, + "step": 1320 + }, + { + "debug/policy_chosen_logits": 1.1492488384246826, + "debug/policy_chosen_logps": -244.71347045898438, + "debug/policy_rejected_logits": 1.3557655811309814, + "debug/policy_rejected_logps": -301.49017333984375, + "debug/reference_chosen_logps": -247.78457641601562, + "debug/reference_rejected_logps": -296.8087463378906, + "debug/sppo_chosen_loss": 2217.271240234375, + "debug/sppo_chosen_reward_in_loss": 3.0710842609405518, + "debug/sppo_rej_reward_in_loss": -4.681424140930176, + "debug/sppo_reject_loss": 2150.398681640625, + "epoch": 4.800724637681159, + "grad_norm": 60184.51204865287, + "learning_rate": 3.739525014075178e-08, + "logits/chosen": 1.1492488384246826, + "logits/rejected": 1.3557655811309814, + "logps/chosen": -244.71347045898438, + "logps/rejected": -301.49017333984375, + "loss": 4390.1812, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.030710840597748756, + "rewards/margins": 0.0775250792503357, + "rewards/rejected": -0.046814244240522385, + "step": 1325 + }, + { + "debug/policy_chosen_logits": 1.0216914415359497, + "debug/policy_chosen_logps": -258.01416015625, + "debug/policy_rejected_logits": 1.3483374118804932, + "debug/policy_rejected_logps": -332.45635986328125, + "debug/reference_chosen_logps": -260.9222717285156, + "debug/reference_rejected_logps": -326.95977783203125, + "debug/sppo_chosen_loss": 2236.34765625, + "debug/sppo_chosen_reward_in_loss": 2.9081203937530518, + "debug/sppo_rej_reward_in_loss": -5.496593475341797, + "debug/sppo_reject_loss": 2135.439453125, + "epoch": 4.818840579710145, + "grad_norm": 71535.59444523143, + "learning_rate": 3.7035057047602446e-08, + "logits/chosen": 1.0216914415359497, + "logits/rejected": 1.3483374118804932, + "logps/chosen": -258.01416015625, + "logps/rejected": -332.45635986328125, + "loss": 4501.743, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.029081201180815697, + "rewards/margins": 0.08404713124036789, + "rewards/rejected": -0.05496593192219734, + "step": 1330 + }, + { + "debug/policy_chosen_logits": 1.256682276725769, + "debug/policy_chosen_logps": -243.0992431640625, + "debug/policy_rejected_logits": 1.596100091934204, + "debug/policy_rejected_logps": -297.1000061035156, + "debug/reference_chosen_logps": -245.0940399169922, + "debug/reference_rejected_logps": -291.6605529785156, + "debug/sppo_chosen_loss": 2355.398681640625, + "debug/sppo_chosen_reward_in_loss": 1.994797706604004, + "debug/sppo_rej_reward_in_loss": -5.43942928314209, + "debug/sppo_reject_loss": 2168.495361328125, + "epoch": 4.836956521739131, + "grad_norm": 95313.27853471276, + "learning_rate": 3.6675583846111964e-08, + "logits/chosen": 1.256682276725769, + "logits/rejected": 1.596100091934204, + "logps/chosen": -243.0992431640625, + "logps/rejected": -297.1000061035156, + "loss": 4547.6633, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.019947977736592293, + "rewards/margins": 0.0743422657251358, + "rewards/rejected": -0.05439429357647896, + "step": 1335 + }, + { + "debug/policy_chosen_logits": 1.0697505474090576, + "debug/policy_chosen_logps": -223.1968231201172, + "debug/policy_rejected_logits": 1.5175020694732666, + "debug/policy_rejected_logps": -285.0724792480469, + "debug/reference_chosen_logps": -223.690185546875, + "debug/reference_rejected_logps": -279.407958984375, + "debug/sppo_chosen_loss": 2535.14306640625, + "debug/sppo_chosen_reward_in_loss": 0.4933549761772156, + "debug/sppo_rej_reward_in_loss": -5.664527416229248, + "debug/sppo_reject_loss": 2116.50048828125, + "epoch": 4.855072463768116, + "grad_norm": 74487.85756637715, + "learning_rate": 3.6316850496395855e-08, + "logits/chosen": 1.0697505474090576, + "logits/rejected": 1.5175020694732666, + "logps/chosen": -223.1968231201172, + "logps/rejected": -285.0724792480469, + "loss": 4669.2, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.004933550488203764, + "rewards/margins": 0.06157882139086723, + "rewards/rejected": -0.05664527416229248, + "step": 1340 + }, + { + "debug/policy_chosen_logits": 1.3100662231445312, + "debug/policy_chosen_logps": -266.4246520996094, + "debug/policy_rejected_logits": 1.6305770874023438, + "debug/policy_rejected_logps": -269.2158203125, + "debug/reference_chosen_logps": -267.3473815917969, + "debug/reference_rejected_logps": -263.4280700683594, + "debug/sppo_chosen_loss": 2454.28466796875, + "debug/sppo_chosen_reward_in_loss": 0.9227026104927063, + "debug/sppo_rej_reward_in_loss": -5.787759304046631, + "debug/sppo_reject_loss": 2104.50732421875, + "epoch": 4.8731884057971016, + "grad_norm": 70525.65485246507, + "learning_rate": 3.595887691748868e-08, + "logits/chosen": 1.3100662231445312, + "logits/rejected": 1.6305770874023438, + "logps/chosen": -266.4246520996094, + "logps/rejected": -269.2158203125, + "loss": 4629.4301, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0092270253226161, + "rewards/margins": 0.06710462272167206, + "rewards/rejected": -0.057877592742443085, + "step": 1345 + }, + { + "debug/policy_chosen_logits": 1.0884641408920288, + "debug/policy_chosen_logps": -234.8406524658203, + "debug/policy_rejected_logits": 1.447321891784668, + "debug/policy_rejected_logps": -278.3136291503906, + "debug/reference_chosen_logps": -235.99813842773438, + "debug/reference_rejected_logps": -273.8133239746094, + "debug/sppo_chosen_loss": 2452.72119140625, + "debug/sppo_chosen_reward_in_loss": 1.1574690341949463, + "debug/sppo_rej_reward_in_loss": -4.500250816345215, + "debug/sppo_reject_loss": 2180.615234375, + "epoch": 4.891304347826087, + "grad_norm": 62332.34024417311, + "learning_rate": 3.560168298623788e-08, + "logits/chosen": 1.0884641408920288, + "logits/rejected": 1.447321891784668, + "logps/chosen": -234.8406524658203, + "logps/rejected": -278.3136291503906, + "loss": 4510.0766, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.011574688367545605, + "rewards/margins": 0.05657719820737839, + "rewards/rejected": -0.04500251263380051, + "step": 1350 + }, + { + "debug/policy_chosen_logits": 1.2118421792984009, + "debug/policy_chosen_logps": -256.170654296875, + "debug/policy_rejected_logits": 1.509093999862671, + "debug/policy_rejected_logps": -273.8734436035156, + "debug/reference_chosen_logps": -257.6533203125, + "debug/reference_rejected_logps": -271.18536376953125, + "debug/sppo_chosen_loss": 2406.82421875, + "debug/sppo_chosen_reward_in_loss": 1.4826520681381226, + "debug/sppo_rej_reward_in_loss": -2.688032865524292, + "debug/sppo_reject_loss": 2327.357177734375, + "epoch": 4.909420289855072, + "grad_norm": 96533.06499373812, + "learning_rate": 3.524528853620023e-08, + "logits/chosen": 1.2118421792984009, + "logits/rejected": 1.509093999862671, + "logps/chosen": -256.170654296875, + "logps/rejected": -273.8734436035156, + "loss": 4591.6375, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.014826519414782524, + "rewards/margins": 0.04170685261487961, + "rewards/rejected": -0.026880327612161636, + "step": 1355 + }, + { + "debug/policy_chosen_logits": 1.014077067375183, + "debug/policy_chosen_logps": -245.6739044189453, + "debug/policy_rejected_logits": 1.540024995803833, + "debug/policy_rejected_logps": -306.22271728515625, + "debug/reference_chosen_logps": -247.7864227294922, + "debug/reference_rejected_logps": -300.8656921386719, + "debug/sppo_chosen_loss": 2335.248779296875, + "debug/sppo_chosen_reward_in_loss": 2.1125502586364746, + "debug/sppo_rej_reward_in_loss": -5.357022285461426, + "debug/sppo_reject_loss": 2192.939453125, + "epoch": 4.927536231884058, + "grad_norm": 70736.48053184348, + "learning_rate": 3.488971335654043e-08, + "logits/chosen": 1.014077067375183, + "logits/rejected": 1.540024995803833, + "logps/chosen": -245.6739044189453, + "logps/rejected": -306.22271728515625, + "loss": 4537.7945, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02112550288438797, + "rewards/margins": 0.07469572126865387, + "rewards/rejected": -0.0535702221095562, + "step": 1360 + }, + { + "debug/policy_chosen_logits": 1.2282394170761108, + "debug/policy_chosen_logps": -254.85342407226562, + "debug/policy_rejected_logits": 1.4015997648239136, + "debug/policy_rejected_logps": -283.4310607910156, + "debug/reference_chosen_logps": -257.22283935546875, + "debug/reference_rejected_logps": -277.9129943847656, + "debug/sppo_chosen_loss": 2287.615966796875, + "debug/sppo_chosen_reward_in_loss": 2.369426727294922, + "debug/sppo_rej_reward_in_loss": -5.518064975738525, + "debug/sppo_reject_loss": 2086.158203125, + "epoch": 4.945652173913043, + "grad_norm": 67815.87349916976, + "learning_rate": 3.453497719093242e-08, + "logits/chosen": 1.2282394170761108, + "logits/rejected": 1.4015997648239136, + "logps/chosen": -254.85342407226562, + "logps/rejected": -283.4310607910156, + "loss": 4530.5863, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.023694265633821487, + "rewards/margins": 0.07887491583824158, + "rewards/rejected": -0.05518064647912979, + "step": 1365 + }, + { + "debug/policy_chosen_logits": 1.1096584796905518, + "debug/policy_chosen_logps": -254.56942749023438, + "debug/policy_rejected_logits": 1.5298644304275513, + "debug/policy_rejected_logps": -323.55755615234375, + "debug/reference_chosen_logps": -254.72119140625, + "debug/reference_rejected_logps": -315.86517333984375, + "debug/sppo_chosen_loss": 2589.263671875, + "debug/sppo_chosen_reward_in_loss": 0.15175572037696838, + "debug/sppo_rej_reward_in_loss": -7.692338466644287, + "debug/sppo_reject_loss": 1989.229248046875, + "epoch": 4.963768115942029, + "grad_norm": 129529.1765103212, + "learning_rate": 3.418109973646298e-08, + "logits/chosen": 1.1096584796905518, + "logits/rejected": 1.5298644304275513, + "logps/chosen": -254.56942749023438, + "logps/rejected": -323.55755615234375, + "loss": 4388.4941, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0015175581211224198, + "rewards/margins": 0.07844093441963196, + "rewards/rejected": -0.07692337036132812, + "step": 1370 + }, + { + "debug/policy_chosen_logits": 1.4372981786727905, + "debug/policy_chosen_logps": -259.3402099609375, + "debug/policy_rejected_logits": 1.646761178970337, + "debug/policy_rejected_logps": -302.2501525878906, + "debug/reference_chosen_logps": -260.85552978515625, + "debug/reference_rejected_logps": -294.53472900390625, + "debug/sppo_chosen_loss": 2396.297607421875, + "debug/sppo_chosen_reward_in_loss": 1.5153119564056396, + "debug/sppo_rej_reward_in_loss": -7.715400695800781, + "debug/sppo_reject_loss": 1990.2379150390625, + "epoch": 4.981884057971015, + "grad_norm": 80680.59194094632, + "learning_rate": 3.382810064253809e-08, + "logits/chosen": 1.4372981786727905, + "logits/rejected": 1.646761178970337, + "logps/chosen": -259.3402099609375, + "logps/rejected": -302.2501525878906, + "loss": 4553.3805, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.015153119340538979, + "rewards/margins": 0.09230712056159973, + "rewards/rejected": -0.0771540030837059, + "step": 1375 + }, + { + "debug/policy_chosen_logits": 1.3966089487075806, + "debug/policy_chosen_logps": -272.23431396484375, + "debug/policy_rejected_logits": 1.5716099739074707, + "debug/policy_rejected_logps": -295.2232360839844, + "debug/reference_chosen_logps": -273.52972412109375, + "debug/reference_rejected_logps": -288.4667053222656, + "debug/sppo_chosen_loss": 2394.87255859375, + "debug/sppo_chosen_reward_in_loss": 1.2953789234161377, + "debug/sppo_rej_reward_in_loss": -6.756533622741699, + "debug/sppo_reject_loss": 2009.9847412109375, + "epoch": 5.0, + "grad_norm": 57177.16481952865, + "learning_rate": 3.3475999509791925e-08, + "logits/chosen": 1.3966089487075806, + "logits/rejected": 1.5716099739074707, + "logps/chosen": -272.23431396484375, + "logps/rejected": -295.2232360839844, + "loss": 4452.3562, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.012953788042068481, + "rewards/margins": 0.08051912486553192, + "rewards/rejected": -0.06756533682346344, + "step": 1380 + }, + { + "debug/policy_chosen_logits": 1.0597548484802246, + "debug/policy_chosen_logps": -262.56280517578125, + "debug/policy_rejected_logits": 1.3429030179977417, + "debug/policy_rejected_logps": -312.5959167480469, + "debug/reference_chosen_logps": -262.3939514160156, + "debug/reference_rejected_logps": -307.7593078613281, + "debug/sppo_chosen_loss": 2606.427734375, + "debug/sppo_chosen_reward_in_loss": -0.16884784400463104, + "debug/sppo_rej_reward_in_loss": -4.836643218994141, + "debug/sppo_reject_loss": 2142.93603515625, + "epoch": 5.018115942028985, + "grad_norm": 66797.05817653751, + "learning_rate": 3.3124815888998345e-08, + "logits/chosen": 1.0597548484802246, + "logits/rejected": 1.3429030179977417, + "logps/chosen": -262.56280517578125, + "logps/rejected": -312.5959167480469, + "loss": 4653.1094, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0016884788637980819, + "rewards/margins": 0.046677954494953156, + "rewards/rejected": -0.04836643114686012, + "step": 1385 + }, + { + "debug/policy_chosen_logits": 1.2682220935821533, + "debug/policy_chosen_logps": -259.74493408203125, + "debug/policy_rejected_logits": 1.6833469867706299, + "debug/policy_rejected_logps": -294.34771728515625, + "debug/reference_chosen_logps": -261.4613342285156, + "debug/reference_rejected_logps": -290.87646484375, + "debug/sppo_chosen_loss": 2359.308349609375, + "debug/sppo_chosen_reward_in_loss": 1.716357946395874, + "debug/sppo_rej_reward_in_loss": -3.471240282058716, + "debug/sppo_reject_loss": 2267.60791015625, + "epoch": 5.036231884057971, + "grad_norm": 73249.23655353184, + "learning_rate": 3.277456927998554e-08, + "logits/chosen": 1.2682220935821533, + "logits/rejected": 1.6833469867706299, + "logps/chosen": -259.74493408203125, + "logps/rejected": -294.34771728515625, + "loss": 4654.3492, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017163580283522606, + "rewards/margins": 0.05187598615884781, + "rewards/rejected": -0.034712404012680054, + "step": 1390 + }, + { + "debug/policy_chosen_logits": 1.388127088546753, + "debug/policy_chosen_logps": -271.39593505859375, + "debug/policy_rejected_logits": 1.7892773151397705, + "debug/policy_rejected_logps": -321.88311767578125, + "debug/reference_chosen_logps": -273.29937744140625, + "debug/reference_rejected_logps": -315.52386474609375, + "debug/sppo_chosen_loss": 2344.13427734375, + "debug/sppo_chosen_reward_in_loss": 1.9034183025360107, + "debug/sppo_rej_reward_in_loss": -6.359226703643799, + "debug/sppo_reject_loss": 2038.1265869140625, + "epoch": 5.054347826086956, + "grad_norm": 56563.963600837385, + "learning_rate": 3.2425279130553076e-08, + "logits/chosen": 1.388127088546753, + "logits/rejected": 1.7892773151397705, + "logps/chosen": -271.39593505859375, + "logps/rejected": -321.88311767578125, + "loss": 4476.5383, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.019034182652831078, + "rewards/margins": 0.08262644708156586, + "rewards/rejected": -0.06359227001667023, + "step": 1395 + }, + { + "debug/policy_chosen_logits": 0.7481376528739929, + "debug/policy_chosen_logps": -235.9759521484375, + "debug/policy_rejected_logits": 1.258954405784607, + "debug/policy_rejected_logps": -309.2740173339844, + "debug/reference_chosen_logps": -236.7089080810547, + "debug/reference_rejected_logps": -302.9029235839844, + "debug/sppo_chosen_loss": 2485.295166015625, + "debug/sppo_chosen_reward_in_loss": 0.7329736948013306, + "debug/sppo_rej_reward_in_loss": -6.371078968048096, + "debug/sppo_reject_loss": 2072.038330078125, + "epoch": 5.072463768115942, + "grad_norm": 64677.330488287786, + "learning_rate": 3.2076964835392185e-08, + "logits/chosen": 0.7481376528739929, + "logits/rejected": 1.258954405784607, + "logps/chosen": -235.9759521484375, + "logps/rejected": -309.2740173339844, + "loss": 4551.7766, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.007329737301915884, + "rewards/margins": 0.07104052603244781, + "rewards/rejected": -0.06371079385280609, + "step": 1400 + }, + { + "epoch": 5.072463768115942, + "eval_debug/policy_chosen_logits": 1.4144388437271118, + "eval_debug/policy_chosen_logps": -252.45864868164062, + "eval_debug/policy_rejected_logits": 1.4595340490341187, + "eval_debug/policy_rejected_logps": -263.1626892089844, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2524.455322265625, + "eval_debug/sppo_chosen_reward_in_loss": 0.45980995893478394, + "eval_debug/sppo_rej_reward_in_loss": -3.5040810108184814, + "eval_debug/sppo_reject_loss": 2311.046630859375, + "eval_logits/chosen": 1.4144388437271118, + "eval_logits/rejected": 1.4595340490341187, + "eval_logps/chosen": -252.45864868164062, + "eval_logps/rejected": -263.1626892089844, + "eval_loss": 4636.1591796875, + "eval_rewards/accuracies": 0.6052631735801697, + "eval_rewards/chosen": 0.004598099738359451, + "eval_rewards/margins": 0.039638906717300415, + "eval_rewards/rejected": -0.035040806978940964, + "eval_runtime": 28.4226, + "eval_samples_per_second": 21.11, + "eval_steps_per_second": 0.668, + "step": 1400 + }, + { + "debug/policy_chosen_logits": 1.073176383972168, + "debug/policy_chosen_logps": -253.0771484375, + "debug/policy_rejected_logits": 1.1446640491485596, + "debug/policy_rejected_logps": -265.46929931640625, + "debug/reference_chosen_logps": -255.2992401123047, + "debug/reference_rejected_logps": -260.98126220703125, + "debug/sppo_chosen_loss": 2313.059814453125, + "debug/sppo_chosen_reward_in_loss": 2.222093105316162, + "debug/sppo_rej_reward_in_loss": -4.48803186416626, + "debug/sppo_reject_loss": 2214.115966796875, + "epoch": 5.090579710144928, + "grad_norm": 57291.336214715986, + "learning_rate": 3.1729645735008747e-08, + "logits/chosen": 1.073176383972168, + "logits/rejected": 1.1446640491485596, + "logps/chosen": -253.0771484375, + "logps/rejected": -265.46929931640625, + "loss": 4430.0254, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.022220930084586143, + "rewards/margins": 0.06710124760866165, + "rewards/rejected": -0.04488031938672066, + "step": 1405 + }, + { + "debug/policy_chosen_logits": 1.2134159803390503, + "debug/policy_chosen_logps": -260.3841552734375, + "debug/policy_rejected_logits": 1.4837480783462524, + "debug/policy_rejected_logps": -291.3798828125, + "debug/reference_chosen_logps": -260.37481689453125, + "debug/reference_rejected_logps": -285.4285888671875, + "debug/sppo_chosen_loss": 2577.62939453125, + "debug/sppo_chosen_reward_in_loss": -0.009347915649414062, + "debug/sppo_rej_reward_in_loss": -5.951307773590088, + "debug/sppo_reject_loss": 2098.92529296875, + "epoch": 5.108695652173913, + "grad_norm": 77453.72944389304, + "learning_rate": 3.1383341114649466e-08, + "logits/chosen": 1.2134159803390503, + "logits/rejected": 1.4837480783462524, + "logps/chosen": -260.3841552734375, + "logps/rejected": -291.3798828125, + "loss": 4625.1777, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -9.347964078187943e-05, + "rewards/margins": 0.05941959470510483, + "rewards/rejected": -0.05951308086514473, + "step": 1410 + }, + { + "debug/policy_chosen_logits": 1.3050518035888672, + "debug/policy_chosen_logps": -271.77227783203125, + "debug/policy_rejected_logits": 1.3268954753875732, + "debug/policy_rejected_logps": -281.19415283203125, + "debug/reference_chosen_logps": -272.76300048828125, + "debug/reference_rejected_logps": -275.5526428222656, + "debug/sppo_chosen_loss": 2437.26220703125, + "debug/sppo_chosen_reward_in_loss": 0.9907159805297852, + "debug/sppo_rej_reward_in_loss": -5.641491889953613, + "debug/sppo_reject_loss": 2117.996337890625, + "epoch": 5.1268115942028984, + "grad_norm": 62995.000833422404, + "learning_rate": 3.103807020323103e-08, + "logits/chosen": 1.3050518035888672, + "logits/rejected": 1.3268954753875732, + "logps/chosen": -271.77227783203125, + "logps/rejected": -281.19415283203125, + "loss": 4568.5047, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.009907159022986889, + "rewards/margins": 0.06632207334041595, + "rewards/rejected": -0.05641491338610649, + "step": 1415 + }, + { + "debug/policy_chosen_logits": 1.2935222387313843, + "debug/policy_chosen_logps": -257.31121826171875, + "debug/policy_rejected_logits": 1.5443761348724365, + "debug/policy_rejected_logps": -304.0801696777344, + "debug/reference_chosen_logps": -260.3376159667969, + "debug/reference_rejected_logps": -298.91290283203125, + "debug/sppo_chosen_loss": 2233.71142578125, + "debug/sppo_chosen_reward_in_loss": 3.026392698287964, + "debug/sppo_rej_reward_in_loss": -5.167298316955566, + "debug/sppo_reject_loss": 2137.61669921875, + "epoch": 5.144927536231884, + "grad_norm": 76168.55447801237, + "learning_rate": 3.0693852172272336e-08, + "logits/chosen": 1.2935222387313843, + "logits/rejected": 1.5443761348724365, + "logps/chosen": -257.31121826171875, + "logps/rejected": -304.0801696777344, + "loss": 4492.1176, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.030263924971222878, + "rewards/margins": 0.08193691074848175, + "rewards/rejected": -0.051672983914613724, + "step": 1420 + }, + { + "debug/policy_chosen_logits": 1.0238162279129028, + "debug/policy_chosen_logps": -281.5020751953125, + "debug/policy_rejected_logits": 1.1858758926391602, + "debug/policy_rejected_logps": -284.3791198730469, + "debug/reference_chosen_logps": -281.7276306152344, + "debug/reference_rejected_logps": -278.9505615234375, + "debug/sppo_chosen_loss": 2551.8203125, + "debug/sppo_chosen_reward_in_loss": 0.22554931044578552, + "debug/sppo_rej_reward_in_loss": -5.4285502433776855, + "debug/sppo_reject_loss": 2039.286865234375, + "epoch": 5.163043478260869, + "grad_norm": 90342.22934886666, + "learning_rate": 3.035070613483009e-08, + "logits/chosen": 1.0238162279129028, + "logits/rejected": 1.1858758926391602, + "logps/chosen": -281.5020751953125, + "logps/rejected": -284.3791198730469, + "loss": 4486.2891, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0022554919123649597, + "rewards/margins": 0.056540995836257935, + "rewards/rejected": -0.054285503923892975, + "step": 1425 + }, + { + "debug/policy_chosen_logits": 1.0165376663208008, + "debug/policy_chosen_logps": -226.81124877929688, + "debug/policy_rejected_logits": 1.4223605394363403, + "debug/policy_rejected_logps": -333.42999267578125, + "debug/reference_chosen_logps": -230.421142578125, + "debug/reference_rejected_logps": -324.8152160644531, + "debug/sppo_chosen_loss": 2202.34912109375, + "debug/sppo_chosen_reward_in_loss": 3.6098670959472656, + "debug/sppo_rej_reward_in_loss": -8.614764213562012, + "debug/sppo_reject_loss": 1888.661376953125, + "epoch": 5.181159420289855, + "grad_norm": 61910.098949095074, + "learning_rate": 3.0008651144437394e-08, + "logits/chosen": 1.0165376663208008, + "logits/rejected": 1.4223605394363403, + "logps/chosen": -226.81124877929688, + "logps/rejected": -333.42999267578125, + "loss": 4456.3543, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.0360986702144146, + "rewards/margins": 0.12224630266427994, + "rewards/rejected": -0.08614763617515564, + "step": 1430 + }, + { + "debug/policy_chosen_logits": 0.6908336877822876, + "debug/policy_chosen_logps": -236.1743621826172, + "debug/policy_rejected_logits": 1.1342148780822754, + "debug/policy_rejected_logps": -267.88836669921875, + "debug/reference_chosen_logps": -237.2069549560547, + "debug/reference_rejected_logps": -264.7642517089844, + "debug/sppo_chosen_loss": 2472.70166015625, + "debug/sppo_chosen_reward_in_loss": 1.0326130390167236, + "debug/sppo_rej_reward_in_loss": -3.124145746231079, + "debug/sppo_reject_loss": 2303.392822265625, + "epoch": 5.199275362318841, + "grad_norm": 70461.5400215946, + "learning_rate": 2.9667706194045895e-08, + "logits/chosen": 0.6908336877822876, + "logits/rejected": 1.1342148780822754, + "logps/chosen": -236.1743621826172, + "logps/rejected": -267.88836669921875, + "loss": 4527.0414, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01032613031566143, + "rewards/margins": 0.04156758636236191, + "rewards/rejected": -0.031241456046700478, + "step": 1435 + }, + { + "debug/policy_chosen_logits": 1.1813528537750244, + "debug/policy_chosen_logps": -278.6885681152344, + "debug/policy_rejected_logits": 1.2910782098770142, + "debug/policy_rejected_logps": -300.384521484375, + "debug/reference_chosen_logps": -281.6507873535156, + "debug/reference_rejected_logps": -296.9029235839844, + "debug/sppo_chosen_loss": 2274.62646484375, + "debug/sppo_chosen_reward_in_loss": 2.9621920585632324, + "debug/sppo_rej_reward_in_loss": -3.4816219806671143, + "debug/sppo_reject_loss": 2300.47705078125, + "epoch": 5.217391304347826, + "grad_norm": 158076.63899714782, + "learning_rate": 2.932789021497113e-08, + "logits/chosen": 1.1813528537750244, + "logits/rejected": 1.2910782098770142, + "logps/chosen": -278.6885681152344, + "logps/rejected": -300.384521484375, + "loss": 4592.1902, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.029621923342347145, + "rewards/margins": 0.06443814188241959, + "rewards/rejected": -0.03481621667742729, + "step": 1440 + }, + { + "debug/policy_chosen_logits": 0.945541262626648, + "debug/policy_chosen_logps": -226.1212615966797, + "debug/policy_rejected_logits": 1.6461076736450195, + "debug/policy_rejected_logps": -293.69866943359375, + "debug/reference_chosen_logps": -230.6846466064453, + "debug/reference_rejected_logps": -292.0801086425781, + "debug/sppo_chosen_loss": 2121.76220703125, + "debug/sppo_chosen_reward_in_loss": 4.563372611999512, + "debug/sppo_rej_reward_in_loss": -1.6185725927352905, + "debug/sppo_reject_loss": 2431.987548828125, + "epoch": 5.2355072463768115, + "grad_norm": 90625.44811809929, + "learning_rate": 2.898922207584133e-08, + "logits/chosen": 0.945541262626648, + "logits/rejected": 1.6461076736450195, + "logps/chosen": -226.1212615966797, + "logps/rejected": -293.69866943359375, + "loss": 4473.3125, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04563372582197189, + "rewards/margins": 0.06181945651769638, + "rewards/rejected": -0.01618572697043419, + "step": 1445 + }, + { + "debug/policy_chosen_logits": 1.2806552648544312, + "debug/policy_chosen_logps": -251.5911102294922, + "debug/policy_rejected_logits": 1.614915132522583, + "debug/policy_rejected_logps": -315.8453674316406, + "debug/reference_chosen_logps": -254.13671875, + "debug/reference_rejected_logps": -314.6860656738281, + "debug/sppo_chosen_loss": 2278.430419921875, + "debug/sppo_chosen_reward_in_loss": 2.5456225872039795, + "debug/sppo_rej_reward_in_loss": -1.1593204736709595, + "debug/sppo_reject_loss": 2467.211181640625, + "epoch": 5.253623188405797, + "grad_norm": 62155.8076423089, + "learning_rate": 2.8651720581549797e-08, + "logits/chosen": 1.2806552648544312, + "logits/rejected": 1.614915132522583, + "logps/chosen": -251.5911102294922, + "logps/rejected": -315.8453674316406, + "loss": 4487.5734, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.025456225499510765, + "rewards/margins": 0.03704943135380745, + "rewards/rejected": -0.01159320492297411, + "step": 1450 + }, + { + "debug/policy_chosen_logits": 0.8935205340385437, + "debug/policy_chosen_logps": -230.4705047607422, + "debug/policy_rejected_logits": 1.229943871498108, + "debug/policy_rejected_logps": -296.24456787109375, + "debug/reference_chosen_logps": -233.8374481201172, + "debug/reference_rejected_logps": -290.72314453125, + "debug/sppo_chosen_loss": 2209.38916015625, + "debug/sppo_chosen_reward_in_loss": 3.366943359375, + "debug/sppo_rej_reward_in_loss": -5.5214385986328125, + "debug/sppo_reject_loss": 2192.26123046875, + "epoch": 5.271739130434782, + "grad_norm": 80665.85813537793, + "learning_rate": 2.8315404472210646e-08, + "logits/chosen": 0.8935205340385437, + "logits/rejected": 1.229943871498108, + "logps/chosen": -230.4705047607422, + "logps/rejected": -296.24456787109375, + "loss": 4589.9242, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03366943448781967, + "rewards/margins": 0.08888381719589233, + "rewards/rejected": -0.05521438643336296, + "step": 1455 + }, + { + "debug/policy_chosen_logits": 0.9543207287788391, + "debug/policy_chosen_logps": -228.46011352539062, + "debug/policy_rejected_logits": 1.2147928476333618, + "debug/policy_rejected_logps": -283.1182556152344, + "debug/reference_chosen_logps": -232.3076629638672, + "debug/reference_rejected_logps": -277.7240295410156, + "debug/sppo_chosen_loss": 2165.05224609375, + "debug/sppo_chosen_reward_in_loss": 3.8475775718688965, + "debug/sppo_rej_reward_in_loss": -5.394213676452637, + "debug/sppo_reject_loss": 2134.77587890625, + "epoch": 5.2898550724637685, + "grad_norm": 75556.9653345357, + "learning_rate": 2.798029242211828e-08, + "logits/chosen": 0.9543207287788391, + "logits/rejected": 1.2147928476333618, + "logps/chosen": -228.46011352539062, + "logps/rejected": -283.1182556152344, + "loss": 4569.8641, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.038475774228572845, + "rewards/margins": 0.09241791069507599, + "rewards/rejected": -0.053942132741212845, + "step": 1460 + }, + { + "debug/policy_chosen_logits": 1.2613698244094849, + "debug/policy_chosen_logps": -245.041015625, + "debug/policy_rejected_logits": 1.4938691854476929, + "debug/policy_rejected_logps": -269.397705078125, + "debug/reference_chosen_logps": -248.71542358398438, + "debug/reference_rejected_logps": -269.37152099609375, + "debug/sppo_chosen_loss": 2186.452392578125, + "debug/sppo_chosen_reward_in_loss": 3.674414873123169, + "debug/sppo_rej_reward_in_loss": -0.02619953081011772, + "debug/sppo_reject_loss": 2560.8427734375, + "epoch": 5.307971014492754, + "grad_norm": 113836.35337531123, + "learning_rate": 2.7646403038710535e-08, + "logits/chosen": 1.2613698244094849, + "logits/rejected": 1.4938691854476929, + "logps/chosen": -245.041015625, + "logps/rejected": -269.397705078125, + "loss": 4434.6, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.036744147539138794, + "rewards/margins": 0.037006136029958725, + "rewards/rejected": -0.0002619953884277493, + "step": 1465 + }, + { + "debug/policy_chosen_logits": 1.011094093322754, + "debug/policy_chosen_logps": -223.7213897705078, + "debug/policy_rejected_logits": 1.8676655292510986, + "debug/policy_rejected_logps": -326.70562744140625, + "debug/reference_chosen_logps": -227.2796173095703, + "debug/reference_rejected_logps": -324.4100036621094, + "debug/sppo_chosen_loss": 2167.9365234375, + "debug/sppo_chosen_reward_in_loss": 3.558199644088745, + "debug/sppo_rej_reward_in_loss": -2.2955880165100098, + "debug/sppo_reject_loss": 2348.977783203125, + "epoch": 5.326086956521739, + "grad_norm": 74309.78767829157, + "learning_rate": 2.73137548615354e-08, + "logits/chosen": 1.011094093322754, + "logits/rejected": 1.8676655292510986, + "logps/chosen": -223.7213897705078, + "logps/rejected": -326.70562744140625, + "loss": 4516.7555, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03558199480175972, + "rewards/margins": 0.05853787809610367, + "rewards/rejected": -0.02295587956905365, + "step": 1470 + }, + { + "debug/policy_chosen_logits": 1.1648266315460205, + "debug/policy_chosen_logps": -263.5667419433594, + "debug/policy_rejected_logits": 1.4224785566329956, + "debug/policy_rejected_logps": -294.29736328125, + "debug/reference_chosen_logps": -265.66778564453125, + "debug/reference_rejected_logps": -290.55096435546875, + "debug/sppo_chosen_loss": 2339.315185546875, + "debug/sppo_chosen_reward_in_loss": 2.1010348796844482, + "debug/sppo_rej_reward_in_loss": -3.7463951110839844, + "debug/sppo_reject_loss": 2271.000732421875, + "epoch": 5.344202898550725, + "grad_norm": 135696.9586156644, + "learning_rate": 2.6982366361221608e-08, + "logits/chosen": 1.1648266315460205, + "logits/rejected": 1.4224785566329956, + "logps/chosen": -263.5667419433594, + "logps/rejected": -294.29736328125, + "loss": 4515.3344, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.021010348573327065, + "rewards/margins": 0.05847429484128952, + "rewards/rejected": -0.0374639518558979, + "step": 1475 + }, + { + "debug/policy_chosen_logits": 1.3069803714752197, + "debug/policy_chosen_logps": -282.95233154296875, + "debug/policy_rejected_logits": 1.629817247390747, + "debug/policy_rejected_logps": -290.300048828125, + "debug/reference_chosen_logps": -286.08929443359375, + "debug/reference_rejected_logps": -290.58465576171875, + "debug/sppo_chosen_loss": 2222.154052734375, + "debug/sppo_chosen_reward_in_loss": 3.136929988861084, + "debug/sppo_rej_reward_in_loss": 0.2846008241176605, + "debug/sppo_reject_loss": 2600.535400390625, + "epoch": 5.36231884057971, + "grad_norm": 92393.44114528071, + "learning_rate": 2.6652255938453066e-08, + "logits/chosen": 1.3069803714752197, + "logits/rejected": 1.629817247390747, + "logps/chosen": -282.95233154296875, + "logps/rejected": -290.300048828125, + "loss": 4634.9187, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03136930242180824, + "rewards/margins": 0.028523290529847145, + "rewards/rejected": 0.002846005605533719, + "step": 1480 + }, + { + "debug/policy_chosen_logits": 1.0232958793640137, + "debug/policy_chosen_logps": -254.8156280517578, + "debug/policy_rejected_logits": 1.1778652667999268, + "debug/policy_rejected_logps": -276.6258239746094, + "debug/reference_chosen_logps": -258.20574951171875, + "debug/reference_rejected_logps": -270.19232177734375, + "debug/sppo_chosen_loss": 2210.439453125, + "debug/sppo_chosen_reward_in_loss": 3.3901278972625732, + "debug/sppo_rej_reward_in_loss": -6.4334917068481445, + "debug/sppo_reject_loss": 1996.9488525390625, + "epoch": 5.380434782608695, + "grad_norm": 66567.60725853742, + "learning_rate": 2.6323441922947165e-08, + "logits/chosen": 1.0232958793640137, + "logits/rejected": 1.1778652667999268, + "logps/chosen": -254.8156280517578, + "logps/rejected": -276.6258239746094, + "loss": 4421.8281, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.03390127792954445, + "rewards/margins": 0.09823620319366455, + "rewards/rejected": -0.0643349215388298, + "step": 1485 + }, + { + "debug/policy_chosen_logits": 1.2483222484588623, + "debug/policy_chosen_logps": -242.7491455078125, + "debug/policy_rejected_logits": 1.4808881282806396, + "debug/policy_rejected_logps": -260.0889587402344, + "debug/reference_chosen_logps": -246.10781860351562, + "debug/reference_rejected_logps": -257.28558349609375, + "debug/sppo_chosen_loss": 2200.03564453125, + "debug/sppo_chosen_reward_in_loss": 3.3586738109588623, + "debug/sppo_rej_reward_in_loss": -2.803394079208374, + "debug/sppo_reject_loss": 2330.042724609375, + "epoch": 5.398550724637682, + "grad_norm": 74480.98281722279, + "learning_rate": 2.599594257243689e-08, + "logits/chosen": 1.2483222484588623, + "logits/rejected": 1.4808881282806396, + "logps/chosen": -242.7491455078125, + "logps/rejected": -260.0889587402344, + "loss": 4575.8328, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.033586740493774414, + "rewards/margins": 0.06162068247795105, + "rewards/rejected": -0.028033941984176636, + "step": 1490 + }, + { + "debug/policy_chosen_logits": 1.2368654012680054, + "debug/policy_chosen_logps": -254.530517578125, + "debug/policy_rejected_logits": 1.3546712398529053, + "debug/policy_rejected_logps": -276.13037109375, + "debug/reference_chosen_logps": -257.09320068359375, + "debug/reference_rejected_logps": -271.80712890625, + "debug/sppo_chosen_loss": 2288.206298828125, + "debug/sppo_chosen_reward_in_loss": 2.5626778602600098, + "debug/sppo_rej_reward_in_loss": -4.323225975036621, + "debug/sppo_reject_loss": 2213.88671875, + "epoch": 5.416666666666667, + "grad_norm": 109713.85571536788, + "learning_rate": 2.566977607165719e-08, + "logits/chosen": 1.2368654012680054, + "logits/rejected": 1.3546712398529053, + "logps/chosen": -254.530517578125, + "logps/rejected": -276.13037109375, + "loss": 4559.675, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.025626778602600098, + "rewards/margins": 0.0688590407371521, + "rewards/rejected": -0.043232254683971405, + "step": 1495 + }, + { + "debug/policy_chosen_logits": 0.9920104742050171, + "debug/policy_chosen_logps": -228.82107543945312, + "debug/policy_rejected_logits": 1.250663161277771, + "debug/policy_rejected_logps": -247.8716278076172, + "debug/reference_chosen_logps": -232.8604278564453, + "debug/reference_rejected_logps": -243.4485321044922, + "debug/sppo_chosen_loss": 2136.54931640625, + "debug/sppo_chosen_reward_in_loss": 4.039345741271973, + "debug/sppo_rej_reward_in_loss": -4.423121452331543, + "debug/sppo_reject_loss": 2212.809326171875, + "epoch": 5.434782608695652, + "grad_norm": 56495.066508312746, + "learning_rate": 2.5344960531335102e-08, + "logits/chosen": 0.9920104742050171, + "logits/rejected": 1.250663161277771, + "logps/chosen": -228.82107543945312, + "logps/rejected": -247.8716278076172, + "loss": 4481.4781, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.04039346054196358, + "rewards/margins": 0.08462467044591904, + "rewards/rejected": -0.04423121362924576, + "step": 1500 + }, + { + "epoch": 5.434782608695652, + "eval_debug/policy_chosen_logits": 1.402949333190918, + "eval_debug/policy_chosen_logps": -251.6733856201172, + "eval_debug/policy_rejected_logits": 1.4467878341674805, + "eval_debug/policy_rejected_logps": -262.5466613769531, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2446.67919921875, + "eval_debug/sppo_chosen_reward_in_loss": 1.2450839281082153, + "eval_debug/sppo_rej_reward_in_loss": -2.888056755065918, + "eval_debug/sppo_reject_loss": 2368.621826171875, + "eval_logits/chosen": 1.402949333190918, + "eval_logits/rejected": 1.4467878341674805, + "eval_logps/chosen": -251.6733856201172, + "eval_logps/rejected": -262.5466613769531, + "eval_loss": 4616.7265625, + "eval_rewards/accuracies": 0.5921052694320679, + "eval_rewards/chosen": 0.012450839392840862, + "eval_rewards/margins": 0.04133140668272972, + "eval_rewards/rejected": -0.028880568221211433, + "eval_runtime": 28.3883, + "eval_samples_per_second": 21.135, + "eval_steps_per_second": 0.669, + "step": 1500 + }, + { + "debug/policy_chosen_logits": 1.1282310485839844, + "debug/policy_chosen_logps": -251.6337890625, + "debug/policy_rejected_logits": 1.3733762502670288, + "debug/policy_rejected_logps": -292.90301513671875, + "debug/reference_chosen_logps": -252.951171875, + "debug/reference_rejected_logps": -290.30047607421875, + "debug/sppo_chosen_loss": 2449.819091796875, + "debug/sppo_chosen_reward_in_loss": 1.3173834085464478, + "debug/sppo_rej_reward_in_loss": -2.6024844646453857, + "debug/sppo_reject_loss": 2344.947998046875, + "epoch": 5.452898550724638, + "grad_norm": 71162.56835391422, + "learning_rate": 2.5021513987184274e-08, + "logits/chosen": 1.1282310485839844, + "logits/rejected": 1.3733762502670288, + "logps/chosen": -251.6337890625, + "logps/rejected": -292.90301513671875, + "loss": 4520.8465, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01317383348941803, + "rewards/margins": 0.039198677986860275, + "rewards/rejected": -0.026024844497442245, + "step": 1505 + }, + { + "debug/policy_chosen_logits": 1.0822408199310303, + "debug/policy_chosen_logps": -248.52548217773438, + "debug/policy_rejected_logits": 1.5081207752227783, + "debug/policy_rejected_logps": -279.8587341308594, + "debug/reference_chosen_logps": -251.04904174804688, + "debug/reference_rejected_logps": -274.56640625, + "debug/sppo_chosen_loss": 2292.804443359375, + "debug/sppo_chosen_reward_in_loss": 2.523580551147461, + "debug/sppo_rej_reward_in_loss": -5.292346000671387, + "debug/sppo_reject_loss": 2132.537109375, + "epoch": 5.471014492753623, + "grad_norm": 79190.04350242131, + "learning_rate": 2.469945439890339e-08, + "logits/chosen": 1.0822408199310303, + "logits/rejected": 1.5081207752227783, + "logps/chosen": -248.52548217773438, + "logps/rejected": -279.8587341308594, + "loss": 4436.1336, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.02523580566048622, + "rewards/margins": 0.07815925776958466, + "rewards/rejected": -0.05292346328496933, + "step": 1510 + }, + { + "debug/policy_chosen_logits": 1.0114610195159912, + "debug/policy_chosen_logps": -219.73385620117188, + "debug/policy_rejected_logits": 1.4271812438964844, + "debug/policy_rejected_logps": -282.7474060058594, + "debug/reference_chosen_logps": -222.33889770507812, + "debug/reference_rejected_logps": -275.36151123046875, + "debug/sppo_chosen_loss": 2299.99951171875, + "debug/sppo_chosen_reward_in_loss": 2.6050164699554443, + "debug/sppo_rej_reward_in_loss": -7.385918617248535, + "debug/sppo_reject_loss": 2018.4224853515625, + "epoch": 5.489130434782608, + "grad_norm": 77773.38202681381, + "learning_rate": 2.4378799649179023e-08, + "logits/chosen": 1.0114610195159912, + "logits/rejected": 1.4271812438964844, + "logps/chosen": -219.73385620117188, + "logps/rejected": -282.7474060058594, + "loss": 4445.4719, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.026050161570310593, + "rewards/margins": 0.09990935027599335, + "rewards/rejected": -0.07385918498039246, + "step": 1515 + }, + { + "debug/policy_chosen_logits": 1.517974853515625, + "debug/policy_chosen_logps": -278.83526611328125, + "debug/policy_rejected_logits": 1.8126938343048096, + "debug/policy_rejected_logps": -344.2177734375, + "debug/reference_chosen_logps": -279.59576416015625, + "debug/reference_rejected_logps": -340.62744140625, + "debug/sppo_chosen_loss": 2477.626220703125, + "debug/sppo_chosen_reward_in_loss": 0.7605171203613281, + "debug/sppo_rej_reward_in_loss": -3.5903029441833496, + "debug/sppo_reject_loss": 2266.54638671875, + "epoch": 5.507246376811594, + "grad_norm": 88506.91102373891, + "learning_rate": 2.4059567542692682e-08, + "logits/chosen": 1.517974853515625, + "logits/rejected": 1.8126938343048096, + "logps/chosen": -278.83526611328125, + "logps/rejected": -344.2177734375, + "loss": 4525.4527, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007605170365422964, + "rewards/margins": 0.04350820183753967, + "rewards/rejected": -0.03590302914381027, + "step": 1520 + }, + { + "debug/policy_chosen_logits": 1.2230756282806396, + "debug/policy_chosen_logps": -253.2280731201172, + "debug/policy_rejected_logits": 1.4549987316131592, + "debug/policy_rejected_logps": -291.5811462402344, + "debug/reference_chosen_logps": -255.423095703125, + "debug/reference_rejected_logps": -287.0794372558594, + "debug/sppo_chosen_loss": 2331.87890625, + "debug/sppo_chosen_reward_in_loss": 2.194990634918213, + "debug/sppo_rej_reward_in_loss": -4.501686096191406, + "debug/sppo_reject_loss": 2184.75439453125, + "epoch": 5.52536231884058, + "grad_norm": 82290.86470328699, + "learning_rate": 2.3741775805132096e-08, + "logits/chosen": 1.2230756282806396, + "logits/rejected": 1.4549987316131592, + "logps/chosen": -253.2280731201172, + "logps/rejected": -291.5811462402344, + "loss": 4532.6773, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021949905902147293, + "rewards/margins": 0.06696675717830658, + "rewards/rejected": -0.04501685872673988, + "step": 1525 + }, + { + "debug/policy_chosen_logits": 0.7856322526931763, + "debug/policy_chosen_logps": -267.90460205078125, + "debug/policy_rejected_logits": 0.8814099431037903, + "debug/policy_rejected_logps": -275.64141845703125, + "debug/reference_chosen_logps": -268.8172302246094, + "debug/reference_rejected_logps": -273.11907958984375, + "debug/sppo_chosen_loss": 2476.67333984375, + "debug/sppo_chosen_reward_in_loss": 0.9126449823379517, + "debug/sppo_rej_reward_in_loss": -2.522326707839966, + "debug/sppo_reject_loss": 2360.42529296875, + "epoch": 5.543478260869565, + "grad_norm": 64277.589583882305, + "learning_rate": 2.342544208220712e-08, + "logits/chosen": 0.7856322526931763, + "logits/rejected": 0.8814099431037903, + "logps/chosen": -267.90460205078125, + "logps/rejected": -275.64141845703125, + "loss": 4584.1656, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.009126449935138226, + "rewards/margins": 0.0343497171998024, + "rewards/rejected": -0.0252232663333416, + "step": 1530 + }, + { + "debug/policy_chosen_logits": 0.983709454536438, + "debug/policy_chosen_logps": -250.71945190429688, + "debug/policy_rejected_logits": 1.3374192714691162, + "debug/policy_rejected_logps": -282.01995849609375, + "debug/reference_chosen_logps": -253.4926300048828, + "debug/reference_rejected_logps": -276.5571594238281, + "debug/sppo_chosen_loss": 2281.95556640625, + "debug/sppo_chosen_reward_in_loss": 2.7731635570526123, + "debug/sppo_rej_reward_in_loss": -5.462827682495117, + "debug/sppo_reject_loss": 2117.26953125, + "epoch": 5.561594202898551, + "grad_norm": 98220.17839286085, + "learning_rate": 2.311058393866981e-08, + "logits/chosen": 0.983709454536438, + "logits/rejected": 1.3374192714691162, + "logps/chosen": -250.71945190429688, + "logps/rejected": -282.01995849609375, + "loss": 4618.0863, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0277316365391016, + "rewards/margins": 0.0823599100112915, + "rewards/rejected": -0.05462827533483505, + "step": 1535 + }, + { + "debug/policy_chosen_logits": 0.9909914135932922, + "debug/policy_chosen_logps": -228.23965454101562, + "debug/policy_rejected_logits": 1.5274099111557007, + "debug/policy_rejected_logps": -296.5209655761719, + "debug/reference_chosen_logps": -230.16903686523438, + "debug/reference_rejected_logps": -288.3876037597656, + "debug/sppo_chosen_loss": 2336.477783203125, + "debug/sppo_chosen_reward_in_loss": 1.9293806552886963, + "debug/sppo_rej_reward_in_loss": -8.133366584777832, + "debug/sppo_reject_loss": 1942.154296875, + "epoch": 5.579710144927536, + "grad_norm": 67134.8106267087, + "learning_rate": 2.2797218857339163e-08, + "logits/chosen": 0.9909914135932922, + "logits/rejected": 1.5274099111557007, + "logps/chosen": -228.23965454101562, + "logps/rejected": -296.5209655761719, + "loss": 4509.9066, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.019293805584311485, + "rewards/margins": 0.10062746703624725, + "rewards/rejected": -0.08133365958929062, + "step": 1540 + }, + { + "debug/policy_chosen_logits": 1.036051869392395, + "debug/policy_chosen_logps": -239.81472778320312, + "debug/policy_rejected_logits": 1.2976715564727783, + "debug/policy_rejected_logps": -264.1811828613281, + "debug/reference_chosen_logps": -241.847900390625, + "debug/reference_rejected_logps": -261.27923583984375, + "debug/sppo_chosen_loss": 2345.96728515625, + "debug/sppo_chosen_reward_in_loss": 2.0331783294677734, + "debug/sppo_rej_reward_in_loss": -2.901949167251587, + "debug/sppo_reject_loss": 2336.490966796875, + "epoch": 5.5978260869565215, + "grad_norm": 60768.60879872343, + "learning_rate": 2.2485364238130432e-08, + "logits/chosen": 1.036051869392395, + "logits/rejected": 1.2976715564727783, + "logps/chosen": -239.81472778320312, + "logps/rejected": -264.1811828613281, + "loss": 4429.5215, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02033178135752678, + "rewards/margins": 0.049351271241903305, + "rewards/rejected": -0.029019493609666824, + "step": 1545 + }, + { + "debug/policy_chosen_logits": 1.1724830865859985, + "debug/policy_chosen_logps": -249.09274291992188, + "debug/policy_rejected_logits": 1.777655839920044, + "debug/policy_rejected_logps": -309.8492126464844, + "debug/reference_chosen_logps": -249.4769744873047, + "debug/reference_rejected_logps": -302.5712890625, + "debug/sppo_chosen_loss": 2510.03759765625, + "debug/sppo_chosen_reward_in_loss": 0.3842487335205078, + "debug/sppo_rej_reward_in_loss": -7.277923583984375, + "debug/sppo_reject_loss": 2033.0191650390625, + "epoch": 5.615942028985507, + "grad_norm": 81233.4541358366, + "learning_rate": 2.2175037397088887e-08, + "logits/chosen": 1.1724830865859985, + "logits/rejected": 1.777655839920044, + "logps/chosen": -249.09274291992188, + "logps/rejected": -309.8492126464844, + "loss": 4423.675, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.003842487931251526, + "rewards/margins": 0.07662171125411987, + "rewards/rejected": -0.07277923822402954, + "step": 1550 + }, + { + "debug/policy_chosen_logits": 1.1309086084365845, + "debug/policy_chosen_logps": -253.1494903564453, + "debug/policy_rejected_logits": 1.5028681755065918, + "debug/policy_rejected_logps": -308.93731689453125, + "debug/reference_chosen_logps": -255.1627960205078, + "debug/reference_rejected_logps": -301.81512451171875, + "debug/sppo_chosen_loss": 2369.07177734375, + "debug/sppo_chosen_reward_in_loss": 2.013322353363037, + "debug/sppo_rej_reward_in_loss": -7.1221771240234375, + "debug/sppo_reject_loss": 1992.401611328125, + "epoch": 5.634057971014493, + "grad_norm": 100650.3146198026, + "learning_rate": 2.1866255565428348e-08, + "logits/chosen": 1.1309086084365845, + "logits/rejected": 1.5028681755065918, + "logps/chosen": -253.1494903564453, + "logps/rejected": -308.93731689453125, + "loss": 4557.2141, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.02013322338461876, + "rewards/margins": 0.09135500341653824, + "rewards/rejected": -0.07122177630662918, + "step": 1555 + }, + { + "debug/policy_chosen_logits": 1.2297143936157227, + "debug/policy_chosen_logps": -257.30133056640625, + "debug/policy_rejected_logits": 1.589552879333496, + "debug/policy_rejected_logps": -279.32781982421875, + "debug/reference_chosen_logps": -259.53619384765625, + "debug/reference_rejected_logps": -274.4745178222656, + "debug/sppo_chosen_loss": 2346.412353515625, + "debug/sppo_chosen_reward_in_loss": 2.2348694801330566, + "debug/sppo_rej_reward_in_loss": -4.853362560272217, + "debug/sppo_reject_loss": 2156.526123046875, + "epoch": 5.6521739130434785, + "grad_norm": 67399.3140960085, + "learning_rate": 2.1559035888574427e-08, + "logits/chosen": 1.2297143936157227, + "logits/rejected": 1.589552879333496, + "logps/chosen": -257.30133056640625, + "logps/rejected": -279.32781982421875, + "loss": 4484.4062, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.022348696365952492, + "rewards/margins": 0.07088232040405273, + "rewards/rejected": -0.04853362590074539, + "step": 1560 + }, + { + "debug/policy_chosen_logits": 0.7509094476699829, + "debug/policy_chosen_logps": -238.2117156982422, + "debug/policy_rejected_logits": 1.0019946098327637, + "debug/policy_rejected_logps": -310.4805603027344, + "debug/reference_chosen_logps": -238.13815307617188, + "debug/reference_rejected_logps": -302.5880126953125, + "debug/sppo_chosen_loss": 2602.240966796875, + "debug/sppo_chosen_reward_in_loss": -0.07359428703784943, + "debug/sppo_rej_reward_in_loss": -7.892542839050293, + "debug/sppo_reject_loss": 2018.501220703125, + "epoch": 5.670289855072464, + "grad_norm": 76036.36447701229, + "learning_rate": 2.125339542521254e-08, + "logits/chosen": 0.7509094476699829, + "logits/rejected": 1.0019946098327637, + "logps/chosen": -238.2117156982422, + "logps/rejected": -310.4805603027344, + "loss": 4549.6527, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0007359433802776039, + "rewards/margins": 0.07818949222564697, + "rewards/rejected": -0.07892543077468872, + "step": 1565 + }, + { + "debug/policy_chosen_logits": 1.2730293273925781, + "debug/policy_chosen_logps": -263.2087707519531, + "debug/policy_rejected_logits": 1.5694999694824219, + "debug/policy_rejected_logps": -303.6869812011719, + "debug/reference_chosen_logps": -265.8478088378906, + "debug/reference_rejected_logps": -297.4637145996094, + "debug/sppo_chosen_loss": 2275.32275390625, + "debug/sppo_chosen_reward_in_loss": 2.639035701751709, + "debug/sppo_rej_reward_in_loss": -6.2232794761657715, + "debug/sppo_reject_loss": 2088.626220703125, + "epoch": 5.688405797101449, + "grad_norm": 80329.30357239723, + "learning_rate": 2.0949351146340583e-08, + "logits/chosen": 1.2730293273925781, + "logits/rejected": 1.5694999694824219, + "logps/chosen": -263.2087707519531, + "logps/rejected": -303.6869812011719, + "loss": 4487.0953, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.026390355080366135, + "rewards/margins": 0.08862314373254776, + "rewards/rejected": -0.062232792377471924, + "step": 1570 + }, + { + "debug/policy_chosen_logits": 1.4459768533706665, + "debug/policy_chosen_logps": -264.822021484375, + "debug/policy_rejected_logits": 1.6058681011199951, + "debug/policy_rejected_logps": -292.1711730957031, + "debug/reference_chosen_logps": -266.3921813964844, + "debug/reference_rejected_logps": -285.4745178222656, + "debug/sppo_chosen_loss": 2444.116455078125, + "debug/sppo_chosen_reward_in_loss": 1.5701853036880493, + "debug/sppo_rej_reward_in_loss": -6.696642875671387, + "debug/sppo_reject_loss": 2027.204345703125, + "epoch": 5.706521739130435, + "grad_norm": 73562.49283707654, + "learning_rate": 2.064691993432678e-08, + "logits/chosen": 1.4459768533706665, + "logits/rejected": 1.6058681011199951, + "logps/chosen": -264.822021484375, + "logps/rejected": -292.1711730957031, + "loss": 4437.1078, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.01570185460150242, + "rewards/margins": 0.08266828209161758, + "rewards/rejected": -0.06696642935276031, + "step": 1575 + }, + { + "debug/policy_chosen_logits": 1.305905818939209, + "debug/policy_chosen_logps": -266.57647705078125, + "debug/policy_rejected_logits": 1.6066995859146118, + "debug/policy_rejected_logps": -286.05487060546875, + "debug/reference_chosen_logps": -268.0072937011719, + "debug/reference_rejected_logps": -280.94744873046875, + "debug/sppo_chosen_loss": 2399.1025390625, + "debug/sppo_chosen_reward_in_loss": 1.430829644203186, + "debug/sppo_rej_reward_in_loss": -5.107414722442627, + "debug/sppo_reject_loss": 2106.423828125, + "epoch": 5.72463768115942, + "grad_norm": 69774.80715783554, + "learning_rate": 2.0346118581972095e-08, + "logits/chosen": 1.305905818939209, + "logits/rejected": 1.6066995859146118, + "logps/chosen": -266.57647705078125, + "logps/rejected": -286.05487060546875, + "loss": 4559.0742, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.014308296144008636, + "rewards/margins": 0.06538243591785431, + "rewards/rejected": -0.05107413977384567, + "step": 1580 + }, + { + "debug/policy_chosen_logits": 0.9529942274093628, + "debug/policy_chosen_logps": -245.853515625, + "debug/policy_rejected_logits": 1.3847246170043945, + "debug/policy_rejected_logps": -291.8622131347656, + "debug/reference_chosen_logps": -246.39022827148438, + "debug/reference_rejected_logps": -284.890625, + "debug/sppo_chosen_loss": 2535.155517578125, + "debug/sppo_chosen_reward_in_loss": 0.5367231369018555, + "debug/sppo_rej_reward_in_loss": -6.971585273742676, + "debug/sppo_reject_loss": 2033.044189453125, + "epoch": 5.742753623188406, + "grad_norm": 83134.39826419551, + "learning_rate": 2.0046963791577898e-08, + "logits/chosen": 0.9529942274093628, + "logits/rejected": 1.3847246170043945, + "logps/chosen": -245.853515625, + "logps/rejected": -291.8622131347656, + "loss": 4546.8141, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.00536723155528307, + "rewards/margins": 0.07508309185504913, + "rewards/rejected": -0.06971585005521774, + "step": 1585 + }, + { + "debug/policy_chosen_logits": 0.9592103958129883, + "debug/policy_chosen_logps": -254.1472625732422, + "debug/policy_rejected_logits": 1.5464726686477661, + "debug/policy_rejected_logps": -303.38140869140625, + "debug/reference_chosen_logps": -256.5666198730469, + "debug/reference_rejected_logps": -295.99114990234375, + "debug/sppo_chosen_loss": 2300.235107421875, + "debug/sppo_chosen_reward_in_loss": 2.4193854331970215, + "debug/sppo_rej_reward_in_loss": -7.3902788162231445, + "debug/sppo_reject_loss": 1999.815185546875, + "epoch": 5.760869565217392, + "grad_norm": 60062.4219329617, + "learning_rate": 1.9749472174018567e-08, + "logits/chosen": 0.9592103958129883, + "logits/rejected": 1.5464726686477661, + "logps/chosen": -254.1472625732422, + "logps/rejected": -303.38140869140625, + "loss": 4369.1586, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.02419385313987732, + "rewards/margins": 0.09809663146734238, + "rewards/rejected": -0.07390278577804565, + "step": 1590 + }, + { + "debug/policy_chosen_logits": 0.9911503791809082, + "debug/policy_chosen_logps": -244.70571899414062, + "debug/policy_rejected_logits": 1.137888789176941, + "debug/policy_rejected_logps": -251.33859252929688, + "debug/reference_chosen_logps": -245.8179168701172, + "debug/reference_rejected_logps": -247.0068817138672, + "debug/sppo_chosen_loss": 2453.41845703125, + "debug/sppo_chosen_reward_in_loss": 1.1122007369995117, + "debug/sppo_rej_reward_in_loss": -4.331699371337891, + "debug/sppo_reject_loss": 2246.73095703125, + "epoch": 5.778985507246377, + "grad_norm": 63515.44254699706, + "learning_rate": 1.9453660247819054e-08, + "logits/chosen": 0.9911503791809082, + "logits/rejected": 1.137888789176941, + "logps/chosen": -244.70571899414062, + "logps/rejected": -251.33859252929688, + "loss": 4569.682, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.011122007854282856, + "rewards/margins": 0.0544389970600605, + "rewards/rejected": -0.04331699013710022, + "step": 1595 + }, + { + "debug/policy_chosen_logits": 0.9711323976516724, + "debug/policy_chosen_logps": -255.40786743164062, + "debug/policy_rejected_logits": 1.3977515697479248, + "debug/policy_rejected_logps": -322.133056640625, + "debug/reference_chosen_logps": -257.37481689453125, + "debug/reference_rejected_logps": -316.53289794921875, + "debug/sppo_chosen_loss": 2381.20361328125, + "debug/sppo_chosen_reward_in_loss": 1.9669723510742188, + "debug/sppo_rej_reward_in_loss": -5.600157260894775, + "debug/sppo_reject_loss": 2125.21435546875, + "epoch": 5.797101449275362, + "grad_norm": 104333.61833020138, + "learning_rate": 1.9159544438237795e-08, + "logits/chosen": 0.9711323976516724, + "logits/rejected": 1.3977515697479248, + "logps/chosen": -255.40786743164062, + "logps/rejected": -322.133056640625, + "loss": 4557.7566, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.019669722765684128, + "rewards/margins": 0.07567129284143448, + "rewards/rejected": -0.05600156635046005, + "step": 1600 + }, + { + "epoch": 5.797101449275362, + "eval_debug/policy_chosen_logits": 1.3976449966430664, + "eval_debug/policy_chosen_logps": -252.77944946289062, + "eval_debug/policy_rejected_logits": 1.4427540302276611, + "eval_debug/policy_rejected_logps": -263.8221130371094, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2564.9140625, + "eval_debug/sppo_chosen_reward_in_loss": 0.13903316855430603, + "eval_debug/sppo_rej_reward_in_loss": -4.163466453552246, + "eval_debug/sppo_reject_loss": 2269.406982421875, + "eval_logits/chosen": 1.3976449966430664, + "eval_logits/rejected": 1.4427540302276611, + "eval_logps/chosen": -252.77944946289062, + "eval_logps/rejected": -263.8221130371094, + "eval_loss": 4618.0537109375, + "eval_rewards/accuracies": 0.5921052694320679, + "eval_rewards/chosen": 0.001390331657603383, + "eval_rewards/margins": 0.043024998158216476, + "eval_rewards/rejected": -0.04163466766476631, + "eval_runtime": 28.3722, + "eval_samples_per_second": 21.147, + "eval_steps_per_second": 0.67, + "step": 1600 + }, + { + "debug/policy_chosen_logits": 1.39915931224823, + "debug/policy_chosen_logps": -261.6456604003906, + "debug/policy_rejected_logits": 1.3916994333267212, + "debug/policy_rejected_logps": -283.9635009765625, + "debug/reference_chosen_logps": -262.82550048828125, + "debug/reference_rejected_logps": -277.431396484375, + "debug/sppo_chosen_loss": 2451.422607421875, + "debug/sppo_chosen_reward_in_loss": 1.1798160076141357, + "debug/sppo_rej_reward_in_loss": -6.532097816467285, + "debug/sppo_reject_loss": 2075.255859375, + "epoch": 5.815217391304348, + "grad_norm": 119684.09863826247, + "learning_rate": 1.8867141076354575e-08, + "logits/chosen": 1.39915931224823, + "logits/rejected": 1.3916994333267212, + "logps/chosen": -261.6456604003906, + "logps/rejected": -283.9635009765625, + "loss": 4475.0945, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.011798160150647163, + "rewards/margins": 0.0771191343665123, + "rewards/rejected": -0.06532097607851028, + "step": 1605 + }, + { + "debug/policy_chosen_logits": 1.276993751525879, + "debug/policy_chosen_logps": -254.44296264648438, + "debug/policy_rejected_logits": 1.4243838787078857, + "debug/policy_rejected_logps": -278.84466552734375, + "debug/reference_chosen_logps": -256.94012451171875, + "debug/reference_rejected_logps": -272.6163024902344, + "debug/sppo_chosen_loss": 2287.19873046875, + "debug/sppo_chosen_reward_in_loss": 2.4971923828125, + "debug/sppo_rej_reward_in_loss": -6.2283549308776855, + "debug/sppo_reject_loss": 2048.57958984375, + "epoch": 5.833333333333333, + "grad_norm": 84232.22173943985, + "learning_rate": 1.8576466398163825e-08, + "logits/chosen": 1.276993751525879, + "logits/rejected": 1.4243838787078857, + "logps/chosen": -254.44296264648438, + "logps/rejected": -278.84466552734375, + "loss": 4472.1785, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.024971922859549522, + "rewards/margins": 0.08725547045469284, + "rewards/rejected": -0.062283553183078766, + "step": 1610 + }, + { + "debug/policy_chosen_logits": 1.0857824087142944, + "debug/policy_chosen_logps": -261.70123291015625, + "debug/policy_rejected_logits": 1.2012531757354736, + "debug/policy_rejected_logps": -284.1413879394531, + "debug/reference_chosen_logps": -263.23260498046875, + "debug/reference_rejected_logps": -279.31488037109375, + "debug/sppo_chosen_loss": 2379.55908203125, + "debug/sppo_chosen_reward_in_loss": 1.5314220190048218, + "debug/sppo_rej_reward_in_loss": -4.826534748077393, + "debug/sppo_reject_loss": 2191.15673828125, + "epoch": 5.851449275362318, + "grad_norm": 75262.29467933044, + "learning_rate": 1.828753654367301e-08, + "logits/chosen": 1.0857824087142944, + "logits/rejected": 1.2012531757354736, + "logps/chosen": -261.70123291015625, + "logps/rejected": -284.1413879394531, + "loss": 4482.4117, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.015314221382141113, + "rewards/margins": 0.06357955932617188, + "rewards/rejected": -0.04826534539461136, + "step": 1615 + }, + { + "debug/policy_chosen_logits": 0.9738510251045227, + "debug/policy_chosen_logps": -253.0663604736328, + "debug/policy_rejected_logits": 1.2081331014633179, + "debug/policy_rejected_logps": -289.32147216796875, + "debug/reference_chosen_logps": -253.2355194091797, + "debug/reference_rejected_logps": -283.28363037109375, + "debug/sppo_chosen_loss": 2559.809814453125, + "debug/sppo_chosen_reward_in_loss": 0.16917076706886292, + "debug/sppo_rej_reward_in_loss": -6.037837028503418, + "debug/sppo_reject_loss": 2085.84033203125, + "epoch": 5.869565217391305, + "grad_norm": 60666.13479644753, + "learning_rate": 1.800036755600649e-08, + "logits/chosen": 0.9738510251045227, + "logits/rejected": 1.2081331014633179, + "logps/chosen": -253.0663604736328, + "logps/rejected": -289.32147216796875, + "loss": 4566.7016, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.001691707642748952, + "rewards/margins": 0.06207007169723511, + "rewards/rejected": -0.060378365218639374, + "step": 1620 + }, + { + "debug/policy_chosen_logits": 1.2756688594818115, + "debug/policy_chosen_logps": -247.154541015625, + "debug/policy_rejected_logits": 1.5490895509719849, + "debug/policy_rejected_logps": -284.45733642578125, + "debug/reference_chosen_logps": -248.82217407226562, + "debug/reference_rejected_logps": -276.4673767089844, + "debug/sppo_chosen_loss": 2402.35302734375, + "debug/sppo_chosen_reward_in_loss": 1.6676151752471924, + "debug/sppo_rej_reward_in_loss": -7.989927768707275, + "debug/sppo_reject_loss": 2021.4124755859375, + "epoch": 5.88768115942029, + "grad_norm": 70184.62744062844, + "learning_rate": 1.7714975380514747e-08, + "logits/chosen": 1.2756688594818115, + "logits/rejected": 1.5490895509719849, + "logps/chosen": -247.154541015625, + "logps/rejected": -284.45733642578125, + "loss": 4444.1676, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.016676150262355804, + "rewards/margins": 0.09657542407512665, + "rewards/rejected": -0.07989926636219025, + "step": 1625 + }, + { + "debug/policy_chosen_logits": 0.9647180438041687, + "debug/policy_chosen_logps": -231.06143188476562, + "debug/policy_rejected_logits": 1.6003284454345703, + "debug/policy_rejected_logps": -329.9085388183594, + "debug/reference_chosen_logps": -232.83535766601562, + "debug/reference_rejected_logps": -322.9118957519531, + "debug/sppo_chosen_loss": 2379.330322265625, + "debug/sppo_chosen_reward_in_loss": 1.773934006690979, + "debug/sppo_rej_reward_in_loss": -6.9966301918029785, + "debug/sppo_reject_loss": 2040.5396728515625, + "epoch": 5.905797101449275, + "grad_norm": 122076.70540811632, + "learning_rate": 1.74313758638889e-08, + "logits/chosen": 0.9647180438041687, + "logits/rejected": 1.6003284454345703, + "logps/chosen": -231.06143188476562, + "logps/rejected": -329.9085388183594, + "loss": 4578.8035, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.01773933880031109, + "rewards/margins": 0.08770564943552017, + "rewards/rejected": -0.06996630132198334, + "step": 1630 + }, + { + "debug/policy_chosen_logits": 0.9913623929023743, + "debug/policy_chosen_logps": -235.04196166992188, + "debug/policy_rejected_logits": 1.44295334815979, + "debug/policy_rejected_logps": -325.648681640625, + "debug/reference_chosen_logps": -237.11782836914062, + "debug/reference_rejected_logps": -316.41522216796875, + "debug/sppo_chosen_loss": 2330.42529296875, + "debug/sppo_chosen_reward_in_loss": 2.075892448425293, + "debug/sppo_rej_reward_in_loss": -9.233416557312012, + "debug/sppo_reject_loss": 1873.272705078125, + "epoch": 5.923913043478261, + "grad_norm": 75527.0035979738, + "learning_rate": 1.7149584753280877e-08, + "logits/chosen": 0.9913623929023743, + "logits/rejected": 1.44295334815979, + "logps/chosen": -235.04196166992188, + "logps/rejected": -325.648681640625, + "loss": 4426.1695, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.020758923143148422, + "rewards/margins": 0.11309309303760529, + "rewards/rejected": -0.09233416616916656, + "step": 1635 + }, + { + "debug/policy_chosen_logits": 0.8644050359725952, + "debug/policy_chosen_logps": -247.27706909179688, + "debug/policy_rejected_logits": 0.9515060186386108, + "debug/policy_rejected_logps": -264.0517272949219, + "debug/reference_chosen_logps": -246.76345825195312, + "debug/reference_rejected_logps": -256.6071472167969, + "debug/sppo_chosen_loss": 2653.48779296875, + "debug/sppo_chosen_reward_in_loss": -0.5136321783065796, + "debug/sppo_rej_reward_in_loss": -7.444613456726074, + "debug/sppo_reject_loss": 1968.6578369140625, + "epoch": 5.942028985507246, + "grad_norm": 111654.94401469397, + "learning_rate": 1.6869617695429024e-08, + "logits/chosen": 0.8644050359725952, + "logits/rejected": 0.9515060186386108, + "logps/chosen": -247.27706909179688, + "logps/rejected": -264.0517272949219, + "loss": 4457.9152, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.005136322230100632, + "rewards/margins": 0.06930981576442719, + "rewards/rejected": -0.07444612681865692, + "step": 1640 + }, + { + "debug/policy_chosen_logits": 1.2634034156799316, + "debug/policy_chosen_logps": -272.01629638671875, + "debug/policy_rejected_logits": 1.728417992591858, + "debug/policy_rejected_logps": -293.72198486328125, + "debug/reference_chosen_logps": -273.44683837890625, + "debug/reference_rejected_logps": -289.3643493652344, + "debug/sppo_chosen_loss": 2410.878173828125, + "debug/sppo_chosen_reward_in_loss": 1.4305458068847656, + "debug/sppo_rej_reward_in_loss": -4.357623100280762, + "debug/sppo_reject_loss": 2192.16552734375, + "epoch": 5.960144927536232, + "grad_norm": 92617.1431644493, + "learning_rate": 1.659149023578932e-08, + "logits/chosen": 1.2634034156799316, + "logits/rejected": 1.728417992591858, + "logps/chosen": -272.01629638671875, + "logps/rejected": -293.72198486328125, + "loss": 4681.8473, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.014305457472801208, + "rewards/margins": 0.05788169056177139, + "rewards/rejected": -0.043576233088970184, + "step": 1645 + }, + { + "debug/policy_chosen_logits": 1.1534197330474854, + "debug/policy_chosen_logps": -257.92620849609375, + "debug/policy_rejected_logits": 1.5255687236785889, + "debug/policy_rejected_logps": -309.0951232910156, + "debug/reference_chosen_logps": -259.07080078125, + "debug/reference_rejected_logps": -302.33856201171875, + "debug/sppo_chosen_loss": 2448.565185546875, + "debug/sppo_chosen_reward_in_loss": 1.144627332687378, + "debug/sppo_rej_reward_in_loss": -6.756533145904541, + "debug/sppo_reject_loss": 2012.3671875, + "epoch": 5.978260869565218, + "grad_norm": 67416.56753397142, + "learning_rate": 1.631521781767214e-08, + "logits/chosen": 1.1534197330474854, + "logits/rejected": 1.5255687236785889, + "logps/chosen": -257.92620849609375, + "logps/rejected": -309.0951232910156, + "loss": 4506.9414, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.011446274816989899, + "rewards/margins": 0.07901160418987274, + "rewards/rejected": -0.06756532937288284, + "step": 1650 + }, + { + "debug/policy_chosen_logits": 1.0105514526367188, + "debug/policy_chosen_logps": -266.85760498046875, + "debug/policy_rejected_logits": 1.1407248973846436, + "debug/policy_rejected_logps": -270.06201171875, + "debug/reference_chosen_logps": -269.12786865234375, + "debug/reference_rejected_logps": -266.747314453125, + "debug/sppo_chosen_loss": 2305.269775390625, + "debug/sppo_chosen_reward_in_loss": 2.270277738571167, + "debug/sppo_rej_reward_in_loss": -3.3147239685058594, + "debug/sppo_reject_loss": 2292.51220703125, + "epoch": 5.996376811594203, + "grad_norm": 71539.28058485997, + "learning_rate": 1.6040815781384835e-08, + "logits/chosen": 1.0105514526367188, + "logits/rejected": 1.1407248973846436, + "logps/chosen": -266.85760498046875, + "logps/rejected": -270.06201171875, + "loss": 4471.6687, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02270277589559555, + "rewards/margins": 0.05585001781582832, + "rewards/rejected": -0.033147238194942474, + "step": 1655 + }, + { + "debug/policy_chosen_logits": 1.3166652917861938, + "debug/policy_chosen_logps": -249.92971801757812, + "debug/policy_rejected_logits": 1.3984097242355347, + "debug/policy_rejected_logps": -264.4957275390625, + "debug/reference_chosen_logps": -252.8326416015625, + "debug/reference_rejected_logps": -259.8531494140625, + "debug/sppo_chosen_loss": 2243.153564453125, + "debug/sppo_chosen_reward_in_loss": 2.9029386043548584, + "debug/sppo_rej_reward_in_loss": -4.6425909996032715, + "debug/sppo_reject_loss": 2174.069091796875, + "epoch": 6.0144927536231885, + "grad_norm": 165564.9998152843, + "learning_rate": 1.5768299363379873e-08, + "logits/chosen": 1.3166652917861938, + "logits/rejected": 1.3984097242355347, + "logps/chosen": -249.92971801757812, + "logps/rejected": -264.4957275390625, + "loss": 4336.6781, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02902938798069954, + "rewards/margins": 0.07545529305934906, + "rewards/rejected": -0.046425916254520416, + "step": 1660 + }, + { + "debug/policy_chosen_logits": 1.1228680610656738, + "debug/policy_chosen_logps": -256.03125, + "debug/policy_rejected_logits": 1.1986610889434814, + "debug/policy_rejected_logps": -284.936279296875, + "debug/reference_chosen_logps": -256.0284118652344, + "debug/reference_rejected_logps": -279.94073486328125, + "debug/sppo_chosen_loss": 2582.91455078125, + "debug/sppo_chosen_reward_in_loss": -0.0028770447243005037, + "debug/sppo_rej_reward_in_loss": -4.995522499084473, + "debug/sppo_reject_loss": 2207.716796875, + "epoch": 6.032608695652174, + "grad_norm": 62791.773762485485, + "learning_rate": 1.549768369540882e-08, + "logits/chosen": 1.1228680610656738, + "logits/rejected": 1.1986610889434814, + "logps/chosen": -256.03125, + "logps/rejected": -284.936279296875, + "loss": 4471.5508, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -2.8770416975021362e-05, + "rewards/margins": 0.049926456063985825, + "rewards/rejected": -0.049955226480960846, + "step": 1665 + }, + { + "debug/policy_chosen_logits": 0.9759367108345032, + "debug/policy_chosen_logps": -260.8306884765625, + "debug/policy_rejected_logits": 1.3370988368988037, + "debug/policy_rejected_logps": -274.1346740722656, + "debug/reference_chosen_logps": -260.8038330078125, + "debug/reference_rejected_logps": -269.924072265625, + "debug/sppo_chosen_loss": 2595.6005859375, + "debug/sppo_chosen_reward_in_loss": -0.026832008734345436, + "debug/sppo_rej_reward_in_loss": -4.210579872131348, + "debug/sppo_reject_loss": 2220.85302734375, + "epoch": 6.050724637681159, + "grad_norm": 85633.85924538625, + "learning_rate": 1.5228983803682233e-08, + "logits/chosen": 0.9759367108345032, + "logits/rejected": 1.3370988368988037, + "logps/chosen": -260.8306884765625, + "logps/rejected": -274.1346740722656, + "loss": 4617.566, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0002683214843273163, + "rewards/margins": 0.04183747619390488, + "rewards/rejected": -0.042105793952941895, + "step": 1670 + }, + { + "debug/policy_chosen_logits": 1.043464183807373, + "debug/policy_chosen_logps": -237.03897094726562, + "debug/policy_rejected_logits": 1.3343417644500732, + "debug/policy_rejected_logps": -272.0693664550781, + "debug/reference_chosen_logps": -237.66552734375, + "debug/reference_rejected_logps": -266.9305725097656, + "debug/sppo_chosen_loss": 2517.232421875, + "debug/sppo_chosen_reward_in_loss": 0.6265815496444702, + "debug/sppo_rej_reward_in_loss": -5.138772487640381, + "debug/sppo_reject_loss": 2158.145751953125, + "epoch": 6.068840579710145, + "grad_norm": 106333.17908808027, + "learning_rate": 1.4962214608035174e-08, + "logits/chosen": 1.043464183807373, + "logits/rejected": 1.3343417644500732, + "logps/chosen": -237.03897094726562, + "logps/rejected": -272.0693664550781, + "loss": 4487.2164, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006265816278755665, + "rewards/margins": 0.05765353515744209, + "rewards/rejected": -0.0513877272605896, + "step": 1675 + }, + { + "debug/policy_chosen_logits": 0.9386157989501953, + "debug/policy_chosen_logps": -245.19601440429688, + "debug/policy_rejected_logits": 1.1581140756607056, + "debug/policy_rejected_logps": -282.74481201171875, + "debug/reference_chosen_logps": -246.68087768554688, + "debug/reference_rejected_logps": -274.8426208496094, + "debug/sppo_chosen_loss": 2406.94140625, + "debug/sppo_chosen_reward_in_loss": 1.484842300415039, + "debug/sppo_rej_reward_in_loss": -7.902211666107178, + "debug/sppo_reject_loss": 1954.858154296875, + "epoch": 6.086956521739131, + "grad_norm": 63533.333542387, + "learning_rate": 1.4697390921098884e-08, + "logits/chosen": 0.9386157989501953, + "logits/rejected": 1.1581140756607056, + "logps/chosen": -245.19601440429688, + "logps/rejected": -282.74481201171875, + "loss": 4453.4, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.014848423190414906, + "rewards/margins": 0.09387053549289703, + "rewards/rejected": -0.079022116959095, + "step": 1680 + }, + { + "debug/policy_chosen_logits": 1.0539305210113525, + "debug/policy_chosen_logps": -252.01614379882812, + "debug/policy_rejected_logits": 1.518951416015625, + "debug/policy_rejected_logps": -322.51824951171875, + "debug/reference_chosen_logps": -255.57870483398438, + "debug/reference_rejected_logps": -316.02362060546875, + "debug/sppo_chosen_loss": 2176.062255859375, + "debug/sppo_chosen_reward_in_loss": 3.562582492828369, + "debug/sppo_rej_reward_in_loss": -6.494635581970215, + "debug/sppo_reject_loss": 2063.71337890625, + "epoch": 6.105072463768116, + "grad_norm": 78076.75910880722, + "learning_rate": 1.4434527447478211e-08, + "logits/chosen": 1.0539305210113525, + "logits/rejected": 1.518951416015625, + "logps/chosen": -252.01614379882812, + "logps/rejected": -322.51824951171875, + "loss": 4314.0164, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.03562582656741142, + "rewards/margins": 0.10057218372821808, + "rewards/rejected": -0.06494636088609695, + "step": 1685 + }, + { + "debug/policy_chosen_logits": 1.1778371334075928, + "debug/policy_chosen_logps": -240.6632843017578, + "debug/policy_rejected_logits": 1.3609716892242432, + "debug/policy_rejected_logps": -288.82550048828125, + "debug/reference_chosen_logps": -242.98355102539062, + "debug/reference_rejected_logps": -285.4348449707031, + "debug/sppo_chosen_loss": 2297.618408203125, + "debug/sppo_chosen_reward_in_loss": 2.320263147354126, + "debug/sppo_rej_reward_in_loss": -3.3906643390655518, + "debug/sppo_reject_loss": 2266.66259765625, + "epoch": 6.1231884057971016, + "grad_norm": 92007.95295488626, + "learning_rate": 1.4173638782935222e-08, + "logits/chosen": 1.1778371334075928, + "logits/rejected": 1.3609716892242432, + "logps/chosen": -240.6632843017578, + "logps/rejected": -288.82550048828125, + "loss": 4490.9922, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02320263162255287, + "rewards/margins": 0.057109273970127106, + "rewards/rejected": -0.033906638622283936, + "step": 1690 + }, + { + "debug/policy_chosen_logits": 1.1212691068649292, + "debug/policy_chosen_logps": -267.1830139160156, + "debug/policy_rejected_logits": 1.542589545249939, + "debug/policy_rejected_logps": -308.27496337890625, + "debug/reference_chosen_logps": -269.02978515625, + "debug/reference_rejected_logps": -303.439208984375, + "debug/sppo_chosen_loss": 2355.48974609375, + "debug/sppo_chosen_reward_in_loss": 1.8467466831207275, + "debug/sppo_rej_reward_in_loss": -4.83573055267334, + "debug/sppo_reject_loss": 2156.66015625, + "epoch": 6.141304347826087, + "grad_norm": 76139.17518191009, + "learning_rate": 1.3914739413578635e-08, + "logits/chosen": 1.1212691068649292, + "logits/rejected": 1.542589545249939, + "logps/chosen": -267.1830139160156, + "logps/rejected": -308.27496337890625, + "loss": 4502.1578, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018467465415596962, + "rewards/margins": 0.0668247789144516, + "rewards/rejected": -0.04835730418562889, + "step": 1695 + }, + { + "debug/policy_chosen_logits": 1.0304549932479858, + "debug/policy_chosen_logps": -258.8855895996094, + "debug/policy_rejected_logits": 1.1785156726837158, + "debug/policy_rejected_logps": -277.87237548828125, + "debug/reference_chosen_logps": -261.2309875488281, + "debug/reference_rejected_logps": -270.38702392578125, + "debug/sppo_chosen_loss": 2296.740478515625, + "debug/sppo_chosen_reward_in_loss": 2.3453879356384277, + "debug/sppo_rej_reward_in_loss": -7.485389709472656, + "debug/sppo_reject_loss": 1961.5794677734375, + "epoch": 6.159420289855072, + "grad_norm": 63352.3130523474, + "learning_rate": 1.3657843715059546e-08, + "logits/chosen": 1.0304549932479858, + "logits/rejected": 1.1785156726837158, + "logps/chosen": -258.8855895996094, + "logps/rejected": -277.87237548828125, + "loss": 4507.4234, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.023453880101442337, + "rewards/margins": 0.0983077734708786, + "rewards/rejected": -0.07485388219356537, + "step": 1700 + }, + { + "epoch": 6.159420289855072, + "eval_debug/policy_chosen_logits": 1.393418312072754, + "eval_debug/policy_chosen_logps": -252.83157348632812, + "eval_debug/policy_rejected_logits": 1.4381625652313232, + "eval_debug/policy_rejected_logps": -263.7893371582031, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2573.3212890625, + "eval_debug/sppo_chosen_reward_in_loss": 0.08691415935754776, + "eval_debug/sppo_rej_reward_in_loss": -4.130711078643799, + "eval_debug/sppo_reject_loss": 2274.951171875, + "eval_logits/chosen": 1.393418312072754, + "eval_logits/rejected": 1.4381625652313232, + "eval_logps/chosen": -252.83157348632812, + "eval_logps/rejected": -263.7893371582031, + "eval_loss": 4618.0, + "eval_rewards/accuracies": 0.5921052694320679, + "eval_rewards/chosen": 0.0008691417751833797, + "eval_rewards/margins": 0.042176254093647, + "eval_rewards/rejected": -0.04130711406469345, + "eval_runtime": 28.5957, + "eval_samples_per_second": 20.982, + "eval_steps_per_second": 0.664, + "step": 1700 + }, + { + "debug/policy_chosen_logits": 1.1354193687438965, + "debug/policy_chosen_logps": -249.91000366210938, + "debug/policy_rejected_logits": 1.5528957843780518, + "debug/policy_rejected_logps": -313.15106201171875, + "debug/reference_chosen_logps": -252.29525756835938, + "debug/reference_rejected_logps": -306.924072265625, + "debug/sppo_chosen_loss": 2300.11083984375, + "debug/sppo_chosen_reward_in_loss": 2.385272741317749, + "debug/sppo_rej_reward_in_loss": -6.227025032043457, + "debug/sppo_reject_loss": 2092.615234375, + "epoch": 6.177536231884058, + "grad_norm": 107904.8515031039, + "learning_rate": 1.3402965951773231e-08, + "logits/chosen": 1.1354193687438965, + "logits/rejected": 1.5528957843780518, + "logps/chosen": -249.91000366210938, + "logps/rejected": -313.15106201171875, + "loss": 4416.3258, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.023852726444602013, + "rewards/margins": 0.08612297475337982, + "rewards/rejected": -0.06227024644613266, + "step": 1705 + }, + { + "debug/policy_chosen_logits": 1.0563108921051025, + "debug/policy_chosen_logps": -254.9259033203125, + "debug/policy_rejected_logits": 1.5079724788665771, + "debug/policy_rejected_logps": -305.40533447265625, + "debug/reference_chosen_logps": -257.25634765625, + "debug/reference_rejected_logps": -300.0366516113281, + "debug/sppo_chosen_loss": 2320.1484375, + "debug/sppo_chosen_reward_in_loss": 2.330429792404175, + "debug/sppo_rej_reward_in_loss": -5.368690013885498, + "debug/sppo_reject_loss": 2095.70947265625, + "epoch": 6.195652173913044, + "grad_norm": 145999.2754310219, + "learning_rate": 1.3150120276067005e-08, + "logits/chosen": 1.0563108921051025, + "logits/rejected": 1.5079724788665771, + "logps/chosen": -254.9259033203125, + "logps/rejected": -305.40533447265625, + "loss": 4454.2812, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.023304296657443047, + "rewards/margins": 0.07699120044708252, + "rewards/rejected": -0.053686898201704025, + "step": 1710 + }, + { + "debug/policy_chosen_logits": 0.8688338994979858, + "debug/policy_chosen_logps": -235.02902221679688, + "debug/policy_rejected_logits": 1.3255870342254639, + "debug/policy_rejected_logps": -299.3507385253906, + "debug/reference_chosen_logps": -234.290283203125, + "debug/reference_rejected_logps": -290.80792236328125, + "debug/sppo_chosen_loss": 2712.55908203125, + "debug/sppo_chosen_reward_in_loss": -0.7387531399726868, + "debug/sppo_rej_reward_in_loss": -8.542762756347656, + "debug/sppo_reject_loss": 1942.9124755859375, + "epoch": 6.213768115942029, + "grad_norm": 72598.38108351543, + "learning_rate": 1.2899320727454472e-08, + "logits/chosen": 0.8688338994979858, + "logits/rejected": 1.3255870342254639, + "logps/chosen": -235.02902221679688, + "logps/rejected": -299.3507385253906, + "loss": 4546.7039, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.007387531455606222, + "rewards/margins": 0.07804010063409805, + "rewards/rejected": -0.08542762696743011, + "step": 1715 + }, + { + "debug/policy_chosen_logits": 1.2419612407684326, + "debug/policy_chosen_logps": -231.77490234375, + "debug/policy_rejected_logits": 1.3306069374084473, + "debug/policy_rejected_logps": -259.4939880371094, + "debug/reference_chosen_logps": -235.42819213867188, + "debug/reference_rejected_logps": -255.14407348632812, + "debug/sppo_chosen_loss": 2156.22705078125, + "debug/sppo_chosen_reward_in_loss": 3.6533005237579346, + "debug/sppo_rej_reward_in_loss": -4.349907875061035, + "debug/sppo_reject_loss": 2229.787353515625, + "epoch": 6.231884057971015, + "grad_norm": 61898.363819015816, + "learning_rate": 1.2650581231835921e-08, + "logits/chosen": 1.2419612407684326, + "logits/rejected": 1.3306069374084473, + "logps/chosen": -231.77490234375, + "logps/rejected": -259.4939880371094, + "loss": 4444.5094, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.03653300553560257, + "rewards/margins": 0.08003208041191101, + "rewards/rejected": -0.04349907115101814, + "step": 1720 + }, + { + "debug/policy_chosen_logits": 1.206857442855835, + "debug/policy_chosen_logps": -261.6423034667969, + "debug/policy_rejected_logits": 1.467166543006897, + "debug/policy_rejected_logps": -296.5067138671875, + "debug/reference_chosen_logps": -262.1924743652344, + "debug/reference_rejected_logps": -292.49652099609375, + "debug/sppo_chosen_loss": 2536.546630859375, + "debug/sppo_chosen_reward_in_loss": 0.5501596331596375, + "debug/sppo_rej_reward_in_loss": -4.010178565979004, + "debug/sppo_reject_loss": 2224.327392578125, + "epoch": 6.25, + "grad_norm": 72796.66106475785, + "learning_rate": 1.2403915600725157e-08, + "logits/chosen": 1.206857442855835, + "logits/rejected": 1.467166543006897, + "logps/chosen": -261.6423034667969, + "logps/rejected": -296.5067138671875, + "loss": 4472.7969, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005501596722751856, + "rewards/margins": 0.04560338333249092, + "rewards/rejected": -0.0401017852127552, + "step": 1725 + }, + { + "debug/policy_chosen_logits": 1.1720632314682007, + "debug/policy_chosen_logps": -239.95504760742188, + "debug/policy_rejected_logits": 1.3450525999069214, + "debug/policy_rejected_logps": -271.2496643066406, + "debug/reference_chosen_logps": -239.88345336914062, + "debug/reference_rejected_logps": -264.8446350097656, + "debug/sppo_chosen_loss": 2585.338623046875, + "debug/sppo_chosen_reward_in_loss": -0.07162685692310333, + "debug/sppo_rej_reward_in_loss": -6.405016899108887, + "debug/sppo_reject_loss": 2089.844482421875, + "epoch": 6.268115942028985, + "grad_norm": 89512.0377820254, + "learning_rate": 1.2159337530482494e-08, + "logits/chosen": 1.1720632314682007, + "logits/rejected": 1.3450525999069214, + "logps/chosen": -239.95504760742188, + "logps/rejected": -271.2496643066406, + "loss": 4510.391, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.000716269772965461, + "rewards/margins": 0.0633339062333107, + "rewards/rejected": -0.06405016779899597, + "step": 1730 + }, + { + "debug/policy_chosen_logits": 0.982774555683136, + "debug/policy_chosen_logps": -246.8867950439453, + "debug/policy_rejected_logits": 1.3127689361572266, + "debug/policy_rejected_logps": -287.54180908203125, + "debug/reference_chosen_logps": -248.7151641845703, + "debug/reference_rejected_logps": -286.00408935546875, + "debug/sppo_chosen_loss": 2359.535888671875, + "debug/sppo_chosen_reward_in_loss": 1.8283694982528687, + "debug/sppo_rej_reward_in_loss": -1.5377200841903687, + "debug/sppo_reject_loss": 2389.627197265625, + "epoch": 6.286231884057971, + "grad_norm": 66847.64337209416, + "learning_rate": 1.1916860601554312e-08, + "logits/chosen": 0.982774555683136, + "logits/rejected": 1.3127689361572266, + "logps/chosen": -246.8867950439453, + "logps/rejected": -287.54180908203125, + "loss": 4579.4227, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.018283694982528687, + "rewards/margins": 0.0336608961224556, + "rewards/rejected": -0.01537720113992691, + "step": 1735 + }, + { + "debug/policy_chosen_logits": 1.253943681716919, + "debug/policy_chosen_logps": -272.449951171875, + "debug/policy_rejected_logits": 1.6080108880996704, + "debug/policy_rejected_logps": -329.8172302246094, + "debug/reference_chosen_logps": -275.38238525390625, + "debug/reference_rejected_logps": -324.34478759765625, + "debug/sppo_chosen_loss": 2233.989501953125, + "debug/sppo_chosen_reward_in_loss": 2.9324352741241455, + "debug/sppo_rej_reward_in_loss": -5.472461223602295, + "debug/sppo_reject_loss": 2133.767333984375, + "epoch": 6.304347826086957, + "grad_norm": 68122.6119676196, + "learning_rate": 1.1676498277719017e-08, + "logits/chosen": 1.253943681716919, + "logits/rejected": 1.6080108880996704, + "logps/chosen": -272.449951171875, + "logps/rejected": -329.8172302246094, + "loss": 4587.9414, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.029324352741241455, + "rewards/margins": 0.08404896408319473, + "rewards/rejected": -0.05472461134195328, + "step": 1740 + }, + { + "debug/policy_chosen_logits": 0.8677678108215332, + "debug/policy_chosen_logps": -261.25238037109375, + "debug/policy_rejected_logits": 1.049141526222229, + "debug/policy_rejected_logps": -305.6992492675781, + "debug/reference_chosen_logps": -263.1070861816406, + "debug/reference_rejected_logps": -298.8080139160156, + "debug/sppo_chosen_loss": 2364.94775390625, + "debug/sppo_chosen_reward_in_loss": 1.854697823524475, + "debug/sppo_rej_reward_in_loss": -6.891258239746094, + "debug/sppo_reject_loss": 2049.59375, + "epoch": 6.322463768115942, + "grad_norm": 76682.47648309536, + "learning_rate": 1.1438263905339358e-08, + "logits/chosen": 0.8677678108215332, + "logits/rejected": 1.049141526222229, + "logps/chosen": -261.25238037109375, + "logps/rejected": -305.6992492675781, + "loss": 4562.4586, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.018546978011727333, + "rewards/margins": 0.08745956420898438, + "rewards/rejected": -0.0689125806093216, + "step": 1745 + }, + { + "debug/policy_chosen_logits": 1.2336132526397705, + "debug/policy_chosen_logps": -244.929443359375, + "debug/policy_rejected_logits": 1.4067014455795288, + "debug/policy_rejected_logps": -275.9320068359375, + "debug/reference_chosen_logps": -247.2993927001953, + "debug/reference_rejected_logps": -272.1546325683594, + "debug/sppo_chosen_loss": 2289.958984375, + "debug/sppo_chosen_reward_in_loss": 2.3699169158935547, + "debug/sppo_rej_reward_in_loss": -3.7773468494415283, + "debug/sppo_reject_loss": 2284.33447265625, + "epoch": 6.340579710144928, + "grad_norm": 72167.1173026926, + "learning_rate": 1.1202170712621467e-08, + "logits/chosen": 1.2336132526397705, + "logits/rejected": 1.4067014455795288, + "logps/chosen": -244.929443359375, + "logps/rejected": -275.9320068359375, + "loss": 4390.8207, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.023699168115854263, + "rewards/margins": 0.061472635716199875, + "rewards/rejected": -0.03777346760034561, + "step": 1750 + }, + { + "debug/policy_chosen_logits": 0.9754334688186646, + "debug/policy_chosen_logps": -266.0594177246094, + "debug/policy_rejected_logits": 1.2265844345092773, + "debug/policy_rejected_logps": -309.459228515625, + "debug/reference_chosen_logps": -267.568359375, + "debug/reference_rejected_logps": -307.5111389160156, + "debug/sppo_chosen_loss": 2409.52978515625, + "debug/sppo_chosen_reward_in_loss": 1.5089489221572876, + "debug/sppo_rej_reward_in_loss": -1.9481090307235718, + "debug/sppo_reject_loss": 2377.966064453125, + "epoch": 6.358695652173913, + "grad_norm": 115786.13338678224, + "learning_rate": 1.0968231808880241e-08, + "logits/chosen": 0.9754334688186646, + "logits/rejected": 1.2265844345092773, + "logps/chosen": -266.0594177246094, + "logps/rejected": -309.459228515625, + "loss": 4526.4805, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01508948765695095, + "rewards/margins": 0.03457058221101761, + "rewards/rejected": -0.01948108896613121, + "step": 1755 + }, + { + "debug/policy_chosen_logits": 1.1159372329711914, + "debug/policy_chosen_logps": -262.75372314453125, + "debug/policy_rejected_logits": 1.3715074062347412, + "debug/policy_rejected_logps": -301.50225830078125, + "debug/reference_chosen_logps": -265.3297424316406, + "debug/reference_rejected_logps": -296.4901123046875, + "debug/sppo_chosen_loss": 2279.923828125, + "debug/sppo_chosen_reward_in_loss": 2.5760035514831543, + "debug/sppo_rej_reward_in_loss": -5.012146949768066, + "debug/sppo_reject_loss": 2145.450927734375, + "epoch": 6.3768115942028984, + "grad_norm": 70644.94503903398, + "learning_rate": 1.0736460183811546e-08, + "logits/chosen": 1.1159372329711914, + "logits/rejected": 1.3715074062347412, + "logps/chosen": -262.75372314453125, + "logps/rejected": -301.50225830078125, + "loss": 4568.8281, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02576003409922123, + "rewards/margins": 0.07588149607181549, + "rewards/rejected": -0.05012146756052971, + "step": 1760 + }, + { + "debug/policy_chosen_logits": 1.30352783203125, + "debug/policy_chosen_logps": -274.08648681640625, + "debug/policy_rejected_logits": 1.3584903478622437, + "debug/policy_rejected_logps": -292.0806884765625, + "debug/reference_chosen_logps": -274.2155456542969, + "debug/reference_rejected_logps": -287.33245849609375, + "debug/sppo_chosen_loss": 2555.212890625, + "debug/sppo_chosen_reward_in_loss": 0.12903061509132385, + "debug/sppo_rej_reward_in_loss": -4.748242378234863, + "debug/sppo_reject_loss": 2203.50048828125, + "epoch": 6.394927536231884, + "grad_norm": 63618.12217875742, + "learning_rate": 1.0506868706770844e-08, + "logits/chosen": 1.30352783203125, + "logits/rejected": 1.3584903478622437, + "logps/chosen": -274.08648681640625, + "logps/rejected": -292.0806884765625, + "loss": 4544.6687, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0012903057504445314, + "rewards/margins": 0.04877272993326187, + "rewards/rejected": -0.04748242348432541, + "step": 1765 + }, + { + "debug/policy_chosen_logits": 1.0495314598083496, + "debug/policy_chosen_logps": -239.2109375, + "debug/policy_rejected_logits": 1.478896141052246, + "debug/policy_rejected_logps": -283.77838134765625, + "debug/reference_chosen_logps": -239.0229034423828, + "debug/reference_rejected_logps": -279.0099182128906, + "debug/sppo_chosen_loss": 2597.17578125, + "debug/sppo_chosen_reward_in_loss": -0.18803825974464417, + "debug/sppo_rej_reward_in_loss": -4.768446922302246, + "debug/sppo_reject_loss": 2182.498046875, + "epoch": 6.413043478260869, + "grad_norm": 77066.58960116318, + "learning_rate": 1.0279470126058676e-08, + "logits/chosen": 1.0495314598083496, + "logits/rejected": 1.478896141052246, + "logps/chosen": -239.2109375, + "logps/rejected": -283.77838134765625, + "loss": 4643.443, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.0018803831189870834, + "rewards/margins": 0.04580408334732056, + "rewards/rejected": -0.04768446832895279, + "step": 1770 + }, + { + "debug/policy_chosen_logits": 1.2252495288848877, + "debug/policy_chosen_logps": -279.1947326660156, + "debug/policy_rejected_logits": 1.2340757846832275, + "debug/policy_rejected_logps": -279.371826171875, + "debug/reference_chosen_logps": -280.98284912109375, + "debug/reference_rejected_logps": -277.61669921875, + "debug/sppo_chosen_loss": 2366.054443359375, + "debug/sppo_chosen_reward_in_loss": 1.7880885601043701, + "debug/sppo_rej_reward_in_loss": -1.7551319599151611, + "debug/sppo_reject_loss": 2390.520751953125, + "epoch": 6.431159420289855, + "grad_norm": 72827.20873599048, + "learning_rate": 1.0054277068212797e-08, + "logits/chosen": 1.2252495288848877, + "logits/rejected": 1.2340757846832275, + "logps/chosen": -279.1947326660156, + "logps/rejected": -279.371826171875, + "loss": 4518.8156, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.017880886793136597, + "rewards/margins": 0.035432200878858566, + "rewards/rejected": -0.017551319673657417, + "step": 1775 + }, + { + "debug/policy_chosen_logits": 0.8835108876228333, + "debug/policy_chosen_logps": -271.79595947265625, + "debug/policy_rejected_logits": 1.2131328582763672, + "debug/policy_rejected_logps": -312.07208251953125, + "debug/reference_chosen_logps": -274.2127685546875, + "debug/reference_rejected_logps": -305.23321533203125, + "debug/sppo_chosen_loss": 2285.39404296875, + "debug/sppo_chosen_reward_in_loss": 2.416809558868408, + "debug/sppo_rej_reward_in_loss": -6.838896751403809, + "debug/sppo_reject_loss": 2023.9400634765625, + "epoch": 6.449275362318841, + "grad_norm": 63623.52265960445, + "learning_rate": 9.831302037307021e-09, + "logits/chosen": 0.8835108876228333, + "logits/rejected": 1.2131328582763672, + "logps/chosen": -271.79595947265625, + "logps/rejected": -312.07208251953125, + "loss": 4477.0383, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.024168096482753754, + "rewards/margins": 0.09255705773830414, + "rewards/rejected": -0.06838897615671158, + "step": 1780 + }, + { + "debug/policy_chosen_logits": 1.4466286897659302, + "debug/policy_chosen_logps": -270.4122009277344, + "debug/policy_rejected_logits": 1.503780722618103, + "debug/policy_rejected_logps": -285.6905212402344, + "debug/reference_chosen_logps": -270.66192626953125, + "debug/reference_rejected_logps": -283.3195495605469, + "debug/sppo_chosen_loss": 2587.914794921875, + "debug/sppo_chosen_reward_in_loss": 0.24976272881031036, + "debug/sppo_rej_reward_in_loss": -2.3709824085235596, + "debug/sppo_reject_loss": 2394.04248046875, + "epoch": 6.467391304347826, + "grad_norm": 68260.49272500667, + "learning_rate": 9.610557414257009e-09, + "logits/chosen": 1.4466286897659302, + "logits/rejected": 1.503780722618103, + "logps/chosen": -270.4122009277344, + "logps/rejected": -285.6905212402344, + "loss": 4458.1781, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0024976269342005253, + "rewards/margins": 0.026207447052001953, + "rewards/rejected": -0.023709822446107864, + "step": 1785 + }, + { + "debug/policy_chosen_logits": 0.844222903251648, + "debug/policy_chosen_logps": -243.48812866210938, + "debug/policy_rejected_logits": 1.2014129161834717, + "debug/policy_rejected_logps": -287.10198974609375, + "debug/reference_chosen_logps": -244.35690307617188, + "debug/reference_rejected_logps": -282.8691711425781, + "debug/sppo_chosen_loss": 2543.624267578125, + "debug/sppo_chosen_reward_in_loss": 0.8687904477119446, + "debug/sppo_rej_reward_in_loss": -4.232800483703613, + "debug/sppo_reject_loss": 2225.53271484375, + "epoch": 6.4855072463768115, + "grad_norm": 71785.22277961893, + "learning_rate": 9.392055456132713e-09, + "logits/chosen": 0.844222903251648, + "logits/rejected": 1.2014129161834717, + "logps/chosen": -243.48812866210938, + "logps/rejected": -287.10198974609375, + "loss": 4567.2234, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008687904104590416, + "rewards/margins": 0.05101591348648071, + "rewards/rejected": -0.04232800751924515, + "step": 1790 + }, + { + "debug/policy_chosen_logits": 1.136611819267273, + "debug/policy_chosen_logps": -226.44253540039062, + "debug/policy_rejected_logits": 1.2988706827163696, + "debug/policy_rejected_logps": -250.56747436523438, + "debug/reference_chosen_logps": -227.589599609375, + "debug/reference_rejected_logps": -245.1594696044922, + "debug/sppo_chosen_loss": 2425.15869140625, + "debug/sppo_chosen_reward_in_loss": 1.147066354751587, + "debug/sppo_rej_reward_in_loss": -5.408025741577148, + "debug/sppo_reject_loss": 2152.47265625, + "epoch": 6.503623188405797, + "grad_norm": 104367.46962525073, + "learning_rate": 9.175808295477849e-09, + "logits/chosen": 1.136611819267273, + "logits/rejected": 1.2988706827163696, + "logps/chosen": -226.44253540039062, + "logps/rejected": -250.56747436523438, + "loss": 4551.068, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.011470664292573929, + "rewards/margins": 0.06555091589689255, + "rewards/rejected": -0.054080259054899216, + "step": 1795 + }, + { + "debug/policy_chosen_logits": 1.1030946969985962, + "debug/policy_chosen_logps": -233.5998077392578, + "debug/policy_rejected_logits": 1.4379918575286865, + "debug/policy_rejected_logps": -271.5467529296875, + "debug/reference_chosen_logps": -234.68594360351562, + "debug/reference_rejected_logps": -270.9393005371094, + "debug/sppo_chosen_loss": 2447.8017578125, + "debug/sppo_chosen_reward_in_loss": 1.0861365795135498, + "debug/sppo_rej_reward_in_loss": -0.607455849647522, + "debug/sppo_reject_loss": 2494.67431640625, + "epoch": 6.521739130434782, + "grad_norm": 81728.7219157688, + "learning_rate": 8.961827939636196e-09, + "logits/chosen": 1.1030946969985962, + "logits/rejected": 1.4379918575286865, + "logps/chosen": -233.5998077392578, + "logps/rejected": -271.5467529296875, + "loss": 4566.6648, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.010861365124583244, + "rewards/margins": 0.01693592593073845, + "rewards/rejected": -0.006074557546526194, + "step": 1800 + }, + { + "epoch": 6.521739130434782, + "eval_debug/policy_chosen_logits": 1.3974790573120117, + "eval_debug/policy_chosen_logps": -252.31051635742188, + "eval_debug/policy_rejected_logits": 1.4412792921066284, + "eval_debug/policy_rejected_logps": -263.3516845703125, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2512.918701171875, + "eval_debug/sppo_chosen_reward_in_loss": 0.6079623103141785, + "eval_debug/sppo_rej_reward_in_loss": -3.6930320262908936, + "eval_debug/sppo_reject_loss": 2304.7548828125, + "eval_logits/chosen": 1.3974790573120117, + "eval_logits/rejected": 1.4412792921066284, + "eval_logps/chosen": -252.31051635742188, + "eval_logps/rejected": -263.3516845703125, + "eval_loss": 4619.33251953125, + "eval_rewards/accuracies": 0.5921052694320679, + "eval_rewards/chosen": 0.006079623010009527, + "eval_rewards/margins": 0.04300994426012039, + "eval_rewards/rejected": -0.03693031892180443, + "eval_runtime": 28.7328, + "eval_samples_per_second": 20.882, + "eval_steps_per_second": 0.661, + "step": 1800 + }, + { + "debug/policy_chosen_logits": 1.0640919208526611, + "debug/policy_chosen_logps": -241.22659301757812, + "debug/policy_rejected_logits": 1.1182310581207275, + "debug/policy_rejected_logps": -266.4372253417969, + "debug/reference_chosen_logps": -242.4794464111328, + "debug/reference_rejected_logps": -261.7477111816406, + "debug/sppo_chosen_loss": 2412.180908203125, + "debug/sppo_chosen_reward_in_loss": 1.2528440952301025, + "debug/sppo_rej_reward_in_loss": -4.689537048339844, + "debug/sppo_reject_loss": 2180.385986328125, + "epoch": 6.539855072463768, + "grad_norm": 61740.59994954301, + "learning_rate": 8.75012627008489e-09, + "logits/chosen": 1.0640919208526611, + "logits/rejected": 1.1182310581207275, + "logps/chosen": -241.22659301757812, + "logps/rejected": -266.4372253417969, + "loss": 4483.5254, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.012528439983725548, + "rewards/margins": 0.059423815459012985, + "rewards/rejected": -0.04689536988735199, + "step": 1805 + }, + { + "debug/policy_chosen_logits": 1.0891189575195312, + "debug/policy_chosen_logps": -241.63992309570312, + "debug/policy_rejected_logits": 1.606302261352539, + "debug/policy_rejected_logps": -313.29656982421875, + "debug/reference_chosen_logps": -243.9291534423828, + "debug/reference_rejected_logps": -306.9833984375, + "debug/sppo_chosen_loss": 2318.420654296875, + "debug/sppo_chosen_reward_in_loss": 2.289240837097168, + "debug/sppo_rej_reward_in_loss": -6.313161373138428, + "debug/sppo_reject_loss": 2048.82080078125, + "epoch": 6.557971014492754, + "grad_norm": 78559.58495351419, + "learning_rate": 8.540715041774716e-09, + "logits/chosen": 1.0891189575195312, + "logits/rejected": 1.606302261352539, + "logps/chosen": -241.63992309570312, + "logps/rejected": -313.29656982421875, + "loss": 4586.9969, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.022892409935593605, + "rewards/margins": 0.08602402359247208, + "rewards/rejected": -0.06313161551952362, + "step": 1810 + }, + { + "debug/policy_chosen_logits": 1.4883325099945068, + "debug/policy_chosen_logps": -271.9356384277344, + "debug/policy_rejected_logits": 1.6947778463363647, + "debug/policy_rejected_logps": -284.2402648925781, + "debug/reference_chosen_logps": -273.122802734375, + "debug/reference_rejected_logps": -281.25311279296875, + "debug/sppo_chosen_loss": 2417.36767578125, + "debug/sppo_chosen_reward_in_loss": 1.1871535778045654, + "debug/sppo_rej_reward_in_loss": -2.987131118774414, + "debug/sppo_reject_loss": 2343.622314453125, + "epoch": 6.576086956521739, + "grad_norm": 70559.05343026349, + "learning_rate": 8.333605882477334e-09, + "logits/chosen": 1.4883325099945068, + "logits/rejected": 1.6947778463363647, + "logps/chosen": -271.9356384277344, + "logps/rejected": -284.2402648925781, + "loss": 4524.1789, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.011871537193655968, + "rewards/margins": 0.041742850095033646, + "rewards/rejected": -0.02987130917608738, + "step": 1815 + }, + { + "debug/policy_chosen_logits": 0.9994763135910034, + "debug/policy_chosen_logps": -232.7861785888672, + "debug/policy_rejected_logits": 1.3966903686523438, + "debug/policy_rejected_logps": -286.7142639160156, + "debug/reference_chosen_logps": -235.10836791992188, + "debug/reference_rejected_logps": -282.6846008300781, + "debug/sppo_chosen_loss": 2292.59033203125, + "debug/sppo_chosen_reward_in_loss": 2.3221702575683594, + "debug/sppo_rej_reward_in_loss": -4.029662609100342, + "debug/sppo_reject_loss": 2255.48974609375, + "epoch": 6.594202898550725, + "grad_norm": 67555.36702872877, + "learning_rate": 8.128810292139726e-09, + "logits/chosen": 0.9994763135910034, + "logits/rejected": 1.3966903686523438, + "logps/chosen": -232.7861785888672, + "logps/rejected": -286.7142639160156, + "loss": 4530.3629, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.023221701383590698, + "rewards/margins": 0.06351832300424576, + "rewards/rejected": -0.04029662907123566, + "step": 1820 + }, + { + "debug/policy_chosen_logits": 0.9176052808761597, + "debug/policy_chosen_logps": -236.9879913330078, + "debug/policy_rejected_logits": 1.3363244533538818, + "debug/policy_rejected_logps": -287.5779724121094, + "debug/reference_chosen_logps": -238.054443359375, + "debug/reference_rejected_logps": -277.24664306640625, + "debug/sppo_chosen_loss": 2428.644775390625, + "debug/sppo_chosen_reward_in_loss": 1.0664472579956055, + "debug/sppo_rej_reward_in_loss": -10.331335067749023, + "debug/sppo_reject_loss": 1826.340576171875, + "epoch": 6.61231884057971, + "grad_norm": 69235.59804149473, + "learning_rate": 7.926339642245555e-09, + "logits/chosen": 0.9176052808761597, + "logits/rejected": 1.3363244533538818, + "logps/chosen": -236.9879913330078, + "logps/rejected": -287.5779724121094, + "loss": 4397.3313, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.010664473287761211, + "rewards/margins": 0.113977812230587, + "rewards/rejected": -0.10331334918737411, + "step": 1825 + }, + { + "debug/policy_chosen_logits": 1.2249447107315063, + "debug/policy_chosen_logps": -242.8837432861328, + "debug/policy_rejected_logits": 1.552274465560913, + "debug/policy_rejected_logps": -280.9314880371094, + "debug/reference_chosen_logps": -245.7642822265625, + "debug/reference_rejected_logps": -274.960693359375, + "debug/sppo_chosen_loss": 2240.804443359375, + "debug/sppo_chosen_reward_in_loss": 2.880551815032959, + "debug/sppo_rej_reward_in_loss": -5.97078275680542, + "debug/sppo_reject_loss": 2090.081298828125, + "epoch": 6.630434782608695, + "grad_norm": 69381.34153206859, + "learning_rate": 7.726205175183837e-09, + "logits/chosen": 1.2249447107315063, + "logits/rejected": 1.552274465560913, + "logps/chosen": -242.8837432861328, + "logps/rejected": -280.9314880371094, + "loss": 4404.8367, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.02880551852285862, + "rewards/margins": 0.08851335197687149, + "rewards/rejected": -0.05970783159136772, + "step": 1830 + }, + { + "debug/policy_chosen_logits": 1.4106062650680542, + "debug/policy_chosen_logps": -276.819091796875, + "debug/policy_rejected_logits": 1.8558547496795654, + "debug/policy_rejected_logps": -322.76226806640625, + "debug/reference_chosen_logps": -277.9795837402344, + "debug/reference_rejected_logps": -316.0045471191406, + "debug/sppo_chosen_loss": 2473.76416015625, + "debug/sppo_chosen_reward_in_loss": 1.1604499816894531, + "debug/sppo_rej_reward_in_loss": -6.757748603820801, + "debug/sppo_reject_loss": 2023.562255859375, + "epoch": 6.648550724637682, + "grad_norm": 122245.8956762465, + "learning_rate": 7.528418003624632e-09, + "logits/chosen": 1.4106062650680542, + "logits/rejected": 1.8558547496795654, + "logps/chosen": -276.819091796875, + "logps/rejected": -322.76226806640625, + "loss": 4622.6406, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.011604499071836472, + "rewards/margins": 0.0791819766163826, + "rewards/rejected": -0.06757748126983643, + "step": 1835 + }, + { + "debug/policy_chosen_logits": 1.3023555278778076, + "debug/policy_chosen_logps": -254.7684326171875, + "debug/policy_rejected_logits": 1.4576895236968994, + "debug/policy_rejected_logps": -289.526123046875, + "debug/reference_chosen_logps": -258.4832458496094, + "debug/reference_rejected_logps": -281.6604309082031, + "debug/sppo_chosen_loss": 2154.77294921875, + "debug/sppo_chosen_reward_in_loss": 3.7148489952087402, + "debug/sppo_rej_reward_in_loss": -7.865678310394287, + "debug/sppo_reject_loss": 1972.273681640625, + "epoch": 6.666666666666667, + "grad_norm": 68577.4154292618, + "learning_rate": 7.332989109902027e-09, + "logits/chosen": 1.3023555278778076, + "logits/rejected": 1.4576895236968994, + "logps/chosen": -254.7684326171875, + "logps/rejected": -289.526123046875, + "loss": 4562.9859, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03714849054813385, + "rewards/margins": 0.11580528318881989, + "rewards/rejected": -0.07865677773952484, + "step": 1840 + }, + { + "debug/policy_chosen_logits": 1.1200048923492432, + "debug/policy_chosen_logps": -264.84515380859375, + "debug/policy_rejected_logits": 1.1008622646331787, + "debug/policy_rejected_logps": -254.68212890625, + "debug/reference_chosen_logps": -264.2433166503906, + "debug/reference_rejected_logps": -252.0456085205078, + "debug/sppo_chosen_loss": 2682.3505859375, + "debug/sppo_chosen_reward_in_loss": -0.6018713116645813, + "debug/sppo_rej_reward_in_loss": -2.6365625858306885, + "debug/sppo_reject_loss": 2311.68212890625, + "epoch": 6.684782608695652, + "grad_norm": 81734.77191589169, + "learning_rate": 7.139929345404355e-09, + "logits/chosen": 1.1200048923492432, + "logits/rejected": 1.1008622646331787, + "logps/chosen": -264.84515380859375, + "logps/rejected": -254.68212890625, + "loss": 4443.2492, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006018712185323238, + "rewards/margins": 0.020346911624073982, + "rewards/rejected": -0.026365626603364944, + "step": 1845 + }, + { + "debug/policy_chosen_logits": 1.310903787612915, + "debug/policy_chosen_logps": -277.4835205078125, + "debug/policy_rejected_logits": 1.4697941541671753, + "debug/policy_rejected_logps": -327.77044677734375, + "debug/reference_chosen_logps": -276.53717041015625, + "debug/reference_rejected_logps": -322.140380859375, + "debug/sppo_chosen_loss": 2686.08984375, + "debug/sppo_chosen_reward_in_loss": -0.9463611841201782, + "debug/sppo_rej_reward_in_loss": -5.630080223083496, + "debug/sppo_reject_loss": 2116.522705078125, + "epoch": 6.702898550724638, + "grad_norm": 77714.10615841726, + "learning_rate": 6.94924942997161e-09, + "logits/chosen": 1.310903787612915, + "logits/rejected": 1.4697941541671753, + "logps/chosen": -277.4835205078125, + "logps/rejected": -327.77044677734375, + "loss": 4527.9828, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00946361105889082, + "rewards/margins": 0.04683718457818031, + "rewards/rejected": -0.056300800293684006, + "step": 1850 + }, + { + "debug/policy_chosen_logits": 1.0288848876953125, + "debug/policy_chosen_logps": -243.25289916992188, + "debug/policy_rejected_logits": 1.0793625116348267, + "debug/policy_rejected_logps": -290.67620849609375, + "debug/reference_chosen_logps": -245.24673461914062, + "debug/reference_rejected_logps": -287.10443115234375, + "debug/sppo_chosen_loss": 2362.96923828125, + "debug/sppo_chosen_reward_in_loss": 1.9938185214996338, + "debug/sppo_rej_reward_in_loss": -3.5717475414276123, + "debug/sppo_reject_loss": 2288.716552734375, + "epoch": 6.721014492753623, + "grad_norm": 67099.74986461057, + "learning_rate": 6.760959951300266e-09, + "logits/chosen": 1.0288848876953125, + "logits/rejected": 1.0793625116348267, + "logps/chosen": -243.25289916992188, + "logps/rejected": -290.67620849609375, + "loss": 4631.593, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.019938183948397636, + "rewards/margins": 0.05565565824508667, + "rewards/rejected": -0.03571747615933418, + "step": 1855 + }, + { + "debug/policy_chosen_logits": 1.067305326461792, + "debug/policy_chosen_logps": -246.3300323486328, + "debug/policy_rejected_logits": 1.253506064414978, + "debug/policy_rejected_logps": -260.63360595703125, + "debug/reference_chosen_logps": -247.5150146484375, + "debug/reference_rejected_logps": -256.20477294921875, + "debug/sppo_chosen_loss": 2444.710693359375, + "debug/sppo_chosen_reward_in_loss": 1.1849710941314697, + "debug/sppo_rej_reward_in_loss": -4.428830146789551, + "debug/sppo_reject_loss": 2232.339111328125, + "epoch": 6.739130434782608, + "grad_norm": 77074.98043159636, + "learning_rate": 6.575071364355334e-09, + "logits/chosen": 1.067305326461792, + "logits/rejected": 1.253506064414978, + "logps/chosen": -246.3300323486328, + "logps/rejected": -260.63360595703125, + "loss": 4501.8156, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01184971071779728, + "rewards/margins": 0.056138016283512115, + "rewards/rejected": -0.04428829625248909, + "step": 1860 + }, + { + "debug/policy_chosen_logits": 1.2634141445159912, + "debug/policy_chosen_logps": -268.0980529785156, + "debug/policy_rejected_logits": 1.4198873043060303, + "debug/policy_rejected_logps": -294.7967834472656, + "debug/reference_chosen_logps": -270.0035705566406, + "debug/reference_rejected_logps": -285.4292907714844, + "debug/sppo_chosen_loss": 2364.331298828125, + "debug/sppo_chosen_reward_in_loss": 1.9055078029632568, + "debug/sppo_rej_reward_in_loss": -9.367478370666504, + "debug/sppo_reject_loss": 1887.01953125, + "epoch": 6.757246376811594, + "grad_norm": 95721.87361269009, + "learning_rate": 6.3915939907899005e-09, + "logits/chosen": 1.2634141445159912, + "logits/rejected": 1.4198873043060303, + "logps/chosen": -268.0980529785156, + "logps/rejected": -294.7967834472656, + "loss": 4398.0863, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.01905507780611515, + "rewards/margins": 0.11272986233234406, + "rewards/rejected": -0.09367477893829346, + "step": 1865 + }, + { + "debug/policy_chosen_logits": 1.2589980363845825, + "debug/policy_chosen_logps": -256.4156799316406, + "debug/policy_rejected_logits": 1.69875967502594, + "debug/policy_rejected_logps": -319.1440734863281, + "debug/reference_chosen_logps": -257.7935485839844, + "debug/reference_rejected_logps": -313.9791564941406, + "debug/sppo_chosen_loss": 2404.74658203125, + "debug/sppo_chosen_reward_in_loss": 1.3778616189956665, + "debug/sppo_rej_reward_in_loss": -5.164914608001709, + "debug/sppo_reject_loss": 2155.24169921875, + "epoch": 6.77536231884058, + "grad_norm": 81792.11649399805, + "learning_rate": 6.210538018371947e-09, + "logits/chosen": 1.2589980363845825, + "logits/rejected": 1.69875967502594, + "logps/chosen": -256.4156799316406, + "logps/rejected": -319.1440734863281, + "loss": 4512.4551, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.013778614811599255, + "rewards/margins": 0.065427765250206, + "rewards/rejected": -0.051649145781993866, + "step": 1870 + }, + { + "debug/policy_chosen_logits": 1.120521068572998, + "debug/policy_chosen_logps": -267.41448974609375, + "debug/policy_rejected_logits": 1.344630241394043, + "debug/policy_rejected_logps": -279.906005859375, + "debug/reference_chosen_logps": -268.8659362792969, + "debug/reference_rejected_logps": -275.41595458984375, + "debug/sppo_chosen_loss": 2384.952392578125, + "debug/sppo_chosen_reward_in_loss": 1.451454520225525, + "debug/sppo_rej_reward_in_loss": -4.490046501159668, + "debug/sppo_reject_loss": 2182.372802734375, + "epoch": 6.793478260869565, + "grad_norm": 103845.9387896819, + "learning_rate": 6.031913500418706e-09, + "logits/chosen": 1.120521068572998, + "logits/rejected": 1.344630241394043, + "logps/chosen": -267.41448974609375, + "logps/rejected": -279.906005859375, + "loss": 4566.6125, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.014514544978737831, + "rewards/margins": 0.05941500514745712, + "rewards/rejected": -0.04490046575665474, + "step": 1875 + }, + { + "debug/policy_chosen_logits": 1.3000844717025757, + "debug/policy_chosen_logps": -271.90667724609375, + "debug/policy_rejected_logits": 1.4914687871932983, + "debug/policy_rejected_logps": -324.59991455078125, + "debug/reference_chosen_logps": -273.9796142578125, + "debug/reference_rejected_logps": -318.5680847167969, + "debug/sppo_chosen_loss": 2341.97265625, + "debug/sppo_chosen_reward_in_loss": 2.0729565620422363, + "debug/sppo_rej_reward_in_loss": -6.031794548034668, + "debug/sppo_reject_loss": 2074.671875, + "epoch": 6.811594202898551, + "grad_norm": 67645.82685745248, + "learning_rate": 5.855730355238414e-09, + "logits/chosen": 1.3000844717025757, + "logits/rejected": 1.4914687871932983, + "logps/chosen": -271.90667724609375, + "logps/rejected": -324.59991455078125, + "loss": 4528.9207, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.020729564130306244, + "rewards/margins": 0.08104751259088516, + "rewards/rejected": -0.06031794473528862, + "step": 1880 + }, + { + "debug/policy_chosen_logits": 1.1996022462844849, + "debug/policy_chosen_logps": -247.4275360107422, + "debug/policy_rejected_logits": 1.309090495109558, + "debug/policy_rejected_logps": -274.24188232421875, + "debug/reference_chosen_logps": -251.1371307373047, + "debug/reference_rejected_logps": -267.4615478515625, + "debug/sppo_chosen_loss": 2163.852783203125, + "debug/sppo_chosen_reward_in_loss": 3.7095978260040283, + "debug/sppo_rej_reward_in_loss": -6.780303001403809, + "debug/sppo_reject_loss": 2039.903564453125, + "epoch": 6.829710144927536, + "grad_norm": 78775.22927602356, + "learning_rate": 5.681998365579593e-09, + "logits/chosen": 1.1996022462844849, + "logits/rejected": 1.309090495109558, + "logps/chosen": -247.4275360107422, + "logps/rejected": -274.24188232421875, + "loss": 4415.8047, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.037095971405506134, + "rewards/margins": 0.10489901155233383, + "rewards/rejected": -0.0678030326962471, + "step": 1885 + }, + { + "debug/policy_chosen_logits": 1.2602416276931763, + "debug/policy_chosen_logps": -237.3834228515625, + "debug/policy_rejected_logits": 1.4726940393447876, + "debug/policy_rejected_logps": -279.3681335449219, + "debug/reference_chosen_logps": -239.3144073486328, + "debug/reference_rejected_logps": -273.20574951171875, + "debug/sppo_chosen_loss": 2332.995849609375, + "debug/sppo_chosen_reward_in_loss": 1.931006669998169, + "debug/sppo_rej_reward_in_loss": -6.162369728088379, + "debug/sppo_reject_loss": 2098.0224609375, + "epoch": 6.8478260869565215, + "grad_norm": 63587.04625732706, + "learning_rate": 5.5107271780878875e-09, + "logits/chosen": 1.2602416276931763, + "logits/rejected": 1.4726940393447876, + "logps/chosen": -237.3834228515625, + "logps/rejected": -279.3681335449219, + "loss": 4382.6273, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.01931006647646427, + "rewards/margins": 0.08093376457691193, + "rewards/rejected": -0.061623699963092804, + "step": 1890 + }, + { + "debug/policy_chosen_logits": 1.3107913732528687, + "debug/policy_chosen_logps": -238.652099609375, + "debug/policy_rejected_logits": 1.534700870513916, + "debug/policy_rejected_logps": -300.97198486328125, + "debug/reference_chosen_logps": -241.06076049804688, + "debug/reference_rejected_logps": -292.98626708984375, + "debug/sppo_chosen_loss": 2309.5693359375, + "debug/sppo_chosen_reward_in_loss": 2.4086639881134033, + "debug/sppo_rej_reward_in_loss": -7.98569393157959, + "debug/sppo_reject_loss": 1979.941650390625, + "epoch": 6.865942028985507, + "grad_norm": 66804.78505725738, + "learning_rate": 5.3419263027703665e-09, + "logits/chosen": 1.3107913732528687, + "logits/rejected": 1.534700870513916, + "logps/chosen": -238.652099609375, + "logps/rejected": -300.97198486328125, + "loss": 4361.425, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.024086639285087585, + "rewards/margins": 0.1039435863494873, + "rewards/rejected": -0.07985694706439972, + "step": 1895 + }, + { + "debug/policy_chosen_logits": 1.34364652633667, + "debug/policy_chosen_logps": -250.04843139648438, + "debug/policy_rejected_logits": 1.5993636846542358, + "debug/policy_rejected_logps": -282.2890319824219, + "debug/reference_chosen_logps": -251.6648712158203, + "debug/reference_rejected_logps": -276.7724609375, + "debug/sppo_chosen_loss": 2384.414306640625, + "debug/sppo_chosen_reward_in_loss": 1.6164261102676392, + "debug/sppo_rej_reward_in_loss": -5.516600608825684, + "debug/sppo_reject_loss": 2137.204345703125, + "epoch": 6.884057971014493, + "grad_norm": 86603.97993699458, + "learning_rate": 5.175605112467529e-09, + "logits/chosen": 1.34364652633667, + "logits/rejected": 1.5993636846542358, + "logps/chosen": -250.04843139648438, + "logps/rejected": -282.2890319824219, + "loss": 4682.7492, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016164259985089302, + "rewards/margins": 0.07133026421070099, + "rewards/rejected": -0.055166006088256836, + "step": 1900 + }, + { + "epoch": 6.884057971014493, + "eval_debug/policy_chosen_logits": 1.3966501951217651, + "eval_debug/policy_chosen_logps": -252.25791931152344, + "eval_debug/policy_rejected_logits": 1.440749168395996, + "eval_debug/policy_rejected_logps": -263.3143615722656, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2507.00537109375, + "eval_debug/sppo_chosen_reward_in_loss": 0.6605623960494995, + "eval_debug/sppo_rej_reward_in_loss": -3.655715227127075, + "eval_debug/sppo_reject_loss": 2307.52392578125, + "eval_logits/chosen": 1.3966501951217651, + "eval_logits/rejected": 1.440749168395996, + "eval_logps/chosen": -252.25791931152344, + "eval_logps/rejected": -263.3143615722656, + "eval_loss": 4616.86865234375, + "eval_rewards/accuracies": 0.5921052694320679, + "eval_rewards/chosen": 0.006605625152587891, + "eval_rewards/margins": 0.04316277801990509, + "eval_rewards/rejected": -0.0365571528673172, + "eval_runtime": 28.3163, + "eval_samples_per_second": 21.189, + "eval_steps_per_second": 0.671, + "step": 1900 + }, + { + "debug/policy_chosen_logits": 1.0572196245193481, + "debug/policy_chosen_logps": -217.22860717773438, + "debug/policy_rejected_logits": 1.4193629026412964, + "debug/policy_rejected_logps": -282.8929443359375, + "debug/reference_chosen_logps": -220.3137969970703, + "debug/reference_rejected_logps": -278.0849914550781, + "debug/sppo_chosen_loss": 2222.53515625, + "debug/sppo_chosen_reward_in_loss": 3.085196018218994, + "debug/sppo_rej_reward_in_loss": -4.807944297790527, + "debug/sppo_reject_loss": 2180.68994140625, + "epoch": 6.9021739130434785, + "grad_norm": 60658.25859471416, + "learning_rate": 5.011772842332812e-09, + "logits/chosen": 1.0572196245193481, + "logits/rejected": 1.4193629026412964, + "logps/chosen": -217.22860717773438, + "logps/rejected": -282.8929443359375, + "loss": 4505.9348, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03085196018218994, + "rewards/margins": 0.07893140614032745, + "rewards/rejected": -0.048079442232847214, + "step": 1905 + }, + { + "debug/policy_chosen_logits": 1.2437039613723755, + "debug/policy_chosen_logps": -254.41989135742188, + "debug/policy_rejected_logits": 1.584726095199585, + "debug/policy_rejected_logps": -295.80438232421875, + "debug/reference_chosen_logps": -255.3196258544922, + "debug/reference_rejected_logps": -290.5936279296875, + "debug/sppo_chosen_loss": 2462.65966796875, + "debug/sppo_chosen_reward_in_loss": 0.8997413516044617, + "debug/sppo_rej_reward_in_loss": -5.2107648849487305, + "debug/sppo_reject_loss": 2128.06787109375, + "epoch": 6.920289855072464, + "grad_norm": 70172.35876578368, + "learning_rate": 4.850438589319817e-09, + "logits/chosen": 1.2437039613723755, + "logits/rejected": 1.584726095199585, + "logps/chosen": -254.41989135742188, + "logps/rejected": -295.80438232421875, + "loss": 4634.7203, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.008997412398457527, + "rewards/margins": 0.06110506132245064, + "rewards/rejected": -0.05210765078663826, + "step": 1910 + }, + { + "debug/policy_chosen_logits": 0.9675963521003723, + "debug/policy_chosen_logps": -234.487548828125, + "debug/policy_rejected_logits": 1.2166943550109863, + "debug/policy_rejected_logps": -280.05853271484375, + "debug/reference_chosen_logps": -237.55307006835938, + "debug/reference_rejected_logps": -274.2191467285156, + "debug/sppo_chosen_loss": 2232.22705078125, + "debug/sppo_chosen_reward_in_loss": 3.065514087677002, + "debug/sppo_rej_reward_in_loss": -5.839382171630859, + "debug/sppo_reject_loss": 2107.266845703125, + "epoch": 6.938405797101449, + "grad_norm": 67291.21445508106, + "learning_rate": 4.691611311677252e-09, + "logits/chosen": 0.9675963521003723, + "logits/rejected": 1.2166943550109863, + "logps/chosen": -234.487548828125, + "logps/rejected": -280.05853271484375, + "loss": 4591.3695, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.030655140057206154, + "rewards/margins": 0.08904895931482315, + "rewards/rejected": -0.05839381739497185, + "step": 1915 + }, + { + "debug/policy_chosen_logits": 1.2087125778198242, + "debug/policy_chosen_logps": -288.37445068359375, + "debug/policy_rejected_logits": 1.162536382675171, + "debug/policy_rejected_logps": -266.4842224121094, + "debug/reference_chosen_logps": -291.00299072265625, + "debug/reference_rejected_logps": -263.9441223144531, + "debug/sppo_chosen_loss": 2261.277099609375, + "debug/sppo_chosen_reward_in_loss": 2.6285316944122314, + "debug/sppo_rej_reward_in_loss": -2.540130376815796, + "debug/sppo_reject_loss": 2301.51220703125, + "epoch": 6.956521739130435, + "grad_norm": 112826.91250668064, + "learning_rate": 4.5352998284514e-09, + "logits/chosen": 1.2087125778198242, + "logits/rejected": 1.162536382675171, + "logps/chosen": -288.37445068359375, + "logps/rejected": -266.4842224121094, + "loss": 4529.3742, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.026285316795110703, + "rewards/margins": 0.05168662220239639, + "rewards/rejected": -0.02540130354464054, + "step": 1920 + }, + { + "debug/policy_chosen_logits": 1.392529845237732, + "debug/policy_chosen_logps": -265.3560485839844, + "debug/policy_rejected_logits": 1.7413336038589478, + "debug/policy_rejected_logps": -323.5859375, + "debug/reference_chosen_logps": -266.50811767578125, + "debug/reference_rejected_logps": -318.15435791015625, + "debug/sppo_chosen_loss": 2445.50390625, + "debug/sppo_chosen_reward_in_loss": 1.1520637273788452, + "debug/sppo_rej_reward_in_loss": -5.431580543518066, + "debug/sppo_reject_loss": 2126.082763671875, + "epoch": 6.97463768115942, + "grad_norm": 84091.55009534623, + "learning_rate": 4.381512818996564e-09, + "logits/chosen": 1.392529845237732, + "logits/rejected": 1.7413336038589478, + "logps/chosen": -265.3560485839844, + "logps/rejected": -323.5859375, + "loss": 4485.8617, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.011520637199282646, + "rewards/margins": 0.06583644449710846, + "rewards/rejected": -0.054315805435180664, + "step": 1925 + }, + { + "debug/policy_chosen_logits": 1.2974140644073486, + "debug/policy_chosen_logps": -249.2262420654297, + "debug/policy_rejected_logits": 1.7515869140625, + "debug/policy_rejected_logps": -300.5926818847656, + "debug/reference_chosen_logps": -250.64013671875, + "debug/reference_rejected_logps": -294.30718994140625, + "debug/sppo_chosen_loss": 2388.483642578125, + "debug/sppo_chosen_reward_in_loss": 1.4139083623886108, + "debug/sppo_rej_reward_in_loss": -6.2854905128479, + "debug/sppo_reject_loss": 2088.600341796875, + "epoch": 6.992753623188406, + "grad_norm": 67447.51991794909, + "learning_rate": 4.230258822492999e-09, + "logits/chosen": 1.2974140644073486, + "logits/rejected": 1.7515869140625, + "logps/chosen": -249.2262420654297, + "logps/rejected": -300.5926818847656, + "loss": 4532.4992, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014139083214104176, + "rewards/margins": 0.07699398696422577, + "rewards/rejected": -0.06285490095615387, + "step": 1930 + }, + { + "debug/policy_chosen_logits": 0.9810554385185242, + "debug/policy_chosen_logps": -248.074462890625, + "debug/policy_rejected_logits": 1.1785837411880493, + "debug/policy_rejected_logps": -261.30096435546875, + "debug/reference_chosen_logps": -250.2155303955078, + "debug/reference_rejected_logps": -256.85382080078125, + "debug/sppo_chosen_loss": 2329.6708984375, + "debug/sppo_chosen_reward_in_loss": 2.141080856323242, + "debug/sppo_rej_reward_in_loss": -4.447126865386963, + "debug/sppo_reject_loss": 2196.126708984375, + "epoch": 7.010869565217392, + "grad_norm": 99318.8145119645, + "learning_rate": 4.08154623747291e-09, + "logits/chosen": 0.9810554385185242, + "logits/rejected": 1.1785837411880493, + "logps/chosen": -248.074462890625, + "logps/rejected": -261.30096435546875, + "loss": 4433.907, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.021410807967185974, + "rewards/margins": 0.06588207185268402, + "rewards/rejected": -0.044471271336078644, + "step": 1935 + }, + { + "debug/policy_chosen_logits": 1.334702968597412, + "debug/policy_chosen_logps": -269.0505676269531, + "debug/policy_rejected_logits": 1.4345498085021973, + "debug/policy_rejected_logps": -289.39788818359375, + "debug/reference_chosen_logps": -270.40582275390625, + "debug/reference_rejected_logps": -283.4798278808594, + "debug/sppo_chosen_loss": 2434.93115234375, + "debug/sppo_chosen_reward_in_loss": 1.3552671670913696, + "debug/sppo_rej_reward_in_loss": -5.9180684089660645, + "debug/sppo_reject_loss": 2116.15771484375, + "epoch": 7.028985507246377, + "grad_norm": 67999.63530305258, + "learning_rate": 3.935383321353974e-09, + "logits/chosen": 1.334702968597412, + "logits/rejected": 1.4345498085021973, + "logps/chosen": -269.0505676269531, + "logps/rejected": -289.39788818359375, + "loss": 4522.9328, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.013552670367062092, + "rewards/margins": 0.07273335009813309, + "rewards/rejected": -0.05918068438768387, + "step": 1940 + }, + { + "debug/policy_chosen_logits": 1.1086900234222412, + "debug/policy_chosen_logps": -287.78692626953125, + "debug/policy_rejected_logits": 1.5081000328063965, + "debug/policy_rejected_logps": -317.62725830078125, + "debug/reference_chosen_logps": -288.9532165527344, + "debug/reference_rejected_logps": -312.1133117675781, + "debug/sppo_chosen_loss": 2440.03857421875, + "debug/sppo_chosen_reward_in_loss": 1.1662803888320923, + "debug/sppo_rej_reward_in_loss": -5.513950824737549, + "debug/sppo_reject_loss": 2124.861328125, + "epoch": 7.047101449275362, + "grad_norm": 115917.52539654094, + "learning_rate": 3.79177818998096e-09, + "logits/chosen": 1.1086900234222412, + "logits/rejected": 1.5081000328063965, + "logps/chosen": -287.78692626953125, + "logps/rejected": -317.62725830078125, + "loss": 4435.0016, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.011662803590297699, + "rewards/margins": 0.06680230796337128, + "rewards/rejected": -0.05513950437307358, + "step": 1945 + }, + { + "debug/policy_chosen_logits": 1.0770022869110107, + "debug/policy_chosen_logps": -236.9434356689453, + "debug/policy_rejected_logits": 1.4237271547317505, + "debug/policy_rejected_logps": -279.34930419921875, + "debug/reference_chosen_logps": -237.2868194580078, + "debug/reference_rejected_logps": -273.8456726074219, + "debug/sppo_chosen_loss": 2557.1015625, + "debug/sppo_chosen_reward_in_loss": 0.3433685302734375, + "debug/sppo_rej_reward_in_loss": -5.503598213195801, + "debug/sppo_reject_loss": 2134.468017578125, + "epoch": 7.065217391304348, + "grad_norm": 118959.58159264854, + "learning_rate": 3.6507388171750085e-09, + "logits/chosen": 1.0770022869110107, + "logits/rejected": 1.4237271547317505, + "logps/chosen": -236.9434356689453, + "logps/rejected": -279.34930419921875, + "loss": 4449.3938, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0034336864482611418, + "rewards/margins": 0.05846966430544853, + "rewards/rejected": -0.055035971105098724, + "step": 1950 + }, + { + "debug/policy_chosen_logits": 1.335742473602295, + "debug/policy_chosen_logps": -261.5475158691406, + "debug/policy_rejected_logits": 1.3126236200332642, + "debug/policy_rejected_logps": -269.31817626953125, + "debug/reference_chosen_logps": -263.124755859375, + "debug/reference_rejected_logps": -263.43121337890625, + "debug/sppo_chosen_loss": 2403.69775390625, + "debug/sppo_chosen_reward_in_loss": 1.5772308111190796, + "debug/sppo_rej_reward_in_loss": -5.886963844299316, + "debug/sppo_reject_loss": 2160.141357421875, + "epoch": 7.083333333333333, + "grad_norm": 65251.16742240301, + "learning_rate": 3.512273034290897e-09, + "logits/chosen": 1.335742473602295, + "logits/rejected": 1.3126236200332642, + "logps/chosen": -261.5475158691406, + "logps/rejected": -269.31817626953125, + "loss": 4585.6008, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.01577230915427208, + "rewards/margins": 0.07464194297790527, + "rewards/rejected": -0.05886963754892349, + "step": 1955 + }, + { + "debug/policy_chosen_logits": 1.319954752922058, + "debug/policy_chosen_logps": -270.43988037109375, + "debug/policy_rejected_logits": 1.3581571578979492, + "debug/policy_rejected_logps": -277.28070068359375, + "debug/reference_chosen_logps": -272.287353515625, + "debug/reference_rejected_logps": -269.27130126953125, + "debug/sppo_chosen_loss": 2363.688720703125, + "debug/sppo_chosen_reward_in_loss": 1.8474743366241455, + "debug/sppo_rej_reward_in_loss": -8.009401321411133, + "debug/sppo_reject_loss": 1959.2318115234375, + "epoch": 7.101449275362318, + "grad_norm": 79167.78203604883, + "learning_rate": 3.376388529782215e-09, + "logits/chosen": 1.319954752922058, + "logits/rejected": 1.3581571578979492, + "logps/chosen": -270.43988037109375, + "logps/rejected": -277.28070068359375, + "loss": 4485.6289, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.018474742770195007, + "rewards/margins": 0.09856875240802765, + "rewards/rejected": -0.08009400218725204, + "step": 1960 + }, + { + "debug/policy_chosen_logits": 1.2650543451309204, + "debug/policy_chosen_logps": -278.1651611328125, + "debug/policy_rejected_logits": 1.6884880065917969, + "debug/policy_rejected_logps": -295.658447265625, + "debug/reference_chosen_logps": -280.9836730957031, + "debug/reference_rejected_logps": -293.024658203125, + "debug/sppo_chosen_loss": 2284.28515625, + "debug/sppo_chosen_reward_in_loss": 2.8184852600097656, + "debug/sppo_rej_reward_in_loss": -2.6337947845458984, + "debug/sppo_reject_loss": 2347.244873046875, + "epoch": 7.119565217391305, + "grad_norm": 79959.42704998671, + "learning_rate": 3.243092848774437e-09, + "logits/chosen": 1.2650543451309204, + "logits/rejected": 1.6884880065917969, + "logps/chosen": -278.1651611328125, + "logps/rejected": -295.658447265625, + "loss": 4554.273, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02818485163152218, + "rewards/margins": 0.0545228011906147, + "rewards/rejected": -0.026337945833802223, + "step": 1965 + }, + { + "debug/policy_chosen_logits": 0.8446614146232605, + "debug/policy_chosen_logps": -239.51779174804688, + "debug/policy_rejected_logits": 1.3469539880752563, + "debug/policy_rejected_logps": -299.9848937988281, + "debug/reference_chosen_logps": -239.8334503173828, + "debug/reference_rejected_logps": -297.77490234375, + "debug/sppo_chosen_loss": 2562.301513671875, + "debug/sppo_chosen_reward_in_loss": 0.31566277146339417, + "debug/sppo_rej_reward_in_loss": -2.2100167274475098, + "debug/sppo_reject_loss": 2351.84765625, + "epoch": 7.13768115942029, + "grad_norm": 68724.14540415957, + "learning_rate": 3.1123933926459844e-09, + "logits/chosen": 0.8446614146232605, + "logits/rejected": 1.3469539880752563, + "logps/chosen": -239.51779174804688, + "logps/rejected": -299.9848937988281, + "loss": 4567.302, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0031566284596920013, + "rewards/margins": 0.025256793946027756, + "rewards/rejected": -0.022100165486335754, + "step": 1970 + }, + { + "debug/policy_chosen_logits": 1.4603184461593628, + "debug/policy_chosen_logps": -259.0393371582031, + "debug/policy_rejected_logits": 1.4730253219604492, + "debug/policy_rejected_logps": -273.2935485839844, + "debug/reference_chosen_logps": -260.7829284667969, + "debug/reference_rejected_logps": -267.289794921875, + "debug/sppo_chosen_loss": 2366.275146484375, + "debug/sppo_chosen_reward_in_loss": 1.7436144351959229, + "debug/sppo_rej_reward_in_loss": -6.003744602203369, + "debug/sppo_reject_loss": 2106.14306640625, + "epoch": 7.155797101449275, + "grad_norm": 91343.06106383547, + "learning_rate": 2.9842974186172264e-09, + "logits/chosen": 1.4603184461593628, + "logits/rejected": 1.4730253219604492, + "logps/chosen": -259.0393371582031, + "logps/rejected": -273.2935485839844, + "loss": 4616.1953, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01743614301085472, + "rewards/margins": 0.07747358083724976, + "rewards/rejected": -0.06003744527697563, + "step": 1975 + }, + { + "debug/policy_chosen_logits": 1.137643575668335, + "debug/policy_chosen_logps": -253.3531036376953, + "debug/policy_rejected_logits": 1.4637935161590576, + "debug/policy_rejected_logps": -280.31024169921875, + "debug/reference_chosen_logps": -254.67086791992188, + "debug/reference_rejected_logps": -273.9142150878906, + "debug/sppo_chosen_loss": 2418.249755859375, + "debug/sppo_chosen_reward_in_loss": 1.317787766456604, + "debug/sppo_rej_reward_in_loss": -6.3960418701171875, + "debug/sppo_reject_loss": 2044.1129150390625, + "epoch": 7.173913043478261, + "grad_norm": 68567.35829314639, + "learning_rate": 2.8588120393475745e-09, + "logits/chosen": 1.137643575668335, + "logits/rejected": 1.4637935161590576, + "logps/chosen": -253.3531036376953, + "logps/rejected": -280.31024169921875, + "loss": 4562.6047, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.013177876360714436, + "rewards/margins": 0.07713828980922699, + "rewards/rejected": -0.06396041810512543, + "step": 1980 + }, + { + "debug/policy_chosen_logits": 1.3646475076675415, + "debug/policy_chosen_logps": -257.977294921875, + "debug/policy_rejected_logits": 1.3902546167373657, + "debug/policy_rejected_logps": -273.2683410644531, + "debug/reference_chosen_logps": -258.584716796875, + "debug/reference_rejected_logps": -268.5025634765625, + "debug/sppo_chosen_loss": 2510.2548828125, + "debug/sppo_chosen_reward_in_loss": 0.6074390411376953, + "debug/sppo_rej_reward_in_loss": -4.765748023986816, + "debug/sppo_reject_loss": 2176.346435546875, + "epoch": 7.192028985507246, + "grad_norm": 54856.54889090561, + "learning_rate": 2.7359442225404815e-09, + "logits/chosen": 1.3646475076675415, + "logits/rejected": 1.3902546167373657, + "logps/chosen": -257.977294921875, + "logps/rejected": -273.2683410644531, + "loss": 4462.4723, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.006074388977140188, + "rewards/margins": 0.053731877356767654, + "rewards/rejected": -0.04765748232603073, + "step": 1985 + }, + { + "debug/policy_chosen_logits": 0.9074499011039734, + "debug/policy_chosen_logps": -222.3174285888672, + "debug/policy_rejected_logits": 1.4654042720794678, + "debug/policy_rejected_logps": -317.33892822265625, + "debug/reference_chosen_logps": -224.1483154296875, + "debug/reference_rejected_logps": -308.30426025390625, + "debug/sppo_chosen_loss": 2361.79150390625, + "debug/sppo_chosen_reward_in_loss": 1.8308719396591187, + "debug/sppo_rej_reward_in_loss": -9.034707069396973, + "debug/sppo_reject_loss": 1857.283447265625, + "epoch": 7.2101449275362315, + "grad_norm": 65083.44357543864, + "learning_rate": 2.615700790556569e-09, + "logits/chosen": 0.9074499011039734, + "logits/rejected": 1.4654042720794678, + "logps/chosen": -222.3174285888672, + "logps/rejected": -317.33892822265625, + "loss": 4340.7336, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.018308719620108604, + "rewards/margins": 0.10865578800439835, + "rewards/rejected": -0.09034706652164459, + "step": 1990 + }, + { + "debug/policy_chosen_logits": 1.3254536390304565, + "debug/policy_chosen_logps": -279.67266845703125, + "debug/policy_rejected_logits": 1.475126028060913, + "debug/policy_rejected_logps": -297.07989501953125, + "debug/reference_chosen_logps": -281.09307861328125, + "debug/reference_rejected_logps": -290.8413391113281, + "debug/sppo_chosen_loss": 2406.08935546875, + "debug/sppo_chosen_reward_in_loss": 1.4204126596450806, + "debug/sppo_rej_reward_in_loss": -6.238560676574707, + "debug/sppo_reject_loss": 2045.1383056640625, + "epoch": 7.228260869565218, + "grad_norm": 63741.71873725913, + "learning_rate": 2.498088420034855e-09, + "logits/chosen": 1.3254536390304565, + "logits/rejected": 1.475126028060913, + "logps/chosen": -279.67266845703125, + "logps/rejected": -297.07989501953125, + "loss": 4379.7883, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.01420412678271532, + "rewards/margins": 0.07658973336219788, + "rewards/rejected": -0.06238560751080513, + "step": 1995 + }, + { + "debug/policy_chosen_logits": 1.1463087797164917, + "debug/policy_chosen_logps": -257.5396423339844, + "debug/policy_rejected_logits": 1.3835389614105225, + "debug/policy_rejected_logps": -300.2962646484375, + "debug/reference_chosen_logps": -260.25921630859375, + "debug/reference_rejected_logps": -291.9169921875, + "debug/sppo_chosen_loss": 2253.771484375, + "debug/sppo_chosen_reward_in_loss": 2.719552516937256, + "debug/sppo_rej_reward_in_loss": -8.379258155822754, + "debug/sppo_reject_loss": 1892.1109619140625, + "epoch": 7.246376811594203, + "grad_norm": 74910.59054714411, + "learning_rate": 2.3831136415219554e-09, + "logits/chosen": 1.1463087797164917, + "logits/rejected": 1.3835389614105225, + "logps/chosen": -257.5396423339844, + "logps/rejected": -300.2962646484375, + "loss": 4486.1707, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.02719552256166935, + "rewards/margins": 0.11098810285329819, + "rewards/rejected": -0.08379258215427399, + "step": 2000 + }, + { + "epoch": 7.246376811594203, + "eval_debug/policy_chosen_logits": 1.3931907415390015, + "eval_debug/policy_chosen_logps": -252.2974853515625, + "eval_debug/policy_rejected_logits": 1.4378267526626587, + "eval_debug/policy_rejected_logps": -263.42547607421875, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2509.96337890625, + "eval_debug/sppo_chosen_reward_in_loss": 0.6209712028503418, + "eval_debug/sppo_rej_reward_in_loss": -3.7668421268463135, + "eval_debug/sppo_reject_loss": 2298.52587890625, + "eval_logits/chosen": 1.3931907415390015, + "eval_logits/rejected": 1.4378267526626587, + "eval_logps/chosen": -252.2974853515625, + "eval_logps/rejected": -263.42547607421875, + "eval_loss": 4616.38916015625, + "eval_rewards/accuracies": 0.5789473652839661, + "eval_rewards/chosen": 0.006209712475538254, + "eval_rewards/margins": 0.04387813061475754, + "eval_rewards/rejected": -0.03766842186450958, + "eval_runtime": 28.3323, + "eval_samples_per_second": 21.177, + "eval_steps_per_second": 0.671, + "step": 2000 + }, + { + "debug/policy_chosen_logits": 1.070395827293396, + "debug/policy_chosen_logps": -249.1089324951172, + "debug/policy_rejected_logits": 1.4063167572021484, + "debug/policy_rejected_logps": -281.3536376953125, + "debug/reference_chosen_logps": -250.95486450195312, + "debug/reference_rejected_logps": -275.25604248046875, + "debug/sppo_chosen_loss": 2398.890869140625, + "debug/sppo_chosen_reward_in_loss": 1.8459268808364868, + "debug/sppo_rej_reward_in_loss": -6.097577095031738, + "debug/sppo_reject_loss": 2119.35888671875, + "epoch": 7.2644927536231885, + "grad_norm": 78997.13816690067, + "learning_rate": 2.2707828391095307e-09, + "logits/chosen": 1.070395827293396, + "logits/rejected": 1.4063167572021484, + "logps/chosen": -249.1089324951172, + "logps/rejected": -281.3536376953125, + "loss": 4406.457, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018459269776940346, + "rewards/margins": 0.07943503558635712, + "rewards/rejected": -0.06097576022148132, + "step": 2005 + }, + { + "debug/policy_chosen_logits": 1.077000617980957, + "debug/policy_chosen_logps": -255.35476684570312, + "debug/policy_rejected_logits": 1.4080921411514282, + "debug/policy_rejected_logps": -269.4011535644531, + "debug/reference_chosen_logps": -256.257080078125, + "debug/reference_rejected_logps": -267.22894287109375, + "debug/sppo_chosen_loss": 2449.634765625, + "debug/sppo_chosen_reward_in_loss": 0.9023283123970032, + "debug/sppo_rej_reward_in_loss": -2.1722145080566406, + "debug/sppo_reject_loss": 2369.26220703125, + "epoch": 7.282608695652174, + "grad_norm": 64792.2688143268, + "learning_rate": 2.1611022500797495e-09, + "logits/chosen": 1.077000617980957, + "logits/rejected": 1.4080921411514282, + "logps/chosen": -255.35476684570312, + "logps/rejected": -269.4011535644531, + "loss": 4526.691, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009023282676935196, + "rewards/margins": 0.030745428055524826, + "rewards/rejected": -0.02172214351594448, + "step": 2010 + }, + { + "debug/policy_chosen_logits": 1.070713758468628, + "debug/policy_chosen_logps": -256.7032775878906, + "debug/policy_rejected_logits": 1.3287959098815918, + "debug/policy_rejected_logps": -307.5129089355469, + "debug/reference_chosen_logps": -256.99139404296875, + "debug/reference_rejected_logps": -301.73614501953125, + "debug/sppo_chosen_loss": 2563.2890625, + "debug/sppo_chosen_reward_in_loss": 0.28814584016799927, + "debug/sppo_rej_reward_in_loss": -5.776768684387207, + "debug/sppo_reject_loss": 2124.537353515625, + "epoch": 7.300724637681159, + "grad_norm": 73436.59483542369, + "learning_rate": 2.0540779645590146e-09, + "logits/chosen": 1.070713758468628, + "logits/rejected": 1.3287959098815918, + "logps/chosen": -256.7032775878906, + "logps/rejected": -307.5129089355469, + "loss": 4503.1719, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.002881459193304181, + "rewards/margins": 0.060649145394563675, + "rewards/rejected": -0.05776768922805786, + "step": 2015 + }, + { + "debug/policy_chosen_logits": 1.38528311252594, + "debug/policy_chosen_logps": -259.69317626953125, + "debug/policy_rejected_logits": 1.6116775274276733, + "debug/policy_rejected_logps": -283.51806640625, + "debug/reference_chosen_logps": -261.0834045410156, + "debug/reference_rejected_logps": -275.83831787109375, + "debug/sppo_chosen_loss": 2445.71875, + "debug/sppo_chosen_reward_in_loss": 1.3902076482772827, + "debug/sppo_rej_reward_in_loss": -7.679726600646973, + "debug/sppo_reject_loss": 1981.787109375, + "epoch": 7.318840579710145, + "grad_norm": 120383.63503334566, + "learning_rate": 1.9497159251797514e-09, + "logits/chosen": 1.38528311252594, + "logits/rejected": 1.6116775274276733, + "logps/chosen": -259.69317626953125, + "logps/rejected": -283.51806640625, + "loss": 4654.2121, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.013902077451348305, + "rewards/margins": 0.09069932997226715, + "rewards/rejected": -0.07679726183414459, + "step": 2020 + }, + { + "debug/policy_chosen_logits": 1.0787150859832764, + "debug/policy_chosen_logps": -250.2095184326172, + "debug/policy_rejected_logits": 1.3025166988372803, + "debug/policy_rejected_logps": -289.91925048828125, + "debug/reference_chosen_logps": -253.705322265625, + "debug/reference_rejected_logps": -283.3659973144531, + "debug/sppo_chosen_loss": 2171.8974609375, + "debug/sppo_chosen_reward_in_loss": 3.495814800262451, + "debug/sppo_rej_reward_in_loss": -6.55324649810791, + "debug/sppo_reject_loss": 1995.900390625, + "epoch": 7.336956521739131, + "grad_norm": 84149.40866936331, + "learning_rate": 1.8480219267504537e-09, + "logits/chosen": 1.0787150859832764, + "logits/rejected": 1.3025166988372803, + "logps/chosen": -250.2095184326172, + "logps/rejected": -289.91925048828125, + "loss": 4416.7891, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.03495814651250839, + "rewards/margins": 0.10049059242010117, + "rewards/rejected": -0.06553246080875397, + "step": 2025 + }, + { + "debug/policy_chosen_logits": 1.3572648763656616, + "debug/policy_chosen_logps": -261.5050048828125, + "debug/policy_rejected_logits": 1.3145654201507568, + "debug/policy_rejected_logps": -268.80584716796875, + "debug/reference_chosen_logps": -263.184814453125, + "debug/reference_rejected_logps": -266.7096252441406, + "debug/sppo_chosen_loss": 2395.189208984375, + "debug/sppo_chosen_reward_in_loss": 1.6798057556152344, + "debug/sppo_rej_reward_in_loss": -2.0962014198303223, + "debug/sppo_reject_loss": 2409.43310546875, + "epoch": 7.355072463768116, + "grad_norm": 65808.77111899166, + "learning_rate": 1.7490016159339482e-09, + "logits/chosen": 1.3572648763656616, + "logits/rejected": 1.3145654201507568, + "logps/chosen": -261.5050048828125, + "logps/rejected": -268.80584716796875, + "loss": 4628.1609, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.016798056662082672, + "rewards/margins": 0.03776007145643234, + "rewards/rejected": -0.02096201293170452, + "step": 2030 + }, + { + "debug/policy_chosen_logits": 1.0232479572296143, + "debug/policy_chosen_logps": -248.12710571289062, + "debug/policy_rejected_logits": 1.5908982753753662, + "debug/policy_rejected_logps": -293.07110595703125, + "debug/reference_chosen_logps": -248.68417358398438, + "debug/reference_rejected_logps": -287.56488037109375, + "debug/sppo_chosen_loss": 2500.882080078125, + "debug/sppo_chosen_reward_in_loss": 0.5570594668388367, + "debug/sppo_rej_reward_in_loss": -5.506226062774658, + "debug/sppo_reject_loss": 2167.27783203125, + "epoch": 7.3731884057971016, + "grad_norm": 70769.16312278254, + "learning_rate": 1.6526604909338049e-09, + "logits/chosen": 1.0232479572296143, + "logits/rejected": 1.5908982753753662, + "logps/chosen": -248.12710571289062, + "logps/rejected": -293.07110595703125, + "loss": 4486.2609, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0055705951526761055, + "rewards/margins": 0.0606328621506691, + "rewards/rejected": -0.05506226420402527, + "step": 2035 + }, + { + "debug/policy_chosen_logits": 0.9465106129646301, + "debug/policy_chosen_logps": -268.14422607421875, + "debug/policy_rejected_logits": 1.3999744653701782, + "debug/policy_rejected_logps": -315.14910888671875, + "debug/reference_chosen_logps": -269.544921875, + "debug/reference_rejected_logps": -309.2588806152344, + "debug/sppo_chosen_loss": 2454.192138671875, + "debug/sppo_chosen_reward_in_loss": 1.400636911392212, + "debug/sppo_rej_reward_in_loss": -5.890233993530273, + "debug/sppo_reject_loss": 2120.572998046875, + "epoch": 7.391304347826087, + "grad_norm": 86036.02978408146, + "learning_rate": 1.5590039011890987e-09, + "logits/chosen": 0.9465106129646301, + "logits/rejected": 1.3999744653701782, + "logps/chosen": -268.14422607421875, + "logps/rejected": -315.14910888671875, + "loss": 4478.875, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.014006366953253746, + "rewards/margins": 0.07290870696306229, + "rewards/rejected": -0.05890233442187309, + "step": 2040 + }, + { + "debug/policy_chosen_logits": 1.4603159427642822, + "debug/policy_chosen_logps": -274.74029541015625, + "debug/policy_rejected_logits": 1.5231013298034668, + "debug/policy_rejected_logps": -305.5496520996094, + "debug/reference_chosen_logps": -276.53521728515625, + "debug/reference_rejected_logps": -299.4794921875, + "debug/sppo_chosen_loss": 2367.659912109375, + "debug/sppo_chosen_reward_in_loss": 1.794926643371582, + "debug/sppo_rej_reward_in_loss": -6.070174217224121, + "debug/sppo_reject_loss": 2107.4912109375, + "epoch": 7.409420289855072, + "grad_norm": 65704.93037044462, + "learning_rate": 1.4680370470773251e-09, + "logits/chosen": 1.4603159427642822, + "logits/rejected": 1.5231013298034668, + "logps/chosen": -274.74029541015625, + "logps/rejected": -305.5496520996094, + "loss": 4405.3949, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.017949264496564865, + "rewards/margins": 0.07865099608898163, + "rewards/rejected": -0.06070173904299736, + "step": 2045 + }, + { + "debug/policy_chosen_logits": 1.3314096927642822, + "debug/policy_chosen_logps": -257.7485046386719, + "debug/policy_rejected_logits": 1.426235556602478, + "debug/policy_rejected_logps": -296.75225830078125, + "debug/reference_chosen_logps": -261.3354797363281, + "debug/reference_rejected_logps": -290.55609130859375, + "debug/sppo_chosen_loss": 2178.118408203125, + "debug/sppo_chosen_reward_in_loss": 3.586970806121826, + "debug/sppo_rej_reward_in_loss": -6.196176052093506, + "debug/sppo_reject_loss": 2050.711181640625, + "epoch": 7.427536231884058, + "grad_norm": 61529.338468656344, + "learning_rate": 1.3797649796257027e-09, + "logits/chosen": 1.3314096927642822, + "logits/rejected": 1.426235556602478, + "logps/chosen": -257.7485046386719, + "logps/rejected": -296.75225830078125, + "loss": 4368.5391, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03586970642209053, + "rewards/margins": 0.09783147275447845, + "rewards/rejected": -0.06196175888180733, + "step": 2050 + }, + { + "debug/policy_chosen_logits": 1.1443208456039429, + "debug/policy_chosen_logps": -267.1307678222656, + "debug/policy_rejected_logits": 1.1701858043670654, + "debug/policy_rejected_logps": -286.821533203125, + "debug/reference_chosen_logps": -266.78277587890625, + "debug/reference_rejected_logps": -282.5542297363281, + "debug/sppo_chosen_loss": 2642.63525390625, + "debug/sppo_chosen_reward_in_loss": -0.34796142578125, + "debug/sppo_rej_reward_in_loss": -4.267295837402344, + "debug/sppo_reject_loss": 2164.36669921875, + "epoch": 7.445652173913043, + "grad_norm": 70235.45670889581, + "learning_rate": 1.2941926002306536e-09, + "logits/chosen": 1.1443208456039429, + "logits/rejected": 1.1701858043670654, + "logps/chosen": -267.1307678222656, + "logps/rejected": -286.821533203125, + "loss": 4530.5602, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0034796136897057295, + "rewards/margins": 0.03919333964586258, + "rewards/rejected": -0.04267295449972153, + "step": 2055 + }, + { + "debug/policy_chosen_logits": 1.4272644519805908, + "debug/policy_chosen_logps": -268.73114013671875, + "debug/policy_rejected_logits": 1.7103526592254639, + "debug/policy_rejected_logps": -310.28265380859375, + "debug/reference_chosen_logps": -269.88397216796875, + "debug/reference_rejected_logps": -305.17596435546875, + "debug/sppo_chosen_loss": 2464.18701171875, + "debug/sppo_chosen_reward_in_loss": 1.152845025062561, + "debug/sppo_rej_reward_in_loss": -5.106662750244141, + "debug/sppo_reject_loss": 2134.12744140625, + "epoch": 7.463768115942029, + "grad_norm": 68845.49769601325, + "learning_rate": 1.2113246603856653e-09, + "logits/chosen": 1.4272644519805908, + "logits/rejected": 1.7103526592254639, + "logps/chosen": -268.73114013671875, + "logps/rejected": -310.28265380859375, + "loss": 4599.1586, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011528450064361095, + "rewards/margins": 0.06259507685899734, + "rewards/rejected": -0.051066625863313675, + "step": 2060 + }, + { + "debug/policy_chosen_logits": 1.303856611251831, + "debug/policy_chosen_logps": -258.29278564453125, + "debug/policy_rejected_logits": 1.5936082601547241, + "debug/policy_rejected_logps": -294.1533203125, + "debug/reference_chosen_logps": -261.1592102050781, + "debug/reference_rejected_logps": -289.24749755859375, + "debug/sppo_chosen_loss": 2248.81298828125, + "debug/sppo_chosen_reward_in_loss": 2.8664422035217285, + "debug/sppo_rej_reward_in_loss": -4.905792236328125, + "debug/sppo_reject_loss": 2147.85693359375, + "epoch": 7.481884057971015, + "grad_norm": 65626.6858130414, + "learning_rate": 1.1311657614174907e-09, + "logits/chosen": 1.303856611251831, + "logits/rejected": 1.5936082601547241, + "logps/chosen": -258.29278564453125, + "logps/rejected": -294.1533203125, + "loss": 4520.8699, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.028664419427514076, + "rewards/margins": 0.07772234827280045, + "rewards/rejected": -0.04905792325735092, + "step": 2065 + }, + { + "debug/policy_chosen_logits": 1.229323148727417, + "debug/policy_chosen_logps": -253.285888671875, + "debug/policy_rejected_logits": 1.6499292850494385, + "debug/policy_rejected_logps": -311.3282165527344, + "debug/reference_chosen_logps": -255.8767852783203, + "debug/reference_rejected_logps": -305.1612243652344, + "debug/sppo_chosen_loss": 2267.26904296875, + "debug/sppo_chosen_reward_in_loss": 2.590872049331665, + "debug/sppo_rej_reward_in_loss": -6.166988849639893, + "debug/sppo_reject_loss": 2095.09814453125, + "epoch": 7.5, + "grad_norm": 63245.6052867994, + "learning_rate": 1.0537203542306083e-09, + "logits/chosen": 1.229323148727417, + "logits/rejected": 1.6499292850494385, + "logps/chosen": -253.285888671875, + "logps/rejected": -311.3282165527344, + "loss": 4523.8332, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.02590871974825859, + "rewards/margins": 0.08757860958576202, + "rewards/rejected": -0.061669886112213135, + "step": 2070 + }, + { + "debug/policy_chosen_logits": 1.3278888463974, + "debug/policy_chosen_logps": -271.868408203125, + "debug/policy_rejected_logits": 1.6005455255508423, + "debug/policy_rejected_logps": -312.808349609375, + "debug/reference_chosen_logps": -274.61627197265625, + "debug/reference_rejected_logps": -309.5181579589844, + "debug/sppo_chosen_loss": 2292.456787109375, + "debug/sppo_chosen_reward_in_loss": 2.7478396892547607, + "debug/sppo_rej_reward_in_loss": -3.2901642322540283, + "debug/sppo_reject_loss": 2260.4921875, + "epoch": 7.518115942028985, + "grad_norm": 61033.563232917964, + "learning_rate": 9.78992739060114e-10, + "logits/chosen": 1.3278888463974, + "logits/rejected": 1.6005455255508423, + "logps/chosen": -271.868408203125, + "logps/rejected": -312.808349609375, + "loss": 4432.7875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.027478396892547607, + "rewards/margins": 0.06038004159927368, + "rewards/rejected": -0.032901640981435776, + "step": 2075 + }, + { + "debug/policy_chosen_logits": 1.1519010066986084, + "debug/policy_chosen_logps": -275.2718811035156, + "debug/policy_rejected_logits": 1.0840139389038086, + "debug/policy_rejected_logps": -284.2613830566406, + "debug/reference_chosen_logps": -277.41717529296875, + "debug/reference_rejected_logps": -277.25054931640625, + "debug/sppo_chosen_loss": 2323.05712890625, + "debug/sppo_chosen_reward_in_loss": 2.145270586013794, + "debug/sppo_rej_reward_in_loss": -7.010800361633301, + "debug/sppo_reject_loss": 2043.428466796875, + "epoch": 7.536231884057971, + "grad_norm": 62756.9248243736, + "learning_rate": 9.069870652329281e-10, + "logits/chosen": 1.1519010066986084, + "logits/rejected": 1.0840139389038086, + "logps/chosen": -275.2718811035156, + "logps/rejected": -284.2613830566406, + "loss": 4596.4141, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021452704444527626, + "rewards/margins": 0.09156069904565811, + "rewards/rejected": -0.07010800391435623, + "step": 2080 + }, + { + "debug/policy_chosen_logits": 0.9182929992675781, + "debug/policy_chosen_logps": -248.80953979492188, + "debug/policy_rejected_logits": 1.4649393558502197, + "debug/policy_rejected_logps": -299.94677734375, + "debug/reference_chosen_logps": -247.97744750976562, + "debug/reference_rejected_logps": -295.57763671875, + "debug/sppo_chosen_loss": 2668.193359375, + "debug/sppo_chosen_reward_in_loss": -0.8321117162704468, + "debug/sppo_rej_reward_in_loss": -4.3691205978393555, + "debug/sppo_reject_loss": 2219.789794921875, + "epoch": 7.554347826086957, + "grad_norm": 61876.31012711566, + "learning_rate": 8.377073309374149e-10, + "logits/chosen": 0.9182929992675781, + "logits/rejected": 1.4649393558502197, + "logps/chosen": -248.80953979492188, + "logps/rejected": -299.94677734375, + "loss": 4586.6172, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.008321116678416729, + "rewards/margins": 0.03537008911371231, + "rewards/rejected": -0.04369121044874191, + "step": 2085 + }, + { + "debug/policy_chosen_logits": 0.9342246055603027, + "debug/policy_chosen_logps": -230.5325469970703, + "debug/policy_rejected_logits": 1.470979928970337, + "debug/policy_rejected_logps": -288.772705078125, + "debug/reference_chosen_logps": -233.8635711669922, + "debug/reference_rejected_logps": -281.48101806640625, + "debug/sppo_chosen_loss": 2201.9716796875, + "debug/sppo_chosen_reward_in_loss": 3.331019639968872, + "debug/sppo_rej_reward_in_loss": -7.291647434234619, + "debug/sppo_reject_loss": 1990.5560302734375, + "epoch": 7.572463768115942, + "grad_norm": 73885.90071030497, + "learning_rate": 7.711573830013584e-10, + "logits/chosen": 0.9342246055603027, + "logits/rejected": 1.470979928970337, + "logps/chosen": -230.5325469970703, + "logps/rejected": -288.772705078125, + "loss": 4508.3344, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.033310193568468094, + "rewards/margins": 0.10622666031122208, + "rewards/rejected": -0.07291646301746368, + "step": 2090 + }, + { + "debug/policy_chosen_logits": 0.804153323173523, + "debug/policy_chosen_logps": -253.0216522216797, + "debug/policy_rejected_logits": 1.2514761686325073, + "debug/policy_rejected_logps": -313.05218505859375, + "debug/reference_chosen_logps": -256.2008972167969, + "debug/reference_rejected_logps": -307.97308349609375, + "debug/sppo_chosen_loss": 2224.22412109375, + "debug/sppo_chosen_reward_in_loss": 3.1792445182800293, + "debug/sppo_rej_reward_in_loss": -5.079104900360107, + "debug/sppo_reject_loss": 2183.387939453125, + "epoch": 7.590579710144928, + "grad_norm": 99151.52697996194, + "learning_rate": 7.073409166783839e-10, + "logits/chosen": 0.804153323173523, + "logits/rejected": 1.2514761686325073, + "logps/chosen": -253.0216522216797, + "logps/rejected": -313.05218505859375, + "loss": 4469.7992, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.03179244324564934, + "rewards/margins": 0.08258350193500519, + "rewards/rejected": -0.050791043788194656, + "step": 2095 + }, + { + "debug/policy_chosen_logits": 1.292790174484253, + "debug/policy_chosen_logps": -273.0116271972656, + "debug/policy_rejected_logits": 1.3900998830795288, + "debug/policy_rejected_logps": -277.5364685058594, + "debug/reference_chosen_logps": -273.92059326171875, + "debug/reference_rejected_logps": -270.35479736328125, + "debug/sppo_chosen_loss": 2474.90185546875, + "debug/sppo_chosen_reward_in_loss": 0.9089992642402649, + "debug/sppo_rej_reward_in_loss": -7.1816864013671875, + "debug/sppo_reject_loss": 2031.3232421875, + "epoch": 7.608695652173913, + "grad_norm": 74327.26454997434, + "learning_rate": 6.462614754427665e-10, + "logits/chosen": 1.292790174484253, + "logits/rejected": 1.3900998830795288, + "logps/chosen": -273.0116271972656, + "logps/rejected": -277.5364685058594, + "loss": 4477.8289, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.009089991450309753, + "rewards/margins": 0.08090685307979584, + "rewards/rejected": -0.07181687653064728, + "step": 2100 + }, + { + "epoch": 7.608695652173913, + "eval_debug/policy_chosen_logits": 1.3925397396087646, + "eval_debug/policy_chosen_logps": -252.22926330566406, + "eval_debug/policy_rejected_logits": 1.436280369758606, + "eval_debug/policy_rejected_logps": -263.1951599121094, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2506.2578125, + "eval_debug/sppo_chosen_reward_in_loss": 0.6891991496086121, + "eval_debug/sppo_rej_reward_in_loss": -3.5365147590637207, + "eval_debug/sppo_reject_loss": 2318.237548828125, + "eval_logits/chosen": 1.3925397396087646, + "eval_logits/rejected": 1.436280369758606, + "eval_logps/chosen": -252.22926330566406, + "eval_logps/rejected": -263.1951599121094, + "eval_loss": 4617.22900390625, + "eval_rewards/accuracies": 0.5789473652839661, + "eval_rewards/chosen": 0.006891992408782244, + "eval_rewards/margins": 0.04225713387131691, + "eval_rewards/rejected": -0.03536514192819595, + "eval_runtime": 28.4804, + "eval_samples_per_second": 21.067, + "eval_steps_per_second": 0.667, + "step": 2100 + }, + { + "debug/policy_chosen_logits": 0.6739364266395569, + "debug/policy_chosen_logps": -230.30337524414062, + "debug/policy_rejected_logits": 0.9180151224136353, + "debug/policy_rejected_logps": -246.10037231445312, + "debug/reference_chosen_logps": -233.5577392578125, + "debug/reference_rejected_logps": -242.1542510986328, + "debug/sppo_chosen_loss": 2197.70849609375, + "debug/sppo_chosen_reward_in_loss": 3.254357099533081, + "debug/sppo_rej_reward_in_loss": -3.946110486984253, + "debug/sppo_reject_loss": 2230.728515625, + "epoch": 7.6268115942028984, + "grad_norm": 64996.731517441374, + "learning_rate": 5.879224507926661e-10, + "logits/chosen": 0.6739364266395569, + "logits/rejected": 0.9180151224136353, + "logps/chosen": -230.30337524414062, + "logps/rejected": -246.10037231445312, + "loss": 4344.8258, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.032543569803237915, + "rewards/margins": 0.07200466096401215, + "rewards/rejected": -0.039461102336645126, + "step": 2105 + }, + { + "debug/policy_chosen_logits": 1.2277402877807617, + "debug/policy_chosen_logps": -256.14288330078125, + "debug/policy_rejected_logits": 1.379900574684143, + "debug/policy_rejected_logps": -274.8895568847656, + "debug/reference_chosen_logps": -259.5997619628906, + "debug/reference_rejected_logps": -270.292724609375, + "debug/sppo_chosen_loss": 2177.25146484375, + "debug/sppo_chosen_reward_in_loss": 3.4568793773651123, + "debug/sppo_rej_reward_in_loss": -4.596831321716309, + "debug/sppo_reject_loss": 2206.194580078125, + "epoch": 7.644927536231884, + "grad_norm": 71585.54175772157, + "learning_rate": 5.323270820618398e-10, + "logits/chosen": 1.2277402877807617, + "logits/rejected": 1.379900574684143, + "logps/chosen": -256.14288330078125, + "logps/rejected": -274.8895568847656, + "loss": 4494.991, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.03456879034638405, + "rewards/margins": 0.0805370956659317, + "rewards/rejected": -0.04596831649541855, + "step": 2110 + }, + { + "debug/policy_chosen_logits": 1.0671769380569458, + "debug/policy_chosen_logps": -243.13125610351562, + "debug/policy_rejected_logits": 1.1685346364974976, + "debug/policy_rejected_logps": -274.372802734375, + "debug/reference_chosen_logps": -243.8218231201172, + "debug/reference_rejected_logps": -267.530517578125, + "debug/sppo_chosen_loss": 2521.191162109375, + "debug/sppo_chosen_reward_in_loss": 0.6905729174613953, + "debug/sppo_rej_reward_in_loss": -6.842259407043457, + "debug/sppo_reject_loss": 2061.65771484375, + "epoch": 7.663043478260869, + "grad_norm": 103906.16698210256, + "learning_rate": 4.794784562397458e-10, + "logits/chosen": 1.0671769380569458, + "logits/rejected": 1.1685346364974976, + "logps/chosen": -243.13125610351562, + "logps/rejected": -274.372802734375, + "loss": 4520.9984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.006905729416757822, + "rewards/margins": 0.07532832771539688, + "rewards/rejected": -0.0684226006269455, + "step": 2115 + }, + { + "debug/policy_chosen_logits": 0.9971033334732056, + "debug/policy_chosen_logps": -227.75521850585938, + "debug/policy_rejected_logits": 1.3219413757324219, + "debug/policy_rejected_logps": -302.8537292480469, + "debug/reference_chosen_logps": -229.51358032226562, + "debug/reference_rejected_logps": -298.8193359375, + "debug/sppo_chosen_loss": 2390.484619140625, + "debug/sppo_chosen_reward_in_loss": 1.7583458423614502, + "debug/sppo_rej_reward_in_loss": -4.034407615661621, + "debug/sppo_reject_loss": 2209.649169921875, + "epoch": 7.681159420289855, + "grad_norm": 92487.56140396521, + "learning_rate": 4.293795078001317e-10, + "logits/chosen": 0.9971033334732056, + "logits/rejected": 1.3219413757324219, + "logps/chosen": -227.75521850585938, + "logps/rejected": -302.8537292480469, + "loss": 4604.8105, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0175834558904171, + "rewards/margins": 0.057927537709474564, + "rewards/rejected": -0.040344081819057465, + "step": 2120 + }, + { + "debug/policy_chosen_logits": 0.9510968923568726, + "debug/policy_chosen_logps": -225.7450714111328, + "debug/policy_rejected_logits": 1.346861720085144, + "debug/policy_rejected_logps": -291.18988037109375, + "debug/reference_chosen_logps": -229.05801391601562, + "debug/reference_rejected_logps": -284.1490478515625, + "debug/sppo_chosen_loss": 2200.169921875, + "debug/sppo_chosen_reward_in_loss": 3.3129355907440186, + "debug/sppo_rej_reward_in_loss": -7.0408034324646, + "debug/sppo_reject_loss": 2031.1324462890625, + "epoch": 7.699275362318841, + "grad_norm": 78902.88545959517, + "learning_rate": 3.8203301853813594e-10, + "logits/chosen": 0.9510968923568726, + "logits/rejected": 1.346861720085144, + "logps/chosen": -225.7450714111328, + "logps/rejected": -291.18988037109375, + "loss": 4466.5586, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.03312935680150986, + "rewards/margins": 0.10353739559650421, + "rewards/rejected": -0.07040803134441376, + "step": 2125 + }, + { + "debug/policy_chosen_logits": 1.301474690437317, + "debug/policy_chosen_logps": -252.5749969482422, + "debug/policy_rejected_logits": 1.6346708536148071, + "debug/policy_rejected_logps": -284.538330078125, + "debug/reference_chosen_logps": -253.86953735351562, + "debug/reference_rejected_logps": -282.350830078125, + "debug/sppo_chosen_loss": 2424.56005859375, + "debug/sppo_chosen_reward_in_loss": 1.2945072650909424, + "debug/sppo_rej_reward_in_loss": -2.187539577484131, + "debug/sppo_reject_loss": 2352.7392578125, + "epoch": 7.717391304347826, + "grad_norm": 64909.281348993565, + "learning_rate": 3.3744161741577905e-10, + "logits/chosen": 1.301474690437317, + "logits/rejected": 1.6346708536148071, + "logps/chosen": -252.5749969482422, + "logps/rejected": -284.538330078125, + "loss": 4599.1926, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.01294507272541523, + "rewards/margins": 0.034820470958948135, + "rewards/rejected": -0.021875392645597458, + "step": 2130 + }, + { + "debug/policy_chosen_logits": 1.1287482976913452, + "debug/policy_chosen_logps": -258.11572265625, + "debug/policy_rejected_logits": 1.5572656393051147, + "debug/policy_rejected_logps": -299.67730712890625, + "debug/reference_chosen_logps": -259.0867004394531, + "debug/reference_rejected_logps": -294.621826171875, + "debug/sppo_chosen_loss": 2464.80810546875, + "debug/sppo_chosen_reward_in_loss": 0.970924973487854, + "debug/sppo_rej_reward_in_loss": -5.055464744567871, + "debug/sppo_reject_loss": 2166.364501953125, + "epoch": 7.7355072463768115, + "grad_norm": 67019.5485001634, + "learning_rate": 2.956077804160184e-10, + "logits/chosen": 1.1287482976913452, + "logits/rejected": 1.5572656393051147, + "logps/chosen": -258.11572265625, + "logps/rejected": -299.67730712890625, + "loss": 4541.4832, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.009709248319268227, + "rewards/margins": 0.060263894498348236, + "rewards/rejected": -0.05055464431643486, + "step": 2135 + }, + { + "debug/policy_chosen_logits": 1.219551920890808, + "debug/policy_chosen_logps": -267.6524963378906, + "debug/policy_rejected_logits": 1.5995728969573975, + "debug/policy_rejected_logps": -285.8074951171875, + "debug/reference_chosen_logps": -269.52984619140625, + "debug/reference_rejected_logps": -281.32818603515625, + "debug/sppo_chosen_loss": 2368.73486328125, + "debug/sppo_chosen_reward_in_loss": 1.8773447275161743, + "debug/sppo_rej_reward_in_loss": -4.479327201843262, + "debug/sppo_reject_loss": 2177.294677734375, + "epoch": 7.753623188405797, + "grad_norm": 151513.1334928217, + "learning_rate": 2.5653383040524224e-10, + "logits/chosen": 1.219551920890808, + "logits/rejected": 1.5995728969573975, + "logps/chosen": -267.6524963378906, + "logps/rejected": -285.8074951171875, + "loss": 4615.0719, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01877344772219658, + "rewards/margins": 0.06356671452522278, + "rewards/rejected": -0.0447932705283165, + "step": 2140 + }, + { + "debug/policy_chosen_logits": 1.3132517337799072, + "debug/policy_chosen_logps": -262.13079833984375, + "debug/policy_rejected_logits": 1.3344939947128296, + "debug/policy_rejected_logps": -284.14874267578125, + "debug/reference_chosen_logps": -263.3422546386719, + "debug/reference_rejected_logps": -277.1472473144531, + "debug/sppo_chosen_loss": 2454.830810546875, + "debug/sppo_chosen_reward_in_loss": 1.2114683389663696, + "debug/sppo_rej_reward_in_loss": -7.001499176025391, + "debug/sppo_reject_loss": 1998.1826171875, + "epoch": 7.771739130434782, + "grad_norm": 62465.41863525179, + "learning_rate": 2.202219370043168e-10, + "logits/chosen": 1.3132517337799072, + "logits/rejected": 1.3344939947128296, + "logps/chosen": -262.13079833984375, + "logps/rejected": -284.14874267578125, + "loss": 4556.0914, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.012114683166146278, + "rewards/margins": 0.08212967216968536, + "rewards/rejected": -0.07001499086618423, + "step": 2145 + }, + { + "debug/policy_chosen_logits": 0.9999769330024719, + "debug/policy_chosen_logps": -222.53140258789062, + "debug/policy_rejected_logits": 1.1089446544647217, + "debug/policy_rejected_logps": -247.6454315185547, + "debug/reference_chosen_logps": -225.07754516601562, + "debug/reference_rejected_logps": -243.9304962158203, + "debug/sppo_chosen_loss": 2308.60986328125, + "debug/sppo_chosen_reward_in_loss": 2.546154737472534, + "debug/sppo_rej_reward_in_loss": -3.7149219512939453, + "debug/sppo_reject_loss": 2277.2587890625, + "epoch": 7.789855072463768, + "grad_norm": 66491.83051664101, + "learning_rate": 1.866741164680996e-10, + "logits/chosen": 0.9999769330024719, + "logits/rejected": 1.1089446544647217, + "logps/chosen": -222.53140258789062, + "logps/rejected": -247.6454315185547, + "loss": 4427.7336, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.025461548939347267, + "rewards/margins": 0.06261076033115387, + "rewards/rejected": -0.03714922070503235, + "step": 2150 + }, + { + "debug/policy_chosen_logits": 1.0805193185806274, + "debug/policy_chosen_logps": -241.43936157226562, + "debug/policy_rejected_logits": 1.2052781581878662, + "debug/policy_rejected_logps": -294.7074279785156, + "debug/reference_chosen_logps": -242.5572967529297, + "debug/reference_rejected_logps": -286.7196960449219, + "debug/sppo_chosen_loss": 2461.002197265625, + "debug/sppo_chosen_reward_in_loss": 1.1179357767105103, + "debug/sppo_rej_reward_in_loss": -7.987711429595947, + "debug/sppo_reject_loss": 1928.757080078125, + "epoch": 7.807971014492754, + "grad_norm": 60872.43048458311, + "learning_rate": 1.5589223157347896e-10, + "logits/chosen": 1.0805193185806274, + "logits/rejected": 1.2052781581878662, + "logps/chosen": -241.43936157226562, + "logps/rejected": -294.7074279785156, + "loss": 4587.5289, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011179356835782528, + "rewards/margins": 0.0910564661026001, + "rewards/rejected": -0.079877108335495, + "step": 2155 + }, + { + "debug/policy_chosen_logits": 1.1456208229064941, + "debug/policy_chosen_logps": -230.3699951171875, + "debug/policy_rejected_logits": 1.9213443994522095, + "debug/policy_rejected_logps": -306.39056396484375, + "debug/reference_chosen_logps": -232.04678344726562, + "debug/reference_rejected_logps": -300.45745849609375, + "debug/sppo_chosen_loss": 2416.55029296875, + "debug/sppo_chosen_reward_in_loss": 1.6767866611480713, + "debug/sppo_rej_reward_in_loss": -5.933084487915039, + "debug/sppo_reject_loss": 2082.762939453125, + "epoch": 7.826086956521739, + "grad_norm": 64496.71112261495, + "learning_rate": 1.2787799151596224e-10, + "logits/chosen": 1.1456208229064941, + "logits/rejected": 1.9213443994522095, + "logps/chosen": -230.3699951171875, + "logps/rejected": -306.39056396484375, + "loss": 4590.8188, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.016767865046858788, + "rewards/margins": 0.07609870284795761, + "rewards/rejected": -0.05933083966374397, + "step": 2160 + }, + { + "debug/policy_chosen_logits": 1.24051833152771, + "debug/policy_chosen_logps": -268.8450622558594, + "debug/policy_rejected_logits": 1.2047107219696045, + "debug/policy_rejected_logps": -266.3825378417969, + "debug/reference_chosen_logps": -269.70306396484375, + "debug/reference_rejected_logps": -261.1277160644531, + "debug/sppo_chosen_loss": 2478.1025390625, + "debug/sppo_chosen_reward_in_loss": 0.8580325841903687, + "debug/sppo_rej_reward_in_loss": -5.254827976226807, + "debug/sppo_reject_loss": 2119.91357421875, + "epoch": 7.844202898550725, + "grad_norm": 82175.45595980022, + "learning_rate": 1.0263295181475174e-10, + "logits/chosen": 1.24051833152771, + "logits/rejected": 1.2047107219696045, + "logps/chosen": -268.8450622558594, + "logps/rejected": -266.3825378417969, + "loss": 4478.3102, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.008580325171351433, + "rewards/margins": 0.06112860515713692, + "rewards/rejected": -0.052548278123140335, + "step": 2165 + }, + { + "debug/policy_chosen_logits": 1.0681052207946777, + "debug/policy_chosen_logps": -255.635498046875, + "debug/policy_rejected_logits": 1.264644742012024, + "debug/policy_rejected_logps": -285.5237731933594, + "debug/reference_chosen_logps": -258.68817138671875, + "debug/reference_rejected_logps": -281.90374755859375, + "debug/sppo_chosen_loss": 2218.684326171875, + "debug/sppo_chosen_reward_in_loss": 3.0526726245880127, + "debug/sppo_rej_reward_in_loss": -3.6200504302978516, + "debug/sppo_reject_loss": 2281.26806640625, + "epoch": 7.86231884057971, + "grad_norm": 63881.06500488675, + "learning_rate": 8.015851422638053e-11, + "logits/chosen": 1.0681052207946777, + "logits/rejected": 1.264644742012024, + "logps/chosen": -255.635498046875, + "logps/rejected": -285.5237731933594, + "loss": 4553.4793, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.030526721850037575, + "rewards/margins": 0.06672722846269608, + "rewards/rejected": -0.03620050102472305, + "step": 2170 + }, + { + "debug/policy_chosen_logits": 1.361900806427002, + "debug/policy_chosen_logps": -259.09490966796875, + "debug/policy_rejected_logits": 1.4846298694610596, + "debug/policy_rejected_logps": -282.2530212402344, + "debug/reference_chosen_logps": -259.996826171875, + "debug/reference_rejected_logps": -276.11199951171875, + "debug/sppo_chosen_loss": 2474.354248046875, + "debug/sppo_chosen_reward_in_loss": 0.9018945693969727, + "debug/sppo_rej_reward_in_loss": -6.1409759521484375, + "debug/sppo_reject_loss": 2052.828857421875, + "epoch": 7.880434782608695, + "grad_norm": 79371.0356298821, + "learning_rate": 6.045592666688581e-11, + "logits/chosen": 1.361900806427002, + "logits/rejected": 1.4846298694610596, + "logps/chosen": -259.09490966796875, + "logps/rejected": -282.2530212402344, + "loss": 4502.0855, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.009018944576382637, + "rewards/margins": 0.07042870670557022, + "rewards/rejected": -0.061409760266542435, + "step": 2175 + }, + { + "debug/policy_chosen_logits": 1.0065410137176514, + "debug/policy_chosen_logps": -260.93023681640625, + "debug/policy_rejected_logits": 1.2652740478515625, + "debug/policy_rejected_logps": -291.34326171875, + "debug/reference_chosen_logps": -263.2150573730469, + "debug/reference_rejected_logps": -288.1213684082031, + "debug/sppo_chosen_loss": 2298.703857421875, + "debug/sppo_chosen_reward_in_loss": 2.2848479747772217, + "debug/sppo_rej_reward_in_loss": -3.221867322921753, + "debug/sppo_reject_loss": 2276.685546875, + "epoch": 7.898550724637682, + "grad_norm": 78849.01149914775, + "learning_rate": 4.352628314249762e-11, + "logits/chosen": 1.0065410137176514, + "logits/rejected": 1.2652740478515625, + "logps/chosen": -260.93023681640625, + "logps/rejected": -291.34326171875, + "loss": 4521.1391, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.022848479449748993, + "rewards/margins": 0.05506715923547745, + "rewards/rejected": -0.03221867233514786, + "step": 2180 + }, + { + "debug/policy_chosen_logits": 1.3481090068817139, + "debug/policy_chosen_logps": -265.5303955078125, + "debug/policy_rejected_logits": 1.5380135774612427, + "debug/policy_rejected_logps": -304.78790283203125, + "debug/reference_chosen_logps": -267.88934326171875, + "debug/reference_rejected_logps": -299.74700927734375, + "debug/sppo_chosen_loss": 2301.35498046875, + "debug/sppo_chosen_reward_in_loss": 2.3589279651641846, + "debug/sppo_rej_reward_in_loss": -5.040875434875488, + "debug/sppo_reject_loss": 2140.15966796875, + "epoch": 7.916666666666667, + "grad_norm": 65384.04338134149, + "learning_rate": 2.9370523688915237e-11, + "logits/chosen": 1.3481090068817139, + "logits/rejected": 1.5380135774612427, + "logps/chosen": -265.5303955078125, + "logps/rejected": -304.78790283203125, + "loss": 4426.9734, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.023589277639985085, + "rewards/margins": 0.07399802654981613, + "rewards/rejected": -0.050408750772476196, + "step": 2185 + }, + { + "debug/policy_chosen_logits": 0.7700341939926147, + "debug/policy_chosen_logps": -227.670654296875, + "debug/policy_rejected_logits": 1.2262274026870728, + "debug/policy_rejected_logps": -317.4136657714844, + "debug/reference_chosen_logps": -231.3727569580078, + "debug/reference_rejected_logps": -307.95062255859375, + "debug/sppo_chosen_loss": 2154.80126953125, + "debug/sppo_chosen_reward_in_loss": 3.7021331787109375, + "debug/sppo_rej_reward_in_loss": -9.463071823120117, + "debug/sppo_reject_loss": 1801.487060546875, + "epoch": 7.934782608695652, + "grad_norm": 115608.79716862518, + "learning_rate": 1.7989434319093387e-11, + "logits/chosen": 0.7700341939926147, + "logits/rejected": 1.2262274026870728, + "logps/chosen": -227.670654296875, + "logps/rejected": -317.4136657714844, + "loss": 4496.9297, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.03702133148908615, + "rewards/margins": 0.13165204226970673, + "rewards/rejected": -0.09463071078062057, + "step": 2190 + }, + { + "debug/policy_chosen_logits": 1.2976726293563843, + "debug/policy_chosen_logps": -283.23236083984375, + "debug/policy_rejected_logits": 1.4259979724884033, + "debug/policy_rejected_logps": -286.2864685058594, + "debug/reference_chosen_logps": -283.5812072753906, + "debug/reference_rejected_logps": -284.0365905761719, + "debug/sppo_chosen_loss": 2530.442626953125, + "debug/sppo_chosen_reward_in_loss": 0.3488399386405945, + "debug/sppo_rej_reward_in_loss": -2.249875545501709, + "debug/sppo_reject_loss": 2347.236083984375, + "epoch": 7.952898550724638, + "grad_norm": 103819.46468325114, + "learning_rate": 9.38364697961047e-12, + "logits/chosen": 1.2976726293563843, + "logits/rejected": 1.4259979724884033, + "logps/chosen": -283.23236083984375, + "logps/rejected": -286.2864685058594, + "loss": 4557.3523, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.003488400485366583, + "rewards/margins": 0.025987153872847557, + "rewards/rejected": -0.022498754784464836, + "step": 2195 + }, + { + "debug/policy_chosen_logits": 1.145102620124817, + "debug/policy_chosen_logps": -264.08642578125, + "debug/policy_rejected_logits": 1.5146633386611938, + "debug/policy_rejected_logps": -315.19097900390625, + "debug/reference_chosen_logps": -266.16912841796875, + "debug/reference_rejected_logps": -312.854248046875, + "debug/sppo_chosen_loss": 2318.06494140625, + "debug/sppo_chosen_reward_in_loss": 2.082681179046631, + "debug/sppo_rej_reward_in_loss": -2.3367526531219482, + "debug/sppo_reject_loss": 2359.71044921875, + "epoch": 7.971014492753623, + "grad_norm": 102200.71903476924, + "learning_rate": 3.5536395155744138e-12, + "logits/chosen": 1.145102620124817, + "logits/rejected": 1.5146633386611938, + "logps/chosen": -264.08642578125, + "logps/rejected": -315.19097900390625, + "loss": 4520.1934, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.020826810970902443, + "rewards/margins": 0.044194333255290985, + "rewards/rejected": -0.02336752787232399, + "step": 2200 + }, + { + "epoch": 7.971014492753623, + "eval_debug/policy_chosen_logits": 1.3941218852996826, + "eval_debug/policy_chosen_logps": -252.2310333251953, + "eval_debug/policy_rejected_logits": 1.437137246131897, + "eval_debug/policy_rejected_logps": -263.2242431640625, + "eval_debug/reference_chosen_logps": -252.91845703125, + "eval_debug/reference_rejected_logps": -259.6585998535156, + "eval_debug/sppo_chosen_loss": 2507.325927734375, + "eval_debug/sppo_chosen_reward_in_loss": 0.6874253153800964, + "eval_debug/sppo_rej_reward_in_loss": -3.5656445026397705, + "eval_debug/sppo_reject_loss": 2312.91162109375, + "eval_logits/chosen": 1.3941218852996826, + "eval_logits/rejected": 1.437137246131897, + "eval_logps/chosen": -252.2310333251953, + "eval_logps/rejected": -263.2242431640625, + "eval_loss": 4613.583984375, + "eval_rewards/accuracies": 0.6052631735801697, + "eval_rewards/chosen": 0.006874253042042255, + "eval_rewards/margins": 0.04253069683909416, + "eval_rewards/rejected": -0.03565644472837448, + "eval_runtime": 28.5358, + "eval_samples_per_second": 21.026, + "eval_steps_per_second": 0.666, + "step": 2200 + }, + { + "debug/policy_chosen_logits": 1.3107990026474, + "debug/policy_chosen_logps": -250.88546752929688, + "debug/policy_rejected_logits": 1.5912861824035645, + "debug/policy_rejected_logps": -305.1792907714844, + "debug/reference_chosen_logps": -252.6299285888672, + "debug/reference_rejected_logps": -301.18548583984375, + "debug/sppo_chosen_loss": 2382.531982421875, + "debug/sppo_chosen_reward_in_loss": 1.7444469928741455, + "debug/sppo_rej_reward_in_loss": -3.993786573410034, + "debug/sppo_reject_loss": 2213.762939453125, + "epoch": 7.989130434782608, + "grad_norm": 65636.56438717897, + "learning_rate": 4.997356440772371e-13, + "logits/chosen": 1.3107990026474, + "logits/rejected": 1.5912861824035645, + "logps/chosen": -250.88546752929688, + "logps/rejected": -305.1792907714844, + "loss": 4429.5063, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.017444469034671783, + "rewards/margins": 0.057382334023714066, + "rewards/rejected": -0.03993786498904228, + "step": 2205 + }, + { + "epoch": 8.0, + "step": 2208, + "total_flos": 0.0, + "train_loss": 4636.829430179319, + "train_runtime": 15508.5423, + "train_samples_per_second": 9.092, + "train_steps_per_second": 0.142 + } + ], + "logging_steps": 5, + "max_steps": 2208, + "num_input_tokens_seen": 0, + "num_train_epochs": 8, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}