{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.0, "eval_steps": 100, "global_step": 2208, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "debug/policy_chosen_logits": 0.8079685568809509, "debug/policy_chosen_logps": -298.0812683105469, "debug/policy_rejected_logits": 0.6268295645713806, "debug/policy_rejected_logps": -240.20742797851562, "debug/reference_chosen_logps": -298.0812683105469, "debug/reference_rejected_logps": -240.20742797851562, "debug/sppo_chosen_loss": 2500.0, "debug/sppo_chosen_reward_in_loss": 0.0, "debug/sppo_rej_reward_in_loss": 0.0, "debug/sppo_reject_loss": 2500.0, "epoch": 0.0036231884057971015, "grad_norm": 63517.94720887712, "learning_rate": 1e-09, "logits/chosen": 0.8079685568809509, "logits/rejected": 0.6268295645713806, "logps/chosen": -298.0812683105469, "logps/rejected": -240.20742797851562, "loss": 5000.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "debug/policy_chosen_logits": 1.1307491064071655, "debug/policy_chosen_logps": -262.5867614746094, "debug/policy_rejected_logits": 1.3688106536865234, "debug/policy_rejected_logps": -295.9023742675781, "debug/reference_chosen_logps": -262.58026123046875, "debug/reference_rejected_logps": -296.0579528808594, "debug/sppo_chosen_loss": 2501.32470703125, "debug/sppo_chosen_reward_in_loss": -0.006488680839538574, "debug/sppo_rej_reward_in_loss": 0.15558385848999023, "debug/sppo_reject_loss": 2515.9296875, "epoch": 0.018115942028985508, "grad_norm": 58736.14839713635, "learning_rate": 5e-09, "logits/chosen": 1.1307491064071655, "logits/rejected": 1.3688106536865234, "logps/chosen": -262.5867614746094, "logps/rejected": -295.9023742675781, "loss": 4991.5791, "rewards/accuracies": 0.34375, "rewards/chosen": -6.488675717264414e-05, "rewards/margins": -0.0016207253793254495, "rewards/rejected": 0.0015558383893221617, "step": 5 }, { "debug/policy_chosen_logits": 1.3411222696304321, "debug/policy_chosen_logps": -282.69219970703125, "debug/policy_rejected_logits": 1.611016869544983, "debug/policy_rejected_logps": -287.430908203125, "debug/reference_chosen_logps": -282.7684631347656, "debug/reference_rejected_logps": -287.81396484375, "debug/sppo_chosen_loss": 2493.212646484375, "debug/sppo_chosen_reward_in_loss": 0.07623787224292755, "debug/sppo_rej_reward_in_loss": 0.38306236267089844, "debug/sppo_reject_loss": 2539.445556640625, "epoch": 0.036231884057971016, "grad_norm": 68115.07386767172, "learning_rate": 1e-08, "logits/chosen": 1.3411222696304321, "logits/rejected": 1.611016869544983, "logps/chosen": -282.69219970703125, "logps/rejected": -287.430908203125, "loss": 5009.1367, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.0007623785641044378, "rewards/margins": -0.0030682452488690615, "rewards/rejected": 0.0038306235801428556, "step": 10 }, { "debug/policy_chosen_logits": 1.2816027402877808, "debug/policy_chosen_logps": -247.15579223632812, "debug/policy_rejected_logits": 1.63693368434906, "debug/policy_rejected_logps": -285.6268615722656, "debug/reference_chosen_logps": -247.40646362304688, "debug/reference_rejected_logps": -286.155029296875, "debug/sppo_chosen_loss": 2475.64404296875, "debug/sppo_chosen_reward_in_loss": 0.25067728757858276, "debug/sppo_rej_reward_in_loss": 0.5281627774238586, "debug/sppo_reject_loss": 2554.003662109375, "epoch": 0.05434782608695652, "grad_norm": 54977.353764878644, "learning_rate": 1.5e-08, "logits/chosen": 1.2816027402877808, "logits/rejected": 1.63693368434906, "logps/chosen": -247.15579223632812, "logps/rejected": -285.6268615722656, "loss": 5003.7672, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0025067729875445366, "rewards/margins": -0.0027748546563088894, "rewards/rejected": 0.005281627178192139, "step": 15 }, { "debug/policy_chosen_logits": 1.2541359663009644, "debug/policy_chosen_logps": -275.11187744140625, "debug/policy_rejected_logits": 1.6011371612548828, "debug/policy_rejected_logps": -277.25579833984375, "debug/reference_chosen_logps": -275.2658386230469, "debug/reference_rejected_logps": -277.7287292480469, "debug/sppo_chosen_loss": 2485.12744140625, "debug/sppo_chosen_reward_in_loss": 0.15397301316261292, "debug/sppo_rej_reward_in_loss": 0.47296810150146484, "debug/sppo_reject_loss": 2548.18603515625, "epoch": 0.07246376811594203, "grad_norm": 57534.90143937713, "learning_rate": 2e-08, "logits/chosen": 1.2541359663009644, "logits/rejected": 1.6011371612548828, "logps/chosen": -275.11187744140625, "logps/rejected": -277.25579833984375, "loss": 5019.257, "rewards/accuracies": 0.375, "rewards/chosen": 0.0015397300012409687, "rewards/margins": -0.003189950715750456, "rewards/rejected": 0.004729681182652712, "step": 20 }, { "debug/policy_chosen_logits": 1.3301985263824463, "debug/policy_chosen_logps": -256.6197204589844, "debug/policy_rejected_logits": 1.700563669204712, "debug/policy_rejected_logps": -268.60101318359375, "debug/reference_chosen_logps": -257.0242614746094, "debug/reference_rejected_logps": -269.0206604003906, "debug/sppo_chosen_loss": 2460.305419921875, "debug/sppo_chosen_reward_in_loss": 0.40453624725341797, "debug/sppo_rej_reward_in_loss": 0.41967296600341797, "debug/sppo_reject_loss": 2542.882080078125, "epoch": 0.09057971014492754, "grad_norm": 60821.795107824626, "learning_rate": 2.5e-08, "logits/chosen": 1.3301985263824463, "logits/rejected": 1.700563669204712, "logps/chosen": -256.6197204589844, "logps/rejected": -268.60101318359375, "loss": 5002.3281, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.004045362584292889, "rewards/margins": -0.00015136711590457708, "rewards/rejected": 0.004196729511022568, "step": 25 }, { "debug/policy_chosen_logits": 1.4955631494522095, "debug/policy_chosen_logps": -226.08700561523438, "debug/policy_rejected_logits": 1.840157151222229, "debug/policy_rejected_logps": -272.52227783203125, "debug/reference_chosen_logps": -226.3221893310547, "debug/reference_rejected_logps": -272.57330322265625, "debug/sppo_chosen_loss": 2477.2919921875, "debug/sppo_chosen_reward_in_loss": 0.23515930771827698, "debug/sppo_rej_reward_in_loss": 0.05106544494628906, "debug/sppo_reject_loss": 2505.71826171875, "epoch": 0.10869565217391304, "grad_norm": 59806.97677705937, "learning_rate": 3e-08, "logits/chosen": 1.4955631494522095, "logits/rejected": 1.840157151222229, "logps/chosen": -226.08700561523438, "logps/rejected": -272.52227783203125, "loss": 4999.6945, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0023515927605330944, "rewards/margins": 0.001840938231907785, "rewards/rejected": 0.0005106544704176486, "step": 30 }, { "debug/policy_chosen_logits": 1.5929896831512451, "debug/policy_chosen_logps": -250.57470703125, "debug/policy_rejected_logits": 1.9173164367675781, "debug/policy_rejected_logps": -278.4027404785156, "debug/reference_chosen_logps": -250.8056640625, "debug/reference_rejected_logps": -278.73834228515625, "debug/sppo_chosen_loss": 2477.46240234375, "debug/sppo_chosen_reward_in_loss": 0.2309425324201584, "debug/sppo_rej_reward_in_loss": 0.335653692483902, "debug/sppo_reject_loss": 2534.33251953125, "epoch": 0.12681159420289856, "grad_norm": 60289.32214413659, "learning_rate": 3.4999999999999996e-08, "logits/chosen": 1.5929896831512451, "logits/rejected": 1.9173164367675781, "logps/chosen": -250.57470703125, "logps/rejected": -278.4027404785156, "loss": 5009.1906, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0023094252683222294, "rewards/margins": -0.0010471114655956626, "rewards/rejected": 0.003356536850333214, "step": 35 }, { "debug/policy_chosen_logits": 1.4920421838760376, "debug/policy_chosen_logps": -258.7696228027344, "debug/policy_rejected_logits": 1.8183902502059937, "debug/policy_rejected_logps": -306.36370849609375, "debug/reference_chosen_logps": -259.2444152832031, "debug/reference_rejected_logps": -306.8253479003906, "debug/sppo_chosen_loss": 2453.486572265625, "debug/sppo_chosen_reward_in_loss": 0.4747522473335266, "debug/sppo_rej_reward_in_loss": 0.46161746978759766, "debug/sppo_reject_loss": 2546.947265625, "epoch": 0.14492753623188406, "grad_norm": 59391.41080156482, "learning_rate": 4e-08, "logits/chosen": 1.4920421838760376, "logits/rejected": 1.8183902502059937, "logps/chosen": -258.7696228027344, "logps/rejected": -306.36370849609375, "loss": 5000.5938, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.004747522063553333, "rewards/margins": 0.00013134740584064275, "rewards/rejected": 0.004616174381226301, "step": 40 }, { "debug/policy_chosen_logits": 1.4446805715560913, "debug/policy_chosen_logps": -251.4736785888672, "debug/policy_rejected_logits": 1.6244945526123047, "debug/policy_rejected_logps": -283.0340270996094, "debug/reference_chosen_logps": -251.584228515625, "debug/reference_rejected_logps": -283.26763916015625, "debug/sppo_chosen_loss": 2489.77587890625, "debug/sppo_chosen_reward_in_loss": 0.11053428798913956, "debug/sppo_rej_reward_in_loss": 0.2336265593767166, "debug/sppo_reject_loss": 2524.171142578125, "epoch": 0.16304347826086957, "grad_norm": 69631.29334784736, "learning_rate": 4.5e-08, "logits/chosen": 1.4446805715560913, "logits/rejected": 1.6244945526123047, "logps/chosen": -251.4736785888672, "logps/rejected": -283.0340270996094, "loss": 5000.6957, "rewards/accuracies": 0.5, "rewards/chosen": 0.0011053427588194609, "rewards/margins": -0.0012309231096878648, "rewards/rejected": 0.002336265752092004, "step": 45 }, { "debug/policy_chosen_logits": 1.348311185836792, "debug/policy_chosen_logps": -244.5071258544922, "debug/policy_rejected_logits": 1.7887471914291382, "debug/policy_rejected_logps": -292.06756591796875, "debug/reference_chosen_logps": -244.6476287841797, "debug/reference_rejected_logps": -292.14788818359375, "debug/sppo_chosen_loss": 2486.54541015625, "debug/sppo_chosen_reward_in_loss": 0.14052048325538635, "debug/sppo_rej_reward_in_loss": 0.080322265625, "debug/sppo_reject_loss": 2508.5693359375, "epoch": 0.18115942028985507, "grad_norm": 60893.65741162143, "learning_rate": 5e-08, "logits/chosen": 1.348311185836792, "logits/rejected": 1.7887471914291382, "logps/chosen": -244.5071258544922, "logps/rejected": -292.06756591796875, "loss": 4994.2488, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0014052048791199923, "rewards/margins": 0.0006019821157678962, "rewards/rejected": 0.0008032227051444352, "step": 50 }, { "debug/policy_chosen_logits": 1.3296738862991333, "debug/policy_chosen_logps": -256.0931396484375, "debug/policy_rejected_logits": 1.8709113597869873, "debug/policy_rejected_logps": -305.2488098144531, "debug/reference_chosen_logps": -255.957275390625, "debug/reference_rejected_logps": -305.47186279296875, "debug/sppo_chosen_loss": 2514.22216796875, "debug/sppo_chosen_reward_in_loss": -0.1358652114868164, "debug/sppo_rej_reward_in_loss": 0.22305870056152344, "debug/sppo_reject_loss": 2523.094970703125, "epoch": 0.19927536231884058, "grad_norm": 68084.35003371171, "learning_rate": 5.5e-08, "logits/chosen": 1.3296738862991333, "logits/rejected": 1.8709113597869873, "logps/chosen": -256.0931396484375, "logps/rejected": -305.2488098144531, "loss": 4993.8352, "rewards/accuracies": 0.375, "rewards/chosen": -0.0013586520217359066, "rewards/margins": -0.0035892389714717865, "rewards/rejected": 0.00223058694973588, "step": 55 }, { "debug/policy_chosen_logits": 1.5263268947601318, "debug/policy_chosen_logps": -259.1288146972656, "debug/policy_rejected_logits": 1.8259985446929932, "debug/policy_rejected_logps": -309.2344970703125, "debug/reference_chosen_logps": -259.2587585449219, "debug/reference_rejected_logps": -309.2173767089844, "debug/sppo_chosen_loss": 2487.46142578125, "debug/sppo_chosen_reward_in_loss": 0.12995243072509766, "debug/sppo_rej_reward_in_loss": -0.01710205152630806, "debug/sppo_reject_loss": 2498.983642578125, "epoch": 0.21739130434782608, "grad_norm": 66891.2264132623, "learning_rate": 6e-08, "logits/chosen": 1.5263268947601318, "logits/rejected": 1.8259985446929932, "logps/chosen": -259.1288146972656, "logps/rejected": -309.2344970703125, "loss": 4989.0859, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.001299524214118719, "rewards/margins": 0.001470544608309865, "rewards/rejected": -0.00017102043784689158, "step": 60 }, { "debug/policy_chosen_logits": 1.4817605018615723, "debug/policy_chosen_logps": -272.45111083984375, "debug/policy_rejected_logits": 1.8564655780792236, "debug/policy_rejected_logps": -295.9479675292969, "debug/reference_chosen_logps": -272.57183837890625, "debug/reference_rejected_logps": -295.93487548828125, "debug/sppo_chosen_loss": 2488.473388671875, "debug/sppo_chosen_reward_in_loss": 0.12075519561767578, "debug/sppo_rej_reward_in_loss": -0.013109969906508923, "debug/sppo_reject_loss": 2499.827392578125, "epoch": 0.23550724637681159, "grad_norm": 65457.17891751278, "learning_rate": 6.5e-08, "logits/chosen": 1.4817605018615723, "logits/rejected": 1.8564655780792236, "logps/chosen": -272.45111083984375, "logps/rejected": -295.9479675292969, "loss": 4989.7848, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0012075519189238548, "rewards/margins": 0.0013386515202000737, "rewards/rejected": -0.00013109967403579503, "step": 65 }, { "debug/policy_chosen_logits": 1.643053412437439, "debug/policy_chosen_logps": -266.3773193359375, "debug/policy_rejected_logits": 1.6303755044937134, "debug/policy_rejected_logps": -279.2357482910156, "debug/reference_chosen_logps": -266.346435546875, "debug/reference_rejected_logps": -279.0445861816406, "debug/sppo_chosen_loss": 2503.91015625, "debug/sppo_chosen_reward_in_loss": -0.0308837890625, "debug/sppo_rej_reward_in_loss": -0.19117030501365662, "debug/sppo_reject_loss": 2481.839111328125, "epoch": 0.2536231884057971, "grad_norm": 56546.95847698017, "learning_rate": 6.999999999999999e-08, "logits/chosen": 1.643053412437439, "logits/rejected": 1.6303755044937134, "logps/chosen": -266.3773193359375, "logps/rejected": -279.2357482910156, "loss": 4996.5586, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0003088379744440317, "rewards/margins": 0.0016028654063120484, "rewards/rejected": -0.0019117031479254365, "step": 70 }, { "debug/policy_chosen_logits": 1.3959085941314697, "debug/policy_chosen_logps": -241.9213409423828, "debug/policy_rejected_logits": 1.7832151651382446, "debug/policy_rejected_logps": -272.53082275390625, "debug/reference_chosen_logps": -241.78036499023438, "debug/reference_rejected_logps": -272.10552978515625, "debug/sppo_chosen_loss": 2515.092529296875, "debug/sppo_chosen_reward_in_loss": -0.14095115661621094, "debug/sppo_rej_reward_in_loss": -0.4252597689628601, "debug/sppo_reject_loss": 2458.417724609375, "epoch": 0.2717391304347826, "grad_norm": 56288.35239166632, "learning_rate": 7.5e-08, "logits/chosen": 1.3959085941314697, "logits/rejected": 1.7832151651382446, "logps/chosen": -241.9213409423828, "logps/rejected": -272.53082275390625, "loss": 4976.884, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0014095116639509797, "rewards/margins": 0.002843086142092943, "rewards/rejected": -0.004252597689628601, "step": 75 }, { "debug/policy_chosen_logits": 1.3575584888458252, "debug/policy_chosen_logps": -263.7048034667969, "debug/policy_rejected_logits": 1.7402280569076538, "debug/policy_rejected_logps": -292.9505310058594, "debug/reference_chosen_logps": -263.3627014160156, "debug/reference_rejected_logps": -292.67999267578125, "debug/sppo_chosen_loss": 2535.425537109375, "debug/sppo_chosen_reward_in_loss": -0.3420942425727844, "debug/sppo_rej_reward_in_loss": -0.2705673277378082, "debug/sppo_reject_loss": 2473.82861328125, "epoch": 0.2898550724637681, "grad_norm": 58321.48565606706, "learning_rate": 8e-08, "logits/chosen": 1.3575584888458252, "logits/rejected": 1.7402280569076538, "logps/chosen": -263.7048034667969, "logps/rejected": -292.9505310058594, "loss": 4986.0457, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0034209422301501036, "rewards/margins": -0.0007152691250666976, "rewards/rejected": -0.002705672988668084, "step": 80 }, { "debug/policy_chosen_logits": 1.495825171470642, "debug/policy_chosen_logps": -239.78091430664062, "debug/policy_rejected_logits": 2.119466781616211, "debug/policy_rejected_logps": -303.726318359375, "debug/reference_chosen_logps": -239.67935180664062, "debug/reference_rejected_logps": -303.4689636230469, "debug/sppo_chosen_loss": 2511.033203125, "debug/sppo_chosen_reward_in_loss": -0.10157432407140732, "debug/sppo_rej_reward_in_loss": -0.25736045837402344, "debug/sppo_reject_loss": 2474.99365234375, "epoch": 0.3079710144927536, "grad_norm": 70326.60868995877, "learning_rate": 8.5e-08, "logits/chosen": 1.495825171470642, "logits/rejected": 2.119466781616211, "logps/chosen": -239.78091430664062, "logps/rejected": -303.726318359375, "loss": 4978.4234, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0010157432407140732, "rewards/margins": 0.0015578612219542265, "rewards/rejected": -0.0025736044626682997, "step": 85 }, { "debug/policy_chosen_logits": 1.2334277629852295, "debug/policy_chosen_logps": -251.84042358398438, "debug/policy_rejected_logits": 1.418172836303711, "debug/policy_rejected_logps": -281.77276611328125, "debug/reference_chosen_logps": -251.71041870117188, "debug/reference_rejected_logps": -281.41546630859375, "debug/sppo_chosen_loss": 2513.858154296875, "debug/sppo_chosen_reward_in_loss": -0.1299985945224762, "debug/sppo_rej_reward_in_loss": -0.3572982847690582, "debug/sppo_reject_loss": 2465.033935546875, "epoch": 0.32608695652173914, "grad_norm": 64153.45121330528, "learning_rate": 9e-08, "logits/chosen": 1.2334277629852295, "logits/rejected": 1.418172836303711, "logps/chosen": -251.84042358398438, "logps/rejected": -281.77276611328125, "loss": 4968.8496, "rewards/accuracies": 0.625, "rewards/chosen": -0.001299985800869763, "rewards/margins": 0.002272996585816145, "rewards/rejected": -0.0035729825031012297, "step": 90 }, { "debug/policy_chosen_logits": 1.4227510690689087, "debug/policy_chosen_logps": -250.6143035888672, "debug/policy_rejected_logits": 1.7439839839935303, "debug/policy_rejected_logps": -265.55133056640625, "debug/reference_chosen_logps": -250.3922119140625, "debug/reference_rejected_logps": -265.2495422363281, "debug/sppo_chosen_loss": 2522.94677734375, "debug/sppo_chosen_reward_in_loss": -0.22208480536937714, "debug/sppo_rej_reward_in_loss": -0.3018133044242859, "debug/sppo_reject_loss": 2470.69970703125, "epoch": 0.3442028985507246, "grad_norm": 69704.93653164264, "learning_rate": 9.499999999999999e-08, "logits/chosen": 1.4227510690689087, "logits/rejected": 1.7439839839935303, "logps/chosen": -250.6143035888672, "logps/rejected": -265.55133056640625, "loss": 4967.1359, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.002220848109573126, "rewards/margins": 0.0007972849416546524, "rewards/rejected": -0.0030181333422660828, "step": 95 }, { "debug/policy_chosen_logits": 1.181106448173523, "debug/policy_chosen_logps": -229.50833129882812, "debug/policy_rejected_logits": 1.8651745319366455, "debug/policy_rejected_logps": -291.35162353515625, "debug/reference_chosen_logps": -229.11221313476562, "debug/reference_rejected_logps": -290.591552734375, "debug/sppo_chosen_loss": 2540.667724609375, "debug/sppo_chosen_reward_in_loss": -0.3961181640625, "debug/sppo_rej_reward_in_loss": -0.7600471377372742, "debug/sppo_reject_loss": 2425.351318359375, "epoch": 0.36231884057971014, "grad_norm": 61806.31661274745, "learning_rate": 1e-07, "logits/chosen": 1.181106448173523, "logits/rejected": 1.8651745319366455, "logps/chosen": -229.50833129882812, "logps/rejected": -291.35162353515625, "loss": 4970.1539, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0039611817337572575, "rewards/margins": 0.0036392901092767715, "rewards/rejected": -0.007600471377372742, "step": 100 }, { "epoch": 0.36231884057971014, "eval_debug/policy_chosen_logits": 1.6355112791061401, "eval_debug/policy_chosen_logps": -253.23245239257812, "eval_debug/policy_rejected_logits": 1.6972817182540894, "eval_debug/policy_rejected_logps": -260.1171875, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2532.337158203125, "eval_debug/sppo_chosen_reward_in_loss": -0.3139660954475403, "eval_debug/sppo_rej_reward_in_loss": -0.45856496691703796, "eval_debug/sppo_reject_loss": 2455.31591796875, "eval_logits/chosen": 1.6355112791061401, "eval_logits/rejected": 1.6972817182540894, "eval_logps/chosen": -253.23245239257812, "eval_logps/rejected": -260.1171875, "eval_loss": 4979.080078125, "eval_rewards/accuracies": 0.5657894611358643, "eval_rewards/chosen": -0.0031396609265357256, "eval_rewards/margins": 0.0014459885424003005, "eval_rewards/rejected": -0.004585649818181992, "eval_runtime": 28.5359, "eval_samples_per_second": 21.026, "eval_steps_per_second": 0.666, "step": 100 }, { "debug/policy_chosen_logits": 1.1145771741867065, "debug/policy_chosen_logps": -245.8112030029297, "debug/policy_rejected_logits": 1.2326147556304932, "debug/policy_rejected_logps": -293.15155029296875, "debug/reference_chosen_logps": -246.10745239257812, "debug/reference_rejected_logps": -293.17578125, "debug/sppo_chosen_loss": 2471.974365234375, "debug/sppo_chosen_reward_in_loss": 0.2962339520454407, "debug/sppo_rej_reward_in_loss": 0.024216841906309128, "debug/sppo_reject_loss": 2503.906982421875, "epoch": 0.3804347826086957, "grad_norm": 67474.7309879143, "learning_rate": 9.999861184954399e-08, "logits/chosen": 1.1145771741867065, "logits/rejected": 1.2326147556304932, "logps/chosen": -245.8112030029297, "logps/rejected": -293.15155029296875, "loss": 4954.6148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0029623392038047314, "rewards/margins": 0.002720171120017767, "rewards/rejected": 0.00024216854944825172, "step": 105 }, { "debug/policy_chosen_logits": 1.262205719947815, "debug/policy_chosen_logps": -252.00979614257812, "debug/policy_rejected_logits": 1.5525165796279907, "debug/policy_rejected_logps": -277.41937255859375, "debug/reference_chosen_logps": -251.8131561279297, "debug/reference_rejected_logps": -276.50152587890625, "debug/sppo_chosen_loss": 2520.6708984375, "debug/sppo_chosen_reward_in_loss": -0.19663181900978088, "debug/sppo_rej_reward_in_loss": -0.9178388714790344, "debug/sppo_reject_loss": 2410.38037109375, "epoch": 0.39855072463768115, "grad_norm": 54919.44910846822, "learning_rate": 9.999444747525447e-08, "logits/chosen": 1.262205719947815, "logits/rejected": 1.5525165796279907, "logps/chosen": -252.00979614257812, "logps/rejected": -277.41937255859375, "loss": 4954.4176, "rewards/accuracies": 0.75, "rewards/chosen": -0.001966318115592003, "rewards/margins": 0.007212069816887379, "rewards/rejected": -0.009178387932479382, "step": 110 }, { "debug/policy_chosen_logits": 1.517817735671997, "debug/policy_chosen_logps": -240.8375244140625, "debug/policy_rejected_logits": 1.9009748697280884, "debug/policy_rejected_logps": -302.6877746582031, "debug/reference_chosen_logps": -240.13211059570312, "debug/reference_rejected_logps": -301.23028564453125, "debug/sppo_chosen_loss": 2572.050048828125, "debug/sppo_chosen_reward_in_loss": -0.7054191827774048, "debug/sppo_rej_reward_in_loss": -1.457501769065857, "debug/sppo_reject_loss": 2358.313720703125, "epoch": 0.4166666666666667, "grad_norm": 60632.14793206507, "learning_rate": 9.998750710836255e-08, "logits/chosen": 1.517817735671997, "logits/rejected": 1.9009748697280884, "logps/chosen": -240.8375244140625, "logps/rejected": -302.6877746582031, "loss": 4951.5859, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.007054192014038563, "rewards/margins": 0.007520826067775488, "rewards/rejected": -0.014575016684830189, "step": 115 }, { "debug/policy_chosen_logits": 1.3009164333343506, "debug/policy_chosen_logps": -239.3059539794922, "debug/policy_rejected_logits": 1.6617835760116577, "debug/policy_rejected_logps": -276.4163513183594, "debug/reference_chosen_logps": -238.712158203125, "debug/reference_rejected_logps": -275.2547302246094, "debug/sppo_chosen_loss": 2561.05419921875, "debug/sppo_chosen_reward_in_loss": -0.5937992334365845, "debug/sppo_rej_reward_in_loss": -1.1615955829620361, "debug/sppo_reject_loss": 2387.34130859375, "epoch": 0.43478260869565216, "grad_norm": 71791.48071321512, "learning_rate": 9.997779113423914e-08, "logits/chosen": 1.3009164333343506, "logits/rejected": 1.6617835760116577, "logps/chosen": -239.3059539794922, "logps/rejected": -276.4163513183594, "loss": 4950.5766, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.005937992129474878, "rewards/margins": 0.005677963141351938, "rewards/rejected": -0.011615955270826817, "step": 120 }, { "debug/policy_chosen_logits": 1.38350510597229, "debug/policy_chosen_logps": -240.8602752685547, "debug/policy_rejected_logits": 1.6243913173675537, "debug/policy_rejected_logps": -272.2510681152344, "debug/reference_chosen_logps": -240.1095428466797, "debug/reference_rejected_logps": -271.2296447753906, "debug/sppo_chosen_loss": 2576.484130859375, "debug/sppo_chosen_reward_in_loss": -0.7507423162460327, "debug/sppo_rej_reward_in_loss": -1.021427869796753, "debug/sppo_reject_loss": 2400.478271484375, "epoch": 0.4528985507246377, "grad_norm": 151701.3681973675, "learning_rate": 9.996530009237363e-08, "logits/chosen": 1.38350510597229, "logits/rejected": 1.6243913173675537, "logps/chosen": -240.8602752685547, "logps/rejected": -272.2510681152344, "loss": 4954.3414, "rewards/accuracies": 0.5, "rewards/chosen": -0.007507423870265484, "rewards/margins": 0.0027068553026765585, "rewards/rejected": -0.010214278474450111, "step": 125 }, { "debug/policy_chosen_logits": 1.1746623516082764, "debug/policy_chosen_logps": -251.74844360351562, "debug/policy_rejected_logits": 1.6008373498916626, "debug/policy_rejected_logps": -297.63568115234375, "debug/reference_chosen_logps": -250.86929321289062, "debug/reference_rejected_logps": -296.35479736328125, "debug/sppo_chosen_loss": 2590.117919921875, "debug/sppo_chosen_reward_in_loss": -0.8791602849960327, "debug/sppo_rej_reward_in_loss": -1.2808887958526611, "debug/sppo_reject_loss": 2374.90869140625, "epoch": 0.47101449275362317, "grad_norm": 70424.18585342077, "learning_rate": 9.995003467634381e-08, "logits/chosen": 1.1746623516082764, "logits/rejected": 1.6008373498916626, "logps/chosen": -251.74844360351562, "logps/rejected": -297.63568115234375, "loss": 4954.9508, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.008791603147983551, "rewards/margins": 0.004017284605652094, "rewards/rejected": -0.012808887287974358, "step": 130 }, { "debug/policy_chosen_logits": 1.3090918064117432, "debug/policy_chosen_logps": -248.5448455810547, "debug/policy_rejected_logits": 1.7777769565582275, "debug/policy_rejected_logps": -299.4295349121094, "debug/reference_chosen_logps": -247.79171752929688, "debug/reference_rejected_logps": -298.0932312011719, "debug/sppo_chosen_loss": 2577.368896484375, "debug/sppo_chosen_reward_in_loss": -0.7531425356864929, "debug/sppo_rej_reward_in_loss": -1.3363120555877686, "debug/sppo_reject_loss": 2369.63916015625, "epoch": 0.4891304347826087, "grad_norm": 55772.8366539, "learning_rate": 9.99319957337775e-08, "logits/chosen": 1.3090918064117432, "logits/rejected": 1.7777769565582275, "logps/chosen": -248.5448455810547, "logps/rejected": -299.4295349121094, "loss": 4953.4102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007531425449997187, "rewards/margins": 0.005831695627421141, "rewards/rejected": -0.013363120146095753, "step": 135 }, { "debug/policy_chosen_logits": 1.6867640018463135, "debug/policy_chosen_logps": -267.0757751464844, "debug/policy_rejected_logits": 1.9828193187713623, "debug/policy_rejected_logps": -309.44317626953125, "debug/reference_chosen_logps": -266.6741638183594, "debug/reference_rejected_logps": -308.5638427734375, "debug/sppo_chosen_loss": 2542.128662109375, "debug/sppo_chosen_reward_in_loss": -0.40160447359085083, "debug/sppo_rej_reward_in_loss": -0.8793373107910156, "debug/sppo_reject_loss": 2414.30419921875, "epoch": 0.5072463768115942, "grad_norm": 87804.69325443542, "learning_rate": 9.991118426630531e-08, "logits/chosen": 1.6867640018463135, "logits/rejected": 1.9828193187713623, "logps/chosen": -267.0757751464844, "logps/rejected": -309.44317626953125, "loss": 4954.575, "rewards/accuracies": 0.625, "rewards/chosen": -0.004016044549643993, "rewards/margins": 0.004777328111231327, "rewards/rejected": -0.00879337266087532, "step": 140 }, { "debug/policy_chosen_logits": 1.2096970081329346, "debug/policy_chosen_logps": -240.99169921875, "debug/policy_rejected_logits": 1.5321658849716187, "debug/policy_rejected_logps": -282.29827880859375, "debug/reference_chosen_logps": -240.69192504882812, "debug/reference_rejected_logps": -281.6559753417969, "debug/sppo_chosen_loss": 2530.816162109375, "debug/sppo_chosen_reward_in_loss": -0.2997651994228363, "debug/sppo_rej_reward_in_loss": -0.6423038244247437, "debug/sppo_reject_loss": 2437.80322265625, "epoch": 0.5253623188405797, "grad_norm": 62100.452115897504, "learning_rate": 9.988760142950516e-08, "logits/chosen": 1.2096970081329346, "logits/rejected": 1.5321658849716187, "logps/chosen": -240.99169921875, "logps/rejected": -282.29827880859375, "loss": 4960.2316, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0029976521618664265, "rewards/margins": 0.0034253865014761686, "rewards/rejected": -0.0064230384305119514, "step": 145 }, { "debug/policy_chosen_logits": 1.5637954473495483, "debug/policy_chosen_logps": -266.9645080566406, "debug/policy_rejected_logits": 2.029578924179077, "debug/policy_rejected_logps": -296.61737060546875, "debug/reference_chosen_logps": -266.7818298339844, "debug/reference_rejected_logps": -295.389892578125, "debug/sppo_chosen_loss": 2519.594970703125, "debug/sppo_chosen_reward_in_loss": -0.18268242478370667, "debug/sppo_rej_reward_in_loss": -1.2275073528289795, "debug/sppo_reject_loss": 2380.341796875, "epoch": 0.5434782608695652, "grad_norm": 105223.1261169405, "learning_rate": 9.98612485328381e-08, "logits/chosen": 1.5637954473495483, "logits/rejected": 2.029578924179077, "logps/chosen": -266.9645080566406, "logps/rejected": -296.61737060546875, "loss": 4930.8102, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0018268240382894874, "rewards/margins": 0.010448249988257885, "rewards/rejected": -0.012275073677301407, "step": 150 }, { "debug/policy_chosen_logits": 1.3448460102081299, "debug/policy_chosen_logps": -241.264892578125, "debug/policy_rejected_logits": 1.6951926946640015, "debug/policy_rejected_logps": -277.68731689453125, "debug/reference_chosen_logps": -240.8543701171875, "debug/reference_rejected_logps": -276.788818359375, "debug/sppo_chosen_loss": 2542.348876953125, "debug/sppo_chosen_reward_in_loss": -0.4105297029018402, "debug/sppo_rej_reward_in_loss": -0.8985021710395813, "debug/sppo_reject_loss": 2412.73291015625, "epoch": 0.5615942028985508, "grad_norm": 72896.69066553783, "learning_rate": 9.983212703957554e-08, "logits/chosen": 1.3448460102081299, "logits/rejected": 1.6951926946640015, "logps/chosen": -241.264892578125, "logps/rejected": -277.68731689453125, "loss": 4944.0086, "rewards/accuracies": 0.5, "rewards/chosen": -0.004105296917259693, "rewards/margins": 0.004879724234342575, "rewards/rejected": -0.008985022082924843, "step": 155 }, { "debug/policy_chosen_logits": 1.077109932899475, "debug/policy_chosen_logps": -255.0774383544922, "debug/policy_rejected_logits": 1.5379952192306519, "debug/policy_rejected_logps": -297.5126953125, "debug/reference_chosen_logps": -254.5387420654297, "debug/reference_rejected_logps": -295.9001770019531, "debug/sppo_chosen_loss": 2555.77099609375, "debug/sppo_chosen_reward_in_loss": -0.5386981964111328, "debug/sppo_rej_reward_in_loss": -1.6125160455703735, "debug/sppo_reject_loss": 2343.542724609375, "epoch": 0.5797101449275363, "grad_norm": 93485.02078548388, "learning_rate": 9.980023856671804e-08, "logits/chosen": 1.077109932899475, "logits/rejected": 1.5379952192306519, "logps/chosen": -255.0774383544922, "logps/rejected": -297.5126953125, "loss": 4954.2586, "rewards/accuracies": 0.625, "rewards/chosen": -0.005386981647461653, "rewards/margins": 0.010738177224993706, "rewards/rejected": -0.016125161200761795, "step": 160 }, { "debug/policy_chosen_logits": 1.5786542892456055, "debug/policy_chosen_logps": -269.99346923828125, "debug/policy_rejected_logits": 1.402779221534729, "debug/policy_rejected_logps": -263.34893798828125, "debug/reference_chosen_logps": -269.4091491699219, "debug/reference_rejected_logps": -262.3013610839844, "debug/sppo_chosen_loss": 2560.51904296875, "debug/sppo_chosen_reward_in_loss": -0.5842826962471008, "debug/sppo_rej_reward_in_loss": -1.0475749969482422, "debug/sppo_reject_loss": 2399.6630859375, "epoch": 0.5978260869565217, "grad_norm": 92597.71266050417, "learning_rate": 9.976558488490555e-08, "logits/chosen": 1.5786542892456055, "logits/rejected": 1.402779221534729, "logps/chosen": -269.99346923828125, "logps/rejected": -263.34893798828125, "loss": 4936.507, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0058428263291716576, "rewards/margins": 0.004632922820746899, "rewards/rejected": -0.01047575008124113, "step": 165 }, { "debug/policy_chosen_logits": 1.290741205215454, "debug/policy_chosen_logps": -236.67715454101562, "debug/policy_rejected_logits": 1.5977189540863037, "debug/policy_rejected_logps": -291.2699279785156, "debug/reference_chosen_logps": -238.73080444335938, "debug/reference_rejected_logps": -293.11700439453125, "debug/sppo_chosen_loss": 2302.3115234375, "debug/sppo_chosen_reward_in_loss": 2.053657054901123, "debug/sppo_rej_reward_in_loss": 1.8470966815948486, "debug/sppo_reject_loss": 2691.03515625, "epoch": 0.6159420289855072, "grad_norm": 98094.66729862805, "learning_rate": 9.972816791831899e-08, "logits/chosen": 1.290741205215454, "logits/rejected": 1.5977189540863037, "logps/chosen": -236.67715454101562, "logps/rejected": -291.2699279785156, "loss": 4975.907, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.020536571741104126, "rewards/margins": 0.002065605018287897, "rewards/rejected": 0.018470967188477516, "step": 170 }, { "debug/policy_chosen_logits": 1.4175643920898438, "debug/policy_chosen_logps": -263.4878845214844, "debug/policy_rejected_logits": 1.8533436059951782, "debug/policy_rejected_logps": -309.7564697265625, "debug/reference_chosen_logps": -263.7447814941406, "debug/reference_rejected_logps": -309.0677795410156, "debug/sppo_chosen_loss": 2477.23681640625, "debug/sppo_chosen_reward_in_loss": 0.2568736970424652, "debug/sppo_rej_reward_in_loss": -0.6886796951293945, "debug/sppo_reject_loss": 2437.29541015625, "epoch": 0.6340579710144928, "grad_norm": 63312.73342726648, "learning_rate": 9.968798974457359e-08, "logits/chosen": 1.4175643920898438, "logits/rejected": 1.8533436059951782, "logps/chosen": -263.4878845214844, "logps/rejected": -309.7564697265625, "loss": 4920.0543, "rewards/accuracies": 0.625, "rewards/chosen": 0.0025687366724014282, "rewards/margins": 0.00945553369820118, "rewards/rejected": -0.006886796560138464, "step": 175 }, { "debug/policy_chosen_logits": 1.2178828716278076, "debug/policy_chosen_logps": -246.9923095703125, "debug/policy_rejected_logits": 1.5837467908859253, "debug/policy_rejected_logps": -289.5892028808594, "debug/reference_chosen_logps": -246.3313446044922, "debug/reference_rejected_logps": -288.1348876953125, "debug/sppo_chosen_loss": 2569.297119140625, "debug/sppo_chosen_reward_in_loss": -0.6609573364257812, "debug/sppo_rej_reward_in_loss": -1.4543180465698242, "debug/sppo_reject_loss": 2363.291015625, "epoch": 0.6521739130434783, "grad_norm": 73130.10544261185, "learning_rate": 9.964505259460332e-08, "logits/chosen": 1.2178828716278076, "logits/rejected": 1.5837467908859253, "logps/chosen": -246.9923095703125, "logps/rejected": -289.5892028808594, "loss": 4913.6719, "rewards/accuracies": 0.625, "rewards/chosen": -0.006609573028981686, "rewards/margins": 0.007933606393635273, "rewards/rejected": -0.014543181285262108, "step": 180 }, { "debug/policy_chosen_logits": 1.19364333152771, "debug/policy_chosen_logps": -248.1702423095703, "debug/policy_rejected_logits": 1.6476013660430908, "debug/policy_rejected_logps": -322.3099365234375, "debug/reference_chosen_logps": -246.93405151367188, "debug/reference_rejected_logps": -319.759765625, "debug/sppo_chosen_loss": 2628.030029296875, "debug/sppo_chosen_reward_in_loss": -1.2361927032470703, "debug/sppo_rej_reward_in_loss": -2.5501551628112793, "debug/sppo_reject_loss": 2257.75341796875, "epoch": 0.6702898550724637, "grad_norm": 62164.347458530305, "learning_rate": 9.959935885253715e-08, "logits/chosen": 1.19364333152771, "logits/rejected": 1.6476013660430908, "logps/chosen": -248.1702423095703, "logps/rejected": -322.3099365234375, "loss": 4902.393, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.012361926957964897, "rewards/margins": 0.013139625079929829, "rewards/rejected": -0.025501549243927002, "step": 185 }, { "debug/policy_chosen_logits": 1.20594322681427, "debug/policy_chosen_logps": -251.36129760742188, "debug/policy_rejected_logits": 1.2766417264938354, "debug/policy_rejected_logps": -275.73919677734375, "debug/reference_chosen_logps": -250.4452362060547, "debug/reference_rejected_logps": -273.4372863769531, "debug/sppo_chosen_loss": 2595.31591796875, "debug/sppo_chosen_reward_in_loss": -0.9160749316215515, "debug/sppo_rej_reward_in_loss": -2.3019137382507324, "debug/sppo_reject_loss": 2283.59375, "epoch": 0.6884057971014492, "grad_norm": 61551.26752821376, "learning_rate": 9.955091105556664e-08, "logits/chosen": 1.20594322681427, "logits/rejected": 1.2766417264938354, "logps/chosen": -251.36129760742188, "logps/rejected": -275.73919677734375, "loss": 4926.9672, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.009160749614238739, "rewards/margins": 0.013858387246727943, "rewards/rejected": -0.023019134998321533, "step": 190 }, { "debug/policy_chosen_logits": 1.7529083490371704, "debug/policy_chosen_logps": -272.32366943359375, "debug/policy_rejected_logits": 2.1265456676483154, "debug/policy_rejected_logps": -313.41241455078125, "debug/reference_chosen_logps": -270.6104736328125, "debug/reference_rejected_logps": -310.8916320800781, "debug/sppo_chosen_loss": 2683.697021484375, "debug/sppo_chosen_reward_in_loss": -1.7131826877593994, "debug/sppo_rej_reward_in_loss": -2.5207762718200684, "debug/sppo_reject_loss": 2260.98779296875, "epoch": 0.7065217391304348, "grad_norm": 64279.10209786374, "learning_rate": 9.949971189380507e-08, "logits/chosen": 1.7529083490371704, "logits/rejected": 2.1265456676483154, "logps/chosen": -272.32366943359375, "logps/rejected": -313.41241455078125, "loss": 4930.4125, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.017131825909018517, "rewards/margins": 0.008075936697423458, "rewards/rejected": -0.02520776353776455, "step": 195 }, { "debug/policy_chosen_logits": 1.2168382406234741, "debug/policy_chosen_logps": -283.47161865234375, "debug/policy_rejected_logits": 1.5349094867706299, "debug/policy_rejected_logps": -290.456787109375, "debug/reference_chosen_logps": -282.74090576171875, "debug/reference_rejected_logps": -288.225341796875, "debug/sppo_chosen_loss": 2577.8662109375, "debug/sppo_chosen_reward_in_loss": -0.7306663393974304, "debug/sppo_rej_reward_in_loss": -2.2314419746398926, "debug/sppo_reject_loss": 2285.576171875, "epoch": 0.7246376811594203, "grad_norm": 60882.04316379648, "learning_rate": 9.944576421013802e-08, "logits/chosen": 1.2168382406234741, "logits/rejected": 1.5349094867706299, "logps/chosen": -283.47161865234375, "logps/rejected": -290.456787109375, "loss": 4913.6875, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.007306662853807211, "rewards/margins": 0.015007754787802696, "rewards/rejected": -0.022314418107271194, "step": 200 }, { "epoch": 0.7246376811594203, "eval_debug/policy_chosen_logits": 1.6047344207763672, "eval_debug/policy_chosen_logps": -253.59323120117188, "eval_debug/policy_rejected_logits": 1.6657979488372803, "eval_debug/policy_rejected_logps": -260.560546875, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2570.339111328125, "eval_debug/sppo_chosen_reward_in_loss": -0.6747744083404541, "eval_debug/sppo_rej_reward_in_loss": -0.9019157886505127, "eval_debug/sppo_reject_loss": 2415.142578125, "eval_logits/chosen": 1.6047344207763672, "eval_logits/rejected": 1.6657979488372803, "eval_logps/chosen": -253.59323120117188, "eval_logps/rejected": -260.560546875, "eval_loss": 4922.29638671875, "eval_rewards/accuracies": 0.5394737124443054, "eval_rewards/chosen": -0.006747743580490351, "eval_rewards/margins": 0.002271413803100586, "eval_rewards/rejected": -0.009019157849252224, "eval_runtime": 28.6024, "eval_samples_per_second": 20.977, "eval_steps_per_second": 0.664, "step": 200 }, { "debug/policy_chosen_logits": 1.3171964883804321, "debug/policy_chosen_logps": -250.88916015625, "debug/policy_rejected_logits": 1.6703647375106812, "debug/policy_rejected_logps": -307.4503479003906, "debug/reference_chosen_logps": -250.2000274658203, "debug/reference_rejected_logps": -305.8931579589844, "debug/sppo_chosen_loss": 2571.575439453125, "debug/sppo_chosen_reward_in_loss": -0.6891248822212219, "debug/sppo_rej_reward_in_loss": -1.5571798086166382, "debug/sppo_reject_loss": 2350.726806640625, "epoch": 0.7427536231884058, "grad_norm": 77669.5503273904, "learning_rate": 9.938907100006552e-08, "logits/chosen": 1.3171964883804321, "logits/rejected": 1.6703647375106812, "logps/chosen": -250.88916015625, "logps/rejected": -307.4503479003906, "loss": 4928.9711, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.006891248282045126, "rewards/margins": 0.008680549450218678, "rewards/rejected": -0.015571797266602516, "step": 205 }, { "debug/policy_chosen_logits": 1.1587716341018677, "debug/policy_chosen_logps": -259.26190185546875, "debug/policy_rejected_logits": 1.460614562034607, "debug/policy_rejected_logps": -255.4847869873047, "debug/reference_chosen_logps": -258.54693603515625, "debug/reference_rejected_logps": -254.0518035888672, "debug/sppo_chosen_loss": 2574.91796875, "debug/sppo_chosen_reward_in_loss": -0.7149562835693359, "debug/sppo_rej_reward_in_loss": -1.4329769611358643, "debug/sppo_reject_loss": 2362.268798828125, "epoch": 0.7608695652173914, "grad_norm": 56509.01147720915, "learning_rate": 9.932963541153584e-08, "logits/chosen": 1.1587716341018677, "logits/rejected": 1.460614562034607, "logps/chosen": -259.26190185546875, "logps/rejected": -255.4847869873047, "loss": 4912.7797, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.007149563170969486, "rewards/margins": 0.007180205546319485, "rewards/rejected": -0.014329768717288971, "step": 210 }, { "debug/policy_chosen_logits": 1.4320160150527954, "debug/policy_chosen_logps": -268.22747802734375, "debug/policy_rejected_logits": 1.8260990381240845, "debug/policy_rejected_logps": -298.08526611328125, "debug/reference_chosen_logps": -267.0002136230469, "debug/reference_rejected_logps": -295.2477111816406, "debug/sppo_chosen_loss": 2628.953857421875, "debug/sppo_chosen_reward_in_loss": -1.227246642112732, "debug/sppo_rej_reward_in_loss": -2.8375353813171387, "debug/sppo_reject_loss": 2234.884033203125, "epoch": 0.7789855072463768, "grad_norm": 57238.35237205925, "learning_rate": 9.926746074477053e-08, "logits/chosen": 1.4320160150527954, "logits/rejected": 1.8260990381240845, "logps/chosen": -268.22747802734375, "logps/rejected": -298.08526611328125, "loss": 4883.1227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012272466905415058, "rewards/margins": 0.01610288769006729, "rewards/rejected": -0.028375351801514626, "step": 215 }, { "debug/policy_chosen_logits": 0.9355718493461609, "debug/policy_chosen_logps": -250.63589477539062, "debug/policy_rejected_logits": 1.2643530368804932, "debug/policy_rejected_logps": -276.95855712890625, "debug/reference_chosen_logps": -249.66796875, "debug/reference_rejected_logps": -274.86138916015625, "debug/sppo_chosen_loss": 2604.803466796875, "debug/sppo_chosen_reward_in_loss": -0.9679214358329773, "debug/sppo_rej_reward_in_loss": -2.0971500873565674, "debug/sppo_reject_loss": 2302.240966796875, "epoch": 0.7971014492753623, "grad_norm": 62378.00382376367, "learning_rate": 9.920255045208128e-08, "logits/chosen": 0.9355718493461609, "logits/rejected": 1.2643530368804932, "logps/chosen": -250.63589477539062, "logps/rejected": -276.95855712890625, "loss": 4879.9465, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009679214097559452, "rewards/margins": 0.011292284354567528, "rewards/rejected": -0.020971499383449554, "step": 220 }, { "debug/policy_chosen_logits": 1.3104256391525269, "debug/policy_chosen_logps": -283.29400634765625, "debug/policy_rejected_logits": 1.6018108129501343, "debug/policy_rejected_logps": -291.35772705078125, "debug/reference_chosen_logps": -282.43328857421875, "debug/reference_rejected_logps": -289.2051696777344, "debug/sppo_chosen_loss": 2590.73828125, "debug/sppo_chosen_reward_in_loss": -0.8607318997383118, "debug/sppo_rej_reward_in_loss": -2.1526076793670654, "debug/sppo_reject_loss": 2297.698974609375, "epoch": 0.8152173913043478, "grad_norm": 65152.871244207236, "learning_rate": 9.913490813767816e-08, "logits/chosen": 1.3104256391525269, "logits/rejected": 1.6018108129501343, "logps/chosen": -283.29400634765625, "logps/rejected": -291.35772705078125, "loss": 4881.9191, "rewards/accuracies": 0.625, "rewards/chosen": -0.008607318624854088, "rewards/margins": 0.01291875820606947, "rewards/rejected": -0.021526077762246132, "step": 225 }, { "debug/policy_chosen_logits": 1.1648050546646118, "debug/policy_chosen_logps": -233.7320098876953, "debug/policy_rejected_logits": 1.381399393081665, "debug/policy_rejected_logps": -276.8841552734375, "debug/reference_chosen_logps": -233.37783813476562, "debug/reference_rejected_logps": -275.42218017578125, "debug/sppo_chosen_loss": 2539.67919921875, "debug/sppo_chosen_reward_in_loss": -0.35420626401901245, "debug/sppo_rej_reward_in_loss": -1.4619684219360352, "debug/sppo_reject_loss": 2360.643310546875, "epoch": 0.8333333333333334, "grad_norm": 55449.34911003317, "learning_rate": 9.906453755746957e-08, "logits/chosen": 1.1648050546646118, "logits/rejected": 1.381399393081665, "logps/chosen": -233.7320098876953, "logps/rejected": -276.8841552734375, "loss": 4885.1031, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0035420632921159267, "rewards/margins": 0.011077621951699257, "rewards/rejected": -0.014619683846831322, "step": 230 }, { "debug/policy_chosen_logits": 1.3417161703109741, "debug/policy_chosen_logps": -252.96591186523438, "debug/policy_rejected_logits": 1.5157549381256104, "debug/policy_rejected_logps": -298.88677978515625, "debug/reference_chosen_logps": -251.4791259765625, "debug/reference_rejected_logps": -296.46527099609375, "debug/sppo_chosen_loss": 2657.89697265625, "debug/sppo_chosen_reward_in_loss": -1.4867690801620483, "debug/sppo_rej_reward_in_loss": -2.4215126037597656, "debug/sppo_reject_loss": 2273.120849609375, "epoch": 0.8514492753623188, "grad_norm": 100941.82559973896, "learning_rate": 9.899144261885363e-08, "logits/chosen": 1.3417161703109741, "logits/rejected": 1.5157549381256104, "logps/chosen": -252.96591186523438, "logps/rejected": -298.88677978515625, "loss": 4858.2457, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01486769039183855, "rewards/margins": 0.009347434155642986, "rewards/rejected": -0.024215126410126686, "step": 235 }, { "debug/policy_chosen_logits": 1.3339102268218994, "debug/policy_chosen_logps": -243.60916137695312, "debug/policy_rejected_logits": 1.9277820587158203, "debug/policy_rejected_logps": -299.7930908203125, "debug/reference_chosen_logps": -242.9955291748047, "debug/reference_rejected_logps": -297.2693786621094, "debug/sppo_chosen_loss": 2566.054931640625, "debug/sppo_chosen_reward_in_loss": -0.6136573553085327, "debug/sppo_rej_reward_in_loss": -2.523709535598755, "debug/sppo_reject_loss": 2259.904052734375, "epoch": 0.8695652173913043, "grad_norm": 59001.90623118747, "learning_rate": 9.891562738050125e-08, "logits/chosen": 1.3339102268218994, "logits/rejected": 1.9277820587158203, "logps/chosen": -243.60916137695312, "logps/rejected": -299.7930908203125, "loss": 4878.6145, "rewards/accuracies": 0.75, "rewards/chosen": -0.006136573851108551, "rewards/margins": 0.01910051889717579, "rewards/rejected": -0.02523709461092949, "step": 240 }, { "debug/policy_chosen_logits": 1.0627963542938232, "debug/policy_chosen_logps": -232.9734344482422, "debug/policy_rejected_logits": 1.4089231491088867, "debug/policy_rejected_logps": -283.24371337890625, "debug/reference_chosen_logps": -233.43692016601562, "debug/reference_rejected_logps": -282.2871398925781, "debug/sppo_chosen_loss": 2458.481201171875, "debug/sppo_chosen_reward_in_loss": 0.46344834566116333, "debug/sppo_rej_reward_in_loss": -0.9565681219100952, "debug/sppo_reject_loss": 2416.12548828125, "epoch": 0.8876811594202898, "grad_norm": 62196.217799154256, "learning_rate": 9.883709605213071e-08, "logits/chosen": 1.0627963542938232, "logits/rejected": 1.4089231491088867, "logps/chosen": -232.9734344482422, "logps/rejected": -283.24371337890625, "loss": 4866.8133, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.004634483251720667, "rewards/margins": 0.014200164005160332, "rewards/rejected": -0.009565682150423527, "step": 245 }, { "debug/policy_chosen_logits": 1.1752718687057495, "debug/policy_chosen_logps": -277.11181640625, "debug/policy_rejected_logits": 1.5447168350219727, "debug/policy_rejected_logps": -283.4997863769531, "debug/reference_chosen_logps": -276.27154541015625, "debug/reference_rejected_logps": -282.2914123535156, "debug/sppo_chosen_loss": 2592.04931640625, "debug/sppo_chosen_reward_in_loss": -0.8402732610702515, "debug/sppo_rej_reward_in_loss": -1.2084100246429443, "debug/sppo_reject_loss": 2386.884033203125, "epoch": 0.9057971014492754, "grad_norm": 59626.787092260776, "learning_rate": 9.8755852994274e-08, "logits/chosen": 1.1752718687057495, "logits/rejected": 1.5447168350219727, "logps/chosen": -277.11181640625, "logps/rejected": -283.4997863769531, "loss": 4915.7211, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008402733132243156, "rewards/margins": 0.003681367728859186, "rewards/rejected": -0.012084100395441055, "step": 250 }, { "debug/policy_chosen_logits": 1.3651785850524902, "debug/policy_chosen_logps": -236.29833984375, "debug/policy_rejected_logits": 1.5351622104644775, "debug/policy_rejected_logps": -271.6980285644531, "debug/reference_chosen_logps": -235.9077606201172, "debug/reference_rejected_logps": -269.55755615234375, "debug/sppo_chosen_loss": 2543.5859375, "debug/sppo_chosen_reward_in_loss": -0.3905603289604187, "debug/sppo_rej_reward_in_loss": -2.140444755554199, "debug/sppo_reject_loss": 2298.95458984375, "epoch": 0.9239130434782609, "grad_norm": 68845.86649779897, "learning_rate": 9.867190271803463e-08, "logits/chosen": 1.3651785850524902, "logits/rejected": 1.5351622104644775, "logps/chosen": -236.29833984375, "logps/rejected": -271.6980285644531, "loss": 4868.9133, "rewards/accuracies": 0.75, "rewards/chosen": -0.0039056031964719296, "rewards/margins": 0.017498845234513283, "rewards/rejected": -0.02140444703400135, "step": 255 }, { "debug/policy_chosen_logits": 1.2891263961791992, "debug/policy_chosen_logps": -253.2111053466797, "debug/policy_rejected_logits": 1.8494764566421509, "debug/policy_rejected_logps": -307.9271545410156, "debug/reference_chosen_logps": -253.3511199951172, "debug/reference_rejected_logps": -305.5581359863281, "debug/sppo_chosen_loss": 2489.207763671875, "debug/sppo_chosen_reward_in_loss": 0.1400173157453537, "debug/sppo_rej_reward_in_loss": -2.3690409660339355, "debug/sppo_reject_loss": 2279.093017578125, "epoch": 0.9420289855072463, "grad_norm": 77776.03745480823, "learning_rate": 9.858524988483717e-08, "logits/chosen": 1.2891263961791992, "logits/rejected": 1.8494764566421509, "logps/chosen": -253.2111053466797, "logps/rejected": -307.9271545410156, "loss": 4828.5781, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0014001730596646667, "rewards/margins": 0.02509058080613613, "rewards/rejected": -0.02369040809571743, "step": 260 }, { "debug/policy_chosen_logits": 1.4969512224197388, "debug/policy_chosen_logps": -305.0471496582031, "debug/policy_rejected_logits": 1.644335150718689, "debug/policy_rejected_logps": -293.63507080078125, "debug/reference_chosen_logps": -304.60321044921875, "debug/reference_rejected_logps": -291.7633056640625, "debug/sppo_chosen_loss": 2549.2109375, "debug/sppo_chosen_reward_in_loss": -0.44389915466308594, "debug/sppo_rej_reward_in_loss": -1.8717502355575562, "debug/sppo_reject_loss": 2324.712158203125, "epoch": 0.9601449275362319, "grad_norm": 57117.242259449704, "learning_rate": 9.849589930616841e-08, "logits/chosen": 1.4969512224197388, "logits/rejected": 1.644335150718689, "logps/chosen": -305.0471496582031, "logps/rejected": -293.63507080078125, "loss": 4841.2102, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.004438991658389568, "rewards/margins": 0.014278510585427284, "rewards/rejected": -0.018717501312494278, "step": 265 }, { "debug/policy_chosen_logits": 1.4527175426483154, "debug/policy_chosen_logps": -268.75811767578125, "debug/policy_rejected_logits": 1.9718250036239624, "debug/policy_rejected_logps": -277.8961486816406, "debug/reference_chosen_logps": -267.8411560058594, "debug/reference_rejected_logps": -275.6684875488281, "debug/sppo_chosen_loss": 2599.503662109375, "debug/sppo_chosen_reward_in_loss": -0.9169847369194031, "debug/sppo_rej_reward_in_loss": -2.2276530265808105, "debug/sppo_reject_loss": 2295.29248046875, "epoch": 0.9782608695652174, "grad_norm": 65236.81774306339, "learning_rate": 9.840385594331021e-08, "logits/chosen": 1.4527175426483154, "logits/rejected": 1.9718250036239624, "logps/chosen": -268.75811767578125, "logps/rejected": -277.8961486816406, "loss": 4872.7801, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.009169846773147583, "rewards/margins": 0.013106681406497955, "rewards/rejected": -0.022276530042290688, "step": 270 }, { "debug/policy_chosen_logits": 1.0952537059783936, "debug/policy_chosen_logps": -257.93865966796875, "debug/policy_rejected_logits": 1.3622313737869263, "debug/policy_rejected_logps": -256.76025390625, "debug/reference_chosen_logps": -257.44097900390625, "debug/reference_rejected_logps": -254.19161987304688, "debug/sppo_chosen_loss": 2554.732421875, "debug/sppo_chosen_reward_in_loss": -0.49769458174705505, "debug/sppo_rej_reward_in_loss": -2.5686464309692383, "debug/sppo_reject_loss": 2260.558349609375, "epoch": 0.9963768115942029, "grad_norm": 74696.15311144819, "learning_rate": 9.830912490706402e-08, "logits/chosen": 1.0952537059783936, "logits/rejected": 1.3622313737869263, "logps/chosen": -257.93865966796875, "logps/rejected": -256.76025390625, "loss": 4826.0516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0049769459292292595, "rewards/margins": 0.02070951648056507, "rewards/rejected": -0.025686467066407204, "step": 275 }, { "debug/policy_chosen_logits": 1.3448244333267212, "debug/policy_chosen_logps": -261.9836730957031, "debug/policy_rejected_logits": 1.7952938079833984, "debug/policy_rejected_logps": -297.6934814453125, "debug/reference_chosen_logps": -260.2370300292969, "debug/reference_rejected_logps": -293.6090393066406, "debug/sppo_chosen_loss": 2686.778564453125, "debug/sppo_chosen_reward_in_loss": -1.746645212173462, "debug/sppo_rej_reward_in_loss": -4.084471225738525, "debug/sppo_reject_loss": 2124.232421875, "epoch": 1.0144927536231885, "grad_norm": 64113.010757584976, "learning_rate": 9.821171145746709e-08, "logits/chosen": 1.3448244333267212, "logits/rejected": 1.7952938079833984, "logps/chosen": -261.9836730957031, "logps/rejected": -297.6934814453125, "loss": 4835.6227, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.017466451972723007, "rewards/margins": 0.02337825670838356, "rewards/rejected": -0.04084470868110657, "step": 280 }, { "debug/policy_chosen_logits": 1.2289044857025146, "debug/policy_chosen_logps": -260.09234619140625, "debug/policy_rejected_logits": 1.4668129682540894, "debug/policy_rejected_logps": -285.3551940917969, "debug/reference_chosen_logps": -259.2257385253906, "debug/reference_rejected_logps": -282.3316955566406, "debug/sppo_chosen_loss": 2595.444091796875, "debug/sppo_chosen_reward_in_loss": -0.8666107058525085, "debug/sppo_rej_reward_in_loss": -3.0234737396240234, "debug/sppo_reject_loss": 2219.795166015625, "epoch": 1.0326086956521738, "grad_norm": 73460.26213970502, "learning_rate": 9.811162100350039e-08, "logits/chosen": 1.2289044857025146, "logits/rejected": 1.4668129682540894, "logps/chosen": -260.09234619140625, "logps/rejected": -285.3551940917969, "loss": 4821.3758, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.008666107431054115, "rewards/margins": 0.021568629890680313, "rewards/rejected": -0.03023473545908928, "step": 285 }, { "debug/policy_chosen_logits": 1.5169494152069092, "debug/policy_chosen_logps": -275.1809997558594, "debug/policy_rejected_logits": 1.6399990320205688, "debug/policy_rejected_logps": -274.2919921875, "debug/reference_chosen_logps": -274.32757568359375, "debug/reference_rejected_logps": -271.07623291015625, "debug/sppo_chosen_loss": 2591.622314453125, "debug/sppo_chosen_reward_in_loss": -0.8534218072891235, "debug/sppo_rej_reward_in_loss": -3.2157363891601562, "debug/sppo_reject_loss": 2204.520263671875, "epoch": 1.0507246376811594, "grad_norm": 78833.13341982767, "learning_rate": 9.80088591027883e-08, "logits/chosen": 1.5169494152069092, "logits/rejected": 1.6399990320205688, "logps/chosen": -275.1809997558594, "logps/rejected": -274.2919921875, "loss": 4839.1961, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00853421725332737, "rewards/margins": 0.023623144254088402, "rewards/rejected": -0.03215736150741577, "step": 290 }, { "debug/policy_chosen_logits": 1.2514266967773438, "debug/policy_chosen_logps": -246.48178100585938, "debug/policy_rejected_logits": 1.6836073398590088, "debug/policy_rejected_logps": -315.2175598144531, "debug/reference_chosen_logps": -245.95834350585938, "debug/reference_rejected_logps": -312.770751953125, "debug/sppo_chosen_loss": 2557.650634765625, "debug/sppo_chosen_reward_in_loss": -0.5234573483467102, "debug/sppo_rej_reward_in_loss": -2.4468045234680176, "debug/sppo_reject_loss": 2270.903564453125, "epoch": 1.068840579710145, "grad_norm": 58519.22419342189, "learning_rate": 9.790343146128999e-08, "logits/chosen": 1.2514266967773438, "logits/rejected": 1.6836073398590088, "logps/chosen": -246.48178100585938, "logps/rejected": -315.2175598144531, "loss": 4841.0684, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005234573967754841, "rewards/margins": 0.019233472645282745, "rewards/rejected": -0.02446804568171501, "step": 295 }, { "debug/policy_chosen_logits": 1.0419118404388428, "debug/policy_chosen_logps": -253.39053344726562, "debug/policy_rejected_logits": 1.3241182565689087, "debug/policy_rejected_logps": -298.21099853515625, "debug/reference_chosen_logps": -251.8573455810547, "debug/reference_rejected_logps": -295.08416748046875, "debug/sppo_chosen_loss": 2667.366943359375, "debug/sppo_chosen_reward_in_loss": -1.5331599712371826, "debug/sppo_rej_reward_in_loss": -3.1267876625061035, "debug/sppo_reject_loss": 2213.57275390625, "epoch": 1.0869565217391304, "grad_norm": 58950.492438903406, "learning_rate": 9.779534393298261e-08, "logits/chosen": 1.0419118404388428, "logits/rejected": 1.3241182565689087, "logps/chosen": -253.39053344726562, "logps/rejected": -298.21099853515625, "loss": 4852.6547, "rewards/accuracies": 0.625, "rewards/chosen": -0.015331600792706013, "rewards/margins": 0.01593627780675888, "rewards/rejected": -0.03126787766814232, "step": 300 }, { "epoch": 1.0869565217391304, "eval_debug/policy_chosen_logits": 1.5894911289215088, "eval_debug/policy_chosen_logps": -253.82180786132812, "eval_debug/policy_rejected_logits": 1.6477311849594116, "eval_debug/policy_rejected_logps": -261.3568115234375, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2599.375244140625, "eval_debug/sppo_chosen_reward_in_loss": -0.9033258557319641, "eval_debug/sppo_rej_reward_in_loss": -1.6981514692306519, "eval_debug/sppo_reject_loss": 2346.007080078125, "eval_logits/chosen": 1.5894911289215088, "eval_logits/rejected": 1.6477311849594116, "eval_logps/chosen": -253.82180786132812, "eval_logps/rejected": -261.3568115234375, "eval_loss": 4861.89599609375, "eval_rewards/accuracies": 0.46052631735801697, "eval_rewards/chosen": -0.009033258073031902, "eval_rewards/margins": 0.007948257029056549, "eval_rewards/rejected": -0.016981516033411026, "eval_runtime": 28.419, "eval_samples_per_second": 21.113, "eval_steps_per_second": 0.669, "step": 300 }, { "debug/policy_chosen_logits": 1.674957036972046, "debug/policy_chosen_logps": -274.6082458496094, "debug/policy_rejected_logits": 1.868452787399292, "debug/policy_rejected_logps": -293.33526611328125, "debug/reference_chosen_logps": -273.9171142578125, "debug/reference_rejected_logps": -290.7630920410156, "debug/sppo_chosen_loss": 2573.90234375, "debug/sppo_chosen_reward_in_loss": -0.6911390423774719, "debug/sppo_rej_reward_in_loss": -2.5721654891967773, "debug/sppo_reject_loss": 2263.828125, "epoch": 1.105072463768116, "grad_norm": 66459.25152549255, "learning_rate": 9.768460251953622e-08, "logits/chosen": 1.674957036972046, "logits/rejected": 1.868452787399292, "logps/chosen": -274.6082458496094, "logps/rejected": -293.33526611328125, "loss": 4867.6438, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006911390461027622, "rewards/margins": 0.018810266628861427, "rewards/rejected": -0.025721654295921326, "step": 305 }, { "debug/policy_chosen_logits": 1.3893417119979858, "debug/policy_chosen_logps": -289.6875915527344, "debug/policy_rejected_logits": 1.6382023096084595, "debug/policy_rejected_logps": -310.7853698730469, "debug/reference_chosen_logps": -288.9293212890625, "debug/reference_rejected_logps": -308.27606201171875, "debug/sppo_chosen_loss": 2581.68017578125, "debug/sppo_chosen_reward_in_loss": -0.7582954168319702, "debug/sppo_rej_reward_in_loss": -2.50929594039917, "debug/sppo_reject_loss": 2264.07470703125, "epoch": 1.1231884057971016, "grad_norm": 75849.96054845196, "learning_rate": 9.757121336998056e-08, "logits/chosen": 1.3893417119979858, "logits/rejected": 1.6382023096084595, "logps/chosen": -289.6875915527344, "logps/rejected": -310.7853698730469, "loss": 4836.8883, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.007582955062389374, "rewards/margins": 0.017510006204247475, "rewards/rejected": -0.0250929594039917, "step": 310 }, { "debug/policy_chosen_logits": 0.9951621294021606, "debug/policy_chosen_logps": -221.96142578125, "debug/policy_rejected_logits": 1.386580228805542, "debug/policy_rejected_logps": -279.8023681640625, "debug/reference_chosen_logps": -227.9397430419922, "debug/reference_rejected_logps": -283.32073974609375, "debug/sppo_chosen_loss": 1973.23828125, "debug/sppo_chosen_reward_in_loss": 5.978316783905029, "debug/sppo_rej_reward_in_loss": 3.5184273719787598, "debug/sppo_reject_loss": 2925.00048828125, "epoch": 1.141304347826087, "grad_norm": 127556.90854651273, "learning_rate": 9.745518278036364e-08, "logits/chosen": 0.9951621294021606, "logits/rejected": 1.386580228805542, "logps/chosen": -221.96142578125, "logps/rejected": -279.8023681640625, "loss": 4991.3445, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.059783171862363815, "rewards/margins": 0.024598896503448486, "rewards/rejected": 0.03518427163362503, "step": 315 }, { "debug/policy_chosen_logits": 1.4349600076675415, "debug/policy_chosen_logps": -244.4615936279297, "debug/policy_rejected_logits": 1.6520965099334717, "debug/policy_rejected_logps": -296.48944091796875, "debug/reference_chosen_logps": -250.67617797851562, "debug/reference_rejected_logps": -301.82122802734375, "debug/sppo_chosen_loss": 1937.580322265625, "debug/sppo_chosen_reward_in_loss": 6.214603900909424, "debug/sppo_rej_reward_in_loss": 5.331799507141113, "debug/sppo_reject_loss": 3087.521240234375, "epoch": 1.1594202898550725, "grad_norm": 82651.7113274058, "learning_rate": 9.733651719340206e-08, "logits/chosen": 1.4349600076675415, "logits/rejected": 1.6520965099334717, "logps/chosen": -244.4615936279297, "logps/rejected": -296.48944091796875, "loss": 4934.0418, "rewards/accuracies": 0.625, "rewards/chosen": 0.06214603781700134, "rewards/margins": 0.008828045800328255, "rewards/rejected": 0.05331799387931824, "step": 320 }, { "debug/policy_chosen_logits": 0.9610234498977661, "debug/policy_chosen_logps": -239.1781005859375, "debug/policy_rejected_logits": 1.2378222942352295, "debug/policy_rejected_logps": -296.514892578125, "debug/reference_chosen_logps": -239.85531616210938, "debug/reference_rejected_logps": -296.14971923828125, "debug/sppo_chosen_loss": 2439.22607421875, "debug/sppo_chosen_reward_in_loss": 0.6772235631942749, "debug/sppo_rej_reward_in_loss": -0.3651662766933441, "debug/sppo_reject_loss": 2480.13232421875, "epoch": 1.177536231884058, "grad_norm": 62900.497256271476, "learning_rate": 9.721522319812339e-08, "logits/chosen": 0.9610234498977661, "logits/rejected": 1.2378222942352295, "logps/chosen": -239.1781005859375, "logps/rejected": -296.514892578125, "loss": 4778.1418, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.006772235967218876, "rewards/margins": 0.010423899628221989, "rewards/rejected": -0.003651663661003113, "step": 325 }, { "debug/policy_chosen_logits": 0.8668224215507507, "debug/policy_chosen_logps": -240.64358520507812, "debug/policy_rejected_logits": 1.338693380355835, "debug/policy_rejected_logps": -296.9376525878906, "debug/reference_chosen_logps": -240.9998321533203, "debug/reference_rejected_logps": -293.64276123046875, "debug/sppo_chosen_loss": 2472.85888671875, "debug/sppo_chosen_reward_in_loss": 0.35626524686813354, "debug/sppo_rej_reward_in_loss": -3.2948780059814453, "debug/sppo_reject_loss": 2214.165771484375, "epoch": 1.1956521739130435, "grad_norm": 78794.22306053036, "learning_rate": 9.709130752950023e-08, "logits/chosen": 0.8668224215507507, "logits/rejected": 1.338693380355835, "logps/chosen": -240.64358520507812, "logps/rejected": -296.9376525878906, "loss": 4845.3301, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0035626522731035948, "rewards/margins": 0.03651143237948418, "rewards/rejected": -0.03294878080487251, "step": 330 }, { "debug/policy_chosen_logits": 1.226872444152832, "debug/policy_chosen_logps": -249.2202911376953, "debug/policy_rejected_logits": 1.4146177768707275, "debug/policy_rejected_logps": -284.56500244140625, "debug/reference_chosen_logps": -247.591064453125, "debug/reference_rejected_logps": -282.10699462890625, "debug/sppo_chosen_loss": 2689.522705078125, "debug/sppo_chosen_reward_in_loss": -1.629233956336975, "debug/sppo_rej_reward_in_loss": -2.4580025672912598, "debug/sppo_reject_loss": 2273.725830078125, "epoch": 1.213768115942029, "grad_norm": 72224.73272645177, "learning_rate": 9.696477706807624e-08, "logits/chosen": 1.226872444152832, "logits/rejected": 1.4146177768707275, "logps/chosen": -249.2202911376953, "logps/rejected": -284.56500244140625, "loss": 4783.2437, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.016292337328195572, "rewards/margins": 0.008287685923278332, "rewards/rejected": -0.02458002418279648, "step": 335 }, { "debug/policy_chosen_logits": 1.5981292724609375, "debug/policy_chosen_logps": -253.11019897460938, "debug/policy_rejected_logits": 1.7621173858642578, "debug/policy_rejected_logps": -273.3591003417969, "debug/reference_chosen_logps": -252.81381225585938, "debug/reference_rejected_logps": -270.9890441894531, "debug/sppo_chosen_loss": 2534.176025390625, "debug/sppo_chosen_reward_in_loss": -0.2963891923427582, "debug/sppo_rej_reward_in_loss": -2.3700661659240723, "debug/sppo_reject_loss": 2284.81640625, "epoch": 1.2318840579710144, "grad_norm": 58031.91363262894, "learning_rate": 9.683563883958413e-08, "logits/chosen": 1.5981292724609375, "logits/rejected": 1.7621173858642578, "logps/chosen": -253.11019897460938, "logps/rejected": -273.3591003417969, "loss": 4819.1984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0029638917185366154, "rewards/margins": 0.020736772567033768, "rewards/rejected": -0.023700661957263947, "step": 340 }, { "debug/policy_chosen_logits": 1.2439069747924805, "debug/policy_chosen_logps": -270.01458740234375, "debug/policy_rejected_logits": 1.4633022546768188, "debug/policy_rejected_logps": -274.4990539550781, "debug/reference_chosen_logps": -269.8506774902344, "debug/reference_rejected_logps": -272.15484619140625, "debug/sppo_chosen_loss": 2524.285888671875, "debug/sppo_chosen_reward_in_loss": -0.1639108657836914, "debug/sppo_rej_reward_in_loss": -2.344228982925415, "debug/sppo_reject_loss": 2287.3037109375, "epoch": 1.25, "grad_norm": 62603.32397855888, "learning_rate": 9.670390001455554e-08, "logits/chosen": 1.2439069747924805, "logits/rejected": 1.4633022546768188, "logps/chosen": -270.01458740234375, "logps/rejected": -274.4990539550781, "loss": 4809.4809, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.001639108406379819, "rewards/margins": 0.021803181618452072, "rewards/rejected": -0.023442288860678673, "step": 345 }, { "debug/policy_chosen_logits": 1.034299612045288, "debug/policy_chosen_logps": -244.1306610107422, "debug/policy_rejected_logits": 1.3921207189559937, "debug/policy_rejected_logps": -290.91131591796875, "debug/reference_chosen_logps": -243.61593627929688, "debug/reference_rejected_logps": -288.1432189941406, "debug/sppo_chosen_loss": 2561.4248046875, "debug/sppo_chosen_reward_in_loss": -0.5147092938423157, "debug/sppo_rej_reward_in_loss": -2.7680869102478027, "debug/sppo_reject_loss": 2245.42236328125, "epoch": 1.2681159420289856, "grad_norm": 99341.70642745105, "learning_rate": 9.656956790792285e-08, "logits/chosen": 1.034299612045288, "logits/rejected": 1.3921207189559937, "logps/chosen": -244.1306610107422, "logps/rejected": -290.91131591796875, "loss": 4828.575, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.00514709297567606, "rewards/margins": 0.022533774375915527, "rewards/rejected": -0.02768086828291416, "step": 350 }, { "debug/policy_chosen_logits": 1.4998633861541748, "debug/policy_chosen_logps": -259.9757385253906, "debug/policy_rejected_logits": 1.6802341938018799, "debug/policy_rejected_logps": -296.2747497558594, "debug/reference_chosen_logps": -260.3787841796875, "debug/reference_rejected_logps": -293.5425109863281, "debug/sppo_chosen_loss": 2463.8203125, "debug/sppo_chosen_reward_in_loss": 0.40305614471435547, "debug/sppo_rej_reward_in_loss": -2.7322874069213867, "debug/sppo_reject_loss": 2261.598388671875, "epoch": 1.286231884057971, "grad_norm": 59802.412832959686, "learning_rate": 9.643264997861311e-08, "logits/chosen": 1.4998633861541748, "logits/rejected": 1.6802341938018799, "logps/chosen": -259.9757385253906, "logps/rejected": -296.2747497558594, "loss": 4807.6715, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.004030561074614525, "rewards/margins": 0.031353436410427094, "rewards/rejected": -0.027322877198457718, "step": 355 }, { "debug/policy_chosen_logits": 0.8489713668823242, "debug/policy_chosen_logps": -229.5171356201172, "debug/policy_rejected_logits": 1.1113895177841187, "debug/policy_rejected_logps": -260.78619384765625, "debug/reference_chosen_logps": -229.55331420898438, "debug/reference_rejected_logps": -256.6519470214844, "debug/sppo_chosen_loss": 2506.991455078125, "debug/sppo_chosen_reward_in_loss": 0.03618621826171875, "debug/sppo_rej_reward_in_loss": -4.134216785430908, "debug/sppo_reject_loss": 2127.05078125, "epoch": 1.3043478260869565, "grad_norm": 56753.91600403247, "learning_rate": 9.62931538291337e-08, "logits/chosen": 0.8489713668823242, "logits/rejected": 1.1113895177841187, "logps/chosen": -229.5171356201172, "logps/rejected": -260.78619384765625, "loss": 4780.2891, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0003618622140493244, "rewards/margins": 0.04170403257012367, "rewards/rejected": -0.04134216904640198, "step": 360 }, { "debug/policy_chosen_logits": 1.2941102981567383, "debug/policy_chosen_logps": -229.6419677734375, "debug/policy_rejected_logits": 1.855833649635315, "debug/policy_rejected_logps": -298.4671325683594, "debug/reference_chosen_logps": -229.5476531982422, "debug/reference_rejected_logps": -296.23944091796875, "debug/sppo_chosen_loss": 2519.103515625, "debug/sppo_chosen_reward_in_loss": -0.09428653866052628, "debug/sppo_rej_reward_in_loss": -2.227700710296631, "debug/sppo_reject_loss": 2297.76025390625, "epoch": 1.322463768115942, "grad_norm": 64235.39187696898, "learning_rate": 9.615108720515041e-08, "logits/chosen": 1.2941102981567383, "logits/rejected": 1.855833649635315, "logps/chosen": -229.6419677734375, "logps/rejected": -298.4671325683594, "loss": 4844.5879, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.00094286521198228, "rewards/margins": 0.021334141492843628, "rewards/rejected": -0.02227700687944889, "step": 365 }, { "debug/policy_chosen_logits": 1.3823152780532837, "debug/policy_chosen_logps": -261.19049072265625, "debug/policy_rejected_logits": 1.6258795261383057, "debug/policy_rejected_logps": -302.4359436035156, "debug/reference_chosen_logps": -260.66339111328125, "debug/reference_rejected_logps": -298.43231201171875, "debug/sppo_chosen_loss": 2561.10693359375, "debug/sppo_chosen_reward_in_loss": -0.5271316766738892, "debug/sppo_rej_reward_in_loss": -4.003632545471191, "debug/sppo_reject_loss": 2143.478759765625, "epoch": 1.3405797101449275, "grad_norm": 62125.362422127815, "learning_rate": 9.600645799505717e-08, "logits/chosen": 1.3823152780532837, "logits/rejected": 1.6258795261383057, "logps/chosen": -261.19049072265625, "logps/rejected": -302.4359436035156, "loss": 4781.7582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005271316505968571, "rewards/margins": 0.034765005111694336, "rewards/rejected": -0.04003632068634033, "step": 370 }, { "debug/policy_chosen_logits": 1.1877999305725098, "debug/policy_chosen_logps": -255.3175506591797, "debug/policy_rejected_logits": 1.4676315784454346, "debug/policy_rejected_logps": -262.8641662597656, "debug/reference_chosen_logps": -254.73141479492188, "debug/reference_rejected_logps": -260.73736572265625, "debug/sppo_chosen_loss": 2564.98974609375, "debug/sppo_chosen_reward_in_loss": -0.586154580116272, "debug/sppo_rej_reward_in_loss": -2.1267733573913574, "debug/sppo_reject_loss": 2307.00927734375, "epoch": 1.358695652173913, "grad_norm": 58105.264293713124, "learning_rate": 9.585927422953815e-08, "logits/chosen": 1.1877999305725098, "logits/rejected": 1.4676315784454346, "logps/chosen": -255.3175506591797, "logps/rejected": -262.8641662597656, "loss": 4838.052, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.005861545447260141, "rewards/margins": 0.015406189486384392, "rewards/rejected": -0.021267732605338097, "step": 375 }, { "debug/policy_chosen_logits": 1.14890456199646, "debug/policy_chosen_logps": -236.107177734375, "debug/policy_rejected_logits": 1.4956175088882446, "debug/policy_rejected_logps": -260.3258056640625, "debug/reference_chosen_logps": -235.4017333984375, "debug/reference_rejected_logps": -256.94818115234375, "debug/sppo_chosen_loss": 2575.2421875, "debug/sppo_chosen_reward_in_loss": -0.7054517865180969, "debug/sppo_rej_reward_in_loss": -3.3776297569274902, "debug/sppo_reject_loss": 2199.29345703125, "epoch": 1.3768115942028984, "grad_norm": 61814.61417535102, "learning_rate": 9.570954408112178e-08, "logits/chosen": 1.14890456199646, "logits/rejected": 1.4956175088882446, "logps/chosen": -236.107177734375, "logps/rejected": -260.3258056640625, "loss": 4763.4984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.007054517511278391, "rewards/margins": 0.026721779257059097, "rewards/rejected": -0.03377629444003105, "step": 380 }, { "debug/policy_chosen_logits": 1.5179424285888672, "debug/policy_chosen_logps": -263.9978942871094, "debug/policy_rejected_logits": 1.6211563348770142, "debug/policy_rejected_logps": -294.53656005859375, "debug/reference_chosen_logps": -262.984375, "debug/reference_rejected_logps": -292.22747802734375, "debug/sppo_chosen_loss": 2611.53662109375, "debug/sppo_chosen_reward_in_loss": -1.0135478973388672, "debug/sppo_rej_reward_in_loss": -2.3091111183166504, "debug/sppo_reject_loss": 2293.55908203125, "epoch": 1.394927536231884, "grad_norm": 75427.62511951616, "learning_rate": 9.555727586372702e-08, "logits/chosen": 1.5179424285888672, "logits/rejected": 1.6211563348770142, "logps/chosen": -263.9978942871094, "logps/rejected": -294.53656005859375, "loss": 4820.6531, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.010135479271411896, "rewards/margins": 0.012955631129443645, "rewards/rejected": -0.023091109469532967, "step": 385 }, { "debug/policy_chosen_logits": 1.590163230895996, "debug/policy_chosen_logps": -296.49066162109375, "debug/policy_rejected_logits": 1.3836915493011475, "debug/policy_rejected_logps": -283.4267883300781, "debug/reference_chosen_logps": -296.08233642578125, "debug/reference_rejected_logps": -280.37335205078125, "debug/sppo_chosen_loss": 2550.162109375, "debug/sppo_chosen_reward_in_loss": -0.40835076570510864, "debug/sppo_rej_reward_in_loss": -3.0534708499908447, "debug/sppo_reject_loss": 2222.74755859375, "epoch": 1.4130434782608696, "grad_norm": 62180.159416556795, "learning_rate": 9.540247803220169e-08, "logits/chosen": 1.590163230895996, "logits/rejected": 1.3836915493011475, "logps/chosen": -296.49066162109375, "logps/rejected": -283.4267883300781, "loss": 4742.4914, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.004083507228642702, "rewards/margins": 0.026451200246810913, "rewards/rejected": -0.030534708872437477, "step": 390 }, { "debug/policy_chosen_logits": 0.9762474894523621, "debug/policy_chosen_logps": -274.21893310546875, "debug/policy_rejected_logits": 1.2334016561508179, "debug/policy_rejected_logps": -305.20465087890625, "debug/reference_chosen_logps": -273.4296875, "debug/reference_rejected_logps": -303.46014404296875, "debug/sppo_chosen_loss": 2594.09521484375, "debug/sppo_chosen_reward_in_loss": -0.7892316579818726, "debug/sppo_rej_reward_in_loss": -1.744462251663208, "debug/sppo_reject_loss": 2350.6826171875, "epoch": 1.431159420289855, "grad_norm": 94722.70795318228, "learning_rate": 9.524515918185301e-08, "logits/chosen": 0.9762474894523621, "logits/rejected": 1.2334016561508179, "logps/chosen": -274.21893310546875, "logps/rejected": -305.20465087890625, "loss": 4802.1969, "rewards/accuracies": 0.625, "rewards/chosen": -0.00789231713861227, "rewards/margins": 0.009552305564284325, "rewards/rejected": -0.01744462177157402, "step": 395 }, { "debug/policy_chosen_logits": 1.2278473377227783, "debug/policy_chosen_logps": -283.450927734375, "debug/policy_rejected_logits": 1.3526257276535034, "debug/policy_rejected_logps": -314.7835388183594, "debug/reference_chosen_logps": -283.3064270019531, "debug/reference_rejected_logps": -312.33880615234375, "debug/sppo_chosen_loss": 2527.92822265625, "debug/sppo_chosen_reward_in_loss": -0.1445016860961914, "debug/sppo_rej_reward_in_loss": -2.444725275039673, "debug/sppo_reject_loss": 2280.696533203125, "epoch": 1.4492753623188406, "grad_norm": 70038.44434880349, "learning_rate": 9.508532804797034e-08, "logits/chosen": 1.2278473377227783, "logits/rejected": 1.3526257276535034, "logps/chosen": -283.450927734375, "logps/rejected": -314.7835388183594, "loss": 4810.0602, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0014450167072936893, "rewards/margins": 0.02300223708152771, "rewards/rejected": -0.024447252973914146, "step": 400 }, { "epoch": 1.4492753623188406, "eval_debug/policy_chosen_logits": 1.5488909482955933, "eval_debug/policy_chosen_logps": -253.56918334960938, "eval_debug/policy_rejected_logits": 1.603257656097412, "eval_debug/policy_rejected_logps": -261.84649658203125, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2584.198486328125, "eval_debug/sppo_chosen_reward_in_loss": -0.6506962776184082, "eval_debug/sppo_rej_reward_in_loss": -2.187859296798706, "eval_debug/sppo_reject_loss": 2322.553466796875, "eval_logits/chosen": 1.5488909482955933, "eval_logits/rejected": 1.603257656097412, "eval_logps/chosen": -253.56918334960938, "eval_logps/rejected": -261.84649658203125, "eval_loss": 4799.115234375, "eval_rewards/accuracies": 0.5394737124443054, "eval_rewards/chosen": -0.006506962701678276, "eval_rewards/margins": 0.015371627174317837, "eval_rewards/rejected": -0.021878590807318687, "eval_runtime": 28.7384, "eval_samples_per_second": 20.878, "eval_steps_per_second": 0.661, "step": 400 }, { "debug/policy_chosen_logits": 1.3403141498565674, "debug/policy_chosen_logps": -248.9816436767578, "debug/policy_rejected_logits": 1.8409442901611328, "debug/policy_rejected_logps": -277.715576171875, "debug/reference_chosen_logps": -250.1258544921875, "debug/reference_rejected_logps": -276.1705627441406, "debug/sppo_chosen_loss": 2390.42578125, "debug/sppo_chosen_reward_in_loss": 1.1441965103149414, "debug/sppo_rej_reward_in_loss": -1.5450060367584229, "debug/sppo_reject_loss": 2368.06884765625, "epoch": 1.4673913043478262, "grad_norm": 70367.72710076011, "learning_rate": 9.49229935053401e-08, "logits/chosen": 1.3403141498565674, "logits/rejected": 1.8409442901611328, "logps/chosen": -248.9816436767578, "logps/rejected": -277.715576171875, "loss": 4720.8113, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.011441965587437153, "rewards/margins": 0.026892025023698807, "rewards/rejected": -0.015450060367584229, "step": 405 }, { "debug/policy_chosen_logits": 1.3786104917526245, "debug/policy_chosen_logps": -244.37557983398438, "debug/policy_rejected_logits": 1.605938196182251, "debug/policy_rejected_logps": -292.2764587402344, "debug/reference_chosen_logps": -244.8103485107422, "debug/reference_rejected_logps": -288.8402404785156, "debug/sppo_chosen_loss": 2460.53564453125, "debug/sppo_chosen_reward_in_loss": 0.4347648620605469, "debug/sppo_rej_reward_in_loss": -3.43621826171875, "debug/sppo_reject_loss": 2192.02734375, "epoch": 1.4855072463768115, "grad_norm": 77871.27716914566, "learning_rate": 9.475816456775311e-08, "logits/chosen": 1.3786104917526245, "logits/rejected": 1.605938196182251, "logps/chosen": -244.37557983398438, "logps/rejected": -292.2764587402344, "loss": 4758.2246, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.004347648937255144, "rewards/margins": 0.03870982676744461, "rewards/rejected": -0.034362178295850754, "step": 410 }, { "debug/policy_chosen_logits": 0.9305599927902222, "debug/policy_chosen_logps": -243.4819793701172, "debug/policy_rejected_logits": 1.324118971824646, "debug/policy_rejected_logps": -293.31402587890625, "debug/reference_chosen_logps": -243.03857421875, "debug/reference_rejected_logps": -289.7604064941406, "debug/sppo_chosen_loss": 2563.819580078125, "debug/sppo_chosen_reward_in_loss": -0.44340628385543823, "debug/sppo_rej_reward_in_loss": -3.5536065101623535, "debug/sppo_reject_loss": 2186.4287109375, "epoch": 1.5036231884057971, "grad_norm": 84731.09766930714, "learning_rate": 9.459085038750394e-08, "logits/chosen": 0.9305599927902222, "logits/rejected": 1.324118971824646, "logps/chosen": -243.4819793701172, "logps/rejected": -293.31402587890625, "loss": 4734.184, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.004434062633663416, "rewards/margins": 0.031102001667022705, "rewards/rejected": -0.035536061972379684, "step": 415 }, { "debug/policy_chosen_logits": 1.5847978591918945, "debug/policy_chosen_logps": -257.9969787597656, "debug/policy_rejected_logits": 2.149386167526245, "debug/policy_rejected_logps": -308.333251953125, "debug/reference_chosen_logps": -255.08334350585938, "debug/reference_rejected_logps": -302.2669372558594, "debug/sppo_chosen_loss": 2824.914794921875, "debug/sppo_chosen_reward_in_loss": -2.9136509895324707, "debug/sppo_rej_reward_in_loss": -6.066329479217529, "debug/sppo_reject_loss": 1976.962646484375, "epoch": 1.5217391304347827, "grad_norm": 104826.24697622382, "learning_rate": 9.442106025488283e-08, "logits/chosen": 1.5847978591918945, "logits/rejected": 2.149386167526245, "logps/chosen": -257.9969787597656, "logps/rejected": -308.333251953125, "loss": 4816.7281, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.02913650870323181, "rewards/margins": 0.03152678534388542, "rewards/rejected": -0.06066329404711723, "step": 420 }, { "debug/policy_chosen_logits": 1.3920035362243652, "debug/policy_chosen_logps": -255.4589080810547, "debug/policy_rejected_logits": 1.937829613685608, "debug/policy_rejected_logps": -310.65008544921875, "debug/reference_chosen_logps": -254.27566528320312, "debug/reference_rejected_logps": -306.13055419921875, "debug/sppo_chosen_loss": 2635.071533203125, "debug/sppo_chosen_reward_in_loss": -1.1832473278045654, "debug/sppo_rej_reward_in_loss": -4.519493579864502, "debug/sppo_reject_loss": 2109.51611328125, "epoch": 1.539855072463768, "grad_norm": 87017.94217456014, "learning_rate": 9.424880359765976e-08, "logits/chosen": 1.3920035362243652, "logits/rejected": 1.937829613685608, "logps/chosen": -255.4589080810547, "logps/rejected": -310.65008544921875, "loss": 4719.7344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.011832473799586296, "rewards/margins": 0.033362459391355515, "rewards/rejected": -0.04519493132829666, "step": 425 }, { "debug/policy_chosen_logits": 0.9032995104789734, "debug/policy_chosen_logps": -243.0791473388672, "debug/policy_rejected_logits": 1.2816048860549927, "debug/policy_rejected_logps": -295.67889404296875, "debug/reference_chosen_logps": -243.094970703125, "debug/reference_rejected_logps": -292.0074462890625, "debug/sppo_chosen_loss": 2512.8564453125, "debug/sppo_chosen_reward_in_loss": 0.015825461596250534, "debug/sppo_rej_reward_in_loss": -3.671428680419922, "debug/sppo_reject_loss": 2182.8984375, "epoch": 1.5579710144927537, "grad_norm": 63772.18252754498, "learning_rate": 9.407408998056104e-08, "logits/chosen": 0.9032995104789734, "logits/rejected": 1.2816048860549927, "logps/chosen": -243.0791473388672, "logps/rejected": -295.67889404296875, "loss": 4704.9863, "rewards/accuracies": 0.75, "rewards/chosen": 0.00015825479931663722, "rewards/margins": 0.036872539669275284, "rewards/rejected": -0.036714281886816025, "step": 430 }, { "debug/policy_chosen_logits": 1.313698410987854, "debug/policy_chosen_logps": -236.9527130126953, "debug/policy_rejected_logits": 1.505110502243042, "debug/policy_rejected_logps": -270.3215637207031, "debug/reference_chosen_logps": -235.29800415039062, "debug/reference_rejected_logps": -266.57958984375, "debug/sppo_chosen_loss": 2689.590087890625, "debug/sppo_chosen_reward_in_loss": -1.6547034978866577, "debug/sppo_rej_reward_in_loss": -3.7419822216033936, "debug/sppo_reject_loss": 2173.85205078125, "epoch": 1.5760869565217392, "grad_norm": 73743.00558373412, "learning_rate": 9.389692910473814e-08, "logits/chosen": 1.313698410987854, "logits/rejected": 1.505110502243042, "logps/chosen": -236.9527130126953, "logps/rejected": -270.3215637207031, "loss": 4785.4102, "rewards/accuracies": 0.625, "rewards/chosen": -0.016547035425901413, "rewards/margins": 0.02087278850376606, "rewards/rejected": -0.037419818341732025, "step": 435 }, { "debug/policy_chosen_logits": 1.3025128841400146, "debug/policy_chosen_logps": -247.7681121826172, "debug/policy_rejected_logits": 1.7691015005111694, "debug/policy_rejected_logps": -306.4404296875, "debug/reference_chosen_logps": -246.77597045898438, "debug/reference_rejected_logps": -302.6429138183594, "debug/sppo_chosen_loss": 2613.018310546875, "debug/sppo_chosen_reward_in_loss": -0.9921543002128601, "debug/sppo_rej_reward_in_loss": -3.7975406646728516, "debug/sppo_reject_loss": 2169.7216796875, "epoch": 1.5942028985507246, "grad_norm": 60935.995908769066, "learning_rate": 9.37173308072291e-08, "logits/chosen": 1.3025128841400146, "logits/rejected": 1.7691015005111694, "logps/chosen": -247.7681121826172, "logps/rejected": -306.4404296875, "loss": 4807.8977, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.009921541437506676, "rewards/margins": 0.02805386111140251, "rewards/rejected": -0.03797540441155434, "step": 440 }, { "debug/policy_chosen_logits": 1.3220393657684326, "debug/policy_chosen_logps": -262.1022033691406, "debug/policy_rejected_logits": 1.4996846914291382, "debug/policy_rejected_logps": -287.05889892578125, "debug/reference_chosen_logps": -260.79449462890625, "debug/reference_rejected_logps": -283.6168518066406, "debug/sppo_chosen_loss": 2663.69580078125, "debug/sppo_chosen_reward_in_loss": -1.3077014684677124, "debug/sppo_rej_reward_in_loss": -3.442105531692505, "debug/sppo_reject_loss": 2197.2099609375, "epoch": 1.6123188405797102, "grad_norm": 88654.99949107536, "learning_rate": 9.353530506041226e-08, "logits/chosen": 1.3220393657684326, "logits/rejected": 1.4996846914291382, "logps/chosen": -262.1022033691406, "logps/rejected": -287.05889892578125, "loss": 4738.5777, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.013077013194561005, "rewards/margins": 0.02134403958916664, "rewards/rejected": -0.034421052783727646, "step": 445 }, { "debug/policy_chosen_logits": 1.200622797012329, "debug/policy_chosen_logps": -262.21710205078125, "debug/policy_rejected_logits": 1.3046382665634155, "debug/policy_rejected_logps": -259.45977783203125, "debug/reference_chosen_logps": -261.75518798828125, "debug/reference_rejected_logps": -257.6763916015625, "debug/sppo_chosen_loss": 2556.416748046875, "debug/sppo_chosen_reward_in_loss": -0.46194133162498474, "debug/sppo_rej_reward_in_loss": -1.7833735942840576, "debug/sppo_reject_loss": 2343.46044921875, "epoch": 1.6304347826086958, "grad_norm": 61443.35825745383, "learning_rate": 9.335086197145254e-08, "logits/chosen": 1.200622797012329, "logits/rejected": 1.3046382665634155, "logps/chosen": -262.21710205078125, "logps/rejected": -259.45977783203125, "loss": 4789.7859, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.004619413521140814, "rewards/margins": 0.013214322738349438, "rewards/rejected": -0.017833735793828964, "step": 450 }, { "debug/policy_chosen_logits": 1.166078805923462, "debug/policy_chosen_logps": -261.41748046875, "debug/policy_rejected_logits": 1.4401895999908447, "debug/policy_rejected_logps": -309.0025329589844, "debug/reference_chosen_logps": -260.24456787109375, "debug/reference_rejected_logps": -304.6410827636719, "debug/sppo_chosen_loss": 2636.66455078125, "debug/sppo_chosen_reward_in_loss": -1.172864317893982, "debug/sppo_rej_reward_in_loss": -4.361422538757324, "debug/sppo_reject_loss": 2135.68115234375, "epoch": 1.6485507246376812, "grad_norm": 64189.45844600206, "learning_rate": 9.31640117817403e-08, "logits/chosen": 1.166078805923462, "logits/rejected": 1.4401895999908447, "logps/chosen": -261.41748046875, "logps/rejected": -309.0025329589844, "loss": 4745.2758, "rewards/accuracies": 0.625, "rewards/chosen": -0.01172864343971014, "rewards/margins": 0.03188558667898178, "rewards/rejected": -0.0436142273247242, "step": 455 }, { "debug/policy_chosen_logits": 1.3456027507781982, "debug/policy_chosen_logps": -230.1571502685547, "debug/policy_rejected_logits": 1.5562714338302612, "debug/policy_rejected_logps": -313.42950439453125, "debug/reference_chosen_logps": -229.99716186523438, "debug/reference_rejected_logps": -310.053955078125, "debug/sppo_chosen_loss": 2534.66064453125, "debug/sppo_chosen_reward_in_loss": -0.15998229384422302, "debug/sppo_rej_reward_in_loss": -3.375528335571289, "debug/sppo_reject_loss": 2215.09814453125, "epoch": 1.6666666666666665, "grad_norm": 67332.59952507944, "learning_rate": 9.297476486632254e-08, "logits/chosen": 1.3456027507781982, "logits/rejected": 1.5562714338302612, "logps/chosen": -230.1571502685547, "logps/rejected": -313.42950439453125, "loss": 4755.9258, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0015998227754607797, "rewards/margins": 0.032155461609363556, "rewards/rejected": -0.03375528007745743, "step": 460 }, { "debug/policy_chosen_logits": 1.2817943096160889, "debug/policy_chosen_logps": -256.0505676269531, "debug/policy_rejected_logits": 1.5631263256072998, "debug/policy_rejected_logps": -262.54083251953125, "debug/reference_chosen_logps": -254.5850372314453, "debug/reference_rejected_logps": -259.1615905761719, "debug/sppo_chosen_loss": 2667.42919921875, "debug/sppo_chosen_reward_in_loss": -1.4655250310897827, "debug/sppo_rej_reward_in_loss": -3.3792197704315186, "debug/sppo_reject_loss": 2197.87646484375, "epoch": 1.6847826086956523, "grad_norm": 60994.94822797016, "learning_rate": 9.278313173332697e-08, "logits/chosen": 1.2817943096160889, "logits/rejected": 1.5631263256072998, "logps/chosen": -256.0505676269531, "logps/rejected": -262.54083251953125, "loss": 4761.1359, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.014655251987278461, "rewards/margins": 0.019136948511004448, "rewards/rejected": -0.033792201429605484, "step": 465 }, { "debug/policy_chosen_logits": 1.2423388957977295, "debug/policy_chosen_logps": -234.31753540039062, "debug/policy_rejected_logits": 1.8380489349365234, "debug/policy_rejected_logps": -291.81353759765625, "debug/reference_chosen_logps": -234.3294677734375, "debug/reference_rejected_logps": -288.486083984375, "debug/sppo_chosen_loss": 2512.909912109375, "debug/sppo_chosen_reward_in_loss": 0.011934471316635609, "debug/sppo_rej_reward_in_loss": -3.327467441558838, "debug/sppo_reject_loss": 2219.69287109375, "epoch": 1.7028985507246377, "grad_norm": 59392.99089032479, "learning_rate": 9.25891230233784e-08, "logits/chosen": 1.2423388957977295, "logits/rejected": 1.8380489349365234, "logps/chosen": -234.31753540039062, "logps/rejected": -291.81353759765625, "loss": 4827.7793, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.00011934452049899846, "rewards/margins": 0.033394016325473785, "rewards/rejected": -0.03327467292547226, "step": 470 }, { "debug/policy_chosen_logits": 1.3661912679672241, "debug/policy_chosen_logps": -267.3763427734375, "debug/policy_rejected_logits": 1.5575473308563232, "debug/policy_rejected_logps": -279.25225830078125, "debug/reference_chosen_logps": -268.66082763671875, "debug/reference_rejected_logps": -276.802490234375, "debug/sppo_chosen_loss": 2395.666748046875, "debug/sppo_chosen_reward_in_loss": 1.284515142440796, "debug/sppo_rej_reward_in_loss": -2.449761390686035, "debug/sppo_reject_loss": 2320.32177734375, "epoch": 1.721014492753623, "grad_norm": 137145.32835366024, "learning_rate": 9.239274950900804e-08, "logits/chosen": 1.3661912679672241, "logits/rejected": 1.5575473308563232, "logps/chosen": -267.3763427734375, "logps/rejected": -279.25225830078125, "loss": 4731.8875, "rewards/accuracies": 0.625, "rewards/chosen": 0.012845151126384735, "rewards/margins": 0.03734276443719864, "rewards/rejected": -0.024497613310813904, "step": 475 }, { "debug/policy_chosen_logits": 1.221142053604126, "debug/policy_chosen_logps": -253.66561889648438, "debug/policy_rejected_logits": 1.3845961093902588, "debug/policy_rejected_logps": -257.0927734375, "debug/reference_chosen_logps": -257.324951171875, "debug/reference_rejected_logps": -257.91546630859375, "debug/sppo_chosen_loss": 2159.9609375, "debug/sppo_chosen_reward_in_loss": 3.659325122833252, "debug/sppo_rej_reward_in_loss": 0.8226556777954102, "debug/sppo_reject_loss": 2633.194091796875, "epoch": 1.7391304347826086, "grad_norm": 100541.81048766572, "learning_rate": 9.219402209405519e-08, "logits/chosen": 1.221142053604126, "logits/rejected": 1.3845961093902588, "logps/chosen": -253.66561889648438, "logps/rejected": -257.0927734375, "loss": 4748.7293, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.036593250930309296, "rewards/margins": 0.0283666905015707, "rewards/rejected": 0.008226556703448296, "step": 480 }, { "debug/policy_chosen_logits": 1.4586317539215088, "debug/policy_chosen_logps": -274.37286376953125, "debug/policy_rejected_logits": 1.7072410583496094, "debug/policy_rejected_logps": -322.7748107910156, "debug/reference_chosen_logps": -275.3488464355469, "debug/reference_rejected_logps": -318.627685546875, "debug/sppo_chosen_loss": 2421.141357421875, "debug/sppo_chosen_reward_in_loss": 0.975965678691864, "debug/sppo_rej_reward_in_loss": -4.1471357345581055, "debug/sppo_reject_loss": 2191.396484375, "epoch": 1.7572463768115942, "grad_norm": 59327.87743261923, "learning_rate": 9.19929518130619e-08, "logits/chosen": 1.4586317539215088, "logits/rejected": 1.7072410583496094, "logps/chosen": -274.37286376953125, "logps/rejected": -322.7748107910156, "loss": 4686.2148, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00975965615361929, "rewards/margins": 0.0512310154736042, "rewards/rejected": -0.04147135466337204, "step": 485 }, { "debug/policy_chosen_logits": 1.4661785364151, "debug/policy_chosen_logps": -258.8686828613281, "debug/policy_rejected_logits": 1.7450615167617798, "debug/policy_rejected_logps": -294.00946044921875, "debug/reference_chosen_logps": -258.397216796875, "debug/reference_rejected_logps": -290.43145751953125, "debug/sppo_chosen_loss": 2561.934814453125, "debug/sppo_chosen_reward_in_loss": -0.47144660353660583, "debug/sppo_rej_reward_in_loss": -3.578030824661255, "debug/sppo_reject_loss": 2190.13330078125, "epoch": 1.7753623188405796, "grad_norm": 116417.92157649665, "learning_rate": 9.178954983066031e-08, "logits/chosen": 1.4661785364151, "logits/rejected": 1.7450615167617798, "logps/chosen": -258.8686828613281, "logps/rejected": -294.00946044921875, "loss": 4626.4297, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.004714466631412506, "rewards/margins": 0.031065840274095535, "rewards/rejected": -0.03578030690550804, "step": 490 }, { "debug/policy_chosen_logits": 1.1955199241638184, "debug/policy_chosen_logps": -232.2951202392578, "debug/policy_rejected_logits": 1.4762752056121826, "debug/policy_rejected_logps": -290.5370178222656, "debug/reference_chosen_logps": -231.7759246826172, "debug/reference_rejected_logps": -286.0237121582031, "debug/sppo_chosen_loss": 2573.547607421875, "debug/sppo_chosen_reward_in_loss": -0.5191976428031921, "debug/sppo_rej_reward_in_loss": -4.513314247131348, "debug/sppo_reject_loss": 2124.11328125, "epoch": 1.7934782608695652, "grad_norm": 65420.3826874141, "learning_rate": 9.15838274409526e-08, "logits/chosen": 1.1955199241638184, "logits/rejected": 1.4762752056121826, "logps/chosen": -232.2951202392578, "logps/rejected": -290.5370178222656, "loss": 4745.8953, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.005191975738853216, "rewards/margins": 0.03994116559624672, "rewards/rejected": -0.045133139938116074, "step": 495 }, { "debug/policy_chosen_logits": 1.2134708166122437, "debug/policy_chosen_logps": -261.5220031738281, "debug/policy_rejected_logits": 1.4601694345474243, "debug/policy_rejected_logps": -268.22821044921875, "debug/reference_chosen_logps": -260.53173828125, "debug/reference_rejected_logps": -265.97222900390625, "debug/sppo_chosen_loss": 2622.430419921875, "debug/sppo_chosen_reward_in_loss": -0.9902515411376953, "debug/sppo_rej_reward_in_loss": -2.25596022605896, "debug/sppo_reject_loss": 2303.806884765625, "epoch": 1.8115942028985508, "grad_norm": 73796.98161312056, "learning_rate": 9.13757960668839e-08, "logits/chosen": 1.2134708166122437, "logits/rejected": 1.4601694345474243, "logps/chosen": -261.5220031738281, "logps/rejected": -268.22821044921875, "loss": 4686.3855, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00990251637995243, "rewards/margins": 0.012657088227570057, "rewards/rejected": -0.022559601813554764, "step": 500 }, { "epoch": 1.8115942028985508, "eval_debug/policy_chosen_logits": 1.5348409414291382, "eval_debug/policy_chosen_logps": -254.37594604492188, "eval_debug/policy_rejected_logits": 1.5898981094360352, "eval_debug/policy_rejected_logps": -263.16796875, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2678.08642578125, "eval_debug/sppo_chosen_reward_in_loss": -1.4574708938598633, "eval_debug/sppo_rej_reward_in_loss": -3.509323835372925, "eval_debug/sppo_reject_loss": 2224.341552734375, "eval_logits/chosen": 1.5348409414291382, "eval_logits/rejected": 1.5898981094360352, "eval_logps/chosen": -254.37594604492188, "eval_logps/rejected": -263.16796875, "eval_loss": 4767.90185546875, "eval_rewards/accuracies": 0.5131579041481018, "eval_rewards/chosen": -0.014574708417057991, "eval_rewards/margins": 0.020518526434898376, "eval_rewards/rejected": -0.03509323671460152, "eval_runtime": 28.3258, "eval_samples_per_second": 21.182, "eval_steps_per_second": 0.671, "step": 500 }, { "debug/policy_chosen_logits": 1.278747320175171, "debug/policy_chosen_logps": -251.4324188232422, "debug/policy_rejected_logits": 1.3868252038955688, "debug/policy_rejected_logps": -285.3130798339844, "debug/reference_chosen_logps": -250.66650390625, "debug/reference_rejected_logps": -280.2650451660156, "debug/sppo_chosen_loss": 2590.232177734375, "debug/sppo_chosen_reward_in_loss": -0.7659379839897156, "debug/sppo_rej_reward_in_loss": -5.048047065734863, "debug/sppo_reject_loss": 2071.307373046875, "epoch": 1.8297101449275361, "grad_norm": 65502.62694895854, "learning_rate": 9.11654672596081e-08, "logits/chosen": 1.278747320175171, "logits/rejected": 1.3868252038955688, "logps/chosen": -251.4324188232422, "logps/rejected": -285.3130798339844, "loss": 4641.1789, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0076593803241848946, "rewards/margins": 0.04282108694314957, "rewards/rejected": -0.05048046633601189, "step": 505 }, { "debug/policy_chosen_logits": 1.4022386074066162, "debug/policy_chosen_logps": -257.87750244140625, "debug/policy_rejected_logits": 1.592950701713562, "debug/policy_rejected_logps": -300.3731689453125, "debug/reference_chosen_logps": -257.995361328125, "debug/reference_rejected_logps": -295.6236267089844, "debug/sppo_chosen_loss": 2500.1318359375, "debug/sppo_chosen_reward_in_loss": 0.11788959801197052, "debug/sppo_rej_reward_in_loss": -4.749524116516113, "debug/sppo_reject_loss": 2101.89306640625, "epoch": 1.8478260869565217, "grad_norm": 90656.15151946226, "learning_rate": 9.095285269784641e-08, "logits/chosen": 1.4022386074066162, "logits/rejected": 1.592950701713562, "logps/chosen": -257.87750244140625, "logps/rejected": -300.3731689453125, "loss": 4618.7891, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0011788962874561548, "rewards/margins": 0.04867414012551308, "rewards/rejected": -0.047495242208242416, "step": 510 }, { "debug/policy_chosen_logits": 1.4164687395095825, "debug/policy_chosen_logps": -256.8494873046875, "debug/policy_rejected_logits": 1.567403793334961, "debug/policy_rejected_logps": -271.0446472167969, "debug/reference_chosen_logps": -255.5850830078125, "debug/reference_rejected_logps": -265.862548828125, "debug/sppo_chosen_loss": 2653.49267578125, "debug/sppo_chosen_reward_in_loss": -1.2644188404083252, "debug/sppo_rej_reward_in_loss": -5.182098388671875, "debug/sppo_reject_loss": 2071.83935546875, "epoch": 1.8659420289855073, "grad_norm": 58337.062881640195, "learning_rate": 9.073796418723882e-08, "logits/chosen": 1.4164687395095825, "logits/rejected": 1.567403793334961, "logps/chosen": -256.8494873046875, "logps/rejected": -271.0446472167969, "loss": 4786.3289, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.012644186615943909, "rewards/margins": 0.03917679563164711, "rewards/rejected": -0.05182098224759102, "step": 515 }, { "debug/policy_chosen_logits": 1.5307259559631348, "debug/policy_chosen_logps": -268.090576171875, "debug/policy_rejected_logits": 1.6045730113983154, "debug/policy_rejected_logps": -283.01275634765625, "debug/reference_chosen_logps": -267.4107360839844, "debug/reference_rejected_logps": -279.47845458984375, "debug/sppo_chosen_loss": 2597.373046875, "debug/sppo_chosen_reward_in_loss": -0.6798439025878906, "debug/sppo_rej_reward_in_loss": -3.5343170166015625, "debug/sppo_reject_loss": 2190.690673828125, "epoch": 1.8840579710144927, "grad_norm": 61099.75772656539, "learning_rate": 9.05208136596887e-08, "logits/chosen": 1.5307259559631348, "logits/rejected": 1.6045730113983154, "logps/chosen": -268.090576171875, "logps/rejected": -283.01275634765625, "loss": 4765.5648, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.006798438727855682, "rewards/margins": 0.028544727712869644, "rewards/rejected": -0.03534316644072533, "step": 520 }, { "debug/policy_chosen_logits": 1.2059051990509033, "debug/policy_chosen_logps": -255.1666717529297, "debug/policy_rejected_logits": 1.6417697668075562, "debug/policy_rejected_logps": -289.1039123535156, "debug/reference_chosen_logps": -254.3158416748047, "debug/reference_rejected_logps": -288.393310546875, "debug/sppo_chosen_loss": 2654.73095703125, "debug/sppo_chosen_reward_in_loss": -0.8508337140083313, "debug/sppo_rej_reward_in_loss": -0.7106183767318726, "debug/sppo_reject_loss": 2452.39599609375, "epoch": 1.9021739130434783, "grad_norm": 69150.2726035533, "learning_rate": 9.030141317270026e-08, "logits/chosen": 1.2059051990509033, "logits/rejected": 1.6417697668075562, "logps/chosen": -255.1666717529297, "logps/rejected": -289.1039123535156, "loss": 4758.4645, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00850833673030138, "rewards/margins": -0.0014021530514582992, "rewards/rejected": -0.0071061840280890465, "step": 525 }, { "debug/policy_chosen_logits": 1.1592168807983398, "debug/policy_chosen_logps": -254.8201446533203, "debug/policy_rejected_logits": 1.2738714218139648, "debug/policy_rejected_logps": -289.788330078125, "debug/reference_chosen_logps": -253.6742401123047, "debug/reference_rejected_logps": -286.6373596191406, "debug/sppo_chosen_loss": 2657.622314453125, "debug/sppo_chosen_reward_in_loss": -1.1458953619003296, "debug/sppo_rej_reward_in_loss": -3.150979518890381, "debug/sppo_reject_loss": 2247.098388671875, "epoch": 1.9202898550724639, "grad_norm": 62523.02200109494, "learning_rate": 9.007977490870885e-08, "logits/chosen": 1.1592168807983398, "logits/rejected": 1.2738714218139648, "logps/chosen": -254.8201446533203, "logps/rejected": -289.788330078125, "loss": 4778.1098, "rewards/accuracies": 0.625, "rewards/chosen": -0.011458953842520714, "rewards/margins": 0.020050838589668274, "rewards/rejected": -0.03150979429483414, "step": 530 }, { "debug/policy_chosen_logits": 1.4269134998321533, "debug/policy_chosen_logps": -235.1399688720703, "debug/policy_rejected_logits": 1.690899133682251, "debug/policy_rejected_logps": -264.2394104003906, "debug/reference_chosen_logps": -234.9927520751953, "debug/reference_rejected_logps": -260.7703552246094, "debug/sppo_chosen_loss": 2530.692138671875, "debug/sppo_chosen_reward_in_loss": -0.14720706641674042, "debug/sppo_rej_reward_in_loss": -3.4690022468566895, "debug/sppo_reject_loss": 2216.19677734375, "epoch": 1.9384057971014492, "grad_norm": 78624.62327767034, "learning_rate": 8.985591117440483e-08, "logits/chosen": 1.4269134998321533, "logits/rejected": 1.690899133682251, "logps/chosen": -235.1399688720703, "logps/rejected": -264.2394104003906, "loss": 4685.6562, "rewards/accuracies": 0.625, "rewards/chosen": -0.0014720701146870852, "rewards/margins": 0.03321795165538788, "rewards/rejected": -0.034690018743276596, "step": 535 }, { "debug/policy_chosen_logits": 1.0426114797592163, "debug/policy_chosen_logps": -249.03506469726562, "debug/policy_rejected_logits": 1.3688905239105225, "debug/policy_rejected_logps": -278.43450927734375, "debug/reference_chosen_logps": -248.1215362548828, "debug/reference_rejected_logps": -276.47796630859375, "debug/sppo_chosen_loss": 2628.769775390625, "debug/sppo_chosen_reward_in_loss": -0.9134899377822876, "debug/sppo_rej_reward_in_loss": -1.9565985202789307, "debug/sppo_reject_loss": 2334.08935546875, "epoch": 1.9565217391304348, "grad_norm": 64105.71961637225, "learning_rate": 8.962983440004998e-08, "logits/chosen": 1.0426114797592163, "logits/rejected": 1.3688905239105225, "logps/chosen": -249.03506469726562, "logps/rejected": -278.43450927734375, "loss": 4719.7977, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.009134897962212563, "rewards/margins": 0.010431085713207722, "rewards/rejected": -0.01956598274409771, "step": 540 }, { "debug/policy_chosen_logits": 1.1802746057510376, "debug/policy_chosen_logps": -262.9992980957031, "debug/policy_rejected_logits": 1.4178202152252197, "debug/policy_rejected_logps": -295.21051025390625, "debug/reference_chosen_logps": -261.45489501953125, "debug/reference_rejected_logps": -292.7149658203125, "debug/sppo_chosen_loss": 2691.10888671875, "debug/sppo_chosen_reward_in_loss": -1.5444284677505493, "debug/sppo_rej_reward_in_loss": -2.495530128479004, "debug/sppo_reject_loss": 2297.989501953125, "epoch": 1.9746376811594204, "grad_norm": 58590.81549818591, "learning_rate": 8.940155713878738e-08, "logits/chosen": 1.1802746057510376, "logits/rejected": 1.4178202152252197, "logps/chosen": -262.9992980957031, "logps/rejected": -295.21051025390625, "loss": 4644.4012, "rewards/accuracies": 0.5, "rewards/chosen": -0.015444284304976463, "rewards/margins": 0.009511016309261322, "rewards/rejected": -0.024955300614237785, "step": 545 }, { "debug/policy_chosen_logits": 0.9551759958267212, "debug/policy_chosen_logps": -247.09033203125, "debug/policy_rejected_logits": 1.022687315940857, "debug/policy_rejected_logps": -282.8631591796875, "debug/reference_chosen_logps": -247.67520141601562, "debug/reference_rejected_logps": -279.9501647949219, "debug/sppo_chosen_loss": 2454.7587890625, "debug/sppo_chosen_reward_in_loss": 0.5848686099052429, "debug/sppo_rej_reward_in_loss": -2.9129879474639893, "debug/sppo_reject_loss": 2258.426513671875, "epoch": 1.9927536231884058, "grad_norm": 64771.314710374965, "learning_rate": 8.91710920659444e-08, "logits/chosen": 0.9551759958267212, "logits/rejected": 1.022687315940857, "logps/chosen": -247.09033203125, "logps/rejected": -282.8631591796875, "loss": 4761.4836, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.005848685745149851, "rewards/margins": 0.03497856482863426, "rewards/rejected": -0.02912987396121025, "step": 550 }, { "debug/policy_chosen_logits": 1.0029628276824951, "debug/policy_chosen_logps": -234.5623016357422, "debug/policy_rejected_logits": 1.6541814804077148, "debug/policy_rejected_logps": -285.01568603515625, "debug/reference_chosen_logps": -234.8700714111328, "debug/reference_rejected_logps": -281.872314453125, "debug/sppo_chosen_loss": 2493.84619140625, "debug/sppo_chosen_reward_in_loss": 0.307760626077652, "debug/sppo_rej_reward_in_loss": -3.143383502960205, "debug/sppo_reject_loss": 2232.835205078125, "epoch": 2.010869565217391, "grad_norm": 69525.29332871876, "learning_rate": 8.89384519783289e-08, "logits/chosen": 1.0029628276824951, "logits/rejected": 1.6541814804077148, "logps/chosen": -234.5623016357422, "logps/rejected": -285.01568603515625, "loss": 4813.4883, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.003077606437727809, "rewards/margins": 0.03451143950223923, "rewards/rejected": -0.0314338319003582, "step": 555 }, { "debug/policy_chosen_logits": 1.5291016101837158, "debug/policy_chosen_logps": -235.267822265625, "debug/policy_rejected_logits": 1.787105917930603, "debug/policy_rejected_logps": -272.7067565917969, "debug/reference_chosen_logps": -236.3900604248047, "debug/reference_rejected_logps": -269.92254638671875, "debug/sppo_chosen_loss": 2397.44677734375, "debug/sppo_chosen_reward_in_loss": 1.1222379207611084, "debug/sppo_rej_reward_in_loss": -2.7842187881469727, "debug/sppo_reject_loss": 2279.22802734375, "epoch": 2.028985507246377, "grad_norm": 62880.48737554153, "learning_rate": 8.87036497935186e-08, "logits/chosen": 1.5291016101837158, "logits/rejected": 1.787105917930603, "logps/chosen": -235.267822265625, "logps/rejected": -272.7067565917969, "loss": 4683.1055, "rewards/accuracies": 0.625, "rewards/chosen": 0.01122237928211689, "rewards/margins": 0.03906456381082535, "rewards/rejected": -0.027842188253998756, "step": 560 }, { "debug/policy_chosen_logits": 1.333370566368103, "debug/policy_chosen_logps": -250.8878173828125, "debug/policy_rejected_logits": 1.6568689346313477, "debug/policy_rejected_logps": -294.1556091308594, "debug/reference_chosen_logps": -251.97714233398438, "debug/reference_rejected_logps": -290.07403564453125, "debug/sppo_chosen_loss": 2402.43994140625, "debug/sppo_chosen_reward_in_loss": 1.0893455743789673, "debug/sppo_rej_reward_in_loss": -4.081561088562012, "debug/sppo_reject_loss": 2164.110107421875, "epoch": 2.0471014492753623, "grad_norm": 67652.64805516161, "learning_rate": 8.846669854914395e-08, "logits/chosen": 1.333370566368103, "logits/rejected": 1.6568689346313477, "logps/chosen": -250.8878173828125, "logps/rejected": -294.1556091308594, "loss": 4661.5477, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01089345570653677, "rewards/margins": 0.051709067076444626, "rewards/rejected": -0.04081561043858528, "step": 565 }, { "debug/policy_chosen_logits": 1.3098406791687012, "debug/policy_chosen_logps": -255.63565063476562, "debug/policy_rejected_logits": 1.6657825708389282, "debug/policy_rejected_logps": -292.25701904296875, "debug/reference_chosen_logps": -254.9364776611328, "debug/reference_rejected_logps": -289.62884521484375, "debug/sppo_chosen_loss": 2624.39404296875, "debug/sppo_chosen_reward_in_loss": -0.6991499066352844, "debug/sppo_rej_reward_in_loss": -2.6281533241271973, "debug/sppo_reject_loss": 2296.69287109375, "epoch": 2.0652173913043477, "grad_norm": 72051.52919968299, "learning_rate": 8.8227611402164e-08, "logits/chosen": 1.3098406791687012, "logits/rejected": 1.6657825708389282, "logps/chosen": -255.63565063476562, "logps/rejected": -292.25701904296875, "loss": 4720.9078, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006991499103605747, "rewards/margins": 0.019290033727884293, "rewards/rejected": -0.026281530037522316, "step": 570 }, { "debug/policy_chosen_logits": 1.2542155981063843, "debug/policy_chosen_logps": -253.7539825439453, "debug/policy_rejected_logits": 1.5640454292297363, "debug/policy_rejected_logps": -275.55792236328125, "debug/reference_chosen_logps": -253.73959350585938, "debug/reference_rejected_logps": -272.0418395996094, "debug/sppo_chosen_loss": 2514.656982421875, "debug/sppo_chosen_reward_in_loss": -0.014367866329848766, "debug/sppo_rej_reward_in_loss": -3.5160465240478516, "debug/sppo_reject_loss": 2216.046630859375, "epoch": 2.0833333333333335, "grad_norm": 71283.46672505948, "learning_rate": 8.798640162813607e-08, "logits/chosen": 1.2542155981063843, "logits/rejected": 1.5640454292297363, "logps/chosen": -253.7539825439453, "logps/rejected": -275.55792236328125, "loss": 4628.4422, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.00014367885887622833, "rewards/margins": 0.03501678630709648, "rewards/rejected": -0.03516046330332756, "step": 575 }, { "debug/policy_chosen_logits": 0.8257444500923157, "debug/policy_chosen_logps": -239.25830078125, "debug/policy_rejected_logits": 1.225967526435852, "debug/policy_rejected_logps": -295.7880859375, "debug/reference_chosen_logps": -238.2715606689453, "debug/reference_rejected_logps": -293.7236022949219, "debug/sppo_chosen_loss": 2639.497802734375, "debug/sppo_chosen_reward_in_loss": -0.9867492914199829, "debug/sppo_rej_reward_in_loss": -2.0644497871398926, "debug/sppo_reject_loss": 2316.61572265625, "epoch": 2.101449275362319, "grad_norm": 64002.07229866191, "learning_rate": 8.774308262047847e-08, "logits/chosen": 0.8257444500923157, "logits/rejected": 1.225967526435852, "logps/chosen": -239.25830078125, "logps/rejected": -295.7880859375, "loss": 4701.6742, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.009867492131888866, "rewards/margins": 0.010777004063129425, "rewards/rejected": -0.020644497126340866, "step": 580 }, { "debug/policy_chosen_logits": 1.3923568725585938, "debug/policy_chosen_logps": -267.9619140625, "debug/policy_rejected_logits": 1.6333366632461548, "debug/policy_rejected_logps": -270.2325134277344, "debug/reference_chosen_logps": -268.8836975097656, "debug/reference_rejected_logps": -266.7135314941406, "debug/sppo_chosen_loss": 2432.814453125, "debug/sppo_chosen_reward_in_loss": 0.9217990636825562, "debug/sppo_rej_reward_in_loss": -3.519031047821045, "debug/sppo_reject_loss": 2213.14453125, "epoch": 2.119565217391304, "grad_norm": 75378.71430621752, "learning_rate": 8.749766788972685e-08, "logits/chosen": 1.3923568725585938, "logits/rejected": 1.6333366632461548, "logps/chosen": -267.9619140625, "logps/rejected": -270.2325134277344, "loss": 4584.3477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.009217990562319756, "rewards/margins": 0.04440830275416374, "rewards/rejected": -0.03519031032919884, "step": 585 }, { "debug/policy_chosen_logits": 1.3195666074752808, "debug/policy_chosen_logps": -250.1842803955078, "debug/policy_rejected_logits": 1.6939647197723389, "debug/policy_rejected_logps": -320.55340576171875, "debug/reference_chosen_logps": -250.12686157226562, "debug/reference_rejected_logps": -315.68890380859375, "debug/sppo_chosen_loss": 2529.84912109375, "debug/sppo_chosen_reward_in_loss": -0.057431600987911224, "debug/sppo_rej_reward_in_loss": -4.864499092102051, "debug/sppo_reject_loss": 2121.0732421875, "epoch": 2.13768115942029, "grad_norm": 61422.39294040227, "learning_rate": 8.725017106278406e-08, "logits/chosen": 1.3195666074752808, "logits/rejected": 1.6939647197723389, "logps/chosen": -250.1842803955078, "logps/rejected": -320.55340576171875, "loss": 4650.8539, "rewards/accuracies": 0.75, "rewards/chosen": -0.0005743157234974205, "rewards/margins": 0.04807067662477493, "rewards/rejected": -0.04864499717950821, "step": 590 }, { "debug/policy_chosen_logits": 0.9939780235290527, "debug/policy_chosen_logps": -235.31338500976562, "debug/policy_rejected_logits": 1.3324755430221558, "debug/policy_rejected_logps": -293.82421875, "debug/reference_chosen_logps": -235.8861541748047, "debug/reference_rejected_logps": -288.54498291015625, "debug/sppo_chosen_loss": 2456.348876953125, "debug/sppo_chosen_reward_in_loss": 0.5727742910385132, "debug/sppo_rej_reward_in_loss": -5.279238700866699, "debug/sppo_reject_loss": 2089.032470703125, "epoch": 2.1557971014492754, "grad_norm": 120561.10121499117, "learning_rate": 8.700060588216336e-08, "logits/chosen": 0.9939780235290527, "logits/rejected": 1.3324755430221558, "logps/chosen": -235.31338500976562, "logps/rejected": -293.82421875, "loss": 4629.7258, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.005727742798626423, "rewards/margins": 0.0585201270878315, "rewards/rejected": -0.05279238149523735, "step": 595 }, { "debug/policy_chosen_logits": 1.170904278755188, "debug/policy_chosen_logps": -241.5663604736328, "debug/policy_rejected_logits": 1.3933851718902588, "debug/policy_rejected_logps": -263.28515625, "debug/reference_chosen_logps": -241.9515838623047, "debug/reference_rejected_logps": -258.27471923828125, "debug/sppo_chosen_loss": 2477.68212890625, "debug/sppo_chosen_reward_in_loss": 0.38524895906448364, "debug/sppo_rej_reward_in_loss": -5.010422706604004, "debug/sppo_reject_loss": 2074.12890625, "epoch": 2.1739130434782608, "grad_norm": 69013.1201950737, "learning_rate": 8.674898620522557e-08, "logits/chosen": 1.170904278755188, "logits/rejected": 1.3933851718902588, "logps/chosen": -241.5663604736328, "logps/rejected": -263.28515625, "loss": 4647.1707, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.003852488938719034, "rewards/margins": 0.05395671725273132, "rewards/rejected": -0.050104230642318726, "step": 600 }, { "epoch": 2.1739130434782608, "eval_debug/policy_chosen_logits": 1.5053505897521973, "eval_debug/policy_chosen_logps": -253.22561645507812, "eval_debug/policy_rejected_logits": 1.5586278438568115, "eval_debug/policy_rejected_logps": -262.3002624511719, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2562.319091796875, "eval_debug/sppo_chosen_reward_in_loss": -0.30712229013442993, "eval_debug/sppo_rej_reward_in_loss": -2.6416518688201904, "eval_debug/sppo_reject_loss": 2304.574462890625, "eval_logits/chosen": 1.5053505897521973, "eval_logits/rejected": 1.5586278438568115, "eval_logps/chosen": -253.22561645507812, "eval_logps/rejected": -262.3002624511719, "eval_loss": 4725.65478515625, "eval_rewards/accuracies": 0.5394737124443054, "eval_rewards/chosen": -0.0030712224543094635, "eval_rewards/margins": 0.02334529533982277, "eval_rewards/rejected": -0.026416515931487083, "eval_runtime": 28.4753, "eval_samples_per_second": 21.071, "eval_steps_per_second": 0.667, "step": 600 }, { "debug/policy_chosen_logits": 1.2302472591400146, "debug/policy_chosen_logps": -245.91958618164062, "debug/policy_rejected_logits": 1.8347580432891846, "debug/policy_rejected_logps": -305.92706298828125, "debug/reference_chosen_logps": -245.0672607421875, "debug/reference_rejected_logps": -303.65692138671875, "debug/sppo_chosen_loss": 2615.14404296875, "debug/sppo_chosen_reward_in_loss": -0.8523017764091492, "debug/sppo_rej_reward_in_loss": -2.2700839042663574, "debug/sppo_reject_loss": 2321.53369140625, "epoch": 2.1920289855072466, "grad_norm": 68805.59854839399, "learning_rate": 8.649532600340945e-08, "logits/chosen": 1.2302472591400146, "logits/rejected": 1.8347580432891846, "logps/chosen": -245.91958618164062, "logps/rejected": -305.92706298828125, "loss": 4716.0961, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.008523017168045044, "rewards/margins": 0.01417782437056303, "rewards/rejected": -0.0227008406072855, "step": 605 }, { "debug/policy_chosen_logits": 1.2323944568634033, "debug/policy_chosen_logps": -251.2030487060547, "debug/policy_rejected_logits": 1.3216893672943115, "debug/policy_rejected_logps": -264.74530029296875, "debug/reference_chosen_logps": -251.7701873779297, "debug/reference_rejected_logps": -263.05694580078125, "debug/sppo_chosen_loss": 2460.92529296875, "debug/sppo_chosen_reward_in_loss": 0.5671443939208984, "debug/sppo_rej_reward_in_loss": -1.688367247581482, "debug/sppo_reject_loss": 2375.023681640625, "epoch": 2.210144927536232, "grad_norm": 89575.39371679806, "learning_rate": 8.6239639361456e-08, "logits/chosen": 1.2323944568634033, "logits/rejected": 1.3216893672943115, "logps/chosen": -251.2030487060547, "logps/rejected": -264.74530029296875, "loss": 4727.2539, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.005671444348990917, "rewards/margins": 0.022555116564035416, "rewards/rejected": -0.016883673146367073, "step": 610 }, { "debug/policy_chosen_logits": 1.2631947994232178, "debug/policy_chosen_logps": -281.2205505371094, "debug/policy_rejected_logits": 1.4563754796981812, "debug/policy_rejected_logps": -297.29949951171875, "debug/reference_chosen_logps": -281.66461181640625, "debug/reference_rejected_logps": -295.0270080566406, "debug/sppo_chosen_loss": 2473.56298828125, "debug/sppo_chosen_reward_in_loss": 0.44406241178512573, "debug/sppo_rej_reward_in_loss": -2.272524356842041, "debug/sppo_reject_loss": 2317.353515625, "epoch": 2.2282608695652173, "grad_norm": 88394.34834173425, "learning_rate": 8.598194047662634e-08, "logits/chosen": 1.2631947994232178, "logits/rejected": 1.4563754796981812, "logps/chosen": -281.2205505371094, "logps/rejected": -297.29949951171875, "loss": 4709.5641, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.004440623801201582, "rewards/margins": 0.027165865525603294, "rewards/rejected": -0.022725243121385574, "step": 615 }, { "debug/policy_chosen_logits": 0.9352153539657593, "debug/policy_chosen_logps": -233.7841339111328, "debug/policy_rejected_logits": 1.4436912536621094, "debug/policy_rejected_logps": -316.3915100097656, "debug/reference_chosen_logps": -233.95346069335938, "debug/reference_rejected_logps": -311.99176025390625, "debug/sppo_chosen_loss": 2502.010498046875, "debug/sppo_chosen_reward_in_loss": 0.16931553184986115, "debug/sppo_rej_reward_in_loss": -4.399728298187256, "debug/sppo_reject_loss": 2132.748779296875, "epoch": 2.246376811594203, "grad_norm": 57675.208745475386, "learning_rate": 8.572224365791348e-08, "logits/chosen": 0.9352153539657593, "logits/rejected": 1.4436912536621094, "logps/chosen": -233.7841339111328, "logps/rejected": -316.3915100097656, "loss": 4686.8801, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0016931556165218353, "rewards/margins": 0.04569043964147568, "rewards/rejected": -0.043997280299663544, "step": 620 }, { "debug/policy_chosen_logits": 1.0854655504226685, "debug/policy_chosen_logps": -250.3056182861328, "debug/policy_rejected_logits": 1.247107744216919, "debug/policy_rejected_logps": -271.73699951171875, "debug/reference_chosen_logps": -250.57568359375, "debug/reference_rejected_logps": -266.9862976074219, "debug/sppo_chosen_loss": 2503.90576171875, "debug/sppo_chosen_reward_in_loss": 0.2700786590576172, "debug/sppo_rej_reward_in_loss": -4.750700950622559, "debug/sppo_reject_loss": 2117.43603515625, "epoch": 2.2644927536231885, "grad_norm": 65162.73558403237, "learning_rate": 8.546056332524771e-08, "logits/chosen": 1.0854655504226685, "logits/rejected": 1.247107744216919, "logps/chosen": -250.3056182861328, "logps/rejected": -271.73699951171875, "loss": 4696.8086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0027007872704416513, "rewards/margins": 0.05020779371261597, "rewards/rejected": -0.04750701040029526, "step": 625 }, { "debug/policy_chosen_logits": 1.3634555339813232, "debug/policy_chosen_logps": -284.90277099609375, "debug/policy_rejected_logits": 1.4885923862457275, "debug/policy_rejected_logps": -262.86126708984375, "debug/reference_chosen_logps": -283.0180969238281, "debug/reference_rejected_logps": -259.73748779296875, "debug/sppo_chosen_loss": 2754.628662109375, "debug/sppo_chosen_reward_in_loss": -1.8847014904022217, "debug/sppo_rej_reward_in_loss": -3.123795986175537, "debug/sppo_reject_loss": 2240.63037109375, "epoch": 2.282608695652174, "grad_norm": 64391.60616811912, "learning_rate": 8.519691400869593e-08, "logits/chosen": 1.3634555339813232, "logits/rejected": 1.4885923862457275, "logps/chosen": -284.90277099609375, "logps/rejected": -262.86126708984375, "loss": 4721.7711, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.018847014755010605, "rewards/margins": 0.012390943244099617, "rewards/rejected": -0.03123795986175537, "step": 630 }, { "debug/policy_chosen_logits": 1.2123847007751465, "debug/policy_chosen_logps": -235.73715209960938, "debug/policy_rejected_logits": 1.4528264999389648, "debug/policy_rejected_logps": -287.69244384765625, "debug/reference_chosen_logps": -236.39944458007812, "debug/reference_rejected_logps": -285.35406494140625, "debug/sppo_chosen_loss": 2444.96435546875, "debug/sppo_chosen_reward_in_loss": 0.6622905731201172, "debug/sppo_rej_reward_in_loss": -2.338413953781128, "debug/sppo_reject_loss": 2319.29736328125, "epoch": 2.300724637681159, "grad_norm": 104523.93148426696, "learning_rate": 8.493131034765493e-08, "logits/chosen": 1.2123847007751465, "logits/rejected": 1.4528264999389648, "logps/chosen": -235.73715209960938, "logps/rejected": -287.69244384765625, "loss": 4711.3289, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.006622905842959881, "rewards/margins": 0.030007043853402138, "rewards/rejected": -0.02338413894176483, "step": 635 }, { "debug/policy_chosen_logits": 1.0598101615905762, "debug/policy_chosen_logps": -259.14007568359375, "debug/policy_rejected_logits": 1.4087841510772705, "debug/policy_rejected_logps": -289.23944091796875, "debug/reference_chosen_logps": -259.8706359863281, "debug/reference_rejected_logps": -284.48968505859375, "debug/sppo_chosen_loss": 2438.184326171875, "debug/sppo_chosen_reward_in_loss": 0.7305816411972046, "debug/sppo_rej_reward_in_loss": -4.749767780303955, "debug/sppo_reject_loss": 2118.009765625, "epoch": 2.318840579710145, "grad_norm": 72785.5043936409, "learning_rate": 8.46637670900384e-08, "logits/chosen": 1.0598101615905762, "logits/rejected": 1.4087841510772705, "logps/chosen": -259.14007568359375, "logps/rejected": -289.23944091796875, "loss": 4633.6555, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.00730581721290946, "rewards/margins": 0.05480349063873291, "rewards/rejected": -0.04749767482280731, "step": 640 }, { "debug/policy_chosen_logits": 1.2692564725875854, "debug/policy_chosen_logps": -271.09075927734375, "debug/policy_rejected_logits": 1.205157995223999, "debug/policy_rejected_logps": -245.5635223388672, "debug/reference_chosen_logps": -280.4898681640625, "debug/reference_rejected_logps": -251.0946807861328, "debug/sppo_chosen_loss": 1798.6510009765625, "debug/sppo_chosen_reward_in_loss": 9.399101257324219, "debug/sppo_rej_reward_in_loss": 5.531121730804443, "debug/sppo_reject_loss": 3291.958984375, "epoch": 2.3369565217391304, "grad_norm": 86753.47801949162, "learning_rate": 8.439429909145816e-08, "logits/chosen": 1.2692564725875854, "logits/rejected": 1.205157995223999, "logps/chosen": -271.09075927734375, "logps/rejected": -245.5635223388672, "loss": 5429.5953, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0939910039305687, "rewards/margins": 0.038679786026477814, "rewards/rejected": 0.05531121417880058, "step": 645 }, { "debug/policy_chosen_logits": 1.5434187650680542, "debug/policy_chosen_logps": -259.09027099609375, "debug/policy_rejected_logits": 1.58522367477417, "debug/policy_rejected_logps": -283.14508056640625, "debug/reference_chosen_logps": -268.76470947265625, "debug/reference_rejected_logps": -293.04833984375, "debug/sppo_chosen_loss": 1690.5296630859375, "debug/sppo_chosen_reward_in_loss": 9.674398422241211, "debug/sppo_rej_reward_in_loss": 9.90326976776123, "debug/sppo_reject_loss": 3680.02392578125, "epoch": 2.355072463768116, "grad_norm": 71014.8256938341, "learning_rate": 8.412292131439924e-08, "logits/chosen": 1.5434187650680542, "logits/rejected": 1.58522367477417, "logps/chosen": -259.09027099609375, "logps/rejected": -283.14508056640625, "loss": 5486.1242, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.09674398601055145, "rewards/margins": -0.002288705902174115, "rewards/rejected": 0.0990326926112175, "step": 650 }, { "debug/policy_chosen_logits": 1.3143525123596191, "debug/policy_chosen_logps": -252.29421997070312, "debug/policy_rejected_logits": 1.6215347051620483, "debug/policy_rejected_logps": -290.984130859375, "debug/reference_chosen_logps": -257.20928955078125, "debug/reference_rejected_logps": -293.06396484375, "debug/sppo_chosen_loss": 2047.283203125, "debug/sppo_chosen_reward_in_loss": 4.915032863616943, "debug/sppo_rej_reward_in_loss": 2.0798301696777344, "debug/sppo_reject_loss": 2748.5693359375, "epoch": 2.3731884057971016, "grad_norm": 63977.00550042232, "learning_rate": 8.3849648827389e-08, "logits/chosen": 1.3143525123596191, "logits/rejected": 1.6215347051620483, "logps/chosen": -252.29421997070312, "logps/rejected": -290.984130859375, "loss": 4779.2102, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04915032535791397, "rewards/margins": 0.028352027758955956, "rewards/rejected": 0.020798301324248314, "step": 655 }, { "debug/policy_chosen_logits": 1.0396404266357422, "debug/policy_chosen_logps": -263.76708984375, "debug/policy_rejected_logits": 1.2717006206512451, "debug/policy_rejected_logps": -282.8274841308594, "debug/reference_chosen_logps": -265.6632080078125, "debug/reference_rejected_logps": -280.2948303222656, "debug/sppo_chosen_loss": 2353.92333984375, "debug/sppo_chosen_reward_in_loss": 1.8960940837860107, "debug/sppo_rej_reward_in_loss": -2.5326685905456543, "debug/sppo_reject_loss": 2308.527099609375, "epoch": 2.391304347826087, "grad_norm": 59400.02624948723, "learning_rate": 8.357449680416058e-08, "logits/chosen": 1.0396404266357422, "logits/rejected": 1.2717006206512451, "logps/chosen": -263.76708984375, "logps/rejected": -282.8274841308594, "loss": 4731.8664, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.018960941582918167, "rewards/margins": 0.044287629425525665, "rewards/rejected": -0.025326687842607498, "step": 660 }, { "debug/policy_chosen_logits": 1.234412670135498, "debug/policy_chosen_logps": -243.5044403076172, "debug/policy_rejected_logits": 1.6132608652114868, "debug/policy_rejected_logps": -273.7642822265625, "debug/reference_chosen_logps": -245.8137969970703, "debug/reference_rejected_logps": -272.16265869140625, "debug/sppo_chosen_loss": 2285.48583984375, "debug/sppo_chosen_reward_in_loss": 2.3093769550323486, "debug/sppo_rej_reward_in_loss": -1.601636290550232, "debug/sppo_reject_loss": 2426.29541015625, "epoch": 2.4094202898550723, "grad_norm": 58602.8192263023, "learning_rate": 8.32974805228102e-08, "logits/chosen": 1.234412670135498, "logits/rejected": 1.6132608652114868, "logps/chosen": -243.5044403076172, "logps/rejected": -273.7642822265625, "loss": 4634.8891, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.02309376932680607, "rewards/margins": 0.039110131561756134, "rewards/rejected": -0.016016360372304916, "step": 665 }, { "debug/policy_chosen_logits": 1.563645601272583, "debug/policy_chosen_logps": -278.4286804199219, "debug/policy_rejected_logits": 1.7810630798339844, "debug/policy_rejected_logps": -304.18304443359375, "debug/reference_chosen_logps": -281.46917724609375, "debug/reference_rejected_logps": -302.9301452636719, "debug/sppo_chosen_loss": 2210.00927734375, "debug/sppo_chosen_reward_in_loss": 3.040518045425415, "debug/sppo_rej_reward_in_loss": -1.2529163360595703, "debug/sppo_reject_loss": 2420.52783203125, "epoch": 2.427536231884058, "grad_norm": 68478.8669991549, "learning_rate": 8.301861536494898e-08, "logits/chosen": 1.563645601272583, "logits/rejected": 1.7810630798339844, "logps/chosen": -278.4286804199219, "logps/rejected": -304.18304443359375, "loss": 4713.7953, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.030405178666114807, "rewards/margins": 0.04293433949351311, "rewards/rejected": -0.012529164552688599, "step": 670 }, { "debug/policy_chosen_logits": 1.1350958347320557, "debug/policy_chosen_logps": -234.85903930664062, "debug/policy_rejected_logits": 1.3057701587677002, "debug/policy_rejected_logps": -273.68170166015625, "debug/reference_chosen_logps": -235.17245483398438, "debug/reference_rejected_logps": -270.46295166015625, "debug/sppo_chosen_loss": 2496.69873046875, "debug/sppo_chosen_reward_in_loss": 0.31343594193458557, "debug/sppo_rej_reward_in_loss": -3.2187705039978027, "debug/sppo_reject_loss": 2251.916259765625, "epoch": 2.4456521739130435, "grad_norm": 68306.26071631578, "learning_rate": 8.273791681484874e-08, "logits/chosen": 1.1350958347320557, "logits/rejected": 1.3057701587677002, "logps/chosen": -234.85903930664062, "logps/rejected": -273.68170166015625, "loss": 4683.6773, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.003134358674287796, "rewards/margins": 0.03532206267118454, "rewards/rejected": -0.03218770772218704, "step": 675 }, { "debug/policy_chosen_logits": 1.5269218683242798, "debug/policy_chosen_logps": -283.9064025878906, "debug/policy_rejected_logits": 1.508864402770996, "debug/policy_rejected_logps": -261.4410095214844, "debug/reference_chosen_logps": -283.28729248046875, "debug/reference_rejected_logps": -259.95697021484375, "debug/sppo_chosen_loss": 2623.616943359375, "debug/sppo_chosen_reward_in_loss": -0.6191161870956421, "debug/sppo_rej_reward_in_loss": -1.484053373336792, "debug/sppo_reject_loss": 2396.083251953125, "epoch": 2.463768115942029, "grad_norm": 82805.1153427768, "learning_rate": 8.245540045858228e-08, "logits/chosen": 1.5269218683242798, "logits/rejected": 1.508864402770996, "logps/chosen": -283.9064025878906, "logps/rejected": -261.4410095214844, "loss": 4723.4434, "rewards/accuracies": 0.625, "rewards/chosen": -0.006191161461174488, "rewards/margins": 0.008649373427033424, "rewards/rejected": -0.014840533025562763, "step": 680 }, { "debug/policy_chosen_logits": 1.1852375268936157, "debug/policy_chosen_logps": -252.340087890625, "debug/policy_rejected_logits": 1.4935983419418335, "debug/policy_rejected_logps": -282.58343505859375, "debug/reference_chosen_logps": -253.1710662841797, "debug/reference_rejected_logps": -280.0072021484375, "debug/sppo_chosen_loss": 2442.70654296875, "debug/sppo_chosen_reward_in_loss": 0.8309797048568726, "debug/sppo_rej_reward_in_loss": -2.5762057304382324, "debug/sppo_reject_loss": 2296.81982421875, "epoch": 2.4818840579710146, "grad_norm": 63401.64185770447, "learning_rate": 8.2171081983158e-08, "logits/chosen": 1.1852375268936157, "logits/rejected": 1.4935983419418335, "logps/chosen": -252.340087890625, "logps/rejected": -282.58343505859375, "loss": 4596.534, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008309796452522278, "rewards/margins": 0.03407185524702072, "rewards/rejected": -0.025762056931853294, "step": 685 }, { "debug/policy_chosen_logits": 1.1588332653045654, "debug/policy_chosen_logps": -248.93905639648438, "debug/policy_rejected_logits": 1.4399282932281494, "debug/policy_rejected_logps": -285.43927001953125, "debug/reference_chosen_logps": -250.5298309326172, "debug/reference_rejected_logps": -281.73443603515625, "debug/sppo_chosen_loss": 2347.6015625, "debug/sppo_chosen_reward_in_loss": 1.5907951593399048, "debug/sppo_rej_reward_in_loss": -3.7047877311706543, "debug/sppo_reject_loss": 2226.80322265625, "epoch": 2.5, "grad_norm": 64223.88240269286, "learning_rate": 8.188497717564871e-08, "logits/chosen": 1.1588332653045654, "logits/rejected": 1.4399282932281494, "logps/chosen": -248.93905639648438, "logps/rejected": -285.43927001953125, "loss": 4621.6918, "rewards/accuracies": 0.75, "rewards/chosen": 0.015907950699329376, "rewards/margins": 0.05295582860708237, "rewards/rejected": -0.03704787790775299, "step": 690 }, { "debug/policy_chosen_logits": 1.1629136800765991, "debug/policy_chosen_logps": -258.650390625, "debug/policy_rejected_logits": 1.2538448572158813, "debug/policy_rejected_logps": -269.53631591796875, "debug/reference_chosen_logps": -258.3566589355469, "debug/reference_rejected_logps": -263.8230285644531, "debug/sppo_chosen_loss": 2563.53466796875, "debug/sppo_chosen_reward_in_loss": -0.29371222853660583, "debug/sppo_rej_reward_in_loss": -5.7132792472839355, "debug/sppo_reject_loss": 2022.1617431640625, "epoch": 2.5181159420289854, "grad_norm": 58930.82852049117, "learning_rate": 8.159710192231519e-08, "logits/chosen": 1.1629136800765991, "logits/rejected": 1.2538448572158813, "logps/chosen": -258.650390625, "logps/rejected": -269.53631591796875, "loss": 4662.3734, "rewards/accuracies": 0.75, "rewards/chosen": -0.0029371220152825117, "rewards/margins": 0.05419566482305527, "rewards/rejected": -0.057132791727781296, "step": 695 }, { "debug/policy_chosen_logits": 1.296156644821167, "debug/policy_chosen_logps": -247.963134765625, "debug/policy_rejected_logits": 1.5709506273269653, "debug/policy_rejected_logps": -284.2712097167969, "debug/reference_chosen_logps": -248.04214477539062, "debug/reference_rejected_logps": -281.76080322265625, "debug/sppo_chosen_loss": 2526.7412109375, "debug/sppo_chosen_reward_in_loss": 0.0790136307477951, "debug/sppo_rej_reward_in_loss": -2.510422706604004, "debug/sppo_reject_loss": 2294.28271484375, "epoch": 2.536231884057971, "grad_norm": 83941.42671095916, "learning_rate": 8.130747220772401e-08, "logits/chosen": 1.296156644821167, "logits/rejected": 1.5709506273269653, "logps/chosen": -247.963134765625, "logps/rejected": -284.2712097167969, "loss": 4590.507, "rewards/accuracies": 0.625, "rewards/chosen": 0.0007901365170255303, "rewards/margins": 0.02589436247944832, "rewards/rejected": -0.025104224681854248, "step": 700 }, { "epoch": 2.536231884057971, "eval_debug/policy_chosen_logits": 1.480161190032959, "eval_debug/policy_chosen_logps": -253.2023468017578, "eval_debug/policy_rejected_logits": 1.5311214923858643, "eval_debug/policy_rejected_logps": -262.8334655761719, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2563.16015625, "eval_debug/sppo_chosen_reward_in_loss": -0.2838987112045288, "eval_debug/sppo_rej_reward_in_loss": -3.1748383045196533, "eval_debug/sppo_reject_loss": 2266.701904296875, "eval_logits/chosen": 1.480161190032959, "eval_logits/rejected": 1.5311214923858643, "eval_logps/chosen": -253.2023468017578, "eval_logps/rejected": -262.8334655761719, "eval_loss": 4709.8720703125, "eval_rewards/accuracies": 0.5657894611358643, "eval_rewards/chosen": -0.0028389859944581985, "eval_rewards/margins": 0.02890939824283123, "eval_rewards/rejected": -0.03174838423728943, "eval_runtime": 28.36, "eval_samples_per_second": 21.157, "eval_steps_per_second": 0.67, "step": 700 }, { "debug/policy_chosen_logits": 1.316695213317871, "debug/policy_chosen_logps": -262.3095397949219, "debug/policy_rejected_logits": 1.5722310543060303, "debug/policy_rejected_logps": -308.7387390136719, "debug/reference_chosen_logps": -263.1445007324219, "debug/reference_rejected_logps": -301.7698059082031, "debug/sppo_chosen_loss": 2425.18603515625, "debug/sppo_chosen_reward_in_loss": 0.8349674344062805, "debug/sppo_rej_reward_in_loss": -6.9689836502075195, "debug/sppo_reject_loss": 1957.8179931640625, "epoch": 2.5543478260869565, "grad_norm": 67599.29085197397, "learning_rate": 8.101610411385998e-08, "logits/chosen": 1.316695213317871, "logits/rejected": 1.5722310543060303, "logps/chosen": -262.3095397949219, "logps/rejected": -308.7387390136719, "loss": 4607.5508, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.008349673822522163, "rewards/margins": 0.0780395045876503, "rewards/rejected": -0.06968982517719269, "step": 705 }, { "debug/policy_chosen_logits": 1.2466450929641724, "debug/policy_chosen_logps": -267.37567138671875, "debug/policy_rejected_logits": 1.5063731670379639, "debug/policy_rejected_logps": -330.72149658203125, "debug/reference_chosen_logps": -267.941162109375, "debug/reference_rejected_logps": -326.2596740722656, "debug/sppo_chosen_loss": 2450.247314453125, "debug/sppo_chosen_reward_in_loss": 0.5654850006103516, "debug/sppo_rej_reward_in_loss": -4.461817264556885, "debug/sppo_reject_loss": 2120.1513671875, "epoch": 2.572463768115942, "grad_norm": 64522.98009611156, "learning_rate": 8.072301381923319e-08, "logits/chosen": 1.2466450929641724, "logits/rejected": 1.5063731670379639, "logps/chosen": -267.37567138671875, "logps/rejected": -330.72149658203125, "loss": 4674.4805, "rewards/accuracies": 0.75, "rewards/chosen": 0.005654850043356419, "rewards/margins": 0.050273019820451736, "rewards/rejected": -0.04461817070841789, "step": 710 }, { "debug/policy_chosen_logits": 0.9615011215209961, "debug/policy_chosen_logps": -229.64877319335938, "debug/policy_rejected_logits": 1.251741647720337, "debug/policy_rejected_logps": -282.9585266113281, "debug/reference_chosen_logps": -229.4971923828125, "debug/reference_rejected_logps": -279.83587646484375, "debug/sppo_chosen_loss": 2542.4091796875, "debug/sppo_chosen_reward_in_loss": -0.1516149491071701, "debug/sppo_rej_reward_in_loss": -3.1226649284362793, "debug/sppo_reject_loss": 2246.38330078125, "epoch": 2.5905797101449277, "grad_norm": 77147.82344419556, "learning_rate": 8.042821759798069e-08, "logits/chosen": 0.9615011215209961, "logits/rejected": 1.251741647720337, "logps/chosen": -229.64877319335938, "logps/rejected": -282.9585266113281, "loss": 4721.9094, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.0015161499613896012, "rewards/margins": 0.029710497707128525, "rewards/rejected": -0.031226646155118942, "step": 715 }, { "debug/policy_chosen_logits": 1.0589511394500732, "debug/policy_chosen_logps": -245.3362579345703, "debug/policy_rejected_logits": 1.2744250297546387, "debug/policy_rejected_logps": -274.80999755859375, "debug/reference_chosen_logps": -246.52197265625, "debug/reference_rejected_logps": -272.2623596191406, "debug/sppo_chosen_loss": 2395.19580078125, "debug/sppo_chosen_reward_in_loss": 1.1857404708862305, "debug/sppo_rej_reward_in_loss": -2.547642707824707, "debug/sppo_reject_loss": 2284.42041015625, "epoch": 2.608695652173913, "grad_norm": 68760.10675885266, "learning_rate": 8.013173181896283e-08, "logits/chosen": 1.0589511394500732, "logits/rejected": 1.2744250297546387, "logps/chosen": -245.3362579345703, "logps/rejected": -274.80999755859375, "loss": 4566.4594, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.011857403442263603, "rewards/margins": 0.03733383119106293, "rewards/rejected": -0.025476425886154175, "step": 720 }, { "debug/policy_chosen_logits": 1.229744553565979, "debug/policy_chosen_logps": -231.814208984375, "debug/policy_rejected_logits": 1.8818788528442383, "debug/policy_rejected_logps": -300.50482177734375, "debug/reference_chosen_logps": -232.2402801513672, "debug/reference_rejected_logps": -296.17901611328125, "debug/sppo_chosen_loss": 2466.02392578125, "debug/sppo_chosen_reward_in_loss": 0.42609596252441406, "debug/sppo_rej_reward_in_loss": -4.325751304626465, "debug/sppo_reject_loss": 2153.456298828125, "epoch": 2.6268115942028984, "grad_norm": 81272.2688889121, "learning_rate": 7.983357294485438e-08, "logits/chosen": 1.229744553565979, "logits/rejected": 1.8818788528442383, "logps/chosen": -231.814208984375, "logps/rejected": -300.50482177734375, "loss": 4581.5086, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.004260959569364786, "rewards/margins": 0.04751847684383392, "rewards/rejected": -0.043257515877485275, "step": 725 }, { "debug/policy_chosen_logits": 1.1009639501571655, "debug/policy_chosen_logps": -244.9521026611328, "debug/policy_rejected_logits": 1.392866849899292, "debug/policy_rejected_logps": -281.6346740722656, "debug/reference_chosen_logps": -244.76611328125, "debug/reference_rejected_logps": -277.59765625, "debug/sppo_chosen_loss": 2543.88134765625, "debug/sppo_chosen_reward_in_loss": -0.185984805226326, "debug/sppo_rej_reward_in_loss": -4.037027359008789, "debug/sppo_reject_loss": 2177.278076171875, "epoch": 2.644927536231884, "grad_norm": 66672.84499784847, "learning_rate": 7.953375753123043e-08, "logits/chosen": 1.1009639501571655, "logits/rejected": 1.392866849899292, "logps/chosen": -244.9521026611328, "logps/rejected": -281.6346740722656, "loss": 4614.0613, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0018598471069708467, "rewards/margins": 0.03851042687892914, "rewards/rejected": -0.04037027060985565, "step": 730 }, { "debug/policy_chosen_logits": 1.2034881114959717, "debug/policy_chosen_logps": -262.13726806640625, "debug/policy_rejected_logits": 1.6349296569824219, "debug/policy_rejected_logps": -300.8846740722656, "debug/reference_chosen_logps": -262.77691650390625, "debug/reference_rejected_logps": -295.9397277832031, "debug/sppo_chosen_loss": 2470.29443359375, "debug/sppo_chosen_reward_in_loss": 0.6396778225898743, "debug/sppo_rej_reward_in_loss": -4.944947719573975, "debug/sppo_reject_loss": 2114.920166015625, "epoch": 2.6630434782608696, "grad_norm": 59913.553582771856, "learning_rate": 7.923230222564714e-08, "logits/chosen": 1.2034881114959717, "logits/rejected": 1.6349296569824219, "logps/chosen": -262.13726806640625, "logps/rejected": -300.8846740722656, "loss": 4603.5797, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.006396777927875519, "rewards/margins": 0.05584625154733658, "rewards/rejected": -0.04944947734475136, "step": 735 }, { "debug/policy_chosen_logits": 1.3139673471450806, "debug/policy_chosen_logps": -275.12811279296875, "debug/policy_rejected_logits": 1.1129451990127563, "debug/policy_rejected_logps": -263.61767578125, "debug/reference_chosen_logps": -274.85089111328125, "debug/reference_rejected_logps": -262.1892395019531, "debug/sppo_chosen_loss": 2561.744140625, "debug/sppo_chosen_reward_in_loss": -0.27727144956588745, "debug/sppo_rej_reward_in_loss": -1.4284439086914062, "debug/sppo_reject_loss": 2384.283935546875, "epoch": 2.681159420289855, "grad_norm": 60968.86609888939, "learning_rate": 7.892922376671725e-08, "logits/chosen": 1.3139673471450806, "logits/rejected": 1.1129451990127563, "logps/chosen": -275.12811279296875, "logps/rejected": -263.61767578125, "loss": 4665.1484, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.002772714477032423, "rewards/margins": 0.011511723510921001, "rewards/rejected": -0.014284437522292137, "step": 740 }, { "debug/policy_chosen_logits": 1.2646352052688599, "debug/policy_chosen_logps": -249.6303253173828, "debug/policy_rejected_logits": 1.2260851860046387, "debug/policy_rejected_logps": -264.775390625, "debug/reference_chosen_logps": -249.9891357421875, "debug/reference_rejected_logps": -260.71539306640625, "debug/sppo_chosen_loss": 2489.34375, "debug/sppo_chosen_reward_in_loss": 0.35877054929733276, "debug/sppo_rej_reward_in_loss": -4.0600104331970215, "debug/sppo_reject_loss": 2168.15087890625, "epoch": 2.699275362318841, "grad_norm": 70768.43523511974, "learning_rate": 7.862453898318082e-08, "logits/chosen": 1.2646352052688599, "logits/rejected": 1.2260851860046387, "logps/chosen": -249.6303253173828, "logps/rejected": -264.775390625, "loss": 4640.3344, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003587705548852682, "rewards/margins": 0.04418780654668808, "rewards/rejected": -0.04060010612010956, "step": 745 }, { "debug/policy_chosen_logits": 1.0691126585006714, "debug/policy_chosen_logps": -255.4532012939453, "debug/policy_rejected_logits": 1.1531434059143066, "debug/policy_rejected_logps": -261.6268005371094, "debug/reference_chosen_logps": -257.0854797363281, "debug/reference_rejected_logps": -259.08953857421875, "debug/sppo_chosen_loss": 2347.645751953125, "debug/sppo_chosen_reward_in_loss": 1.6322675943374634, "debug/sppo_rej_reward_in_loss": -2.53729248046875, "debug/sppo_reject_loss": 2319.710205078125, "epoch": 2.717391304347826, "grad_norm": 63375.54807649489, "learning_rate": 7.83182647929707e-08, "logits/chosen": 1.0691126585006714, "logits/rejected": 1.1531434059143066, "logps/chosen": -255.4532012939453, "logps/rejected": -261.6268005371094, "loss": 4591.8281, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.016322676092386246, "rewards/margins": 0.04169560223817825, "rewards/rejected": -0.02537292242050171, "step": 750 }, { "debug/policy_chosen_logits": 1.1826550960540771, "debug/policy_chosen_logps": -275.0767517089844, "debug/policy_rejected_logits": 1.3011400699615479, "debug/policy_rejected_logps": -286.65875244140625, "debug/reference_chosen_logps": -274.70001220703125, "debug/reference_rejected_logps": -281.486083984375, "debug/sppo_chosen_loss": 2563.9345703125, "debug/sppo_chosen_reward_in_loss": -0.3767387270927429, "debug/sppo_rej_reward_in_loss": -5.172691822052002, "debug/sppo_reject_loss": 2118.130126953125, "epoch": 2.7355072463768115, "grad_norm": 73693.41186252929, "learning_rate": 7.801041820227318e-08, "logits/chosen": 1.1826550960540771, "logits/rejected": 1.3011400699615479, "logps/chosen": -275.0767517089844, "logps/rejected": -286.65875244140625, "loss": 4697.5813, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.0037673874758183956, "rewards/margins": 0.04795952886343002, "rewards/rejected": -0.05172691494226456, "step": 755 }, { "debug/policy_chosen_logits": 1.2178585529327393, "debug/policy_chosen_logps": -258.1893310546875, "debug/policy_rejected_logits": 1.7031934261322021, "debug/policy_rejected_logps": -302.24322509765625, "debug/reference_chosen_logps": -258.6191711425781, "debug/reference_rejected_logps": -298.0517883300781, "debug/sppo_chosen_loss": 2476.55078125, "debug/sppo_chosen_reward_in_loss": 0.4298551678657532, "debug/sppo_rej_reward_in_loss": -4.191437244415283, "debug/sppo_reject_loss": 2163.860107421875, "epoch": 2.753623188405797, "grad_norm": 122726.8457950013, "learning_rate": 7.770101630458363e-08, "logits/chosen": 1.2178585529327393, "logits/rejected": 1.7031934261322021, "logps/chosen": -258.1893310546875, "logps/rejected": -302.24322509765625, "loss": 4691.1711, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.004298551939427853, "rewards/margins": 0.046212922781705856, "rewards/rejected": -0.04191437363624573, "step": 760 }, { "debug/policy_chosen_logits": 1.3344509601593018, "debug/policy_chosen_logps": -262.7269592285156, "debug/policy_rejected_logits": 1.428648591041565, "debug/policy_rejected_logps": -281.42303466796875, "debug/reference_chosen_logps": -263.6646728515625, "debug/reference_rejected_logps": -276.64959716796875, "debug/sppo_chosen_loss": 2433.05029296875, "debug/sppo_chosen_reward_in_loss": 0.9377063512802124, "debug/sppo_rej_reward_in_loss": -4.773464202880859, "debug/sppo_reject_loss": 2114.662841796875, "epoch": 2.7717391304347827, "grad_norm": 70208.33518676949, "learning_rate": 7.73900762797575e-08, "logits/chosen": 1.3344509601593018, "logits/rejected": 1.428648591041565, "logps/chosen": -262.7269592285156, "logps/rejected": -281.42303466796875, "loss": 4616.6656, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.009377063252031803, "rewards/margins": 0.0571117028594017, "rewards/rejected": -0.047734636813402176, "step": 765 }, { "debug/policy_chosen_logits": 1.4318565130233765, "debug/policy_chosen_logps": -246.78271484375, "debug/policy_rejected_logits": 1.9473499059677124, "debug/policy_rejected_logps": -292.7750549316406, "debug/reference_chosen_logps": -248.1265869140625, "debug/reference_rejected_logps": -289.3108825683594, "debug/sppo_chosen_loss": 2377.390869140625, "debug/sppo_chosen_reward_in_loss": 1.3438713550567627, "debug/sppo_rej_reward_in_loss": -3.4641430377960205, "debug/sppo_reject_loss": 2215.130126953125, "epoch": 2.789855072463768, "grad_norm": 56045.35340217485, "learning_rate": 7.707761539305629e-08, "logits/chosen": 1.4318565130233765, "logits/rejected": 1.9473499059677124, "logps/chosen": -246.78271484375, "logps/rejected": -292.7750549316406, "loss": 4641.2305, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.013438713736832142, "rewards/margins": 0.048080142587423325, "rewards/rejected": -0.03464142605662346, "step": 770 }, { "debug/policy_chosen_logits": 1.5560919046401978, "debug/policy_chosen_logps": -253.90786743164062, "debug/policy_rejected_logits": 1.8329626321792603, "debug/policy_rejected_logps": -306.51739501953125, "debug/reference_chosen_logps": -254.22415161132812, "debug/reference_rejected_logps": -302.6522216796875, "debug/sppo_chosen_loss": 2491.45654296875, "debug/sppo_chosen_reward_in_loss": 0.3162704408168793, "debug/sppo_rej_reward_in_loss": -3.8651795387268066, "debug/sppo_reject_loss": 2216.763916015625, "epoch": 2.807971014492754, "grad_norm": 74056.59942683075, "learning_rate": 7.676365099418883e-08, "logits/chosen": 1.5560919046401978, "logits/rejected": 1.8329626321792603, "logps/chosen": -253.90786743164062, "logps/rejected": -306.51739501953125, "loss": 4730.9008, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0031627051066607237, "rewards/margins": 0.041814498603343964, "rewards/rejected": -0.03865179046988487, "step": 775 }, { "debug/policy_chosen_logits": 1.1131713390350342, "debug/policy_chosen_logps": -242.6931610107422, "debug/policy_rejected_logits": 1.5814272165298462, "debug/policy_rejected_logps": -303.05401611328125, "debug/reference_chosen_logps": -242.7500762939453, "debug/reference_rejected_logps": -299.9582824707031, "debug/sppo_chosen_loss": 2545.71142578125, "debug/sppo_chosen_reward_in_loss": 0.05693111568689346, "debug/sppo_rej_reward_in_loss": -3.095724105834961, "debug/sppo_reject_loss": 2284.80517578125, "epoch": 2.8260869565217392, "grad_norm": 62252.28679752256, "learning_rate": 7.644820051634812e-08, "logits/chosen": 1.1131713390350342, "logits/rejected": 1.5814272165298462, "logps/chosen": -242.6931610107422, "logps/rejected": -303.05401611328125, "loss": 4659.9199, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0005693117855116725, "rewards/margins": 0.03152655437588692, "rewards/rejected": -0.030957240611314774, "step": 780 }, { "debug/policy_chosen_logits": 1.1171921491622925, "debug/policy_chosen_logps": -248.11083984375, "debug/policy_rejected_logits": 1.436684012413025, "debug/policy_rejected_logps": -278.32916259765625, "debug/reference_chosen_logps": -248.8385467529297, "debug/reference_rejected_logps": -271.8460693359375, "debug/sppo_chosen_loss": 2448.23193359375, "debug/sppo_chosen_reward_in_loss": 0.7277113199234009, "debug/sppo_rej_reward_in_loss": -6.483117580413818, "debug/sppo_reject_loss": 2002.7955322265625, "epoch": 2.8442028985507246, "grad_norm": 65694.1491439372, "learning_rate": 7.613128147524313e-08, "logits/chosen": 1.1171921491622925, "logits/rejected": 1.436684012413025, "logps/chosen": -248.11083984375, "logps/rejected": -278.32916259765625, "loss": 4572.9023, "rewards/accuracies": 0.75, "rewards/chosen": 0.007277113385498524, "rewards/margins": 0.07210828363895416, "rewards/rejected": -0.06483118236064911, "step": 785 }, { "debug/policy_chosen_logits": 1.31461501121521, "debug/policy_chosen_logps": -246.47268676757812, "debug/policy_rejected_logits": 1.7004801034927368, "debug/policy_rejected_logps": -302.381103515625, "debug/reference_chosen_logps": -246.872802734375, "debug/reference_rejected_logps": -299.05181884765625, "debug/sppo_chosen_loss": 2485.46142578125, "debug/sppo_chosen_reward_in_loss": 0.40012186765670776, "debug/sppo_rej_reward_in_loss": -3.329306125640869, "debug/sppo_reject_loss": 2230.423583984375, "epoch": 2.86231884057971, "grad_norm": 78799.29136632854, "learning_rate": 7.581291146812631e-08, "logits/chosen": 1.31461501121521, "logits/rejected": 1.7004801034927368, "logps/chosen": -246.47268676757812, "logps/rejected": -302.381103515625, "loss": 4617.8523, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.00400121882557869, "rewards/margins": 0.03729427605867386, "rewards/rejected": -0.03329306095838547, "step": 790 }, { "debug/policy_chosen_logits": 1.1879009008407593, "debug/policy_chosen_logps": -256.074951171875, "debug/policy_rejected_logits": 1.436812162399292, "debug/policy_rejected_logps": -293.434814453125, "debug/reference_chosen_logps": -256.1133117675781, "debug/reference_rejected_logps": -289.4981994628906, "debug/sppo_chosen_loss": 2532.99658203125, "debug/sppo_chosen_reward_in_loss": 0.03835143893957138, "debug/sppo_rej_reward_in_loss": -3.936647891998291, "debug/sppo_reject_loss": 2198.875732421875, "epoch": 2.880434782608696, "grad_norm": 67427.30031192664, "learning_rate": 7.549310817281647e-08, "logits/chosen": 1.1879009008407593, "logits/rejected": 1.436812162399292, "logps/chosen": -256.074951171875, "logps/rejected": -293.434814453125, "loss": 4572.7508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.000383514619898051, "rewards/margins": 0.03974999114871025, "rewards/rejected": -0.039366476237773895, "step": 795 }, { "debug/policy_chosen_logits": 1.2889708280563354, "debug/policy_chosen_logps": -275.5391540527344, "debug/policy_rejected_logits": 1.4859613180160522, "debug/policy_rejected_logps": -283.573486328125, "debug/reference_chosen_logps": -275.1273193359375, "debug/reference_rejected_logps": -279.24957275390625, "debug/sppo_chosen_loss": 2589.01220703125, "debug/sppo_chosen_reward_in_loss": -0.4118543565273285, "debug/sppo_rej_reward_in_loss": -4.323914527893066, "debug/sppo_reject_loss": 2165.736572265625, "epoch": 2.898550724637681, "grad_norm": 82038.69191055794, "learning_rate": 7.517188934671725e-08, "logits/chosen": 1.2889708280563354, "logits/rejected": 1.4859613180160522, "logps/chosen": -275.5391540527344, "logps/rejected": -283.573486328125, "loss": 4624.6344, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004118544049561024, "rewards/margins": 0.03912060707807541, "rewards/rejected": -0.04323914647102356, "step": 800 }, { "epoch": 2.898550724637681, "eval_debug/policy_chosen_logits": 1.4659922122955322, "eval_debug/policy_chosen_logps": -253.1265106201172, "eval_debug/policy_rejected_logits": 1.5167869329452515, "eval_debug/policy_rejected_logps": -262.9391784667969, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2564.37353515625, "eval_debug/sppo_chosen_reward_in_loss": -0.20804350078105927, "eval_debug/sppo_rej_reward_in_loss": -3.2805538177490234, "eval_debug/sppo_reject_loss": 2277.46337890625, "eval_logits/chosen": 1.4659922122955322, "eval_logits/rejected": 1.5167869329452515, "eval_logps/chosen": -253.1265106201172, "eval_logps/rejected": -262.9391784667969, "eval_loss": 4685.78759765625, "eval_rewards/accuracies": 0.6315789222717285, "eval_rewards/chosen": -0.002080434700474143, "eval_rewards/margins": 0.030725106596946716, "eval_rewards/rejected": -0.032805539667606354, "eval_runtime": 28.3849, "eval_samples_per_second": 21.138, "eval_steps_per_second": 0.669, "step": 800 }, { "debug/policy_chosen_logits": 1.221453070640564, "debug/policy_chosen_logps": -251.9959259033203, "debug/policy_rejected_logits": 1.6424691677093506, "debug/policy_rejected_logps": -314.9390563964844, "debug/reference_chosen_logps": -253.0550079345703, "debug/reference_rejected_logps": -309.57537841796875, "debug/sppo_chosen_loss": 2406.28515625, "debug/sppo_chosen_reward_in_loss": 1.059086799621582, "debug/sppo_rej_reward_in_loss": -5.363643646240234, "debug/sppo_reject_loss": 2103.30810546875, "epoch": 2.9166666666666665, "grad_norm": 69167.85516381508, "learning_rate": 7.484927282583103e-08, "logits/chosen": 1.221453070640564, "logits/rejected": 1.6424691677093506, "logps/chosen": -251.9959259033203, "logps/rejected": -314.9390563964844, "loss": 4556.7664, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.010590868070721626, "rewards/margins": 0.06422730535268784, "rewards/rejected": -0.05363643169403076, "step": 805 }, { "debug/policy_chosen_logits": 1.2087690830230713, "debug/policy_chosen_logps": -248.7294464111328, "debug/policy_rejected_logits": 1.498429536819458, "debug/policy_rejected_logps": -316.4676818847656, "debug/reference_chosen_logps": -249.0221710205078, "debug/reference_rejected_logps": -309.80548095703125, "debug/sppo_chosen_loss": 2494.62451171875, "debug/sppo_chosen_reward_in_loss": 0.2927181124687195, "debug/sppo_rej_reward_in_loss": -6.6621809005737305, "debug/sppo_reject_loss": 1967.0299072265625, "epoch": 2.9347826086956523, "grad_norm": 68731.1622774303, "learning_rate": 7.452527652376863e-08, "logits/chosen": 1.2087690830230713, "logits/rejected": 1.498429536819458, "logps/chosen": -248.7294464111328, "logps/rejected": -316.4676818847656, "loss": 4657.1684, "rewards/accuracies": 0.75, "rewards/chosen": 0.002927180379629135, "rewards/margins": 0.06954899430274963, "rewards/rejected": -0.0666218176484108, "step": 810 }, { "debug/policy_chosen_logits": 1.0130951404571533, "debug/policy_chosen_logps": -237.8261260986328, "debug/policy_rejected_logits": 1.4029724597930908, "debug/policy_rejected_logps": -298.6962890625, "debug/reference_chosen_logps": -238.4188232421875, "debug/reference_rejected_logps": -296.5367126464844, "debug/sppo_chosen_loss": 2480.473388671875, "debug/sppo_chosen_reward_in_loss": 0.5927131772041321, "debug/sppo_rej_reward_in_loss": -2.159576892852783, "debug/sppo_reject_loss": 2338.80712890625, "epoch": 2.9528985507246377, "grad_norm": 73180.34487996876, "learning_rate": 7.419991843075463e-08, "logits/chosen": 1.0130951404571533, "logits/rejected": 1.4029724597930908, "logps/chosen": -237.8261260986328, "logps/rejected": -298.6962890625, "loss": 4632.7711, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.005927131976932287, "rewards/margins": 0.027522901073098183, "rewards/rejected": -0.021595770493149757, "step": 815 }, { "debug/policy_chosen_logits": 1.3386638164520264, "debug/policy_chosen_logps": -250.01718139648438, "debug/policy_rejected_logits": 1.921316385269165, "debug/policy_rejected_logps": -306.84710693359375, "debug/reference_chosen_logps": -251.37051391601562, "debug/reference_rejected_logps": -302.274658203125, "debug/sppo_chosen_loss": 2387.706298828125, "debug/sppo_chosen_reward_in_loss": 1.3533411026000977, "debug/sppo_rej_reward_in_loss": -4.572475910186768, "debug/sppo_reject_loss": 2148.51806640625, "epoch": 2.971014492753623, "grad_norm": 57116.2603627131, "learning_rate": 7.387321661262844e-08, "logits/chosen": 1.3386638164520264, "logits/rejected": 1.921316385269165, "logps/chosen": -250.01718139648438, "logps/rejected": -306.84710693359375, "loss": 4651.9684, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.013533410616219044, "rewards/margins": 0.059258170425891876, "rewards/rejected": -0.04572475701570511, "step": 820 }, { "debug/policy_chosen_logits": 1.445495843887329, "debug/policy_chosen_logps": -283.45599365234375, "debug/policy_rejected_logits": 1.7299737930297852, "debug/policy_rejected_logps": -301.90399169921875, "debug/reference_chosen_logps": -284.3904724121094, "debug/reference_rejected_logps": -295.550537109375, "debug/sppo_chosen_loss": 2426.180419921875, "debug/sppo_chosen_reward_in_loss": 0.9344981908798218, "debug/sppo_rej_reward_in_loss": -6.353468894958496, "debug/sppo_reject_loss": 2020.0521240234375, "epoch": 2.9891304347826084, "grad_norm": 65796.06205811449, "learning_rate": 7.354518920984119e-08, "logits/chosen": 1.445495843887329, "logits/rejected": 1.7299737930297852, "logps/chosen": -283.45599365234375, "logps/rejected": -301.90399169921875, "loss": 4561.118, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.009344981983304024, "rewards/margins": 0.07287967205047607, "rewards/rejected": -0.0635346919298172, "step": 825 }, { "debug/policy_chosen_logits": 1.2625454664230347, "debug/policy_chosen_logps": -236.8692169189453, "debug/policy_rejected_logits": 1.7302738428115845, "debug/policy_rejected_logps": -306.16729736328125, "debug/reference_chosen_logps": -238.1107177734375, "debug/reference_rejected_logps": -302.297607421875, "debug/sppo_chosen_loss": 2399.04638671875, "debug/sppo_chosen_reward_in_loss": 1.2415053844451904, "debug/sppo_rej_reward_in_loss": -3.869725465774536, "debug/sppo_reject_loss": 2204.76416015625, "epoch": 3.0072463768115942, "grad_norm": 61193.65334425578, "learning_rate": 7.32158544364484e-08, "logits/chosen": 1.2625454664230347, "logits/rejected": 1.7302738428115845, "logps/chosen": -236.8692169189453, "logps/rejected": -306.16729736328125, "loss": 4634.2973, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.012415053322911263, "rewards/margins": 0.051112305372953415, "rewards/rejected": -0.0386972539126873, "step": 830 }, { "debug/policy_chosen_logits": 1.3355305194854736, "debug/policy_chosen_logps": -260.23663330078125, "debug/policy_rejected_logits": 1.5196969509124756, "debug/policy_rejected_logps": -283.5062255859375, "debug/reference_chosen_logps": -261.3477478027344, "debug/reference_rejected_logps": -279.7190856933594, "debug/sppo_chosen_loss": 2415.00537109375, "debug/sppo_chosen_reward_in_loss": 1.111120581626892, "debug/sppo_rej_reward_in_loss": -3.787144184112549, "debug/sppo_reject_loss": 2210.35009765625, "epoch": 3.0253623188405796, "grad_norm": 72850.47897855999, "learning_rate": 7.28852305790987e-08, "logits/chosen": 1.3355305194854736, "logits/rejected": 1.5196969509124756, "logps/chosen": -260.23663330078125, "logps/rejected": -283.5062255859375, "loss": 4518.6047, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011111205443739891, "rewards/margins": 0.0489826463162899, "rewards/rejected": -0.03787143900990486, "step": 835 }, { "debug/policy_chosen_logits": 1.5348484516143799, "debug/policy_chosen_logps": -273.3497619628906, "debug/policy_rejected_logits": 1.320349931716919, "debug/policy_rejected_logps": -276.6268005371094, "debug/reference_chosen_logps": -274.346923828125, "debug/reference_rejected_logps": -274.3509216308594, "debug/sppo_chosen_loss": 2448.82177734375, "debug/sppo_chosen_reward_in_loss": 0.9972000122070312, "debug/sppo_rej_reward_in_loss": -2.2758421897888184, "debug/sppo_reject_loss": 2341.25048828125, "epoch": 3.0434782608695654, "grad_norm": 67740.2764421149, "learning_rate": 7.255333599601847e-08, "logits/chosen": 1.5348484516143799, "logits/rejected": 1.320349931716919, "logps/chosen": -273.3497619628906, "logps/rejected": -276.6268005371094, "loss": 4595.4688, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.009972000494599342, "rewards/margins": 0.03273041918873787, "rewards/rejected": -0.022758418694138527, "step": 840 }, { "debug/policy_chosen_logits": 1.3700846433639526, "debug/policy_chosen_logps": -254.62109375, "debug/policy_rejected_logits": 1.505577802658081, "debug/policy_rejected_logps": -268.71044921875, "debug/reference_chosen_logps": -258.35516357421875, "debug/reference_rejected_logps": -267.05230712890625, "debug/sppo_chosen_loss": 2162.54052734375, "debug/sppo_chosen_reward_in_loss": 3.7340476512908936, "debug/sppo_rej_reward_in_loss": -1.6581627130508423, "debug/sppo_reject_loss": 2429.60986328125, "epoch": 3.0615942028985508, "grad_norm": 70335.9560176472, "learning_rate": 7.222018911599233e-08, "logits/chosen": 1.3700846433639526, "logits/rejected": 1.505577802658081, "logps/chosen": -254.62109375, "logps/rejected": -268.71044921875, "loss": 4590.6805, "rewards/accuracies": 0.75, "rewards/chosen": 0.03734047710895538, "rewards/margins": 0.053922105580568314, "rewards/rejected": -0.01658162660896778, "step": 845 }, { "debug/policy_chosen_logits": 1.1331188678741455, "debug/policy_chosen_logps": -258.7958068847656, "debug/policy_rejected_logits": 1.5421819686889648, "debug/policy_rejected_logps": -287.38116455078125, "debug/reference_chosen_logps": -262.4872131347656, "debug/reference_rejected_logps": -282.98260498046875, "debug/sppo_chosen_loss": 2155.307373046875, "debug/sppo_chosen_reward_in_loss": 3.6914010047912598, "debug/sppo_rej_reward_in_loss": -4.398567199707031, "debug/sppo_reject_loss": 2190.74169921875, "epoch": 3.079710144927536, "grad_norm": 98115.78335956299, "learning_rate": 7.188580843734004e-08, "logits/chosen": 1.1331188678741455, "logits/rejected": 1.5421819686889648, "logps/chosen": -258.7958068847656, "logps/rejected": -287.38116455078125, "loss": 4541.2988, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03691400960087776, "rewards/margins": 0.080899678170681, "rewards/rejected": -0.043985672295093536, "step": 850 }, { "debug/policy_chosen_logits": 1.476320505142212, "debug/policy_chosen_logps": -272.7628479003906, "debug/policy_rejected_logits": 1.561833381652832, "debug/policy_rejected_logps": -317.9288024902344, "debug/reference_chosen_logps": -275.04705810546875, "debug/reference_rejected_logps": -311.6878356933594, "debug/sppo_chosen_loss": 2295.048583984375, "debug/sppo_chosen_reward_in_loss": 2.2842469215393066, "debug/sppo_rej_reward_in_loss": -6.240972995758057, "debug/sppo_reject_loss": 2041.3408203125, "epoch": 3.097826086956522, "grad_norm": 65203.51672900081, "learning_rate": 7.155021252688928e-08, "logits/chosen": 1.476320505142212, "logits/rejected": 1.561833381652832, "logps/chosen": -272.7628479003906, "logps/rejected": -317.9288024902344, "loss": 4582.6109, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.022842470556497574, "rewards/margins": 0.08525218814611435, "rewards/rejected": -0.06240972876548767, "step": 855 }, { "debug/policy_chosen_logits": 1.192882776260376, "debug/policy_chosen_logps": -258.40386962890625, "debug/policy_rejected_logits": 1.2437019348144531, "debug/policy_rejected_logps": -273.90728759765625, "debug/reference_chosen_logps": -260.32659912109375, "debug/reference_rejected_logps": -270.2201232910156, "debug/sppo_chosen_loss": 2330.452392578125, "debug/sppo_chosen_reward_in_loss": 1.9227139949798584, "debug/sppo_rej_reward_in_loss": -3.687180757522583, "debug/sppo_reject_loss": 2217.678466796875, "epoch": 3.1159420289855073, "grad_norm": 70023.40665554293, "learning_rate": 7.121342001894466e-08, "logits/chosen": 1.192882776260376, "logits/rejected": 1.2437019348144531, "logps/chosen": -258.40386962890625, "logps/rejected": -273.90728759765625, "loss": 4706.7301, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01922714151442051, "rewards/margins": 0.056098949164152145, "rewards/rejected": -0.03687180578708649, "step": 860 }, { "debug/policy_chosen_logits": 1.283287525177002, "debug/policy_chosen_logps": -259.46160888671875, "debug/policy_rejected_logits": 1.6224002838134766, "debug/policy_rejected_logps": -277.697998046875, "debug/reference_chosen_logps": -260.6236877441406, "debug/reference_rejected_logps": -272.70477294921875, "debug/sppo_chosen_loss": 2419.24853515625, "debug/sppo_chosen_reward_in_loss": 1.1620738506317139, "debug/sppo_rej_reward_in_loss": -4.993208885192871, "debug/sppo_reject_loss": 2130.47021484375, "epoch": 3.1340579710144927, "grad_norm": 65909.5441623718, "learning_rate": 7.087544961425316e-08, "logits/chosen": 1.283287525177002, "logits/rejected": 1.6224002838134766, "logps/chosen": -259.46160888671875, "logps/rejected": -277.697998046875, "loss": 4560.4773, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.011620739474892616, "rewards/margins": 0.061552830040454865, "rewards/rejected": -0.0499320924282074, "step": 865 }, { "debug/policy_chosen_logits": 1.4900705814361572, "debug/policy_chosen_logps": -271.69781494140625, "debug/policy_rejected_logits": 1.7472995519638062, "debug/policy_rejected_logps": -323.4566955566406, "debug/reference_chosen_logps": -273.129150390625, "debug/reference_rejected_logps": -319.109130859375, "debug/sppo_chosen_loss": 2371.99560546875, "debug/sppo_chosen_reward_in_loss": 1.4313232898712158, "debug/sppo_rej_reward_in_loss": -4.347558498382568, "debug/sppo_reject_loss": 2175.521728515625, "epoch": 3.1521739130434785, "grad_norm": 98449.00469359074, "learning_rate": 7.05363200789656e-08, "logits/chosen": 1.4900705814361572, "logits/rejected": 1.7472995519638062, "logps/chosen": -271.69781494140625, "logps/rejected": -323.4566955566406, "loss": 4573.332, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.014313233084976673, "rewards/margins": 0.05778881907463074, "rewards/rejected": -0.04347558692097664, "step": 870 }, { "debug/policy_chosen_logits": 1.4324753284454346, "debug/policy_chosen_logps": -273.22796630859375, "debug/policy_rejected_logits": 1.4399211406707764, "debug/policy_rejected_logps": -269.93475341796875, "debug/reference_chosen_logps": -273.6427307128906, "debug/reference_rejected_logps": -265.6277160644531, "debug/sppo_chosen_loss": 2514.471435546875, "debug/sppo_chosen_reward_in_loss": 0.41477125883102417, "debug/sppo_rej_reward_in_loss": -4.3070831298828125, "debug/sppo_reject_loss": 2171.365234375, "epoch": 3.170289855072464, "grad_norm": 62355.19727690433, "learning_rate": 7.019605024359474e-08, "logits/chosen": 1.4324753284454346, "logits/rejected": 1.4399211406707764, "logps/chosen": -273.22796630859375, "logps/rejected": -269.93475341796875, "loss": 4654.0688, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0041477130725979805, "rewards/margins": 0.04721853882074356, "rewards/rejected": -0.043070826679468155, "step": 875 }, { "debug/policy_chosen_logits": 1.3117127418518066, "debug/policy_chosen_logps": -256.207763671875, "debug/policy_rejected_logits": 1.6756629943847656, "debug/policy_rejected_logps": -342.5567321777344, "debug/reference_chosen_logps": -257.4051818847656, "debug/reference_rejected_logps": -340.0152587890625, "debug/sppo_chosen_loss": 2410.57080078125, "debug/sppo_chosen_reward_in_loss": 1.1974289417266846, "debug/sppo_rej_reward_in_loss": -2.541478157043457, "debug/sppo_reject_loss": 2331.065185546875, "epoch": 3.1884057971014492, "grad_norm": 92334.78755239808, "learning_rate": 6.98546590019697e-08, "logits/chosen": 1.3117127418518066, "logits/rejected": 1.6756629943847656, "logps/chosen": -256.207763671875, "logps/rejected": -342.5567321777344, "loss": 4476.9602, "rewards/accuracies": 0.625, "rewards/chosen": 0.011974288150668144, "rewards/margins": 0.03738906979560852, "rewards/rejected": -0.025414779782295227, "step": 880 }, { "debug/policy_chosen_logits": 0.9446808099746704, "debug/policy_chosen_logps": -233.10348510742188, "debug/policy_rejected_logits": 1.277573585510254, "debug/policy_rejected_logps": -285.7528991699219, "debug/reference_chosen_logps": -232.969482421875, "debug/reference_rejected_logps": -279.6332702636719, "debug/sppo_chosen_loss": 2578.521484375, "debug/sppo_chosen_reward_in_loss": -0.13400498032569885, "debug/sppo_rej_reward_in_loss": -6.119626045227051, "debug/sppo_reject_loss": 2058.451171875, "epoch": 3.2065217391304346, "grad_norm": 101465.78772268088, "learning_rate": 6.951216531018677e-08, "logits/chosen": 0.9446808099746704, "logits/rejected": 1.277573585510254, "logps/chosen": -233.10348510742188, "logps/rejected": -285.7528991699219, "loss": 4723.9219, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0013400499010458589, "rewards/margins": 0.05985620617866516, "rewards/rejected": -0.061196256428956985, "step": 885 }, { "debug/policy_chosen_logits": 1.2341248989105225, "debug/policy_chosen_logps": -257.8463134765625, "debug/policy_rejected_logits": 1.4800808429718018, "debug/policy_rejected_logps": -290.77386474609375, "debug/reference_chosen_logps": -256.938720703125, "debug/reference_rejected_logps": -284.2198486328125, "debug/sppo_chosen_loss": 2649.87548828125, "debug/sppo_chosen_reward_in_loss": -0.9075664281845093, "debug/sppo_rej_reward_in_loss": -6.554051399230957, "debug/sppo_reject_loss": 1996.097900390625, "epoch": 3.2246376811594204, "grad_norm": 60303.7658421852, "learning_rate": 6.91685881855569e-08, "logits/chosen": 1.2341248989105225, "logits/rejected": 1.4800808429718018, "logps/chosen": -257.8463134765625, "logps/rejected": -290.77386474609375, "loss": 4511.9859, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009075663983821869, "rewards/margins": 0.056464843451976776, "rewards/rejected": -0.06554051488637924, "step": 890 }, { "debug/policy_chosen_logits": 1.3582326173782349, "debug/policy_chosen_logps": -245.34634399414062, "debug/policy_rejected_logits": 1.5517940521240234, "debug/policy_rejected_logps": -263.07659912109375, "debug/reference_chosen_logps": -245.36166381835938, "debug/reference_rejected_logps": -260.2655334472656, "debug/sppo_chosen_loss": 2533.37841796875, "debug/sppo_chosen_reward_in_loss": 0.015349959954619408, "debug/sppo_rej_reward_in_loss": -2.811088800430298, "debug/sppo_reject_loss": 2292.26513671875, "epoch": 3.2427536231884058, "grad_norm": 67855.48725470214, "learning_rate": 6.882394670554983e-08, "logits/chosen": 1.3582326173782349, "logits/rejected": 1.5517940521240234, "logps/chosen": -245.34634399414062, "logps/rejected": -263.07659912109375, "loss": 4732.8016, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.00015349910245276988, "rewards/margins": 0.02826438844203949, "rewards/rejected": -0.028110885992646217, "step": 895 }, { "debug/policy_chosen_logits": 1.0621987581253052, "debug/policy_chosen_logps": -241.30923461914062, "debug/policy_rejected_logits": 1.5428647994995117, "debug/policy_rejected_logps": -297.16314697265625, "debug/reference_chosen_logps": -242.0436553955078, "debug/reference_rejected_logps": -291.9529724121094, "debug/sppo_chosen_loss": 2451.71728515625, "debug/sppo_chosen_reward_in_loss": 0.7344198226928711, "debug/sppo_rej_reward_in_loss": -5.210179328918457, "debug/sppo_reject_loss": 2117.42138671875, "epoch": 3.260869565217391, "grad_norm": 66758.97499411159, "learning_rate": 6.847826000673463e-08, "logits/chosen": 1.0621987581253052, "logits/rejected": 1.5428647994995117, "logps/chosen": -241.30923461914062, "logps/rejected": -297.16314697265625, "loss": 4526.798, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.007344198413193226, "rewards/margins": 0.05944598838686943, "rewards/rejected": -0.05210179090499878, "step": 900 }, { "epoch": 3.260869565217391, "eval_debug/policy_chosen_logits": 1.4543341398239136, "eval_debug/policy_chosen_logps": -253.01719665527344, "eval_debug/policy_rejected_logits": 1.5043613910675049, "eval_debug/policy_rejected_logps": -263.0449523925781, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2560.71923828125, "eval_debug/sppo_chosen_reward_in_loss": -0.0987061932682991, "eval_debug/sppo_rej_reward_in_loss": -3.386300802230835, "eval_debug/sppo_reject_loss": 2277.551513671875, "eval_logits/chosen": 1.4543341398239136, "eval_logits/rejected": 1.5043613910675049, "eval_logps/chosen": -253.01719665527344, "eval_logps/rejected": -263.0449523925781, "eval_loss": 4673.5791015625, "eval_rewards/accuracies": 0.5921052694320679, "eval_rewards/chosen": -0.000987062114290893, "eval_rewards/margins": 0.032875943928956985, "eval_rewards/rejected": -0.03386300429701805, "eval_runtime": 28.4446, "eval_samples_per_second": 21.094, "eval_steps_per_second": 0.668, "step": 900 }, { "debug/policy_chosen_logits": 1.3216747045516968, "debug/policy_chosen_logps": -273.6351318359375, "debug/policy_rejected_logits": 1.6194353103637695, "debug/policy_rejected_logps": -297.20819091796875, "debug/reference_chosen_logps": -273.14617919921875, "debug/reference_rejected_logps": -291.0046691894531, "debug/sppo_chosen_loss": 2630.943359375, "debug/sppo_chosen_reward_in_loss": -0.488912969827652, "debug/sppo_rej_reward_in_loss": -6.203536033630371, "debug/sppo_reject_loss": 2036.7193603515625, "epoch": 3.278985507246377, "grad_norm": 86282.95682551128, "learning_rate": 6.813154728371727e-08, "logits/chosen": 1.3216747045516968, "logits/rejected": 1.6194353103637695, "logps/chosen": -273.6351318359375, "logps/rejected": -297.20819091796875, "loss": 4647.6363, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.004889129661023617, "rewards/margins": 0.05714622884988785, "rewards/rejected": -0.06203535944223404, "step": 905 }, { "debug/policy_chosen_logits": 1.189056396484375, "debug/policy_chosen_logps": -247.99783325195312, "debug/policy_rejected_logits": 1.2917503118515015, "debug/policy_rejected_logps": -281.1531982421875, "debug/reference_chosen_logps": -248.80142211914062, "debug/reference_rejected_logps": -276.1498107910156, "debug/sppo_chosen_loss": 2473.15283203125, "debug/sppo_chosen_reward_in_loss": 0.8035877346992493, "debug/sppo_rej_reward_in_loss": -5.003408908843994, "debug/sppo_reject_loss": 2130.677490234375, "epoch": 3.2971014492753623, "grad_norm": 85369.33504264854, "learning_rate": 6.77838277880747e-08, "logits/chosen": 1.189056396484375, "logits/rejected": 1.2917503118515015, "logps/chosen": -247.99783325195312, "logps/rejected": -281.1531982421875, "loss": 4709.9395, "rewards/accuracies": 0.75, "rewards/chosen": 0.008035877719521523, "rewards/margins": 0.05806996300816536, "rewards/rejected": -0.050034087151288986, "step": 910 }, { "debug/policy_chosen_logits": 1.5871620178222656, "debug/policy_chosen_logps": -265.612060546875, "debug/policy_rejected_logits": 1.6904337406158447, "debug/policy_rejected_logps": -276.8652648925781, "debug/reference_chosen_logps": -264.2405700683594, "debug/reference_rejected_logps": -274.06915283203125, "debug/sppo_chosen_loss": 2715.077880859375, "debug/sppo_chosen_reward_in_loss": -1.3715009689331055, "debug/sppo_rej_reward_in_loss": -2.7960915565490723, "debug/sppo_reject_loss": 2292.55712890625, "epoch": 3.3152173913043477, "grad_norm": 62428.39352075046, "learning_rate": 6.743512082728601e-08, "logits/chosen": 1.5871620178222656, "logits/rejected": 1.6904337406158447, "logps/chosen": -265.612060546875, "logps/rejected": -276.8652648925781, "loss": 4642.7937, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.013715009205043316, "rewards/margins": 0.014245906844735146, "rewards/rejected": -0.02796091139316559, "step": 915 }, { "debug/policy_chosen_logits": 1.2219802141189575, "debug/policy_chosen_logps": -264.1886291503906, "debug/policy_rejected_logits": 1.2222537994384766, "debug/policy_rejected_logps": -267.58038330078125, "debug/reference_chosen_logps": -265.7757263183594, "debug/reference_rejected_logps": -265.7680358886719, "debug/sppo_chosen_loss": 2358.83642578125, "debug/sppo_chosen_reward_in_loss": 1.5870968103408813, "debug/sppo_rej_reward_in_loss": -1.8123528957366943, "debug/sppo_reject_loss": 2367.68115234375, "epoch": 3.3333333333333335, "grad_norm": 60585.62405711025, "learning_rate": 6.708544576366023e-08, "logits/chosen": 1.2219802141189575, "logits/rejected": 1.2222537994384766, "logps/chosen": -264.1886291503906, "logps/rejected": -267.58038330078125, "loss": 4629.0422, "rewards/accuracies": 0.625, "rewards/chosen": 0.015870967879891396, "rewards/margins": 0.03399449959397316, "rewards/rejected": -0.018123529851436615, "step": 920 }, { "debug/policy_chosen_logits": 1.1675989627838135, "debug/policy_chosen_logps": -278.4476013183594, "debug/policy_rejected_logits": 1.2601947784423828, "debug/policy_rejected_logps": -310.80718994140625, "debug/reference_chosen_logps": -278.3129577636719, "debug/reference_rejected_logps": -303.284423828125, "debug/sppo_chosen_loss": 2571.292236328125, "debug/sppo_chosen_reward_in_loss": -0.1346588134765625, "debug/sppo_rej_reward_in_loss": -7.522784233093262, "debug/sppo_reject_loss": 1986.8385009765625, "epoch": 3.351449275362319, "grad_norm": 65406.092156402694, "learning_rate": 6.673482201326134e-08, "logits/chosen": 1.1675989627838135, "logits/rejected": 1.2601947784423828, "logps/chosen": -278.4476013183594, "logps/rejected": -310.80718994140625, "loss": 4602.2395, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.001346587436273694, "rewards/margins": 0.07388125360012054, "rewards/rejected": -0.07522784918546677, "step": 925 }, { "debug/policy_chosen_logits": 1.0340900421142578, "debug/policy_chosen_logps": -243.76431274414062, "debug/policy_rejected_logits": 1.3360588550567627, "debug/policy_rejected_logps": -282.24517822265625, "debug/reference_chosen_logps": -244.71994018554688, "debug/reference_rejected_logps": -276.67999267578125, "debug/sppo_chosen_loss": 2420.22119140625, "debug/sppo_chosen_reward_in_loss": 0.9556635022163391, "debug/sppo_rej_reward_in_loss": -5.565131187438965, "debug/sppo_reject_loss": 2063.99853515625, "epoch": 3.369565217391304, "grad_norm": 62121.8313624803, "learning_rate": 6.638326904483011e-08, "logits/chosen": 1.0340900421142578, "logits/rejected": 1.3360588550567627, "logps/chosen": -243.76431274414062, "logps/rejected": -282.24517822265625, "loss": 4595.0086, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.009556634351611137, "rewards/margins": 0.06520794332027435, "rewards/rejected": -0.055651307106018066, "step": 930 }, { "debug/policy_chosen_logits": 1.0084972381591797, "debug/policy_chosen_logps": -231.0150604248047, "debug/policy_rejected_logits": 1.483633041381836, "debug/policy_rejected_logps": -263.78753662109375, "debug/reference_chosen_logps": -233.00662231445312, "debug/reference_rejected_logps": -258.6335144042969, "debug/sppo_chosen_loss": 2313.91064453125, "debug/sppo_chosen_reward_in_loss": 1.9915668964385986, "debug/sppo_rej_reward_in_loss": -5.154005527496338, "debug/sppo_reject_loss": 2143.50732421875, "epoch": 3.38768115942029, "grad_norm": 58778.16609152914, "learning_rate": 6.603080637870306e-08, "logits/chosen": 1.0084972381591797, "logits/rejected": 1.483633041381836, "logps/chosen": -231.0150604248047, "logps/rejected": -263.78753662109375, "loss": 4569.0508, "rewards/accuracies": 0.75, "rewards/chosen": 0.019915666431188583, "rewards/margins": 0.07145573198795319, "rewards/rejected": -0.05154005438089371, "step": 935 }, { "debug/policy_chosen_logits": 1.1337854862213135, "debug/policy_chosen_logps": -245.05838012695312, "debug/policy_rejected_logits": 1.2576329708099365, "debug/policy_rejected_logps": -268.67169189453125, "debug/reference_chosen_logps": -245.1829071044922, "debug/reference_rejected_logps": -262.12872314453125, "debug/sppo_chosen_loss": 2540.96142578125, "debug/sppo_chosen_reward_in_loss": 0.12454567104578018, "debug/sppo_rej_reward_in_loss": -6.542975425720215, "debug/sppo_reject_loss": 1980.4417724609375, "epoch": 3.4057971014492754, "grad_norm": 79533.11373377292, "learning_rate": 6.567745358572863e-08, "logits/chosen": 1.1337854862213135, "logits/rejected": 1.2576329708099365, "logps/chosen": -245.05838012695312, "logps/rejected": -268.67169189453125, "loss": 4578.5586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0012454565148800611, "rewards/margins": 0.06667520850896835, "rewards/rejected": -0.06542975455522537, "step": 940 }, { "debug/policy_chosen_logits": 1.131906270980835, "debug/policy_chosen_logps": -226.1370849609375, "debug/policy_rejected_logits": 1.4897301197052002, "debug/policy_rejected_logps": -268.9652099609375, "debug/reference_chosen_logps": -227.4475555419922, "debug/reference_rejected_logps": -264.8956298828125, "debug/sppo_chosen_loss": 2411.512451171875, "debug/sppo_chosen_reward_in_loss": 1.3104562759399414, "debug/sppo_rej_reward_in_loss": -4.069582939147949, "debug/sppo_reject_loss": 2181.656982421875, "epoch": 3.4239130434782608, "grad_norm": 106805.97530068812, "learning_rate": 6.532323028618045e-08, "logits/chosen": 1.131906270980835, "logits/rejected": 1.4897301197052002, "logps/chosen": -226.1370849609375, "logps/rejected": -268.9652099609375, "loss": 4561.7008, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.013104562647640705, "rewards/margins": 0.053800392895936966, "rewards/rejected": -0.04069582745432854, "step": 945 }, { "debug/policy_chosen_logits": 0.8874229192733765, "debug/policy_chosen_logps": -253.674560546875, "debug/policy_rejected_logits": 1.1931557655334473, "debug/policy_rejected_logps": -277.47772216796875, "debug/reference_chosen_logps": -252.0839385986328, "debug/reference_rejected_logps": -270.8529052734375, "debug/sppo_chosen_loss": 2753.93359375, "debug/sppo_chosen_reward_in_loss": -1.5906407833099365, "debug/sppo_rej_reward_in_loss": -6.624871253967285, "debug/sppo_reject_loss": 1967.4906005859375, "epoch": 3.4420289855072466, "grad_norm": 71768.84768061835, "learning_rate": 6.496815614866791e-08, "logits/chosen": 0.8874229192733765, "logits/rejected": 1.1931557655334473, "logps/chosen": -253.674560546875, "logps/rejected": -277.47772216796875, "loss": 4537.0883, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.015906408429145813, "rewards/margins": 0.05034229904413223, "rewards/rejected": -0.06624870747327805, "step": 950 }, { "debug/policy_chosen_logits": 1.2168424129486084, "debug/policy_chosen_logps": -248.585693359375, "debug/policy_rejected_logits": 1.4745090007781982, "debug/policy_rejected_logps": -255.3431854248047, "debug/reference_chosen_logps": -250.8568572998047, "debug/reference_rejected_logps": -252.77029418945312, "debug/sppo_chosen_loss": 2288.59375, "debug/sppo_chosen_reward_in_loss": 2.271167516708374, "debug/sppo_rej_reward_in_loss": -2.5729286670684814, "debug/sppo_reject_loss": 2316.78564453125, "epoch": 3.460144927536232, "grad_norm": 104690.3662571117, "learning_rate": 6.461225088904402e-08, "logits/chosen": 1.2168424129486084, "logits/rejected": 1.4745090007781982, "logps/chosen": -248.585693359375, "logps/rejected": -255.3431854248047, "loss": 4551.0578, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.022711673751473427, "rewards/margins": 0.04844096302986145, "rewards/rejected": -0.025729287415742874, "step": 955 }, { "debug/policy_chosen_logits": 1.2385737895965576, "debug/policy_chosen_logps": -241.7220458984375, "debug/policy_rejected_logits": 1.4867292642593384, "debug/policy_rejected_logps": -280.5006103515625, "debug/reference_chosen_logps": -242.78359985351562, "debug/reference_rejected_logps": -274.2628479003906, "debug/sppo_chosen_loss": 2402.5537109375, "debug/sppo_chosen_reward_in_loss": 1.0615535974502563, "debug/sppo_rej_reward_in_loss": -6.237776756286621, "debug/sppo_reject_loss": 2054.988525390625, "epoch": 3.4782608695652173, "grad_norm": 55443.77261586784, "learning_rate": 6.425553426931074e-08, "logits/chosen": 1.2385737895965576, "logits/rejected": 1.4867292642593384, "logps/chosen": -241.7220458984375, "logps/rejected": -280.5006103515625, "loss": 4659.8496, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.010615535080432892, "rewards/margins": 0.07299330085515976, "rewards/rejected": -0.06237776204943657, "step": 960 }, { "debug/policy_chosen_logits": 1.3841091394424438, "debug/policy_chosen_logps": -256.5181579589844, "debug/policy_rejected_logits": 1.741140365600586, "debug/policy_rejected_logps": -308.02252197265625, "debug/reference_chosen_logps": -256.64630126953125, "debug/reference_rejected_logps": -303.5486755371094, "debug/sppo_chosen_loss": 2536.14306640625, "debug/sppo_chosen_reward_in_loss": 0.12813511490821838, "debug/sppo_rej_reward_in_loss": -4.473842144012451, "debug/sppo_reject_loss": 2165.02587890625, "epoch": 3.496376811594203, "grad_norm": 67083.57000441544, "learning_rate": 6.389802609652162e-08, "logits/chosen": 1.3841091394424438, "logits/rejected": 1.741140365600586, "logps/chosen": -256.5181579589844, "logps/rejected": -308.02252197265625, "loss": 4516.9992, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.001281349454075098, "rewards/margins": 0.046019770205020905, "rewards/rejected": -0.044738419353961945, "step": 965 }, { "debug/policy_chosen_logits": 1.2212345600128174, "debug/policy_chosen_logps": -264.72930908203125, "debug/policy_rejected_logits": 1.5062768459320068, "debug/policy_rejected_logps": -296.52203369140625, "debug/reference_chosen_logps": -266.0826721191406, "debug/reference_rejected_logps": -290.98681640625, "debug/sppo_chosen_loss": 2382.504150390625, "debug/sppo_chosen_reward_in_loss": 1.3533518314361572, "debug/sppo_rej_reward_in_loss": -5.535216331481934, "debug/sppo_reject_loss": 2121.054443359375, "epoch": 3.5144927536231885, "grad_norm": 71337.47534161789, "learning_rate": 6.353974622168195e-08, "logits/chosen": 1.2212345600128174, "logits/rejected": 1.5062768459320068, "logps/chosen": -264.72930908203125, "logps/rejected": -296.52203369140625, "loss": 4735.7297, "rewards/accuracies": 0.75, "rewards/chosen": 0.013533517718315125, "rewards/margins": 0.0688856840133667, "rewards/rejected": -0.05535217002034187, "step": 970 }, { "debug/policy_chosen_logits": 1.2449438571929932, "debug/policy_chosen_logps": -274.74798583984375, "debug/policy_rejected_logits": 1.4908430576324463, "debug/policy_rejected_logps": -294.26043701171875, "debug/reference_chosen_logps": -276.3829040527344, "debug/reference_rejected_logps": -289.47418212890625, "debug/sppo_chosen_loss": 2366.81787109375, "debug/sppo_chosen_reward_in_loss": 1.6348743438720703, "debug/sppo_rej_reward_in_loss": -4.786262035369873, "debug/sppo_reject_loss": 2130.755615234375, "epoch": 3.532608695652174, "grad_norm": 81418.27012097696, "learning_rate": 6.318071453864662e-08, "logits/chosen": 1.2449438571929932, "logits/rejected": 1.4908430576324463, "logps/chosen": -274.74798583984375, "logps/rejected": -294.26043701171875, "loss": 4559.5645, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.016348743811249733, "rewards/margins": 0.06421136111021042, "rewards/rejected": -0.047862619161605835, "step": 975 }, { "debug/policy_chosen_logits": 1.1797082424163818, "debug/policy_chosen_logps": -226.591796875, "debug/policy_rejected_logits": 1.7147998809814453, "debug/policy_rejected_logps": -287.06842041015625, "debug/reference_chosen_logps": -229.1777801513672, "debug/reference_rejected_logps": -279.7626953125, "debug/sppo_chosen_loss": 2254.087646484375, "debug/sppo_chosen_reward_in_loss": 2.585974931716919, "debug/sppo_rej_reward_in_loss": -7.305711269378662, "debug/sppo_reject_loss": 1962.0999755859375, "epoch": 3.550724637681159, "grad_norm": 67774.91197865033, "learning_rate": 6.282095098301539e-08, "logits/chosen": 1.1797082424163818, "logits/rejected": 1.7147998809814453, "logps/chosen": -226.591796875, "logps/rejected": -287.06842041015625, "loss": 4500.5508, "rewards/accuracies": 0.875, "rewards/chosen": 0.02585974894464016, "rewards/margins": 0.09891685843467712, "rewards/rejected": -0.07305711507797241, "step": 980 }, { "debug/policy_chosen_logits": 1.225523591041565, "debug/policy_chosen_logps": -247.4475555419922, "debug/policy_rejected_logits": 1.4108251333236694, "debug/policy_rejected_logps": -291.97711181640625, "debug/reference_chosen_logps": -249.59201049804688, "debug/reference_rejected_logps": -285.00433349609375, "debug/sppo_chosen_loss": 2312.104736328125, "debug/sppo_chosen_reward_in_loss": 2.144468069076538, "debug/sppo_rej_reward_in_loss": -6.972817897796631, "debug/sppo_reject_loss": 1981.7515869140625, "epoch": 3.568840579710145, "grad_norm": 66697.2543743667, "learning_rate": 6.246047553102603e-08, "logits/chosen": 1.225523591041565, "logits/rejected": 1.4108251333236694, "logps/chosen": -247.4475555419922, "logps/rejected": -291.97711181640625, "loss": 4519.1367, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.02144468203186989, "rewards/margins": 0.09117285907268524, "rewards/rejected": -0.06972817331552505, "step": 985 }, { "debug/policy_chosen_logits": 1.4036672115325928, "debug/policy_chosen_logps": -270.65557861328125, "debug/policy_rejected_logits": 1.3850643634796143, "debug/policy_rejected_logps": -292.19305419921875, "debug/reference_chosen_logps": -272.51019287109375, "debug/reference_rejected_logps": -290.306884765625, "debug/sppo_chosen_loss": 2350.90185546875, "debug/sppo_chosen_reward_in_loss": 1.854596734046936, "debug/sppo_rej_reward_in_loss": -1.886178731918335, "debug/sppo_reject_loss": 2385.397705078125, "epoch": 3.5869565217391304, "grad_norm": 87720.9428138047, "learning_rate": 6.209930819844507e-08, "logits/chosen": 1.4036672115325928, "logits/rejected": 1.3850643634796143, "logps/chosen": -270.65557861328125, "logps/rejected": -292.19305419921875, "loss": 4641.4867, "rewards/accuracies": 0.75, "rewards/chosen": 0.0185459665954113, "rewards/margins": 0.037407755851745605, "rewards/rejected": -0.018861789256334305, "step": 990 }, { "debug/policy_chosen_logits": 1.102624535560608, "debug/policy_chosen_logps": -255.6370849609375, "debug/policy_rejected_logits": 1.6262718439102173, "debug/policy_rejected_logps": -312.22467041015625, "debug/reference_chosen_logps": -255.8466339111328, "debug/reference_rejected_logps": -307.6287841796875, "debug/sppo_chosen_loss": 2506.376708984375, "debug/sppo_chosen_reward_in_loss": 0.20953139662742615, "debug/sppo_rej_reward_in_loss": -4.5958662033081055, "debug/sppo_reject_loss": 2155.5029296875, "epoch": 3.605072463768116, "grad_norm": 79109.53179825464, "learning_rate": 6.173746903945638e-08, "logits/chosen": 1.102624535560608, "logits/rejected": 1.6262718439102173, "logps/chosen": -255.6370849609375, "logps/rejected": -312.22467041015625, "loss": 4548.2949, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.002095314208418131, "rewards/margins": 0.04805397614836693, "rewards/rejected": -0.045958660542964935, "step": 995 }, { "debug/policy_chosen_logits": 1.402718186378479, "debug/policy_chosen_logps": -263.24212646484375, "debug/policy_rejected_logits": 1.5921787023544312, "debug/policy_rejected_logps": -289.51226806640625, "debug/reference_chosen_logps": -265.0359802246094, "debug/reference_rejected_logps": -285.32977294921875, "debug/sppo_chosen_loss": 2352.199462890625, "debug/sppo_chosen_reward_in_loss": 1.7938053607940674, "debug/sppo_rej_reward_in_loss": -4.182505130767822, "debug/sppo_reject_loss": 2218.07470703125, "epoch": 3.6231884057971016, "grad_norm": 64425.89693535943, "learning_rate": 6.137497814554771e-08, "logits/chosen": 1.402718186378479, "logits/rejected": 1.5921787023544312, "logps/chosen": -263.24212646484375, "logps/rejected": -289.51226806640625, "loss": 4599.7109, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.017938053235411644, "rewards/margins": 0.059763096272945404, "rewards/rejected": -0.04182504862546921, "step": 1000 }, { "epoch": 3.6231884057971016, "eval_debug/policy_chosen_logits": 1.4479906558990479, "eval_debug/policy_chosen_logps": -252.73809814453125, "eval_debug/policy_rejected_logits": 1.4972885847091675, "eval_debug/policy_rejected_logps": -262.917236328125, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2535.936767578125, "eval_debug/sppo_chosen_reward_in_loss": 0.18038207292556763, "eval_debug/sppo_rej_reward_in_loss": -3.2586004734039307, "eval_debug/sppo_reject_loss": 2302.096923828125, "eval_logits/chosen": 1.4479906558990479, "eval_logits/rejected": 1.4972885847091675, "eval_logps/chosen": -252.73809814453125, "eval_logps/rejected": -262.917236328125, "eval_loss": 4664.81689453125, "eval_rewards/accuracies": 0.5657894611358643, "eval_rewards/chosen": 0.0018038200214505196, "eval_rewards/margins": 0.03438982367515564, "eval_rewards/rejected": -0.032586004585027695, "eval_runtime": 28.5745, "eval_samples_per_second": 20.998, "eval_steps_per_second": 0.665, "step": 1000 }, { "debug/policy_chosen_logits": 1.068411111831665, "debug/policy_chosen_logps": -243.59439086914062, "debug/policy_rejected_logits": 1.6734635829925537, "debug/policy_rejected_logps": -307.09027099609375, "debug/reference_chosen_logps": -243.37484741210938, "debug/reference_rejected_logps": -299.6080017089844, "debug/sppo_chosen_loss": 2582.09765625, "debug/sppo_chosen_reward_in_loss": -0.21955490112304688, "debug/sppo_rej_reward_in_loss": -7.482234001159668, "debug/sppo_reject_loss": 2003.4375, "epoch": 3.641304347826087, "grad_norm": 72879.08819563665, "learning_rate": 6.101185564439507e-08, "logits/chosen": 1.068411111831665, "logits/rejected": 1.6734635829925537, "logps/chosen": -243.59439086914062, "logps/rejected": -307.09027099609375, "loss": 4494.1641, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0021955487318336964, "rewards/margins": 0.07262678444385529, "rewards/rejected": -0.074822336435318, "step": 1005 }, { "debug/policy_chosen_logits": 1.1344451904296875, "debug/policy_chosen_logps": -240.4451446533203, "debug/policy_rejected_logits": 1.7183849811553955, "debug/policy_rejected_logps": -306.75244140625, "debug/reference_chosen_logps": -241.87393188476562, "debug/reference_rejected_logps": -300.93377685546875, "debug/sppo_chosen_loss": 2374.83251953125, "debug/sppo_chosen_reward_in_loss": 1.4287716150283813, "debug/sppo_rej_reward_in_loss": -5.818660259246826, "debug/sppo_reject_loss": 2069.93798828125, "epoch": 3.6594202898550723, "grad_norm": 58992.621556186234, "learning_rate": 6.064812169874505e-08, "logits/chosen": 1.1344451904296875, "logits/rejected": 1.7183849811553955, "logps/chosen": -240.4451446533203, "logps/rejected": -306.75244140625, "loss": 4562.4156, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.014287715777754784, "rewards/margins": 0.07247431576251984, "rewards/rejected": -0.0581866018474102, "step": 1010 }, { "debug/policy_chosen_logits": 1.4492950439453125, "debug/policy_chosen_logps": -258.4239807128906, "debug/policy_rejected_logits": 1.282820224761963, "debug/policy_rejected_logps": -271.81219482421875, "debug/reference_chosen_logps": -259.12908935546875, "debug/reference_rejected_logps": -265.5401611328125, "debug/sppo_chosen_loss": 2455.122314453125, "debug/sppo_chosen_reward_in_loss": 0.7051193118095398, "debug/sppo_rej_reward_in_loss": -6.272065162658691, "debug/sppo_reject_loss": 2034.664306640625, "epoch": 3.677536231884058, "grad_norm": 77077.84401835203, "learning_rate": 6.028379650529536e-08, "logits/chosen": 1.4492950439453125, "logits/rejected": 1.282820224761963, "logps/chosen": -258.4239807128906, "logps/rejected": -271.81219482421875, "loss": 4588.6488, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.007051193621009588, "rewards/margins": 0.06977184116840363, "rewards/rejected": -0.0627206414937973, "step": 1015 }, { "debug/policy_chosen_logits": 0.9079713821411133, "debug/policy_chosen_logps": -240.22738647460938, "debug/policy_rejected_logits": 1.334970474243164, "debug/policy_rejected_logps": -296.3286437988281, "debug/reference_chosen_logps": -241.02230834960938, "debug/reference_rejected_logps": -289.7048034667969, "debug/sppo_chosen_loss": 2454.479736328125, "debug/sppo_chosen_reward_in_loss": 0.7949361801147461, "debug/sppo_rej_reward_in_loss": -6.62381649017334, "debug/sppo_reject_loss": 2016.115966796875, "epoch": 3.6956521739130435, "grad_norm": 61592.09809646041, "learning_rate": 5.991890029357334e-08, "logits/chosen": 0.9079713821411133, "logits/rejected": 1.334970474243164, "logps/chosen": -240.22738647460938, "logps/rejected": -296.3286437988281, "loss": 4505.7461, "rewards/accuracies": 0.75, "rewards/chosen": 0.007949361577630043, "rewards/margins": 0.07418752461671829, "rewards/rejected": -0.0662381649017334, "step": 1020 }, { "debug/policy_chosen_logits": 1.4035258293151855, "debug/policy_chosen_logps": -254.43551635742188, "debug/policy_rejected_logits": 1.8539726734161377, "debug/policy_rejected_logps": -316.92572021484375, "debug/reference_chosen_logps": -256.17486572265625, "debug/reference_rejected_logps": -310.9232482910156, "debug/sppo_chosen_loss": 2340.42724609375, "debug/sppo_chosen_reward_in_loss": 1.739314317703247, "debug/sppo_rej_reward_in_loss": -6.002488613128662, "debug/sppo_reject_loss": 2059.16162109375, "epoch": 3.713768115942029, "grad_norm": 64473.59022347943, "learning_rate": 5.9553453324812716e-08, "logits/chosen": 1.4035258293151855, "logits/rejected": 1.8539726734161377, "logps/chosen": -254.43551635742188, "logps/rejected": -316.92572021484375, "loss": 4521.1812, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.017393141984939575, "rewards/margins": 0.07741802930831909, "rewards/rejected": -0.06002488732337952, "step": 1025 }, { "debug/policy_chosen_logits": 1.2367794513702393, "debug/policy_chosen_logps": -276.3914489746094, "debug/policy_rejected_logits": 1.4828026294708252, "debug/policy_rejected_logps": -286.67523193359375, "debug/reference_chosen_logps": -277.847412109375, "debug/reference_rejected_logps": -283.011962890625, "debug/sppo_chosen_loss": 2384.060791015625, "debug/sppo_chosen_reward_in_loss": 1.4559547901153564, "debug/sppo_rej_reward_in_loss": -3.6632473468780518, "debug/sppo_reject_loss": 2214.15087890625, "epoch": 3.7318840579710146, "grad_norm": 73969.22297967359, "learning_rate": 5.918747589082852e-08, "logits/chosen": 1.2367794513702393, "logits/rejected": 1.4828026294708252, "logps/chosen": -276.3914489746094, "logps/rejected": -286.67523193359375, "loss": 4686.9672, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.014559546485543251, "rewards/margins": 0.0511920228600502, "rewards/rejected": -0.0366324707865715, "step": 1030 }, { "debug/policy_chosen_logits": 1.0385093688964844, "debug/policy_chosen_logps": -269.85687255859375, "debug/policy_rejected_logits": 1.1317791938781738, "debug/policy_rejected_logps": -279.90869140625, "debug/reference_chosen_logps": -271.19110107421875, "debug/reference_rejected_logps": -272.58306884765625, "debug/sppo_chosen_loss": 2418.24658203125, "debug/sppo_chosen_reward_in_loss": 1.3342158794403076, "debug/sppo_rej_reward_in_loss": -7.325618743896484, "debug/sppo_reject_loss": 1939.541748046875, "epoch": 3.75, "grad_norm": 61579.2486295793, "learning_rate": 5.882098831289043e-08, "logits/chosen": 1.0385093688964844, "logits/rejected": 1.1317791938781738, "logps/chosen": -269.85687255859375, "logps/rejected": -279.90869140625, "loss": 4513.3438, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.013342161662876606, "rewards/margins": 0.08659834414720535, "rewards/rejected": -0.07325618714094162, "step": 1035 }, { "debug/policy_chosen_logits": 1.7431812286376953, "debug/policy_chosen_logps": -261.0448913574219, "debug/policy_rejected_logits": 2.046821117401123, "debug/policy_rejected_logps": -301.9384460449219, "debug/reference_chosen_logps": -260.8301696777344, "debug/reference_rejected_logps": -295.55316162109375, "debug/sppo_chosen_loss": 2600.938232421875, "debug/sppo_chosen_reward_in_loss": -0.2147224396467209, "debug/sppo_rej_reward_in_loss": -6.385306358337402, "debug/sppo_reject_loss": 2045.309326171875, "epoch": 3.7681159420289854, "grad_norm": 60098.91869574073, "learning_rate": 5.845401094059438e-08, "logits/chosen": 1.7431812286376953, "logits/rejected": 2.046821117401123, "logps/chosen": -261.0448913574219, "logps/rejected": -301.9384460449219, "loss": 4533.4078, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.002147223800420761, "rewards/margins": 0.06170583888888359, "rewards/rejected": -0.06385305523872375, "step": 1040 }, { "debug/policy_chosen_logits": 1.536130666732788, "debug/policy_chosen_logps": -244.9329376220703, "debug/policy_rejected_logits": 1.77217698097229, "debug/policy_rejected_logps": -294.2804260253906, "debug/reference_chosen_logps": -246.18161010742188, "debug/reference_rejected_logps": -288.33087158203125, "debug/sppo_chosen_loss": 2424.373779296875, "debug/sppo_chosen_reward_in_loss": 1.2486801147460938, "debug/sppo_rej_reward_in_loss": -5.949560642242432, "debug/sppo_reject_loss": 2032.7587890625, "epoch": 3.786231884057971, "grad_norm": 69585.66644369718, "learning_rate": 5.808656415073263e-08, "logits/chosen": 1.536130666732788, "logits/rejected": 1.77217698097229, "logps/chosen": -244.9329376220703, "logps/rejected": -294.2804260253906, "loss": 4555.6328, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.012486802414059639, "rewards/margins": 0.07198240607976913, "rewards/rejected": -0.059495605528354645, "step": 1045 }, { "debug/policy_chosen_logits": 1.2508312463760376, "debug/policy_chosen_logps": -263.029052734375, "debug/policy_rejected_logits": 1.6683003902435303, "debug/policy_rejected_logps": -312.407958984375, "debug/reference_chosen_logps": -265.32025146484375, "debug/reference_rejected_logps": -303.7322998046875, "debug/sppo_chosen_loss": 2288.44775390625, "debug/sppo_chosen_reward_in_loss": 2.291184663772583, "debug/sppo_rej_reward_in_loss": -8.675673484802246, "debug/sppo_reject_loss": 1858.896240234375, "epoch": 3.8043478260869565, "grad_norm": 127296.46824033877, "learning_rate": 5.7718668346162357e-08, "logits/chosen": 1.2508312463760376, "logits/rejected": 1.6683003902435303, "logps/chosen": -263.029052734375, "logps/rejected": -312.407958984375, "loss": 4507.2723, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.02291184663772583, "rewards/margins": 0.10966857522726059, "rewards/rejected": -0.08675673604011536, "step": 1050 }, { "debug/policy_chosen_logits": 1.4180892705917358, "debug/policy_chosen_logps": -273.88018798828125, "debug/policy_rejected_logits": 1.6986258029937744, "debug/policy_rejected_logps": -292.30291748046875, "debug/reference_chosen_logps": -276.7886962890625, "debug/reference_rejected_logps": -286.98443603515625, "debug/sppo_chosen_loss": 2226.932861328125, "debug/sppo_chosen_reward_in_loss": 2.9084973335266113, "debug/sppo_rej_reward_in_loss": -5.318492412567139, "debug/sppo_reject_loss": 2071.72412109375, "epoch": 3.822463768115942, "grad_norm": 108931.73433353592, "learning_rate": 5.735034395467271e-08, "logits/chosen": 1.4180892705917358, "logits/rejected": 1.6986258029937744, "logps/chosen": -273.88018798828125, "logps/rejected": -292.30291748046875, "loss": 4483.5527, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02908497489988804, "rewards/margins": 0.08226989209651947, "rewards/rejected": -0.05318492650985718, "step": 1055 }, { "debug/policy_chosen_logits": 1.3309862613677979, "debug/policy_chosen_logps": -267.27777099609375, "debug/policy_rejected_logits": 1.5425870418548584, "debug/policy_rejected_logps": -288.1453552246094, "debug/reference_chosen_logps": -266.3821716308594, "debug/reference_rejected_logps": -285.1313171386719, "debug/sppo_chosen_loss": 2662.57666015625, "debug/sppo_chosen_reward_in_loss": -0.8955985903739929, "debug/sppo_rej_reward_in_loss": -3.014031171798706, "debug/sppo_reject_loss": 2292.829345703125, "epoch": 3.8405797101449277, "grad_norm": 95367.0531697411, "learning_rate": 5.698161142785058e-08, "logits/chosen": 1.3309862613677979, "logits/rejected": 1.5425870418548584, "logps/chosen": -267.27777099609375, "logps/rejected": -288.1453552246094, "loss": 4694.8043, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.008955985307693481, "rewards/margins": 0.021184323355555534, "rewards/rejected": -0.030140310525894165, "step": 1060 }, { "debug/policy_chosen_logits": 1.4495620727539062, "debug/policy_chosen_logps": -266.0342712402344, "debug/policy_rejected_logits": 1.4949467182159424, "debug/policy_rejected_logps": -267.76898193359375, "debug/reference_chosen_logps": -266.792236328125, "debug/reference_rejected_logps": -261.2626953125, "debug/sppo_chosen_loss": 2474.59521484375, "debug/sppo_chosen_reward_in_loss": 0.7579633593559265, "debug/sppo_rej_reward_in_loss": -6.50632381439209, "debug/sppo_reject_loss": 2022.4810791015625, "epoch": 3.858695652173913, "grad_norm": 61935.53835577618, "learning_rate": 5.661249123994495e-08, "logits/chosen": 1.4495620727539062, "logits/rejected": 1.4949467182159424, "logps/chosen": -266.0342712402344, "logps/rejected": -267.76898193359375, "loss": 4553.7563, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.007579633500427008, "rewards/margins": 0.07264287769794464, "rewards/rejected": -0.0650632381439209, "step": 1065 }, { "debug/policy_chosen_logits": 1.2382694482803345, "debug/policy_chosen_logps": -242.1920623779297, "debug/policy_rejected_logits": 1.4644126892089844, "debug/policy_rejected_logps": -260.42327880859375, "debug/reference_chosen_logps": -243.15463256835938, "debug/reference_rejected_logps": -256.77166748046875, "debug/sppo_chosen_loss": 2455.119140625, "debug/sppo_chosen_reward_in_loss": 0.9625652432441711, "debug/sppo_rej_reward_in_loss": -3.651651382446289, "debug/sppo_reject_loss": 2244.75634765625, "epoch": 3.8768115942028984, "grad_norm": 62560.529926078765, "learning_rate": 5.624300388673012e-08, "logits/chosen": 1.2382694482803345, "logits/rejected": 1.4644126892089844, "logps/chosen": -242.1920623779297, "logps/rejected": -260.42327880859375, "loss": 4614.7484, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.009625652804970741, "rewards/margins": 0.04614216461777687, "rewards/rejected": -0.03651650995016098, "step": 1070 }, { "debug/policy_chosen_logits": 1.057923674583435, "debug/policy_chosen_logps": -227.30722045898438, "debug/policy_rejected_logits": 1.3531568050384521, "debug/policy_rejected_logps": -271.60675048828125, "debug/reference_chosen_logps": -231.5152130126953, "debug/reference_rejected_logps": -270.4071350097656, "debug/sppo_chosen_loss": 2120.608154296875, "debug/sppo_chosen_reward_in_loss": 4.2079925537109375, "debug/sppo_rej_reward_in_loss": -1.1996062994003296, "debug/sppo_reject_loss": 2496.496337890625, "epoch": 3.894927536231884, "grad_norm": 212962.5914278675, "learning_rate": 5.5873169884367596e-08, "logits/chosen": 1.057923674583435, "logits/rejected": 1.3531568050384521, "logps/chosen": -227.30722045898438, "logps/rejected": -271.60675048828125, "loss": 4589.2344, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.042079925537109375, "rewards/margins": 0.05407598614692688, "rewards/rejected": -0.011996065266430378, "step": 1075 }, { "debug/policy_chosen_logits": 1.2656968832015991, "debug/policy_chosen_logps": -228.71029663085938, "debug/policy_rejected_logits": 1.6959224939346313, "debug/policy_rejected_logps": -311.6900939941406, "debug/reference_chosen_logps": -238.9965362548828, "debug/reference_rejected_logps": -321.0993957519531, "debug/sppo_chosen_loss": 1639.9117431640625, "debug/sppo_chosen_reward_in_loss": 10.28625202178955, "debug/sppo_rej_reward_in_loss": 9.409296035766602, "debug/sppo_reject_loss": 3562.62158203125, "epoch": 3.9130434782608696, "grad_norm": 134964.29261057146, "learning_rate": 5.550300976826696e-08, "logits/chosen": 1.2656968832015991, "logits/rejected": 1.6959224939346313, "logps/chosen": -228.71029663085938, "logps/rejected": -311.6900939941406, "loss": 5403.2051, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10286252200603485, "rewards/margins": 0.008769561536610126, "rewards/rejected": 0.094092957675457, "step": 1080 }, { "debug/policy_chosen_logits": 1.2611393928527832, "debug/policy_chosen_logps": -254.24649047851562, "debug/policy_rejected_logits": 1.7364327907562256, "debug/policy_rejected_logps": -283.56488037109375, "debug/reference_chosen_logps": -259.33831787109375, "debug/reference_rejected_logps": -290.1636962890625, "debug/sppo_chosen_loss": 2095.25146484375, "debug/sppo_chosen_reward_in_loss": 5.091801643371582, "debug/sppo_rej_reward_in_loss": 6.598813056945801, "debug/sppo_reject_loss": 3243.10498046875, "epoch": 3.931159420289855, "grad_norm": 80427.36638938684, "learning_rate": 5.513254409194554e-08, "logits/chosen": 1.2611393928527832, "logits/rejected": 1.7364327907562256, "logps/chosen": -254.24649047851562, "logps/rejected": -283.56488037109375, "loss": 5105.2883, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.05091802030801773, "rewards/margins": -0.015070107765495777, "rewards/rejected": 0.06598811596632004, "step": 1085 }, { "debug/policy_chosen_logits": 1.1099778413772583, "debug/policy_chosen_logps": -226.66616821289062, "debug/policy_rejected_logits": 1.6032660007476807, "debug/policy_rejected_logps": -317.4427490234375, "debug/reference_chosen_logps": -233.14584350585938, "debug/reference_rejected_logps": -319.9851989746094, "debug/sppo_chosen_loss": 1903.612548828125, "debug/sppo_chosen_reward_in_loss": 6.479703426361084, "debug/sppo_rej_reward_in_loss": 2.5424790382385254, "debug/sppo_reject_loss": 2803.58642578125, "epoch": 3.949275362318841, "grad_norm": 60879.37374870972, "learning_rate": 5.4761793425887274e-08, "logits/chosen": 1.1099778413772583, "logits/rejected": 1.6032660007476807, "logps/chosen": -226.66616821289062, "logps/rejected": -317.4427490234375, "loss": 4794.3469, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.06479702889919281, "rewards/margins": 0.039372242987155914, "rewards/rejected": 0.025424787774682045, "step": 1090 }, { "debug/policy_chosen_logits": 0.9639069437980652, "debug/policy_chosen_logps": -227.4275665283203, "debug/policy_rejected_logits": 1.2056257724761963, "debug/policy_rejected_logps": -266.2969665527344, "debug/reference_chosen_logps": -232.79541015625, "debug/reference_rejected_logps": -265.0921325683594, "debug/sppo_chosen_loss": 2008.8134765625, "debug/sppo_chosen_reward_in_loss": 5.367839336395264, "debug/sppo_rej_reward_in_loss": -1.2048313617706299, "debug/sppo_reject_loss": 2482.92919921875, "epoch": 3.967391304347826, "grad_norm": 62988.92224608488, "learning_rate": 5.439077835640038e-08, "logits/chosen": 0.9639069437980652, "logits/rejected": 1.2056257724761963, "logps/chosen": -227.4275665283203, "logps/rejected": -266.2969665527344, "loss": 4757.8082, "rewards/accuracies": 0.75, "rewards/chosen": 0.05367839336395264, "rewards/margins": 0.06572670489549637, "rewards/rejected": -0.012048312462866306, "step": 1095 }, { "debug/policy_chosen_logits": 1.3540475368499756, "debug/policy_chosen_logps": -268.8951721191406, "debug/policy_rejected_logits": 1.5584566593170166, "debug/policy_rejected_logps": -272.23944091796875, "debug/reference_chosen_logps": -273.50811767578125, "debug/reference_rejected_logps": -270.3443298339844, "debug/sppo_chosen_loss": 2084.36181640625, "debug/sppo_chosen_reward_in_loss": 4.612961769104004, "debug/sppo_rej_reward_in_loss": -1.8950939178466797, "debug/sppo_reject_loss": 2381.415283203125, "epoch": 3.9855072463768115, "grad_norm": 78353.44336278702, "learning_rate": 5.4019519484474376e-08, "logits/chosen": 1.3540475368499756, "logits/rejected": 1.5584566593170166, "logps/chosen": -268.8951721191406, "logps/rejected": -272.23944091796875, "loss": 4598.4699, "rewards/accuracies": 0.875, "rewards/chosen": 0.04612961411476135, "rewards/margins": 0.06508056074380875, "rewards/rejected": -0.018950939178466797, "step": 1100 }, { "epoch": 3.9855072463768115, "eval_debug/policy_chosen_logits": 1.4246468544006348, "eval_debug/policy_chosen_logps": -250.67323303222656, "eval_debug/policy_rejected_logits": 1.4704437255859375, "eval_debug/policy_rejected_logps": -261.152099609375, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2330.43505859375, "eval_debug/sppo_chosen_reward_in_loss": 2.245246648788452, "eval_debug/sppo_rej_reward_in_loss": -1.4934808015823364, "eval_debug/sppo_reject_loss": 2454.228515625, "eval_logits/chosen": 1.4246468544006348, "eval_logits/rejected": 1.4704437255859375, "eval_logps/chosen": -250.67323303222656, "eval_logps/rejected": -261.152099609375, "eval_loss": 4659.80908203125, "eval_rewards/accuracies": 0.6578947305679321, "eval_rewards/chosen": 0.022452462464571, "eval_rewards/margins": 0.03738727420568466, "eval_rewards/rejected": -0.014934806153178215, "eval_runtime": 28.6024, "eval_samples_per_second": 20.977, "eval_steps_per_second": 0.664, "step": 1100 }, { "debug/policy_chosen_logits": 1.1118760108947754, "debug/policy_chosen_logps": -255.175048828125, "debug/policy_rejected_logits": 1.0309432744979858, "debug/policy_rejected_logps": -250.95205688476562, "debug/reference_chosen_logps": -258.0804748535156, "debug/reference_rejected_logps": -249.09701538085938, "debug/sppo_chosen_loss": 2245.64990234375, "debug/sppo_chosen_reward_in_loss": 2.9053878784179688, "debug/sppo_rej_reward_in_loss": -1.855006456375122, "debug/sppo_reject_loss": 2433.066162109375, "epoch": 4.003623188405797, "grad_norm": 66540.11198134471, "learning_rate": 5.364803742463616e-08, "logits/chosen": 1.1118760108947754, "logits/rejected": 1.0309432744979858, "logps/chosen": -255.175048828125, "logps/rejected": -250.95205688476562, "loss": 4605.5047, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02905387617647648, "rewards/margins": 0.04760394245386124, "rewards/rejected": -0.01855006255209446, "step": 1105 }, { "debug/policy_chosen_logits": 1.1205860376358032, "debug/policy_chosen_logps": -227.9105987548828, "debug/policy_rejected_logits": 1.6373428106307983, "debug/policy_rejected_logps": -315.283203125, "debug/reference_chosen_logps": -230.3061981201172, "debug/reference_rejected_logps": -310.6712341308594, "debug/sppo_chosen_loss": 2305.743408203125, "debug/sppo_chosen_reward_in_loss": 2.3956267833709717, "debug/sppo_rej_reward_in_loss": -4.611940383911133, "debug/sppo_reject_loss": 2187.669189453125, "epoch": 4.021739130434782, "grad_norm": 117309.90344802302, "learning_rate": 5.327635280380538e-08, "logits/chosen": 1.1205860376358032, "logits/rejected": 1.6373428106307983, "logps/chosen": -227.9105987548828, "logps/rejected": -315.283203125, "loss": 4562.8648, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.023956269025802612, "rewards/margins": 0.07007567584514618, "rewards/rejected": -0.04611939936876297, "step": 1110 }, { "debug/policy_chosen_logits": 1.216569185256958, "debug/policy_chosen_logps": -245.4633331298828, "debug/policy_rejected_logits": 1.344404935836792, "debug/policy_rejected_logps": -275.8775939941406, "debug/reference_chosen_logps": -246.9245147705078, "debug/reference_rejected_logps": -274.3244323730469, "debug/sppo_chosen_loss": 2414.36669921875, "debug/sppo_chosen_reward_in_loss": 1.4611823558807373, "debug/sppo_rej_reward_in_loss": -1.5531257390975952, "debug/sppo_reject_loss": 2421.377197265625, "epoch": 4.0398550724637685, "grad_norm": 65295.14624639694, "learning_rate": 5.290448626014904e-08, "logits/chosen": 1.216569185256958, "logits/rejected": 1.344404935836792, "logps/chosen": -245.4633331298828, "logps/rejected": -275.8775939941406, "loss": 4631.8605, "rewards/accuracies": 0.625, "rewards/chosen": 0.014611823484301567, "rewards/margins": 0.03014308586716652, "rewards/rejected": -0.015531256794929504, "step": 1115 }, { "debug/policy_chosen_logits": 1.2135611772537231, "debug/policy_chosen_logps": -256.9342041015625, "debug/policy_rejected_logits": 1.3099197149276733, "debug/policy_rejected_logps": -286.2529296875, "debug/reference_chosen_logps": -259.2249450683594, "debug/reference_rejected_logps": -282.4788818359375, "debug/sppo_chosen_loss": 2306.509521484375, "debug/sppo_chosen_reward_in_loss": 2.2907767295837402, "debug/sppo_rej_reward_in_loss": -3.77405047416687, "debug/sppo_reject_loss": 2217.175537109375, "epoch": 4.057971014492754, "grad_norm": 105412.84079895554, "learning_rate": 5.253245844193563e-08, "logits/chosen": 1.2135611772537231, "logits/rejected": 1.3099197149276733, "logps/chosen": -256.9342041015625, "logps/rejected": -286.2529296875, "loss": 4491.2824, "rewards/accuracies": 0.75, "rewards/chosen": 0.022907767444849014, "rewards/margins": 0.060648269951343536, "rewards/rejected": -0.03774050623178482, "step": 1120 }, { "debug/policy_chosen_logits": 1.1843010187149048, "debug/policy_chosen_logps": -252.4182586669922, "debug/policy_rejected_logits": 1.4800398349761963, "debug/policy_rejected_logps": -295.4695739746094, "debug/reference_chosen_logps": -255.17752075195312, "debug/reference_rejected_logps": -289.46148681640625, "debug/sppo_chosen_loss": 2242.72412109375, "debug/sppo_chosen_reward_in_loss": 2.7592289447784424, "debug/sppo_rej_reward_in_loss": -6.00807523727417, "debug/sppo_reject_loss": 2072.38037109375, "epoch": 4.076086956521739, "grad_norm": 77597.85285256788, "learning_rate": 5.21602900063886e-08, "logits/chosen": 1.1843010187149048, "logits/rejected": 1.4800398349761963, "logps/chosen": -252.4182586669922, "logps/rejected": -295.4695739746094, "loss": 4496.1145, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.027592290192842484, "rewards/margins": 0.08767304569482803, "rewards/rejected": -0.06008074805140495, "step": 1125 }, { "debug/policy_chosen_logits": 1.3048770427703857, "debug/policy_chosen_logps": -276.3455505371094, "debug/policy_rejected_logits": 1.5092872381210327, "debug/policy_rejected_logps": -294.993896484375, "debug/reference_chosen_logps": -276.34832763671875, "debug/reference_rejected_logps": -289.8849182128906, "debug/sppo_chosen_loss": 2554.799560546875, "debug/sppo_chosen_reward_in_loss": 0.0027608871459960938, "debug/sppo_rej_reward_in_loss": -5.10897159576416, "debug/sppo_reject_loss": 2119.67626953125, "epoch": 4.094202898550725, "grad_norm": 97216.84067192367, "learning_rate": 5.1788001618539276e-08, "logits/chosen": 1.3048770427703857, "logits/rejected": 1.5092872381210327, "logps/chosen": -276.3455505371094, "logps/rejected": -294.993896484375, "loss": 4531.5531, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 2.7608499294728972e-05, "rewards/margins": 0.05111732333898544, "rewards/rejected": -0.05108971521258354, "step": 1130 }, { "debug/policy_chosen_logits": 0.9452874064445496, "debug/policy_chosen_logps": -240.93484497070312, "debug/policy_rejected_logits": 1.2251718044281006, "debug/policy_rejected_logps": -279.31304931640625, "debug/reference_chosen_logps": -242.5782012939453, "debug/reference_rejected_logps": -273.7024841308594, "debug/sppo_chosen_loss": 2375.52734375, "debug/sppo_chosen_reward_in_loss": 1.643363356590271, "debug/sppo_rej_reward_in_loss": -5.610522270202637, "debug/sppo_reject_loss": 2086.57177734375, "epoch": 4.11231884057971, "grad_norm": 79223.02835815644, "learning_rate": 5.141561395007945e-08, "logits/chosen": 0.9452874064445496, "logits/rejected": 1.2251718044281006, "logps/chosen": -240.93484497070312, "logps/rejected": -279.31304931640625, "loss": 4587.2625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.016433632001280785, "rewards/margins": 0.07253885269165039, "rewards/rejected": -0.05610521882772446, "step": 1135 }, { "debug/policy_chosen_logits": 0.9107455015182495, "debug/policy_chosen_logps": -257.3502197265625, "debug/policy_rejected_logits": 1.4192378520965576, "debug/policy_rejected_logps": -306.44573974609375, "debug/reference_chosen_logps": -257.38555908203125, "debug/reference_rejected_logps": -298.5243225097656, "debug/sppo_chosen_loss": 2568.659912109375, "debug/sppo_chosen_reward_in_loss": 0.035347748547792435, "debug/sppo_rej_reward_in_loss": -7.921439170837402, "debug/sppo_reject_loss": 1932.2445068359375, "epoch": 4.130434782608695, "grad_norm": 120058.59485182083, "learning_rate": 5.104314767821363e-08, "logits/chosen": 0.9107455015182495, "logits/rejected": 1.4192378520965576, "logps/chosen": -257.3502197265625, "logps/rejected": -306.44573974609375, "loss": 4574.6625, "rewards/accuracies": 0.75, "rewards/chosen": 0.000353475654264912, "rewards/margins": 0.07956786453723907, "rewards/rejected": -0.07921438664197922, "step": 1140 }, { "debug/policy_chosen_logits": 1.1127492189407349, "debug/policy_chosen_logps": -247.59640502929688, "debug/policy_rejected_logits": 1.3368191719055176, "debug/policy_rejected_logps": -298.4590148925781, "debug/reference_chosen_logps": -248.70059204101562, "debug/reference_rejected_logps": -292.8433532714844, "debug/sppo_chosen_loss": 2429.892333984375, "debug/sppo_chosen_reward_in_loss": 1.1041476726531982, "debug/sppo_rej_reward_in_loss": -5.615652561187744, "debug/sppo_reject_loss": 2096.07275390625, "epoch": 4.148550724637682, "grad_norm": 57946.196357600886, "learning_rate": 5.067062348451078e-08, "logits/chosen": 1.1127492189407349, "logits/rejected": 1.3368191719055176, "logps/chosen": -247.59640502929688, "logps/rejected": -298.4590148925781, "loss": 4558.4766, "rewards/accuracies": 0.75, "rewards/chosen": 0.011041476391255856, "rewards/margins": 0.0671980008482933, "rewards/rejected": -0.05615652725100517, "step": 1145 }, { "debug/policy_chosen_logits": 1.174381971359253, "debug/policy_chosen_logps": -249.43716430664062, "debug/policy_rejected_logits": 1.5160521268844604, "debug/policy_rejected_logps": -294.8294982910156, "debug/reference_chosen_logps": -251.27197265625, "debug/reference_rejected_logps": -290.90399169921875, "debug/sppo_chosen_loss": 2353.6328125, "debug/sppo_chosen_reward_in_loss": 1.8347762823104858, "debug/sppo_rej_reward_in_loss": -3.92549467086792, "debug/sppo_reject_loss": 2237.70751953125, "epoch": 4.166666666666667, "grad_norm": 59812.82756526929, "learning_rate": 5.029806205375612e-08, "logits/chosen": 1.174381971359253, "logits/rejected": 1.5160521268844604, "logps/chosen": -249.43716430664062, "logps/rejected": -294.8294982910156, "loss": 4607.048, "rewards/accuracies": 0.625, "rewards/chosen": 0.018347764387726784, "rewards/margins": 0.057602714747190475, "rewards/rejected": -0.039254944771528244, "step": 1150 }, { "debug/policy_chosen_logits": 1.2691490650177002, "debug/policy_chosen_logps": -268.0215759277344, "debug/policy_rejected_logits": 1.664320707321167, "debug/policy_rejected_logps": -268.55377197265625, "debug/reference_chosen_logps": -268.6834716796875, "debug/reference_rejected_logps": -264.5723571777344, "debug/sppo_chosen_loss": 2495.21826171875, "debug/sppo_chosen_reward_in_loss": 0.661870002746582, "debug/sppo_rej_reward_in_loss": -3.981393814086914, "debug/sppo_reject_loss": 2244.986328125, "epoch": 4.184782608695652, "grad_norm": 66401.13018831579, "learning_rate": 4.9925484072802416e-08, "logits/chosen": 1.2691490650177002, "logits/rejected": 1.664320707321167, "logps/chosen": -268.0215759277344, "logps/rejected": -268.55377197265625, "loss": 4537.3687, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00661869952455163, "rewards/margins": 0.04643263667821884, "rewards/rejected": -0.03981393203139305, "step": 1155 }, { "debug/policy_chosen_logits": 1.1173344850540161, "debug/policy_chosen_logps": -228.13916015625, "debug/policy_rejected_logits": 1.6752166748046875, "debug/policy_rejected_logps": -300.904296875, "debug/reference_chosen_logps": -228.95333862304688, "debug/reference_rejected_logps": -298.1351623535156, "debug/sppo_chosen_loss": 2445.013671875, "debug/sppo_chosen_reward_in_loss": 0.8141956329345703, "debug/sppo_rej_reward_in_loss": -2.769136428833008, "debug/sppo_reject_loss": 2345.28369140625, "epoch": 4.202898550724638, "grad_norm": 60564.715991142424, "learning_rate": 4.955291022942145e-08, "logits/chosen": 1.1173344850540161, "logits/rejected": 1.6752166748046875, "logps/chosen": -228.13916015625, "logps/rejected": -300.904296875, "loss": 4666.5391, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.008141955360770226, "rewards/margins": 0.035833317786455154, "rewards/rejected": -0.027691364288330078, "step": 1160 }, { "debug/policy_chosen_logits": 1.0344722270965576, "debug/policy_chosen_logps": -238.86990356445312, "debug/policy_rejected_logits": 1.4619827270507812, "debug/policy_rejected_logps": -276.3166809082031, "debug/reference_chosen_logps": -240.12442016601562, "debug/reference_rejected_logps": -271.47845458984375, "debug/sppo_chosen_loss": 2413.0107421875, "debug/sppo_chosen_reward_in_loss": 1.2545111179351807, "debug/sppo_rej_reward_in_loss": -4.838225364685059, "debug/sppo_reject_loss": 2133.81298828125, "epoch": 4.221014492753623, "grad_norm": 95972.71297880122, "learning_rate": 4.918036121115522e-08, "logits/chosen": 1.0344722270965576, "logits/rejected": 1.4619827270507812, "logps/chosen": -238.86990356445312, "logps/rejected": -276.3166809082031, "loss": 4568.5465, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.012545110657811165, "rewards/margins": 0.060927361249923706, "rewards/rejected": -0.04838225245475769, "step": 1165 }, { "debug/policy_chosen_logits": 1.2485231161117554, "debug/policy_chosen_logps": -272.33270263671875, "debug/policy_rejected_logits": 1.2228999137878418, "debug/policy_rejected_logps": -272.02862548828125, "debug/reference_chosen_logps": -274.59075927734375, "debug/reference_rejected_logps": -266.5238037109375, "debug/sppo_chosen_loss": 2290.04931640625, "debug/sppo_chosen_reward_in_loss": 2.2580807209014893, "debug/sppo_rej_reward_in_loss": -5.504796028137207, "debug/sppo_reject_loss": 2098.5244140625, "epoch": 4.239130434782608, "grad_norm": 61518.25577334932, "learning_rate": 4.8807857704167354e-08, "logits/chosen": 1.2485231161117554, "logits/rejected": 1.2228999137878418, "logps/chosen": -272.33270263671875, "logps/rejected": -272.02862548828125, "loss": 4555.2211, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02258080616593361, "rewards/margins": 0.07762876898050308, "rewards/rejected": -0.05504796653985977, "step": 1170 }, { "debug/policy_chosen_logits": 1.1164934635162354, "debug/policy_chosen_logps": -260.64654541015625, "debug/policy_rejected_logits": 1.1424113512039185, "debug/policy_rejected_logps": -283.02191162109375, "debug/reference_chosen_logps": -261.7978515625, "debug/reference_rejected_logps": -278.51116943359375, "debug/sppo_chosen_loss": 2429.515625, "debug/sppo_chosen_reward_in_loss": 1.1512893438339233, "debug/sppo_rej_reward_in_loss": -4.510707378387451, "debug/sppo_reject_loss": 2166.89794921875, "epoch": 4.257246376811594, "grad_norm": 63260.42201895485, "learning_rate": 4.843542039209433e-08, "logits/chosen": 1.1164934635162354, "logits/rejected": 1.1424113512039185, "logps/chosen": -260.64654541015625, "logps/rejected": -283.02191162109375, "loss": 4533.0938, "rewards/accuracies": 0.75, "rewards/chosen": 0.011512893252074718, "rewards/margins": 0.05661996454000473, "rewards/rejected": -0.04510707035660744, "step": 1175 }, { "debug/policy_chosen_logits": 1.2684993743896484, "debug/policy_chosen_logps": -276.10369873046875, "debug/policy_rejected_logits": 1.4684734344482422, "debug/policy_rejected_logps": -307.7631530761719, "debug/reference_chosen_logps": -273.1227722167969, "debug/reference_rejected_logps": -302.58367919921875, "debug/sppo_chosen_loss": 2924.57373046875, "debug/sppo_chosen_reward_in_loss": -2.980940341949463, "debug/sppo_rej_reward_in_loss": -5.179494380950928, "debug/sppo_reject_loss": 2151.70654296875, "epoch": 4.27536231884058, "grad_norm": 93177.70691280684, "learning_rate": 4.806306995489717e-08, "logits/chosen": 1.2684993743896484, "logits/rejected": 1.4684734344482422, "logps/chosen": -276.10369873046875, "logps/rejected": -307.7631530761719, "loss": 4602.8684, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.02980940416455269, "rewards/margins": 0.02198554016649723, "rewards/rejected": -0.05179494619369507, "step": 1180 }, { "debug/policy_chosen_logits": 0.8586879968643188, "debug/policy_chosen_logps": -242.4465789794922, "debug/policy_rejected_logits": 1.266420602798462, "debug/policy_rejected_logps": -309.1039733886719, "debug/reference_chosen_logps": -243.1239013671875, "debug/reference_rejected_logps": -302.6136169433594, "debug/sppo_chosen_loss": 2482.39892578125, "debug/sppo_chosen_reward_in_loss": 0.6773250699043274, "debug/sppo_rej_reward_in_loss": -6.490335941314697, "debug/sppo_reject_loss": 2004.927734375, "epoch": 4.293478260869565, "grad_norm": 64413.494297686426, "learning_rate": 4.769082706771303e-08, "logits/chosen": 0.8586879968643188, "logits/rejected": 1.266420602798462, "logps/chosen": -242.4465789794922, "logps/rejected": -309.1039733886719, "loss": 4494.5984, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.006773251108825207, "rewards/margins": 0.07167660444974899, "rewards/rejected": -0.06490335613489151, "step": 1185 }, { "debug/policy_chosen_logits": 1.2695519924163818, "debug/policy_chosen_logps": -249.88650512695312, "debug/policy_rejected_logits": 1.4001498222351074, "debug/policy_rejected_logps": -259.56414794921875, "debug/reference_chosen_logps": -252.074951171875, "debug/reference_rejected_logps": -256.25079345703125, "debug/sppo_chosen_loss": 2297.40478515625, "debug/sppo_chosen_reward_in_loss": 2.188446521759033, "debug/sppo_rej_reward_in_loss": -3.3133347034454346, "debug/sppo_reject_loss": 2301.43310546875, "epoch": 4.311594202898551, "grad_norm": 68571.0234947558, "learning_rate": 4.731871239970723e-08, "logits/chosen": 1.2695519924163818, "logits/rejected": 1.4001498222351074, "logps/chosen": -249.88650512695312, "logps/rejected": -259.56414794921875, "loss": 4506.7273, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.02188446745276451, "rewards/margins": 0.05501781031489372, "rewards/rejected": -0.03313334658741951, "step": 1190 }, { "debug/policy_chosen_logits": 1.0433049201965332, "debug/policy_chosen_logps": -260.4845275878906, "debug/policy_rejected_logits": 1.3592349290847778, "debug/policy_rejected_logps": -304.9561462402344, "debug/reference_chosen_logps": -261.36907958984375, "debug/reference_rejected_logps": -301.53497314453125, "debug/sppo_chosen_loss": 2446.2802734375, "debug/sppo_chosen_reward_in_loss": 0.8845428228378296, "debug/sppo_rej_reward_in_loss": -3.4211902618408203, "debug/sppo_reject_loss": 2262.201904296875, "epoch": 4.329710144927536, "grad_norm": 60012.33571932537, "learning_rate": 4.694674661292563e-08, "logits/chosen": 1.0433049201965332, "logits/rejected": 1.3592349290847778, "logps/chosen": -260.4845275878906, "logps/rejected": -304.9561462402344, "loss": 4682.9477, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008845428004860878, "rewards/margins": 0.04305732995271683, "rewards/rejected": -0.0342118963599205, "step": 1195 }, { "debug/policy_chosen_logits": 1.639452338218689, "debug/policy_chosen_logps": -251.68209838867188, "debug/policy_rejected_logits": 2.0136008262634277, "debug/policy_rejected_logps": -309.7662048339844, "debug/reference_chosen_logps": -252.7598114013672, "debug/reference_rejected_logps": -300.9691467285156, "debug/sppo_chosen_loss": 2417.675048828125, "debug/sppo_chosen_reward_in_loss": 1.0777438879013062, "debug/sppo_rej_reward_in_loss": -8.797119140625, "debug/sppo_reject_loss": 1863.860595703125, "epoch": 4.3478260869565215, "grad_norm": 59513.496975803915, "learning_rate": 4.6574950361147296e-08, "logits/chosen": 1.639452338218689, "logits/rejected": 2.0136008262634277, "logps/chosen": -251.68209838867188, "logps/rejected": -309.7662048339844, "loss": 4434.3441, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.010777438059449196, "rewards/margins": 0.09874863177537918, "rewards/rejected": -0.08797118812799454, "step": 1200 }, { "epoch": 4.3478260869565215, "eval_debug/policy_chosen_logits": 1.4176274538040161, "eval_debug/policy_chosen_logps": -253.55946350097656, "eval_debug/policy_rejected_logits": 1.4647811651229858, "eval_debug/policy_rejected_logps": -264.1338806152344, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2633.100830078125, "eval_debug/sppo_chosen_reward_in_loss": -0.6409844160079956, "eval_debug/sppo_rej_reward_in_loss": -4.475238800048828, "eval_debug/sppo_reject_loss": 2222.516357421875, "eval_logits/chosen": 1.4176274538040161, "eval_logits/rejected": 1.4647811651229858, "eval_logps/chosen": -253.55946350097656, "eval_logps/rejected": -264.1338806152344, "eval_loss": 4652.3701171875, "eval_rewards/accuracies": 0.5789473652839661, "eval_rewards/chosen": -0.006409844849258661, "eval_rewards/margins": 0.03834254667162895, "eval_rewards/rejected": -0.044752392917871475, "eval_runtime": 28.2997, "eval_samples_per_second": 21.202, "eval_steps_per_second": 0.671, "step": 1200 }, { "debug/policy_chosen_logits": 1.046290397644043, "debug/policy_chosen_logps": -251.0709991455078, "debug/policy_rejected_logits": 1.3077054023742676, "debug/policy_rejected_logps": -298.0001525878906, "debug/reference_chosen_logps": -251.0874481201172, "debug/reference_rejected_logps": -295.08001708984375, "debug/sppo_chosen_loss": 2553.08642578125, "debug/sppo_chosen_reward_in_loss": 0.016447830945253372, "debug/sppo_rej_reward_in_loss": -2.9201416969299316, "debug/sppo_reject_loss": 2284.946533203125, "epoch": 4.365942028985507, "grad_norm": 69233.98566803512, "learning_rate": 4.6203344288737694e-08, "logits/chosen": 1.046290397644043, "logits/rejected": 1.3077054023742676, "logps/chosen": -251.0709991455078, "logps/rejected": -298.0001525878906, "loss": 4569.1977, "rewards/accuracies": 0.625, "rewards/chosen": 0.00016447734378743917, "rewards/margins": 0.029365893453359604, "rewards/rejected": -0.029201412573456764, "step": 1205 }, { "debug/policy_chosen_logits": 1.3429532051086426, "debug/policy_chosen_logps": -232.9199981689453, "debug/policy_rejected_logits": 1.702622652053833, "debug/policy_rejected_logps": -267.97393798828125, "debug/reference_chosen_logps": -235.09268188476562, "debug/reference_rejected_logps": -262.50762939453125, "debug/sppo_chosen_loss": 2300.420654296875, "debug/sppo_chosen_reward_in_loss": 2.1726772785186768, "debug/sppo_rej_reward_in_loss": -5.466324806213379, "debug/sppo_reject_loss": 2110.576171875, "epoch": 4.384057971014493, "grad_norm": 86817.63430815592, "learning_rate": 4.583194902950234e-08, "logits/chosen": 1.3429532051086426, "logits/rejected": 1.702622652053833, "logps/chosen": -232.9199981689453, "logps/rejected": -267.97393798828125, "loss": 4474.6441, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02172677218914032, "rewards/margins": 0.07639001309871674, "rewards/rejected": -0.054663240909576416, "step": 1210 }, { "debug/policy_chosen_logits": 1.091511845588684, "debug/policy_chosen_logps": -274.4998779296875, "debug/policy_rejected_logits": 1.1359487771987915, "debug/policy_rejected_logps": -276.3737487792969, "debug/reference_chosen_logps": -274.5265197753906, "debug/reference_rejected_logps": -268.24322509765625, "debug/sppo_chosen_loss": 2567.46533203125, "debug/sppo_chosen_reward_in_loss": 0.02658367156982422, "debug/sppo_rej_reward_in_loss": -8.130558967590332, "debug/sppo_reject_loss": 1878.1728515625, "epoch": 4.4021739130434785, "grad_norm": 57731.05906653854, "learning_rate": 4.546078520554123e-08, "logits/chosen": 1.091511845588684, "logits/rejected": 1.1359487771987915, "logps/chosen": -274.4998779296875, "logps/rejected": -276.3737487792969, "loss": 4550.2871, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.00026583747239783406, "rewards/margins": 0.08157142251729965, "rewards/rejected": -0.08130558580160141, "step": 1215 }, { "debug/policy_chosen_logits": 0.7921018600463867, "debug/policy_chosen_logps": -234.0077667236328, "debug/policy_rejected_logits": 1.1153194904327393, "debug/policy_rejected_logps": -288.6137390136719, "debug/reference_chosen_logps": -235.14913940429688, "debug/reference_rejected_logps": -283.31939697265625, "debug/sppo_chosen_loss": 2416.521484375, "debug/sppo_chosen_reward_in_loss": 1.1413967609405518, "debug/sppo_rej_reward_in_loss": -5.294376850128174, "debug/sppo_reject_loss": 2090.29541015625, "epoch": 4.420289855072464, "grad_norm": 64438.99635837363, "learning_rate": 4.5089873426103575e-08, "logits/chosen": 0.7921018600463867, "logits/rejected": 1.1153194904327393, "logps/chosen": -234.0077667236328, "logps/rejected": -288.6137390136719, "loss": 4561.3641, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011413967236876488, "rewards/margins": 0.06435773521661758, "rewards/rejected": -0.05294376611709595, "step": 1220 }, { "debug/policy_chosen_logits": 1.2593185901641846, "debug/policy_chosen_logps": -251.8949432373047, "debug/policy_rejected_logits": 1.3442823886871338, "debug/policy_rejected_logps": -266.51556396484375, "debug/reference_chosen_logps": -254.294189453125, "debug/reference_rejected_logps": -261.4356689453125, "debug/sppo_chosen_loss": 2281.20849609375, "debug/sppo_chosen_reward_in_loss": 2.3992409706115723, "debug/sppo_rej_reward_in_loss": -5.079881191253662, "debug/sppo_reject_loss": 2105.07666015625, "epoch": 4.438405797101449, "grad_norm": 69135.99785715759, "learning_rate": 4.471923428644361e-08, "logits/chosen": 1.2593185901641846, "logits/rejected": 1.3442823886871338, "logps/chosen": -251.8949432373047, "logps/rejected": -266.51556396484375, "loss": 4308.8, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02399240806698799, "rewards/margins": 0.07479121536016464, "rewards/rejected": -0.05079881101846695, "step": 1225 }, { "debug/policy_chosen_logits": 1.1671477556228638, "debug/policy_chosen_logps": -262.21832275390625, "debug/policy_rejected_logits": 1.5277702808380127, "debug/policy_rejected_logps": -303.25189208984375, "debug/reference_chosen_logps": -262.35906982421875, "debug/reference_rejected_logps": -297.26397705078125, "debug/sppo_chosen_loss": 2544.049072265625, "debug/sppo_chosen_reward_in_loss": 0.14074191451072693, "debug/sppo_rej_reward_in_loss": -5.987893581390381, "debug/sppo_reject_loss": 2118.998291015625, "epoch": 4.456521739130435, "grad_norm": 75122.42668605472, "learning_rate": 4.4348888366677e-08, "logits/chosen": 1.1671477556228638, "logits/rejected": 1.5277702808380127, "logps/chosen": -262.21832275390625, "logps/rejected": -303.25189208984375, "loss": 4490.9406, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0014074190985411406, "rewards/margins": 0.06128635257482529, "rewards/rejected": -0.059878937900066376, "step": 1230 }, { "debug/policy_chosen_logits": 1.1719297170639038, "debug/policy_chosen_logps": -243.4971923828125, "debug/policy_rejected_logits": 1.5150502920150757, "debug/policy_rejected_logps": -290.97918701171875, "debug/reference_chosen_logps": -245.56253051757812, "debug/reference_rejected_logps": -283.6595153808594, "debug/sppo_chosen_loss": 2315.97802734375, "debug/sppo_chosen_reward_in_loss": 2.0653247833251953, "debug/sppo_rej_reward_in_loss": -7.319671630859375, "debug/sppo_reject_loss": 1958.665283203125, "epoch": 4.47463768115942, "grad_norm": 65485.326826669945, "learning_rate": 4.3978856230638006e-08, "logits/chosen": 1.1719297170639038, "logits/rejected": 1.5150502920150757, "logps/chosen": -243.4971923828125, "logps/rejected": -290.97918701171875, "loss": 4511.8867, "rewards/accuracies": 0.875, "rewards/chosen": 0.020653247833251953, "rewards/margins": 0.09384995698928833, "rewards/rejected": -0.07319670915603638, "step": 1235 }, { "debug/policy_chosen_logits": 0.9258686304092407, "debug/policy_chosen_logps": -252.1705780029297, "debug/policy_rejected_logits": 1.5341691970825195, "debug/policy_rejected_logps": -304.2678527832031, "debug/reference_chosen_logps": -252.189697265625, "debug/reference_rejected_logps": -299.5684814453125, "debug/sppo_chosen_loss": 2564.078125, "debug/sppo_chosen_reward_in_loss": 0.01913604699075222, "debug/sppo_rej_reward_in_loss": -4.6993584632873535, "debug/sppo_reject_loss": 2178.47705078125, "epoch": 4.492753623188406, "grad_norm": 67467.04571567297, "learning_rate": 4.360915842473778e-08, "logits/chosen": 0.9258686304092407, "logits/rejected": 1.5341691970825195, "logps/chosen": -252.1705780029297, "logps/rejected": -304.2678527832031, "loss": 4636.2949, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00019135959155391902, "rewards/margins": 0.047184936702251434, "rewards/rejected": -0.04699358344078064, "step": 1240 }, { "debug/policy_chosen_logits": 1.2430813312530518, "debug/policy_chosen_logps": -246.3681182861328, "debug/policy_rejected_logits": 1.4096765518188477, "debug/policy_rejected_logps": -265.3592834472656, "debug/reference_chosen_logps": -249.0845184326172, "debug/reference_rejected_logps": -261.7829895019531, "debug/sppo_chosen_loss": 2265.285400390625, "debug/sppo_chosen_reward_in_loss": 2.7164249420166016, "debug/sppo_rej_reward_in_loss": -3.576289415359497, "debug/sppo_reject_loss": 2272.076171875, "epoch": 4.510869565217392, "grad_norm": 87505.91804699051, "learning_rate": 4.323981547682341e-08, "logits/chosen": 1.2430813312530518, "logits/rejected": 1.4096765518188477, "logps/chosen": -246.3681182861328, "logps/rejected": -265.3592834472656, "loss": 4565.9375, "rewards/accuracies": 0.75, "rewards/chosen": 0.02716425061225891, "rewards/margins": 0.06292714178562164, "rewards/rejected": -0.03576289117336273, "step": 1245 }, { "debug/policy_chosen_logits": 0.8251383900642395, "debug/policy_chosen_logps": -251.19461059570312, "debug/policy_rejected_logits": 1.2840948104858398, "debug/policy_rejected_logps": -310.5716247558594, "debug/reference_chosen_logps": -252.380859375, "debug/reference_rejected_logps": -305.73828125, "debug/sppo_chosen_loss": 2430.56005859375, "debug/sppo_chosen_reward_in_loss": 1.1862595081329346, "debug/sppo_rej_reward_in_loss": -4.833325386047363, "debug/sppo_reject_loss": 2147.44091796875, "epoch": 4.528985507246377, "grad_norm": 65261.528655588234, "learning_rate": 4.287084789503821e-08, "logits/chosen": 0.8251383900642395, "logits/rejected": 1.2840948104858398, "logps/chosen": -251.19461059570312, "logps/rejected": -310.5716247558594, "loss": 4561.8227, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011862593702971935, "rewards/margins": 0.06019585207104683, "rewards/rejected": -0.04833325743675232, "step": 1250 }, { "debug/policy_chosen_logits": 1.1032116413116455, "debug/policy_chosen_logps": -236.8427276611328, "debug/policy_rejected_logits": 1.1994361877441406, "debug/policy_rejected_logps": -281.3302917480469, "debug/reference_chosen_logps": -238.1943359375, "debug/reference_rejected_logps": -277.8025817871094, "debug/sppo_chosen_loss": 2413.16064453125, "debug/sppo_chosen_reward_in_loss": 1.351604700088501, "debug/sppo_rej_reward_in_loss": -3.5276970863342285, "debug/sppo_reject_loss": 2266.24072265625, "epoch": 4.547101449275362, "grad_norm": 60726.047146706565, "learning_rate": 4.25022761666828e-08, "logits/chosen": 1.1032116413116455, "logits/rejected": 1.1994361877441406, "logps/chosen": -236.8427276611328, "logps/rejected": -281.3302917480469, "loss": 4491.7547, "rewards/accuracies": 0.625, "rewards/chosen": 0.013516046106815338, "rewards/margins": 0.048793014138936996, "rewards/rejected": -0.03527696803212166, "step": 1255 }, { "debug/policy_chosen_logits": 1.1790199279785156, "debug/policy_chosen_logps": -230.02743530273438, "debug/policy_rejected_logits": 1.228280782699585, "debug/policy_rejected_logps": -251.299072265625, "debug/reference_chosen_logps": -231.77627563476562, "debug/reference_rejected_logps": -250.15414428710938, "debug/sppo_chosen_loss": 2367.98876953125, "debug/sppo_chosen_reward_in_loss": 1.7488276958465576, "debug/sppo_rej_reward_in_loss": -1.1449229717254639, "debug/sppo_reject_loss": 2475.135009765625, "epoch": 4.565217391304348, "grad_norm": 64332.87644757188, "learning_rate": 4.2134120757077734e-08, "logits/chosen": 1.1790199279785156, "logits/rejected": 1.228280782699585, "logps/chosen": -230.02743530273438, "logps/rejected": -251.299072265625, "loss": 4465.0578, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.017488278448581696, "rewards/margins": 0.028937507420778275, "rewards/rejected": -0.011449231766164303, "step": 1260 }, { "debug/policy_chosen_logits": 1.3369028568267822, "debug/policy_chosen_logps": -251.6381378173828, "debug/policy_rejected_logits": 1.5229181051254272, "debug/policy_rejected_logps": -292.3419494628906, "debug/reference_chosen_logps": -252.33349609375, "debug/reference_rejected_logps": -288.603515625, "debug/sppo_chosen_loss": 2490.442138671875, "debug/sppo_chosen_reward_in_loss": 0.6953468322753906, "debug/sppo_rej_reward_in_loss": -3.7384228706359863, "debug/sppo_reject_loss": 2298.62744140625, "epoch": 4.583333333333333, "grad_norm": 64540.17470785349, "learning_rate": 4.176640210842699e-08, "logits/chosen": 1.3369028568267822, "logits/rejected": 1.5229181051254272, "logps/chosen": -251.6381378173828, "logps/rejected": -292.3419494628906, "loss": 4575.027, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0069534690119326115, "rewards/margins": 0.04433769732713699, "rewards/rejected": -0.03738423064351082, "step": 1265 }, { "debug/policy_chosen_logits": 1.377716302871704, "debug/policy_chosen_logps": -278.78375244140625, "debug/policy_rejected_logits": 1.4594463109970093, "debug/policy_rejected_logps": -312.36566162109375, "debug/reference_chosen_logps": -279.77496337890625, "debug/reference_rejected_logps": -307.9085693359375, "debug/sppo_chosen_loss": 2480.254150390625, "debug/sppo_chosen_reward_in_loss": 0.9911910891532898, "debug/sppo_rej_reward_in_loss": -4.457060813903809, "debug/sppo_reject_loss": 2181.80517578125, "epoch": 4.601449275362318, "grad_norm": 61145.46098697978, "learning_rate": 4.139914063868293e-08, "logits/chosen": 1.377716302871704, "logits/rejected": 1.4594463109970093, "logps/chosen": -278.78375244140625, "logps/rejected": -312.36566162109375, "loss": 4571.3488, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.009911911562085152, "rewards/margins": 0.05448251962661743, "rewards/rejected": -0.04457060620188713, "step": 1270 }, { "debug/policy_chosen_logits": 1.1420056819915771, "debug/policy_chosen_logps": -258.6353454589844, "debug/policy_rejected_logits": 1.4533131122589111, "debug/policy_rejected_logps": -294.0809631347656, "debug/reference_chosen_logps": -259.3207092285156, "debug/reference_rejected_logps": -291.1746520996094, "debug/sppo_chosen_loss": 2481.23779296875, "debug/sppo_chosen_reward_in_loss": 0.6853691339492798, "debug/sppo_rej_reward_in_loss": -2.906320571899414, "debug/sppo_reject_loss": 2302.75341796875, "epoch": 4.619565217391305, "grad_norm": 83740.46552961235, "learning_rate": 4.103235674041266e-08, "logits/chosen": 1.1420056819915771, "logits/rejected": 1.4533131122589111, "logps/chosen": -258.6353454589844, "logps/rejected": -294.0809631347656, "loss": 4558.2504, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.006853691302239895, "rewards/margins": 0.03591689467430115, "rewards/rejected": -0.02906320057809353, "step": 1275 }, { "debug/policy_chosen_logits": 1.4147323369979858, "debug/policy_chosen_logps": -269.6281433105469, "debug/policy_rejected_logits": 1.2961864471435547, "debug/policy_rejected_logps": -270.24127197265625, "debug/reference_chosen_logps": -270.97503662109375, "debug/reference_rejected_logps": -266.56317138671875, "debug/sppo_chosen_loss": 2403.29443359375, "debug/sppo_chosen_reward_in_loss": 1.3468936681747437, "debug/sppo_rej_reward_in_loss": -3.678117275238037, "debug/sppo_reject_loss": 2250.15869140625, "epoch": 4.63768115942029, "grad_norm": 96431.02323300211, "learning_rate": 4.066607077966558e-08, "logits/chosen": 1.4147323369979858, "logits/rejected": 1.2961864471435547, "logps/chosen": -269.6281433105469, "logps/rejected": -270.24127197265625, "loss": 4642.2664, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.013468936085700989, "rewards/margins": 0.0502501018345356, "rewards/rejected": -0.03678116947412491, "step": 1280 }, { "debug/policy_chosen_logits": 1.1305530071258545, "debug/policy_chosen_logps": -217.1304931640625, "debug/policy_rejected_logits": 1.5472891330718994, "debug/policy_rejected_logps": -268.34173583984375, "debug/reference_chosen_logps": -220.8345947265625, "debug/reference_rejected_logps": -263.44635009765625, "debug/sppo_chosen_loss": 2151.916259765625, "debug/sppo_chosen_reward_in_loss": 3.7041258811950684, "debug/sppo_rej_reward_in_loss": -4.89541482925415, "debug/sppo_reject_loss": 2223.81689453125, "epoch": 4.655797101449275, "grad_norm": 63172.904139896964, "learning_rate": 4.030030309484266e-08, "logits/chosen": 1.1305530071258545, "logits/rejected": 1.5472891330718994, "logps/chosen": -217.1304931640625, "logps/rejected": -268.34173583984375, "loss": 4577.1152, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.037041254341602325, "rewards/margins": 0.08599540591239929, "rewards/rejected": -0.04895415157079697, "step": 1285 }, { "debug/policy_chosen_logits": 1.053808569908142, "debug/policy_chosen_logps": -240.4136505126953, "debug/policy_rejected_logits": 1.4557785987854004, "debug/policy_rejected_logps": -289.88995361328125, "debug/reference_chosen_logps": -240.63711547851562, "debug/reference_rejected_logps": -285.86376953125, "debug/sppo_chosen_loss": 2550.99853515625, "debug/sppo_chosen_reward_in_loss": 0.22346897423267365, "debug/sppo_rej_reward_in_loss": -4.026174068450928, "debug/sppo_reject_loss": 2216.336669921875, "epoch": 4.673913043478261, "grad_norm": 58932.00259799572, "learning_rate": 3.9935073995566984e-08, "logits/chosen": 1.053808569908142, "logits/rejected": 1.4557785987854004, "logps/chosen": -240.4136505126953, "logps/rejected": -289.88995361328125, "loss": 4557.0328, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0022346898913383484, "rewards/margins": 0.04249643161892891, "rewards/rejected": -0.04026174172759056, "step": 1290 }, { "debug/policy_chosen_logits": 1.1827704906463623, "debug/policy_chosen_logps": -250.3016815185547, "debug/policy_rejected_logits": 1.66567063331604, "debug/policy_rejected_logps": -292.5500793457031, "debug/reference_chosen_logps": -250.30029296875, "debug/reference_rejected_logps": -286.7928771972656, "debug/sppo_chosen_loss": 2592.457763671875, "debug/sppo_chosen_reward_in_loss": -0.0013914108276367188, "debug/sppo_rej_reward_in_loss": -5.757199764251709, "debug/sppo_reject_loss": 2131.22412109375, "epoch": 4.692028985507246, "grad_norm": 68959.83758611538, "learning_rate": 3.957040376155625e-08, "logits/chosen": 1.1827704906463623, "logits/rejected": 1.66567063331604, "logps/chosen": -250.3016815185547, "logps/rejected": -292.5500793457031, "loss": 4449.4266, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3914332157582976e-05, "rewards/margins": 0.0575580820441246, "rewards/rejected": -0.05757199600338936, "step": 1295 }, { "debug/policy_chosen_logits": 1.2554280757904053, "debug/policy_chosen_logps": -279.84320068359375, "debug/policy_rejected_logits": 1.32145094871521, "debug/policy_rejected_logps": -276.5966796875, "debug/reference_chosen_logps": -281.0545959472656, "debug/reference_rejected_logps": -272.4754333496094, "debug/sppo_chosen_loss": 2443.668701171875, "debug/sppo_chosen_reward_in_loss": 1.2113920450210571, "debug/sppo_rej_reward_in_loss": -4.121267795562744, "debug/sppo_reject_loss": 2225.25048828125, "epoch": 4.710144927536232, "grad_norm": 67623.47642834642, "learning_rate": 3.920631264149647e-08, "logits/chosen": 1.2554280757904053, "logits/rejected": 1.32145094871521, "logps/chosen": -279.84320068359375, "logps/rejected": -276.5966796875, "loss": 4673.5336, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.012113918550312519, "rewards/margins": 0.053326599299907684, "rewards/rejected": -0.04121267795562744, "step": 1300 }, { "epoch": 4.710144927536232, "eval_debug/policy_chosen_logits": 1.413673996925354, "eval_debug/policy_chosen_logps": -252.32928466796875, "eval_debug/policy_rejected_logits": 1.4597282409667969, "eval_debug/policy_rejected_logps": -263.0262756347656, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2506.592041015625, "eval_debug/sppo_chosen_reward_in_loss": 0.5891677737236023, "eval_debug/sppo_rej_reward_in_loss": -3.3676302433013916, "eval_debug/sppo_reject_loss": 2317.545654296875, "eval_logits/chosen": 1.413673996925354, "eval_logits/rejected": 1.4597282409667969, "eval_logps/chosen": -252.32928466796875, "eval_logps/rejected": -263.0262756347656, "eval_loss": 4629.23583984375, "eval_rewards/accuracies": 0.6052631735801697, "eval_rewards/chosen": 0.005891676992177963, "eval_rewards/margins": 0.039567980915308, "eval_rewards/rejected": -0.033676303923130035, "eval_runtime": 28.2834, "eval_samples_per_second": 21.214, "eval_steps_per_second": 0.672, "step": 1300 }, { "debug/policy_chosen_logits": 1.0135124921798706, "debug/policy_chosen_logps": -241.3780059814453, "debug/policy_rejected_logits": 1.3717260360717773, "debug/policy_rejected_logps": -283.3448791503906, "debug/reference_chosen_logps": -243.1513214111328, "debug/reference_rejected_logps": -277.0658264160156, "debug/sppo_chosen_loss": 2355.27001953125, "debug/sppo_chosen_reward_in_loss": 1.773308515548706, "debug/sppo_rej_reward_in_loss": -6.279069900512695, "debug/sppo_reject_loss": 2043.216064453125, "epoch": 4.728260869565218, "grad_norm": 63917.706723514275, "learning_rate": 3.884282085191782e-08, "logits/chosen": 1.0135124921798706, "logits/rejected": 1.3717260360717773, "logps/chosen": -241.3780059814453, "logps/rejected": -283.3448791503906, "loss": 4531.777, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.01773308590054512, "rewards/margins": 0.080523781478405, "rewards/rejected": -0.06279069930315018, "step": 1305 }, { "debug/policy_chosen_logits": 1.2251166105270386, "debug/policy_chosen_logps": -254.70632934570312, "debug/policy_rejected_logits": 1.443878412246704, "debug/policy_rejected_logps": -293.47235107421875, "debug/reference_chosen_logps": -256.298828125, "debug/reference_rejected_logps": -287.5867614746094, "debug/sppo_chosen_loss": 2382.60595703125, "debug/sppo_chosen_reward_in_loss": 1.5925235748291016, "debug/sppo_rej_reward_in_loss": -5.885610103607178, "debug/sppo_reject_loss": 2065.21435546875, "epoch": 4.746376811594203, "grad_norm": 65263.12493107783, "learning_rate": 3.847994857607208e-08, "logits/chosen": 1.2251166105270386, "logits/rejected": 1.443878412246704, "logps/chosen": -254.70632934570312, "logps/rejected": -293.47235107421875, "loss": 4509.5789, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01592523418366909, "rewards/margins": 0.07478133589029312, "rewards/rejected": -0.05885609984397888, "step": 1310 }, { "debug/policy_chosen_logits": 1.4621236324310303, "debug/policy_chosen_logps": -258.2713928222656, "debug/policy_rejected_logits": 1.7599725723266602, "debug/policy_rejected_logps": -283.0798034667969, "debug/reference_chosen_logps": -258.59027099609375, "debug/reference_rejected_logps": -279.9950256347656, "debug/sppo_chosen_loss": 2545.721923828125, "debug/sppo_chosen_reward_in_loss": 0.3188707232475281, "debug/sppo_rej_reward_in_loss": -3.0847675800323486, "debug/sppo_reject_loss": 2300.00830078125, "epoch": 4.7644927536231885, "grad_norm": 66844.54159609167, "learning_rate": 3.811771596281181e-08, "logits/chosen": 1.4621236324310303, "logits/rejected": 1.7599725723266602, "logps/chosen": -258.2713928222656, "logps/rejected": -283.0798034667969, "loss": 4572.9363, "rewards/accuracies": 0.625, "rewards/chosen": 0.003188707632943988, "rewards/margins": 0.03403637930750847, "rewards/rejected": -0.030847672373056412, "step": 1315 }, { "debug/policy_chosen_logits": 1.078034520149231, "debug/policy_chosen_logps": -244.0189666748047, "debug/policy_rejected_logits": 1.416656255722046, "debug/policy_rejected_logps": -286.8953552246094, "debug/reference_chosen_logps": -244.8174591064453, "debug/reference_rejected_logps": -283.5927429199219, "debug/sppo_chosen_loss": 2442.52783203125, "debug/sppo_chosen_reward_in_loss": 0.7984712719917297, "debug/sppo_rej_reward_in_loss": -3.3026015758514404, "debug/sppo_reject_loss": 2292.23291015625, "epoch": 4.782608695652174, "grad_norm": 59809.747670291195, "learning_rate": 3.775614312547174e-08, "logits/chosen": 1.078034520149231, "logits/rejected": 1.416656255722046, "logps/chosen": -244.0189666748047, "logps/rejected": -286.8953552246094, "loss": 4636.6781, "rewards/accuracies": 0.625, "rewards/chosen": 0.007984711788594723, "rewards/margins": 0.04101072996854782, "rewards/rejected": -0.033026017248630524, "step": 1320 }, { "debug/policy_chosen_logits": 1.1492488384246826, "debug/policy_chosen_logps": -244.71347045898438, "debug/policy_rejected_logits": 1.3557655811309814, "debug/policy_rejected_logps": -301.49017333984375, "debug/reference_chosen_logps": -247.78457641601562, "debug/reference_rejected_logps": -296.8087463378906, "debug/sppo_chosen_loss": 2217.271240234375, "debug/sppo_chosen_reward_in_loss": 3.0710842609405518, "debug/sppo_rej_reward_in_loss": -4.681424140930176, "debug/sppo_reject_loss": 2150.398681640625, "epoch": 4.800724637681159, "grad_norm": 60184.51204865287, "learning_rate": 3.739525014075178e-08, "logits/chosen": 1.1492488384246826, "logits/rejected": 1.3557655811309814, "logps/chosen": -244.71347045898438, "logps/rejected": -301.49017333984375, "loss": 4390.1812, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.030710840597748756, "rewards/margins": 0.0775250792503357, "rewards/rejected": -0.046814244240522385, "step": 1325 }, { "debug/policy_chosen_logits": 1.0216914415359497, "debug/policy_chosen_logps": -258.01416015625, "debug/policy_rejected_logits": 1.3483374118804932, "debug/policy_rejected_logps": -332.45635986328125, "debug/reference_chosen_logps": -260.9222717285156, "debug/reference_rejected_logps": -326.95977783203125, "debug/sppo_chosen_loss": 2236.34765625, "debug/sppo_chosen_reward_in_loss": 2.9081203937530518, "debug/sppo_rej_reward_in_loss": -5.496593475341797, "debug/sppo_reject_loss": 2135.439453125, "epoch": 4.818840579710145, "grad_norm": 71535.59444523143, "learning_rate": 3.7035057047602446e-08, "logits/chosen": 1.0216914415359497, "logits/rejected": 1.3483374118804932, "logps/chosen": -258.01416015625, "logps/rejected": -332.45635986328125, "loss": 4501.743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.029081201180815697, "rewards/margins": 0.08404713124036789, "rewards/rejected": -0.05496593192219734, "step": 1330 }, { "debug/policy_chosen_logits": 1.256682276725769, "debug/policy_chosen_logps": -243.0992431640625, "debug/policy_rejected_logits": 1.596100091934204, "debug/policy_rejected_logps": -297.1000061035156, "debug/reference_chosen_logps": -245.0940399169922, "debug/reference_rejected_logps": -291.6605529785156, "debug/sppo_chosen_loss": 2355.398681640625, "debug/sppo_chosen_reward_in_loss": 1.994797706604004, "debug/sppo_rej_reward_in_loss": -5.43942928314209, "debug/sppo_reject_loss": 2168.495361328125, "epoch": 4.836956521739131, "grad_norm": 95313.27853471276, "learning_rate": 3.6675583846111964e-08, "logits/chosen": 1.256682276725769, "logits/rejected": 1.596100091934204, "logps/chosen": -243.0992431640625, "logps/rejected": -297.1000061035156, "loss": 4547.6633, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.019947977736592293, "rewards/margins": 0.0743422657251358, "rewards/rejected": -0.05439429357647896, "step": 1335 }, { "debug/policy_chosen_logits": 1.0697505474090576, "debug/policy_chosen_logps": -223.1968231201172, "debug/policy_rejected_logits": 1.5175020694732666, "debug/policy_rejected_logps": -285.0724792480469, "debug/reference_chosen_logps": -223.690185546875, "debug/reference_rejected_logps": -279.407958984375, "debug/sppo_chosen_loss": 2535.14306640625, "debug/sppo_chosen_reward_in_loss": 0.4933549761772156, "debug/sppo_rej_reward_in_loss": -5.664527416229248, "debug/sppo_reject_loss": 2116.50048828125, "epoch": 4.855072463768116, "grad_norm": 74487.85756637715, "learning_rate": 3.6316850496395855e-08, "logits/chosen": 1.0697505474090576, "logits/rejected": 1.5175020694732666, "logps/chosen": -223.1968231201172, "logps/rejected": -285.0724792480469, "loss": 4669.2, "rewards/accuracies": 0.75, "rewards/chosen": 0.004933550488203764, "rewards/margins": 0.06157882139086723, "rewards/rejected": -0.05664527416229248, "step": 1340 }, { "debug/policy_chosen_logits": 1.3100662231445312, "debug/policy_chosen_logps": -266.4246520996094, "debug/policy_rejected_logits": 1.6305770874023438, "debug/policy_rejected_logps": -269.2158203125, "debug/reference_chosen_logps": -267.3473815917969, "debug/reference_rejected_logps": -263.4280700683594, "debug/sppo_chosen_loss": 2454.28466796875, "debug/sppo_chosen_reward_in_loss": 0.9227026104927063, "debug/sppo_rej_reward_in_loss": -5.787759304046631, "debug/sppo_reject_loss": 2104.50732421875, "epoch": 4.8731884057971016, "grad_norm": 70525.65485246507, "learning_rate": 3.595887691748868e-08, "logits/chosen": 1.3100662231445312, "logits/rejected": 1.6305770874023438, "logps/chosen": -266.4246520996094, "logps/rejected": -269.2158203125, "loss": 4629.4301, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0092270253226161, "rewards/margins": 0.06710462272167206, "rewards/rejected": -0.057877592742443085, "step": 1345 }, { "debug/policy_chosen_logits": 1.0884641408920288, "debug/policy_chosen_logps": -234.8406524658203, "debug/policy_rejected_logits": 1.447321891784668, "debug/policy_rejected_logps": -278.3136291503906, "debug/reference_chosen_logps": -235.99813842773438, "debug/reference_rejected_logps": -273.8133239746094, "debug/sppo_chosen_loss": 2452.72119140625, "debug/sppo_chosen_reward_in_loss": 1.1574690341949463, "debug/sppo_rej_reward_in_loss": -4.500250816345215, "debug/sppo_reject_loss": 2180.615234375, "epoch": 4.891304347826087, "grad_norm": 62332.34024417311, "learning_rate": 3.560168298623788e-08, "logits/chosen": 1.0884641408920288, "logits/rejected": 1.447321891784668, "logps/chosen": -234.8406524658203, "logps/rejected": -278.3136291503906, "loss": 4510.0766, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.011574688367545605, "rewards/margins": 0.05657719820737839, "rewards/rejected": -0.04500251263380051, "step": 1350 }, { "debug/policy_chosen_logits": 1.2118421792984009, "debug/policy_chosen_logps": -256.170654296875, "debug/policy_rejected_logits": 1.509093999862671, "debug/policy_rejected_logps": -273.8734436035156, "debug/reference_chosen_logps": -257.6533203125, "debug/reference_rejected_logps": -271.18536376953125, "debug/sppo_chosen_loss": 2406.82421875, "debug/sppo_chosen_reward_in_loss": 1.4826520681381226, "debug/sppo_rej_reward_in_loss": -2.688032865524292, "debug/sppo_reject_loss": 2327.357177734375, "epoch": 4.909420289855072, "grad_norm": 96533.06499373812, "learning_rate": 3.524528853620023e-08, "logits/chosen": 1.2118421792984009, "logits/rejected": 1.509093999862671, "logps/chosen": -256.170654296875, "logps/rejected": -273.8734436035156, "loss": 4591.6375, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.014826519414782524, "rewards/margins": 0.04170685261487961, "rewards/rejected": -0.026880327612161636, "step": 1355 }, { "debug/policy_chosen_logits": 1.014077067375183, "debug/policy_chosen_logps": -245.6739044189453, "debug/policy_rejected_logits": 1.540024995803833, "debug/policy_rejected_logps": -306.22271728515625, "debug/reference_chosen_logps": -247.7864227294922, "debug/reference_rejected_logps": -300.8656921386719, "debug/sppo_chosen_loss": 2335.248779296875, "debug/sppo_chosen_reward_in_loss": 2.1125502586364746, "debug/sppo_rej_reward_in_loss": -5.357022285461426, "debug/sppo_reject_loss": 2192.939453125, "epoch": 4.927536231884058, "grad_norm": 70736.48053184348, "learning_rate": 3.488971335654043e-08, "logits/chosen": 1.014077067375183, "logits/rejected": 1.540024995803833, "logps/chosen": -245.6739044189453, "logps/rejected": -306.22271728515625, "loss": 4537.7945, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02112550288438797, "rewards/margins": 0.07469572126865387, "rewards/rejected": -0.0535702221095562, "step": 1360 }, { "debug/policy_chosen_logits": 1.2282394170761108, "debug/policy_chosen_logps": -254.85342407226562, "debug/policy_rejected_logits": 1.4015997648239136, "debug/policy_rejected_logps": -283.4310607910156, "debug/reference_chosen_logps": -257.22283935546875, "debug/reference_rejected_logps": -277.9129943847656, "debug/sppo_chosen_loss": 2287.615966796875, "debug/sppo_chosen_reward_in_loss": 2.369426727294922, "debug/sppo_rej_reward_in_loss": -5.518064975738525, "debug/sppo_reject_loss": 2086.158203125, "epoch": 4.945652173913043, "grad_norm": 67815.87349916976, "learning_rate": 3.453497719093242e-08, "logits/chosen": 1.2282394170761108, "logits/rejected": 1.4015997648239136, "logps/chosen": -254.85342407226562, "logps/rejected": -283.4310607910156, "loss": 4530.5863, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.023694265633821487, "rewards/margins": 0.07887491583824158, "rewards/rejected": -0.05518064647912979, "step": 1365 }, { "debug/policy_chosen_logits": 1.1096584796905518, "debug/policy_chosen_logps": -254.56942749023438, "debug/policy_rejected_logits": 1.5298644304275513, "debug/policy_rejected_logps": -323.55755615234375, "debug/reference_chosen_logps": -254.72119140625, "debug/reference_rejected_logps": -315.86517333984375, "debug/sppo_chosen_loss": 2589.263671875, "debug/sppo_chosen_reward_in_loss": 0.15175572037696838, "debug/sppo_rej_reward_in_loss": -7.692338466644287, "debug/sppo_reject_loss": 1989.229248046875, "epoch": 4.963768115942029, "grad_norm": 129529.1765103212, "learning_rate": 3.418109973646298e-08, "logits/chosen": 1.1096584796905518, "logits/rejected": 1.5298644304275513, "logps/chosen": -254.56942749023438, "logps/rejected": -323.55755615234375, "loss": 4388.4941, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0015175581211224198, "rewards/margins": 0.07844093441963196, "rewards/rejected": -0.07692337036132812, "step": 1370 }, { "debug/policy_chosen_logits": 1.4372981786727905, "debug/policy_chosen_logps": -259.3402099609375, "debug/policy_rejected_logits": 1.646761178970337, "debug/policy_rejected_logps": -302.2501525878906, "debug/reference_chosen_logps": -260.85552978515625, "debug/reference_rejected_logps": -294.53472900390625, "debug/sppo_chosen_loss": 2396.297607421875, "debug/sppo_chosen_reward_in_loss": 1.5153119564056396, "debug/sppo_rej_reward_in_loss": -7.715400695800781, "debug/sppo_reject_loss": 1990.2379150390625, "epoch": 4.981884057971015, "grad_norm": 80680.59194094632, "learning_rate": 3.382810064253809e-08, "logits/chosen": 1.4372981786727905, "logits/rejected": 1.646761178970337, "logps/chosen": -259.3402099609375, "logps/rejected": -302.2501525878906, "loss": 4553.3805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.015153119340538979, "rewards/margins": 0.09230712056159973, "rewards/rejected": -0.0771540030837059, "step": 1375 }, { "debug/policy_chosen_logits": 1.3966089487075806, "debug/policy_chosen_logps": -272.23431396484375, "debug/policy_rejected_logits": 1.5716099739074707, "debug/policy_rejected_logps": -295.2232360839844, "debug/reference_chosen_logps": -273.52972412109375, "debug/reference_rejected_logps": -288.4667053222656, "debug/sppo_chosen_loss": 2394.87255859375, "debug/sppo_chosen_reward_in_loss": 1.2953789234161377, "debug/sppo_rej_reward_in_loss": -6.756533622741699, "debug/sppo_reject_loss": 2009.9847412109375, "epoch": 5.0, "grad_norm": 57177.16481952865, "learning_rate": 3.3475999509791925e-08, "logits/chosen": 1.3966089487075806, "logits/rejected": 1.5716099739074707, "logps/chosen": -272.23431396484375, "logps/rejected": -295.2232360839844, "loss": 4452.3562, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.012953788042068481, "rewards/margins": 0.08051912486553192, "rewards/rejected": -0.06756533682346344, "step": 1380 }, { "debug/policy_chosen_logits": 1.0597548484802246, "debug/policy_chosen_logps": -262.56280517578125, "debug/policy_rejected_logits": 1.3429030179977417, "debug/policy_rejected_logps": -312.5959167480469, "debug/reference_chosen_logps": -262.3939514160156, "debug/reference_rejected_logps": -307.7593078613281, "debug/sppo_chosen_loss": 2606.427734375, "debug/sppo_chosen_reward_in_loss": -0.16884784400463104, "debug/sppo_rej_reward_in_loss": -4.836643218994141, "debug/sppo_reject_loss": 2142.93603515625, "epoch": 5.018115942028985, "grad_norm": 66797.05817653751, "learning_rate": 3.3124815888998345e-08, "logits/chosen": 1.0597548484802246, "logits/rejected": 1.3429030179977417, "logps/chosen": -262.56280517578125, "logps/rejected": -312.5959167480469, "loss": 4653.1094, "rewards/accuracies": 0.75, "rewards/chosen": -0.0016884788637980819, "rewards/margins": 0.046677954494953156, "rewards/rejected": -0.04836643114686012, "step": 1385 }, { "debug/policy_chosen_logits": 1.2682220935821533, "debug/policy_chosen_logps": -259.74493408203125, "debug/policy_rejected_logits": 1.6833469867706299, "debug/policy_rejected_logps": -294.34771728515625, "debug/reference_chosen_logps": -261.4613342285156, "debug/reference_rejected_logps": -290.87646484375, "debug/sppo_chosen_loss": 2359.308349609375, "debug/sppo_chosen_reward_in_loss": 1.716357946395874, "debug/sppo_rej_reward_in_loss": -3.471240282058716, "debug/sppo_reject_loss": 2267.60791015625, "epoch": 5.036231884057971, "grad_norm": 73249.23655353184, "learning_rate": 3.277456927998554e-08, "logits/chosen": 1.2682220935821533, "logits/rejected": 1.6833469867706299, "logps/chosen": -259.74493408203125, "logps/rejected": -294.34771728515625, "loss": 4654.3492, "rewards/accuracies": 0.75, "rewards/chosen": 0.017163580283522606, "rewards/margins": 0.05187598615884781, "rewards/rejected": -0.034712404012680054, "step": 1390 }, { "debug/policy_chosen_logits": 1.388127088546753, "debug/policy_chosen_logps": -271.39593505859375, "debug/policy_rejected_logits": 1.7892773151397705, "debug/policy_rejected_logps": -321.88311767578125, "debug/reference_chosen_logps": -273.29937744140625, "debug/reference_rejected_logps": -315.52386474609375, "debug/sppo_chosen_loss": 2344.13427734375, "debug/sppo_chosen_reward_in_loss": 1.9034183025360107, "debug/sppo_rej_reward_in_loss": -6.359226703643799, "debug/sppo_reject_loss": 2038.1265869140625, "epoch": 5.054347826086956, "grad_norm": 56563.963600837385, "learning_rate": 3.2425279130553076e-08, "logits/chosen": 1.388127088546753, "logits/rejected": 1.7892773151397705, "logps/chosen": -271.39593505859375, "logps/rejected": -321.88311767578125, "loss": 4476.5383, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.019034182652831078, "rewards/margins": 0.08262644708156586, "rewards/rejected": -0.06359227001667023, "step": 1395 }, { "debug/policy_chosen_logits": 0.7481376528739929, "debug/policy_chosen_logps": -235.9759521484375, "debug/policy_rejected_logits": 1.258954405784607, "debug/policy_rejected_logps": -309.2740173339844, "debug/reference_chosen_logps": -236.7089080810547, "debug/reference_rejected_logps": -302.9029235839844, "debug/sppo_chosen_loss": 2485.295166015625, "debug/sppo_chosen_reward_in_loss": 0.7329736948013306, "debug/sppo_rej_reward_in_loss": -6.371078968048096, "debug/sppo_reject_loss": 2072.038330078125, "epoch": 5.072463768115942, "grad_norm": 64677.330488287786, "learning_rate": 3.2076964835392185e-08, "logits/chosen": 0.7481376528739929, "logits/rejected": 1.258954405784607, "logps/chosen": -235.9759521484375, "logps/rejected": -309.2740173339844, "loss": 4551.7766, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.007329737301915884, "rewards/margins": 0.07104052603244781, "rewards/rejected": -0.06371079385280609, "step": 1400 }, { "epoch": 5.072463768115942, "eval_debug/policy_chosen_logits": 1.4144388437271118, "eval_debug/policy_chosen_logps": -252.45864868164062, "eval_debug/policy_rejected_logits": 1.4595340490341187, "eval_debug/policy_rejected_logps": -263.1626892089844, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2524.455322265625, "eval_debug/sppo_chosen_reward_in_loss": 0.45980995893478394, "eval_debug/sppo_rej_reward_in_loss": -3.5040810108184814, "eval_debug/sppo_reject_loss": 2311.046630859375, "eval_logits/chosen": 1.4144388437271118, "eval_logits/rejected": 1.4595340490341187, "eval_logps/chosen": -252.45864868164062, "eval_logps/rejected": -263.1626892089844, "eval_loss": 4636.1591796875, "eval_rewards/accuracies": 0.6052631735801697, "eval_rewards/chosen": 0.004598099738359451, "eval_rewards/margins": 0.039638906717300415, "eval_rewards/rejected": -0.035040806978940964, "eval_runtime": 28.4226, "eval_samples_per_second": 21.11, "eval_steps_per_second": 0.668, "step": 1400 }, { "debug/policy_chosen_logits": 1.073176383972168, "debug/policy_chosen_logps": -253.0771484375, "debug/policy_rejected_logits": 1.1446640491485596, "debug/policy_rejected_logps": -265.46929931640625, "debug/reference_chosen_logps": -255.2992401123047, "debug/reference_rejected_logps": -260.98126220703125, "debug/sppo_chosen_loss": 2313.059814453125, "debug/sppo_chosen_reward_in_loss": 2.222093105316162, "debug/sppo_rej_reward_in_loss": -4.48803186416626, "debug/sppo_reject_loss": 2214.115966796875, "epoch": 5.090579710144928, "grad_norm": 57291.336214715986, "learning_rate": 3.1729645735008747e-08, "logits/chosen": 1.073176383972168, "logits/rejected": 1.1446640491485596, "logps/chosen": -253.0771484375, "logps/rejected": -265.46929931640625, "loss": 4430.0254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.022220930084586143, "rewards/margins": 0.06710124760866165, "rewards/rejected": -0.04488031938672066, "step": 1405 }, { "debug/policy_chosen_logits": 1.2134159803390503, "debug/policy_chosen_logps": -260.3841552734375, "debug/policy_rejected_logits": 1.4837480783462524, "debug/policy_rejected_logps": -291.3798828125, "debug/reference_chosen_logps": -260.37481689453125, "debug/reference_rejected_logps": -285.4285888671875, "debug/sppo_chosen_loss": 2577.62939453125, "debug/sppo_chosen_reward_in_loss": -0.009347915649414062, "debug/sppo_rej_reward_in_loss": -5.951307773590088, "debug/sppo_reject_loss": 2098.92529296875, "epoch": 5.108695652173913, "grad_norm": 77453.72944389304, "learning_rate": 3.1383341114649466e-08, "logits/chosen": 1.2134159803390503, "logits/rejected": 1.4837480783462524, "logps/chosen": -260.3841552734375, "logps/rejected": -291.3798828125, "loss": 4625.1777, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -9.347964078187943e-05, "rewards/margins": 0.05941959470510483, "rewards/rejected": -0.05951308086514473, "step": 1410 }, { "debug/policy_chosen_logits": 1.3050518035888672, "debug/policy_chosen_logps": -271.77227783203125, "debug/policy_rejected_logits": 1.3268954753875732, "debug/policy_rejected_logps": -281.19415283203125, "debug/reference_chosen_logps": -272.76300048828125, "debug/reference_rejected_logps": -275.5526428222656, "debug/sppo_chosen_loss": 2437.26220703125, "debug/sppo_chosen_reward_in_loss": 0.9907159805297852, "debug/sppo_rej_reward_in_loss": -5.641491889953613, "debug/sppo_reject_loss": 2117.996337890625, "epoch": 5.1268115942028984, "grad_norm": 62995.000833422404, "learning_rate": 3.103807020323103e-08, "logits/chosen": 1.3050518035888672, "logits/rejected": 1.3268954753875732, "logps/chosen": -271.77227783203125, "logps/rejected": -281.19415283203125, "loss": 4568.5047, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.009907159022986889, "rewards/margins": 0.06632207334041595, "rewards/rejected": -0.05641491338610649, "step": 1415 }, { "debug/policy_chosen_logits": 1.2935222387313843, "debug/policy_chosen_logps": -257.31121826171875, "debug/policy_rejected_logits": 1.5443761348724365, "debug/policy_rejected_logps": -304.0801696777344, "debug/reference_chosen_logps": -260.3376159667969, "debug/reference_rejected_logps": -298.91290283203125, "debug/sppo_chosen_loss": 2233.71142578125, "debug/sppo_chosen_reward_in_loss": 3.026392698287964, "debug/sppo_rej_reward_in_loss": -5.167298316955566, "debug/sppo_reject_loss": 2137.61669921875, "epoch": 5.144927536231884, "grad_norm": 76168.55447801237, "learning_rate": 3.0693852172272336e-08, "logits/chosen": 1.2935222387313843, "logits/rejected": 1.5443761348724365, "logps/chosen": -257.31121826171875, "logps/rejected": -304.0801696777344, "loss": 4492.1176, "rewards/accuracies": 0.75, "rewards/chosen": 0.030263924971222878, "rewards/margins": 0.08193691074848175, "rewards/rejected": -0.051672983914613724, "step": 1420 }, { "debug/policy_chosen_logits": 1.0238162279129028, "debug/policy_chosen_logps": -281.5020751953125, "debug/policy_rejected_logits": 1.1858758926391602, "debug/policy_rejected_logps": -284.3791198730469, "debug/reference_chosen_logps": -281.7276306152344, "debug/reference_rejected_logps": -278.9505615234375, "debug/sppo_chosen_loss": 2551.8203125, "debug/sppo_chosen_reward_in_loss": 0.22554931044578552, "debug/sppo_rej_reward_in_loss": -5.4285502433776855, "debug/sppo_reject_loss": 2039.286865234375, "epoch": 5.163043478260869, "grad_norm": 90342.22934886666, "learning_rate": 3.035070613483009e-08, "logits/chosen": 1.0238162279129028, "logits/rejected": 1.1858758926391602, "logps/chosen": -281.5020751953125, "logps/rejected": -284.3791198730469, "loss": 4486.2891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0022554919123649597, "rewards/margins": 0.056540995836257935, "rewards/rejected": -0.054285503923892975, "step": 1425 }, { "debug/policy_chosen_logits": 1.0165376663208008, "debug/policy_chosen_logps": -226.81124877929688, "debug/policy_rejected_logits": 1.4223605394363403, "debug/policy_rejected_logps": -333.42999267578125, "debug/reference_chosen_logps": -230.421142578125, "debug/reference_rejected_logps": -324.8152160644531, "debug/sppo_chosen_loss": 2202.34912109375, "debug/sppo_chosen_reward_in_loss": 3.6098670959472656, "debug/sppo_rej_reward_in_loss": -8.614764213562012, "debug/sppo_reject_loss": 1888.661376953125, "epoch": 5.181159420289855, "grad_norm": 61910.098949095074, "learning_rate": 3.0008651144437394e-08, "logits/chosen": 1.0165376663208008, "logits/rejected": 1.4223605394363403, "logps/chosen": -226.81124877929688, "logps/rejected": -333.42999267578125, "loss": 4456.3543, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.0360986702144146, "rewards/margins": 0.12224630266427994, "rewards/rejected": -0.08614763617515564, "step": 1430 }, { "debug/policy_chosen_logits": 0.6908336877822876, "debug/policy_chosen_logps": -236.1743621826172, "debug/policy_rejected_logits": 1.1342148780822754, "debug/policy_rejected_logps": -267.88836669921875, "debug/reference_chosen_logps": -237.2069549560547, "debug/reference_rejected_logps": -264.7642517089844, "debug/sppo_chosen_loss": 2472.70166015625, "debug/sppo_chosen_reward_in_loss": 1.0326130390167236, "debug/sppo_rej_reward_in_loss": -3.124145746231079, "debug/sppo_reject_loss": 2303.392822265625, "epoch": 5.199275362318841, "grad_norm": 70461.5400215946, "learning_rate": 2.9667706194045895e-08, "logits/chosen": 0.6908336877822876, "logits/rejected": 1.1342148780822754, "logps/chosen": -236.1743621826172, "logps/rejected": -267.88836669921875, "loss": 4527.0414, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01032613031566143, "rewards/margins": 0.04156758636236191, "rewards/rejected": -0.031241456046700478, "step": 1435 }, { "debug/policy_chosen_logits": 1.1813528537750244, "debug/policy_chosen_logps": -278.6885681152344, "debug/policy_rejected_logits": 1.2910782098770142, "debug/policy_rejected_logps": -300.384521484375, "debug/reference_chosen_logps": -281.6507873535156, "debug/reference_rejected_logps": -296.9029235839844, "debug/sppo_chosen_loss": 2274.62646484375, "debug/sppo_chosen_reward_in_loss": 2.9621920585632324, "debug/sppo_rej_reward_in_loss": -3.4816219806671143, "debug/sppo_reject_loss": 2300.47705078125, "epoch": 5.217391304347826, "grad_norm": 158076.63899714782, "learning_rate": 2.932789021497113e-08, "logits/chosen": 1.1813528537750244, "logits/rejected": 1.2910782098770142, "logps/chosen": -278.6885681152344, "logps/rejected": -300.384521484375, "loss": 4592.1902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.029621923342347145, "rewards/margins": 0.06443814188241959, "rewards/rejected": -0.03481621667742729, "step": 1440 }, { "debug/policy_chosen_logits": 0.945541262626648, "debug/policy_chosen_logps": -226.1212615966797, "debug/policy_rejected_logits": 1.6461076736450195, "debug/policy_rejected_logps": -293.69866943359375, "debug/reference_chosen_logps": -230.6846466064453, "debug/reference_rejected_logps": -292.0801086425781, "debug/sppo_chosen_loss": 2121.76220703125, "debug/sppo_chosen_reward_in_loss": 4.563372611999512, "debug/sppo_rej_reward_in_loss": -1.6185725927352905, "debug/sppo_reject_loss": 2431.987548828125, "epoch": 5.2355072463768115, "grad_norm": 90625.44811809929, "learning_rate": 2.898922207584133e-08, "logits/chosen": 0.945541262626648, "logits/rejected": 1.6461076736450195, "logps/chosen": -226.1212615966797, "logps/rejected": -293.69866943359375, "loss": 4473.3125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.04563372582197189, "rewards/margins": 0.06181945651769638, "rewards/rejected": -0.01618572697043419, "step": 1445 }, { "debug/policy_chosen_logits": 1.2806552648544312, "debug/policy_chosen_logps": -251.5911102294922, "debug/policy_rejected_logits": 1.614915132522583, "debug/policy_rejected_logps": -315.8453674316406, "debug/reference_chosen_logps": -254.13671875, "debug/reference_rejected_logps": -314.6860656738281, "debug/sppo_chosen_loss": 2278.430419921875, "debug/sppo_chosen_reward_in_loss": 2.5456225872039795, "debug/sppo_rej_reward_in_loss": -1.1593204736709595, "debug/sppo_reject_loss": 2467.211181640625, "epoch": 5.253623188405797, "grad_norm": 62155.8076423089, "learning_rate": 2.8651720581549797e-08, "logits/chosen": 1.2806552648544312, "logits/rejected": 1.614915132522583, "logps/chosen": -251.5911102294922, "logps/rejected": -315.8453674316406, "loss": 4487.5734, "rewards/accuracies": 0.625, "rewards/chosen": 0.025456225499510765, "rewards/margins": 0.03704943135380745, "rewards/rejected": -0.01159320492297411, "step": 1450 }, { "debug/policy_chosen_logits": 0.8935205340385437, "debug/policy_chosen_logps": -230.4705047607422, "debug/policy_rejected_logits": 1.229943871498108, "debug/policy_rejected_logps": -296.24456787109375, "debug/reference_chosen_logps": -233.8374481201172, "debug/reference_rejected_logps": -290.72314453125, "debug/sppo_chosen_loss": 2209.38916015625, "debug/sppo_chosen_reward_in_loss": 3.366943359375, "debug/sppo_rej_reward_in_loss": -5.5214385986328125, "debug/sppo_reject_loss": 2192.26123046875, "epoch": 5.271739130434782, "grad_norm": 80665.85813537793, "learning_rate": 2.8315404472210646e-08, "logits/chosen": 0.8935205340385437, "logits/rejected": 1.229943871498108, "logps/chosen": -230.4705047607422, "logps/rejected": -296.24456787109375, "loss": 4589.9242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03366943448781967, "rewards/margins": 0.08888381719589233, "rewards/rejected": -0.05521438643336296, "step": 1455 }, { "debug/policy_chosen_logits": 0.9543207287788391, "debug/policy_chosen_logps": -228.46011352539062, "debug/policy_rejected_logits": 1.2147928476333618, "debug/policy_rejected_logps": -283.1182556152344, "debug/reference_chosen_logps": -232.3076629638672, "debug/reference_rejected_logps": -277.7240295410156, "debug/sppo_chosen_loss": 2165.05224609375, "debug/sppo_chosen_reward_in_loss": 3.8475775718688965, "debug/sppo_rej_reward_in_loss": -5.394213676452637, "debug/sppo_reject_loss": 2134.77587890625, "epoch": 5.2898550724637685, "grad_norm": 75556.9653345357, "learning_rate": 2.798029242211828e-08, "logits/chosen": 0.9543207287788391, "logits/rejected": 1.2147928476333618, "logps/chosen": -228.46011352539062, "logps/rejected": -283.1182556152344, "loss": 4569.8641, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.038475774228572845, "rewards/margins": 0.09241791069507599, "rewards/rejected": -0.053942132741212845, "step": 1460 }, { "debug/policy_chosen_logits": 1.2613698244094849, "debug/policy_chosen_logps": -245.041015625, "debug/policy_rejected_logits": 1.4938691854476929, "debug/policy_rejected_logps": -269.397705078125, "debug/reference_chosen_logps": -248.71542358398438, "debug/reference_rejected_logps": -269.37152099609375, "debug/sppo_chosen_loss": 2186.452392578125, "debug/sppo_chosen_reward_in_loss": 3.674414873123169, "debug/sppo_rej_reward_in_loss": -0.02619953081011772, "debug/sppo_reject_loss": 2560.8427734375, "epoch": 5.307971014492754, "grad_norm": 113836.35337531123, "learning_rate": 2.7646403038710535e-08, "logits/chosen": 1.2613698244094849, "logits/rejected": 1.4938691854476929, "logps/chosen": -245.041015625, "logps/rejected": -269.397705078125, "loss": 4434.6, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.036744147539138794, "rewards/margins": 0.037006136029958725, "rewards/rejected": -0.0002619953884277493, "step": 1465 }, { "debug/policy_chosen_logits": 1.011094093322754, "debug/policy_chosen_logps": -223.7213897705078, "debug/policy_rejected_logits": 1.8676655292510986, "debug/policy_rejected_logps": -326.70562744140625, "debug/reference_chosen_logps": -227.2796173095703, "debug/reference_rejected_logps": -324.4100036621094, "debug/sppo_chosen_loss": 2167.9365234375, "debug/sppo_chosen_reward_in_loss": 3.558199644088745, "debug/sppo_rej_reward_in_loss": -2.2955880165100098, "debug/sppo_reject_loss": 2348.977783203125, "epoch": 5.326086956521739, "grad_norm": 74309.78767829157, "learning_rate": 2.73137548615354e-08, "logits/chosen": 1.011094093322754, "logits/rejected": 1.8676655292510986, "logps/chosen": -223.7213897705078, "logps/rejected": -326.70562744140625, "loss": 4516.7555, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03558199480175972, "rewards/margins": 0.05853787809610367, "rewards/rejected": -0.02295587956905365, "step": 1470 }, { "debug/policy_chosen_logits": 1.1648266315460205, "debug/policy_chosen_logps": -263.5667419433594, "debug/policy_rejected_logits": 1.4224785566329956, "debug/policy_rejected_logps": -294.29736328125, "debug/reference_chosen_logps": -265.66778564453125, "debug/reference_rejected_logps": -290.55096435546875, "debug/sppo_chosen_loss": 2339.315185546875, "debug/sppo_chosen_reward_in_loss": 2.1010348796844482, "debug/sppo_rej_reward_in_loss": -3.7463951110839844, "debug/sppo_reject_loss": 2271.000732421875, "epoch": 5.344202898550725, "grad_norm": 135696.9586156644, "learning_rate": 2.6982366361221608e-08, "logits/chosen": 1.1648266315460205, "logits/rejected": 1.4224785566329956, "logps/chosen": -263.5667419433594, "logps/rejected": -294.29736328125, "loss": 4515.3344, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.021010348573327065, "rewards/margins": 0.05847429484128952, "rewards/rejected": -0.0374639518558979, "step": 1475 }, { "debug/policy_chosen_logits": 1.3069803714752197, "debug/policy_chosen_logps": -282.95233154296875, "debug/policy_rejected_logits": 1.629817247390747, "debug/policy_rejected_logps": -290.300048828125, "debug/reference_chosen_logps": -286.08929443359375, "debug/reference_rejected_logps": -290.58465576171875, "debug/sppo_chosen_loss": 2222.154052734375, "debug/sppo_chosen_reward_in_loss": 3.136929988861084, "debug/sppo_rej_reward_in_loss": 0.2846008241176605, "debug/sppo_reject_loss": 2600.535400390625, "epoch": 5.36231884057971, "grad_norm": 92393.44114528071, "learning_rate": 2.6652255938453066e-08, "logits/chosen": 1.3069803714752197, "logits/rejected": 1.629817247390747, "logps/chosen": -282.95233154296875, "logps/rejected": -290.300048828125, "loss": 4634.9187, "rewards/accuracies": 0.625, "rewards/chosen": 0.03136930242180824, "rewards/margins": 0.028523290529847145, "rewards/rejected": 0.002846005605533719, "step": 1480 }, { "debug/policy_chosen_logits": 1.0232958793640137, "debug/policy_chosen_logps": -254.8156280517578, "debug/policy_rejected_logits": 1.1778652667999268, "debug/policy_rejected_logps": -276.6258239746094, "debug/reference_chosen_logps": -258.20574951171875, "debug/reference_rejected_logps": -270.19232177734375, "debug/sppo_chosen_loss": 2210.439453125, "debug/sppo_chosen_reward_in_loss": 3.3901278972625732, "debug/sppo_rej_reward_in_loss": -6.4334917068481445, "debug/sppo_reject_loss": 1996.9488525390625, "epoch": 5.380434782608695, "grad_norm": 66567.60725853742, "learning_rate": 2.6323441922947165e-08, "logits/chosen": 1.0232958793640137, "logits/rejected": 1.1778652667999268, "logps/chosen": -254.8156280517578, "logps/rejected": -276.6258239746094, "loss": 4421.8281, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03390127792954445, "rewards/margins": 0.09823620319366455, "rewards/rejected": -0.0643349215388298, "step": 1485 }, { "debug/policy_chosen_logits": 1.2483222484588623, "debug/policy_chosen_logps": -242.7491455078125, "debug/policy_rejected_logits": 1.4808881282806396, "debug/policy_rejected_logps": -260.0889587402344, "debug/reference_chosen_logps": -246.10781860351562, "debug/reference_rejected_logps": -257.28558349609375, "debug/sppo_chosen_loss": 2200.03564453125, "debug/sppo_chosen_reward_in_loss": 3.3586738109588623, "debug/sppo_rej_reward_in_loss": -2.803394079208374, "debug/sppo_reject_loss": 2330.042724609375, "epoch": 5.398550724637682, "grad_norm": 74480.98281722279, "learning_rate": 2.599594257243689e-08, "logits/chosen": 1.2483222484588623, "logits/rejected": 1.4808881282806396, "logps/chosen": -242.7491455078125, "logps/rejected": -260.0889587402344, "loss": 4575.8328, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.033586740493774414, "rewards/margins": 0.06162068247795105, "rewards/rejected": -0.028033941984176636, "step": 1490 }, { "debug/policy_chosen_logits": 1.2368654012680054, "debug/policy_chosen_logps": -254.530517578125, "debug/policy_rejected_logits": 1.3546712398529053, "debug/policy_rejected_logps": -276.13037109375, "debug/reference_chosen_logps": -257.09320068359375, "debug/reference_rejected_logps": -271.80712890625, "debug/sppo_chosen_loss": 2288.206298828125, "debug/sppo_chosen_reward_in_loss": 2.5626778602600098, "debug/sppo_rej_reward_in_loss": -4.323225975036621, "debug/sppo_reject_loss": 2213.88671875, "epoch": 5.416666666666667, "grad_norm": 109713.85571536788, "learning_rate": 2.566977607165719e-08, "logits/chosen": 1.2368654012680054, "logits/rejected": 1.3546712398529053, "logps/chosen": -254.530517578125, "logps/rejected": -276.13037109375, "loss": 4559.675, "rewards/accuracies": 0.75, "rewards/chosen": 0.025626778602600098, "rewards/margins": 0.0688590407371521, "rewards/rejected": -0.043232254683971405, "step": 1495 }, { "debug/policy_chosen_logits": 0.9920104742050171, "debug/policy_chosen_logps": -228.82107543945312, "debug/policy_rejected_logits": 1.250663161277771, "debug/policy_rejected_logps": -247.8716278076172, "debug/reference_chosen_logps": -232.8604278564453, "debug/reference_rejected_logps": -243.4485321044922, "debug/sppo_chosen_loss": 2136.54931640625, "debug/sppo_chosen_reward_in_loss": 4.039345741271973, "debug/sppo_rej_reward_in_loss": -4.423121452331543, "debug/sppo_reject_loss": 2212.809326171875, "epoch": 5.434782608695652, "grad_norm": 56495.066508312746, "learning_rate": 2.5344960531335102e-08, "logits/chosen": 0.9920104742050171, "logits/rejected": 1.250663161277771, "logps/chosen": -228.82107543945312, "logps/rejected": -247.8716278076172, "loss": 4481.4781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.04039346054196358, "rewards/margins": 0.08462467044591904, "rewards/rejected": -0.04423121362924576, "step": 1500 }, { "epoch": 5.434782608695652, "eval_debug/policy_chosen_logits": 1.402949333190918, "eval_debug/policy_chosen_logps": -251.6733856201172, "eval_debug/policy_rejected_logits": 1.4467878341674805, "eval_debug/policy_rejected_logps": -262.5466613769531, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2446.67919921875, "eval_debug/sppo_chosen_reward_in_loss": 1.2450839281082153, "eval_debug/sppo_rej_reward_in_loss": -2.888056755065918, "eval_debug/sppo_reject_loss": 2368.621826171875, "eval_logits/chosen": 1.402949333190918, "eval_logits/rejected": 1.4467878341674805, "eval_logps/chosen": -251.6733856201172, "eval_logps/rejected": -262.5466613769531, "eval_loss": 4616.7265625, "eval_rewards/accuracies": 0.5921052694320679, "eval_rewards/chosen": 0.012450839392840862, "eval_rewards/margins": 0.04133140668272972, "eval_rewards/rejected": -0.028880568221211433, "eval_runtime": 28.3883, "eval_samples_per_second": 21.135, "eval_steps_per_second": 0.669, "step": 1500 }, { "debug/policy_chosen_logits": 1.1282310485839844, "debug/policy_chosen_logps": -251.6337890625, "debug/policy_rejected_logits": 1.3733762502670288, "debug/policy_rejected_logps": -292.90301513671875, "debug/reference_chosen_logps": -252.951171875, "debug/reference_rejected_logps": -290.30047607421875, "debug/sppo_chosen_loss": 2449.819091796875, "debug/sppo_chosen_reward_in_loss": 1.3173834085464478, "debug/sppo_rej_reward_in_loss": -2.6024844646453857, "debug/sppo_reject_loss": 2344.947998046875, "epoch": 5.452898550724638, "grad_norm": 71162.56835391422, "learning_rate": 2.5021513987184274e-08, "logits/chosen": 1.1282310485839844, "logits/rejected": 1.3733762502670288, "logps/chosen": -251.6337890625, "logps/rejected": -292.90301513671875, "loss": 4520.8465, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.01317383348941803, "rewards/margins": 0.039198677986860275, "rewards/rejected": -0.026024844497442245, "step": 1505 }, { "debug/policy_chosen_logits": 1.0822408199310303, "debug/policy_chosen_logps": -248.52548217773438, "debug/policy_rejected_logits": 1.5081207752227783, "debug/policy_rejected_logps": -279.8587341308594, "debug/reference_chosen_logps": -251.04904174804688, "debug/reference_rejected_logps": -274.56640625, "debug/sppo_chosen_loss": 2292.804443359375, "debug/sppo_chosen_reward_in_loss": 2.523580551147461, "debug/sppo_rej_reward_in_loss": -5.292346000671387, "debug/sppo_reject_loss": 2132.537109375, "epoch": 5.471014492753623, "grad_norm": 79190.04350242131, "learning_rate": 2.469945439890339e-08, "logits/chosen": 1.0822408199310303, "logits/rejected": 1.5081207752227783, "logps/chosen": -248.52548217773438, "logps/rejected": -279.8587341308594, "loss": 4436.1336, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02523580566048622, "rewards/margins": 0.07815925776958466, "rewards/rejected": -0.05292346328496933, "step": 1510 }, { "debug/policy_chosen_logits": 1.0114610195159912, "debug/policy_chosen_logps": -219.73385620117188, "debug/policy_rejected_logits": 1.4271812438964844, "debug/policy_rejected_logps": -282.7474060058594, "debug/reference_chosen_logps": -222.33889770507812, "debug/reference_rejected_logps": -275.36151123046875, "debug/sppo_chosen_loss": 2299.99951171875, "debug/sppo_chosen_reward_in_loss": 2.6050164699554443, "debug/sppo_rej_reward_in_loss": -7.385918617248535, "debug/sppo_reject_loss": 2018.4224853515625, "epoch": 5.489130434782608, "grad_norm": 77773.38202681381, "learning_rate": 2.4378799649179023e-08, "logits/chosen": 1.0114610195159912, "logits/rejected": 1.4271812438964844, "logps/chosen": -219.73385620117188, "logps/rejected": -282.7474060058594, "loss": 4445.4719, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.026050161570310593, "rewards/margins": 0.09990935027599335, "rewards/rejected": -0.07385918498039246, "step": 1515 }, { "debug/policy_chosen_logits": 1.517974853515625, "debug/policy_chosen_logps": -278.83526611328125, "debug/policy_rejected_logits": 1.8126938343048096, "debug/policy_rejected_logps": -344.2177734375, "debug/reference_chosen_logps": -279.59576416015625, "debug/reference_rejected_logps": -340.62744140625, "debug/sppo_chosen_loss": 2477.626220703125, "debug/sppo_chosen_reward_in_loss": 0.7605171203613281, "debug/sppo_rej_reward_in_loss": -3.5903029441833496, "debug/sppo_reject_loss": 2266.54638671875, "epoch": 5.507246376811594, "grad_norm": 88506.91102373891, "learning_rate": 2.4059567542692682e-08, "logits/chosen": 1.517974853515625, "logits/rejected": 1.8126938343048096, "logps/chosen": -278.83526611328125, "logps/rejected": -344.2177734375, "loss": 4525.4527, "rewards/accuracies": 0.625, "rewards/chosen": 0.007605170365422964, "rewards/margins": 0.04350820183753967, "rewards/rejected": -0.03590302914381027, "step": 1520 }, { "debug/policy_chosen_logits": 1.2230756282806396, "debug/policy_chosen_logps": -253.2280731201172, "debug/policy_rejected_logits": 1.4549987316131592, "debug/policy_rejected_logps": -291.5811462402344, "debug/reference_chosen_logps": -255.423095703125, "debug/reference_rejected_logps": -287.0794372558594, "debug/sppo_chosen_loss": 2331.87890625, "debug/sppo_chosen_reward_in_loss": 2.194990634918213, "debug/sppo_rej_reward_in_loss": -4.501686096191406, "debug/sppo_reject_loss": 2184.75439453125, "epoch": 5.52536231884058, "grad_norm": 82290.86470328699, "learning_rate": 2.3741775805132096e-08, "logits/chosen": 1.2230756282806396, "logits/rejected": 1.4549987316131592, "logps/chosen": -253.2280731201172, "logps/rejected": -291.5811462402344, "loss": 4532.6773, "rewards/accuracies": 0.75, "rewards/chosen": 0.021949905902147293, "rewards/margins": 0.06696675717830658, "rewards/rejected": -0.04501685872673988, "step": 1525 }, { "debug/policy_chosen_logits": 0.7856322526931763, "debug/policy_chosen_logps": -267.90460205078125, "debug/policy_rejected_logits": 0.8814099431037903, "debug/policy_rejected_logps": -275.64141845703125, "debug/reference_chosen_logps": -268.8172302246094, "debug/reference_rejected_logps": -273.11907958984375, "debug/sppo_chosen_loss": 2476.67333984375, "debug/sppo_chosen_reward_in_loss": 0.9126449823379517, "debug/sppo_rej_reward_in_loss": -2.522326707839966, "debug/sppo_reject_loss": 2360.42529296875, "epoch": 5.543478260869565, "grad_norm": 64277.589583882305, "learning_rate": 2.342544208220712e-08, "logits/chosen": 0.7856322526931763, "logits/rejected": 0.8814099431037903, "logps/chosen": -267.90460205078125, "logps/rejected": -275.64141845703125, "loss": 4584.1656, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.009126449935138226, "rewards/margins": 0.0343497171998024, "rewards/rejected": -0.0252232663333416, "step": 1530 }, { "debug/policy_chosen_logits": 0.983709454536438, "debug/policy_chosen_logps": -250.71945190429688, "debug/policy_rejected_logits": 1.3374192714691162, "debug/policy_rejected_logps": -282.01995849609375, "debug/reference_chosen_logps": -253.4926300048828, "debug/reference_rejected_logps": -276.5571594238281, "debug/sppo_chosen_loss": 2281.95556640625, "debug/sppo_chosen_reward_in_loss": 2.7731635570526123, "debug/sppo_rej_reward_in_loss": -5.462827682495117, "debug/sppo_reject_loss": 2117.26953125, "epoch": 5.561594202898551, "grad_norm": 98220.17839286085, "learning_rate": 2.311058393866981e-08, "logits/chosen": 0.983709454536438, "logits/rejected": 1.3374192714691162, "logps/chosen": -250.71945190429688, "logps/rejected": -282.01995849609375, "loss": 4618.0863, "rewards/accuracies": 0.75, "rewards/chosen": 0.0277316365391016, "rewards/margins": 0.0823599100112915, "rewards/rejected": -0.05462827533483505, "step": 1535 }, { "debug/policy_chosen_logits": 0.9909914135932922, "debug/policy_chosen_logps": -228.23965454101562, "debug/policy_rejected_logits": 1.5274099111557007, "debug/policy_rejected_logps": -296.5209655761719, "debug/reference_chosen_logps": -230.16903686523438, "debug/reference_rejected_logps": -288.3876037597656, "debug/sppo_chosen_loss": 2336.477783203125, "debug/sppo_chosen_reward_in_loss": 1.9293806552886963, "debug/sppo_rej_reward_in_loss": -8.133366584777832, "debug/sppo_reject_loss": 1942.154296875, "epoch": 5.579710144927536, "grad_norm": 67134.8106267087, "learning_rate": 2.2797218857339163e-08, "logits/chosen": 0.9909914135932922, "logits/rejected": 1.5274099111557007, "logps/chosen": -228.23965454101562, "logps/rejected": -296.5209655761719, "loss": 4509.9066, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.019293805584311485, "rewards/margins": 0.10062746703624725, "rewards/rejected": -0.08133365958929062, "step": 1540 }, { "debug/policy_chosen_logits": 1.036051869392395, "debug/policy_chosen_logps": -239.81472778320312, "debug/policy_rejected_logits": 1.2976715564727783, "debug/policy_rejected_logps": -264.1811828613281, "debug/reference_chosen_logps": -241.847900390625, "debug/reference_rejected_logps": -261.27923583984375, "debug/sppo_chosen_loss": 2345.96728515625, "debug/sppo_chosen_reward_in_loss": 2.0331783294677734, "debug/sppo_rej_reward_in_loss": -2.901949167251587, "debug/sppo_reject_loss": 2336.490966796875, "epoch": 5.5978260869565215, "grad_norm": 60768.60879872343, "learning_rate": 2.2485364238130432e-08, "logits/chosen": 1.036051869392395, "logits/rejected": 1.2976715564727783, "logps/chosen": -239.81472778320312, "logps/rejected": -264.1811828613281, "loss": 4429.5215, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.02033178135752678, "rewards/margins": 0.049351271241903305, "rewards/rejected": -0.029019493609666824, "step": 1545 }, { "debug/policy_chosen_logits": 1.1724830865859985, "debug/policy_chosen_logps": -249.09274291992188, "debug/policy_rejected_logits": 1.777655839920044, "debug/policy_rejected_logps": -309.8492126464844, "debug/reference_chosen_logps": -249.4769744873047, "debug/reference_rejected_logps": -302.5712890625, "debug/sppo_chosen_loss": 2510.03759765625, "debug/sppo_chosen_reward_in_loss": 0.3842487335205078, "debug/sppo_rej_reward_in_loss": -7.277923583984375, "debug/sppo_reject_loss": 2033.0191650390625, "epoch": 5.615942028985507, "grad_norm": 81233.4541358366, "learning_rate": 2.2175037397088887e-08, "logits/chosen": 1.1724830865859985, "logits/rejected": 1.777655839920044, "logps/chosen": -249.09274291992188, "logps/rejected": -309.8492126464844, "loss": 4423.675, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003842487931251526, "rewards/margins": 0.07662171125411987, "rewards/rejected": -0.07277923822402954, "step": 1550 }, { "debug/policy_chosen_logits": 1.1309086084365845, "debug/policy_chosen_logps": -253.1494903564453, "debug/policy_rejected_logits": 1.5028681755065918, "debug/policy_rejected_logps": -308.93731689453125, "debug/reference_chosen_logps": -255.1627960205078, "debug/reference_rejected_logps": -301.81512451171875, "debug/sppo_chosen_loss": 2369.07177734375, "debug/sppo_chosen_reward_in_loss": 2.013322353363037, "debug/sppo_rej_reward_in_loss": -7.1221771240234375, "debug/sppo_reject_loss": 1992.401611328125, "epoch": 5.634057971014493, "grad_norm": 100650.3146198026, "learning_rate": 2.1866255565428348e-08, "logits/chosen": 1.1309086084365845, "logits/rejected": 1.5028681755065918, "logps/chosen": -253.1494903564453, "logps/rejected": -308.93731689453125, "loss": 4557.2141, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02013322338461876, "rewards/margins": 0.09135500341653824, "rewards/rejected": -0.07122177630662918, "step": 1555 }, { "debug/policy_chosen_logits": 1.2297143936157227, "debug/policy_chosen_logps": -257.30133056640625, "debug/policy_rejected_logits": 1.589552879333496, "debug/policy_rejected_logps": -279.32781982421875, "debug/reference_chosen_logps": -259.53619384765625, "debug/reference_rejected_logps": -274.4745178222656, "debug/sppo_chosen_loss": 2346.412353515625, "debug/sppo_chosen_reward_in_loss": 2.2348694801330566, "debug/sppo_rej_reward_in_loss": -4.853362560272217, "debug/sppo_reject_loss": 2156.526123046875, "epoch": 5.6521739130434785, "grad_norm": 67399.3140960085, "learning_rate": 2.1559035888574427e-08, "logits/chosen": 1.2297143936157227, "logits/rejected": 1.589552879333496, "logps/chosen": -257.30133056640625, "logps/rejected": -279.32781982421875, "loss": 4484.4062, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.022348696365952492, "rewards/margins": 0.07088232040405273, "rewards/rejected": -0.04853362590074539, "step": 1560 }, { "debug/policy_chosen_logits": 0.7509094476699829, "debug/policy_chosen_logps": -238.2117156982422, "debug/policy_rejected_logits": 1.0019946098327637, "debug/policy_rejected_logps": -310.4805603027344, "debug/reference_chosen_logps": -238.13815307617188, "debug/reference_rejected_logps": -302.5880126953125, "debug/sppo_chosen_loss": 2602.240966796875, "debug/sppo_chosen_reward_in_loss": -0.07359428703784943, "debug/sppo_rej_reward_in_loss": -7.892542839050293, "debug/sppo_reject_loss": 2018.501220703125, "epoch": 5.670289855072464, "grad_norm": 76036.36447701229, "learning_rate": 2.125339542521254e-08, "logits/chosen": 0.7509094476699829, "logits/rejected": 1.0019946098327637, "logps/chosen": -238.2117156982422, "logps/rejected": -310.4805603027344, "loss": 4549.6527, "rewards/accuracies": 0.625, "rewards/chosen": -0.0007359433802776039, "rewards/margins": 0.07818949222564697, "rewards/rejected": -0.07892543077468872, "step": 1565 }, { "debug/policy_chosen_logits": 1.2730293273925781, "debug/policy_chosen_logps": -263.2087707519531, "debug/policy_rejected_logits": 1.5694999694824219, "debug/policy_rejected_logps": -303.6869812011719, "debug/reference_chosen_logps": -265.8478088378906, "debug/reference_rejected_logps": -297.4637145996094, "debug/sppo_chosen_loss": 2275.32275390625, "debug/sppo_chosen_reward_in_loss": 2.639035701751709, "debug/sppo_rej_reward_in_loss": -6.2232794761657715, "debug/sppo_reject_loss": 2088.626220703125, "epoch": 5.688405797101449, "grad_norm": 80329.30357239723, "learning_rate": 2.0949351146340583e-08, "logits/chosen": 1.2730293273925781, "logits/rejected": 1.5694999694824219, "logps/chosen": -263.2087707519531, "logps/rejected": -303.6869812011719, "loss": 4487.0953, "rewards/accuracies": 0.75, "rewards/chosen": 0.026390355080366135, "rewards/margins": 0.08862314373254776, "rewards/rejected": -0.062232792377471924, "step": 1570 }, { "debug/policy_chosen_logits": 1.4459768533706665, "debug/policy_chosen_logps": -264.822021484375, "debug/policy_rejected_logits": 1.6058681011199951, "debug/policy_rejected_logps": -292.1711730957031, "debug/reference_chosen_logps": -266.3921813964844, "debug/reference_rejected_logps": -285.4745178222656, "debug/sppo_chosen_loss": 2444.116455078125, "debug/sppo_chosen_reward_in_loss": 1.5701853036880493, "debug/sppo_rej_reward_in_loss": -6.696642875671387, "debug/sppo_reject_loss": 2027.204345703125, "epoch": 5.706521739130435, "grad_norm": 73562.49283707654, "learning_rate": 2.064691993432678e-08, "logits/chosen": 1.4459768533706665, "logits/rejected": 1.6058681011199951, "logps/chosen": -264.822021484375, "logps/rejected": -292.1711730957031, "loss": 4437.1078, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01570185460150242, "rewards/margins": 0.08266828209161758, "rewards/rejected": -0.06696642935276031, "step": 1575 }, { "debug/policy_chosen_logits": 1.305905818939209, "debug/policy_chosen_logps": -266.57647705078125, "debug/policy_rejected_logits": 1.6066995859146118, "debug/policy_rejected_logps": -286.05487060546875, "debug/reference_chosen_logps": -268.0072937011719, "debug/reference_rejected_logps": -280.94744873046875, "debug/sppo_chosen_loss": 2399.1025390625, "debug/sppo_chosen_reward_in_loss": 1.430829644203186, "debug/sppo_rej_reward_in_loss": -5.107414722442627, "debug/sppo_reject_loss": 2106.423828125, "epoch": 5.72463768115942, "grad_norm": 69774.80715783554, "learning_rate": 2.0346118581972095e-08, "logits/chosen": 1.305905818939209, "logits/rejected": 1.6066995859146118, "logps/chosen": -266.57647705078125, "logps/rejected": -286.05487060546875, "loss": 4559.0742, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.014308296144008636, "rewards/margins": 0.06538243591785431, "rewards/rejected": -0.05107413977384567, "step": 1580 }, { "debug/policy_chosen_logits": 0.9529942274093628, "debug/policy_chosen_logps": -245.853515625, "debug/policy_rejected_logits": 1.3847246170043945, "debug/policy_rejected_logps": -291.8622131347656, "debug/reference_chosen_logps": -246.39022827148438, "debug/reference_rejected_logps": -284.890625, "debug/sppo_chosen_loss": 2535.155517578125, "debug/sppo_chosen_reward_in_loss": 0.5367231369018555, "debug/sppo_rej_reward_in_loss": -6.971585273742676, "debug/sppo_reject_loss": 2033.044189453125, "epoch": 5.742753623188406, "grad_norm": 83134.39826419551, "learning_rate": 2.0046963791577898e-08, "logits/chosen": 0.9529942274093628, "logits/rejected": 1.3847246170043945, "logps/chosen": -245.853515625, "logps/rejected": -291.8622131347656, "loss": 4546.8141, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.00536723155528307, "rewards/margins": 0.07508309185504913, "rewards/rejected": -0.06971585005521774, "step": 1585 }, { "debug/policy_chosen_logits": 0.9592103958129883, "debug/policy_chosen_logps": -254.1472625732422, "debug/policy_rejected_logits": 1.5464726686477661, "debug/policy_rejected_logps": -303.38140869140625, "debug/reference_chosen_logps": -256.5666198730469, "debug/reference_rejected_logps": -295.99114990234375, "debug/sppo_chosen_loss": 2300.235107421875, "debug/sppo_chosen_reward_in_loss": 2.4193854331970215, "debug/sppo_rej_reward_in_loss": -7.3902788162231445, "debug/sppo_reject_loss": 1999.815185546875, "epoch": 5.760869565217392, "grad_norm": 60062.4219329617, "learning_rate": 1.9749472174018567e-08, "logits/chosen": 0.9592103958129883, "logits/rejected": 1.5464726686477661, "logps/chosen": -254.1472625732422, "logps/rejected": -303.38140869140625, "loss": 4369.1586, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02419385313987732, "rewards/margins": 0.09809663146734238, "rewards/rejected": -0.07390278577804565, "step": 1590 }, { "debug/policy_chosen_logits": 0.9911503791809082, "debug/policy_chosen_logps": -244.70571899414062, "debug/policy_rejected_logits": 1.137888789176941, "debug/policy_rejected_logps": -251.33859252929688, "debug/reference_chosen_logps": -245.8179168701172, "debug/reference_rejected_logps": -247.0068817138672, "debug/sppo_chosen_loss": 2453.41845703125, "debug/sppo_chosen_reward_in_loss": 1.1122007369995117, "debug/sppo_rej_reward_in_loss": -4.331699371337891, "debug/sppo_reject_loss": 2246.73095703125, "epoch": 5.778985507246377, "grad_norm": 63515.44254699706, "learning_rate": 1.9453660247819054e-08, "logits/chosen": 0.9911503791809082, "logits/rejected": 1.137888789176941, "logps/chosen": -244.70571899414062, "logps/rejected": -251.33859252929688, "loss": 4569.682, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.011122007854282856, "rewards/margins": 0.0544389970600605, "rewards/rejected": -0.04331699013710022, "step": 1595 }, { "debug/policy_chosen_logits": 0.9711323976516724, "debug/policy_chosen_logps": -255.40786743164062, "debug/policy_rejected_logits": 1.3977515697479248, "debug/policy_rejected_logps": -322.133056640625, "debug/reference_chosen_logps": -257.37481689453125, "debug/reference_rejected_logps": -316.53289794921875, "debug/sppo_chosen_loss": 2381.20361328125, "debug/sppo_chosen_reward_in_loss": 1.9669723510742188, "debug/sppo_rej_reward_in_loss": -5.600157260894775, "debug/sppo_reject_loss": 2125.21435546875, "epoch": 5.797101449275362, "grad_norm": 104333.61833020138, "learning_rate": 1.9159544438237795e-08, "logits/chosen": 0.9711323976516724, "logits/rejected": 1.3977515697479248, "logps/chosen": -255.40786743164062, "logps/rejected": -322.133056640625, "loss": 4557.7566, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.019669722765684128, "rewards/margins": 0.07567129284143448, "rewards/rejected": -0.05600156635046005, "step": 1600 }, { "epoch": 5.797101449275362, "eval_debug/policy_chosen_logits": 1.3976449966430664, "eval_debug/policy_chosen_logps": -252.77944946289062, "eval_debug/policy_rejected_logits": 1.4427540302276611, "eval_debug/policy_rejected_logps": -263.8221130371094, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2564.9140625, "eval_debug/sppo_chosen_reward_in_loss": 0.13903316855430603, "eval_debug/sppo_rej_reward_in_loss": -4.163466453552246, "eval_debug/sppo_reject_loss": 2269.406982421875, "eval_logits/chosen": 1.3976449966430664, "eval_logits/rejected": 1.4427540302276611, "eval_logps/chosen": -252.77944946289062, "eval_logps/rejected": -263.8221130371094, "eval_loss": 4618.0537109375, "eval_rewards/accuracies": 0.5921052694320679, "eval_rewards/chosen": 0.001390331657603383, "eval_rewards/margins": 0.043024998158216476, "eval_rewards/rejected": -0.04163466766476631, "eval_runtime": 28.3722, "eval_samples_per_second": 21.147, "eval_steps_per_second": 0.67, "step": 1600 }, { "debug/policy_chosen_logits": 1.39915931224823, "debug/policy_chosen_logps": -261.6456604003906, "debug/policy_rejected_logits": 1.3916994333267212, "debug/policy_rejected_logps": -283.9635009765625, "debug/reference_chosen_logps": -262.82550048828125, "debug/reference_rejected_logps": -277.431396484375, "debug/sppo_chosen_loss": 2451.422607421875, "debug/sppo_chosen_reward_in_loss": 1.1798160076141357, "debug/sppo_rej_reward_in_loss": -6.532097816467285, "debug/sppo_reject_loss": 2075.255859375, "epoch": 5.815217391304348, "grad_norm": 119684.09863826247, "learning_rate": 1.8867141076354575e-08, "logits/chosen": 1.39915931224823, "logits/rejected": 1.3916994333267212, "logps/chosen": -261.6456604003906, "logps/rejected": -283.9635009765625, "loss": 4475.0945, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.011798160150647163, "rewards/margins": 0.0771191343665123, "rewards/rejected": -0.06532097607851028, "step": 1605 }, { "debug/policy_chosen_logits": 1.276993751525879, "debug/policy_chosen_logps": -254.44296264648438, "debug/policy_rejected_logits": 1.4243838787078857, "debug/policy_rejected_logps": -278.84466552734375, "debug/reference_chosen_logps": -256.94012451171875, "debug/reference_rejected_logps": -272.6163024902344, "debug/sppo_chosen_loss": 2287.19873046875, "debug/sppo_chosen_reward_in_loss": 2.4971923828125, "debug/sppo_rej_reward_in_loss": -6.2283549308776855, "debug/sppo_reject_loss": 2048.57958984375, "epoch": 5.833333333333333, "grad_norm": 84232.22173943985, "learning_rate": 1.8576466398163825e-08, "logits/chosen": 1.276993751525879, "logits/rejected": 1.4243838787078857, "logps/chosen": -254.44296264648438, "logps/rejected": -278.84466552734375, "loss": 4472.1785, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.024971922859549522, "rewards/margins": 0.08725547045469284, "rewards/rejected": -0.062283553183078766, "step": 1610 }, { "debug/policy_chosen_logits": 1.0857824087142944, "debug/policy_chosen_logps": -261.70123291015625, "debug/policy_rejected_logits": 1.2012531757354736, "debug/policy_rejected_logps": -284.1413879394531, "debug/reference_chosen_logps": -263.23260498046875, "debug/reference_rejected_logps": -279.31488037109375, "debug/sppo_chosen_loss": 2379.55908203125, "debug/sppo_chosen_reward_in_loss": 1.5314220190048218, "debug/sppo_rej_reward_in_loss": -4.826534748077393, "debug/sppo_reject_loss": 2191.15673828125, "epoch": 5.851449275362318, "grad_norm": 75262.29467933044, "learning_rate": 1.828753654367301e-08, "logits/chosen": 1.0857824087142944, "logits/rejected": 1.2012531757354736, "logps/chosen": -261.70123291015625, "logps/rejected": -284.1413879394531, "loss": 4482.4117, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.015314221382141113, "rewards/margins": 0.06357955932617188, "rewards/rejected": -0.04826534539461136, "step": 1615 }, { "debug/policy_chosen_logits": 0.9738510251045227, "debug/policy_chosen_logps": -253.0663604736328, "debug/policy_rejected_logits": 1.2081331014633179, "debug/policy_rejected_logps": -289.32147216796875, "debug/reference_chosen_logps": -253.2355194091797, "debug/reference_rejected_logps": -283.28363037109375, "debug/sppo_chosen_loss": 2559.809814453125, "debug/sppo_chosen_reward_in_loss": 0.16917076706886292, "debug/sppo_rej_reward_in_loss": -6.037837028503418, "debug/sppo_reject_loss": 2085.84033203125, "epoch": 5.869565217391305, "grad_norm": 60666.13479644753, "learning_rate": 1.800036755600649e-08, "logits/chosen": 0.9738510251045227, "logits/rejected": 1.2081331014633179, "logps/chosen": -253.0663604736328, "logps/rejected": -289.32147216796875, "loss": 4566.7016, "rewards/accuracies": 0.625, "rewards/chosen": 0.001691707642748952, "rewards/margins": 0.06207007169723511, "rewards/rejected": -0.060378365218639374, "step": 1620 }, { "debug/policy_chosen_logits": 1.2756688594818115, "debug/policy_chosen_logps": -247.154541015625, "debug/policy_rejected_logits": 1.5490895509719849, "debug/policy_rejected_logps": -284.45733642578125, "debug/reference_chosen_logps": -248.82217407226562, "debug/reference_rejected_logps": -276.4673767089844, "debug/sppo_chosen_loss": 2402.35302734375, "debug/sppo_chosen_reward_in_loss": 1.6676151752471924, "debug/sppo_rej_reward_in_loss": -7.989927768707275, "debug/sppo_reject_loss": 2021.4124755859375, "epoch": 5.88768115942029, "grad_norm": 70184.62744062844, "learning_rate": 1.7714975380514747e-08, "logits/chosen": 1.2756688594818115, "logits/rejected": 1.5490895509719849, "logps/chosen": -247.154541015625, "logps/rejected": -284.45733642578125, "loss": 4444.1676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.016676150262355804, "rewards/margins": 0.09657542407512665, "rewards/rejected": -0.07989926636219025, "step": 1625 }, { "debug/policy_chosen_logits": 0.9647180438041687, "debug/policy_chosen_logps": -231.06143188476562, "debug/policy_rejected_logits": 1.6003284454345703, "debug/policy_rejected_logps": -329.9085388183594, "debug/reference_chosen_logps": -232.83535766601562, "debug/reference_rejected_logps": -322.9118957519531, "debug/sppo_chosen_loss": 2379.330322265625, "debug/sppo_chosen_reward_in_loss": 1.773934006690979, "debug/sppo_rej_reward_in_loss": -6.9966301918029785, "debug/sppo_reject_loss": 2040.5396728515625, "epoch": 5.905797101449275, "grad_norm": 122076.70540811632, "learning_rate": 1.74313758638889e-08, "logits/chosen": 0.9647180438041687, "logits/rejected": 1.6003284454345703, "logps/chosen": -231.06143188476562, "logps/rejected": -329.9085388183594, "loss": 4578.8035, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.01773933880031109, "rewards/margins": 0.08770564943552017, "rewards/rejected": -0.06996630132198334, "step": 1630 }, { "debug/policy_chosen_logits": 0.9913623929023743, "debug/policy_chosen_logps": -235.04196166992188, "debug/policy_rejected_logits": 1.44295334815979, "debug/policy_rejected_logps": -325.648681640625, "debug/reference_chosen_logps": -237.11782836914062, "debug/reference_rejected_logps": -316.41522216796875, "debug/sppo_chosen_loss": 2330.42529296875, "debug/sppo_chosen_reward_in_loss": 2.075892448425293, "debug/sppo_rej_reward_in_loss": -9.233416557312012, "debug/sppo_reject_loss": 1873.272705078125, "epoch": 5.923913043478261, "grad_norm": 75527.0035979738, "learning_rate": 1.7149584753280877e-08, "logits/chosen": 0.9913623929023743, "logits/rejected": 1.44295334815979, "logps/chosen": -235.04196166992188, "logps/rejected": -325.648681640625, "loss": 4426.1695, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.020758923143148422, "rewards/margins": 0.11309309303760529, "rewards/rejected": -0.09233416616916656, "step": 1635 }, { "debug/policy_chosen_logits": 0.8644050359725952, "debug/policy_chosen_logps": -247.27706909179688, "debug/policy_rejected_logits": 0.9515060186386108, "debug/policy_rejected_logps": -264.0517272949219, "debug/reference_chosen_logps": -246.76345825195312, "debug/reference_rejected_logps": -256.6071472167969, "debug/sppo_chosen_loss": 2653.48779296875, "debug/sppo_chosen_reward_in_loss": -0.5136321783065796, "debug/sppo_rej_reward_in_loss": -7.444613456726074, "debug/sppo_reject_loss": 1968.6578369140625, "epoch": 5.942028985507246, "grad_norm": 111654.94401469397, "learning_rate": 1.6869617695429024e-08, "logits/chosen": 0.8644050359725952, "logits/rejected": 0.9515060186386108, "logps/chosen": -247.27706909179688, "logps/rejected": -264.0517272949219, "loss": 4457.9152, "rewards/accuracies": 0.75, "rewards/chosen": -0.005136322230100632, "rewards/margins": 0.06930981576442719, "rewards/rejected": -0.07444612681865692, "step": 1640 }, { "debug/policy_chosen_logits": 1.2634034156799316, "debug/policy_chosen_logps": -272.01629638671875, "debug/policy_rejected_logits": 1.728417992591858, "debug/policy_rejected_logps": -293.72198486328125, "debug/reference_chosen_logps": -273.44683837890625, "debug/reference_rejected_logps": -289.3643493652344, "debug/sppo_chosen_loss": 2410.878173828125, "debug/sppo_chosen_reward_in_loss": 1.4305458068847656, "debug/sppo_rej_reward_in_loss": -4.357623100280762, "debug/sppo_reject_loss": 2192.16552734375, "epoch": 5.960144927536232, "grad_norm": 92617.1431644493, "learning_rate": 1.659149023578932e-08, "logits/chosen": 1.2634034156799316, "logits/rejected": 1.728417992591858, "logps/chosen": -272.01629638671875, "logps/rejected": -293.72198486328125, "loss": 4681.8473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.014305457472801208, "rewards/margins": 0.05788169056177139, "rewards/rejected": -0.043576233088970184, "step": 1645 }, { "debug/policy_chosen_logits": 1.1534197330474854, "debug/policy_chosen_logps": -257.92620849609375, "debug/policy_rejected_logits": 1.5255687236785889, "debug/policy_rejected_logps": -309.0951232910156, "debug/reference_chosen_logps": -259.07080078125, "debug/reference_rejected_logps": -302.33856201171875, "debug/sppo_chosen_loss": 2448.565185546875, "debug/sppo_chosen_reward_in_loss": 1.144627332687378, "debug/sppo_rej_reward_in_loss": -6.756533145904541, "debug/sppo_reject_loss": 2012.3671875, "epoch": 5.978260869565218, "grad_norm": 67416.56753397142, "learning_rate": 1.631521781767214e-08, "logits/chosen": 1.1534197330474854, "logits/rejected": 1.5255687236785889, "logps/chosen": -257.92620849609375, "logps/rejected": -309.0951232910156, "loss": 4506.9414, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011446274816989899, "rewards/margins": 0.07901160418987274, "rewards/rejected": -0.06756532937288284, "step": 1650 }, { "debug/policy_chosen_logits": 1.0105514526367188, "debug/policy_chosen_logps": -266.85760498046875, "debug/policy_rejected_logits": 1.1407248973846436, "debug/policy_rejected_logps": -270.06201171875, "debug/reference_chosen_logps": -269.12786865234375, "debug/reference_rejected_logps": -266.747314453125, "debug/sppo_chosen_loss": 2305.269775390625, "debug/sppo_chosen_reward_in_loss": 2.270277738571167, "debug/sppo_rej_reward_in_loss": -3.3147239685058594, "debug/sppo_reject_loss": 2292.51220703125, "epoch": 5.996376811594203, "grad_norm": 71539.28058485997, "learning_rate": 1.6040815781384835e-08, "logits/chosen": 1.0105514526367188, "logits/rejected": 1.1407248973846436, "logps/chosen": -266.85760498046875, "logps/rejected": -270.06201171875, "loss": 4471.6687, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02270277589559555, "rewards/margins": 0.05585001781582832, "rewards/rejected": -0.033147238194942474, "step": 1655 }, { "debug/policy_chosen_logits": 1.3166652917861938, "debug/policy_chosen_logps": -249.92971801757812, "debug/policy_rejected_logits": 1.3984097242355347, "debug/policy_rejected_logps": -264.4957275390625, "debug/reference_chosen_logps": -252.8326416015625, "debug/reference_rejected_logps": -259.8531494140625, "debug/sppo_chosen_loss": 2243.153564453125, "debug/sppo_chosen_reward_in_loss": 2.9029386043548584, "debug/sppo_rej_reward_in_loss": -4.6425909996032715, "debug/sppo_reject_loss": 2174.069091796875, "epoch": 6.0144927536231885, "grad_norm": 165564.9998152843, "learning_rate": 1.5768299363379873e-08, "logits/chosen": 1.3166652917861938, "logits/rejected": 1.3984097242355347, "logps/chosen": -249.92971801757812, "logps/rejected": -264.4957275390625, "loss": 4336.6781, "rewards/accuracies": 0.75, "rewards/chosen": 0.02902938798069954, "rewards/margins": 0.07545529305934906, "rewards/rejected": -0.046425916254520416, "step": 1660 }, { "debug/policy_chosen_logits": 1.1228680610656738, "debug/policy_chosen_logps": -256.03125, "debug/policy_rejected_logits": 1.1986610889434814, "debug/policy_rejected_logps": -284.936279296875, "debug/reference_chosen_logps": -256.0284118652344, "debug/reference_rejected_logps": -279.94073486328125, "debug/sppo_chosen_loss": 2582.91455078125, "debug/sppo_chosen_reward_in_loss": -0.0028770447243005037, "debug/sppo_rej_reward_in_loss": -4.995522499084473, "debug/sppo_reject_loss": 2207.716796875, "epoch": 6.032608695652174, "grad_norm": 62791.773762485485, "learning_rate": 1.549768369540882e-08, "logits/chosen": 1.1228680610656738, "logits/rejected": 1.1986610889434814, "logps/chosen": -256.03125, "logps/rejected": -284.936279296875, "loss": 4471.5508, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.8770416975021362e-05, "rewards/margins": 0.049926456063985825, "rewards/rejected": -0.049955226480960846, "step": 1665 }, { "debug/policy_chosen_logits": 0.9759367108345032, "debug/policy_chosen_logps": -260.8306884765625, "debug/policy_rejected_logits": 1.3370988368988037, "debug/policy_rejected_logps": -274.1346740722656, "debug/reference_chosen_logps": -260.8038330078125, "debug/reference_rejected_logps": -269.924072265625, "debug/sppo_chosen_loss": 2595.6005859375, "debug/sppo_chosen_reward_in_loss": -0.026832008734345436, "debug/sppo_rej_reward_in_loss": -4.210579872131348, "debug/sppo_reject_loss": 2220.85302734375, "epoch": 6.050724637681159, "grad_norm": 85633.85924538625, "learning_rate": 1.5228983803682233e-08, "logits/chosen": 0.9759367108345032, "logits/rejected": 1.3370988368988037, "logps/chosen": -260.8306884765625, "logps/rejected": -274.1346740722656, "loss": 4617.566, "rewards/accuracies": 0.75, "rewards/chosen": -0.0002683214843273163, "rewards/margins": 0.04183747619390488, "rewards/rejected": -0.042105793952941895, "step": 1670 }, { "debug/policy_chosen_logits": 1.043464183807373, "debug/policy_chosen_logps": -237.03897094726562, "debug/policy_rejected_logits": 1.3343417644500732, "debug/policy_rejected_logps": -272.0693664550781, "debug/reference_chosen_logps": -237.66552734375, "debug/reference_rejected_logps": -266.9305725097656, "debug/sppo_chosen_loss": 2517.232421875, "debug/sppo_chosen_reward_in_loss": 0.6265815496444702, "debug/sppo_rej_reward_in_loss": -5.138772487640381, "debug/sppo_reject_loss": 2158.145751953125, "epoch": 6.068840579710145, "grad_norm": 106333.17908808027, "learning_rate": 1.4962214608035174e-08, "logits/chosen": 1.043464183807373, "logits/rejected": 1.3343417644500732, "logps/chosen": -237.03897094726562, "logps/rejected": -272.0693664550781, "loss": 4487.2164, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.006265816278755665, "rewards/margins": 0.05765353515744209, "rewards/rejected": -0.0513877272605896, "step": 1675 }, { "debug/policy_chosen_logits": 0.9386157989501953, "debug/policy_chosen_logps": -245.19601440429688, "debug/policy_rejected_logits": 1.1581140756607056, "debug/policy_rejected_logps": -282.74481201171875, "debug/reference_chosen_logps": -246.68087768554688, "debug/reference_rejected_logps": -274.8426208496094, "debug/sppo_chosen_loss": 2406.94140625, "debug/sppo_chosen_reward_in_loss": 1.484842300415039, "debug/sppo_rej_reward_in_loss": -7.902211666107178, "debug/sppo_reject_loss": 1954.858154296875, "epoch": 6.086956521739131, "grad_norm": 63533.333542387, "learning_rate": 1.4697390921098884e-08, "logits/chosen": 0.9386157989501953, "logits/rejected": 1.1581140756607056, "logps/chosen": -245.19601440429688, "logps/rejected": -282.74481201171875, "loss": 4453.4, "rewards/accuracies": 0.75, "rewards/chosen": 0.014848423190414906, "rewards/margins": 0.09387053549289703, "rewards/rejected": -0.079022116959095, "step": 1680 }, { "debug/policy_chosen_logits": 1.0539305210113525, "debug/policy_chosen_logps": -252.01614379882812, "debug/policy_rejected_logits": 1.518951416015625, "debug/policy_rejected_logps": -322.51824951171875, "debug/reference_chosen_logps": -255.57870483398438, "debug/reference_rejected_logps": -316.02362060546875, "debug/sppo_chosen_loss": 2176.062255859375, "debug/sppo_chosen_reward_in_loss": 3.562582492828369, "debug/sppo_rej_reward_in_loss": -6.494635581970215, "debug/sppo_reject_loss": 2063.71337890625, "epoch": 6.105072463768116, "grad_norm": 78076.75910880722, "learning_rate": 1.4434527447478211e-08, "logits/chosen": 1.0539305210113525, "logits/rejected": 1.518951416015625, "logps/chosen": -252.01614379882812, "logps/rejected": -322.51824951171875, "loss": 4314.0164, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03562582656741142, "rewards/margins": 0.10057218372821808, "rewards/rejected": -0.06494636088609695, "step": 1685 }, { "debug/policy_chosen_logits": 1.1778371334075928, "debug/policy_chosen_logps": -240.6632843017578, "debug/policy_rejected_logits": 1.3609716892242432, "debug/policy_rejected_logps": -288.82550048828125, "debug/reference_chosen_logps": -242.98355102539062, "debug/reference_rejected_logps": -285.4348449707031, "debug/sppo_chosen_loss": 2297.618408203125, "debug/sppo_chosen_reward_in_loss": 2.320263147354126, "debug/sppo_rej_reward_in_loss": -3.3906643390655518, "debug/sppo_reject_loss": 2266.66259765625, "epoch": 6.1231884057971016, "grad_norm": 92007.95295488626, "learning_rate": 1.4173638782935222e-08, "logits/chosen": 1.1778371334075928, "logits/rejected": 1.3609716892242432, "logps/chosen": -240.6632843017578, "logps/rejected": -288.82550048828125, "loss": 4490.9922, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02320263162255287, "rewards/margins": 0.057109273970127106, "rewards/rejected": -0.033906638622283936, "step": 1690 }, { "debug/policy_chosen_logits": 1.1212691068649292, "debug/policy_chosen_logps": -267.1830139160156, "debug/policy_rejected_logits": 1.542589545249939, "debug/policy_rejected_logps": -308.27496337890625, "debug/reference_chosen_logps": -269.02978515625, "debug/reference_rejected_logps": -303.439208984375, "debug/sppo_chosen_loss": 2355.48974609375, "debug/sppo_chosen_reward_in_loss": 1.8467466831207275, "debug/sppo_rej_reward_in_loss": -4.83573055267334, "debug/sppo_reject_loss": 2156.66015625, "epoch": 6.141304347826087, "grad_norm": 76139.17518191009, "learning_rate": 1.3914739413578635e-08, "logits/chosen": 1.1212691068649292, "logits/rejected": 1.542589545249939, "logps/chosen": -267.1830139160156, "logps/rejected": -308.27496337890625, "loss": 4502.1578, "rewards/accuracies": 0.75, "rewards/chosen": 0.018467465415596962, "rewards/margins": 0.0668247789144516, "rewards/rejected": -0.04835730418562889, "step": 1695 }, { "debug/policy_chosen_logits": 1.0304549932479858, "debug/policy_chosen_logps": -258.8855895996094, "debug/policy_rejected_logits": 1.1785156726837158, "debug/policy_rejected_logps": -277.87237548828125, "debug/reference_chosen_logps": -261.2309875488281, "debug/reference_rejected_logps": -270.38702392578125, "debug/sppo_chosen_loss": 2296.740478515625, "debug/sppo_chosen_reward_in_loss": 2.3453879356384277, "debug/sppo_rej_reward_in_loss": -7.485389709472656, "debug/sppo_reject_loss": 1961.5794677734375, "epoch": 6.159420289855072, "grad_norm": 63352.3130523474, "learning_rate": 1.3657843715059546e-08, "logits/chosen": 1.0304549932479858, "logits/rejected": 1.1785156726837158, "logps/chosen": -258.8855895996094, "logps/rejected": -277.87237548828125, "loss": 4507.4234, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.023453880101442337, "rewards/margins": 0.0983077734708786, "rewards/rejected": -0.07485388219356537, "step": 1700 }, { "epoch": 6.159420289855072, "eval_debug/policy_chosen_logits": 1.393418312072754, "eval_debug/policy_chosen_logps": -252.83157348632812, "eval_debug/policy_rejected_logits": 1.4381625652313232, "eval_debug/policy_rejected_logps": -263.7893371582031, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2573.3212890625, "eval_debug/sppo_chosen_reward_in_loss": 0.08691415935754776, "eval_debug/sppo_rej_reward_in_loss": -4.130711078643799, "eval_debug/sppo_reject_loss": 2274.951171875, "eval_logits/chosen": 1.393418312072754, "eval_logits/rejected": 1.4381625652313232, "eval_logps/chosen": -252.83157348632812, "eval_logps/rejected": -263.7893371582031, "eval_loss": 4618.0, "eval_rewards/accuracies": 0.5921052694320679, "eval_rewards/chosen": 0.0008691417751833797, "eval_rewards/margins": 0.042176254093647, "eval_rewards/rejected": -0.04130711406469345, "eval_runtime": 28.5957, "eval_samples_per_second": 20.982, "eval_steps_per_second": 0.664, "step": 1700 }, { "debug/policy_chosen_logits": 1.1354193687438965, "debug/policy_chosen_logps": -249.91000366210938, "debug/policy_rejected_logits": 1.5528957843780518, "debug/policy_rejected_logps": -313.15106201171875, "debug/reference_chosen_logps": -252.29525756835938, "debug/reference_rejected_logps": -306.924072265625, "debug/sppo_chosen_loss": 2300.11083984375, "debug/sppo_chosen_reward_in_loss": 2.385272741317749, "debug/sppo_rej_reward_in_loss": -6.227025032043457, "debug/sppo_reject_loss": 2092.615234375, "epoch": 6.177536231884058, "grad_norm": 107904.8515031039, "learning_rate": 1.3402965951773231e-08, "logits/chosen": 1.1354193687438965, "logits/rejected": 1.5528957843780518, "logps/chosen": -249.91000366210938, "logps/rejected": -313.15106201171875, "loss": 4416.3258, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.023852726444602013, "rewards/margins": 0.08612297475337982, "rewards/rejected": -0.06227024644613266, "step": 1705 }, { "debug/policy_chosen_logits": 1.0563108921051025, "debug/policy_chosen_logps": -254.9259033203125, "debug/policy_rejected_logits": 1.5079724788665771, "debug/policy_rejected_logps": -305.40533447265625, "debug/reference_chosen_logps": -257.25634765625, "debug/reference_rejected_logps": -300.0366516113281, "debug/sppo_chosen_loss": 2320.1484375, "debug/sppo_chosen_reward_in_loss": 2.330429792404175, "debug/sppo_rej_reward_in_loss": -5.368690013885498, "debug/sppo_reject_loss": 2095.70947265625, "epoch": 6.195652173913044, "grad_norm": 145999.2754310219, "learning_rate": 1.3150120276067005e-08, "logits/chosen": 1.0563108921051025, "logits/rejected": 1.5079724788665771, "logps/chosen": -254.9259033203125, "logps/rejected": -305.40533447265625, "loss": 4454.2812, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.023304296657443047, "rewards/margins": 0.07699120044708252, "rewards/rejected": -0.053686898201704025, "step": 1710 }, { "debug/policy_chosen_logits": 0.8688338994979858, "debug/policy_chosen_logps": -235.02902221679688, "debug/policy_rejected_logits": 1.3255870342254639, "debug/policy_rejected_logps": -299.3507385253906, "debug/reference_chosen_logps": -234.290283203125, "debug/reference_rejected_logps": -290.80792236328125, "debug/sppo_chosen_loss": 2712.55908203125, "debug/sppo_chosen_reward_in_loss": -0.7387531399726868, "debug/sppo_rej_reward_in_loss": -8.542762756347656, "debug/sppo_reject_loss": 1942.9124755859375, "epoch": 6.213768115942029, "grad_norm": 72598.38108351543, "learning_rate": 1.2899320727454472e-08, "logits/chosen": 0.8688338994979858, "logits/rejected": 1.3255870342254639, "logps/chosen": -235.02902221679688, "logps/rejected": -299.3507385253906, "loss": 4546.7039, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.007387531455606222, "rewards/margins": 0.07804010063409805, "rewards/rejected": -0.08542762696743011, "step": 1715 }, { "debug/policy_chosen_logits": 1.2419612407684326, "debug/policy_chosen_logps": -231.77490234375, "debug/policy_rejected_logits": 1.3306069374084473, "debug/policy_rejected_logps": -259.4939880371094, "debug/reference_chosen_logps": -235.42819213867188, "debug/reference_rejected_logps": -255.14407348632812, "debug/sppo_chosen_loss": 2156.22705078125, "debug/sppo_chosen_reward_in_loss": 3.6533005237579346, "debug/sppo_rej_reward_in_loss": -4.349907875061035, "debug/sppo_reject_loss": 2229.787353515625, "epoch": 6.231884057971015, "grad_norm": 61898.363819015816, "learning_rate": 1.2650581231835921e-08, "logits/chosen": 1.2419612407684326, "logits/rejected": 1.3306069374084473, "logps/chosen": -231.77490234375, "logps/rejected": -259.4939880371094, "loss": 4444.5094, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03653300553560257, "rewards/margins": 0.08003208041191101, "rewards/rejected": -0.04349907115101814, "step": 1720 }, { "debug/policy_chosen_logits": 1.206857442855835, "debug/policy_chosen_logps": -261.6423034667969, "debug/policy_rejected_logits": 1.467166543006897, "debug/policy_rejected_logps": -296.5067138671875, "debug/reference_chosen_logps": -262.1924743652344, "debug/reference_rejected_logps": -292.49652099609375, "debug/sppo_chosen_loss": 2536.546630859375, "debug/sppo_chosen_reward_in_loss": 0.5501596331596375, "debug/sppo_rej_reward_in_loss": -4.010178565979004, "debug/sppo_reject_loss": 2224.327392578125, "epoch": 6.25, "grad_norm": 72796.66106475785, "learning_rate": 1.2403915600725157e-08, "logits/chosen": 1.206857442855835, "logits/rejected": 1.467166543006897, "logps/chosen": -261.6423034667969, "logps/rejected": -296.5067138671875, "loss": 4472.7969, "rewards/accuracies": 0.625, "rewards/chosen": 0.005501596722751856, "rewards/margins": 0.04560338333249092, "rewards/rejected": -0.0401017852127552, "step": 1725 }, { "debug/policy_chosen_logits": 1.1720632314682007, "debug/policy_chosen_logps": -239.95504760742188, "debug/policy_rejected_logits": 1.3450525999069214, "debug/policy_rejected_logps": -271.2496643066406, "debug/reference_chosen_logps": -239.88345336914062, "debug/reference_rejected_logps": -264.8446350097656, "debug/sppo_chosen_loss": 2585.338623046875, "debug/sppo_chosen_reward_in_loss": -0.07162685692310333, "debug/sppo_rej_reward_in_loss": -6.405016899108887, "debug/sppo_reject_loss": 2089.844482421875, "epoch": 6.268115942028985, "grad_norm": 89512.0377820254, "learning_rate": 1.2159337530482494e-08, "logits/chosen": 1.1720632314682007, "logits/rejected": 1.3450525999069214, "logps/chosen": -239.95504760742188, "logps/rejected": -271.2496643066406, "loss": 4510.391, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.000716269772965461, "rewards/margins": 0.0633339062333107, "rewards/rejected": -0.06405016779899597, "step": 1730 }, { "debug/policy_chosen_logits": 0.982774555683136, "debug/policy_chosen_logps": -246.8867950439453, "debug/policy_rejected_logits": 1.3127689361572266, "debug/policy_rejected_logps": -287.54180908203125, "debug/reference_chosen_logps": -248.7151641845703, "debug/reference_rejected_logps": -286.00408935546875, "debug/sppo_chosen_loss": 2359.535888671875, "debug/sppo_chosen_reward_in_loss": 1.8283694982528687, "debug/sppo_rej_reward_in_loss": -1.5377200841903687, "debug/sppo_reject_loss": 2389.627197265625, "epoch": 6.286231884057971, "grad_norm": 66847.64337209416, "learning_rate": 1.1916860601554312e-08, "logits/chosen": 0.982774555683136, "logits/rejected": 1.3127689361572266, "logps/chosen": -246.8867950439453, "logps/rejected": -287.54180908203125, "loss": 4579.4227, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.018283694982528687, "rewards/margins": 0.0336608961224556, "rewards/rejected": -0.01537720113992691, "step": 1735 }, { "debug/policy_chosen_logits": 1.253943681716919, "debug/policy_chosen_logps": -272.449951171875, "debug/policy_rejected_logits": 1.6080108880996704, "debug/policy_rejected_logps": -329.8172302246094, "debug/reference_chosen_logps": -275.38238525390625, "debug/reference_rejected_logps": -324.34478759765625, "debug/sppo_chosen_loss": 2233.989501953125, "debug/sppo_chosen_reward_in_loss": 2.9324352741241455, "debug/sppo_rej_reward_in_loss": -5.472461223602295, "debug/sppo_reject_loss": 2133.767333984375, "epoch": 6.304347826086957, "grad_norm": 68122.6119676196, "learning_rate": 1.1676498277719017e-08, "logits/chosen": 1.253943681716919, "logits/rejected": 1.6080108880996704, "logps/chosen": -272.449951171875, "logps/rejected": -329.8172302246094, "loss": 4587.9414, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.029324352741241455, "rewards/margins": 0.08404896408319473, "rewards/rejected": -0.05472461134195328, "step": 1740 }, { "debug/policy_chosen_logits": 0.8677678108215332, "debug/policy_chosen_logps": -261.25238037109375, "debug/policy_rejected_logits": 1.049141526222229, "debug/policy_rejected_logps": -305.6992492675781, "debug/reference_chosen_logps": -263.1070861816406, "debug/reference_rejected_logps": -298.8080139160156, "debug/sppo_chosen_loss": 2364.94775390625, "debug/sppo_chosen_reward_in_loss": 1.854697823524475, "debug/sppo_rej_reward_in_loss": -6.891258239746094, "debug/sppo_reject_loss": 2049.59375, "epoch": 6.322463768115942, "grad_norm": 76682.47648309536, "learning_rate": 1.1438263905339358e-08, "logits/chosen": 0.8677678108215332, "logits/rejected": 1.049141526222229, "logps/chosen": -261.25238037109375, "logps/rejected": -305.6992492675781, "loss": 4562.4586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.018546978011727333, "rewards/margins": 0.08745956420898438, "rewards/rejected": -0.0689125806093216, "step": 1745 }, { "debug/policy_chosen_logits": 1.2336132526397705, "debug/policy_chosen_logps": -244.929443359375, "debug/policy_rejected_logits": 1.4067014455795288, "debug/policy_rejected_logps": -275.9320068359375, "debug/reference_chosen_logps": -247.2993927001953, "debug/reference_rejected_logps": -272.1546325683594, "debug/sppo_chosen_loss": 2289.958984375, "debug/sppo_chosen_reward_in_loss": 2.3699169158935547, "debug/sppo_rej_reward_in_loss": -3.7773468494415283, "debug/sppo_reject_loss": 2284.33447265625, "epoch": 6.340579710144928, "grad_norm": 72167.1173026926, "learning_rate": 1.1202170712621467e-08, "logits/chosen": 1.2336132526397705, "logits/rejected": 1.4067014455795288, "logps/chosen": -244.929443359375, "logps/rejected": -275.9320068359375, "loss": 4390.8207, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.023699168115854263, "rewards/margins": 0.061472635716199875, "rewards/rejected": -0.03777346760034561, "step": 1750 }, { "debug/policy_chosen_logits": 0.9754334688186646, "debug/policy_chosen_logps": -266.0594177246094, "debug/policy_rejected_logits": 1.2265844345092773, "debug/policy_rejected_logps": -309.459228515625, "debug/reference_chosen_logps": -267.568359375, "debug/reference_rejected_logps": -307.5111389160156, "debug/sppo_chosen_loss": 2409.52978515625, "debug/sppo_chosen_reward_in_loss": 1.5089489221572876, "debug/sppo_rej_reward_in_loss": -1.9481090307235718, "debug/sppo_reject_loss": 2377.966064453125, "epoch": 6.358695652173913, "grad_norm": 115786.13338678224, "learning_rate": 1.0968231808880241e-08, "logits/chosen": 0.9754334688186646, "logits/rejected": 1.2265844345092773, "logps/chosen": -266.0594177246094, "logps/rejected": -309.459228515625, "loss": 4526.4805, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.01508948765695095, "rewards/margins": 0.03457058221101761, "rewards/rejected": -0.01948108896613121, "step": 1755 }, { "debug/policy_chosen_logits": 1.1159372329711914, "debug/policy_chosen_logps": -262.75372314453125, "debug/policy_rejected_logits": 1.3715074062347412, "debug/policy_rejected_logps": -301.50225830078125, "debug/reference_chosen_logps": -265.3297424316406, "debug/reference_rejected_logps": -296.4901123046875, "debug/sppo_chosen_loss": 2279.923828125, "debug/sppo_chosen_reward_in_loss": 2.5760035514831543, "debug/sppo_rej_reward_in_loss": -5.012146949768066, "debug/sppo_reject_loss": 2145.450927734375, "epoch": 6.3768115942028984, "grad_norm": 70644.94503903398, "learning_rate": 1.0736460183811546e-08, "logits/chosen": 1.1159372329711914, "logits/rejected": 1.3715074062347412, "logps/chosen": -262.75372314453125, "logps/rejected": -301.50225830078125, "loss": 4568.8281, "rewards/accuracies": 0.75, "rewards/chosen": 0.02576003409922123, "rewards/margins": 0.07588149607181549, "rewards/rejected": -0.05012146756052971, "step": 1760 }, { "debug/policy_chosen_logits": 1.30352783203125, "debug/policy_chosen_logps": -274.08648681640625, "debug/policy_rejected_logits": 1.3584903478622437, "debug/policy_rejected_logps": -292.0806884765625, "debug/reference_chosen_logps": -274.2155456542969, "debug/reference_rejected_logps": -287.33245849609375, "debug/sppo_chosen_loss": 2555.212890625, "debug/sppo_chosen_reward_in_loss": 0.12903061509132385, "debug/sppo_rej_reward_in_loss": -4.748242378234863, "debug/sppo_reject_loss": 2203.50048828125, "epoch": 6.394927536231884, "grad_norm": 63618.12217875742, "learning_rate": 1.0506868706770844e-08, "logits/chosen": 1.30352783203125, "logits/rejected": 1.3584903478622437, "logps/chosen": -274.08648681640625, "logps/rejected": -292.0806884765625, "loss": 4544.6687, "rewards/accuracies": 0.625, "rewards/chosen": 0.0012903057504445314, "rewards/margins": 0.04877272993326187, "rewards/rejected": -0.04748242348432541, "step": 1765 }, { "debug/policy_chosen_logits": 1.0495314598083496, "debug/policy_chosen_logps": -239.2109375, "debug/policy_rejected_logits": 1.478896141052246, "debug/policy_rejected_logps": -283.77838134765625, "debug/reference_chosen_logps": -239.0229034423828, "debug/reference_rejected_logps": -279.0099182128906, "debug/sppo_chosen_loss": 2597.17578125, "debug/sppo_chosen_reward_in_loss": -0.18803825974464417, "debug/sppo_rej_reward_in_loss": -4.768446922302246, "debug/sppo_reject_loss": 2182.498046875, "epoch": 6.413043478260869, "grad_norm": 77066.58960116318, "learning_rate": 1.0279470126058676e-08, "logits/chosen": 1.0495314598083496, "logits/rejected": 1.478896141052246, "logps/chosen": -239.2109375, "logps/rejected": -283.77838134765625, "loss": 4643.443, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0018803831189870834, "rewards/margins": 0.04580408334732056, "rewards/rejected": -0.04768446832895279, "step": 1770 }, { "debug/policy_chosen_logits": 1.2252495288848877, "debug/policy_chosen_logps": -279.1947326660156, "debug/policy_rejected_logits": 1.2340757846832275, "debug/policy_rejected_logps": -279.371826171875, "debug/reference_chosen_logps": -280.98284912109375, "debug/reference_rejected_logps": -277.61669921875, "debug/sppo_chosen_loss": 2366.054443359375, "debug/sppo_chosen_reward_in_loss": 1.7880885601043701, "debug/sppo_rej_reward_in_loss": -1.7551319599151611, "debug/sppo_reject_loss": 2390.520751953125, "epoch": 6.431159420289855, "grad_norm": 72827.20873599048, "learning_rate": 1.0054277068212797e-08, "logits/chosen": 1.2252495288848877, "logits/rejected": 1.2340757846832275, "logps/chosen": -279.1947326660156, "logps/rejected": -279.371826171875, "loss": 4518.8156, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.017880886793136597, "rewards/margins": 0.035432200878858566, "rewards/rejected": -0.017551319673657417, "step": 1775 }, { "debug/policy_chosen_logits": 0.8835108876228333, "debug/policy_chosen_logps": -271.79595947265625, "debug/policy_rejected_logits": 1.2131328582763672, "debug/policy_rejected_logps": -312.07208251953125, "debug/reference_chosen_logps": -274.2127685546875, "debug/reference_rejected_logps": -305.23321533203125, "debug/sppo_chosen_loss": 2285.39404296875, "debug/sppo_chosen_reward_in_loss": 2.416809558868408, "debug/sppo_rej_reward_in_loss": -6.838896751403809, "debug/sppo_reject_loss": 2023.9400634765625, "epoch": 6.449275362318841, "grad_norm": 63623.52265960445, "learning_rate": 9.831302037307021e-09, "logits/chosen": 0.8835108876228333, "logits/rejected": 1.2131328582763672, "logps/chosen": -271.79595947265625, "logps/rejected": -312.07208251953125, "loss": 4477.0383, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.024168096482753754, "rewards/margins": 0.09255705773830414, "rewards/rejected": -0.06838897615671158, "step": 1780 }, { "debug/policy_chosen_logits": 1.4466286897659302, "debug/policy_chosen_logps": -270.4122009277344, "debug/policy_rejected_logits": 1.503780722618103, "debug/policy_rejected_logps": -285.6905212402344, "debug/reference_chosen_logps": -270.66192626953125, "debug/reference_rejected_logps": -283.3195495605469, "debug/sppo_chosen_loss": 2587.914794921875, "debug/sppo_chosen_reward_in_loss": 0.24976272881031036, "debug/sppo_rej_reward_in_loss": -2.3709824085235596, "debug/sppo_reject_loss": 2394.04248046875, "epoch": 6.467391304347826, "grad_norm": 68260.49272500667, "learning_rate": 9.610557414257009e-09, "logits/chosen": 1.4466286897659302, "logits/rejected": 1.503780722618103, "logps/chosen": -270.4122009277344, "logps/rejected": -285.6905212402344, "loss": 4458.1781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0024976269342005253, "rewards/margins": 0.026207447052001953, "rewards/rejected": -0.023709822446107864, "step": 1785 }, { "debug/policy_chosen_logits": 0.844222903251648, "debug/policy_chosen_logps": -243.48812866210938, "debug/policy_rejected_logits": 1.2014129161834717, "debug/policy_rejected_logps": -287.10198974609375, "debug/reference_chosen_logps": -244.35690307617188, "debug/reference_rejected_logps": -282.8691711425781, "debug/sppo_chosen_loss": 2543.624267578125, "debug/sppo_chosen_reward_in_loss": 0.8687904477119446, "debug/sppo_rej_reward_in_loss": -4.232800483703613, "debug/sppo_reject_loss": 2225.53271484375, "epoch": 6.4855072463768115, "grad_norm": 71785.22277961893, "learning_rate": 9.392055456132713e-09, "logits/chosen": 0.844222903251648, "logits/rejected": 1.2014129161834717, "logps/chosen": -243.48812866210938, "logps/rejected": -287.10198974609375, "loss": 4567.2234, "rewards/accuracies": 0.75, "rewards/chosen": 0.008687904104590416, "rewards/margins": 0.05101591348648071, "rewards/rejected": -0.04232800751924515, "step": 1790 }, { "debug/policy_chosen_logits": 1.136611819267273, "debug/policy_chosen_logps": -226.44253540039062, "debug/policy_rejected_logits": 1.2988706827163696, "debug/policy_rejected_logps": -250.56747436523438, "debug/reference_chosen_logps": -227.589599609375, "debug/reference_rejected_logps": -245.1594696044922, "debug/sppo_chosen_loss": 2425.15869140625, "debug/sppo_chosen_reward_in_loss": 1.147066354751587, "debug/sppo_rej_reward_in_loss": -5.408025741577148, "debug/sppo_reject_loss": 2152.47265625, "epoch": 6.503623188405797, "grad_norm": 104367.46962525073, "learning_rate": 9.175808295477849e-09, "logits/chosen": 1.136611819267273, "logits/rejected": 1.2988706827163696, "logps/chosen": -226.44253540039062, "logps/rejected": -250.56747436523438, "loss": 4551.068, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.011470664292573929, "rewards/margins": 0.06555091589689255, "rewards/rejected": -0.054080259054899216, "step": 1795 }, { "debug/policy_chosen_logits": 1.1030946969985962, "debug/policy_chosen_logps": -233.5998077392578, "debug/policy_rejected_logits": 1.4379918575286865, "debug/policy_rejected_logps": -271.5467529296875, "debug/reference_chosen_logps": -234.68594360351562, "debug/reference_rejected_logps": -270.9393005371094, "debug/sppo_chosen_loss": 2447.8017578125, "debug/sppo_chosen_reward_in_loss": 1.0861365795135498, "debug/sppo_rej_reward_in_loss": -0.607455849647522, "debug/sppo_reject_loss": 2494.67431640625, "epoch": 6.521739130434782, "grad_norm": 81728.7219157688, "learning_rate": 8.961827939636196e-09, "logits/chosen": 1.1030946969985962, "logits/rejected": 1.4379918575286865, "logps/chosen": -233.5998077392578, "logps/rejected": -271.5467529296875, "loss": 4566.6648, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.010861365124583244, "rewards/margins": 0.01693592593073845, "rewards/rejected": -0.006074557546526194, "step": 1800 }, { "epoch": 6.521739130434782, "eval_debug/policy_chosen_logits": 1.3974790573120117, "eval_debug/policy_chosen_logps": -252.31051635742188, "eval_debug/policy_rejected_logits": 1.4412792921066284, "eval_debug/policy_rejected_logps": -263.3516845703125, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2512.918701171875, "eval_debug/sppo_chosen_reward_in_loss": 0.6079623103141785, "eval_debug/sppo_rej_reward_in_loss": -3.6930320262908936, "eval_debug/sppo_reject_loss": 2304.7548828125, "eval_logits/chosen": 1.3974790573120117, "eval_logits/rejected": 1.4412792921066284, "eval_logps/chosen": -252.31051635742188, "eval_logps/rejected": -263.3516845703125, "eval_loss": 4619.33251953125, "eval_rewards/accuracies": 0.5921052694320679, "eval_rewards/chosen": 0.006079623010009527, "eval_rewards/margins": 0.04300994426012039, "eval_rewards/rejected": -0.03693031892180443, "eval_runtime": 28.7328, "eval_samples_per_second": 20.882, "eval_steps_per_second": 0.661, "step": 1800 }, { "debug/policy_chosen_logits": 1.0640919208526611, "debug/policy_chosen_logps": -241.22659301757812, "debug/policy_rejected_logits": 1.1182310581207275, "debug/policy_rejected_logps": -266.4372253417969, "debug/reference_chosen_logps": -242.4794464111328, "debug/reference_rejected_logps": -261.7477111816406, "debug/sppo_chosen_loss": 2412.180908203125, "debug/sppo_chosen_reward_in_loss": 1.2528440952301025, "debug/sppo_rej_reward_in_loss": -4.689537048339844, "debug/sppo_reject_loss": 2180.385986328125, "epoch": 6.539855072463768, "grad_norm": 61740.59994954301, "learning_rate": 8.75012627008489e-09, "logits/chosen": 1.0640919208526611, "logits/rejected": 1.1182310581207275, "logps/chosen": -241.22659301757812, "logps/rejected": -266.4372253417969, "loss": 4483.5254, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.012528439983725548, "rewards/margins": 0.059423815459012985, "rewards/rejected": -0.04689536988735199, "step": 1805 }, { "debug/policy_chosen_logits": 1.0891189575195312, "debug/policy_chosen_logps": -241.63992309570312, "debug/policy_rejected_logits": 1.606302261352539, "debug/policy_rejected_logps": -313.29656982421875, "debug/reference_chosen_logps": -243.9291534423828, "debug/reference_rejected_logps": -306.9833984375, "debug/sppo_chosen_loss": 2318.420654296875, "debug/sppo_chosen_reward_in_loss": 2.289240837097168, "debug/sppo_rej_reward_in_loss": -6.313161373138428, "debug/sppo_reject_loss": 2048.82080078125, "epoch": 6.557971014492754, "grad_norm": 78559.58495351419, "learning_rate": 8.540715041774716e-09, "logits/chosen": 1.0891189575195312, "logits/rejected": 1.606302261352539, "logps/chosen": -241.63992309570312, "logps/rejected": -313.29656982421875, "loss": 4586.9969, "rewards/accuracies": 0.75, "rewards/chosen": 0.022892409935593605, "rewards/margins": 0.08602402359247208, "rewards/rejected": -0.06313161551952362, "step": 1810 }, { "debug/policy_chosen_logits": 1.4883325099945068, "debug/policy_chosen_logps": -271.9356384277344, "debug/policy_rejected_logits": 1.6947778463363647, "debug/policy_rejected_logps": -284.2402648925781, "debug/reference_chosen_logps": -273.122802734375, "debug/reference_rejected_logps": -281.25311279296875, "debug/sppo_chosen_loss": 2417.36767578125, "debug/sppo_chosen_reward_in_loss": 1.1871535778045654, "debug/sppo_rej_reward_in_loss": -2.987131118774414, "debug/sppo_reject_loss": 2343.622314453125, "epoch": 6.576086956521739, "grad_norm": 70559.05343026349, "learning_rate": 8.333605882477334e-09, "logits/chosen": 1.4883325099945068, "logits/rejected": 1.6947778463363647, "logps/chosen": -271.9356384277344, "logps/rejected": -284.2402648925781, "loss": 4524.1789, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.011871537193655968, "rewards/margins": 0.041742850095033646, "rewards/rejected": -0.02987130917608738, "step": 1815 }, { "debug/policy_chosen_logits": 0.9994763135910034, "debug/policy_chosen_logps": -232.7861785888672, "debug/policy_rejected_logits": 1.3966903686523438, "debug/policy_rejected_logps": -286.7142639160156, "debug/reference_chosen_logps": -235.10836791992188, "debug/reference_rejected_logps": -282.6846008300781, "debug/sppo_chosen_loss": 2292.59033203125, "debug/sppo_chosen_reward_in_loss": 2.3221702575683594, "debug/sppo_rej_reward_in_loss": -4.029662609100342, "debug/sppo_reject_loss": 2255.48974609375, "epoch": 6.594202898550725, "grad_norm": 67555.36702872877, "learning_rate": 8.128810292139726e-09, "logits/chosen": 0.9994763135910034, "logits/rejected": 1.3966903686523438, "logps/chosen": -232.7861785888672, "logps/rejected": -286.7142639160156, "loss": 4530.3629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.023221701383590698, "rewards/margins": 0.06351832300424576, "rewards/rejected": -0.04029662907123566, "step": 1820 }, { "debug/policy_chosen_logits": 0.9176052808761597, "debug/policy_chosen_logps": -236.9879913330078, "debug/policy_rejected_logits": 1.3363244533538818, "debug/policy_rejected_logps": -287.5779724121094, "debug/reference_chosen_logps": -238.054443359375, "debug/reference_rejected_logps": -277.24664306640625, "debug/sppo_chosen_loss": 2428.644775390625, "debug/sppo_chosen_reward_in_loss": 1.0664472579956055, "debug/sppo_rej_reward_in_loss": -10.331335067749023, "debug/sppo_reject_loss": 1826.340576171875, "epoch": 6.61231884057971, "grad_norm": 69235.59804149473, "learning_rate": 7.926339642245555e-09, "logits/chosen": 0.9176052808761597, "logits/rejected": 1.3363244533538818, "logps/chosen": -236.9879913330078, "logps/rejected": -287.5779724121094, "loss": 4397.3313, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.010664473287761211, "rewards/margins": 0.113977812230587, "rewards/rejected": -0.10331334918737411, "step": 1825 }, { "debug/policy_chosen_logits": 1.2249447107315063, "debug/policy_chosen_logps": -242.8837432861328, "debug/policy_rejected_logits": 1.552274465560913, "debug/policy_rejected_logps": -280.9314880371094, "debug/reference_chosen_logps": -245.7642822265625, "debug/reference_rejected_logps": -274.960693359375, "debug/sppo_chosen_loss": 2240.804443359375, "debug/sppo_chosen_reward_in_loss": 2.880551815032959, "debug/sppo_rej_reward_in_loss": -5.97078275680542, "debug/sppo_reject_loss": 2090.081298828125, "epoch": 6.630434782608695, "grad_norm": 69381.34153206859, "learning_rate": 7.726205175183837e-09, "logits/chosen": 1.2249447107315063, "logits/rejected": 1.552274465560913, "logps/chosen": -242.8837432861328, "logps/rejected": -280.9314880371094, "loss": 4404.8367, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02880551852285862, "rewards/margins": 0.08851335197687149, "rewards/rejected": -0.05970783159136772, "step": 1830 }, { "debug/policy_chosen_logits": 1.4106062650680542, "debug/policy_chosen_logps": -276.819091796875, "debug/policy_rejected_logits": 1.8558547496795654, "debug/policy_rejected_logps": -322.76226806640625, "debug/reference_chosen_logps": -277.9795837402344, "debug/reference_rejected_logps": -316.0045471191406, "debug/sppo_chosen_loss": 2473.76416015625, "debug/sppo_chosen_reward_in_loss": 1.1604499816894531, "debug/sppo_rej_reward_in_loss": -6.757748603820801, "debug/sppo_reject_loss": 2023.562255859375, "epoch": 6.648550724637682, "grad_norm": 122245.8956762465, "learning_rate": 7.528418003624632e-09, "logits/chosen": 1.4106062650680542, "logits/rejected": 1.8558547496795654, "logps/chosen": -276.819091796875, "logps/rejected": -322.76226806640625, "loss": 4622.6406, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.011604499071836472, "rewards/margins": 0.0791819766163826, "rewards/rejected": -0.06757748126983643, "step": 1835 }, { "debug/policy_chosen_logits": 1.3023555278778076, "debug/policy_chosen_logps": -254.7684326171875, "debug/policy_rejected_logits": 1.4576895236968994, "debug/policy_rejected_logps": -289.526123046875, "debug/reference_chosen_logps": -258.4832458496094, "debug/reference_rejected_logps": -281.6604309082031, "debug/sppo_chosen_loss": 2154.77294921875, "debug/sppo_chosen_reward_in_loss": 3.7148489952087402, "debug/sppo_rej_reward_in_loss": -7.865678310394287, "debug/sppo_reject_loss": 1972.273681640625, "epoch": 6.666666666666667, "grad_norm": 68577.4154292618, "learning_rate": 7.332989109902027e-09, "logits/chosen": 1.3023555278778076, "logits/rejected": 1.4576895236968994, "logps/chosen": -254.7684326171875, "logps/rejected": -289.526123046875, "loss": 4562.9859, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03714849054813385, "rewards/margins": 0.11580528318881989, "rewards/rejected": -0.07865677773952484, "step": 1840 }, { "debug/policy_chosen_logits": 1.1200048923492432, "debug/policy_chosen_logps": -264.84515380859375, "debug/policy_rejected_logits": 1.1008622646331787, "debug/policy_rejected_logps": -254.68212890625, "debug/reference_chosen_logps": -264.2433166503906, "debug/reference_rejected_logps": -252.0456085205078, "debug/sppo_chosen_loss": 2682.3505859375, "debug/sppo_chosen_reward_in_loss": -0.6018713116645813, "debug/sppo_rej_reward_in_loss": -2.6365625858306885, "debug/sppo_reject_loss": 2311.68212890625, "epoch": 6.684782608695652, "grad_norm": 81734.77191589169, "learning_rate": 7.139929345404355e-09, "logits/chosen": 1.1200048923492432, "logits/rejected": 1.1008622646331787, "logps/chosen": -264.84515380859375, "logps/rejected": -254.68212890625, "loss": 4443.2492, "rewards/accuracies": 0.625, "rewards/chosen": -0.006018712185323238, "rewards/margins": 0.020346911624073982, "rewards/rejected": -0.026365626603364944, "step": 1845 }, { "debug/policy_chosen_logits": 1.310903787612915, "debug/policy_chosen_logps": -277.4835205078125, "debug/policy_rejected_logits": 1.4697941541671753, "debug/policy_rejected_logps": -327.77044677734375, "debug/reference_chosen_logps": -276.53717041015625, "debug/reference_rejected_logps": -322.140380859375, "debug/sppo_chosen_loss": 2686.08984375, "debug/sppo_chosen_reward_in_loss": -0.9463611841201782, "debug/sppo_rej_reward_in_loss": -5.630080223083496, "debug/sppo_reject_loss": 2116.522705078125, "epoch": 6.702898550724638, "grad_norm": 77714.10615841726, "learning_rate": 6.94924942997161e-09, "logits/chosen": 1.310903787612915, "logits/rejected": 1.4697941541671753, "logps/chosen": -277.4835205078125, "logps/rejected": -327.77044677734375, "loss": 4527.9828, "rewards/accuracies": 0.625, "rewards/chosen": -0.00946361105889082, "rewards/margins": 0.04683718457818031, "rewards/rejected": -0.056300800293684006, "step": 1850 }, { "debug/policy_chosen_logits": 1.0288848876953125, "debug/policy_chosen_logps": -243.25289916992188, "debug/policy_rejected_logits": 1.0793625116348267, "debug/policy_rejected_logps": -290.67620849609375, "debug/reference_chosen_logps": -245.24673461914062, "debug/reference_rejected_logps": -287.10443115234375, "debug/sppo_chosen_loss": 2362.96923828125, "debug/sppo_chosen_reward_in_loss": 1.9938185214996338, "debug/sppo_rej_reward_in_loss": -3.5717475414276123, "debug/sppo_reject_loss": 2288.716552734375, "epoch": 6.721014492753623, "grad_norm": 67099.74986461057, "learning_rate": 6.760959951300266e-09, "logits/chosen": 1.0288848876953125, "logits/rejected": 1.0793625116348267, "logps/chosen": -243.25289916992188, "logps/rejected": -290.67620849609375, "loss": 4631.593, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.019938183948397636, "rewards/margins": 0.05565565824508667, "rewards/rejected": -0.03571747615933418, "step": 1855 }, { "debug/policy_chosen_logits": 1.067305326461792, "debug/policy_chosen_logps": -246.3300323486328, "debug/policy_rejected_logits": 1.253506064414978, "debug/policy_rejected_logps": -260.63360595703125, "debug/reference_chosen_logps": -247.5150146484375, "debug/reference_rejected_logps": -256.20477294921875, "debug/sppo_chosen_loss": 2444.710693359375, "debug/sppo_chosen_reward_in_loss": 1.1849710941314697, "debug/sppo_rej_reward_in_loss": -4.428830146789551, "debug/sppo_reject_loss": 2232.339111328125, "epoch": 6.739130434782608, "grad_norm": 77074.98043159636, "learning_rate": 6.575071364355334e-09, "logits/chosen": 1.067305326461792, "logits/rejected": 1.253506064414978, "logps/chosen": -246.3300323486328, "logps/rejected": -260.63360595703125, "loss": 4501.8156, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01184971071779728, "rewards/margins": 0.056138016283512115, "rewards/rejected": -0.04428829625248909, "step": 1860 }, { "debug/policy_chosen_logits": 1.2634141445159912, "debug/policy_chosen_logps": -268.0980529785156, "debug/policy_rejected_logits": 1.4198873043060303, "debug/policy_rejected_logps": -294.7967834472656, "debug/reference_chosen_logps": -270.0035705566406, "debug/reference_rejected_logps": -285.4292907714844, "debug/sppo_chosen_loss": 2364.331298828125, "debug/sppo_chosen_reward_in_loss": 1.9055078029632568, "debug/sppo_rej_reward_in_loss": -9.367478370666504, "debug/sppo_reject_loss": 1887.01953125, "epoch": 6.757246376811594, "grad_norm": 95721.87361269009, "learning_rate": 6.3915939907899005e-09, "logits/chosen": 1.2634141445159912, "logits/rejected": 1.4198873043060303, "logps/chosen": -268.0980529785156, "logps/rejected": -294.7967834472656, "loss": 4398.0863, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01905507780611515, "rewards/margins": 0.11272986233234406, "rewards/rejected": -0.09367477893829346, "step": 1865 }, { "debug/policy_chosen_logits": 1.2589980363845825, "debug/policy_chosen_logps": -256.4156799316406, "debug/policy_rejected_logits": 1.69875967502594, "debug/policy_rejected_logps": -319.1440734863281, "debug/reference_chosen_logps": -257.7935485839844, "debug/reference_rejected_logps": -313.9791564941406, "debug/sppo_chosen_loss": 2404.74658203125, "debug/sppo_chosen_reward_in_loss": 1.3778616189956665, "debug/sppo_rej_reward_in_loss": -5.164914608001709, "debug/sppo_reject_loss": 2155.24169921875, "epoch": 6.77536231884058, "grad_norm": 81792.11649399805, "learning_rate": 6.210538018371947e-09, "logits/chosen": 1.2589980363845825, "logits/rejected": 1.69875967502594, "logps/chosen": -256.4156799316406, "logps/rejected": -319.1440734863281, "loss": 4512.4551, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.013778614811599255, "rewards/margins": 0.065427765250206, "rewards/rejected": -0.051649145781993866, "step": 1870 }, { "debug/policy_chosen_logits": 1.120521068572998, "debug/policy_chosen_logps": -267.41448974609375, "debug/policy_rejected_logits": 1.344630241394043, "debug/policy_rejected_logps": -279.906005859375, "debug/reference_chosen_logps": -268.8659362792969, "debug/reference_rejected_logps": -275.41595458984375, "debug/sppo_chosen_loss": 2384.952392578125, "debug/sppo_chosen_reward_in_loss": 1.451454520225525, "debug/sppo_rej_reward_in_loss": -4.490046501159668, "debug/sppo_reject_loss": 2182.372802734375, "epoch": 6.793478260869565, "grad_norm": 103845.9387896819, "learning_rate": 6.031913500418706e-09, "logits/chosen": 1.120521068572998, "logits/rejected": 1.344630241394043, "logps/chosen": -267.41448974609375, "logps/rejected": -279.906005859375, "loss": 4566.6125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.014514544978737831, "rewards/margins": 0.05941500514745712, "rewards/rejected": -0.04490046575665474, "step": 1875 }, { "debug/policy_chosen_logits": 1.3000844717025757, "debug/policy_chosen_logps": -271.90667724609375, "debug/policy_rejected_logits": 1.4914687871932983, "debug/policy_rejected_logps": -324.59991455078125, "debug/reference_chosen_logps": -273.9796142578125, "debug/reference_rejected_logps": -318.5680847167969, "debug/sppo_chosen_loss": 2341.97265625, "debug/sppo_chosen_reward_in_loss": 2.0729565620422363, "debug/sppo_rej_reward_in_loss": -6.031794548034668, "debug/sppo_reject_loss": 2074.671875, "epoch": 6.811594202898551, "grad_norm": 67645.82685745248, "learning_rate": 5.855730355238414e-09, "logits/chosen": 1.3000844717025757, "logits/rejected": 1.4914687871932983, "logps/chosen": -271.90667724609375, "logps/rejected": -324.59991455078125, "loss": 4528.9207, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.020729564130306244, "rewards/margins": 0.08104751259088516, "rewards/rejected": -0.06031794473528862, "step": 1880 }, { "debug/policy_chosen_logits": 1.1996022462844849, "debug/policy_chosen_logps": -247.4275360107422, "debug/policy_rejected_logits": 1.309090495109558, "debug/policy_rejected_logps": -274.24188232421875, "debug/reference_chosen_logps": -251.1371307373047, "debug/reference_rejected_logps": -267.4615478515625, "debug/sppo_chosen_loss": 2163.852783203125, "debug/sppo_chosen_reward_in_loss": 3.7095978260040283, "debug/sppo_rej_reward_in_loss": -6.780303001403809, "debug/sppo_reject_loss": 2039.903564453125, "epoch": 6.829710144927536, "grad_norm": 78775.22927602356, "learning_rate": 5.681998365579593e-09, "logits/chosen": 1.1996022462844849, "logits/rejected": 1.309090495109558, "logps/chosen": -247.4275360107422, "logps/rejected": -274.24188232421875, "loss": 4415.8047, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.037095971405506134, "rewards/margins": 0.10489901155233383, "rewards/rejected": -0.0678030326962471, "step": 1885 }, { "debug/policy_chosen_logits": 1.2602416276931763, "debug/policy_chosen_logps": -237.3834228515625, "debug/policy_rejected_logits": 1.4726940393447876, "debug/policy_rejected_logps": -279.3681335449219, "debug/reference_chosen_logps": -239.3144073486328, "debug/reference_rejected_logps": -273.20574951171875, "debug/sppo_chosen_loss": 2332.995849609375, "debug/sppo_chosen_reward_in_loss": 1.931006669998169, "debug/sppo_rej_reward_in_loss": -6.162369728088379, "debug/sppo_reject_loss": 2098.0224609375, "epoch": 6.8478260869565215, "grad_norm": 63587.04625732706, "learning_rate": 5.5107271780878875e-09, "logits/chosen": 1.2602416276931763, "logits/rejected": 1.4726940393447876, "logps/chosen": -237.3834228515625, "logps/rejected": -279.3681335449219, "loss": 4382.6273, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.01931006647646427, "rewards/margins": 0.08093376457691193, "rewards/rejected": -0.061623699963092804, "step": 1890 }, { "debug/policy_chosen_logits": 1.3107913732528687, "debug/policy_chosen_logps": -238.652099609375, "debug/policy_rejected_logits": 1.534700870513916, "debug/policy_rejected_logps": -300.97198486328125, "debug/reference_chosen_logps": -241.06076049804688, "debug/reference_rejected_logps": -292.98626708984375, "debug/sppo_chosen_loss": 2309.5693359375, "debug/sppo_chosen_reward_in_loss": 2.4086639881134033, "debug/sppo_rej_reward_in_loss": -7.98569393157959, "debug/sppo_reject_loss": 1979.941650390625, "epoch": 6.865942028985507, "grad_norm": 66804.78505725738, "learning_rate": 5.3419263027703665e-09, "logits/chosen": 1.3107913732528687, "logits/rejected": 1.534700870513916, "logps/chosen": -238.652099609375, "logps/rejected": -300.97198486328125, "loss": 4361.425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.024086639285087585, "rewards/margins": 0.1039435863494873, "rewards/rejected": -0.07985694706439972, "step": 1895 }, { "debug/policy_chosen_logits": 1.34364652633667, "debug/policy_chosen_logps": -250.04843139648438, "debug/policy_rejected_logits": 1.5993636846542358, "debug/policy_rejected_logps": -282.2890319824219, "debug/reference_chosen_logps": -251.6648712158203, "debug/reference_rejected_logps": -276.7724609375, "debug/sppo_chosen_loss": 2384.414306640625, "debug/sppo_chosen_reward_in_loss": 1.6164261102676392, "debug/sppo_rej_reward_in_loss": -5.516600608825684, "debug/sppo_reject_loss": 2137.204345703125, "epoch": 6.884057971014493, "grad_norm": 86603.97993699458, "learning_rate": 5.175605112467529e-09, "logits/chosen": 1.34364652633667, "logits/rejected": 1.5993636846542358, "logps/chosen": -250.04843139648438, "logps/rejected": -282.2890319824219, "loss": 4682.7492, "rewards/accuracies": 0.625, "rewards/chosen": 0.016164259985089302, "rewards/margins": 0.07133026421070099, "rewards/rejected": -0.055166006088256836, "step": 1900 }, { "epoch": 6.884057971014493, "eval_debug/policy_chosen_logits": 1.3966501951217651, "eval_debug/policy_chosen_logps": -252.25791931152344, "eval_debug/policy_rejected_logits": 1.440749168395996, "eval_debug/policy_rejected_logps": -263.3143615722656, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2507.00537109375, "eval_debug/sppo_chosen_reward_in_loss": 0.6605623960494995, "eval_debug/sppo_rej_reward_in_loss": -3.655715227127075, "eval_debug/sppo_reject_loss": 2307.52392578125, "eval_logits/chosen": 1.3966501951217651, "eval_logits/rejected": 1.440749168395996, "eval_logps/chosen": -252.25791931152344, "eval_logps/rejected": -263.3143615722656, "eval_loss": 4616.86865234375, "eval_rewards/accuracies": 0.5921052694320679, "eval_rewards/chosen": 0.006605625152587891, "eval_rewards/margins": 0.04316277801990509, "eval_rewards/rejected": -0.0365571528673172, "eval_runtime": 28.3163, "eval_samples_per_second": 21.189, "eval_steps_per_second": 0.671, "step": 1900 }, { "debug/policy_chosen_logits": 1.0572196245193481, "debug/policy_chosen_logps": -217.22860717773438, "debug/policy_rejected_logits": 1.4193629026412964, "debug/policy_rejected_logps": -282.8929443359375, "debug/reference_chosen_logps": -220.3137969970703, "debug/reference_rejected_logps": -278.0849914550781, "debug/sppo_chosen_loss": 2222.53515625, "debug/sppo_chosen_reward_in_loss": 3.085196018218994, "debug/sppo_rej_reward_in_loss": -4.807944297790527, "debug/sppo_reject_loss": 2180.68994140625, "epoch": 6.9021739130434785, "grad_norm": 60658.25859471416, "learning_rate": 5.011772842332812e-09, "logits/chosen": 1.0572196245193481, "logits/rejected": 1.4193629026412964, "logps/chosen": -217.22860717773438, "logps/rejected": -282.8929443359375, "loss": 4505.9348, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03085196018218994, "rewards/margins": 0.07893140614032745, "rewards/rejected": -0.048079442232847214, "step": 1905 }, { "debug/policy_chosen_logits": 1.2437039613723755, "debug/policy_chosen_logps": -254.41989135742188, "debug/policy_rejected_logits": 1.584726095199585, "debug/policy_rejected_logps": -295.80438232421875, "debug/reference_chosen_logps": -255.3196258544922, "debug/reference_rejected_logps": -290.5936279296875, "debug/sppo_chosen_loss": 2462.65966796875, "debug/sppo_chosen_reward_in_loss": 0.8997413516044617, "debug/sppo_rej_reward_in_loss": -5.2107648849487305, "debug/sppo_reject_loss": 2128.06787109375, "epoch": 6.920289855072464, "grad_norm": 70172.35876578368, "learning_rate": 4.850438589319817e-09, "logits/chosen": 1.2437039613723755, "logits/rejected": 1.584726095199585, "logps/chosen": -254.41989135742188, "logps/rejected": -295.80438232421875, "loss": 4634.7203, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.008997412398457527, "rewards/margins": 0.06110506132245064, "rewards/rejected": -0.05210765078663826, "step": 1910 }, { "debug/policy_chosen_logits": 0.9675963521003723, "debug/policy_chosen_logps": -234.487548828125, "debug/policy_rejected_logits": 1.2166943550109863, "debug/policy_rejected_logps": -280.05853271484375, "debug/reference_chosen_logps": -237.55307006835938, "debug/reference_rejected_logps": -274.2191467285156, "debug/sppo_chosen_loss": 2232.22705078125, "debug/sppo_chosen_reward_in_loss": 3.065514087677002, "debug/sppo_rej_reward_in_loss": -5.839382171630859, "debug/sppo_reject_loss": 2107.266845703125, "epoch": 6.938405797101449, "grad_norm": 67291.21445508106, "learning_rate": 4.691611311677252e-09, "logits/chosen": 0.9675963521003723, "logits/rejected": 1.2166943550109863, "logps/chosen": -234.487548828125, "logps/rejected": -280.05853271484375, "loss": 4591.3695, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.030655140057206154, "rewards/margins": 0.08904895931482315, "rewards/rejected": -0.05839381739497185, "step": 1915 }, { "debug/policy_chosen_logits": 1.2087125778198242, "debug/policy_chosen_logps": -288.37445068359375, "debug/policy_rejected_logits": 1.162536382675171, "debug/policy_rejected_logps": -266.4842224121094, "debug/reference_chosen_logps": -291.00299072265625, "debug/reference_rejected_logps": -263.9441223144531, "debug/sppo_chosen_loss": 2261.277099609375, "debug/sppo_chosen_reward_in_loss": 2.6285316944122314, "debug/sppo_rej_reward_in_loss": -2.540130376815796, "debug/sppo_reject_loss": 2301.51220703125, "epoch": 6.956521739130435, "grad_norm": 112826.91250668064, "learning_rate": 4.5352998284514e-09, "logits/chosen": 1.2087125778198242, "logits/rejected": 1.162536382675171, "logps/chosen": -288.37445068359375, "logps/rejected": -266.4842224121094, "loss": 4529.3742, "rewards/accuracies": 0.75, "rewards/chosen": 0.026285316795110703, "rewards/margins": 0.05168662220239639, "rewards/rejected": -0.02540130354464054, "step": 1920 }, { "debug/policy_chosen_logits": 1.392529845237732, "debug/policy_chosen_logps": -265.3560485839844, "debug/policy_rejected_logits": 1.7413336038589478, "debug/policy_rejected_logps": -323.5859375, "debug/reference_chosen_logps": -266.50811767578125, "debug/reference_rejected_logps": -318.15435791015625, "debug/sppo_chosen_loss": 2445.50390625, "debug/sppo_chosen_reward_in_loss": 1.1520637273788452, "debug/sppo_rej_reward_in_loss": -5.431580543518066, "debug/sppo_reject_loss": 2126.082763671875, "epoch": 6.97463768115942, "grad_norm": 84091.55009534623, "learning_rate": 4.381512818996564e-09, "logits/chosen": 1.392529845237732, "logits/rejected": 1.7413336038589478, "logps/chosen": -265.3560485839844, "logps/rejected": -323.5859375, "loss": 4485.8617, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.011520637199282646, "rewards/margins": 0.06583644449710846, "rewards/rejected": -0.054315805435180664, "step": 1925 }, { "debug/policy_chosen_logits": 1.2974140644073486, "debug/policy_chosen_logps": -249.2262420654297, "debug/policy_rejected_logits": 1.7515869140625, "debug/policy_rejected_logps": -300.5926818847656, "debug/reference_chosen_logps": -250.64013671875, "debug/reference_rejected_logps": -294.30718994140625, "debug/sppo_chosen_loss": 2388.483642578125, "debug/sppo_chosen_reward_in_loss": 1.4139083623886108, "debug/sppo_rej_reward_in_loss": -6.2854905128479, "debug/sppo_reject_loss": 2088.600341796875, "epoch": 6.992753623188406, "grad_norm": 67447.51991794909, "learning_rate": 4.230258822492999e-09, "logits/chosen": 1.2974140644073486, "logits/rejected": 1.7515869140625, "logps/chosen": -249.2262420654297, "logps/rejected": -300.5926818847656, "loss": 4532.4992, "rewards/accuracies": 0.625, "rewards/chosen": 0.014139083214104176, "rewards/margins": 0.07699398696422577, "rewards/rejected": -0.06285490095615387, "step": 1930 }, { "debug/policy_chosen_logits": 0.9810554385185242, "debug/policy_chosen_logps": -248.074462890625, "debug/policy_rejected_logits": 1.1785837411880493, "debug/policy_rejected_logps": -261.30096435546875, "debug/reference_chosen_logps": -250.2155303955078, "debug/reference_rejected_logps": -256.85382080078125, "debug/sppo_chosen_loss": 2329.6708984375, "debug/sppo_chosen_reward_in_loss": 2.141080856323242, "debug/sppo_rej_reward_in_loss": -4.447126865386963, "debug/sppo_reject_loss": 2196.126708984375, "epoch": 7.010869565217392, "grad_norm": 99318.8145119645, "learning_rate": 4.08154623747291e-09, "logits/chosen": 0.9810554385185242, "logits/rejected": 1.1785837411880493, "logps/chosen": -248.074462890625, "logps/rejected": -261.30096435546875, "loss": 4433.907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.021410807967185974, "rewards/margins": 0.06588207185268402, "rewards/rejected": -0.044471271336078644, "step": 1935 }, { "debug/policy_chosen_logits": 1.334702968597412, "debug/policy_chosen_logps": -269.0505676269531, "debug/policy_rejected_logits": 1.4345498085021973, "debug/policy_rejected_logps": -289.39788818359375, "debug/reference_chosen_logps": -270.40582275390625, "debug/reference_rejected_logps": -283.4798278808594, "debug/sppo_chosen_loss": 2434.93115234375, "debug/sppo_chosen_reward_in_loss": 1.3552671670913696, "debug/sppo_rej_reward_in_loss": -5.9180684089660645, "debug/sppo_reject_loss": 2116.15771484375, "epoch": 7.028985507246377, "grad_norm": 67999.63530305258, "learning_rate": 3.935383321353974e-09, "logits/chosen": 1.334702968597412, "logits/rejected": 1.4345498085021973, "logps/chosen": -269.0505676269531, "logps/rejected": -289.39788818359375, "loss": 4522.9328, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.013552670367062092, "rewards/margins": 0.07273335009813309, "rewards/rejected": -0.05918068438768387, "step": 1940 }, { "debug/policy_chosen_logits": 1.1086900234222412, "debug/policy_chosen_logps": -287.78692626953125, "debug/policy_rejected_logits": 1.5081000328063965, "debug/policy_rejected_logps": -317.62725830078125, "debug/reference_chosen_logps": -288.9532165527344, "debug/reference_rejected_logps": -312.1133117675781, "debug/sppo_chosen_loss": 2440.03857421875, "debug/sppo_chosen_reward_in_loss": 1.1662803888320923, "debug/sppo_rej_reward_in_loss": -5.513950824737549, "debug/sppo_reject_loss": 2124.861328125, "epoch": 7.047101449275362, "grad_norm": 115917.52539654094, "learning_rate": 3.79177818998096e-09, "logits/chosen": 1.1086900234222412, "logits/rejected": 1.5081000328063965, "logps/chosen": -287.78692626953125, "logps/rejected": -317.62725830078125, "loss": 4435.0016, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.011662803590297699, "rewards/margins": 0.06680230796337128, "rewards/rejected": -0.05513950437307358, "step": 1945 }, { "debug/policy_chosen_logits": 1.0770022869110107, "debug/policy_chosen_logps": -236.9434356689453, "debug/policy_rejected_logits": 1.4237271547317505, "debug/policy_rejected_logps": -279.34930419921875, "debug/reference_chosen_logps": -237.2868194580078, "debug/reference_rejected_logps": -273.8456726074219, "debug/sppo_chosen_loss": 2557.1015625, "debug/sppo_chosen_reward_in_loss": 0.3433685302734375, "debug/sppo_rej_reward_in_loss": -5.503598213195801, "debug/sppo_reject_loss": 2134.468017578125, "epoch": 7.065217391304348, "grad_norm": 118959.58159264854, "learning_rate": 3.6507388171750085e-09, "logits/chosen": 1.0770022869110107, "logits/rejected": 1.4237271547317505, "logps/chosen": -236.9434356689453, "logps/rejected": -279.34930419921875, "loss": 4449.3938, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0034336864482611418, "rewards/margins": 0.05846966430544853, "rewards/rejected": -0.055035971105098724, "step": 1950 }, { "debug/policy_chosen_logits": 1.335742473602295, "debug/policy_chosen_logps": -261.5475158691406, "debug/policy_rejected_logits": 1.3126236200332642, "debug/policy_rejected_logps": -269.31817626953125, "debug/reference_chosen_logps": -263.124755859375, "debug/reference_rejected_logps": -263.43121337890625, "debug/sppo_chosen_loss": 2403.69775390625, "debug/sppo_chosen_reward_in_loss": 1.5772308111190796, "debug/sppo_rej_reward_in_loss": -5.886963844299316, "debug/sppo_reject_loss": 2160.141357421875, "epoch": 7.083333333333333, "grad_norm": 65251.16742240301, "learning_rate": 3.512273034290897e-09, "logits/chosen": 1.335742473602295, "logits/rejected": 1.3126236200332642, "logps/chosen": -261.5475158691406, "logps/rejected": -269.31817626953125, "loss": 4585.6008, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.01577230915427208, "rewards/margins": 0.07464194297790527, "rewards/rejected": -0.05886963754892349, "step": 1955 }, { "debug/policy_chosen_logits": 1.319954752922058, "debug/policy_chosen_logps": -270.43988037109375, "debug/policy_rejected_logits": 1.3581571578979492, "debug/policy_rejected_logps": -277.28070068359375, "debug/reference_chosen_logps": -272.287353515625, "debug/reference_rejected_logps": -269.27130126953125, "debug/sppo_chosen_loss": 2363.688720703125, "debug/sppo_chosen_reward_in_loss": 1.8474743366241455, "debug/sppo_rej_reward_in_loss": -8.009401321411133, "debug/sppo_reject_loss": 1959.2318115234375, "epoch": 7.101449275362318, "grad_norm": 79167.78203604883, "learning_rate": 3.376388529782215e-09, "logits/chosen": 1.319954752922058, "logits/rejected": 1.3581571578979492, "logps/chosen": -270.43988037109375, "logps/rejected": -277.28070068359375, "loss": 4485.6289, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.018474742770195007, "rewards/margins": 0.09856875240802765, "rewards/rejected": -0.08009400218725204, "step": 1960 }, { "debug/policy_chosen_logits": 1.2650543451309204, "debug/policy_chosen_logps": -278.1651611328125, "debug/policy_rejected_logits": 1.6884880065917969, "debug/policy_rejected_logps": -295.658447265625, "debug/reference_chosen_logps": -280.9836730957031, "debug/reference_rejected_logps": -293.024658203125, "debug/sppo_chosen_loss": 2284.28515625, "debug/sppo_chosen_reward_in_loss": 2.8184852600097656, "debug/sppo_rej_reward_in_loss": -2.6337947845458984, "debug/sppo_reject_loss": 2347.244873046875, "epoch": 7.119565217391305, "grad_norm": 79959.42704998671, "learning_rate": 3.243092848774437e-09, "logits/chosen": 1.2650543451309204, "logits/rejected": 1.6884880065917969, "logps/chosen": -278.1651611328125, "logps/rejected": -295.658447265625, "loss": 4554.273, "rewards/accuracies": 0.75, "rewards/chosen": 0.02818485163152218, "rewards/margins": 0.0545228011906147, "rewards/rejected": -0.026337945833802223, "step": 1965 }, { "debug/policy_chosen_logits": 0.8446614146232605, "debug/policy_chosen_logps": -239.51779174804688, "debug/policy_rejected_logits": 1.3469539880752563, "debug/policy_rejected_logps": -299.9848937988281, "debug/reference_chosen_logps": -239.8334503173828, "debug/reference_rejected_logps": -297.77490234375, "debug/sppo_chosen_loss": 2562.301513671875, "debug/sppo_chosen_reward_in_loss": 0.31566277146339417, "debug/sppo_rej_reward_in_loss": -2.2100167274475098, "debug/sppo_reject_loss": 2351.84765625, "epoch": 7.13768115942029, "grad_norm": 68724.14540415957, "learning_rate": 3.1123933926459844e-09, "logits/chosen": 0.8446614146232605, "logits/rejected": 1.3469539880752563, "logps/chosen": -239.51779174804688, "logps/rejected": -299.9848937988281, "loss": 4567.302, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0031566284596920013, "rewards/margins": 0.025256793946027756, "rewards/rejected": -0.022100165486335754, "step": 1970 }, { "debug/policy_chosen_logits": 1.4603184461593628, "debug/policy_chosen_logps": -259.0393371582031, "debug/policy_rejected_logits": 1.4730253219604492, "debug/policy_rejected_logps": -273.2935485839844, "debug/reference_chosen_logps": -260.7829284667969, "debug/reference_rejected_logps": -267.289794921875, "debug/sppo_chosen_loss": 2366.275146484375, "debug/sppo_chosen_reward_in_loss": 1.7436144351959229, "debug/sppo_rej_reward_in_loss": -6.003744602203369, "debug/sppo_reject_loss": 2106.14306640625, "epoch": 7.155797101449275, "grad_norm": 91343.06106383547, "learning_rate": 2.9842974186172264e-09, "logits/chosen": 1.4603184461593628, "logits/rejected": 1.4730253219604492, "logps/chosen": -259.0393371582031, "logps/rejected": -273.2935485839844, "loss": 4616.1953, "rewards/accuracies": 0.75, "rewards/chosen": 0.01743614301085472, "rewards/margins": 0.07747358083724976, "rewards/rejected": -0.06003744527697563, "step": 1975 }, { "debug/policy_chosen_logits": 1.137643575668335, "debug/policy_chosen_logps": -253.3531036376953, "debug/policy_rejected_logits": 1.4637935161590576, "debug/policy_rejected_logps": -280.31024169921875, "debug/reference_chosen_logps": -254.67086791992188, "debug/reference_rejected_logps": -273.9142150878906, "debug/sppo_chosen_loss": 2418.249755859375, "debug/sppo_chosen_reward_in_loss": 1.317787766456604, "debug/sppo_rej_reward_in_loss": -6.3960418701171875, "debug/sppo_reject_loss": 2044.1129150390625, "epoch": 7.173913043478261, "grad_norm": 68567.35829314639, "learning_rate": 2.8588120393475745e-09, "logits/chosen": 1.137643575668335, "logits/rejected": 1.4637935161590576, "logps/chosen": -253.3531036376953, "logps/rejected": -280.31024169921875, "loss": 4562.6047, "rewards/accuracies": 0.75, "rewards/chosen": 0.013177876360714436, "rewards/margins": 0.07713828980922699, "rewards/rejected": -0.06396041810512543, "step": 1980 }, { "debug/policy_chosen_logits": 1.3646475076675415, "debug/policy_chosen_logps": -257.977294921875, "debug/policy_rejected_logits": 1.3902546167373657, "debug/policy_rejected_logps": -273.2683410644531, "debug/reference_chosen_logps": -258.584716796875, "debug/reference_rejected_logps": -268.5025634765625, "debug/sppo_chosen_loss": 2510.2548828125, "debug/sppo_chosen_reward_in_loss": 0.6074390411376953, "debug/sppo_rej_reward_in_loss": -4.765748023986816, "debug/sppo_reject_loss": 2176.346435546875, "epoch": 7.192028985507246, "grad_norm": 54856.54889090561, "learning_rate": 2.7359442225404815e-09, "logits/chosen": 1.3646475076675415, "logits/rejected": 1.3902546167373657, "logps/chosen": -257.977294921875, "logps/rejected": -273.2683410644531, "loss": 4462.4723, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.006074388977140188, "rewards/margins": 0.053731877356767654, "rewards/rejected": -0.04765748232603073, "step": 1985 }, { "debug/policy_chosen_logits": 0.9074499011039734, "debug/policy_chosen_logps": -222.3174285888672, "debug/policy_rejected_logits": 1.4654042720794678, "debug/policy_rejected_logps": -317.33892822265625, "debug/reference_chosen_logps": -224.1483154296875, "debug/reference_rejected_logps": -308.30426025390625, "debug/sppo_chosen_loss": 2361.79150390625, "debug/sppo_chosen_reward_in_loss": 1.8308719396591187, "debug/sppo_rej_reward_in_loss": -9.034707069396973, "debug/sppo_reject_loss": 1857.283447265625, "epoch": 7.2101449275362315, "grad_norm": 65083.44357543864, "learning_rate": 2.615700790556569e-09, "logits/chosen": 0.9074499011039734, "logits/rejected": 1.4654042720794678, "logps/chosen": -222.3174285888672, "logps/rejected": -317.33892822265625, "loss": 4340.7336, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.018308719620108604, "rewards/margins": 0.10865578800439835, "rewards/rejected": -0.09034706652164459, "step": 1990 }, { "debug/policy_chosen_logits": 1.3254536390304565, "debug/policy_chosen_logps": -279.67266845703125, "debug/policy_rejected_logits": 1.475126028060913, "debug/policy_rejected_logps": -297.07989501953125, "debug/reference_chosen_logps": -281.09307861328125, "debug/reference_rejected_logps": -290.8413391113281, "debug/sppo_chosen_loss": 2406.08935546875, "debug/sppo_chosen_reward_in_loss": 1.4204126596450806, "debug/sppo_rej_reward_in_loss": -6.238560676574707, "debug/sppo_reject_loss": 2045.1383056640625, "epoch": 7.228260869565218, "grad_norm": 63741.71873725913, "learning_rate": 2.498088420034855e-09, "logits/chosen": 1.3254536390304565, "logits/rejected": 1.475126028060913, "logps/chosen": -279.67266845703125, "logps/rejected": -297.07989501953125, "loss": 4379.7883, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.01420412678271532, "rewards/margins": 0.07658973336219788, "rewards/rejected": -0.06238560751080513, "step": 1995 }, { "debug/policy_chosen_logits": 1.1463087797164917, "debug/policy_chosen_logps": -257.5396423339844, "debug/policy_rejected_logits": 1.3835389614105225, "debug/policy_rejected_logps": -300.2962646484375, "debug/reference_chosen_logps": -260.25921630859375, "debug/reference_rejected_logps": -291.9169921875, "debug/sppo_chosen_loss": 2253.771484375, "debug/sppo_chosen_reward_in_loss": 2.719552516937256, "debug/sppo_rej_reward_in_loss": -8.379258155822754, "debug/sppo_reject_loss": 1892.1109619140625, "epoch": 7.246376811594203, "grad_norm": 74910.59054714411, "learning_rate": 2.3831136415219554e-09, "logits/chosen": 1.1463087797164917, "logits/rejected": 1.3835389614105225, "logps/chosen": -257.5396423339844, "logps/rejected": -300.2962646484375, "loss": 4486.1707, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02719552256166935, "rewards/margins": 0.11098810285329819, "rewards/rejected": -0.08379258215427399, "step": 2000 }, { "epoch": 7.246376811594203, "eval_debug/policy_chosen_logits": 1.3931907415390015, "eval_debug/policy_chosen_logps": -252.2974853515625, "eval_debug/policy_rejected_logits": 1.4378267526626587, "eval_debug/policy_rejected_logps": -263.42547607421875, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2509.96337890625, "eval_debug/sppo_chosen_reward_in_loss": 0.6209712028503418, "eval_debug/sppo_rej_reward_in_loss": -3.7668421268463135, "eval_debug/sppo_reject_loss": 2298.52587890625, "eval_logits/chosen": 1.3931907415390015, "eval_logits/rejected": 1.4378267526626587, "eval_logps/chosen": -252.2974853515625, "eval_logps/rejected": -263.42547607421875, "eval_loss": 4616.38916015625, "eval_rewards/accuracies": 0.5789473652839661, "eval_rewards/chosen": 0.006209712475538254, "eval_rewards/margins": 0.04387813061475754, "eval_rewards/rejected": -0.03766842186450958, "eval_runtime": 28.3323, "eval_samples_per_second": 21.177, "eval_steps_per_second": 0.671, "step": 2000 }, { "debug/policy_chosen_logits": 1.070395827293396, "debug/policy_chosen_logps": -249.1089324951172, "debug/policy_rejected_logits": 1.4063167572021484, "debug/policy_rejected_logps": -281.3536376953125, "debug/reference_chosen_logps": -250.95486450195312, "debug/reference_rejected_logps": -275.25604248046875, "debug/sppo_chosen_loss": 2398.890869140625, "debug/sppo_chosen_reward_in_loss": 1.8459268808364868, "debug/sppo_rej_reward_in_loss": -6.097577095031738, "debug/sppo_reject_loss": 2119.35888671875, "epoch": 7.2644927536231885, "grad_norm": 78997.13816690067, "learning_rate": 2.2707828391095307e-09, "logits/chosen": 1.070395827293396, "logits/rejected": 1.4063167572021484, "logps/chosen": -249.1089324951172, "logps/rejected": -281.3536376953125, "loss": 4406.457, "rewards/accuracies": 0.75, "rewards/chosen": 0.018459269776940346, "rewards/margins": 0.07943503558635712, "rewards/rejected": -0.06097576022148132, "step": 2005 }, { "debug/policy_chosen_logits": 1.077000617980957, "debug/policy_chosen_logps": -255.35476684570312, "debug/policy_rejected_logits": 1.4080921411514282, "debug/policy_rejected_logps": -269.4011535644531, "debug/reference_chosen_logps": -256.257080078125, "debug/reference_rejected_logps": -267.22894287109375, "debug/sppo_chosen_loss": 2449.634765625, "debug/sppo_chosen_reward_in_loss": 0.9023283123970032, "debug/sppo_rej_reward_in_loss": -2.1722145080566406, "debug/sppo_reject_loss": 2369.26220703125, "epoch": 7.282608695652174, "grad_norm": 64792.2688143268, "learning_rate": 2.1611022500797495e-09, "logits/chosen": 1.077000617980957, "logits/rejected": 1.4080921411514282, "logps/chosen": -255.35476684570312, "logps/rejected": -269.4011535644531, "loss": 4526.691, "rewards/accuracies": 0.625, "rewards/chosen": 0.009023282676935196, "rewards/margins": 0.030745428055524826, "rewards/rejected": -0.02172214351594448, "step": 2010 }, { "debug/policy_chosen_logits": 1.070713758468628, "debug/policy_chosen_logps": -256.7032775878906, "debug/policy_rejected_logits": 1.3287959098815918, "debug/policy_rejected_logps": -307.5129089355469, "debug/reference_chosen_logps": -256.99139404296875, "debug/reference_rejected_logps": -301.73614501953125, "debug/sppo_chosen_loss": 2563.2890625, "debug/sppo_chosen_reward_in_loss": 0.28814584016799927, "debug/sppo_rej_reward_in_loss": -5.776768684387207, "debug/sppo_reject_loss": 2124.537353515625, "epoch": 7.300724637681159, "grad_norm": 73436.59483542369, "learning_rate": 2.0540779645590146e-09, "logits/chosen": 1.070713758468628, "logits/rejected": 1.3287959098815918, "logps/chosen": -256.7032775878906, "logps/rejected": -307.5129089355469, "loss": 4503.1719, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.002881459193304181, "rewards/margins": 0.060649145394563675, "rewards/rejected": -0.05776768922805786, "step": 2015 }, { "debug/policy_chosen_logits": 1.38528311252594, "debug/policy_chosen_logps": -259.69317626953125, "debug/policy_rejected_logits": 1.6116775274276733, "debug/policy_rejected_logps": -283.51806640625, "debug/reference_chosen_logps": -261.0834045410156, "debug/reference_rejected_logps": -275.83831787109375, "debug/sppo_chosen_loss": 2445.71875, "debug/sppo_chosen_reward_in_loss": 1.3902076482772827, "debug/sppo_rej_reward_in_loss": -7.679726600646973, "debug/sppo_reject_loss": 1981.787109375, "epoch": 7.318840579710145, "grad_norm": 120383.63503334566, "learning_rate": 1.9497159251797514e-09, "logits/chosen": 1.38528311252594, "logits/rejected": 1.6116775274276733, "logps/chosen": -259.69317626953125, "logps/rejected": -283.51806640625, "loss": 4654.2121, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.013902077451348305, "rewards/margins": 0.09069932997226715, "rewards/rejected": -0.07679726183414459, "step": 2020 }, { "debug/policy_chosen_logits": 1.0787150859832764, "debug/policy_chosen_logps": -250.2095184326172, "debug/policy_rejected_logits": 1.3025166988372803, "debug/policy_rejected_logps": -289.91925048828125, "debug/reference_chosen_logps": -253.705322265625, "debug/reference_rejected_logps": -283.3659973144531, "debug/sppo_chosen_loss": 2171.8974609375, "debug/sppo_chosen_reward_in_loss": 3.495814800262451, "debug/sppo_rej_reward_in_loss": -6.55324649810791, "debug/sppo_reject_loss": 1995.900390625, "epoch": 7.336956521739131, "grad_norm": 84149.40866936331, "learning_rate": 1.8480219267504537e-09, "logits/chosen": 1.0787150859832764, "logits/rejected": 1.3025166988372803, "logps/chosen": -250.2095184326172, "logps/rejected": -289.91925048828125, "loss": 4416.7891, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.03495814651250839, "rewards/margins": 0.10049059242010117, "rewards/rejected": -0.06553246080875397, "step": 2025 }, { "debug/policy_chosen_logits": 1.3572648763656616, "debug/policy_chosen_logps": -261.5050048828125, "debug/policy_rejected_logits": 1.3145654201507568, "debug/policy_rejected_logps": -268.80584716796875, "debug/reference_chosen_logps": -263.184814453125, "debug/reference_rejected_logps": -266.7096252441406, "debug/sppo_chosen_loss": 2395.189208984375, "debug/sppo_chosen_reward_in_loss": 1.6798057556152344, "debug/sppo_rej_reward_in_loss": -2.0962014198303223, "debug/sppo_reject_loss": 2409.43310546875, "epoch": 7.355072463768116, "grad_norm": 65808.77111899166, "learning_rate": 1.7490016159339482e-09, "logits/chosen": 1.3572648763656616, "logits/rejected": 1.3145654201507568, "logps/chosen": -261.5050048828125, "logps/rejected": -268.80584716796875, "loss": 4628.1609, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.016798056662082672, "rewards/margins": 0.03776007145643234, "rewards/rejected": -0.02096201293170452, "step": 2030 }, { "debug/policy_chosen_logits": 1.0232479572296143, "debug/policy_chosen_logps": -248.12710571289062, "debug/policy_rejected_logits": 1.5908982753753662, "debug/policy_rejected_logps": -293.07110595703125, "debug/reference_chosen_logps": -248.68417358398438, "debug/reference_rejected_logps": -287.56488037109375, "debug/sppo_chosen_loss": 2500.882080078125, "debug/sppo_chosen_reward_in_loss": 0.5570594668388367, "debug/sppo_rej_reward_in_loss": -5.506226062774658, "debug/sppo_reject_loss": 2167.27783203125, "epoch": 7.3731884057971016, "grad_norm": 70769.16312278254, "learning_rate": 1.6526604909338049e-09, "logits/chosen": 1.0232479572296143, "logits/rejected": 1.5908982753753662, "logps/chosen": -248.12710571289062, "logps/rejected": -293.07110595703125, "loss": 4486.2609, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0055705951526761055, "rewards/margins": 0.0606328621506691, "rewards/rejected": -0.05506226420402527, "step": 2035 }, { "debug/policy_chosen_logits": 0.9465106129646301, "debug/policy_chosen_logps": -268.14422607421875, "debug/policy_rejected_logits": 1.3999744653701782, "debug/policy_rejected_logps": -315.14910888671875, "debug/reference_chosen_logps": -269.544921875, "debug/reference_rejected_logps": -309.2588806152344, "debug/sppo_chosen_loss": 2454.192138671875, "debug/sppo_chosen_reward_in_loss": 1.400636911392212, "debug/sppo_rej_reward_in_loss": -5.890233993530273, "debug/sppo_reject_loss": 2120.572998046875, "epoch": 7.391304347826087, "grad_norm": 86036.02978408146, "learning_rate": 1.5590039011890987e-09, "logits/chosen": 0.9465106129646301, "logits/rejected": 1.3999744653701782, "logps/chosen": -268.14422607421875, "logps/rejected": -315.14910888671875, "loss": 4478.875, "rewards/accuracies": 0.75, "rewards/chosen": 0.014006366953253746, "rewards/margins": 0.07290870696306229, "rewards/rejected": -0.05890233442187309, "step": 2040 }, { "debug/policy_chosen_logits": 1.4603159427642822, "debug/policy_chosen_logps": -274.74029541015625, "debug/policy_rejected_logits": 1.5231013298034668, "debug/policy_rejected_logps": -305.5496520996094, "debug/reference_chosen_logps": -276.53521728515625, "debug/reference_rejected_logps": -299.4794921875, "debug/sppo_chosen_loss": 2367.659912109375, "debug/sppo_chosen_reward_in_loss": 1.794926643371582, "debug/sppo_rej_reward_in_loss": -6.070174217224121, "debug/sppo_reject_loss": 2107.4912109375, "epoch": 7.409420289855072, "grad_norm": 65704.93037044462, "learning_rate": 1.4680370470773251e-09, "logits/chosen": 1.4603159427642822, "logits/rejected": 1.5231013298034668, "logps/chosen": -274.74029541015625, "logps/rejected": -305.5496520996094, "loss": 4405.3949, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.017949264496564865, "rewards/margins": 0.07865099608898163, "rewards/rejected": -0.06070173904299736, "step": 2045 }, { "debug/policy_chosen_logits": 1.3314096927642822, "debug/policy_chosen_logps": -257.7485046386719, "debug/policy_rejected_logits": 1.426235556602478, "debug/policy_rejected_logps": -296.75225830078125, "debug/reference_chosen_logps": -261.3354797363281, "debug/reference_rejected_logps": -290.55609130859375, "debug/sppo_chosen_loss": 2178.118408203125, "debug/sppo_chosen_reward_in_loss": 3.586970806121826, "debug/sppo_rej_reward_in_loss": -6.196176052093506, "debug/sppo_reject_loss": 2050.711181640625, "epoch": 7.427536231884058, "grad_norm": 61529.338468656344, "learning_rate": 1.3797649796257027e-09, "logits/chosen": 1.3314096927642822, "logits/rejected": 1.426235556602478, "logps/chosen": -257.7485046386719, "logps/rejected": -296.75225830078125, "loss": 4368.5391, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03586970642209053, "rewards/margins": 0.09783147275447845, "rewards/rejected": -0.06196175888180733, "step": 2050 }, { "debug/policy_chosen_logits": 1.1443208456039429, "debug/policy_chosen_logps": -267.1307678222656, "debug/policy_rejected_logits": 1.1701858043670654, "debug/policy_rejected_logps": -286.821533203125, "debug/reference_chosen_logps": -266.78277587890625, "debug/reference_rejected_logps": -282.5542297363281, "debug/sppo_chosen_loss": 2642.63525390625, "debug/sppo_chosen_reward_in_loss": -0.34796142578125, "debug/sppo_rej_reward_in_loss": -4.267295837402344, "debug/sppo_reject_loss": 2164.36669921875, "epoch": 7.445652173913043, "grad_norm": 70235.45670889581, "learning_rate": 1.2941926002306536e-09, "logits/chosen": 1.1443208456039429, "logits/rejected": 1.1701858043670654, "logps/chosen": -267.1307678222656, "logps/rejected": -286.821533203125, "loss": 4530.5602, "rewards/accuracies": 0.625, "rewards/chosen": -0.0034796136897057295, "rewards/margins": 0.03919333964586258, "rewards/rejected": -0.04267295449972153, "step": 2055 }, { "debug/policy_chosen_logits": 1.4272644519805908, "debug/policy_chosen_logps": -268.73114013671875, "debug/policy_rejected_logits": 1.7103526592254639, "debug/policy_rejected_logps": -310.28265380859375, "debug/reference_chosen_logps": -269.88397216796875, "debug/reference_rejected_logps": -305.17596435546875, "debug/sppo_chosen_loss": 2464.18701171875, "debug/sppo_chosen_reward_in_loss": 1.152845025062561, "debug/sppo_rej_reward_in_loss": -5.106662750244141, "debug/sppo_reject_loss": 2134.12744140625, "epoch": 7.463768115942029, "grad_norm": 68845.49769601325, "learning_rate": 1.2113246603856653e-09, "logits/chosen": 1.4272644519805908, "logits/rejected": 1.7103526592254639, "logps/chosen": -268.73114013671875, "logps/rejected": -310.28265380859375, "loss": 4599.1586, "rewards/accuracies": 0.625, "rewards/chosen": 0.011528450064361095, "rewards/margins": 0.06259507685899734, "rewards/rejected": -0.051066625863313675, "step": 2060 }, { "debug/policy_chosen_logits": 1.303856611251831, "debug/policy_chosen_logps": -258.29278564453125, "debug/policy_rejected_logits": 1.5936082601547241, "debug/policy_rejected_logps": -294.1533203125, "debug/reference_chosen_logps": -261.1592102050781, "debug/reference_rejected_logps": -289.24749755859375, "debug/sppo_chosen_loss": 2248.81298828125, "debug/sppo_chosen_reward_in_loss": 2.8664422035217285, "debug/sppo_rej_reward_in_loss": -4.905792236328125, "debug/sppo_reject_loss": 2147.85693359375, "epoch": 7.481884057971015, "grad_norm": 65626.6858130414, "learning_rate": 1.1311657614174907e-09, "logits/chosen": 1.303856611251831, "logits/rejected": 1.5936082601547241, "logps/chosen": -258.29278564453125, "logps/rejected": -294.1533203125, "loss": 4520.8699, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.028664419427514076, "rewards/margins": 0.07772234827280045, "rewards/rejected": -0.04905792325735092, "step": 2065 }, { "debug/policy_chosen_logits": 1.229323148727417, "debug/policy_chosen_logps": -253.285888671875, "debug/policy_rejected_logits": 1.6499292850494385, "debug/policy_rejected_logps": -311.3282165527344, "debug/reference_chosen_logps": -255.8767852783203, "debug/reference_rejected_logps": -305.1612243652344, "debug/sppo_chosen_loss": 2267.26904296875, "debug/sppo_chosen_reward_in_loss": 2.590872049331665, "debug/sppo_rej_reward_in_loss": -6.166988849639893, "debug/sppo_reject_loss": 2095.09814453125, "epoch": 7.5, "grad_norm": 63245.6052867994, "learning_rate": 1.0537203542306083e-09, "logits/chosen": 1.229323148727417, "logits/rejected": 1.6499292850494385, "logps/chosen": -253.285888671875, "logps/rejected": -311.3282165527344, "loss": 4523.8332, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02590871974825859, "rewards/margins": 0.08757860958576202, "rewards/rejected": -0.061669886112213135, "step": 2070 }, { "debug/policy_chosen_logits": 1.3278888463974, "debug/policy_chosen_logps": -271.868408203125, "debug/policy_rejected_logits": 1.6005455255508423, "debug/policy_rejected_logps": -312.808349609375, "debug/reference_chosen_logps": -274.61627197265625, "debug/reference_rejected_logps": -309.5181579589844, "debug/sppo_chosen_loss": 2292.456787109375, "debug/sppo_chosen_reward_in_loss": 2.7478396892547607, "debug/sppo_rej_reward_in_loss": -3.2901642322540283, "debug/sppo_reject_loss": 2260.4921875, "epoch": 7.518115942028985, "grad_norm": 61033.563232917964, "learning_rate": 9.78992739060114e-10, "logits/chosen": 1.3278888463974, "logits/rejected": 1.6005455255508423, "logps/chosen": -271.868408203125, "logps/rejected": -312.808349609375, "loss": 4432.7875, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.027478396892547607, "rewards/margins": 0.06038004159927368, "rewards/rejected": -0.032901640981435776, "step": 2075 }, { "debug/policy_chosen_logits": 1.1519010066986084, "debug/policy_chosen_logps": -275.2718811035156, "debug/policy_rejected_logits": 1.0840139389038086, "debug/policy_rejected_logps": -284.2613830566406, "debug/reference_chosen_logps": -277.41717529296875, "debug/reference_rejected_logps": -277.25054931640625, "debug/sppo_chosen_loss": 2323.05712890625, "debug/sppo_chosen_reward_in_loss": 2.145270586013794, "debug/sppo_rej_reward_in_loss": -7.010800361633301, "debug/sppo_reject_loss": 2043.428466796875, "epoch": 7.536231884057971, "grad_norm": 62756.9248243736, "learning_rate": 9.069870652329281e-10, "logits/chosen": 1.1519010066986084, "logits/rejected": 1.0840139389038086, "logps/chosen": -275.2718811035156, "logps/rejected": -284.2613830566406, "loss": 4596.4141, "rewards/accuracies": 0.75, "rewards/chosen": 0.021452704444527626, "rewards/margins": 0.09156069904565811, "rewards/rejected": -0.07010800391435623, "step": 2080 }, { "debug/policy_chosen_logits": 0.9182929992675781, "debug/policy_chosen_logps": -248.80953979492188, "debug/policy_rejected_logits": 1.4649393558502197, "debug/policy_rejected_logps": -299.94677734375, "debug/reference_chosen_logps": -247.97744750976562, "debug/reference_rejected_logps": -295.57763671875, "debug/sppo_chosen_loss": 2668.193359375, "debug/sppo_chosen_reward_in_loss": -0.8321117162704468, "debug/sppo_rej_reward_in_loss": -4.3691205978393555, "debug/sppo_reject_loss": 2219.789794921875, "epoch": 7.554347826086957, "grad_norm": 61876.31012711566, "learning_rate": 8.377073309374149e-10, "logits/chosen": 0.9182929992675781, "logits/rejected": 1.4649393558502197, "logps/chosen": -248.80953979492188, "logps/rejected": -299.94677734375, "loss": 4586.6172, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.008321116678416729, "rewards/margins": 0.03537008911371231, "rewards/rejected": -0.04369121044874191, "step": 2085 }, { "debug/policy_chosen_logits": 0.9342246055603027, "debug/policy_chosen_logps": -230.5325469970703, "debug/policy_rejected_logits": 1.470979928970337, "debug/policy_rejected_logps": -288.772705078125, "debug/reference_chosen_logps": -233.8635711669922, "debug/reference_rejected_logps": -281.48101806640625, "debug/sppo_chosen_loss": 2201.9716796875, "debug/sppo_chosen_reward_in_loss": 3.331019639968872, "debug/sppo_rej_reward_in_loss": -7.291647434234619, "debug/sppo_reject_loss": 1990.5560302734375, "epoch": 7.572463768115942, "grad_norm": 73885.90071030497, "learning_rate": 7.711573830013584e-10, "logits/chosen": 0.9342246055603027, "logits/rejected": 1.470979928970337, "logps/chosen": -230.5325469970703, "logps/rejected": -288.772705078125, "loss": 4508.3344, "rewards/accuracies": 0.75, "rewards/chosen": 0.033310193568468094, "rewards/margins": 0.10622666031122208, "rewards/rejected": -0.07291646301746368, "step": 2090 }, { "debug/policy_chosen_logits": 0.804153323173523, "debug/policy_chosen_logps": -253.0216522216797, "debug/policy_rejected_logits": 1.2514761686325073, "debug/policy_rejected_logps": -313.05218505859375, "debug/reference_chosen_logps": -256.2008972167969, "debug/reference_rejected_logps": -307.97308349609375, "debug/sppo_chosen_loss": 2224.22412109375, "debug/sppo_chosen_reward_in_loss": 3.1792445182800293, "debug/sppo_rej_reward_in_loss": -5.079104900360107, "debug/sppo_reject_loss": 2183.387939453125, "epoch": 7.590579710144928, "grad_norm": 99151.52697996194, "learning_rate": 7.073409166783839e-10, "logits/chosen": 0.804153323173523, "logits/rejected": 1.2514761686325073, "logps/chosen": -253.0216522216797, "logps/rejected": -313.05218505859375, "loss": 4469.7992, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03179244324564934, "rewards/margins": 0.08258350193500519, "rewards/rejected": -0.050791043788194656, "step": 2095 }, { "debug/policy_chosen_logits": 1.292790174484253, "debug/policy_chosen_logps": -273.0116271972656, "debug/policy_rejected_logits": 1.3900998830795288, "debug/policy_rejected_logps": -277.5364685058594, "debug/reference_chosen_logps": -273.92059326171875, "debug/reference_rejected_logps": -270.35479736328125, "debug/sppo_chosen_loss": 2474.90185546875, "debug/sppo_chosen_reward_in_loss": 0.9089992642402649, "debug/sppo_rej_reward_in_loss": -7.1816864013671875, "debug/sppo_reject_loss": 2031.3232421875, "epoch": 7.608695652173913, "grad_norm": 74327.26454997434, "learning_rate": 6.462614754427665e-10, "logits/chosen": 1.292790174484253, "logits/rejected": 1.3900998830795288, "logps/chosen": -273.0116271972656, "logps/rejected": -277.5364685058594, "loss": 4477.8289, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.009089991450309753, "rewards/margins": 0.08090685307979584, "rewards/rejected": -0.07181687653064728, "step": 2100 }, { "epoch": 7.608695652173913, "eval_debug/policy_chosen_logits": 1.3925397396087646, "eval_debug/policy_chosen_logps": -252.22926330566406, "eval_debug/policy_rejected_logits": 1.436280369758606, "eval_debug/policy_rejected_logps": -263.1951599121094, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2506.2578125, "eval_debug/sppo_chosen_reward_in_loss": 0.6891991496086121, "eval_debug/sppo_rej_reward_in_loss": -3.5365147590637207, "eval_debug/sppo_reject_loss": 2318.237548828125, "eval_logits/chosen": 1.3925397396087646, "eval_logits/rejected": 1.436280369758606, "eval_logps/chosen": -252.22926330566406, "eval_logps/rejected": -263.1951599121094, "eval_loss": 4617.22900390625, "eval_rewards/accuracies": 0.5789473652839661, "eval_rewards/chosen": 0.006891992408782244, "eval_rewards/margins": 0.04225713387131691, "eval_rewards/rejected": -0.03536514192819595, "eval_runtime": 28.4804, "eval_samples_per_second": 21.067, "eval_steps_per_second": 0.667, "step": 2100 }, { "debug/policy_chosen_logits": 0.6739364266395569, "debug/policy_chosen_logps": -230.30337524414062, "debug/policy_rejected_logits": 0.9180151224136353, "debug/policy_rejected_logps": -246.10037231445312, "debug/reference_chosen_logps": -233.5577392578125, "debug/reference_rejected_logps": -242.1542510986328, "debug/sppo_chosen_loss": 2197.70849609375, "debug/sppo_chosen_reward_in_loss": 3.254357099533081, "debug/sppo_rej_reward_in_loss": -3.946110486984253, "debug/sppo_reject_loss": 2230.728515625, "epoch": 7.6268115942028984, "grad_norm": 64996.731517441374, "learning_rate": 5.879224507926661e-10, "logits/chosen": 0.6739364266395569, "logits/rejected": 0.9180151224136353, "logps/chosen": -230.30337524414062, "logps/rejected": -246.10037231445312, "loss": 4344.8258, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.032543569803237915, "rewards/margins": 0.07200466096401215, "rewards/rejected": -0.039461102336645126, "step": 2105 }, { "debug/policy_chosen_logits": 1.2277402877807617, "debug/policy_chosen_logps": -256.14288330078125, "debug/policy_rejected_logits": 1.379900574684143, "debug/policy_rejected_logps": -274.8895568847656, "debug/reference_chosen_logps": -259.5997619628906, "debug/reference_rejected_logps": -270.292724609375, "debug/sppo_chosen_loss": 2177.25146484375, "debug/sppo_chosen_reward_in_loss": 3.4568793773651123, "debug/sppo_rej_reward_in_loss": -4.596831321716309, "debug/sppo_reject_loss": 2206.194580078125, "epoch": 7.644927536231884, "grad_norm": 71585.54175772157, "learning_rate": 5.323270820618398e-10, "logits/chosen": 1.2277402877807617, "logits/rejected": 1.379900574684143, "logps/chosen": -256.14288330078125, "logps/rejected": -274.8895568847656, "loss": 4494.991, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03456879034638405, "rewards/margins": 0.0805370956659317, "rewards/rejected": -0.04596831649541855, "step": 2110 }, { "debug/policy_chosen_logits": 1.0671769380569458, "debug/policy_chosen_logps": -243.13125610351562, "debug/policy_rejected_logits": 1.1685346364974976, "debug/policy_rejected_logps": -274.372802734375, "debug/reference_chosen_logps": -243.8218231201172, "debug/reference_rejected_logps": -267.530517578125, "debug/sppo_chosen_loss": 2521.191162109375, "debug/sppo_chosen_reward_in_loss": 0.6905729174613953, "debug/sppo_rej_reward_in_loss": -6.842259407043457, "debug/sppo_reject_loss": 2061.65771484375, "epoch": 7.663043478260869, "grad_norm": 103906.16698210256, "learning_rate": 4.794784562397458e-10, "logits/chosen": 1.0671769380569458, "logits/rejected": 1.1685346364974976, "logps/chosen": -243.13125610351562, "logps/rejected": -274.372802734375, "loss": 4520.9984, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.006905729416757822, "rewards/margins": 0.07532832771539688, "rewards/rejected": -0.0684226006269455, "step": 2115 }, { "debug/policy_chosen_logits": 0.9971033334732056, "debug/policy_chosen_logps": -227.75521850585938, "debug/policy_rejected_logits": 1.3219413757324219, "debug/policy_rejected_logps": -302.8537292480469, "debug/reference_chosen_logps": -229.51358032226562, "debug/reference_rejected_logps": -298.8193359375, "debug/sppo_chosen_loss": 2390.484619140625, "debug/sppo_chosen_reward_in_loss": 1.7583458423614502, "debug/sppo_rej_reward_in_loss": -4.034407615661621, "debug/sppo_reject_loss": 2209.649169921875, "epoch": 7.681159420289855, "grad_norm": 92487.56140396521, "learning_rate": 4.293795078001317e-10, "logits/chosen": 0.9971033334732056, "logits/rejected": 1.3219413757324219, "logps/chosen": -227.75521850585938, "logps/rejected": -302.8537292480469, "loss": 4604.8105, "rewards/accuracies": 0.75, "rewards/chosen": 0.0175834558904171, "rewards/margins": 0.057927537709474564, "rewards/rejected": -0.040344081819057465, "step": 2120 }, { "debug/policy_chosen_logits": 0.9510968923568726, "debug/policy_chosen_logps": -225.7450714111328, "debug/policy_rejected_logits": 1.346861720085144, "debug/policy_rejected_logps": -291.18988037109375, "debug/reference_chosen_logps": -229.05801391601562, "debug/reference_rejected_logps": -284.1490478515625, "debug/sppo_chosen_loss": 2200.169921875, "debug/sppo_chosen_reward_in_loss": 3.3129355907440186, "debug/sppo_rej_reward_in_loss": -7.0408034324646, "debug/sppo_reject_loss": 2031.1324462890625, "epoch": 7.699275362318841, "grad_norm": 78902.88545959517, "learning_rate": 3.8203301853813594e-10, "logits/chosen": 0.9510968923568726, "logits/rejected": 1.346861720085144, "logps/chosen": -225.7450714111328, "logps/rejected": -291.18988037109375, "loss": 4466.5586, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03312935680150986, "rewards/margins": 0.10353739559650421, "rewards/rejected": -0.07040803134441376, "step": 2125 }, { "debug/policy_chosen_logits": 1.301474690437317, "debug/policy_chosen_logps": -252.5749969482422, "debug/policy_rejected_logits": 1.6346708536148071, "debug/policy_rejected_logps": -284.538330078125, "debug/reference_chosen_logps": -253.86953735351562, "debug/reference_rejected_logps": -282.350830078125, "debug/sppo_chosen_loss": 2424.56005859375, "debug/sppo_chosen_reward_in_loss": 1.2945072650909424, "debug/sppo_rej_reward_in_loss": -2.187539577484131, "debug/sppo_reject_loss": 2352.7392578125, "epoch": 7.717391304347826, "grad_norm": 64909.281348993565, "learning_rate": 3.3744161741577905e-10, "logits/chosen": 1.301474690437317, "logits/rejected": 1.6346708536148071, "logps/chosen": -252.5749969482422, "logps/rejected": -284.538330078125, "loss": 4599.1926, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.01294507272541523, "rewards/margins": 0.034820470958948135, "rewards/rejected": -0.021875392645597458, "step": 2130 }, { "debug/policy_chosen_logits": 1.1287482976913452, "debug/policy_chosen_logps": -258.11572265625, "debug/policy_rejected_logits": 1.5572656393051147, "debug/policy_rejected_logps": -299.67730712890625, "debug/reference_chosen_logps": -259.0867004394531, "debug/reference_rejected_logps": -294.621826171875, "debug/sppo_chosen_loss": 2464.80810546875, "debug/sppo_chosen_reward_in_loss": 0.970924973487854, "debug/sppo_rej_reward_in_loss": -5.055464744567871, "debug/sppo_reject_loss": 2166.364501953125, "epoch": 7.7355072463768115, "grad_norm": 67019.5485001634, "learning_rate": 2.956077804160184e-10, "logits/chosen": 1.1287482976913452, "logits/rejected": 1.5572656393051147, "logps/chosen": -258.11572265625, "logps/rejected": -299.67730712890625, "loss": 4541.4832, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.009709248319268227, "rewards/margins": 0.060263894498348236, "rewards/rejected": -0.05055464431643486, "step": 2135 }, { "debug/policy_chosen_logits": 1.219551920890808, "debug/policy_chosen_logps": -267.6524963378906, "debug/policy_rejected_logits": 1.5995728969573975, "debug/policy_rejected_logps": -285.8074951171875, "debug/reference_chosen_logps": -269.52984619140625, "debug/reference_rejected_logps": -281.32818603515625, "debug/sppo_chosen_loss": 2368.73486328125, "debug/sppo_chosen_reward_in_loss": 1.8773447275161743, "debug/sppo_rej_reward_in_loss": -4.479327201843262, "debug/sppo_reject_loss": 2177.294677734375, "epoch": 7.753623188405797, "grad_norm": 151513.1334928217, "learning_rate": 2.5653383040524224e-10, "logits/chosen": 1.219551920890808, "logits/rejected": 1.5995728969573975, "logps/chosen": -267.6524963378906, "logps/rejected": -285.8074951171875, "loss": 4615.0719, "rewards/accuracies": 0.75, "rewards/chosen": 0.01877344772219658, "rewards/margins": 0.06356671452522278, "rewards/rejected": -0.0447932705283165, "step": 2140 }, { "debug/policy_chosen_logits": 1.3132517337799072, "debug/policy_chosen_logps": -262.13079833984375, "debug/policy_rejected_logits": 1.3344939947128296, "debug/policy_rejected_logps": -284.14874267578125, "debug/reference_chosen_logps": -263.3422546386719, "debug/reference_rejected_logps": -277.1472473144531, "debug/sppo_chosen_loss": 2454.830810546875, "debug/sppo_chosen_reward_in_loss": 1.2114683389663696, "debug/sppo_rej_reward_in_loss": -7.001499176025391, "debug/sppo_reject_loss": 1998.1826171875, "epoch": 7.771739130434782, "grad_norm": 62465.41863525179, "learning_rate": 2.202219370043168e-10, "logits/chosen": 1.3132517337799072, "logits/rejected": 1.3344939947128296, "logps/chosen": -262.13079833984375, "logps/rejected": -284.14874267578125, "loss": 4556.0914, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.012114683166146278, "rewards/margins": 0.08212967216968536, "rewards/rejected": -0.07001499086618423, "step": 2145 }, { "debug/policy_chosen_logits": 0.9999769330024719, "debug/policy_chosen_logps": -222.53140258789062, "debug/policy_rejected_logits": 1.1089446544647217, "debug/policy_rejected_logps": -247.6454315185547, "debug/reference_chosen_logps": -225.07754516601562, "debug/reference_rejected_logps": -243.9304962158203, "debug/sppo_chosen_loss": 2308.60986328125, "debug/sppo_chosen_reward_in_loss": 2.546154737472534, "debug/sppo_rej_reward_in_loss": -3.7149219512939453, "debug/sppo_reject_loss": 2277.2587890625, "epoch": 7.789855072463768, "grad_norm": 66491.83051664101, "learning_rate": 1.866741164680996e-10, "logits/chosen": 0.9999769330024719, "logits/rejected": 1.1089446544647217, "logps/chosen": -222.53140258789062, "logps/rejected": -247.6454315185547, "loss": 4427.7336, "rewards/accuracies": 0.75, "rewards/chosen": 0.025461548939347267, "rewards/margins": 0.06261076033115387, "rewards/rejected": -0.03714922070503235, "step": 2150 }, { "debug/policy_chosen_logits": 1.0805193185806274, "debug/policy_chosen_logps": -241.43936157226562, "debug/policy_rejected_logits": 1.2052781581878662, "debug/policy_rejected_logps": -294.7074279785156, "debug/reference_chosen_logps": -242.5572967529297, "debug/reference_rejected_logps": -286.7196960449219, "debug/sppo_chosen_loss": 2461.002197265625, "debug/sppo_chosen_reward_in_loss": 1.1179357767105103, "debug/sppo_rej_reward_in_loss": -7.987711429595947, "debug/sppo_reject_loss": 1928.757080078125, "epoch": 7.807971014492754, "grad_norm": 60872.43048458311, "learning_rate": 1.5589223157347896e-10, "logits/chosen": 1.0805193185806274, "logits/rejected": 1.2052781581878662, "logps/chosen": -241.43936157226562, "logps/rejected": -294.7074279785156, "loss": 4587.5289, "rewards/accuracies": 0.75, "rewards/chosen": 0.011179356835782528, "rewards/margins": 0.0910564661026001, "rewards/rejected": -0.079877108335495, "step": 2155 }, { "debug/policy_chosen_logits": 1.1456208229064941, "debug/policy_chosen_logps": -230.3699951171875, "debug/policy_rejected_logits": 1.9213443994522095, "debug/policy_rejected_logps": -306.39056396484375, "debug/reference_chosen_logps": -232.04678344726562, "debug/reference_rejected_logps": -300.45745849609375, "debug/sppo_chosen_loss": 2416.55029296875, "debug/sppo_chosen_reward_in_loss": 1.6767866611480713, "debug/sppo_rej_reward_in_loss": -5.933084487915039, "debug/sppo_reject_loss": 2082.762939453125, "epoch": 7.826086956521739, "grad_norm": 64496.71112261495, "learning_rate": 1.2787799151596224e-10, "logits/chosen": 1.1456208229064941, "logits/rejected": 1.9213443994522095, "logps/chosen": -230.3699951171875, "logps/rejected": -306.39056396484375, "loss": 4590.8188, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.016767865046858788, "rewards/margins": 0.07609870284795761, "rewards/rejected": -0.05933083966374397, "step": 2160 }, { "debug/policy_chosen_logits": 1.24051833152771, "debug/policy_chosen_logps": -268.8450622558594, "debug/policy_rejected_logits": 1.2047107219696045, "debug/policy_rejected_logps": -266.3825378417969, "debug/reference_chosen_logps": -269.70306396484375, "debug/reference_rejected_logps": -261.1277160644531, "debug/sppo_chosen_loss": 2478.1025390625, "debug/sppo_chosen_reward_in_loss": 0.8580325841903687, "debug/sppo_rej_reward_in_loss": -5.254827976226807, "debug/sppo_reject_loss": 2119.91357421875, "epoch": 7.844202898550725, "grad_norm": 82175.45595980022, "learning_rate": 1.0263295181475174e-10, "logits/chosen": 1.24051833152771, "logits/rejected": 1.2047107219696045, "logps/chosen": -268.8450622558594, "logps/rejected": -266.3825378417969, "loss": 4478.3102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008580325171351433, "rewards/margins": 0.06112860515713692, "rewards/rejected": -0.052548278123140335, "step": 2165 }, { "debug/policy_chosen_logits": 1.0681052207946777, "debug/policy_chosen_logps": -255.635498046875, "debug/policy_rejected_logits": 1.264644742012024, "debug/policy_rejected_logps": -285.5237731933594, "debug/reference_chosen_logps": -258.68817138671875, "debug/reference_rejected_logps": -281.90374755859375, "debug/sppo_chosen_loss": 2218.684326171875, "debug/sppo_chosen_reward_in_loss": 3.0526726245880127, "debug/sppo_rej_reward_in_loss": -3.6200504302978516, "debug/sppo_reject_loss": 2281.26806640625, "epoch": 7.86231884057971, "grad_norm": 63881.06500488675, "learning_rate": 8.015851422638053e-11, "logits/chosen": 1.0681052207946777, "logits/rejected": 1.264644742012024, "logps/chosen": -255.635498046875, "logps/rejected": -285.5237731933594, "loss": 4553.4793, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.030526721850037575, "rewards/margins": 0.06672722846269608, "rewards/rejected": -0.03620050102472305, "step": 2170 }, { "debug/policy_chosen_logits": 1.361900806427002, "debug/policy_chosen_logps": -259.09490966796875, "debug/policy_rejected_logits": 1.4846298694610596, "debug/policy_rejected_logps": -282.2530212402344, "debug/reference_chosen_logps": -259.996826171875, "debug/reference_rejected_logps": -276.11199951171875, "debug/sppo_chosen_loss": 2474.354248046875, "debug/sppo_chosen_reward_in_loss": 0.9018945693969727, "debug/sppo_rej_reward_in_loss": -6.1409759521484375, "debug/sppo_reject_loss": 2052.828857421875, "epoch": 7.880434782608695, "grad_norm": 79371.0356298821, "learning_rate": 6.045592666688581e-11, "logits/chosen": 1.361900806427002, "logits/rejected": 1.4846298694610596, "logps/chosen": -259.09490966796875, "logps/rejected": -282.2530212402344, "loss": 4502.0855, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.009018944576382637, "rewards/margins": 0.07042870670557022, "rewards/rejected": -0.061409760266542435, "step": 2175 }, { "debug/policy_chosen_logits": 1.0065410137176514, "debug/policy_chosen_logps": -260.93023681640625, "debug/policy_rejected_logits": 1.2652740478515625, "debug/policy_rejected_logps": -291.34326171875, "debug/reference_chosen_logps": -263.2150573730469, "debug/reference_rejected_logps": -288.1213684082031, "debug/sppo_chosen_loss": 2298.703857421875, "debug/sppo_chosen_reward_in_loss": 2.2848479747772217, "debug/sppo_rej_reward_in_loss": -3.221867322921753, "debug/sppo_reject_loss": 2276.685546875, "epoch": 7.898550724637682, "grad_norm": 78849.01149914775, "learning_rate": 4.352628314249762e-11, "logits/chosen": 1.0065410137176514, "logits/rejected": 1.2652740478515625, "logps/chosen": -260.93023681640625, "logps/rejected": -291.34326171875, "loss": 4521.1391, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.022848479449748993, "rewards/margins": 0.05506715923547745, "rewards/rejected": -0.03221867233514786, "step": 2180 }, { "debug/policy_chosen_logits": 1.3481090068817139, "debug/policy_chosen_logps": -265.5303955078125, "debug/policy_rejected_logits": 1.5380135774612427, "debug/policy_rejected_logps": -304.78790283203125, "debug/reference_chosen_logps": -267.88934326171875, "debug/reference_rejected_logps": -299.74700927734375, "debug/sppo_chosen_loss": 2301.35498046875, "debug/sppo_chosen_reward_in_loss": 2.3589279651641846, "debug/sppo_rej_reward_in_loss": -5.040875434875488, "debug/sppo_reject_loss": 2140.15966796875, "epoch": 7.916666666666667, "grad_norm": 65384.04338134149, "learning_rate": 2.9370523688915237e-11, "logits/chosen": 1.3481090068817139, "logits/rejected": 1.5380135774612427, "logps/chosen": -265.5303955078125, "logps/rejected": -304.78790283203125, "loss": 4426.9734, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.023589277639985085, "rewards/margins": 0.07399802654981613, "rewards/rejected": -0.050408750772476196, "step": 2185 }, { "debug/policy_chosen_logits": 0.7700341939926147, "debug/policy_chosen_logps": -227.670654296875, "debug/policy_rejected_logits": 1.2262274026870728, "debug/policy_rejected_logps": -317.4136657714844, "debug/reference_chosen_logps": -231.3727569580078, "debug/reference_rejected_logps": -307.95062255859375, "debug/sppo_chosen_loss": 2154.80126953125, "debug/sppo_chosen_reward_in_loss": 3.7021331787109375, "debug/sppo_rej_reward_in_loss": -9.463071823120117, "debug/sppo_reject_loss": 1801.487060546875, "epoch": 7.934782608695652, "grad_norm": 115608.79716862518, "learning_rate": 1.7989434319093387e-11, "logits/chosen": 0.7700341939926147, "logits/rejected": 1.2262274026870728, "logps/chosen": -227.670654296875, "logps/rejected": -317.4136657714844, "loss": 4496.9297, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03702133148908615, "rewards/margins": 0.13165204226970673, "rewards/rejected": -0.09463071078062057, "step": 2190 }, { "debug/policy_chosen_logits": 1.2976726293563843, "debug/policy_chosen_logps": -283.23236083984375, "debug/policy_rejected_logits": 1.4259979724884033, "debug/policy_rejected_logps": -286.2864685058594, "debug/reference_chosen_logps": -283.5812072753906, "debug/reference_rejected_logps": -284.0365905761719, "debug/sppo_chosen_loss": 2530.442626953125, "debug/sppo_chosen_reward_in_loss": 0.3488399386405945, "debug/sppo_rej_reward_in_loss": -2.249875545501709, "debug/sppo_reject_loss": 2347.236083984375, "epoch": 7.952898550724638, "grad_norm": 103819.46468325114, "learning_rate": 9.38364697961047e-12, "logits/chosen": 1.2976726293563843, "logits/rejected": 1.4259979724884033, "logps/chosen": -283.23236083984375, "logps/rejected": -286.2864685058594, "loss": 4557.3523, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.003488400485366583, "rewards/margins": 0.025987153872847557, "rewards/rejected": -0.022498754784464836, "step": 2195 }, { "debug/policy_chosen_logits": 1.145102620124817, "debug/policy_chosen_logps": -264.08642578125, "debug/policy_rejected_logits": 1.5146633386611938, "debug/policy_rejected_logps": -315.19097900390625, "debug/reference_chosen_logps": -266.16912841796875, "debug/reference_rejected_logps": -312.854248046875, "debug/sppo_chosen_loss": 2318.06494140625, "debug/sppo_chosen_reward_in_loss": 2.082681179046631, "debug/sppo_rej_reward_in_loss": -2.3367526531219482, "debug/sppo_reject_loss": 2359.71044921875, "epoch": 7.971014492753623, "grad_norm": 102200.71903476924, "learning_rate": 3.5536395155744138e-12, "logits/chosen": 1.145102620124817, "logits/rejected": 1.5146633386611938, "logps/chosen": -264.08642578125, "logps/rejected": -315.19097900390625, "loss": 4520.1934, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.020826810970902443, "rewards/margins": 0.044194333255290985, "rewards/rejected": -0.02336752787232399, "step": 2200 }, { "epoch": 7.971014492753623, "eval_debug/policy_chosen_logits": 1.3941218852996826, "eval_debug/policy_chosen_logps": -252.2310333251953, "eval_debug/policy_rejected_logits": 1.437137246131897, "eval_debug/policy_rejected_logps": -263.2242431640625, "eval_debug/reference_chosen_logps": -252.91845703125, "eval_debug/reference_rejected_logps": -259.6585998535156, "eval_debug/sppo_chosen_loss": 2507.325927734375, "eval_debug/sppo_chosen_reward_in_loss": 0.6874253153800964, "eval_debug/sppo_rej_reward_in_loss": -3.5656445026397705, "eval_debug/sppo_reject_loss": 2312.91162109375, "eval_logits/chosen": 1.3941218852996826, "eval_logits/rejected": 1.437137246131897, "eval_logps/chosen": -252.2310333251953, "eval_logps/rejected": -263.2242431640625, "eval_loss": 4613.583984375, "eval_rewards/accuracies": 0.6052631735801697, "eval_rewards/chosen": 0.006874253042042255, "eval_rewards/margins": 0.04253069683909416, "eval_rewards/rejected": -0.03565644472837448, "eval_runtime": 28.5358, "eval_samples_per_second": 21.026, "eval_steps_per_second": 0.666, "step": 2200 }, { "debug/policy_chosen_logits": 1.3107990026474, "debug/policy_chosen_logps": -250.88546752929688, "debug/policy_rejected_logits": 1.5912861824035645, "debug/policy_rejected_logps": -305.1792907714844, "debug/reference_chosen_logps": -252.6299285888672, "debug/reference_rejected_logps": -301.18548583984375, "debug/sppo_chosen_loss": 2382.531982421875, "debug/sppo_chosen_reward_in_loss": 1.7444469928741455, "debug/sppo_rej_reward_in_loss": -3.993786573410034, "debug/sppo_reject_loss": 2213.762939453125, "epoch": 7.989130434782608, "grad_norm": 65636.56438717897, "learning_rate": 4.997356440772371e-13, "logits/chosen": 1.3107990026474, "logits/rejected": 1.5912861824035645, "logps/chosen": -250.88546752929688, "logps/rejected": -305.1792907714844, "loss": 4429.5063, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.017444469034671783, "rewards/margins": 0.057382334023714066, "rewards/rejected": -0.03993786498904228, "step": 2205 }, { "epoch": 8.0, "step": 2208, "total_flos": 0.0, "train_loss": 4636.829430179319, "train_runtime": 15508.5423, "train_samples_per_second": 9.092, "train_steps_per_second": 0.142 } ], "logging_steps": 5, "max_steps": 2208, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }