diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,12 +10,12 @@ "log_history": [ { "epoch": 0.0, - "grad_norm": 2.453125, + "grad_norm": 1.96875, "learning_rate": 1.3054830287206268e-08, - "logits/chosen": -2.3776800632476807, - "logits/rejected": -2.219546318054199, - "logps/chosen": -290.367919921875, - "logps/rejected": -374.58001708984375, + "logits/chosen": -2.804384231567383, + "logits/rejected": -2.6176326274871826, + "logps/chosen": -310.1795349121094, + "logps/rejected": -392.80609130859375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -25,6349 +25,6349 @@ }, { "epoch": 0.0, - "grad_norm": 2.40625, + "grad_norm": 1.984375, "learning_rate": 1.3054830287206266e-07, - "logits/chosen": -2.2502875328063965, - "logits/rejected": -2.052671194076538, - "logps/chosen": -279.6107177734375, - "logps/rejected": -245.41419982910156, - "loss": 0.6933, - "rewards/accuracies": 0.375, - "rewards/chosen": 0.00020713169942609966, - "rewards/margins": -0.0003161564818583429, - "rewards/rejected": 0.000523288210388273, + "logits/chosen": -2.765848159790039, + "logits/rejected": -2.541754722595215, + "logps/chosen": -301.7055358886719, + "logps/rejected": -262.96173095703125, + "loss": 0.6932, + "rewards/accuracies": 0.4305555522441864, + "rewards/chosen": -5.9747020713984966e-05, + "rewards/margins": -4.961798185831867e-05, + "rewards/rejected": -1.0129070687980857e-05, "step": 10 }, { "epoch": 0.01, - "grad_norm": 2.5, + "grad_norm": 2.3125, "learning_rate": 2.610966057441253e-07, - "logits/chosen": -2.2449612617492676, - "logits/rejected": -1.9438155889511108, - "logps/chosen": -305.486083984375, - "logps/rejected": -237.72116088867188, - "loss": 0.6927, - "rewards/accuracies": 0.53125, - "rewards/chosen": 0.003224103944376111, - "rewards/margins": 0.000847988179884851, - "rewards/rejected": 0.0023761156480759382, + "logits/chosen": -2.753605365753174, + "logits/rejected": -2.3991589546203613, + "logps/chosen": -328.1248474121094, + "logps/rejected": -254.85726928710938, + "loss": 0.6932, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.00012516247807070613, + "rewards/margins": -0.00015478665591217577, + "rewards/rejected": 0.00027994910487905145, "step": 20 }, { "epoch": 0.01, - "grad_norm": 2.3125, + "grad_norm": 2.625, "learning_rate": 3.9164490861618804e-07, - "logits/chosen": -2.205552577972412, - "logits/rejected": -2.137118101119995, - "logps/chosen": -251.212890625, - "logps/rejected": -251.37832641601562, - "loss": 0.6925, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.011251501739025116, - "rewards/margins": 0.0012646990362554789, - "rewards/rejected": 0.00998680293560028, + "logits/chosen": -2.6939728260040283, + "logits/rejected": -2.622387647628784, + "logps/chosen": -272.1611633300781, + "logps/rejected": -270.6854553222656, + "loss": 0.6931, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0003803514118772, + "rewards/margins": 8.866041025612503e-05, + "rewards/rejected": -0.0004690118657890707, "step": 30 }, { "epoch": 0.01, - "grad_norm": 1.9453125, + "grad_norm": 1.6875, "learning_rate": 5.221932114882506e-07, - "logits/chosen": -2.0620596408843994, - "logits/rejected": -2.0244317054748535, - "logps/chosen": -216.23904418945312, - "logps/rejected": -221.69931030273438, - "loss": 0.6915, - "rewards/accuracies": 0.53125, - "rewards/chosen": 0.01871674135327339, - "rewards/margins": 0.003262059763073921, - "rewards/rejected": 0.015454679727554321, + "logits/chosen": -2.5652289390563965, + "logits/rejected": -2.5474331378936768, + "logps/chosen": -235.1596221923828, + "logps/rejected": -240.07699584960938, + "loss": 0.6926, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.00027505913749337196, + "rewards/margins": 0.001062754774466157, + "rewards/rejected": -0.0007876956951804459, "step": 40 }, { "epoch": 0.01, - "grad_norm": 2.078125, + "grad_norm": 1.65625, "learning_rate": 6.527415143603135e-07, - "logits/chosen": -2.1119301319122314, - "logits/rejected": -2.100111484527588, - "logps/chosen": -266.87158203125, - "logps/rejected": -234.3477325439453, - "loss": 0.6902, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.02964865043759346, - "rewards/margins": 0.005909620318561792, - "rewards/rejected": 0.023739028722047806, + "logits/chosen": -2.6258225440979004, + "logits/rejected": -2.6688971519470215, + "logps/chosen": -291.9371337890625, + "logps/rejected": -255.8798828125, + "loss": 0.6922, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0018380691763013601, + "rewards/margins": 0.0018592171836644411, + "rewards/rejected": -2.1147827283130027e-05, "step": 50 }, { "epoch": 0.02, - "grad_norm": 2.125, + "grad_norm": 1.8046875, "learning_rate": 7.832898172323761e-07, - "logits/chosen": -2.0990371704101562, - "logits/rejected": -1.9420478343963623, - "logps/chosen": -252.28097534179688, - "logps/rejected": -226.6587677001953, - "loss": 0.69, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.03170455992221832, - "rewards/margins": 0.006424080580472946, - "rewards/rejected": 0.025280479341745377, + "logits/chosen": -2.6339774131774902, + "logits/rejected": -2.42149019241333, + "logps/chosen": -276.26544189453125, + "logps/rejected": -248.28884887695312, + "loss": 0.6918, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0025974493473768234, + "rewards/margins": 0.0027662336360663176, + "rewards/rejected": -0.00016878444876056165, "step": 60 }, { "epoch": 0.02, - "grad_norm": 2.03125, + "grad_norm": 1.8046875, "learning_rate": 9.138381201044387e-07, - "logits/chosen": -2.2445380687713623, - "logits/rejected": -2.036877393722534, - "logps/chosen": -272.0133361816406, - "logps/rejected": -246.6891326904297, - "loss": 0.6875, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.041380804032087326, - "rewards/margins": 0.011579836718738079, - "rewards/rejected": 0.029800966382026672, + "logits/chosen": -2.7314560413360596, + "logits/rejected": -2.5053892135620117, + "logps/chosen": -295.3564758300781, + "logps/rejected": -268.2427673339844, + "loss": 0.6913, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.0015695008914917707, + "rewards/margins": 0.003719520289450884, + "rewards/rejected": -0.002150018932297826, "step": 70 }, { "epoch": 0.02, - "grad_norm": 2.390625, + "grad_norm": 2.453125, "learning_rate": 1.0443864229765013e-06, - "logits/chosen": -2.1533939838409424, - "logits/rejected": -1.9770574569702148, - "logps/chosen": -257.60333251953125, - "logps/rejected": -246.875244140625, - "loss": 0.6875, + "logits/chosen": -2.676331043243408, + "logits/rejected": -2.466033697128296, + "logps/chosen": -281.6533203125, + "logps/rejected": -268.70001220703125, + "loss": 0.6903, "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.037470318377017975, - "rewards/margins": 0.011652106419205666, - "rewards/rejected": 0.02581821382045746, + "rewards/chosen": 0.0021942234598100185, + "rewards/margins": 0.00573298055678606, + "rewards/rejected": -0.003538756398484111, "step": 80 }, { "epoch": 0.02, - "grad_norm": 2.1875, + "grad_norm": 1.8125, "learning_rate": 1.1749347258485642e-06, - "logits/chosen": -2.136107921600342, - "logits/rejected": -2.000105381011963, - "logps/chosen": -250.14208984375, - "logps/rejected": -234.50558471679688, - "loss": 0.6848, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.04163329675793648, - "rewards/margins": 0.0173010416328907, - "rewards/rejected": 0.024332258850336075, + "logits/chosen": -2.6481051445007324, + "logits/rejected": -2.495612621307373, + "logps/chosen": -272.77105712890625, + "logps/rejected": -253.6960906982422, + "loss": 0.6899, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.0026349679101258516, + "rewards/margins": 0.006577133201062679, + "rewards/rejected": -0.003942165989428759, "step": 90 }, { "epoch": 0.03, - "grad_norm": 2.125, + "grad_norm": 1.890625, "learning_rate": 1.305483028720627e-06, - "logits/chosen": -2.1805434226989746, - "logits/rejected": -2.0699386596679688, - "logps/chosen": -246.99072265625, - "logps/rejected": -230.7646026611328, - "loss": 0.6822, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.04753277450799942, - "rewards/margins": 0.022709060460329056, - "rewards/rejected": 0.024823706597089767, + "logits/chosen": -2.6623802185058594, + "logits/rejected": -2.56431245803833, + "logps/chosen": -269.0791320800781, + "logps/rejected": -250.7205047607422, + "loss": 0.6896, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.00583293940871954, + "rewards/margins": 0.0073219239711761475, + "rewards/rejected": -0.0014889847952872515, "step": 100 }, { "epoch": 0.03, - "eval_logits/chosen": -2.0995795726776123, - "eval_logits/rejected": -1.9600573778152466, - "eval_logps/chosen": -259.6322021484375, - "eval_logps/rejected": -241.881103515625, - "eval_loss": 0.6822221875190735, - "eval_rewards/accuracies": 0.6620000004768372, - "eval_rewards/chosen": 0.04995274916291237, - "eval_rewards/margins": 0.02285967767238617, - "eval_rewards/rejected": 0.027093075215816498, - "eval_runtime": 787.2156, - "eval_samples_per_second": 2.541, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -2.690899610519409, + "eval_logits/rejected": -2.5586190223693848, + "eval_logps/chosen": -284.4092712402344, + "eval_logps/rejected": -263.477294921875, + "eval_loss": 0.6884487271308899, + "eval_rewards/accuracies": 0.6744999885559082, + "eval_rewards/chosen": 0.00724982051178813, + "eval_rewards/margins": 0.009573389776051044, + "eval_rewards/rejected": -0.0023235685657709837, + "eval_runtime": 784.8066, + "eval_samples_per_second": 2.548, + "eval_steps_per_second": 0.319, "step": 100 }, { "epoch": 0.03, - "grad_norm": 2.328125, + "grad_norm": 2.125, "learning_rate": 1.4360313315926894e-06, - "logits/chosen": -2.1463546752929688, - "logits/rejected": -2.0026791095733643, - "logps/chosen": -284.35943603515625, - "logps/rejected": -238.8638153076172, - "loss": 0.6792, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.049624957144260406, - "rewards/margins": 0.029189487919211388, - "rewards/rejected": 0.02043546549975872, + "logits/chosen": -2.632530689239502, + "logits/rejected": -2.5299453735351562, + "logps/chosen": -309.1819763183594, + "logps/rejected": -259.67205810546875, + "loss": 0.6861, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.009178686887025833, + "rewards/margins": 0.01440912764519453, + "rewards/rejected": -0.005230440758168697, "step": 110 }, { "epoch": 0.03, - "grad_norm": 2.15625, + "grad_norm": 1.7578125, "learning_rate": 1.5665796344647521e-06, - "logits/chosen": -2.1936752796173096, - "logits/rejected": -2.0543084144592285, - "logps/chosen": -287.4670104980469, - "logps/rejected": -271.9347229003906, - "loss": 0.6726, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.055788446217775345, - "rewards/margins": 0.04297556355595589, - "rewards/rejected": 0.012812879867851734, + "logits/chosen": -2.6648287773132324, + "logits/rejected": -2.5776381492614746, + "logps/chosen": -311.6719665527344, + "logps/rejected": -289.6720275878906, + "loss": 0.6842, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.014281374402344227, + "rewards/margins": 0.018208980560302734, + "rewards/rejected": -0.003927607089281082, "step": 120 }, { "epoch": 0.03, - "grad_norm": 2.984375, + "grad_norm": 2.203125, "learning_rate": 1.6971279373368146e-06, - "logits/chosen": -2.207933187484741, - "logits/rejected": -2.118176221847534, - "logps/chosen": -250.0963592529297, - "logps/rejected": -252.5596466064453, - "loss": 0.6703, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.050027620047330856, - "rewards/margins": 0.04846165329217911, - "rewards/rejected": 0.0015659643104299903, + "logits/chosen": -2.6542999744415283, + "logits/rejected": -2.6270055770874023, + "logps/chosen": -272.4345397949219, + "logps/rejected": -271.96484375, + "loss": 0.6834, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.015841448679566383, + "rewards/margins": 0.019856764003634453, + "rewards/rejected": -0.004015314858406782, "step": 130 }, { "epoch": 0.04, - "grad_norm": 2.484375, + "grad_norm": 1.921875, "learning_rate": 1.8276762402088774e-06, - "logits/chosen": -2.244518756866455, - "logits/rejected": -1.909910798072815, - "logps/chosen": -270.50396728515625, - "logps/rejected": -226.2338104248047, - "loss": 0.6687, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.04182927682995796, - "rewards/margins": 0.05254059284925461, - "rewards/rejected": -0.010711319744586945, + "logits/chosen": -2.7620275020599365, + "logits/rejected": -2.4180870056152344, + "logps/chosen": -292.2891845703125, + "logps/rejected": -249.4049530029297, + "loss": 0.6847, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.014797434210777283, + "rewards/margins": 0.017552640289068222, + "rewards/rejected": -0.0027552072424441576, "step": 140 }, { "epoch": 0.04, - "grad_norm": 2.625, + "grad_norm": 2.078125, "learning_rate": 1.9582245430809403e-06, - "logits/chosen": -2.2653160095214844, - "logits/rejected": -2.0392024517059326, - "logps/chosen": -280.30181884765625, - "logps/rejected": -242.7819061279297, - "loss": 0.6678, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.03664901480078697, - "rewards/margins": 0.05553766340017319, - "rewards/rejected": -0.018888648599386215, + "logits/chosen": -2.770200252532959, + "logits/rejected": -2.5172462463378906, + "logps/chosen": -299.77508544921875, + "logps/rejected": -258.18353271484375, + "loss": 0.682, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.019656788557767868, + "rewards/margins": 0.023214135318994522, + "rewards/rejected": -0.003557346761226654, "step": 150 }, { "epoch": 0.04, - "grad_norm": 2.6875, + "grad_norm": 1.9921875, "learning_rate": 2.0887728459530026e-06, - "logits/chosen": -2.155770778656006, - "logits/rejected": -2.0534839630126953, - "logps/chosen": -256.0411071777344, - "logps/rejected": -261.82269287109375, - "loss": 0.669, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0072199017740786076, - "rewards/margins": 0.05537067726254463, - "rewards/rejected": -0.04815077781677246, + "logits/chosen": -2.698042392730713, + "logits/rejected": -2.585510730743408, + "logps/chosen": -275.923095703125, + "logps/rejected": -277.31927490234375, + "loss": 0.683, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.018199656158685684, + "rewards/margins": 0.021459590643644333, + "rewards/rejected": -0.003259934950619936, "step": 160 }, { "epoch": 0.04, - "grad_norm": 2.921875, + "grad_norm": 2.265625, "learning_rate": 2.2193211488250653e-06, - "logits/chosen": -2.123459815979004, - "logits/rejected": -1.9686686992645264, - "logps/chosen": -220.979248046875, - "logps/rejected": -228.3390655517578, - "loss": 0.6707, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.004531031008809805, - "rewards/margins": 0.051065556704998016, - "rewards/rejected": -0.05559658259153366, + "logits/chosen": -2.6444950103759766, + "logits/rejected": -2.464858293533325, + "logps/chosen": -237.28213500976562, + "logps/rejected": -239.7051239013672, + "loss": 0.6839, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.01709694229066372, + "rewards/margins": 0.019580338150262833, + "rewards/rejected": -0.0024833965580910444, "step": 170 }, { "epoch": 0.05, - "grad_norm": 3.125, + "grad_norm": 1.6953125, "learning_rate": 2.3498694516971284e-06, - "logits/chosen": -2.1231114864349365, - "logits/rejected": -1.9875109195709229, - "logps/chosen": -259.0267639160156, - "logps/rejected": -252.4883575439453, - "loss": 0.6639, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.03366151079535484, - "rewards/margins": 0.06775667518377304, - "rewards/rejected": -0.10141818225383759, + "logits/chosen": -2.6247406005859375, + "logits/rejected": -2.4786934852600098, + "logps/chosen": -275.9543151855469, + "logps/rejected": -262.0204772949219, + "loss": 0.679, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.02462857961654663, + "rewards/margins": 0.029734212905168533, + "rewards/rejected": -0.005105632357299328, "step": 180 }, { "epoch": 0.05, - "grad_norm": 3.625, + "grad_norm": 3.34375, "learning_rate": 2.4804177545691907e-06, - "logits/chosen": -2.2462027072906494, - "logits/rejected": -2.0307514667510986, - "logps/chosen": -275.4230041503906, - "logps/rejected": -257.12322998046875, - "loss": 0.6496, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.05365932732820511, - "rewards/margins": 0.10096828639507294, - "rewards/rejected": -0.15462762117385864, + "logits/chosen": -2.7437422275543213, + "logits/rejected": -2.514353036880493, + "logps/chosen": -289.6897277832031, + "logps/rejected": -259.26397705078125, + "loss": 0.6762, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.03337350860238075, + "rewards/margins": 0.035980306565761566, + "rewards/rejected": -0.0026067979633808136, "step": 190 }, { "epoch": 0.05, - "grad_norm": 3.328125, + "grad_norm": 2.140625, "learning_rate": 2.610966057441254e-06, - "logits/chosen": -2.1966991424560547, - "logits/rejected": -1.9566929340362549, - "logps/chosen": -257.5749816894531, - "logps/rejected": -229.8505096435547, - "loss": 0.6491, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.07897868007421494, - "rewards/margins": 0.10433997958898544, - "rewards/rejected": -0.18331868946552277, + "logits/chosen": -2.741255760192871, + "logits/rejected": -2.47418212890625, + "logps/chosen": -268.6815490722656, + "logps/rejected": -230.81680297851562, + "loss": 0.6699, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.014004526659846306, + "rewards/margins": 0.049270953983068466, + "rewards/rejected": -0.03526642546057701, "step": 200 }, { "epoch": 0.05, - "eval_logits/chosen": -2.0737857818603516, - "eval_logits/rejected": -1.9371289014816284, - "eval_logps/chosen": -270.8282470703125, - "eval_logps/rejected": -261.5014343261719, - "eval_loss": 0.6485820412635803, - "eval_rewards/accuracies": 0.6800000071525574, - "eval_rewards/chosen": -0.06200797110795975, - "eval_rewards/margins": 0.1071021780371666, - "eval_rewards/rejected": -0.16911016404628754, - "eval_runtime": 785.9656, - "eval_samples_per_second": 2.545, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -2.68129825592041, + "eval_logits/rejected": -2.5495352745056152, + "eval_logps/chosen": -284.310302734375, + "eval_logps/rejected": -266.82989501953125, + "eval_loss": 0.6725865006446838, + "eval_rewards/accuracies": 0.6894999742507935, + "eval_rewards/chosen": 0.008239748887717724, + "eval_rewards/margins": 0.04408977925777435, + "eval_rewards/rejected": -0.035850029438734055, + "eval_runtime": 783.4542, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, "step": 200 }, { "epoch": 0.05, - "grad_norm": 3.46875, + "grad_norm": 1.90625, "learning_rate": 2.741514360313316e-06, - "logits/chosen": -2.202221632003784, - "logits/rejected": -1.9849048852920532, - "logps/chosen": -268.0324401855469, - "logps/rejected": -250.7098388671875, - "loss": 0.6324, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.05872233957052231, - "rewards/margins": 0.14260056614875793, - "rewards/rejected": -0.20132291316986084, + "logits/chosen": -2.6879875659942627, + "logits/rejected": -2.4686179161071777, + "logps/chosen": -277.16839599609375, + "logps/rejected": -251.56875610351562, + "loss": 0.6655, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.01113974954932928, + "rewards/margins": 0.05906779691576958, + "rewards/rejected": -0.04792805388569832, "step": 210 }, { "epoch": 0.06, - "grad_norm": 4.03125, + "grad_norm": 1.9140625, "learning_rate": 2.872062663185379e-06, - "logits/chosen": -2.1051223278045654, - "logits/rejected": -1.9894578456878662, - "logps/chosen": -270.9224853515625, - "logps/rejected": -257.1054382324219, - "loss": 0.641, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.17000998556613922, - "rewards/margins": 0.12536069750785828, - "rewards/rejected": -0.2953706979751587, + "logits/chosen": -2.657982110977173, + "logits/rejected": -2.5118064880371094, + "logps/chosen": -275.3629150390625, + "logps/rejected": -249.0461883544922, + "loss": 0.6669, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.009381966665387154, + "rewards/margins": 0.058504533022642136, + "rewards/rejected": -0.04912256821990013, "step": 220 }, { "epoch": 0.06, - "grad_norm": 4.375, + "grad_norm": 2.515625, "learning_rate": 3.0026109660574416e-06, - "logits/chosen": -2.2437427043914795, - "logits/rejected": -2.056591033935547, - "logps/chosen": -316.51153564453125, - "logps/rejected": -290.94573974609375, - "loss": 0.6577, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.10382795333862305, - "rewards/margins": 0.09949944913387299, - "rewards/rejected": -0.20332741737365723, + "logits/chosen": -2.734462261199951, + "logits/rejected": -2.523545265197754, + "logps/chosen": -324.44927978515625, + "logps/rejected": -291.8562316894531, + "loss": 0.6691, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.009536907076835632, + "rewards/margins": 0.053029656410217285, + "rewards/rejected": -0.04349275305867195, "step": 230 }, { "epoch": 0.06, - "grad_norm": 8.375, + "grad_norm": 1.9609375, "learning_rate": 3.1331592689295043e-06, - "logits/chosen": -2.1568007469177246, - "logits/rejected": -1.968348503112793, - "logps/chosen": -302.8646545410156, - "logps/rejected": -300.64166259765625, - "loss": 0.6449, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.06635068356990814, - "rewards/margins": 0.12852035462856293, - "rewards/rejected": -0.19487103819847107, + "logits/chosen": -2.7479310035705566, + "logits/rejected": -2.4911446571350098, + "logps/chosen": -316.1883850097656, + "logps/rejected": -306.96759033203125, + "loss": 0.6644, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.010969454422593117, + "rewards/margins": 0.06440214812755585, + "rewards/rejected": -0.07537160068750381, "step": 240 }, { "epoch": 0.07, - "grad_norm": 2.96875, + "grad_norm": 1.96875, "learning_rate": 3.263707571801567e-06, - "logits/chosen": -2.1217124462127686, - "logits/rejected": -2.0212414264678955, - "logps/chosen": -281.8119812011719, - "logps/rejected": -271.2024841308594, - "loss": 0.6345, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.222783163189888, - "rewards/margins": 0.1482577621936798, - "rewards/rejected": -0.3710409700870514, + "logits/chosen": -2.632441997528076, + "logits/rejected": -2.4809885025024414, + "logps/chosen": -281.6483459472656, + "logps/rejected": -259.15728759765625, + "loss": 0.6613, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.01294863224029541, + "rewards/margins": 0.07276834547519684, + "rewards/rejected": -0.08571697771549225, "step": 250 }, { "epoch": 0.07, - "grad_norm": 3.96875, + "grad_norm": 1.9921875, "learning_rate": 3.3942558746736293e-06, - "logits/chosen": -2.1723222732543945, - "logits/rejected": -1.9684314727783203, - "logps/chosen": -308.781005859375, - "logps/rejected": -295.96539306640625, - "loss": 0.6594, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.36416196823120117, - "rewards/margins": 0.09235499799251556, - "rewards/rejected": -0.45651698112487793, + "logits/chosen": -2.7068283557891846, + "logits/rejected": -2.490290403366089, + "logps/chosen": -300.8772277832031, + "logps/rejected": -287.4792175292969, + "loss": 0.6643, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.0049089426174759865, + "rewards/margins": 0.07015503942966461, + "rewards/rejected": -0.07506398111581802, "step": 260 }, { "epoch": 0.07, - "grad_norm": 3.765625, + "grad_norm": 1.8359375, "learning_rate": 3.524804177545692e-06, - "logits/chosen": -2.0782060623168945, - "logits/rejected": -1.952291488647461, - "logps/chosen": -274.5791320800781, - "logps/rejected": -263.46026611328125, - "loss": 0.6121, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.1397680789232254, - "rewards/margins": 0.20285245776176453, - "rewards/rejected": -0.34262052178382874, + "logits/chosen": -2.611229658126831, + "logits/rejected": -2.4650397300720215, + "logps/chosen": -283.0829772949219, + "logps/rejected": -258.4039611816406, + "loss": 0.6393, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.00923145655542612, + "rewards/margins": 0.12286220490932465, + "rewards/rejected": -0.1136307492852211, "step": 270 }, { "epoch": 0.07, - "grad_norm": 4.21875, + "grad_norm": 2.515625, "learning_rate": 3.6553524804177547e-06, - "logits/chosen": -2.1469614505767822, - "logits/rejected": -1.9756981134414673, - "logps/chosen": -274.50927734375, - "logps/rejected": -268.12066650390625, - "loss": 0.6365, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.15571124851703644, - "rewards/margins": 0.15112467110157013, - "rewards/rejected": -0.30683591961860657, + "logits/chosen": -2.7161974906921387, + "logits/rejected": -2.5690791606903076, + "logps/chosen": -281.7822570800781, + "logps/rejected": -267.12957763671875, + "loss": 0.6531, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -0.027930429205298424, + "rewards/margins": 0.09523724019527435, + "rewards/rejected": -0.12316767871379852, "step": 280 }, { "epoch": 0.08, - "grad_norm": 6.9375, + "grad_norm": 2.59375, "learning_rate": 3.7859007832898174e-06, - "logits/chosen": -2.069694995880127, - "logits/rejected": -1.9719445705413818, - "logps/chosen": -307.63958740234375, - "logps/rejected": -303.70440673828125, - "loss": 0.617, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.22047266364097595, - "rewards/margins": 0.2119269073009491, - "rewards/rejected": -0.43239957094192505, + "logits/chosen": -2.6117746829986572, + "logits/rejected": -2.537930727005005, + "logps/chosen": -311.3076171875, + "logps/rejected": -297.04058837890625, + "loss": 0.6439, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.04281552881002426, + "rewards/margins": 0.11832265555858612, + "rewards/rejected": -0.16113819181919098, "step": 290 }, { "epoch": 0.08, - "grad_norm": 5.125, + "grad_norm": 2.65625, "learning_rate": 3.9164490861618806e-06, - "logits/chosen": -2.1048154830932617, - "logits/rejected": -1.902273416519165, - "logps/chosen": -281.62274169921875, - "logps/rejected": -288.73858642578125, - "loss": 0.6171, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.3755633234977722, - "rewards/margins": 0.20842130482196808, - "rewards/rejected": -0.5839846134185791, + "logits/chosen": -2.667273998260498, + "logits/rejected": -2.4587292671203613, + "logps/chosen": -275.95428466796875, + "logps/rejected": -270.0225830078125, + "loss": 0.636, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.06665624678134918, + "rewards/margins": 0.13395485281944275, + "rewards/rejected": -0.2006111443042755, "step": 300 }, { "epoch": 0.08, - "eval_logits/chosen": -2.0253100395202637, - "eval_logits/rejected": -1.89149010181427, - "eval_logps/chosen": -297.7160339355469, - "eval_logps/rejected": -298.03631591796875, - "eval_loss": 0.6234341859817505, - "eval_rewards/accuracies": 0.6725000143051147, - "eval_rewards/chosen": -0.33088573813438416, - "eval_rewards/margins": 0.20357321202754974, - "eval_rewards/rejected": -0.5344589352607727, - "eval_runtime": 785.2004, - "eval_samples_per_second": 2.547, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -2.6823012828826904, + "eval_logits/rejected": -2.552032947540283, + "eval_logps/chosen": -285.1533508300781, + "eval_logps/rejected": -274.4986877441406, + "eval_loss": 0.6465616226196289, + "eval_rewards/accuracies": 0.6779999732971191, + "eval_rewards/chosen": -0.00019080757920164615, + "eval_rewards/margins": 0.11234673112630844, + "eval_rewards/rejected": -0.11253754794597626, + "eval_runtime": 783.4928, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, "step": 300 }, { "epoch": 0.08, - "grad_norm": 4.3125, + "grad_norm": 2.453125, "learning_rate": 4.046997389033943e-06, - "logits/chosen": -2.2486979961395264, - "logits/rejected": -2.0486714839935303, - "logps/chosen": -318.63775634765625, - "logps/rejected": -294.32781982421875, - "loss": 0.5843, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.2978816330432892, - "rewards/margins": 0.293204665184021, - "rewards/rejected": -0.5910862684249878, + "logits/chosen": -2.8026623725891113, + "logits/rejected": -2.601572036743164, + "logps/chosen": -306.88897705078125, + "logps/rejected": -264.5550231933594, + "loss": 0.627, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.028339838609099388, + "rewards/margins": 0.15499535202980042, + "rewards/rejected": -0.12665551900863647, "step": 310 }, { "epoch": 0.08, - "grad_norm": 4.28125, + "grad_norm": 3.296875, "learning_rate": 4.177545691906005e-06, - "logits/chosen": -2.1338534355163574, - "logits/rejected": -1.9460092782974243, - "logps/chosen": -285.28778076171875, - "logps/rejected": -287.7310791015625, - "loss": 0.6298, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.2706644833087921, - "rewards/margins": 0.1947680413722992, - "rewards/rejected": -0.4654324948787689, + "logits/chosen": -2.6788418292999268, + "logits/rejected": -2.460413932800293, + "logps/chosen": -285.0597839355469, + "logps/rejected": -278.0973205566406, + "loss": 0.6448, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.089615099132061, + "rewards/margins": 0.1233007088303566, + "rewards/rejected": -0.2129158079624176, "step": 320 }, { "epoch": 0.09, - "grad_norm": 6.125, + "grad_norm": 2.78125, "learning_rate": 4.308093994778068e-06, - "logits/chosen": -2.0707926750183105, - "logits/rejected": -1.9247407913208008, - "logps/chosen": -278.6669616699219, - "logps/rejected": -276.16522216796875, - "loss": 0.612, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.1918516606092453, - "rewards/margins": 0.2161349505186081, - "rewards/rejected": -0.4079866409301758, + "logits/chosen": -2.6454341411590576, + "logits/rejected": -2.417539119720459, + "logps/chosen": -291.4884338378906, + "logps/rejected": -275.2319641113281, + "loss": 0.6474, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10766121000051498, + "rewards/margins": 0.11833895742893219, + "rewards/rejected": -0.22600015997886658, "step": 330 }, { "epoch": 0.09, - "grad_norm": 4.40625, + "grad_norm": 2.765625, "learning_rate": 4.4386422976501306e-06, - "logits/chosen": -2.135446548461914, - "logits/rejected": -2.021030902862549, - "logps/chosen": -307.82275390625, - "logps/rejected": -314.97894287109375, - "loss": 0.5922, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.18191352486610413, - "rewards/margins": 0.2853655517101288, - "rewards/rejected": -0.4672791063785553, + "logits/chosen": -2.6752266883850098, + "logits/rejected": -2.5314574241638184, + "logps/chosen": -311.11553955078125, + "logps/rejected": -302.4649963378906, + "loss": 0.6335, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.005945128854364157, + "rewards/margins": 0.15127454698085785, + "rewards/rejected": -0.15721969306468964, "step": 340 }, { "epoch": 0.09, - "grad_norm": 6.96875, + "grad_norm": 3.15625, "learning_rate": 4.569190600522193e-06, - "logits/chosen": -2.052945613861084, - "logits/rejected": -1.905386209487915, - "logps/chosen": -331.6342468261719, - "logps/rejected": -345.2201232910156, - "loss": 0.6196, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.42430585622787476, - "rewards/margins": 0.25065261125564575, - "rewards/rejected": -0.6749585270881653, + "logits/chosen": -2.5868887901306152, + "logits/rejected": -2.4233832359313965, + "logps/chosen": -323.6490173339844, + "logps/rejected": -323.33380126953125, + "loss": 0.6437, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11508532613515854, + "rewards/margins": 0.13315364718437195, + "rewards/rejected": -0.2482389658689499, "step": 350 }, { "epoch": 0.09, - "grad_norm": 13.75, + "grad_norm": 2.515625, "learning_rate": 4.699738903394257e-06, - "logits/chosen": -2.0188870429992676, - "logits/rejected": -1.9744517803192139, - "logps/chosen": -304.7213439941406, - "logps/rejected": -316.82366943359375, - "loss": 0.5877, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.6372613906860352, - "rewards/margins": 0.33024388551712036, - "rewards/rejected": -0.9675053358078003, + "logits/chosen": -2.5959036350250244, + "logits/rejected": -2.5964016914367676, + "logps/chosen": -274.69927978515625, + "logps/rejected": -269.53741455078125, + "loss": 0.632, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.15621377527713776, + "rewards/margins": 0.15650849044322968, + "rewards/rejected": -0.31272226572036743, "step": 360 }, { "epoch": 0.1, - "grad_norm": 7.21875, + "grad_norm": 3.203125, "learning_rate": 4.8302872062663196e-06, - "logits/chosen": -2.104248523712158, - "logits/rejected": -1.9289659261703491, - "logps/chosen": -367.14837646484375, - "logps/rejected": -349.7784118652344, - "loss": 0.6189, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.9026868939399719, - "rewards/margins": 0.2647797465324402, - "rewards/rejected": -1.167466640472412, + "logits/chosen": -2.661418914794922, + "logits/rejected": -2.4857161045074463, + "logps/chosen": -309.907958984375, + "logps/rejected": -284.32794189453125, + "loss": 0.613, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.1116505041718483, + "rewards/margins": 0.20645570755004883, + "rewards/rejected": -0.3181062340736389, "step": 370 }, { "epoch": 0.1, - "grad_norm": 5.0625, + "grad_norm": 4.1875, "learning_rate": 4.9608355091383814e-06, - "logits/chosen": -2.0765578746795654, - "logits/rejected": -1.8675994873046875, - "logps/chosen": -378.85736083984375, - "logps/rejected": -368.45098876953125, - "loss": 0.6225, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.808873176574707, - "rewards/margins": 0.22909462451934814, - "rewards/rejected": -1.0379678010940552, + "logits/chosen": -2.6475014686584473, + "logits/rejected": -2.4438838958740234, + "logps/chosen": -330.4369812011719, + "logps/rejected": -315.2841796875, + "loss": 0.6096, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.11033549159765244, + "rewards/margins": 0.22602955996990204, + "rewards/rejected": -0.3363650441169739, "step": 380 }, { "epoch": 0.1, - "grad_norm": 6.21875, + "grad_norm": 3.671875, "learning_rate": 4.9999488562447675e-06, - "logits/chosen": -2.078608989715576, - "logits/rejected": -1.9628406763076782, - "logps/chosen": -334.11102294921875, - "logps/rejected": -345.5013427734375, - "loss": 0.5891, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.5315781831741333, - "rewards/margins": 0.314100444316864, - "rewards/rejected": -0.8456785082817078, + "logits/chosen": -2.6455321311950684, + "logits/rejected": -2.537829875946045, + "logps/chosen": -308.32122802734375, + "logps/rejected": -309.0108947753906, + "loss": 0.6092, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.06899285316467285, + "rewards/margins": 0.21975460648536682, + "rewards/rejected": -0.2887474298477173, "step": 390 }, { "epoch": 0.1, - "grad_norm": 6.28125, + "grad_norm": 3.125, "learning_rate": 4.999698361256577e-06, - "logits/chosen": -2.1007919311523438, - "logits/rejected": -1.8630239963531494, - "logps/chosen": -317.9114685058594, - "logps/rejected": -301.27923583984375, - "loss": 0.6176, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.5248637199401855, - "rewards/margins": 0.2398781031370163, - "rewards/rejected": -0.7647417783737183, + "logits/chosen": -2.740412712097168, + "logits/rejected": -2.414173126220703, + "logps/chosen": -291.7049255371094, + "logps/rejected": -265.08746337890625, + "loss": 0.6312, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.0857996717095375, + "rewards/margins": 0.16563723981380463, + "rewards/rejected": -0.2514369487762451, "step": 400 }, { "epoch": 0.1, - "eval_logits/chosen": -1.9461418390274048, - "eval_logits/rejected": -1.8181438446044922, - "eval_logps/chosen": -339.5647888183594, - "eval_logps/rejected": -351.81793212890625, - "eval_loss": 0.5987982749938965, - "eval_rewards/accuracies": 0.6809999942779541, - "eval_rewards/chosen": -0.7493733167648315, - "eval_rewards/margins": 0.3229018449783325, - "eval_rewards/rejected": -1.0722752809524536, - "eval_runtime": 786.5559, - "eval_samples_per_second": 2.543, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -2.652951717376709, + "eval_logits/rejected": -2.5250518321990967, + "eval_logps/chosen": -306.51312255859375, + "eval_logps/rejected": -305.46551513671875, + "eval_loss": 0.6190703511238098, + "eval_rewards/accuracies": 0.6804999709129333, + "eval_rewards/chosen": -0.21378836035728455, + "eval_rewards/margins": 0.20841743052005768, + "eval_rewards/rejected": -0.4222058057785034, + "eval_runtime": 783.195, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, "step": 400 }, { "epoch": 0.11, - "grad_norm": 4.78125, + "grad_norm": 3.265625, "learning_rate": 4.999239142174581e-06, - "logits/chosen": -1.9640734195709229, - "logits/rejected": -1.9052097797393799, - "logps/chosen": -312.40789794921875, - "logps/rejected": -328.6429138183594, - "loss": 0.6524, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7017174959182739, - "rewards/margins": 0.1795380413532257, - "rewards/rejected": -0.8812555074691772, + "logits/chosen": -2.567608594894409, + "logits/rejected": -2.524784803390503, + "logps/chosen": -282.33831787109375, + "logps/rejected": -294.5062255859375, + "loss": 0.6538, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": -0.22671082615852356, + "rewards/margins": 0.12887807190418243, + "rewards/rejected": -0.3555889129638672, "step": 410 }, { "epoch": 0.11, - "grad_norm": 4.875, + "grad_norm": 3.59375, "learning_rate": 4.99857123734344e-06, - "logits/chosen": -1.9825359582901, - "logits/rejected": -1.8627588748931885, - "logps/chosen": -268.44232177734375, - "logps/rejected": -290.1076354980469, - "loss": 0.5912, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.3973512053489685, - "rewards/margins": 0.3069622218608856, - "rewards/rejected": -0.7043134570121765, + "logits/chosen": -2.568328619003296, + "logits/rejected": -2.4854254722595215, + "logps/chosen": -258.9724426269531, + "logps/rejected": -273.19512939453125, + "loss": 0.6095, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.1116761714220047, + "rewards/margins": 0.22403624653816223, + "rewards/rejected": -0.33571237325668335, "step": 420 }, { "epoch": 0.11, - "grad_norm": 5.5, + "grad_norm": 3.75, "learning_rate": 4.997694702533016e-06, - "logits/chosen": -1.9619979858398438, - "logits/rejected": -1.9021638631820679, - "logps/chosen": -330.4411315917969, - "logps/rejected": -346.8544921875, - "loss": 0.5752, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.5297900438308716, - "rewards/margins": 0.39973610639572144, - "rewards/rejected": -0.9295262098312378, + "logits/chosen": -2.59728741645813, + "logits/rejected": -2.569671154022217, + "logps/chosen": -307.2120361328125, + "logps/rejected": -310.6186828613281, + "loss": 0.5874, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.08464425802230835, + "rewards/margins": 0.27853769063949585, + "rewards/rejected": -0.3631819784641266, "step": 430 }, { "epoch": 0.12, - "grad_norm": 6.96875, + "grad_norm": 3.5625, "learning_rate": 4.996609610933713e-06, - "logits/chosen": -2.0568199157714844, - "logits/rejected": -1.9730806350708008, - "logps/chosen": -315.17816162109375, - "logps/rejected": -321.71881103515625, - "loss": 0.593, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.43069157004356384, - "rewards/margins": 0.34868526458740234, - "rewards/rejected": -0.7793768644332886, + "logits/chosen": -2.708739757537842, + "logits/rejected": -2.613562822341919, + "logps/chosen": -300.5826721191406, + "logps/rejected": -295.70538330078125, + "loss": 0.6024, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.11334341764450073, + "rewards/margins": 0.25627994537353516, + "rewards/rejected": -0.3696233332157135, "step": 440 }, { "epoch": 0.12, - "grad_norm": 9.375, + "grad_norm": 5.125, "learning_rate": 4.995316053150366e-06, - "logits/chosen": -1.9076818227767944, - "logits/rejected": -1.787350058555603, - "logps/chosen": -321.8026428222656, - "logps/rejected": -348.10693359375, - "loss": 0.5363, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.5402087569236755, - "rewards/margins": 0.49322277307510376, - "rewards/rejected": -1.0334315299987793, + "logits/chosen": -2.5639803409576416, + "logits/rejected": -2.3998026847839355, + "logps/chosen": -303.77288818359375, + "logps/rejected": -300.2038269042969, + "loss": 0.5949, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.10397627204656601, + "rewards/margins": 0.2747548520565033, + "rewards/rejected": -0.3787311017513275, "step": 450 }, { "epoch": 0.12, - "grad_norm": 9.125, + "grad_norm": 5.90625, "learning_rate": 4.9938141371946815e-06, - "logits/chosen": -1.8639757633209229, - "logits/rejected": -1.7780717611312866, - "logps/chosen": -429.0907287597656, - "logps/rejected": -459.61004638671875, - "loss": 0.5738, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.6404383182525635, - "rewards/margins": 0.44994354248046875, - "rewards/rejected": -2.0903818607330322, + "logits/chosen": -2.6004934310913086, + "logits/rejected": -2.5245566368103027, + "logps/chosen": -318.1405944824219, + "logps/rejected": -328.95794677734375, + "loss": 0.6037, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3281031847000122, + "rewards/margins": 0.25530144572257996, + "rewards/rejected": -0.5834046006202698, "step": 460 }, { "epoch": 0.12, - "grad_norm": 5.5, + "grad_norm": 2.8125, "learning_rate": 4.992103988476206e-06, - "logits/chosen": -1.9022108316421509, - "logits/rejected": -1.7589776515960693, - "logps/chosen": -382.638916015625, - "logps/rejected": -413.98199462890625, - "loss": 0.5783, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3886549472808838, - "rewards/margins": 0.4382164478302002, - "rewards/rejected": -1.8268712759017944, + "logits/chosen": -2.627959966659546, + "logits/rejected": -2.482374429702759, + "logps/chosen": -286.36962890625, + "logps/rejected": -298.1661071777344, + "loss": 0.597, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.23781995475292206, + "rewards/margins": 0.27273446321487427, + "rewards/rejected": -0.5105544328689575, "step": 470 }, { "epoch": 0.13, - "grad_norm": 4.53125, + "grad_norm": 3.0625, "learning_rate": 4.990185749791866e-06, - "logits/chosen": -1.9298263788223267, - "logits/rejected": -1.7964195013046265, - "logps/chosen": -321.39947509765625, - "logps/rejected": -372.30950927734375, - "loss": 0.5534, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.6453030705451965, - "rewards/margins": 0.49152326583862305, - "rewards/rejected": -1.1368262767791748, + "logits/chosen": -2.6001830101013184, + "logits/rejected": -2.4575042724609375, + "logps/chosen": -306.5142822265625, + "logps/rejected": -333.7895202636719, + "loss": 0.5985, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2919207811355591, + "rewards/margins": 0.2878710925579071, + "rewards/rejected": -0.5797919034957886, "step": 480 }, { "epoch": 0.13, - "grad_norm": 7.78125, + "grad_norm": 4.75, "learning_rate": 4.9880595813140395e-06, - "logits/chosen": -1.9634357690811157, - "logits/rejected": -1.8230020999908447, - "logps/chosen": -348.3260192871094, - "logps/rejected": -362.29718017578125, - "loss": 0.5661, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.6348981857299805, - "rewards/margins": 0.4430553913116455, - "rewards/rejected": -1.077953577041626, + "logits/chosen": -2.666719913482666, + "logits/rejected": -2.5408244132995605, + "logps/chosen": -349.4113464355469, + "logps/rejected": -340.92974853515625, + "loss": 0.5848, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.3812797963619232, + "rewards/margins": 0.3167892098426819, + "rewards/rejected": -0.6980689764022827, "step": 490 }, { "epoch": 0.13, - "grad_norm": 6.21875, + "grad_norm": 2.703125, "learning_rate": 4.985725660577184e-06, - "logits/chosen": -1.8814575672149658, - "logits/rejected": -1.7436174154281616, - "logps/chosen": -387.90081787109375, - "logps/rejected": -399.5475769042969, - "loss": 0.5761, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.141646146774292, - "rewards/margins": 0.5287792682647705, - "rewards/rejected": -1.6704254150390625, + "logits/chosen": -2.6188435554504395, + "logits/rejected": -2.534493923187256, + "logps/chosen": -328.07855224609375, + "logps/rejected": -317.0086975097656, + "loss": 0.5918, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.34686478972435, + "rewards/margins": 0.3155154585838318, + "rewards/rejected": -0.6623803377151489, "step": 500 }, { "epoch": 0.13, - "eval_logits/chosen": -1.7681965827941895, - "eval_logits/rejected": -1.642921805381775, - "eval_logps/chosen": -384.94061279296875, - "eval_logps/rejected": -417.0115661621094, - "eval_loss": 0.5704278945922852, - "eval_rewards/accuracies": 0.6924999952316284, - "eval_rewards/chosen": -1.2031314373016357, - "eval_rewards/margins": 0.5210796594619751, - "eval_rewards/rejected": -1.7242112159729004, - "eval_runtime": 784.8791, - "eval_samples_per_second": 2.548, + "eval_logits/chosen": -2.6574668884277344, + "eval_logits/rejected": -2.5298194885253906, + "eval_logps/chosen": -311.40576171875, + "eval_logps/rejected": -317.3648681640625, + "eval_loss": 0.6031413078308105, + "eval_rewards/accuracies": 0.6880000233650208, + "eval_rewards/chosen": -0.2627146244049072, + "eval_rewards/margins": 0.27848491072654724, + "eval_rewards/rejected": -0.5411995649337769, + "eval_runtime": 782.9368, + "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.319, "step": 500 }, { "epoch": 0.13, - "grad_norm": 5.6875, + "grad_norm": 4.28125, "learning_rate": 4.983184182463009e-06, - "logits/chosen": -1.829919457435608, - "logits/rejected": -1.7273536920547485, - "logps/chosen": -387.8377685546875, - "logps/rejected": -412.5938415527344, - "loss": 0.5492, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0962493419647217, - "rewards/margins": 0.6178775429725647, - "rewards/rejected": -1.7141268253326416, + "logits/chosen": -2.595390796661377, + "logits/rejected": -2.52022385597229, + "logps/chosen": -321.63763427734375, + "logps/rejected": -315.06890869140625, + "loss": 0.5835, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.24320808053016663, + "rewards/margins": 0.33513620495796204, + "rewards/rejected": -0.5783442854881287, "step": 510 }, { "epoch": 0.14, - "grad_norm": 8.5625, + "grad_norm": 4.78125, "learning_rate": 4.980435359184203e-06, - "logits/chosen": -1.8658363819122314, - "logits/rejected": -1.8036350011825562, - "logps/chosen": -346.643310546875, - "logps/rejected": -370.98773193359375, - "loss": 0.6123, + "logits/chosen": -2.625418186187744, + "logits/rejected": -2.5462870597839355, + "logps/chosen": -318.16717529296875, + "logps/rejected": -326.4075622558594, + "loss": 0.6142, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7361586689949036, - "rewards/margins": 0.4059801697731018, - "rewards/rejected": -1.1421388387680054, + "rewards/chosen": -0.2785532474517822, + "rewards/margins": 0.2635067105293274, + "rewards/rejected": -0.5420600175857544, "step": 520 }, { "epoch": 0.14, - "grad_norm": 6.0, + "grad_norm": 3.78125, "learning_rate": 4.9774794202667236e-06, - "logits/chosen": -1.8419468402862549, - "logits/rejected": -1.788932204246521, - "logps/chosen": -324.2068786621094, - "logps/rejected": -378.1756286621094, - "loss": 0.5749, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.6313104033470154, - "rewards/margins": 0.4553147852420807, - "rewards/rejected": -1.0866252183914185, + "logits/chosen": -2.6040523052215576, + "logits/rejected": -2.4890506267547607, + "logps/chosen": -340.12603759765625, + "logps/rejected": -378.34368896484375, + "loss": 0.5922, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.572872519493103, + "rewards/margins": 0.31255173683166504, + "rewards/rejected": -0.8854243159294128, "step": 530 }, { "epoch": 0.14, - "grad_norm": 6.875, + "grad_norm": 2.984375, "learning_rate": 4.974316612530615e-06, - "logits/chosen": -1.7697532176971436, - "logits/rejected": -1.6131279468536377, - "logps/chosen": -369.4639587402344, - "logps/rejected": -392.1553955078125, - "loss": 0.4874, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -0.9103059768676758, - "rewards/margins": 0.6944985389709473, - "rewards/rejected": -1.6048046350479126, + "logits/chosen": -2.5228214263916016, + "logits/rejected": -2.3871700763702393, + "logps/chosen": -377.46466064453125, + "logps/rejected": -362.720458984375, + "loss": 0.5955, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.757167637348175, + "rewards/margins": 0.2707655131816864, + "rewards/rejected": -1.027933120727539, "step": 540 }, { "epoch": 0.14, - "grad_norm": 13.75, + "grad_norm": 4.71875, "learning_rate": 4.970947200069416e-06, - "logits/chosen": -1.7397000789642334, - "logits/rejected": -1.676129698753357, - "logps/chosen": -375.16485595703125, - "logps/rejected": -407.05413818359375, - "loss": 0.593, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.9668623208999634, - "rewards/margins": 0.5175089836120605, - "rewards/rejected": -1.4843711853027344, + "logits/chosen": -2.5291991233825684, + "logits/rejected": -2.488736152648926, + "logps/chosen": -376.4600524902344, + "logps/rejected": -375.04119873046875, + "loss": 0.6232, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7454890608787537, + "rewards/margins": 0.21865490078926086, + "rewards/rejected": -0.9641439318656921, "step": 550 }, { "epoch": 0.15, - "grad_norm": 4.9375, + "grad_norm": 3.34375, "learning_rate": 4.967371464228096e-06, - "logits/chosen": -1.9172483682632446, - "logits/rejected": -1.829637885093689, - "logps/chosen": -343.5597839355469, - "logps/rejected": -391.8153076171875, - "loss": 0.5654, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.8610507845878601, - "rewards/margins": 0.4784216284751892, - "rewards/rejected": -1.3394721746444702, + "logits/chosen": -2.659289598464966, + "logits/rejected": -2.6029088497161865, + "logps/chosen": -328.5123291015625, + "logps/rejected": -361.32269287109375, + "loss": 0.5842, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.5446836948394775, + "rewards/margins": 0.3293144106864929, + "rewards/rejected": -0.8739980459213257, "step": 560 }, { "epoch": 0.15, - "grad_norm": 7.09375, + "grad_norm": 4.125, "learning_rate": 4.963589703579569e-06, - "logits/chosen": -2.0134716033935547, - "logits/rejected": -1.8797743320465088, - "logps/chosen": -384.7007751464844, - "logps/rejected": -398.67132568359375, - "loss": 0.5697, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.8466698527336121, - "rewards/margins": 0.4867705702781677, - "rewards/rejected": -1.3334404230117798, + "logits/chosen": -2.742898941040039, + "logits/rejected": -2.624717950820923, + "logps/chosen": -358.6355895996094, + "logps/rejected": -352.4152526855469, + "loss": 0.5988, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.4009895920753479, + "rewards/margins": 0.3117690086364746, + "rewards/rejected": -0.7127586007118225, "step": 570 }, { "epoch": 0.15, - "grad_norm": 9.375, + "grad_norm": 3.90625, "learning_rate": 4.9596022338997615e-06, - "logits/chosen": -1.9887676239013672, - "logits/rejected": -1.7658201456069946, - "logps/chosen": -392.9250183105469, - "logps/rejected": -410.36273193359375, - "loss": 0.558, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.9515495300292969, - "rewards/margins": 0.585209846496582, - "rewards/rejected": -1.5367592573165894, + "logits/chosen": -2.770641803741455, + "logits/rejected": -2.534297466278076, + "logps/chosen": -355.1096496582031, + "logps/rejected": -343.6912536621094, + "loss": 0.5891, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.375056654214859, + "rewards/margins": 0.3282235562801361, + "rewards/rejected": -0.7032800912857056, "step": 580 }, { "epoch": 0.15, - "grad_norm": 7.28125, + "grad_norm": 4.3125, "learning_rate": 4.955409388141243e-06, - "logits/chosen": -1.829351782798767, - "logits/rejected": -1.7153953313827515, - "logps/chosen": -358.69146728515625, - "logps/rejected": -381.8030700683594, - "loss": 0.5971, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.0141375064849854, - "rewards/margins": 0.46766194701194763, - "rewards/rejected": -1.4817993640899658, + "logits/chosen": -2.6562209129333496, + "logits/rejected": -2.553755044937134, + "logps/chosen": -338.35784912109375, + "logps/rejected": -340.85028076171875, + "loss": 0.608, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.604152500629425, + "rewards/margins": 0.2910125255584717, + "rewards/rejected": -0.895164966583252, "step": 590 }, { "epoch": 0.16, - "grad_norm": 5.28125, + "grad_norm": 3.78125, "learning_rate": 4.951011516405429e-06, - "logits/chosen": -1.8711599111557007, - "logits/rejected": -1.7997047901153564, - "logps/chosen": -335.1069641113281, - "logps/rejected": -371.37860107421875, - "loss": 0.5583, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8434522747993469, - "rewards/margins": 0.5126376748085022, - "rewards/rejected": -1.3560899496078491, + "logits/chosen": -2.64245343208313, + "logits/rejected": -2.5466177463531494, + "logps/chosen": -334.6476135253906, + "logps/rejected": -347.5552673339844, + "loss": 0.6012, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6437972187995911, + "rewards/margins": 0.31083759665489197, + "rewards/rejected": -0.9546347856521606, "step": 600 }, { "epoch": 0.16, - "eval_logits/chosen": -1.7496064901351929, - "eval_logits/rejected": -1.6199465990066528, - "eval_logps/chosen": -347.5315856933594, - "eval_logps/rejected": -376.3094482421875, - "eval_loss": 0.5622454881668091, - "eval_rewards/accuracies": 0.7055000066757202, - "eval_rewards/chosen": -0.8290411233901978, - "eval_rewards/margins": 0.48814910650253296, - "eval_rewards/rejected": -1.317190170288086, - "eval_runtime": 784.8787, - "eval_samples_per_second": 2.548, + "eval_logits/chosen": -2.6737217903137207, + "eval_logits/rejected": -2.5442543029785156, + "eval_logps/chosen": -336.4282531738281, + "eval_logps/rejected": -348.7829284667969, + "eval_loss": 0.5927833914756775, + "eval_rewards/accuracies": 0.6934999823570251, + "eval_rewards/chosen": -0.512939989566803, + "eval_rewards/margins": 0.34244006872177124, + "eval_rewards/rejected": -0.8553800582885742, + "eval_runtime": 783.6117, + "eval_samples_per_second": 2.552, "eval_steps_per_second": 0.319, "step": 600 }, { "epoch": 0.16, - "grad_norm": 12.25, + "grad_norm": 3.6875, "learning_rate": 4.946408985913344e-06, - "logits/chosen": -1.776925802230835, - "logits/rejected": -1.6997029781341553, - "logps/chosen": -318.4737854003906, - "logps/rejected": -362.5947265625, - "loss": 0.5243, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7264784574508667, - "rewards/margins": 0.6186612844467163, - "rewards/rejected": -1.345139741897583, + "logits/chosen": -2.576265811920166, + "logits/rejected": -2.532116413116455, + "logps/chosen": -298.76788330078125, + "logps/rejected": -316.5951232910156, + "loss": 0.5703, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -0.31805160641670227, + "rewards/margins": 0.38874995708465576, + "rewards/rejected": -0.7068015933036804, "step": 610 }, { "epoch": 0.16, - "grad_norm": 10.625, + "grad_norm": 4.21875, "learning_rate": 4.941602180974958e-06, - "logits/chosen": -1.772719383239746, - "logits/rejected": -1.5400515794754028, - "logps/chosen": -380.8660583496094, - "logps/rejected": -393.8148498535156, - "loss": 0.5317, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.9309979677200317, - "rewards/margins": 0.7158006429672241, - "rewards/rejected": -1.6467987298965454, + "logits/chosen": -2.6575896739959717, + "logits/rejected": -2.4126646518707275, + "logps/chosen": -342.2276306152344, + "logps/rejected": -311.13140869140625, + "loss": 0.5937, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.34147003293037415, + "rewards/margins": 0.3287685513496399, + "rewards/rejected": -0.6702385544776917, "step": 620 }, { "epoch": 0.16, - "grad_norm": 12.5, + "grad_norm": 3.53125, "learning_rate": 4.936591502957101e-06, - "logits/chosen": -1.7443783283233643, - "logits/rejected": -1.5749586820602417, - "logps/chosen": -344.0816955566406, - "logps/rejected": -416.293701171875, - "loss": 0.5269, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.9601060748100281, - "rewards/margins": 0.8244937062263489, - "rewards/rejected": -1.7845996618270874, + "logits/chosen": -2.6380743980407715, + "logits/rejected": -2.4174392223358154, + "logps/chosen": -301.11102294921875, + "logps/rejected": -339.44720458984375, + "loss": 0.5432, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.3551376461982727, + "rewards/margins": 0.47654080390930176, + "rewards/rejected": -0.8316785097122192, "step": 630 }, { "epoch": 0.17, - "grad_norm": 11.75, + "grad_norm": 4.71875, "learning_rate": 4.931377370249946e-06, - "logits/chosen": -1.7829558849334717, - "logits/rejected": -1.5439743995666504, - "logps/chosen": -400.5958557128906, - "logps/rejected": -443.45831298828125, - "loss": 0.5513, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3533570766448975, - "rewards/margins": 0.6858088374137878, - "rewards/rejected": -2.03916597366333, + "logits/chosen": -2.655895233154297, + "logits/rejected": -2.344895362854004, + "logps/chosen": -347.9822082519531, + "logps/rejected": -362.0487060546875, + "loss": 0.5791, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6452215909957886, + "rewards/margins": 0.3807607591152191, + "rewards/rejected": -1.0259822607040405, "step": 640 }, { "epoch": 0.17, - "grad_norm": 12.5, + "grad_norm": 5.1875, "learning_rate": 4.925960218232073e-06, - "logits/chosen": -1.7807738780975342, - "logits/rejected": -1.6571195125579834, - "logps/chosen": -370.91845703125, - "logps/rejected": -435.66064453125, - "loss": 0.5571, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1564347743988037, - "rewards/margins": 0.7427167296409607, - "rewards/rejected": -1.8991515636444092, + "logits/chosen": -2.584510326385498, + "logits/rejected": -2.451813220977783, + "logps/chosen": -321.987548828125, + "logps/rejected": -358.29327392578125, + "loss": 0.5533, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.4966830313205719, + "rewards/margins": 0.47680535912513733, + "rewards/rejected": -0.973488450050354, "step": 650 }, { "epoch": 0.17, - "grad_norm": 11.9375, + "grad_norm": 7.5, "learning_rate": 4.920340499234116e-06, - "logits/chosen": -1.7504314184188843, - "logits/rejected": -1.5074818134307861, - "logps/chosen": -374.42523193359375, - "logps/rejected": -391.95501708984375, - "loss": 0.5983, + "logits/chosen": -2.5348637104034424, + "logits/rejected": -2.209812641143799, + "logps/chosen": -330.370361328125, + "logps/rejected": -326.264404296875, + "loss": 0.6019, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.0853469371795654, - "rewards/margins": 0.5263570547103882, - "rewards/rejected": -1.6117041110992432, + "rewards/chosen": -0.41889137029647827, + "rewards/margins": 0.35165196657180786, + "rewards/rejected": -0.7705433368682861, "step": 660 }, { "epoch": 0.18, - "grad_norm": 14.125, + "grad_norm": 4.5, "learning_rate": 4.914518682500995e-06, - "logits/chosen": -1.905542016029358, - "logits/rejected": -1.6827415227890015, - "logps/chosen": -420.6158142089844, - "logps/rejected": -452.22967529296875, - "loss": 0.5489, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.370868444442749, - "rewards/margins": 0.7235444188117981, - "rewards/rejected": -2.0944130420684814, + "logits/chosen": -2.7013747692108154, + "logits/rejected": -2.40236759185791, + "logps/chosen": -345.67144775390625, + "logps/rejected": -343.01348876953125, + "loss": 0.5753, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.4313276410102844, + "rewards/margins": 0.41087159514427185, + "rewards/rejected": -0.8421992063522339, "step": 670 }, { "epoch": 0.18, - "grad_norm": 7.125, + "grad_norm": 4.8125, "learning_rate": 4.9084952541527315e-06, - "logits/chosen": -1.7983808517456055, - "logits/rejected": -1.5911136865615845, - "logps/chosen": -442.5914001464844, - "logps/rejected": -462.5369567871094, - "loss": 0.5316, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.693092942237854, - "rewards/margins": 0.6704458594322205, - "rewards/rejected": -2.3635387420654297, + "logits/chosen": -2.57464337348938, + "logits/rejected": -2.316161632537842, + "logps/chosen": -341.40765380859375, + "logps/rejected": -332.87060546875, + "loss": 0.5596, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.48329535126686096, + "rewards/margins": 0.43051138520240784, + "rewards/rejected": -0.9138067960739136, "step": 680 }, { "epoch": 0.18, - "grad_norm": 9.125, + "grad_norm": 3.71875, "learning_rate": 4.902270717143858e-06, - "logits/chosen": -1.7777169942855835, - "logits/rejected": -1.675255537033081, - "logps/chosen": -407.14141845703125, - "logps/rejected": -501.22442626953125, - "loss": 0.475, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.7094749212265015, - "rewards/margins": 0.8240033984184265, - "rewards/rejected": -2.5334784984588623, + "logits/chosen": -2.496086597442627, + "logits/rejected": -2.3636794090270996, + "logps/chosen": -304.6783447265625, + "logps/rejected": -378.12548828125, + "loss": 0.4891, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.46743354201316833, + "rewards/margins": 0.6628168225288391, + "rewards/rejected": -1.1302502155303955, "step": 690 }, { "epoch": 0.18, - "grad_norm": 5.8125, + "grad_norm": 8.125, "learning_rate": 4.895845591221427e-06, - "logits/chosen": -1.8242580890655518, - "logits/rejected": -1.743706464767456, - "logps/chosen": -386.7220764160156, - "logps/rejected": -448.8822326660156, - "loss": 0.5297, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3346667289733887, - "rewards/margins": 0.6666029691696167, - "rewards/rejected": -2.001269578933716, + "logits/chosen": -2.4748313426971436, + "logits/rejected": -2.386183261871338, + "logps/chosen": -326.8917541503906, + "logps/rejected": -360.40423583984375, + "loss": 0.5823, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.5521180033683777, + "rewards/margins": 0.3961629271507263, + "rewards/rejected": -0.9482809901237488, "step": 700 }, { "epoch": 0.18, - "eval_logits/chosen": -1.7075196504592896, - "eval_logits/rejected": -1.5802654027938843, - "eval_logps/chosen": -382.9457092285156, - "eval_logps/rejected": -426.70123291015625, - "eval_loss": 0.5517451763153076, - "eval_rewards/accuracies": 0.7114999890327454, - "eval_rewards/chosen": -1.1831822395324707, - "eval_rewards/margins": 0.6379258632659912, - "eval_rewards/rejected": -1.8211082220077515, - "eval_runtime": 779.4893, - "eval_samples_per_second": 2.566, - "eval_steps_per_second": 0.321, + "eval_logits/chosen": -2.466245174407959, + "eval_logits/rejected": -2.344569444656372, + "eval_logps/chosen": -342.8824768066406, + "eval_logps/rejected": -365.3114929199219, + "eval_loss": 0.5811374187469482, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -0.5774820446968079, + "eval_rewards/margins": 0.44318366050720215, + "eval_rewards/rejected": -1.0206657648086548, + "eval_runtime": 783.0972, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, "step": 700 }, { "epoch": 0.19, - "grad_norm": 13.5625, + "grad_norm": 7.875, "learning_rate": 4.8892204128816e-06, - "logits/chosen": -1.8227674961090088, - "logits/rejected": -1.7048763036727905, - "logps/chosen": -377.75494384765625, - "logps/rejected": -425.33258056640625, - "loss": 0.543, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1022976636886597, - "rewards/margins": 0.61147141456604, - "rewards/rejected": -1.7137691974639893, + "logits/chosen": -2.4716761112213135, + "logits/rejected": -2.3479976654052734, + "logps/chosen": -335.98077392578125, + "logps/rejected": -359.7213439941406, + "loss": 0.5862, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5169531106948853, + "rewards/margins": 0.3903142213821411, + "rewards/rejected": -0.9072673916816711, "step": 710 }, { "epoch": 0.19, - "grad_norm": 7.875, + "grad_norm": 4.1875, "learning_rate": 4.882395735324864e-06, - "logits/chosen": -1.7444076538085938, - "logits/rejected": -1.6055552959442139, - "logps/chosen": -397.4420166015625, - "logps/rejected": -459.69781494140625, - "loss": 0.5118, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3140912055969238, - "rewards/margins": 0.7935563921928406, - "rewards/rejected": -2.10764741897583, + "logits/chosen": -2.4327664375305176, + "logits/rejected": -2.3049471378326416, + "logps/chosen": -334.0990295410156, + "logps/rejected": -373.6146240234375, + "loss": 0.5449, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5033348798751831, + "rewards/margins": 0.5430740714073181, + "rewards/rejected": -1.046409010887146, "step": 720 }, { "epoch": 0.19, - "grad_norm": 8.0625, + "grad_norm": 4.34375, "learning_rate": 4.87537212840983e-06, - "logits/chosen": -1.7117830514907837, - "logits/rejected": -1.571088194847107, - "logps/chosen": -390.9749755859375, - "logps/rejected": -426.7323303222656, - "loss": 0.5418, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.2472585439682007, - "rewards/margins": 0.6636706590652466, - "rewards/rejected": -1.9109293222427368, + "logits/chosen": -2.3654582500457764, + "logits/rejected": -2.2108511924743652, + "logps/chosen": -341.95501708984375, + "logps/rejected": -352.4615173339844, + "loss": 0.5897, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.555401623249054, + "rewards/margins": 0.4416992664337158, + "rewards/rejected": -0.9971009492874146, "step": 730 }, { "epoch": 0.19, - "grad_norm": 8.0625, + "grad_norm": 4.5625, "learning_rate": 4.8681501786056545e-06, - "logits/chosen": -1.6051737070083618, - "logits/rejected": -1.4560521841049194, - "logps/chosen": -354.8691101074219, - "logps/rejected": -401.2043151855469, - "loss": 0.4986, + "logits/chosen": -2.3230032920837402, + "logits/rejected": -2.170013427734375, + "logps/chosen": -294.665771484375, + "logps/rejected": -311.9244384765625, + "loss": 0.5406, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.266836404800415, - "rewards/margins": 0.785784125328064, - "rewards/rejected": -2.0526206493377686, + "rewards/chosen": -0.507541835308075, + "rewards/margins": 0.5050578713417053, + "rewards/rejected": -1.0125997066497803, "step": 740 }, { "epoch": 0.2, - "grad_norm": 17.25, + "grad_norm": 6.59375, "learning_rate": 4.860730488943068e-06, - "logits/chosen": -1.5318214893341064, - "logits/rejected": -1.5089409351348877, - "logps/chosen": -417.6119079589844, - "logps/rejected": -491.09002685546875, - "loss": 0.5178, + "logits/chosen": -2.2937541007995605, + "logits/rejected": -2.311389684677124, + "logps/chosen": -292.8400573730469, + "logps/rejected": -333.968505859375, + "loss": 0.5574, "rewards/accuracies": 0.71875, - "rewards/chosen": -1.8313087224960327, - "rewards/margins": 0.7537959218025208, - "rewards/rejected": -2.585104465484619, + "rewards/chosen": -0.39045557379722595, + "rewards/margins": 0.4602751135826111, + "rewards/rejected": -0.8507307171821594, "step": 750 }, { "epoch": 0.2, - "grad_norm": 11.875, + "grad_norm": 5.1875, "learning_rate": 4.853113678964022e-06, - "logits/chosen": -1.5976030826568604, - "logits/rejected": -1.5322299003601074, - "logps/chosen": -454.0541076660156, - "logps/rejected": -510.95916748046875, - "loss": 0.5448, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.8170955181121826, - "rewards/margins": 0.6482337713241577, - "rewards/rejected": -2.465329170227051, + "logits/chosen": -2.3435096740722656, + "logits/rejected": -2.282520294189453, + "logps/chosen": -334.9354248046875, + "logps/rejected": -366.12799072265625, + "loss": 0.5493, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.3748478293418884, + "rewards/margins": 0.4812418520450592, + "rewards/rejected": -0.85608971118927, "step": 760 }, { "epoch": 0.2, - "grad_norm": 5.40625, + "grad_norm": 5.5625, "learning_rate": 4.845300384669958e-06, - "logits/chosen": -1.736106514930725, - "logits/rejected": -1.5969589948654175, - "logps/chosen": -372.9768371582031, - "logps/rejected": -411.02301025390625, - "loss": 0.5501, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.1984102725982666, - "rewards/margins": 0.6082743406295776, - "rewards/rejected": -1.8066844940185547, + "logits/chosen": -2.43422269821167, + "logits/rejected": -2.2817306518554688, + "logps/chosen": -327.2389221191406, + "logps/rejected": -339.32135009765625, + "loss": 0.5993, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.542651891708374, + "rewards/margins": 0.3658657670021057, + "rewards/rejected": -0.9085175395011902, "step": 770 }, { "epoch": 0.2, - "grad_norm": 9.0, + "grad_norm": 4.125, "learning_rate": 4.837291258468701e-06, - "logits/chosen": -1.8057279586791992, - "logits/rejected": -1.6633065938949585, - "logps/chosen": -383.2629089355469, - "logps/rejected": -416.9931640625, - "loss": 0.5643, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.9799069166183472, - "rewards/margins": 0.5860243439674377, - "rewards/rejected": -1.5659310817718506, + "logits/chosen": -2.4618449211120605, + "logits/rejected": -2.3151028156280518, + "logps/chosen": -359.65631103515625, + "logps/rejected": -383.3794860839844, + "loss": 0.5576, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.5611785650253296, + "rewards/margins": 0.4969305098056793, + "rewards/rejected": -1.0581090450286865, "step": 780 }, { "epoch": 0.21, - "grad_norm": 7.15625, + "grad_norm": 8.3125, "learning_rate": 4.829086969119984e-06, - "logits/chosen": -1.6649093627929688, - "logits/rejected": -1.6469013690948486, - "logps/chosen": -377.21533203125, - "logps/rejected": -429.5812072753906, - "loss": 0.5937, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.2654292583465576, - "rewards/margins": 0.5063241124153137, - "rewards/rejected": -1.7717533111572266, + "logits/chosen": -2.364696502685547, + "logits/rejected": -2.3082656860351562, + "logps/chosen": -346.21722412109375, + "logps/rejected": -385.6118469238281, + "loss": 0.5844, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.6969791650772095, + "rewards/margins": 0.4597656726837158, + "rewards/rejected": -1.1567448377609253, "step": 790 }, { "epoch": 0.21, - "grad_norm": 8.3125, + "grad_norm": 8.75, "learning_rate": 4.820688201679605e-06, - "logits/chosen": -1.8579657077789307, - "logits/rejected": -1.5570186376571655, - "logps/chosen": -410.35980224609375, - "logps/rejected": -413.7637634277344, - "loss": 0.5161, + "logits/chosen": -2.4862303733825684, + "logits/rejected": -2.1764872074127197, + "logps/chosen": -336.849365234375, + "logps/rejected": -319.41827392578125, + "loss": 0.5502, "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4777495861053467, - "rewards/margins": 0.6578246355056763, - "rewards/rejected": -2.1355741024017334, + "rewards/chosen": -0.5689047574996948, + "rewards/margins": 0.4903965890407562, + "rewards/rejected": -1.0593013763427734, "step": 800 }, { "epoch": 0.21, - "eval_logits/chosen": -1.6554840803146362, - "eval_logits/rejected": -1.5289907455444336, - "eval_logps/chosen": -425.41973876953125, - "eval_logps/rejected": -469.8731994628906, - "eval_loss": 0.5413252115249634, - "eval_rewards/accuracies": 0.7135000228881836, - "eval_rewards/chosen": -1.6079227924346924, - "eval_rewards/margins": 0.6449044942855835, - "eval_rewards/rejected": -2.2528274059295654, - "eval_runtime": 784.96, - "eval_samples_per_second": 2.548, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -2.439472198486328, + "eval_logits/rejected": -2.317272901535034, + "eval_logps/chosen": -342.2333984375, + "eval_logps/rejected": -366.5324401855469, + "eval_loss": 0.5688381195068359, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": -0.5709910988807678, + "eval_rewards/margins": 0.4618839621543884, + "eval_rewards/rejected": -1.0328751802444458, + "eval_runtime": 783.1906, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, "step": 800 }, { "epoch": 0.21, - "grad_norm": 9.8125, + "grad_norm": 5.5625, "learning_rate": 4.8120956574422315e-06, - "logits/chosen": -1.7708232402801514, - "logits/rejected": -1.7380520105361938, - "logps/chosen": -440.8616638183594, - "logps/rejected": -485.88177490234375, - "loss": 0.6029, + "logits/chosen": -2.46923828125, + "logits/rejected": -2.4094414710998535, + "logps/chosen": -350.98675537109375, + "logps/rejected": -378.17626953125, + "loss": 0.6132, "rewards/accuracies": 0.65625, - "rewards/chosen": -1.669494867324829, - "rewards/margins": 0.5214712023735046, - "rewards/rejected": -2.1909661293029785, + "rewards/chosen": -0.5874276161193848, + "rewards/margins": 0.35395190119743347, + "rewards/rejected": -0.9413794279098511, "step": 810 }, { "epoch": 0.21, - "grad_norm": 11.4375, + "grad_norm": 5.09375, "learning_rate": 4.803310053882831e-06, - "logits/chosen": -1.757965087890625, - "logits/rejected": -1.766776442527771, - "logps/chosen": -361.08880615234375, - "logps/rejected": -428.43280029296875, - "loss": 0.5508, + "logits/chosen": -2.4236135482788086, + "logits/rejected": -2.460178852081299, + "logps/chosen": -293.0755920410156, + "logps/rejected": -345.6906433105469, + "loss": 0.5662, "rewards/accuracies": 0.71875, - "rewards/chosen": -1.317987322807312, - "rewards/margins": 0.5500593185424805, - "rewards/rejected": -1.868046760559082, + "rewards/chosen": -0.4208901822566986, + "rewards/margins": 0.42452698945999146, + "rewards/rejected": -0.8454172015190125, "step": 820 }, { "epoch": 0.22, - "grad_norm": 9.0, + "grad_norm": 5.25, "learning_rate": 4.794332124596775e-06, - "logits/chosen": -1.7953479290008545, - "logits/rejected": -1.6658519506454468, - "logps/chosen": -391.43597412109375, - "logps/rejected": -435.4403381347656, - "loss": 0.5953, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2301970720291138, - "rewards/margins": 0.4769642949104309, - "rewards/rejected": -1.7071611881256104, + "logits/chosen": -2.4821386337280273, + "logits/rejected": -2.3172764778137207, + "logps/chosen": -328.24755859375, + "logps/rejected": -359.9569091796875, + "loss": 0.5966, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.400545209646225, + "rewards/margins": 0.38425248861312866, + "rewards/rejected": -0.7847977876663208, "step": 830 }, { "epoch": 0.22, - "grad_norm": 7.21875, + "grad_norm": 6.5625, "learning_rate": 4.785162619238575e-06, - "logits/chosen": -1.770181655883789, - "logits/rejected": -1.5968060493469238, - "logps/chosen": -369.03302001953125, - "logps/rejected": -407.775634765625, - "loss": 0.5252, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.151700735092163, - "rewards/margins": 0.6617273092269897, - "rewards/rejected": -1.8134281635284424, + "logits/chosen": -2.45249080657959, + "logits/rejected": -2.2496962547302246, + "logps/chosen": -310.9997253417969, + "logps/rejected": -332.2084045410156, + "loss": 0.5478, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.3862709403038025, + "rewards/margins": 0.48837798833847046, + "rewards/rejected": -0.8746488690376282, "step": 840 }, { "epoch": 0.22, - "grad_norm": 5.78125, + "grad_norm": 4.84375, "learning_rate": 4.775802303459288e-06, - "logits/chosen": -1.6927623748779297, - "logits/rejected": -1.6124932765960693, - "logps/chosen": -361.70172119140625, - "logps/rejected": -417.58892822265625, - "loss": 0.5563, + "logits/chosen": -2.3312971591949463, + "logits/rejected": -2.241899013519287, + "logps/chosen": -320.90618896484375, + "logps/rejected": -363.725830078125, + "loss": 0.5602, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.1398875713348389, - "rewards/margins": 0.605847954750061, - "rewards/rejected": -1.7457354068756104, + "rewards/chosen": -0.5471154451370239, + "rewards/margins": 0.4693846106529236, + "rewards/rejected": -1.0165001153945923, "step": 850 }, { "epoch": 0.23, - "grad_norm": 10.8125, + "grad_norm": 7.90625, "learning_rate": 4.766251958842589e-06, - "logits/chosen": -1.6586036682128906, - "logits/rejected": -1.5239696502685547, - "logps/chosen": -397.3894958496094, - "logps/rejected": -443.81982421875, - "loss": 0.5419, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2489416599273682, - "rewards/margins": 0.5692877769470215, - "rewards/rejected": -1.8182293176651, + "logits/chosen": -2.3646304607391357, + "logits/rejected": -2.184277057647705, + "logps/chosen": -373.2308654785156, + "logps/rejected": -394.0884704589844, + "loss": 0.5985, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7825444340705872, + "rewards/margins": 0.35992631316185, + "rewards/rejected": -1.1424707174301147, "step": 860 }, { "epoch": 0.23, - "grad_norm": 7.0, + "grad_norm": 4.03125, "learning_rate": 4.7565123828395066e-06, - "logits/chosen": -1.5194356441497803, - "logits/rejected": -1.4448068141937256, - "logps/chosen": -411.2981872558594, - "logps/rejected": -477.72027587890625, - "loss": 0.5386, + "logits/chosen": -2.292964458465576, + "logits/rejected": -2.258443593978882, + "logps/chosen": -369.5010070800781, + "logps/rejected": -415.40069580078125, + "loss": 0.5605, "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5540659427642822, - "rewards/margins": 0.6700575351715088, - "rewards/rejected": -2.224123477935791, + "rewards/chosen": -0.9762517809867859, + "rewards/margins": 0.4485411047935486, + "rewards/rejected": -1.424793004989624, "step": 870 }, { "epoch": 0.23, - "grad_norm": 6.46875, + "grad_norm": 3.828125, "learning_rate": 4.746584388701831e-06, - "logits/chosen": -1.5930888652801514, - "logits/rejected": -1.5229886770248413, - "logps/chosen": -402.97210693359375, - "logps/rejected": -464.884521484375, - "loss": 0.5162, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.397782325744629, - "rewards/margins": 0.7612795233726501, - "rewards/rejected": -2.159061908721924, + "logits/chosen": -2.381927967071533, + "logits/rejected": -2.2788944244384766, + "logps/chosen": -373.2781677246094, + "logps/rejected": -398.8980407714844, + "loss": 0.5913, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9109410047531128, + "rewards/margins": 0.4138285219669342, + "rewards/rejected": -1.3247694969177246, "step": 880 }, { "epoch": 0.23, - "grad_norm": 8.25, + "grad_norm": 6.34375, "learning_rate": 4.736468805414218e-06, - "logits/chosen": -1.5649399757385254, - "logits/rejected": -1.5401012897491455, - "logps/chosen": -349.52880859375, - "logps/rejected": -429.34478759765625, - "loss": 0.5457, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.0213879346847534, - "rewards/margins": 0.6598361730575562, - "rewards/rejected": -1.6812242269515991, + "logits/chosen": -2.2713119983673096, + "logits/rejected": -2.2479841709136963, + "logps/chosen": -331.4849548339844, + "logps/rejected": -392.24859619140625, + "loss": 0.5605, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6100383996963501, + "rewards/margins": 0.5197950601577759, + "rewards/rejected": -1.129833459854126, "step": 890 }, { "epoch": 0.24, - "grad_norm": 14.5625, + "grad_norm": 4.96875, "learning_rate": 4.7261664776249595e-06, - "logits/chosen": -1.440166711807251, - "logits/rejected": -1.354661226272583, - "logps/chosen": -325.14202880859375, - "logps/rejected": -391.0576171875, - "loss": 0.5089, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.9625970721244812, - "rewards/margins": 0.7495478391647339, - "rewards/rejected": -1.7121448516845703, + "logits/chosen": -2.1991798877716064, + "logits/rejected": -2.126394510269165, + "logps/chosen": -300.3690490722656, + "logps/rejected": -337.35076904296875, + "loss": 0.551, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5168617963790894, + "rewards/margins": 0.48721250891685486, + "rewards/rejected": -1.0040743350982666, "step": 900 }, { "epoch": 0.24, - "eval_logits/chosen": -1.5150411128997803, - "eval_logits/rejected": -1.3977367877960205, - "eval_logps/chosen": -376.68115234375, - "eval_logps/rejected": -420.21600341796875, - "eval_loss": 0.5513297319412231, - "eval_rewards/accuracies": 0.7120000123977661, - "eval_rewards/chosen": -1.1205369234085083, - "eval_rewards/margins": 0.6357187032699585, - "eval_rewards/rejected": -1.7562556266784668, - "eval_runtime": 784.7402, - "eval_samples_per_second": 2.549, + "eval_logits/chosen": -2.3767266273498535, + "eval_logits/rejected": -2.257284641265869, + "eval_logps/chosen": -340.9869689941406, + "eval_logps/rejected": -364.7085266113281, + "eval_loss": 0.5723198056221008, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": -0.5585272312164307, + "eval_rewards/margins": 0.45610883831977844, + "eval_rewards/rejected": -1.0146360397338867, + "eval_runtime": 783.0599, + "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.319, "step": 900 }, { "epoch": 0.24, - "grad_norm": 7.0, + "grad_norm": 4.9375, "learning_rate": 4.715678265575463e-06, - "logits/chosen": -1.6201040744781494, - "logits/rejected": -1.4238890409469604, - "logps/chosen": -389.1523132324219, - "logps/rejected": -387.42364501953125, - "loss": 0.5769, + "logits/chosen": -2.355442523956299, + "logits/rejected": -2.1583566665649414, + "logps/chosen": -355.2525939941406, + "logps/rejected": -336.24884033203125, + "loss": 0.5939, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.0492109060287476, - "rewards/margins": 0.5384214520454407, - "rewards/rejected": -1.587632417678833, + "rewards/chosen": -0.5178682208061218, + "rewards/margins": 0.3880244195461273, + "rewards/rejected": -0.9058926701545715, "step": 910 }, { "epoch": 0.24, - "grad_norm": 7.9375, + "grad_norm": 5.53125, "learning_rate": 4.705005045028415e-06, - "logits/chosen": -1.5486856698989868, - "logits/rejected": -1.4411218166351318, - "logps/chosen": -355.61639404296875, - "logps/rejected": -393.3835144042969, - "loss": 0.5762, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.9255332946777344, - "rewards/margins": 0.5513514876365662, - "rewards/rejected": -1.4768848419189453, + "logits/chosen": -2.257471799850464, + "logits/rejected": -2.175213098526001, + "logps/chosen": -341.3362731933594, + "logps/rejected": -361.2740173339844, + "loss": 0.5922, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5742098093032837, + "rewards/margins": 0.4120204448699951, + "rewards/rejected": -0.9862302541732788, "step": 920 }, { "epoch": 0.24, - "grad_norm": 8.5625, + "grad_norm": 6.125, "learning_rate": 4.694147707194659e-06, - "logits/chosen": -1.5970402956008911, - "logits/rejected": -1.5369764566421509, - "logps/chosen": -379.3194274902344, - "logps/rejected": -420.14813232421875, - "loss": 0.539, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.0752673149108887, - "rewards/margins": 0.606385350227356, - "rewards/rejected": -1.6816527843475342, + "logits/chosen": -2.330077648162842, + "logits/rejected": -2.316610336303711, + "logps/chosen": -347.4102478027344, + "logps/rejected": -376.43292236328125, + "loss": 0.5613, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.5703408122062683, + "rewards/margins": 0.5020008683204651, + "rewards/rejected": -1.0723416805267334, "step": 930 }, { "epoch": 0.25, - "grad_norm": 8.125, + "grad_norm": 5.46875, "learning_rate": 4.683107158658782e-06, - "logits/chosen": -1.4575464725494385, - "logits/rejected": -1.3923773765563965, - "logps/chosen": -426.5927734375, - "logps/rejected": -477.93695068359375, - "loss": 0.4933, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.36399245262146, - "rewards/margins": 0.8010183572769165, - "rewards/rejected": -2.165010690689087, + "logits/chosen": -2.2630085945129395, + "logits/rejected": -2.2645463943481445, + "logps/chosen": -359.7054138183594, + "logps/rejected": -383.32269287109375, + "loss": 0.5196, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.48080378770828247, + "rewards/margins": 0.5593954920768738, + "rewards/rejected": -1.0401992797851562, "step": 940 }, { "epoch": 0.25, - "grad_norm": 9.125, + "grad_norm": 7.625, "learning_rate": 4.671884321303407e-06, - "logits/chosen": -1.524253487586975, - "logits/rejected": -1.3763213157653809, - "logps/chosen": -392.6251525878906, - "logps/rejected": -457.06396484375, - "loss": 0.5223, + "logits/chosen": -2.3846192359924316, + "logits/rejected": -2.1435961723327637, + "logps/chosen": -315.91790771484375, + "logps/rejected": -355.96343994140625, + "loss": 0.5362, "rewards/accuracies": 0.6875, - "rewards/chosen": -1.4630814790725708, - "rewards/margins": 0.7883446216583252, - "rewards/rejected": -2.2514259815216064, + "rewards/chosen": -0.5064867734909058, + "rewards/margins": 0.5416258573532104, + "rewards/rejected": -1.0481126308441162, "step": 950 }, { "epoch": 0.25, - "grad_norm": 9.0, + "grad_norm": 7.4375, "learning_rate": 4.660480132232224e-06, - "logits/chosen": -1.6294152736663818, - "logits/rejected": -1.5263299942016602, - "logps/chosen": -399.7666320800781, - "logps/rejected": -435.2568359375, - "loss": 0.5705, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2803003787994385, - "rewards/margins": 0.5959587693214417, - "rewards/rejected": -1.876259207725525, + "logits/chosen": -2.409409999847412, + "logits/rejected": -2.2849433422088623, + "logps/chosen": -346.1353759765625, + "logps/rejected": -366.18707275390625, + "loss": 0.5855, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.5678282976150513, + "rewards/margins": 0.43991154432296753, + "rewards/rejected": -1.007739782333374, "step": 960 }, { "epoch": 0.25, - "grad_norm": 12.25, + "grad_norm": 4.5, "learning_rate": 4.6488955436917414e-06, - "logits/chosen": -1.6960029602050781, - "logits/rejected": -1.4932464361190796, - "logps/chosen": -402.33111572265625, - "logps/rejected": -436.6167907714844, - "loss": 0.5216, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.223601222038269, - "rewards/margins": 0.8088523149490356, - "rewards/rejected": -2.0324535369873047, + "logits/chosen": -2.370598554611206, + "logits/rejected": -2.1790661811828613, + "logps/chosen": -352.23797607421875, + "logps/rejected": -368.2437744140625, + "loss": 0.5155, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5098623633384705, + "rewards/margins": 0.6602128744125366, + "rewards/rejected": -1.1700751781463623, "step": 970 }, { "epoch": 0.26, - "grad_norm": 8.6875, + "grad_norm": 5.125, "learning_rate": 4.6371315229917644e-06, - "logits/chosen": -1.64836847782135, - "logits/rejected": -1.5174161195755005, - "logps/chosen": -429.2811584472656, - "logps/rejected": -487.087158203125, - "loss": 0.5142, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.459040880203247, - "rewards/margins": 0.7822157144546509, - "rewards/rejected": -2.2412567138671875, + "logits/chosen": -2.3567447662353516, + "logits/rejected": -2.2032203674316406, + "logps/chosen": -362.06390380859375, + "logps/rejected": -394.82550048828125, + "loss": 0.522, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5349576473236084, + "rewards/margins": 0.604442298412323, + "rewards/rejected": -1.1393998861312866, "step": 980 }, { "epoch": 0.26, - "grad_norm": 11.3125, + "grad_norm": 18.375, "learning_rate": 4.625189052424638e-06, - "logits/chosen": -1.5577348470687866, - "logits/rejected": -1.4374052286148071, - "logps/chosen": -413.53900146484375, - "logps/rejected": -485.85302734375, - "loss": 0.4678, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.7447271347045898, - "rewards/margins": 0.9506396055221558, - "rewards/rejected": -2.695366859436035, + "logits/chosen": -2.288172721862793, + "logits/rejected": -2.201801061630249, + "logps/chosen": -327.40899658203125, + "logps/rejected": -366.302001953125, + "loss": 0.514, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.700675368309021, + "rewards/margins": 0.6302372813224792, + "rewards/rejected": -1.3309125900268555, "step": 990 }, { "epoch": 0.26, - "grad_norm": 8.375, + "grad_norm": 6.46875, "learning_rate": 4.613069129183218e-06, - "logits/chosen": -1.6428356170654297, - "logits/rejected": -1.5078332424163818, - "logps/chosen": -469.07391357421875, - "logps/rejected": -501.66412353515625, - "loss": 0.5577, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.6570255756378174, - "rewards/margins": 0.687675952911377, - "rewards/rejected": -2.3447012901306152, + "logits/chosen": -2.399292469024658, + "logits/rejected": -2.2850279808044434, + "logps/chosen": -393.8326721191406, + "logps/rejected": -400.1686096191406, + "loss": 0.5684, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6994020342826843, + "rewards/margins": 0.47142618894577026, + "rewards/rejected": -1.170828104019165, "step": 1000 }, { "epoch": 0.26, - "eval_logits/chosen": -1.4712328910827637, - "eval_logits/rejected": -1.3511788845062256, - "eval_logps/chosen": -408.3596496582031, - "eval_logps/rejected": -461.6903076171875, - "eval_loss": 0.5358989238739014, - "eval_rewards/accuracies": 0.7200000286102295, - "eval_rewards/chosen": -1.4373221397399902, - "eval_rewards/margins": 0.7336767315864563, - "eval_rewards/rejected": -2.170999050140381, - "eval_runtime": 785.3121, - "eval_samples_per_second": 2.547, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -2.346391201019287, + "eval_logits/rejected": -2.2283005714416504, + "eval_logps/chosen": -360.5555114746094, + "eval_logps/rejected": -394.3551330566406, + "eval_loss": 0.5602221488952637, + "eval_rewards/accuracies": 0.7070000171661377, + "eval_rewards/chosen": -0.7542121410369873, + "eval_rewards/margins": 0.5568897724151611, + "eval_rewards/rejected": -1.311102032661438, + "eval_runtime": 783.2741, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, "step": 1000 }, { "epoch": 0.26, - "grad_norm": 6.71875, + "grad_norm": 9.375, "learning_rate": 4.600772765277607e-06, - "logits/chosen": -1.4756183624267578, - "logits/rejected": -1.4088212251663208, - "logps/chosen": -365.6411437988281, - "logps/rejected": -432.31231689453125, - "loss": 0.5213, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3210700750350952, - "rewards/margins": 0.7175318002700806, - "rewards/rejected": -2.038601875305176, + "logits/chosen": -2.223280429840088, + "logits/rejected": -2.1989901065826416, + "logps/chosen": -330.130859375, + "logps/rejected": -382.6180725097656, + "loss": 0.5425, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7673664093017578, + "rewards/margins": 0.5920525789260864, + "rewards/rejected": -1.3594189882278442, "step": 1010 }, { "epoch": 0.27, - "grad_norm": 14.6875, + "grad_norm": 10.5625, "learning_rate": 4.588300987450652e-06, - "logits/chosen": -1.5921796560287476, - "logits/rejected": -1.488405466079712, - "logps/chosen": -364.0208435058594, - "logps/rejected": -393.43609619140625, - "loss": 0.5467, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.1031922101974487, - "rewards/margins": 0.6297253370285034, - "rewards/rejected": -1.7329175472259521, + "logits/chosen": -2.2941932678222656, + "logits/rejected": -2.2032299041748047, + "logps/chosen": -362.07977294921875, + "logps/rejected": -387.52996826171875, + "loss": 0.5543, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8918827176094055, + "rewards/margins": 0.598099410533905, + "rewards/rejected": -1.4899821281433105, "step": 1020 }, { "epoch": 0.27, - "grad_norm": 5.65625, + "grad_norm": 4.4375, "learning_rate": 4.5756548370922136e-06, - "logits/chosen": -1.557192325592041, - "logits/rejected": -1.4739595651626587, - "logps/chosen": -337.0077819824219, - "logps/rejected": -397.47552490234375, - "loss": 0.5093, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.9634488224983215, - "rewards/margins": 0.6944564580917358, - "rewards/rejected": -1.6579049825668335, + "logits/chosen": -2.1861064434051514, + "logits/rejected": -2.1109728813171387, + "logps/chosen": -340.6225891113281, + "logps/rejected": -387.4969177246094, + "loss": 0.5734, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.822590172290802, + "rewards/margins": 0.5768292546272278, + "rewards/rejected": -1.3994193077087402, "step": 1030 }, { "epoch": 0.27, - "grad_norm": 11.5, + "grad_norm": 8.25, "learning_rate": 4.562835370152206e-06, - "logits/chosen": -1.604058027267456, - "logits/rejected": -1.3751734495162964, - "logps/chosen": -426.73870849609375, - "logps/rejected": -494.95623779296875, - "loss": 0.4766, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2556837797164917, - "rewards/margins": 0.9930402040481567, - "rewards/rejected": -2.2487237453460693, + "logits/chosen": -2.3119113445281982, + "logits/rejected": -2.002518653869629, + "logps/chosen": -402.314697265625, + "logps/rejected": -457.06463623046875, + "loss": 0.4708, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.7894701957702637, + "rewards/margins": 0.8879375457763672, + "rewards/rejected": -1.6774076223373413, "step": 1040 }, { "epoch": 0.27, - "grad_norm": 4.8125, + "grad_norm": 5.1875, "learning_rate": 4.54984365705243e-06, - "logits/chosen": -1.5349643230438232, - "logits/rejected": -1.4311106204986572, - "logps/chosen": -406.82110595703125, - "logps/rejected": -502.3373107910156, - "loss": 0.4771, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.383937120437622, - "rewards/margins": 1.0204886198043823, - "rewards/rejected": -2.404425859451294, + "logits/chosen": -2.284492015838623, + "logits/rejected": -2.1806671619415283, + "logps/chosen": -371.3755798339844, + "logps/rejected": -453.24591064453125, + "loss": 0.4773, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8460718393325806, + "rewards/margins": 0.8838627934455872, + "rewards/rejected": -1.7299346923828125, "step": 1050 }, { "epoch": 0.28, - "grad_norm": 9.0, + "grad_norm": 9.125, "learning_rate": 4.536680782597191e-06, - "logits/chosen": -1.4277522563934326, - "logits/rejected": -1.3529363870620728, - "logps/chosen": -381.1377868652344, - "logps/rejected": -452.6669921875, - "loss": 0.5952, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.4513322114944458, - "rewards/margins": 0.771652102470398, - "rewards/rejected": -2.2229843139648438, + "logits/chosen": -2.1628429889678955, + "logits/rejected": -2.0977540016174316, + "logps/chosen": -370.38250732421875, + "logps/rejected": -426.35479736328125, + "loss": 0.5848, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.142185926437378, + "rewards/margins": 0.637835681438446, + "rewards/rejected": -1.7800216674804688, "step": 1060 }, { "epoch": 0.28, - "grad_norm": 10.625, + "grad_norm": 18.375, "learning_rate": 4.523347845882718e-06, - "logits/chosen": -1.5477885007858276, - "logits/rejected": -1.3644009828567505, - "logps/chosen": -425.4134826660156, - "logps/rejected": -483.24237060546875, - "loss": 0.4547, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3729465007781982, - "rewards/margins": 1.0710420608520508, - "rewards/rejected": -2.443988800048828, + "logits/chosen": -2.2164788246154785, + "logits/rejected": -2.0444154739379883, + "logps/chosen": -416.1336364746094, + "logps/rejected": -453.9571228027344, + "loss": 0.4591, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.0554344654083252, + "rewards/margins": 0.9309595823287964, + "rewards/rejected": -1.9863941669464111, "step": 1070 }, { "epoch": 0.28, - "grad_norm": 5.96875, + "grad_norm": 5.90625, "learning_rate": 4.50984596020539e-06, - "logits/chosen": -1.3994543552398682, - "logits/rejected": -1.3206733465194702, - "logps/chosen": -416.38818359375, - "logps/rejected": -454.9171447753906, - "loss": 0.575, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3624916076660156, - "rewards/margins": 0.7082706689834595, - "rewards/rejected": -2.0707619190216064, + "logits/chosen": -2.07619047164917, + "logits/rejected": -1.9385461807250977, + "logps/chosen": -401.8956298828125, + "logps/rejected": -442.7154235839844, + "loss": 0.5498, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.0267369747161865, + "rewards/margins": 0.7495890855789185, + "rewards/rejected": -1.7763264179229736, "step": 1080 }, { "epoch": 0.29, - "grad_norm": 7.03125, + "grad_norm": 5.28125, "learning_rate": 4.4961762529687745e-06, - "logits/chosen": -1.596328854560852, - "logits/rejected": -1.4719685316085815, - "logps/chosen": -366.9061279296875, - "logps/rejected": -433.460205078125, - "loss": 0.4976, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.0652989149093628, - "rewards/margins": 0.8305339813232422, - "rewards/rejected": -1.8958330154418945, + "logits/chosen": -2.176191806793213, + "logits/rejected": -2.0374581813812256, + "logps/chosen": -372.7863464355469, + "logps/rejected": -427.24658203125, + "loss": 0.5695, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.9093082547187805, + "rewards/margins": 0.7514539957046509, + "rewards/rejected": -1.6607621908187866, "step": 1090 }, { "epoch": 0.29, - "grad_norm": 5.34375, + "grad_norm": 6.09375, "learning_rate": 4.482339865589492e-06, - "logits/chosen": -1.579730749130249, - "logits/rejected": -1.4267351627349854, - "logps/chosen": -390.0257263183594, - "logps/rejected": -401.1070556640625, - "loss": 0.5701, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2379372119903564, - "rewards/margins": 0.5505021214485168, - "rewards/rejected": -1.788439393043518, + "logits/chosen": -2.1343674659729004, + "logits/rejected": -1.996208906173706, + "logps/chosen": -379.14581298828125, + "logps/rejected": -391.421142578125, + "loss": 0.5722, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9395943880081177, + "rewards/margins": 0.568466305732727, + "rewards/rejected": -1.5080606937408447, "step": 1100 }, { "epoch": 0.29, - "eval_logits/chosen": -1.4727928638458252, - "eval_logits/rejected": -1.351434350013733, - "eval_logps/chosen": -364.6695861816406, - "eval_logps/rejected": -410.377685546875, - "eval_loss": 0.527571439743042, - "eval_rewards/accuracies": 0.7315000295639038, - "eval_rewards/chosen": -1.000421404838562, - "eval_rewards/margins": 0.6574515104293823, - "eval_rewards/rejected": -1.6578730344772339, - "eval_runtime": 784.7433, - "eval_samples_per_second": 2.549, + "eval_logits/chosen": -2.1819868087768555, + "eval_logits/rejected": -2.0676610469818115, + "eval_logps/chosen": -364.4903564453125, + "eval_logps/rejected": -408.9803466796875, + "eval_loss": 0.5428664684295654, + "eval_rewards/accuracies": 0.7239999771118164, + "eval_rewards/chosen": -0.793560802936554, + "eval_rewards/margins": 0.6637934446334839, + "eval_rewards/rejected": -1.4573543071746826, + "eval_runtime": 782.9279, + "eval_samples_per_second": 2.555, "eval_steps_per_second": 0.319, "step": 1100 }, { "epoch": 0.29, - "grad_norm": 5.90625, + "grad_norm": 5.25, "learning_rate": 4.468337953401909e-06, - "logits/chosen": -1.5745726823806763, - "logits/rejected": -1.5150861740112305, - "logps/chosen": -369.83258056640625, - "logps/rejected": -414.9020080566406, - "loss": 0.5783, + "logits/chosen": -2.15395450592041, + "logits/rejected": -2.0820419788360596, + "logps/chosen": -375.0626525878906, + "logps/rejected": -429.63238525390625, + "loss": 0.547, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.9951512217521667, - "rewards/margins": 0.4992947578430176, - "rewards/rejected": -1.494446039199829, + "rewards/chosen": -0.8576391339302063, + "rewards/margins": 0.6185614466667175, + "rewards/rejected": -1.4762006998062134, "step": 1110 }, { "epoch": 0.29, - "grad_norm": 6.8125, + "grad_norm": 6.53125, "learning_rate": 4.45417168556166e-06, - "logits/chosen": -1.4539698362350464, - "logits/rejected": -1.3534314632415771, - "logps/chosen": -338.3500061035156, - "logps/rejected": -408.1448059082031, - "loss": 0.5083, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.9725552797317505, - "rewards/margins": 0.7043280005455017, - "rewards/rejected": -1.6768831014633179, + "logits/chosen": -2.0685439109802246, + "logits/rejected": -1.959359884262085, + "logps/chosen": -349.4896545410156, + "logps/rejected": -422.0367736816406, + "loss": 0.5171, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8915795087814331, + "rewards/margins": 0.7198113799095154, + "rewards/rejected": -1.6113910675048828, "step": 1120 }, { "epoch": 0.3, - "grad_norm": 9.875, + "grad_norm": 11.125, "learning_rate": 4.439842244948036e-06, - "logits/chosen": -1.3767496347427368, - "logits/rejected": -1.2502760887145996, - "logps/chosen": -405.4912109375, - "logps/rejected": -463.8192443847656, - "loss": 0.5618, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.4443975687026978, - "rewards/margins": 0.6391817331314087, - "rewards/rejected": -2.0835793018341064, + "logits/chosen": -2.0176918506622314, + "logits/rejected": -1.9431390762329102, + "logps/chosen": -387.5065002441406, + "logps/rejected": -457.86065673828125, + "loss": 0.5738, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.048559308052063, + "rewards/margins": 0.6856990456581116, + "rewards/rejected": -1.7342584133148193, "step": 1130 }, { "epoch": 0.3, - "grad_norm": 14.5625, + "grad_norm": 9.3125, "learning_rate": 4.425350828065204e-06, - "logits/chosen": -1.4182418584823608, - "logits/rejected": -1.2102141380310059, - "logps/chosen": -417.3229064941406, - "logps/rejected": -449.11248779296875, - "loss": 0.4964, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.332468032836914, - "rewards/margins": 0.7967325448989868, - "rewards/rejected": -2.1292006969451904, + "logits/chosen": -2.1399078369140625, + "logits/rejected": -1.9063644409179688, + "logps/chosen": -398.8601989746094, + "logps/rejected": -416.056640625, + "loss": 0.5311, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -0.9232648611068726, + "rewards/margins": 0.6931252479553223, + "rewards/rejected": -1.6163902282714844, "step": 1140 }, { "epoch": 0.3, - "grad_norm": 7.53125, + "grad_norm": 5.9375, "learning_rate": 4.410698644942303e-06, - "logits/chosen": -1.4434064626693726, - "logits/rejected": -1.3108904361724854, - "logps/chosen": -392.4228820800781, - "logps/rejected": -455.933837890625, - "loss": 0.4833, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2017642259597778, - "rewards/margins": 0.8739562034606934, - "rewards/rejected": -2.0757203102111816, + "logits/chosen": -2.1537070274353027, + "logits/rejected": -2.044154167175293, + "logps/chosen": -377.70782470703125, + "logps/rejected": -427.4280700683594, + "loss": 0.5116, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8818570971488953, + "rewards/margins": 0.7475295066833496, + "rewards/rejected": -1.6293865442276, "step": 1150 }, { "epoch": 0.3, - "grad_norm": 13.5, + "grad_norm": 9.9375, "learning_rate": 4.395886919032406e-06, - "logits/chosen": -1.285071611404419, - "logits/rejected": -1.1748772859573364, - "logps/chosen": -413.7234802246094, - "logps/rejected": -464.4402770996094, - "loss": 0.5262, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.499909520149231, - "rewards/margins": 0.7716993093490601, - "rewards/rejected": -2.271608829498291, + "logits/chosen": -1.9682527780532837, + "logits/rejected": -1.9200541973114014, + "logps/chosen": -385.158203125, + "logps/rejected": -427.021484375, + "loss": 0.5316, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.030469536781311, + "rewards/margins": 0.7004586458206177, + "rewards/rejected": -1.7309284210205078, "step": 1160 }, { "epoch": 0.31, - "grad_norm": 9.8125, + "grad_norm": 7.59375, "learning_rate": 4.380916887110366e-06, - "logits/chosen": -1.4289734363555908, - "logits/rejected": -1.237170934677124, - "logps/chosen": -424.04852294921875, - "logps/rejected": -467.486328125, - "loss": 0.534, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.6495983600616455, - "rewards/margins": 0.837870717048645, - "rewards/rejected": -2.48746919631958, + "logits/chosen": -2.1585569381713867, + "logits/rejected": -1.9411481618881226, + "logps/chosen": -395.1438903808594, + "logps/rejected": -424.6376953125, + "loss": 0.5312, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1806066036224365, + "rewards/margins": 0.7220126390457153, + "rewards/rejected": -1.9026191234588623, "step": 1170 }, { "epoch": 0.31, - "grad_norm": 8.5625, + "grad_norm": 9.4375, "learning_rate": 4.365789799169539e-06, - "logits/chosen": -1.3049434423446655, - "logits/rejected": -1.3542959690093994, - "logps/chosen": -396.14105224609375, - "logps/rejected": -471.424072265625, - "loss": 0.5497, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.4954506158828735, - "rewards/margins": 0.6992089152336121, - "rewards/rejected": -2.194659471511841, + "logits/chosen": -1.9048945903778076, + "logits/rejected": -1.9615923166275024, + "logps/chosen": -394.121337890625, + "logps/rejected": -464.55633544921875, + "loss": 0.5309, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.21621572971344, + "rewards/margins": 0.7191425561904907, + "rewards/rejected": -1.9353582859039307, "step": 1180 }, { "epoch": 0.31, - "grad_norm": 8.0625, + "grad_norm": 8.375, "learning_rate": 4.350506918317416e-06, - "logits/chosen": -1.5622175931930542, - "logits/rejected": -1.4013831615447998, - "logps/chosen": -361.25677490234375, - "logps/rejected": -424.95721435546875, - "loss": 0.5253, + "logits/chosen": -2.11989164352417, + "logits/rejected": -1.946270227432251, + "logps/chosen": -387.0372009277344, + "logps/rejected": -453.46307373046875, + "loss": 0.52, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.1372191905975342, - "rewards/margins": 0.7087024450302124, - "rewards/rejected": -1.845921516418457, + "rewards/chosen": -1.2322447299957275, + "rewards/margins": 0.72694331407547, + "rewards/rejected": -1.9591881036758423, "step": 1190 }, { "epoch": 0.31, - "grad_norm": 6.8125, + "grad_norm": 9.375, "learning_rate": 4.335069520670149e-06, - "logits/chosen": -1.412278413772583, - "logits/rejected": -1.3392635583877563, - "logps/chosen": -347.6351318359375, - "logps/rejected": -417.938720703125, - "loss": 0.5581, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.1793489456176758, - "rewards/margins": 0.6217935681343079, - "rewards/rejected": -1.8011423349380493, + "logits/chosen": -1.9962623119354248, + "logits/rejected": -1.9240058660507202, + "logps/chosen": -365.1288757324219, + "logps/rejected": -439.9312438964844, + "loss": 0.5866, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.1962945461273193, + "rewards/margins": 0.6338215470314026, + "rewards/rejected": -1.8301159143447876, "step": 1200 }, { "epoch": 0.31, - "eval_logits/chosen": -1.4257181882858276, - "eval_logits/rejected": -1.3054285049438477, - "eval_logps/chosen": -375.3857421875, - "eval_logps/rejected": -424.83258056640625, - "eval_loss": 0.5235946774482727, - "eval_rewards/accuracies": 0.7300000190734863, - "eval_rewards/chosen": -1.107582688331604, - "eval_rewards/margins": 0.6948392391204834, - "eval_rewards/rejected": -1.8024218082427979, - "eval_runtime": 785.7041, - "eval_samples_per_second": 2.545, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -2.1387972831726074, + "eval_logits/rejected": -2.0249171257019043, + "eval_logps/chosen": -389.76617431640625, + "eval_logps/rejected": -436.6127624511719, + "eval_loss": 0.5338234305381775, + "eval_rewards/accuracies": 0.7204999923706055, + "eval_rewards/chosen": -1.0463193655014038, + "eval_rewards/margins": 0.6873589754104614, + "eval_rewards/rejected": -1.7336784601211548, + "eval_runtime": 783.6864, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, "step": 1200 }, { "epoch": 0.32, - "grad_norm": 8.3125, + "grad_norm": 4.34375, "learning_rate": 4.319478895246e-06, - "logits/chosen": -1.4606410264968872, - "logits/rejected": -1.2953035831451416, - "logps/chosen": -355.97027587890625, - "logps/rejected": -399.80560302734375, - "loss": 0.5126, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.088834524154663, - "rewards/margins": 0.6851385831832886, - "rewards/rejected": -1.7739731073379517, + "logits/chosen": -2.0610435009002686, + "logits/rejected": -1.9308923482894897, + "logps/chosen": -357.9656677246094, + "logps/rejected": -389.885498046875, + "loss": 0.542, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9039508104324341, + "rewards/margins": 0.6048199534416199, + "rewards/rejected": -1.5087708234786987, "step": 1210 }, { "epoch": 0.32, - "grad_norm": 20.25, + "grad_norm": 9.75, "learning_rate": 4.303736343857704e-06, - "logits/chosen": -1.4367766380310059, - "logits/rejected": -1.3515610694885254, - "logps/chosen": -380.9598083496094, - "logps/rejected": -480.06170654296875, - "loss": 0.5198, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2385380268096924, - "rewards/margins": 0.8721078038215637, - "rewards/rejected": -2.1106457710266113, + "logits/chosen": -2.0861053466796875, + "logits/rejected": -2.018589735031128, + "logps/chosen": -353.1060791015625, + "logps/rejected": -446.2403869628906, + "loss": 0.4978, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7825400233268738, + "rewards/margins": 0.8096281886100769, + "rewards/rejected": -1.5921683311462402, "step": 1220 }, { "epoch": 0.32, - "grad_norm": 10.5625, + "grad_norm": 8.375, "learning_rate": 4.287843181003772e-06, - "logits/chosen": -1.4491949081420898, - "logits/rejected": -1.2914572954177856, - "logps/chosen": -456.51898193359375, - "logps/rejected": -474.9752502441406, - "loss": 0.5841, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.6272194385528564, - "rewards/margins": 0.6476430296897888, - "rewards/rejected": -2.27486252784729, + "logits/chosen": -2.0886423587799072, + "logits/rejected": -1.9124377965927124, + "logps/chosen": -429.72119140625, + "logps/rejected": -441.4107360839844, + "loss": 0.576, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.1503570079803467, + "rewards/margins": 0.6045898795127869, + "rewards/rejected": -1.7549469470977783, "step": 1230 }, { "epoch": 0.32, - "grad_norm": 6.53125, + "grad_norm": 8.9375, "learning_rate": 4.27180073375873e-06, - "logits/chosen": -1.4414918422698975, - "logits/rejected": -1.2992889881134033, - "logps/chosen": -440.4886169433594, - "logps/rejected": -475.1626892089844, - "loss": 0.5292, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5315672159194946, - "rewards/margins": 0.7622373104095459, - "rewards/rejected": -2.29380464553833, + "logits/chosen": -2.0608718395233154, + "logits/rejected": -1.8816003799438477, + "logps/chosen": -424.85528564453125, + "logps/rejected": -464.27215576171875, + "loss": 0.5232, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.177444338798523, + "rewards/margins": 0.769978404045105, + "rewards/rejected": -1.947422742843628, "step": 1240 }, { "epoch": 0.33, - "grad_norm": 5.09375, + "grad_norm": 3.734375, "learning_rate": 4.255610341662304e-06, - "logits/chosen": -1.4650533199310303, - "logits/rejected": -1.257328987121582, - "logps/chosen": -408.236328125, - "logps/rejected": -454.47760009765625, - "loss": 0.561, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.530187964439392, - "rewards/margins": 0.6616398096084595, - "rewards/rejected": -2.1918280124664307, + "logits/chosen": -2.1295692920684814, + "logits/rejected": -1.8819198608398438, + "logps/chosen": -387.6376647949219, + "logps/rejected": -432.35418701171875, + "loss": 0.5569, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.10086989402771, + "rewards/margins": 0.6811186075210571, + "rewards/rejected": -1.781988501548767, "step": 1250 }, { "epoch": 0.33, - "grad_norm": 6.40625, + "grad_norm": 6.53125, "learning_rate": 4.2392733566075764e-06, - "logits/chosen": -1.4444035291671753, - "logits/rejected": -1.3093421459197998, - "logps/chosen": -423.92840576171875, - "logps/rejected": -458.30908203125, - "loss": 0.5834, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.6796367168426514, - "rewards/margins": 0.4993881285190582, - "rewards/rejected": -2.1790249347686768, + "logits/chosen": -2.117842674255371, + "logits/rejected": -1.9765924215316772, + "logps/chosen": -393.4205017089844, + "logps/rejected": -428.7975158691406, + "loss": 0.5688, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.1825861930847168, + "rewards/margins": 0.5461826324462891, + "rewards/rejected": -1.7287689447402954, "step": 1260 }, { "epoch": 0.33, - "grad_norm": 6.96875, + "grad_norm": 7.40625, "learning_rate": 4.2227911427280975e-06, - "logits/chosen": -1.4136598110198975, - "logits/rejected": -1.2310463190078735, - "logps/chosen": -391.7215270996094, - "logps/rejected": -422.70928955078125, - "loss": 0.5448, + "logits/chosen": -2.0678598880767822, + "logits/rejected": -1.8555564880371094, + "logps/chosen": -370.04156494140625, + "logps/rejected": -407.6815490722656, + "loss": 0.5463, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.4147931337356567, - "rewards/margins": 0.6379164457321167, - "rewards/rejected": -2.0527095794677734, + "rewards/chosen": -1.0184184312820435, + "rewards/margins": 0.7026650905609131, + "rewards/rejected": -1.721083402633667, "step": 1270 }, { "epoch": 0.33, - "grad_norm": 12.4375, + "grad_norm": 9.4375, "learning_rate": 4.206165076283983e-06, - "logits/chosen": -1.4327565431594849, - "logits/rejected": -1.2814817428588867, - "logps/chosen": -380.93310546875, - "logps/rejected": -449.3885803222656, - "loss": 0.4766, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3505585193634033, - "rewards/margins": 0.8950177431106567, - "rewards/rejected": -2.2455763816833496, + "logits/chosen": -2.090740442276001, + "logits/rejected": -1.9197864532470703, + "logps/chosen": -359.6732482910156, + "logps/rejected": -415.2185974121094, + "loss": 0.5172, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9589044451713562, + "rewards/margins": 0.7464350461959839, + "rewards/rejected": -1.7053394317626953, "step": 1280 }, { "epoch": 0.34, - "grad_norm": 10.0, + "grad_norm": 8.6875, "learning_rate": 4.189396545546995e-06, - "logits/chosen": -1.3509052991867065, - "logits/rejected": -1.2488114833831787, - "logps/chosen": -402.62908935546875, - "logps/rejected": -479.2837829589844, - "loss": 0.4963, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.5351405143737793, - "rewards/margins": 0.9657770395278931, - "rewards/rejected": -2.500917434692383, + "logits/chosen": -2.073063373565674, + "logits/rejected": -1.9953018426895142, + "logps/chosen": -351.28448486328125, + "logps/rejected": -401.362060546875, + "loss": 0.5346, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.8371566534042358, + "rewards/margins": 0.7300963997840881, + "rewards/rejected": -1.5672528743743896, "step": 1290 }, { "epoch": 0.34, - "grad_norm": 11.75, + "grad_norm": 5.375, "learning_rate": 4.172486950684627e-06, - "logits/chosen": -1.268965244293213, - "logits/rejected": -1.1887578964233398, - "logps/chosen": -441.35858154296875, - "logps/rejected": -525.0691528320312, - "loss": 0.5446, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.8772825002670288, - "rewards/margins": 0.8692727088928223, - "rewards/rejected": -2.7465550899505615, + "logits/chosen": -2.1238481998443604, + "logits/rejected": -2.0095021724700928, + "logps/chosen": -348.4383239746094, + "logps/rejected": -406.6896667480469, + "loss": 0.5659, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7746914029121399, + "rewards/margins": 0.6173580884933472, + "rewards/rejected": -1.3920494318008423, "step": 1300 }, { "epoch": 0.34, - "eval_logits/chosen": -1.0957450866699219, - "eval_logits/rejected": -0.9841247797012329, - "eval_logps/chosen": -490.49017333984375, - "eval_logps/rejected": -567.4619140625, - "eval_loss": 0.5300161242485046, - "eval_rewards/accuracies": 0.7235000133514404, - "eval_rewards/chosen": -2.25862717628479, - "eval_rewards/margins": 0.9700879454612732, - "eval_rewards/rejected": -3.228715419769287, - "eval_runtime": 784.7067, - "eval_samples_per_second": 2.549, + "eval_logits/chosen": -2.1049439907073975, + "eval_logits/rejected": -1.9892892837524414, + "eval_logps/chosen": -371.2005920410156, + "eval_logps/rejected": -417.2295837402344, + "eval_loss": 0.5310410261154175, + "eval_rewards/accuracies": 0.7310000061988831, + "eval_rewards/chosen": -0.8606633543968201, + "eval_rewards/margins": 0.679183304309845, + "eval_rewards/rejected": -1.539846658706665, + "eval_runtime": 782.8904, + "eval_samples_per_second": 2.555, "eval_steps_per_second": 0.319, "step": 1300 }, { "epoch": 0.34, - "grad_norm": 13.5625, + "grad_norm": 10.75, "learning_rate": 4.155437703643182e-06, - "logits/chosen": -1.2097558975219727, - "logits/rejected": -1.0522234439849854, - "logps/chosen": -461.5384826660156, - "logps/rejected": -531.5099487304688, - "loss": 0.5011, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.1479458808898926, - "rewards/margins": 0.9907328486442566, - "rewards/rejected": -3.138678550720215, + "logits/chosen": -2.134488582611084, + "logits/rejected": -1.9875046014785767, + "logps/chosen": -345.08453369140625, + "logps/rejected": -394.74932861328125, + "loss": 0.5007, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8158382177352905, + "rewards/margins": 0.7958999872207642, + "rewards/rejected": -1.6117382049560547, "step": 1310 }, { "epoch": 0.35, - "grad_norm": 16.625, + "grad_norm": 8.875, "learning_rate": 4.138250228029882e-06, - "logits/chosen": -1.2134325504302979, - "logits/rejected": -1.1368061304092407, - "logps/chosen": -478.79449462890625, - "logps/rejected": -573.6204223632812, - "loss": 0.5209, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.2191338539123535, - "rewards/margins": 0.9137837290763855, - "rewards/rejected": -3.1329174041748047, + "logits/chosen": -2.039412021636963, + "logits/rejected": -1.9596283435821533, + "logps/chosen": -389.32757568359375, + "logps/rejected": -462.86376953125, + "loss": 0.5427, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1479754447937012, + "rewards/margins": 0.6707499623298645, + "rewards/rejected": -1.818725347518921, "step": 1320 }, { "epoch": 0.35, - "grad_norm": 8.875, + "grad_norm": 4.53125, "learning_rate": 4.120925958993994e-06, - "logits/chosen": -1.2161281108856201, - "logits/rejected": -1.1206849813461304, - "logps/chosen": -419.74066162109375, - "logps/rejected": -502.05682373046875, - "loss": 0.548, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.8588517904281616, - "rewards/margins": 0.8662081956863403, - "rewards/rejected": -2.725059747695923, + "logits/chosen": -2.0538439750671387, + "logits/rejected": -1.8900225162506104, + "logps/chosen": -349.4403381347656, + "logps/rejected": -406.2320251464844, + "loss": 0.5515, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.9707523584365845, + "rewards/margins": 0.6485539674758911, + "rewards/rejected": -1.6193063259124756, "step": 1330 }, { "epoch": 0.35, - "grad_norm": 8.5625, + "grad_norm": 10.75, "learning_rate": 4.103466343106999e-06, - "logits/chosen": -1.4189960956573486, - "logits/rejected": -1.2956821918487549, - "logps/chosen": -427.60992431640625, - "logps/rejected": -475.4102478027344, - "loss": 0.5402, + "logits/chosen": -2.099672317504883, + "logits/rejected": -1.962204933166504, + "logps/chosen": -396.4217529296875, + "logps/rejected": -423.8016662597656, + "loss": 0.5514, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.582824468612671, - "rewards/margins": 0.7366166114807129, - "rewards/rejected": -2.319441080093384, + "rewards/chosen": -1.0410661697387695, + "rewards/margins": 0.6128488779067993, + "rewards/rejected": -1.6539150476455688, "step": 1340 }, { "epoch": 0.35, - "grad_norm": 10.9375, + "grad_norm": 9.3125, "learning_rate": 4.085872838241797e-06, - "logits/chosen": -1.369505524635315, - "logits/rejected": -1.2414172887802124, - "logps/chosen": -413.95208740234375, - "logps/rejected": -456.4146423339844, - "loss": 0.581, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.5163764953613281, - "rewards/margins": 0.6249885559082031, - "rewards/rejected": -2.1413650512695312, + "logits/chosen": -1.950664758682251, + "logits/rejected": -1.8220726251602173, + "logps/chosen": -407.6488952636719, + "logps/rejected": -432.849609375, + "loss": 0.5876, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.10310959815979, + "rewards/margins": 0.5940074324607849, + "rewards/rejected": -1.6971168518066406, "step": 1350 }, { "epoch": 0.36, - "grad_norm": 7.96875, + "grad_norm": 7.28125, "learning_rate": 4.06814691345098e-06, - "logits/chosen": -1.3671820163726807, - "logits/rejected": -1.2069867849349976, - "logps/chosen": -386.15679931640625, - "logps/rejected": -448.37628173828125, - "loss": 0.4791, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3064360618591309, - "rewards/margins": 0.8428786396980286, - "rewards/rejected": -2.1493146419525146, + "logits/chosen": -1.9413379430770874, + "logits/rejected": -1.7642743587493896, + "logps/chosen": -379.175048828125, + "logps/rejected": -433.0530700683594, + "loss": 0.5028, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0211687088012695, + "rewards/margins": 0.7622953653335571, + "rewards/rejected": -1.7834640741348267, "step": 1360 }, { "epoch": 0.36, - "grad_norm": 11.1875, + "grad_norm": 7.59375, "learning_rate": 4.050290048844171e-06, - "logits/chosen": -1.4646304845809937, - "logits/rejected": -1.367467999458313, - "logps/chosen": -407.99505615234375, - "logps/rejected": -482.73907470703125, - "loss": 0.5289, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3937979936599731, - "rewards/margins": 0.7991429567337036, - "rewards/rejected": -2.192941188812256, + "logits/chosen": -2.095034122467041, + "logits/rejected": -1.9543094635009766, + "logps/chosen": -379.04046630859375, + "logps/rejected": -442.8995666503906, + "loss": 0.5448, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9098923802375793, + "rewards/margins": 0.6899572014808655, + "rewards/rejected": -1.5998497009277344, "step": 1370 }, { "epoch": 0.36, - "grad_norm": 9.875, + "grad_norm": 5.6875, "learning_rate": 4.032303735464422e-06, - "logits/chosen": -1.4965475797653198, - "logits/rejected": -1.3343265056610107, - "logps/chosen": -435.0130920410156, - "logps/rejected": -509.0382385253906, - "loss": 0.4611, + "logits/chosen": -2.0973174571990967, + "logits/rejected": -2.030759811401367, + "logps/chosen": -381.38037109375, + "logps/rejected": -437.8154296875, + "loss": 0.4747, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.6306921243667603, - "rewards/margins": 0.9709357023239136, - "rewards/rejected": -2.601627826690674, + "rewards/chosen": -0.8857170343399048, + "rewards/margins": 0.8334710001945496, + "rewards/rejected": -1.7191880941390991, "step": 1380 }, { "epoch": 0.36, - "grad_norm": 14.375, + "grad_norm": 6.09375, "learning_rate": 4.014189475163727e-06, - "logits/chosen": -1.2652885913848877, - "logits/rejected": -1.1585814952850342, - "logps/chosen": -438.55712890625, - "logps/rejected": -525.4952392578125, - "loss": 0.4871, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.839609146118164, - "rewards/margins": 0.9500301480293274, - "rewards/rejected": -2.7896392345428467, + "logits/chosen": -1.9514198303222656, + "logits/rejected": -1.8510992527008057, + "logps/chosen": -375.90625, + "logps/rejected": -439.33709716796875, + "loss": 0.5165, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.0048784017562866, + "rewards/margins": 0.7614682912826538, + "rewards/rejected": -1.7663466930389404, "step": 1390 }, { "epoch": 0.37, - "grad_norm": 13.0625, + "grad_norm": 9.75, "learning_rate": 3.995948780477605e-06, - "logits/chosen": -1.3760168552398682, - "logits/rejected": -1.2231824398040771, - "logps/chosen": -443.558837890625, - "logps/rejected": -497.2522888183594, - "loss": 0.5288, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.748147964477539, - "rewards/margins": 0.7837062478065491, - "rewards/rejected": -2.5318541526794434, + "logits/chosen": -2.0884742736816406, + "logits/rejected": -1.8744325637817383, + "logps/chosen": -370.63525390625, + "logps/rejected": -408.30657958984375, + "loss": 0.5625, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8331832885742188, + "rewards/margins": 0.6226540803909302, + "rewards/rejected": -1.455837368965149, "step": 1400 }, { "epoch": 0.37, - "eval_logits/chosen": -1.2369366884231567, - "eval_logits/rejected": -1.1188790798187256, - "eval_logps/chosen": -432.47894287109375, - "eval_logps/rejected": -500.79150390625, - "eval_loss": 0.5134124755859375, - "eval_rewards/accuracies": 0.7350000143051147, - "eval_rewards/chosen": -1.6785151958465576, - "eval_rewards/margins": 0.883496105670929, - "eval_rewards/rejected": -2.562011241912842, - "eval_runtime": 782.4668, - "eval_samples_per_second": 2.556, - "eval_steps_per_second": 0.32, + "eval_logits/chosen": -2.039109468460083, + "eval_logits/rejected": -1.9253612756729126, + "eval_logps/chosen": -365.1205749511719, + "eval_logps/rejected": -413.8091735839844, + "eval_loss": 0.5294913053512573, + "eval_rewards/accuracies": 0.7214999794960022, + "eval_rewards/chosen": -0.7998626828193665, + "eval_rewards/margins": 0.7057796120643616, + "eval_rewards/rejected": -1.505642294883728, + "eval_runtime": 782.6935, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, "step": 1400 }, { "epoch": 0.37, - "grad_norm": 8.1875, + "grad_norm": 5.0, "learning_rate": 3.977583174498816e-06, - "logits/chosen": -1.263580560684204, - "logits/rejected": -1.1519924402236938, - "logps/chosen": -439.1827697753906, - "logps/rejected": -539.3436889648438, - "loss": 0.4008, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.7555328607559204, - "rewards/margins": 1.2208806276321411, - "rewards/rejected": -2.9764137268066406, + "logits/chosen": -2.00089693069458, + "logits/rejected": -1.875847578048706, + "logps/chosen": -362.216796875, + "logps/rejected": -436.0418395996094, + "loss": 0.4119, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": -0.7390182614326477, + "rewards/margins": 0.9906827211380005, + "rewards/rejected": -1.729701042175293, "step": 1410 }, { "epoch": 0.37, - "grad_norm": 13.125, + "grad_norm": 8.0, "learning_rate": 3.959094190750172e-06, - "logits/chosen": -1.2331125736236572, - "logits/rejected": -1.0962107181549072, - "logps/chosen": -490.2676696777344, - "logps/rejected": -562.7335205078125, - "loss": 0.5274, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.051352024078369, - "rewards/margins": 0.9939060211181641, - "rewards/rejected": -3.045258045196533, + "logits/chosen": -1.9545952081680298, + "logits/rejected": -1.8029680252075195, + "logps/chosen": -398.83709716796875, + "logps/rejected": -449.42742919921875, + "loss": 0.5272, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9393488764762878, + "rewards/margins": 0.8084908723831177, + "rewards/rejected": -1.7478396892547607, "step": 1420 }, { "epoch": 0.37, - "grad_norm": 12.625, + "grad_norm": 11.125, "learning_rate": 3.9404833730564975e-06, - "logits/chosen": -1.1363118886947632, - "logits/rejected": -1.0240169763565063, - "logps/chosen": -450.925048828125, - "logps/rejected": -538.6007080078125, - "loss": 0.5225, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.9782140254974365, - "rewards/margins": 0.9927111864089966, - "rewards/rejected": -2.9709250926971436, + "logits/chosen": -1.8657894134521484, + "logits/rejected": -1.716658353805542, + "logps/chosen": -378.5374755859375, + "logps/rejected": -448.2701721191406, + "loss": 0.5079, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0239883661270142, + "rewards/margins": 0.8290524482727051, + "rewards/rejected": -1.8530406951904297, "step": 1430 }, { "epoch": 0.38, - "grad_norm": 13.8125, + "grad_norm": 8.8125, "learning_rate": 3.921752275415712e-06, - "logits/chosen": -1.194887399673462, - "logits/rejected": -1.1376324892044067, - "logps/chosen": -451.27496337890625, - "logps/rejected": -552.3207397460938, - "loss": 0.4409, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.008340358734131, - "rewards/margins": 1.2156970500946045, - "rewards/rejected": -3.2240378856658936, + "logits/chosen": -1.8294332027435303, + "logits/rejected": -1.8695322275161743, + "logps/chosen": -386.57476806640625, + "logps/rejected": -454.24981689453125, + "loss": 0.4901, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.1536591053009033, + "rewards/margins": 0.9241256713867188, + "rewards/rejected": -2.077784776687622, "step": 1440 }, { "epoch": 0.38, - "grad_norm": 5.375, + "grad_norm": 9.9375, "learning_rate": 3.902902461869079e-06, - "logits/chosen": -1.1640112400054932, - "logits/rejected": -1.0399417877197266, - "logps/chosen": -474.15594482421875, - "logps/rejected": -570.0380249023438, - "loss": 0.5512, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.3466668128967285, - "rewards/margins": 1.1234229803085327, - "rewards/rejected": -3.4700896739959717, + "logits/chosen": -1.9432599544525146, + "logits/rejected": -1.8257582187652588, + "logps/chosen": -391.89544677734375, + "logps/rejected": -465.5562438964844, + "loss": 0.5328, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3478235006332397, + "rewards/margins": 0.9181373715400696, + "rewards/rejected": -2.265960931777954, "step": 1450 }, { "epoch": 0.38, - "grad_norm": 20.5, + "grad_norm": 9.3125, "learning_rate": 3.883935506370605e-06, - "logits/chosen": -1.1935418844223022, - "logits/rejected": -1.0673936605453491, - "logps/chosen": -464.279541015625, - "logps/rejected": -521.2711181640625, - "loss": 0.5569, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.110640048980713, - "rewards/margins": 0.8482397794723511, - "rewards/rejected": -2.9588799476623535, + "logits/chosen": -1.9065253734588623, + "logits/rejected": -1.7174403667449951, + "logps/chosen": -428.204833984375, + "logps/rejected": -474.4853515625, + "loss": 0.5452, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5569171905517578, + "rewards/margins": 0.7829157114028931, + "rewards/rejected": -2.3398330211639404, "step": 1460 }, { "epoch": 0.38, - "grad_norm": 4.90625, + "grad_norm": 4.875, "learning_rate": 3.864852992655617e-06, - "logits/chosen": -1.3366987705230713, - "logits/rejected": -1.2401338815689087, - "logps/chosen": -414.4486389160156, - "logps/rejected": -496.52886962890625, - "loss": 0.4655, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.659065842628479, - "rewards/margins": 0.9428392648696899, - "rewards/rejected": -2.601905345916748, + "logits/chosen": -1.921338677406311, + "logits/rejected": -1.8216731548309326, + "logps/chosen": -417.23699951171875, + "logps/rejected": -486.22344970703125, + "loss": 0.4872, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.457044243812561, + "rewards/margins": 0.8441551327705383, + "rewards/rejected": -2.301199436187744, "step": 1470 }, { "epoch": 0.39, - "grad_norm": 6.5625, + "grad_norm": 5.3125, "learning_rate": 3.845656514108516e-06, - "logits/chosen": -1.3062841892242432, - "logits/rejected": -1.1431127786636353, - "logps/chosen": -451.17138671875, - "logps/rejected": -493.60760498046875, - "loss": 0.4747, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.9208762645721436, - "rewards/margins": 0.9739481210708618, - "rewards/rejected": -2.894824266433716, + "logits/chosen": -1.9481254816055298, + "logits/rejected": -1.7473742961883545, + "logps/chosen": -439.948486328125, + "logps/rejected": -445.56268310546875, + "loss": 0.5663, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.5704560279846191, + "rewards/margins": 0.6752415895462036, + "rewards/rejected": -2.2456977367401123, "step": 1480 }, { "epoch": 0.39, - "grad_norm": 9.75, + "grad_norm": 9.1875, "learning_rate": 3.826347673629738e-06, - "logits/chosen": -1.260495662689209, - "logits/rejected": -1.075345754623413, - "logps/chosen": -461.8627014160156, - "logps/rejected": -551.0700073242188, - "loss": 0.4878, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.085941791534424, - "rewards/margins": 1.1549327373504639, - "rewards/rejected": -3.2408745288848877, + "logits/chosen": -1.9770145416259766, + "logits/rejected": -1.7594553232192993, + "logps/chosen": -389.63836669921875, + "logps/rejected": -452.9381408691406, + "loss": 0.4914, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1838957071304321, + "rewards/margins": 0.9015070199966431, + "rewards/rejected": -2.085402727127075, "step": 1490 }, { "epoch": 0.39, - "grad_norm": 18.5, + "grad_norm": 7.53125, "learning_rate": 3.8069280835019062e-06, - "logits/chosen": -1.241081953048706, - "logits/rejected": -1.0921330451965332, - "logps/chosen": -499.6073303222656, - "logps/rejected": -612.812255859375, - "loss": 0.4638, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.3537635803222656, - "rewards/margins": 1.3232930898666382, - "rewards/rejected": -3.6770567893981934, + "logits/chosen": -1.9332622289657593, + "logits/rejected": -1.8013938665390015, + "logps/chosen": -398.4181213378906, + "logps/rejected": -475.78045654296875, + "loss": 0.4575, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1129112243652344, + "rewards/margins": 1.0079797506332397, + "rewards/rejected": -2.1208910942077637, "step": 1500 }, { "epoch": 0.39, - "eval_logits/chosen": -1.2233450412750244, - "eval_logits/rejected": -1.0978461503982544, - "eval_logps/chosen": -486.1478271484375, - "eval_logps/rejected": -580.8158569335938, - "eval_loss": 0.5280259847640991, - "eval_rewards/accuracies": 0.7325000166893005, - "eval_rewards/chosen": -2.215203046798706, - "eval_rewards/margins": 1.1470519304275513, - "eval_rewards/rejected": -3.362255096435547, - "eval_runtime": 784.6278, - "eval_samples_per_second": 2.549, + "eval_logits/chosen": -2.025247812271118, + "eval_logits/rejected": -1.910525918006897, + "eval_logps/chosen": -399.6888732910156, + "eval_logps/rejected": -459.70855712890625, + "eval_loss": 0.5265802145004272, + "eval_rewards/accuracies": 0.7260000109672546, + "eval_rewards/chosen": -1.1455461978912354, + "eval_rewards/margins": 0.8190898299217224, + "eval_rewards/rejected": -1.964635968208313, + "eval_runtime": 783.8773, + "eval_samples_per_second": 2.551, "eval_steps_per_second": 0.319, "step": 1500 }, { "epoch": 0.4, - "grad_norm": 12.5625, + "grad_norm": 7.6875, "learning_rate": 3.7873993652552077e-06, - "logits/chosen": -1.2416788339614868, - "logits/rejected": -1.1541244983673096, - "logps/chosen": -440.28076171875, - "logps/rejected": -523.345703125, - "loss": 0.6306, - "rewards/accuracies": 0.65625, - "rewards/chosen": -2.066037654876709, - "rewards/margins": 0.8885302543640137, - "rewards/rejected": -2.9545681476593018, + "logits/chosen": -1.9163720607757568, + "logits/rejected": -1.8175251483917236, + "logps/chosen": -369.39483642578125, + "logps/rejected": -420.3642578125, + "loss": 0.6257, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": -1.180105447769165, + "rewards/margins": 0.591270923614502, + "rewards/rejected": -1.771376371383667, "step": 1510 }, { "epoch": 0.4, - "grad_norm": 11.125, + "grad_norm": 8.3125, "learning_rate": 3.7677631495319953e-06, - "logits/chosen": -1.4541292190551758, - "logits/rejected": -1.3423665761947632, - "logps/chosen": -382.6937255859375, - "logps/rejected": -438.780029296875, - "loss": 0.5179, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1906108856201172, - "rewards/margins": 0.7407819032669067, - "rewards/rejected": -1.9313929080963135, + "logits/chosen": -2.0789072513580322, + "logits/rejected": -1.9311103820800781, + "logps/chosen": -360.617431640625, + "logps/rejected": -415.0093688964844, + "loss": 0.523, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.7879349589347839, + "rewards/margins": 0.7430469989776611, + "rewards/rejected": -1.5309817790985107, "step": 1520 }, { "epoch": 0.4, - "grad_norm": 7.15625, + "grad_norm": 7.34375, "learning_rate": 3.748021075950633e-06, - "logits/chosen": -1.5313133001327515, - "logits/rejected": -1.4197962284088135, - "logps/chosen": -386.45062255859375, - "logps/rejected": -424.572509765625, - "loss": 0.5895, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.1186347007751465, - "rewards/margins": 0.48732686042785645, - "rewards/rejected": -1.605961561203003, + "logits/chosen": -2.0557217597961426, + "logits/rejected": -1.866943120956421, + "logps/chosen": -381.2101745605469, + "logps/rejected": -421.3724670410156, + "loss": 0.5781, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.8339166641235352, + "rewards/margins": 0.5441929697990417, + "rewards/rejected": -1.3781098127365112, "step": 1530 }, { "epoch": 0.4, - "grad_norm": 12.375, + "grad_norm": 7.90625, "learning_rate": 3.7281747929685824e-06, - "logits/chosen": -1.3873339891433716, - "logits/rejected": -1.2301104068756104, - "logps/chosen": -367.59600830078125, - "logps/rejected": -420.02020263671875, - "loss": 0.5421, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3002967834472656, - "rewards/margins": 0.6807582974433899, - "rewards/rejected": -1.9810549020767212, + "logits/chosen": -1.9901912212371826, + "logits/rejected": -1.7555814981460571, + "logps/chosen": -359.7068786621094, + "logps/rejected": -405.24261474609375, + "loss": 0.5585, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.026548147201538, + "rewards/margins": 0.6196350455284119, + "rewards/rejected": -1.6461833715438843, "step": 1540 }, { "epoch": 0.41, - "grad_norm": 7.375, + "grad_norm": 8.0625, "learning_rate": 3.7082259577447604e-06, - "logits/chosen": -1.4852304458618164, - "logits/rejected": -1.3760902881622314, - "logps/chosen": -417.33013916015625, - "logps/rejected": -477.65582275390625, - "loss": 0.4857, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.46196448802948, - "rewards/margins": 0.8031632304191589, - "rewards/rejected": -2.265127658843994, + "logits/chosen": -2.0445377826690674, + "logits/rejected": -1.9260342121124268, + "logps/chosen": -404.2452087402344, + "logps/rejected": -466.71044921875, + "loss": 0.4753, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1328551769256592, + "rewards/margins": 0.8080953359603882, + "rewards/rejected": -1.9409507513046265, "step": 1550 }, { "epoch": 0.41, - "grad_norm": 9.625, + "grad_norm": 6.09375, "learning_rate": 3.6881762360011688e-06, - "logits/chosen": -1.4668078422546387, - "logits/rejected": -1.2735626697540283, - "logps/chosen": -456.6810607910156, - "logps/rejected": -517.4048461914062, - "loss": 0.504, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.7524135112762451, - "rewards/margins": 0.9768053889274597, - "rewards/rejected": -2.7292189598083496, + "logits/chosen": -1.9906909465789795, + "logits/rejected": -1.808937668800354, + "logps/chosen": -433.05712890625, + "logps/rejected": -471.65643310546875, + "loss": 0.5246, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.329231858253479, + "rewards/margins": 0.7763558626174927, + "rewards/rejected": -2.1055874824523926, "step": 1560 }, { "epoch": 0.41, - "grad_norm": 10.625, + "grad_norm": 8.5625, "learning_rate": 3.668027301883802e-06, - "logits/chosen": -1.403698444366455, - "logits/rejected": -1.2377779483795166, - "logps/chosen": -443.7867126464844, - "logps/rejected": -529.870849609375, - "loss": 0.5181, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.9251244068145752, - "rewards/margins": 1.0098555088043213, - "rewards/rejected": -2.9349796772003174, + "logits/chosen": -2.066068649291992, + "logits/rejected": -1.8010034561157227, + "logps/chosen": -375.88629150390625, + "logps/rejected": -440.9111328125, + "loss": 0.493, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0363174676895142, + "rewards/margins": 0.8553186655044556, + "rewards/rejected": -1.8916361331939697, "step": 1570 }, { "epoch": 0.41, - "grad_norm": 6.0, + "grad_norm": 4.25, "learning_rate": 3.64778083782286e-06, - "logits/chosen": -1.3432996273040771, - "logits/rejected": -1.3267749547958374, - "logps/chosen": -444.5298767089844, - "logps/rejected": -557.05419921875, - "loss": 0.5426, + "logits/chosen": -1.9386217594146729, + "logits/rejected": -1.899171233177185, + "logps/chosen": -377.6373596191406, + "logps/rejected": -472.9842834472656, + "loss": 0.5669, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.9199059009552002, - "rewards/margins": 0.8350849151611328, - "rewards/rejected": -2.754991054534912, + "rewards/chosen": -1.0420780181884766, + "rewards/margins": 0.665661096572876, + "rewards/rejected": -1.707739233970642, "step": 1580 }, { "epoch": 0.42, - "grad_norm": 12.375, + "grad_norm": 9.5625, "learning_rate": 3.627438534392268e-06, - "logits/chosen": -1.4838283061981201, - "logits/rejected": -1.451644778251648, - "logps/chosen": -411.3505859375, - "logps/rejected": -506.9202575683594, - "loss": 0.5068, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7328803539276123, - "rewards/margins": 0.8886650800704956, - "rewards/rejected": -2.6215453147888184, + "logits/chosen": -2.084904193878174, + "logits/rejected": -2.023829221725464, + "logps/chosen": -353.0201721191406, + "logps/rejected": -427.7330017089844, + "loss": 0.5343, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.9893399477005005, + "rewards/margins": 0.678139328956604, + "rewards/rejected": -1.6674792766571045, "step": 1590 }, { "epoch": 0.42, - "grad_norm": 7.1875, + "grad_norm": 7.0, "learning_rate": 3.607002090168506e-06, - "logits/chosen": -1.347975492477417, - "logits/rejected": -1.2728310823440552, - "logps/chosen": -466.4964904785156, - "logps/rejected": -525.4207153320312, - "loss": 0.5653, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -2.022606134414673, - "rewards/margins": 0.803046703338623, - "rewards/rejected": -2.825652837753296, + "logits/chosen": -1.933423399925232, + "logits/rejected": -1.8388328552246094, + "logps/chosen": -398.29754638671875, + "logps/rejected": -430.0, + "loss": 0.5855, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1320139169692993, + "rewards/margins": 0.552472710609436, + "rewards/rejected": -1.684486746788025, "step": 1600 }, { "epoch": 0.42, - "eval_logits/chosen": -1.3479187488555908, - "eval_logits/rejected": -1.2289050817489624, - "eval_logps/chosen": -445.3527526855469, - "eval_logps/rejected": -522.2391967773438, - "eval_loss": 0.5065078139305115, - "eval_rewards/accuracies": 0.7360000014305115, - "eval_rewards/chosen": -1.8072537183761597, - "eval_rewards/margins": 0.9692339301109314, - "eval_rewards/rejected": -2.7764875888824463, - "eval_runtime": 784.7363, - "eval_samples_per_second": 2.549, + "eval_logits/chosen": -2.040278911590576, + "eval_logits/rejected": -1.9276047945022583, + "eval_logps/chosen": -388.7277526855469, + "eval_logps/rejected": -439.5246276855469, + "eval_loss": 0.5226916670799255, + "eval_rewards/accuracies": 0.734499990940094, + "eval_rewards/chosen": -1.0359350442886353, + "eval_rewards/margins": 0.7268618941307068, + "eval_rewards/rejected": -1.7627968788146973, + "eval_runtime": 783.5654, + "eval_samples_per_second": 2.552, "eval_steps_per_second": 0.319, "step": 1600 }, { "epoch": 0.42, - "grad_norm": 5.3125, + "grad_norm": 4.5625, "learning_rate": 3.586473211588787e-06, - "logits/chosen": -1.426160216331482, - "logits/rejected": -1.3096323013305664, - "logps/chosen": -416.1043395996094, - "logps/rejected": -529.2267456054688, - "loss": 0.4505, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.714402437210083, - "rewards/margins": 1.0645701885223389, - "rewards/rejected": -2.778972625732422, + "logits/chosen": -2.0347843170166016, + "logits/rejected": -1.825095534324646, + "logps/chosen": -351.39971923828125, + "logps/rejected": -442.338623046875, + "loss": 0.4675, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8847958445549011, + "rewards/margins": 0.8562973141670227, + "rewards/rejected": -1.7410932779312134, "step": 1610 }, { "epoch": 0.42, - "grad_norm": 13.0625, + "grad_norm": 9.3125, "learning_rate": 3.5658536128085623e-06, - "logits/chosen": -1.429114818572998, - "logits/rejected": -1.2542836666107178, - "logps/chosen": -489.0148010253906, - "logps/rejected": -539.6549072265625, - "loss": 0.603, + "logits/chosen": -2.046114683151245, + "logits/rejected": -1.8478816747665405, + "logps/chosen": -398.8979797363281, + "logps/rejected": -427.96405029296875, + "loss": 0.5911, "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -2.2616515159606934, - "rewards/margins": 0.8016579747200012, - "rewards/rejected": -3.06330943107605, + "rewards/chosen": -1.1641943454742432, + "rewards/margins": 0.6046980619430542, + "rewards/rejected": -1.7688922882080078, "step": 1620 }, { "epoch": 0.43, - "grad_norm": 8.125, + "grad_norm": 9.1875, "learning_rate": 3.545145015558399e-06, - "logits/chosen": -1.2496092319488525, - "logits/rejected": -1.218031644821167, - "logps/chosen": -425.0565490722656, - "logps/rejected": -515.0162353515625, - "loss": 0.4944, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.934874176979065, - "rewards/margins": 1.0416674613952637, - "rewards/rejected": -2.9765419960021973, + "logits/chosen": -1.891063928604126, + "logits/rejected": -1.809687614440918, + "logps/chosen": -368.8531188964844, + "logps/rejected": -437.64691162109375, + "loss": 0.509, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.174725890159607, + "rewards/margins": 0.8606030344963074, + "rewards/rejected": -2.0353291034698486, "step": 1630 }, { "epoch": 0.43, - "grad_norm": 6.0625, + "grad_norm": 5.8125, "learning_rate": 3.5243491490002056e-06, - "logits/chosen": -1.3644239902496338, - "logits/rejected": -1.2777811288833618, - "logps/chosen": -464.21502685546875, - "logps/rejected": -553.838134765625, - "loss": 0.5441, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.105651378631592, - "rewards/margins": 0.9486795663833618, - "rewards/rejected": -3.054331064224243, + "logits/chosen": -2.02524471282959, + "logits/rejected": -1.9297832250595093, + "logps/chosen": -404.6397399902344, + "logps/rejected": -462.12591552734375, + "loss": 0.5795, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.298349142074585, + "rewards/margins": 0.6594666242599487, + "rewards/rejected": -1.9578157663345337, "step": 1640 }, { "epoch": 0.43, - "grad_norm": 9.625, + "grad_norm": 6.15625, "learning_rate": 3.503467749582857e-06, - "logits/chosen": -1.391477108001709, - "logits/rejected": -1.1985410451889038, - "logps/chosen": -453.86737060546875, - "logps/rejected": -497.47869873046875, - "loss": 0.5858, + "logits/chosen": -2.001770257949829, + "logits/rejected": -1.7856979370117188, + "logps/chosen": -405.2151794433594, + "logps/rejected": -426.4241638183594, + "loss": 0.6245, "rewards/accuracies": 0.6875, - "rewards/chosen": -2.026911973953247, - "rewards/margins": 0.7760933041572571, - "rewards/rejected": -2.8030049800872803, + "rewards/chosen": -1.3188527822494507, + "rewards/margins": 0.5851167440414429, + "rewards/rejected": -1.9039695262908936, "step": 1650 }, { "epoch": 0.43, - "grad_norm": 16.25, + "grad_norm": 9.375, "learning_rate": 3.4825025608971947e-06, - "logits/chosen": -1.3152287006378174, - "logits/rejected": -1.2422856092453003, - "logps/chosen": -403.56390380859375, - "logps/rejected": -483.42388916015625, - "loss": 0.5352, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.8168277740478516, - "rewards/margins": 0.7600839734077454, - "rewards/rejected": -2.576911449432373, + "logits/chosen": -1.9151302576065063, + "logits/rejected": -1.8214277029037476, + "logps/chosen": -355.21087646484375, + "logps/rejected": -424.50811767578125, + "loss": 0.5519, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.154817819595337, + "rewards/margins": 0.6674734950065613, + "rewards/rejected": -1.8222911357879639, "step": 1660 }, { "epoch": 0.44, - "grad_norm": 7.6875, + "grad_norm": 5.25, "learning_rate": 3.4614553335304407e-06, - "logits/chosen": -1.3943700790405273, - "logits/rejected": -1.1897953748703003, - "logps/chosen": -439.11810302734375, - "logps/rejected": -506.8814392089844, - "loss": 0.4682, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.7027143239974976, - "rewards/margins": 0.9949854016304016, - "rewards/rejected": -2.697699785232544, + "logits/chosen": -2.0037343502044678, + "logits/rejected": -1.7987245321273804, + "logps/chosen": -409.9883728027344, + "logps/rejected": -452.8309631347656, + "loss": 0.516, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.1703855991363525, + "rewards/margins": 0.8152831792831421, + "rewards/rejected": -1.9856687784194946, "step": 1670 }, { "epoch": 0.44, - "grad_norm": 10.625, + "grad_norm": 8.0625, "learning_rate": 3.4403278249200222e-06, - "logits/chosen": -1.3651249408721924, - "logits/rejected": -1.1605368852615356, - "logps/chosen": -466.6790466308594, - "logps/rejected": -544.6467895507812, - "loss": 0.4334, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.8409086465835571, - "rewards/margins": 1.1990861892700195, - "rewards/rejected": -3.039994716644287, + "logits/chosen": -1.9947742223739624, + "logits/rejected": -1.7905107736587524, + "logps/chosen": -405.6366882324219, + "logps/rejected": -466.224609375, + "loss": 0.4613, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.0154985189437866, + "rewards/margins": 1.0372416973114014, + "rewards/rejected": -2.0527403354644775, "step": 1680 }, { "epoch": 0.44, - "grad_norm": 17.125, + "grad_norm": 9.3125, "learning_rate": 3.4191217992068293e-06, - "logits/chosen": -1.4057015180587769, - "logits/rejected": -1.2282214164733887, - "logps/chosen": -506.6934509277344, - "logps/rejected": -559.0716552734375, - "loss": 0.5429, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.285783290863037, - "rewards/margins": 0.9797841906547546, - "rewards/rejected": -3.2655677795410156, + "logits/chosen": -2.061328887939453, + "logits/rejected": -1.9267551898956299, + "logps/chosen": -403.9309997558594, + "logps/rejected": -432.7481384277344, + "loss": 0.5519, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0664750337600708, + "rewards/margins": 0.7731136679649353, + "rewards/rejected": -1.8395887613296509, "step": 1690 }, { "epoch": 0.44, - "grad_norm": 11.8125, + "grad_norm": 10.1875, "learning_rate": 3.3978390270879056e-06, - "logits/chosen": -1.2874337434768677, - "logits/rejected": -1.1903162002563477, - "logps/chosen": -475.48675537109375, - "logps/rejected": -584.1217041015625, - "loss": 0.5129, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.6555633544921875, - "rewards/margins": 1.0180342197418213, - "rewards/rejected": -3.6735973358154297, + "logits/chosen": -2.0079243183135986, + "logits/rejected": -1.8811041116714478, + "logps/chosen": -350.56658935546875, + "logps/rejected": -422.943603515625, + "loss": 0.5333, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.1983606815338135, + "rewards/margins": 0.6959515810012817, + "rewards/rejected": -1.8943122625350952, "step": 1700 }, { "epoch": 0.44, - "eval_logits/chosen": -1.2898441553115845, - "eval_logits/rejected": -1.1655206680297852, - "eval_logps/chosen": -527.8513793945312, - "eval_logps/rejected": -610.3750610351562, - "eval_loss": 0.5115381479263306, - "eval_rewards/accuracies": 0.7289999723434448, - "eval_rewards/chosen": -2.63223934173584, - "eval_rewards/margins": 1.0256068706512451, - "eval_rewards/rejected": -3.657846212387085, - "eval_runtime": 787.4965, - "eval_samples_per_second": 2.54, - "eval_steps_per_second": 0.317, + "eval_logits/chosen": -2.0731844902038574, + "eval_logits/rejected": -1.9572210311889648, + "eval_logps/chosen": -401.31475830078125, + "eval_logps/rejected": -460.55657958984375, + "eval_loss": 0.5154699087142944, + "eval_rewards/accuracies": 0.7310000061988831, + "eval_rewards/chosen": -1.1618049144744873, + "eval_rewards/margins": 0.8113114237785339, + "eval_rewards/rejected": -1.9731160402297974, + "eval_runtime": 782.2946, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, "step": 1700 }, { "epoch": 0.45, - "grad_norm": 12.25, + "grad_norm": 9.875, "learning_rate": 3.3764812856685995e-06, - "logits/chosen": -1.3596528768539429, - "logits/rejected": -1.342123031616211, - "logps/chosen": -466.9327697753906, - "logps/rejected": -575.8137817382812, - "loss": 0.5253, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.4246678352355957, - "rewards/margins": 0.9531208276748657, - "rewards/rejected": -3.37778902053833, + "logits/chosen": -2.0123562812805176, + "logits/rejected": -1.9913345575332642, + "logps/chosen": -348.2825012207031, + "logps/rejected": -437.12481689453125, + "loss": 0.5252, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.0764367580413818, + "rewards/margins": 0.7534947395324707, + "rewards/rejected": -1.8299314975738525, "step": 1710 }, { "epoch": 0.45, - "grad_norm": 8.875, + "grad_norm": 8.3125, "learning_rate": 3.3550503583141726e-06, - "logits/chosen": -1.4853286743164062, - "logits/rejected": -1.3501067161560059, - "logps/chosen": -495.5145568847656, - "logps/rejected": -587.0310668945312, - "loss": 0.4881, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.2669217586517334, - "rewards/margins": 1.0670498609542847, - "rewards/rejected": -3.3339715003967285, + "logits/chosen": -2.1099114418029785, + "logits/rejected": -1.9925434589385986, + "logps/chosen": -392.92840576171875, + "logps/rejected": -464.06884765625, + "loss": 0.4725, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.0577646493911743, + "rewards/margins": 0.9005683660507202, + "rewards/rejected": -1.9583332538604736, "step": 1720 }, { "epoch": 0.45, - "grad_norm": 7.375, + "grad_norm": 7.0625, "learning_rate": 3.3335480345008907e-06, - "logits/chosen": -1.389654517173767, - "logits/rejected": -1.2867991924285889, - "logps/chosen": -441.3914489746094, - "logps/rejected": -524.025146484375, - "loss": 0.4572, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.8125606775283813, - "rewards/margins": 1.0735079050064087, - "rewards/rejected": -2.886068344116211, + "logits/chosen": -1.9768962860107422, + "logits/rejected": -1.850328803062439, + "logps/chosen": -407.8126525878906, + "logps/rejected": -476.3419494628906, + "loss": 0.5127, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2192199230194092, + "rewards/margins": 0.9083667993545532, + "rewards/rejected": -2.127586841583252, "step": 1730 }, { "epoch": 0.46, - "grad_norm": 9.375, + "grad_norm": 8.125, "learning_rate": 3.3119761096666055e-06, - "logits/chosen": -1.4131015539169312, - "logits/rejected": -1.2663146257400513, - "logps/chosen": -475.7037048339844, - "logps/rejected": -523.84521484375, - "loss": 0.5636, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.007295846939087, - "rewards/margins": 0.7847532033920288, - "rewards/rejected": -2.792048931121826, + "logits/chosen": -1.985120415687561, + "logits/rejected": -1.8539278507232666, + "logps/chosen": -421.05126953125, + "logps/rejected": -467.579833984375, + "loss": 0.5525, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2381929159164429, + "rewards/margins": 0.7920646667480469, + "rewards/rejected": -2.0302577018737793, "step": 1740 }, { "epoch": 0.46, - "grad_norm": 8.0625, + "grad_norm": 6.59375, "learning_rate": 3.290336385060832e-06, - "logits/chosen": -1.567063570022583, - "logits/rejected": -1.3609685897827148, - "logps/chosen": -461.98870849609375, - "logps/rejected": -528.9849243164062, - "loss": 0.5434, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.131678342819214, - "rewards/margins": 0.8711919784545898, - "rewards/rejected": -3.002870559692383, + "logits/chosen": -2.143947124481201, + "logits/rejected": -1.8942737579345703, + "logps/chosen": -413.96551513671875, + "logps/rejected": -477.3929748535156, + "loss": 0.5114, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4759609699249268, + "rewards/margins": 0.8571161031723022, + "rewards/rejected": -2.3330769538879395, "step": 1750 }, { "epoch": 0.46, - "grad_norm": 10.9375, + "grad_norm": 8.125, "learning_rate": 3.268630667594348e-06, - "logits/chosen": -1.408299446105957, - "logits/rejected": -1.3696682453155518, - "logps/chosen": -449.277099609375, - "logps/rejected": -518.7066650390625, - "loss": 0.5036, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.9362947940826416, - "rewards/margins": 0.9368389248847961, - "rewards/rejected": -2.873133659362793, + "logits/chosen": -2.0270843505859375, + "logits/rejected": -1.9718875885009766, + "logps/chosen": -408.5159912109375, + "logps/rejected": -458.33013916015625, + "loss": 0.5462, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3257557153701782, + "rewards/margins": 0.7757492065429688, + "rewards/rejected": -2.1015048027038574, "step": 1760 }, { "epoch": 0.46, - "grad_norm": 13.875, + "grad_norm": 6.5, "learning_rate": 3.2468607696883147e-06, - "logits/chosen": -1.4419914484024048, - "logits/rejected": -1.3911958932876587, - "logps/chosen": -453.1897888183594, - "logps/rejected": -551.0692138671875, - "loss": 0.4916, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.000717878341675, - "rewards/margins": 0.9351266026496887, - "rewards/rejected": -2.935844659805298, + "logits/chosen": -1.9911127090454102, + "logits/rejected": -1.94431471824646, + "logps/chosen": -397.14208984375, + "logps/rejected": -485.87237548828125, + "loss": 0.5191, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.2622430324554443, + "rewards/margins": 0.8354751467704773, + "rewards/rejected": -2.0977180004119873, "step": 1770 }, { "epoch": 0.47, - "grad_norm": 7.4375, + "grad_norm": 6.78125, "learning_rate": 3.225028509122944e-06, - "logits/chosen": -1.50209641456604, - "logits/rejected": -1.3786523342132568, - "logps/chosen": -422.64263916015625, - "logps/rejected": -490.89422607421875, - "loss": 0.5349, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.8613319396972656, - "rewards/margins": 0.7958077192306519, - "rewards/rejected": -2.657139539718628, + "logits/chosen": -2.0844154357910156, + "logits/rejected": -1.9376178979873657, + "logps/chosen": -355.7822265625, + "logps/rejected": -413.16748046875, + "loss": 0.5498, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.041587233543396, + "rewards/margins": 0.6852777004241943, + "rewards/rejected": -1.7268650531768799, "step": 1780 }, { "epoch": 0.47, - "grad_norm": 12.6875, + "grad_norm": 5.90625, "learning_rate": 3.2031357088857083e-06, - "logits/chosen": -1.4481326341629028, - "logits/rejected": -1.376110315322876, - "logps/chosen": -487.8973083496094, - "logps/rejected": -578.3382568359375, - "loss": 0.5044, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.1271708011627197, - "rewards/margins": 0.9390125274658203, - "rewards/rejected": -3.066183090209961, + "logits/chosen": -2.071098804473877, + "logits/rejected": -2.020618200302124, + "logps/chosen": -409.9624938964844, + "logps/rejected": -493.58758544921875, + "loss": 0.5321, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.149269938468933, + "rewards/margins": 0.780631959438324, + "rewards/rejected": -1.9299018383026123, "step": 1790 }, { "epoch": 0.47, - "grad_norm": 15.5625, + "grad_norm": 8.4375, "learning_rate": 3.181184197019127e-06, - "logits/chosen": -1.1928216218948364, - "logits/rejected": -1.0818660259246826, - "logps/chosen": -475.0074768066406, - "logps/rejected": -645.8020629882812, - "loss": 0.464, + "logits/chosen": -1.9076446294784546, + "logits/rejected": -1.7894035577774048, + "logps/chosen": -362.01275634765625, + "logps/rejected": -494.119384765625, + "loss": 0.5055, "rewards/accuracies": 0.75, - "rewards/chosen": -2.448078155517578, - "rewards/margins": 1.4459528923034668, - "rewards/rejected": -3.894031524658203, + "rewards/chosen": -1.1391741037368774, + "rewards/margins": 1.0371575355529785, + "rewards/rejected": -2.1763315200805664, "step": 1800 }, { "epoch": 0.47, - "eval_logits/chosen": -1.2262241840362549, - "eval_logits/rejected": -1.1092225313186646, - "eval_logps/chosen": -519.20654296875, - "eval_logps/rejected": -611.1868286132812, - "eval_loss": 0.5067161321640015, - "eval_rewards/accuracies": 0.7360000014305115, - "eval_rewards/chosen": -2.545790672302246, - "eval_rewards/margins": 1.1201735734939575, - "eval_rewards/rejected": -3.665964365005493, - "eval_runtime": 786.3131, - "eval_samples_per_second": 2.544, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -2.0727155208587646, + "eval_logits/rejected": -1.9571987390518188, + "eval_logps/chosen": -396.1869812011719, + "eval_logps/rejected": -452.9256896972656, + "eval_loss": 0.5181357860565186, + "eval_rewards/accuracies": 0.7329999804496765, + "eval_rewards/chosen": -1.1105273962020874, + "eval_rewards/margins": 0.7862799763679504, + "eval_rewards/rejected": -1.8968074321746826, + "eval_runtime": 783.5553, + "eval_samples_per_second": 2.552, + "eval_steps_per_second": 0.319, "step": 1800 }, { "epoch": 0.47, - "grad_norm": 27.75, + "grad_norm": 8.8125, "learning_rate": 3.159175806468126e-06, - "logits/chosen": -1.1510545015335083, - "logits/rejected": -0.9545754194259644, - "logps/chosen": -524.1520385742188, - "logps/rejected": -614.8507690429688, - "loss": 0.4959, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.7552742958068848, - "rewards/margins": 1.2089765071868896, - "rewards/rejected": -3.9642505645751953, + "logits/chosen": -1.9769659042358398, + "logits/rejected": -1.7270854711532593, + "logps/chosen": -385.99139404296875, + "logps/rejected": -418.35595703125, + "loss": 0.5416, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.117026448249817, + "rewards/margins": 0.7229973077774048, + "rewards/rejected": -1.8400236368179321, "step": 1810 }, { "epoch": 0.48, - "grad_norm": 17.25, + "grad_norm": 9.625, "learning_rate": 3.1371123749269804e-06, - "logits/chosen": -1.2211850881576538, - "logits/rejected": -1.1514862775802612, - "logps/chosen": -588.10400390625, - "logps/rejected": -663.5833129882812, - "loss": 0.5824, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -3.0478672981262207, - "rewards/margins": 0.9145796895027161, - "rewards/rejected": -3.96244740486145, + "logits/chosen": -2.0032527446746826, + "logits/rejected": -1.9413458108901978, + "logps/chosen": -422.4027404785156, + "logps/rejected": -460.8421325683594, + "loss": 0.6067, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -1.2173041105270386, + "rewards/margins": 0.5386160612106323, + "rewards/rejected": -1.75592041015625, "step": 1820 }, { "epoch": 0.48, - "grad_norm": 12.25, + "grad_norm": 7.21875, "learning_rate": 3.114995744685877e-06, - "logits/chosen": -1.1766655445098877, - "logits/rejected": -1.1476547718048096, - "logps/chosen": -536.2582397460938, - "logps/rejected": -619.7481079101562, - "loss": 0.5361, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.8898520469665527, - "rewards/margins": 0.9645761251449585, - "rewards/rejected": -3.8544278144836426, + "logits/chosen": -1.9304783344268799, + "logits/rejected": -1.9257535934448242, + "logps/chosen": -375.6784973144531, + "logps/rejected": -426.36822509765625, + "loss": 0.5476, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0842137336730957, + "rewards/margins": 0.660191535949707, + "rewards/rejected": -1.7444051504135132, "step": 1830 }, { "epoch": 0.48, - "grad_norm": 5.46875, + "grad_norm": 3.5, "learning_rate": 3.0928277624770743e-06, - "logits/chosen": -1.3936359882354736, - "logits/rejected": -1.2410995960235596, - "logps/chosen": -532.3043212890625, - "logps/rejected": -620.3671264648438, - "loss": 0.5013, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.4759044647216797, - "rewards/margins": 1.1648061275482178, - "rewards/rejected": -3.6407108306884766, + "logits/chosen": -2.052551507949829, + "logits/rejected": -1.9413669109344482, + "logps/chosen": -400.77728271484375, + "logps/rejected": -459.59979248046875, + "loss": 0.4952, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.9567273855209351, + "rewards/margins": 0.8860765695571899, + "rewards/rejected": -1.842804193496704, "step": 1840 }, { "epoch": 0.48, - "grad_norm": 6.46875, + "grad_norm": 4.71875, "learning_rate": 3.070610279320708e-06, - "logits/chosen": -1.4516522884368896, - "logits/rejected": -1.2788631916046143, - "logps/chosen": -496.40087890625, - "logps/rejected": -581.0626220703125, - "loss": 0.4663, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.094099521636963, - "rewards/margins": 1.0866529941558838, - "rewards/rejected": -3.1807522773742676, + "logits/chosen": -2.090240478515625, + "logits/rejected": -1.8750137090682983, + "logps/chosen": -401.25360107421875, + "logps/rejected": -459.5379943847656, + "loss": 0.498, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.9490548372268677, + "rewards/margins": 0.8133231401443481, + "rewards/rejected": -1.7623779773712158, "step": 1850 }, { "epoch": 0.49, - "grad_norm": 6.90625, + "grad_norm": 4.0625, "learning_rate": 3.0483451503702264e-06, - "logits/chosen": -1.3937948942184448, - "logits/rejected": -1.322385549545288, - "logps/chosen": -499.5318298339844, - "logps/rejected": -563.4232177734375, - "loss": 0.5662, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.1348235607147217, - "rewards/margins": 0.8390694856643677, - "rewards/rejected": -2.9738926887512207, + "logits/chosen": -1.9803142547607422, + "logits/rejected": -1.9110126495361328, + "logps/chosen": -415.2545471191406, + "logps/rejected": -474.24017333984375, + "loss": 0.564, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.1085463762283325, + "rewards/margins": 0.7884865999221802, + "rewards/rejected": -1.8970329761505127, "step": 1860 }, { "epoch": 0.49, - "grad_norm": 8.25, + "grad_norm": 7.28125, "learning_rate": 3.0260342347574916e-06, - "logits/chosen": -1.3950804471969604, - "logits/rejected": -1.2477340698242188, - "logps/chosen": -465.05157470703125, - "logps/rejected": -563.8284912109375, - "loss": 0.4475, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -1.9272401332855225, - "rewards/margins": 1.178283929824829, - "rewards/rejected": -3.1055240631103516, + "logits/chosen": -2.0407090187072754, + "logits/rejected": -1.8666067123413086, + "logps/chosen": -393.3802795410156, + "logps/rejected": -472.4957580566406, + "loss": 0.4705, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0278112888336182, + "rewards/margins": 0.9973715543746948, + "rewards/rejected": -2.0251829624176025, "step": 1870 }, { "epoch": 0.49, - "grad_norm": 14.75, + "grad_norm": 6.75, "learning_rate": 3.0036793954375358e-06, - "logits/chosen": -1.3638473749160767, - "logits/rejected": -1.209535002708435, - "logps/chosen": -496.8861389160156, - "logps/rejected": -571.2623901367188, - "loss": 0.4678, + "logits/chosen": -2.039440155029297, + "logits/rejected": -1.8485755920410156, + "logps/chosen": -410.55291748046875, + "logps/rejected": -466.46051025390625, + "loss": 0.4697, "rewards/accuracies": 0.78125, - "rewards/chosen": -2.2681288719177246, - "rewards/margins": 1.1660722494125366, - "rewards/rejected": -3.4342010021209717, + "rewards/chosen": -1.2217860221862793, + "rewards/margins": 0.9957095384597778, + "rewards/rejected": -2.2174954414367676, "step": 1880 }, { "epoch": 0.49, - "grad_norm": 11.5, + "grad_norm": 9.1875, "learning_rate": 2.981282499033009e-06, - "logits/chosen": -1.346684217453003, - "logits/rejected": -1.2146567106246948, - "logps/chosen": -523.80078125, - "logps/rejected": -608.2716064453125, - "loss": 0.5041, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.475346803665161, - "rewards/margins": 1.0990780591964722, - "rewards/rejected": -3.5744247436523438, + "logits/chosen": -1.9933445453643799, + "logits/rejected": -1.8086822032928467, + "logps/chosen": -437.57635498046875, + "logps/rejected": -503.1834411621094, + "loss": 0.5375, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.4291341304779053, + "rewards/margins": 0.9259105920791626, + "rewards/rejected": -2.3550448417663574, "step": 1890 }, { "epoch": 0.5, - "grad_norm": 10.0625, + "grad_norm": 9.8125, "learning_rate": 2.9588454156783163e-06, - "logits/chosen": -1.3662660121917725, - "logits/rejected": -1.198290228843689, - "logps/chosen": -516.8792114257812, - "logps/rejected": -632.828125, - "loss": 0.4435, + "logits/chosen": -1.9322729110717773, + "logits/rejected": -1.7422888278961182, + "logps/chosen": -434.81884765625, + "logps/rejected": -520.74267578125, + "loss": 0.4687, "rewards/accuracies": 0.78125, - "rewards/chosen": -2.3553900718688965, - "rewards/margins": 1.3771806955337524, - "rewards/rejected": -3.7325706481933594, + "rewards/chosen": -1.3640549182891846, + "rewards/margins": 1.0799517631530762, + "rewards/rejected": -2.44400691986084, "step": 1900 }, { "epoch": 0.5, - "eval_logits/chosen": -1.29695463180542, - "eval_logits/rejected": -1.1772844791412354, - "eval_logps/chosen": -506.60626220703125, - "eval_logps/rejected": -595.5960693359375, - "eval_loss": 0.5027948617935181, - "eval_rewards/accuracies": 0.7294999957084656, - "eval_rewards/chosen": -2.419787883758545, - "eval_rewards/margins": 1.0902690887451172, - "eval_rewards/rejected": -3.510056734085083, - "eval_runtime": 784.8032, - "eval_samples_per_second": 2.548, + "eval_logits/chosen": -1.967826247215271, + "eval_logits/rejected": -1.8518778085708618, + "eval_logps/chosen": -425.9162902832031, + "eval_logps/rejected": -493.8867492675781, + "eval_loss": 0.5197983980178833, + "eval_rewards/accuracies": 0.7289999723434448, + "eval_rewards/chosen": -1.4078201055526733, + "eval_rewards/margins": 0.8985980153083801, + "eval_rewards/rejected": -2.3064181804656982, + "eval_runtime": 784.0155, + "eval_samples_per_second": 2.551, "eval_steps_per_second": 0.319, "step": 1900 }, { "epoch": 0.5, - "grad_norm": 12.1875, + "grad_norm": 9.9375, "learning_rate": 2.9363700188634597e-06, - "logits/chosen": -1.3673919439315796, - "logits/rejected": -1.2327473163604736, - "logps/chosen": -520.3253784179688, - "logps/rejected": -581.2381591796875, - "loss": 0.505, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5530762672424316, - "rewards/margins": 0.9795705080032349, - "rewards/rejected": -3.5326473712921143, + "logits/chosen": -1.9670441150665283, + "logits/rejected": -1.802392601966858, + "logps/chosen": -425.4117126464844, + "logps/rejected": -467.03948974609375, + "loss": 0.5114, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.430497407913208, + "rewards/margins": 0.8090961575508118, + "rewards/rejected": -2.239593505859375, "step": 1910 }, { "epoch": 0.5, - "grad_norm": 10.8125, + "grad_norm": 7.78125, "learning_rate": 2.9138581852776053e-06, - "logits/chosen": -1.3470666408538818, - "logits/rejected": -1.2466224431991577, - "logps/chosen": -521.1154174804688, - "logps/rejected": -616.56982421875, - "loss": 0.5077, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.6147425174713135, - "rewards/margins": 1.084177851676941, - "rewards/rejected": -3.698920726776123, + "logits/chosen": -1.9015588760375977, + "logits/rejected": -1.817588210105896, + "logps/chosen": -412.984619140625, + "logps/rejected": -484.392333984375, + "loss": 0.5182, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3329174518585205, + "rewards/margins": 0.8674777150154114, + "rewards/rejected": -2.200395107269287, "step": 1920 }, { "epoch": 0.51, - "grad_norm": 7.875, + "grad_norm": 9.9375, "learning_rate": 2.8913117946523805e-06, - "logits/chosen": -1.383959412574768, - "logits/rejected": -1.200412392616272, - "logps/chosen": -533.2996826171875, - "logps/rejected": -598.9548950195312, - "loss": 0.4722, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.669104814529419, - "rewards/margins": 1.0376745462417603, - "rewards/rejected": -3.7067794799804688, + "logits/chosen": -1.8993953466415405, + "logits/rejected": -1.7067630290985107, + "logps/chosen": -418.3155212402344, + "logps/rejected": -466.83038330078125, + "loss": 0.4876, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2759909629821777, + "rewards/margins": 0.8963203430175781, + "rewards/rejected": -2.172311305999756, "step": 1930 }, { "epoch": 0.51, - "grad_norm": 10.9375, + "grad_norm": 7.625, "learning_rate": 2.8687327296049126e-06, - "logits/chosen": -1.366599440574646, - "logits/rejected": -1.2617782354354858, - "logps/chosen": -517.69091796875, - "logps/rejected": -610.8246459960938, - "loss": 0.5236, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.5863590240478516, - "rewards/margins": 1.028294324874878, - "rewards/rejected": -3.6146531105041504, + "logits/chosen": -1.8995901346206665, + "logits/rejected": -1.7730462551116943, + "logps/chosen": -393.28955078125, + "logps/rejected": -478.4151306152344, + "loss": 0.5351, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1747307777404785, + "rewards/margins": 0.8694013357162476, + "rewards/rejected": -2.0441319942474365, "step": 1940 }, { "epoch": 0.51, - "grad_norm": 13.5, + "grad_norm": 6.3125, "learning_rate": 2.8461228754806376e-06, - "logits/chosen": -1.4240081310272217, - "logits/rejected": -1.258113145828247, - "logps/chosen": -523.6707153320312, - "logps/rejected": -583.9991455078125, - "loss": 0.5317, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.479217052459717, - "rewards/margins": 0.8611544370651245, - "rewards/rejected": -3.3403713703155518, + "logits/chosen": -1.9606692790985107, + "logits/rejected": -1.8029773235321045, + "logps/chosen": -410.99462890625, + "logps/rejected": -456.9501953125, + "loss": 0.5492, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.1734405755996704, + "rewards/margins": 0.7420889139175415, + "rewards/rejected": -1.9155292510986328, "step": 1950 }, { "epoch": 0.51, - "grad_norm": 10.6875, + "grad_norm": 4.96875, "learning_rate": 2.823484120195865e-06, - "logits/chosen": -1.523560643196106, - "logits/rejected": -1.30274498462677, - "logps/chosen": -500.1468811035156, - "logps/rejected": -573.5841064453125, - "loss": 0.4433, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.2237653732299805, - "rewards/margins": 1.078161597251892, - "rewards/rejected": -3.301927089691162, + "logits/chosen": -2.014724016189575, + "logits/rejected": -1.7219772338867188, + "logps/chosen": -407.7786560058594, + "logps/rejected": -461.8780212402344, + "loss": 0.4589, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.0491843223571777, + "rewards/margins": 0.9459662437438965, + "rewards/rejected": -1.9951505661010742, "step": 1960 }, { "epoch": 0.52, - "grad_norm": 11.375, + "grad_norm": 7.375, "learning_rate": 2.8008183540801486e-06, - "logits/chosen": -1.3897085189819336, - "logits/rejected": -1.2395132780075073, - "logps/chosen": -508.63427734375, - "logps/rejected": -554.4570922851562, - "loss": 0.5131, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.3504276275634766, - "rewards/margins": 0.9118019342422485, - "rewards/rejected": -3.2622294425964355, + "logits/chosen": -1.9526262283325195, + "logits/rejected": -1.7973562479019165, + "logps/chosen": -413.77386474609375, + "logps/rejected": -439.8606872558594, + "loss": 0.5228, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2191674709320068, + "rewards/margins": 0.7341850399971008, + "rewards/rejected": -1.9533525705337524, "step": 1970 }, { "epoch": 0.52, - "grad_norm": 12.0, + "grad_norm": 7.0, "learning_rate": 2.7781274697184353e-06, - "logits/chosen": -1.2152577638626099, - "logits/rejected": -1.2678476572036743, - "logps/chosen": -492.1659240722656, - "logps/rejected": -622.5379638671875, - "loss": 0.5216, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.6284852027893066, - "rewards/margins": 1.1008700132369995, - "rewards/rejected": -3.7293553352355957, + "logits/chosen": -1.732486367225647, + "logits/rejected": -1.8619239330291748, + "logps/chosen": -389.2760314941406, + "logps/rejected": -493.4657287597656, + "loss": 0.5323, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3979893922805786, + "rewards/margins": 0.8640559315681458, + "rewards/rejected": -2.262045383453369, "step": 1980 }, { "epoch": 0.52, - "grad_norm": 10.0, + "grad_norm": 5.5, "learning_rate": 2.7554133617930397e-06, - "logits/chosen": -1.2987666130065918, - "logits/rejected": -1.1745851039886475, - "logps/chosen": -531.090087890625, - "logps/rejected": -626.9152221679688, - "loss": 0.5033, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.8330931663513184, - "rewards/margins": 1.108681082725525, - "rewards/rejected": -3.941774368286133, + "logits/chosen": -1.817797064781189, + "logits/rejected": -1.7195402383804321, + "logps/chosen": -402.89129638671875, + "logps/rejected": -470.81756591796875, + "loss": 0.519, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3425031900405884, + "rewards/margins": 0.8210526704788208, + "rewards/rejected": -2.163555860519409, "step": 1990 }, { "epoch": 0.52, - "grad_norm": 11.75, + "grad_norm": 7.71875, "learning_rate": 2.7326779269254363e-06, - "logits/chosen": -1.464644432067871, - "logits/rejected": -1.2891961336135864, - "logps/chosen": -578.9766845703125, - "logps/rejected": -634.8112182617188, - "loss": 0.4722, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.89630126953125, - "rewards/margins": 1.1180540323257446, - "rewards/rejected": -4.014355659484863, + "logits/chosen": -1.9797512292861938, + "logits/rejected": -1.7942073345184326, + "logps/chosen": -438.1368103027344, + "logps/rejected": -471.66876220703125, + "loss": 0.4936, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.2935562133789062, + "rewards/margins": 0.9384673833847046, + "rewards/rejected": -2.2320237159729004, "step": 2000 }, { "epoch": 0.52, - "eval_logits/chosen": -1.2849688529968262, - "eval_logits/rejected": -1.1620625257492065, - "eval_logps/chosen": -550.964599609375, - "eval_logps/rejected": -635.0359497070312, - "eval_loss": 0.5023801326751709, - "eval_rewards/accuracies": 0.7369999885559082, - "eval_rewards/chosen": -2.8633713722229004, - "eval_rewards/margins": 1.0410833358764648, - "eval_rewards/rejected": -3.9044554233551025, - "eval_runtime": 784.8313, - "eval_samples_per_second": 2.548, + "eval_logits/chosen": -1.9507912397384644, + "eval_logits/rejected": -1.8371071815490723, + "eval_logps/chosen": -426.1055603027344, + "eval_logps/rejected": -488.6001281738281, + "eval_loss": 0.5123463273048401, + "eval_rewards/accuracies": 0.7289999723434448, + "eval_rewards/chosen": -1.4097131490707397, + "eval_rewards/margins": 0.8438388109207153, + "eval_rewards/rejected": -2.253551959991455, + "eval_runtime": 783.3098, + "eval_samples_per_second": 2.553, "eval_steps_per_second": 0.319, "step": 2000 }, { "epoch": 0.53, - "grad_norm": 8.875, + "grad_norm": 7.625, "learning_rate": 2.7099230635178954e-06, - "logits/chosen": -1.314143180847168, - "logits/rejected": -1.2714715003967285, - "logps/chosen": -547.5592041015625, - "logps/rejected": -636.8946533203125, - "loss": 0.5139, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.8272881507873535, - "rewards/margins": 0.9581931829452515, - "rewards/rejected": -3.7854812145233154, + "logits/chosen": -1.8386337757110596, + "logits/rejected": -1.808058738708496, + "logps/chosen": -423.898681640625, + "logps/rejected": -500.33203125, + "loss": 0.5187, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4234874248504639, + "rewards/margins": 0.8072878122329712, + "rewards/rejected": -2.2307751178741455, "step": 2010 }, { "epoch": 0.53, - "grad_norm": 12.4375, + "grad_norm": 10.0, "learning_rate": 2.6871506715949608e-06, - "logits/chosen": -1.432326316833496, - "logits/rejected": -1.3116884231567383, - "logps/chosen": -509.01702880859375, - "logps/rejected": -595.810546875, - "loss": 0.4683, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.5611319541931152, - "rewards/margins": 1.0483646392822266, - "rewards/rejected": -3.6094970703125, + "logits/chosen": -1.862138032913208, + "logits/rejected": -1.816674828529358, + "logps/chosen": -406.03057861328125, + "logps/rejected": -474.5035095214844, + "loss": 0.4734, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3433624505996704, + "rewards/margins": 0.9091267585754395, + "rewards/rejected": -2.2524895668029785, "step": 2020 }, { "epoch": 0.53, - "grad_norm": 12.0, + "grad_norm": 8.125, "learning_rate": 2.6643626526448063e-06, - "logits/chosen": -1.5081617832183838, - "logits/rejected": -1.3427668809890747, - "logps/chosen": -560.9156494140625, - "logps/rejected": -640.62646484375, - "loss": 0.4616, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.660341501235962, - "rewards/margins": 1.2213226556777954, - "rewards/rejected": -3.881664276123047, + "logits/chosen": -1.974774718284607, + "logits/rejected": -1.8196189403533936, + "logps/chosen": -454.701904296875, + "logps/rejected": -509.44073486328125, + "loss": 0.4647, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.3952927589416504, + "rewards/margins": 1.014176368713379, + "rewards/rejected": -2.40946888923645, "step": 2030 }, { "epoch": 0.53, - "grad_norm": 11.0, + "grad_norm": 15.125, "learning_rate": 2.6415609094604562e-06, - "logits/chosen": -1.359722375869751, - "logits/rejected": -1.303741216659546, - "logps/chosen": -530.6090698242188, - "logps/rejected": -622.9503784179688, - "loss": 0.4628, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.600862979888916, - "rewards/margins": 1.1137781143188477, - "rewards/rejected": -3.7146408557891846, + "logits/chosen": -1.8369518518447876, + "logits/rejected": -1.7782014608383179, + "logps/chosen": -436.97723388671875, + "logps/rejected": -511.42657470703125, + "loss": 0.4861, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4681875705718994, + "rewards/margins": 0.9622129201889038, + "rewards/rejected": -2.4304006099700928, "step": 2040 }, { "epoch": 0.54, - "grad_norm": 17.25, + "grad_norm": 8.125, "learning_rate": 2.618747345980904e-06, - "logits/chosen": -1.3780386447906494, - "logits/rejected": -1.1855640411376953, - "logps/chosen": -539.1251220703125, - "logps/rejected": -589.020751953125, - "loss": 0.5482, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.908057689666748, - "rewards/margins": 1.0036025047302246, - "rewards/rejected": -3.9116599559783936, + "logits/chosen": -1.8477888107299805, + "logits/rejected": -1.659761667251587, + "logps/chosen": -462.1060485839844, + "logps/rejected": -497.0399475097656, + "loss": 0.5581, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.9495713710784912, + "rewards/margins": 0.898357093334198, + "rewards/rejected": -2.847928524017334, "step": 2050 }, { "epoch": 0.54, - "grad_norm": 6.34375, + "grad_norm": 7.34375, "learning_rate": 2.595923867132136e-06, - "logits/chosen": -1.4118659496307373, - "logits/rejected": -1.2859770059585571, - "logps/chosen": -557.1971435546875, - "logps/rejected": -651.4580078125, - "loss": 0.4842, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.762861728668213, - "rewards/margins": 1.1891956329345703, - "rewards/rejected": -3.952057361602783, + "logits/chosen": -1.8910592794418335, + "logits/rejected": -1.7664788961410522, + "logps/chosen": -513.361328125, + "logps/rejected": -582.758544921875, + "loss": 0.5251, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -2.1317970752716064, + "rewards/margins": 0.9448413848876953, + "rewards/rejected": -3.0766384601593018, "step": 2060 }, { "epoch": 0.54, - "grad_norm": 8.0, + "grad_norm": 6.25, "learning_rate": 2.5730923786680672e-06, - "logits/chosen": -1.3455147743225098, - "logits/rejected": -1.3025658130645752, - "logps/chosen": -508.14849853515625, - "logps/rejected": -603.1004638671875, - "loss": 0.5323, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.5665009021759033, - "rewards/margins": 0.8730229139328003, - "rewards/rejected": -3.439523696899414, + "logits/chosen": -1.8651106357574463, + "logits/rejected": -1.7601064443588257, + "logps/chosen": -450.99859619140625, + "logps/rejected": -541.1217651367188, + "loss": 0.5428, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": -1.829119086265564, + "rewards/margins": 0.8268195986747742, + "rewards/rejected": -2.6559386253356934, "step": 2070 }, { "epoch": 0.54, - "grad_norm": 7.875, + "grad_norm": 6.34375, "learning_rate": 2.5502547870114137e-06, - "logits/chosen": -1.4105392694473267, - "logits/rejected": -1.2930370569229126, - "logps/chosen": -493.2674865722656, - "logps/rejected": -561.6505126953125, - "loss": 0.5025, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.378877878189087, - "rewards/margins": 0.9867528676986694, - "rewards/rejected": -3.365630626678467, + "logits/chosen": -1.845892310142517, + "logits/rejected": -1.7993892431259155, + "logps/chosen": -434.0924377441406, + "logps/rejected": -487.72955322265625, + "loss": 0.5445, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.6064924001693726, + "rewards/margins": 0.8616697192192078, + "rewards/rejected": -2.4681620597839355, "step": 2080 }, { "epoch": 0.55, - "grad_norm": 10.5, + "grad_norm": 7.3125, "learning_rate": 2.527412999094507e-06, - "logits/chosen": -1.4027700424194336, - "logits/rejected": -1.2287507057189941, - "logps/chosen": -542.0404663085938, - "logps/rejected": -644.4595947265625, - "loss": 0.4737, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.483314037322998, - "rewards/margins": 1.1499130725860596, - "rewards/rejected": -3.6332271099090576, + "logits/chosen": -1.8845453262329102, + "logits/rejected": -1.681976318359375, + "logps/chosen": -474.90753173828125, + "logps/rejected": -553.6337890625, + "loss": 0.4916, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.5448286533355713, + "rewards/margins": 0.9603381156921387, + "rewards/rejected": -2.505167007446289, "step": 2090 }, { "epoch": 0.55, - "grad_norm": 10.875, + "grad_norm": 10.375, "learning_rate": 2.504568922200064e-06, - "logits/chosen": -1.3680492639541626, - "logits/rejected": -1.2119042873382568, - "logps/chosen": -485.38580322265625, - "logps/rejected": -582.6237182617188, - "loss": 0.4946, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.440831184387207, - "rewards/margins": 1.1339969635009766, - "rewards/rejected": -3.5748283863067627, + "logits/chosen": -1.8502464294433594, + "logits/rejected": -1.6667206287384033, + "logps/chosen": -398.37567138671875, + "logps/rejected": -467.509765625, + "loss": 0.5058, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3745046854019165, + "rewards/margins": 0.8694229125976562, + "rewards/rejected": -2.243927478790283, "step": 2100 }, { "epoch": 0.55, - "eval_logits/chosen": -1.3435323238372803, - "eval_logits/rejected": -1.2222667932510376, - "eval_logps/chosen": -524.0186767578125, - "eval_logps/rejected": -610.4345092773438, - "eval_loss": 0.4990447759628296, - "eval_rewards/accuracies": 0.7404999732971191, - "eval_rewards/chosen": -2.593912124633789, - "eval_rewards/margins": 1.0645288228988647, - "eval_rewards/rejected": -3.6584410667419434, - "eval_runtime": 785.9589, - "eval_samples_per_second": 2.545, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -1.9302312135696411, + "eval_logits/rejected": -1.8155733346939087, + "eval_logps/chosen": -425.4353332519531, + "eval_logps/rejected": -491.28082275390625, + "eval_loss": 0.5121245384216309, + "eval_rewards/accuracies": 0.7319999933242798, + "eval_rewards/chosen": -1.4030110836029053, + "eval_rewards/margins": 0.8773477673530579, + "eval_rewards/rejected": -2.2803587913513184, + "eval_runtime": 783.2383, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, "step": 2100 }, { "epoch": 0.55, - "grad_norm": 9.0625, + "grad_norm": 8.125, "learning_rate": 2.4817244638019333e-06, - "logits/chosen": -1.4251536130905151, - "logits/rejected": -1.2696633338928223, - "logps/chosen": -537.7713623046875, - "logps/rejected": -598.4697265625, - "loss": 0.5169, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.6200804710388184, - "rewards/margins": 1.0189145803451538, - "rewards/rejected": -3.638995409011841, + "logits/chosen": -1.9113391637802124, + "logits/rejected": -1.7333167791366577, + "logps/chosen": -438.35302734375, + "logps/rejected": -476.8440856933594, + "loss": 0.546, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.412841558456421, + "rewards/margins": 0.8053679466247559, + "rewards/rejected": -2.218209743499756, "step": 2110 }, { "epoch": 0.55, - "grad_norm": 16.0, + "grad_norm": 16.375, "learning_rate": 2.4588815314058155e-06, - "logits/chosen": -1.3950834274291992, - "logits/rejected": -1.3387658596038818, - "logps/chosen": -498.3982849121094, - "logps/rejected": -562.3578491210938, - "loss": 0.4877, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.5290651321411133, - "rewards/margins": 0.9819074869155884, - "rewards/rejected": -3.510972261428833, + "logits/chosen": -1.8352677822113037, + "logits/rejected": -1.8049707412719727, + "logps/chosen": -396.540283203125, + "logps/rejected": -446.5592346191406, + "loss": 0.5137, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.347374677658081, + "rewards/margins": 0.8455110788345337, + "rewards/rejected": -2.1928858757019043, "step": 2120 }, { "epoch": 0.56, - "grad_norm": 13.6875, + "grad_norm": 8.3125, "learning_rate": 2.4360420323899922e-06, - "logits/chosen": -1.4768807888031006, - "logits/rejected": -1.346991777420044, - "logps/chosen": -514.8466796875, - "logps/rejected": -561.6516723632812, - "loss": 0.5809, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.390619993209839, - "rewards/margins": 0.7938958406448364, - "rewards/rejected": -3.1845157146453857, + "logits/chosen": -1.9231681823730469, + "logits/rejected": -1.7653518915176392, + "logps/chosen": -431.0726623535156, + "logps/rejected": -464.889404296875, + "loss": 0.5654, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3148460388183594, + "rewards/margins": 0.712091326713562, + "rewards/rejected": -2.026937246322632, "step": 2130 }, { "epoch": 0.56, - "grad_norm": 6.1875, + "grad_norm": 6.25, "learning_rate": 2.4132078738460585e-06, - "logits/chosen": -1.5245544910430908, - "logits/rejected": -1.3647751808166504, - "logps/chosen": -490.10662841796875, - "logps/rejected": -555.2937622070312, - "loss": 0.4677, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.256990432739258, - "rewards/margins": 1.0371400117874146, - "rewards/rejected": -3.294130802154541, + "logits/chosen": -1.939756155014038, + "logits/rejected": -1.7474992275238037, + "logps/chosen": -411.16943359375, + "logps/rejected": -457.58294677734375, + "loss": 0.4876, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3001420497894287, + "rewards/margins": 0.8735949397087097, + "rewards/rejected": -2.173737049102783, "step": 2140 }, { "epoch": 0.56, - "grad_norm": 12.25, + "grad_norm": 30.0, "learning_rate": 2.3903809624196826e-06, - "logits/chosen": -1.4812016487121582, - "logits/rejected": -1.334385633468628, - "logps/chosen": -449.6355895996094, - "logps/rejected": -501.5064392089844, - "loss": 0.5571, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.1596665382385254, - "rewards/margins": 0.8436049222946167, - "rewards/rejected": -3.0032711029052734, + "logits/chosen": -1.926123857498169, + "logits/rejected": -1.7321586608886719, + "logps/chosen": -378.1445007324219, + "logps/rejected": -406.1599426269531, + "loss": 0.5939, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.2644902467727661, + "rewards/margins": 0.6342250108718872, + "rewards/rejected": -1.8987153768539429, "step": 2150 }, { "epoch": 0.57, - "grad_norm": 13.3125, + "grad_norm": 11.9375, "learning_rate": 2.3675632041513978e-06, - "logits/chosen": -1.5942524671554565, - "logits/rejected": -1.3488222360610962, - "logps/chosen": -509.5716857910156, - "logps/rejected": -551.3196411132812, - "loss": 0.4925, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.2258713245391846, - "rewards/margins": 1.064483880996704, - "rewards/rejected": -3.2903549671173096, + "logits/chosen": -2.0393998622894287, + "logits/rejected": -1.7864320278167725, + "logps/chosen": -425.8330078125, + "logps/rejected": -446.5849609375, + "loss": 0.4931, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2073787450790405, + "rewards/margins": 0.8838081359863281, + "rewards/rejected": -2.091187000274658, "step": 2160 }, { "epoch": 0.57, - "grad_norm": 11.5625, + "grad_norm": 8.5625, "learning_rate": 2.3447565043174533e-06, - "logits/chosen": -1.4272565841674805, - "logits/rejected": -1.2725881338119507, - "logps/chosen": -508.5224609375, - "logps/rejected": -560.1171264648438, - "loss": 0.5312, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.502539873123169, - "rewards/margins": 0.9032621383666992, - "rewards/rejected": -3.4058022499084473, + "logits/chosen": -1.87993586063385, + "logits/rejected": -1.7703927755355835, + "logps/chosen": -428.050048828125, + "logps/rejected": -463.94683837890625, + "loss": 0.5315, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.516704797744751, + "rewards/margins": 0.7439497709274292, + "rewards/rejected": -2.2606546878814697, "step": 2170 }, { "epoch": 0.57, - "grad_norm": 11.75, + "grad_norm": 9.125, "learning_rate": 2.321962767270724e-06, - "logits/chosen": -1.4588415622711182, - "logits/rejected": -1.3032244443893433, - "logps/chosen": -494.3795471191406, - "logps/rejected": -537.76953125, - "loss": 0.5614, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.4449479579925537, - "rewards/margins": 0.8053716421127319, - "rewards/rejected": -3.2503199577331543, + "logits/chosen": -1.932936668395996, + "logits/rejected": -1.7261488437652588, + "logps/chosen": -423.9878845214844, + "logps/rejected": -446.48779296875, + "loss": 0.573, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4952479600906372, + "rewards/margins": 0.677778959274292, + "rewards/rejected": -2.1730268001556396, "step": 2180 }, { "epoch": 0.57, - "grad_norm": 7.8125, + "grad_norm": 7.625, "learning_rate": 2.299183896281692e-06, - "logits/chosen": -1.4126060009002686, - "logits/rejected": -1.268117904663086, - "logps/chosen": -468.32720947265625, - "logps/rejected": -549.8713989257812, - "loss": 0.5261, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.1345205307006836, - "rewards/margins": 0.8381951451301575, - "rewards/rejected": -2.9727156162261963, + "logits/chosen": -1.9751628637313843, + "logits/rejected": -1.7565038204193115, + "logps/chosen": -406.3192443847656, + "logps/rejected": -481.75701904296875, + "loss": 0.5169, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3347368240356445, + "rewards/margins": 0.7907986640930176, + "rewards/rejected": -2.125535488128662, "step": 2190 }, { "epoch": 0.58, - "grad_norm": 6.0, + "grad_norm": 5.65625, "learning_rate": 2.2764217933795297e-06, - "logits/chosen": -1.5007565021514893, - "logits/rejected": -1.3766028881072998, - "logps/chosen": -455.20086669921875, - "logps/rejected": -538.727294921875, - "loss": 0.4809, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.938236951828003, - "rewards/margins": 1.0400770902633667, - "rewards/rejected": -2.97831392288208, + "logits/chosen": -1.9902312755584717, + "logits/rejected": -1.8669850826263428, + "logps/chosen": -404.98345947265625, + "logps/rejected": -479.02001953125, + "loss": 0.491, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.247058629989624, + "rewards/margins": 0.9538729786872864, + "rewards/rejected": -2.2009317874908447, "step": 2200 }, { "epoch": 0.58, - "eval_logits/chosen": -1.3983244895935059, - "eval_logits/rejected": -1.2749797105789185, - "eval_logps/chosen": -464.00067138671875, - "eval_logps/rejected": -537.4633178710938, - "eval_loss": 0.4960455894470215, - "eval_rewards/accuracies": 0.7400000095367432, - "eval_rewards/chosen": -1.99373197555542, - "eval_rewards/margins": 0.934997022151947, - "eval_rewards/rejected": -2.9287290573120117, - "eval_runtime": 786.1842, - "eval_samples_per_second": 2.544, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -2.0052785873413086, + "eval_logits/rejected": -1.8893271684646606, + "eval_logps/chosen": -413.965576171875, + "eval_logps/rejected": -474.9657287597656, + "eval_loss": 0.5101913213729858, + "eval_rewards/accuracies": 0.7300000190734863, + "eval_rewards/chosen": -1.2883129119873047, + "eval_rewards/margins": 0.8288950324058533, + "eval_rewards/rejected": -2.1172077655792236, + "eval_runtime": 783.9156, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, "step": 2200 }, { "epoch": 0.58, - "grad_norm": 4.96875, + "grad_norm": 4.3125, "learning_rate": 2.2536783591932786e-06, - "logits/chosen": -1.556023359298706, - "logits/rejected": -1.400562047958374, - "logps/chosen": -483.48834228515625, - "logps/rejected": -551.9060668945312, - "loss": 0.5219, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.080296516418457, - "rewards/margins": 0.8650667071342468, - "rewards/rejected": -2.9453635215759277, + "logits/chosen": -2.0587801933288574, + "logits/rejected": -1.9291499853134155, + "logps/chosen": -429.5076599121094, + "logps/rejected": -478.2344665527344, + "loss": 0.5481, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.328704595565796, + "rewards/margins": 0.716596245765686, + "rewards/rejected": -2.0453009605407715, "step": 2210 }, { "epoch": 0.58, - "grad_norm": 8.4375, + "grad_norm": 7.125, "learning_rate": 2.230955492793149e-06, - "logits/chosen": -1.3444099426269531, - "logits/rejected": -1.2924230098724365, - "logps/chosen": -513.07080078125, - "logps/rejected": -582.9181518554688, - "loss": 0.5894, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.392636775970459, - "rewards/margins": 0.8276904821395874, - "rewards/rejected": -3.2203269004821777, + "logits/chosen": -1.843507170677185, + "logits/rejected": -1.7773106098175049, + "logps/chosen": -447.97296142578125, + "logps/rejected": -498.5977478027344, + "loss": 0.6047, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5283676385879517, + "rewards/margins": 0.6763461828231812, + "rewards/rejected": -2.204713821411133, "step": 2220 }, { "epoch": 0.58, - "grad_norm": 6.1875, + "grad_norm": 5.875, "learning_rate": 2.208255091531947e-06, - "logits/chosen": -1.361377477645874, - "logits/rejected": -1.2748311758041382, - "logps/chosen": -496.361083984375, - "logps/rejected": -573.513671875, - "loss": 0.4732, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.194149971008301, - "rewards/margins": 1.1210458278656006, - "rewards/rejected": -3.3151957988739014, + "logits/chosen": -1.903603196144104, + "logits/rejected": -1.803851842880249, + "logps/chosen": -430.32830810546875, + "logps/rejected": -484.010009765625, + "loss": 0.4911, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.32736074924469, + "rewards/margins": 0.9304676055908203, + "rewards/rejected": -2.2578284740448, "step": 2230 }, { "epoch": 0.59, - "grad_norm": 10.4375, + "grad_norm": 8.5, "learning_rate": 2.1855790508866435e-06, - "logits/chosen": -1.4328817129135132, - "logits/rejected": -1.3162767887115479, - "logps/chosen": -505.9356994628906, - "logps/rejected": -592.6329345703125, - "loss": 0.4924, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.0898897647857666, - "rewards/margins": 1.0430490970611572, - "rewards/rejected": -3.132939100265503, + "logits/chosen": -1.9459956884384155, + "logits/rejected": -1.7975234985351562, + "logps/chosen": -473.66778564453125, + "logps/rejected": -527.3323974609375, + "loss": 0.5193, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.447183609008789, + "rewards/margins": 0.8262225985527039, + "rewards/rejected": -2.2734062671661377, "step": 2240 }, { "epoch": 0.59, - "grad_norm": 5.6875, + "grad_norm": 5.1875, "learning_rate": 2.162929264300107e-06, - "logits/chosen": -1.4232869148254395, - "logits/rejected": -1.332491159439087, - "logps/chosen": -457.2303771972656, - "logps/rejected": -559.7453002929688, - "loss": 0.4106, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.8832441568374634, - "rewards/margins": 1.2484712600708008, - "rewards/rejected": -3.1317152976989746, + "logits/chosen": -1.8744213581085205, + "logits/rejected": -1.810024619102478, + "logps/chosen": -405.54437255859375, + "logps/rejected": -486.003662109375, + "loss": 0.4564, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.1884064674377441, + "rewards/margins": 1.0178717374801636, + "rewards/rejected": -2.206278085708618, "step": 2250 }, { "epoch": 0.59, - "grad_norm": 12.4375, + "grad_norm": 9.125, "learning_rate": 2.1403076230230006e-06, - "logits/chosen": -1.351418137550354, - "logits/rejected": -1.2313954830169678, - "logps/chosen": -513.0045166015625, - "logps/rejected": -566.1433715820312, - "loss": 0.5914, - "rewards/accuracies": 0.6875, - "rewards/chosen": -2.4189300537109375, - "rewards/margins": 0.8091577291488647, - "rewards/rejected": -3.228087902069092, + "logits/chosen": -1.8847591876983643, + "logits/rejected": -1.7714933156967163, + "logps/chosen": -439.0040588378906, + "logps/rejected": -468.7979431152344, + "loss": 0.6072, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.4334795475006104, + "rewards/margins": 0.630344033241272, + "rewards/rejected": -2.0638232231140137, "step": 2260 }, { "epoch": 0.59, - "grad_norm": 8.6875, + "grad_norm": 7.75, "learning_rate": 2.11771601595586e-06, - "logits/chosen": -1.40321946144104, - "logits/rejected": -1.291913390159607, - "logps/chosen": -520.4403076171875, - "logps/rejected": -564.6587524414062, - "loss": 0.517, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.384366035461426, - "rewards/margins": 0.9645765423774719, - "rewards/rejected": -3.348942518234253, + "logits/chosen": -1.925467848777771, + "logits/rejected": -1.8592636585235596, + "logps/chosen": -436.3429260253906, + "logps/rejected": -466.99810791015625, + "loss": 0.5333, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3296717405319214, + "rewards/margins": 0.8554004430770874, + "rewards/rejected": -2.185072422027588, "step": 2270 }, { "epoch": 0.6, - "grad_norm": 12.0, + "grad_norm": 9.4375, "learning_rate": 2.0951563294913737e-06, - "logits/chosen": -1.3978488445281982, - "logits/rejected": -1.185626745223999, - "logps/chosen": -489.99456787109375, - "logps/rejected": -556.4645385742188, - "loss": 0.4735, - "rewards/accuracies": 0.8125, - "rewards/chosen": -2.2852938175201416, - "rewards/margins": 0.9692630767822266, - "rewards/rejected": -3.254556655883789, + "logits/chosen": -2.0011866092681885, + "logits/rejected": -1.7229188680648804, + "logps/chosen": -401.23773193359375, + "logps/rejected": -455.4305114746094, + "loss": 0.4779, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.1956737041473389, + "rewards/margins": 0.8523205518722534, + "rewards/rejected": -2.0479941368103027, "step": 2280 }, { "epoch": 0.6, - "grad_norm": 6.59375, + "grad_norm": 6.25, "learning_rate": 2.0726304473568693e-06, - "logits/chosen": -1.3983738422393799, - "logits/rejected": -1.2773765325546265, - "logps/chosen": -481.07208251953125, - "logps/rejected": -549.56640625, - "loss": 0.4708, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.221526622772217, - "rewards/margins": 1.0086002349853516, - "rewards/rejected": -3.2301268577575684, + "logits/chosen": -2.045499563217163, + "logits/rejected": -1.9166113138198853, + "logps/chosen": -397.6915588378906, + "logps/rejected": -439.178466796875, + "loss": 0.5102, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.1919631958007812, + "rewards/margins": 0.771680474281311, + "rewards/rejected": -1.9636436700820923, "step": 2290 }, { "epoch": 0.6, - "grad_norm": 9.875, + "grad_norm": 7.4375, "learning_rate": 2.050140250457023e-06, - "logits/chosen": -1.4776570796966553, - "logits/rejected": -1.261081576347351, - "logps/chosen": -533.631591796875, - "logps/rejected": -610.4840087890625, - "loss": 0.4721, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.561835765838623, - "rewards/margins": 1.1079896688461304, - "rewards/rejected": -3.669825792312622, + "logits/chosen": -2.0833168029785156, + "logits/rejected": -1.8359079360961914, + "logps/chosen": -419.4093322753906, + "logps/rejected": -470.44256591796875, + "loss": 0.4923, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.2162163257598877, + "rewards/margins": 0.8549982309341431, + "rewards/rejected": -2.0712146759033203, "step": 2300 }, { "epoch": 0.6, - "eval_logits/chosen": -1.2803525924682617, - "eval_logits/rejected": -1.1592578887939453, - "eval_logps/chosen": -538.886474609375, - "eval_logps/rejected": -635.14892578125, - "eval_loss": 0.49942103028297424, - "eval_rewards/accuracies": 0.7409999966621399, - "eval_rewards/chosen": -2.74259090423584, - "eval_rewards/margins": 1.162994146347046, - "eval_rewards/rejected": -3.9055850505828857, - "eval_runtime": 784.6528, - "eval_samples_per_second": 2.549, + "eval_logits/chosen": -2.0101025104522705, + "eval_logits/rejected": -1.8948729038238525, + "eval_logps/chosen": -409.7294616699219, + "eval_logps/rejected": -472.4915771484375, + "eval_loss": 0.5106725096702576, + "eval_rewards/accuracies": 0.7319999933242798, + "eval_rewards/chosen": -1.245951771736145, + "eval_rewards/margins": 0.8465145230293274, + "eval_rewards/rejected": -2.092466354370117, + "eval_runtime": 782.9612, + "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.319, "step": 2300 }, { "epoch": 0.6, - "grad_norm": 15.4375, + "grad_norm": 8.9375, "learning_rate": 2.0276876167168042e-06, - "logits/chosen": -1.2227513790130615, - "logits/rejected": -1.1273781061172485, - "logps/chosen": -501.6146545410156, - "logps/rejected": -578.4463500976562, - "loss": 0.5789, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.771559238433838, - "rewards/margins": 1.0941669940948486, - "rewards/rejected": -3.8657259941101074, + "logits/chosen": -1.845709204673767, + "logits/rejected": -1.7635509967803955, + "logps/chosen": -373.54425048828125, + "logps/rejected": -412.81805419921875, + "loss": 0.5546, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.249789834022522, + "rewards/margins": 0.7671625018119812, + "rewards/rejected": -2.0169520378112793, "step": 2310 }, { "epoch": 0.61, - "grad_norm": 9.875, + "grad_norm": 6.5625, "learning_rate": 2.0052744209246682e-06, - "logits/chosen": -1.3600375652313232, - "logits/rejected": -1.242816686630249, - "logps/chosen": -535.4923706054688, - "logps/rejected": -608.05859375, - "loss": 0.5101, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.8023643493652344, - "rewards/margins": 1.0536280870437622, - "rewards/rejected": -3.855992555618286, + "logits/chosen": -1.970632791519165, + "logits/rejected": -1.8350298404693604, + "logps/chosen": -410.10125732421875, + "logps/rejected": -451.84124755859375, + "loss": 0.521, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.3679193258285522, + "rewards/margins": 0.7870801687240601, + "rewards/rejected": -2.1549994945526123, "step": 2320 }, { "epoch": 0.61, - "grad_norm": 11.75, + "grad_norm": 10.4375, "learning_rate": 1.9829025345760127e-06, - "logits/chosen": -1.358417272567749, - "logits/rejected": -1.3312757015228271, - "logps/chosen": -533.0018310546875, - "logps/rejected": -619.017822265625, - "loss": 0.5438, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.5403590202331543, - "rewards/margins": 0.920685887336731, - "rewards/rejected": -3.4610447883605957, + "logits/chosen": -1.9096359014511108, + "logits/rejected": -1.947090744972229, + "logps/chosen": -431.58319091796875, + "logps/rejected": -487.96466064453125, + "loss": 0.5675, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.311854600906372, + "rewards/margins": 0.6661221385002136, + "rewards/rejected": -1.9779767990112305, "step": 2330 }, { "epoch": 0.61, - "grad_norm": 8.75, + "grad_norm": 8.5, "learning_rate": 1.9605738257169115e-06, - "logits/chosen": -1.3306870460510254, - "logits/rejected": -1.1574633121490479, - "logps/chosen": -481.7784729003906, - "logps/rejected": -590.2117919921875, - "loss": 0.4974, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.488585948944092, - "rewards/margins": 1.1636273860931396, - "rewards/rejected": -3.6522133350372314, + "logits/chosen": -1.928025245666504, + "logits/rejected": -1.657149314880371, + "logps/chosen": -391.2662048339844, + "logps/rejected": -477.40106201171875, + "loss": 0.4978, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3763009309768677, + "rewards/margins": 0.9396868944168091, + "rewards/rejected": -2.3159878253936768, "step": 2340 }, { "epoch": 0.62, - "grad_norm": 10.9375, + "grad_norm": 8.3125, "learning_rate": 1.9382901587881275e-06, - "logits/chosen": -1.3847217559814453, - "logits/rejected": -1.2626315355300903, - "logps/chosen": -491.93438720703125, - "logps/rejected": -581.0950927734375, - "loss": 0.4138, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.3320419788360596, - "rewards/margins": 1.2147138118743896, - "rewards/rejected": -3.54675555229187, + "logits/chosen": -1.9695905447006226, + "logits/rejected": -1.8236945867538452, + "logps/chosen": -399.340087890625, + "logps/rejected": -462.8815002441406, + "loss": 0.4442, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.215022325515747, + "rewards/margins": 1.0116783380508423, + "rewards/rejected": -2.226700782775879, "step": 2350 }, { "epoch": 0.62, - "grad_norm": 19.375, + "grad_norm": 10.125, "learning_rate": 1.916053394469437e-06, - "logits/chosen": -1.3969483375549316, - "logits/rejected": -1.1894080638885498, - "logps/chosen": -518.6019287109375, - "logps/rejected": -613.1328735351562, - "loss": 0.5218, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.553534984588623, - "rewards/margins": 1.090774655342102, - "rewards/rejected": -3.6443095207214355, + "logits/chosen": -1.9849417209625244, + "logits/rejected": -1.7118148803710938, + "logps/chosen": -412.3340759277344, + "logps/rejected": -497.0166015625, + "loss": 0.4895, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.307409644126892, + "rewards/margins": 0.9881817102432251, + "rewards/rejected": -2.295591354370117, "step": 2360 }, { "epoch": 0.62, - "grad_norm": 11.0, + "grad_norm": 7.71875, "learning_rate": 1.8938653895242604e-06, - "logits/chosen": -1.3663508892059326, - "logits/rejected": -1.1833648681640625, - "logps/chosen": -508.30267333984375, - "logps/rejected": -601.1873168945312, - "loss": 0.4356, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -2.4409778118133545, - "rewards/margins": 1.2163755893707275, - "rewards/rejected": -3.657353639602661, + "logits/chosen": -1.9592363834381104, + "logits/rejected": -1.747896432876587, + "logps/chosen": -414.3805236816406, + "logps/rejected": -484.31842041015625, + "loss": 0.4682, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.3323676586151123, + "rewards/margins": 1.001394510269165, + "rewards/rejected": -2.3337621688842773, "step": 2370 }, { "epoch": 0.62, - "grad_norm": 11.0625, + "grad_norm": 8.375, "learning_rate": 1.8717279966446267e-06, - "logits/chosen": -1.2062112092971802, - "logits/rejected": -1.125889539718628, - "logps/chosen": -510.49578857421875, - "logps/rejected": -616.1349487304688, - "loss": 0.4488, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.619786024093628, - "rewards/margins": 1.1522109508514404, - "rewards/rejected": -3.7719969749450684, + "logits/chosen": -1.8209333419799805, + "logits/rejected": -1.7357136011123657, + "logps/chosen": -419.76885986328125, + "logps/rejected": -493.34686279296875, + "loss": 0.5022, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4863672256469727, + "rewards/margins": 0.8758360743522644, + "rewards/rejected": -2.3622031211853027, "step": 2380 }, { "epoch": 0.63, - "grad_norm": 8.1875, + "grad_norm": 5.6875, "learning_rate": 1.8496430642964698e-06, - "logits/chosen": -1.2681883573532104, - "logits/rejected": -1.1605939865112305, - "logps/chosen": -538.9415893554688, - "logps/rejected": -628.6326293945312, - "loss": 0.4746, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.6662049293518066, - "rewards/margins": 1.1333577632904053, - "rewards/rejected": -3.799562931060791, + "logits/chosen": -1.899411916732788, + "logits/rejected": -1.8176219463348389, + "logps/chosen": -425.37030029296875, + "logps/rejected": -482.58197021484375, + "loss": 0.536, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3191746473312378, + "rewards/margins": 0.8288755416870117, + "rewards/rejected": -2.148050308227539, "step": 2390 }, { "epoch": 0.63, - "grad_norm": 13.125, + "grad_norm": 4.75, "learning_rate": 1.827612436565286e-06, - "logits/chosen": -1.2723603248596191, - "logits/rejected": -1.120650291442871, - "logps/chosen": -525.9894409179688, - "logps/rejected": -625.4840087890625, - "loss": 0.4693, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.6309449672698975, - "rewards/margins": 1.2005013227462769, - "rewards/rejected": -3.8314461708068848, + "logits/chosen": -1.941320776939392, + "logits/rejected": -1.7779874801635742, + "logps/chosen": -411.2789611816406, + "logps/rejected": -485.43328857421875, + "loss": 0.4718, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2878601551055908, + "rewards/margins": 0.9265607595443726, + "rewards/rejected": -2.214420795440674, "step": 2400 }, { "epoch": 0.63, - "eval_logits/chosen": -1.2052682638168335, - "eval_logits/rejected": -1.0848991870880127, - "eval_logps/chosen": -527.1746215820312, - "eval_logps/rejected": -621.5708618164062, - "eval_loss": 0.49804455041885376, - "eval_rewards/accuracies": 0.7404999732971191, - "eval_rewards/chosen": -2.625471591949463, - "eval_rewards/margins": 1.144333004951477, - "eval_rewards/rejected": -3.7698044776916504, - "eval_runtime": 785.6898, - "eval_samples_per_second": 2.546, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -1.9757143259048462, + "eval_logits/rejected": -1.8618159294128418, + "eval_logps/chosen": -419.5652770996094, + "eval_logps/rejected": -482.2935791015625, + "eval_loss": 0.5093097686767578, + "eval_rewards/accuracies": 0.7264999747276306, + "eval_rewards/chosen": -1.3443105220794678, + "eval_rewards/margins": 0.846176028251648, + "eval_rewards/rejected": -2.1904866695404053, + "eval_runtime": 783.1788, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, "step": 2400 }, { "epoch": 0.63, - "grad_norm": 16.875, + "grad_norm": 8.3125, "learning_rate": 1.8056379530021492e-06, - "logits/chosen": -1.328696846961975, - "logits/rejected": -1.2462583780288696, - "logps/chosen": -509.60870361328125, - "logps/rejected": -574.6074829101562, - "loss": 0.5497, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.6544697284698486, - "rewards/margins": 0.8983172178268433, - "rewards/rejected": -3.5527873039245605, + "logits/chosen": -1.9571435451507568, + "logits/rejected": -1.8972694873809814, + "logps/chosen": -400.8582763671875, + "logps/rejected": -447.929931640625, + "loss": 0.527, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3873006105422974, + "rewards/margins": 0.7267057299613953, + "rewards/rejected": -2.114006519317627, "step": 2410 }, { "epoch": 0.63, - "grad_norm": 15.75, + "grad_norm": 8.1875, "learning_rate": 1.7837214484701154e-06, - "logits/chosen": -1.3654658794403076, - "logits/rejected": -1.242241621017456, - "logps/chosen": -470.0826721191406, - "logps/rejected": -560.571044921875, - "loss": 0.4824, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.178126573562622, - "rewards/margins": 1.1377776861190796, - "rewards/rejected": -3.315903902053833, + "logits/chosen": -1.9466660022735596, + "logits/rejected": -1.8471206426620483, + "logps/chosen": -389.0110168457031, + "logps/rejected": -453.07257080078125, + "loss": 0.4846, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1683722734451294, + "rewards/margins": 0.9214507937431335, + "rewards/rejected": -2.089823007583618, "step": 2420 }, { "epoch": 0.64, - "grad_norm": 14.25, + "grad_norm": 8.9375, "learning_rate": 1.7618647529910043e-06, - "logits/chosen": -1.3804328441619873, - "logits/rejected": -1.2497543096542358, - "logps/chosen": -473.36566162109375, - "logps/rejected": -571.909912109375, - "loss": 0.4992, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.16664457321167, - "rewards/margins": 1.1039235591888428, - "rewards/rejected": -3.2705681324005127, + "logits/chosen": -1.9525439739227295, + "logits/rejected": -1.8127120733261108, + "logps/chosen": -403.1404724121094, + "logps/rejected": -473.431396484375, + "loss": 0.5026, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2594850063323975, + "rewards/margins": 0.8663209080696106, + "rewards/rejected": -2.1258060932159424, "step": 2430 }, { "epoch": 0.64, - "grad_norm": 8.75, + "grad_norm": 7.84375, "learning_rate": 1.7400696915925996e-06, - "logits/chosen": -1.3891335725784302, - "logits/rejected": -1.1996538639068604, - "logps/chosen": -495.14697265625, - "logps/rejected": -543.0442504882812, - "loss": 0.512, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.24596905708313, - "rewards/margins": 1.0613727569580078, - "rewards/rejected": -3.3073418140411377, + "logits/chosen": -1.9792495965957642, + "logits/rejected": -1.7760179042816162, + "logps/chosen": -421.8628845214844, + "logps/rejected": -445.33551025390625, + "loss": 0.5126, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.304434061050415, + "rewards/margins": 0.8763524293899536, + "rewards/rejected": -2.180786371231079, "step": 2440 }, { "epoch": 0.64, - "grad_norm": 11.0625, + "grad_norm": 12.3125, "learning_rate": 1.718338084156254e-06, - "logits/chosen": -1.3508188724517822, - "logits/rejected": -1.1968786716461182, - "logps/chosen": -500.3172302246094, - "logps/rejected": -570.3558959960938, - "loss": 0.4664, + "logits/chosen": -1.9452136754989624, + "logits/rejected": -1.7868611812591553, + "logps/chosen": -435.8577575683594, + "logps/rejected": -489.53997802734375, + "loss": 0.4592, "rewards/accuracies": 0.78125, - "rewards/chosen": -2.1358048915863037, - "rewards/margins": 1.0841740369796753, - "rewards/rejected": -3.2199790477752686, + "rewards/chosen": -1.263363242149353, + "rewards/margins": 0.9536368250846863, + "rewards/rejected": -2.2170000076293945, "step": 2450 }, { "epoch": 0.64, - "grad_norm": 10.5625, + "grad_norm": 17.875, "learning_rate": 1.6966717452649372e-06, - "logits/chosen": -1.4619855880737305, - "logits/rejected": -1.2985750436782837, - "logps/chosen": -488.65850830078125, - "logps/rejected": -548.7929077148438, - "loss": 0.4381, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.1176390647888184, - "rewards/margins": 1.1195837259292603, - "rewards/rejected": -3.237222671508789, + "logits/chosen": -1.931109070777893, + "logits/rejected": -1.7780697345733643, + "logps/chosen": -429.203369140625, + "logps/rejected": -467.59588623046875, + "loss": 0.4606, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2935905456542969, + "rewards/margins": 0.9742799997329712, + "rewards/rejected": -2.2678704261779785, "step": 2460 }, { "epoch": 0.65, - "grad_norm": 8.6875, + "grad_norm": 7.53125, "learning_rate": 1.6750724840517103e-06, - "logits/chosen": -1.4035240411758423, - "logits/rejected": -1.326468586921692, - "logps/chosen": -475.1392517089844, - "logps/rejected": -569.88427734375, - "loss": 0.5292, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.223914623260498, - "rewards/margins": 0.9064715504646301, - "rewards/rejected": -3.1303858757019043, + "logits/chosen": -1.9360177516937256, + "logits/rejected": -1.8184916973114014, + "logps/chosen": -402.77056884765625, + "logps/rejected": -478.7181091308594, + "loss": 0.5304, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.2702269554138184, + "rewards/margins": 0.7718670964241028, + "rewards/rejected": -2.0420939922332764, "step": 2470 }, { "epoch": 0.65, - "grad_norm": 11.9375, + "grad_norm": 9.8125, "learning_rate": 1.6535421040486686e-06, - "logits/chosen": -1.2256592512130737, - "logits/rejected": -1.1291966438293457, - "logps/chosen": -491.3338928222656, - "logps/rejected": -581.4615478515625, - "loss": 0.427, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -2.3804726600646973, - "rewards/margins": 1.2432738542556763, - "rewards/rejected": -3.623746156692505, + "logits/chosen": -1.799547791481018, + "logits/rejected": -1.6973329782485962, + "logps/chosen": -416.2311096191406, + "logps/rejected": -489.39935302734375, + "loss": 0.4687, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.419057011604309, + "rewards/margins": 1.0684629678726196, + "rewards/rejected": -2.4875199794769287, "step": 2480 }, { "epoch": 0.65, - "grad_norm": 12.5, + "grad_norm": 10.75, "learning_rate": 1.6320824030363458e-06, - "logits/chosen": -1.3096754550933838, - "logits/rejected": -1.2545294761657715, - "logps/chosen": -478.04998779296875, - "logps/rejected": -581.6156616210938, - "loss": 0.4561, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.4329066276550293, - "rewards/margins": 1.1900606155395508, - "rewards/rejected": -3.6229679584503174, + "logits/chosen": -1.8740527629852295, + "logits/rejected": -1.8057101964950562, + "logps/chosen": -390.3742980957031, + "logps/rejected": -475.86737060546875, + "loss": 0.4615, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.374366044998169, + "rewards/margins": 1.031112790107727, + "rewards/rejected": -2.4054789543151855, "step": 2490 }, { "epoch": 0.65, - "grad_norm": 12.8125, + "grad_norm": 8.875, "learning_rate": 1.6106951728936028e-06, - "logits/chosen": -1.4158105850219727, - "logits/rejected": -1.2867882251739502, - "logps/chosen": -494.7313537597656, - "logps/rejected": -591.484375, - "loss": 0.5, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.375338554382324, - "rewards/margins": 1.0466548204421997, - "rewards/rejected": -3.4219937324523926, + "logits/chosen": -2.002263307571411, + "logits/rejected": -1.8575477600097656, + "logps/chosen": -405.1084289550781, + "logps/rejected": -480.92327880859375, + "loss": 0.5187, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.2944339513778687, + "rewards/margins": 0.833656907081604, + "rewards/rejected": -2.1280908584594727, "step": 2500 }, { "epoch": 0.65, - "eval_logits/chosen": -1.2915215492248535, - "eval_logits/rejected": -1.166710615158081, - "eval_logps/chosen": -499.8446960449219, - "eval_logps/rejected": -589.3930053710938, - "eval_loss": 0.4927913546562195, - "eval_rewards/accuracies": 0.7425000071525574, - "eval_rewards/chosen": -2.352172374725342, - "eval_rewards/margins": 1.0958540439605713, - "eval_rewards/rejected": -3.448026418685913, - "eval_runtime": 785.167, - "eval_samples_per_second": 2.547, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -1.9795697927474976, + "eval_logits/rejected": -1.8640780448913574, + "eval_logps/chosen": -417.4667663574219, + "eval_logps/rejected": -484.1988220214844, + "eval_loss": 0.5102798342704773, + "eval_rewards/accuracies": 0.7285000085830688, + "eval_rewards/chosen": -1.3233253955841064, + "eval_rewards/margins": 0.8862132430076599, + "eval_rewards/rejected": -2.2095389366149902, + "eval_runtime": 784.1373, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, "step": 2500 }, { "epoch": 0.66, - "grad_norm": 7.875, + "grad_norm": 6.0, "learning_rate": 1.5893821994479996e-06, - "logits/chosen": -1.410304307937622, - "logits/rejected": -1.2979320287704468, - "logps/chosen": -498.3885192871094, - "logps/rejected": -573.0137939453125, - "loss": 0.4767, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.2099618911743164, - "rewards/margins": 1.131973147392273, - "rewards/rejected": -3.3419349193573, + "logits/chosen": -1.9972072839736938, + "logits/rejected": -1.8986774682998657, + "logps/chosen": -403.4590759277344, + "logps/rejected": -454.43585205078125, + "loss": 0.481, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.087583303451538, + "rewards/margins": 0.914753258228302, + "rewards/rejected": -2.0023367404937744, "step": 2510 }, { "epoch": 0.66, - "grad_norm": 10.4375, + "grad_norm": 7.375, "learning_rate": 1.5681452623266868e-06, - "logits/chosen": -1.387092113494873, - "logits/rejected": -1.1502468585968018, - "logps/chosen": -521.6098022460938, - "logps/rejected": -584.9129028320312, - "loss": 0.4678, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.3259663581848145, - "rewards/margins": 1.215375304222107, - "rewards/rejected": -3.541342258453369, + "logits/chosen": -1.9567502737045288, + "logits/rejected": -1.769144058227539, + "logps/chosen": -429.75860595703125, + "logps/rejected": -468.85382080078125, + "loss": 0.4778, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2228810787200928, + "rewards/margins": 0.9947159886360168, + "rewards/rejected": -2.217597007751465, "step": 2520 }, { "epoch": 0.66, - "grad_norm": 9.0625, + "grad_norm": 6.53125, "learning_rate": 1.5469861348078014e-06, - "logits/chosen": -1.3792612552642822, - "logits/rejected": -1.23573899269104, - "logps/chosen": -492.11932373046875, - "logps/rejected": -602.9713745117188, - "loss": 0.4293, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.4535579681396484, - "rewards/margins": 1.2147105932235718, - "rewards/rejected": -3.6682686805725098, + "logits/chosen": -1.967635154724121, + "logits/rejected": -1.8467252254486084, + "logps/chosen": -392.23504638671875, + "logps/rejected": -474.0735778808594, + "loss": 0.4835, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.249990701675415, + "rewards/margins": 0.9654566049575806, + "rewards/rejected": -2.215447187423706, "step": 2530 }, { "epoch": 0.66, - "grad_norm": 10.25, + "grad_norm": 7.4375, "learning_rate": 1.5259065836724035e-06, - "logits/chosen": -1.2396007776260376, - "logits/rejected": -1.18125581741333, - "logps/chosen": -500.73150634765625, - "logps/rejected": -620.1203002929688, - "loss": 0.4274, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.559818983078003, - "rewards/margins": 1.2554031610488892, - "rewards/rejected": -3.8152222633361816, + "logits/chosen": -1.803320288658142, + "logits/rejected": -1.7856988906860352, + "logps/chosen": -396.5582275390625, + "logps/rejected": -478.5979919433594, + "loss": 0.49, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3036736249923706, + "rewards/margins": 0.9293512105941772, + "rewards/rejected": -2.233024835586548, "step": 2540 }, { "epoch": 0.67, - "grad_norm": 20.25, + "grad_norm": 8.25, "learning_rate": 1.5049083690569456e-06, - "logits/chosen": -1.2867457866668701, - "logits/rejected": -1.1846749782562256, - "logps/chosen": -513.3667602539062, - "logps/rejected": -623.0715942382812, - "loss": 0.5211, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.7591919898986816, - "rewards/margins": 1.1411640644073486, - "rewards/rejected": -3.900355815887451, + "logits/chosen": -1.8403940200805664, + "logits/rejected": -1.6996625661849976, + "logps/chosen": -382.8592529296875, + "logps/rejected": -459.94952392578125, + "loss": 0.5264, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2738138437271118, + "rewards/margins": 0.8273483514785767, + "rewards/rejected": -2.1011624336242676, "step": 2550 }, { "epoch": 0.67, - "grad_norm": 14.0, + "grad_norm": 12.3125, "learning_rate": 1.4839932443063057e-06, - "logits/chosen": -1.3069757223129272, - "logits/rejected": -1.1448779106140137, - "logps/chosen": -552.1356811523438, - "logps/rejected": -609.2958984375, - "loss": 0.4779, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.6020736694335938, - "rewards/margins": 1.1343077421188354, - "rewards/rejected": -3.7363815307617188, + "logits/chosen": -1.9228919744491577, + "logits/rejected": -1.7470515966415405, + "logps/chosen": -434.42767333984375, + "logps/rejected": -461.97943115234375, + "loss": 0.4903, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.238571047782898, + "rewards/margins": 0.861971378326416, + "rewards/rejected": -2.1005425453186035, "step": 2560 }, { "epoch": 0.67, - "grad_norm": 15.9375, + "grad_norm": 11.25, "learning_rate": 1.4631629558273803e-06, - "logits/chosen": -1.319197416305542, - "logits/rejected": -1.2164061069488525, - "logps/chosen": -505.0912170410156, - "logps/rejected": -576.6810302734375, - "loss": 0.6199, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -2.6343064308166504, - "rewards/margins": 0.8322671055793762, - "rewards/rejected": -3.4665732383728027, + "logits/chosen": -1.851351022720337, + "logits/rejected": -1.8053207397460938, + "logps/chosen": -394.43255615234375, + "logps/rejected": -451.2469787597656, + "loss": 0.5683, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.342629313468933, + "rewards/margins": 0.7212954163551331, + "rewards/rejected": -2.063924551010132, "step": 2570 }, { "epoch": 0.68, - "grad_norm": 6.125, + "grad_norm": 5.09375, "learning_rate": 1.4424192429432657e-06, - "logits/chosen": -1.3904783725738525, - "logits/rejected": -1.3104830980300903, - "logps/chosen": -471.5389709472656, - "logps/rejected": -583.8826293945312, - "loss": 0.4716, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.1569409370422363, - "rewards/margins": 1.0904219150543213, - "rewards/rejected": -3.247363328933716, + "logits/chosen": -1.888126015663147, + "logits/rejected": -1.8805888891220093, + "logps/chosen": -380.94451904296875, + "logps/rejected": -482.2357482910156, + "loss": 0.4703, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0602384805679321, + "rewards/margins": 0.9893448948860168, + "rewards/rejected": -2.0495834350585938, "step": 2580 }, { "epoch": 0.68, - "grad_norm": 12.75, + "grad_norm": 11.5, "learning_rate": 1.421763837748016e-06, - "logits/chosen": -1.3573510646820068, - "logits/rejected": -1.2631666660308838, - "logps/chosen": -475.02264404296875, - "logps/rejected": -580.511474609375, - "loss": 0.4562, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.244671583175659, - "rewards/margins": 1.139622688293457, - "rewards/rejected": -3.3842945098876953, + "logits/chosen": -1.8856582641601562, + "logits/rejected": -1.7754625082015991, + "logps/chosen": -404.8888244628906, + "logps/rejected": -481.6700744628906, + "loss": 0.4836, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2667722702026367, + "rewards/margins": 0.9201840162277222, + "rewards/rejected": -2.1869561672210693, "step": 2590 }, { "epoch": 0.68, - "grad_norm": 9.3125, + "grad_norm": 6.625, "learning_rate": 1.401198464962021e-06, - "logits/chosen": -1.4000240564346313, - "logits/rejected": -1.2207516431808472, - "logps/chosen": -510.6034240722656, - "logps/rejected": -570.3859252929688, - "loss": 0.4706, + "logits/chosen": -1.9382120370864868, + "logits/rejected": -1.6832538843154907, + "logps/chosen": -417.79046630859375, + "logps/rejected": -458.81719970703125, + "loss": 0.5025, "rewards/accuracies": 0.75, - "rewards/chosen": -2.4208061695098877, - "rewards/margins": 0.9837051630020142, - "rewards/rejected": -3.4045116901397705, + "rewards/chosen": -1.303044080734253, + "rewards/margins": 0.8441787958145142, + "rewards/rejected": -2.1472229957580566, "step": 2600 }, { "epoch": 0.68, - "eval_logits/chosen": -1.2960612773895264, - "eval_logits/rejected": -1.1720730066299438, - "eval_logps/chosen": -504.3380126953125, - "eval_logps/rejected": -593.6089477539062, - "eval_loss": 0.49207746982574463, - "eval_rewards/accuracies": 0.7390000224113464, - "eval_rewards/chosen": -2.3971054553985596, - "eval_rewards/margins": 1.0930795669555664, - "eval_rewards/rejected": -3.490184783935547, - "eval_runtime": 784.7098, - "eval_samples_per_second": 2.549, + "eval_logits/chosen": -1.9538325071334839, + "eval_logits/rejected": -1.8387821912765503, + "eval_logps/chosen": -414.2358703613281, + "eval_logps/rejected": -481.66204833984375, + "eval_loss": 0.5114797353744507, + "eval_rewards/accuracies": 0.7315000295639038, + "eval_rewards/chosen": -1.2910163402557373, + "eval_rewards/margins": 0.8931546807289124, + "eval_rewards/rejected": -2.184170961380005, + "eval_runtime": 783.3661, + "eval_samples_per_second": 2.553, "eval_steps_per_second": 0.319, "step": 2600 }, { "epoch": 0.68, - "grad_norm": 11.875, + "grad_norm": 12.3125, "learning_rate": 1.3807248417879896e-06, - "logits/chosen": -1.448823094367981, - "logits/rejected": -1.3387176990509033, - "logps/chosen": -505.70074462890625, - "logps/rejected": -603.4103393554688, - "loss": 0.4594, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.3266615867614746, - "rewards/margins": 1.1787439584732056, - "rewards/rejected": -3.5054054260253906, + "logits/chosen": -1.939713716506958, + "logits/rejected": -1.8573747873306274, + "logps/chosen": -410.3163146972656, + "logps/rejected": -483.0690002441406, + "loss": 0.4874, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.1922686100006104, + "rewards/margins": 0.9466792941093445, + "rewards/rejected": -2.1389479637145996, "step": 2610 }, { "epoch": 0.69, - "grad_norm": 25.0, + "grad_norm": 18.625, "learning_rate": 1.3603446777675665e-06, - "logits/chosen": -1.296796441078186, - "logits/rejected": -1.1804838180541992, - "logps/chosen": -518.1197509765625, - "logps/rejected": -606.1761474609375, - "loss": 0.5191, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.5498392581939697, - "rewards/margins": 1.1014236211776733, - "rewards/rejected": -3.6512627601623535, + "logits/chosen": -1.8353694677352905, + "logits/rejected": -1.7406032085418701, + "logps/chosen": -430.1822814941406, + "logps/rejected": -488.7582092285156, + "loss": 0.5383, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4513074159622192, + "rewards/margins": 0.8473714590072632, + "rewards/rejected": -2.2986786365509033, "step": 2620 }, { "epoch": 0.69, - "grad_norm": 8.0, + "grad_norm": 6.96875, "learning_rate": 1.3400596746385817e-06, - "logits/chosen": -1.4230577945709229, - "logits/rejected": -1.2577768564224243, - "logps/chosen": -517.9698486328125, - "logps/rejected": -597.4027099609375, - "loss": 0.5, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.4883761405944824, - "rewards/margins": 1.0656553506851196, - "rewards/rejected": -3.5540313720703125, + "logits/chosen": -1.92405104637146, + "logits/rejected": -1.7130537033081055, + "logps/chosen": -423.2511291503906, + "logps/rejected": -478.33294677734375, + "loss": 0.518, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3243486881256104, + "rewards/margins": 0.8681432604789734, + "rewards/rejected": -2.1924920082092285, "step": 2630 }, { "epoch": 0.69, - "grad_norm": 8.1875, + "grad_norm": 8.875, "learning_rate": 1.3198715261929587e-06, - "logits/chosen": -1.383465051651001, - "logits/rejected": -1.2352216243743896, - "logps/chosen": -500.38348388671875, - "logps/rejected": -602.236572265625, - "loss": 0.4268, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.6397364139556885, - "rewards/margins": 1.1547033786773682, - "rewards/rejected": -3.7944397926330566, + "logits/chosen": -1.888300895690918, + "logits/rejected": -1.7400954961776733, + "logps/chosen": -396.8492736816406, + "logps/rejected": -486.35400390625, + "loss": 0.4583, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.449106216430664, + "rewards/margins": 0.9863926768302917, + "rewards/rejected": -2.4354991912841797, "step": 2640 }, { "epoch": 0.69, - "grad_norm": 8.0625, + "grad_norm": 5.28125, "learning_rate": 1.2997819181352823e-06, - "logits/chosen": -1.4076334238052368, - "logits/rejected": -1.253779649734497, - "logps/chosen": -545.2298583984375, - "logps/rejected": -658.158447265625, - "loss": 0.4235, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.48392391204834, - "rewards/margins": 1.3746708631515503, - "rewards/rejected": -3.858595371246338, + "logits/chosen": -1.9277763366699219, + "logits/rejected": -1.792421579360962, + "logps/chosen": -438.722900390625, + "logps/rejected": -527.2325439453125, + "loss": 0.4197, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.1698882579803467, + "rewards/margins": 1.157183289527893, + "rewards/rejected": -2.3270716667175293, "step": 2650 }, { "epoch": 0.7, - "grad_norm": 23.125, + "grad_norm": 20.625, "learning_rate": 1.2797925279420454e-06, - "logits/chosen": -1.3712605237960815, - "logits/rejected": -1.2327440977096558, - "logps/chosen": -558.10205078125, - "logps/rejected": -669.1498413085938, - "loss": 0.4676, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.8626303672790527, - "rewards/margins": 1.2305885553359985, - "rewards/rejected": -4.093218803405762, + "logits/chosen": -1.9607807397842407, + "logits/rejected": -1.7774593830108643, + "logps/chosen": -437.16436767578125, + "logps/rejected": -518.04931640625, + "loss": 0.5167, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4784574508666992, + "rewards/margins": 0.9173187017440796, + "rewards/rejected": -2.3957762718200684, "step": 2660 }, { "epoch": 0.7, - "grad_norm": 12.5, + "grad_norm": 11.8125, "learning_rate": 1.2599050247215764e-06, - "logits/chosen": -1.3040002584457397, - "logits/rejected": -1.2008647918701172, - "logps/chosen": -549.0474853515625, - "logps/rejected": -647.370849609375, - "loss": 0.4787, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.9114413261413574, - "rewards/margins": 1.2136096954345703, - "rewards/rejected": -4.125051021575928, + "logits/chosen": -1.871718406677246, + "logits/rejected": -1.7832616567611694, + "logps/chosen": -417.762451171875, + "logps/rejected": -488.9070739746094, + "loss": 0.4824, + "rewards/accuracies": 0.8062499761581421, + "rewards/chosen": -1.3971645832061768, + "rewards/margins": 0.9499279260635376, + "rewards/rejected": -2.347092390060425, "step": 2670 }, { "epoch": 0.7, - "grad_norm": 12.9375, + "grad_norm": 9.4375, "learning_rate": 1.2401210690746705e-06, - "logits/chosen": -1.332764744758606, - "logits/rejected": -1.1813501119613647, - "logps/chosen": -552.0095825195312, - "logps/rejected": -633.2657470703125, - "loss": 0.5028, + "logits/chosen": -1.9106123447418213, + "logits/rejected": -1.728986382484436, + "logps/chosen": -418.19207763671875, + "logps/rejected": -467.6233825683594, + "loss": 0.5374, "rewards/accuracies": 0.75, - "rewards/chosen": -2.8489394187927246, - "rewards/margins": 1.1449158191680908, - "rewards/rejected": -3.9938557147979736, + "rewards/chosen": -1.2990198135375977, + "rewards/margins": 0.8474353551864624, + "rewards/rejected": -2.1464550495147705, "step": 2680 }, { "epoch": 0.7, - "grad_norm": 11.75, + "grad_norm": 8.25, "learning_rate": 1.2204423129559306e-06, - "logits/chosen": -1.3755300045013428, - "logits/rejected": -1.3134779930114746, - "logps/chosen": -533.4794921875, - "logps/rejected": -646.0863647460938, - "loss": 0.4949, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.6803555488586426, - "rewards/margins": 1.1670808792114258, - "rewards/rejected": -3.8474364280700684, + "logits/chosen": -1.945713758468628, + "logits/rejected": -1.860364556312561, + "logps/chosen": -415.3990173339844, + "logps/rejected": -502.76605224609375, + "loss": 0.5061, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2963026762008667, + "rewards/margins": 0.9278820753097534, + "rewards/rejected": -2.22418475151062, "step": 2690 }, { "epoch": 0.71, - "grad_norm": 15.375, + "grad_norm": 13.625, "learning_rate": 1.20087039953583e-06, - "logits/chosen": -1.39198899269104, - "logits/rejected": -1.268708348274231, - "logps/chosen": -519.9878540039062, - "logps/rejected": -613.4864501953125, - "loss": 0.5242, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.558593273162842, - "rewards/margins": 1.1980977058410645, - "rewards/rejected": -3.7566914558410645, + "logits/chosen": -1.9242095947265625, + "logits/rejected": -1.7687902450561523, + "logps/chosen": -412.12188720703125, + "logps/rejected": -479.70123291015625, + "loss": 0.4946, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.2817579507827759, + "rewards/margins": 0.9811213612556458, + "rewards/rejected": -2.2628793716430664, "step": 2700 }, { "epoch": 0.71, - "eval_logits/chosen": -1.278761386871338, - "eval_logits/rejected": -1.1556262969970703, - "eval_logps/chosen": -523.679443359375, - "eval_logps/rejected": -614.7410278320312, - "eval_loss": 0.4932706654071808, - "eval_rewards/accuracies": 0.7390000224113464, - "eval_rewards/chosen": -2.590519905090332, - "eval_rewards/margins": 1.1109861135482788, - "eval_rewards/rejected": -3.7015058994293213, - "eval_runtime": 786.8821, - "eval_samples_per_second": 2.542, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -1.9339491128921509, + "eval_logits/rejected": -1.8200063705444336, + "eval_logps/chosen": -419.6712951660156, + "eval_logps/rejected": -487.4803771972656, + "eval_loss": 0.5093861818313599, + "eval_rewards/accuracies": 0.7300000190734863, + "eval_rewards/chosen": -1.345369815826416, + "eval_rewards/margins": 0.896984338760376, + "eval_rewards/rejected": -2.242354154586792, + "eval_runtime": 783.2364, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, "step": 2700 }, { "epoch": 0.71, - "grad_norm": 13.1875, + "grad_norm": 9.0, "learning_rate": 1.181406963063507e-06, - "logits/chosen": -1.3122459650039673, - "logits/rejected": -1.2601219415664673, - "logps/chosen": -514.85595703125, - "logps/rejected": -623.5354614257812, - "loss": 0.4996, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.4907000064849854, - "rewards/margins": 1.0934088230133057, - "rewards/rejected": -3.584108829498291, + "logits/chosen": -1.852378487586975, + "logits/rejected": -1.8395885229110718, + "logps/chosen": -417.3213806152344, + "logps/rejected": -499.66241455078125, + "loss": 0.5247, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -1.254032850265503, + "rewards/margins": 0.9126483798027039, + "rewards/rejected": -2.1666812896728516, "step": 2710 }, { "epoch": 0.71, - "grad_norm": 6.34375, + "grad_norm": 7.53125, "learning_rate": 1.1620536287303052e-06, - "logits/chosen": -1.420472502708435, - "logits/rejected": -1.2871911525726318, - "logps/chosen": -533.7322998046875, - "logps/rejected": -595.6099853515625, - "loss": 0.5347, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.426419973373413, - "rewards/margins": 0.9205582737922668, - "rewards/rejected": -3.3469784259796143, + "logits/chosen": -1.9413061141967773, + "logits/rejected": -1.761757493019104, + "logps/chosen": -443.94256591796875, + "logps/rejected": -479.33001708984375, + "loss": 0.5708, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3163220882415771, + "rewards/margins": 0.6949169635772705, + "rewards/rejected": -2.0112390518188477, "step": 2720 }, { "epoch": 0.71, - "grad_norm": 9.5, + "grad_norm": 9.3125, "learning_rate": 1.1428120125340717e-06, - "logits/chosen": -1.3559473752975464, - "logits/rejected": -1.2113393545150757, - "logps/chosen": -484.11016845703125, - "logps/rejected": -594.2417602539062, - "loss": 0.3924, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -2.300110340118408, - "rewards/margins": 1.5157711505889893, - "rewards/rejected": -3.8158810138702393, + "logits/chosen": -1.8692998886108398, + "logits/rejected": -1.7310359477996826, + "logps/chosen": -402.96820068359375, + "logps/rejected": -472.52716064453125, + "loss": 0.4338, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2012834548950195, + "rewards/margins": 1.200130820274353, + "rewards/rejected": -2.401414394378662, "step": 2730 }, { "epoch": 0.72, - "grad_norm": 9.75, + "grad_norm": 7.625, "learning_rate": 1.123683721144223e-06, - "logits/chosen": -1.3640830516815186, - "logits/rejected": -1.260825753211975, - "logps/chosen": -518.4991455078125, - "logps/rejected": -619.27783203125, - "loss": 0.4397, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.3486995697021484, - "rewards/margins": 1.3310325145721436, - "rewards/rejected": -3.67973256111145, + "logits/chosen": -1.8488807678222656, + "logits/rejected": -1.749967336654663, + "logps/chosen": -426.597412109375, + "logps/rejected": -500.22113037109375, + "loss": 0.4657, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.22300386428833, + "rewards/margins": 1.0593169927597046, + "rewards/rejected": -2.282320976257324, "step": 2740 }, { "epoch": 0.72, - "grad_norm": 6.625, + "grad_norm": 7.375, "learning_rate": 1.1046703517675848e-06, - "logits/chosen": -1.3874989748001099, - "logits/rejected": -1.305215835571289, - "logps/chosen": -490.18426513671875, - "logps/rejected": -601.5721435546875, - "loss": 0.5104, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.351259231567383, - "rewards/margins": 1.0526962280273438, - "rewards/rejected": -3.4039554595947266, + "logits/chosen": -1.9238160848617554, + "logits/rejected": -1.8325822353363037, + "logps/chosen": -397.0641174316406, + "logps/rejected": -492.7923889160156, + "loss": 0.5064, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2281832695007324, + "rewards/margins": 0.9004761576652527, + "rewards/rejected": -2.128659248352051, "step": 2750 }, { "epoch": 0.72, - "grad_norm": 11.0, + "grad_norm": 9.5625, "learning_rate": 1.085773492015028e-06, - "logits/chosen": -1.379077434539795, - "logits/rejected": -1.2084681987762451, - "logps/chosen": -476.3445739746094, - "logps/rejected": -564.0722045898438, - "loss": 0.4448, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.3004708290100098, - "rewards/margins": 1.2197654247283936, - "rewards/rejected": -3.5202362537384033, + "logits/chosen": -1.8633310794830322, + "logits/rejected": -1.6645698547363281, + "logps/chosen": -390.46405029296875, + "logps/rejected": -457.5859375, + "loss": 0.4611, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.2341833114624023, + "rewards/margins": 1.0489164590835571, + "rewards/rejected": -2.28309965133667, "step": 2760 }, { "epoch": 0.72, - "grad_norm": 11.375, + "grad_norm": 9.5625, "learning_rate": 1.0669947197689034e-06, - "logits/chosen": -1.3544801473617554, - "logits/rejected": -1.2332212924957275, - "logps/chosen": -513.3738403320312, - "logps/rejected": -589.9592895507812, - "loss": 0.5029, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.408294677734375, - "rewards/margins": 1.052453875541687, - "rewards/rejected": -3.4607481956481934, + "logits/chosen": -1.828849196434021, + "logits/rejected": -1.7827651500701904, + "logps/chosen": -433.4454040527344, + "logps/rejected": -493.23187255859375, + "loss": 0.5096, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3995469808578491, + "rewards/margins": 0.8646076321601868, + "rewards/rejected": -2.2641544342041016, "step": 2770 }, { "epoch": 0.73, "grad_norm": 8.625, "learning_rate": 1.048335603051291e-06, - "logits/chosen": -1.3504235744476318, - "logits/rejected": -1.2152540683746338, - "logps/chosen": -541.3975830078125, - "logps/rejected": -645.4278564453125, - "loss": 0.4346, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.536367416381836, - "rewards/margins": 1.3222835063934326, - "rewards/rejected": -3.8586506843566895, + "logits/chosen": -1.815760612487793, + "logits/rejected": -1.668330192565918, + "logps/chosen": -463.34033203125, + "logps/rejected": -534.0313720703125, + "loss": 0.452, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.546386480331421, + "rewards/margins": 1.0244462490081787, + "rewards/rejected": -2.5708324909210205, "step": 2780 }, { "epoch": 0.73, - "grad_norm": 10.1875, + "grad_norm": 9.4375, "learning_rate": 1.0297976998930665e-06, - "logits/chosen": -1.3515374660491943, - "logits/rejected": -1.242642879486084, - "logps/chosen": -500.7738342285156, - "logps/rejected": -608.32861328125, - "loss": 0.4494, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.402224540710449, - "rewards/margins": 1.3533068895339966, - "rewards/rejected": -3.7555313110351562, + "logits/chosen": -1.82321298122406, + "logits/rejected": -1.7483196258544922, + "logps/chosen": -437.36407470703125, + "logps/rejected": -515.2645263671875, + "loss": 0.4615, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.430606484413147, + "rewards/margins": 1.13131844997406, + "rewards/rejected": -2.561924695968628, "step": 2790 }, { "epoch": 0.73, - "grad_norm": 8.625, + "grad_norm": 7.6875, "learning_rate": 1.0113825582038078e-06, - "logits/chosen": -1.3841054439544678, - "logits/rejected": -1.2714396715164185, - "logps/chosen": -513.6281127929688, - "logps/rejected": -610.97802734375, - "loss": 0.4557, + "logits/chosen": -1.86112380027771, + "logits/rejected": -1.7646338939666748, + "logps/chosen": -427.6062927246094, + "logps/rejected": -500.8868103027344, + "loss": 0.5054, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.4843039512634277, - "rewards/margins": 1.1403610706329346, - "rewards/rejected": -3.624664783477783, + "rewards/chosen": -1.4373871088027954, + "rewards/margins": 0.9105140566825867, + "rewards/rejected": -2.3479011058807373, "step": 2800 }, { "epoch": 0.73, - "eval_logits/chosen": -1.3008809089660645, - "eval_logits/rejected": -1.178100347518921, - "eval_logps/chosen": -511.7322998046875, - "eval_logps/rejected": -604.080810546875, - "eval_loss": 0.4921375811100006, - "eval_rewards/accuracies": 0.7400000095367432, - "eval_rewards/chosen": -2.471048355102539, - "eval_rewards/margins": 1.123855710029602, - "eval_rewards/rejected": -3.5949041843414307, - "eval_runtime": 784.7637, - "eval_samples_per_second": 2.549, + "eval_logits/chosen": -1.918005347251892, + "eval_logits/rejected": -1.8041657209396362, + "eval_logps/chosen": -425.96136474609375, + "eval_logps/rejected": -495.76287841796875, + "eval_loss": 0.5085317492485046, + "eval_rewards/accuracies": 0.7319999933242798, + "eval_rewards/chosen": -1.4082709550857544, + "eval_rewards/margins": 0.9169085621833801, + "eval_rewards/rejected": -2.3251795768737793, + "eval_runtime": 782.9743, + "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.319, "step": 2800 }, { "epoch": 0.74, - "grad_norm": 9.1875, + "grad_norm": 9.75, "learning_rate": 9.930917156425477e-07, - "logits/chosen": -1.366546630859375, - "logits/rejected": -1.2530113458633423, - "logps/chosen": -527.7960205078125, - "logps/rejected": -628.1170654296875, - "loss": 0.5534, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.667004346847534, - "rewards/margins": 1.0318858623504639, - "rewards/rejected": -3.6988906860351562, + "logits/chosen": -1.88894522190094, + "logits/rejected": -1.7312339544296265, + "logps/chosen": -430.57806396484375, + "logps/rejected": -514.3462524414062, + "loss": 0.5414, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4730045795440674, + "rewards/margins": 0.888375461101532, + "rewards/rejected": -2.361380100250244, "step": 2810 }, { "epoch": 0.74, - "grad_norm": 20.375, + "grad_norm": 9.9375, "learning_rate": 9.749266994893756e-07, - "logits/chosen": -1.292289137840271, - "logits/rejected": -1.173458456993103, - "logps/chosen": -490.67889404296875, - "logps/rejected": -565.9766845703125, - "loss": 0.5627, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5006136894226074, - "rewards/margins": 0.8870899081230164, - "rewards/rejected": -3.3877034187316895, + "logits/chosen": -1.753129243850708, + "logits/rejected": -1.7074140310287476, + "logps/chosen": -392.9122619628906, + "logps/rejected": -458.78887939453125, + "loss": 0.57, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3587015867233276, + "rewards/margins": 0.7806506156921387, + "rewards/rejected": -2.139352321624756, "step": 2820 }, { "epoch": 0.74, - "grad_norm": 14.0625, + "grad_norm": 12.875, "learning_rate": 9.56889026517913e-07, - "logits/chosen": -1.3482143878936768, - "logits/rejected": -1.2396175861358643, - "logps/chosen": -520.6595458984375, - "logps/rejected": -597.5631713867188, - "loss": 0.5159, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.593552350997925, - "rewards/margins": 1.0291521549224854, - "rewards/rejected": -3.6227047443389893, + "logits/chosen": -1.8339742422103882, + "logits/rejected": -1.7476985454559326, + "logps/chosen": -428.970703125, + "logps/rejected": -468.741455078125, + "loss": 0.5567, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4643380641937256, + "rewards/margins": 0.7202334403991699, + "rewards/rejected": -2.1845715045928955, "step": 2830 }, { "epoch": 0.74, - "grad_norm": 7.0625, + "grad_norm": 7.9375, "learning_rate": 9.389802028686617e-07, - "logits/chosen": -1.4297748804092407, - "logits/rejected": -1.3320245742797852, - "logps/chosen": -507.13232421875, - "logps/rejected": -551.9314575195312, - "loss": 0.5765, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.46412992477417, - "rewards/margins": 0.7776277661323547, - "rewards/rejected": -3.2417571544647217, + "logits/chosen": -1.8853356838226318, + "logits/rejected": -1.784654974937439, + "logps/chosen": -423.21673583984375, + "logps/rejected": -450.36773681640625, + "loss": 0.5783, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4194176197052002, + "rewards/margins": 0.6203545331954956, + "rewards/rejected": -2.0397722721099854, "step": 2840 }, { "epoch": 0.75, - "grad_norm": 9.9375, + "grad_norm": 5.78125, "learning_rate": 9.212017239232427e-07, - "logits/chosen": -1.372556447982788, - "logits/rejected": -1.2070673704147339, - "logps/chosen": -507.5038146972656, - "logps/rejected": -604.3282470703125, - "loss": 0.4644, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.3459770679473877, - "rewards/margins": 1.1976208686828613, - "rewards/rejected": -3.54359769821167, + "logits/chosen": -1.907448172569275, + "logits/rejected": -1.665366768836975, + "logps/chosen": -412.2789001464844, + "logps/rejected": -495.1942443847656, + "loss": 0.4473, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.196616291999817, + "rewards/margins": 1.0811289548873901, + "rewards/rejected": -2.277745246887207, "step": 2850 }, { "epoch": 0.75, - "grad_norm": 10.3125, + "grad_norm": 6.9375, "learning_rate": 9.03555074179533e-07, - "logits/chosen": -1.3323904275894165, - "logits/rejected": -1.3141463994979858, - "logps/chosen": -486.00225830078125, - "logps/rejected": -611.860107421875, - "loss": 0.452, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.2929673194885254, - "rewards/margins": 1.200028896331787, - "rewards/rejected": -3.4929962158203125, + "logits/chosen": -1.8094466924667358, + "logits/rejected": -1.8180068731307983, + "logps/chosen": -398.2765808105469, + "logps/rejected": -497.26824951171875, + "loss": 0.4864, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.2072433233261108, + "rewards/margins": 0.9678330421447754, + "rewards/rejected": -2.175076484680176, "step": 2860 }, { "epoch": 0.75, - "grad_norm": 10.6875, + "grad_norm": 7.15625, "learning_rate": 8.860417271277067e-07, - "logits/chosen": -1.454944372177124, - "logits/rejected": -1.4230549335479736, - "logps/chosen": -502.74847412109375, - "logps/rejected": -583.6201782226562, - "loss": 0.4977, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.33546781539917, - "rewards/margins": 0.8885356187820435, - "rewards/rejected": -3.224003553390503, + "logits/chosen": -1.9343254566192627, + "logits/rejected": -1.969805359840393, + "logps/chosen": -407.0105285644531, + "logps/rejected": -468.46826171875, + "loss": 0.5046, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.1932955980300903, + "rewards/margins": 0.7251569628715515, + "rewards/rejected": -1.9184526205062866, "step": 2870 }, { "epoch": 0.75, - "grad_norm": 8.75, + "grad_norm": 10.0, "learning_rate": 8.686631451272029e-07, - "logits/chosen": -1.4279208183288574, - "logits/rejected": -1.2692053318023682, - "logps/chosen": -504.5962829589844, - "logps/rejected": -593.9146728515625, - "loss": 0.4974, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.5120105743408203, - "rewards/margins": 1.1478073596954346, - "rewards/rejected": -3.6598174571990967, + "logits/chosen": -1.9621702432632446, + "logits/rejected": -1.7568639516830444, + "logps/chosen": -402.2386779785156, + "logps/rejected": -468.0703125, + "loss": 0.5113, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2953293323516846, + "rewards/margins": 0.9019441604614258, + "rewards/rejected": -2.1972732543945312, "step": 2880 }, { "epoch": 0.76, - "grad_norm": 7.15625, + "grad_norm": 6.75, "learning_rate": 8.514207792846168e-07, - "logits/chosen": -1.4306687116622925, - "logits/rejected": -1.3072850704193115, - "logps/chosen": -498.3331604003906, - "logps/rejected": -580.2415771484375, - "loss": 0.4904, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.475101947784424, - "rewards/margins": 1.1032798290252686, - "rewards/rejected": -3.5783820152282715, + "logits/chosen": -1.8871433734893799, + "logits/rejected": -1.749696969985962, + "logps/chosen": -405.40374755859375, + "logps/rejected": -461.99078369140625, + "loss": 0.4967, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3629052639007568, + "rewards/margins": 0.8758969306945801, + "rewards/rejected": -2.238802433013916, "step": 2890 }, { "epoch": 0.76, - "grad_norm": 8.3125, + "grad_norm": 5.3125, "learning_rate": 8.343160693325356e-07, - "logits/chosen": -1.329193353652954, - "logits/rejected": -1.215259313583374, - "logps/chosen": -514.7459716796875, - "logps/rejected": -618.2096557617188, - "loss": 0.523, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.592334747314453, - "rewards/margins": 1.077820062637329, - "rewards/rejected": -3.670154571533203, + "logits/chosen": -1.8498756885528564, + "logits/rejected": -1.7222645282745361, + "logps/chosen": -402.5791931152344, + "logps/rejected": -493.0008239746094, + "loss": 0.5159, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2853260040283203, + "rewards/margins": 0.957757294178009, + "rewards/rejected": -2.2430832386016846, "step": 2900 }, { "epoch": 0.76, - "eval_logits/chosen": -1.3049767017364502, - "eval_logits/rejected": -1.1831483840942383, - "eval_logps/chosen": -520.3427734375, - "eval_logps/rejected": -608.647216796875, - "eval_loss": 0.489922434091568, - "eval_rewards/accuracies": 0.7434999942779541, - "eval_rewards/chosen": -2.5571529865264893, - "eval_rewards/margins": 1.0834143161773682, - "eval_rewards/rejected": -3.640566825866699, - "eval_runtime": 784.9976, - "eval_samples_per_second": 2.548, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -1.9330179691314697, + "eval_logits/rejected": -1.8193044662475586, + "eval_logps/chosen": -419.8022155761719, + "eval_logps/rejected": -486.5226745605469, + "eval_loss": 0.5065543055534363, + "eval_rewards/accuracies": 0.7319999933242798, + "eval_rewards/chosen": -1.346679449081421, + "eval_rewards/margins": 0.8860979676246643, + "eval_rewards/rejected": -2.2327773571014404, + "eval_runtime": 784.023, + "eval_samples_per_second": 2.551, + "eval_steps_per_second": 0.319, "step": 2900 }, { "epoch": 0.76, - "grad_norm": 8.8125, + "grad_norm": 7.375, "learning_rate": 8.173504435093174e-07, - "logits/chosen": -1.330748200416565, - "logits/rejected": -1.152014970779419, - "logps/chosen": -493.8155212402344, - "logps/rejected": -579.8218994140625, - "loss": 0.4803, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.5229828357696533, - "rewards/margins": 1.1812492609024048, - "rewards/rejected": -3.7042319774627686, + "logits/chosen": -1.8513338565826416, + "logits/rejected": -1.6518259048461914, + "logps/chosen": -398.8111877441406, + "logps/rejected": -456.32086181640625, + "loss": 0.4881, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3139582872390747, + "rewards/margins": 0.9640085101127625, + "rewards/rejected": -2.2779667377471924, "step": 2910 }, { "epoch": 0.76, - "grad_norm": 7.46875, + "grad_norm": 7.21875, "learning_rate": 8.00525318439836e-07, - "logits/chosen": -1.3724451065063477, - "logits/rejected": -1.2252975702285767, - "logps/chosen": -531.9354248046875, - "logps/rejected": -615.12841796875, - "loss": 0.5378, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.551813840866089, - "rewards/margins": 0.9076172709465027, - "rewards/rejected": -3.4594311714172363, + "logits/chosen": -1.9344298839569092, + "logits/rejected": -1.7016353607177734, + "logps/chosen": -431.385498046875, + "logps/rejected": -499.0389099121094, + "loss": 0.5575, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3274246454238892, + "rewards/margins": 0.7691916227340698, + "rewards/rejected": -2.09661602973938, "step": 2920 }, { "epoch": 0.77, - "grad_norm": 7.09375, + "grad_norm": 7.90625, "learning_rate": 7.838420990171927e-07, - "logits/chosen": -1.4560999870300293, - "logits/rejected": -1.292479395866394, - "logps/chosen": -512.5706176757812, - "logps/rejected": -587.8864135742188, - "loss": 0.5081, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.4322078227996826, - "rewards/margins": 1.0088894367218018, - "rewards/rejected": -3.4410972595214844, + "logits/chosen": -1.9444793462753296, + "logits/rejected": -1.7388956546783447, + "logps/chosen": -421.0689392089844, + "logps/rejected": -473.1524353027344, + "loss": 0.5226, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3027684688568115, + "rewards/margins": 0.813397228717804, + "rewards/rejected": -2.1161653995513916, "step": 2930 }, { "epoch": 0.77, - "grad_norm": 11.9375, + "grad_norm": 7.15625, "learning_rate": 7.673021782854084e-07, - "logits/chosen": -1.3237154483795166, - "logits/rejected": -1.1798452138900757, - "logps/chosen": -513.1326904296875, - "logps/rejected": -593.04150390625, - "loss": 0.4693, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.524876594543457, - "rewards/margins": 1.2180140018463135, - "rewards/rejected": -3.7428905963897705, + "logits/chosen": -1.836850881576538, + "logits/rejected": -1.6611976623535156, + "logps/chosen": -419.4012756347656, + "logps/rejected": -462.13726806640625, + "loss": 0.4885, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3087483644485474, + "rewards/margins": 0.983415961265564, + "rewards/rejected": -2.2921640872955322, "step": 2940 }, { "epoch": 0.77, - "grad_norm": 9.1875, + "grad_norm": 5.625, "learning_rate": 7.509069373231039e-07, - "logits/chosen": -1.3298399448394775, - "logits/rejected": -1.2147536277770996, - "logps/chosen": -515.6809692382812, - "logps/rejected": -578.0634765625, - "loss": 0.5569, + "logits/chosen": -1.8366082906723022, + "logits/rejected": -1.7366777658462524, + "logps/chosen": -413.8230895996094, + "logps/rejected": -465.357177734375, + "loss": 0.5603, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.6305928230285645, - "rewards/margins": 0.8716634511947632, - "rewards/rejected": -3.502256393432617, + "rewards/chosen": -1.4296019077301025, + "rewards/margins": 0.7154251337051392, + "rewards/rejected": -2.145026922225952, "step": 2950 }, { "epoch": 0.77, - "grad_norm": 7.75, + "grad_norm": 5.84375, "learning_rate": 7.346577451281822e-07, - "logits/chosen": -1.3448503017425537, - "logits/rejected": -1.2580183744430542, - "logps/chosen": -514.1692504882812, - "logps/rejected": -614.6067504882812, - "loss": 0.4651, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.5198423862457275, - "rewards/margins": 1.2452865839004517, - "rewards/rejected": -3.7651286125183105, + "logits/chosen": -1.8811330795288086, + "logits/rejected": -1.754575490951538, + "logps/chosen": -425.0521545410156, + "logps/rejected": -499.4877014160156, + "loss": 0.4909, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4068305492401123, + "rewards/margins": 0.9745821952819824, + "rewards/rejected": -2.381412982940674, "step": 2960 }, { "epoch": 0.78, - "grad_norm": 11.1875, + "grad_norm": 10.875, "learning_rate": 7.185559585035138e-07, - "logits/chosen": -1.3795948028564453, - "logits/rejected": -1.222127079963684, - "logps/chosen": -550.9356079101562, - "logps/rejected": -646.4664306640625, - "loss": 0.4803, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -2.682473659515381, - "rewards/margins": 1.1136572360992432, - "rewards/rejected": -3.796131134033203, + "logits/chosen": -1.8809776306152344, + "logits/rejected": -1.7219616174697876, + "logps/chosen": -449.704833984375, + "logps/rejected": -526.0900268554688, + "loss": 0.4946, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4797046184539795, + "rewards/margins": 0.9321640729904175, + "rewards/rejected": -2.4118688106536865, "step": 2970 }, { "epoch": 0.78, - "grad_norm": 7.625, + "grad_norm": 10.375, "learning_rate": 7.026029219436504e-07, - "logits/chosen": -1.4066416025161743, - "logits/rejected": -1.2424472570419312, - "logps/chosen": -509.71185302734375, - "logps/rejected": -613.3909912109375, - "loss": 0.4757, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.57428240776062, - "rewards/margins": 1.1651995182037354, - "rewards/rejected": -3.7394821643829346, + "logits/chosen": -1.8558896780014038, + "logits/rejected": -1.67585027217865, + "logps/chosen": -418.6376953125, + "logps/rejected": -502.877685546875, + "loss": 0.4795, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.4395673274993896, + "rewards/margins": 0.9882482290267944, + "rewards/rejected": -2.4278156757354736, "step": 2980 }, { "epoch": 0.78, - "grad_norm": 11.0, + "grad_norm": 9.1875, "learning_rate": 6.867999675225523e-07, - "logits/chosen": -1.4465783834457397, - "logits/rejected": -1.313063144683838, - "logps/chosen": -479.8519592285156, - "logps/rejected": -575.8267822265625, - "loss": 0.4872, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.489804744720459, - "rewards/margins": 1.1018868684768677, - "rewards/rejected": -3.5916919708251953, + "logits/chosen": -1.9322134256362915, + "logits/rejected": -1.7906997203826904, + "logps/chosen": -387.1766662597656, + "logps/rejected": -465.6539001464844, + "loss": 0.4869, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.3697566986083984, + "rewards/margins": 0.9596658945083618, + "rewards/rejected": -2.3294224739074707, "step": 2990 }, { "epoch": 0.79, - "grad_norm": 9.0, + "grad_norm": 7.375, "learning_rate": 6.711484147823663e-07, - "logits/chosen": -1.3521897792816162, - "logits/rejected": -1.2767261266708374, - "logps/chosen": -477.41650390625, - "logps/rejected": -599.962646484375, - "loss": 0.4588, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.43296480178833, - "rewards/margins": 1.1764843463897705, - "rewards/rejected": -3.6094489097595215, + "logits/chosen": -1.8498642444610596, + "logits/rejected": -1.770777940750122, + "logps/chosen": -388.5807189941406, + "logps/rejected": -491.8318786621094, + "loss": 0.4671, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3586634397506714, + "rewards/margins": 0.9807540774345398, + "rewards/rejected": -2.3394176959991455, "step": 3000 }, { "epoch": 0.79, - "eval_logits/chosen": -1.3135889768600464, - "eval_logits/rejected": -1.1913836002349854, - "eval_logps/chosen": -521.3173828125, - "eval_logps/rejected": -606.716064453125, - "eval_loss": 0.489653617143631, - "eval_rewards/accuracies": 0.7415000200271606, - "eval_rewards/chosen": -2.566899061203003, - "eval_rewards/margins": 1.0543569326400757, - "eval_rewards/rejected": -3.621256113052368, - "eval_runtime": 780.2881, - "eval_samples_per_second": 2.563, - "eval_steps_per_second": 0.32, + "eval_logits/chosen": -1.927390217781067, + "eval_logits/rejected": -1.813964605331421, + "eval_logps/chosen": -427.0750732421875, + "eval_logps/rejected": -493.886474609375, + "eval_loss": 0.5061513781547546, + "eval_rewards/accuracies": 0.7325000166893005, + "eval_rewards/chosen": -1.4194074869155884, + "eval_rewards/margins": 0.8870080709457397, + "eval_rewards/rejected": -2.306415557861328, + "eval_runtime": 782.9041, + "eval_samples_per_second": 2.555, + "eval_steps_per_second": 0.319, "step": 3000 }, { "epoch": 0.79, - "grad_norm": 9.0625, + "grad_norm": 6.8125, "learning_rate": 6.556495706232413e-07, - "logits/chosen": -1.3601144552230835, - "logits/rejected": -1.263600468635559, - "logps/chosen": -534.0828857421875, - "logps/rejected": -612.0753173828125, - "loss": 0.5489, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.654869556427002, - "rewards/margins": 0.9636019468307495, - "rewards/rejected": -3.61847186088562, + "logits/chosen": -1.8801040649414062, + "logits/rejected": -1.712304711341858, + "logps/chosen": -434.16717529296875, + "logps/rejected": -491.79571533203125, + "loss": 0.557, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.439712405204773, + "rewards/margins": 0.7891451120376587, + "rewards/rejected": -2.2288575172424316, "step": 3010 }, { "epoch": 0.79, - "grad_norm": 10.8125, + "grad_norm": 7.90625, "learning_rate": 6.403047291942057e-07, - "logits/chosen": -1.2863237857818604, - "logits/rejected": -1.1349502801895142, - "logps/chosen": -485.25115966796875, - "logps/rejected": -566.3896484375, - "loss": 0.4941, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.59041166305542, - "rewards/margins": 1.0551263093948364, - "rewards/rejected": -3.645537853240967, + "logits/chosen": -1.7626889944076538, + "logits/rejected": -1.604828119277954, + "logps/chosen": -396.83819580078125, + "logps/rejected": -452.27740478515625, + "loss": 0.5308, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.5018559694290161, + "rewards/margins": 0.8321312665939331, + "rewards/rejected": -2.3339874744415283, "step": 3020 }, { "epoch": 0.79, - "grad_norm": 11.0, + "grad_norm": 12.875, "learning_rate": 6.251151717851023e-07, - "logits/chosen": -1.3493398427963257, - "logits/rejected": -1.2716810703277588, - "logps/chosen": -480.6705627441406, - "logps/rejected": -577.1141357421875, - "loss": 0.4844, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.4731645584106445, - "rewards/margins": 1.0976300239562988, - "rewards/rejected": -3.5707945823669434, + "logits/chosen": -1.7991710901260376, + "logits/rejected": -1.764809012413025, + "logps/chosen": -398.560791015625, + "logps/rejected": -469.8736267089844, + "loss": 0.5408, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4512755870819092, + "rewards/margins": 0.8837641477584839, + "rewards/rejected": -2.3350396156311035, "step": 3030 }, { "epoch": 0.8, - "grad_norm": 6.625, + "grad_norm": 4.75, "learning_rate": 6.100821667196041e-07, - "logits/chosen": -1.5315853357315063, - "logits/rejected": -1.2590696811676025, - "logps/chosen": -524.2052612304688, - "logps/rejected": -563.5902099609375, - "loss": 0.4941, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.4863927364349365, - "rewards/margins": 1.048506498336792, - "rewards/rejected": -3.5348992347717285, + "logits/chosen": -2.0548505783081055, + "logits/rejected": -1.7320611476898193, + "logps/chosen": -432.7552185058594, + "logps/rejected": -450.3419494628906, + "loss": 0.5211, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4007914066314697, + "rewards/margins": 0.8642256855964661, + "rewards/rejected": -2.26501727104187, "step": 3040 }, { "epoch": 0.8, - "grad_norm": 13.25, + "grad_norm": 7.15625, "learning_rate": 5.952069692493062e-07, - "logits/chosen": -1.3318599462509155, - "logits/rejected": -1.213030457496643, - "logps/chosen": -470.8360290527344, - "logps/rejected": -598.3742065429688, - "loss": 0.4165, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.4257140159606934, - "rewards/margins": 1.2411974668502808, - "rewards/rejected": -3.6669116020202637, + "logits/chosen": -1.7902101278305054, + "logits/rejected": -1.6391757726669312, + "logps/chosen": -375.85205078125, + "logps/rejected": -483.9964904785156, + "loss": 0.4403, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.279217004776001, + "rewards/margins": 1.0462336540222168, + "rewards/rejected": -2.3254504203796387, "step": 3050 }, { "epoch": 0.8, - "grad_norm": 8.625, + "grad_norm": 8.0, "learning_rate": 5.80490821448918e-07, - "logits/chosen": -1.2763512134552002, - "logits/rejected": -1.282798409461975, - "logps/chosen": -507.8856506347656, - "logps/rejected": -682.7872314453125, - "loss": 0.4266, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.454050064086914, - "rewards/margins": 1.3245913982391357, - "rewards/rejected": -3.77864146232605, + "logits/chosen": -1.6928142309188843, + "logits/rejected": -1.7527170181274414, + "logps/chosen": -416.954345703125, + "logps/rejected": -565.76953125, + "loss": 0.4596, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3452516794204712, + "rewards/margins": 1.0599815845489502, + "rewards/rejected": -2.405233383178711, "step": 3060 }, { "epoch": 0.8, - "grad_norm": 9.6875, + "grad_norm": 6.6875, "learning_rate": 5.659349521125459e-07, - "logits/chosen": -1.4823614358901978, - "logits/rejected": -1.4227873086929321, - "logps/chosen": -526.6748046875, - "logps/rejected": -605.1229248046875, - "loss": 0.5139, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.4531147480010986, - "rewards/margins": 0.9902750849723816, - "rewards/rejected": -3.443389415740967, + "logits/chosen": -1.9752018451690674, + "logits/rejected": -1.9083846807479858, + "logps/chosen": -435.5009765625, + "logps/rejected": -491.18365478515625, + "loss": 0.5258, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.340466022491455, + "rewards/margins": 0.7914128303527832, + "rewards/rejected": -2.1318788528442383, "step": 3070 }, { "epoch": 0.81, - "grad_norm": 6.71875, + "grad_norm": 5.8125, "learning_rate": 5.5154057665109e-07, - "logits/chosen": -1.426896095275879, - "logits/rejected": -1.2738499641418457, - "logps/chosen": -527.0716552734375, - "logps/rejected": -619.157470703125, - "loss": 0.5019, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.662111282348633, - "rewards/margins": 1.185082197189331, - "rewards/rejected": -3.847193956375122, + "logits/chosen": -1.8967254161834717, + "logits/rejected": -1.712303876876831, + "logps/chosen": -424.0162658691406, + "logps/rejected": -495.65972900390625, + "loss": 0.4923, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4567795991897583, + "rewards/margins": 0.9943565130233765, + "rewards/rejected": -2.4511361122131348, "step": 3080 }, { "epoch": 0.81, - "grad_norm": 8.4375, + "grad_norm": 5.25, "learning_rate": 5.373088969907586e-07, - "logits/chosen": -1.4556440114974976, - "logits/rejected": -1.2886602878570557, - "logps/chosen": -536.6363525390625, - "logps/rejected": -595.8131713867188, - "loss": 0.4589, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.5982890129089355, - "rewards/margins": 1.0939452648162842, - "rewards/rejected": -3.692234516143799, + "logits/chosen": -1.9155843257904053, + "logits/rejected": -1.7840111255645752, + "logps/chosen": -438.22076416015625, + "logps/rejected": -466.3299865722656, + "loss": 0.479, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3894039392471313, + "rewards/margins": 0.8652655482292175, + "rewards/rejected": -2.254669189453125, "step": 3090 }, { "epoch": 0.81, - "grad_norm": 9.9375, + "grad_norm": 6.8125, "learning_rate": 5.23241101472709e-07, - "logits/chosen": -1.372431993484497, - "logits/rejected": -1.2485543489456177, - "logps/chosen": -527.42236328125, - "logps/rejected": -604.1925048828125, - "loss": 0.5038, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.511486530303955, - "rewards/margins": 0.9880231618881226, - "rewards/rejected": -3.499509811401367, + "logits/chosen": -1.8724768161773682, + "logits/rejected": -1.7467072010040283, + "logps/chosen": -429.61871337890625, + "logps/rejected": -504.12518310546875, + "loss": 0.4864, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.277355432510376, + "rewards/margins": 0.8578082919120789, + "rewards/rejected": -2.1351637840270996, "step": 3100 }, { "epoch": 0.81, - "eval_logits/chosen": -1.3089466094970703, - "eval_logits/rejected": -1.1865952014923096, - "eval_logps/chosen": -526.1104125976562, - "eval_logps/rejected": -615.688232421875, - "eval_loss": 0.4894368350505829, - "eval_rewards/accuracies": 0.7400000095367432, - "eval_rewards/chosen": -2.6148295402526855, - "eval_rewards/margins": 1.0961487293243408, - "eval_rewards/rejected": -3.7109785079956055, - "eval_runtime": 785.3317, - "eval_samples_per_second": 2.547, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -1.929145097732544, + "eval_logits/rejected": -1.815799593925476, + "eval_logps/chosen": -427.61724853515625, + "eval_logps/rejected": -494.0862731933594, + "eval_loss": 0.5058856010437012, + "eval_rewards/accuracies": 0.7329999804496765, + "eval_rewards/chosen": -1.4248294830322266, + "eval_rewards/margins": 0.8835842609405518, + "eval_rewards/rejected": -2.3084137439727783, + "eval_runtime": 783.2884, + "eval_samples_per_second": 2.553, + "eval_steps_per_second": 0.319, "step": 3100 }, { "epoch": 0.81, - "grad_norm": 7.1875, + "grad_norm": 6.03125, "learning_rate": 5.09338364753818e-07, - "logits/chosen": -1.4420716762542725, - "logits/rejected": -1.27860426902771, - "logps/chosen": -539.74853515625, - "logps/rejected": -632.0848999023438, - "loss": 0.5158, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.543715715408325, - "rewards/margins": 1.0814507007598877, - "rewards/rejected": -3.625166416168213, + "logits/chosen": -1.8905067443847656, + "logits/rejected": -1.7481834888458252, + "logps/chosen": -439.99053955078125, + "logps/rejected": -510.75732421875, + "loss": 0.5265, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3464457988739014, + "rewards/margins": 0.8832794427871704, + "rewards/rejected": -2.2297251224517822, "step": 3110 }, { "epoch": 0.82, - "grad_norm": 10.4375, + "grad_norm": 9.9375, "learning_rate": 4.956018477086005e-07, - "logits/chosen": -1.4062488079071045, - "logits/rejected": -1.2400996685028076, - "logps/chosen": -536.4059448242188, - "logps/rejected": -620.3006591796875, - "loss": 0.513, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.673593044281006, - "rewards/margins": 1.1122300624847412, - "rewards/rejected": -3.785823106765747, + "logits/chosen": -1.8537441492080688, + "logits/rejected": -1.6520248651504517, + "logps/chosen": -441.00018310546875, + "logps/rejected": -497.1783142089844, + "loss": 0.5475, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4708020687103271, + "rewards/margins": 0.8641169667243958, + "rewards/rejected": -2.334918737411499, "step": 3120 }, { "epoch": 0.82, - "grad_norm": 11.6875, + "grad_norm": 7.84375, "learning_rate": 4.820326973322764e-07, - "logits/chosen": -1.3163243532180786, - "logits/rejected": -1.2418758869171143, - "logps/chosen": -526.8767700195312, - "logps/rejected": -622.9215087890625, - "loss": 0.5485, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.7595295906066895, - "rewards/margins": 1.0204734802246094, - "rewards/rejected": -3.780003070831299, + "logits/chosen": -1.8311748504638672, + "logits/rejected": -1.7749812602996826, + "logps/chosen": -419.26483154296875, + "logps/rejected": -499.1497497558594, + "loss": 0.5304, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.489749789237976, + "rewards/margins": 0.8352252840995789, + "rewards/rejected": -2.3249752521514893, "step": 3130 }, { "epoch": 0.82, - "grad_norm": 10.4375, + "grad_norm": 7.21875, "learning_rate": 4.686320466449981e-07, - "logits/chosen": -1.3269009590148926, - "logits/rejected": -1.1425731182098389, - "logps/chosen": -494.65802001953125, - "logps/rejected": -630.2109375, - "loss": 0.4391, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.554276466369629, - "rewards/margins": 1.3776849508285522, - "rewards/rejected": -3.9319615364074707, + "logits/chosen": -1.8751367330551147, + "logits/rejected": -1.65691339969635, + "logps/chosen": -396.12567138671875, + "logps/rejected": -495.40386962890625, + "loss": 0.4698, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3601388931274414, + "rewards/margins": 1.032128095626831, + "rewards/rejected": -2.3922669887542725, "step": 3140 }, { "epoch": 0.82, - "grad_norm": 8.25, + "grad_norm": 6.5, "learning_rate": 4.554010145972418e-07, - "logits/chosen": -1.4709281921386719, - "logits/rejected": -1.2983810901641846, - "logps/chosen": -533.2330322265625, - "logps/rejected": -631.8088989257812, - "loss": 0.5445, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.6948599815368652, - "rewards/margins": 1.0983901023864746, - "rewards/rejected": -3.7932498455047607, + "logits/chosen": -1.9465984106063843, + "logits/rejected": -1.8031593561172485, + "logps/chosen": -430.3694763183594, + "logps/rejected": -502.8419494628906, + "loss": 0.5425, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4778169393539429, + "rewards/margins": 0.8588212728500366, + "rewards/rejected": -2.3366379737854004, "step": 3150 }, { "epoch": 0.83, - "grad_norm": 7.84375, + "grad_norm": 6.53125, "learning_rate": 4.4234070597637455e-07, - "logits/chosen": -1.3381648063659668, - "logits/rejected": -1.2488067150115967, - "logps/chosen": -536.5142211914062, - "logps/rejected": -626.2984619140625, - "loss": 0.527, + "logits/chosen": -1.8906543254852295, + "logits/rejected": -1.8061383962631226, + "logps/chosen": -434.2395935058594, + "logps/rejected": -512.9703369140625, + "loss": 0.5175, "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.6102843284606934, - "rewards/margins": 1.0166431665420532, - "rewards/rejected": -3.626927137374878, + "rewards/chosen": -1.405390977859497, + "rewards/margins": 0.8955022692680359, + "rewards/rejected": -2.3008930683135986, "step": 3160 }, { "epoch": 0.83, - "grad_norm": 6.75, + "grad_norm": 5.90625, "learning_rate": 4.2945221131440783e-07, - "logits/chosen": -1.3066151142120361, - "logits/rejected": -1.1077501773834229, - "logps/chosen": -520.0422973632812, - "logps/rejected": -613.4268798828125, - "loss": 0.4346, + "logits/chosen": -1.796657919883728, + "logits/rejected": -1.600589394569397, + "logps/chosen": -431.2503356933594, + "logps/rejected": -495.09710693359375, + "loss": 0.4632, "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -2.544084072113037, - "rewards/margins": 1.250158667564392, - "rewards/rejected": -3.7942421436309814, + "rewards/chosen": -1.4193048477172852, + "rewards/margins": 0.9972556233406067, + "rewards/rejected": -2.416560649871826, "step": 3170 }, { "epoch": 0.83, - "grad_norm": 11.375, + "grad_norm": 6.3125, "learning_rate": 4.167366067969381e-07, - "logits/chosen": -1.3843923807144165, - "logits/rejected": -1.325184941291809, - "logps/chosen": -484.657470703125, - "logps/rejected": -607.08056640625, - "loss": 0.5088, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.6174612045288086, - "rewards/margins": 0.9863953590393066, - "rewards/rejected": -3.603856325149536, + "logits/chosen": -1.7535873651504517, + "logits/rejected": -1.7269706726074219, + "logps/chosen": -381.70745849609375, + "logps/rejected": -487.33544921875, + "loss": 0.5104, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.4208877086639404, + "rewards/margins": 0.8177056312561035, + "rewards/rejected": -2.238593578338623, "step": 3180 }, { "epoch": 0.83, - "grad_norm": 7.71875, + "grad_norm": 5.78125, "learning_rate": 4.041949541732826e-07, - "logits/chosen": -1.3859659433364868, - "logits/rejected": -1.3317590951919556, - "logps/chosen": -530.0472412109375, - "logps/rejected": -617.9666748046875, - "loss": 0.5261, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.6624364852905273, - "rewards/margins": 1.0216740369796753, - "rewards/rejected": -3.6841111183166504, + "logits/chosen": -1.855145812034607, + "logits/rejected": -1.850327491760254, + "logps/chosen": -433.2710876464844, + "logps/rejected": -499.67047119140625, + "loss": 0.5162, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.5070629119873047, + "rewards/margins": 0.8208478689193726, + "rewards/rejected": -2.327910900115967, "step": 3190 }, { "epoch": 0.84, - "grad_norm": 14.875, + "grad_norm": 5.9375, "learning_rate": 3.9182830066782614e-07, - "logits/chosen": -1.3132878541946411, - "logits/rejected": -1.3010650873184204, - "logps/chosen": -518.9996948242188, - "logps/rejected": -642.5855712890625, - "loss": 0.5, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.613128900527954, - "rewards/margins": 1.1114436388015747, - "rewards/rejected": -3.7245724201202393, + "logits/chosen": -1.8350633382797241, + "logits/rejected": -1.811108946800232, + "logps/chosen": -424.4688415527344, + "logps/rejected": -527.4735717773438, + "loss": 0.5101, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.472731351852417, + "rewards/margins": 0.912972629070282, + "rewards/rejected": -2.3857038021087646, "step": 3200 }, { "epoch": 0.84, - "eval_logits/chosen": -1.3130204677581787, - "eval_logits/rejected": -1.1907377243041992, - "eval_logps/chosen": -520.2028198242188, - "eval_logps/rejected": -609.7108764648438, - "eval_loss": 0.4889434278011322, - "eval_rewards/accuracies": 0.7434999942779541, - "eval_rewards/chosen": -2.555753469467163, - "eval_rewards/margins": 1.0954509973526, - "eval_rewards/rejected": -3.6512043476104736, - "eval_runtime": 784.8129, - "eval_samples_per_second": 2.548, + "eval_logits/chosen": -1.9300192594528198, + "eval_logits/rejected": -1.8167176246643066, + "eval_logps/chosen": -426.7279357910156, + "eval_logps/rejected": -493.0526123046875, + "eval_loss": 0.5056060552597046, + "eval_rewards/accuracies": 0.734000027179718, + "eval_rewards/chosen": -1.4159364700317383, + "eval_rewards/margins": 0.882140040397644, + "eval_rewards/rejected": -2.298076868057251, + "eval_runtime": 783.8657, + "eval_samples_per_second": 2.551, "eval_steps_per_second": 0.319, "step": 3200 }, { "epoch": 0.84, - "grad_norm": 8.3125, + "grad_norm": 7.125, "learning_rate": 3.796376788925771e-07, - "logits/chosen": -1.32333505153656, - "logits/rejected": -1.252463936805725, - "logps/chosen": -507.30096435546875, - "logps/rejected": -578.5609130859375, - "loss": 0.5077, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.459289073944092, - "rewards/margins": 0.9527593851089478, - "rewards/rejected": -3.412048816680908, + "logits/chosen": -1.8429349660873413, + "logits/rejected": -1.7482311725616455, + "logps/chosen": -427.6624450683594, + "logps/rejected": -486.4988708496094, + "loss": 0.5414, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3303483724594116, + "rewards/margins": 0.765957236289978, + "rewards/rejected": -2.0963053703308105, "step": 3210 }, { "epoch": 0.84, - "grad_norm": 8.3125, + "grad_norm": 10.0, "learning_rate": 3.676241067609465e-07, - "logits/chosen": -1.4022958278656006, - "logits/rejected": -1.290711522102356, - "logps/chosen": -542.0577392578125, - "logps/rejected": -607.3893432617188, - "loss": 0.5058, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.4838201999664307, - "rewards/margins": 1.0848579406738281, - "rewards/rejected": -3.568678379058838, + "logits/chosen": -1.9111486673355103, + "logits/rejected": -1.818162202835083, + "logps/chosen": -458.59552001953125, + "logps/rejected": -496.85772705078125, + "loss": 0.5223, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3662760257720947, + "rewards/margins": 0.86688631772995, + "rewards/rejected": -2.2331624031066895, "step": 3220 }, { "epoch": 0.85, - "grad_norm": 12.125, + "grad_norm": 8.0625, "learning_rate": 3.5578858740274976e-07, - "logits/chosen": -1.3260449171066284, - "logits/rejected": -1.2201776504516602, - "logps/chosen": -525.3899536132812, - "logps/rejected": -606.1673583984375, - "loss": 0.5216, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.637200117111206, - "rewards/margins": 0.9335072636604309, - "rewards/rejected": -3.570707321166992, + "logits/chosen": -1.8537018299102783, + "logits/rejected": -1.7111694812774658, + "logps/chosen": -425.08203125, + "logps/rejected": -486.8157653808594, + "loss": 0.5413, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.4430186748504639, + "rewards/margins": 0.7479842305183411, + "rewards/rejected": -2.191002607345581, "step": 3230 }, { "epoch": 0.85, - "grad_norm": 11.1875, + "grad_norm": 8.1875, "learning_rate": 3.44132109080447e-07, - "logits/chosen": -1.5093415975570679, - "logits/rejected": -1.3356587886810303, - "logps/chosen": -508.17236328125, - "logps/rejected": -589.3931274414062, - "loss": 0.4492, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.429962635040283, - "rewards/margins": 1.1870572566986084, - "rewards/rejected": -3.6170201301574707, + "logits/chosen": -1.9728435277938843, + "logits/rejected": -1.802259087562561, + "logps/chosen": -418.29925537109375, + "logps/rejected": -468.47509765625, + "loss": 0.4702, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3325908184051514, + "rewards/margins": 0.9086204767227173, + "rewards/rejected": -2.241211414337158, "step": 3240 }, { "epoch": 0.85, - "grad_norm": 11.125, + "grad_norm": 4.25, "learning_rate": 3.3265564510662344e-07, - "logits/chosen": -1.4432194232940674, - "logits/rejected": -1.3123570680618286, - "logps/chosen": -531.9483642578125, - "logps/rejected": -631.9108276367188, - "loss": 0.4232, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.3866143226623535, - "rewards/margins": 1.2505089044570923, - "rewards/rejected": -3.6371231079101562, + "logits/chosen": -1.9281237125396729, + "logits/rejected": -1.8321959972381592, + "logps/chosen": -435.9788513183594, + "logps/rejected": -510.285888671875, + "loss": 0.459, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.2192438840866089, + "rewards/margins": 1.0345146656036377, + "rewards/rejected": -2.253758430480957, "step": 3250 }, { "epoch": 0.85, - "grad_norm": 13.1875, + "grad_norm": 11.6875, "learning_rate": 3.213601537627195e-07, - "logits/chosen": -1.35580313205719, - "logits/rejected": -1.2558863162994385, - "logps/chosen": -531.135986328125, - "logps/rejected": -618.29248046875, - "loss": 0.5372, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -2.740386486053467, - "rewards/margins": 1.042879343032837, - "rewards/rejected": -3.7832655906677246, + "logits/chosen": -1.8366546630859375, + "logits/rejected": -1.7728904485702515, + "logps/chosen": -432.23724365234375, + "logps/rejected": -493.5435485839844, + "loss": 0.544, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5417168140411377, + "rewards/margins": 0.8185436129570007, + "rewards/rejected": -2.360260486602783, "step": 3260 }, { "epoch": 0.86, - "grad_norm": 13.0, + "grad_norm": 10.3125, "learning_rate": 3.1024657821901063e-07, - "logits/chosen": -1.4194626808166504, - "logits/rejected": -1.335777759552002, - "logps/chosen": -489.5896911621094, - "logps/rejected": -585.8953857421875, - "loss": 0.4902, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.4187073707580566, - "rewards/margins": 1.1213419437408447, - "rewards/rejected": -3.5400497913360596, + "logits/chosen": -1.8914482593536377, + "logits/rejected": -1.7727159261703491, + "logps/chosen": -404.0108337402344, + "logps/rejected": -475.5201110839844, + "loss": 0.4975, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3568850755691528, + "rewards/margins": 0.8796807527542114, + "rewards/rejected": -2.2365658283233643, "step": 3270 }, { "epoch": 0.86, - "grad_norm": 12.625, + "grad_norm": 26.875, "learning_rate": 2.9931584645585654e-07, - "logits/chosen": -1.3686619997024536, - "logits/rejected": -1.3381317853927612, - "logps/chosen": -520.0299682617188, - "logps/rejected": -624.9264526367188, - "loss": 0.5054, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.4555375576019287, - "rewards/margins": 1.0194650888442993, - "rewards/rejected": -3.4750030040740967, + "logits/chosen": -1.8954200744628906, + "logits/rejected": -1.8291819095611572, + "logps/chosen": -429.4666442871094, + "logps/rejected": -514.4373168945312, + "loss": 0.5108, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3573051691055298, + "rewards/margins": 0.8282718658447266, + "rewards/rejected": -2.185576915740967, "step": 3280 }, { "epoch": 0.86, - "grad_norm": 5.75, + "grad_norm": 5.59375, "learning_rate": 2.885688711862136e-07, - "logits/chosen": -1.371180772781372, - "logits/rejected": -1.3699692487716675, - "logps/chosen": -519.8883056640625, - "logps/rejected": -640.9420776367188, - "loss": 0.51, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.588639736175537, - "rewards/margins": 1.230355978012085, - "rewards/rejected": -3.818995714187622, + "logits/chosen": -1.813812255859375, + "logits/rejected": -1.8214820623397827, + "logps/chosen": -422.4434509277344, + "logps/rejected": -510.93707275390625, + "loss": 0.5452, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4395946264266968, + "rewards/margins": 0.9139968752861023, + "rewards/rejected": -2.3535916805267334, "step": 3290 }, { "epoch": 0.86, - "grad_norm": 10.6875, + "grad_norm": 4.65625, "learning_rate": 2.7800654977942486e-07, - "logits/chosen": -1.3652598857879639, - "logits/rejected": -1.242366075515747, - "logps/chosen": -507.40740966796875, - "logps/rejected": -607.2086181640625, - "loss": 0.5164, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.500795602798462, - "rewards/margins": 1.0368000268936157, - "rewards/rejected": -3.537595272064209, + "logits/chosen": -1.8207260370254517, + "logits/rejected": -1.6706082820892334, + "logps/chosen": -415.5560607910156, + "logps/rejected": -514.9617919921875, + "loss": 0.5317, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.3756178617477417, + "rewards/margins": 0.8347378969192505, + "rewards/rejected": -2.210355758666992, "step": 3300 }, { "epoch": 0.86, - "eval_logits/chosen": -1.3162051439285278, - "eval_logits/rejected": -1.194024920463562, - "eval_logps/chosen": -519.2968139648438, - "eval_logps/rejected": -608.888427734375, - "eval_loss": 0.48908573389053345, - "eval_rewards/accuracies": 0.7415000200271606, - "eval_rewards/chosen": -2.5466933250427246, - "eval_rewards/margins": 1.096286416053772, - "eval_rewards/rejected": -3.642979621887207, - "eval_runtime": 784.9349, - "eval_samples_per_second": 2.548, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -1.9272832870483398, + "eval_logits/rejected": -1.8138694763183594, + "eval_logps/chosen": -425.427978515625, + "eval_logps/rejected": -491.8742370605469, + "eval_loss": 0.5055835843086243, + "eval_rewards/accuracies": 0.7354999780654907, + "eval_rewards/chosen": -1.4029372930526733, + "eval_rewards/margins": 0.8833554983139038, + "eval_rewards/rejected": -2.2862930297851562, + "eval_runtime": 782.9474, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, "step": 3300 }, { "epoch": 0.87, - "grad_norm": 13.9375, + "grad_norm": 8.125, "learning_rate": 2.6762976418628797e-07, - "logits/chosen": -1.4040035009384155, - "logits/rejected": -1.2554330825805664, - "logps/chosen": -472.7406311035156, - "logps/rejected": -534.6551513671875, - "loss": 0.5136, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.42999529838562, - "rewards/margins": 1.0553653240203857, - "rewards/rejected": -3.485360622406006, + "logits/chosen": -1.9211854934692383, + "logits/rejected": -1.741869568824768, + "logps/chosen": -377.3354797363281, + "logps/rejected": -417.59295654296875, + "loss": 0.5303, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.3278093338012695, + "rewards/margins": 0.8446895480155945, + "rewards/rejected": -2.172498941421509, "step": 3310 }, { "epoch": 0.87, - "grad_norm": 13.5625, + "grad_norm": 6.375, "learning_rate": 2.5743938086541354e-07, - "logits/chosen": -1.3862022161483765, - "logits/rejected": -1.2560724020004272, - "logps/chosen": -514.1824951171875, - "logps/rejected": -601.5919799804688, - "loss": 0.4881, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.499603748321533, - "rewards/margins": 1.1225526332855225, - "rewards/rejected": -3.6221566200256348, + "logits/chosen": -1.8673250675201416, + "logits/rejected": -1.7157970666885376, + "logps/chosen": -422.41583251953125, + "logps/rejected": -485.87786865234375, + "loss": 0.5085, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3918393850326538, + "rewards/margins": 0.9153121113777161, + "rewards/rejected": -2.3071513175964355, "step": 3320 }, { "epoch": 0.87, - "grad_norm": 10.5625, + "grad_norm": 10.6875, "learning_rate": 2.4743625071087574e-07, - "logits/chosen": -1.5129387378692627, - "logits/rejected": -1.3397341966629028, - "logps/chosen": -518.1233520507812, - "logps/rejected": -620.1852416992188, - "loss": 0.461, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.405925750732422, - "rewards/margins": 1.3158292770385742, - "rewards/rejected": -3.721754789352417, + "logits/chosen": -1.9498252868652344, + "logits/rejected": -1.7927253246307373, + "logps/chosen": -422.6053161621094, + "logps/rejected": -502.58154296875, + "loss": 0.472, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.277875304222107, + "rewards/margins": 1.0411653518676758, + "rewards/rejected": -2.3190407752990723, "step": 3330 }, { "epoch": 0.87, - "grad_norm": 12.4375, + "grad_norm": 7.96875, "learning_rate": 2.3762120898116498e-07, - "logits/chosen": -1.3996360301971436, - "logits/rejected": -1.2913117408752441, - "logps/chosen": -539.2269897460938, - "logps/rejected": -628.4332275390625, - "loss": 0.5157, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.7045836448669434, - "rewards/margins": 0.9718179702758789, - "rewards/rejected": -3.6764016151428223, + "logits/chosen": -1.9012149572372437, + "logits/rejected": -1.7770229578018188, + "logps/chosen": -446.6949768066406, + "logps/rejected": -513.9698486328125, + "loss": 0.5254, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.564635992050171, + "rewards/margins": 0.7733919024467468, + "rewards/rejected": -2.3380281925201416, "step": 3340 }, { "epoch": 0.88, - "grad_norm": 8.1875, + "grad_norm": 8.125, "learning_rate": 2.2799507522944048e-07, - "logits/chosen": -1.3280880451202393, - "logits/rejected": -1.2456754446029663, - "logps/chosen": -509.9691467285156, - "logps/rejected": -624.684326171875, - "loss": 0.4429, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.4425625801086426, - "rewards/margins": 1.2291357517242432, - "rewards/rejected": -3.6716980934143066, + "logits/chosen": -1.7588342428207397, + "logits/rejected": -1.728158950805664, + "logps/chosen": -427.84002685546875, + "logps/rejected": -514.2551879882812, + "loss": 0.4674, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3852746486663818, + "rewards/margins": 0.9436731338500977, + "rewards/rejected": -2.3289475440979004, "step": 3350 }, { "epoch": 0.88, - "grad_norm": 8.4375, + "grad_norm": 6.84375, "learning_rate": 2.1855865323510056e-07, - "logits/chosen": -1.3912630081176758, - "logits/rejected": -1.2085883617401123, - "logps/chosen": -522.2635498046875, - "logps/rejected": -654.9818115234375, - "loss": 0.4447, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.4961001873016357, - "rewards/margins": 1.3863821029663086, - "rewards/rejected": -3.8824820518493652, + "logits/chosen": -1.8500531911849976, + "logits/rejected": -1.7112195491790771, + "logps/chosen": -435.78485107421875, + "logps/rejected": -542.5267333984375, + "loss": 0.4701, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.3804128170013428, + "rewards/margins": 1.1414902210235596, + "rewards/rejected": -2.5219030380249023, "step": 3360 }, { "epoch": 0.88, - "grad_norm": 7.875, + "grad_norm": 6.59375, "learning_rate": 2.0931273093666575e-07, - "logits/chosen": -1.352172613143921, - "logits/rejected": -1.2029151916503906, - "logps/chosen": -504.1370544433594, - "logps/rejected": -606.3607177734375, - "loss": 0.4461, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -2.639508008956909, - "rewards/margins": 1.2250006198883057, - "rewards/rejected": -3.8645083904266357, + "logits/chosen": -1.8536121845245361, + "logits/rejected": -1.6857569217681885, + "logps/chosen": -402.9033203125, + "logps/rejected": -488.2943420410156, + "loss": 0.4316, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.427424669265747, + "rewards/margins": 1.077510118484497, + "rewards/rejected": -2.504934787750244, "step": 3370 }, { "epoch": 0.88, - "grad_norm": 13.125, + "grad_norm": 9.9375, "learning_rate": 2.002580803659873e-07, - "logits/chosen": -1.3526690006256104, - "logits/rejected": -1.2415813207626343, - "logps/chosen": -520.9991455078125, - "logps/rejected": -610.5531616210938, - "loss": 0.4778, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.6652913093566895, - "rewards/margins": 1.0764987468719482, - "rewards/rejected": -3.7417900562286377, + "logits/chosen": -1.80948007106781, + "logits/rejected": -1.7452131509780884, + "logps/chosen": -420.40924072265625, + "logps/rejected": -497.26751708984375, + "loss": 0.4773, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.4817922115325928, + "rewards/margins": 0.893271803855896, + "rewards/rejected": -2.3750641345977783, "step": 3380 }, { "epoch": 0.89, - "grad_norm": 6.375, + "grad_norm": 5.125, "learning_rate": 1.913954575837826e-07, - "logits/chosen": -1.4264500141143799, - "logits/rejected": -1.1652201414108276, - "logps/chosen": -528.1181640625, - "logps/rejected": -590.223388671875, - "loss": 0.4603, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.560216188430786, - "rewards/margins": 1.1181532144546509, - "rewards/rejected": -3.6783695220947266, + "logits/chosen": -1.9585930109024048, + "logits/rejected": -1.6200177669525146, + "logps/chosen": -426.95416259765625, + "logps/rejected": -465.089599609375, + "loss": 0.5055, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3750698566436768, + "rewards/margins": 0.8868755102157593, + "rewards/rejected": -2.2619454860687256, "step": 3390 }, { "epoch": 0.89, - "grad_norm": 9.625, + "grad_norm": 5.65625, "learning_rate": 1.827256026165028e-07, - "logits/chosen": -1.4379308223724365, - "logits/rejected": -1.2398895025253296, - "logps/chosen": -551.6383056640625, - "logps/rejected": -617.5082397460938, - "loss": 0.4554, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -2.366239070892334, - "rewards/margins": 1.22964608669281, - "rewards/rejected": -3.5958850383758545, + "logits/chosen": -1.9661176204681396, + "logits/rejected": -1.7149207592010498, + "logps/chosen": -461.3528747558594, + "logps/rejected": -499.0296936035156, + "loss": 0.4668, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.2651516199111938, + "rewards/margins": 0.9840850830078125, + "rewards/rejected": -2.249236583709717, "step": 3400 }, { "epoch": 0.89, - "eval_logits/chosen": -1.316201090812683, - "eval_logits/rejected": -1.1940573453903198, - "eval_logps/chosen": -521.2743530273438, - "eval_logps/rejected": -611.3746337890625, - "eval_loss": 0.4889242351055145, - "eval_rewards/accuracies": 0.7409999966621399, - "eval_rewards/chosen": -2.5664684772491455, - "eval_rewards/margins": 1.1013725996017456, - "eval_rewards/rejected": -3.6678411960601807, - "eval_runtime": 784.7968, - "eval_samples_per_second": 2.548, + "eval_logits/chosen": -1.9266204833984375, + "eval_logits/rejected": -1.8132041692733765, + "eval_logps/chosen": -425.77191162109375, + "eval_logps/rejected": -492.45269775390625, + "eval_loss": 0.505458652973175, + "eval_rewards/accuracies": 0.7350000143051147, + "eval_rewards/chosen": -1.4063764810562134, + "eval_rewards/margins": 0.8857010006904602, + "eval_rewards/rejected": -2.2920773029327393, + "eval_runtime": 783.1005, + "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.319, "step": 3400 }, { "epoch": 0.89, - "grad_norm": 7.59375, + "grad_norm": 8.3125, "learning_rate": 1.7424923939454274e-07, - "logits/chosen": -1.3783215284347534, - "logits/rejected": -1.205242395401001, - "logps/chosen": -533.8648071289062, - "logps/rejected": -615.4136962890625, - "loss": 0.41, - "rewards/accuracies": 0.856249988079071, - "rewards/chosen": -2.525538921356201, - "rewards/margins": 1.2488229274749756, - "rewards/rejected": -3.7743613719940186, + "logits/chosen": -1.9129213094711304, + "logits/rejected": -1.7314773797988892, + "logps/chosen": -438.605224609375, + "logps/rejected": -489.36846923828125, + "loss": 0.4838, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3883512020111084, + "rewards/margins": 0.9468286633491516, + "rewards/rejected": -2.3351798057556152, "step": 3410 }, { "epoch": 0.9, - "grad_norm": 16.75, + "grad_norm": 12.125, "learning_rate": 1.6596707569179304e-07, - "logits/chosen": -1.4666881561279297, - "logits/rejected": -1.3174692392349243, - "logps/chosen": -538.4910888671875, - "logps/rejected": -610.6369018554688, - "loss": 0.5007, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.573629379272461, - "rewards/margins": 1.0726503133773804, - "rewards/rejected": -3.6462795734405518, + "logits/chosen": -1.9000225067138672, + "logits/rejected": -1.7462106943130493, + "logps/chosen": -441.75830078125, + "logps/rejected": -494.11846923828125, + "loss": 0.5098, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.4242441654205322, + "rewards/margins": 0.8710432052612305, + "rewards/rejected": -2.2952873706817627, "step": 3420 }, { "epoch": 0.9, - "grad_norm": 8.25, + "grad_norm": 9.0625, "learning_rate": 1.578798030665385e-07, - "logits/chosen": -1.4168808460235596, - "logits/rejected": -1.2357183694839478, - "logps/chosen": -524.3135986328125, - "logps/rejected": -638.2732543945312, - "loss": 0.4441, - "rewards/accuracies": 0.78125, - "rewards/chosen": -2.4969139099121094, - "rewards/margins": 1.3102748394012451, - "rewards/rejected": -3.8071885108947754, + "logits/chosen": -1.9095699787139893, + "logits/rejected": -1.7264404296875, + "logps/chosen": -427.95294189453125, + "logps/rejected": -512.547607421875, + "loss": 0.4677, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3406087160110474, + "rewards/margins": 1.0093950033187866, + "rewards/rejected": -2.350003719329834, "step": 3430 }, { "epoch": 0.9, - "grad_norm": 7.96875, + "grad_norm": 5.71875, "learning_rate": 1.499880968037165e-07, - "logits/chosen": -1.3937218189239502, - "logits/rejected": -1.2632675170898438, - "logps/chosen": -508.6050720214844, - "logps/rejected": -583.2626342773438, - "loss": 0.5086, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -2.530186891555786, - "rewards/margins": 1.109734296798706, - "rewards/rejected": -3.639921188354492, + "logits/chosen": -1.8476403951644897, + "logits/rejected": -1.7272007465362549, + "logps/chosen": -402.6236267089844, + "logps/rejected": -456.137451171875, + "loss": 0.4818, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3089590072631836, + "rewards/margins": 0.9297110438346863, + "rewards/rejected": -2.2386701107025146, "step": 3440 }, { "epoch": 0.9, - "grad_norm": 15.5625, + "grad_norm": 8.375, "learning_rate": 1.4229261585852805e-07, - "logits/chosen": -1.4141117334365845, - "logits/rejected": -1.3363901376724243, - "logps/chosen": -519.9625854492188, - "logps/rejected": -607.4551391601562, - "loss": 0.4595, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -2.5164310932159424, - "rewards/margins": 1.1321784257888794, - "rewards/rejected": -3.6486096382141113, + "logits/chosen": -1.8726069927215576, + "logits/rejected": -1.814284324645996, + "logps/chosen": -413.341796875, + "logps/rejected": -482.2705993652344, + "loss": 0.4554, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.283301591873169, + "rewards/margins": 0.9565474390983582, + "rewards/rejected": -2.239849090576172, "step": 3450 }, { "epoch": 0.91, - "grad_norm": 10.625, + "grad_norm": 7.6875, "learning_rate": 1.3479400280141886e-07, - "logits/chosen": -1.339782953262329, - "logits/rejected": -1.2957314252853394, - "logps/chosen": -505.3309631347656, - "logps/rejected": -619.78662109375, - "loss": 0.4873, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.590623378753662, - "rewards/margins": 1.1652476787567139, - "rewards/rejected": -3.755871295928955, + "logits/chosen": -1.8118232488632202, + "logits/rejected": -1.7649272680282593, + "logps/chosen": -407.5479736328125, + "logps/rejected": -503.00091552734375, + "loss": 0.5054, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4062145948410034, + "rewards/margins": 0.9560636281967163, + "rewards/rejected": -2.3622782230377197, "step": 3460 }, { "epoch": 0.91, - "grad_norm": 11.3125, + "grad_norm": 7.78125, "learning_rate": 1.2749288376442044e-07, - "logits/chosen": -1.4245412349700928, - "logits/rejected": -1.2371046543121338, - "logps/chosen": -545.9927978515625, - "logps/rejected": -600.5451049804688, - "loss": 0.4724, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.495211362838745, - "rewards/margins": 1.1209526062011719, - "rewards/rejected": -3.616163969039917, + "logits/chosen": -1.8775228261947632, + "logits/rejected": -1.7272803783416748, + "logps/chosen": -452.64874267578125, + "logps/rejected": -483.62811279296875, + "loss": 0.4763, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3288511037826538, + "rewards/margins": 0.9381489753723145, + "rewards/rejected": -2.267000198364258, "step": 3470 }, { "epoch": 0.91, - "grad_norm": 10.8125, + "grad_norm": 9.4375, "learning_rate": 1.203898683888713e-07, - "logits/chosen": -1.4366028308868408, - "logits/rejected": -1.2881221771240234, - "logps/chosen": -508.22882080078125, - "logps/rejected": -602.8942260742188, - "loss": 0.5468, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.642627239227295, - "rewards/margins": 0.9780961275100708, - "rewards/rejected": -3.620723247528076, + "logits/chosen": -1.9275810718536377, + "logits/rejected": -1.7312577962875366, + "logps/chosen": -407.8893737792969, + "logps/rejected": -480.2945861816406, + "loss": 0.5632, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.4742262363433838, + "rewards/margins": 0.7488548755645752, + "rewards/rejected": -2.22308087348938, "step": 3480 }, { "epoch": 0.91, - "grad_norm": 7.78125, + "grad_norm": 7.71875, "learning_rate": 1.1348554977451132e-07, - "logits/chosen": -1.4646247625350952, - "logits/rejected": -1.316213846206665, - "logps/chosen": -533.8064575195312, - "logps/rejected": -605.6385498046875, - "loss": 0.5125, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -2.518562078475952, - "rewards/margins": 1.047595739364624, - "rewards/rejected": -3.566157579421997, + "logits/chosen": -1.9800020456314087, + "logits/rejected": -1.7983967065811157, + "logps/chosen": -435.8182678222656, + "logps/rejected": -486.7703552246094, + "loss": 0.4831, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.3190518617630005, + "rewards/margins": 0.8798874616622925, + "rewards/rejected": -2.198939085006714, "step": 3490 }, { "epoch": 0.92, - "grad_norm": 6.53125, + "grad_norm": 4.96875, "learning_rate": 1.0678050442995802e-07, - "logits/chosen": -1.4132311344146729, - "logits/rejected": -1.2221227884292603, - "logps/chosen": -535.25927734375, - "logps/rejected": -595.0910034179688, - "loss": 0.5354, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.5456292629241943, - "rewards/margins": 1.0484365224838257, - "rewards/rejected": -3.5940654277801514, + "logits/chosen": -1.889638900756836, + "logits/rejected": -1.7416059970855713, + "logps/chosen": -442.87994384765625, + "logps/rejected": -472.72314453125, + "loss": 0.5671, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4175615310668945, + "rewards/margins": 0.7715851068496704, + "rewards/rejected": -2.1891465187072754, "step": 3500 }, { "epoch": 0.92, - "eval_logits/chosen": -1.3186591863632202, - "eval_logits/rejected": -1.1965842247009277, - "eval_logps/chosen": -520.4332885742188, - "eval_logps/rejected": -610.7185668945312, - "eval_loss": 0.4888494312763214, - "eval_rewards/accuracies": 0.7409999966621399, - "eval_rewards/chosen": -2.558058023452759, - "eval_rewards/margins": 1.1032229661941528, - "eval_rewards/rejected": -3.6612813472747803, - "eval_runtime": 785.478, - "eval_samples_per_second": 2.546, - "eval_steps_per_second": 0.318, + "eval_logits/chosen": -1.9291415214538574, + "eval_logits/rejected": -1.8157764673233032, + "eval_logps/chosen": -425.4985656738281, + "eval_logps/rejected": -492.2395324707031, + "eval_loss": 0.5055854916572571, + "eval_rewards/accuracies": 0.734499990940094, + "eval_rewards/chosen": -1.4036431312561035, + "eval_rewards/margins": 0.8863027095794678, + "eval_rewards/rejected": -2.2899458408355713, + "eval_runtime": 783.003, + "eval_samples_per_second": 2.554, + "eval_steps_per_second": 0.319, "step": 3500 }, { "epoch": 0.92, - "grad_norm": 10.375, + "grad_norm": 6.4375, "learning_rate": 1.0027529222456755e-07, - "logits/chosen": -1.3980259895324707, - "logits/rejected": -1.228690266609192, - "logps/chosen": -502.554443359375, - "logps/rejected": -604.9215087890625, - "loss": 0.4243, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -2.484199047088623, - "rewards/margins": 1.1991740465164185, - "rewards/rejected": -3.683372974395752, + "logits/chosen": -1.8589719533920288, + "logits/rejected": -1.7069114446640015, + "logps/chosen": -408.1587219238281, + "logps/rejected": -483.13787841796875, + "loss": 0.4453, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.3498586416244507, + "rewards/margins": 0.9569522738456726, + "rewards/rejected": -2.3068108558654785, "step": 3510 }, { "epoch": 0.92, - "grad_norm": 9.9375, + "grad_norm": 4.875, "learning_rate": 9.397045634168766e-08, - "logits/chosen": -1.4190794229507446, - "logits/rejected": -1.3529365062713623, - "logps/chosen": -514.8367919921875, - "logps/rejected": -641.3560791015625, - "loss": 0.4603, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.4630870819091797, - "rewards/margins": 1.262632131576538, - "rewards/rejected": -3.725719451904297, + "logits/chosen": -1.8985093832015991, + "logits/rejected": -1.8638538122177124, + "logps/chosen": -413.01373291015625, + "logps/rejected": -521.2156982421875, + "loss": 0.4702, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2440009117126465, + "rewards/margins": 1.069000482559204, + "rewards/rejected": -2.3130011558532715, "step": 3520 }, { "epoch": 0.92, - "grad_norm": 11.8125, + "grad_norm": 11.3125, "learning_rate": 8.78665232332998e-08, - "logits/chosen": -1.346338152885437, - "logits/rejected": -1.2753067016601562, - "logps/chosen": -492.9662170410156, - "logps/rejected": -594.6959228515625, - "loss": 0.477, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.623117446899414, - "rewards/margins": 1.024907112121582, - "rewards/rejected": -3.648024320602417, + "logits/chosen": -1.7806581258773804, + "logits/rejected": -1.7375141382217407, + "logps/chosen": -399.5974426269531, + "logps/rejected": -475.81280517578125, + "loss": 0.5006, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.5022794008255005, + "rewards/margins": 0.8037067651748657, + "rewards/rejected": -2.305986166000366, "step": 3530 }, { "epoch": 0.93, - "grad_norm": 7.46875, + "grad_norm": 6.5, "learning_rate": 8.196400257606208e-08, - "logits/chosen": -1.4510104656219482, - "logits/rejected": -1.3078534603118896, - "logps/chosen": -530.4656372070312, - "logps/rejected": -659.3447265625, - "loss": 0.4242, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.4742283821105957, - "rewards/margins": 1.3415868282318115, - "rewards/rejected": -3.8158154487609863, + "logits/chosen": -1.910846471786499, + "logits/rejected": -1.8186595439910889, + "logps/chosen": -435.6886291503906, + "logps/rejected": -532.4617309570312, + "loss": 0.4666, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": -1.325961709022522, + "rewards/margins": 1.0250279903411865, + "rewards/rejected": -2.350989818572998, "step": 3540 }, { "epoch": 0.93, - "grad_norm": 8.75, + "grad_norm": 6.5, "learning_rate": 7.626338722875076e-08, - "logits/chosen": -1.3904287815093994, - "logits/rejected": -1.3310621976852417, - "logps/chosen": -502.951904296875, - "logps/rejected": -611.5296020507812, - "loss": 0.4973, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.489201307296753, - "rewards/margins": 1.0758605003356934, - "rewards/rejected": -3.5650620460510254, + "logits/chosen": -1.8711729049682617, + "logits/rejected": -1.8197062015533447, + "logps/chosen": -405.9395446777344, + "logps/rejected": -484.9190368652344, + "loss": 0.5179, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3086730241775513, + "rewards/margins": 0.8287800550460815, + "rewards/rejected": -2.137453317642212, "step": 3550 }, { "epoch": 0.93, - "grad_norm": 7.34375, + "grad_norm": 4.90625, "learning_rate": 7.076515319110688e-08, - "logits/chosen": -1.396416425704956, - "logits/rejected": -1.3051905632019043, - "logps/chosen": -505.5409240722656, - "logps/rejected": -582.608154296875, - "loss": 0.5116, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.492713212966919, - "rewards/margins": 1.1867092847824097, - "rewards/rejected": -3.679421901702881, + "logits/chosen": -1.8416671752929688, + "logits/rejected": -1.8124072551727295, + "logps/chosen": -407.7252502441406, + "logps/rejected": -458.68353271484375, + "loss": 0.5219, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -1.3443095684051514, + "rewards/margins": 0.9537046551704407, + "rewards/rejected": -2.2980141639709473, "step": 3560 }, { "epoch": 0.93, - "grad_norm": 9.6875, + "grad_norm": 6.125, "learning_rate": 6.54697595640899e-08, - "logits/chosen": -1.406216025352478, - "logits/rejected": -1.2725520133972168, - "logps/chosen": -550.2034912109375, - "logps/rejected": -638.1261596679688, - "loss": 0.4763, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.5798609256744385, - "rewards/margins": 1.1309739351272583, - "rewards/rejected": -3.710834503173828, + "logits/chosen": -1.893803358078003, + "logits/rejected": -1.7148902416229248, + "logps/chosen": -445.6398010253906, + "logps/rejected": -513.85888671875, + "loss": 0.4858, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -1.334416389465332, + "rewards/margins": 0.9229806065559387, + "rewards/rejected": -2.257397174835205, "step": 3570 }, { "epoch": 0.94, - "grad_norm": 8.625, + "grad_norm": 6.15625, "learning_rate": 6.037764851154426e-08, - "logits/chosen": -1.3890554904937744, - "logits/rejected": -1.3486218452453613, - "logps/chosen": -515.577880859375, - "logps/rejected": -630.5028076171875, - "loss": 0.4898, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.5160646438598633, - "rewards/margins": 1.101179599761963, - "rewards/rejected": -3.617244243621826, + "logits/chosen": -1.827797532081604, + "logits/rejected": -1.8340890407562256, + "logps/chosen": -417.6336975097656, + "logps/rejected": -509.90423583984375, + "loss": 0.4995, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.3264166116714478, + "rewards/margins": 0.8964270353317261, + "rewards/rejected": -2.222843647003174, "step": 3580 }, { "epoch": 0.94, - "grad_norm": 7.28125, + "grad_norm": 6.71875, "learning_rate": 5.548924522327748e-08, - "logits/chosen": -1.3871036767959595, - "logits/rejected": -1.2439727783203125, - "logps/chosen": -509.38330078125, - "logps/rejected": -601.8846435546875, - "loss": 0.4916, + "logits/chosen": -1.8965423107147217, + "logits/rejected": -1.6809604167938232, + "logps/chosen": -421.51165771484375, + "logps/rejected": -491.33660888671875, + "loss": 0.5142, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.476794719696045, - "rewards/margins": 1.0720064640045166, - "rewards/rejected": -3.548801898956299, + "rewards/chosen": -1.3985979557037354, + "rewards/margins": 0.8676589727401733, + "rewards/rejected": -2.266256809234619, "step": 3590 }, { "epoch": 0.94, - "grad_norm": 11.0, + "grad_norm": 10.625, "learning_rate": 5.0804957879556915e-08, - "logits/chosen": -1.311679482460022, - "logits/rejected": -1.2297261953353882, - "logps/chosen": -475.36346435546875, - "logps/rejected": -592.0137329101562, - "loss": 0.4576, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.497997522354126, - "rewards/margins": 1.1212456226348877, - "rewards/rejected": -3.6192429065704346, + "logits/chosen": -1.7934764623641968, + "logits/rejected": -1.7331079244613647, + "logps/chosen": -388.4217529296875, + "logps/rejected": -480.72711181640625, + "loss": 0.4708, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.4167721271514893, + "rewards/margins": 0.9230138659477234, + "rewards/rejected": -2.3397860527038574, "step": 3600 }, { "epoch": 0.94, - "eval_logits/chosen": -1.318048119544983, - "eval_logits/rejected": -1.1959580183029175, - "eval_logps/chosen": -520.4293823242188, - "eval_logps/rejected": -610.7242431640625, - "eval_loss": 0.488956093788147, - "eval_rewards/accuracies": 0.7394999861717224, - "eval_rewards/chosen": -2.5580191612243652, - "eval_rewards/margins": 1.1033189296722412, - "eval_rewards/rejected": -3.6613380908966064, - "eval_runtime": 784.7913, - "eval_samples_per_second": 2.548, + "eval_logits/chosen": -1.9261465072631836, + "eval_logits/rejected": -1.812709093093872, + "eval_logps/chosen": -425.6341857910156, + "eval_logps/rejected": -492.3602600097656, + "eval_loss": 0.5055687427520752, + "eval_rewards/accuracies": 0.734499990940094, + "eval_rewards/chosen": -1.4049991369247437, + "eval_rewards/margins": 0.8861536383628845, + "eval_rewards/rejected": -2.2911531925201416, + "eval_runtime": 783.0794, + "eval_samples_per_second": 2.554, "eval_steps_per_second": 0.319, "step": 3600 }, { "epoch": 0.94, - "grad_norm": 11.75, + "grad_norm": 8.625, "learning_rate": 4.632517761702815e-08, - "logits/chosen": -1.3307334184646606, - "logits/rejected": -1.1874339580535889, - "logps/chosen": -491.62255859375, - "logps/rejected": -613.105712890625, - "loss": 0.4395, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.562260150909424, - "rewards/margins": 1.3380639553070068, - "rewards/rejected": -3.9003238677978516, + "logits/chosen": -1.7564241886138916, + "logits/rejected": -1.6173584461212158, + "logps/chosen": -408.35675048828125, + "logps/rejected": -496.64678955078125, + "loss": 0.459, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4728659391403198, + "rewards/margins": 1.0330259799957275, + "rewards/rejected": -2.505892038345337, "step": 3610 }, { "epoch": 0.95, - "grad_norm": 12.3125, + "grad_norm": 10.0, "learning_rate": 4.205027849605359e-08, - "logits/chosen": -1.3593323230743408, - "logits/rejected": -1.2600761651992798, - "logps/chosen": -530.3143310546875, - "logps/rejected": -590.4012451171875, - "loss": 0.661, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.8244924545288086, - "rewards/margins": 0.9012508392333984, - "rewards/rejected": -3.725743532180786, + "logits/chosen": -1.8442957401275635, + "logits/rejected": -1.732021689414978, + "logps/chosen": -413.531494140625, + "logps/rejected": -455.7919006347656, + "loss": 0.5459, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -1.4587666988372803, + "rewards/margins": 0.7717809677124023, + "rewards/rejected": -2.2305476665496826, "step": 3620 }, { "epoch": 0.95, - "grad_norm": 9.0625, + "grad_norm": 7.9375, "learning_rate": 3.798061746947995e-08, - "logits/chosen": -1.4908219575881958, - "logits/rejected": -1.32948899269104, - "logps/chosen": -515.5693359375, - "logps/rejected": -589.8714599609375, - "loss": 0.4813, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.5147576332092285, - "rewards/margins": 1.1346615552902222, - "rewards/rejected": -3.649419069290161, + "logits/chosen": -1.9753859043121338, + "logits/rejected": -1.7450494766235352, + "logps/chosen": -424.0469665527344, + "logps/rejected": -471.99188232421875, + "loss": 0.5102, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -1.4073350429534912, + "rewards/margins": 0.8742674589157104, + "rewards/rejected": -2.2816028594970703, "step": 3630 }, { "epoch": 0.95, - "grad_norm": 10.75, + "grad_norm": 9.0, "learning_rate": 3.411653435283158e-08, - "logits/chosen": -1.4093029499053955, - "logits/rejected": -1.2017524242401123, - "logps/chosen": -519.6351318359375, - "logps/rejected": -570.8560791015625, - "loss": 0.486, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.4641475677490234, - "rewards/margins": 1.06964910030365, - "rewards/rejected": -3.533797025680542, + "logits/chosen": -1.921852469444275, + "logits/rejected": -1.7092326879501343, + "logps/chosen": -422.7461853027344, + "logps/rejected": -451.899658203125, + "loss": 0.5007, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.311460256576538, + "rewards/margins": 0.8820557594299316, + "rewards/rejected": -2.1935160160064697, "step": 3640 }, { "epoch": 0.96, - "grad_norm": 10.875, + "grad_norm": 7.0, "learning_rate": 3.04583517959367e-08, - "logits/chosen": -1.4467204809188843, - "logits/rejected": -1.293100357055664, - "logps/chosen": -489.1570739746094, - "logps/rejected": -567.6870727539062, - "loss": 0.4641, + "logits/chosen": -1.9487841129302979, + "logits/rejected": -1.784651756286621, + "logps/chosen": -393.1403503417969, + "logps/rejected": -452.6136779785156, + "loss": 0.4832, "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -2.3709263801574707, - "rewards/margins": 1.1096155643463135, - "rewards/rejected": -3.480541706085205, + "rewards/chosen": -1.2417455911636353, + "rewards/margins": 0.9175812005996704, + "rewards/rejected": -2.1593265533447266, "step": 3650 }, { "epoch": 0.96, - "grad_norm": 9.875, + "grad_norm": 7.8125, "learning_rate": 2.7006375255985984e-08, - "logits/chosen": -1.38077712059021, - "logits/rejected": -1.3454914093017578, - "logps/chosen": -526.8986206054688, - "logps/rejected": -618.7762451171875, - "loss": 0.5662, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -2.621387004852295, - "rewards/margins": 0.9132136106491089, - "rewards/rejected": -3.5346004962921143, + "logits/chosen": -1.8327049016952515, + "logits/rejected": -1.7989648580551147, + "logps/chosen": -442.2281799316406, + "logps/rejected": -505.47674560546875, + "loss": 0.5841, + "rewards/accuracies": 0.65625, + "rewards/chosen": -1.5012400150299072, + "rewards/margins": 0.6502693891525269, + "rewards/rejected": -2.1515092849731445, "step": 3660 }, { "epoch": 0.96, - "grad_norm": 10.1875, + "grad_norm": 8.5625, "learning_rate": 2.3760892972027328e-08, - "logits/chosen": -1.497115135192871, - "logits/rejected": -1.3196406364440918, - "logps/chosen": -539.4028930664062, - "logps/rejected": -614.2379150390625, - "loss": 0.5386, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.6696388721466064, - "rewards/margins": 1.088233470916748, - "rewards/rejected": -3.7578723430633545, + "logits/chosen": -1.9631707668304443, + "logits/rejected": -1.7359784841537476, + "logps/chosen": -437.84423828125, + "logps/rejected": -484.88470458984375, + "loss": 0.5526, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.477554440498352, + "rewards/margins": 0.8330958485603333, + "rewards/rejected": -2.31065034866333, "step": 3670 }, { "epoch": 0.96, - "grad_norm": 12.3125, + "grad_norm": 9.4375, "learning_rate": 2.072217594089765e-08, - "logits/chosen": -1.3596253395080566, - "logits/rejected": -1.3404871225357056, - "logps/chosen": -516.7539672851562, - "logps/rejected": -628.5675048828125, - "loss": 0.4286, - "rewards/accuracies": 0.8187500238418579, - "rewards/chosen": -2.552133798599243, - "rewards/margins": 1.241511344909668, - "rewards/rejected": -3.793645143508911, + "logits/chosen": -1.8473793268203735, + "logits/rejected": -1.8018176555633545, + "logps/chosen": -421.2726135253906, + "logps/rejected": -516.998291015625, + "loss": 0.4353, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.4261518716812134, + "rewards/margins": 1.0697355270385742, + "rewards/rejected": -2.495887279510498, "step": 3680 }, { "epoch": 0.97, - "grad_norm": 9.8125, + "grad_norm": 9.6875, "learning_rate": 1.789047789459375e-08, - "logits/chosen": -1.4519376754760742, - "logits/rejected": -1.2614377737045288, - "logps/chosen": -568.89306640625, - "logps/rejected": -631.7921752929688, - "loss": 0.5235, + "logits/chosen": -1.9341415166854858, + "logits/rejected": -1.7410176992416382, + "logps/chosen": -475.49359130859375, + "logps/rejected": -517.1406860351562, + "loss": 0.4974, "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.520317554473877, - "rewards/margins": 1.11881422996521, - "rewards/rejected": -3.639132022857666, + "rewards/chosen": -1.3300564289093018, + "rewards/margins": 0.9742846488952637, + "rewards/rejected": -2.3043410778045654, "step": 3690 }, { "epoch": 0.97, - "grad_norm": 6.90625, + "grad_norm": 6.71875, "learning_rate": 1.5266035279088708e-08, - "logits/chosen": -1.2868165969848633, - "logits/rejected": -1.15117609500885, - "logps/chosen": -563.9498291015625, - "logps/rejected": -649.9329223632812, - "loss": 0.4816, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.673642158508301, - "rewards/margins": 1.1221272945404053, - "rewards/rejected": -3.795769453048706, + "logits/chosen": -1.7829246520996094, + "logits/rejected": -1.5944633483886719, + "logps/chosen": -466.7164611816406, + "logps/rejected": -525.4857177734375, + "loss": 0.4904, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4383667707443237, + "rewards/margins": 0.8766821026802063, + "rewards/rejected": -2.315048933029175, "step": 3700 }, { "epoch": 0.97, - "eval_logits/chosen": -1.3142924308776855, - "eval_logits/rejected": -1.1920459270477295, - "eval_logps/chosen": -520.3650512695312, - "eval_logps/rejected": -610.6686401367188, - "eval_loss": 0.4889410734176636, - "eval_rewards/accuracies": 0.7409999966621399, - "eval_rewards/chosen": -2.55737566947937, - "eval_rewards/margins": 1.1034061908721924, - "eval_rewards/rejected": -3.6607820987701416, - "eval_runtime": 780.7109, - "eval_samples_per_second": 2.562, + "eval_logits/chosen": -1.9288603067398071, + "eval_logits/rejected": -1.8154667615890503, + "eval_logps/chosen": -425.60430908203125, + "eval_logps/rejected": -492.37359619140625, + "eval_loss": 0.5054319500923157, + "eval_rewards/accuracies": 0.7354999780654907, + "eval_rewards/chosen": -1.4047002792358398, + "eval_rewards/margins": 0.8865863084793091, + "eval_rewards/rejected": -2.2912867069244385, + "eval_runtime": 782.1538, + "eval_samples_per_second": 2.557, "eval_steps_per_second": 0.32, "step": 3700 }, { "epoch": 0.97, - "grad_norm": 12.75, + "grad_norm": 10.5, "learning_rate": 1.2849067234584623e-08, - "logits/chosen": -1.2859097719192505, - "logits/rejected": -1.224083423614502, - "logps/chosen": -491.2533264160156, - "logps/rejected": -601.7191162109375, - "loss": 0.4703, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -2.5494284629821777, - "rewards/margins": 1.1806433200836182, - "rewards/rejected": -3.730072021484375, + "logits/chosen": -1.7769056558609009, + "logits/rejected": -1.7184251546859741, + "logps/chosen": -394.2373046875, + "logps/rejected": -475.7364807128906, + "loss": 0.5047, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.3951432704925537, + "rewards/margins": 0.919331431388855, + "rewards/rejected": -2.314474582672119, "step": 3710 }, { "epoch": 0.97, - "grad_norm": 11.375, + "grad_norm": 9.5625, "learning_rate": 1.0639775577218625e-08, - "logits/chosen": -1.2778334617614746, - "logits/rejected": -1.1110355854034424, - "logps/chosen": -510.61126708984375, - "logps/rejected": -586.930908203125, - "loss": 0.5205, - "rewards/accuracies": 0.75, - "rewards/chosen": -2.614388942718506, - "rewards/margins": 1.1228806972503662, - "rewards/rejected": -3.737269639968872, + "logits/chosen": -1.78192138671875, + "logits/rejected": -1.6007667779922485, + "logps/chosen": -411.0592346191406, + "logps/rejected": -470.96612548828125, + "loss": 0.4938, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -1.4045506715774536, + "rewards/margins": 0.9757940173149109, + "rewards/rejected": -2.380344867706299, "step": 3720 }, { "epoch": 0.98, - "grad_norm": 9.25, + "grad_norm": 7.5, "learning_rate": 8.638344782207486e-09, - "logits/chosen": -1.314502477645874, - "logits/rejected": -1.1973210573196411, - "logps/chosen": -491.2776794433594, - "logps/rejected": -575.7835083007812, - "loss": 0.4875, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.460561990737915, - "rewards/margins": 1.0728912353515625, - "rewards/rejected": -3.5334529876708984, + "logits/chosen": -1.8548873662948608, + "logits/rejected": -1.6706825494766235, + "logps/chosen": -405.6194763183594, + "logps/rejected": -470.0986328125, + "loss": 0.5011, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.2951724529266357, + "rewards/margins": 0.9182003736495972, + "rewards/rejected": -2.2133727073669434, "step": 3730 }, { "epoch": 0.98, - "grad_norm": 10.0, + "grad_norm": 8.3125, "learning_rate": 6.84494196844715e-09, - "logits/chosen": -1.3657796382904053, - "logits/rejected": -1.2624292373657227, - "logps/chosen": -522.8583984375, - "logps/rejected": -644.0818481445312, - "loss": 0.4645, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -2.5086629390716553, - "rewards/margins": 1.3271586894989014, - "rewards/rejected": -3.8358216285705566, + "logits/chosen": -1.8603630065917969, + "logits/rejected": -1.7809423208236694, + "logps/chosen": -431.21478271484375, + "logps/rejected": -528.8220825195312, + "loss": 0.4502, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -1.3780217170715332, + "rewards/margins": 1.1094415187835693, + "rewards/rejected": -2.4874634742736816, "step": 3740 }, { "epoch": 0.98, - "grad_norm": 8.5625, + "grad_norm": 5.75, "learning_rate": 5.259716884556121e-09, - "logits/chosen": -1.4247804880142212, - "logits/rejected": -1.2844493389129639, - "logps/chosen": -516.2821044921875, - "logps/rejected": -618.6136474609375, - "loss": 0.4474, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -2.5185599327087402, - "rewards/margins": 1.171399474143982, - "rewards/rejected": -3.689959764480591, + "logits/chosen": -1.9321876764297485, + "logits/rejected": -1.7817065715789795, + "logps/chosen": -427.5755920410156, + "logps/rejected": -513.5477294921875, + "loss": 0.4795, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -1.339813470840454, + "rewards/margins": 0.9209781885147095, + "rewards/rejected": -2.260791778564453, "step": 3750 }, { "epoch": 0.98, - "grad_norm": 8.875, + "grad_norm": 7.625, "learning_rate": 3.882801896372967e-09, - "logits/chosen": -1.4088770151138306, - "logits/rejected": -1.3459703922271729, - "logps/chosen": -512.896240234375, - "logps/rejected": -596.1285400390625, - "loss": 0.4873, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -2.4756276607513428, - "rewards/margins": 1.1512353420257568, - "rewards/rejected": -3.6268630027770996, + "logits/chosen": -1.8706543445587158, + "logits/rejected": -1.8746263980865479, + "logps/chosen": -417.9456481933594, + "logps/rejected": -473.9695739746094, + "loss": 0.5329, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -1.338267207145691, + "rewards/margins": 0.8854736089706421, + "rewards/rejected": -2.223741054534912, "step": 3760 }, { "epoch": 0.99, - "grad_norm": 8.875, + "grad_norm": 5.90625, "learning_rate": 2.7143119759026614e-09, - "logits/chosen": -1.4300106763839722, - "logits/rejected": -1.2483068704605103, - "logps/chosen": -532.7575073242188, - "logps/rejected": -620.56396484375, - "loss": 0.4266, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -2.485426425933838, - "rewards/margins": 1.1331579685211182, - "rewards/rejected": -3.618584394454956, + "logits/chosen": -1.898192048072815, + "logits/rejected": -1.680114507675171, + "logps/chosen": -445.6121520996094, + "logps/rejected": -501.625732421875, + "loss": 0.4729, + "rewards/accuracies": 0.78125, + "rewards/chosen": -1.3821141719818115, + "rewards/margins": 0.848861575126648, + "rewards/rejected": -2.23097562789917, "step": 3770 }, { "epoch": 0.99, - "grad_norm": 8.875, + "grad_norm": 8.5, "learning_rate": 1.754344691717591e-09, - "logits/chosen": -1.3341867923736572, - "logits/rejected": -1.2818725109100342, - "logps/chosen": -508.2366638183594, - "logps/rejected": -624.2241821289062, - "loss": 0.5203, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.584681510925293, - "rewards/margins": 0.9391103982925415, - "rewards/rejected": -3.523791551589966, + "logits/chosen": -1.8438247442245483, + "logits/rejected": -1.7638375759124756, + "logps/chosen": -413.7787170410156, + "logps/rejected": -505.7743225097656, + "loss": 0.5463, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": -1.4394727945327759, + "rewards/margins": 0.7100211381912231, + "rewards/rejected": -2.149493932723999, "step": 3780 }, { "epoch": 0.99, - "grad_norm": 14.8125, + "grad_norm": 11.5625, "learning_rate": 1.0029802008096335e-09, - "logits/chosen": -1.3584728240966797, - "logits/rejected": -1.2166756391525269, - "logps/chosen": -531.0197143554688, - "logps/rejected": -623.9142456054688, - "loss": 0.4766, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -2.5674145221710205, - "rewards/margins": 1.1573880910873413, - "rewards/rejected": -3.7248024940490723, + "logits/chosen": -1.8373730182647705, + "logits/rejected": -1.7250454425811768, + "logps/chosen": -439.18878173828125, + "logps/rejected": -509.8990173339844, + "loss": 0.5098, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.4585397243499756, + "rewards/margins": 0.936810314655304, + "rewards/rejected": -2.3953499794006348, "step": 3790 }, { "epoch": 0.99, - "grad_norm": 10.0625, + "grad_norm": 5.53125, "learning_rate": 4.602812418974534e-10, - "logits/chosen": -1.4544346332550049, - "logits/rejected": -1.3297145366668701, - "logps/chosen": -539.0203857421875, - "logps/rejected": -625.6553955078125, - "loss": 0.5057, - "rewards/accuracies": 0.71875, - "rewards/chosen": -2.5723214149475098, - "rewards/margins": 1.1115500926971436, - "rewards/rejected": -3.6838715076446533, + "logits/chosen": -1.9282503128051758, + "logits/rejected": -1.8350452184677124, + "logps/chosen": -442.35992431640625, + "logps/rejected": -509.1976013183594, + "loss": 0.5001, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.3592561483383179, + "rewards/margins": 0.9274970293045044, + "rewards/rejected": -2.2867531776428223, "step": 3800 }, { "epoch": 0.99, - "eval_logits/chosen": -1.315586805343628, - "eval_logits/rejected": -1.1934481859207153, - "eval_logps/chosen": -520.3310546875, - "eval_logps/rejected": -610.6146240234375, - "eval_loss": 0.48894432187080383, - "eval_rewards/accuracies": 0.7419999837875366, - "eval_rewards/chosen": -2.557035446166992, - "eval_rewards/margins": 1.1032062768936157, - "eval_rewards/rejected": -3.6602416038513184, - "eval_runtime": 784.8709, - "eval_samples_per_second": 2.548, + "eval_logits/chosen": -1.9264984130859375, + "eval_logits/rejected": -1.813086986541748, + "eval_logps/chosen": -425.7129821777344, + "eval_logps/rejected": -492.45635986328125, + "eval_loss": 0.5055971741676331, + "eval_rewards/accuracies": 0.734499990940094, + "eval_rewards/chosen": -1.4057865142822266, + "eval_rewards/margins": 0.8863277435302734, + "eval_rewards/rejected": -2.2921142578125, + "eval_runtime": 782.8512, + "eval_samples_per_second": 2.555, "eval_steps_per_second": 0.319, "step": 3800 }, { "epoch": 1.0, - "grad_norm": 11.125, + "grad_norm": 7.25, "learning_rate": 1.2629313018819312e-10, - "logits/chosen": -1.3780701160430908, - "logits/rejected": -1.2597674131393433, - "logps/chosen": -503.51800537109375, - "logps/rejected": -583.5145874023438, - "loss": 0.5227, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -2.510735273361206, - "rewards/margins": 0.9568923115730286, - "rewards/rejected": -3.4676272869110107, + "logits/chosen": -1.8780736923217773, + "logits/rejected": -1.761028528213501, + "logps/chosen": -410.8787536621094, + "logps/rejected": -473.9859313964844, + "loss": 0.5112, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3418636322021484, + "rewards/margins": 0.8411988019943237, + "rewards/rejected": -2.1830623149871826, "step": 3810 }, { "epoch": 1.0, - "grad_norm": 17.5, + "grad_norm": 12.1875, "learning_rate": 1.0437535929996855e-12, - "logits/chosen": -1.3732407093048096, - "logits/rejected": -1.2099716663360596, - "logps/chosen": -543.6559448242188, - "logps/rejected": -636.1058349609375, - "loss": 0.4576, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.5689854621887207, - "rewards/margins": 1.371565580368042, - "rewards/rejected": -3.9405510425567627, + "logits/chosen": -1.8872268199920654, + "logits/rejected": -1.743740439414978, + "logps/chosen": -458.9305725097656, + "logps/rejected": -513.4288940429688, + "loss": 0.4832, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -1.4926271438598633, + "rewards/margins": 1.0204166173934937, + "rewards/rejected": -2.5130436420440674, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, - "train_loss": 0.5232822467709236, - "train_runtime": 82986.4165, - "train_samples_per_second": 0.737, + "train_loss": 0.5390157630246398, + "train_runtime": 82744.187, + "train_samples_per_second": 0.739, "train_steps_per_second": 0.046 } ],