diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,21 +1,21 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 0.9997996125843297, "eval_steps": 100, - "global_step": 3821, + "global_step": 3742, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, - "grad_norm": 1.96875, - "learning_rate": 1.3054830287206268e-08, - "logits/chosen": -2.804384231567383, - "logits/rejected": -2.6176326274871826, - "logps/chosen": -310.1795349121094, - "logps/rejected": -392.80609130859375, + "grad_norm": 1.28125, + "learning_rate": 1.3333333333333334e-08, + "logits/chosen": -3.025045394897461, + "logits/rejected": -2.9554684162139893, + "logps/chosen": -203.51824951171875, + "logps/rejected": -217.89437866210938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -25,6354 +25,6218 @@ }, { "epoch": 0.0, - "grad_norm": 1.984375, - "learning_rate": 1.3054830287206266e-07, - "logits/chosen": -2.765848159790039, - "logits/rejected": -2.541754722595215, - "logps/chosen": -301.7055358886719, - "logps/rejected": -262.96173095703125, - "loss": 0.6932, - "rewards/accuracies": 0.4305555522441864, - "rewards/chosen": -5.9747020713984966e-05, - "rewards/margins": -4.961798185831867e-05, - "rewards/rejected": -1.0129070687980857e-05, + "grad_norm": 1.5234375, + "learning_rate": 1.3333333333333336e-07, + "logits/chosen": -2.852483034133911, + "logits/rejected": -2.5924882888793945, + "logps/chosen": -203.398681640625, + "logps/rejected": -200.5269012451172, + "loss": 0.6929, + "rewards/accuracies": 0.5138888955116272, + "rewards/chosen": 0.00020951780606992543, + "rewards/margins": 0.0005117396358400583, + "rewards/rejected": -0.0003022218297701329, "step": 10 }, { "epoch": 0.01, - "grad_norm": 2.3125, - "learning_rate": 2.610966057441253e-07, - "logits/chosen": -2.753605365753174, - "logits/rejected": -2.3991589546203613, - "logps/chosen": -328.1248474121094, - "logps/rejected": -254.85726928710938, - "loss": 0.6932, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": 0.00012516247807070613, - "rewards/margins": -0.00015478665591217577, - "rewards/rejected": 0.00027994910487905145, + "grad_norm": 1.3515625, + "learning_rate": 2.666666666666667e-07, + "logits/chosen": -2.891592025756836, + "logits/rejected": -2.6086668968200684, + "logps/chosen": -196.91787719726562, + "logps/rejected": -190.93399047851562, + "loss": 0.6926, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.00016576508642174304, + "rewards/margins": 0.0010707227047532797, + "rewards/rejected": -0.001236487878486514, "step": 20 }, { "epoch": 0.01, - "grad_norm": 2.625, - "learning_rate": 3.9164490861618804e-07, - "logits/chosen": -2.6939728260040283, - "logits/rejected": -2.622387647628784, - "logps/chosen": -272.1611633300781, - "logps/rejected": -270.6854553222656, - "loss": 0.6931, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -0.0003803514118772, - "rewards/margins": 8.866041025612503e-05, - "rewards/rejected": -0.0004690118657890707, + "grad_norm": 1.25, + "learning_rate": 4.0000000000000003e-07, + "logits/chosen": -2.8247406482696533, + "logits/rejected": -2.615675449371338, + "logps/chosen": -184.14871215820312, + "logps/rejected": -184.94290161132812, + "loss": 0.6916, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.0007558080833405256, + "rewards/margins": 0.0030627017840743065, + "rewards/rejected": -0.0038185096345841885, "step": 30 }, { "epoch": 0.01, - "grad_norm": 1.6875, - "learning_rate": 5.221932114882506e-07, - "logits/chosen": -2.5652289390563965, - "logits/rejected": -2.5474331378936768, - "logps/chosen": -235.1596221923828, - "logps/rejected": -240.07699584960938, - "loss": 0.6926, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.00027505913749337196, - "rewards/margins": 0.001062754774466157, - "rewards/rejected": -0.0007876956951804459, + "grad_norm": 1.3125, + "learning_rate": 5.333333333333335e-07, + "logits/chosen": -2.7942700386047363, + "logits/rejected": -2.5588576793670654, + "logps/chosen": -190.18087768554688, + "logps/rejected": -184.86978149414062, + "loss": 0.69, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.0013035403098911047, + "rewards/margins": 0.006345006637275219, + "rewards/rejected": -0.007648547179996967, "step": 40 }, { "epoch": 0.01, - "grad_norm": 1.65625, - "learning_rate": 6.527415143603135e-07, - "logits/chosen": -2.6258225440979004, - "logits/rejected": -2.6688971519470215, - "logps/chosen": -291.9371337890625, - "logps/rejected": -255.8798828125, - "loss": 0.6922, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.0018380691763013601, - "rewards/margins": 0.0018592171836644411, - "rewards/rejected": -2.1147827283130027e-05, + "grad_norm": 1.5859375, + "learning_rate": 6.666666666666667e-07, + "logits/chosen": -2.7790920734405518, + "logits/rejected": -2.594180107116699, + "logps/chosen": -195.8500213623047, + "logps/rejected": -197.25079345703125, + "loss": 0.6867, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": 0.001157461549155414, + "rewards/margins": 0.01313072256743908, + "rewards/rejected": -0.011973262764513493, "step": 50 }, { "epoch": 0.02, - "grad_norm": 1.8046875, - "learning_rate": 7.832898172323761e-07, - "logits/chosen": -2.6339774131774902, - "logits/rejected": -2.42149019241333, - "logps/chosen": -276.26544189453125, - "logps/rejected": -248.28884887695312, - "loss": 0.6918, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0025974493473768234, - "rewards/margins": 0.0027662336360663176, - "rewards/rejected": -0.00016878444876056165, + "grad_norm": 1.3515625, + "learning_rate": 8.000000000000001e-07, + "logits/chosen": -2.7710585594177246, + "logits/rejected": -2.5242323875427246, + "logps/chosen": -185.81253051757812, + "logps/rejected": -195.85052490234375, + "loss": 0.6806, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": 0.009653949178755283, + "rewards/margins": 0.025605550035834312, + "rewards/rejected": -0.015951603651046753, "step": 60 }, { "epoch": 0.02, - "grad_norm": 1.8046875, - "learning_rate": 9.138381201044387e-07, - "logits/chosen": -2.7314560413360596, - "logits/rejected": -2.5053892135620117, - "logps/chosen": -295.3564758300781, - "logps/rejected": -268.2427673339844, - "loss": 0.6913, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.0015695008914917707, - "rewards/margins": 0.003719520289450884, - "rewards/rejected": -0.002150018932297826, + "grad_norm": 1.3984375, + "learning_rate": 9.333333333333334e-07, + "logits/chosen": -2.845823049545288, + "logits/rejected": -2.7732534408569336, + "logps/chosen": -185.01431274414062, + "logps/rejected": -185.13381958007812, + "loss": 0.6809, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.025843212381005287, + "rewards/margins": 0.025248954072594643, + "rewards/rejected": 0.0005942584248259664, "step": 70 }, { "epoch": 0.02, - "grad_norm": 2.453125, - "learning_rate": 1.0443864229765013e-06, - "logits/chosen": -2.676331043243408, - "logits/rejected": -2.466033697128296, - "logps/chosen": -281.6533203125, - "logps/rejected": -268.70001220703125, - "loss": 0.6903, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.0021942234598100185, - "rewards/margins": 0.00573298055678606, - "rewards/rejected": -0.003538756398484111, + "grad_norm": 1.421875, + "learning_rate": 1.066666666666667e-06, + "logits/chosen": -2.7801637649536133, + "logits/rejected": -2.580289840698242, + "logps/chosen": -207.74093627929688, + "logps/rejected": -203.61135864257812, + "loss": 0.6783, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.03248121589422226, + "rewards/margins": 0.030628155916929245, + "rewards/rejected": 0.0018530559027567506, "step": 80 }, { "epoch": 0.02, - "grad_norm": 1.8125, - "learning_rate": 1.1749347258485642e-06, - "logits/chosen": -2.6481051445007324, - "logits/rejected": -2.495612621307373, - "logps/chosen": -272.77105712890625, - "logps/rejected": -253.6960906982422, - "loss": 0.6899, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.0026349679101258516, - "rewards/margins": 0.006577133201062679, - "rewards/rejected": -0.003942165989428759, + "grad_norm": 1.421875, + "learning_rate": 1.2000000000000002e-06, + "logits/chosen": -2.8460533618927, + "logits/rejected": -2.6127703189849854, + "logps/chosen": -184.6515655517578, + "logps/rejected": -189.5701446533203, + "loss": 0.6742, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.015769537538290024, + "rewards/margins": 0.03935455530881882, + "rewards/rejected": -0.023585019633173943, "step": 90 }, { "epoch": 0.03, - "grad_norm": 1.890625, - "learning_rate": 1.305483028720627e-06, - "logits/chosen": -2.6623802185058594, - "logits/rejected": -2.56431245803833, - "logps/chosen": -269.0791320800781, - "logps/rejected": -250.7205047607422, - "loss": 0.6896, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.00583293940871954, - "rewards/margins": 0.0073219239711761475, - "rewards/rejected": -0.0014889847952872515, + "grad_norm": 1.6484375, + "learning_rate": 1.3333333333333334e-06, + "logits/chosen": -2.798549175262451, + "logits/rejected": -2.63997220993042, + "logps/chosen": -186.33349609375, + "logps/rejected": -193.92739868164062, + "loss": 0.6698, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.00546322762966156, + "rewards/margins": 0.048379648476839066, + "rewards/rejected": -0.042916424572467804, "step": 100 }, { "epoch": 0.03, - "eval_logits/chosen": -2.690899610519409, - "eval_logits/rejected": -2.5586190223693848, - "eval_logps/chosen": -284.4092712402344, - "eval_logps/rejected": -263.477294921875, - "eval_loss": 0.6884487271308899, - "eval_rewards/accuracies": 0.6744999885559082, - "eval_rewards/chosen": 0.00724982051178813, - "eval_rewards/margins": 0.009573389776051044, - "eval_rewards/rejected": -0.0023235685657709837, - "eval_runtime": 784.8066, - "eval_samples_per_second": 2.548, - "eval_steps_per_second": 0.319, + "eval_logits/chosen": -2.69581937789917, + "eval_logits/rejected": -2.562171220779419, + "eval_logps/chosen": -288.78289794921875, + "eval_logps/rejected": -267.54718017578125, + "eval_loss": 0.6901289224624634, + "eval_rewards/accuracies": 0.5625, + "eval_rewards/chosen": -0.03648632764816284, + "eval_rewards/margins": 0.006536239292472601, + "eval_rewards/rejected": -0.04302256554365158, + "eval_runtime": 782.2962, + "eval_samples_per_second": 2.557, + "eval_steps_per_second": 0.32, "step": 100 }, { "epoch": 0.03, - "grad_norm": 2.125, - "learning_rate": 1.4360313315926894e-06, - "logits/chosen": -2.632530689239502, - "logits/rejected": -2.5299453735351562, - "logps/chosen": -309.1819763183594, - "logps/rejected": -259.67205810546875, - "loss": 0.6861, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.009178686887025833, - "rewards/margins": 0.01440912764519453, - "rewards/rejected": -0.005230440758168697, + "grad_norm": 1.3125, + "learning_rate": 1.4666666666666669e-06, + "logits/chosen": -2.820195198059082, + "logits/rejected": -2.545072317123413, + "logps/chosen": -190.32762145996094, + "logps/rejected": -188.912353515625, + "loss": 0.6649, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00024031568318605423, + "rewards/margins": 0.05883366987109184, + "rewards/rejected": -0.05859335511922836, "step": 110 }, { "epoch": 0.03, - "grad_norm": 1.7578125, - "learning_rate": 1.5665796344647521e-06, - "logits/chosen": -2.6648287773132324, - "logits/rejected": -2.5776381492614746, - "logps/chosen": -311.6719665527344, - "logps/rejected": -289.6720275878906, - "loss": 0.6842, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.014281374402344227, - "rewards/margins": 0.018208980560302734, - "rewards/rejected": -0.003927607089281082, + "grad_norm": 1.5625, + "learning_rate": 1.6000000000000001e-06, + "logits/chosen": -2.7726855278015137, + "logits/rejected": -2.5571906566619873, + "logps/chosen": -194.81582641601562, + "logps/rejected": -202.47158813476562, + "loss": 0.6592, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.427392509067431e-05, + "rewards/margins": 0.07058823108673096, + "rewards/rejected": -0.07065249979496002, "step": 120 }, { "epoch": 0.03, - "grad_norm": 2.203125, - "learning_rate": 1.6971279373368146e-06, - "logits/chosen": -2.6542999744415283, - "logits/rejected": -2.6270055770874023, - "logps/chosen": -272.4345397949219, - "logps/rejected": -271.96484375, - "loss": 0.6834, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.015841448679566383, - "rewards/margins": 0.019856764003634453, - "rewards/rejected": -0.004015314858406782, + "grad_norm": 1.3984375, + "learning_rate": 1.7333333333333336e-06, + "logits/chosen": -2.8007471561431885, + "logits/rejected": -2.6735222339630127, + "logps/chosen": -180.63816833496094, + "logps/rejected": -196.3402099609375, + "loss": 0.6591, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.001957343192771077, + "rewards/margins": 0.07113702595233917, + "rewards/rejected": -0.07309436798095703, "step": 130 }, { "epoch": 0.04, - "grad_norm": 1.921875, - "learning_rate": 1.8276762402088774e-06, - "logits/chosen": -2.7620275020599365, - "logits/rejected": -2.4180870056152344, - "logps/chosen": -292.2891845703125, - "logps/rejected": -249.4049530029297, - "loss": 0.6847, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.014797434210777283, - "rewards/margins": 0.017552640289068222, - "rewards/rejected": -0.0027552072424441576, + "grad_norm": 1.5234375, + "learning_rate": 1.8666666666666669e-06, + "logits/chosen": -2.764997959136963, + "logits/rejected": -2.5524096488952637, + "logps/chosen": -184.10682678222656, + "logps/rejected": -201.29014587402344, + "loss": 0.6534, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.01610555127263069, + "rewards/margins": 0.0830421894788742, + "rewards/rejected": -0.06693664193153381, "step": 140 }, { "epoch": 0.04, - "grad_norm": 2.078125, - "learning_rate": 1.9582245430809403e-06, - "logits/chosen": -2.770200252532959, - "logits/rejected": -2.5172462463378906, - "logps/chosen": -299.77508544921875, - "logps/rejected": -258.18353271484375, - "loss": 0.682, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.019656788557767868, - "rewards/margins": 0.023214135318994522, - "rewards/rejected": -0.003557346761226654, + "grad_norm": 1.3359375, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -2.812412738800049, + "logits/rejected": -2.6014838218688965, + "logps/chosen": -193.41714477539062, + "logps/rejected": -208.3372344970703, + "loss": 0.6436, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.01172459963709116, + "rewards/margins": 0.10551564395427704, + "rewards/rejected": -0.11724023520946503, "step": 150 }, { "epoch": 0.04, - "grad_norm": 1.9921875, - "learning_rate": 2.0887728459530026e-06, - "logits/chosen": -2.698042392730713, - "logits/rejected": -2.585510730743408, - "logps/chosen": -275.923095703125, - "logps/rejected": -277.31927490234375, - "loss": 0.683, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.018199656158685684, - "rewards/margins": 0.021459590643644333, - "rewards/rejected": -0.003259934950619936, + "grad_norm": 1.6640625, + "learning_rate": 2.133333333333334e-06, + "logits/chosen": -2.8185415267944336, + "logits/rejected": -2.615097761154175, + "logps/chosen": -195.69102478027344, + "logps/rejected": -213.40408325195312, + "loss": 0.6294, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.03036859631538391, + "rewards/margins": 0.13596105575561523, + "rewards/rejected": -0.16632965207099915, "step": 160 }, { - "epoch": 0.04, - "grad_norm": 2.265625, - "learning_rate": 2.2193211488250653e-06, - "logits/chosen": -2.6444950103759766, - "logits/rejected": -2.464858293533325, - "logps/chosen": -237.28213500976562, - "logps/rejected": -239.7051239013672, - "loss": 0.6839, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.01709694229066372, - "rewards/margins": 0.019580338150262833, - "rewards/rejected": -0.0024833965580910444, + "epoch": 0.05, + "grad_norm": 1.75, + "learning_rate": 2.266666666666667e-06, + "logits/chosen": -2.826056957244873, + "logits/rejected": -2.6662731170654297, + "logps/chosen": -205.1206512451172, + "logps/rejected": -218.390869140625, + "loss": 0.6173, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": 0.005031009670346975, + "rewards/margins": 0.1644003987312317, + "rewards/rejected": -0.15936937928199768, "step": 170 }, { "epoch": 0.05, - "grad_norm": 1.6953125, - "learning_rate": 2.3498694516971284e-06, - "logits/chosen": -2.6247406005859375, - "logits/rejected": -2.4786934852600098, - "logps/chosen": -275.9543151855469, - "logps/rejected": -262.0204772949219, - "loss": 0.679, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.02462857961654663, - "rewards/margins": 0.029734212905168533, - "rewards/rejected": -0.005105632357299328, + "grad_norm": 1.609375, + "learning_rate": 2.4000000000000003e-06, + "logits/chosen": -2.845885753631592, + "logits/rejected": -2.6642587184906006, + "logps/chosen": -207.2623748779297, + "logps/rejected": -219.3222198486328, + "loss": 0.621, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.06885858625173569, + "rewards/margins": 0.15684355795383453, + "rewards/rejected": -0.22570213675498962, "step": 180 }, { "epoch": 0.05, - "grad_norm": 3.34375, - "learning_rate": 2.4804177545691907e-06, - "logits/chosen": -2.7437422275543213, - "logits/rejected": -2.514353036880493, - "logps/chosen": -289.6897277832031, - "logps/rejected": -259.26397705078125, - "loss": 0.6762, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.03337350860238075, - "rewards/margins": 0.035980306565761566, - "rewards/rejected": -0.0026067979633808136, + "grad_norm": 1.9921875, + "learning_rate": 2.5333333333333338e-06, + "logits/chosen": -2.8301947116851807, + "logits/rejected": -2.6005446910858154, + "logps/chosen": -194.04434204101562, + "logps/rejected": -226.3549346923828, + "loss": 0.6042, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.010565501637756824, + "rewards/margins": 0.1950242817401886, + "rewards/rejected": -0.20558981597423553, "step": 190 }, { "epoch": 0.05, - "grad_norm": 2.140625, - "learning_rate": 2.610966057441254e-06, - "logits/chosen": -2.741255760192871, - "logits/rejected": -2.47418212890625, - "logps/chosen": -268.6815490722656, - "logps/rejected": -230.81680297851562, - "loss": 0.6699, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.014004526659846306, - "rewards/margins": 0.049270953983068466, - "rewards/rejected": -0.03526642546057701, + "grad_norm": 1.90625, + "learning_rate": 2.666666666666667e-06, + "logits/chosen": -2.886862277984619, + "logits/rejected": -2.6888222694396973, + "logps/chosen": -198.06605529785156, + "logps/rejected": -230.2741241455078, + "loss": 0.5864, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -0.08909536898136139, + "rewards/margins": 0.23902097344398499, + "rewards/rejected": -0.32811635732650757, "step": 200 }, { "epoch": 0.05, - "eval_logits/chosen": -2.68129825592041, - "eval_logits/rejected": -2.5495352745056152, - "eval_logps/chosen": -284.310302734375, - "eval_logps/rejected": -266.82989501953125, - "eval_loss": 0.6725865006446838, - "eval_rewards/accuracies": 0.6894999742507935, - "eval_rewards/chosen": 0.008239748887717724, - "eval_rewards/margins": 0.04408977925777435, - "eval_rewards/rejected": -0.035850029438734055, - "eval_runtime": 783.4542, - "eval_samples_per_second": 2.553, - "eval_steps_per_second": 0.319, + "eval_logits/chosen": -2.721036434173584, + "eval_logits/rejected": -2.583653211593628, + "eval_logps/chosen": -300.270263671875, + "eval_logps/rejected": -283.8880615234375, + "eval_loss": 0.6711774468421936, + "eval_rewards/accuracies": 0.5914999842643738, + "eval_rewards/chosen": -0.15135958790779114, + "eval_rewards/margins": 0.055071912705898285, + "eval_rewards/rejected": -0.20643149316310883, + "eval_runtime": 781.0761, + "eval_samples_per_second": 2.561, + "eval_steps_per_second": 0.32, "step": 200 }, { - "epoch": 0.05, - "grad_norm": 1.90625, - "learning_rate": 2.741514360313316e-06, - "logits/chosen": -2.6879875659942627, - "logits/rejected": -2.4686179161071777, - "logps/chosen": -277.16839599609375, - "logps/rejected": -251.56875610351562, - "loss": 0.6655, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.01113974954932928, - "rewards/margins": 0.05906779691576958, - "rewards/rejected": -0.04792805388569832, + "epoch": 0.06, + "grad_norm": 2.09375, + "learning_rate": 2.8000000000000003e-06, + "logits/chosen": -2.8395516872406006, + "logits/rejected": -2.6661925315856934, + "logps/chosen": -191.8214874267578, + "logps/rejected": -225.88900756835938, + "loss": 0.5772, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -0.06861170381307602, + "rewards/margins": 0.2652565836906433, + "rewards/rejected": -0.3338682949542999, "step": 210 }, { "epoch": 0.06, - "grad_norm": 1.9140625, - "learning_rate": 2.872062663185379e-06, - "logits/chosen": -2.657982110977173, - "logits/rejected": -2.5118064880371094, - "logps/chosen": -275.3629150390625, - "logps/rejected": -249.0461883544922, - "loss": 0.6669, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.009381966665387154, - "rewards/margins": 0.058504533022642136, - "rewards/rejected": -0.04912256821990013, + "grad_norm": 1.9453125, + "learning_rate": 2.9333333333333338e-06, + "logits/chosen": -2.861643075942993, + "logits/rejected": -2.617266893386841, + "logps/chosen": -202.16476440429688, + "logps/rejected": -233.1599578857422, + "loss": 0.5759, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -0.09456236660480499, + "rewards/margins": 0.27134519815444946, + "rewards/rejected": -0.36590754985809326, "step": 220 }, { "epoch": 0.06, - "grad_norm": 2.515625, - "learning_rate": 3.0026109660574416e-06, - "logits/chosen": -2.734462261199951, - "logits/rejected": -2.523545265197754, - "logps/chosen": -324.44927978515625, - "logps/rejected": -291.8562316894531, - "loss": 0.6691, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.009536907076835632, - "rewards/margins": 0.053029656410217285, - "rewards/rejected": -0.04349275305867195, + "grad_norm": 3.171875, + "learning_rate": 3.066666666666667e-06, + "logits/chosen": -2.8064911365509033, + "logits/rejected": -2.6874587535858154, + "logps/chosen": -211.124267578125, + "logps/rejected": -253.9136962890625, + "loss": 0.5572, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -0.11212892830371857, + "rewards/margins": 0.3169172704219818, + "rewards/rejected": -0.4290461540222168, "step": 230 }, { "epoch": 0.06, - "grad_norm": 1.9609375, - "learning_rate": 3.1331592689295043e-06, - "logits/chosen": -2.7479310035705566, - "logits/rejected": -2.4911446571350098, - "logps/chosen": -316.1883850097656, - "logps/rejected": -306.96759033203125, - "loss": 0.6644, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.010969454422593117, - "rewards/margins": 0.06440214812755585, - "rewards/rejected": -0.07537160068750381, + "grad_norm": 3.40625, + "learning_rate": 3.2000000000000003e-06, + "logits/chosen": -2.8507437705993652, + "logits/rejected": -2.614583969116211, + "logps/chosen": -207.95834350585938, + "logps/rejected": -242.54525756835938, + "loss": 0.5409, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.1064368262887001, + "rewards/margins": 0.3595541715621948, + "rewards/rejected": -0.46599096059799194, "step": 240 }, { "epoch": 0.07, - "grad_norm": 1.96875, - "learning_rate": 3.263707571801567e-06, - "logits/chosen": -2.632441997528076, - "logits/rejected": -2.4809885025024414, - "logps/chosen": -281.6483459472656, - "logps/rejected": -259.15728759765625, - "loss": 0.6613, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.01294863224029541, - "rewards/margins": 0.07276834547519684, - "rewards/rejected": -0.08571697771549225, + "grad_norm": 3.375, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -2.758396625518799, + "logits/rejected": -2.6282601356506348, + "logps/chosen": -205.42391967773438, + "logps/rejected": -261.75799560546875, + "loss": 0.5237, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.1666347235441208, + "rewards/margins": 0.413015216588974, + "rewards/rejected": -0.5796499252319336, "step": 250 }, { "epoch": 0.07, - "grad_norm": 1.9921875, - "learning_rate": 3.3942558746736293e-06, - "logits/chosen": -2.7068283557891846, - "logits/rejected": -2.490290403366089, - "logps/chosen": -300.8772277832031, - "logps/rejected": -287.4792175292969, - "loss": 0.6643, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.0049089426174759865, - "rewards/margins": 0.07015503942966461, - "rewards/rejected": -0.07506398111581802, + "grad_norm": 4.34375, + "learning_rate": 3.4666666666666672e-06, + "logits/chosen": -2.8751864433288574, + "logits/rejected": -2.75618052482605, + "logps/chosen": -206.491943359375, + "logps/rejected": -259.9626770019531, + "loss": 0.5182, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -0.12468788772821426, + "rewards/margins": 0.4317372739315033, + "rewards/rejected": -0.556425154209137, "step": 260 }, { "epoch": 0.07, - "grad_norm": 1.8359375, - "learning_rate": 3.524804177545692e-06, - "logits/chosen": -2.611229658126831, - "logits/rejected": -2.4650397300720215, - "logps/chosen": -283.0829772949219, - "logps/rejected": -258.4039611816406, - "loss": 0.6393, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.00923145655542612, - "rewards/margins": 0.12286220490932465, - "rewards/rejected": -0.1136307492852211, + "grad_norm": 2.21875, + "learning_rate": 3.6000000000000003e-06, + "logits/chosen": -2.9339048862457275, + "logits/rejected": -2.759460210800171, + "logps/chosen": -219.48843383789062, + "logps/rejected": -272.0813293457031, + "loss": 0.4921, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.20950379967689514, + "rewards/margins": 0.4998226761817932, + "rewards/rejected": -0.709326446056366, "step": 270 }, { "epoch": 0.07, - "grad_norm": 2.515625, - "learning_rate": 3.6553524804177547e-06, - "logits/chosen": -2.7161974906921387, - "logits/rejected": -2.5690791606903076, - "logps/chosen": -281.7822570800781, - "logps/rejected": -267.12957763671875, - "loss": 0.6531, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.027930429205298424, - "rewards/margins": 0.09523724019527435, - "rewards/rejected": -0.12316767871379852, + "grad_norm": 5.4375, + "learning_rate": 3.7333333333333337e-06, + "logits/chosen": -2.836683511734009, + "logits/rejected": -2.7271854877471924, + "logps/chosen": -229.08169555664062, + "logps/rejected": -287.5020446777344, + "loss": 0.4868, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.2740945518016815, + "rewards/margins": 0.520361065864563, + "rewards/rejected": -0.7944557070732117, "step": 280 }, { "epoch": 0.08, - "grad_norm": 2.59375, - "learning_rate": 3.7859007832898174e-06, - "logits/chosen": -2.6117746829986572, - "logits/rejected": -2.537930727005005, - "logps/chosen": -311.3076171875, - "logps/rejected": -297.04058837890625, - "loss": 0.6439, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.04281552881002426, - "rewards/margins": 0.11832265555858612, - "rewards/rejected": -0.16113819181919098, + "grad_norm": 4.78125, + "learning_rate": 3.866666666666667e-06, + "logits/chosen": -2.8678526878356934, + "logits/rejected": -2.572840690612793, + "logps/chosen": -224.20431518554688, + "logps/rejected": -272.62664794921875, + "loss": 0.4813, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -0.2870492935180664, + "rewards/margins": 0.539145827293396, + "rewards/rejected": -0.8261950612068176, "step": 290 }, { "epoch": 0.08, - "grad_norm": 2.65625, - "learning_rate": 3.9164490861618806e-06, - "logits/chosen": -2.667273998260498, - "logits/rejected": -2.4587292671203613, - "logps/chosen": -275.95428466796875, - "logps/rejected": -270.0225830078125, - "loss": 0.636, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.06665624678134918, - "rewards/margins": 0.13395485281944275, - "rewards/rejected": -0.2006111443042755, + "grad_norm": 3.765625, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -2.9098057746887207, + "logits/rejected": -2.6796698570251465, + "logps/chosen": -236.15304565429688, + "logps/rejected": -302.85540771484375, + "loss": 0.4604, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -0.3245059847831726, + "rewards/margins": 0.5988339185714722, + "rewards/rejected": -0.9233399629592896, "step": 300 }, { "epoch": 0.08, - "eval_logits/chosen": -2.6823012828826904, - "eval_logits/rejected": -2.552032947540283, - "eval_logps/chosen": -285.1533508300781, - "eval_logps/rejected": -274.4986877441406, - "eval_loss": 0.6465616226196289, - "eval_rewards/accuracies": 0.6779999732971191, - "eval_rewards/chosen": -0.00019080757920164615, - "eval_rewards/margins": 0.11234673112630844, - "eval_rewards/rejected": -0.11253754794597626, - "eval_runtime": 783.4928, - "eval_samples_per_second": 2.553, - "eval_steps_per_second": 0.319, + "eval_logits/chosen": -2.7866616249084473, + "eval_logits/rejected": -2.646491050720215, + "eval_logps/chosen": -319.6733703613281, + "eval_logps/rejected": -309.0577087402344, + "eval_loss": 0.6634631752967834, + "eval_rewards/accuracies": 0.5755000114440918, + "eval_rewards/chosen": -0.34539124369621277, + "eval_rewards/margins": 0.11273663491010666, + "eval_rewards/rejected": -0.45812782645225525, + "eval_runtime": 781.0865, + "eval_samples_per_second": 2.561, + "eval_steps_per_second": 0.32, "step": 300 }, { "epoch": 0.08, - "grad_norm": 2.453125, - "learning_rate": 4.046997389033943e-06, - "logits/chosen": -2.8026623725891113, - "logits/rejected": -2.601572036743164, - "logps/chosen": -306.88897705078125, - "logps/rejected": -264.5550231933594, - "loss": 0.627, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.028339838609099388, - "rewards/margins": 0.15499535202980042, - "rewards/rejected": -0.12665551900863647, + "grad_norm": 4.71875, + "learning_rate": 4.133333333333333e-06, + "logits/chosen": -2.762162446975708, + "logits/rejected": -2.5808608531951904, + "logps/chosen": -220.88076782226562, + "logps/rejected": -284.20111083984375, + "loss": 0.4536, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.34411242604255676, + "rewards/margins": 0.6413618922233582, + "rewards/rejected": -0.9854742884635925, "step": 310 }, { - "epoch": 0.08, - "grad_norm": 3.296875, - "learning_rate": 4.177545691906005e-06, - "logits/chosen": -2.6788418292999268, - "logits/rejected": -2.460413932800293, - "logps/chosen": -285.0597839355469, - "logps/rejected": -278.0973205566406, - "loss": 0.6448, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.089615099132061, - "rewards/margins": 0.1233007088303566, - "rewards/rejected": -0.2129158079624176, + "epoch": 0.09, + "grad_norm": 2.921875, + "learning_rate": 4.266666666666668e-06, + "logits/chosen": -2.8597700595855713, + "logits/rejected": -2.6923470497131348, + "logps/chosen": -216.56784057617188, + "logps/rejected": -292.9040222167969, + "loss": 0.4418, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.37289419770240784, + "rewards/margins": 0.6711525917053223, + "rewards/rejected": -1.0440467596054077, "step": 320 }, { "epoch": 0.09, - "grad_norm": 2.78125, - "learning_rate": 4.308093994778068e-06, - "logits/chosen": -2.6454341411590576, - "logits/rejected": -2.417539119720459, - "logps/chosen": -291.4884338378906, - "logps/rejected": -275.2319641113281, - "loss": 0.6474, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.10766121000051498, - "rewards/margins": 0.11833895742893219, - "rewards/rejected": -0.22600015997886658, + "grad_norm": 3.40625, + "learning_rate": 4.4e-06, + "logits/chosen": -2.923088312149048, + "logits/rejected": -2.7437281608581543, + "logps/chosen": -242.5861358642578, + "logps/rejected": -329.2769775390625, + "loss": 0.4273, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.5358756184577942, + "rewards/margins": 0.750207781791687, + "rewards/rejected": -1.286083459854126, "step": 330 }, { "epoch": 0.09, - "grad_norm": 2.765625, - "learning_rate": 4.4386422976501306e-06, - "logits/chosen": -2.6752266883850098, - "logits/rejected": -2.5314574241638184, - "logps/chosen": -311.11553955078125, - "logps/rejected": -302.4649963378906, - "loss": 0.6335, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.005945128854364157, - "rewards/margins": 0.15127454698085785, - "rewards/rejected": -0.15721969306468964, + "grad_norm": 4.46875, + "learning_rate": 4.533333333333334e-06, + "logits/chosen": -2.840907096862793, + "logits/rejected": -2.619574785232544, + "logps/chosen": -250.60830688476562, + "logps/rejected": -342.74652099609375, + "loss": 0.417, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -0.5628878474235535, + "rewards/margins": 0.7784547805786133, + "rewards/rejected": -1.3413426876068115, "step": 340 }, { "epoch": 0.09, - "grad_norm": 3.15625, - "learning_rate": 4.569190600522193e-06, - "logits/chosen": -2.5868887901306152, - "logits/rejected": -2.4233832359313965, - "logps/chosen": -323.6490173339844, - "logps/rejected": -323.33380126953125, - "loss": 0.6437, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.11508532613515854, - "rewards/margins": 0.13315364718437195, - "rewards/rejected": -0.2482389658689499, + "grad_norm": 3.875, + "learning_rate": 4.666666666666667e-06, + "logits/chosen": -2.8517518043518066, + "logits/rejected": -2.663865089416504, + "logps/chosen": -267.8255920410156, + "logps/rejected": -348.36627197265625, + "loss": 0.4245, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.7652944922447205, + "rewards/margins": 0.7990447282791138, + "rewards/rejected": -1.5643391609191895, "step": 350 }, { - "epoch": 0.09, - "grad_norm": 2.515625, - "learning_rate": 4.699738903394257e-06, - "logits/chosen": -2.5959036350250244, - "logits/rejected": -2.5964016914367676, - "logps/chosen": -274.69927978515625, - "logps/rejected": -269.53741455078125, - "loss": 0.632, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.15621377527713776, - "rewards/margins": 0.15650849044322968, - "rewards/rejected": -0.31272226572036743, + "epoch": 0.1, + "grad_norm": 4.375, + "learning_rate": 4.800000000000001e-06, + "logits/chosen": -2.710493564605713, + "logits/rejected": -2.574153184890747, + "logps/chosen": -266.52459716796875, + "logps/rejected": -371.8028259277344, + "loss": 0.403, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.861789345741272, + "rewards/margins": 0.9031310081481934, + "rewards/rejected": -1.7649204730987549, "step": 360 }, { "epoch": 0.1, - "grad_norm": 3.203125, - "learning_rate": 4.8302872062663196e-06, - "logits/chosen": -2.661418914794922, - "logits/rejected": -2.4857161045074463, - "logps/chosen": -309.907958984375, - "logps/rejected": -284.32794189453125, - "loss": 0.613, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.1116505041718483, - "rewards/margins": 0.20645570755004883, - "rewards/rejected": -0.3181062340736389, + "grad_norm": 5.34375, + "learning_rate": 4.933333333333334e-06, + "logits/chosen": -2.7110581398010254, + "logits/rejected": -2.490288019180298, + "logps/chosen": -316.9415283203125, + "logps/rejected": -405.4663391113281, + "loss": 0.4073, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -1.1048616170883179, + "rewards/margins": 0.959460437297821, + "rewards/rejected": -2.064321994781494, "step": 370 }, { "epoch": 0.1, - "grad_norm": 4.1875, - "learning_rate": 4.9608355091383814e-06, - "logits/chosen": -2.6475014686584473, - "logits/rejected": -2.4438838958740234, - "logps/chosen": -330.4369812011719, - "logps/rejected": -315.2841796875, - "loss": 0.6096, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.11033549159765244, - "rewards/margins": 0.22602955996990204, - "rewards/rejected": -0.3363650441169739, + "grad_norm": 6.4375, + "learning_rate": 4.999972794121976e-06, + "logits/chosen": -2.7278122901916504, + "logits/rejected": -2.473001003265381, + "logps/chosen": -306.61614990234375, + "logps/rejected": -403.7631530761719, + "loss": 0.3723, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.060858964920044, + "rewards/margins": 1.0833942890167236, + "rewards/rejected": -2.1442532539367676, "step": 380 }, { "epoch": 0.1, - "grad_norm": 3.671875, - "learning_rate": 4.9999488562447675e-06, - "logits/chosen": -2.6455321311950684, - "logits/rejected": -2.537829875946045, - "logps/chosen": -308.32122802734375, - "logps/rejected": -309.0108947753906, - "loss": 0.6092, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.06899285316467285, - "rewards/margins": 0.21975460648536682, - "rewards/rejected": -0.2887474298477173, + "grad_norm": 6.625, + "learning_rate": 4.999755150650535e-06, + "logits/chosen": -2.621072292327881, + "logits/rejected": -2.448957920074463, + "logps/chosen": -318.9244384765625, + "logps/rejected": -465.15625, + "loss": 0.317, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.2451636791229248, + "rewards/margins": 1.3955293893814087, + "rewards/rejected": -2.640693187713623, "step": 390 }, { - "epoch": 0.1, - "grad_norm": 3.125, - "learning_rate": 4.999698361256577e-06, - "logits/chosen": -2.740412712097168, - "logits/rejected": -2.414173126220703, - "logps/chosen": -291.7049255371094, - "logps/rejected": -265.08746337890625, - "loss": 0.6312, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.0857996717095375, - "rewards/margins": 0.16563723981380463, - "rewards/rejected": -0.2514369487762451, + "epoch": 0.11, + "grad_norm": 10.4375, + "learning_rate": 4.99931988265533e-06, + "logits/chosen": -2.5788636207580566, + "logits/rejected": -2.3275580406188965, + "logps/chosen": -312.1104431152344, + "logps/rejected": -457.56646728515625, + "loss": 0.3274, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.3098253011703491, + "rewards/margins": 1.4334557056427002, + "rewards/rejected": -2.743281126022339, "step": 400 }, { - "epoch": 0.1, - "eval_logits/chosen": -2.652951717376709, - "eval_logits/rejected": -2.5250518321990967, - "eval_logps/chosen": -306.51312255859375, - "eval_logps/rejected": -305.46551513671875, - "eval_loss": 0.6190703511238098, - "eval_rewards/accuracies": 0.6804999709129333, - "eval_rewards/chosen": -0.21378836035728455, - "eval_rewards/margins": 0.20841743052005768, - "eval_rewards/rejected": -0.4222058057785034, - "eval_runtime": 783.195, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "epoch": 0.11, + "eval_logits/chosen": -2.465298652648926, + "eval_logits/rejected": -2.3317997455596924, + "eval_logps/chosen": -425.34393310546875, + "eval_logps/rejected": -439.54949951171875, + "eval_loss": 0.6736099720001221, + "eval_rewards/accuracies": 0.5864999890327454, + "eval_rewards/chosen": -1.4020962715148926, + "eval_rewards/margins": 0.36094897985458374, + "eval_rewards/rejected": -1.7630454301834106, + "eval_runtime": 780.8961, + "eval_samples_per_second": 2.561, + "eval_steps_per_second": 0.32, "step": 400 }, { "epoch": 0.11, - "grad_norm": 3.265625, - "learning_rate": 4.999239142174581e-06, - "logits/chosen": -2.567608594894409, - "logits/rejected": -2.524784803390503, - "logps/chosen": -282.33831787109375, - "logps/rejected": -294.5062255859375, - "loss": 0.6538, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.22671082615852356, - "rewards/margins": 0.12887807190418243, - "rewards/rejected": -0.3555889129638672, + "grad_norm": 3.34375, + "learning_rate": 4.998667028030071e-06, + "logits/chosen": -2.462109327316284, + "logits/rejected": -2.3724803924560547, + "logps/chosen": -334.2771911621094, + "logps/rejected": -479.7632751464844, + "loss": 0.351, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.4377202987670898, + "rewards/margins": 1.3972641229629517, + "rewards/rejected": -2.834984540939331, "step": 410 }, { "epoch": 0.11, - "grad_norm": 3.59375, - "learning_rate": 4.99857123734344e-06, - "logits/chosen": -2.568328619003296, - "logits/rejected": -2.4854254722595215, - "logps/chosen": -258.9724426269531, - "logps/rejected": -273.19512939453125, - "loss": 0.6095, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.1116761714220047, - "rewards/margins": 0.22403624653816223, - "rewards/rejected": -0.33571237325668335, + "grad_norm": 11.875, + "learning_rate": 4.997796643611192e-06, + "logits/chosen": -2.428496837615967, + "logits/rejected": -2.3284060955047607, + "logps/chosen": -304.59942626953125, + "logps/rejected": -476.5513610839844, + "loss": 0.3337, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.3155182600021362, + "rewards/margins": 1.556190013885498, + "rewards/rejected": -2.871708393096924, "step": 420 }, { "epoch": 0.11, - "grad_norm": 3.75, - "learning_rate": 4.997694702533016e-06, - "logits/chosen": -2.59728741645813, - "logits/rejected": -2.569671154022217, - "logps/chosen": -307.2120361328125, - "logps/rejected": -310.6186828613281, - "loss": 0.5874, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.08464425802230835, - "rewards/margins": 0.27853769063949585, - "rewards/rejected": -0.3631819784641266, + "grad_norm": 5.53125, + "learning_rate": 4.9967088051729154e-06, + "logits/chosen": -2.4595181941986084, + "logits/rejected": -2.2439799308776855, + "logps/chosen": -356.4442443847656, + "logps/rejected": -502.91497802734375, + "loss": 0.3412, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -1.5988712310791016, + "rewards/margins": 1.4717421531677246, + "rewards/rejected": -3.070613384246826, "step": 430 }, { "epoch": 0.12, - "grad_norm": 3.5625, - "learning_rate": 4.996609610933713e-06, - "logits/chosen": -2.708739757537842, - "logits/rejected": -2.613562822341919, - "logps/chosen": -300.5826721191406, - "logps/rejected": -295.70538330078125, - "loss": 0.6024, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.11334341764450073, - "rewards/margins": 0.25627994537353516, - "rewards/rejected": -0.3696233332157135, + "grad_norm": 8.25, + "learning_rate": 4.995403607420644e-06, + "logits/chosen": -2.4543726444244385, + "logits/rejected": -2.2793850898742676, + "logps/chosen": -318.9652404785156, + "logps/rejected": -472.81903076171875, + "loss": 0.3197, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3662667274475098, + "rewards/margins": 1.52040696144104, + "rewards/rejected": -2.8866734504699707, "step": 440 }, { "epoch": 0.12, - "grad_norm": 5.125, - "learning_rate": 4.995316053150366e-06, - "logits/chosen": -2.5639803409576416, - "logits/rejected": -2.3998026847839355, - "logps/chosen": -303.77288818359375, - "logps/rejected": -300.2038269042969, - "loss": 0.5949, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.10397627204656601, - "rewards/margins": 0.2747548520565033, - "rewards/rejected": -0.3787311017513275, + "grad_norm": 3.609375, + "learning_rate": 4.993881163982721e-06, + "logits/chosen": -2.369863271713257, + "logits/rejected": -2.2286698818206787, + "logps/chosen": -356.9123840332031, + "logps/rejected": -509.55908203125, + "loss": 0.3459, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6083195209503174, + "rewards/margins": 1.4843461513519287, + "rewards/rejected": -3.092665672302246, "step": 450 }, { "epoch": 0.12, - "grad_norm": 5.90625, - "learning_rate": 4.9938141371946815e-06, - "logits/chosen": -2.6004934310913086, - "logits/rejected": -2.5245566368103027, - "logps/chosen": -318.1405944824219, - "logps/rejected": -328.95794677734375, - "loss": 0.6037, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.3281031847000122, - "rewards/margins": 0.25530144572257996, - "rewards/rejected": -0.5834046006202698, + "grad_norm": 13.4375, + "learning_rate": 4.992141607400541e-06, + "logits/chosen": -2.3464982509613037, + "logits/rejected": -2.1806864738464355, + "logps/chosen": -380.472900390625, + "logps/rejected": -534.5485229492188, + "loss": 0.355, + "rewards/accuracies": 0.8687499761581421, + "rewards/chosen": -1.8383054733276367, + "rewards/margins": 1.5852895975112915, + "rewards/rejected": -3.4235949516296387, "step": 460 }, { - "epoch": 0.12, - "grad_norm": 2.8125, - "learning_rate": 4.992103988476206e-06, - "logits/chosen": -2.627959966659546, - "logits/rejected": -2.482374429702759, - "logps/chosen": -286.36962890625, - "logps/rejected": -298.1661071777344, - "loss": 0.597, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.23781995475292206, - "rewards/margins": 0.27273446321487427, - "rewards/rejected": -0.5105544328689575, + "epoch": 0.13, + "grad_norm": 9.3125, + "learning_rate": 4.990185089117005e-06, + "logits/chosen": -2.3217694759368896, + "logits/rejected": -2.139650821685791, + "logps/chosen": -352.61322021484375, + "logps/rejected": -521.1868896484375, + "loss": 0.315, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.4914997816085815, + "rewards/margins": 1.6338955163955688, + "rewards/rejected": -3.1253952980041504, "step": 470 }, { "epoch": 0.13, - "grad_norm": 3.0625, - "learning_rate": 4.990185749791866e-06, - "logits/chosen": -2.6001830101013184, - "logits/rejected": -2.4575042724609375, - "logps/chosen": -306.5142822265625, - "logps/rejected": -333.7895202636719, - "loss": 0.5985, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.2919207811355591, - "rewards/margins": 0.2878710925579071, - "rewards/rejected": -0.5797919034957886, + "grad_norm": 6.28125, + "learning_rate": 4.988011779463336e-06, + "logits/chosen": -2.3029465675354004, + "logits/rejected": -2.1867263317108154, + "logps/chosen": -386.57373046875, + "logps/rejected": -545.6253662109375, + "loss": 0.3282, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.977741003036499, + "rewards/margins": 1.5777519941329956, + "rewards/rejected": -3.555493116378784, "step": 480 }, { "epoch": 0.13, - "grad_norm": 4.75, - "learning_rate": 4.9880595813140395e-06, - "logits/chosen": -2.666719913482666, - "logits/rejected": -2.5408244132995605, - "logps/chosen": -349.4113464355469, - "logps/rejected": -340.92974853515625, - "loss": 0.5848, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.3812797963619232, - "rewards/margins": 0.3167892098426819, - "rewards/rejected": -0.6980689764022827, + "grad_norm": 4.375, + "learning_rate": 4.98562186764426e-06, + "logits/chosen": -2.2591264247894287, + "logits/rejected": -2.1315784454345703, + "logps/chosen": -339.3611145019531, + "logps/rejected": -513.1484375, + "loss": 0.297, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -1.6218554973602295, + "rewards/margins": 1.7187793254852295, + "rewards/rejected": -3.340634822845459, "step": 490 }, { "epoch": 0.13, - "grad_norm": 2.703125, - "learning_rate": 4.985725660577184e-06, - "logits/chosen": -2.6188435554504395, - "logits/rejected": -2.534493923187256, - "logps/chosen": -328.07855224609375, - "logps/rejected": -317.0086975097656, - "loss": 0.5918, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.34686478972435, - "rewards/margins": 0.3155154585838318, - "rewards/rejected": -0.6623803377151489, + "grad_norm": 5.25, + "learning_rate": 4.983015561721522e-06, + "logits/chosen": -2.2526965141296387, + "logits/rejected": -2.069826602935791, + "logps/chosen": -320.3377990722656, + "logps/rejected": -532.88623046875, + "loss": 0.2403, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -1.4934319257736206, + "rewards/margins": 1.9852195978164673, + "rewards/rejected": -3.478651523590088, "step": 500 }, { "epoch": 0.13, - "eval_logits/chosen": -2.6574668884277344, - "eval_logits/rejected": -2.5298194885253906, - "eval_logps/chosen": -311.40576171875, - "eval_logps/rejected": -317.3648681640625, - "eval_loss": 0.6031413078308105, - "eval_rewards/accuracies": 0.6880000233650208, - "eval_rewards/chosen": -0.2627146244049072, - "eval_rewards/margins": 0.27848491072654724, - "eval_rewards/rejected": -0.5411995649337769, - "eval_runtime": 782.9368, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "eval_logits/chosen": -2.168368339538574, + "eval_logits/rejected": -2.045610189437866, + "eval_logps/chosen": -513.8699340820312, + "eval_logps/rejected": -537.7081298828125, + "eval_loss": 0.6997026205062866, + "eval_rewards/accuracies": 0.5985000133514404, + "eval_rewards/chosen": -2.2873566150665283, + "eval_rewards/margins": 0.4572749435901642, + "eval_rewards/rejected": -2.74463152885437, + "eval_runtime": 781.5275, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, "step": 500 }, { - "epoch": 0.13, - "grad_norm": 4.28125, - "learning_rate": 4.983184182463009e-06, - "logits/chosen": -2.595390796661377, - "logits/rejected": -2.52022385597229, - "logps/chosen": -321.63763427734375, - "logps/rejected": -315.06890869140625, - "loss": 0.5835, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.24320808053016663, - "rewards/margins": 0.33513620495796204, - "rewards/rejected": -0.5783442854881287, + "epoch": 0.14, + "grad_norm": 6.8125, + "learning_rate": 4.980193088595777e-06, + "logits/chosen": -2.1158764362335205, + "logits/rejected": -2.001831531524658, + "logps/chosen": -453.5465393066406, + "logps/rejected": -606.9456787109375, + "loss": 0.3288, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.537163257598877, + "rewards/margins": 1.5891704559326172, + "rewards/rejected": -4.126333713531494, "step": 510 }, { "epoch": 0.14, - "grad_norm": 4.78125, - "learning_rate": 4.980435359184203e-06, - "logits/chosen": -2.625418186187744, - "logits/rejected": -2.5462870597839355, - "logps/chosen": -318.16717529296875, - "logps/rejected": -326.4075622558594, - "loss": 0.6142, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.2785532474517822, - "rewards/margins": 0.2635067105293274, - "rewards/rejected": -0.5420600175857544, + "grad_norm": 3.640625, + "learning_rate": 4.977154693986841e-06, + "logits/chosen": -2.253612518310547, + "logits/rejected": -2.078402042388916, + "logps/chosen": -348.5957946777344, + "logps/rejected": -533.9927978515625, + "loss": 0.2757, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.4598500728607178, + "rewards/margins": 1.7905546426773071, + "rewards/rejected": -3.2504043579101562, "step": 520 }, { "epoch": 0.14, - "grad_norm": 3.78125, - "learning_rate": 4.9774794202667236e-06, - "logits/chosen": -2.6040523052215576, - "logits/rejected": -2.4890506267547607, - "logps/chosen": -340.12603759765625, - "logps/rejected": -378.34368896484375, - "loss": 0.5922, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.572872519493103, - "rewards/margins": 0.31255173683166504, - "rewards/rejected": -0.8854243159294128, + "grad_norm": 3.90625, + "learning_rate": 4.97390064241229e-06, + "logits/chosen": -2.1055116653442383, + "logits/rejected": -1.9794048070907593, + "logps/chosen": -391.5548095703125, + "logps/rejected": -594.6146240234375, + "loss": 0.2911, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.0448694229125977, + "rewards/margins": 1.8874633312225342, + "rewards/rejected": -3.9323325157165527, "step": 530 }, { "epoch": 0.14, - "grad_norm": 2.984375, - "learning_rate": 4.974316612530615e-06, - "logits/chosen": -2.5228214263916016, - "logits/rejected": -2.3871700763702393, - "logps/chosen": -377.46466064453125, - "logps/rejected": -362.720458984375, - "loss": 0.5955, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.757167637348175, - "rewards/margins": 0.2707655131816864, - "rewards/rejected": -1.027933120727539, + "grad_norm": 5.96875, + "learning_rate": 4.970431217164442e-06, + "logits/chosen": -2.1032092571258545, + "logits/rejected": -1.9064871072769165, + "logps/chosen": -426.44989013671875, + "logps/rejected": -619.9075317382812, + "loss": 0.2672, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.313416004180908, + "rewards/margins": 1.9491608142852783, + "rewards/rejected": -4.262576580047607, "step": 540 }, { - "epoch": 0.14, + "epoch": 0.15, "grad_norm": 4.71875, - "learning_rate": 4.970947200069416e-06, - "logits/chosen": -2.5291991233825684, - "logits/rejected": -2.488736152648926, - "logps/chosen": -376.4600524902344, - "logps/rejected": -375.04119873046875, - "loss": 0.6232, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7454890608787537, - "rewards/margins": 0.21865490078926086, - "rewards/rejected": -0.9641439318656921, + "learning_rate": 4.966746720285684e-06, + "logits/chosen": -2.1194474697113037, + "logits/rejected": -1.8741849660873413, + "logps/chosen": -383.41705322265625, + "logps/rejected": -557.76904296875, + "loss": 0.3418, + "rewards/accuracies": 0.856249988079071, + "rewards/chosen": -1.8038151264190674, + "rewards/margins": 1.8368213176727295, + "rewards/rejected": -3.640636444091797, "step": 550 }, { "epoch": 0.15, - "grad_norm": 3.34375, - "learning_rate": 4.967371464228096e-06, - "logits/chosen": -2.659289598464966, - "logits/rejected": -2.6029088497161865, - "logps/chosen": -328.5123291015625, - "logps/rejected": -361.32269287109375, - "loss": 0.5842, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.5446836948394775, - "rewards/margins": 0.3293144106864929, - "rewards/rejected": -0.8739980459213257, + "grad_norm": 6.46875, + "learning_rate": 4.9628474725421845e-06, + "logits/chosen": -2.0428099632263184, + "logits/rejected": -1.881706953048706, + "logps/chosen": -403.2717590332031, + "logps/rejected": -580.6181030273438, + "loss": 0.3346, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -2.0810344219207764, + "rewards/margins": 1.7879607677459717, + "rewards/rejected": -3.868995189666748, "step": 560 }, { "epoch": 0.15, - "grad_norm": 4.125, - "learning_rate": 4.963589703579569e-06, - "logits/chosen": -2.742898941040039, - "logits/rejected": -2.624717950820923, - "logps/chosen": -358.6355895996094, - "logps/rejected": -352.4152526855469, - "loss": 0.5988, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.4009895920753479, - "rewards/margins": 0.3117690086364746, - "rewards/rejected": -0.7127586007118225, + "grad_norm": 5.5625, + "learning_rate": 4.958733813395963e-06, + "logits/chosen": -2.0323638916015625, + "logits/rejected": -1.907156229019165, + "logps/chosen": -435.7867126464844, + "logps/rejected": -609.8319091796875, + "loss": 0.3164, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.3000435829162598, + "rewards/margins": 1.7664451599121094, + "rewards/rejected": -4.066488742828369, "step": 570 }, { "epoch": 0.15, - "grad_norm": 3.90625, - "learning_rate": 4.9596022338997615e-06, - "logits/chosen": -2.770641803741455, - "logits/rejected": -2.534297466278076, - "logps/chosen": -355.1096496582031, - "logps/rejected": -343.6912536621094, - "loss": 0.5891, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.375056654214859, - "rewards/margins": 0.3282235562801361, - "rewards/rejected": -0.7032800912857056, + "grad_norm": 4.875, + "learning_rate": 4.95440610097534e-06, + "logits/chosen": -2.064138889312744, + "logits/rejected": -1.9411983489990234, + "logps/chosen": -364.3408508300781, + "logps/rejected": -598.0901489257812, + "loss": 0.2586, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8342880010604858, + "rewards/margins": 2.1939587593078613, + "rewards/rejected": -4.028246879577637, "step": 580 }, { - "epoch": 0.15, - "grad_norm": 4.3125, - "learning_rate": 4.955409388141243e-06, - "logits/chosen": -2.6562209129333496, - "logits/rejected": -2.553755044937134, - "logps/chosen": -338.35784912109375, - "logps/rejected": -340.85028076171875, - "loss": 0.608, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.604152500629425, - "rewards/margins": 0.2910125255584717, - "rewards/rejected": -0.895164966583252, + "epoch": 0.16, + "grad_norm": 5.09375, + "learning_rate": 4.949864712043756e-06, + "logits/chosen": -2.042320728302002, + "logits/rejected": -1.8681459426879883, + "logps/chosen": -400.0122985839844, + "logps/rejected": -617.6143798828125, + "loss": 0.2548, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.086648464202881, + "rewards/margins": 2.196746349334717, + "rewards/rejected": -4.283394813537598, "step": 590 }, { "epoch": 0.16, - "grad_norm": 3.78125, - "learning_rate": 4.951011516405429e-06, - "logits/chosen": -2.64245343208313, - "logits/rejected": -2.5466177463531494, - "logps/chosen": -334.6476135253906, - "logps/rejected": -347.5552673339844, - "loss": 0.6012, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.6437972187995911, - "rewards/margins": 0.31083759665489197, - "rewards/rejected": -0.9546347856521606, + "grad_norm": 9.5625, + "learning_rate": 4.945110041966975e-06, + "logits/chosen": -1.9855597019195557, + "logits/rejected": -1.8242261409759521, + "logps/chosen": -444.31317138671875, + "logps/rejected": -660.8572387695312, + "loss": 0.2586, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.530787944793701, + "rewards/margins": 2.156179904937744, + "rewards/rejected": -4.686967372894287, "step": 600 }, { "epoch": 0.16, - "eval_logits/chosen": -2.6737217903137207, - "eval_logits/rejected": -2.5442543029785156, - "eval_logps/chosen": -336.4282531738281, - "eval_logps/rejected": -348.7829284667969, - "eval_loss": 0.5927833914756775, - "eval_rewards/accuracies": 0.6934999823570251, - "eval_rewards/chosen": -0.512939989566803, - "eval_rewards/margins": 0.34244006872177124, - "eval_rewards/rejected": -0.8553800582885742, - "eval_runtime": 783.6117, - "eval_samples_per_second": 2.552, - "eval_steps_per_second": 0.319, + "eval_logits/chosen": -2.0491983890533447, + "eval_logits/rejected": -1.9324215650558472, + "eval_logps/chosen": -524.0146484375, + "eval_logps/rejected": -548.6314697265625, + "eval_loss": 0.7062023878097534, + "eval_rewards/accuracies": 0.5839999914169312, + "eval_rewards/chosen": -2.388803720474243, + "eval_rewards/margins": 0.4650622308254242, + "eval_rewards/rejected": -2.8538658618927, + "eval_runtime": 781.5672, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, "step": 600 }, { "epoch": 0.16, - "grad_norm": 3.6875, - "learning_rate": 4.946408985913344e-06, - "logits/chosen": -2.576265811920166, - "logits/rejected": -2.532116413116455, - "logps/chosen": -298.76788330078125, - "logps/rejected": -316.5951232910156, - "loss": 0.5703, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.31805160641670227, - "rewards/margins": 0.38874995708465576, - "rewards/rejected": -0.7068015933036804, + "grad_norm": 8.125, + "learning_rate": 4.940142504678662e-06, + "logits/chosen": -2.0201022624969482, + "logits/rejected": -1.893396019935608, + "logps/chosen": -410.92913818359375, + "logps/rejected": -636.2786865234375, + "loss": 0.2489, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.2299957275390625, + "rewards/margins": 2.2826168537139893, + "rewards/rejected": -4.512612819671631, "step": 610 }, { - "epoch": 0.16, - "grad_norm": 4.21875, - "learning_rate": 4.941602180974958e-06, - "logits/chosen": -2.6575896739959717, - "logits/rejected": -2.4126646518707275, - "logps/chosen": -342.2276306152344, - "logps/rejected": -311.13140869140625, - "loss": 0.5937, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.34147003293037415, - "rewards/margins": 0.3287685513496399, - "rewards/rejected": -0.6702385544776917, + "epoch": 0.17, + "grad_norm": 7.875, + "learning_rate": 4.934962532644348e-06, + "logits/chosen": -2.003357410430908, + "logits/rejected": -1.861501693725586, + "logps/chosen": -430.82904052734375, + "logps/rejected": -698.9578857421875, + "loss": 0.2217, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -2.4477930068969727, + "rewards/margins": 2.599547863006592, + "rewards/rejected": -5.0473408699035645, "step": 620 }, { - "epoch": 0.16, - "grad_norm": 3.53125, - "learning_rate": 4.936591502957101e-06, - "logits/chosen": -2.6380743980407715, - "logits/rejected": -2.4174392223358154, - "logps/chosen": -301.11102294921875, - "logps/rejected": -339.44720458984375, - "loss": 0.5432, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.3551376461982727, - "rewards/margins": 0.47654080390930176, - "rewards/rejected": -0.8316785097122192, + "epoch": 0.17, + "grad_norm": 12.875, + "learning_rate": 4.929570576823779e-06, + "logits/chosen": -2.044196605682373, + "logits/rejected": -1.859323263168335, + "logps/chosen": -417.61865234375, + "logps/rejected": -642.5675048828125, + "loss": 0.2632, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.268167018890381, + "rewards/margins": 2.251687526702881, + "rewards/rejected": -4.519854545593262, "step": 630 }, { "epoch": 0.17, - "grad_norm": 4.71875, - "learning_rate": 4.931377370249946e-06, - "logits/chosen": -2.655895233154297, - "logits/rejected": -2.344895362854004, - "logps/chosen": -347.9822082519531, - "logps/rejected": -362.0487060546875, - "loss": 0.5791, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.6452215909957886, - "rewards/margins": 0.3807607591152191, - "rewards/rejected": -1.0259822607040405, + "grad_norm": 12.4375, + "learning_rate": 4.923967106631655e-06, + "logits/chosen": -2.0745182037353516, + "logits/rejected": -1.884718894958496, + "logps/chosen": -407.1151428222656, + "logps/rejected": -658.7186889648438, + "loss": 0.24, + "rewards/accuracies": 0.90625, + "rewards/chosen": -2.072746992111206, + "rewards/margins": 2.464381456375122, + "rewards/rejected": -4.537128448486328, "step": 640 }, { "epoch": 0.17, - "grad_norm": 5.1875, - "learning_rate": 4.925960218232073e-06, - "logits/chosen": -2.584510326385498, - "logits/rejected": -2.451813220977783, - "logps/chosen": -321.987548828125, - "logps/rejected": -358.29327392578125, - "loss": 0.5533, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.4966830313205719, - "rewards/margins": 0.47680535912513733, - "rewards/rejected": -0.973488450050354, + "grad_norm": 13.0, + "learning_rate": 4.918152609896768e-06, + "logits/chosen": -2.031191110610962, + "logits/rejected": -1.8076276779174805, + "logps/chosen": -422.650390625, + "logps/rejected": -628.3702392578125, + "loss": 0.2986, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.175328493118286, + "rewards/margins": 2.1270930767059326, + "rewards/rejected": -4.3024210929870605, "step": 650 }, { - "epoch": 0.17, - "grad_norm": 7.5, - "learning_rate": 4.920340499234116e-06, - "logits/chosen": -2.5348637104034424, - "logits/rejected": -2.209812641143799, - "logps/chosen": -330.370361328125, - "logps/rejected": -326.264404296875, - "loss": 0.6019, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.41889137029647827, - "rewards/margins": 0.35165196657180786, - "rewards/rejected": -0.7705433368682861, + "epoch": 0.18, + "grad_norm": 11.25, + "learning_rate": 4.9121275928195265e-06, + "logits/chosen": -1.9430030584335327, + "logits/rejected": -1.8268959522247314, + "logps/chosen": -450.4566345214844, + "logps/rejected": -635.5111083984375, + "loss": 0.382, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.5708768367767334, + "rewards/margins": 1.800329566001892, + "rewards/rejected": -4.371206283569336, "step": 660 }, { "epoch": 0.18, - "grad_norm": 4.5, - "learning_rate": 4.914518682500995e-06, - "logits/chosen": -2.7013747692108154, - "logits/rejected": -2.40236759185791, - "logps/chosen": -345.67144775390625, - "logps/rejected": -343.01348876953125, - "loss": 0.5753, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.4313276410102844, - "rewards/margins": 0.41087159514427185, - "rewards/rejected": -0.8421992063522339, + "grad_norm": 5.4375, + "learning_rate": 4.9058925799278934e-06, + "logits/chosen": -1.9249922037124634, + "logits/rejected": -1.7653077840805054, + "logps/chosen": -440.85693359375, + "logps/rejected": -660.0525512695312, + "loss": 0.2587, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.6279947757720947, + "rewards/margins": 2.1633927822113037, + "rewards/rejected": -4.791387557983398, "step": 670 }, { "epoch": 0.18, - "grad_norm": 4.8125, - "learning_rate": 4.9084952541527315e-06, - "logits/chosen": -2.57464337348938, - "logits/rejected": -2.316161632537842, - "logps/chosen": -341.40765380859375, - "logps/rejected": -332.87060546875, - "loss": 0.5596, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.48329535126686096, - "rewards/margins": 0.43051138520240784, - "rewards/rejected": -0.9138067960739136, + "grad_norm": 9.9375, + "learning_rate": 4.899448114031714e-06, + "logits/chosen": -1.9734159708023071, + "logits/rejected": -1.7737067937850952, + "logps/chosen": -416.80517578125, + "logps/rejected": -651.2550659179688, + "loss": 0.2519, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.3417036533355713, + "rewards/margins": 2.3331680297851562, + "rewards/rejected": -4.674871921539307, "step": 680 }, { "epoch": 0.18, - "grad_norm": 3.71875, - "learning_rate": 4.902270717143858e-06, - "logits/chosen": -2.496086597442627, - "logits/rejected": -2.3636794090270996, - "logps/chosen": -304.6783447265625, - "logps/rejected": -378.12548828125, - "loss": 0.4891, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.46743354201316833, - "rewards/margins": 0.6628168225288391, - "rewards/rejected": -1.1302502155303955, + "grad_norm": 3.984375, + "learning_rate": 4.892794756175467e-06, + "logits/chosen": -1.8331539630889893, + "logits/rejected": -1.6363424062728882, + "logps/chosen": -449.67218017578125, + "logps/rejected": -700.6417236328125, + "loss": 0.2215, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -2.48677134513855, + "rewards/margins": 2.510742664337158, + "rewards/rejected": -4.997514247894287, "step": 690 }, { - "epoch": 0.18, - "grad_norm": 8.125, - "learning_rate": 4.895845591221427e-06, - "logits/chosen": -2.4748313426971436, - "logits/rejected": -2.386183261871338, - "logps/chosen": -326.8917541503906, - "logps/rejected": -360.40423583984375, - "loss": 0.5823, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.5521180033683777, - "rewards/margins": 0.3961629271507263, - "rewards/rejected": -0.9482809901237488, + "epoch": 0.19, + "grad_norm": 10.3125, + "learning_rate": 4.885933085589416e-06, + "logits/chosen": -1.9560575485229492, + "logits/rejected": -1.7940680980682373, + "logps/chosen": -468.181640625, + "logps/rejected": -718.8138427734375, + "loss": 0.2338, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -2.665666103363037, + "rewards/margins": 2.501105546951294, + "rewards/rejected": -5.166770935058594, "step": 700 }, { - "epoch": 0.18, - "eval_logits/chosen": -2.466245174407959, - "eval_logits/rejected": -2.344569444656372, - "eval_logps/chosen": -342.8824768066406, - "eval_logps/rejected": -365.3114929199219, - "eval_loss": 0.5811374187469482, - "eval_rewards/accuracies": 0.699999988079071, - "eval_rewards/chosen": -0.5774820446968079, - "eval_rewards/margins": 0.44318366050720215, - "eval_rewards/rejected": -1.0206657648086548, - "eval_runtime": 783.0972, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "epoch": 0.19, + "eval_logits/chosen": -2.0428850650787354, + "eval_logits/rejected": -1.9258164167404175, + "eval_logps/chosen": -527.9253540039062, + "eval_logps/rejected": -551.26611328125, + "eval_loss": 0.707597553730011, + "eval_rewards/accuracies": 0.5864999890327454, + "eval_rewards/chosen": -2.4279117584228516, + "eval_rewards/margins": 0.45229974389076233, + "eval_rewards/rejected": -2.880211353302002, + "eval_runtime": 781.2446, + "eval_samples_per_second": 2.56, + "eval_steps_per_second": 0.32, "step": 700 }, { "epoch": 0.19, - "grad_norm": 7.875, - "learning_rate": 4.8892204128816e-06, - "logits/chosen": -2.4716761112213135, - "logits/rejected": -2.3479976654052734, - "logps/chosen": -335.98077392578125, - "logps/rejected": -359.7213439941406, - "loss": 0.5862, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.5169531106948853, - "rewards/margins": 0.3903142213821411, - "rewards/rejected": -0.9072673916816711, + "grad_norm": 10.1875, + "learning_rate": 4.878863699639183e-06, + "logits/chosen": -2.0451254844665527, + "logits/rejected": -1.7745431661605835, + "logps/chosen": -405.2158508300781, + "logps/rejected": -666.8660888671875, + "loss": 0.2089, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.268383502960205, + "rewards/margins": 2.578892230987549, + "rewards/rejected": -4.847275733947754, "step": 710 }, { "epoch": 0.19, - "grad_norm": 4.1875, - "learning_rate": 4.882395735324864e-06, - "logits/chosen": -2.4327664375305176, - "logits/rejected": -2.3049471378326416, - "logps/chosen": -334.0990295410156, - "logps/rejected": -373.6146240234375, - "loss": 0.5449, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.5033348798751831, - "rewards/margins": 0.5430740714073181, - "rewards/rejected": -1.046409010887146, + "grad_norm": 7.875, + "learning_rate": 4.871587213773745e-06, + "logits/chosen": -1.8383365869522095, + "logits/rejected": -1.6933542490005493, + "logps/chosen": -471.1405334472656, + "logps/rejected": -719.7000122070312, + "loss": 0.254, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.016746997833252, + "rewards/margins": 2.3775877952575684, + "rewards/rejected": -5.39433479309082, "step": 720 }, { - "epoch": 0.19, - "grad_norm": 4.34375, - "learning_rate": 4.87537212840983e-06, - "logits/chosen": -2.3654582500457764, - "logits/rejected": -2.2108511924743652, - "logps/chosen": -341.95501708984375, - "logps/rejected": -352.4615173339844, - "loss": 0.5897, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.555401623249054, - "rewards/margins": 0.4416992664337158, - "rewards/rejected": -0.9971009492874146, + "epoch": 0.2, + "grad_norm": 8.1875, + "learning_rate": 4.864104261471856e-06, + "logits/chosen": -2.0093419551849365, + "logits/rejected": -1.8601839542388916, + "logps/chosen": -440.412353515625, + "logps/rejected": -686.5137939453125, + "loss": 0.2582, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4893579483032227, + "rewards/margins": 2.3431363105773926, + "rewards/rejected": -4.832494258880615, "step": 730 }, { - "epoch": 0.19, - "grad_norm": 4.5625, - "learning_rate": 4.8681501786056545e-06, - "logits/chosen": -2.3230032920837402, - "logits/rejected": -2.170013427734375, - "logps/chosen": -294.665771484375, - "logps/rejected": -311.9244384765625, - "loss": 0.5406, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.507541835308075, - "rewards/margins": 0.5050578713417053, - "rewards/rejected": -1.0125997066497803, + "epoch": 0.2, + "grad_norm": 5.09375, + "learning_rate": 4.856415494186888e-06, + "logits/chosen": -1.8051137924194336, + "logits/rejected": -1.6217536926269531, + "logps/chosen": -465.5696716308594, + "logps/rejected": -726.9296264648438, + "loss": 0.2112, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -2.7774364948272705, + "rewards/margins": 2.651355266571045, + "rewards/rejected": -5.4287919998168945, "step": 740 }, { "epoch": 0.2, - "grad_norm": 6.59375, - "learning_rate": 4.860730488943068e-06, - "logits/chosen": -2.2937541007995605, - "logits/rejected": -2.311389684677124, - "logps/chosen": -292.8400573730469, - "logps/rejected": -333.968505859375, - "loss": 0.5574, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.39045557379722595, - "rewards/margins": 0.4602751135826111, - "rewards/rejected": -0.8507307171821594, + "grad_norm": 2.140625, + "learning_rate": 4.848521581290129e-06, + "logits/chosen": -1.7539660930633545, + "logits/rejected": -1.5488591194152832, + "logps/chosen": -537.1699829101562, + "logps/rejected": -836.3923950195312, + "loss": 0.1996, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -3.470181941986084, + "rewards/margins": 2.852477550506592, + "rewards/rejected": -6.322659492492676, "step": 750 }, { "epoch": 0.2, - "grad_norm": 5.1875, - "learning_rate": 4.853113678964022e-06, - "logits/chosen": -2.3435096740722656, - "logits/rejected": -2.282520294189453, - "logps/chosen": -334.9354248046875, - "logps/rejected": -366.12799072265625, - "loss": 0.5493, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -0.3748478293418884, - "rewards/margins": 0.4812418520450592, - "rewards/rejected": -0.85608971118927, + "grad_norm": 6.1875, + "learning_rate": 4.840423210012499e-06, + "logits/chosen": -1.7398707866668701, + "logits/rejected": -1.5924098491668701, + "logps/chosen": -524.1094970703125, + "logps/rejected": -829.4354248046875, + "loss": 0.1781, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3822684288024902, + "rewards/margins": 3.0030548572540283, + "rewards/rejected": -6.385323524475098, "step": 760 }, { - "epoch": 0.2, - "grad_norm": 5.5625, - "learning_rate": 4.845300384669958e-06, - "logits/chosen": -2.43422269821167, - "logits/rejected": -2.2817306518554688, - "logps/chosen": -327.2389221191406, - "logps/rejected": -339.32135009765625, - "loss": 0.5993, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.542651891708374, - "rewards/margins": 0.3658657670021057, - "rewards/rejected": -0.9085175395011902, + "epoch": 0.21, + "grad_norm": 5.875, + "learning_rate": 4.832121085384726e-06, + "logits/chosen": -1.686767339706421, + "logits/rejected": -1.5091755390167236, + "logps/chosen": -591.5139770507812, + "logps/rejected": -932.99365234375, + "loss": 0.2262, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.011205196380615, + "rewards/margins": 3.2888877391815186, + "rewards/rejected": -7.300093650817871, "step": 770 }, { - "epoch": 0.2, - "grad_norm": 4.125, - "learning_rate": 4.837291258468701e-06, - "logits/chosen": -2.4618449211120605, - "logits/rejected": -2.3151028156280518, - "logps/chosen": -359.65631103515625, - "logps/rejected": -383.3794860839844, - "loss": 0.5576, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -0.5611785650253296, - "rewards/margins": 0.4969305098056793, - "rewards/rejected": -1.0581090450286865, + "epoch": 0.21, + "grad_norm": 15.3125, + "learning_rate": 4.823615930175965e-06, + "logits/chosen": -1.840598702430725, + "logits/rejected": -1.5934467315673828, + "logps/chosen": -480.89990234375, + "logps/rejected": -787.3369140625, + "loss": 0.1987, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.085526943206787, + "rewards/margins": 3.0567963123321533, + "rewards/rejected": -6.1423234939575195, "step": 780 }, { "epoch": 0.21, - "grad_norm": 8.3125, - "learning_rate": 4.829086969119984e-06, - "logits/chosen": -2.364696502685547, - "logits/rejected": -2.3082656860351562, - "logps/chosen": -346.21722412109375, - "logps/rejected": -385.6118469238281, - "loss": 0.5844, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.6969791650772095, - "rewards/margins": 0.4597656726837158, - "rewards/rejected": -1.1567448377609253, + "grad_norm": 8.25, + "learning_rate": 4.814908484830876e-06, + "logits/chosen": -1.8566583395004272, + "logits/rejected": -1.7526509761810303, + "logps/chosen": -491.6114196777344, + "logps/rejected": -752.9496459960938, + "loss": 0.3075, + "rewards/accuracies": 0.84375, + "rewards/chosen": -2.904597520828247, + "rewards/margins": 2.5675301551818848, + "rewards/rejected": -5.4721269607543945, "step": 790 }, { "epoch": 0.21, - "grad_norm": 8.75, - "learning_rate": 4.820688201679605e-06, - "logits/chosen": -2.4862303733825684, - "logits/rejected": -2.1764872074127197, - "logps/chosen": -336.849365234375, - "logps/rejected": -319.41827392578125, - "loss": 0.5502, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.5689047574996948, - "rewards/margins": 0.4903965890407562, - "rewards/rejected": -1.0593013763427734, + "grad_norm": 9.375, + "learning_rate": 4.80599950740516e-06, + "logits/chosen": -1.920092225074768, + "logits/rejected": -1.7277867794036865, + "logps/chosen": -479.88238525390625, + "logps/rejected": -748.7023315429688, + "loss": 0.2163, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.6948907375335693, + "rewards/margins": 2.6764466762542725, + "rewards/rejected": -5.371337413787842, "step": 800 }, { "epoch": 0.21, - "eval_logits/chosen": -2.439472198486328, - "eval_logits/rejected": -2.317272901535034, - "eval_logps/chosen": -342.2333984375, - "eval_logps/rejected": -366.5324401855469, - "eval_loss": 0.5688381195068359, - "eval_rewards/accuracies": 0.7039999961853027, - "eval_rewards/chosen": -0.5709910988807678, - "eval_rewards/margins": 0.4618839621543884, - "eval_rewards/rejected": -1.0328751802444458, - "eval_runtime": 783.1906, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "eval_logits/chosen": -1.9782260656356812, + "eval_logits/rejected": -1.8626127243041992, + "eval_logps/chosen": -610.1845092773438, + "eval_logps/rejected": -641.7468872070312, + "eval_loss": 0.7138827443122864, + "eval_rewards/accuracies": 0.6029999852180481, + "eval_rewards/chosen": -3.250502109527588, + "eval_rewards/margins": 0.5345167517662048, + "eval_rewards/rejected": -3.785019636154175, + "eval_runtime": 781.4822, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, "step": 800 }, { - "epoch": 0.21, - "grad_norm": 5.5625, - "learning_rate": 4.8120956574422315e-06, - "logits/chosen": -2.46923828125, - "logits/rejected": -2.4094414710998535, - "logps/chosen": -350.98675537109375, - "logps/rejected": -378.17626953125, - "loss": 0.6132, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.5874276161193848, - "rewards/margins": 0.35395190119743347, - "rewards/rejected": -0.9413794279098511, + "epoch": 0.22, + "grad_norm": 5.75, + "learning_rate": 4.796889773499569e-06, + "logits/chosen": -1.8404449224472046, + "logits/rejected": -1.6686649322509766, + "logps/chosen": -595.162109375, + "logps/rejected": -937.796875, + "loss": 0.2058, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.065727710723877, + "rewards/margins": 3.215528964996338, + "rewards/rejected": -7.281256198883057, "step": 810 }, { - "epoch": 0.21, - "grad_norm": 5.09375, - "learning_rate": 4.803310053882831e-06, - "logits/chosen": -2.4236135482788086, - "logits/rejected": -2.460178852081299, - "logps/chosen": -293.0755920410156, - "logps/rejected": -345.6906433105469, - "loss": 0.5662, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.4208901822566986, - "rewards/margins": 0.42452698945999146, - "rewards/rejected": -0.8454172015190125, + "epoch": 0.22, + "grad_norm": 8.6875, + "learning_rate": 4.787580076192377e-06, + "logits/chosen": -1.7283341884613037, + "logits/rejected": -1.5389282703399658, + "logps/chosen": -565.5133056640625, + "logps/rejected": -909.4611206054688, + "loss": 0.1806, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.863267421722412, + "rewards/margins": 3.289839506149292, + "rewards/rejected": -7.153106689453125, "step": 820 }, { "epoch": 0.22, - "grad_norm": 5.25, - "learning_rate": 4.794332124596775e-06, - "logits/chosen": -2.4821386337280273, - "logits/rejected": -2.3172764778137207, - "logps/chosen": -328.24755859375, - "logps/rejected": -359.9569091796875, - "loss": 0.5966, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.400545209646225, - "rewards/margins": 0.38425248861312866, - "rewards/rejected": -0.7847977876663208, + "grad_norm": 4.65625, + "learning_rate": 4.77807122597034e-06, + "logits/chosen": -1.8481029272079468, + "logits/rejected": -1.607322096824646, + "logps/chosen": -589.0106811523438, + "logps/rejected": -942.8314208984375, + "loss": 0.2034, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.9117865562438965, + "rewards/margins": 3.432347536087036, + "rewards/rejected": -7.3441338539123535, "step": 830 }, { "epoch": 0.22, - "grad_norm": 6.5625, - "learning_rate": 4.785162619238575e-06, - "logits/chosen": -2.45249080657959, - "logits/rejected": -2.2496962547302246, - "logps/chosen": -310.9997253417969, - "logps/rejected": -332.2084045410156, - "loss": 0.5478, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.3862709403038025, - "rewards/margins": 0.48837798833847046, - "rewards/rejected": -0.8746488690376282, + "grad_norm": 3.46875, + "learning_rate": 4.768364050658135e-06, + "logits/chosen": -1.9006433486938477, + "logits/rejected": -1.6561591625213623, + "logps/chosen": -529.4610595703125, + "logps/rejected": -857.2683715820312, + "loss": 0.1928, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -3.359536647796631, + "rewards/margins": 3.191103458404541, + "rewards/rejected": -6.550640106201172, "step": 840 }, { - "epoch": 0.22, - "grad_norm": 4.84375, - "learning_rate": 4.775802303459288e-06, - "logits/chosen": -2.3312971591949463, - "logits/rejected": -2.241899013519287, - "logps/chosen": -320.90618896484375, - "logps/rejected": -363.725830078125, - "loss": 0.5602, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.5471154451370239, - "rewards/margins": 0.4693846106529236, - "rewards/rejected": -1.0165001153945923, + "epoch": 0.23, + "grad_norm": 6.3125, + "learning_rate": 4.758459395346292e-06, + "logits/chosen": -1.8754297494888306, + "logits/rejected": -1.6810500621795654, + "logps/chosen": -516.5537109375, + "logps/rejected": -870.4509887695312, + "loss": 0.2042, + "rewards/accuracies": 0.90625, + "rewards/chosen": -3.3872780799865723, + "rewards/margins": 3.3747024536132812, + "rewards/rejected": -6.761981010437012, "step": 850 }, { "epoch": 0.23, - "grad_norm": 7.90625, - "learning_rate": 4.766251958842589e-06, - "logits/chosen": -2.3646304607391357, - "logits/rejected": -2.184277057647705, - "logps/chosen": -373.2308654785156, - "logps/rejected": -394.0884704589844, - "loss": 0.5985, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7825444340705872, - "rewards/margins": 0.35992631316185, - "rewards/rejected": -1.1424707174301147, + "grad_norm": 7.84375, + "learning_rate": 4.748358122317621e-06, + "logits/chosen": -1.7386939525604248, + "logits/rejected": -1.560772180557251, + "logps/chosen": -666.9801025390625, + "logps/rejected": -998.2257080078125, + "loss": 0.2537, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -4.832745552062988, + "rewards/margins": 3.112260341644287, + "rewards/rejected": -7.945005893707275, "step": 860 }, { "epoch": 0.23, - "grad_norm": 4.03125, - "learning_rate": 4.7565123828395066e-06, - "logits/chosen": -2.292964458465576, - "logits/rejected": -2.258443593978882, - "logps/chosen": -369.5010070800781, - "logps/rejected": -415.40069580078125, - "loss": 0.5605, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.9762517809867859, - "rewards/margins": 0.4485411047935486, - "rewards/rejected": -1.424793004989624, + "grad_norm": 16.375, + "learning_rate": 4.738061110972143e-06, + "logits/chosen": -1.8336429595947266, + "logits/rejected": -1.627294898033142, + "logps/chosen": -668.6233520507812, + "logps/rejected": -1004.8411865234375, + "loss": 0.2447, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.7506890296936035, + "rewards/margins": 3.425739288330078, + "rewards/rejected": -8.176427841186523, "step": 870 }, { - "epoch": 0.23, - "grad_norm": 3.828125, - "learning_rate": 4.746584388701831e-06, - "logits/chosen": -2.381927967071533, - "logits/rejected": -2.2788944244384766, - "logps/chosen": -373.2781677246094, - "logps/rejected": -398.8980407714844, - "loss": 0.5913, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.9109410047531128, - "rewards/margins": 0.4138285219669342, - "rewards/rejected": -1.3247694969177246, + "epoch": 0.24, + "grad_norm": 11.875, + "learning_rate": 4.727569257750531e-06, + "logits/chosen": -1.9241821765899658, + "logits/rejected": -1.7090564966201782, + "logps/chosen": -580.5578002929688, + "logps/rejected": -904.0485229492188, + "loss": 0.2519, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -3.875545024871826, + "rewards/margins": 3.183382511138916, + "rewards/rejected": -7.0589280128479, "step": 880 }, { - "epoch": 0.23, - "grad_norm": 6.34375, - "learning_rate": 4.736468805414218e-06, - "logits/chosen": -2.2713119983673096, - "logits/rejected": -2.2479841709136963, - "logps/chosen": -331.4849548339844, - "logps/rejected": -392.24859619140625, - "loss": 0.5605, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6100383996963501, - "rewards/margins": 0.5197950601577759, - "rewards/rejected": -1.129833459854126, + "epoch": 0.24, + "grad_norm": 6.40625, + "learning_rate": 4.71688347605607e-06, + "logits/chosen": -2.0419352054595947, + "logits/rejected": -1.7710177898406982, + "logps/chosen": -566.7761840820312, + "logps/rejected": -831.5400390625, + "loss": 0.1916, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.699720859527588, + "rewards/margins": 2.6933109760284424, + "rewards/rejected": -6.393032073974609, "step": 890 }, { "epoch": 0.24, - "grad_norm": 4.96875, - "learning_rate": 4.7261664776249595e-06, - "logits/chosen": -2.1991798877716064, - "logits/rejected": -2.126394510269165, - "logps/chosen": -300.3690490722656, - "logps/rejected": -337.35076904296875, - "loss": 0.551, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.5168617963790894, - "rewards/margins": 0.48721250891685486, - "rewards/rejected": -1.0040743350982666, + "grad_norm": 5.25, + "learning_rate": 4.70600469617513e-06, + "logits/chosen": -1.9878368377685547, + "logits/rejected": -1.7731959819793701, + "logps/chosen": -579.6895141601562, + "logps/rejected": -862.3414306640625, + "loss": 0.2297, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -3.830165386199951, + "rewards/margins": 2.8225326538085938, + "rewards/rejected": -6.652697563171387, "step": 900 }, { "epoch": 0.24, - "eval_logits/chosen": -2.3767266273498535, - "eval_logits/rejected": -2.257284641265869, - "eval_logps/chosen": -340.9869689941406, - "eval_logps/rejected": -364.7085266113281, - "eval_loss": 0.5723198056221008, - "eval_rewards/accuracies": 0.7099999785423279, - "eval_rewards/chosen": -0.5585272312164307, - "eval_rewards/margins": 0.45610883831977844, - "eval_rewards/rejected": -1.0146360397338867, - "eval_runtime": 783.0599, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "eval_logits/chosen": -2.071340799331665, + "eval_logits/rejected": -1.9512320756912231, + "eval_logps/chosen": -649.6268920898438, + "eval_logps/rejected": -682.4832153320312, + "eval_loss": 0.7276944518089294, + "eval_rewards/accuracies": 0.6014999747276306, + "eval_rewards/chosen": -3.644925355911255, + "eval_rewards/margins": 0.5474573969841003, + "eval_rewards/rejected": -4.1923828125, + "eval_runtime": 780.6188, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, "step": 900 }, { "epoch": 0.24, - "grad_norm": 4.9375, - "learning_rate": 4.715678265575463e-06, - "logits/chosen": -2.355442523956299, - "logits/rejected": -2.1583566665649414, - "logps/chosen": -355.2525939941406, - "logps/rejected": -336.24884033203125, - "loss": 0.5939, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.5178682208061218, - "rewards/margins": 0.3880244195461273, - "rewards/rejected": -0.9058926701545715, + "grad_norm": 4.28125, + "learning_rate": 4.694933865196185e-06, + "logits/chosen": -1.9098104238510132, + "logits/rejected": -1.6890531778335571, + "logps/chosen": -601.0123291015625, + "logps/rejected": -1018.0811767578125, + "loss": 0.1563, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -4.1650543212890625, + "rewards/margins": 3.963716506958008, + "rewards/rejected": -8.12877082824707, "step": 910 }, { - "epoch": 0.24, - "grad_norm": 5.53125, - "learning_rate": 4.705005045028415e-06, - "logits/chosen": -2.257471799850464, - "logits/rejected": -2.175213098526001, - "logps/chosen": -341.3362731933594, - "logps/rejected": -361.2740173339844, - "loss": 0.5922, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.5742098093032837, - "rewards/margins": 0.4120204448699951, - "rewards/rejected": -0.9862302541732788, + "epoch": 0.25, + "grad_norm": 4.625, + "learning_rate": 4.68367194692736e-06, + "logits/chosen": -1.815717339515686, + "logits/rejected": -1.5684750080108643, + "logps/chosen": -765.4061279296875, + "logps/rejected": -1324.0279541015625, + "loss": 0.2069, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.8041791915893555, + "rewards/margins": 5.39047384262085, + "rewards/rejected": -11.194653511047363, "step": 920 }, { - "epoch": 0.24, - "grad_norm": 6.125, - "learning_rate": 4.694147707194659e-06, - "logits/chosen": -2.330077648162842, - "logits/rejected": -2.316610336303711, - "logps/chosen": -347.4102478027344, - "logps/rejected": -376.43292236328125, - "loss": 0.5613, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.5703408122062683, - "rewards/margins": 0.5020008683204651, - "rewards/rejected": -1.0723416805267334, + "epoch": 0.25, + "grad_norm": 6.75, + "learning_rate": 4.672219921812517e-06, + "logits/chosen": -1.815259337425232, + "logits/rejected": -1.4773845672607422, + "logps/chosen": -702.4598388671875, + "logps/rejected": -1157.608642578125, + "loss": 0.2329, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.984345436096191, + "rewards/margins": 4.5979905128479, + "rewards/rejected": -9.58233642578125, "step": 930 }, { "epoch": 0.25, - "grad_norm": 5.46875, - "learning_rate": 4.683107158658782e-06, - "logits/chosen": -2.2630085945129395, - "logits/rejected": -2.2645463943481445, - "logps/chosen": -359.7054138183594, - "logps/rejected": -383.32269287109375, - "loss": 0.5196, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.48080378770828247, - "rewards/margins": 0.5593954920768738, - "rewards/rejected": -1.0401992797851562, + "grad_norm": 6.65625, + "learning_rate": 4.660578786845907e-06, + "logits/chosen": -1.9987341165542603, + "logits/rejected": -1.7102264165878296, + "logps/chosen": -521.5107421875, + "logps/rejected": -883.5695190429688, + "loss": 0.1534, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3418431282043457, + "rewards/margins": 3.685710906982422, + "rewards/rejected": -7.027552604675293, "step": 940 }, { "epoch": 0.25, - "grad_norm": 7.625, - "learning_rate": 4.671884321303407e-06, - "logits/chosen": -2.3846192359924316, - "logits/rejected": -2.1435961723327637, - "logps/chosen": -315.91790771484375, - "logps/rejected": -355.96343994140625, - "loss": 0.5362, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.5064867734909058, - "rewards/margins": 0.5416258573532104, - "rewards/rejected": -1.0481126308441162, + "grad_norm": 8.3125, + "learning_rate": 4.6487495554853706e-06, + "logits/chosen": -1.8766582012176514, + "logits/rejected": -1.727230429649353, + "logps/chosen": -566.2430419921875, + "logps/rejected": -948.5830078125, + "loss": 0.2355, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -3.5930278301239014, + "rewards/margins": 3.7260138988494873, + "rewards/rejected": -7.319043159484863, "step": 950 }, { - "epoch": 0.25, - "grad_norm": 7.4375, - "learning_rate": 4.660480132232224e-06, - "logits/chosen": -2.409409999847412, - "logits/rejected": -2.2849433422088623, - "logps/chosen": -346.1353759765625, - "logps/rejected": -366.18707275390625, - "loss": 0.5855, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.5678282976150513, - "rewards/margins": 0.43991154432296753, - "rewards/rejected": -1.007739782333374, + "epoch": 0.26, + "grad_norm": 6.5, + "learning_rate": 4.636733257564104e-06, + "logits/chosen": -1.8848927021026611, + "logits/rejected": -1.6707156896591187, + "logps/chosen": -633.5640869140625, + "logps/rejected": -1011.9500122070312, + "loss": 0.1879, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.319493770599365, + "rewards/margins": 3.7195515632629395, + "rewards/rejected": -8.039045333862305, "step": 960 }, { - "epoch": 0.25, - "grad_norm": 4.5, - "learning_rate": 4.6488955436917414e-06, - "logits/chosen": -2.370598554611206, - "logits/rejected": -2.1790661811828613, - "logps/chosen": -352.23797607421875, - "logps/rejected": -368.2437744140625, - "loss": 0.5155, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.5098623633384705, - "rewards/margins": 0.6602128744125366, - "rewards/rejected": -1.1700751781463623, + "epoch": 0.26, + "grad_norm": 14.625, + "learning_rate": 4.6245309392010094e-06, + "logits/chosen": -1.8968805074691772, + "logits/rejected": -1.5086712837219238, + "logps/chosen": -695.408935546875, + "logps/rejected": -1168.6949462890625, + "loss": 0.1625, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.129621982574463, + "rewards/margins": 4.753883361816406, + "rewards/rejected": -9.883504867553711, "step": 970 }, { "epoch": 0.26, - "grad_norm": 5.125, - "learning_rate": 4.6371315229917644e-06, - "logits/chosen": -2.3567447662353516, - "logits/rejected": -2.2032203674316406, - "logps/chosen": -362.06390380859375, - "logps/rejected": -394.82550048828125, - "loss": 0.522, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.5349576473236084, - "rewards/margins": 0.604442298412323, - "rewards/rejected": -1.1393998861312866, + "grad_norm": 11.0, + "learning_rate": 4.612143662709619e-06, + "logits/chosen": -1.941301703453064, + "logits/rejected": -1.7558329105377197, + "logps/chosen": -655.1188354492188, + "logps/rejected": -1162.034912109375, + "loss": 0.1826, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.672366142272949, + "rewards/margins": 4.921967506408691, + "rewards/rejected": -9.59433364868164, "step": 980 }, { "epoch": 0.26, - "grad_norm": 18.375, - "learning_rate": 4.625189052424638e-06, - "logits/chosen": -2.288172721862793, - "logits/rejected": -2.201801061630249, - "logps/chosen": -327.40899658203125, - "logps/rejected": -366.302001953125, - "loss": 0.514, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.700675368309021, - "rewards/margins": 0.6302372813224792, - "rewards/rejected": -1.3309125900268555, + "grad_norm": 15.0, + "learning_rate": 4.599572506505611e-06, + "logits/chosen": -2.061692476272583, + "logits/rejected": -1.7294963598251343, + "logps/chosen": -608.3771362304688, + "logps/rejected": -967.8352661132812, + "loss": 0.2038, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -3.9572815895080566, + "rewards/margins": 3.729525089263916, + "rewards/rejected": -7.686806678771973, "step": 990 }, { - "epoch": 0.26, - "grad_norm": 6.46875, - "learning_rate": 4.613069129183218e-06, - "logits/chosen": -2.399292469024658, - "logits/rejected": -2.2850279808044434, - "logps/chosen": -393.8326721191406, - "logps/rejected": -400.1686096191406, - "loss": 0.5684, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.6994020342826843, - "rewards/margins": 0.47142618894577026, - "rewards/rejected": -1.170828104019165, + "epoch": 0.27, + "grad_norm": 7.59375, + "learning_rate": 4.586818565012925e-06, + "logits/chosen": -1.9608147144317627, + "logits/rejected": -1.8354988098144531, + "logps/chosen": -583.6975708007812, + "logps/rejected": -952.1292724609375, + "loss": 0.1739, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -4.060886383056641, + "rewards/margins": 3.516300678253174, + "rewards/rejected": -7.577187538146973, "step": 1000 }, { - "epoch": 0.26, - "eval_logits/chosen": -2.346391201019287, - "eval_logits/rejected": -2.2283005714416504, - "eval_logps/chosen": -360.5555114746094, - "eval_logps/rejected": -394.3551330566406, - "eval_loss": 0.5602221488952637, - "eval_rewards/accuracies": 0.7070000171661377, - "eval_rewards/chosen": -0.7542121410369873, - "eval_rewards/margins": 0.5568897724151611, - "eval_rewards/rejected": -1.311102032661438, - "eval_runtime": 783.2741, - "eval_samples_per_second": 2.553, - "eval_steps_per_second": 0.319, + "epoch": 0.27, + "eval_logits/chosen": -2.0702786445617676, + "eval_logits/rejected": -1.9476977586746216, + "eval_logps/chosen": -655.5535888671875, + "eval_logps/rejected": -696.7918701171875, + "eval_loss": 0.761256992816925, + "eval_rewards/accuracies": 0.6010000109672546, + "eval_rewards/chosen": -3.704192638397217, + "eval_rewards/margins": 0.6312769055366516, + "eval_rewards/rejected": -4.3354692459106445, + "eval_runtime": 781.7978, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, "step": 1000 }, { - "epoch": 0.26, - "grad_norm": 9.375, - "learning_rate": 4.600772765277607e-06, - "logits/chosen": -2.223280429840088, - "logits/rejected": -2.1989901065826416, - "logps/chosen": -330.130859375, - "logps/rejected": -382.6180725097656, - "loss": 0.5425, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7673664093017578, - "rewards/margins": 0.5920525789260864, - "rewards/rejected": -1.3594189882278442, + "epoch": 0.27, + "grad_norm": 19.125, + "learning_rate": 4.573882948568487e-06, + "logits/chosen": -1.999812126159668, + "logits/rejected": -1.7216962575912476, + "logps/chosen": -565.93505859375, + "logps/rejected": -929.3225708007812, + "loss": 0.2591, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -3.8413009643554688, + "rewards/margins": 3.6320464611053467, + "rewards/rejected": -7.4733476638793945, "step": 1010 }, { "epoch": 0.27, - "grad_norm": 10.5625, - "learning_rate": 4.588300987450652e-06, - "logits/chosen": -2.2941932678222656, - "logits/rejected": -2.2032299041748047, - "logps/chosen": -362.07977294921875, - "logps/rejected": -387.52996826171875, - "loss": 0.5543, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.8918827176094055, - "rewards/margins": 0.598099410533905, - "rewards/rejected": -1.4899821281433105, + "grad_norm": 7.9375, + "learning_rate": 4.560766783325536e-06, + "logits/chosen": -1.9002296924591064, + "logits/rejected": -1.7296489477157593, + "logps/chosen": -585.817626953125, + "logps/rejected": -975.8126831054688, + "loss": 0.2053, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.8914363384246826, + "rewards/margins": 3.8831589221954346, + "rewards/rejected": -7.774595737457275, "step": 1020 }, { - "epoch": 0.27, - "grad_norm": 4.4375, - "learning_rate": 4.5756548370922136e-06, - "logits/chosen": -2.1861064434051514, - "logits/rejected": -2.1109728813171387, - "logps/chosen": -340.6225891113281, - "logps/rejected": -387.4969177246094, - "loss": 0.5734, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.822590172290802, - "rewards/margins": 0.5768292546272278, - "rewards/rejected": -1.3994193077087402, + "epoch": 0.28, + "grad_norm": 16.875, + "learning_rate": 4.547471211155595e-06, + "logits/chosen": -1.9321715831756592, + "logits/rejected": -1.7310463190078735, + "logps/chosen": -688.9970092773438, + "logps/rejected": -1087.14306640625, + "loss": 0.238, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.974576473236084, + "rewards/margins": 3.8230443000793457, + "rewards/rejected": -8.797619819641113, "step": 1030 }, { - "epoch": 0.27, - "grad_norm": 8.25, - "learning_rate": 4.562835370152206e-06, - "logits/chosen": -2.3119113445281982, - "logits/rejected": -2.002518653869629, - "logps/chosen": -402.314697265625, - "logps/rejected": -457.06463623046875, - "loss": 0.4708, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -0.7894701957702637, - "rewards/margins": 0.8879375457763672, - "rewards/rejected": -1.6774076223373413, + "epoch": 0.28, + "grad_norm": 9.6875, + "learning_rate": 4.533997389549052e-06, + "logits/chosen": -1.9757192134857178, + "logits/rejected": -1.7459895610809326, + "logps/chosen": -632.3358154296875, + "logps/rejected": -1068.840576171875, + "loss": 0.2295, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.492635250091553, + "rewards/margins": 4.191953182220459, + "rewards/rejected": -8.684588432312012, "step": 1040 }, { - "epoch": 0.27, - "grad_norm": 5.1875, - "learning_rate": 4.54984365705243e-06, - "logits/chosen": -2.284492015838623, - "logits/rejected": -2.1806671619415283, - "logps/chosen": -371.3755798339844, - "logps/rejected": -453.24591064453125, - "loss": 0.4773, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.8460718393325806, - "rewards/margins": 0.8838627934455872, - "rewards/rejected": -1.7299346923828125, + "epoch": 0.28, + "grad_norm": 5.9375, + "learning_rate": 4.520346491514395e-06, + "logits/chosen": -1.8071882724761963, + "logits/rejected": -1.4430408477783203, + "logps/chosen": -718.0726318359375, + "logps/rejected": -1229.7724609375, + "loss": 0.1538, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.237442970275879, + "rewards/margins": 5.075723171234131, + "rewards/rejected": -10.313165664672852, "step": 1050 }, { "epoch": 0.28, - "grad_norm": 9.125, - "learning_rate": 4.536680782597191e-06, - "logits/chosen": -2.1628429889678955, - "logits/rejected": -2.0977540016174316, - "logps/chosen": -370.38250732421875, - "logps/rejected": -426.35479736328125, - "loss": 0.5848, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.142185926437378, - "rewards/margins": 0.637835681438446, - "rewards/rejected": -1.7800216674804688, + "grad_norm": 6.53125, + "learning_rate": 4.506519705476092e-06, + "logits/chosen": -1.7625377178192139, + "logits/rejected": -1.508211374282837, + "logps/chosen": -698.8258056640625, + "logps/rejected": -1272.889892578125, + "loss": 0.187, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.130419731140137, + "rewards/margins": 5.599501609802246, + "rewards/rejected": -10.729921340942383, "step": 1060 }, { - "epoch": 0.28, - "grad_norm": 18.375, - "learning_rate": 4.523347845882718e-06, - "logits/chosen": -2.2164788246154785, - "logits/rejected": -2.0444154739379883, - "logps/chosen": -416.1336364746094, - "logps/rejected": -453.9571228027344, - "loss": 0.4591, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.0554344654083252, - "rewards/margins": 0.9309595823287964, - "rewards/rejected": -1.9863941669464111, + "epoch": 0.29, + "grad_norm": 3.265625, + "learning_rate": 4.4925182351711286e-06, + "logits/chosen": -1.927294135093689, + "logits/rejected": -1.6415202617645264, + "logps/chosen": -584.6781005859375, + "logps/rejected": -968.0924072265625, + "loss": 0.2194, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.002772331237793, + "rewards/margins": 3.7211735248565674, + "rewards/rejected": -7.723946571350098, "step": 1070 }, { - "epoch": 0.28, - "grad_norm": 5.90625, - "learning_rate": 4.50984596020539e-06, - "logits/chosen": -2.07619047164917, - "logits/rejected": -1.9385461807250977, - "logps/chosen": -401.8956298828125, - "logps/rejected": -442.7154235839844, - "loss": 0.5498, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.0267369747161865, - "rewards/margins": 0.7495890855789185, - "rewards/rejected": -1.7763264179229736, + "epoch": 0.29, + "grad_norm": 14.375, + "learning_rate": 4.478343299544208e-06, + "logits/chosen": -1.805546522140503, + "logits/rejected": -1.6112562417984009, + "logps/chosen": -647.7420043945312, + "logps/rejected": -1060.8192138671875, + "loss": 0.2179, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.759741306304932, + "rewards/margins": 4.042403221130371, + "rewards/rejected": -8.802144050598145, "step": 1080 }, { "epoch": 0.29, - "grad_norm": 5.28125, - "learning_rate": 4.4961762529687745e-06, - "logits/chosen": -2.176191806793213, - "logits/rejected": -2.0374581813812256, - "logps/chosen": -372.7863464355469, - "logps/rejected": -427.24658203125, - "loss": 0.5695, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.9093082547187805, - "rewards/margins": 0.7514539957046509, - "rewards/rejected": -1.6607621908187866, + "grad_norm": 5.03125, + "learning_rate": 4.463996132641641e-06, + "logits/chosen": -1.8722712993621826, + "logits/rejected": -1.560368537902832, + "logps/chosen": -722.08349609375, + "logps/rejected": -1167.6322021484375, + "loss": 0.162, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.170421600341797, + "rewards/margins": 4.453159809112549, + "rewards/rejected": -9.623581886291504, "step": 1090 }, { "epoch": 0.29, - "grad_norm": 6.09375, - "learning_rate": 4.482339865589492e-06, - "logits/chosen": -2.1343674659729004, - "logits/rejected": -1.996208906173706, - "logps/chosen": -379.14581298828125, - "logps/rejected": -391.421142578125, - "loss": 0.5722, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.9395943880081177, - "rewards/margins": 0.568466305732727, - "rewards/rejected": -1.5080606937408447, + "grad_norm": 7.3125, + "learning_rate": 4.449477983503902e-06, + "logits/chosen": -1.6484050750732422, + "logits/rejected": -1.3613227605819702, + "logps/chosen": -704.55322265625, + "logps/rejected": -1220.0943603515625, + "loss": 0.1868, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.208704948425293, + "rewards/margins": 5.147915840148926, + "rewards/rejected": -10.356620788574219, "step": 1100 }, { "epoch": 0.29, - "eval_logits/chosen": -2.1819868087768555, - "eval_logits/rejected": -2.0676610469818115, - "eval_logps/chosen": -364.4903564453125, - "eval_logps/rejected": -408.9803466796875, - "eval_loss": 0.5428664684295654, - "eval_rewards/accuracies": 0.7239999771118164, - "eval_rewards/chosen": -0.793560802936554, - "eval_rewards/margins": 0.6637934446334839, - "eval_rewards/rejected": -1.4573543071746826, - "eval_runtime": 782.9279, - "eval_samples_per_second": 2.555, - "eval_steps_per_second": 0.319, + "eval_logits/chosen": -1.8779360055923462, + "eval_logits/rejected": -1.7606240510940552, + "eval_logps/chosen": -797.6035766601562, + "eval_logps/rejected": -851.0315551757812, + "eval_loss": 0.9360729455947876, + "eval_rewards/accuracies": 0.5724999904632568, + "eval_rewards/chosen": -5.124693393707275, + "eval_rewards/margins": 0.753172755241394, + "eval_rewards/rejected": -5.877865791320801, + "eval_runtime": 780.1782, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.32, "step": 1100 }, { - "epoch": 0.29, - "grad_norm": 5.25, - "learning_rate": 4.468337953401909e-06, - "logits/chosen": -2.15395450592041, - "logits/rejected": -2.0820419788360596, - "logps/chosen": -375.0626525878906, - "logps/rejected": -429.63238525390625, - "loss": 0.547, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.8576391339302063, - "rewards/margins": 0.6185614466667175, - "rewards/rejected": -1.4762006998062134, + "epoch": 0.3, + "grad_norm": 6.03125, + "learning_rate": 4.434790116056898e-06, + "logits/chosen": -1.6855872869491577, + "logits/rejected": -1.4614320993423462, + "logps/chosen": -692.4788818359375, + "logps/rejected": -1222.5704345703125, + "loss": 0.2294, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.072499752044678, + "rewards/margins": 5.230921745300293, + "rewards/rejected": -10.303421974182129, "step": 1110 }, { - "epoch": 0.29, - "grad_norm": 6.53125, - "learning_rate": 4.45417168556166e-06, - "logits/chosen": -2.0685439109802246, - "logits/rejected": -1.959359884262085, - "logps/chosen": -349.4896545410156, - "logps/rejected": -422.0367736816406, - "loss": 0.5171, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.8915795087814331, - "rewards/margins": 0.7198113799095154, - "rewards/rejected": -1.6113910675048828, + "epoch": 0.3, + "grad_norm": 2.625, + "learning_rate": 4.419933809001929e-06, + "logits/chosen": -1.776877760887146, + "logits/rejected": -1.4783637523651123, + "logps/chosen": -685.8599853515625, + "logps/rejected": -1175.57958984375, + "loss": 0.1559, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -4.9904985427856445, + "rewards/margins": 4.894043922424316, + "rewards/rejected": -9.884542465209961, "step": 1120 }, { "epoch": 0.3, - "grad_norm": 11.125, - "learning_rate": 4.439842244948036e-06, - "logits/chosen": -2.0176918506622314, - "logits/rejected": -1.9431390762329102, - "logps/chosen": -387.5065002441406, - "logps/rejected": -457.86065673828125, - "loss": 0.5738, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.048559308052063, - "rewards/margins": 0.6856990456581116, - "rewards/rejected": -1.7342584133148193, + "grad_norm": 13.125, + "learning_rate": 4.404910355704362e-06, + "logits/chosen": -1.676234483718872, + "logits/rejected": -1.4960626363754272, + "logps/chosen": -788.3824462890625, + "logps/rejected": -1284.5242919921875, + "loss": 0.1796, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.101700305938721, + "rewards/margins": 4.996757507324219, + "rewards/rejected": -11.098456382751465, "step": 1130 }, { "epoch": 0.3, - "grad_norm": 9.3125, - "learning_rate": 4.425350828065204e-06, - "logits/chosen": -2.1399078369140625, - "logits/rejected": -1.9063644409179688, - "logps/chosen": -398.8601989746094, - "logps/rejected": -416.056640625, - "loss": 0.5311, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -0.9232648611068726, - "rewards/margins": 0.6931252479553223, - "rewards/rejected": -1.6163902282714844, + "grad_norm": 9.125, + "learning_rate": 4.389721064081045e-06, + "logits/chosen": -1.748809814453125, + "logits/rejected": -1.509686827659607, + "logps/chosen": -748.4895629882812, + "logps/rejected": -1260.5853271484375, + "loss": 0.204, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.650696754455566, + "rewards/margins": 5.0570387840271, + "rewards/rejected": -10.707735061645508, "step": 1140 }, { - "epoch": 0.3, - "grad_norm": 5.9375, - "learning_rate": 4.410698644942303e-06, - "logits/chosen": -2.1537070274353027, - "logits/rejected": -2.044154167175293, - "logps/chosen": -377.70782470703125, - "logps/rejected": -427.4280700683594, - "loss": 0.5116, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.8818570971488953, - "rewards/margins": 0.7475295066833496, - "rewards/rejected": -1.6293865442276, + "epoch": 0.31, + "grad_norm": 2.3125, + "learning_rate": 4.3743672564864305e-06, + "logits/chosen": -1.7800718545913696, + "logits/rejected": -1.5566697120666504, + "logps/chosen": -759.5309448242188, + "logps/rejected": -1315.963134765625, + "loss": 0.1966, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.749941825866699, + "rewards/margins": 5.49472713470459, + "rewards/rejected": -11.244667053222656, "step": 1150 }, { - "epoch": 0.3, - "grad_norm": 9.9375, - "learning_rate": 4.395886919032406e-06, - "logits/chosen": -1.9682527780532837, - "logits/rejected": -1.9200541973114014, - "logps/chosen": -385.158203125, - "logps/rejected": -427.021484375, - "loss": 0.5316, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.030469536781311, - "rewards/margins": 0.7004586458206177, - "rewards/rejected": -1.7309284210205078, + "epoch": 0.31, + "grad_norm": 13.75, + "learning_rate": 4.358850269597458e-06, + "logits/chosen": -1.9110691547393799, + "logits/rejected": -1.7078921794891357, + "logps/chosen": -694.5254516601562, + "logps/rejected": -1189.4229736328125, + "loss": 0.219, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.015244960784912, + "rewards/margins": 5.033026695251465, + "rewards/rejected": -10.048272132873535, "step": 1160 }, { "epoch": 0.31, - "grad_norm": 7.59375, - "learning_rate": 4.380916887110366e-06, - "logits/chosen": -2.1585569381713867, - "logits/rejected": -1.9411481618881226, - "logps/chosen": -395.1438903808594, - "logps/rejected": -424.6376953125, - "loss": 0.5312, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.1806066036224365, - "rewards/margins": 0.7220126390457153, - "rewards/rejected": -1.9026191234588623, + "grad_norm": 12.6875, + "learning_rate": 4.343171454297187e-06, + "logits/chosen": -1.8295984268188477, + "logits/rejected": -1.617525339126587, + "logps/chosen": -707.7969970703125, + "logps/rejected": -1195.290771484375, + "loss": 0.1967, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.0687055587768555, + "rewards/margins": 4.745570182800293, + "rewards/rejected": -9.814275741577148, "step": 1170 }, { - "epoch": 0.31, - "grad_norm": 9.4375, - "learning_rate": 4.365789799169539e-06, - "logits/chosen": -1.9048945903778076, - "logits/rejected": -1.9615923166275024, - "logps/chosen": -394.121337890625, - "logps/rejected": -464.55633544921875, - "loss": 0.5309, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.21621572971344, - "rewards/margins": 0.7191425561904907, - "rewards/rejected": -1.9353582859039307, + "epoch": 0.32, + "grad_norm": 8.875, + "learning_rate": 4.3273321755571855e-06, + "logits/chosen": -1.7773466110229492, + "logits/rejected": -1.505315899848938, + "logps/chosen": -755.8875732421875, + "logps/rejected": -1358.144287109375, + "loss": 0.1713, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.826977252960205, + "rewards/margins": 5.8857316970825195, + "rewards/rejected": -11.712709426879883, "step": 1180 }, { - "epoch": 0.31, - "grad_norm": 8.375, - "learning_rate": 4.350506918317416e-06, - "logits/chosen": -2.11989164352417, - "logits/rejected": -1.946270227432251, - "logps/chosen": -387.0372009277344, - "logps/rejected": -453.46307373046875, - "loss": 0.52, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2322447299957275, - "rewards/margins": 0.72694331407547, - "rewards/rejected": -1.9591881036758423, + "epoch": 0.32, + "grad_norm": 4.65625, + "learning_rate": 4.3113338123187046e-06, + "logits/chosen": -1.7583072185516357, + "logits/rejected": -1.4466537237167358, + "logps/chosen": -786.8846435546875, + "logps/rejected": -1387.0775146484375, + "loss": 0.1455, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.128576755523682, + "rewards/margins": 5.886734962463379, + "rewards/rejected": -12.015311241149902, "step": 1190 }, { - "epoch": 0.31, - "grad_norm": 9.375, - "learning_rate": 4.335069520670149e-06, - "logits/chosen": -1.9962623119354248, - "logits/rejected": -1.9240058660507202, - "logps/chosen": -365.1288757324219, - "logps/rejected": -439.9312438964844, - "loss": 0.5866, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.1962945461273193, - "rewards/margins": 0.6338215470314026, - "rewards/rejected": -1.8301159143447876, + "epoch": 0.32, + "grad_norm": 3.78125, + "learning_rate": 4.295177757372627e-06, + "logits/chosen": -1.9021990299224854, + "logits/rejected": -1.6431891918182373, + "logps/chosen": -681.1278076171875, + "logps/rejected": -1244.9158935546875, + "loss": 0.191, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.913496971130371, + "rewards/margins": 5.573685646057129, + "rewards/rejected": -10.487181663513184, "step": 1200 }, { - "epoch": 0.31, - "eval_logits/chosen": -2.1387972831726074, - "eval_logits/rejected": -2.0249171257019043, - "eval_logps/chosen": -389.76617431640625, - "eval_logps/rejected": -436.6127624511719, - "eval_loss": 0.5338234305381775, - "eval_rewards/accuracies": 0.7204999923706055, - "eval_rewards/chosen": -1.0463193655014038, - "eval_rewards/margins": 0.6873589754104614, - "eval_rewards/rejected": -1.7336784601211548, - "eval_runtime": 783.6864, - "eval_samples_per_second": 2.552, - "eval_steps_per_second": 0.319, + "epoch": 0.32, + "eval_logits/chosen": -2.013472557067871, + "eval_logits/rejected": -1.893463134765625, + "eval_logps/chosen": -719.9283447265625, + "eval_logps/rejected": -763.7403564453125, + "eval_loss": 0.8649675250053406, + "eval_rewards/accuracies": 0.578499972820282, + "eval_rewards/chosen": -4.347940444946289, + "eval_rewards/margins": 0.6570136547088623, + "eval_rewards/rejected": -5.0049543380737305, + "eval_runtime": 780.7054, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, "step": 1200 }, { "epoch": 0.32, - "grad_norm": 4.34375, - "learning_rate": 4.319478895246e-06, - "logits/chosen": -2.0610435009002686, - "logits/rejected": -1.9308923482894897, - "logps/chosen": -357.9656677246094, - "logps/rejected": -389.885498046875, - "loss": 0.542, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.9039508104324341, - "rewards/margins": 0.6048199534416199, - "rewards/rejected": -1.5087708234786987, + "grad_norm": 2.6875, + "learning_rate": 4.278865417238212e-06, + "logits/chosen": -1.8427289724349976, + "logits/rejected": -1.6436818838119507, + "logps/chosen": -644.0859375, + "logps/rejected": -1054.9425048828125, + "loss": 0.188, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.5779314041137695, + "rewards/margins": 4.159191608428955, + "rewards/rejected": -8.737122535705566, "step": 1210 }, { - "epoch": 0.32, - "grad_norm": 9.75, - "learning_rate": 4.303736343857704e-06, - "logits/chosen": -2.0861053466796875, - "logits/rejected": -2.018589735031128, - "logps/chosen": -353.1060791015625, - "logps/rejected": -446.2403869628906, - "loss": 0.4978, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.7825400233268738, - "rewards/margins": 0.8096281886100769, - "rewards/rejected": -1.5921683311462402, + "epoch": 0.33, + "grad_norm": 2.359375, + "learning_rate": 4.262398212040646e-06, + "logits/chosen": -1.8668807744979858, + "logits/rejected": -1.623335838317871, + "logps/chosen": -723.8055419921875, + "logps/rejected": -1369.9927978515625, + "loss": 0.1109, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.331421375274658, + "rewards/margins": 6.487430572509766, + "rewards/rejected": -11.818851470947266, "step": 1220 }, { - "epoch": 0.32, - "grad_norm": 8.375, - "learning_rate": 4.287843181003772e-06, - "logits/chosen": -2.0886423587799072, - "logits/rejected": -1.9124377965927124, - "logps/chosen": -429.72119140625, - "logps/rejected": -441.4107360839844, - "loss": 0.576, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.1503570079803467, - "rewards/margins": 0.6045898795127869, - "rewards/rejected": -1.7549469470977783, + "epoch": 0.33, + "grad_norm": 32.5, + "learning_rate": 4.245777575387413e-06, + "logits/chosen": -1.7328977584838867, + "logits/rejected": -1.4244635105133057, + "logps/chosen": -891.6882934570312, + "logps/rejected": -1529.746337890625, + "loss": 0.2673, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -7.042812347412109, + "rewards/margins": 6.3449177742004395, + "rewards/rejected": -13.387730598449707, "step": 1230 }, { - "epoch": 0.32, - "grad_norm": 8.9375, - "learning_rate": 4.27180073375873e-06, - "logits/chosen": -2.0608718395233154, - "logits/rejected": -1.8816003799438477, - "logps/chosen": -424.85528564453125, - "logps/rejected": -464.27215576171875, - "loss": 0.5232, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.177444338798523, - "rewards/margins": 0.769978404045105, - "rewards/rejected": -1.947422742843628, + "epoch": 0.33, + "grad_norm": 5.5625, + "learning_rate": 4.229004954243483e-06, + "logits/chosen": -1.7528269290924072, + "logits/rejected": -1.5056416988372803, + "logps/chosen": -1028.7825927734375, + "logps/rejected": -1734.1068115234375, + "loss": 0.1938, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -8.365468978881836, + "rewards/margins": 6.998432159423828, + "rewards/rejected": -15.36390209197998, "step": 1240 }, { "epoch": 0.33, - "grad_norm": 3.734375, - "learning_rate": 4.255610341662304e-06, - "logits/chosen": -2.1295692920684814, - "logits/rejected": -1.8819198608398438, - "logps/chosen": -387.6376647949219, - "logps/rejected": -432.35418701171875, - "loss": 0.5569, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.10086989402771, - "rewards/margins": 0.6811186075210571, - "rewards/rejected": -1.781988501548767, + "grad_norm": 5.375, + "learning_rate": 4.212081808805342e-06, + "logits/chosen": -1.768991470336914, + "logits/rejected": -1.4559440612792969, + "logps/chosen": -1029.68505859375, + "logps/rejected": -1800.233154296875, + "loss": 0.158, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -8.380985260009766, + "rewards/margins": 7.644388675689697, + "rewards/rejected": -16.025373458862305, "step": 1250 }, { - "epoch": 0.33, - "grad_norm": 6.53125, - "learning_rate": 4.2392733566075764e-06, - "logits/chosen": -2.117842674255371, - "logits/rejected": -1.9765924215316772, - "logps/chosen": -393.4205017089844, - "logps/rejected": -428.7975158691406, - "loss": 0.5688, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.1825861930847168, - "rewards/margins": 0.5461826324462891, - "rewards/rejected": -1.7287689447402954, + "epoch": 0.34, + "grad_norm": 8.8125, + "learning_rate": 4.195009612373873e-06, + "logits/chosen": -1.784402847290039, + "logits/rejected": -1.4816844463348389, + "logps/chosen": -856.8828125, + "logps/rejected": -1528.76513671875, + "loss": 0.2102, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.638609409332275, + "rewards/margins": 6.666424751281738, + "rewards/rejected": -13.305035591125488, "step": 1260 }, { - "epoch": 0.33, - "grad_norm": 7.40625, - "learning_rate": 4.2227911427280975e-06, - "logits/chosen": -2.0678598880767822, - "logits/rejected": -1.8555564880371094, - "logps/chosen": -370.04156494140625, - "logps/rejected": -407.6815490722656, - "loss": 0.5463, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.0184184312820435, - "rewards/margins": 0.7026650905609131, - "rewards/rejected": -1.721083402633667, + "epoch": 0.34, + "grad_norm": 11.25, + "learning_rate": 4.177789851226086e-06, + "logits/chosen": -1.8825385570526123, + "logits/rejected": -1.5658035278320312, + "logps/chosen": -762.8591918945312, + "logps/rejected": -1407.0120849609375, + "loss": 0.1716, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.705621719360352, + "rewards/margins": 6.4548749923706055, + "rewards/rejected": -12.160497665405273, "step": 1270 }, { - "epoch": 0.33, - "grad_norm": 9.4375, - "learning_rate": 4.206165076283983e-06, - "logits/chosen": -2.090740442276001, - "logits/rejected": -1.9197864532470703, - "logps/chosen": -359.6732482910156, - "logps/rejected": -415.2185974121094, - "loss": 0.5172, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.9589044451713562, - "rewards/margins": 0.7464350461959839, - "rewards/rejected": -1.7053394317626953, + "epoch": 0.34, + "grad_norm": 10.25, + "learning_rate": 4.160424024485734e-06, + "logits/chosen": -1.794632911682129, + "logits/rejected": -1.5640695095062256, + "logps/chosen": -725.0952758789062, + "logps/rejected": -1341.0850830078125, + "loss": 0.1473, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.345667839050293, + "rewards/margins": 6.119729995727539, + "rewards/rejected": -11.465397834777832, "step": 1280 }, { "epoch": 0.34, - "grad_norm": 8.6875, - "learning_rate": 4.189396545546995e-06, - "logits/chosen": -2.073063373565674, - "logits/rejected": -1.9953018426895142, - "logps/chosen": -351.28448486328125, - "logps/rejected": -401.362060546875, - "loss": 0.5346, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -0.8371566534042358, - "rewards/margins": 0.7300963997840881, - "rewards/rejected": -1.5672528743743896, + "grad_norm": 4.28125, + "learning_rate": 4.1429136439927965e-06, + "logits/chosen": -1.8779207468032837, + "logits/rejected": -1.570647120475769, + "logps/chosen": -688.0980224609375, + "logps/rejected": -1310.856689453125, + "loss": 0.1614, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.92476749420166, + "rewards/margins": 6.2549147605896, + "rewards/rejected": -11.179682731628418, "step": 1290 }, { - "epoch": 0.34, - "grad_norm": 5.375, - "learning_rate": 4.172486950684627e-06, - "logits/chosen": -2.1238481998443604, - "logits/rejected": -2.0095021724700928, - "logps/chosen": -348.4383239746094, - "logps/rejected": -406.6896667480469, - "loss": 0.5659, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7746914029121399, - "rewards/margins": 0.6173580884933472, - "rewards/rejected": -1.3920494318008423, + "epoch": 0.35, + "grad_norm": 13.0625, + "learning_rate": 4.125260234171861e-06, + "logits/chosen": -1.7949470281600952, + "logits/rejected": -1.541264295578003, + "logps/chosen": -773.5762939453125, + "logps/rejected": -1390.2864990234375, + "loss": 0.1594, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.758688926696777, + "rewards/margins": 6.116512298583984, + "rewards/rejected": -11.875202178955078, "step": 1300 }, { - "epoch": 0.34, - "eval_logits/chosen": -2.1049439907073975, - "eval_logits/rejected": -1.9892892837524414, - "eval_logps/chosen": -371.2005920410156, - "eval_logps/rejected": -417.2295837402344, - "eval_loss": 0.5310410261154175, - "eval_rewards/accuracies": 0.7310000061988831, - "eval_rewards/chosen": -0.8606633543968201, - "eval_rewards/margins": 0.679183304309845, - "eval_rewards/rejected": -1.539846658706665, - "eval_runtime": 782.8904, - "eval_samples_per_second": 2.555, - "eval_steps_per_second": 0.319, + "epoch": 0.35, + "eval_logits/chosen": -1.9477717876434326, + "eval_logits/rejected": -1.8285531997680664, + "eval_logps/chosen": -788.9777221679688, + "eval_logps/rejected": -848.410400390625, + "eval_loss": 0.9262253642082214, + "eval_rewards/accuracies": 0.593999981880188, + "eval_rewards/chosen": -5.038434982299805, + "eval_rewards/margins": 0.8132193684577942, + "eval_rewards/rejected": -5.851653575897217, + "eval_runtime": 780.3851, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, "step": 1300 }, { - "epoch": 0.34, - "grad_norm": 10.75, - "learning_rate": 4.155437703643182e-06, - "logits/chosen": -2.134488582611084, - "logits/rejected": -1.9875046014785767, - "logps/chosen": -345.08453369140625, - "logps/rejected": -394.74932861328125, - "loss": 0.5007, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.8158382177352905, - "rewards/margins": 0.7958999872207642, - "rewards/rejected": -1.6117382049560547, + "epoch": 0.35, + "grad_norm": 13.8125, + "learning_rate": 4.107465331899411e-06, + "logits/chosen": -1.7850643396377563, + "logits/rejected": -1.4996929168701172, + "logps/chosen": -707.7679443359375, + "logps/rejected": -1261.5113525390625, + "loss": 0.238, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.266989707946777, + "rewards/margins": 5.456046104431152, + "rewards/rejected": -10.72303581237793, "step": 1310 }, { "epoch": 0.35, - "grad_norm": 8.875, - "learning_rate": 4.138250228029882e-06, - "logits/chosen": -2.039412021636963, - "logits/rejected": -1.9596283435821533, - "logps/chosen": -389.32757568359375, - "logps/rejected": -462.86376953125, - "loss": 0.5427, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1479754447937012, - "rewards/margins": 0.6707499623298645, - "rewards/rejected": -1.818725347518921, + "grad_norm": 27.25, + "learning_rate": 4.089530486370025e-06, + "logits/chosen": -1.7926385402679443, + "logits/rejected": -1.560814619064331, + "logps/chosen": -695.866455078125, + "logps/rejected": -1322.89208984375, + "loss": 0.1696, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.130670070648193, + "rewards/margins": 6.105543613433838, + "rewards/rejected": -11.236213684082031, "step": 1320 }, { - "epoch": 0.35, - "grad_norm": 4.53125, - "learning_rate": 4.120925958993994e-06, - "logits/chosen": -2.0538439750671387, - "logits/rejected": -1.8900225162506104, - "logps/chosen": -349.4403381347656, - "logps/rejected": -406.2320251464844, - "loss": 0.5515, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.9707523584365845, - "rewards/margins": 0.6485539674758911, - "rewards/rejected": -1.6193063259124756, + "epoch": 0.36, + "grad_norm": 4.625, + "learning_rate": 4.071457258961514e-06, + "logits/chosen": -1.8761751651763916, + "logits/rejected": -1.5752075910568237, + "logps/chosen": -634.4463500976562, + "logps/rejected": -1144.76708984375, + "loss": 0.182, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.44040060043335, + "rewards/margins": 4.965670585632324, + "rewards/rejected": -9.406070709228516, "step": 1330 }, { - "epoch": 0.35, - "grad_norm": 10.75, - "learning_rate": 4.103466343106999e-06, - "logits/chosen": -2.099672317504883, - "logits/rejected": -1.962204933166504, - "logps/chosen": -396.4217529296875, - "logps/rejected": -423.8016662597656, - "loss": 0.5514, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.0410661697387695, - "rewards/margins": 0.6128488779067993, - "rewards/rejected": -1.6539150476455688, + "epoch": 0.36, + "grad_norm": 4.28125, + "learning_rate": 4.05324722309898e-06, + "logits/chosen": -1.9114227294921875, + "logits/rejected": -1.6285560131072998, + "logps/chosen": -579.0138549804688, + "logps/rejected": -1113.2867431640625, + "loss": 0.1373, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -3.8820624351501465, + "rewards/margins": 5.322786808013916, + "rewards/rejected": -9.204849243164062, "step": 1340 }, { - "epoch": 0.35, - "grad_norm": 9.3125, - "learning_rate": 4.085872838241797e-06, - "logits/chosen": -1.950664758682251, - "logits/rejected": -1.8220726251602173, - "logps/chosen": -407.6488952636719, - "logps/rejected": -432.849609375, - "loss": 0.5876, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.10310959815979, - "rewards/margins": 0.5940074324607849, - "rewards/rejected": -1.6971168518066406, + "epoch": 0.36, + "grad_norm": 5.5, + "learning_rate": 4.034901964117844e-06, + "logits/chosen": -1.9491599798202515, + "logits/rejected": -1.6949039697647095, + "logps/chosen": -544.5189819335938, + "logps/rejected": -1022.8623046875, + "loss": 0.2103, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.5138885974884033, + "rewards/margins": 4.748744010925293, + "rewards/rejected": -8.262632369995117, "step": 1350 }, { "epoch": 0.36, - "grad_norm": 7.28125, - "learning_rate": 4.06814691345098e-06, - "logits/chosen": -1.9413379430770874, - "logits/rejected": -1.7642743587493896, - "logps/chosen": -379.175048828125, - "logps/rejected": -433.0530700683594, - "loss": 0.5028, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.0211687088012695, - "rewards/margins": 0.7622953653335571, - "rewards/rejected": -1.7834640741348267, + "grad_norm": 5.3125, + "learning_rate": 4.0164230791258265e-06, + "logits/chosen": -1.790183663368225, + "logits/rejected": -1.5226788520812988, + "logps/chosen": -645.8140869140625, + "logps/rejected": -1219.503662109375, + "loss": 0.1358, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.579228401184082, + "rewards/margins": 5.6584672927856445, + "rewards/rejected": -10.23769474029541, "step": 1360 }, { - "epoch": 0.36, - "grad_norm": 7.59375, - "learning_rate": 4.050290048844171e-06, - "logits/chosen": -2.095034122467041, - "logits/rejected": -1.9543094635009766, - "logps/chosen": -379.04046630859375, - "logps/rejected": -442.8995666503906, - "loss": 0.5448, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.9098923802375793, - "rewards/margins": 0.6899572014808655, - "rewards/rejected": -1.5998497009277344, + "epoch": 0.37, + "grad_norm": 4.78125, + "learning_rate": 3.9978121768639035e-06, + "logits/chosen": -1.713234543800354, + "logits/rejected": -1.463331699371338, + "logps/chosen": -963.2076416015625, + "logps/rejected": -1653.765625, + "loss": 0.3174, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -7.8012800216674805, + "rewards/margins": 6.784623146057129, + "rewards/rejected": -14.585902214050293, "step": 1370 }, { - "epoch": 0.36, - "grad_norm": 5.6875, - "learning_rate": 4.032303735464422e-06, - "logits/chosen": -2.0973174571990967, - "logits/rejected": -2.030759811401367, - "logps/chosen": -381.38037109375, - "logps/rejected": -437.8154296875, - "loss": 0.4747, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.8857170343399048, - "rewards/margins": 0.8334710001945496, - "rewards/rejected": -1.7191880941390991, + "epoch": 0.37, + "grad_norm": 5.21875, + "learning_rate": 3.979070877566259e-06, + "logits/chosen": -1.6942354440689087, + "logits/rejected": -1.4278452396392822, + "logps/chosen": -851.5432739257812, + "logps/rejected": -1492.692138671875, + "loss": 0.1636, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.686502933502197, + "rewards/margins": 6.411787509918213, + "rewards/rejected": -13.098289489746094, "step": 1380 }, { - "epoch": 0.36, - "grad_norm": 6.09375, - "learning_rate": 4.014189475163727e-06, - "logits/chosen": -1.9514198303222656, - "logits/rejected": -1.8510992527008057, - "logps/chosen": -375.90625, - "logps/rejected": -439.33709716796875, - "loss": 0.5165, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.0048784017562866, - "rewards/margins": 0.7614682912826538, - "rewards/rejected": -1.7663466930389404, + "epoch": 0.37, + "grad_norm": 14.875, + "learning_rate": 3.960200812819223e-06, + "logits/chosen": -1.8888661861419678, + "logits/rejected": -1.6964051723480225, + "logps/chosen": -689.935791015625, + "logps/rejected": -1223.9864501953125, + "loss": 0.27, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.14157772064209, + "rewards/margins": 5.335638523101807, + "rewards/rejected": -10.477215766906738, "step": 1390 }, { "epoch": 0.37, - "grad_norm": 9.75, - "learning_rate": 3.995948780477605e-06, - "logits/chosen": -2.0884742736816406, - "logits/rejected": -1.8744325637817383, - "logps/chosen": -370.63525390625, - "logps/rejected": -408.30657958984375, - "loss": 0.5625, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.8331832885742188, - "rewards/margins": 0.6226540803909302, - "rewards/rejected": -1.455837368965149, + "grad_norm": 2.46875, + "learning_rate": 3.941203625419233e-06, + "logits/chosen": -1.9211117029190063, + "logits/rejected": -1.690458059310913, + "logps/chosen": -591.420166015625, + "logps/rejected": -1057.7073974609375, + "loss": 0.1899, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -3.9465689659118652, + "rewards/margins": 4.7175774574279785, + "rewards/rejected": -8.664146423339844, "step": 1400 }, { "epoch": 0.37, - "eval_logits/chosen": -2.039109468460083, - "eval_logits/rejected": -1.9253612756729126, - "eval_logps/chosen": -365.1205749511719, - "eval_logps/rejected": -413.8091735839844, - "eval_loss": 0.5294913053512573, - "eval_rewards/accuracies": 0.7214999794960022, - "eval_rewards/chosen": -0.7998626828193665, - "eval_rewards/margins": 0.7057796120643616, - "eval_rewards/rejected": -1.505642294883728, - "eval_runtime": 782.6935, - "eval_samples_per_second": 2.555, - "eval_steps_per_second": 0.319, + "eval_logits/chosen": -2.0371508598327637, + "eval_logits/rejected": -1.9178376197814941, + "eval_logps/chosen": -655.8643188476562, + "eval_logps/rejected": -707.4119873046875, + "eval_loss": 0.7746438384056091, + "eval_rewards/accuracies": 0.6079999804496765, + "eval_rewards/chosen": -3.7073001861572266, + "eval_rewards/margins": 0.7343699336051941, + "eval_rewards/rejected": -4.441669940948486, + "eval_runtime": 780.6813, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, "step": 1400 }, { - "epoch": 0.37, - "grad_norm": 5.0, - "learning_rate": 3.977583174498816e-06, - "logits/chosen": -2.00089693069458, - "logits/rejected": -1.875847578048706, - "logps/chosen": -362.216796875, - "logps/rejected": -436.0418395996094, - "loss": 0.4119, - "rewards/accuracies": 0.831250011920929, - "rewards/chosen": -0.7390182614326477, - "rewards/margins": 0.9906827211380005, - "rewards/rejected": -1.729701042175293, + "epoch": 0.38, + "grad_norm": 10.3125, + "learning_rate": 3.92208096922981e-06, + "logits/chosen": -1.9045965671539307, + "logits/rejected": -1.6174876689910889, + "logps/chosen": -607.1253051757812, + "logps/rejected": -1079.6739501953125, + "loss": 0.1974, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.075843334197998, + "rewards/margins": 4.8129777908325195, + "rewards/rejected": -8.88882064819336, "step": 1410 }, { - "epoch": 0.37, - "grad_norm": 8.0, - "learning_rate": 3.959094190750172e-06, - "logits/chosen": -1.9545952081680298, - "logits/rejected": -1.8029680252075195, - "logps/chosen": -398.83709716796875, - "logps/rejected": -449.42742919921875, - "loss": 0.5272, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.9393488764762878, - "rewards/margins": 0.8084908723831177, - "rewards/rejected": -1.7478396892547607, + "epoch": 0.38, + "grad_norm": 18.25, + "learning_rate": 3.902834509037584e-06, + "logits/chosen": -1.8484761714935303, + "logits/rejected": -1.5948092937469482, + "logps/chosen": -668.1317749023438, + "logps/rejected": -1173.568359375, + "loss": 0.1977, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.78977108001709, + "rewards/margins": 5.134987831115723, + "rewards/rejected": -9.924758911132812, "step": 1420 }, { - "epoch": 0.37, - "grad_norm": 11.125, - "learning_rate": 3.9404833730564975e-06, - "logits/chosen": -1.8657894134521484, - "logits/rejected": -1.716658353805542, - "logps/chosen": -378.5374755859375, - "logps/rejected": -448.2701721191406, - "loss": 0.5079, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.0239883661270142, - "rewards/margins": 0.8290524482727051, - "rewards/rejected": -1.8530406951904297, + "epoch": 0.38, + "grad_norm": 6.90625, + "learning_rate": 3.883465920407351e-06, + "logits/chosen": -1.7886062860488892, + "logits/rejected": -1.529721975326538, + "logps/chosen": -671.2840576171875, + "logps/rejected": -1196.9456787109375, + "loss": 0.1944, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.772359848022461, + "rewards/margins": 5.296358108520508, + "rewards/rejected": -10.068717002868652, "step": 1430 }, { "epoch": 0.38, - "grad_norm": 8.8125, - "learning_rate": 3.921752275415712e-06, - "logits/chosen": -1.8294332027435303, - "logits/rejected": -1.8695322275161743, - "logps/chosen": -386.57476806640625, - "logps/rejected": -454.24981689453125, - "loss": 0.4901, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.1536591053009033, - "rewards/margins": 0.9241256713867188, - "rewards/rejected": -2.077784776687622, + "grad_norm": 15.0, + "learning_rate": 3.8639768895362075e-06, + "logits/chosen": -1.8040317296981812, + "logits/rejected": -1.5752843618392944, + "logps/chosen": -648.8005981445312, + "logps/rejected": -1176.04638671875, + "loss": 0.2305, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.622233867645264, + "rewards/margins": 5.306826114654541, + "rewards/rejected": -9.929059982299805, "step": 1440 }, { - "epoch": 0.38, - "grad_norm": 9.9375, - "learning_rate": 3.902902461869079e-06, - "logits/chosen": -1.9432599544525146, - "logits/rejected": -1.8257582187652588, - "logps/chosen": -391.89544677734375, - "logps/rejected": -465.5562438964844, - "loss": 0.5328, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3478235006332397, - "rewards/margins": 0.9181373715400696, - "rewards/rejected": -2.265960931777954, + "epoch": 0.39, + "grad_norm": 6.4375, + "learning_rate": 3.8443691131067525e-06, + "logits/chosen": -1.8379939794540405, + "logits/rejected": -1.543233036994934, + "logps/chosen": -604.583740234375, + "logps/rejected": -1111.179443359375, + "loss": 0.1573, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.116429328918457, + "rewards/margins": 5.1361494064331055, + "rewards/rejected": -9.252578735351562, "step": 1450 }, { - "epoch": 0.38, - "grad_norm": 9.3125, - "learning_rate": 3.883935506370605e-06, - "logits/chosen": -1.9065253734588623, - "logits/rejected": -1.7174403667449951, - "logps/chosen": -428.204833984375, - "logps/rejected": -474.4853515625, - "loss": 0.5452, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5569171905517578, - "rewards/margins": 0.7829157114028931, - "rewards/rejected": -2.3398330211639404, + "epoch": 0.39, + "grad_norm": 4.78125, + "learning_rate": 3.824644298139372e-06, + "logits/chosen": -1.7858953475952148, + "logits/rejected": -1.6210616827011108, + "logps/chosen": -655.4627685546875, + "logps/rejected": -1142.6436767578125, + "loss": 0.1781, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -4.720765113830566, + "rewards/margins": 4.827541828155518, + "rewards/rejected": -9.548307418823242, "step": 1460 }, { - "epoch": 0.38, - "grad_norm": 4.875, - "learning_rate": 3.864852992655617e-06, - "logits/chosen": -1.921338677406311, - "logits/rejected": -1.8216731548309326, - "logps/chosen": -417.23699951171875, - "logps/rejected": -486.22344970703125, - "loss": 0.4872, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.457044243812561, - "rewards/margins": 0.8441551327705383, - "rewards/rejected": -2.301199436187744, + "epoch": 0.39, + "grad_norm": 18.875, + "learning_rate": 3.8048041618436365e-06, + "logits/chosen": -1.8527462482452393, + "logits/rejected": -1.6160118579864502, + "logps/chosen": -651.9395751953125, + "logps/rejected": -1180.6568603515625, + "loss": 0.2885, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.603270530700684, + "rewards/margins": 5.251911640167236, + "rewards/rejected": -9.855181694030762, "step": 1470 }, { - "epoch": 0.39, - "grad_norm": 5.3125, - "learning_rate": 3.845656514108516e-06, - "logits/chosen": -1.9481254816055298, - "logits/rejected": -1.7473742961883545, - "logps/chosen": -439.948486328125, - "logps/rejected": -445.56268310546875, - "loss": 0.5663, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.5704560279846191, - "rewards/margins": 0.6752415895462036, - "rewards/rejected": -2.2456977367401123, + "epoch": 0.4, + "grad_norm": 1.921875, + "learning_rate": 3.784850431468795e-06, + "logits/chosen": -1.9396499395370483, + "logits/rejected": -1.6909902095794678, + "logps/chosen": -533.6766357421875, + "logps/rejected": -1005.97705078125, + "loss": 0.17, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.5864109992980957, + "rewards/margins": 4.638134479522705, + "rewards/rejected": -8.224546432495117, "step": 1480 }, { - "epoch": 0.39, - "grad_norm": 9.1875, - "learning_rate": 3.826347673629738e-06, - "logits/chosen": -1.9770145416259766, - "logits/rejected": -1.7594553232192993, - "logps/chosen": -389.63836669921875, - "logps/rejected": -452.9381408691406, - "loss": 0.4914, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1838957071304321, - "rewards/margins": 0.9015070199966431, - "rewards/rejected": -2.085402727127075, + "epoch": 0.4, + "grad_norm": 4.5, + "learning_rate": 3.764784844153413e-06, + "logits/chosen": -1.8069417476654053, + "logits/rejected": -1.5476117134094238, + "logps/chosen": -604.2606811523438, + "logps/rejected": -1145.07275390625, + "loss": 0.1285, + "rewards/accuracies": 0.96875, + "rewards/chosen": -4.072413444519043, + "rewards/margins": 5.356746673583984, + "rewards/rejected": -9.429161071777344, "step": 1490 }, { - "epoch": 0.39, - "grad_norm": 7.53125, - "learning_rate": 3.8069280835019062e-06, - "logits/chosen": -1.9332622289657593, - "logits/rejected": -1.8013938665390015, - "logps/chosen": -398.4181213378906, - "logps/rejected": -475.78045654296875, - "loss": 0.4575, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.1129112243652344, - "rewards/margins": 1.0079797506332397, - "rewards/rejected": -2.1208910942077637, + "epoch": 0.4, + "grad_norm": 7.25, + "learning_rate": 3.7446091467741314e-06, + "logits/chosen": -1.7064955234527588, + "logits/rejected": -1.3745331764221191, + "logps/chosen": -820.8228759765625, + "logps/rejected": -1404.6578369140625, + "loss": 0.1972, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.311244964599609, + "rewards/margins": 5.807946681976318, + "rewards/rejected": -12.119193077087402, "step": 1500 }, { - "epoch": 0.39, - "eval_logits/chosen": -2.025247812271118, - "eval_logits/rejected": -1.910525918006897, - "eval_logps/chosen": -399.6888732910156, - "eval_logps/rejected": -459.70855712890625, - "eval_loss": 0.5265802145004272, - "eval_rewards/accuracies": 0.7260000109672546, - "eval_rewards/chosen": -1.1455461978912354, - "eval_rewards/margins": 0.8190898299217224, - "eval_rewards/rejected": -1.964635968208313, - "eval_runtime": 783.8773, - "eval_samples_per_second": 2.551, - "eval_steps_per_second": 0.319, + "epoch": 0.4, + "eval_logits/chosen": -1.8571399450302124, + "eval_logits/rejected": -1.7422724962234497, + "eval_logps/chosen": -874.4112548828125, + "eval_logps/rejected": -957.8101806640625, + "eval_loss": 0.9739969372749329, + "eval_rewards/accuracies": 0.6129999756813049, + "eval_rewards/chosen": -5.8927693367004395, + "eval_rewards/margins": 1.0528827905654907, + "eval_rewards/rejected": -6.945652961730957, + "eval_runtime": 780.1629, + "eval_samples_per_second": 2.564, + "eval_steps_per_second": 0.32, "step": 1500 }, { "epoch": 0.4, - "grad_norm": 7.6875, - "learning_rate": 3.7873993652552077e-06, - "logits/chosen": -1.9163720607757568, - "logits/rejected": -1.8175251483917236, - "logps/chosen": -369.39483642578125, - "logps/rejected": -420.3642578125, - "loss": 0.6257, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -1.180105447769165, - "rewards/margins": 0.591270923614502, - "rewards/rejected": -1.771376371383667, + "grad_norm": 8.875, + "learning_rate": 3.7243250957935904e-06, + "logits/chosen": -1.7506663799285889, + "logits/rejected": -1.4978208541870117, + "logps/chosen": -850.6290283203125, + "logps/rejected": -1508.726318359375, + "loss": 0.219, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.688626766204834, + "rewards/margins": 6.346508979797363, + "rewards/rejected": -13.035135269165039, "step": 1510 }, { - "epoch": 0.4, - "grad_norm": 8.3125, - "learning_rate": 3.7677631495319953e-06, - "logits/chosen": -2.0789072513580322, - "logits/rejected": -1.9311103820800781, - "logps/chosen": -360.617431640625, - "logps/rejected": -415.0093688964844, - "loss": 0.523, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -0.7879349589347839, - "rewards/margins": 0.7430469989776611, - "rewards/rejected": -1.5309817790985107, + "epoch": 0.41, + "grad_norm": 7.65625, + "learning_rate": 3.703934457107517e-06, + "logits/chosen": -1.839238166809082, + "logits/rejected": -1.5624536275863647, + "logps/chosen": -738.9110717773438, + "logps/rejected": -1317.7669677734375, + "loss": 0.1813, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.391977787017822, + "rewards/margins": 5.72281551361084, + "rewards/rejected": -11.114792823791504, "step": 1520 }, { - "epoch": 0.4, - "grad_norm": 7.34375, - "learning_rate": 3.748021075950633e-06, - "logits/chosen": -2.0557217597961426, - "logits/rejected": -1.866943120956421, - "logps/chosen": -381.2101745605469, - "logps/rejected": -421.3724670410156, - "loss": 0.5781, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -0.8339166641235352, - "rewards/margins": 0.5441929697990417, - "rewards/rejected": -1.3781098127365112, + "epoch": 0.41, + "grad_norm": 11.0625, + "learning_rate": 3.683439005890983e-06, + "logits/chosen": -1.7876466512680054, + "logits/rejected": -1.5029432773590088, + "logps/chosen": -661.0178833007812, + "logps/rejected": -1189.1959228515625, + "loss": 0.2421, + "rewards/accuracies": 0.90625, + "rewards/chosen": -4.857475280761719, + "rewards/margins": 5.292153835296631, + "rewards/rejected": -10.149629592895508, "step": 1530 }, { - "epoch": 0.4, - "grad_norm": 7.90625, - "learning_rate": 3.7281747929685824e-06, - "logits/chosen": -1.9901912212371826, - "logits/rejected": -1.7555814981460571, - "logps/chosen": -359.7068786621094, - "logps/rejected": -405.24261474609375, - "loss": 0.5585, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.026548147201538, - "rewards/margins": 0.6196350455284119, - "rewards/rejected": -1.6461833715438843, + "epoch": 0.41, + "grad_norm": 10.0625, + "learning_rate": 3.662840526443868e-06, + "logits/chosen": -1.8758341073989868, + "logits/rejected": -1.6289899349212646, + "logps/chosen": -683.6453247070312, + "logps/rejected": -1202.3697509765625, + "loss": 0.1645, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.952923774719238, + "rewards/margins": 5.2092509269714355, + "rewards/rejected": -10.162174224853516, "step": 1540 }, { "epoch": 0.41, - "grad_norm": 8.0625, - "learning_rate": 3.7082259577447604e-06, - "logits/chosen": -2.0445377826690674, - "logits/rejected": -1.9260342121124268, - "logps/chosen": -404.2452087402344, - "logps/rejected": -466.71044921875, - "loss": 0.4753, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.1328551769256592, - "rewards/margins": 0.8080953359603882, - "rewards/rejected": -1.9409507513046265, + "grad_norm": 4.28125, + "learning_rate": 3.6421408120355145e-06, + "logits/chosen": -1.8700506687164307, + "logits/rejected": -1.6114925146102905, + "logps/chosen": -732.8409423828125, + "logps/rejected": -1359.912109375, + "loss": 0.1532, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.466494560241699, + "rewards/margins": 6.277614593505859, + "rewards/rejected": -11.744108200073242, "step": 1550 }, { - "epoch": 0.41, - "grad_norm": 6.09375, - "learning_rate": 3.6881762360011688e-06, - "logits/chosen": -1.9906909465789795, - "logits/rejected": -1.808937668800354, - "logps/chosen": -433.05712890625, - "logps/rejected": -471.65643310546875, - "loss": 0.5246, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.329231858253479, - "rewards/margins": 0.7763558626174927, - "rewards/rejected": -2.1055874824523926, + "epoch": 0.42, + "grad_norm": 8.5625, + "learning_rate": 3.6213416647486157e-06, + "logits/chosen": -1.8383686542510986, + "logits/rejected": -1.524852991104126, + "logps/chosen": -737.9182739257812, + "logps/rejected": -1409.013427734375, + "loss": 0.2423, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.449790000915527, + "rewards/margins": 6.735495567321777, + "rewards/rejected": -12.185285568237305, "step": 1560 }, { - "epoch": 0.41, - "grad_norm": 8.5625, - "learning_rate": 3.668027301883802e-06, - "logits/chosen": -2.066068649291992, - "logits/rejected": -1.8010034561157227, - "logps/chosen": -375.88629150390625, - "logps/rejected": -440.9111328125, - "loss": 0.493, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.0363174676895142, - "rewards/margins": 0.8553186655044556, - "rewards/rejected": -1.8916361331939697, + "epoch": 0.42, + "grad_norm": 7.84375, + "learning_rate": 3.6004448953223225e-06, + "logits/chosen": -1.757846474647522, + "logits/rejected": -1.5626018047332764, + "logps/chosen": -797.1041870117188, + "logps/rejected": -1491.272216796875, + "loss": 0.1457, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.0056328773498535, + "rewards/margins": 6.717007637023926, + "rewards/rejected": -12.722640037536621, "step": 1570 }, { - "epoch": 0.41, - "grad_norm": 4.25, - "learning_rate": 3.64778083782286e-06, - "logits/chosen": -1.9386217594146729, - "logits/rejected": -1.899171233177185, - "logps/chosen": -377.6373596191406, - "logps/rejected": -472.9842834472656, - "loss": 0.5669, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.0420780181884766, - "rewards/margins": 0.665661096572876, - "rewards/rejected": -1.707739233970642, + "epoch": 0.42, + "grad_norm": 1.6171875, + "learning_rate": 3.5794523229946105e-06, + "logits/chosen": -1.597733497619629, + "logits/rejected": -1.324507474899292, + "logps/chosen": -1041.21435546875, + "logps/rejected": -1858.760498046875, + "loss": 0.1777, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -8.502154350280762, + "rewards/margins": 8.060552597045898, + "rewards/rejected": -16.56270408630371, "step": 1580 }, { "epoch": 0.42, - "grad_norm": 9.5625, - "learning_rate": 3.627438534392268e-06, - "logits/chosen": -2.084904193878174, - "logits/rejected": -2.023829221725464, - "logps/chosen": -353.0201721191406, - "logps/rejected": -427.7330017089844, - "loss": 0.5343, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.9893399477005005, - "rewards/margins": 0.678139328956604, - "rewards/rejected": -1.6674792766571045, + "grad_norm": 16.5, + "learning_rate": 3.558365775343892e-06, + "logits/chosen": -1.7607545852661133, + "logits/rejected": -1.4137681722640991, + "logps/chosen": -923.2019653320312, + "logps/rejected": -1746.294189453125, + "loss": 0.2314, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -7.116549491882324, + "rewards/margins": 8.30859661102295, + "rewards/rejected": -15.425145149230957, "step": 1590 }, { - "epoch": 0.42, - "grad_norm": 7.0, - "learning_rate": 3.607002090168506e-06, - "logits/chosen": -1.933423399925232, - "logits/rejected": -1.8388328552246094, - "logps/chosen": -398.29754638671875, - "logps/rejected": -430.0, - "loss": 0.5855, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1320139169692993, - "rewards/margins": 0.552472710609436, - "rewards/rejected": -1.684486746788025, + "epoch": 0.43, + "grad_norm": 7.40625, + "learning_rate": 3.537187088129919e-06, + "logits/chosen": -1.9187705516815186, + "logits/rejected": -1.618198037147522, + "logps/chosen": -692.7543334960938, + "logps/rejected": -1304.157470703125, + "loss": 0.1712, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.782906532287598, + "rewards/margins": 6.2092204093933105, + "rewards/rejected": -10.99212646484375, "step": 1600 }, { - "epoch": 0.42, - "eval_logits/chosen": -2.040278911590576, - "eval_logits/rejected": -1.9276047945022583, - "eval_logps/chosen": -388.7277526855469, - "eval_logps/rejected": -439.5246276855469, - "eval_loss": 0.5226916670799255, - "eval_rewards/accuracies": 0.734499990940094, - "eval_rewards/chosen": -1.0359350442886353, - "eval_rewards/margins": 0.7268618941307068, - "eval_rewards/rejected": -1.7627968788146973, - "eval_runtime": 783.5654, - "eval_samples_per_second": 2.552, - "eval_steps_per_second": 0.319, + "epoch": 0.43, + "eval_logits/chosen": -1.997864842414856, + "eval_logits/rejected": -1.8792200088500977, + "eval_logps/chosen": -680.6552124023438, + "eval_logps/rejected": -728.7279663085938, + "eval_loss": 0.8206447958946228, + "eval_rewards/accuracies": 0.593500018119812, + "eval_rewards/chosen": -3.9552090167999268, + "eval_rewards/margins": 0.6996216177940369, + "eval_rewards/rejected": -4.654830455780029, + "eval_runtime": 780.7801, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, "step": 1600 }, { - "epoch": 0.42, - "grad_norm": 4.5625, - "learning_rate": 3.586473211588787e-06, - "logits/chosen": -2.0347843170166016, - "logits/rejected": -1.825095534324646, - "logps/chosen": -351.39971923828125, - "logps/rejected": -442.338623046875, - "loss": 0.4675, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.8847958445549011, - "rewards/margins": 0.8562973141670227, - "rewards/rejected": -1.7410932779312134, + "epoch": 0.43, + "grad_norm": 6.5625, + "learning_rate": 3.5159181051339574e-06, + "logits/chosen": -1.9240127801895142, + "logits/rejected": -1.6421921253204346, + "logps/chosen": -656.6437377929688, + "logps/rejected": -1210.740234375, + "loss": 0.1597, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -4.618703842163086, + "rewards/margins": 5.614047050476074, + "rewards/rejected": -10.232751846313477, "step": 1610 }, { - "epoch": 0.42, - "grad_norm": 9.3125, - "learning_rate": 3.5658536128085623e-06, - "logits/chosen": -2.046114683151245, - "logits/rejected": -1.8478816747665405, - "logps/chosen": -398.8979797363281, - "logps/rejected": -427.96405029296875, - "loss": 0.5911, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.1641943454742432, - "rewards/margins": 0.6046980619430542, - "rewards/rejected": -1.7688922882080078, + "epoch": 0.43, + "grad_norm": 1.9765625, + "learning_rate": 3.494560677998275e-06, + "logits/chosen": -1.7963730096817017, + "logits/rejected": -1.4937909841537476, + "logps/chosen": -813.3496704101562, + "logps/rejected": -1511.3714599609375, + "loss": 0.1662, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.14628267288208, + "rewards/margins": 6.9289398193359375, + "rewards/rejected": -13.075222969055176, "step": 1620 }, { - "epoch": 0.43, - "grad_norm": 9.1875, - "learning_rate": 3.545145015558399e-06, - "logits/chosen": -1.891063928604126, - "logits/rejected": -1.809687614440918, - "logps/chosen": -368.8531188964844, - "logps/rejected": -437.64691162109375, - "loss": 0.509, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.174725890159607, - "rewards/margins": 0.8606030344963074, - "rewards/rejected": -2.0353291034698486, + "epoch": 0.44, + "grad_norm": 15.25, + "learning_rate": 3.473116666064939e-06, + "logits/chosen": -1.7890408039093018, + "logits/rejected": -1.4465700387954712, + "logps/chosen": -864.6702880859375, + "logps/rejected": -1585.6441650390625, + "loss": 0.1791, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.7736711502075195, + "rewards/margins": 7.236307621002197, + "rewards/rejected": -14.009979248046875, "step": 1630 }, { - "epoch": 0.43, - "grad_norm": 5.8125, - "learning_rate": 3.5243491490002056e-06, - "logits/chosen": -2.02524471282959, - "logits/rejected": -1.9297832250595093, - "logps/chosen": -404.6397399902344, - "logps/rejected": -462.12591552734375, - "loss": 0.5795, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.298349142074585, - "rewards/margins": 0.6594666242599487, - "rewards/rejected": -1.9578157663345337, + "epoch": 0.44, + "grad_norm": 11.0, + "learning_rate": 3.4515879362139453e-06, + "logits/chosen": -1.7052695751190186, + "logits/rejected": -1.4470438957214355, + "logps/chosen": -879.1795654296875, + "logps/rejected": -1647.1077880859375, + "loss": 0.2115, + "rewards/accuracies": 0.875, + "rewards/chosen": -6.843623161315918, + "rewards/margins": 7.675244331359863, + "rewards/rejected": -14.518865585327148, "step": 1640 }, { - "epoch": 0.43, - "grad_norm": 6.15625, - "learning_rate": 3.503467749582857e-06, - "logits/chosen": -2.001770257949829, - "logits/rejected": -1.7856979370117188, - "logps/chosen": -405.2151794433594, - "logps/rejected": -426.4241638183594, - "loss": 0.6245, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.3188527822494507, - "rewards/margins": 0.5851167440414429, - "rewards/rejected": -1.9039695262908936, + "epoch": 0.44, + "grad_norm": 9.875, + "learning_rate": 3.42997636270069e-06, + "logits/chosen": -1.8049097061157227, + "logits/rejected": -1.564592719078064, + "logps/chosen": -801.4486083984375, + "logps/rejected": -1398.755126953125, + "loss": 0.2083, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.123307704925537, + "rewards/margins": 5.921048164367676, + "rewards/rejected": -12.044357299804688, "step": 1650 }, { - "epoch": 0.43, - "grad_norm": 9.375, - "learning_rate": 3.4825025608971947e-06, - "logits/chosen": -1.9151302576065063, - "logits/rejected": -1.8214277029037476, - "logps/chosen": -355.21087646484375, - "logps/rejected": -424.50811767578125, - "loss": 0.5519, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.154817819595337, - "rewards/margins": 0.6674734950065613, - "rewards/rejected": -1.8222911357879639, + "epoch": 0.44, + "grad_norm": 5.46875, + "learning_rate": 3.408283826992801e-06, + "logits/chosen": -1.8630778789520264, + "logits/rejected": -1.5700795650482178, + "logps/chosen": -832.0042724609375, + "logps/rejected": -1476.0029296875, + "loss": 0.2379, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -6.297715187072754, + "rewards/margins": 6.487104892730713, + "rewards/rejected": -12.784818649291992, "step": 1660 }, { - "epoch": 0.44, - "grad_norm": 5.25, - "learning_rate": 3.4614553335304407e-06, - "logits/chosen": -2.0037343502044678, - "logits/rejected": -1.7987245321273804, - "logps/chosen": -409.9883728027344, - "logps/rejected": -452.8309631347656, - "loss": 0.516, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.1703855991363525, - "rewards/margins": 0.8152831792831421, - "rewards/rejected": -1.9856687784194946, + "epoch": 0.45, + "grad_norm": 10.0, + "learning_rate": 3.386512217606339e-06, + "logits/chosen": -1.7772754430770874, + "logits/rejected": -1.5164556503295898, + "logps/chosen": -838.1605224609375, + "logps/rejected": -1455.7694091796875, + "loss": 0.1919, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.386895179748535, + "rewards/margins": 6.187010765075684, + "rewards/rejected": -12.573904991149902, "step": 1670 }, { - "epoch": 0.44, - "grad_norm": 8.0625, - "learning_rate": 3.4403278249200222e-06, - "logits/chosen": -1.9947742223739624, - "logits/rejected": -1.7905107736587524, - "logps/chosen": -405.6366882324219, - "logps/rejected": -466.224609375, - "loss": 0.4613, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.0154985189437866, - "rewards/margins": 1.0372416973114014, - "rewards/rejected": -2.0527403354644775, + "epoch": 0.45, + "grad_norm": 8.8125, + "learning_rate": 3.364663429941391e-06, + "logits/chosen": -1.793426752090454, + "logits/rejected": -1.4886395931243896, + "logps/chosen": -872.0006103515625, + "logps/rejected": -1503.099365234375, + "loss": 0.3277, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -6.79733419418335, + "rewards/margins": 6.3793768882751465, + "rewards/rejected": -13.17671012878418, "step": 1680 }, { - "epoch": 0.44, - "grad_norm": 9.3125, - "learning_rate": 3.4191217992068293e-06, - "logits/chosen": -2.061328887939453, - "logits/rejected": -1.9267551898956299, - "logps/chosen": -403.9309997558594, - "logps/rejected": -432.7481384277344, - "loss": 0.5519, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.0664750337600708, - "rewards/margins": 0.7731136679649353, - "rewards/rejected": -1.8395887613296509, + "epoch": 0.45, + "grad_norm": 7.875, + "learning_rate": 3.3427393661170532e-06, + "logits/chosen": -1.8139142990112305, + "logits/rejected": -1.59800386428833, + "logps/chosen": -764.7420043945312, + "logps/rejected": -1374.8375244140625, + "loss": 0.1911, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.704885959625244, + "rewards/margins": 6.062502384185791, + "rewards/rejected": -11.767388343811035, "step": 1690 }, { - "epoch": 0.44, - "grad_norm": 10.1875, - "learning_rate": 3.3978390270879056e-06, - "logits/chosen": -2.0079243183135986, - "logits/rejected": -1.8811041116714478, - "logps/chosen": -350.56658935546875, - "logps/rejected": -422.943603515625, - "loss": 0.5333, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.1983606815338135, - "rewards/margins": 0.6959515810012817, - "rewards/rejected": -1.8943122625350952, + "epoch": 0.45, + "grad_norm": 8.4375, + "learning_rate": 3.3207419348058393e-06, + "logits/chosen": -1.8668022155761719, + "logits/rejected": -1.5570051670074463, + "logps/chosen": -714.6455078125, + "logps/rejected": -1290.4510498046875, + "loss": 0.2211, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.201146125793457, + "rewards/margins": 5.8124284744262695, + "rewards/rejected": -11.013574600219727, "step": 1700 }, { - "epoch": 0.44, - "eval_logits/chosen": -2.0731844902038574, - "eval_logits/rejected": -1.9572210311889648, - "eval_logps/chosen": -401.31475830078125, - "eval_logps/rejected": -460.55657958984375, - "eval_loss": 0.5154699087142944, - "eval_rewards/accuracies": 0.7310000061988831, - "eval_rewards/chosen": -1.1618049144744873, - "eval_rewards/margins": 0.8113114237785339, - "eval_rewards/rejected": -1.9731160402297974, - "eval_runtime": 782.2946, - "eval_samples_per_second": 2.557, + "epoch": 0.45, + "eval_logits/chosen": -1.973204255104065, + "eval_logits/rejected": -1.8550846576690674, + "eval_logps/chosen": -717.7827758789062, + "eval_logps/rejected": -775.1051025390625, + "eval_loss": 0.8185168504714966, + "eval_rewards/accuracies": 0.6119999885559082, + "eval_rewards/chosen": -4.326484680175781, + "eval_rewards/margins": 0.7921165823936462, + "eval_rewards/rejected": -5.1186017990112305, + "eval_runtime": 780.2406, + "eval_samples_per_second": 2.563, "eval_steps_per_second": 0.32, "step": 1700 }, { - "epoch": 0.45, - "grad_norm": 9.875, - "learning_rate": 3.3764812856685995e-06, - "logits/chosen": -2.0123562812805176, - "logits/rejected": -1.9913345575332642, - "logps/chosen": -348.2825012207031, - "logps/rejected": -437.12481689453125, - "loss": 0.5252, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.0764367580413818, - "rewards/margins": 0.7534947395324707, - "rewards/rejected": -1.8299314975738525, + "epoch": 0.46, + "grad_norm": 8.625, + "learning_rate": 3.2986730510675158e-06, + "logits/chosen": -1.8806215524673462, + "logits/rejected": -1.581820011138916, + "logps/chosen": -694.9906005859375, + "logps/rejected": -1254.445556640625, + "loss": 0.1772, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.930051326751709, + "rewards/margins": 5.521780490875244, + "rewards/rejected": -10.451830863952637, "step": 1710 }, { - "epoch": 0.45, - "grad_norm": 8.3125, - "learning_rate": 3.3550503583141726e-06, - "logits/chosen": -2.1099114418029785, - "logits/rejected": -1.9925434589385986, - "logps/chosen": -392.92840576171875, - "logps/rejected": -464.06884765625, - "loss": 0.4725, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.0577646493911743, - "rewards/margins": 0.9005683660507202, - "rewards/rejected": -1.9583332538604736, + "epoch": 0.46, + "grad_norm": 4.34375, + "learning_rate": 3.276534636182378e-06, + "logits/chosen": -1.8050769567489624, + "logits/rejected": -1.5063773393630981, + "logps/chosen": -626.0925903320312, + "logps/rejected": -1262.783447265625, + "loss": 0.1264, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -4.493157386779785, + "rewards/margins": 6.255930423736572, + "rewards/rejected": -10.749089241027832, "step": 1720 }, { - "epoch": 0.45, - "grad_norm": 7.0625, - "learning_rate": 3.3335480345008907e-06, - "logits/chosen": -1.9768962860107422, - "logits/rejected": -1.850328803062439, - "logps/chosen": -407.8126525878906, - "logps/rejected": -476.3419494628906, - "loss": 0.5127, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2192199230194092, - "rewards/margins": 0.9083667993545532, - "rewards/rejected": -2.127586841583252, + "epoch": 0.46, + "grad_norm": 4.75, + "learning_rate": 3.2543286174839856e-06, + "logits/chosen": -1.8059381246566772, + "logits/rejected": -1.4949989318847656, + "logps/chosen": -771.2824096679688, + "logps/rejected": -1471.7220458984375, + "loss": 0.2386, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.776717662811279, + "rewards/margins": 6.974827766418457, + "rewards/rejected": -12.751544952392578, "step": 1730 }, { "epoch": 0.46, - "grad_norm": 8.125, - "learning_rate": 3.3119761096666055e-06, - "logits/chosen": -1.985120415687561, - "logits/rejected": -1.8539278507232666, - "logps/chosen": -421.05126953125, - "logps/rejected": -467.579833984375, - "loss": 0.5525, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.2381929159164429, - "rewards/margins": 0.7920646667480469, - "rewards/rejected": -2.0302577018737793, + "grad_norm": 6.125, + "learning_rate": 3.232056928191376e-06, + "logits/chosen": -1.740046501159668, + "logits/rejected": -1.4656656980514526, + "logps/chosen": -784.59326171875, + "logps/rejected": -1440.2933349609375, + "loss": 0.1952, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.885767459869385, + "rewards/margins": 6.415579319000244, + "rewards/rejected": -12.301345825195312, "step": 1740 }, { - "epoch": 0.46, - "grad_norm": 6.59375, - "learning_rate": 3.290336385060832e-06, - "logits/chosen": -2.143947124481201, - "logits/rejected": -1.8942737579345703, - "logps/chosen": -413.96551513671875, - "logps/rejected": -477.3929748535156, - "loss": 0.5114, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4759609699249268, - "rewards/margins": 0.8571161031723022, - "rewards/rejected": -2.3330769538879395, + "epoch": 0.47, + "grad_norm": 7.5, + "learning_rate": 3.2097215072407595e-06, + "logits/chosen": -1.8645740747451782, + "logits/rejected": -1.5591399669647217, + "logps/chosen": -744.97216796875, + "logps/rejected": -1370.182373046875, + "loss": 0.3274, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.665326118469238, + "rewards/margins": 6.191165447235107, + "rewards/rejected": -11.856492042541504, "step": 1750 }, { - "epoch": 0.46, - "grad_norm": 8.125, - "learning_rate": 3.268630667594348e-06, - "logits/chosen": -2.0270843505859375, - "logits/rejected": -1.9718875885009766, - "logps/chosen": -408.5159912109375, - "logps/rejected": -458.33013916015625, - "loss": 0.5462, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3257557153701782, - "rewards/margins": 0.7757492065429688, - "rewards/rejected": -2.1015048027038574, + "epoch": 0.47, + "grad_norm": 3.4375, + "learning_rate": 3.1873242991167153e-06, + "logits/chosen": -1.7916425466537476, + "logits/rejected": -1.556660771369934, + "logps/chosen": -705.9693603515625, + "logps/rejected": -1279.2801513671875, + "loss": 0.21, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.239865303039551, + "rewards/margins": 5.630426406860352, + "rewards/rejected": -10.870291709899902, "step": 1760 }, { - "epoch": 0.46, - "grad_norm": 6.5, - "learning_rate": 3.2468607696883147e-06, - "logits/chosen": -1.9911127090454102, - "logits/rejected": -1.94431471824646, - "logps/chosen": -397.14208984375, - "logps/rejected": -485.87237548828125, - "loss": 0.5191, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.2622430324554443, - "rewards/margins": 0.8354751467704773, - "rewards/rejected": -2.0977180004119873, + "epoch": 0.47, + "grad_norm": 7.25, + "learning_rate": 3.164867253682915e-06, + "logits/chosen": -1.880875587463379, + "logits/rejected": -1.615635871887207, + "logps/chosen": -632.2330322265625, + "logps/rejected": -1230.26318359375, + "loss": 0.1901, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.562050819396973, + "rewards/margins": 5.894750595092773, + "rewards/rejected": -10.456802368164062, "step": 1770 }, { - "epoch": 0.47, - "grad_norm": 6.78125, - "learning_rate": 3.225028509122944e-06, - "logits/chosen": -2.0844154357910156, - "logits/rejected": -1.9376178979873657, - "logps/chosen": -355.7822265625, - "logps/rejected": -413.16748046875, - "loss": 0.5498, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.041587233543396, - "rewards/margins": 0.6852777004241943, - "rewards/rejected": -1.7268650531768799, + "epoch": 0.48, + "grad_norm": 12.875, + "learning_rate": 3.1423523260123627e-06, + "logits/chosen": -1.873928427696228, + "logits/rejected": -1.63522469997406, + "logps/chosen": -672.1016845703125, + "logps/rejected": -1353.87744140625, + "loss": 0.1407, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.872590065002441, + "rewards/margins": 6.644981384277344, + "rewards/rejected": -11.517570495605469, "step": 1780 }, { - "epoch": 0.47, - "grad_norm": 5.90625, - "learning_rate": 3.2031357088857083e-06, - "logits/chosen": -2.071098804473877, - "logits/rejected": -2.020618200302124, - "logps/chosen": -409.9624938964844, - "logps/rejected": -493.58758544921875, - "loss": 0.5321, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.149269938468933, - "rewards/margins": 0.780631959438324, - "rewards/rejected": -1.9299018383026123, + "epoch": 0.48, + "grad_norm": 4.4375, + "learning_rate": 3.1197814762171986e-06, + "logits/chosen": -1.8514392375946045, + "logits/rejected": -1.6486479043960571, + "logps/chosen": -698.458984375, + "logps/rejected": -1315.129638671875, + "loss": 0.1982, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.084946632385254, + "rewards/margins": 6.099282741546631, + "rewards/rejected": -11.184229850769043, "step": 1790 }, { - "epoch": 0.47, - "grad_norm": 8.4375, - "learning_rate": 3.181184197019127e-06, - "logits/chosen": -1.9076446294784546, - "logits/rejected": -1.7894035577774048, - "logps/chosen": -362.01275634765625, - "logps/rejected": -494.119384765625, - "loss": 0.5055, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1391741037368774, - "rewards/margins": 1.0371575355529785, - "rewards/rejected": -2.1763315200805664, + "epoch": 0.48, + "grad_norm": 18.125, + "learning_rate": 3.097156669278046e-06, + "logits/chosen": -1.7210429906845093, + "logits/rejected": -1.4064064025878906, + "logps/chosen": -816.0775756835938, + "logps/rejected": -1553.640625, + "loss": 0.1773, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.110322952270508, + "rewards/margins": 7.464188575744629, + "rewards/rejected": -13.574511528015137, "step": 1800 }, { - "epoch": 0.47, - "eval_logits/chosen": -2.0727155208587646, - "eval_logits/rejected": -1.9571987390518188, - "eval_logps/chosen": -396.1869812011719, - "eval_logps/rejected": -452.9256896972656, - "eval_loss": 0.5181357860565186, - "eval_rewards/accuracies": 0.7329999804496765, - "eval_rewards/chosen": -1.1105273962020874, - "eval_rewards/margins": 0.7862799763679504, - "eval_rewards/rejected": -1.8968074321746826, - "eval_runtime": 783.5553, - "eval_samples_per_second": 2.552, - "eval_steps_per_second": 0.319, + "epoch": 0.48, + "eval_logits/chosen": -1.9209892749786377, + "eval_logits/rejected": -1.8040364980697632, + "eval_logps/chosen": -810.9664916992188, + "eval_logps/rejected": -883.3089599609375, + "eval_loss": 0.9660559296607971, + "eval_rewards/accuracies": 0.5950000286102295, + "eval_rewards/chosen": -5.258322715759277, + "eval_rewards/margins": 0.9423174858093262, + "eval_rewards/rejected": -6.2006402015686035, + "eval_runtime": 781.3981, + "eval_samples_per_second": 2.56, + "eval_steps_per_second": 0.32, "step": 1800 }, { - "epoch": 0.47, - "grad_norm": 8.8125, - "learning_rate": 3.159175806468126e-06, - "logits/chosen": -1.9769659042358398, - "logits/rejected": -1.7270854711532593, - "logps/chosen": -385.99139404296875, - "logps/rejected": -418.35595703125, - "loss": 0.5416, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.117026448249817, - "rewards/margins": 0.7229973077774048, - "rewards/rejected": -1.8400236368179321, + "epoch": 0.48, + "grad_norm": 24.625, + "learning_rate": 3.074479874872949e-06, + "logits/chosen": -1.7587913274765015, + "logits/rejected": -1.476696252822876, + "logps/chosen": -767.0065307617188, + "logps/rejected": -1501.1015625, + "loss": 0.1594, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.815308094024658, + "rewards/margins": 7.267469882965088, + "rewards/rejected": -13.082776069641113, "step": 1810 }, { - "epoch": 0.48, - "grad_norm": 9.625, - "learning_rate": 3.1371123749269804e-06, - "logits/chosen": -2.0032527446746826, - "logits/rejected": -1.9413458108901978, - "logps/chosen": -422.4027404785156, - "logps/rejected": -460.8421325683594, - "loss": 0.6067, - "rewards/accuracies": 0.6937500238418579, - "rewards/chosen": -1.2173041105270386, - "rewards/margins": 0.5386160612106323, - "rewards/rejected": -1.75592041015625, + "epoch": 0.49, + "grad_norm": 21.5, + "learning_rate": 3.051753067205895e-06, + "logits/chosen": -1.7685956954956055, + "logits/rejected": -1.5216073989868164, + "logps/chosen": -767.9783935546875, + "logps/rejected": -1409.71240234375, + "loss": 0.2066, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.720013618469238, + "rewards/margins": 6.2696709632873535, + "rewards/rejected": -11.989683151245117, "step": 1820 }, { - "epoch": 0.48, - "grad_norm": 7.21875, - "learning_rate": 3.114995744685877e-06, - "logits/chosen": -1.9304783344268799, - "logits/rejected": -1.9257535934448242, - "logps/chosen": -375.6784973144531, - "logps/rejected": -426.36822509765625, - "loss": 0.5476, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0842137336730957, - "rewards/margins": 0.660191535949707, - "rewards/rejected": -1.7444051504135132, + "epoch": 0.49, + "grad_norm": 2.65625, + "learning_rate": 3.0289782248349394e-06, + "logits/chosen": -1.8148343563079834, + "logits/rejected": -1.5846790075302124, + "logps/chosen": -754.4901123046875, + "logps/rejected": -1433.83203125, + "loss": 0.2044, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.541926383972168, + "rewards/margins": 6.822430610656738, + "rewards/rejected": -12.364358901977539, "step": 1830 }, { - "epoch": 0.48, - "grad_norm": 3.5, - "learning_rate": 3.0928277624770743e-06, - "logits/chosen": -2.052551507949829, - "logits/rejected": -1.9413669109344482, - "logps/chosen": -400.77728271484375, - "logps/rejected": -459.59979248046875, - "loss": 0.4952, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.9567273855209351, - "rewards/margins": 0.8860765695571899, - "rewards/rejected": -1.842804193496704, + "epoch": 0.49, + "grad_norm": 7.0, + "learning_rate": 3.0061573304999626e-06, + "logits/chosen": -1.762790322303772, + "logits/rejected": -1.459867238998413, + "logps/chosen": -755.7040405273438, + "logps/rejected": -1503.606689453125, + "loss": 0.1869, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.6962385177612305, + "rewards/margins": 7.409841060638428, + "rewards/rejected": -13.1060791015625, "step": 1840 }, { - "epoch": 0.48, - "grad_norm": 4.71875, - "learning_rate": 3.070610279320708e-06, - "logits/chosen": -2.090240478515625, - "logits/rejected": -1.8750137090682983, - "logps/chosen": -401.25360107421875, - "logps/rejected": -459.5379943847656, - "loss": 0.498, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.9490548372268677, - "rewards/margins": 0.8133231401443481, - "rewards/rejected": -1.7623779773712158, + "epoch": 0.49, + "grad_norm": 4.15625, + "learning_rate": 2.9832923709500507e-06, + "logits/chosen": -1.853634238243103, + "logits/rejected": -1.6102838516235352, + "logps/chosen": -765.724609375, + "logps/rejected": -1428.977783203125, + "loss": 0.2409, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.709242343902588, + "rewards/margins": 6.602417945861816, + "rewards/rejected": -12.311660766601562, "step": 1850 }, { - "epoch": 0.49, - "grad_norm": 4.0625, - "learning_rate": 3.0483451503702264e-06, - "logits/chosen": -1.9803142547607422, - "logits/rejected": -1.9110126495361328, - "logps/chosen": -415.2545471191406, - "logps/rejected": -474.24017333984375, - "loss": 0.564, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -1.1085463762283325, - "rewards/margins": 0.7884865999221802, - "rewards/rejected": -1.8970329761505127, + "epoch": 0.5, + "grad_norm": 1.1796875, + "learning_rate": 2.9603853367705334e-06, + "logits/chosen": -1.7447372674942017, + "logits/rejected": -1.4683334827423096, + "logps/chosen": -815.2188110351562, + "logps/rejected": -1613.6304931640625, + "loss": 0.1237, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.318219184875488, + "rewards/margins": 7.942680358886719, + "rewards/rejected": -14.260897636413574, "step": 1860 }, { - "epoch": 0.49, - "grad_norm": 7.28125, - "learning_rate": 3.0260342347574916e-06, - "logits/chosen": -2.0407090187072754, - "logits/rejected": -1.8666067123413086, - "logps/chosen": -393.3802795410156, - "logps/rejected": -472.4957580566406, - "loss": 0.4705, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.0278112888336182, - "rewards/margins": 0.9973715543746948, - "rewards/rejected": -2.0251829624176025, + "epoch": 0.5, + "grad_norm": 4.3125, + "learning_rate": 2.9374382222096885e-06, + "logits/chosen": -1.728420615196228, + "logits/rejected": -1.4341685771942139, + "logps/chosen": -946.7561645507812, + "logps/rejected": -1834.0970458984375, + "loss": 0.2257, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -7.539031028747559, + "rewards/margins": 8.83682632446289, + "rewards/rejected": -16.375858306884766, "step": 1870 }, { - "epoch": 0.49, - "grad_norm": 6.75, - "learning_rate": 3.0036793954375358e-06, - "logits/chosen": -2.039440155029297, - "logits/rejected": -1.8485755920410156, - "logps/chosen": -410.55291748046875, - "logps/rejected": -466.46051025390625, - "loss": 0.4697, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2217860221862793, - "rewards/margins": 0.9957095384597778, - "rewards/rejected": -2.2174954414367676, + "epoch": 0.5, + "grad_norm": 19.875, + "learning_rate": 2.9144530250051266e-06, + "logits/chosen": -1.7010129690170288, + "logits/rejected": -1.3637703657150269, + "logps/chosen": -856.9533081054688, + "logps/rejected": -1696.8665771484375, + "loss": 0.1865, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.755218505859375, + "rewards/margins": 8.420073509216309, + "rewards/rejected": -15.175291061401367, "step": 1880 }, { - "epoch": 0.49, - "grad_norm": 9.1875, - "learning_rate": 2.981282499033009e-06, - "logits/chosen": -1.9933445453643799, - "logits/rejected": -1.8086822032928467, - "logps/chosen": -437.57635498046875, - "logps/rejected": -503.1834411621094, - "loss": 0.5375, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.4291341304779053, - "rewards/margins": 0.9259105920791626, - "rewards/rejected": -2.3550448417663574, + "epoch": 0.5, + "grad_norm": 13.875, + "learning_rate": 2.891431746209868e-06, + "logits/chosen": -1.7717593908309937, + "logits/rejected": -1.5268971920013428, + "logps/chosen": -791.3832397460938, + "logps/rejected": -1521.106689453125, + "loss": 0.1918, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.118847846984863, + "rewards/margins": 7.1408281326293945, + "rewards/rejected": -13.259675979614258, "step": 1890 }, { - "epoch": 0.5, - "grad_norm": 9.8125, - "learning_rate": 2.9588454156783163e-06, - "logits/chosen": -1.9322729110717773, - "logits/rejected": -1.7422888278961182, - "logps/chosen": -434.81884765625, - "logps/rejected": -520.74267578125, - "loss": 0.4687, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3640549182891846, - "rewards/margins": 1.0799517631530762, - "rewards/rejected": -2.44400691986084, + "epoch": 0.51, + "grad_norm": 7.78125, + "learning_rate": 2.868376390018136e-06, + "logits/chosen": -1.8260672092437744, + "logits/rejected": -1.6104164123535156, + "logps/chosen": -674.1734008789062, + "logps/rejected": -1275.109619140625, + "loss": 0.2611, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -4.913368225097656, + "rewards/margins": 5.8313703536987305, + "rewards/rejected": -10.744738578796387, "step": 1900 }, { - "epoch": 0.5, - "eval_logits/chosen": -1.967826247215271, - "eval_logits/rejected": -1.8518778085708618, - "eval_logps/chosen": -425.9162902832031, - "eval_logps/rejected": -493.8867492675781, - "eval_loss": 0.5197983980178833, - "eval_rewards/accuracies": 0.7289999723434448, - "eval_rewards/chosen": -1.4078201055526733, - "eval_rewards/margins": 0.8985980153083801, - "eval_rewards/rejected": -2.3064181804656982, - "eval_runtime": 784.0155, - "eval_samples_per_second": 2.551, - "eval_steps_per_second": 0.319, + "epoch": 0.51, + "eval_logits/chosen": -1.984923243522644, + "eval_logits/rejected": -1.8682689666748047, + "eval_logps/chosen": -678.0472412109375, + "eval_logps/rejected": -734.593505859375, + "eval_loss": 0.8357905149459839, + "eval_rewards/accuracies": 0.6035000085830688, + "eval_rewards/chosen": -3.9291298389434814, + "eval_rewards/margins": 0.7843554615974426, + "eval_rewards/rejected": -4.713485240936279, + "eval_runtime": 780.35, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, "step": 1900 }, { - "epoch": 0.5, - "grad_norm": 9.9375, - "learning_rate": 2.9363700188634597e-06, - "logits/chosen": -1.9670441150665283, - "logits/rejected": -1.802392601966858, - "logps/chosen": -425.4117126464844, - "logps/rejected": -467.03948974609375, - "loss": 0.5114, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.430497407913208, - "rewards/margins": 0.8090961575508118, - "rewards/rejected": -2.239593505859375, + "epoch": 0.51, + "grad_norm": 7.15625, + "learning_rate": 2.8452889635908758e-06, + "logits/chosen": -1.8920762538909912, + "logits/rejected": -1.573889970779419, + "logps/chosen": -592.5186767578125, + "logps/rejected": -1212.495361328125, + "loss": 0.1637, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -3.980005979537964, + "rewards/margins": 6.280307292938232, + "rewards/rejected": -10.260313987731934, "step": 1910 }, { - "epoch": 0.5, - "grad_norm": 7.78125, - "learning_rate": 2.9138581852776053e-06, - "logits/chosen": -1.9015588760375977, - "logits/rejected": -1.817588210105896, - "logps/chosen": -412.984619140625, - "logps/rejected": -484.392333984375, - "loss": 0.5182, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3329174518585205, - "rewards/margins": 0.8674777150154114, - "rewards/rejected": -2.200395107269287, + "epoch": 0.51, + "grad_norm": 9.8125, + "learning_rate": 2.8221714768810144e-06, + "logits/chosen": -1.822471261024475, + "logits/rejected": -1.5430892705917358, + "logps/chosen": -584.103271484375, + "logps/rejected": -1109.2965087890625, + "loss": 0.198, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.9089488983154297, + "rewards/margins": 5.281431198120117, + "rewards/rejected": -9.19037914276123, "step": 1920 }, { - "epoch": 0.51, - "grad_norm": 9.9375, - "learning_rate": 2.8913117946523805e-06, - "logits/chosen": -1.8993953466415405, - "logits/rejected": -1.7067630290985107, - "logps/chosen": -418.3155212402344, - "logps/rejected": -466.83038330078125, - "loss": 0.4876, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2759909629821777, - "rewards/margins": 0.8963203430175781, - "rewards/rejected": -2.172311305999756, + "epoch": 0.52, + "grad_norm": 5.6875, + "learning_rate": 2.799025942458477e-06, + "logits/chosen": -1.9587557315826416, + "logits/rejected": -1.6500368118286133, + "logps/chosen": -631.9678344726562, + "logps/rejected": -1182.78466796875, + "loss": 0.1954, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.307094573974609, + "rewards/margins": 5.625722885131836, + "rewards/rejected": -9.932817459106445, "step": 1930 }, { - "epoch": 0.51, - "grad_norm": 7.625, - "learning_rate": 2.8687327296049126e-06, - "logits/chosen": -1.8995901346206665, - "logits/rejected": -1.7730462551116943, - "logps/chosen": -393.28955078125, - "logps/rejected": -478.4151306152344, - "loss": 0.5351, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1747307777404785, - "rewards/margins": 0.8694013357162476, - "rewards/rejected": -2.0441319942474365, + "epoch": 0.52, + "grad_norm": 5.84375, + "learning_rate": 2.775854375334976e-06, + "logits/chosen": -1.908205270767212, + "logits/rejected": -1.5645700693130493, + "logps/chosen": -686.67529296875, + "logps/rejected": -1329.472900390625, + "loss": 0.1638, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -4.845526695251465, + "rewards/margins": 6.522275447845459, + "rewards/rejected": -11.367802619934082, "step": 1940 }, { - "epoch": 0.51, - "grad_norm": 6.3125, - "learning_rate": 2.8461228754806376e-06, - "logits/chosen": -1.9606692790985107, - "logits/rejected": -1.8029773235321045, - "logps/chosen": -410.99462890625, - "logps/rejected": -456.9501953125, - "loss": 0.5492, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.1734405755996704, - "rewards/margins": 0.7420889139175415, - "rewards/rejected": -1.9155292510986328, + "epoch": 0.52, + "grad_norm": 10.125, + "learning_rate": 2.7526587927885855e-06, + "logits/chosen": -1.7340272665023804, + "logits/rejected": -1.4183170795440674, + "logps/chosen": -756.422607421875, + "logps/rejected": -1329.1837158203125, + "loss": 0.2525, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -5.652913570404053, + "rewards/margins": 5.638332366943359, + "rewards/rejected": -11.29124641418457, "step": 1950 }, { - "epoch": 0.51, - "grad_norm": 4.96875, - "learning_rate": 2.823484120195865e-06, - "logits/chosen": -2.014724016189575, - "logits/rejected": -1.7219772338867188, - "logps/chosen": -407.7786560058594, - "logps/rejected": -461.8780212402344, - "loss": 0.4589, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.0491843223571777, - "rewards/margins": 0.9459662437438965, - "rewards/rejected": -1.9951505661010742, + "epoch": 0.52, + "grad_norm": 7.3125, + "learning_rate": 2.729441214188127e-06, + "logits/chosen": -1.7929729223251343, + "logits/rejected": -1.5234935283660889, + "logps/chosen": -770.7953491210938, + "logps/rejected": -1437.832763671875, + "loss": 0.156, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.866808891296387, + "rewards/margins": 6.6957688331604, + "rewards/rejected": -12.562577247619629, "step": 1960 }, { - "epoch": 0.52, - "grad_norm": 7.375, - "learning_rate": 2.8008183540801486e-06, - "logits/chosen": -1.9526262283325195, - "logits/rejected": -1.7973562479019165, - "logps/chosen": -413.77386474609375, - "logps/rejected": -439.8606872558594, - "loss": 0.5228, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2191674709320068, - "rewards/margins": 0.7341850399971008, - "rewards/rejected": -1.9533525705337524, + "epoch": 0.53, + "grad_norm": 6.28125, + "learning_rate": 2.706203660817355e-06, + "logits/chosen": -1.7897508144378662, + "logits/rejected": -1.490918755531311, + "logps/chosen": -712.3719482421875, + "logps/rejected": -1381.3006591796875, + "loss": 0.1929, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.262055397033691, + "rewards/margins": 6.684333801269531, + "rewards/rejected": -11.946390151977539, "step": 1970 }, { - "epoch": 0.52, - "grad_norm": 7.0, - "learning_rate": 2.7781274697184353e-06, - "logits/chosen": -1.732486367225647, - "logits/rejected": -1.8619239330291748, - "logps/chosen": -389.2760314941406, - "logps/rejected": -493.4657287597656, - "loss": 0.5323, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3979893922805786, - "rewards/margins": 0.8640559315681458, - "rewards/rejected": -2.262045383453369, + "epoch": 0.53, + "grad_norm": 4.0625, + "learning_rate": 2.6829481556990017e-06, + "logits/chosen": -1.8363555669784546, + "logits/rejected": -1.5211414098739624, + "logps/chosen": -658.4310302734375, + "logps/rejected": -1290.535888671875, + "loss": 0.1377, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.622659683227539, + "rewards/margins": 6.354128360748291, + "rewards/rejected": -10.976788520812988, "step": 1980 }, { - "epoch": 0.52, - "grad_norm": 5.5, - "learning_rate": 2.7554133617930397e-06, - "logits/chosen": -1.817797064781189, - "logits/rejected": -1.7195402383804321, - "logps/chosen": -402.89129638671875, - "logps/rejected": -470.81756591796875, - "loss": 0.519, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3425031900405884, - "rewards/margins": 0.8210526704788208, - "rewards/rejected": -2.163555860519409, + "epoch": 0.53, + "grad_norm": 3.75, + "learning_rate": 2.6596767234186427e-06, + "logits/chosen": -1.8283014297485352, + "logits/rejected": -1.628281593322754, + "logps/chosen": -666.2095947265625, + "logps/rejected": -1244.331787109375, + "loss": 0.1876, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.930241107940674, + "rewards/margins": 5.740645408630371, + "rewards/rejected": -10.670886039733887, "step": 1990 }, { - "epoch": 0.52, - "grad_norm": 7.71875, - "learning_rate": 2.7326779269254363e-06, - "logits/chosen": -1.9797512292861938, - "logits/rejected": -1.7942073345184326, - "logps/chosen": -438.1368103027344, - "logps/rejected": -471.66876220703125, - "loss": 0.4936, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2935562133789062, - "rewards/margins": 0.9384673833847046, - "rewards/rejected": -2.2320237159729004, + "epoch": 0.53, + "grad_norm": 2.859375, + "learning_rate": 2.636391389948449e-06, + "logits/chosen": -1.8633663654327393, + "logits/rejected": -1.6405309438705444, + "logps/chosen": -635.901611328125, + "logps/rejected": -1253.4234619140625, + "loss": 0.1584, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -4.540086269378662, + "rewards/margins": 5.974652290344238, + "rewards/rejected": -10.514738082885742, "step": 2000 }, { - "epoch": 0.52, - "eval_logits/chosen": -1.9507912397384644, - "eval_logits/rejected": -1.8371071815490723, - "eval_logps/chosen": -426.1055603027344, - "eval_logps/rejected": -488.6001281738281, - "eval_loss": 0.5123463273048401, - "eval_rewards/accuracies": 0.7289999723434448, - "eval_rewards/chosen": -1.4097131490707397, - "eval_rewards/margins": 0.8438388109207153, - "eval_rewards/rejected": -2.253551959991455, - "eval_runtime": 783.3098, - "eval_samples_per_second": 2.553, - "eval_steps_per_second": 0.319, + "epoch": 0.53, + "eval_logits/chosen": -1.9624484777450562, + "eval_logits/rejected": -1.8456604480743408, + "eval_logps/chosen": -742.7199096679688, + "eval_logps/rejected": -804.4951171875, + "eval_loss": 0.9011984467506409, + "eval_rewards/accuracies": 0.6010000109672546, + "eval_rewards/chosen": -4.575857162475586, + "eval_rewards/margins": 0.8366447687149048, + "eval_rewards/rejected": -5.412501335144043, + "eval_runtime": 781.1923, + "eval_samples_per_second": 2.56, + "eval_steps_per_second": 0.32, "step": 2000 }, { - "epoch": 0.53, - "grad_norm": 7.625, - "learning_rate": 2.7099230635178954e-06, - "logits/chosen": -1.8386337757110596, - "logits/rejected": -1.808058738708496, - "logps/chosen": -423.898681640625, - "logps/rejected": -500.33203125, - "loss": 0.5187, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4234874248504639, - "rewards/margins": 0.8072878122329712, - "rewards/rejected": -2.2307751178741455, + "epoch": 0.54, + "grad_norm": 11.9375, + "learning_rate": 2.613094182470804e-06, + "logits/chosen": -1.833001732826233, + "logits/rejected": -1.6033201217651367, + "logps/chosen": -710.7305908203125, + "logps/rejected": -1363.15185546875, + "loss": 0.2535, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.038983345031738, + "rewards/margins": 6.4814558029174805, + "rewards/rejected": -11.520438194274902, "step": 2010 }, { - "epoch": 0.53, - "grad_norm": 10.0, - "learning_rate": 2.6871506715949608e-06, - "logits/chosen": -1.862138032913208, - "logits/rejected": -1.816674828529358, - "logps/chosen": -406.03057861328125, - "logps/rejected": -474.5035095214844, - "loss": 0.4734, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3433624505996704, - "rewards/margins": 0.9091267585754395, - "rewards/rejected": -2.2524895668029785, + "epoch": 0.54, + "grad_norm": 5.03125, + "learning_rate": 2.5897871292018255e-06, + "logits/chosen": -1.901118516921997, + "logits/rejected": -1.5881661176681519, + "logps/chosen": -653.0816650390625, + "logps/rejected": -1297.2364501953125, + "loss": 0.1509, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -4.45406436920166, + "rewards/margins": 6.3539299964904785, + "rewards/rejected": -10.80799388885498, "step": 2020 }, { - "epoch": 0.53, - "grad_norm": 8.125, - "learning_rate": 2.6643626526448063e-06, - "logits/chosen": -1.974774718284607, - "logits/rejected": -1.8196189403533936, - "logps/chosen": -454.701904296875, - "logps/rejected": -509.44073486328125, - "loss": 0.4647, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.3952927589416504, - "rewards/margins": 1.014176368713379, - "rewards/rejected": -2.40946888923645, + "epoch": 0.54, + "grad_norm": 4.53125, + "learning_rate": 2.5664722592147866e-06, + "logits/chosen": -1.866217851638794, + "logits/rejected": -1.6465851068496704, + "logps/chosen": -623.0480346679688, + "logps/rejected": -1156.857666015625, + "loss": 0.2831, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -4.450149059295654, + "rewards/margins": 5.248156547546387, + "rewards/rejected": -9.6983060836792, "step": 2030 }, { - "epoch": 0.53, - "grad_norm": 15.125, - "learning_rate": 2.6415609094604562e-06, - "logits/chosen": -1.8369518518447876, - "logits/rejected": -1.7782014608383179, - "logps/chosen": -436.97723388671875, - "logps/rejected": -511.42657470703125, - "loss": 0.4861, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.4681875705718994, - "rewards/margins": 0.9622129201889038, - "rewards/rejected": -2.4304006099700928, + "epoch": 0.55, + "grad_norm": 10.875, + "learning_rate": 2.5431516022634718e-06, + "logits/chosen": -1.925252914428711, + "logits/rejected": -1.6734755039215088, + "logps/chosen": -602.1317138671875, + "logps/rejected": -1146.4388427734375, + "loss": 0.177, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -4.30599308013916, + "rewards/margins": 5.237140655517578, + "rewards/rejected": -9.543133735656738, "step": 2040 }, { - "epoch": 0.54, - "grad_norm": 8.125, - "learning_rate": 2.618747345980904e-06, - "logits/chosen": -1.8477888107299805, - "logits/rejected": -1.659761667251587, - "logps/chosen": -462.1060485839844, - "logps/rejected": -497.0399475097656, - "loss": 0.5581, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.9495713710784912, - "rewards/margins": 0.898357093334198, - "rewards/rejected": -2.847928524017334, + "epoch": 0.55, + "grad_norm": 4.375, + "learning_rate": 2.5198271886054693e-06, + "logits/chosen": -1.8272100687026978, + "logits/rejected": -1.5635576248168945, + "logps/chosen": -690.4618530273438, + "logps/rejected": -1262.934814453125, + "loss": 0.2364, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.088870048522949, + "rewards/margins": 5.745336055755615, + "rewards/rejected": -10.834206581115723, "step": 2050 }, { - "epoch": 0.54, - "grad_norm": 7.34375, - "learning_rate": 2.595923867132136e-06, - "logits/chosen": -1.8910592794418335, - "logits/rejected": -1.7664788961410522, - "logps/chosen": -513.361328125, - "logps/rejected": -582.758544921875, - "loss": 0.5251, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -2.1317970752716064, - "rewards/margins": 0.9448413848876953, - "rewards/rejected": -3.0766384601593018, + "epoch": 0.55, + "grad_norm": 6.34375, + "learning_rate": 2.4965010488254198e-06, + "logits/chosen": -1.8737865686416626, + "logits/rejected": -1.5293992757797241, + "logps/chosen": -696.8263549804688, + "logps/rejected": -1301.195068359375, + "loss": 0.1246, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.076106071472168, + "rewards/margins": 5.952775955200195, + "rewards/rejected": -11.028882026672363, "step": 2060 }, { - "epoch": 0.54, - "grad_norm": 6.25, - "learning_rate": 2.5730923786680672e-06, - "logits/chosen": -1.8651106357574463, - "logits/rejected": -1.7601064443588257, - "logps/chosen": -450.99859619140625, - "logps/rejected": -541.1217651367188, - "loss": 0.5428, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -1.829119086265564, - "rewards/margins": 0.8268195986747742, - "rewards/rejected": -2.6559386253356934, + "epoch": 0.55, + "grad_norm": 5.03125, + "learning_rate": 2.473175213658236e-06, + "logits/chosen": -1.898840308189392, + "logits/rejected": -1.673750877380371, + "logps/chosen": -655.5372924804688, + "logps/rejected": -1246.764892578125, + "loss": 0.215, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.903532981872559, + "rewards/margins": 5.727369785308838, + "rewards/rejected": -10.630903244018555, "step": 2070 }, { - "epoch": 0.54, - "grad_norm": 6.34375, - "learning_rate": 2.5502547870114137e-06, - "logits/chosen": -1.845892310142517, - "logits/rejected": -1.7993892431259155, - "logps/chosen": -434.0924377441406, - "logps/rejected": -487.72955322265625, - "loss": 0.5445, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.6064924001693726, - "rewards/margins": 0.8616697192192078, - "rewards/rejected": -2.4681620597839355, + "epoch": 0.56, + "grad_norm": 12.9375, + "learning_rate": 2.4498517138123153e-06, + "logits/chosen": -1.8875300884246826, + "logits/rejected": -1.6077525615692139, + "logps/chosen": -679.5547485351562, + "logps/rejected": -1191.851318359375, + "loss": 0.2411, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -4.939997673034668, + "rewards/margins": 5.089707374572754, + "rewards/rejected": -10.029705047607422, "step": 2080 }, { - "epoch": 0.55, - "grad_norm": 7.3125, - "learning_rate": 2.527412999094507e-06, - "logits/chosen": -1.8845453262329102, - "logits/rejected": -1.681976318359375, - "logps/chosen": -474.90753173828125, - "logps/rejected": -553.6337890625, - "loss": 0.4916, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.5448286533355713, - "rewards/margins": 0.9603381156921387, - "rewards/rejected": -2.505167007446289, + "epoch": 0.56, + "grad_norm": 6.375, + "learning_rate": 2.426532579792742e-06, + "logits/chosen": -1.903921127319336, + "logits/rejected": -1.640777826309204, + "logps/chosen": -701.1472778320312, + "logps/rejected": -1259.781982421875, + "loss": 0.1961, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.0907464027404785, + "rewards/margins": 5.620265007019043, + "rewards/rejected": -10.71101188659668, "step": 2090 }, { - "epoch": 0.55, - "grad_norm": 10.375, - "learning_rate": 2.504568922200064e-06, - "logits/chosen": -1.8502464294433594, - "logits/rejected": -1.6667206287384033, - "logps/chosen": -398.37567138671875, - "logps/rejected": -467.509765625, - "loss": 0.5058, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3745046854019165, - "rewards/margins": 0.8694229125976562, - "rewards/rejected": -2.243927478790283, + "epoch": 0.56, + "grad_norm": 4.75, + "learning_rate": 2.40321984172452e-06, + "logits/chosen": -1.799851417541504, + "logits/rejected": -1.5513416528701782, + "logps/chosen": -734.9310302734375, + "logps/rejected": -1328.170166015625, + "loss": 0.173, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.326907157897949, + "rewards/margins": 6.046074867248535, + "rewards/rejected": -11.372981071472168, "step": 2100 }, { - "epoch": 0.55, - "eval_logits/chosen": -1.9302312135696411, - "eval_logits/rejected": -1.8155733346939087, - "eval_logps/chosen": -425.4353332519531, - "eval_logps/rejected": -491.28082275390625, - "eval_loss": 0.5121245384216309, - "eval_rewards/accuracies": 0.7319999933242798, - "eval_rewards/chosen": -1.4030110836029053, - "eval_rewards/margins": 0.8773477673530579, - "eval_rewards/rejected": -2.2803587913513184, - "eval_runtime": 783.2383, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "epoch": 0.56, + "eval_logits/chosen": -1.9477238655090332, + "eval_logits/rejected": -1.831076741218567, + "eval_logps/chosen": -817.8319091796875, + "eval_logps/rejected": -883.6325073242188, + "eval_loss": 0.9585374593734741, + "eval_rewards/accuracies": 0.590499997138977, + "eval_rewards/chosen": -5.326976299285889, + "eval_rewards/margins": 0.8768988251686096, + "eval_rewards/rejected": -6.2038750648498535, + "eval_runtime": 781.5335, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, "step": 2100 }, { - "epoch": 0.55, - "grad_norm": 8.125, - "learning_rate": 2.4817244638019333e-06, - "logits/chosen": -1.9113391637802124, - "logits/rejected": -1.7333167791366577, - "logps/chosen": -438.35302734375, - "logps/rejected": -476.8440856933594, - "loss": 0.546, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.412841558456421, - "rewards/margins": 0.8053679466247559, - "rewards/rejected": -2.218209743499756, + "epoch": 0.56, + "grad_norm": 24.875, + "learning_rate": 2.3799155291758332e-06, + "logits/chosen": -1.7613136768341064, + "logits/rejected": -1.5118930339813232, + "logps/chosen": -802.9510498046875, + "logps/rejected": -1477.132568359375, + "loss": 0.1726, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.113633632659912, + "rewards/margins": 6.739386558532715, + "rewards/rejected": -12.853021621704102, "step": 2110 }, { - "epoch": 0.55, - "grad_norm": 16.375, - "learning_rate": 2.4588815314058155e-06, - "logits/chosen": -1.8352677822113037, - "logits/rejected": -1.8049707412719727, - "logps/chosen": -396.540283203125, - "logps/rejected": -446.5592346191406, - "loss": 0.5137, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.347374677658081, - "rewards/margins": 0.8455110788345337, - "rewards/rejected": -2.1928858757019043, + "epoch": 0.57, + "grad_norm": 24.875, + "learning_rate": 2.356621670981353e-06, + "logits/chosen": -1.7708122730255127, + "logits/rejected": -1.4993810653686523, + "logps/chosen": -834.2234497070312, + "logps/rejected": -1653.456298828125, + "loss": 0.2399, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.487744331359863, + "rewards/margins": 7.984147548675537, + "rewards/rejected": -14.471891403198242, "step": 2120 }, { - "epoch": 0.56, - "grad_norm": 8.3125, - "learning_rate": 2.4360420323899922e-06, - "logits/chosen": -1.9231681823730469, - "logits/rejected": -1.7653518915176392, - "logps/chosen": -431.0726623535156, - "logps/rejected": -464.889404296875, - "loss": 0.5654, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3148460388183594, - "rewards/margins": 0.712091326713562, - "rewards/rejected": -2.026937246322632, + "epoch": 0.57, + "grad_norm": 7.28125, + "learning_rate": 2.3333402950656124e-06, + "logits/chosen": -1.7092750072479248, + "logits/rejected": -1.4365252256393433, + "logps/chosen": -843.7501220703125, + "logps/rejected": -1490.2340087890625, + "loss": 0.1593, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.497511863708496, + "rewards/margins": 6.372196197509766, + "rewards/rejected": -12.869707107543945, "step": 2130 }, { - "epoch": 0.56, - "grad_norm": 6.25, - "learning_rate": 2.4132078738460585e-06, - "logits/chosen": -1.939756155014038, - "logits/rejected": -1.7474992275238037, - "logps/chosen": -411.16943359375, - "logps/rejected": -457.58294677734375, - "loss": 0.4876, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3001420497894287, - "rewards/margins": 0.8735949397087097, - "rewards/rejected": -2.173737049102783, + "epoch": 0.57, + "grad_norm": 4.21875, + "learning_rate": 2.3100734282664565e-06, + "logits/chosen": -1.743865728378296, + "logits/rejected": -1.5017584562301636, + "logps/chosen": -744.8902587890625, + "logps/rejected": -1406.6005859375, + "loss": 0.1513, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.662583351135254, + "rewards/margins": 6.43302059173584, + "rewards/rejected": -12.095603942871094, "step": 2140 }, { - "epoch": 0.56, - "grad_norm": 30.0, - "learning_rate": 2.3903809624196826e-06, - "logits/chosen": -1.926123857498169, - "logits/rejected": -1.7321586608886719, - "logps/chosen": -378.1445007324219, - "logps/rejected": -406.1599426269531, - "loss": 0.5939, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.2644902467727661, - "rewards/margins": 0.6342250108718872, - "rewards/rejected": -1.8987153768539429, + "epoch": 0.57, + "grad_norm": 5.96875, + "learning_rate": 2.286823096158595e-06, + "logits/chosen": -1.7671382427215576, + "logits/rejected": -1.5393540859222412, + "logps/chosen": -764.4788208007812, + "logps/rejected": -1386.416259765625, + "loss": 0.148, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.647488594055176, + "rewards/margins": 6.092167854309082, + "rewards/rejected": -11.739656448364258, "step": 2150 }, { - "epoch": 0.57, - "grad_norm": 11.9375, - "learning_rate": 2.3675632041513978e-06, - "logits/chosen": -2.0393998622894287, - "logits/rejected": -1.7864320278167725, - "logps/chosen": -425.8330078125, - "logps/rejected": -446.5849609375, - "loss": 0.4931, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2073787450790405, - "rewards/margins": 0.8838081359863281, - "rewards/rejected": -2.091187000274658, + "epoch": 0.58, + "grad_norm": 4.78125, + "learning_rate": 2.2635913228772495e-06, + "logits/chosen": -1.7948739528656006, + "logits/rejected": -1.5345538854599, + "logps/chosen": -731.1153564453125, + "logps/rejected": -1379.321044921875, + "loss": 0.1676, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.540210247039795, + "rewards/margins": 6.384571552276611, + "rewards/rejected": -11.924782752990723, "step": 2160 }, { - "epoch": 0.57, - "grad_norm": 8.5625, - "learning_rate": 2.3447565043174533e-06, - "logits/chosen": -1.87993586063385, - "logits/rejected": -1.7703927755355835, - "logps/chosen": -428.050048828125, - "logps/rejected": -463.94683837890625, - "loss": 0.5315, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.516704797744751, - "rewards/margins": 0.7439497709274292, - "rewards/rejected": -2.2606546878814697, + "epoch": 0.58, + "grad_norm": 6.03125, + "learning_rate": 2.240380130941944e-06, + "logits/chosen": -1.7852880954742432, + "logits/rejected": -1.4755876064300537, + "logps/chosen": -762.9698486328125, + "logps/rejected": -1461.54296875, + "loss": 0.1365, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.76432991027832, + "rewards/margins": 6.822981834411621, + "rewards/rejected": -12.587312698364258, "step": 2170 }, { - "epoch": 0.57, - "grad_norm": 9.125, - "learning_rate": 2.321962767270724e-06, - "logits/chosen": -1.932936668395996, - "logits/rejected": -1.7261488437652588, - "logps/chosen": -423.9878845214844, - "logps/rejected": -446.48779296875, - "loss": 0.573, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4952479600906372, - "rewards/margins": 0.677778959274292, - "rewards/rejected": -2.1730268001556396, + "epoch": 0.58, + "grad_norm": 10.3125, + "learning_rate": 2.217191541080426e-06, + "logits/chosen": -1.7540229558944702, + "logits/rejected": -1.5075386762619019, + "logps/chosen": -811.2817993164062, + "logps/rejected": -1491.155517578125, + "loss": 0.2447, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.2686567306518555, + "rewards/margins": 6.715971946716309, + "rewards/rejected": -12.984628677368164, "step": 2180 }, { - "epoch": 0.57, - "grad_norm": 7.625, - "learning_rate": 2.299183896281692e-06, - "logits/chosen": -1.9751628637313843, - "logits/rejected": -1.7565038204193115, - "logps/chosen": -406.3192443847656, - "logps/rejected": -481.75701904296875, - "loss": 0.5169, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3347368240356445, - "rewards/margins": 0.7907986640930176, - "rewards/rejected": -2.125535488128662, + "epoch": 0.59, + "grad_norm": 15.8125, + "learning_rate": 2.194027572052741e-06, + "logits/chosen": -1.780588150024414, + "logits/rejected": -1.435884952545166, + "logps/chosen": -880.83349609375, + "logps/rejected": -1586.189697265625, + "loss": 0.2163, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.738241672515869, + "rewards/margins": 7.2081298828125, + "rewards/rejected": -13.946372985839844, "step": 2190 }, { - "epoch": 0.58, - "grad_norm": 5.65625, - "learning_rate": 2.2764217933795297e-06, - "logits/chosen": -1.9902312755584717, - "logits/rejected": -1.8669850826263428, - "logps/chosen": -404.98345947265625, - "logps/rejected": -479.02001953125, - "loss": 0.491, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.247058629989624, - "rewards/margins": 0.9538729786872864, - "rewards/rejected": -2.2009317874908447, + "epoch": 0.59, + "grad_norm": 7.625, + "learning_rate": 2.170890240475488e-06, + "logits/chosen": -1.842301607131958, + "logits/rejected": -1.4893786907196045, + "logps/chosen": -848.5924072265625, + "logps/rejected": -1563.589111328125, + "loss": 0.1348, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.58083963394165, + "rewards/margins": 7.14026403427124, + "rewards/rejected": -13.721104621887207, "step": 2200 }, { - "epoch": 0.58, - "eval_logits/chosen": -2.0052785873413086, - "eval_logits/rejected": -1.8893271684646606, - "eval_logps/chosen": -413.965576171875, - "eval_logps/rejected": -474.9657287597656, - "eval_loss": 0.5101913213729858, - "eval_rewards/accuracies": 0.7300000190734863, - "eval_rewards/chosen": -1.2883129119873047, - "eval_rewards/margins": 0.8288950324058533, - "eval_rewards/rejected": -2.1172077655792236, - "eval_runtime": 783.9156, - "eval_samples_per_second": 2.551, - "eval_steps_per_second": 0.319, + "epoch": 0.59, + "eval_logits/chosen": -1.933610439300537, + "eval_logits/rejected": -1.8167284727096558, + "eval_logps/chosen": -850.277587890625, + "eval_logps/rejected": -931.1090698242188, + "eval_loss": 1.0276434421539307, + "eval_rewards/accuracies": 0.6010000109672546, + "eval_rewards/chosen": -5.651431560516357, + "eval_rewards/margins": 1.0272092819213867, + "eval_rewards/rejected": -6.6786417961120605, + "eval_runtime": 780.3127, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, "step": 2200 }, { - "epoch": 0.58, - "grad_norm": 4.3125, - "learning_rate": 2.2536783591932786e-06, - "logits/chosen": -2.0587801933288574, - "logits/rejected": -1.9291499853134155, - "logps/chosen": -429.5076599121094, - "logps/rejected": -478.2344665527344, - "loss": 0.5481, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.328704595565796, - "rewards/margins": 0.716596245765686, - "rewards/rejected": -2.0453009605407715, + "epoch": 0.59, + "grad_norm": 14.3125, + "learning_rate": 2.147781560646252e-06, + "logits/chosen": -1.7368297576904297, + "logits/rejected": -1.452894926071167, + "logps/chosen": -835.8079223632812, + "logps/rejected": -1602.2353515625, + "loss": 0.1927, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.314393997192383, + "rewards/margins": 7.641695976257324, + "rewards/rejected": -13.956090927124023, "step": 2210 }, { - "epoch": 0.58, - "grad_norm": 7.125, - "learning_rate": 2.230955492793149e-06, - "logits/chosen": -1.843507170677185, - "logits/rejected": -1.7773106098175049, - "logps/chosen": -447.97296142578125, - "logps/rejected": -498.5977478027344, - "loss": 0.6047, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.5283676385879517, - "rewards/margins": 0.6763461828231812, - "rewards/rejected": -2.204713821411133, + "epoch": 0.59, + "grad_norm": 12.75, + "learning_rate": 2.1247035443682466e-06, + "logits/chosen": -1.7991231679916382, + "logits/rejected": -1.4767308235168457, + "logps/chosen": -748.2978515625, + "logps/rejected": -1403.584716796875, + "loss": 0.1813, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.546191215515137, + "rewards/margins": 6.627876281738281, + "rewards/rejected": -12.174067497253418, "step": 2220 }, { - "epoch": 0.58, - "grad_norm": 5.875, - "learning_rate": 2.208255091531947e-06, - "logits/chosen": -1.903603196144104, - "logits/rejected": -1.803851842880249, - "logps/chosen": -430.32830810546875, - "logps/rejected": -484.010009765625, - "loss": 0.4911, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.32736074924469, - "rewards/margins": 0.9304676055908203, - "rewards/rejected": -2.2578284740448, + "epoch": 0.6, + "grad_norm": 6.4375, + "learning_rate": 2.101658200775166e-06, + "logits/chosen": -1.8935410976409912, + "logits/rejected": -1.6258153915405273, + "logps/chosen": -756.5455932617188, + "logps/rejected": -1427.7357177734375, + "loss": 0.2026, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.5328755378723145, + "rewards/margins": 6.666754722595215, + "rewards/rejected": -12.199630737304688, "step": 2230 }, { - "epoch": 0.59, - "grad_norm": 8.5, - "learning_rate": 2.1855790508866435e-06, - "logits/chosen": -1.9459956884384155, - "logits/rejected": -1.7975234985351562, - "logps/chosen": -473.66778564453125, - "logps/rejected": -527.3323974609375, - "loss": 0.5193, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.447183609008789, - "rewards/margins": 0.8262225985527039, - "rewards/rejected": -2.2734062671661377, + "epoch": 0.6, + "grad_norm": 7.28125, + "learning_rate": 2.0786475361562754e-06, + "logits/chosen": -1.8521206378936768, + "logits/rejected": -1.5795223712921143, + "logps/chosen": -647.3450927734375, + "logps/rejected": -1349.6434326171875, + "loss": 0.1187, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -4.813652515411377, + "rewards/margins": 6.857410430908203, + "rewards/rejected": -11.671062469482422, "step": 2240 }, { - "epoch": 0.59, - "grad_norm": 5.1875, - "learning_rate": 2.162929264300107e-06, - "logits/chosen": -1.8744213581085205, - "logits/rejected": -1.810024619102478, - "logps/chosen": -405.54437255859375, - "logps/rejected": -486.003662109375, - "loss": 0.4564, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.1884064674377441, - "rewards/margins": 1.0178717374801636, - "rewards/rejected": -2.206278085708618, + "epoch": 0.6, + "grad_norm": 7.9375, + "learning_rate": 2.0556735537817464e-06, + "logits/chosen": -1.8903312683105469, + "logits/rejected": -1.6123740673065186, + "logps/chosen": -781.2482299804688, + "logps/rejected": -1511.971923828125, + "loss": 0.1646, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.933591365814209, + "rewards/margins": 7.245237827301025, + "rewards/rejected": -13.17883014678955, "step": 2250 }, { - "epoch": 0.59, - "grad_norm": 9.125, - "learning_rate": 2.1403076230230006e-06, - "logits/chosen": -1.8847591876983643, - "logits/rejected": -1.7714933156967163, - "logps/chosen": -439.0040588378906, - "logps/rejected": -468.7979431152344, - "loss": 0.6072, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -1.4334795475006104, - "rewards/margins": 0.630344033241272, - "rewards/rejected": -2.0638232231140137, + "epoch": 0.6, + "grad_norm": 5.21875, + "learning_rate": 2.0327382537282563e-06, + "logits/chosen": -1.8345636129379272, + "logits/rejected": -1.5723216533660889, + "logps/chosen": -747.3956298828125, + "logps/rejected": -1473.4298095703125, + "loss": 0.1579, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.571768760681152, + "rewards/margins": 7.050948143005371, + "rewards/rejected": -12.622716903686523, "step": 2260 }, { - "epoch": 0.59, - "grad_norm": 7.75, - "learning_rate": 2.11771601595586e-06, - "logits/chosen": -1.925467848777771, - "logits/rejected": -1.8592636585235596, - "logps/chosen": -436.3429260253906, - "logps/rejected": -466.99810791015625, - "loss": 0.5333, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3296717405319214, - "rewards/margins": 0.8554004430770874, - "rewards/rejected": -2.185072422027588, + "epoch": 0.61, + "grad_norm": 9.875, + "learning_rate": 2.009843632704861e-06, + "logits/chosen": -1.8921455144882202, + "logits/rejected": -1.6603155136108398, + "logps/chosen": -674.5091552734375, + "logps/rejected": -1375.080810546875, + "loss": 0.1659, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -4.866215705871582, + "rewards/margins": 6.783639430999756, + "rewards/rejected": -11.64985466003418, "step": 2270 }, { - "epoch": 0.6, - "grad_norm": 9.4375, - "learning_rate": 2.0951563294913737e-06, - "logits/chosen": -2.0011866092681885, - "logits/rejected": -1.7229188680648804, - "logps/chosen": -401.23773193359375, - "logps/rejected": -455.4305114746094, - "loss": 0.4779, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.1956737041473389, - "rewards/margins": 0.8523205518722534, - "rewards/rejected": -2.0479941368103027, + "epoch": 0.61, + "grad_norm": 5.125, + "learning_rate": 1.9869916838791704e-06, + "logits/chosen": -1.9177888631820679, + "logits/rejected": -1.6464917659759521, + "logps/chosen": -677.2672729492188, + "logps/rejected": -1329.63818359375, + "loss": 0.1531, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -4.961805820465088, + "rewards/margins": 6.457982063293457, + "rewards/rejected": -11.419788360595703, "step": 2280 }, { - "epoch": 0.6, - "grad_norm": 6.25, - "learning_rate": 2.0726304473568693e-06, - "logits/chosen": -2.045499563217163, - "logits/rejected": -1.9166113138198853, - "logps/chosen": -397.6915588378906, - "logps/rejected": -439.178466796875, - "loss": 0.5102, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.1919631958007812, - "rewards/margins": 0.771680474281311, - "rewards/rejected": -1.9636436700820923, + "epoch": 0.61, + "grad_norm": 6.09375, + "learning_rate": 1.964184396703823e-06, + "logits/chosen": -1.8470436334609985, + "logits/rejected": -1.5801961421966553, + "logps/chosen": -734.0846557617188, + "logps/rejected": -1403.310791015625, + "loss": 0.2223, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.368366241455078, + "rewards/margins": 6.740887641906738, + "rewards/rejected": -12.109254837036133, "step": 2290 }, { - "epoch": 0.6, - "grad_norm": 7.4375, - "learning_rate": 2.050140250457023e-06, - "logits/chosen": -2.0833168029785156, - "logits/rejected": -1.8359079360961914, - "logps/chosen": -419.4093322753906, - "logps/rejected": -470.44256591796875, - "loss": 0.4923, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.2162163257598877, - "rewards/margins": 0.8549982309341431, - "rewards/rejected": -2.0712146759033203, + "epoch": 0.61, + "grad_norm": 11.625, + "learning_rate": 1.9414237567432887e-06, + "logits/chosen": -1.8151607513427734, + "logits/rejected": -1.5806912183761597, + "logps/chosen": -775.1886596679688, + "logps/rejected": -1465.5963134765625, + "loss": 0.1724, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.8581719398498535, + "rewards/margins": 6.805610656738281, + "rewards/rejected": -12.663783073425293, "step": 2300 }, { - "epoch": 0.6, - "eval_logits/chosen": -2.0101025104522705, - "eval_logits/rejected": -1.8948729038238525, - "eval_logps/chosen": -409.7294616699219, - "eval_logps/rejected": -472.4915771484375, - "eval_loss": 0.5106725096702576, - "eval_rewards/accuracies": 0.7319999933242798, - "eval_rewards/chosen": -1.245951771736145, - "eval_rewards/margins": 0.8465145230293274, - "eval_rewards/rejected": -2.092466354370117, - "eval_runtime": 782.9612, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "epoch": 0.61, + "eval_logits/chosen": -1.9729039669036865, + "eval_logits/rejected": -1.8557853698730469, + "eval_logps/chosen": -799.329345703125, + "eval_logps/rejected": -868.793701171875, + "eval_loss": 0.9419282078742981, + "eval_rewards/accuracies": 0.6029999852180481, + "eval_rewards/chosen": -5.141951560974121, + "eval_rewards/margins": 0.9135352373123169, + "eval_rewards/rejected": -6.055487155914307, + "eval_runtime": 781.02, + "eval_samples_per_second": 2.561, + "eval_steps_per_second": 0.32, "step": 2300 }, { - "epoch": 0.6, - "grad_norm": 8.9375, - "learning_rate": 2.0276876167168042e-06, - "logits/chosen": -1.845709204673767, - "logits/rejected": -1.7635509967803955, - "logps/chosen": -373.54425048828125, - "logps/rejected": -412.81805419921875, - "loss": 0.5546, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.249789834022522, - "rewards/margins": 0.7671625018119812, - "rewards/rejected": -2.0169520378112793, + "epoch": 0.62, + "grad_norm": 3.578125, + "learning_rate": 1.9187117455010082e-06, + "logits/chosen": -1.8257277011871338, + "logits/rejected": -1.544679880142212, + "logps/chosen": -741.0110473632812, + "logps/rejected": -1510.415771484375, + "loss": 0.1389, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.247300148010254, + "rewards/margins": 7.593186378479004, + "rewards/rejected": -12.840486526489258, "step": 2310 }, { - "epoch": 0.61, - "grad_norm": 6.5625, - "learning_rate": 2.0052744209246682e-06, - "logits/chosen": -1.970632791519165, - "logits/rejected": -1.8350298404693604, - "logps/chosen": -410.10125732421875, - "logps/rejected": -451.84124755859375, - "loss": 0.521, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.3679193258285522, - "rewards/margins": 0.7870801687240601, - "rewards/rejected": -2.1549994945526123, + "epoch": 0.62, + "grad_norm": 3.46875, + "learning_rate": 1.896050340246886e-06, + "logits/chosen": -1.865552306175232, + "logits/rejected": -1.5399208068847656, + "logps/chosen": -729.8931884765625, + "logps/rejected": -1392.20751953125, + "loss": 0.1431, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.416408538818359, + "rewards/margins": 6.736737251281738, + "rewards/rejected": -12.153144836425781, "step": 2320 }, { - "epoch": 0.61, - "grad_norm": 10.4375, - "learning_rate": 1.9829025345760127e-06, - "logits/chosen": -1.9096359014511108, - "logits/rejected": -1.947090744972229, - "logps/chosen": -431.58319091796875, - "logps/rejected": -487.96466064453125, - "loss": 0.5675, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.311854600906372, - "rewards/margins": 0.6661221385002136, - "rewards/rejected": -1.9779767990112305, + "epoch": 0.62, + "grad_norm": 16.875, + "learning_rate": 1.8734415138451556e-06, + "logits/chosen": -1.8815135955810547, + "logits/rejected": -1.6109225749969482, + "logps/chosen": -702.2775268554688, + "logps/rejected": -1290.330322265625, + "loss": 0.1526, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.315953254699707, + "rewards/margins": 5.775982856750488, + "rewards/rejected": -11.091935157775879, "step": 2330 }, { - "epoch": 0.61, - "grad_norm": 8.5, - "learning_rate": 1.9605738257169115e-06, - "logits/chosen": -1.928025245666504, - "logits/rejected": -1.657149314880371, - "logps/chosen": -391.2662048339844, - "logps/rejected": -477.40106201171875, - "loss": 0.4978, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3763009309768677, - "rewards/margins": 0.9396868944168091, - "rewards/rejected": -2.3159878253936768, + "epoch": 0.63, + "grad_norm": 0.91015625, + "learning_rate": 1.8508872345826217e-06, + "logits/chosen": -1.7835582494735718, + "logits/rejected": -1.5049933195114136, + "logps/chosen": -776.7059326171875, + "logps/rejected": -1574.740966796875, + "loss": 0.1563, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.917466640472412, + "rewards/margins": 7.796708583831787, + "rewards/rejected": -13.714177131652832, "step": 2340 }, { - "epoch": 0.62, - "grad_norm": 8.3125, - "learning_rate": 1.9382901587881275e-06, - "logits/chosen": -1.9695905447006226, - "logits/rejected": -1.8236945867538452, - "logps/chosen": -399.340087890625, - "logps/rejected": -462.8815002441406, - "loss": 0.4442, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.215022325515747, - "rewards/margins": 1.0116783380508423, - "rewards/rejected": -2.226700782775879, + "epoch": 0.63, + "grad_norm": 11.6875, + "learning_rate": 1.828389465997305e-06, + "logits/chosen": -1.783422827720642, + "logits/rejected": -1.5323078632354736, + "logps/chosen": -845.9024658203125, + "logps/rejected": -1603.354248046875, + "loss": 0.1634, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.5129241943359375, + "rewards/margins": 7.4951491355896, + "rewards/rejected": -14.008073806762695, "step": 2350 }, { - "epoch": 0.62, - "grad_norm": 10.125, - "learning_rate": 1.916053394469437e-06, - "logits/chosen": -1.9849417209625244, - "logits/rejected": -1.7118148803710938, - "logps/chosen": -412.3340759277344, - "logps/rejected": -497.0166015625, - "loss": 0.4895, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.307409644126892, - "rewards/margins": 0.9881817102432251, - "rewards/rejected": -2.295591354370117, + "epoch": 0.63, + "grad_norm": 4.125, + "learning_rate": 1.8059501667075033e-06, + "logits/chosen": -1.762046456336975, + "logits/rejected": -1.4177411794662476, + "logps/chosen": -846.1433715820312, + "logps/rejected": -1681.241455078125, + "loss": 0.1306, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.4577531814575195, + "rewards/margins": 8.345453262329102, + "rewards/rejected": -14.803205490112305, "step": 2360 }, { - "epoch": 0.62, - "grad_norm": 7.71875, - "learning_rate": 1.8938653895242604e-06, - "logits/chosen": -1.9592363834381104, - "logits/rejected": -1.747896432876587, - "logps/chosen": -414.3805236816406, - "logps/rejected": -484.31842041015625, - "loss": 0.4682, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.3323676586151123, - "rewards/margins": 1.001394510269165, - "rewards/rejected": -2.3337621688842773, + "epoch": 0.63, + "grad_norm": 3.96875, + "learning_rate": 1.7835712902412727e-06, + "logits/chosen": -1.7055094242095947, + "logits/rejected": -1.4395256042480469, + "logps/chosen": -878.4562377929688, + "logps/rejected": -1672.9664306640625, + "loss": 0.1229, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.844629764556885, + "rewards/margins": 7.841983795166016, + "rewards/rejected": -14.686613082885742, "step": 2370 }, { - "epoch": 0.62, - "grad_norm": 8.375, - "learning_rate": 1.8717279966446267e-06, - "logits/chosen": -1.8209333419799805, - "logits/rejected": -1.7357136011123657, - "logps/chosen": -419.76885986328125, - "logps/rejected": -493.34686279296875, - "loss": 0.5022, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.4863672256469727, - "rewards/margins": 0.8758360743522644, - "rewards/rejected": -2.3622031211853027, + "epoch": 0.64, + "grad_norm": 3.046875, + "learning_rate": 1.7612547848663585e-06, + "logits/chosen": -1.7910232543945312, + "logits/rejected": -1.5396807193756104, + "logps/chosen": -862.0275268554688, + "logps/rejected": -1550.447998046875, + "loss": 0.2311, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.806525230407715, + "rewards/margins": 6.917562961578369, + "rewards/rejected": -13.724087715148926, "step": 2380 }, { - "epoch": 0.63, - "grad_norm": 5.6875, - "learning_rate": 1.8496430642964698e-06, - "logits/chosen": -1.899411916732788, - "logits/rejected": -1.8176219463348389, - "logps/chosen": -425.37030029296875, - "logps/rejected": -482.58197021484375, - "loss": 0.536, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3191746473312378, - "rewards/margins": 0.8288755416870117, - "rewards/rejected": -2.148050308227539, + "epoch": 0.64, + "grad_norm": 9.6875, + "learning_rate": 1.7390025934205836e-06, + "logits/chosen": -1.8132579326629639, + "logits/rejected": -1.5044097900390625, + "logps/chosen": -865.0626220703125, + "logps/rejected": -1650.018798828125, + "loss": 0.2765, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.646265983581543, + "rewards/margins": 7.8968071937561035, + "rewards/rejected": -14.543073654174805, "step": 2390 }, { - "epoch": 0.63, - "grad_norm": 4.75, - "learning_rate": 1.827612436565286e-06, - "logits/chosen": -1.941320776939392, - "logits/rejected": -1.7779874801635742, - "logps/chosen": -411.2789611816406, - "logps/rejected": -485.43328857421875, - "loss": 0.4718, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2878601551055908, - "rewards/margins": 0.9265607595443726, - "rewards/rejected": -2.214420795440674, + "epoch": 0.64, + "grad_norm": 6.1875, + "learning_rate": 1.7168166531427083e-06, + "logits/chosen": -1.8180538415908813, + "logits/rejected": -1.4960973262786865, + "logps/chosen": -817.0841674804688, + "logps/rejected": -1640.4371337890625, + "loss": 0.1652, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.297974586486816, + "rewards/margins": 8.122183799743652, + "rewards/rejected": -14.420158386230469, "step": 2400 }, { - "epoch": 0.63, - "eval_logits/chosen": -1.9757143259048462, - "eval_logits/rejected": -1.8618159294128418, - "eval_logps/chosen": -419.5652770996094, - "eval_logps/rejected": -482.2935791015625, - "eval_loss": 0.5093097686767578, - "eval_rewards/accuracies": 0.7264999747276306, - "eval_rewards/chosen": -1.3443105220794678, - "eval_rewards/margins": 0.846176028251648, - "eval_rewards/rejected": -2.1904866695404053, - "eval_runtime": 783.1788, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "epoch": 0.64, + "eval_logits/chosen": -1.9630491733551025, + "eval_logits/rejected": -1.8445292711257935, + "eval_logps/chosen": -813.1928100585938, + "eval_logps/rejected": -888.5103149414062, + "eval_loss": 0.9893488883972168, + "eval_rewards/accuracies": 0.5960000157356262, + "eval_rewards/chosen": -5.280584812164307, + "eval_rewards/margins": 0.9720681309700012, + "eval_rewards/rejected": -6.252652645111084, + "eval_runtime": 780.9929, + "eval_samples_per_second": 2.561, + "eval_steps_per_second": 0.32, "step": 2400 }, { - "epoch": 0.63, - "grad_norm": 8.3125, - "learning_rate": 1.8056379530021492e-06, - "logits/chosen": -1.9571435451507568, - "logits/rejected": -1.8972694873809814, - "logps/chosen": -400.8582763671875, - "logps/rejected": -447.929931640625, - "loss": 0.527, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3873006105422974, - "rewards/margins": 0.7267057299613953, - "rewards/rejected": -2.114006519317627, + "epoch": 0.64, + "grad_norm": 2.9375, + "learning_rate": 1.694698895503774e-06, + "logits/chosen": -1.8146822452545166, + "logits/rejected": -1.5159728527069092, + "logps/chosen": -773.5902099609375, + "logps/rejected": -1589.297607421875, + "loss": 0.1422, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.86234188079834, + "rewards/margins": 8.083044052124023, + "rewards/rejected": -13.94538688659668, "step": 2410 }, { - "epoch": 0.63, - "grad_norm": 8.1875, - "learning_rate": 1.7837214484701154e-06, - "logits/chosen": -1.9466660022735596, - "logits/rejected": -1.8471206426620483, - "logps/chosen": -389.0110168457031, - "logps/rejected": -453.07257080078125, - "loss": 0.4846, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.1683722734451294, - "rewards/margins": 0.9214507937431335, - "rewards/rejected": -2.089823007583618, + "epoch": 0.65, + "grad_norm": 2.515625, + "learning_rate": 1.6726512460389566e-06, + "logits/chosen": -1.8080215454101562, + "logits/rejected": -1.52683424949646, + "logps/chosen": -794.5113525390625, + "logps/rejected": -1582.714111328125, + "loss": 0.1342, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.883685111999512, + "rewards/margins": 7.861043453216553, + "rewards/rejected": -13.744728088378906, "step": 2420 }, { - "epoch": 0.64, - "grad_norm": 8.9375, - "learning_rate": 1.7618647529910043e-06, - "logits/chosen": -1.9525439739227295, - "logits/rejected": -1.8127120733261108, - "logps/chosen": -403.1404724121094, - "logps/rejected": -473.431396484375, - "loss": 0.5026, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2594850063323975, - "rewards/margins": 0.8663209080696106, - "rewards/rejected": -2.1258060932159424, + "epoch": 0.65, + "grad_norm": 2.515625, + "learning_rate": 1.650675624179931e-06, + "logits/chosen": -1.8216609954833984, + "logits/rejected": -1.4835867881774902, + "logps/chosen": -799.8715209960938, + "logps/rejected": -1556.9541015625, + "loss": 0.1083, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.9547319412231445, + "rewards/margins": 7.664329528808594, + "rewards/rejected": -13.619061470031738, "step": 2430 }, { - "epoch": 0.64, - "grad_norm": 7.84375, - "learning_rate": 1.7400696915925996e-06, - "logits/chosen": -1.9792495965957642, - "logits/rejected": -1.7760179042816162, - "logps/chosen": -421.8628845214844, - "logps/rejected": -445.33551025390625, - "loss": 0.5126, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.304434061050415, - "rewards/margins": 0.8763524293899536, - "rewards/rejected": -2.180786371231079, + "epoch": 0.65, + "grad_norm": 2.984375, + "learning_rate": 1.628773943087768e-06, + "logits/chosen": -1.7566239833831787, + "logits/rejected": -1.5169622898101807, + "logps/chosen": -790.89453125, + "logps/rejected": -1523.702392578125, + "loss": 0.3056, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.1184186935424805, + "rewards/margins": 7.254298210144043, + "rewards/rejected": -13.372715950012207, "step": 2440 }, { - "epoch": 0.64, - "grad_norm": 12.3125, - "learning_rate": 1.718338084156254e-06, - "logits/chosen": -1.9452136754989624, - "logits/rejected": -1.7868611812591553, - "logps/chosen": -435.8577575683594, - "logps/rejected": -489.53997802734375, - "loss": 0.4592, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.263363242149353, - "rewards/margins": 0.9536368250846863, - "rewards/rejected": -2.2170000076293945, + "epoch": 0.65, + "grad_norm": 3.734375, + "learning_rate": 1.606948109486379e-06, + "logits/chosen": -1.8213424682617188, + "logits/rejected": -1.517643690109253, + "logps/chosen": -821.3567504882812, + "logps/rejected": -1711.3013916015625, + "loss": 0.1673, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.3845534324646, + "rewards/margins": 8.916479110717773, + "rewards/rejected": -15.301034927368164, "step": 2450 }, { - "epoch": 0.64, - "grad_norm": 17.875, - "learning_rate": 1.6966717452649372e-06, - "logits/chosen": -1.931109070777893, - "logits/rejected": -1.7780697345733643, - "logps/chosen": -429.203369140625, - "logps/rejected": -467.59588623046875, - "loss": 0.4606, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2935905456542969, - "rewards/margins": 0.9742799997329712, - "rewards/rejected": -2.2678704261779785, + "epoch": 0.66, + "grad_norm": 3.3125, + "learning_rate": 1.5852000234965176e-06, + "logits/chosen": -1.8380916118621826, + "logits/rejected": -1.556247353553772, + "logps/chosen": -863.4910888671875, + "logps/rejected": -1631.0631103515625, + "loss": 0.1745, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.359866619110107, + "rewards/margins": 7.785064697265625, + "rewards/rejected": -14.144930839538574, "step": 2460 }, { - "epoch": 0.65, - "grad_norm": 7.53125, - "learning_rate": 1.6750724840517103e-06, - "logits/chosen": -1.9360177516937256, - "logits/rejected": -1.8184916973114014, - "logps/chosen": -402.77056884765625, - "logps/rejected": -478.7181091308594, - "loss": 0.5304, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.2702269554138184, - "rewards/margins": 0.7718670964241028, - "rewards/rejected": -2.0420939922332764, + "epoch": 0.66, + "grad_norm": 9.5, + "learning_rate": 1.563531578470362e-06, + "logits/chosen": -1.7966903448104858, + "logits/rejected": -1.5243536233901978, + "logps/chosen": -778.637939453125, + "logps/rejected": -1521.4149169921875, + "loss": 0.1896, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.939091682434082, + "rewards/margins": 7.2415289878845215, + "rewards/rejected": -13.180621147155762, "step": 2470 }, { - "epoch": 0.65, - "grad_norm": 9.8125, - "learning_rate": 1.6535421040486686e-06, - "logits/chosen": -1.799547791481018, - "logits/rejected": -1.6973329782485962, - "logps/chosen": -416.2311096191406, - "logps/rejected": -489.39935302734375, - "loss": 0.4687, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.419057011604309, - "rewards/margins": 1.0684629678726196, - "rewards/rejected": -2.4875199794769287, + "epoch": 0.66, + "grad_norm": 5.4375, + "learning_rate": 1.5419446608266792e-06, + "logits/chosen": -1.8752574920654297, + "logits/rejected": -1.4560130834579468, + "logps/chosen": -757.1990356445312, + "logps/rejected": -1594.9390869140625, + "loss": 0.1539, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.606993675231934, + "rewards/margins": 8.46636962890625, + "rewards/rejected": -14.073362350463867, "step": 2480 }, { - "epoch": 0.65, - "grad_norm": 10.75, - "learning_rate": 1.6320824030363458e-06, - "logits/chosen": -1.8740527629852295, - "logits/rejected": -1.8057101964950562, - "logps/chosen": -390.3742980957031, - "logps/rejected": -475.86737060546875, - "loss": 0.4615, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.374366044998169, - "rewards/margins": 1.031112790107727, - "rewards/rejected": -2.4054789543151855, + "epoch": 0.67, + "grad_norm": 6.84375, + "learning_rate": 1.5204411498865961e-06, + "logits/chosen": -1.7594566345214844, + "logits/rejected": -1.507631540298462, + "logps/chosen": -827.5431518554688, + "logps/rejected": -1608.100830078125, + "loss": 0.1661, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.1717095375061035, + "rewards/margins": 7.750910758972168, + "rewards/rejected": -13.922619819641113, "step": 2490 }, { - "epoch": 0.65, - "grad_norm": 8.875, - "learning_rate": 1.6106951728936028e-06, - "logits/chosen": -2.002263307571411, - "logits/rejected": -1.8575477600097656, - "logps/chosen": -405.1084289550781, - "logps/rejected": -480.92327880859375, - "loss": 0.5187, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.2944339513778687, - "rewards/margins": 0.833656907081604, - "rewards/rejected": -2.1280908584594727, + "epoch": 0.67, + "grad_norm": 12.0625, + "learning_rate": 1.4990229177099957e-06, + "logits/chosen": -1.9248380661010742, + "logits/rejected": -1.610783576965332, + "logps/chosen": -834.01904296875, + "logps/rejected": -1623.152099609375, + "loss": 0.1393, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.52529239654541, + "rewards/margins": 7.815352439880371, + "rewards/rejected": -14.340644836425781, "step": 2500 }, { - "epoch": 0.65, - "eval_logits/chosen": -1.9795697927474976, - "eval_logits/rejected": -1.8640780448913574, - "eval_logps/chosen": -417.4667663574219, - "eval_logps/rejected": -484.1988220214844, - "eval_loss": 0.5102798342704773, - "eval_rewards/accuracies": 0.7285000085830688, - "eval_rewards/chosen": -1.3233253955841064, - "eval_rewards/margins": 0.8862132430076599, - "eval_rewards/rejected": -2.2095389366149902, - "eval_runtime": 784.1373, - "eval_samples_per_second": 2.551, - "eval_steps_per_second": 0.319, - "step": 2500 + "epoch": 0.67, + "eval_logits/chosen": -1.9431767463684082, + "eval_logits/rejected": -1.8254231214523315, + "eval_logps/chosen": -853.2055053710938, + "eval_logps/rejected": -937.9327392578125, + "eval_loss": 1.0265454053878784, + "eval_rewards/accuracies": 0.5985000133514404, + "eval_rewards/chosen": -5.68071174621582, + "eval_rewards/margins": 1.0661665201187134, + "eval_rewards/rejected": -6.746878623962402, + "eval_runtime": 781.5036, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, + "step": 2500 }, { - "epoch": 0.66, - "grad_norm": 6.0, - "learning_rate": 1.5893821994479996e-06, - "logits/chosen": -1.9972072839736938, - "logits/rejected": -1.8986774682998657, - "logps/chosen": -403.4590759277344, - "logps/rejected": -454.43585205078125, - "loss": 0.481, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.087583303451538, - "rewards/margins": 0.914753258228302, - "rewards/rejected": -2.0023367404937744, + "epoch": 0.67, + "grad_norm": 12.5625, + "learning_rate": 1.4776918289325298e-06, + "logits/chosen": -1.7308515310287476, + "logits/rejected": -1.5036309957504272, + "logps/chosen": -862.5445556640625, + "logps/rejected": -1760.222412109375, + "loss": 0.1141, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.750277042388916, + "rewards/margins": 8.782461166381836, + "rewards/rejected": -15.532739639282227, "step": 2510 }, { - "epoch": 0.66, - "grad_norm": 7.375, - "learning_rate": 1.5681452623266868e-06, - "logits/chosen": -1.9567502737045288, - "logits/rejected": -1.769144058227539, - "logps/chosen": -429.75860595703125, - "logps/rejected": -468.85382080078125, - "loss": 0.4778, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2228810787200928, - "rewards/margins": 0.9947159886360168, - "rewards/rejected": -2.217597007751465, + "epoch": 0.67, + "grad_norm": 8.4375, + "learning_rate": 1.456449740603291e-06, + "logits/chosen": -1.803753137588501, + "logits/rejected": -1.5564358234405518, + "logps/chosen": -908.7633056640625, + "logps/rejected": -1765.212890625, + "loss": 0.1407, + "rewards/accuracies": 0.9375, + "rewards/chosen": -7.143259525299072, + "rewards/margins": 8.473600387573242, + "rewards/rejected": -15.616859436035156, "step": 2520 }, { - "epoch": 0.66, - "grad_norm": 6.53125, - "learning_rate": 1.5469861348078014e-06, - "logits/chosen": -1.967635154724121, - "logits/rejected": -1.8467252254486084, - "logps/chosen": -392.23504638671875, - "logps/rejected": -474.0735778808594, - "loss": 0.4835, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.249990701675415, - "rewards/margins": 0.9654566049575806, - "rewards/rejected": -2.215447187423706, + "epoch": 0.68, + "grad_norm": 3.015625, + "learning_rate": 1.4352985020231421e-06, + "logits/chosen": -1.7121673822402954, + "logits/rejected": -1.417112112045288, + "logps/chosen": -905.90087890625, + "logps/rejected": -1849.311279296875, + "loss": 0.1203, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -7.201010704040527, + "rewards/margins": 9.321398735046387, + "rewards/rejected": -16.52240753173828, "step": 2530 }, { - "epoch": 0.66, - "grad_norm": 7.4375, - "learning_rate": 1.5259065836724035e-06, - "logits/chosen": -1.803320288658142, - "logits/rejected": -1.7856988906860352, - "logps/chosen": -396.5582275390625, - "logps/rejected": -478.5979919433594, - "loss": 0.49, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3036736249923706, - "rewards/margins": 0.9293512105941772, - "rewards/rejected": -2.233024835586548, + "epoch": 0.68, + "grad_norm": 12.0, + "learning_rate": 1.4142399545837182e-06, + "logits/chosen": -1.7247213125228882, + "logits/rejected": -1.4465917348861694, + "logps/chosen": -866.37548828125, + "logps/rejected": -1694.348388671875, + "loss": 0.1955, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.842932224273682, + "rewards/margins": 8.215066909790039, + "rewards/rejected": -15.058000564575195, "step": 2540 }, { - "epoch": 0.67, - "grad_norm": 8.25, - "learning_rate": 1.5049083690569456e-06, - "logits/chosen": -1.8403940200805664, - "logits/rejected": -1.6996625661849976, - "logps/chosen": -382.8592529296875, - "logps/rejected": -459.94952392578125, - "loss": 0.5264, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2738138437271118, - "rewards/margins": 0.8273483514785767, - "rewards/rejected": -2.1011624336242676, + "epoch": 0.68, + "grad_norm": 8.0625, + "learning_rate": 1.3932759316071184e-06, + "logits/chosen": -1.718605399131775, + "logits/rejected": -1.4183330535888672, + "logps/chosen": -816.7738647460938, + "logps/rejected": -1566.388427734375, + "loss": 0.1356, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.286736488342285, + "rewards/margins": 7.438478946685791, + "rewards/rejected": -13.725214958190918, "step": 2550 }, { - "epoch": 0.67, - "grad_norm": 12.3125, - "learning_rate": 1.4839932443063057e-06, - "logits/chosen": -1.9228919744491577, - "logits/rejected": -1.7470515966415405, - "logps/chosen": -434.42767333984375, - "logps/rejected": -461.97943115234375, - "loss": 0.4903, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.238571047782898, - "rewards/margins": 0.861971378326416, - "rewards/rejected": -2.1005425453186035, + "epoch": 0.68, + "grad_norm": 12.5625, + "learning_rate": 1.3724082581862963e-06, + "logits/chosen": -1.7261683940887451, + "logits/rejected": -1.461607575416565, + "logps/chosen": -756.42431640625, + "logps/rejected": -1514.209716796875, + "loss": 0.228, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.900428771972656, + "rewards/margins": 7.4936652183532715, + "rewards/rejected": -13.394094467163086, "step": 2560 }, { - "epoch": 0.67, - "grad_norm": 11.25, - "learning_rate": 1.4631629558273803e-06, - "logits/chosen": -1.851351022720337, - "logits/rejected": -1.8053207397460938, - "logps/chosen": -394.43255615234375, - "logps/rejected": -451.2469787597656, - "loss": 0.5683, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.342629313468933, - "rewards/margins": 0.7212954163551331, - "rewards/rejected": -2.063924551010132, + "epoch": 0.69, + "grad_norm": 20.5, + "learning_rate": 1.351638751026178e-06, + "logits/chosen": -1.86214280128479, + "logits/rejected": -1.4936145544052124, + "logps/chosen": -812.3070678710938, + "logps/rejected": -1635.2750244140625, + "loss": 0.2018, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.023752212524414, + "rewards/margins": 8.384181022644043, + "rewards/rejected": -14.407933235168457, "step": 2570 }, { - "epoch": 0.68, - "grad_norm": 5.09375, - "learning_rate": 1.4424192429432657e-06, - "logits/chosen": -1.888126015663147, - "logits/rejected": -1.8805888891220093, - "logps/chosen": -380.94451904296875, - "logps/rejected": -482.2357482910156, - "loss": 0.4703, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.0602384805679321, - "rewards/margins": 0.9893448948860168, - "rewards/rejected": -2.0495834350585938, + "epoch": 0.69, + "grad_norm": 24.0, + "learning_rate": 1.3309692182854933e-06, + "logits/chosen": -1.8188070058822632, + "logits/rejected": -1.4605897665023804, + "logps/chosen": -798.3870239257812, + "logps/rejected": -1644.860595703125, + "loss": 0.1926, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.045955181121826, + "rewards/margins": 8.516426086425781, + "rewards/rejected": -14.56238079071045, "step": 2580 }, { - "epoch": 0.68, - "grad_norm": 11.5, - "learning_rate": 1.421763837748016e-06, - "logits/chosen": -1.8856582641601562, - "logits/rejected": -1.7754625082015991, - "logps/chosen": -404.8888244628906, - "logps/rejected": -481.6700744628906, - "loss": 0.4836, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2667722702026367, - "rewards/margins": 0.9201840162277222, - "rewards/rejected": -2.1869561672210693, + "epoch": 0.69, + "grad_norm": 1.8671875, + "learning_rate": 1.3104014594193703e-06, + "logits/chosen": -1.8970272541046143, + "logits/rejected": -1.6189887523651123, + "logps/chosen": -709.9751586914062, + "logps/rejected": -1469.901611328125, + "loss": 0.1165, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.162797451019287, + "rewards/margins": 7.589346408843994, + "rewards/rejected": -12.752143859863281, "step": 2590 }, { - "epoch": 0.68, - "grad_norm": 6.625, - "learning_rate": 1.401198464962021e-06, - "logits/chosen": -1.9382120370864868, - "logits/rejected": -1.6832538843154907, - "logps/chosen": -417.79046630859375, - "logps/rejected": -458.81719970703125, - "loss": 0.5025, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.303044080734253, - "rewards/margins": 0.8441787958145142, - "rewards/rejected": -2.1472229957580566, + "epoch": 0.69, + "grad_norm": 25.875, + "learning_rate": 1.2899372650226688e-06, + "logits/chosen": -1.897132158279419, + "logits/rejected": -1.589751124382019, + "logps/chosen": -739.879638671875, + "logps/rejected": -1452.0478515625, + "loss": 0.2064, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.397900104522705, + "rewards/margins": 7.155561923980713, + "rewards/rejected": -12.553461074829102, "step": 2600 }, { - "epoch": 0.68, - "eval_logits/chosen": -1.9538325071334839, - "eval_logits/rejected": -1.8387821912765503, - "eval_logps/chosen": -414.2358703613281, - "eval_logps/rejected": -481.66204833984375, - "eval_loss": 0.5114797353744507, - "eval_rewards/accuracies": 0.7315000295639038, - "eval_rewards/chosen": -1.2910163402557373, - "eval_rewards/margins": 0.8931546807289124, - "eval_rewards/rejected": -2.184170961380005, - "eval_runtime": 783.3661, - "eval_samples_per_second": 2.553, - "eval_steps_per_second": 0.319, + "epoch": 0.69, + "eval_logits/chosen": -1.9722185134887695, + "eval_logits/rejected": -1.853868842124939, + "eval_logps/chosen": -780.031494140625, + "eval_logps/rejected": -850.7296752929688, + "eval_loss": 0.9616246223449707, + "eval_rewards/accuracies": 0.5960000157356262, + "eval_rewards/chosen": -4.948971748352051, + "eval_rewards/margins": 0.925875186920166, + "eval_rewards/rejected": -5.874846935272217, + "eval_runtime": 781.7509, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, "step": 2600 }, { - "epoch": 0.68, - "grad_norm": 12.3125, - "learning_rate": 1.3807248417879896e-06, - "logits/chosen": -1.939713716506958, - "logits/rejected": -1.8573747873306274, - "logps/chosen": -410.3163146972656, - "logps/rejected": -483.0690002441406, - "loss": 0.4874, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.1922686100006104, - "rewards/margins": 0.9466792941093445, - "rewards/rejected": -2.1389479637145996, + "epoch": 0.7, + "grad_norm": 2.65625, + "learning_rate": 1.269578416674101e-06, + "logits/chosen": -1.8619873523712158, + "logits/rejected": -1.5740466117858887, + "logps/chosen": -758.4598999023438, + "logps/rejected": -1500.849853515625, + "loss": 0.118, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.509732246398926, + "rewards/margins": 7.478974342346191, + "rewards/rejected": -12.98870849609375, "step": 2610 }, { - "epoch": 0.69, - "grad_norm": 18.625, - "learning_rate": 1.3603446777675665e-06, - "logits/chosen": -1.8353694677352905, - "logits/rejected": -1.7406032085418701, - "logps/chosen": -430.1822814941406, - "logps/rejected": -488.7582092285156, - "loss": 0.5383, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4513074159622192, - "rewards/margins": 0.8473714590072632, - "rewards/rejected": -2.2986786365509033, + "epoch": 0.7, + "grad_norm": 2.703125, + "learning_rate": 1.249326686781127e-06, + "logits/chosen": -1.913484811782837, + "logits/rejected": -1.5594704151153564, + "logps/chosen": -777.2727661132812, + "logps/rejected": -1519.972900390625, + "loss": 0.1092, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.689510345458984, + "rewards/margins": 7.493929386138916, + "rewards/rejected": -13.183438301086426, "step": 2620 }, { - "epoch": 0.69, - "grad_norm": 6.96875, - "learning_rate": 1.3400596746385817e-06, - "logits/chosen": -1.92405104637146, - "logits/rejected": -1.7130537033081055, - "logps/chosen": -423.2511291503906, - "logps/rejected": -478.33294677734375, - "loss": 0.518, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3243486881256104, - "rewards/margins": 0.8681432604789734, - "rewards/rejected": -2.1924920082092285, + "epoch": 0.7, + "grad_norm": 11.75, + "learning_rate": 1.2291838384256508e-06, + "logits/chosen": -1.7811142206192017, + "logits/rejected": -1.4538167715072632, + "logps/chosen": -757.9189453125, + "logps/rejected": -1458.7880859375, + "loss": 0.2024, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.850201606750488, + "rewards/margins": 6.9421892166137695, + "rewards/rejected": -12.792390823364258, "step": 2630 }, { - "epoch": 0.69, - "grad_norm": 8.875, - "learning_rate": 1.3198715261929587e-06, - "logits/chosen": -1.888300895690918, - "logits/rejected": -1.7400954961776733, - "logps/chosen": -396.8492736816406, - "logps/rejected": -486.35400390625, - "loss": 0.4583, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.449106216430664, - "rewards/margins": 0.9863926768302917, - "rewards/rejected": -2.4354991912841797, + "epoch": 0.71, + "grad_norm": 2.90625, + "learning_rate": 1.2091516252105323e-06, + "logits/chosen": -1.9148595333099365, + "logits/rejected": -1.6675951480865479, + "logps/chosen": -781.6110229492188, + "logps/rejected": -1495.281982421875, + "loss": 0.1428, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.9503889083862305, + "rewards/margins": 7.140740394592285, + "rewards/rejected": -13.091130256652832, "step": 2640 }, { - "epoch": 0.69, - "grad_norm": 5.28125, - "learning_rate": 1.2997819181352823e-06, - "logits/chosen": -1.9277763366699219, - "logits/rejected": -1.792421579360962, - "logps/chosen": -438.722900390625, - "logps/rejected": -527.2325439453125, - "loss": 0.4197, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.1698882579803467, - "rewards/margins": 1.157183289527893, - "rewards/rejected": -2.3270716667175293, + "epoch": 0.71, + "grad_norm": 1.7109375, + "learning_rate": 1.1892317911069212e-06, + "logits/chosen": -1.8337042331695557, + "logits/rejected": -1.4769656658172607, + "logps/chosen": -804.5179443359375, + "logps/rejected": -1690.7152099609375, + "loss": 0.1166, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.174851417541504, + "rewards/margins": 8.737001419067383, + "rewards/rejected": -14.911852836608887, "step": 2650 }, { - "epoch": 0.7, - "grad_norm": 20.625, - "learning_rate": 1.2797925279420454e-06, - "logits/chosen": -1.9607807397842407, - "logits/rejected": -1.7774593830108643, - "logps/chosen": -437.16436767578125, - "logps/rejected": -518.04931640625, - "loss": 0.5167, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4784574508666992, - "rewards/margins": 0.9173187017440796, - "rewards/rejected": -2.3957762718200684, + "epoch": 0.71, + "grad_norm": 1.546875, + "learning_rate": 1.1694260703024266e-06, + "logits/chosen": -1.903599739074707, + "logits/rejected": -1.5903232097625732, + "logps/chosen": -827.9391479492188, + "logps/rejected": -1669.436767578125, + "loss": 0.1213, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.192580699920654, + "rewards/margins": 8.526972770690918, + "rewards/rejected": -14.719552993774414, "step": 2660 }, { - "epoch": 0.7, - "grad_norm": 11.8125, - "learning_rate": 1.2599050247215764e-06, - "logits/chosen": -1.871718406677246, - "logits/rejected": -1.7832616567611694, - "logps/chosen": -417.762451171875, - "logps/rejected": -488.9070739746094, - "loss": 0.4824, - "rewards/accuracies": 0.8062499761581421, - "rewards/chosen": -1.3971645832061768, - "rewards/margins": 0.9499279260635376, - "rewards/rejected": -2.347092390060425, + "epoch": 0.71, + "grad_norm": 1.8203125, + "learning_rate": 1.1497361870501425e-06, + "logits/chosen": -1.8243070840835571, + "logits/rejected": -1.5200848579406738, + "logps/chosen": -765.5211181640625, + "logps/rejected": -1657.3450927734375, + "loss": 0.1269, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.842063903808594, + "rewards/margins": 8.801877975463867, + "rewards/rejected": -14.643940925598145, "step": 2670 }, { - "epoch": 0.7, - "grad_norm": 9.4375, - "learning_rate": 1.2401210690746705e-06, - "logits/chosen": -1.9106123447418213, - "logits/rejected": -1.728986382484436, - "logps/chosen": -418.19207763671875, - "logps/rejected": -467.6233825683594, - "loss": 0.5374, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2990198135375977, - "rewards/margins": 0.8474353551864624, - "rewards/rejected": -2.1464550495147705, + "epoch": 0.72, + "grad_norm": 1.2109375, + "learning_rate": 1.130163855518544e-06, + "logits/chosen": -1.822270154953003, + "logits/rejected": -1.5637589693069458, + "logps/chosen": -796.2030029296875, + "logps/rejected": -1690.8538818359375, + "loss": 0.1248, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.09335470199585, + "rewards/margins": 8.841546058654785, + "rewards/rejected": -14.934901237487793, "step": 2680 }, { - "epoch": 0.7, - "grad_norm": 8.25, - "learning_rate": 1.2204423129559306e-06, - "logits/chosen": -1.945713758468628, - "logits/rejected": -1.860364556312561, - "logps/chosen": -415.3990173339844, - "logps/rejected": -502.76605224609375, - "loss": 0.5061, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2963026762008667, - "rewards/margins": 0.9278820753097534, - "rewards/rejected": -2.22418475151062, + "epoch": 0.72, + "grad_norm": 18.875, + "learning_rate": 1.110710779642244e-06, + "logits/chosen": -1.839624047279358, + "logits/rejected": -1.576751708984375, + "logps/chosen": -819.2501831054688, + "logps/rejected": -1549.195556640625, + "loss": 0.2897, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.27780818939209, + "rewards/margins": 7.272715091705322, + "rewards/rejected": -13.55052375793457, "step": 2690 }, { - "epoch": 0.71, - "grad_norm": 13.625, - "learning_rate": 1.20087039953583e-06, - "logits/chosen": -1.9242095947265625, - "logits/rejected": -1.7687902450561523, - "logps/chosen": -412.12188720703125, - "logps/rejected": -479.70123291015625, - "loss": 0.4946, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.2817579507827759, - "rewards/margins": 0.9811213612556458, - "rewards/rejected": -2.2628793716430664, + "epoch": 0.72, + "grad_norm": 11.375, + "learning_rate": 1.0913786529736558e-06, + "logits/chosen": -1.802791953086853, + "logits/rejected": -1.4809070825576782, + "logps/chosen": -757.2066650390625, + "logps/rejected": -1667.732421875, + "loss": 0.1235, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.712438106536865, + "rewards/margins": 9.024209976196289, + "rewards/rejected": -14.736648559570312, "step": 2700 }, { - "epoch": 0.71, - "eval_logits/chosen": -1.9339491128921509, - "eval_logits/rejected": -1.8200063705444336, - "eval_logps/chosen": -419.6712951660156, - "eval_logps/rejected": -487.4803771972656, - "eval_loss": 0.5093861818313599, - "eval_rewards/accuracies": 0.7300000190734863, - "eval_rewards/chosen": -1.345369815826416, - "eval_rewards/margins": 0.896984338760376, - "eval_rewards/rejected": -2.242354154586792, - "eval_runtime": 783.2364, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "epoch": 0.72, + "eval_logits/chosen": -1.9443422555923462, + "eval_logits/rejected": -1.8262375593185425, + "eval_logps/chosen": -845.3806762695312, + "eval_logps/rejected": -925.70849609375, + "eval_loss": 1.044278621673584, + "eval_rewards/accuracies": 0.5924999713897705, + "eval_rewards/chosen": -5.602463722229004, + "eval_rewards/margins": 1.0221716165542603, + "eval_rewards/rejected": -6.624634742736816, + "eval_runtime": 780.7798, + "eval_samples_per_second": 2.562, + "eval_steps_per_second": 0.32, "step": 2700 }, { - "epoch": 0.71, - "grad_norm": 9.0, - "learning_rate": 1.181406963063507e-06, - "logits/chosen": -1.852378487586975, - "logits/rejected": -1.8395885229110718, - "logps/chosen": -417.3213806152344, - "logps/rejected": -499.66241455078125, - "loss": 0.5247, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": -1.254032850265503, - "rewards/margins": 0.9126483798027039, - "rewards/rejected": -2.1666812896728516, + "epoch": 0.72, + "grad_norm": 10.9375, + "learning_rate": 1.0721691585355618e-06, + "logits/chosen": -1.7941747903823853, + "logits/rejected": -1.5727746486663818, + "logps/chosen": -802.1234130859375, + "logps/rejected": -1506.57763671875, + "loss": 0.1902, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.127458095550537, + "rewards/margins": 6.992899417877197, + "rewards/rejected": -13.12035846710205, "step": 2710 }, { - "epoch": 0.71, - "grad_norm": 7.53125, - "learning_rate": 1.1620536287303052e-06, - "logits/chosen": -1.9413061141967773, - "logits/rejected": -1.761757493019104, - "logps/chosen": -443.94256591796875, - "logps/rejected": -479.33001708984375, - "loss": 0.5708, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3163220882415771, - "rewards/margins": 0.6949169635772705, - "rewards/rejected": -2.0112390518188477, + "epoch": 0.73, + "grad_norm": 10.875, + "learning_rate": 1.0530839686745806e-06, + "logits/chosen": -1.825495958328247, + "logits/rejected": -1.555251121520996, + "logps/chosen": -809.5203857421875, + "logps/rejected": -1606.273193359375, + "loss": 0.1539, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.172567367553711, + "rewards/margins": 7.86652135848999, + "rewards/rejected": -14.039088249206543, "step": 2720 }, { - "epoch": 0.71, - "grad_norm": 9.3125, - "learning_rate": 1.1428120125340717e-06, - "logits/chosen": -1.8692998886108398, - "logits/rejected": -1.7310359477996826, - "logps/chosen": -402.96820068359375, - "logps/rejected": -472.52716064453125, - "loss": 0.4338, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.2012834548950195, - "rewards/margins": 1.200130820274353, - "rewards/rejected": -2.401414394378662, + "epoch": 0.73, + "grad_norm": 5.25, + "learning_rate": 1.0341247449155824e-06, + "logits/chosen": -1.8205457925796509, + "logits/rejected": -1.512557864189148, + "logps/chosen": -829.7819213867188, + "logps/rejected": -1568.7506103515625, + "loss": 0.1747, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.401388645172119, + "rewards/margins": 7.3971991539001465, + "rewards/rejected": -13.798588752746582, "step": 2730 }, { - "epoch": 0.72, - "grad_norm": 7.625, - "learning_rate": 1.123683721144223e-06, - "logits/chosen": -1.8488807678222656, - "logits/rejected": -1.749967336654663, - "logps/chosen": -426.597412109375, - "logps/rejected": -500.22113037109375, - "loss": 0.4657, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.22300386428833, - "rewards/margins": 1.0593169927597046, - "rewards/rejected": -2.282320976257324, + "epoch": 0.73, + "grad_norm": 2.71875, + "learning_rate": 1.0152931378170406e-06, + "logits/chosen": -1.7705323696136475, + "logits/rejected": -1.4258768558502197, + "logps/chosen": -788.82373046875, + "logps/rejected": -1534.7718505859375, + "loss": 0.1828, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -6.10504674911499, + "rewards/margins": 7.536930084228516, + "rewards/rejected": -13.641977310180664, "step": 2740 }, { - "epoch": 0.72, - "grad_norm": 7.375, - "learning_rate": 1.1046703517675848e-06, - "logits/chosen": -1.9238160848617554, - "logits/rejected": -1.8325822353363037, - "logps/chosen": -397.0641174316406, - "logps/rejected": -492.7923889160156, - "loss": 0.5064, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2281832695007324, - "rewards/margins": 0.9004761576652527, - "rewards/rejected": -2.128659248352051, + "epoch": 0.73, + "grad_norm": 2.09375, + "learning_rate": 9.96590786827333e-07, + "logits/chosen": -1.8109601736068726, + "logits/rejected": -1.5829339027404785, + "logps/chosen": -827.0929565429688, + "logps/rejected": -1609.902099609375, + "loss": 0.1431, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.4153337478637695, + "rewards/margins": 7.681593418121338, + "rewards/rejected": -14.09692668914795, "step": 2750 }, { - "epoch": 0.72, - "grad_norm": 9.5625, - "learning_rate": 1.085773492015028e-06, - "logits/chosen": -1.8633310794830322, - "logits/rejected": -1.6645698547363281, - "logps/chosen": -390.46405029296875, - "logps/rejected": -457.5859375, - "loss": 0.4611, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.2341833114624023, - "rewards/margins": 1.0489164590835571, - "rewards/rejected": -2.28309965133667, + "epoch": 0.74, + "grad_norm": 16.75, + "learning_rate": 9.780193201420144e-07, + "logits/chosen": -1.8415769338607788, + "logits/rejected": -1.5490391254425049, + "logps/chosen": -801.6826782226562, + "logps/rejected": -1576.27587890625, + "loss": 0.1532, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.982812404632568, + "rewards/margins": 7.840212821960449, + "rewards/rejected": -13.823025703430176, "step": 2760 }, { - "epoch": 0.72, - "grad_norm": 9.5625, - "learning_rate": 1.0669947197689034e-06, - "logits/chosen": -1.828849196434021, - "logits/rejected": -1.7827651500701904, - "logps/chosen": -433.4454040527344, - "logps/rejected": -493.23187255859375, - "loss": 0.5096, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3995469808578491, - "rewards/margins": 0.8646076321601868, - "rewards/rejected": -2.2641544342041016, + "epoch": 0.74, + "grad_norm": 10.9375, + "learning_rate": 9.59580354562072e-07, + "logits/chosen": -1.8422110080718994, + "logits/rejected": -1.512509822845459, + "logps/chosen": -825.0670776367188, + "logps/rejected": -1616.693115234375, + "loss": 0.1553, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.282979965209961, + "rewards/margins": 7.964785575866699, + "rewards/rejected": -14.247766494750977, "step": 2770 }, { - "epoch": 0.73, - "grad_norm": 8.625, - "learning_rate": 1.048335603051291e-06, - "logits/chosen": -1.815760612487793, - "logits/rejected": -1.668330192565918, - "logps/chosen": -463.34033203125, - "logps/rejected": -534.0313720703125, - "loss": 0.452, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.546386480331421, - "rewards/margins": 1.0244462490081787, - "rewards/rejected": -2.5708324909210205, + "epoch": 0.74, + "grad_norm": 10.0, + "learning_rate": 9.412754953531664e-07, + "logits/chosen": -1.6988179683685303, + "logits/rejected": -1.3882054090499878, + "logps/chosen": -789.2442626953125, + "logps/rejected": -1564.3502197265625, + "loss": 0.1322, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.106675148010254, + "rewards/margins": 7.6955246925354, + "rewards/rejected": -13.802200317382812, "step": 2780 }, { - "epoch": 0.73, - "grad_norm": 9.4375, - "learning_rate": 1.0297976998930665e-06, - "logits/chosen": -1.82321298122406, - "logits/rejected": -1.7483196258544922, - "logps/chosen": -437.36407470703125, - "logps/rejected": -515.2645263671875, - "loss": 0.4615, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.430606484413147, - "rewards/margins": 1.13131844997406, - "rewards/rejected": -2.561924695968628, + "epoch": 0.75, + "grad_norm": 11.9375, + "learning_rate": 9.231063361058806e-07, + "logits/chosen": -1.7611534595489502, + "logits/rejected": -1.5275523662567139, + "logps/chosen": -850.1500854492188, + "logps/rejected": -1655.7884521484375, + "loss": 0.1367, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.542205810546875, + "rewards/margins": 7.965720176696777, + "rewards/rejected": -14.507925033569336, "step": 2790 }, { - "epoch": 0.73, - "grad_norm": 7.6875, - "learning_rate": 1.0113825582038078e-06, - "logits/chosen": -1.86112380027771, - "logits/rejected": -1.7646338939666748, - "logps/chosen": -427.6062927246094, - "logps/rejected": -500.8868103027344, - "loss": 0.5054, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.4373871088027954, - "rewards/margins": 0.9105140566825867, - "rewards/rejected": -2.3479011058807373, + "epoch": 0.75, + "grad_norm": 2.265625, + "learning_rate": 9.050744585969834e-07, + "logits/chosen": -1.7793487310409546, + "logits/rejected": -1.5123307704925537, + "logps/chosen": -768.626953125, + "logps/rejected": -1476.6981201171875, + "loss": 0.1229, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.841226577758789, + "rewards/margins": 7.072574615478516, + "rewards/rejected": -12.913800239562988, "step": 2800 }, { - "epoch": 0.73, - "eval_logits/chosen": -1.918005347251892, - "eval_logits/rejected": -1.8041657209396362, - "eval_logps/chosen": -425.96136474609375, - "eval_logps/rejected": -495.76287841796875, - "eval_loss": 0.5085317492485046, - "eval_rewards/accuracies": 0.7319999933242798, - "eval_rewards/chosen": -1.4082709550857544, - "eval_rewards/margins": 0.9169085621833801, - "eval_rewards/rejected": -2.3251795768737793, - "eval_runtime": 782.9743, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "epoch": 0.75, + "eval_logits/chosen": -1.9388341903686523, + "eval_logits/rejected": -1.821057915687561, + "eval_logps/chosen": -857.1437377929688, + "eval_logps/rejected": -939.6195678710938, + "eval_loss": 1.0562974214553833, + "eval_rewards/accuracies": 0.5914999842643738, + "eval_rewards/chosen": -5.720094680786133, + "eval_rewards/margins": 1.0436511039733887, + "eval_rewards/rejected": -6.763746738433838, + "eval_runtime": 781.6527, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, "step": 2800 }, { - "epoch": 0.74, - "grad_norm": 9.75, - "learning_rate": 9.930917156425477e-07, - "logits/chosen": -1.88894522190094, - "logits/rejected": -1.7312339544296265, - "logps/chosen": -430.57806396484375, - "logps/rejected": -514.3462524414062, - "loss": 0.5414, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4730045795440674, - "rewards/margins": 0.888375461101532, - "rewards/rejected": -2.361380100250244, + "epoch": 0.75, + "grad_norm": 10.5, + "learning_rate": 8.871814326517264e-07, + "logits/chosen": -1.7912660837173462, + "logits/rejected": -1.4652750492095947, + "logps/chosen": -845.84033203125, + "logps/rejected": -1647.9117431640625, + "loss": 0.1651, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -6.425032615661621, + "rewards/margins": 8.037796974182129, + "rewards/rejected": -14.46282958984375, "step": 2810 }, { - "epoch": 0.74, - "grad_norm": 9.9375, - "learning_rate": 9.749266994893756e-07, - "logits/chosen": -1.753129243850708, - "logits/rejected": -1.7074140310287476, - "logps/chosen": -392.9122619628906, - "logps/rejected": -458.78887939453125, - "loss": 0.57, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3587015867233276, - "rewards/margins": 0.7806506156921387, - "rewards/rejected": -2.139352321624756, + "epoch": 0.75, + "grad_norm": 16.375, + "learning_rate": 8.694288160071746e-07, + "logits/chosen": -1.7436532974243164, + "logits/rejected": -1.343569040298462, + "logps/chosen": -901.9475708007812, + "logps/rejected": -1744.094970703125, + "loss": 0.1745, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -7.057692050933838, + "rewards/margins": 8.397699356079102, + "rewards/rejected": -15.455390930175781, "step": 2820 }, { - "epoch": 0.74, - "grad_norm": 12.875, - "learning_rate": 9.56889026517913e-07, - "logits/chosen": -1.8339742422103882, - "logits/rejected": -1.7476985454559326, - "logps/chosen": -428.970703125, - "logps/rejected": -468.741455078125, - "loss": 0.5567, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.4643380641937256, - "rewards/margins": 0.7202334403991699, - "rewards/rejected": -2.1845715045928955, + "epoch": 0.76, + "grad_norm": 19.875, + "learning_rate": 8.518181541765904e-07, + "logits/chosen": -1.8024213314056396, + "logits/rejected": -1.5170562267303467, + "logps/chosen": -840.4650268554688, + "logps/rejected": -1586.270263671875, + "loss": 0.1989, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.36107063293457, + "rewards/margins": 7.343822479248047, + "rewards/rejected": -13.7048921585083, "step": 2830 }, { - "epoch": 0.74, - "grad_norm": 7.9375, - "learning_rate": 9.389802028686617e-07, - "logits/chosen": -1.8853356838226318, - "logits/rejected": -1.784654974937439, - "logps/chosen": -423.21673583984375, - "logps/rejected": -450.36773681640625, - "loss": 0.5783, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4194176197052002, - "rewards/margins": 0.6203545331954956, - "rewards/rejected": -2.0397722721099854, + "epoch": 0.76, + "grad_norm": 1.8515625, + "learning_rate": 8.343509803148898e-07, + "logits/chosen": -1.760271668434143, + "logits/rejected": -1.4487478733062744, + "logps/chosen": -829.8733520507812, + "logps/rejected": -1621.2315673828125, + "loss": 0.2415, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -6.347391605377197, + "rewards/margins": 7.86527156829834, + "rewards/rejected": -14.212663650512695, "step": 2840 }, { - "epoch": 0.75, - "grad_norm": 5.78125, - "learning_rate": 9.212017239232427e-07, - "logits/chosen": -1.907448172569275, - "logits/rejected": -1.665366768836975, - "logps/chosen": -412.2789001464844, - "logps/rejected": -495.1942443847656, - "loss": 0.4473, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.196616291999817, - "rewards/margins": 1.0811289548873901, - "rewards/rejected": -2.277745246887207, + "epoch": 0.76, + "grad_norm": 11.125, + "learning_rate": 8.170288150851635e-07, + "logits/chosen": -1.7877166271209717, + "logits/rejected": -1.4713761806488037, + "logps/chosen": -742.2184448242188, + "logps/rejected": -1476.7142333984375, + "loss": 0.1751, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.588148593902588, + "rewards/margins": 7.356741905212402, + "rewards/rejected": -12.944890022277832, "step": 2850 }, { - "epoch": 0.75, - "grad_norm": 6.9375, - "learning_rate": 9.03555074179533e-07, - "logits/chosen": -1.8094466924667358, - "logits/rejected": -1.8180068731307983, - "logps/chosen": -398.2765808105469, - "logps/rejected": -497.26824951171875, - "loss": 0.4864, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.2072433233261108, - "rewards/margins": 0.9678330421447754, - "rewards/rejected": -2.175076484680176, + "epoch": 0.76, + "grad_norm": 5.78125, + "learning_rate": 7.998531665262907e-07, + "logits/chosen": -1.813119888305664, + "logits/rejected": -1.519991397857666, + "logps/chosen": -784.0347900390625, + "logps/rejected": -1572.8804931640625, + "loss": 0.1347, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.94631290435791, + "rewards/margins": 7.838288307189941, + "rewards/rejected": -13.784601211547852, "step": 2860 }, { - "epoch": 0.75, - "grad_norm": 7.15625, - "learning_rate": 8.860417271277067e-07, - "logits/chosen": -1.9343254566192627, - "logits/rejected": -1.969805359840393, - "logps/chosen": -407.0105285644531, - "logps/rejected": -468.46826171875, - "loss": 0.5046, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.1932955980300903, - "rewards/margins": 0.7251569628715515, - "rewards/rejected": -1.9184526205062866, + "epoch": 0.77, + "grad_norm": 2.640625, + "learning_rate": 7.828255299216553e-07, + "logits/chosen": -1.8793045282363892, + "logits/rejected": -1.6276319026947021, + "logps/chosen": -803.3926391601562, + "logps/rejected": -1561.659423828125, + "loss": 0.117, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -6.136702537536621, + "rewards/margins": 7.4938812255859375, + "rewards/rejected": -13.630582809448242, "step": 2870 }, { - "epoch": 0.75, - "grad_norm": 10.0, - "learning_rate": 8.686631451272029e-07, - "logits/chosen": -1.9621702432632446, - "logits/rejected": -1.7568639516830444, - "logps/chosen": -402.2386779785156, - "logps/rejected": -468.0703125, - "loss": 0.5113, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2953293323516846, - "rewards/margins": 0.9019441604614258, - "rewards/rejected": -2.1972732543945312, + "epoch": 0.77, + "grad_norm": 5.4375, + "learning_rate": 7.659473876689649e-07, + "logits/chosen": -1.7960598468780518, + "logits/rejected": -1.5569416284561157, + "logps/chosen": -826.9943237304688, + "logps/rejected": -1568.2489013671875, + "loss": 0.1817, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.515174865722656, + "rewards/margins": 7.30636739730835, + "rewards/rejected": -13.821542739868164, "step": 2880 }, { - "epoch": 0.76, - "grad_norm": 6.75, - "learning_rate": 8.514207792846168e-07, - "logits/chosen": -1.8871433734893799, - "logits/rejected": -1.749696969985962, - "logps/chosen": -405.40374755859375, - "logps/rejected": -461.99078369140625, - "loss": 0.4967, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3629052639007568, - "rewards/margins": 0.8758969306945801, - "rewards/rejected": -2.238802433013916, + "epoch": 0.77, + "grad_norm": 11.375, + "learning_rate": 7.492202091511986e-07, + "logits/chosen": -1.8298485279083252, + "logits/rejected": -1.5398129224777222, + "logps/chosen": -814.5342407226562, + "logps/rejected": -1584.274169921875, + "loss": 0.2433, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.155816078186035, + "rewards/margins": 7.666536808013916, + "rewards/rejected": -13.822354316711426, "step": 2890 }, { - "epoch": 0.76, - "grad_norm": 5.3125, - "learning_rate": 8.343160693325356e-07, - "logits/chosen": -1.8498756885528564, - "logits/rejected": -1.7222645282745361, - "logps/chosen": -402.5791931152344, - "logps/rejected": -493.0008239746094, - "loss": 0.5159, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.2853260040283203, - "rewards/margins": 0.957757294178009, - "rewards/rejected": -2.2430832386016846, + "epoch": 0.77, + "grad_norm": 15.4375, + "learning_rate": 7.326454506086844e-07, + "logits/chosen": -1.836544394493103, + "logits/rejected": -1.5596864223480225, + "logps/chosen": -795.0130004882812, + "logps/rejected": -1612.1021728515625, + "loss": 0.1734, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.960467338562012, + "rewards/margins": 8.215749740600586, + "rewards/rejected": -14.176218032836914, "step": 2900 }, { - "epoch": 0.76, - "eval_logits/chosen": -1.9330179691314697, - "eval_logits/rejected": -1.8193044662475586, - "eval_logps/chosen": -419.8022155761719, - "eval_logps/rejected": -486.5226745605469, - "eval_loss": 0.5065543055534363, - "eval_rewards/accuracies": 0.7319999933242798, - "eval_rewards/chosen": -1.346679449081421, - "eval_rewards/margins": 0.8860979676246643, - "eval_rewards/rejected": -2.2327773571014404, - "eval_runtime": 784.023, - "eval_samples_per_second": 2.551, - "eval_steps_per_second": 0.319, + "epoch": 0.77, + "eval_logits/chosen": -1.9442803859710693, + "eval_logits/rejected": -1.8265658617019653, + "eval_logps/chosen": -834.2926635742188, + "eval_logps/rejected": -913.9391479492188, + "eval_loss": 1.024413824081421, + "eval_rewards/accuracies": 0.593999981880188, + "eval_rewards/chosen": -5.491583824157715, + "eval_rewards/margins": 1.015357494354248, + "eval_rewards/rejected": -6.506941795349121, + "eval_runtime": 781.3907, + "eval_samples_per_second": 2.56, + "eval_steps_per_second": 0.32, "step": 2900 }, { - "epoch": 0.76, - "grad_norm": 7.375, - "learning_rate": 8.173504435093174e-07, - "logits/chosen": -1.8513338565826416, - "logits/rejected": -1.6518259048461914, - "logps/chosen": -398.8111877441406, - "logps/rejected": -456.32086181640625, - "loss": 0.4881, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3139582872390747, - "rewards/margins": 0.9640085101127625, - "rewards/rejected": -2.2779667377471924, + "epoch": 0.78, + "grad_norm": 11.5625, + "learning_rate": 7.162245550123193e-07, + "logits/chosen": -1.8039443492889404, + "logits/rejected": -1.4971197843551636, + "logps/chosen": -814.34326171875, + "logps/rejected": -1535.5086669921875, + "loss": 0.1568, + "rewards/accuracies": 0.9375, + "rewards/chosen": -6.270487308502197, + "rewards/margins": 7.268198490142822, + "rewards/rejected": -13.53868579864502, "step": 2910 }, { - "epoch": 0.76, - "grad_norm": 7.21875, - "learning_rate": 8.00525318439836e-07, - "logits/chosen": -1.9344298839569092, - "logits/rejected": -1.7016353607177734, - "logps/chosen": -431.385498046875, - "logps/rejected": -499.0389099121094, - "loss": 0.5575, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3274246454238892, - "rewards/margins": 0.7691916227340698, - "rewards/rejected": -2.09661602973938, + "epoch": 0.78, + "grad_norm": 12.0, + "learning_rate": 6.999589519379494e-07, + "logits/chosen": -1.7954915761947632, + "logits/rejected": -1.5133321285247803, + "logps/chosen": -779.1148071289062, + "logps/rejected": -1520.70703125, + "loss": 0.1384, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.82504415512085, + "rewards/margins": 7.360219478607178, + "rewards/rejected": -13.185264587402344, "step": 2920 }, { - "epoch": 0.77, - "grad_norm": 7.90625, - "learning_rate": 7.838420990171927e-07, - "logits/chosen": -1.9444793462753296, - "logits/rejected": -1.7388956546783447, - "logps/chosen": -421.0689392089844, - "logps/rejected": -473.1524353027344, - "loss": 0.5226, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3027684688568115, - "rewards/margins": 0.813397228717804, - "rewards/rejected": -2.1161653995513916, + "epoch": 0.78, + "grad_norm": 8.5, + "learning_rate": 6.83850057441913e-07, + "logits/chosen": -1.8146121501922607, + "logits/rejected": -1.5364322662353516, + "logps/chosen": -776.1300659179688, + "logps/rejected": -1522.8975830078125, + "loss": 0.1867, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.84547758102417, + "rewards/margins": 7.345385551452637, + "rewards/rejected": -13.190862655639648, "step": 2930 }, { - "epoch": 0.77, - "grad_norm": 7.15625, - "learning_rate": 7.673021782854084e-07, - "logits/chosen": -1.836850881576538, - "logits/rejected": -1.6611976623535156, - "logps/chosen": -419.4012756347656, - "logps/rejected": -462.13726806640625, - "loss": 0.4885, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3087483644485474, - "rewards/margins": 0.983415961265564, - "rewards/rejected": -2.2921640872955322, + "epoch": 0.79, + "grad_norm": 18.875, + "learning_rate": 6.678992739377582e-07, + "logits/chosen": -1.8299884796142578, + "logits/rejected": -1.5080091953277588, + "logps/chosen": -812.95654296875, + "logps/rejected": -1622.835205078125, + "loss": 0.1807, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.02605676651001, + "rewards/margins": 8.1326265335083, + "rewards/rejected": -14.158681869506836, "step": 2940 }, { - "epoch": 0.77, - "grad_norm": 5.625, - "learning_rate": 7.509069373231039e-07, - "logits/chosen": -1.8366082906723022, - "logits/rejected": -1.7366777658462524, - "logps/chosen": -413.8230895996094, - "logps/rejected": -465.357177734375, - "loss": 0.5603, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4296019077301025, - "rewards/margins": 0.7154251337051392, - "rewards/rejected": -2.145026922225952, + "epoch": 0.79, + "grad_norm": 55.5, + "learning_rate": 6.521079900741542e-07, + "logits/chosen": -1.767059087753296, + "logits/rejected": -1.5588467121124268, + "logps/chosen": -786.4676513671875, + "logps/rejected": -1427.7884521484375, + "loss": 0.2542, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.954463005065918, + "rewards/margins": 6.433099269866943, + "rewards/rejected": -12.38756275177002, "step": 2950 }, { - "epoch": 0.77, - "grad_norm": 5.84375, - "learning_rate": 7.346577451281822e-07, - "logits/chosen": -1.8811330795288086, - "logits/rejected": -1.754575490951538, - "logps/chosen": -425.0521545410156, - "logps/rejected": -499.4877014160156, - "loss": 0.4909, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.4068305492401123, - "rewards/margins": 0.9745821952819824, - "rewards/rejected": -2.381412982940674, + "epoch": 0.79, + "grad_norm": 0.78515625, + "learning_rate": 6.364775806139975e-07, + "logits/chosen": -1.793999433517456, + "logits/rejected": -1.5091029405593872, + "logps/chosen": -839.9996337890625, + "logps/rejected": -1650.9144287109375, + "loss": 0.1235, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.430754661560059, + "rewards/margins": 8.038093566894531, + "rewards/rejected": -14.468847274780273, "step": 2960 }, { - "epoch": 0.78, - "grad_norm": 10.875, - "learning_rate": 7.185559585035138e-07, - "logits/chosen": -1.8809776306152344, - "logits/rejected": -1.7219616174697876, - "logps/chosen": -449.704833984375, - "logps/rejected": -526.0900268554688, - "loss": 0.4946, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4797046184539795, - "rewards/margins": 0.9321640729904175, - "rewards/rejected": -2.4118688106536865, + "epoch": 0.79, + "grad_norm": 9.5625, + "learning_rate": 6.21009406314727e-07, + "logits/chosen": -1.8470089435577393, + "logits/rejected": -1.5226188898086548, + "logps/chosen": -806.4625244140625, + "logps/rejected": -1513.0128173828125, + "loss": 0.2682, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.04176139831543, + "rewards/margins": 7.0099334716796875, + "rewards/rejected": -13.0516939163208, "step": 2970 }, { - "epoch": 0.78, - "grad_norm": 10.375, - "learning_rate": 7.026029219436504e-07, - "logits/chosen": -1.8558896780014038, - "logits/rejected": -1.67585027217865, - "logps/chosen": -418.6376953125, - "logps/rejected": -502.877685546875, - "loss": 0.4795, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.4395673274993896, - "rewards/margins": 0.9882482290267944, - "rewards/rejected": -2.4278156757354736, + "epoch": 0.8, + "grad_norm": 9.1875, + "learning_rate": 6.057048138098567e-07, + "logits/chosen": -1.8832355737686157, + "logits/rejected": -1.579980731010437, + "logps/chosen": -732.5565795898438, + "logps/rejected": -1567.3011474609375, + "loss": 0.1713, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.4515252113342285, + "rewards/margins": 8.183982849121094, + "rewards/rejected": -13.635507583618164, "step": 2980 }, { - "epoch": 0.78, - "grad_norm": 9.1875, - "learning_rate": 6.867999675225523e-07, - "logits/chosen": -1.9322134256362915, - "logits/rejected": -1.7906997203826904, - "logps/chosen": -387.1766662597656, - "logps/rejected": -465.6539001464844, - "loss": 0.4869, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.3697566986083984, - "rewards/margins": 0.9596658945083618, - "rewards/rejected": -2.3294224739074707, + "epoch": 0.8, + "grad_norm": 4.78125, + "learning_rate": 5.90565135491743e-07, + "logits/chosen": -1.8270089626312256, + "logits/rejected": -1.6248699426651, + "logps/chosen": -783.6031494140625, + "logps/rejected": -1493.7191162109375, + "loss": 0.1782, + "rewards/accuracies": 0.90625, + "rewards/chosen": -6.045929431915283, + "rewards/margins": 7.007973670959473, + "rewards/rejected": -13.053903579711914, "step": 2990 }, { - "epoch": 0.79, - "grad_norm": 7.375, - "learning_rate": 6.711484147823663e-07, - "logits/chosen": -1.8498642444610596, - "logits/rejected": -1.770777940750122, - "logps/chosen": -388.5807189941406, - "logps/rejected": -491.8318786621094, - "loss": 0.4671, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3586634397506714, - "rewards/margins": 0.9807540774345398, - "rewards/rejected": -2.3394176959991455, + "epoch": 0.8, + "grad_norm": 13.375, + "learning_rate": 5.755916893955887e-07, + "logits/chosen": -1.8112058639526367, + "logits/rejected": -1.626491904258728, + "logps/chosen": -764.4779052734375, + "logps/rejected": -1465.9271240234375, + "loss": 0.2791, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.7741007804870605, + "rewards/margins": 6.939056396484375, + "rewards/rejected": -12.713155746459961, "step": 3000 }, { - "epoch": 0.79, - "eval_logits/chosen": -1.927390217781067, - "eval_logits/rejected": -1.813964605331421, - "eval_logps/chosen": -427.0750732421875, - "eval_logps/rejected": -493.886474609375, - "eval_loss": 0.5061513781547546, - "eval_rewards/accuracies": 0.7325000166893005, - "eval_rewards/chosen": -1.4194074869155884, - "eval_rewards/margins": 0.8870080709457397, - "eval_rewards/rejected": -2.306415557861328, - "eval_runtime": 782.9041, - "eval_samples_per_second": 2.555, - "eval_steps_per_second": 0.319, + "epoch": 0.8, + "eval_logits/chosen": -1.9510712623596191, + "eval_logits/rejected": -1.8336251974105835, + "eval_logps/chosen": -809.6419677734375, + "eval_logps/rejected": -884.2896118164062, + "eval_loss": 0.9939026832580566, + "eval_rewards/accuracies": 0.593999981880188, + "eval_rewards/chosen": -5.2450761795043945, + "eval_rewards/margins": 0.9653691649436951, + "eval_rewards/rejected": -6.210445880889893, + "eval_runtime": 781.2328, + "eval_samples_per_second": 2.56, + "eval_steps_per_second": 0.32, "step": 3000 }, { - "epoch": 0.79, - "grad_norm": 6.8125, - "learning_rate": 6.556495706232413e-07, - "logits/chosen": -1.8801040649414062, - "logits/rejected": -1.712304711341858, - "logps/chosen": -434.16717529296875, - "logps/rejected": -491.79571533203125, - "loss": 0.557, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.439712405204773, - "rewards/margins": 0.7891451120376587, - "rewards/rejected": -2.2288575172424316, + "epoch": 0.8, + "grad_norm": 2.640625, + "learning_rate": 5.607857790846932e-07, + "logits/chosen": -1.7774875164031982, + "logits/rejected": -1.4975415468215942, + "logps/chosen": -744.4974365234375, + "logps/rejected": -1493.6400146484375, + "loss": 0.1508, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.606020450592041, + "rewards/margins": 7.40170431137085, + "rewards/rejected": -13.007725715637207, "step": 3010 }, { - "epoch": 0.79, - "grad_norm": 7.90625, - "learning_rate": 6.403047291942057e-07, - "logits/chosen": -1.7626889944076538, - "logits/rejected": -1.604828119277954, - "logps/chosen": -396.83819580078125, - "logps/rejected": -452.27740478515625, - "loss": 0.5308, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -1.5018559694290161, - "rewards/margins": 0.8321312665939331, - "rewards/rejected": -2.3339874744415283, + "epoch": 0.81, + "grad_norm": 5.34375, + "learning_rate": 5.461486935369714e-07, + "logits/chosen": -1.8177086114883423, + "logits/rejected": -1.5261489152908325, + "logps/chosen": -819.9765625, + "logps/rejected": -1483.857666015625, + "loss": 0.2674, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -6.2067670822143555, + "rewards/margins": 6.750323295593262, + "rewards/rejected": -12.957090377807617, "step": 3020 }, { - "epoch": 0.79, - "grad_norm": 12.875, - "learning_rate": 6.251151717851023e-07, - "logits/chosen": -1.7991710901260376, - "logits/rejected": -1.764809012413025, - "logps/chosen": -398.560791015625, - "logps/rejected": -469.8736267089844, - "loss": 0.5408, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4512755870819092, - "rewards/margins": 0.8837641477584839, - "rewards/rejected": -2.3350396156311035, + "epoch": 0.81, + "grad_norm": 9.5, + "learning_rate": 5.316817070327354e-07, + "logits/chosen": -1.816895842552185, + "logits/rejected": -1.521397352218628, + "logps/chosen": -768.5750732421875, + "logps/rejected": -1489.7919921875, + "loss": 0.2322, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.702691078186035, + "rewards/margins": 7.254155158996582, + "rewards/rejected": -12.956846237182617, "step": 3030 }, { - "epoch": 0.8, - "grad_norm": 4.75, - "learning_rate": 6.100821667196041e-07, - "logits/chosen": -2.0548505783081055, - "logits/rejected": -1.7320611476898193, - "logps/chosen": -432.7552185058594, - "logps/rejected": -450.3419494628906, - "loss": 0.5211, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4007914066314697, - "rewards/margins": 0.8642256855964661, - "rewards/rejected": -2.26501727104187, + "epoch": 0.81, + "grad_norm": 15.6875, + "learning_rate": 5.173860790437563e-07, + "logits/chosen": -1.870107650756836, + "logits/rejected": -1.59227454662323, + "logps/chosen": -747.5250854492188, + "logps/rejected": -1507.5074462890625, + "loss": 0.1589, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.517577171325684, + "rewards/margins": 7.614779472351074, + "rewards/rejected": -13.132356643676758, "step": 3040 }, { - "epoch": 0.8, - "grad_norm": 7.15625, - "learning_rate": 5.952069692493062e-07, - "logits/chosen": -1.7902101278305054, - "logits/rejected": -1.6391757726669312, - "logps/chosen": -375.85205078125, - "logps/rejected": -483.9964904785156, - "loss": 0.4403, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.279217004776001, - "rewards/margins": 1.0462336540222168, - "rewards/rejected": -2.3254504203796387, + "epoch": 0.81, + "grad_norm": 2.84375, + "learning_rate": 5.032630541236167e-07, + "logits/chosen": -1.7885679006576538, + "logits/rejected": -1.5312520265579224, + "logps/chosen": -728.3582153320312, + "logps/rejected": -1373.1923828125, + "loss": 0.2111, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.529536724090576, + "rewards/margins": 6.345106601715088, + "rewards/rejected": -11.874643325805664, "step": 3050 }, { - "epoch": 0.8, - "grad_norm": 8.0, - "learning_rate": 5.80490821448918e-07, - "logits/chosen": -1.6928142309188843, - "logits/rejected": -1.7527170181274414, - "logps/chosen": -416.954345703125, - "logps/rejected": -565.76953125, - "loss": 0.4596, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3452516794204712, - "rewards/margins": 1.0599815845489502, - "rewards/rejected": -2.405233383178711, + "epoch": 0.82, + "grad_norm": 11.8125, + "learning_rate": 4.893138617993681e-07, + "logits/chosen": -1.8757107257843018, + "logits/rejected": -1.4865679740905762, + "logps/chosen": -737.5164794921875, + "logps/rejected": -1416.6087646484375, + "loss": 0.1654, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.339116096496582, + "rewards/margins": 6.8614068031311035, + "rewards/rejected": -12.200522422790527, "step": 3060 }, { - "epoch": 0.8, - "grad_norm": 6.6875, - "learning_rate": 5.659349521125459e-07, - "logits/chosen": -1.9752018451690674, - "logits/rejected": -1.9083846807479858, - "logps/chosen": -435.5009765625, - "logps/rejected": -491.18365478515625, - "loss": 0.5258, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.340466022491455, - "rewards/margins": 0.7914128303527832, - "rewards/rejected": -2.1318788528442383, + "epoch": 0.82, + "grad_norm": 4.9375, + "learning_rate": 4.755397164644812e-07, + "logits/chosen": -1.8473145961761475, + "logits/rejected": -1.6185519695281982, + "logps/chosen": -758.03515625, + "logps/rejected": -1406.828857421875, + "loss": 0.1818, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.706026077270508, + "rewards/margins": 6.3536272048950195, + "rewards/rejected": -12.059653282165527, "step": 3070 }, { - "epoch": 0.81, + "epoch": 0.82, "grad_norm": 5.8125, - "learning_rate": 5.5154057665109e-07, - "logits/chosen": -1.8967254161834717, - "logits/rejected": -1.712303876876831, - "logps/chosen": -424.0162658691406, - "logps/rejected": -495.65972900390625, - "loss": 0.4923, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4567795991897583, - "rewards/margins": 0.9943565130233765, - "rewards/rejected": -2.4511361122131348, + "learning_rate": 4.619418172731277e-07, + "logits/chosen": -1.8655025959014893, + "logits/rejected": -1.5346721410751343, + "logps/chosen": -714.9515380859375, + "logps/rejected": -1431.1217041015625, + "loss": 0.1951, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.192868232727051, + "rewards/margins": 7.240492820739746, + "rewards/rejected": -12.433362007141113, "step": 3080 }, { - "epoch": 0.81, - "grad_norm": 5.25, - "learning_rate": 5.373088969907586e-07, - "logits/chosen": -1.9155843257904053, - "logits/rejected": -1.7840111255645752, - "logps/chosen": -438.22076416015625, - "logps/rejected": -466.3299865722656, - "loss": 0.479, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3894039392471313, - "rewards/margins": 0.8652655482292175, - "rewards/rejected": -2.254669189453125, + "epoch": 0.83, + "grad_norm": 10.3125, + "learning_rate": 4.4852134803578446e-07, + "logits/chosen": -1.8112146854400635, + "logits/rejected": -1.541689157485962, + "logps/chosen": -722.6507568359375, + "logps/rejected": -1418.703857421875, + "loss": 0.2012, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.355412483215332, + "rewards/margins": 6.820165157318115, + "rewards/rejected": -12.175577163696289, "step": 3090 }, { - "epoch": 0.81, - "grad_norm": 6.8125, - "learning_rate": 5.23241101472709e-07, - "logits/chosen": -1.8724768161773682, - "logits/rejected": -1.7467072010040283, - "logps/chosen": -429.61871337890625, - "logps/rejected": -504.12518310546875, - "loss": 0.4864, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.277355432510376, - "rewards/margins": 0.8578082919120789, - "rewards/rejected": -2.1351637840270996, + "epoch": 0.83, + "grad_norm": 25.0, + "learning_rate": 4.3527947711617197e-07, + "logits/chosen": -1.8729737997055054, + "logits/rejected": -1.5761798620224, + "logps/chosen": -731.1620483398438, + "logps/rejected": -1336.569091796875, + "loss": 0.2041, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.474403381347656, + "rewards/margins": 6.022130489349365, + "rewards/rejected": -11.496532440185547, "step": 3100 }, { - "epoch": 0.81, - "eval_logits/chosen": -1.929145097732544, - "eval_logits/rejected": -1.815799593925476, - "eval_logps/chosen": -427.61724853515625, - "eval_logps/rejected": -494.0862731933594, - "eval_loss": 0.5058856010437012, - "eval_rewards/accuracies": 0.7329999804496765, - "eval_rewards/chosen": -1.4248294830322266, - "eval_rewards/margins": 0.8835842609405518, - "eval_rewards/rejected": -2.3084137439727783, - "eval_runtime": 783.2884, - "eval_samples_per_second": 2.553, - "eval_steps_per_second": 0.319, + "epoch": 0.83, + "eval_logits/chosen": -1.9610016345977783, + "eval_logits/rejected": -1.8434377908706665, + "eval_logps/chosen": -785.0302734375, + "eval_logps/rejected": -855.6673583984375, + "eval_loss": 0.9620700478553772, + "eval_rewards/accuracies": 0.5954999923706055, + "eval_rewards/chosen": -4.998960018157959, + "eval_rewards/margins": 0.9252643585205078, + "eval_rewards/rejected": -5.924223899841309, + "eval_runtime": 781.2922, + "eval_samples_per_second": 2.56, + "eval_steps_per_second": 0.32, "step": 3100 }, { - "epoch": 0.81, - "grad_norm": 6.03125, - "learning_rate": 5.09338364753818e-07, - "logits/chosen": -1.8905067443847656, - "logits/rejected": -1.7481834888458252, - "logps/chosen": -439.99053955078125, - "logps/rejected": -510.75732421875, - "loss": 0.5265, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3464457988739014, - "rewards/margins": 0.8832794427871704, - "rewards/rejected": -2.2297251224517822, + "epoch": 0.83, + "grad_norm": 24.125, + "learning_rate": 4.2221735732953655e-07, + "logits/chosen": -1.8799352645874023, + "logits/rejected": -1.5558679103851318, + "logps/chosen": -739.7000732421875, + "logps/rejected": -1477.8525390625, + "loss": 0.1092, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.51525354385376, + "rewards/margins": 7.406874179840088, + "rewards/rejected": -12.922128677368164, "step": 3110 }, { - "epoch": 0.82, - "grad_norm": 9.9375, - "learning_rate": 4.956018477086005e-07, - "logits/chosen": -1.8537441492080688, - "logits/rejected": -1.6520248651504517, - "logps/chosen": -441.00018310546875, - "logps/rejected": -497.1783142089844, - "loss": 0.5475, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.4708020687103271, - "rewards/margins": 0.8641169667243958, - "rewards/rejected": -2.334918737411499, + "epoch": 0.83, + "grad_norm": 4.1875, + "learning_rate": 4.0933612584229174e-07, + "logits/chosen": -1.8271852731704712, + "logits/rejected": -1.4540526866912842, + "logps/chosen": -776.713134765625, + "logps/rejected": -1541.1552734375, + "loss": 0.1872, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.6901164054870605, + "rewards/margins": 7.6587700843811035, + "rewards/rejected": -13.348886489868164, "step": 3120 }, { - "epoch": 0.82, - "grad_norm": 7.84375, - "learning_rate": 4.820326973322764e-07, - "logits/chosen": -1.8311748504638672, - "logits/rejected": -1.7749812602996826, - "logps/chosen": -419.26483154296875, - "logps/rejected": -499.1497497558594, - "loss": 0.5304, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.489749789237976, - "rewards/margins": 0.8352252840995789, - "rewards/rejected": -2.3249752521514893, + "epoch": 0.84, + "grad_norm": 13.3125, + "learning_rate": 3.9663690407301644e-07, + "logits/chosen": -1.8121604919433594, + "logits/rejected": -1.4967972040176392, + "logps/chosen": -767.4049682617188, + "logps/rejected": -1499.587646484375, + "loss": 0.2297, + "rewards/accuracies": 0.8812500238418579, + "rewards/chosen": -5.7149529457092285, + "rewards/margins": 7.2757568359375, + "rewards/rejected": -12.990710258483887, "step": 3130 }, { - "epoch": 0.82, - "grad_norm": 7.21875, - "learning_rate": 4.686320466449981e-07, - "logits/chosen": -1.8751367330551147, - "logits/rejected": -1.65691339969635, - "logps/chosen": -396.12567138671875, - "logps/rejected": -495.40386962890625, - "loss": 0.4698, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3601388931274414, - "rewards/margins": 1.032128095626831, - "rewards/rejected": -2.3922669887542725, + "epoch": 0.84, + "grad_norm": 4.21875, + "learning_rate": 3.841207975948255e-07, + "logits/chosen": -1.8146421909332275, + "logits/rejected": -1.556372880935669, + "logps/chosen": -736.1464233398438, + "logps/rejected": -1403.8233642578125, + "loss": 0.1693, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.403313636779785, + "rewards/margins": 6.708578586578369, + "rewards/rejected": -12.111891746520996, "step": 3140 }, { - "epoch": 0.82, - "grad_norm": 6.5, - "learning_rate": 4.554010145972418e-07, - "logits/chosen": -1.9465984106063843, - "logits/rejected": -1.8031593561172485, - "logps/chosen": -430.3694763183594, - "logps/rejected": -502.8419494628906, - "loss": 0.5425, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4778169393539429, - "rewards/margins": 0.8588212728500366, - "rewards/rejected": -2.3366379737854004, + "epoch": 0.84, + "grad_norm": 5.40625, + "learning_rate": 3.717888960391222e-07, + "logits/chosen": -1.8087650537490845, + "logits/rejected": -1.6352970600128174, + "logps/chosen": -742.0700073242188, + "logps/rejected": -1389.765380859375, + "loss": 0.132, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.522411346435547, + "rewards/margins": 6.3892717361450195, + "rewards/rejected": -11.911683082580566, "step": 3150 }, { - "epoch": 0.83, - "grad_norm": 6.53125, - "learning_rate": 4.4234070597637455e-07, - "logits/chosen": -1.8906543254852295, - "logits/rejected": -1.8061383962631226, - "logps/chosen": -434.2395935058594, - "logps/rejected": -512.9703369140625, - "loss": 0.5175, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.405390977859497, - "rewards/margins": 0.8955022692680359, - "rewards/rejected": -2.3008930683135986, + "epoch": 0.84, + "grad_norm": 18.5, + "learning_rate": 3.5964227300073485e-07, + "logits/chosen": -1.8568089008331299, + "logits/rejected": -1.5052803754806519, + "logps/chosen": -732.3570556640625, + "logps/rejected": -1547.2830810546875, + "loss": 0.1797, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.472035884857178, + "rewards/margins": 8.027372360229492, + "rewards/rejected": -13.499404907226562, "step": 3160 }, { - "epoch": 0.83, - "grad_norm": 5.90625, - "learning_rate": 4.2945221131440783e-07, - "logits/chosen": -1.796657919883728, - "logits/rejected": -1.600589394569397, - "logps/chosen": -431.2503356933594, - "logps/rejected": -495.09710693359375, - "loss": 0.4632, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -1.4193048477172852, - "rewards/margins": 0.9972556233406067, - "rewards/rejected": -2.416560649871826, + "epoch": 0.85, + "grad_norm": 22.25, + "learning_rate": 3.4768198594445386e-07, + "logits/chosen": -1.7955982685089111, + "logits/rejected": -1.534495234489441, + "logps/chosen": -768.7007446289062, + "logps/rejected": -1410.9127197265625, + "loss": 0.2282, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.612591743469238, + "rewards/margins": 6.481339454650879, + "rewards/rejected": -12.0939302444458, "step": 3170 }, { - "epoch": 0.83, - "grad_norm": 6.3125, - "learning_rate": 4.167366067969381e-07, - "logits/chosen": -1.7535873651504517, - "logits/rejected": -1.7269706726074219, - "logps/chosen": -381.70745849609375, - "logps/rejected": -487.33544921875, - "loss": 0.5104, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.4208877086639404, - "rewards/margins": 0.8177056312561035, - "rewards/rejected": -2.238593578338623, + "epoch": 0.85, + "grad_norm": 7.59375, + "learning_rate": 3.359090761129671e-07, + "logits/chosen": -1.8354012966156006, + "logits/rejected": -1.6204370260238647, + "logps/chosen": -778.430419921875, + "logps/rejected": -1483.1397705078125, + "loss": 0.1945, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.689809799194336, + "rewards/margins": 6.9429473876953125, + "rewards/rejected": -12.632757186889648, "step": 3180 }, { - "epoch": 0.83, - "grad_norm": 5.78125, - "learning_rate": 4.041949541732826e-07, - "logits/chosen": -1.855145812034607, - "logits/rejected": -1.850327491760254, - "logps/chosen": -433.2710876464844, - "logps/rejected": -499.67047119140625, - "loss": 0.5162, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.5070629119873047, - "rewards/margins": 0.8208478689193726, - "rewards/rejected": -2.327910900115967, + "epoch": 0.85, + "grad_norm": 6.46875, + "learning_rate": 3.2432456843621454e-07, + "logits/chosen": -1.8581463098526, + "logits/rejected": -1.5767614841461182, + "logps/chosen": -744.0443115234375, + "logps/rejected": -1415.0477294921875, + "loss": 0.1642, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.575232982635498, + "rewards/margins": 6.698835849761963, + "rewards/rejected": -12.274068832397461, "step": 3190 }, { - "epoch": 0.84, - "grad_norm": 5.9375, - "learning_rate": 3.9182830066782614e-07, - "logits/chosen": -1.8350633382797241, - "logits/rejected": -1.811108946800232, - "logps/chosen": -424.4688415527344, - "logps/rejected": -527.4735717773438, - "loss": 0.5101, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.472731351852417, - "rewards/margins": 0.912972629070282, - "rewards/rejected": -2.3857038021087646, + "epoch": 0.85, + "grad_norm": 2.53125, + "learning_rate": 3.1292947144215795e-07, + "logits/chosen": -1.9127811193466187, + "logits/rejected": -1.5582940578460693, + "logps/chosen": -683.0210571289062, + "logps/rejected": -1300.957763671875, + "loss": 0.1699, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.052832126617432, + "rewards/margins": 6.252196311950684, + "rewards/rejected": -11.305027961730957, "step": 3200 }, { - "epoch": 0.84, - "eval_logits/chosen": -1.9300192594528198, - "eval_logits/rejected": -1.8167176246643066, - "eval_logps/chosen": -426.7279357910156, - "eval_logps/rejected": -493.0526123046875, - "eval_loss": 0.5056060552597046, - "eval_rewards/accuracies": 0.734000027179718, - "eval_rewards/chosen": -1.4159364700317383, - "eval_rewards/margins": 0.882140040397644, - "eval_rewards/rejected": -2.298076868057251, - "eval_runtime": 783.8657, - "eval_samples_per_second": 2.551, - "eval_steps_per_second": 0.319, + "epoch": 0.85, + "eval_logits/chosen": -1.9592163562774658, + "eval_logits/rejected": -1.8420151472091675, + "eval_logps/chosen": -793.1118774414062, + "eval_logps/rejected": -864.7924194335938, + "eval_loss": 0.9697643518447876, + "eval_rewards/accuracies": 0.5950000286102295, + "eval_rewards/chosen": -5.079776287078857, + "eval_rewards/margins": 0.9356989860534668, + "eval_rewards/rejected": -6.015475749969482, + "eval_runtime": 781.2877, + "eval_samples_per_second": 2.56, + "eval_steps_per_second": 0.32, "step": 3200 }, { - "epoch": 0.84, - "grad_norm": 7.125, - "learning_rate": 3.796376788925771e-07, - "logits/chosen": -1.8429349660873413, - "logits/rejected": -1.7482311725616455, - "logps/chosen": -427.6624450683594, - "logps/rejected": -486.4988708496094, - "loss": 0.5414, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3303483724594116, - "rewards/margins": 0.765957236289978, - "rewards/rejected": -2.0963053703308105, + "epoch": 0.86, + "grad_norm": 11.75, + "learning_rate": 3.0172477716897936e-07, + "logits/chosen": -1.882510781288147, + "logits/rejected": -1.5755535364151, + "logps/chosen": -773.5219116210938, + "logps/rejected": -1535.2706298828125, + "loss": 0.1173, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.749590873718262, + "rewards/margins": 7.536416530609131, + "rewards/rejected": -13.28600788116455, "step": 3210 }, { - "epoch": 0.84, - "grad_norm": 10.0, - "learning_rate": 3.676241067609465e-07, - "logits/chosen": -1.9111486673355103, - "logits/rejected": -1.818162202835083, - "logps/chosen": -458.59552001953125, - "logps/rejected": -496.85772705078125, - "loss": 0.5223, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3662760257720947, - "rewards/margins": 0.86688631772995, - "rewards/rejected": -2.2331624031066895, + "epoch": 0.86, + "grad_norm": 24.5, + "learning_rate": 2.907114610787179e-07, + "logits/chosen": -1.8907438516616821, + "logits/rejected": -1.600659728050232, + "logps/chosen": -763.7069091796875, + "logps/rejected": -1600.47412109375, + "loss": 0.1238, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.60447883605957, + "rewards/margins": 8.27302360534668, + "rewards/rejected": -13.877504348754883, "step": 3220 }, { - "epoch": 0.85, - "grad_norm": 8.0625, - "learning_rate": 3.5578858740274976e-07, - "logits/chosen": -1.8537018299102783, - "logits/rejected": -1.7111694812774658, - "logps/chosen": -425.08203125, - "logps/rejected": -486.8157653808594, - "loss": 0.5413, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.4430186748504639, - "rewards/margins": 0.7479842305183411, - "rewards/rejected": -2.191002607345581, + "epoch": 0.86, + "grad_norm": 13.4375, + "learning_rate": 2.798904819723453e-07, + "logits/chosen": -1.8532946109771729, + "logits/rejected": -1.6151609420776367, + "logps/chosen": -724.9547119140625, + "logps/rejected": -1497.78466796875, + "loss": 0.1694, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.479042053222656, + "rewards/margins": 7.565389156341553, + "rewards/rejected": -13.044431686401367, "step": 3230 }, { - "epoch": 0.85, - "grad_norm": 8.1875, - "learning_rate": 3.44132109080447e-07, - "logits/chosen": -1.9728435277938843, - "logits/rejected": -1.802259087562561, - "logps/chosen": -418.29925537109375, - "logps/rejected": -468.47509765625, - "loss": 0.4702, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3325908184051514, - "rewards/margins": 0.9086204767227173, - "rewards/rejected": -2.241211414337158, + "epoch": 0.87, + "grad_norm": 3.171875, + "learning_rate": 2.6926278190629624e-07, + "logits/chosen": -1.8193203210830688, + "logits/rejected": -1.5701786279678345, + "logps/chosen": -761.4595947265625, + "logps/rejected": -1493.7257080078125, + "loss": 0.1443, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.704920768737793, + "rewards/margins": 7.19381046295166, + "rewards/rejected": -12.898730278015137, "step": 3240 }, { - "epoch": 0.85, - "grad_norm": 4.25, - "learning_rate": 3.3265564510662344e-07, - "logits/chosen": -1.9281237125396729, - "logits/rejected": -1.8321959972381592, - "logps/chosen": -435.9788513183594, - "logps/rejected": -510.285888671875, - "loss": 0.459, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.2192438840866089, - "rewards/margins": 1.0345146656036377, - "rewards/rejected": -2.253758430480957, + "epoch": 0.87, + "grad_norm": 29.375, + "learning_rate": 2.588292861104547e-07, + "logits/chosen": -1.8176294565200806, + "logits/rejected": -1.4913674592971802, + "logps/chosen": -757.4988403320312, + "logps/rejected": -1452.015869140625, + "loss": 0.2572, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.766777038574219, + "rewards/margins": 6.933912754058838, + "rewards/rejected": -12.700689315795898, "step": 3250 }, { - "epoch": 0.85, - "grad_norm": 11.6875, - "learning_rate": 3.213601537627195e-07, - "logits/chosen": -1.8366546630859375, - "logits/rejected": -1.7728904485702515, - "logps/chosen": -432.23724365234375, - "logps/rejected": -493.5435485839844, - "loss": 0.544, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.5417168140411377, - "rewards/margins": 0.8185436129570007, - "rewards/rejected": -2.360260486602783, + "epoch": 0.87, + "grad_norm": 17.5, + "learning_rate": 2.485909029076017e-07, + "logits/chosen": -1.8745567798614502, + "logits/rejected": -1.4693113565444946, + "logps/chosen": -770.8389892578125, + "logps/rejected": -1528.6536865234375, + "loss": 0.1999, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.627139091491699, + "rewards/margins": 7.68057107925415, + "rewards/rejected": -13.307708740234375, "step": 3260 }, { - "epoch": 0.86, - "grad_norm": 10.3125, - "learning_rate": 3.1024657821901063e-07, - "logits/chosen": -1.8914482593536377, - "logits/rejected": -1.7727159261703491, - "logps/chosen": -404.0108337402344, - "logps/rejected": -475.5201110839844, - "loss": 0.4975, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3568850755691528, - "rewards/margins": 0.8796807527542114, - "rewards/rejected": -2.2365658283233643, + "epoch": 0.87, + "grad_norm": 7.71875, + "learning_rate": 2.3854852363434294e-07, + "logits/chosen": -1.790553092956543, + "logits/rejected": -1.4430289268493652, + "logps/chosen": -768.5995483398438, + "logps/rejected": -1433.785888671875, + "loss": 0.1821, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.625730991363525, + "rewards/margins": 6.759538173675537, + "rewards/rejected": -12.385269165039062, "step": 3270 }, { - "epoch": 0.86, - "grad_norm": 26.875, - "learning_rate": 2.9931584645585654e-07, - "logits/chosen": -1.8954200744628906, - "logits/rejected": -1.8291819095611572, - "logps/chosen": -429.4666442871094, - "logps/rejected": -514.4373168945312, - "loss": 0.5108, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3573051691055298, - "rewards/margins": 0.8282718658447266, - "rewards/rejected": -2.185576915740967, + "epoch": 0.88, + "grad_norm": 7.78125, + "learning_rate": 2.2870302256350702e-07, + "logits/chosen": -1.8332099914550781, + "logits/rejected": -1.5230542421340942, + "logps/chosen": -758.5438842773438, + "logps/rejected": -1465.334716796875, + "loss": 0.2688, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.736861228942871, + "rewards/margins": 7.083024024963379, + "rewards/rejected": -12.819883346557617, "step": 3280 }, { - "epoch": 0.86, - "grad_norm": 5.59375, - "learning_rate": 2.885688711862136e-07, - "logits/chosen": -1.813812255859375, - "logits/rejected": -1.8214820623397827, - "logps/chosen": -422.4434509277344, - "logps/rejected": -510.93707275390625, - "loss": 0.5452, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4395946264266968, - "rewards/margins": 0.9139968752861023, - "rewards/rejected": -2.3535916805267334, + "epoch": 0.88, + "grad_norm": 7.375, + "learning_rate": 2.1905525682803408e-07, + "logits/chosen": -1.7580779790878296, + "logits/rejected": -1.3955549001693726, + "logps/chosen": -742.316162109375, + "logps/rejected": -1486.322021484375, + "loss": 0.12, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.4525275230407715, + "rewards/margins": 7.50750732421875, + "rewards/rejected": -12.960034370422363, "step": 3290 }, { - "epoch": 0.86, - "grad_norm": 4.65625, - "learning_rate": 2.7800654977942486e-07, - "logits/chosen": -1.8207260370254517, - "logits/rejected": -1.6706082820892334, - "logps/chosen": -415.5560607910156, - "logps/rejected": -514.9617919921875, - "loss": 0.5317, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.3756178617477417, - "rewards/margins": 0.8347378969192505, - "rewards/rejected": -2.210355758666992, + "epoch": 0.88, + "grad_norm": 7.84375, + "learning_rate": 2.0960606634635366e-07, + "logits/chosen": -1.7858350276947021, + "logits/rejected": -1.522297739982605, + "logps/chosen": -778.438720703125, + "logps/rejected": -1517.631591796875, + "loss": 0.1894, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.899752616882324, + "rewards/margins": 7.459959506988525, + "rewards/rejected": -13.359713554382324, "step": 3300 }, { - "epoch": 0.86, - "eval_logits/chosen": -1.9272832870483398, - "eval_logits/rejected": -1.8138694763183594, - "eval_logps/chosen": -425.427978515625, - "eval_logps/rejected": -491.8742370605469, - "eval_loss": 0.5055835843086243, - "eval_rewards/accuracies": 0.7354999780654907, - "eval_rewards/chosen": -1.4029372930526733, - "eval_rewards/margins": 0.8833554983139038, - "eval_rewards/rejected": -2.2862930297851562, - "eval_runtime": 782.9474, - "eval_samples_per_second": 2.554, + "epoch": 0.88, + "eval_logits/chosen": -1.9565173387527466, + "eval_logits/rejected": -1.8393272161483765, + "eval_logps/chosen": -796.8450927734375, + "eval_logps/rejected": -869.2388916015625, + "eval_loss": 0.9730798602104187, + "eval_rewards/accuracies": 0.5975000262260437, + "eval_rewards/chosen": -5.11710786819458, + "eval_rewards/margins": 0.9428322315216064, + "eval_rewards/rejected": -6.059940338134766, + "eval_runtime": 784.0287, + "eval_samples_per_second": 2.551, "eval_steps_per_second": 0.319, "step": 3300 }, { - "epoch": 0.87, - "grad_norm": 8.125, - "learning_rate": 2.6762976418628797e-07, - "logits/chosen": -1.9211854934692383, - "logits/rejected": -1.741869568824768, - "logps/chosen": -377.3354797363281, - "logps/rejected": -417.59295654296875, - "loss": 0.5303, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.3278093338012695, - "rewards/margins": 0.8446895480155945, - "rewards/rejected": -2.172498941421509, + "epoch": 0.88, + "grad_norm": 8.3125, + "learning_rate": 2.003562737492676e-07, + "logits/chosen": -1.7809603214263916, + "logits/rejected": -1.4814460277557373, + "logps/chosen": -758.2651977539062, + "logps/rejected": -1489.6539306640625, + "loss": 0.201, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.66076135635376, + "rewards/margins": 7.262644290924072, + "rewards/rejected": -12.923406600952148, "step": 3310 }, { - "epoch": 0.87, - "grad_norm": 6.375, - "learning_rate": 2.5743938086541354e-07, - "logits/chosen": -1.8673250675201416, - "logits/rejected": -1.7157970666885376, - "logps/chosen": -422.41583251953125, - "logps/rejected": -485.87786865234375, - "loss": 0.5085, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3918393850326538, - "rewards/margins": 0.9153121113777161, - "rewards/rejected": -2.3071513175964355, + "epoch": 0.89, + "grad_norm": 3.453125, + "learning_rate": 1.913066843083264e-07, + "logits/chosen": -1.8235909938812256, + "logits/rejected": -1.6878944635391235, + "logps/chosen": -703.0023193359375, + "logps/rejected": -1377.5068359375, + "loss": 0.1528, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.281097412109375, + "rewards/margins": 6.587450981140137, + "rewards/rejected": -11.868548393249512, "step": 3320 }, { - "epoch": 0.87, - "grad_norm": 10.6875, - "learning_rate": 2.4743625071087574e-07, - "logits/chosen": -1.9498252868652344, - "logits/rejected": -1.7927253246307373, - "logps/chosen": -422.6053161621094, - "logps/rejected": -502.58154296875, - "loss": 0.472, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.277875304222107, - "rewards/margins": 1.0411653518676758, - "rewards/rejected": -2.3190407752990723, + "epoch": 0.89, + "grad_norm": 12.1875, + "learning_rate": 1.8245808586572738e-07, + "logits/chosen": -1.8138974905014038, + "logits/rejected": -1.5821456909179688, + "logps/chosen": -710.8995361328125, + "logps/rejected": -1410.432373046875, + "loss": 0.1257, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.360553741455078, + "rewards/margins": 6.941224098205566, + "rewards/rejected": -12.301777839660645, "step": 3330 }, { - "epoch": 0.87, - "grad_norm": 7.96875, - "learning_rate": 2.3762120898116498e-07, - "logits/chosen": -1.9012149572372437, - "logits/rejected": -1.7770229578018188, - "logps/chosen": -446.6949768066406, - "logps/rejected": -513.9698486328125, - "loss": 0.5254, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.564635992050171, - "rewards/margins": 0.7733919024467468, - "rewards/rejected": -2.3380281925201416, + "epoch": 0.89, + "grad_norm": 6.3125, + "learning_rate": 1.7381124876572757e-07, + "logits/chosen": -1.9142138957977295, + "logits/rejected": -1.6659278869628906, + "logps/chosen": -739.8534545898438, + "logps/rejected": -1462.5361328125, + "loss": 0.1216, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.500533103942871, + "rewards/margins": 7.069828987121582, + "rewards/rejected": -12.570362091064453, "step": 3340 }, { - "epoch": 0.88, - "grad_norm": 8.125, - "learning_rate": 2.2799507522944048e-07, - "logits/chosen": -1.7588342428207397, - "logits/rejected": -1.728158950805664, - "logps/chosen": -427.84002685546875, - "logps/rejected": -514.2551879882812, - "loss": 0.4674, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3852746486663818, - "rewards/margins": 0.9436731338500977, - "rewards/rejected": -2.3289475440979004, + "epoch": 0.9, + "grad_norm": 6.59375, + "learning_rate": 1.6536692578757647e-07, + "logits/chosen": -1.852924108505249, + "logits/rejected": -1.5509072542190552, + "logps/chosen": -708.359375, + "logps/rejected": -1479.578857421875, + "loss": 0.115, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.126400470733643, + "rewards/margins": 7.5811262130737305, + "rewards/rejected": -12.707525253295898, "step": 3350 }, { - "epoch": 0.88, - "grad_norm": 6.84375, - "learning_rate": 2.1855865323510056e-07, - "logits/chosen": -1.8500531911849976, - "logits/rejected": -1.7112195491790771, - "logps/chosen": -435.78485107421875, - "logps/rejected": -542.5267333984375, - "loss": 0.4701, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.3804128170013428, - "rewards/margins": 1.1414902210235596, - "rewards/rejected": -2.5219030380249023, + "epoch": 0.9, + "grad_norm": 12.8125, + "learning_rate": 1.571258520799804e-07, + "logits/chosen": -1.8212575912475586, + "logits/rejected": -1.5021746158599854, + "logps/chosen": -801.4302978515625, + "logps/rejected": -1474.905029296875, + "loss": 0.1442, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -6.002583980560303, + "rewards/margins": 6.724352836608887, + "rewards/rejected": -12.726938247680664, "step": 3360 }, { - "epoch": 0.88, - "grad_norm": 6.59375, - "learning_rate": 2.0931273093666575e-07, - "logits/chosen": -1.8536121845245361, - "logits/rejected": -1.6857569217681885, - "logps/chosen": -402.9033203125, - "logps/rejected": -488.2943420410156, - "loss": 0.4316, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": -1.427424669265747, - "rewards/margins": 1.077510118484497, - "rewards/rejected": -2.504934787750244, + "epoch": 0.9, + "grad_norm": 3.046875, + "learning_rate": 1.490887450971032e-07, + "logits/chosen": -1.8178669214248657, + "logits/rejected": -1.5735647678375244, + "logps/chosen": -782.1468505859375, + "logps/rejected": -1470.4990234375, + "loss": 0.2163, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -6.029472351074219, + "rewards/margins": 6.646093845367432, + "rewards/rejected": -12.675565719604492, "step": 3370 }, { - "epoch": 0.88, - "grad_norm": 9.9375, - "learning_rate": 2.002580803659873e-07, - "logits/chosen": -1.80948007106781, - "logits/rejected": -1.7452131509780884, - "logps/chosen": -420.40924072265625, - "logps/rejected": -497.26751708984375, - "loss": 0.4773, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.4817922115325928, - "rewards/margins": 0.893271803855896, - "rewards/rejected": -2.3750641345977783, + "epoch": 0.9, + "grad_norm": 4.4375, + "learning_rate": 1.4125630453610539e-07, + "logits/chosen": -1.8485219478607178, + "logits/rejected": -1.6230590343475342, + "logps/chosen": -785.03857421875, + "logps/rejected": -1493.5316162109375, + "loss": 0.1828, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -5.861344814300537, + "rewards/margins": 7.066938877105713, + "rewards/rejected": -12.928281784057617, "step": 3380 }, { - "epoch": 0.89, - "grad_norm": 5.125, - "learning_rate": 1.913954575837826e-07, - "logits/chosen": -1.9585930109024048, - "logits/rejected": -1.6200177669525146, - "logps/chosen": -426.95416259765625, - "logps/rejected": -465.089599609375, - "loss": 0.5055, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3750698566436768, - "rewards/margins": 0.8868755102157593, - "rewards/rejected": -2.2619454860687256, + "epoch": 0.91, + "grad_norm": 10.625, + "learning_rate": 1.336292122762281e-07, + "logits/chosen": -1.7747573852539062, + "logits/rejected": -1.5330942869186401, + "logps/chosen": -760.6348266601562, + "logps/rejected": -1411.752685546875, + "loss": 0.1911, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.722105979919434, + "rewards/margins": 6.500117301940918, + "rewards/rejected": -12.222223281860352, "step": 3390 }, { - "epoch": 0.89, - "grad_norm": 5.65625, - "learning_rate": 1.827256026165028e-07, - "logits/chosen": -1.9661176204681396, - "logits/rejected": -1.7149207592010498, - "logps/chosen": -461.3528747558594, - "logps/rejected": -499.0296936035156, - "loss": 0.4668, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.2651516199111938, - "rewards/margins": 0.9840850830078125, - "rewards/rejected": -2.249236583709717, + "epoch": 0.91, + "grad_norm": 8.25, + "learning_rate": 1.2620813231943197e-07, + "logits/chosen": -1.842785120010376, + "logits/rejected": -1.5948874950408936, + "logps/chosen": -769.0888061523438, + "logps/rejected": -1497.182861328125, + "loss": 0.1929, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.66414737701416, + "rewards/margins": 7.302771091461182, + "rewards/rejected": -12.966917037963867, "step": 3400 }, { - "epoch": 0.89, - "eval_logits/chosen": -1.9266204833984375, - "eval_logits/rejected": -1.8132041692733765, - "eval_logps/chosen": -425.77191162109375, - "eval_logps/rejected": -492.45269775390625, - "eval_loss": 0.505458652973175, - "eval_rewards/accuracies": 0.7350000143051147, - "eval_rewards/chosen": -1.4063764810562134, - "eval_rewards/margins": 0.8857010006904602, - "eval_rewards/rejected": -2.2920773029327393, - "eval_runtime": 783.1005, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "epoch": 0.91, + "eval_logits/chosen": -1.9565365314483643, + "eval_logits/rejected": -1.8393818140029907, + "eval_logps/chosen": -797.0282592773438, + "eval_logps/rejected": -869.4041748046875, + "eval_loss": 0.9733775854110718, + "eval_rewards/accuracies": 0.5945000052452087, + "eval_rewards/chosen": -5.118940353393555, + "eval_rewards/margins": 0.9426524639129639, + "eval_rewards/rejected": -6.061592102050781, + "eval_runtime": 781.5982, + "eval_samples_per_second": 2.559, + "eval_steps_per_second": 0.32, "step": 3400 }, { - "epoch": 0.89, - "grad_norm": 8.3125, - "learning_rate": 1.7424923939454274e-07, - "logits/chosen": -1.9129213094711304, - "logits/rejected": -1.7314773797988892, - "logps/chosen": -438.605224609375, - "logps/rejected": -489.36846923828125, - "loss": 0.4838, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3883512020111084, - "rewards/margins": 0.9468286633491516, - "rewards/rejected": -2.3351798057556152, + "epoch": 0.91, + "grad_norm": 15.4375, + "learning_rate": 1.1899371073258947e-07, + "logits/chosen": -1.8293758630752563, + "logits/rejected": -1.5503754615783691, + "logps/chosen": -763.3578491210938, + "logps/rejected": -1477.820556640625, + "loss": 0.1756, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.740475654602051, + "rewards/margins": 7.120656490325928, + "rewards/rejected": -12.861132621765137, "step": 3410 }, { - "epoch": 0.9, - "grad_norm": 12.125, - "learning_rate": 1.6596707569179304e-07, - "logits/chosen": -1.9000225067138672, - "logits/rejected": -1.7462106943130493, - "logps/chosen": -441.75830078125, - "logps/rejected": -494.11846923828125, - "loss": 0.5098, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.4242441654205322, - "rewards/margins": 0.8710432052612305, - "rewards/rejected": -2.2952873706817627, + "epoch": 0.91, + "grad_norm": 2.390625, + "learning_rate": 1.1198657559123887e-07, + "logits/chosen": -1.8027288913726807, + "logits/rejected": -1.5142316818237305, + "logps/chosen": -748.8643188476562, + "logps/rejected": -1528.968017578125, + "loss": 0.1251, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.598520755767822, + "rewards/margins": 7.6766767501831055, + "rewards/rejected": -13.275197982788086, "step": 3420 }, { - "epoch": 0.9, - "grad_norm": 9.0625, - "learning_rate": 1.578798030665385e-07, - "logits/chosen": -1.9095699787139893, - "logits/rejected": -1.7264404296875, - "logps/chosen": -427.95294189453125, - "logps/rejected": -512.547607421875, - "loss": 0.4677, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3406087160110474, - "rewards/margins": 1.0093950033187866, - "rewards/rejected": -2.350003719329834, + "epoch": 0.92, + "grad_norm": 14.4375, + "learning_rate": 1.0518733692490512e-07, + "logits/chosen": -1.809152603149414, + "logits/rejected": -1.4840996265411377, + "logps/chosen": -713.273681640625, + "logps/rejected": -1412.4957275390625, + "loss": 0.1484, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.270025253295898, + "rewards/margins": 6.962747097015381, + "rewards/rejected": -12.232772827148438, "step": 3430 }, { - "epoch": 0.9, - "grad_norm": 5.71875, - "learning_rate": 1.499880968037165e-07, - "logits/chosen": -1.8476403951644897, - "logits/rejected": -1.7272007465362549, - "logps/chosen": -402.6236267089844, - "logps/rejected": -456.137451171875, - "loss": 0.4818, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3089590072631836, - "rewards/margins": 0.9297110438346863, - "rewards/rejected": -2.2386701107025146, + "epoch": 0.92, + "grad_norm": 15.25, + "learning_rate": 9.859658666399291e-08, + "logits/chosen": -1.877356767654419, + "logits/rejected": -1.6048234701156616, + "logps/chosen": -761.9014892578125, + "logps/rejected": -1552.405517578125, + "loss": 0.1827, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.749403476715088, + "rewards/margins": 7.860726356506348, + "rewards/rejected": -13.610130310058594, "step": 3440 }, { - "epoch": 0.9, - "grad_norm": 8.375, - "learning_rate": 1.4229261585852805e-07, - "logits/chosen": -1.8726069927215576, - "logits/rejected": -1.814284324645996, - "logps/chosen": -413.341796875, - "logps/rejected": -482.2705993652344, - "loss": 0.4554, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.283301591873169, - "rewards/margins": 0.9565474390983582, - "rewards/rejected": -2.239849090576172, + "epoch": 0.92, + "grad_norm": 3.46875, + "learning_rate": 9.221489858825289e-08, + "logits/chosen": -1.8329734802246094, + "logits/rejected": -1.5475187301635742, + "logps/chosen": -776.5346069335938, + "logps/rejected": -1564.7760009765625, + "loss": 0.1415, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.798548698425293, + "rewards/margins": 7.909842014312744, + "rewards/rejected": -13.708391189575195, "step": 3450 }, { - "epoch": 0.91, - "grad_norm": 7.6875, - "learning_rate": 1.3479400280141886e-07, - "logits/chosen": -1.8118232488632202, - "logits/rejected": -1.7649272680282593, - "logps/chosen": -407.5479736328125, - "logps/rejected": -503.00091552734375, - "loss": 0.5054, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.4062145948410034, - "rewards/margins": 0.9560636281967163, - "rewards/rejected": -2.3622782230377197, + "epoch": 0.92, + "grad_norm": 9.0625, + "learning_rate": 8.604282827682942e-08, + "logits/chosen": -1.8030303716659546, + "logits/rejected": -1.5392242670059204, + "logps/chosen": -753.5026245117188, + "logps/rejected": -1457.5640869140625, + "loss": 0.1391, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.680572509765625, + "rewards/margins": 6.985505104064941, + "rewards/rejected": -12.666077613830566, "step": 3460 }, { - "epoch": 0.91, - "grad_norm": 7.78125, - "learning_rate": 1.2749288376442044e-07, - "logits/chosen": -1.8775228261947632, - "logits/rejected": -1.7272803783416748, - "logps/chosen": -452.64874267578125, - "logps/rejected": -483.62811279296875, - "loss": 0.4763, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3288511037826538, - "rewards/margins": 0.9381489753723145, - "rewards/rejected": -2.267000198364258, + "epoch": 0.93, + "grad_norm": 7.90625, + "learning_rate": 8.008091305989313e-08, + "logits/chosen": -1.8074871301651, + "logits/rejected": -1.5014359951019287, + "logps/chosen": -736.6246337890625, + "logps/rejected": -1578.62548828125, + "loss": 0.112, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": -5.4359283447265625, + "rewards/margins": 8.487021446228027, + "rewards/rejected": -13.922948837280273, "step": 3470 }, { - "epoch": 0.91, - "grad_norm": 9.4375, - "learning_rate": 1.203898683888713e-07, - "logits/chosen": -1.9275810718536377, - "logits/rejected": -1.7312577962875366, - "logps/chosen": -407.8893737792969, - "logps/rejected": -480.2945861816406, - "loss": 0.5632, - "rewards/accuracies": 0.6875, - "rewards/chosen": -1.4742262363433838, - "rewards/margins": 0.7488548755645752, - "rewards/rejected": -2.22308087348938, + "epoch": 0.93, + "grad_norm": 18.375, + "learning_rate": 7.432967197186225e-08, + "logits/chosen": -1.8415800333023071, + "logits/rejected": -1.548337697982788, + "logps/chosen": -760.2332763671875, + "logps/rejected": -1478.7425537109375, + "loss": 0.1622, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.600264549255371, + "rewards/margins": 7.201352119445801, + "rewards/rejected": -12.801614761352539, "step": 3480 }, { - "epoch": 0.91, - "grad_norm": 7.71875, - "learning_rate": 1.1348554977451132e-07, - "logits/chosen": -1.9800020456314087, - "logits/rejected": -1.7983967065811157, - "logps/chosen": -435.8182678222656, - "logps/rejected": -486.7703552246094, - "loss": 0.4831, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.3190518617630005, - "rewards/margins": 0.8798874616622925, - "rewards/rejected": -2.198939085006714, + "epoch": 0.93, + "grad_norm": 7.875, + "learning_rate": 6.878960570621568e-08, + "logits/chosen": -1.849948525428772, + "logits/rejected": -1.6014125347137451, + "logps/chosen": -792.1825561523438, + "logps/rejected": -1483.4019775390625, + "loss": 0.1436, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.94309139251709, + "rewards/margins": 6.779400825500488, + "rewards/rejected": -12.722491264343262, "step": 3490 }, { - "epoch": 0.92, - "grad_norm": 4.96875, - "learning_rate": 1.0678050442995802e-07, - "logits/chosen": -1.889638900756836, - "logits/rejected": -1.7416059970855713, - "logps/chosen": -442.87994384765625, - "logps/rejected": -472.72314453125, - "loss": 0.5671, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4175615310668945, - "rewards/margins": 0.7715851068496704, - "rewards/rejected": -2.1891465187072754, + "epoch": 0.94, + "grad_norm": 7.5625, + "learning_rate": 6.346119657190396e-08, + "logits/chosen": -1.8813422918319702, + "logits/rejected": -1.5884922742843628, + "logps/chosen": -810.8375854492188, + "logps/rejected": -1503.427490234375, + "loss": 0.1222, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -6.075570583343506, + "rewards/margins": 6.977696418762207, + "rewards/rejected": -13.053268432617188, "step": 3500 }, { - "epoch": 0.92, - "eval_logits/chosen": -1.9291415214538574, - "eval_logits/rejected": -1.8157764673233032, - "eval_logps/chosen": -425.4985656738281, - "eval_logps/rejected": -492.2395324707031, - "eval_loss": 0.5055854916572571, - "eval_rewards/accuracies": 0.734499990940094, - "eval_rewards/chosen": -1.4036431312561035, - "eval_rewards/margins": 0.8863027095794678, - "eval_rewards/rejected": -2.2899458408355713, - "eval_runtime": 783.003, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, + "epoch": 0.94, + "eval_logits/chosen": -1.9550334215164185, + "eval_logits/rejected": -1.837749719619751, + "eval_logps/chosen": -797.5565185546875, + "eval_logps/rejected": -869.9833984375, + "eval_loss": 0.9741838574409485, + "eval_rewards/accuracies": 0.5954999923706055, + "eval_rewards/chosen": -5.124222755432129, + "eval_rewards/margins": 0.9431625008583069, + "eval_rewards/rejected": -6.067384719848633, + "eval_runtime": 781.8275, + "eval_samples_per_second": 2.558, + "eval_steps_per_second": 0.32, "step": 3500 }, { - "epoch": 0.92, - "grad_norm": 6.4375, - "learning_rate": 1.0027529222456755e-07, - "logits/chosen": -1.8589719533920288, - "logits/rejected": -1.7069114446640015, - "logps/chosen": -408.1587219238281, - "logps/rejected": -483.13787841796875, - "loss": 0.4453, - "rewards/accuracies": 0.8125, - "rewards/chosen": -1.3498586416244507, - "rewards/margins": 0.9569522738456726, - "rewards/rejected": -2.3068108558654785, + "epoch": 0.94, + "grad_norm": 11.6875, + "learning_rate": 5.8344908451359315e-08, + "logits/chosen": -1.798156976699829, + "logits/rejected": -1.5139014720916748, + "logps/chosen": -691.9673461914062, + "logps/rejected": -1474.0509033203125, + "loss": 0.1717, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.160942077636719, + "rewards/margins": 7.799500942230225, + "rewards/rejected": -12.960443496704102, "step": 3510 }, { - "epoch": 0.92, - "grad_norm": 4.875, - "learning_rate": 9.397045634168766e-08, - "logits/chosen": -1.8985093832015991, - "logits/rejected": -1.8638538122177124, - "logps/chosen": -413.01373291015625, - "logps/rejected": -521.2156982421875, - "loss": 0.4702, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.2440009117126465, - "rewards/margins": 1.069000482559204, - "rewards/rejected": -2.3130011558532715, + "epoch": 0.94, + "grad_norm": 18.0, + "learning_rate": 5.344118676011173e-08, + "logits/chosen": -1.8956835269927979, + "logits/rejected": -1.5818126201629639, + "logps/chosen": -761.4198608398438, + "logps/rejected": -1536.6260986328125, + "loss": 0.1966, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.5117411613464355, + "rewards/margins": 7.931551933288574, + "rewards/rejected": -13.443292617797852, "step": 3520 }, { - "epoch": 0.92, - "grad_norm": 11.3125, - "learning_rate": 8.78665232332998e-08, - "logits/chosen": -1.7806581258773804, - "logits/rejected": -1.7375141382217407, - "logps/chosen": -399.5974426269531, - "logps/rejected": -475.81280517578125, - "loss": 0.5006, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.5022794008255005, - "rewards/margins": 0.8037067651748657, - "rewards/rejected": -2.305986166000366, + "epoch": 0.94, + "grad_norm": 20.25, + "learning_rate": 4.875045840801257e-08, + "logits/chosen": -1.9040454626083374, + "logits/rejected": -1.6035842895507812, + "logps/chosen": -798.6568603515625, + "logps/rejected": -1530.2841796875, + "loss": 0.2062, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.8380513191223145, + "rewards/margins": 7.351933479309082, + "rewards/rejected": -13.189984321594238, "step": 3530 }, { - "epoch": 0.93, - "grad_norm": 6.5, - "learning_rate": 8.196400257606208e-08, - "logits/chosen": -1.910846471786499, - "logits/rejected": -1.8186595439910889, - "logps/chosen": -435.6886291503906, - "logps/rejected": -532.4617309570312, - "loss": 0.4666, - "rewards/accuracies": 0.793749988079071, - "rewards/chosen": -1.325961709022522, - "rewards/margins": 1.0250279903411865, - "rewards/rejected": -2.350989818572998, + "epoch": 0.95, + "grad_norm": 7.28125, + "learning_rate": 4.427313176206621e-08, + "logits/chosen": -1.8618987798690796, + "logits/rejected": -1.5344703197479248, + "logps/chosen": -754.1892700195312, + "logps/rejected": -1544.8671875, + "loss": 0.1424, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -5.624054908752441, + "rewards/margins": 7.836933135986328, + "rewards/rejected": -13.46098804473877, "step": 3540 }, { - "epoch": 0.93, - "grad_norm": 6.5, - "learning_rate": 7.626338722875076e-08, - "logits/chosen": -1.8711729049682617, - "logits/rejected": -1.8197062015533447, - "logps/chosen": -405.9395446777344, - "logps/rejected": -484.9190368652344, - "loss": 0.5179, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3086730241775513, - "rewards/margins": 0.8287800550460815, - "rewards/rejected": -2.137453317642212, + "epoch": 0.95, + "grad_norm": 11.1875, + "learning_rate": 4.000959661087961e-08, + "logits/chosen": -1.8265072107315063, + "logits/rejected": -1.576949119567871, + "logps/chosen": -750.4592895507812, + "logps/rejected": -1366.3184814453125, + "loss": 0.2174, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.603542804718018, + "rewards/margins": 6.079636573791504, + "rewards/rejected": -11.68317985534668, "step": 3550 }, { - "epoch": 0.93, - "grad_norm": 4.90625, - "learning_rate": 7.076515319110688e-08, - "logits/chosen": -1.8416671752929688, - "logits/rejected": -1.8124072551727295, - "logps/chosen": -407.7252502441406, - "logps/rejected": -458.68353271484375, - "loss": 0.5219, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3443095684051514, - "rewards/margins": 0.9537046551704407, - "rewards/rejected": -2.2980141639709473, + "epoch": 0.95, + "grad_norm": 3.578125, + "learning_rate": 3.596022413072886e-08, + "logits/chosen": -1.8795325756072998, + "logits/rejected": -1.6326515674591064, + "logps/chosen": -711.9053955078125, + "logps/rejected": -1451.708251953125, + "loss": 0.1559, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.207059860229492, + "rewards/margins": 7.259676456451416, + "rewards/rejected": -12.46673583984375, "step": 3560 }, { - "epoch": 0.93, - "grad_norm": 6.125, - "learning_rate": 6.54697595640899e-08, - "logits/chosen": -1.893803358078003, - "logits/rejected": -1.7148902416229248, - "logps/chosen": -445.6398010253906, - "logps/rejected": -513.85888671875, - "loss": 0.4858, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -1.334416389465332, - "rewards/margins": 0.9229806065559387, - "rewards/rejected": -2.257397174835205, + "epoch": 0.95, + "grad_norm": 2.953125, + "learning_rate": 3.2125366853243964e-08, + "logits/chosen": -1.8214938640594482, + "logits/rejected": -1.5755712985992432, + "logps/chosen": -723.0794067382812, + "logps/rejected": -1359.31689453125, + "loss": 0.2401, + "rewards/accuracies": 0.893750011920929, + "rewards/chosen": -5.46134090423584, + "rewards/margins": 6.4174017906188965, + "rewards/rejected": -11.878742218017578, "step": 3570 }, { - "epoch": 0.94, - "grad_norm": 6.15625, - "learning_rate": 6.037764851154426e-08, - "logits/chosen": -1.827797532081604, - "logits/rejected": -1.8340890407562256, - "logps/chosen": -417.6336975097656, - "logps/rejected": -509.90423583984375, - "loss": 0.4995, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.3264166116714478, - "rewards/margins": 0.8964270353317261, - "rewards/rejected": -2.222843647003174, + "epoch": 0.96, + "grad_norm": 12.1875, + "learning_rate": 2.8505358634718095e-08, + "logits/chosen": -1.7878949642181396, + "logits/rejected": -1.488386631011963, + "logps/chosen": -745.4979858398438, + "logps/rejected": -1443.983642578125, + "loss": 0.2291, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -5.672155857086182, + "rewards/margins": 6.9494218826293945, + "rewards/rejected": -12.621576309204102, "step": 3580 }, - { - "epoch": 0.94, - "grad_norm": 6.71875, - "learning_rate": 5.548924522327748e-08, - "logits/chosen": -1.8965423107147217, - "logits/rejected": -1.6809604167938232, - "logps/chosen": -421.51165771484375, - "logps/rejected": -491.33660888671875, - "loss": 0.5142, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3985979557037354, - "rewards/margins": 0.8676589727401733, - "rewards/rejected": -2.266256809234619, - "step": 3590 - }, - { - "epoch": 0.94, - "grad_norm": 10.625, - "learning_rate": 5.0804957879556915e-08, - "logits/chosen": -1.7934764623641968, - "logits/rejected": -1.7331079244613647, - "logps/chosen": -388.4217529296875, - "logps/rejected": -480.72711181640625, - "loss": 0.4708, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.4167721271514893, - "rewards/margins": 0.9230138659477234, - "rewards/rejected": -2.3397860527038574, - "step": 3600 - }, - { - "epoch": 0.94, - "eval_logits/chosen": -1.9261465072631836, - "eval_logits/rejected": -1.812709093093872, - "eval_logps/chosen": -425.6341857910156, - "eval_logps/rejected": -492.3602600097656, - "eval_loss": 0.5055687427520752, - "eval_rewards/accuracies": 0.734499990940094, - "eval_rewards/chosen": -1.4049991369247437, - "eval_rewards/margins": 0.8861536383628845, - "eval_rewards/rejected": -2.2911531925201416, - "eval_runtime": 783.0794, - "eval_samples_per_second": 2.554, - "eval_steps_per_second": 0.319, - "step": 3600 - }, - { - "epoch": 0.94, - "grad_norm": 8.625, - "learning_rate": 4.632517761702815e-08, - "logits/chosen": -1.7564241886138916, - "logits/rejected": -1.6173584461212158, - "logps/chosen": -408.35675048828125, - "logps/rejected": -496.64678955078125, - "loss": 0.459, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.4728659391403198, - "rewards/margins": 1.0330259799957275, - "rewards/rejected": -2.505892038345337, - "step": 3610 - }, - { - "epoch": 0.95, - "grad_norm": 10.0, - "learning_rate": 4.205027849605359e-08, - "logits/chosen": -1.8442957401275635, - "logits/rejected": -1.732021689414978, - "logps/chosen": -413.531494140625, - "logps/rejected": -455.7919006347656, - "loss": 0.5459, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.4587666988372803, - "rewards/margins": 0.7717809677124023, - "rewards/rejected": -2.2305476665496826, - "step": 3620 - }, - { - "epoch": 0.95, - "grad_norm": 7.9375, - "learning_rate": 3.798061746947995e-08, - "logits/chosen": -1.9753859043121338, - "logits/rejected": -1.7450494766235352, - "logps/chosen": -424.0469665527344, - "logps/rejected": -471.99188232421875, - "loss": 0.5102, - "rewards/accuracies": 0.731249988079071, - "rewards/chosen": -1.4073350429534912, - "rewards/margins": 0.8742674589157104, - "rewards/rejected": -2.2816028594970703, - "step": 3630 - }, - { - "epoch": 0.95, - "grad_norm": 9.0, - "learning_rate": 3.411653435283158e-08, - "logits/chosen": -1.921852469444275, - "logits/rejected": -1.7092326879501343, - "logps/chosen": -422.7461853027344, - "logps/rejected": -451.899658203125, - "loss": 0.5007, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.311460256576538, - "rewards/margins": 0.8820557594299316, - "rewards/rejected": -2.1935160160064697, - "step": 3640 - }, { "epoch": 0.96, - "grad_norm": 7.0, - "learning_rate": 3.04583517959367e-08, - "logits/chosen": -1.9487841129302979, - "logits/rejected": -1.784651756286621, - "logps/chosen": -393.1403503417969, - "logps/rejected": -452.6136779785156, - "loss": 0.4832, - "rewards/accuracies": 0.7437499761581421, - "rewards/chosen": -1.2417455911636353, - "rewards/margins": 0.9175812005996704, - "rewards/rejected": -2.1593265533447266, - "step": 3650 + "grad_norm": 10.25, + "learning_rate": 2.5100514627042773e-08, + "logits/chosen": -1.8504598140716553, + "logits/rejected": -1.5906519889831543, + "logps/chosen": -742.5360107421875, + "logps/rejected": -1424.46142578125, + "loss": 0.137, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": -5.449435234069824, + "rewards/margins": 6.750896453857422, + "rewards/rejected": -12.20033073425293, + "step": 3590 }, { "epoch": 0.96, - "grad_norm": 7.8125, - "learning_rate": 2.7006375255985984e-08, - "logits/chosen": -1.8327049016952515, - "logits/rejected": -1.7989648580551147, - "logps/chosen": -442.2281799316406, - "logps/rejected": -505.47674560546875, - "loss": 0.5841, - "rewards/accuracies": 0.65625, - "rewards/chosen": -1.5012400150299072, - "rewards/margins": 0.6502693891525269, - "rewards/rejected": -2.1515092849731445, - "step": 3660 + "grad_norm": 5.71875, + "learning_rate": 2.1911131250271778e-08, + "logits/chosen": -1.8526302576065063, + "logits/rejected": -1.6113744974136353, + "logps/chosen": -719.0070190429688, + "logps/rejected": -1461.58642578125, + "loss": 0.1486, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.213129043579102, + "rewards/margins": 7.295968532562256, + "rewards/rejected": -12.5090970993042, + "step": 3600 }, { "epoch": 0.96, - "grad_norm": 8.5625, - "learning_rate": 2.3760892972027328e-08, - "logits/chosen": -1.9631707668304443, - "logits/rejected": -1.7359784841537476, - "logps/chosen": -437.84423828125, - "logps/rejected": -484.88470458984375, - "loss": 0.5526, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.477554440498352, - "rewards/margins": 0.8330958485603333, - "rewards/rejected": -2.31065034866333, - "step": 3670 + "eval_logits/chosen": -1.9550325870513916, + "eval_logits/rejected": -1.8377885818481445, + "eval_logps/chosen": -797.7341918945312, + "eval_logps/rejected": -870.1893310546875, + "eval_loss": 0.9740959405899048, + "eval_rewards/accuracies": 0.5954999923706055, + "eval_rewards/chosen": -5.125999450683594, + "eval_rewards/margins": 0.9434443116188049, + "eval_rewards/rejected": -6.0694427490234375, + "eval_runtime": 780.393, + "eval_samples_per_second": 2.563, + "eval_steps_per_second": 0.32, + "step": 3600 }, { "epoch": 0.96, "grad_norm": 9.4375, - "learning_rate": 2.072217594089765e-08, - "logits/chosen": -1.8473793268203735, - "logits/rejected": -1.8018176555633545, - "logps/chosen": -421.2726135253906, - "logps/rejected": -516.998291015625, - "loss": 0.4353, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -1.4261518716812134, - "rewards/margins": 1.0697355270385742, - "rewards/rejected": -2.495887279510498, - "step": 3680 - }, - { - "epoch": 0.97, - "grad_norm": 9.6875, - "learning_rate": 1.789047789459375e-08, - "logits/chosen": -1.9341415166854858, - "logits/rejected": -1.7410176992416382, - "logps/chosen": -475.49359130859375, - "logps/rejected": -517.1406860351562, - "loss": 0.4974, - "rewards/accuracies": 0.768750011920929, - "rewards/chosen": -1.3300564289093018, - "rewards/margins": 0.9742846488952637, - "rewards/rejected": -2.3043410778045654, - "step": 3690 - }, - { - "epoch": 0.97, - "grad_norm": 6.71875, - "learning_rate": 1.5266035279088708e-08, - "logits/chosen": -1.7829246520996094, - "logits/rejected": -1.5944633483886719, - "logps/chosen": -466.7164611816406, - "logps/rejected": -525.4857177734375, - "loss": 0.4904, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.4383667707443237, - "rewards/margins": 0.8766821026802063, - "rewards/rejected": -2.315048933029175, - "step": 3700 + "learning_rate": 1.8937486166814568e-08, + "logits/chosen": -1.9209082126617432, + "logits/rejected": -1.6933987140655518, + "logps/chosen": -755.1649780273438, + "logps/rejected": -1393.343994140625, + "loss": 0.174, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.591456413269043, + "rewards/margins": 6.302043437957764, + "rewards/rejected": -11.893500328063965, + "step": 3610 }, { "epoch": 0.97, - "eval_logits/chosen": -1.9288603067398071, - "eval_logits/rejected": -1.8154667615890503, - "eval_logps/chosen": -425.60430908203125, - "eval_logps/rejected": -492.37359619140625, - "eval_loss": 0.5054319500923157, - "eval_rewards/accuracies": 0.7354999780654907, - "eval_rewards/chosen": -1.4047002792358398, - "eval_rewards/margins": 0.8865863084793091, - "eval_rewards/rejected": -2.2912867069244385, - "eval_runtime": 782.1538, - "eval_samples_per_second": 2.557, - "eval_steps_per_second": 0.32, - "step": 3700 + "grad_norm": 8.6875, + "learning_rate": 1.6179838257263935e-08, + "logits/chosen": -1.8362079858779907, + "logits/rejected": -1.5869410037994385, + "logps/chosen": -773.01318359375, + "logps/rejected": -1502.8729248046875, + "loss": 0.2091, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.817551612854004, + "rewards/margins": 7.409367561340332, + "rewards/rejected": -13.22691822052002, + "step": 3620 }, { "epoch": 0.97, - "grad_norm": 10.5, - "learning_rate": 1.2849067234584623e-08, - "logits/chosen": -1.7769056558609009, - "logits/rejected": -1.7184251546859741, - "logps/chosen": -394.2373046875, - "logps/rejected": -475.7364807128906, - "loss": 0.5047, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.3951432704925537, - "rewards/margins": 0.919331431388855, - "rewards/rejected": -2.314474582672119, - "step": 3710 + "grad_norm": 5.0, + "learning_rate": 1.363842759785794e-08, + "logits/chosen": -1.849810004234314, + "logits/rejected": -1.6076825857162476, + "logps/chosen": -761.3151245117188, + "logps/rejected": -1389.025146484375, + "loss": 0.1852, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.615696907043457, + "rewards/margins": 6.342384338378906, + "rewards/rejected": -11.958080291748047, + "step": 3630 }, { "epoch": 0.97, - "grad_norm": 9.5625, - "learning_rate": 1.0639775577218625e-08, - "logits/chosen": -1.78192138671875, - "logits/rejected": -1.6007667779922485, - "logps/chosen": -411.0592346191406, - "logps/rejected": -470.96612548828125, - "loss": 0.4938, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -1.4045506715774536, - "rewards/margins": 0.9757940173149109, - "rewards/rejected": -2.380344867706299, - "step": 3720 + "grad_norm": 23.75, + "learning_rate": 1.1313475439580225e-08, + "logits/chosen": -1.8430496454238892, + "logits/rejected": -1.4863206148147583, + "logps/chosen": -747.9796752929688, + "logps/rejected": -1544.6732177734375, + "loss": 0.1458, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.6696696281433105, + "rewards/margins": 7.953704833984375, + "rewards/rejected": -13.623373031616211, + "step": 3640 }, { "epoch": 0.98, - "grad_norm": 7.5, - "learning_rate": 8.638344782207486e-09, - "logits/chosen": -1.8548873662948608, - "logits/rejected": -1.6706825494766235, - "logps/chosen": -405.6194763183594, - "logps/rejected": -470.0986328125, - "loss": 0.5011, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.2951724529266357, - "rewards/margins": 0.9182003736495972, - "rewards/rejected": -2.2133727073669434, - "step": 3730 + "grad_norm": 16.75, + "learning_rate": 9.205184188895988e-09, + "logits/chosen": -1.8420413732528687, + "logits/rejected": -1.5764353275299072, + "logps/chosen": -797.4532470703125, + "logps/rejected": -1453.536865234375, + "loss": 0.1949, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.8908371925354, + "rewards/margins": 6.618434906005859, + "rewards/rejected": -12.509271621704102, + "step": 3650 }, { "epoch": 0.98, - "grad_norm": 8.3125, - "learning_rate": 6.84494196844715e-09, - "logits/chosen": -1.8603630065917969, - "logits/rejected": -1.7809423208236694, - "logps/chosen": -431.21478271484375, - "logps/rejected": -528.8220825195312, - "loss": 0.4502, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -1.3780217170715332, - "rewards/margins": 1.1094415187835693, - "rewards/rejected": -2.4874634742736816, - "step": 3740 + "grad_norm": 4.8125, + "learning_rate": 7.313737390133857e-09, + "logits/chosen": -1.8078467845916748, + "logits/rejected": -1.4992420673370361, + "logps/chosen": -765.2530517578125, + "logps/rejected": -1426.178955078125, + "loss": 0.2101, + "rewards/accuracies": 0.90625, + "rewards/chosen": -5.691425323486328, + "rewards/margins": 6.622584342956543, + "rewards/rejected": -12.314008712768555, + "step": 3660 }, { "epoch": 0.98, - "grad_norm": 5.75, - "learning_rate": 5.259716884556121e-09, - "logits/chosen": -1.9321876764297485, - "logits/rejected": -1.7817065715789795, - "logps/chosen": -427.5755920410156, - "logps/rejected": -513.5477294921875, - "loss": 0.4795, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -1.339813470840454, - "rewards/margins": 0.9209781885147095, - "rewards/rejected": -2.260791778564453, - "step": 3750 + "grad_norm": 6.34375, + "learning_rate": 5.6392997095047756e-09, + "logits/chosen": -1.7872775793075562, + "logits/rejected": -1.508326768875122, + "logps/chosen": -758.366943359375, + "logps/rejected": -1513.539794921875, + "loss": 0.1763, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.68167781829834, + "rewards/margins": 7.58060359954834, + "rewards/rejected": -13.26228141784668, + "step": 3670 }, { "epoch": 0.98, - "grad_norm": 7.625, - "learning_rate": 3.882801896372967e-09, - "logits/chosen": -1.8706543445587158, - "logits/rejected": -1.8746263980865479, - "logps/chosen": -417.9456481933594, - "logps/rejected": -473.9695739746094, - "loss": 0.5329, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": -1.338267207145691, - "rewards/margins": 0.8854736089706421, - "rewards/rejected": -2.223741054534912, - "step": 3760 + "grad_norm": 2.640625, + "learning_rate": 4.182016920766529e-09, + "logits/chosen": -1.8777058124542236, + "logits/rejected": -1.6157386302947998, + "logps/chosen": -753.9669799804688, + "logps/rejected": -1563.1689453125, + "loss": 0.1037, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -5.5306901931762695, + "rewards/margins": 7.934181213378906, + "rewards/rejected": -13.464871406555176, + "step": 3680 }, { "epoch": 0.99, - "grad_norm": 5.90625, - "learning_rate": 2.7143119759026614e-09, - "logits/chosen": -1.898192048072815, - "logits/rejected": -1.680114507675171, - "logps/chosen": -445.6121520996094, - "logps/rejected": -501.625732421875, - "loss": 0.4729, - "rewards/accuracies": 0.78125, - "rewards/chosen": -1.3821141719818115, - "rewards/margins": 0.848861575126648, - "rewards/rejected": -2.23097562789917, - "step": 3770 + "grad_norm": 2.859375, + "learning_rate": 2.942015892534178e-09, + "logits/chosen": -1.7432610988616943, + "logits/rejected": -1.4490609169006348, + "logps/chosen": -728.8178100585938, + "logps/rejected": -1425.911376953125, + "loss": 0.1537, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.359938621520996, + "rewards/margins": 7.0258660316467285, + "rewards/rejected": -12.385805130004883, + "step": 3690 }, { "epoch": 0.99, - "grad_norm": 8.5, - "learning_rate": 1.754344691717591e-09, - "logits/chosen": -1.8438247442245483, - "logits/rejected": -1.7638375759124756, - "logps/chosen": -413.7787170410156, - "logps/rejected": -505.7743225097656, - "loss": 0.5463, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -1.4394727945327759, - "rewards/margins": 0.7100211381912231, - "rewards/rejected": -2.149493932723999, - "step": 3780 + "grad_norm": 8.75, + "learning_rate": 1.9194045772336077e-09, + "logits/chosen": -1.8491967916488647, + "logits/rejected": -1.5299230813980103, + "logps/chosen": -698.197998046875, + "logps/rejected": -1445.8939208984375, + "loss": 0.1384, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -5.197235584259033, + "rewards/margins": 7.472482204437256, + "rewards/rejected": -12.669717788696289, + "step": 3700 }, { "epoch": 0.99, - "grad_norm": 11.5625, - "learning_rate": 1.0029802008096335e-09, - "logits/chosen": -1.8373730182647705, - "logits/rejected": -1.7250454425811768, - "logps/chosen": -439.18878173828125, - "logps/rejected": -509.8990173339844, - "loss": 0.5098, - "rewards/accuracies": 0.71875, - "rewards/chosen": -1.4585397243499756, - "rewards/margins": 0.936810314655304, - "rewards/rejected": -2.3953499794006348, - "step": 3790 + "eval_logits/chosen": -1.9538819789886475, + "eval_logits/rejected": -1.8366447687149048, + "eval_logps/chosen": -797.7002563476562, + "eval_logps/rejected": -870.1065063476562, + "eval_loss": 0.974000096321106, + "eval_rewards/accuracies": 0.5950000286102295, + "eval_rewards/chosen": -5.125659465789795, + "eval_rewards/margins": 0.9429554343223572, + "eval_rewards/rejected": -6.068615436553955, + "eval_runtime": 786.1332, + "eval_samples_per_second": 2.544, + "eval_steps_per_second": 0.318, + "step": 3700 }, { "epoch": 0.99, - "grad_norm": 5.53125, - "learning_rate": 4.602812418974534e-10, - "logits/chosen": -1.9282503128051758, - "logits/rejected": -1.8350452184677124, - "logps/chosen": -442.35992431640625, - "logps/rejected": -509.1976013183594, - "loss": 0.5001, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -1.3592561483383179, - "rewards/margins": 0.9274970293045044, - "rewards/rejected": -2.2867531776428223, - "step": 3800 + "grad_norm": 7.625, + "learning_rate": 1.1142720017040532e-09, + "logits/chosen": -1.829294204711914, + "logits/rejected": -1.6115293502807617, + "logps/chosen": -751.1514892578125, + "logps/rejected": -1485.13427734375, + "loss": 0.2801, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": -5.69688081741333, + "rewards/margins": 7.314256191253662, + "rewards/rejected": -13.011137008666992, + "step": 3710 }, { "epoch": 0.99, - "eval_logits/chosen": -1.9264984130859375, - "eval_logits/rejected": -1.813086986541748, - "eval_logps/chosen": -425.7129821777344, - "eval_logps/rejected": -492.45635986328125, - "eval_loss": 0.5055971741676331, - "eval_rewards/accuracies": 0.734499990940094, - "eval_rewards/chosen": -1.4057865142822266, - "eval_rewards/margins": 0.8863277435302734, - "eval_rewards/rejected": -2.2921142578125, - "eval_runtime": 782.8512, - "eval_samples_per_second": 2.555, - "eval_steps_per_second": 0.319, - "step": 3800 + "grad_norm": 9.125, + "learning_rate": 5.266882594481826e-10, + "logits/chosen": -1.890520691871643, + "logits/rejected": -1.6448471546173096, + "logps/chosen": -768.3478393554688, + "logps/rejected": -1508.867919921875, + "loss": 0.1475, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": -5.754606246948242, + "rewards/margins": 7.364068031311035, + "rewards/rejected": -13.118673324584961, + "step": 3720 }, { "epoch": 1.0, - "grad_norm": 7.25, - "learning_rate": 1.2629313018819312e-10, - "logits/chosen": -1.8780736923217773, - "logits/rejected": -1.761028528213501, - "logps/chosen": -410.8787536621094, - "logps/rejected": -473.9859313964844, - "loss": 0.5112, - "rewards/accuracies": 0.75, - "rewards/chosen": -1.3418636322021484, - "rewards/margins": 0.8411988019943237, - "rewards/rejected": -2.1830623149871826, - "step": 3810 + "grad_norm": 7.9375, + "learning_rate": 1.5670450452892617e-10, + "logits/chosen": -1.957965612411499, + "logits/rejected": -1.7082374095916748, + "logps/chosen": -737.3411865234375, + "logps/rejected": -1421.47607421875, + "loss": 0.1594, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -5.499425411224365, + "rewards/margins": 6.8082733154296875, + "rewards/rejected": -12.307699203491211, + "step": 3730 }, { "epoch": 1.0, - "grad_norm": 12.1875, - "learning_rate": 1.0437535929996855e-12, - "logits/chosen": -1.8872268199920654, - "logits/rejected": -1.743740439414978, - "logps/chosen": -458.9305725097656, - "logps/rejected": -513.4288940429688, - "loss": 0.4832, - "rewards/accuracies": 0.7562500238418579, - "rewards/chosen": -1.4926271438598633, - "rewards/margins": 1.0204166173934937, - "rewards/rejected": -2.5130436420440674, - "step": 3820 + "grad_norm": 7.09375, + "learning_rate": 4.3529471158154646e-12, + "logits/chosen": -1.8396613597869873, + "logits/rejected": -1.5516573190689087, + "logps/chosen": -753.6124267578125, + "logps/rejected": -1473.0452880859375, + "loss": 0.0999, + "rewards/accuracies": 0.96875, + "rewards/chosen": -5.577627658843994, + "rewards/margins": 7.1511077880859375, + "rewards/rejected": -12.728734016418457, + "step": 3740 }, { "epoch": 1.0, - "step": 3821, + "step": 3742, "total_flos": 0.0, - "train_loss": 0.5390157630246398, - "train_runtime": 82744.187, - "train_samples_per_second": 0.739, - "train_steps_per_second": 0.046 + "train_loss": 0.23287739991377154, + "train_runtime": 68266.2327, + "train_samples_per_second": 0.877, + "train_steps_per_second": 0.055 } ], "logging_steps": 10, - "max_steps": 3821, + "max_steps": 3742, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100,