{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 50.808629131181846, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.0106163024902344, "logits/rejected": -0.9813010096549988, "logps/chosen": -0.2742612659931183, "logps/rejected": -0.27159106731414795, "loss": 3.1812, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -2.742612361907959, "rewards/margins": -0.026701826602220535, "rewards/rejected": -2.7159104347229004, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 39.05077621078139, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.0402742624282837, "logits/rejected": -0.9734223484992981, "logps/chosen": -0.29369306564331055, "logps/rejected": -0.2996915876865387, "loss": 3.1464, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.9369308948516846, "rewards/margins": 0.05998479202389717, "rewards/rejected": -2.996915578842163, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 51.1710350218012, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9752303957939148, "logits/rejected": -0.9949920773506165, "logps/chosen": -0.2641645073890686, "logps/rejected": -0.30072060227394104, "loss": 3.1415, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.6416451930999756, "rewards/margins": 0.3655607998371124, "rewards/rejected": -3.0072059631347656, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 54.054907397432125, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9653177261352539, "logits/rejected": -0.9394901990890503, "logps/chosen": -0.27761954069137573, "logps/rejected": -0.29122447967529297, "loss": 3.0604, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.7761950492858887, "rewards/margins": 0.13604947924613953, "rewards/rejected": -2.912245035171509, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 57.39363585752273, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.0094068050384521, "logits/rejected": -0.979636549949646, "logps/chosen": -0.2718670964241028, "logps/rejected": -0.2782040536403656, "loss": 3.2307, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.7186713218688965, "rewards/margins": 0.06336940079927444, "rewards/rejected": -2.7820403575897217, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 46.45036278050633, "learning_rate": 6.382978723404255e-07, "logits/chosen": -1.0011686086654663, "logits/rejected": -0.9563290476799011, "logps/chosen": -0.2732592225074768, "logps/rejected": -0.2793694734573364, "loss": 3.0229, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -2.7325918674468994, "rewards/margins": 0.061102885752916336, "rewards/rejected": -2.7936947345733643, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 65.68030814202801, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.0469828844070435, "logits/rejected": -0.9718242883682251, "logps/chosen": -0.2941492199897766, "logps/rejected": -0.3212208151817322, "loss": 3.0108, "rewards/accuracies": 0.53125, "rewards/chosen": -2.9414920806884766, "rewards/margins": 0.2707158327102661, "rewards/rejected": -3.212207794189453, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 53.10277093204753, "learning_rate": 8.51063829787234e-07, "logits/chosen": -1.0076814889907837, "logits/rejected": -0.9637096524238586, "logps/chosen": -0.28012728691101074, "logps/rejected": -0.3228897452354431, "loss": 3.0702, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.8012731075286865, "rewards/margins": 0.42762452363967896, "rewards/rejected": -3.2288970947265625, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 40.02949687926913, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.0486935377120972, "logits/rejected": -1.005444049835205, "logps/chosen": -0.32811442017555237, "logps/rejected": -0.3792416453361511, "loss": 3.0998, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.2811439037323, "rewards/margins": 0.5112729072570801, "rewards/rejected": -3.79241681098938, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 82.92092402905409, "learning_rate": 9.998741174712533e-07, "logits/chosen": -1.0403989553451538, "logits/rejected": -0.9902151226997375, "logps/chosen": -0.3361224830150604, "logps/rejected": -0.37874636054039, "loss": 3.1724, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -3.36122465133667, "rewards/margins": 0.42623916268348694, "rewards/rejected": -3.787464141845703, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 67.21060799723625, "learning_rate": 9.991050648838675e-07, "logits/chosen": -1.0551035404205322, "logits/rejected": -1.0203436613082886, "logps/chosen": -0.28603029251098633, "logps/rejected": -0.3468594253063202, "loss": 2.9325, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.8603029251098633, "rewards/margins": 0.6082914471626282, "rewards/rejected": -3.468594789505005, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 51.680348516369364, "learning_rate": 9.97637968732563e-07, "logits/chosen": -1.082493543624878, "logits/rejected": -1.0492689609527588, "logps/chosen": -0.31981295347213745, "logps/rejected": -0.34038814902305603, "loss": 3.0045, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.198129415512085, "rewards/margins": 0.20575246214866638, "rewards/rejected": -3.403881788253784, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 54.61195531607251, "learning_rate": 9.954748808839674e-07, "logits/chosen": -1.0184985399246216, "logits/rejected": -0.9890450239181519, "logps/chosen": -0.3667483925819397, "logps/rejected": -0.42044463753700256, "loss": 2.914, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.6674838066101074, "rewards/margins": 0.536962628364563, "rewards/rejected": -4.204446315765381, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 37.13802969934917, "learning_rate": 9.926188266120295e-07, "logits/chosen": -1.0257906913757324, "logits/rejected": -1.0014400482177734, "logps/chosen": -0.348396897315979, "logps/rejected": -0.4223526418209076, "loss": 2.9874, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.483968734741211, "rewards/margins": 0.7395574450492859, "rewards/rejected": -4.223526477813721, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 51.397013010315, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.99571692943573, "logits/rejected": -0.9247003793716431, "logps/chosen": -0.3539729118347168, "logps/rejected": -0.40424877405166626, "loss": 2.9448, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -3.539729356765747, "rewards/margins": 0.5027583837509155, "rewards/rejected": -4.042487144470215, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 45.94682158666188, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.9615497589111328, "logits/rejected": -0.9482225179672241, "logps/chosen": -0.35162094235420227, "logps/rejected": -0.4469473361968994, "loss": 2.9366, "rewards/accuracies": 0.59375, "rewards/chosen": -3.516209363937378, "rewards/margins": 0.9532642364501953, "rewards/rejected": -4.469473838806152, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 54.044285711278675, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.971204936504364, "logits/rejected": -0.9494684338569641, "logps/chosen": -0.3352496027946472, "logps/rejected": -0.39382439851760864, "loss": 2.8152, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -3.352496385574341, "rewards/margins": 0.5857475996017456, "rewards/rejected": -3.9382433891296387, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 63.65754855294631, "learning_rate": 9.743592451943998e-07, "logits/chosen": -1.0101659297943115, "logits/rejected": -0.9759138822555542, "logps/chosen": -0.4150550365447998, "logps/rejected": -0.4971020221710205, "loss": 2.9804, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.150550842285156, "rewards/margins": 0.820469856262207, "rewards/rejected": -4.971020221710205, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 52.477454995925875, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.104175329208374, "logits/rejected": -1.0200673341751099, "logps/chosen": -0.44731807708740234, "logps/rejected": -0.48720675706863403, "loss": 2.9239, "rewards/accuracies": 0.5625, "rewards/chosen": -4.473180294036865, "rewards/margins": 0.3988868296146393, "rewards/rejected": -4.872067451477051, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 75.5374686993359, "learning_rate": 9.612209208833646e-07, "logits/chosen": -0.9939002990722656, "logits/rejected": -0.9696178436279297, "logps/chosen": -0.4261603355407715, "logps/rejected": -0.5003350377082825, "loss": 2.8938, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.261603355407715, "rewards/margins": 0.7417464852333069, "rewards/rejected": -5.003349781036377, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 63.97152158331742, "learning_rate": 9.536793472839324e-07, "logits/chosen": -0.9977374076843262, "logits/rejected": -0.9457524418830872, "logps/chosen": -0.41890573501586914, "logps/rejected": -0.5236679315567017, "loss": 2.8541, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -4.189057350158691, "rewards/margins": 1.047621488571167, "rewards/rejected": -5.2366790771484375, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 61.90790022013242, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.9688698053359985, "logits/rejected": -0.9092176556587219, "logps/chosen": -0.47669458389282227, "logps/rejected": -0.6035077571868896, "loss": 2.7492, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.766946315765381, "rewards/margins": 1.2681318521499634, "rewards/rejected": -6.035078048706055, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 59.49952855412905, "learning_rate": 9.367041003085648e-07, "logits/chosen": -1.0452520847320557, "logits/rejected": -0.9828802943229675, "logps/chosen": -0.5145617723464966, "logps/rejected": -0.5854091048240662, "loss": 2.6461, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -5.145617961883545, "rewards/margins": 0.7084735631942749, "rewards/rejected": -5.854091167449951, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 70.21848273086795, "learning_rate": 9.272941683504808e-07, "logits/chosen": -0.9884990453720093, "logits/rejected": -0.8981709480285645, "logps/chosen": -0.5350543260574341, "logps/rejected": -0.7321064472198486, "loss": 2.4956, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -5.350543022155762, "rewards/margins": 1.9705219268798828, "rewards/rejected": -7.3210649490356445, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 60.44399310480539, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.0669106245040894, "logits/rejected": -1.0236799716949463, "logps/chosen": -0.6078816652297974, "logps/rejected": -0.7024620771408081, "loss": 2.4476, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -6.078816890716553, "rewards/margins": 0.9458037614822388, "rewards/rejected": -7.024620056152344, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 99.44046344290406, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.0788668394088745, "logits/rejected": -1.072113037109375, "logps/chosen": -0.600851833820343, "logps/rejected": -0.8661821484565735, "loss": 2.2581, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.008517742156982, "rewards/margins": 2.6533024311065674, "rewards/rejected": -8.661821365356445, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 71.50920970516586, "learning_rate": 8.955355173281707e-07, "logits/chosen": -1.0785815715789795, "logits/rejected": -1.028867483139038, "logps/chosen": -0.7083786725997925, "logps/rejected": -0.8610437512397766, "loss": 2.2681, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -7.0837860107421875, "rewards/margins": 1.5266506671905518, "rewards/rejected": -8.610437393188477, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 78.03195943233231, "learning_rate": 8.838223701790055e-07, "logits/chosen": -1.136013150215149, "logits/rejected": -1.1127153635025024, "logps/chosen": -0.8601881861686707, "logps/rejected": -1.009979009628296, "loss": 2.2414, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.60188102722168, "rewards/margins": 1.497908353805542, "rewards/rejected": -10.099790573120117, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 89.34922394775987, "learning_rate": 8.71572412738697e-07, "logits/chosen": -1.0426230430603027, "logits/rejected": -1.0167655944824219, "logps/chosen": -0.8711285591125488, "logps/rejected": -1.1150425672531128, "loss": 2.0482, "rewards/accuracies": 0.75, "rewards/chosen": -8.711285591125488, "rewards/margins": 2.439141273498535, "rewards/rejected": -11.150426864624023, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 75.56439243863579, "learning_rate": 8.588027776804058e-07, "logits/chosen": -1.0707896947860718, "logits/rejected": -1.0511105060577393, "logps/chosen": -0.975185215473175, "logps/rejected": -1.2233917713165283, "loss": 2.11, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -9.751851081848145, "rewards/margins": 2.482065200805664, "rewards/rejected": -12.233917236328125, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 80.28227062969263, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.077965497970581, "logits/rejected": -1.0561294555664062, "logps/chosen": -1.0423448085784912, "logps/rejected": -1.3658177852630615, "loss": 2.1484, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -10.42344856262207, "rewards/margins": 3.2347302436828613, "rewards/rejected": -13.658178329467773, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 106.46219017655814, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.1071363687515259, "logits/rejected": -1.0886081457138062, "logps/chosen": -1.1759061813354492, "logps/rejected": -1.577666997909546, "loss": 2.12, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -11.759061813354492, "rewards/margins": 4.017607688903809, "rewards/rejected": -15.77667236328125, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 62.19115283396683, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.160860538482666, "logits/rejected": -1.1344614028930664, "logps/chosen": -1.1339943408966064, "logps/rejected": -1.5048753023147583, "loss": 1.9366, "rewards/accuracies": 0.75, "rewards/chosen": -11.339943885803223, "rewards/margins": 3.7088088989257812, "rewards/rejected": -15.048751831054688, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 99.1190048190453, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.163419485092163, "logits/rejected": -1.1405839920043945, "logps/chosen": -1.1193758249282837, "logps/rejected": -1.510626196861267, "loss": 1.9257, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -11.193758964538574, "rewards/margins": 3.9125027656555176, "rewards/rejected": -15.106260299682617, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 94.65121648570927, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.1881390810012817, "logits/rejected": -1.1373883485794067, "logps/chosen": -1.1970434188842773, "logps/rejected": -1.4721133708953857, "loss": 1.7523, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -11.970434188842773, "rewards/margins": 2.750699520111084, "rewards/rejected": -14.7211332321167, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 76.66460112585435, "learning_rate": 7.723195175075135e-07, "logits/chosen": -1.1413110494613647, "logits/rejected": -1.1197932958602905, "logps/chosen": -1.1530605554580688, "logps/rejected": -1.529752492904663, "loss": 1.6665, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -11.53060531616211, "rewards/margins": 3.7669196128845215, "rewards/rejected": -15.297525405883789, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 89.60532608004162, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.1999183893203735, "logits/rejected": -1.1381819248199463, "logps/chosen": -1.1865547895431519, "logps/rejected": -1.6205813884735107, "loss": 1.7264, "rewards/accuracies": 0.84375, "rewards/chosen": -11.865548133850098, "rewards/margins": 4.34026575088501, "rewards/rejected": -16.205814361572266, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 89.40986425114181, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.1798112392425537, "logits/rejected": -1.1886682510375977, "logps/chosen": -1.3043251037597656, "logps/rejected": -1.7928787469863892, "loss": 1.5861, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -13.043252944946289, "rewards/margins": 4.885535717010498, "rewards/rejected": -17.928787231445312, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 69.59585645663604, "learning_rate": 7.236565741578162e-07, "logits/chosen": -1.0942785739898682, "logits/rejected": -1.0754075050354004, "logps/chosen": -1.3407868146896362, "logps/rejected": -1.7155689001083374, "loss": 1.6811, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -13.407869338989258, "rewards/margins": 3.747821092605591, "rewards/rejected": -17.155689239501953, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 90.9002259793451, "learning_rate": 7.067792524832603e-07, "logits/chosen": -1.0895906686782837, "logits/rejected": -1.076690912246704, "logps/chosen": -1.4090057611465454, "logps/rejected": -1.8496875762939453, "loss": 1.5719, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -14.090057373046875, "rewards/margins": 4.406816482543945, "rewards/rejected": -18.49687385559082, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 88.53510868331433, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.1440551280975342, "logits/rejected": -1.0879385471343994, "logps/chosen": -1.477365493774414, "logps/rejected": -1.9413487911224365, "loss": 1.7641, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -14.773653984069824, "rewards/margins": 4.639832496643066, "rewards/rejected": -19.41348648071289, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 90.8771106366244, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.1879045963287354, "logits/rejected": -1.1744410991668701, "logps/chosen": -1.4768015146255493, "logps/rejected": -1.9050781726837158, "loss": 1.6469, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -14.76801586151123, "rewards/margins": 4.282763481140137, "rewards/rejected": -19.050779342651367, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 113.9910768058093, "learning_rate": 6.545084971874736e-07, "logits/chosen": -1.1348854303359985, "logits/rejected": -1.1144535541534424, "logps/chosen": -1.4629454612731934, "logps/rejected": -1.938966155052185, "loss": 1.5923, "rewards/accuracies": 0.8125, "rewards/chosen": -14.62945556640625, "rewards/margins": 4.760207653045654, "rewards/rejected": -19.38966178894043, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 106.93624349436355, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.182987093925476, "logits/rejected": -1.1622049808502197, "logps/chosen": -1.5438847541809082, "logps/rejected": -2.0398669242858887, "loss": 1.5195, "rewards/accuracies": 0.75, "rewards/chosen": -15.438847541809082, "rewards/margins": 4.959823131561279, "rewards/rejected": -20.398670196533203, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 74.38520444138166, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.1378791332244873, "logits/rejected": -1.1019752025604248, "logps/chosen": -1.5831804275512695, "logps/rejected": -2.0450186729431152, "loss": 1.5229, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -15.831802368164062, "rewards/margins": 4.618380069732666, "rewards/rejected": -20.450183868408203, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 83.37151004405337, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.2437490224838257, "logits/rejected": -1.1872754096984863, "logps/chosen": -1.511113166809082, "logps/rejected": -1.9627548456192017, "loss": 1.465, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -15.11113166809082, "rewards/margins": 4.516416072845459, "rewards/rejected": -19.62755012512207, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 86.35371016129366, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.27354896068573, "logits/rejected": -1.2406803369522095, "logps/chosen": -1.5447642803192139, "logps/rejected": -2.0402631759643555, "loss": 1.4297, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -15.44764232635498, "rewards/margins": 4.954989433288574, "rewards/rejected": -20.402629852294922, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 86.01810660438646, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.2549498081207275, "logits/rejected": -1.2551780939102173, "logps/chosen": -1.4501978158950806, "logps/rejected": -1.9462862014770508, "loss": 1.4872, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -14.501977920532227, "rewards/margins": 4.960885047912598, "rewards/rejected": -19.46286392211914, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 156.33302244565826, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.3415353298187256, "logits/rejected": -1.276715874671936, "logps/chosen": -1.5217020511627197, "logps/rejected": -2.1184372901916504, "loss": 1.4292, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -15.217020034790039, "rewards/margins": 5.967349529266357, "rewards/rejected": -21.184368133544922, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 132.29630654970654, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.269829511642456, "logits/rejected": -1.250939965248108, "logps/chosen": -1.617235541343689, "logps/rejected": -2.209672212600708, "loss": 1.3537, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -16.172353744506836, "rewards/margins": 5.92436408996582, "rewards/rejected": -22.096717834472656, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 158.35762544123853, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.319935917854309, "logits/rejected": -1.2904071807861328, "logps/chosen": -1.7636592388153076, "logps/rejected": -2.246645927429199, "loss": 1.5149, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.6365909576416, "rewards/margins": 4.829868793487549, "rewards/rejected": -22.466461181640625, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 91.11952816433295, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.2893364429473877, "logits/rejected": -1.2540985345840454, "logps/chosen": -1.687126874923706, "logps/rejected": -2.1884632110595703, "loss": 1.4806, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.87126922607422, "rewards/margins": 5.013364315032959, "rewards/rejected": -21.884632110595703, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 87.17319112572255, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.2351106405258179, "logits/rejected": -1.2150055170059204, "logps/chosen": -1.7930908203125, "logps/rejected": -2.3220622539520264, "loss": 1.4439, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.930908203125, "rewards/margins": 5.2897138595581055, "rewards/rejected": -23.220623016357422, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 150.21462619573362, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.2381212711334229, "logits/rejected": -1.2050025463104248, "logps/chosen": -1.56709623336792, "logps/rejected": -2.124847412109375, "loss": 1.352, "rewards/accuracies": 0.84375, "rewards/chosen": -15.670961380004883, "rewards/margins": 5.577512264251709, "rewards/rejected": -21.248472213745117, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 76.03789179576287, "learning_rate": 4.328833670911724e-07, "logits/chosen": -1.215340495109558, "logits/rejected": -1.172009825706482, "logps/chosen": -1.5536482334136963, "logps/rejected": -2.027946949005127, "loss": 1.5498, "rewards/accuracies": 0.8125, "rewards/chosen": -15.536481857299805, "rewards/margins": 4.742985725402832, "rewards/rejected": -20.279468536376953, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 116.42589091157576, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.2979695796966553, "logits/rejected": -1.2710950374603271, "logps/chosen": -1.580294132232666, "logps/rejected": -2.1467227935791016, "loss": 1.4098, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.802940368652344, "rewards/margins": 5.664287090301514, "rewards/rejected": -21.467227935791016, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 104.14693862336519, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.2605773210525513, "logits/rejected": -1.2214257717132568, "logps/chosen": -1.6150792837142944, "logps/rejected": -2.210728168487549, "loss": 1.1607, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -16.15079116821289, "rewards/margins": 5.9564900398254395, "rewards/rejected": -22.107282638549805, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 114.57172950546183, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.2815816402435303, "logits/rejected": -1.2309645414352417, "logps/chosen": -1.6390390396118164, "logps/rejected": -2.1291799545288086, "loss": 1.4268, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -16.39038848876953, "rewards/margins": 4.901411056518555, "rewards/rejected": -21.291799545288086, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 98.96170714825152, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.2153351306915283, "logits/rejected": -1.19598388671875, "logps/chosen": -1.6430248022079468, "logps/rejected": -2.1691431999206543, "loss": 1.2288, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -16.430248260498047, "rewards/margins": 5.261181831359863, "rewards/rejected": -21.69143295288086, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 93.71636651038355, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.1519898176193237, "logits/rejected": -1.121536135673523, "logps/chosen": -1.7675939798355103, "logps/rejected": -2.2407684326171875, "loss": 1.6479, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -17.67593765258789, "rewards/margins": 4.731745719909668, "rewards/rejected": -22.407682418823242, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 86.39143716603384, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.2513381242752075, "logits/rejected": -1.1973439455032349, "logps/chosen": -1.7402875423431396, "logps/rejected": -2.2251009941101074, "loss": 1.3757, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -17.402873992919922, "rewards/margins": 4.8481340408325195, "rewards/rejected": -22.25101089477539, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 94.5378755636189, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.2887119054794312, "logits/rejected": -1.264244794845581, "logps/chosen": -1.7532516717910767, "logps/rejected": -2.2781875133514404, "loss": 1.4579, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -17.53251838684082, "rewards/margins": 5.2493577003479, "rewards/rejected": -22.78187370300293, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 93.15518142375495, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.2719175815582275, "logits/rejected": -1.2082593441009521, "logps/chosen": -1.7665042877197266, "logps/rejected": -2.3101308345794678, "loss": 1.4507, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -17.665042877197266, "rewards/margins": 5.43626594543457, "rewards/rejected": -23.101306915283203, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 97.11116588059683, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.2424182891845703, "logits/rejected": -1.2367308139801025, "logps/chosen": -1.7836353778839111, "logps/rejected": -2.4376604557037354, "loss": 1.3392, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -17.836353302001953, "rewards/margins": 6.540250301361084, "rewards/rejected": -24.376604080200195, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 69.8021684622736, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.2811143398284912, "logits/rejected": -1.2258597612380981, "logps/chosen": -1.817238211631775, "logps/rejected": -2.512756109237671, "loss": 1.3488, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -18.172382354736328, "rewards/margins": 6.955182075500488, "rewards/rejected": -25.1275634765625, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 77.71196846678416, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.1771881580352783, "logits/rejected": -1.1298190355300903, "logps/chosen": -1.7999836206436157, "logps/rejected": -2.308206081390381, "loss": 1.4148, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -17.999834060668945, "rewards/margins": 5.0822248458862305, "rewards/rejected": -23.082063674926758, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 87.49112614337298, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.2217838764190674, "logits/rejected": -1.2007259130477905, "logps/chosen": -1.806614637374878, "logps/rejected": -2.3819501399993896, "loss": 1.2578, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -18.066146850585938, "rewards/margins": 5.753355979919434, "rewards/rejected": -23.819501876831055, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 107.03498343189057, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.2570655345916748, "logits/rejected": -1.2179275751113892, "logps/chosen": -1.701104760169983, "logps/rejected": -2.2810521125793457, "loss": 1.3651, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -17.01104736328125, "rewards/margins": 5.799473285675049, "rewards/rejected": -22.810522079467773, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 86.43402605700935, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.2722374200820923, "logits/rejected": -1.245724081993103, "logps/chosen": -1.7886686325073242, "logps/rejected": -2.45815110206604, "loss": 1.431, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -17.886688232421875, "rewards/margins": 6.694826602935791, "rewards/rejected": -24.581512451171875, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 139.70986795572603, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.2459907531738281, "logits/rejected": -1.2479979991912842, "logps/chosen": -1.638685941696167, "logps/rejected": -2.1945548057556152, "loss": 1.3108, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -16.386859893798828, "rewards/margins": 5.55868673324585, "rewards/rejected": -21.945547103881836, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 72.87334447256926, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.166658878326416, "logits/rejected": -1.1790478229522705, "logps/chosen": -1.6613085269927979, "logps/rejected": -2.2439186573028564, "loss": 1.199, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -16.613086700439453, "rewards/margins": 5.826100826263428, "rewards/rejected": -22.439186096191406, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 79.88658334563766, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.2464196681976318, "logits/rejected": -1.2423464059829712, "logps/chosen": -1.653806447982788, "logps/rejected": -2.3273231983184814, "loss": 1.3606, "rewards/accuracies": 0.84375, "rewards/chosen": -16.53806495666504, "rewards/margins": 6.735169410705566, "rewards/rejected": -23.273231506347656, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 75.5014559816677, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.2761361598968506, "logits/rejected": -1.202921748161316, "logps/chosen": -1.719037652015686, "logps/rejected": -2.3826024532318115, "loss": 1.3149, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.19037437438965, "rewards/margins": 6.635651588439941, "rewards/rejected": -23.826026916503906, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 97.99359030026348, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.1994116306304932, "logits/rejected": -1.1858131885528564, "logps/chosen": -1.6818050146102905, "logps/rejected": -2.197366714477539, "loss": 1.3242, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -16.818050384521484, "rewards/margins": 5.1556172370910645, "rewards/rejected": -21.97366714477539, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 81.04863528295469, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.2257211208343506, "logits/rejected": -1.2296313047409058, "logps/chosen": -1.8233497142791748, "logps/rejected": -2.4218993186950684, "loss": 1.2579, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -18.233497619628906, "rewards/margins": 5.985495567321777, "rewards/rejected": -24.218994140625, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 122.42193972639224, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.2562552690505981, "logits/rejected": -1.2336931228637695, "logps/chosen": -1.6737709045410156, "logps/rejected": -2.233579397201538, "loss": 1.4593, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -16.737709045410156, "rewards/margins": 5.598085403442383, "rewards/rejected": -22.33579444885254, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 73.50829236522559, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.2516887187957764, "logits/rejected": -1.2255418300628662, "logps/chosen": -1.7326440811157227, "logps/rejected": -2.4565865993499756, "loss": 1.131, "rewards/accuracies": 0.84375, "rewards/chosen": -17.32644271850586, "rewards/margins": 7.239426612854004, "rewards/rejected": -24.565866470336914, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 79.97435377790613, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.2442253828048706, "logits/rejected": -1.2265394926071167, "logps/chosen": -1.791513442993164, "logps/rejected": -2.413567066192627, "loss": 1.3076, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -17.91513442993164, "rewards/margins": 6.220534801483154, "rewards/rejected": -24.135669708251953, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 78.81694766120557, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.2579686641693115, "logits/rejected": -1.2332755327224731, "logps/chosen": -1.7013702392578125, "logps/rejected": -2.281177282333374, "loss": 1.2656, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -17.01369857788086, "rewards/margins": 5.798070430755615, "rewards/rejected": -22.811771392822266, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 65.85044796055213, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.2656534910202026, "logits/rejected": -1.2466814517974854, "logps/chosen": -1.7392183542251587, "logps/rejected": -2.282461166381836, "loss": 1.2554, "rewards/accuracies": 0.8125, "rewards/chosen": -17.39218521118164, "rewards/margins": 5.4324259757995605, "rewards/rejected": -22.824609756469727, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.4624698162078857, "eval_logits/rejected": -1.4709206819534302, "eval_logps/chosen": -1.7548081874847412, "eval_logps/rejected": -2.3352949619293213, "eval_loss": 1.259665608406067, "eval_rewards/accuracies": 0.8414633870124817, "eval_rewards/chosen": -17.548084259033203, "eval_rewards/margins": 5.804864883422852, "eval_rewards/rejected": -23.352949142456055, "eval_runtime": 99.0195, "eval_samples_per_second": 19.804, "eval_steps_per_second": 1.242, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 105.3602759438555, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.219763994216919, "logits/rejected": -1.236553430557251, "logps/chosen": -1.8288891315460205, "logps/rejected": -2.4192848205566406, "loss": 1.2773, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -18.288890838623047, "rewards/margins": 5.903960227966309, "rewards/rejected": -24.19285011291504, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 151.80745481617458, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.2444562911987305, "logits/rejected": -1.2300468683242798, "logps/chosen": -1.7573579549789429, "logps/rejected": -2.3343706130981445, "loss": 1.3736, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -17.57358169555664, "rewards/margins": 5.770127296447754, "rewards/rejected": -23.343708038330078, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 81.2925954089191, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.2386925220489502, "logits/rejected": -1.179355263710022, "logps/chosen": -1.6806129217147827, "logps/rejected": -2.3523941040039062, "loss": 1.4348, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -16.806127548217773, "rewards/margins": 6.717810153961182, "rewards/rejected": -23.52393913269043, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 86.0496565299361, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.2732845544815063, "logits/rejected": -1.217943787574768, "logps/chosen": -1.6420053243637085, "logps/rejected": -2.2697978019714355, "loss": 1.2661, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -16.420053482055664, "rewards/margins": 6.277927398681641, "rewards/rejected": -22.697978973388672, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 109.05110419596457, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.264012098312378, "logits/rejected": -1.2519878149032593, "logps/chosen": -1.8349307775497437, "logps/rejected": -2.3962314128875732, "loss": 1.449, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -18.349308013916016, "rewards/margins": 5.613007068634033, "rewards/rejected": -23.962316513061523, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 100.07054279278233, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.2456198930740356, "logits/rejected": -1.2271144390106201, "logps/chosen": -1.821995735168457, "logps/rejected": -2.4518990516662598, "loss": 1.2827, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -18.21995735168457, "rewards/margins": 6.29903507232666, "rewards/rejected": -24.518993377685547, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 83.31815666168382, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.2076127529144287, "logits/rejected": -1.1630686521530151, "logps/chosen": -1.6751903295516968, "logps/rejected": -2.2981104850769043, "loss": 1.2333, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -16.751901626586914, "rewards/margins": 6.2292022705078125, "rewards/rejected": -22.981103897094727, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 80.90105173474987, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.2286403179168701, "logits/rejected": -1.1650359630584717, "logps/chosen": -1.8118457794189453, "logps/rejected": -2.381875991821289, "loss": 1.2913, "rewards/accuracies": 0.8125, "rewards/chosen": -18.118457794189453, "rewards/margins": 5.700301170349121, "rewards/rejected": -23.81875991821289, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 79.8365351361464, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.1772619485855103, "logits/rejected": -1.156760811805725, "logps/chosen": -1.7749990224838257, "logps/rejected": -2.451124668121338, "loss": 1.1325, "rewards/accuracies": 0.84375, "rewards/chosen": -17.749990463256836, "rewards/margins": 6.761256217956543, "rewards/rejected": -24.511247634887695, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 87.63443320029899, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.2932809591293335, "logits/rejected": -1.2688276767730713, "logps/chosen": -1.7249505519866943, "logps/rejected": -2.3006398677825928, "loss": 1.3266, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -17.249507904052734, "rewards/margins": 5.756890296936035, "rewards/rejected": -23.006397247314453, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 103.7569023130293, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.2542561292648315, "logits/rejected": -1.2149529457092285, "logps/chosen": -1.7445335388183594, "logps/rejected": -2.428725481033325, "loss": 1.0878, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -17.445335388183594, "rewards/margins": 6.841917991638184, "rewards/rejected": -24.287254333496094, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 103.46687532351471, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.2521284818649292, "logits/rejected": -1.2400448322296143, "logps/chosen": -1.8158271312713623, "logps/rejected": -2.446662664413452, "loss": 1.2683, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -18.15826988220215, "rewards/margins": 6.3083577156066895, "rewards/rejected": -24.466629028320312, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 87.35869072597153, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.2218118906021118, "logits/rejected": -1.2264667749404907, "logps/chosen": -1.836191177368164, "logps/rejected": -2.498161792755127, "loss": 1.3805, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -18.36191177368164, "rewards/margins": 6.619703769683838, "rewards/rejected": -24.981616973876953, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 1.8863488443403225, "train_runtime": 11541.8743, "train_samples_per_second": 5.188, "train_steps_per_second": 0.04 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }