{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 53.52218298444476, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.0146243572235107, "logits/rejected": -0.9850981831550598, "logps/chosen": -0.27403339743614197, "logps/rejected": -0.2716384530067444, "loss": 3.0444, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -2.7403340339660645, "rewards/margins": -0.02394939959049225, "rewards/rejected": -2.7163848876953125, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 39.10999969888965, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.0449364185333252, "logits/rejected": -0.9776930809020996, "logps/chosen": -0.29451489448547363, "logps/rejected": -0.2995792329311371, "loss": 3.0211, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.9451489448547363, "rewards/margins": 0.05064352601766586, "rewards/rejected": -2.9957923889160156, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 53.821066581509214, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9672248959541321, "logits/rejected": -0.9867329597473145, "logps/chosen": -0.26386433839797974, "logps/rejected": -0.30063143372535706, "loss": 3.0404, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.638643503189087, "rewards/margins": 0.3676711320877075, "rewards/rejected": -3.006314516067505, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 86.6542555553414, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9602643847465515, "logits/rejected": -0.9344671964645386, "logps/chosen": -0.2776374816894531, "logps/rejected": -0.29131022095680237, "loss": 2.9793, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.7763748168945312, "rewards/margins": 0.13672712445259094, "rewards/rejected": -2.913102149963379, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 56.919799993589805, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.0135596990585327, "logits/rejected": -0.9844949841499329, "logps/chosen": -0.2717221677303314, "logps/rejected": -0.2782990336418152, "loss": 3.124, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.717221736907959, "rewards/margins": 0.06576814502477646, "rewards/rejected": -2.7829902172088623, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 45.796379698409524, "learning_rate": 6.382978723404255e-07, "logits/chosen": -0.9898878931999207, "logits/rejected": -0.9455238580703735, "logps/chosen": -0.2733747959136963, "logps/rejected": -0.279060035943985, "loss": 2.8977, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -2.733747959136963, "rewards/margins": 0.05685253068804741, "rewards/rejected": -2.790600299835205, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 64.64288788170485, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.0491113662719727, "logits/rejected": -0.9738750457763672, "logps/chosen": -0.2941775918006897, "logps/rejected": -0.32069069147109985, "loss": 2.9119, "rewards/accuracies": 0.53125, "rewards/chosen": -2.9417757987976074, "rewards/margins": 0.2651310861110687, "rewards/rejected": -3.206906795501709, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 60.56769615337976, "learning_rate": 8.51063829787234e-07, "logits/chosen": -1.0074384212493896, "logits/rejected": -0.963466465473175, "logps/chosen": -0.2797192931175232, "logps/rejected": -0.3225395083427429, "loss": 2.9345, "rewards/accuracies": 0.59375, "rewards/chosen": -2.7971930503845215, "rewards/margins": 0.4282020032405853, "rewards/rejected": -3.2253952026367188, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 48.675093440338955, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.0469945669174194, "logits/rejected": -1.0040814876556396, "logps/chosen": -0.33255186676979065, "logps/rejected": -0.38402628898620605, "loss": 2.9815, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -3.3255183696746826, "rewards/margins": 0.5147446393966675, "rewards/rejected": -3.8402628898620605, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 92.08652708998007, "learning_rate": 9.998741174712533e-07, "logits/chosen": -1.038892388343811, "logits/rejected": -0.988103985786438, "logps/chosen": -0.34245526790618896, "logps/rejected": -0.38594862818717957, "loss": 3.0508, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -3.4245529174804688, "rewards/margins": 0.4349338412284851, "rewards/rejected": -3.8594863414764404, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 72.54827446103837, "learning_rate": 9.991050648838675e-07, "logits/chosen": -1.0567952394485474, "logits/rejected": -1.0215675830841064, "logps/chosen": -0.28753459453582764, "logps/rejected": -0.3490275740623474, "loss": 2.7982, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.8753461837768555, "rewards/margins": 0.6149295568466187, "rewards/rejected": -3.4902758598327637, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 55.56312267177659, "learning_rate": 9.97637968732563e-07, "logits/chosen": -1.0922194719314575, "logits/rejected": -1.059291958808899, "logps/chosen": -0.3225264847278595, "logps/rejected": -0.3470703959465027, "loss": 2.8716, "rewards/accuracies": 0.5625, "rewards/chosen": -3.22526478767395, "rewards/margins": 0.2454390972852707, "rewards/rejected": -3.4707038402557373, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 54.05440384507174, "learning_rate": 9.954748808839674e-07, "logits/chosen": -0.9975064992904663, "logits/rejected": -0.9689160585403442, "logps/chosen": -0.37468865513801575, "logps/rejected": -0.43205341696739197, "loss": 2.7901, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -3.746886730194092, "rewards/margins": 0.5736472010612488, "rewards/rejected": -4.3205342292785645, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 37.66775098927071, "learning_rate": 9.926188266120295e-07, "logits/chosen": -1.0229814052581787, "logits/rejected": -0.9982998967170715, "logps/chosen": -0.3514581620693207, "logps/rejected": -0.4274352192878723, "loss": 2.8718, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -3.5145821571350098, "rewards/margins": 0.7597699761390686, "rewards/rejected": -4.274352073669434, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 51.934633835606974, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.9838461875915527, "logits/rejected": -0.9134309887886047, "logps/chosen": -0.35928577184677124, "logps/rejected": -0.4099213182926178, "loss": 2.8345, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.592857837677002, "rewards/margins": 0.5063551664352417, "rewards/rejected": -4.099213123321533, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 46.83578017177419, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.9681940078735352, "logits/rejected": -0.9539217948913574, "logps/chosen": -0.353752076625824, "logps/rejected": -0.4523216187953949, "loss": 2.7878, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -3.53752064704895, "rewards/margins": 0.985695481300354, "rewards/rejected": -4.523216247558594, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 54.174949491419966, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.9862138628959656, "logits/rejected": -0.9641338586807251, "logps/chosen": -0.3405635952949524, "logps/rejected": -0.39860305190086365, "loss": 2.6715, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.4056358337402344, "rewards/margins": 0.5803946852684021, "rewards/rejected": -3.9860305786132812, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 62.18682762469074, "learning_rate": 9.743592451943998e-07, "logits/chosen": -1.028374195098877, "logits/rejected": -0.9928615689277649, "logps/chosen": -0.4192899763584137, "logps/rejected": -0.5028694868087769, "loss": 2.8803, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -4.192899703979492, "rewards/margins": 0.8357950448989868, "rewards/rejected": -5.028695106506348, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 56.712862810919404, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.102429747581482, "logits/rejected": -1.017956256866455, "logps/chosen": -0.4515204429626465, "logps/rejected": -0.49105948209762573, "loss": 2.7854, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -4.515204429626465, "rewards/margins": 0.39539000391960144, "rewards/rejected": -4.910594463348389, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 77.56651991727357, "learning_rate": 9.612209208833646e-07, "logits/chosen": -1.0002816915512085, "logits/rejected": -0.9756115078926086, "logps/chosen": -0.4405655860900879, "logps/rejected": -0.5030835866928101, "loss": 2.8381, "rewards/accuracies": 0.625, "rewards/chosen": -4.405655860900879, "rewards/margins": 0.6251801252365112, "rewards/rejected": -5.0308356285095215, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 63.78609875386195, "learning_rate": 9.536793472839324e-07, "logits/chosen": -1.0079588890075684, "logits/rejected": -0.9540907144546509, "logps/chosen": -0.41310757398605347, "logps/rejected": -0.5235550999641418, "loss": 2.7704, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -4.131075859069824, "rewards/margins": 1.1044747829437256, "rewards/rejected": -5.235550880432129, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 59.92913033519696, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.9624613523483276, "logits/rejected": -0.9022065997123718, "logps/chosen": -0.4771413207054138, "logps/rejected": -0.6054214239120483, "loss": 2.6684, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.771413326263428, "rewards/margins": 1.2828001976013184, "rewards/rejected": -6.0542144775390625, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 57.71552130623015, "learning_rate": 9.367041003085648e-07, "logits/chosen": -1.0269968509674072, "logits/rejected": -0.9661616086959839, "logps/chosen": -0.5121074914932251, "logps/rejected": -0.578630268573761, "loss": 2.5559, "rewards/accuracies": 0.625, "rewards/chosen": -5.121075630187988, "rewards/margins": 0.6652273535728455, "rewards/rejected": -5.7863030433654785, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 64.09249680400335, "learning_rate": 9.272941683504808e-07, "logits/chosen": -0.998211681842804, "logits/rejected": -0.9050429463386536, "logps/chosen": -0.5254617929458618, "logps/rejected": -0.7217136025428772, "loss": 2.4049, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.2546186447143555, "rewards/margins": 1.9625177383422852, "rewards/rejected": -7.217136383056641, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 54.10213565718134, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.06635320186615, "logits/rejected": -1.0216171741485596, "logps/chosen": -0.5953704714775085, "logps/rejected": -0.6902128458023071, "loss": 2.3251, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -5.953704833984375, "rewards/margins": 0.9484230875968933, "rewards/rejected": -6.902127742767334, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 87.63946362541415, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.0916361808776855, "logits/rejected": -1.085458517074585, "logps/chosen": -0.588487446308136, "logps/rejected": -0.8501450419425964, "loss": 2.1826, "rewards/accuracies": 0.75, "rewards/chosen": -5.884873867034912, "rewards/margins": 2.6165759563446045, "rewards/rejected": -8.501450538635254, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 66.78226800807278, "learning_rate": 8.955355173281707e-07, "logits/chosen": -1.067176342010498, "logits/rejected": -1.0195186138153076, "logps/chosen": -0.6727192401885986, "logps/rejected": -0.8245170712471008, "loss": 2.1861, "rewards/accuracies": 0.71875, "rewards/chosen": -6.7271928787231445, "rewards/margins": 1.5179781913757324, "rewards/rejected": -8.245170593261719, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 77.9071558548112, "learning_rate": 8.838223701790055e-07, "logits/chosen": -1.1569595336914062, "logits/rejected": -1.1336597204208374, "logps/chosen": -0.785293698310852, "logps/rejected": -0.9337224960327148, "loss": 2.1564, "rewards/accuracies": 0.71875, "rewards/chosen": -7.852936744689941, "rewards/margins": 1.4842884540557861, "rewards/rejected": -9.337224960327148, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 109.08809267522398, "learning_rate": 8.71572412738697e-07, "logits/chosen": -1.0582095384597778, "logits/rejected": -1.03193998336792, "logps/chosen": -0.827114462852478, "logps/rejected": -1.0477594137191772, "loss": 2.0118, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.27114486694336, "rewards/margins": 2.206449508666992, "rewards/rejected": -10.477594375610352, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 72.02715367718524, "learning_rate": 8.588027776804058e-07, "logits/chosen": -1.0821495056152344, "logits/rejected": -1.0622715950012207, "logps/chosen": -0.9100320935249329, "logps/rejected": -1.1453698873519897, "loss": 2.0273, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -9.100319862365723, "rewards/margins": 2.353379011154175, "rewards/rejected": -11.453699111938477, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 70.83089987980944, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.092185139656067, "logits/rejected": -1.070657730102539, "logps/chosen": -0.9754332304000854, "logps/rejected": -1.2774028778076172, "loss": 2.0633, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.754331588745117, "rewards/margins": 3.0196967124938965, "rewards/rejected": -12.774029731750488, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 89.85353120616982, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.107634425163269, "logits/rejected": -1.0893046855926514, "logps/chosen": -1.0988253355026245, "logps/rejected": -1.4862325191497803, "loss": 2.0523, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -10.988253593444824, "rewards/margins": 3.8740711212158203, "rewards/rejected": -14.862322807312012, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 60.873789300571126, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.1269104480743408, "logits/rejected": -1.1010853052139282, "logps/chosen": -1.0796130895614624, "logps/rejected": -1.4461021423339844, "loss": 1.8838, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -10.796131134033203, "rewards/margins": 3.6648898124694824, "rewards/rejected": -14.461019515991211, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 77.23211870911884, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.1438876390457153, "logits/rejected": -1.1206210851669312, "logps/chosen": -1.0647801160812378, "logps/rejected": -1.4476187229156494, "loss": 1.8488, "rewards/accuracies": 0.78125, "rewards/chosen": -10.647802352905273, "rewards/margins": 3.828385591506958, "rewards/rejected": -14.476186752319336, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 97.35090322598491, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.157462239265442, "logits/rejected": -1.1056431531906128, "logps/chosen": -1.1336826086044312, "logps/rejected": -1.3956897258758545, "loss": 1.7083, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -11.336827278137207, "rewards/margins": 2.620070695877075, "rewards/rejected": -13.956896781921387, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 70.60533034676232, "learning_rate": 7.723195175075135e-07, "logits/chosen": -1.1124871969223022, "logits/rejected": -1.0904567241668701, "logps/chosen": -1.0966602563858032, "logps/rejected": -1.4549492597579956, "loss": 1.6569, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -10.966601371765137, "rewards/margins": 3.5828919410705566, "rewards/rejected": -14.549494743347168, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 84.92007593834019, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.1567202806472778, "logits/rejected": -1.0984870195388794, "logps/chosen": -1.1207507848739624, "logps/rejected": -1.5250511169433594, "loss": 1.6817, "rewards/accuracies": 0.84375, "rewards/chosen": -11.20750904083252, "rewards/margins": 4.043001651763916, "rewards/rejected": -15.250509262084961, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 82.82215861540205, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.1628299951553345, "logits/rejected": -1.170377492904663, "logps/chosen": -1.235033392906189, "logps/rejected": -1.7156970500946045, "loss": 1.5387, "rewards/accuracies": 0.84375, "rewards/chosen": -12.350334167480469, "rewards/margins": 4.806637763977051, "rewards/rejected": -17.156970977783203, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 70.50682719627838, "learning_rate": 7.236565741578162e-07, "logits/chosen": -1.1164333820343018, "logits/rejected": -1.0961400270462036, "logps/chosen": -1.2593460083007812, "logps/rejected": -1.6189504861831665, "loss": 1.6047, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -12.593461036682129, "rewards/margins": 3.5960440635681152, "rewards/rejected": -16.189504623413086, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 97.28442308133118, "learning_rate": 7.067792524832603e-07, "logits/chosen": -1.1036303043365479, "logits/rejected": -1.0911258459091187, "logps/chosen": -1.324706792831421, "logps/rejected": -1.7423721551895142, "loss": 1.5626, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -13.247068405151367, "rewards/margins": 4.176652908325195, "rewards/rejected": -17.42371940612793, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 85.54406338680343, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.154517650604248, "logits/rejected": -1.101162314414978, "logps/chosen": -1.40175461769104, "logps/rejected": -1.8435806035995483, "loss": 1.7321, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -14.017547607421875, "rewards/margins": 4.418261528015137, "rewards/rejected": -18.435808181762695, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 103.7420052940262, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.1762125492095947, "logits/rejected": -1.1645376682281494, "logps/chosen": -1.4059008359909058, "logps/rejected": -1.826703429222107, "loss": 1.6602, "rewards/accuracies": 0.84375, "rewards/chosen": -14.05901050567627, "rewards/margins": 4.208024024963379, "rewards/rejected": -18.267032623291016, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 115.44925865991426, "learning_rate": 6.545084971874736e-07, "logits/chosen": -1.1282669305801392, "logits/rejected": -1.1098558902740479, "logps/chosen": -1.4084670543670654, "logps/rejected": -1.8751609325408936, "loss": 1.5529, "rewards/accuracies": 0.8125, "rewards/chosen": -14.084672927856445, "rewards/margins": 4.666939735412598, "rewards/rejected": -18.751609802246094, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 94.83729222797992, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.191811442375183, "logits/rejected": -1.1707171201705933, "logps/chosen": -1.4761518239974976, "logps/rejected": -1.9565551280975342, "loss": 1.5052, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -14.761518478393555, "rewards/margins": 4.804032325744629, "rewards/rejected": -19.5655517578125, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 69.08600083744463, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.171081304550171, "logits/rejected": -1.1351138353347778, "logps/chosen": -1.4978052377700806, "logps/rejected": -1.93888258934021, "loss": 1.4742, "rewards/accuracies": 0.78125, "rewards/chosen": -14.978052139282227, "rewards/margins": 4.410772800445557, "rewards/rejected": -19.388826370239258, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 72.13177261697588, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.245228886604309, "logits/rejected": -1.1923692226409912, "logps/chosen": -1.4302809238433838, "logps/rejected": -1.8505923748016357, "loss": 1.4317, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -14.302810668945312, "rewards/margins": 4.203113555908203, "rewards/rejected": -18.505924224853516, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 78.71892029667256, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.2821385860443115, "logits/rejected": -1.2519080638885498, "logps/chosen": -1.4527919292449951, "logps/rejected": -1.9279251098632812, "loss": 1.4134, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -14.527920722961426, "rewards/margins": 4.751331329345703, "rewards/rejected": -19.279251098632812, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 87.04649782214463, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.245603322982788, "logits/rejected": -1.247234582901001, "logps/chosen": -1.3713314533233643, "logps/rejected": -1.8449758291244507, "loss": 1.4346, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -13.7133150100708, "rewards/margins": 4.736443042755127, "rewards/rejected": -18.449758529663086, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 120.6181547874012, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.3496326208114624, "logits/rejected": -1.2907614707946777, "logps/chosen": -1.440033197402954, "logps/rejected": -2.0060055255889893, "loss": 1.4071, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -14.4003324508667, "rewards/margins": 5.659722328186035, "rewards/rejected": -20.060054779052734, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 127.16635817286267, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.2846823930740356, "logits/rejected": -1.2672080993652344, "logps/chosen": -1.5308005809783936, "logps/rejected": -2.1003577709198, "loss": 1.3208, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.308004379272461, "rewards/margins": 5.695572853088379, "rewards/rejected": -21.003578186035156, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 306.6500775815346, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.3492941856384277, "logits/rejected": -1.3214812278747559, "logps/chosen": -1.6527442932128906, "logps/rejected": -2.1238582134246826, "loss": 1.4957, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -16.527442932128906, "rewards/margins": 4.711141586303711, "rewards/rejected": -21.238582611083984, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 89.88872208917493, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.3080346584320068, "logits/rejected": -1.2766286134719849, "logps/chosen": -1.5681380033493042, "logps/rejected": -2.0582587718963623, "loss": 1.4255, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -15.681379318237305, "rewards/margins": 4.901208877563477, "rewards/rejected": -20.58258819580078, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 86.94866969630735, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.2753899097442627, "logits/rejected": -1.2558867931365967, "logps/chosen": -1.668534278869629, "logps/rejected": -2.181380271911621, "loss": 1.4204, "rewards/accuracies": 0.8125, "rewards/chosen": -16.68534278869629, "rewards/margins": 5.1284589767456055, "rewards/rejected": -21.813800811767578, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 144.84472573271995, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.2718496322631836, "logits/rejected": -1.2400305271148682, "logps/chosen": -1.4821763038635254, "logps/rejected": -2.010958194732666, "loss": 1.3379, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -14.82176399230957, "rewards/margins": 5.287820816040039, "rewards/rejected": -20.109582901000977, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 73.43309027045284, "learning_rate": 4.328833670911724e-07, "logits/chosen": -1.2501633167266846, "logits/rejected": -1.2101550102233887, "logps/chosen": -1.4654467105865479, "logps/rejected": -1.9191405773162842, "loss": 1.4963, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.654467582702637, "rewards/margins": 4.536937713623047, "rewards/rejected": -19.191404342651367, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 118.57888241178858, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.3355966806411743, "logits/rejected": -1.3093878030776978, "logps/chosen": -1.5047754049301147, "logps/rejected": -2.050473213195801, "loss": 1.3782, "rewards/accuracies": 0.8125, "rewards/chosen": -15.047755241394043, "rewards/margins": 5.456977844238281, "rewards/rejected": -20.50473403930664, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 100.79377019073691, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.307716965675354, "logits/rejected": -1.2712657451629639, "logps/chosen": -1.5421284437179565, "logps/rejected": -2.111297845840454, "loss": 1.1768, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -15.421285629272461, "rewards/margins": 5.6916913986206055, "rewards/rejected": -21.112977981567383, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 102.72150408454053, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.333150863647461, "logits/rejected": -1.2860305309295654, "logps/chosen": -1.5572869777679443, "logps/rejected": -2.028750419616699, "loss": 1.3993, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -15.572871208190918, "rewards/margins": 4.714633464813232, "rewards/rejected": -20.287504196166992, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 96.32710532692002, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.2499698400497437, "logits/rejected": -1.2331962585449219, "logps/chosen": -1.5715104341506958, "logps/rejected": -2.08168625831604, "loss": 1.2236, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -15.715105056762695, "rewards/margins": 5.101758003234863, "rewards/rejected": -20.81686019897461, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 86.57462147935358, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.1837140321731567, "logits/rejected": -1.156435251235962, "logps/chosen": -1.662043809890747, "logps/rejected": -2.1141371726989746, "loss": 1.6291, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -16.620437622070312, "rewards/margins": 4.52093505859375, "rewards/rejected": -21.141372680664062, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 80.28274687652879, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.2831798791885376, "logits/rejected": -1.2358052730560303, "logps/chosen": -1.636275053024292, "logps/rejected": -2.093479871749878, "loss": 1.3641, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -16.362751007080078, "rewards/margins": 4.572048664093018, "rewards/rejected": -20.93480110168457, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 96.95154393343023, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.3118140697479248, "logits/rejected": -1.2903715372085571, "logps/chosen": -1.6529546976089478, "logps/rejected": -2.148355007171631, "loss": 1.4766, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -16.529544830322266, "rewards/margins": 4.954004764556885, "rewards/rejected": -21.483551025390625, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 91.51736686071692, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.3063311576843262, "logits/rejected": -1.2485519647598267, "logps/chosen": -1.6333932876586914, "logps/rejected": -2.1507859230041504, "loss": 1.3963, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -16.333934783935547, "rewards/margins": 5.173925876617432, "rewards/rejected": -21.507858276367188, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 97.07913178610919, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.2934232950210571, "logits/rejected": -1.2893450260162354, "logps/chosen": -1.6584867238998413, "logps/rejected": -2.2650082111358643, "loss": 1.3115, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -16.584867477416992, "rewards/margins": 6.065215110778809, "rewards/rejected": -22.650081634521484, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 65.85264295945626, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.326992392539978, "logits/rejected": -1.2749508619308472, "logps/chosen": -1.7087081670761108, "logps/rejected": -2.377331256866455, "loss": 1.307, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -17.087081909179688, "rewards/margins": 6.6862287521362305, "rewards/rejected": -23.773311614990234, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 71.08108071468983, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.2321017980575562, "logits/rejected": -1.1879392862319946, "logps/chosen": -1.6843183040618896, "logps/rejected": -2.170222759246826, "loss": 1.3597, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -16.843183517456055, "rewards/margins": 4.859041690826416, "rewards/rejected": -21.702226638793945, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 85.80290375242986, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.2848079204559326, "logits/rejected": -1.2643808126449585, "logps/chosen": -1.6716737747192383, "logps/rejected": -2.2179079055786133, "loss": 1.2118, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.716739654541016, "rewards/margins": 5.46234130859375, "rewards/rejected": -22.179079055786133, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 106.87884023285183, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.3009603023529053, "logits/rejected": -1.2630964517593384, "logps/chosen": -1.5984188318252563, "logps/rejected": -2.146073579788208, "loss": 1.3799, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.984187126159668, "rewards/margins": 5.476546287536621, "rewards/rejected": -21.460735321044922, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 81.57738599240237, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.3159044981002808, "logits/rejected": -1.2908227443695068, "logps/chosen": -1.6741054058074951, "logps/rejected": -2.3151228427886963, "loss": 1.4007, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -16.741056442260742, "rewards/margins": 6.4101715087890625, "rewards/rejected": -23.151227951049805, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 137.46788470613842, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.3111270666122437, "logits/rejected": -1.3133299350738525, "logps/chosen": -1.55172860622406, "logps/rejected": -2.0665595531463623, "loss": 1.3291, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -15.51728630065918, "rewards/margins": 5.148309707641602, "rewards/rejected": -20.66559410095215, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 73.0747912837978, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.2305195331573486, "logits/rejected": -1.2432688474655151, "logps/chosen": -1.5626884698867798, "logps/rejected": -2.1072001457214355, "loss": 1.159, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.626884460449219, "rewards/margins": 5.445114612579346, "rewards/rejected": -21.071998596191406, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 82.19549372560476, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.2950479984283447, "logits/rejected": -1.2918254137039185, "logps/chosen": -1.5626431703567505, "logps/rejected": -2.192157030105591, "loss": 1.3653, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -15.626432418823242, "rewards/margins": 6.295140266418457, "rewards/rejected": -21.921573638916016, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 82.26556152038766, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.3408092260360718, "logits/rejected": -1.2676836252212524, "logps/chosen": -1.621119737625122, "logps/rejected": -2.2568397521972656, "loss": 1.2936, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -16.211196899414062, "rewards/margins": 6.35720157623291, "rewards/rejected": -22.56839942932129, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 97.91298047906564, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.2643686532974243, "logits/rejected": -1.2498524188995361, "logps/chosen": -1.578148603439331, "logps/rejected": -2.067432403564453, "loss": 1.3095, "rewards/accuracies": 0.84375, "rewards/chosen": -15.781486511230469, "rewards/margins": 4.892836093902588, "rewards/rejected": -20.6743221282959, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 76.40375667456833, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.2894870042800903, "logits/rejected": -1.2930238246917725, "logps/chosen": -1.7195911407470703, "logps/rejected": -2.293926954269409, "loss": 1.2661, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -17.195911407470703, "rewards/margins": 5.7433576583862305, "rewards/rejected": -22.939268112182617, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 128.55014662844385, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.3014891147613525, "logits/rejected": -1.2802826166152954, "logps/chosen": -1.5879671573638916, "logps/rejected": -2.113447666168213, "loss": 1.4202, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -15.879669189453125, "rewards/margins": 5.25480842590332, "rewards/rejected": -21.134477615356445, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 77.57546829061782, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.310450792312622, "logits/rejected": -1.2848607301712036, "logps/chosen": -1.6349436044692993, "logps/rejected": -2.3224172592163086, "loss": 1.1354, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -16.349435806274414, "rewards/margins": 6.8747382164001465, "rewards/rejected": -23.224172592163086, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 75.76498018298135, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.3069926500320435, "logits/rejected": -1.289568305015564, "logps/chosen": -1.699163794517517, "logps/rejected": -2.2884535789489746, "loss": 1.3105, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -16.99163818359375, "rewards/margins": 5.8928985595703125, "rewards/rejected": -22.884536743164062, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 74.3951066976334, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.3087493181228638, "logits/rejected": -1.2855933904647827, "logps/chosen": -1.6192277669906616, "logps/rejected": -2.1721370220184326, "loss": 1.2688, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -16.192277908325195, "rewards/margins": 5.5290937423706055, "rewards/rejected": -21.721370697021484, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 63.10639530225684, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.3169952630996704, "logits/rejected": -1.2985506057739258, "logps/chosen": -1.6467092037200928, "logps/rejected": -2.164301633834839, "loss": 1.2114, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -16.467090606689453, "rewards/margins": 5.17592716217041, "rewards/rejected": -21.643016815185547, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.4850261211395264, "eval_logits/rejected": -1.493988037109375, "eval_logps/chosen": -1.664995551109314, "eval_logps/rejected": -2.2206830978393555, "eval_loss": 1.2490928173065186, "eval_rewards/accuracies": 0.8414633870124817, "eval_rewards/chosen": -16.64995574951172, "eval_rewards/margins": 5.556875228881836, "eval_rewards/rejected": -22.206830978393555, "eval_runtime": 95.4555, "eval_samples_per_second": 20.544, "eval_steps_per_second": 1.289, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 102.60713365281785, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.267327904701233, "logits/rejected": -1.2850300073623657, "logps/chosen": -1.7324796915054321, "logps/rejected": -2.2837843894958496, "loss": 1.2554, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -17.32479476928711, "rewards/margins": 5.513047218322754, "rewards/rejected": -22.83784294128418, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 159.07261192162792, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.2908105850219727, "logits/rejected": -1.2769014835357666, "logps/chosen": -1.6668212413787842, "logps/rejected": -2.2075092792510986, "loss": 1.3804, "rewards/accuracies": 0.84375, "rewards/chosen": -16.668210983276367, "rewards/margins": 5.406882286071777, "rewards/rejected": -22.075092315673828, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 76.85488373819665, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.2890928983688354, "logits/rejected": -1.2320820093154907, "logps/chosen": -1.5973718166351318, "logps/rejected": -2.237947940826416, "loss": 1.4163, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -15.973716735839844, "rewards/margins": 6.40576171875, "rewards/rejected": -22.37947654724121, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 92.42320617715352, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.3191107511520386, "logits/rejected": -1.2656759023666382, "logps/chosen": -1.5610657930374146, "logps/rejected": -2.152204990386963, "loss": 1.2658, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -15.610658645629883, "rewards/margins": 5.911390781402588, "rewards/rejected": -21.522048950195312, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 102.84147971960329, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.3186463117599487, "logits/rejected": -1.3073859214782715, "logps/chosen": -1.736202597618103, "logps/rejected": -2.2703185081481934, "loss": 1.4248, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.362024307250977, "rewards/margins": 5.34116268157959, "rewards/rejected": -22.70318603515625, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 108.04777919102577, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.2982522249221802, "logits/rejected": -1.2813619375228882, "logps/chosen": -1.7414271831512451, "logps/rejected": -2.3307671546936035, "loss": 1.2802, "rewards/accuracies": 0.875, "rewards/chosen": -17.41427230834961, "rewards/margins": 5.893403053283691, "rewards/rejected": -23.30767250061035, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 85.6236171514638, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.262458324432373, "logits/rejected": -1.2183687686920166, "logps/chosen": -1.5962882041931152, "logps/rejected": -2.182863712310791, "loss": 1.222, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -15.962882995605469, "rewards/margins": 5.865753650665283, "rewards/rejected": -21.828638076782227, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 88.93173263482028, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.2786242961883545, "logits/rejected": -1.2167497873306274, "logps/chosen": -1.7170331478118896, "logps/rejected": -2.2510578632354736, "loss": 1.2694, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -17.170331954956055, "rewards/margins": 5.340245723724365, "rewards/rejected": -22.510578155517578, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 80.06878550984797, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.229853868484497, "logits/rejected": -1.2106773853302002, "logps/chosen": -1.688746690750122, "logps/rejected": -2.332123279571533, "loss": 1.1496, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -16.88746452331543, "rewards/margins": 6.433764457702637, "rewards/rejected": -23.321231842041016, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 87.87225651237878, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.3426064252853394, "logits/rejected": -1.3202402591705322, "logps/chosen": -1.6350570917129517, "logps/rejected": -2.1853957176208496, "loss": 1.3199, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.350570678710938, "rewards/margins": 5.503388404846191, "rewards/rejected": -21.853958129882812, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 97.584405727653, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.3094408512115479, "logits/rejected": -1.2717828750610352, "logps/chosen": -1.6554279327392578, "logps/rejected": -2.304875373840332, "loss": 1.0871, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -16.554279327392578, "rewards/margins": 6.494471549987793, "rewards/rejected": -23.048751831054688, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 94.99231466494224, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.3035484552383423, "logits/rejected": -1.2918545007705688, "logps/chosen": -1.7271077632904053, "logps/rejected": -2.327470541000366, "loss": 1.2419, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -17.27107810974121, "rewards/margins": 6.003628730773926, "rewards/rejected": -23.27470588684082, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 86.3036149732278, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.286787986755371, "logits/rejected": -1.2908694744110107, "logps/chosen": -1.7435226440429688, "logps/rejected": -2.3610475063323975, "loss": 1.3748, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -17.435226440429688, "rewards/margins": 6.175250053405762, "rewards/rejected": -23.610477447509766, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 1.8360214427400707, "train_runtime": 11486.9698, "train_samples_per_second": 5.213, "train_steps_per_second": 0.041 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }