{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 9.442932838948966, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.0071109533309937, "logits/rejected": -0.9781900644302368, "logps/chosen": -0.2738580107688904, "logps/rejected": -0.27158379554748535, "loss": 1.0523, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5477160215377808, "rewards/margins": -0.004548341501504183, "rewards/rejected": -0.5431675910949707, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 6.34423728622988, "learning_rate": 4.25531914893617e-07, "logits/chosen": -1.0404982566833496, "logits/rejected": -0.9738548398017883, "logps/chosen": -0.2942856252193451, "logps/rejected": -0.2995370030403137, "loss": 1.0442, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.5885712504386902, "rewards/margins": 0.010502670891582966, "rewards/rejected": -0.5990740060806274, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 6.854457761517512, "learning_rate": 6.382978723404255e-07, "logits/chosen": -0.9717105031013489, "logits/rejected": -0.9914683103561401, "logps/chosen": -0.2636018991470337, "logps/rejected": -0.3009588122367859, "loss": 1.0229, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5272037982940674, "rewards/margins": 0.07471387088298798, "rewards/rejected": -0.6019176244735718, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 16.17238672181369, "learning_rate": 8.51063829787234e-07, "logits/chosen": -0.9552351236343384, "logits/rejected": -0.9299653768539429, "logps/chosen": -0.27658405900001526, "logps/rejected": -0.2946491837501526, "loss": 1.0348, "rewards/accuracies": 0.5, "rewards/chosen": -0.5531681180000305, "rewards/margins": 0.03613026812672615, "rewards/rejected": -0.5892983675003052, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 7.914459513231275, "learning_rate": 1.0638297872340424e-06, "logits/chosen": -1.0123283863067627, "logits/rejected": -0.9839458465576172, "logps/chosen": -0.2764621078968048, "logps/rejected": -0.29262328147888184, "loss": 1.0216, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.5529242157936096, "rewards/margins": 0.03232245892286301, "rewards/rejected": -0.5852465629577637, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 13.510536818444182, "learning_rate": 1.276595744680851e-06, "logits/chosen": -0.9960908889770508, "logits/rejected": -0.9520798921585083, "logps/chosen": -0.3060453534126282, "logps/rejected": -0.3202216625213623, "loss": 1.0213, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.6120907068252563, "rewards/margins": 0.028352651745080948, "rewards/rejected": -0.6404433250427246, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 10.603480288342643, "learning_rate": 1.4893617021276594e-06, "logits/chosen": -1.0775905847549438, "logits/rejected": -1.0043548345565796, "logps/chosen": -0.33030545711517334, "logps/rejected": -0.3744826912879944, "loss": 1.0195, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.6606109142303467, "rewards/margins": 0.08835448324680328, "rewards/rejected": -0.7489653825759888, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 14.893194407448227, "learning_rate": 1.702127659574468e-06, "logits/chosen": -1.0553807020187378, "logits/rejected": -1.0140490531921387, "logps/chosen": -0.3645663559436798, "logps/rejected": -0.461661159992218, "loss": 1.0284, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.7291327118873596, "rewards/margins": 0.19418945908546448, "rewards/rejected": -0.923322319984436, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 7.362675910290458, "learning_rate": 1.9148936170212767e-06, "logits/chosen": -1.1070150136947632, "logits/rejected": -1.0679465532302856, "logps/chosen": -0.4404965341091156, "logps/rejected": -0.5644907355308533, "loss": 1.0179, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.8809930682182312, "rewards/margins": 0.24798834323883057, "rewards/rejected": -1.1289814710617065, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 16.23414874505975, "learning_rate": 1.9997482349425066e-06, "logits/chosen": -1.0770556926727295, "logits/rejected": -1.0299774408340454, "logps/chosen": -0.3946690261363983, "logps/rejected": -0.47187358140945435, "loss": 1.0123, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.7893380522727966, "rewards/margins": 0.15440911054611206, "rewards/rejected": -0.9437471628189087, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 11.88283791262975, "learning_rate": 1.998210129767735e-06, "logits/chosen": -1.0645383596420288, "logits/rejected": -1.035369873046875, "logps/chosen": -0.3778243362903595, "logps/rejected": -0.48207464814186096, "loss": 0.9951, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.755648672580719, "rewards/margins": 0.2085006982088089, "rewards/rejected": -0.9641492962837219, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 10.444389026599103, "learning_rate": 1.995275937465126e-06, "logits/chosen": -1.082425594329834, "logits/rejected": -1.0538678169250488, "logps/chosen": -0.4237767159938812, "logps/rejected": -0.4713103175163269, "loss": 0.9836, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.8475534319877625, "rewards/margins": 0.0950673446059227, "rewards/rejected": -0.9426206350326538, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 12.992830889875604, "learning_rate": 1.9909497617679347e-06, "logits/chosen": -0.9931782484054565, "logits/rejected": -0.9680334329605103, "logps/chosen": -0.5701107382774353, "logps/rejected": -0.7114989757537842, "loss": 0.9774, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.1402214765548706, "rewards/margins": 0.2827766239643097, "rewards/rejected": -1.4229979515075684, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 8.908123494624329, "learning_rate": 1.985237653224059e-06, "logits/chosen": -0.9891507029533386, "logits/rejected": -0.9734717607498169, "logps/chosen": -0.5873534679412842, "logps/rejected": -0.7440844774246216, "loss": 0.9571, "rewards/accuracies": 0.625, "rewards/chosen": -1.1747069358825684, "rewards/margins": 0.3134620785713196, "rewards/rejected": -1.4881689548492432, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 15.06224561163384, "learning_rate": 1.9781476007338054e-06, "logits/chosen": -0.9478601217269897, "logits/rejected": -0.8844977617263794, "logps/chosen": -0.6380752921104431, "logps/rejected": -0.7878230810165405, "loss": 0.9386, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2761505842208862, "rewards/margins": 0.29949551820755005, "rewards/rejected": -1.575646162033081, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 10.129109213694903, "learning_rate": 1.9696895203766866e-06, "logits/chosen": -0.9139761924743652, "logits/rejected": -0.9103153944015503, "logps/chosen": -0.7025324702262878, "logps/rejected": -0.9276626706123352, "loss": 0.8866, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.4050649404525757, "rewards/margins": 0.45026034116744995, "rewards/rejected": -1.8553253412246704, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 16.035849628874075, "learning_rate": 1.9598752415428888e-06, "logits/chosen": -0.9445829391479492, "logits/rejected": -0.9311642646789551, "logps/chosen": -0.8271282315254211, "logps/rejected": -1.0663609504699707, "loss": 0.8879, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6542564630508423, "rewards/margins": 0.4784657061100006, "rewards/rejected": -2.1327219009399414, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 15.552471664159093, "learning_rate": 1.9487184903887996e-06, "logits/chosen": -0.9677060842514038, "logits/rejected": -0.9533635377883911, "logps/chosen": -1.1237901449203491, "logps/rejected": -1.4190008640289307, "loss": 0.9043, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2475802898406982, "rewards/margins": 0.5904213786125183, "rewards/rejected": -2.8380017280578613, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 21.38276928877544, "learning_rate": 1.936234870639737e-06, "logits/chosen": -1.0183446407318115, "logits/rejected": -0.9617747068405151, "logps/chosen": -1.5094763040542603, "logps/rejected": -1.7956956624984741, "loss": 0.8115, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0189526081085205, "rewards/margins": 0.5724390745162964, "rewards/rejected": -3.5913913249969482, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 31.546005742023485, "learning_rate": 1.922441841766729e-06, "logits/chosen": -0.8167861104011536, "logits/rejected": -0.8134365081787109, "logps/chosen": -1.9628349542617798, "logps/rejected": -2.347581148147583, "loss": 0.841, "rewards/accuracies": 0.75, "rewards/chosen": -3.9256699085235596, "rewards/margins": 0.7694929838180542, "rewards/rejected": -4.695162296295166, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 31.175237667862007, "learning_rate": 1.907358694567865e-06, "logits/chosen": -0.7257764935493469, "logits/rejected": -0.682075560092926, "logps/chosen": -2.4148917198181152, "logps/rejected": -2.919673204421997, "loss": 0.8144, "rewards/accuracies": 0.75, "rewards/chosen": -4.8297834396362305, "rewards/margins": 1.0095628499984741, "rewards/rejected": -5.839346408843994, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 42.60812515694024, "learning_rate": 1.8910065241883678e-06, "logits/chosen": -0.5907033681869507, "logits/rejected": -0.5452768206596375, "logps/chosen": -2.7082858085632324, "logps/rejected": -3.285773515701294, "loss": 0.7803, "rewards/accuracies": 0.75, "rewards/chosen": -5.416571617126465, "rewards/margins": 1.1549749374389648, "rewards/rejected": -6.571547031402588, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 33.770352812549774, "learning_rate": 1.8734082006171296e-06, "logits/chosen": -0.6769031286239624, "logits/rejected": -0.6223554611206055, "logps/chosen": -2.841639995574951, "logps/rejected": -3.499586820602417, "loss": 0.7724, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -5.683279991149902, "rewards/margins": 1.3158934116363525, "rewards/rejected": -6.999173641204834, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 25.195574765320742, "learning_rate": 1.8545883367009615e-06, "logits/chosen": -0.7494109272956848, "logits/rejected": -0.6586568355560303, "logps/chosen": -2.6896004676818848, "logps/rejected": -3.3795294761657715, "loss": 0.7034, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.3792009353637695, "rewards/margins": 1.3798582553863525, "rewards/rejected": -6.759058952331543, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 20.698310297934206, "learning_rate": 1.8345732537213026e-06, "logits/chosen": -0.8739752769470215, "logits/rejected": -0.8345277905464172, "logps/chosen": -2.600498676300049, "logps/rejected": -3.1906166076660156, "loss": 0.6515, "rewards/accuracies": 0.75, "rewards/chosen": -5.200997352600098, "rewards/margins": 1.1802361011505127, "rewards/rejected": -6.381233215332031, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 31.900476449074073, "learning_rate": 1.8133909445815276e-06, "logits/chosen": -0.876822829246521, "logits/rejected": -0.8683232069015503, "logps/chosen": -2.75192928314209, "logps/rejected": -3.620870590209961, "loss": 0.6498, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -5.50385856628418, "rewards/margins": 1.7378835678100586, "rewards/rejected": -7.241741180419922, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 30.23141141236411, "learning_rate": 1.7910710346563413e-06, "logits/chosen": -0.7084225416183472, "logits/rejected": -0.650471568107605, "logps/chosen": -3.4160752296447754, "logps/rejected": -4.176965713500977, "loss": 0.6394, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -6.832150459289551, "rewards/margins": 1.5217812061309814, "rewards/rejected": -8.353931427001953, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 29.441980968776832, "learning_rate": 1.767644740358011e-06, "logits/chosen": -0.76490318775177, "logits/rejected": -0.7356737852096558, "logps/chosen": -3.500870943069458, "logps/rejected": -4.334284782409668, "loss": 0.5747, "rewards/accuracies": 0.8125, "rewards/chosen": -7.001741886138916, "rewards/margins": 1.6668283939361572, "rewards/rejected": -8.668569564819336, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 28.87020107784321, "learning_rate": 1.743144825477394e-06, "logits/chosen": -0.6797415614128113, "logits/rejected": -0.650688648223877, "logps/chosen": -3.6205127239227295, "logps/rejected": -4.511746406555176, "loss": 0.6507, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -7.241025447845459, "rewards/margins": 1.7824666500091553, "rewards/rejected": -9.023492813110352, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 29.827503183327266, "learning_rate": 1.7176055553608117e-06, "logits/chosen": -0.7169264554977417, "logits/rejected": -0.6832514405250549, "logps/chosen": -3.934389114379883, "logps/rejected": -4.9375319480896, "loss": 0.6128, "rewards/accuracies": 0.8125, "rewards/chosen": -7.868778228759766, "rewards/margins": 2.0062854290008545, "rewards/rejected": -9.8750638961792, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 27.09179333048581, "learning_rate": 1.6910626489868648e-06, "logits/chosen": -0.8100920915603638, "logits/rejected": -0.7742663621902466, "logps/chosen": -3.824146270751953, "logps/rejected": -5.090175628662109, "loss": 0.6399, "rewards/accuracies": 0.84375, "rewards/chosen": -7.648292541503906, "rewards/margins": 2.5320582389831543, "rewards/rejected": -10.180351257324219, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 36.65170099175081, "learning_rate": 1.6635532290102113e-06, "logits/chosen": -0.8540701866149902, "logits/rejected": -0.8212080001831055, "logps/chosen": -4.092007637023926, "logps/rejected": -5.184715270996094, "loss": 0.5601, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -8.184015274047852, "rewards/margins": 2.185415744781494, "rewards/rejected": -10.369430541992188, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 44.09007725935235, "learning_rate": 1.6351157698421788e-06, "logits/chosen": -0.9053822755813599, "logits/rejected": -0.8696815371513367, "logps/chosen": -4.188479423522949, "logps/rejected": -5.3639140129089355, "loss": 0.5898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.376958847045898, "rewards/margins": 2.3508691787719727, "rewards/rejected": -10.727828025817871, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 34.800340553634506, "learning_rate": 1.6057900438408199e-06, "logits/chosen": -0.8616800308227539, "logits/rejected": -0.8292746543884277, "logps/chosen": -4.644923686981201, "logps/rejected": -6.038055896759033, "loss": 0.5397, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -9.289847373962402, "rewards/margins": 2.7862656116485596, "rewards/rejected": -12.076111793518066, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 34.593547384833734, "learning_rate": 1.5756170656856736e-06, "logits/chosen": -0.9542654752731323, "logits/rejected": -0.889543354511261, "logps/chosen": -4.545766830444336, "logps/rejected": -5.687682628631592, "loss": 0.5562, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -9.091533660888672, "rewards/margins": 2.28383207321167, "rewards/rejected": -11.375365257263184, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 22.61281693291947, "learning_rate": 1.544639035015027e-06, "logits/chosen": -0.9639078378677368, "logits/rejected": -0.9341806173324585, "logps/chosen": -4.075970649719238, "logps/rejected": -5.5132246017456055, "loss": 0.513, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -8.151941299438477, "rewards/margins": 2.8745083808898926, "rewards/rejected": -11.026449203491211, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 21.446599953079577, "learning_rate": 1.5128992774059062e-06, "logits/chosen": -1.0559054613113403, "logits/rejected": -0.9924653172492981, "logps/chosen": -3.7231125831604004, "logps/rejected": -5.130820274353027, "loss": 0.4996, "rewards/accuracies": 0.875, "rewards/chosen": -7.446225166320801, "rewards/margins": 2.815417766571045, "rewards/rejected": -10.261640548706055, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 24.863835996393608, "learning_rate": 1.4804421837793377e-06, "logits/chosen": -0.9934264421463013, "logits/rejected": -0.9997881054878235, "logps/chosen": -4.336796760559082, "logps/rejected": -5.937041282653809, "loss": 0.4682, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -8.673593521118164, "rewards/margins": 3.200488328933716, "rewards/rejected": -11.874082565307617, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 34.56272131407248, "learning_rate": 1.4473131483156324e-06, "logits/chosen": -0.8811644315719604, "logits/rejected": -0.8515303730964661, "logps/chosen": -5.209665298461914, "logps/rejected": -6.913350582122803, "loss": 0.515, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -10.419330596923828, "rewards/margins": 3.4073710441589355, "rewards/rejected": -13.826701164245605, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 26.404593181307447, "learning_rate": 1.4135585049665206e-06, "logits/chosen": -0.8241022825241089, "logits/rejected": -0.7840823531150818, "logps/chosen": -5.047942161560059, "logps/rejected": -6.955193996429443, "loss": 0.4519, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -10.095884323120117, "rewards/margins": 3.8145041465759277, "rewards/rejected": -13.910387992858887, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 35.5838299296831, "learning_rate": 1.3792254626529285e-06, "logits/chosen": -0.8618327975273132, "logits/rejected": -0.7756074666976929, "logps/chosen": -5.758598327636719, "logps/rejected": -7.596462249755859, "loss": 0.5778, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -11.517196655273438, "rewards/margins": 3.6757278442382812, "rewards/rejected": -15.192924499511719, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 42.32040382898782, "learning_rate": 1.3443620392390349e-06, "logits/chosen": -0.9941180944442749, "logits/rejected": -0.9657033085823059, "logps/chosen": -4.432991981506348, "logps/rejected": -6.000949859619141, "loss": 0.495, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -8.865983963012695, "rewards/margins": 3.135915756225586, "rewards/rejected": -12.001899719238281, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 28.859222169675768, "learning_rate": 1.3090169943749473e-06, "logits/chosen": -0.948104739189148, "logits/rejected": -0.9129034280776978, "logps/chosen": -3.579448699951172, "logps/rejected": -5.187192440032959, "loss": 0.4532, "rewards/accuracies": 0.875, "rewards/chosen": -7.158897399902344, "rewards/margins": 3.2154877185821533, "rewards/rejected": -10.374384880065918, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 33.6510053739595, "learning_rate": 1.27323976130192e-06, "logits/chosen": -0.9587677121162415, "logits/rejected": -0.9107363820075989, "logps/chosen": -4.461714744567871, "logps/rejected": -6.2298054695129395, "loss": 0.3885, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -8.923429489135742, "rewards/margins": 3.536180019378662, "rewards/rejected": -12.459610939025879, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 39.16622543078335, "learning_rate": 1.2370803777154975e-06, "logits/chosen": -0.7982478141784668, "logits/rejected": -0.7258783578872681, "logps/chosen": -7.227081298828125, "logps/rejected": -9.01085090637207, "loss": 0.5453, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -14.45416259765625, "rewards/margins": 3.5675411224365234, "rewards/rejected": -18.02170181274414, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 28.294600400326075, "learning_rate": 1.2005894157832728e-06, "logits/chosen": -0.9068690538406372, "logits/rejected": -0.8007113337516785, "logps/chosen": -5.985177516937256, "logps/rejected": -8.007855415344238, "loss": 0.4459, "rewards/accuracies": 0.90625, "rewards/chosen": -11.970355033874512, "rewards/margins": 4.045356750488281, "rewards/rejected": -16.015710830688477, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 26.428195821183824, "learning_rate": 1.1638179114151377e-06, "logits/chosen": -1.0134648084640503, "logits/rejected": -0.9478827714920044, "logps/chosen": -4.030945301055908, "logps/rejected": -5.84409761428833, "loss": 0.4607, "rewards/accuracies": 0.84375, "rewards/chosen": -8.061890602111816, "rewards/margins": 3.6263041496276855, "rewards/rejected": -11.68819522857666, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 28.506424636352925, "learning_rate": 1.1268172928849485e-06, "logits/chosen": -1.0107872486114502, "logits/rejected": -0.9833100438117981, "logps/chosen": -3.623994827270508, "logps/rejected": -5.339346885681152, "loss": 0.4664, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -7.247989654541016, "rewards/margins": 3.4307048320770264, "rewards/rejected": -10.678693771362305, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 37.9874271990268, "learning_rate": 1.0896393089034335e-06, "logits/chosen": -1.0698987245559692, "logits/rejected": -0.9614090919494629, "logps/chosen": -4.2720537185668945, "logps/rejected": -6.518821716308594, "loss": 0.3759, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -8.544107437133789, "rewards/margins": 4.493536472320557, "rewards/rejected": -13.037643432617188, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 37.6233219867946, "learning_rate": 1.052335956242944e-06, "logits/chosen": -0.9640167355537415, "logits/rejected": -0.9025171399116516, "logps/chosen": -5.073387622833252, "logps/rejected": -7.112657070159912, "loss": 0.3989, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -10.146775245666504, "rewards/margins": 4.078539848327637, "rewards/rejected": -14.225314140319824, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 35.09471619941238, "learning_rate": 1.0149594070152636e-06, "logits/chosen": -0.9901530146598816, "logits/rejected": -0.9247368574142456, "logps/chosen": -6.148016452789307, "logps/rejected": -8.221637725830078, "loss": 0.4697, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -12.296032905578613, "rewards/margins": 4.147244930267334, "rewards/rejected": -16.443275451660156, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 39.356165818725984, "learning_rate": 9.77561935704195e-07, "logits/chosen": -0.9357139468193054, "logits/rejected": -0.858476459980011, "logps/chosen": -6.003566741943359, "logps/rejected": -8.099205017089844, "loss": 0.4241, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -12.007133483886719, "rewards/margins": 4.191277027130127, "rewards/rejected": -16.198410034179688, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 20.857149706425567, "learning_rate": 9.401958460549657e-07, "logits/chosen": -0.8877873420715332, "logits/rejected": -0.8332953453063965, "logps/chosen": -5.713176250457764, "logps/rejected": -7.9226484298706055, "loss": 0.4085, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.426352500915527, "rewards/margins": 4.418946266174316, "rewards/rejected": -15.845296859741211, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 37.11096746877866, "learning_rate": 9.029133979227118e-07, "logits/chosen": -0.9584988355636597, "logits/rejected": -0.9051562547683716, "logps/chosen": -4.586709976196289, "logps/rejected": -6.5038323402404785, "loss": 0.4022, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.173419952392578, "rewards/margins": 3.8342444896698, "rewards/rejected": -13.007664680480957, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 25.04133162285963, "learning_rate": 8.657667341823448e-07, "logits/chosen": -0.9564048051834106, "logits/rejected": -0.8701663017272949, "logps/chosen": -4.893515586853027, "logps/rejected": -6.940362453460693, "loss": 0.4312, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -9.787031173706055, "rewards/margins": 4.093693256378174, "rewards/rejected": -13.880724906921387, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 31.64139590058085, "learning_rate": 8.288078078020249e-07, "logits/chosen": -1.0176098346710205, "logits/rejected": -0.9464299082756042, "logps/chosen": -5.894881248474121, "logps/rejected": -8.109701156616211, "loss": 0.4212, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -11.789762496948242, "rewards/margins": 4.429640769958496, "rewards/rejected": -16.219402313232422, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 34.98704174006504, "learning_rate": 7.920883091822408e-07, "logits/chosen": -1.0222933292388916, "logits/rejected": -0.9283574223518372, "logps/chosen": -5.977299213409424, "logps/rejected": -8.55643081665039, "loss": 0.3473, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -11.954598426818848, "rewards/margins": 5.158264636993408, "rewards/rejected": -17.11286163330078, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 43.7429550932754, "learning_rate": 7.556595938621058e-07, "logits/chosen": -1.0368258953094482, "logits/rejected": -0.9450758099555969, "logps/chosen": -6.416205406188965, "logps/rejected": -8.702176094055176, "loss": 0.4135, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -12.83241081237793, "rewards/margins": 4.571939945220947, "rewards/rejected": -17.40435218811035, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 38.571708947108014, "learning_rate": 7.195726106939973e-07, "logits/chosen": -1.0127325057983398, "logits/rejected": -0.9613968729972839, "logps/chosen": -6.0891900062561035, "logps/rejected": -8.455511093139648, "loss": 0.3415, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.178380012512207, "rewards/margins": 4.73264217376709, "rewards/rejected": -16.911022186279297, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 37.02547097442152, "learning_rate": 6.838778305869759e-07, "logits/chosen": -0.9378641247749329, "logits/rejected": -0.8806314468383789, "logps/chosen": -6.423588752746582, "logps/rejected": -8.611102104187012, "loss": 0.4404, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -12.847177505493164, "rewards/margins": 4.375027656555176, "rewards/rejected": -17.222204208374023, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 35.55070245031894, "learning_rate": 6.486251759186572e-07, "logits/chosen": -1.0858322381973267, "logits/rejected": -0.9954659342765808, "logps/chosen": -5.805714130401611, "logps/rejected": -7.78420877456665, "loss": 0.4396, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -11.611428260803223, "rewards/margins": 3.956988573074341, "rewards/rejected": -15.5684175491333, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 51.53731628000405, "learning_rate": 6.138639507142538e-07, "logits/chosen": -1.175060749053955, "logits/rejected": -1.1142823696136475, "logps/chosen": -5.7005181312561035, "logps/rejected": -7.967810153961182, "loss": 0.4227, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -11.401036262512207, "rewards/margins": 4.534584999084473, "rewards/rejected": -15.935620307922363, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 29.428644028564324, "learning_rate": 5.796427716904346e-07, "logits/chosen": -1.1236612796783447, "logits/rejected": -1.0238118171691895, "logps/chosen": -6.259681701660156, "logps/rejected": -8.45996379852295, "loss": 0.3742, "rewards/accuracies": 0.875, "rewards/chosen": -12.519363403320312, "rewards/margins": 4.400565147399902, "rewards/rejected": -16.9199275970459, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 38.71321431370745, "learning_rate": 5.460095002604532e-07, "logits/chosen": -1.11953866481781, "logits/rejected": -1.0796916484832764, "logps/chosen": -6.55707311630249, "logps/rejected": -9.187610626220703, "loss": 0.3626, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -13.11414623260498, "rewards/margins": 5.2610764503479, "rewards/rejected": -18.375221252441406, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 38.929986299465604, "learning_rate": 5.130111755956327e-07, "logits/chosen": -1.1838449239730835, "logits/rejected": -1.0870417356491089, "logps/chosen": -6.676375389099121, "logps/rejected": -9.317723274230957, "loss": 0.4211, "rewards/accuracies": 0.875, "rewards/chosen": -13.352750778198242, "rewards/margins": 5.282693862915039, "rewards/rejected": -18.635446548461914, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 26.360971338492213, "learning_rate": 4.806939488368308e-07, "logits/chosen": -1.0527994632720947, "logits/rejected": -0.9714158177375793, "logps/chosen": -6.790243625640869, "logps/rejected": -8.82271671295166, "loss": 0.3754, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -13.580487251281738, "rewards/margins": 4.06494665145874, "rewards/rejected": -17.64543342590332, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 37.25228754273986, "learning_rate": 4.4910301854789755e-07, "logits/chosen": -1.092002511024475, "logits/rejected": -1.0370265245437622, "logps/chosen": -6.746194362640381, "logps/rejected": -8.957503318786621, "loss": 0.379, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -13.492388725280762, "rewards/margins": 4.4226179122924805, "rewards/rejected": -17.915006637573242, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 41.78732477890408, "learning_rate": 4.1828256750139443e-07, "logits/chosen": -1.15060555934906, "logits/rejected": -1.0927339792251587, "logps/chosen": -6.618721008300781, "logps/rejected": -8.740182876586914, "loss": 0.4272, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -13.237442016601562, "rewards/margins": 4.242924213409424, "rewards/rejected": -17.480365753173828, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 31.334898714386284, "learning_rate": 3.882757008849935e-07, "logits/chosen": -1.1759268045425415, "logits/rejected": -1.125778317451477, "logps/chosen": -7.186532020568848, "logps/rejected": -9.452940940856934, "loss": 0.3551, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -14.373064041137695, "rewards/margins": 4.532819747924805, "rewards/rejected": -18.905881881713867, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 29.63352796318247, "learning_rate": 3.5912438601497584e-07, "logits/chosen": -1.186089038848877, "logits/rejected": -1.1533267498016357, "logps/chosen": -6.283223628997803, "logps/rejected": -8.40349006652832, "loss": 0.3724, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -12.566447257995605, "rewards/margins": 4.2405314445495605, "rewards/rejected": -16.80698013305664, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 37.40129439705042, "learning_rate": 3.308693936411421e-07, "logits/chosen": -1.0497562885284424, "logits/rejected": -1.0346195697784424, "logps/chosen": -6.789434909820557, "logps/rejected": -9.07376766204834, "loss": 0.3605, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -13.578869819641113, "rewards/margins": 4.568666458129883, "rewards/rejected": -18.14753532409668, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 42.85252213793353, "learning_rate": 3.035502409252333e-07, "logits/chosen": -1.11203134059906, "logits/rejected": -1.0642902851104736, "logps/chosen": -6.502237796783447, "logps/rejected": -9.193612098693848, "loss": 0.4275, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -13.004475593566895, "rewards/margins": 5.382746696472168, "rewards/rejected": -18.387224197387695, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 45.248127741114246, "learning_rate": 2.7720513617260855e-07, "logits/chosen": -1.1741015911102295, "logits/rejected": -1.0450173616409302, "logps/chosen": -6.776492118835449, "logps/rejected": -9.342794418334961, "loss": 0.3758, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -13.552984237670898, "rewards/margins": 5.132604598999023, "rewards/rejected": -18.685588836669922, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 70.88406643518205, "learning_rate": 2.5187092539447294e-07, "logits/chosen": -1.1018563508987427, "logits/rejected": -1.0579187870025635, "logps/chosen": -6.298445701599121, "logps/rejected": -8.556467056274414, "loss": 0.3808, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.596891403198242, "rewards/margins": 4.516043663024902, "rewards/rejected": -17.112934112548828, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 41.80456248679069, "learning_rate": 2.2758304077540058e-07, "logits/chosen": -1.1480379104614258, "logits/rejected": -1.1150692701339722, "logps/chosen": -6.318451881408691, "logps/rejected": -8.656303405761719, "loss": 0.3586, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -12.636903762817383, "rewards/margins": 4.675703525543213, "rewards/rejected": -17.312606811523438, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 31.017236490830967, "learning_rate": 2.043754511182191e-07, "logits/chosen": -1.1511554718017578, "logits/rejected": -1.0976629257202148, "logps/chosen": -6.138351917266846, "logps/rejected": -8.664915084838867, "loss": 0.4, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -12.276703834533691, "rewards/margins": 5.053128242492676, "rewards/rejected": -17.329830169677734, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 26.351372088988093, "learning_rate": 1.8228061433556864e-07, "logits/chosen": -1.1164242029190063, "logits/rejected": -1.0599582195281982, "logps/chosen": -6.1393351554870605, "logps/rejected": -8.908954620361328, "loss": 0.3271, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -12.278670310974121, "rewards/margins": 5.539238929748535, "rewards/rejected": -17.817909240722656, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 40.004488570738765, "learning_rate": 1.6132943205457606e-07, "logits/chosen": -1.1820439100265503, "logits/rejected": -1.1261646747589111, "logps/chosen": -6.401742458343506, "logps/rejected": -8.99330997467041, "loss": 0.4273, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.803484916687012, "rewards/margins": 5.183135032653809, "rewards/rejected": -17.98661994934082, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 41.3303995282676, "learning_rate": 1.415512063981339e-07, "logits/chosen": -1.1933691501617432, "logits/rejected": -1.143477201461792, "logps/chosen": -6.095961093902588, "logps/rejected": -8.315205574035645, "loss": 0.3615, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -12.191922187805176, "rewards/margins": 4.438488960266113, "rewards/rejected": -16.63041114807129, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 30.146673376540157, "learning_rate": 1.2297359900323344e-07, "logits/chosen": -1.185856819152832, "logits/rejected": -1.149908423423767, "logps/chosen": -6.064610958099365, "logps/rejected": -8.274811744689941, "loss": 0.3805, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -12.12922191619873, "rewards/margins": 4.420398712158203, "rewards/rejected": -16.549623489379883, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.3878380060195923, "eval_logits/rejected": -1.3844929933547974, "eval_logps/chosen": -5.970302104949951, "eval_logps/rejected": -8.178492546081543, "eval_loss": 0.34991469979286194, "eval_rewards/accuracies": 0.9004064798355103, "eval_rewards/chosen": -11.940604209899902, "eval_rewards/margins": 4.416379928588867, "eval_rewards/rejected": -16.356985092163086, "eval_runtime": 98.864, "eval_samples_per_second": 19.835, "eval_steps_per_second": 1.244, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 39.23606930955491, "learning_rate": 1.0562259233366333e-07, "logits/chosen": -1.1601266860961914, "logits/rejected": -1.1533467769622803, "logps/chosen": -6.3432416915893555, "logps/rejected": -8.685356140136719, "loss": 0.3527, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -12.686483383178711, "rewards/margins": 4.684228420257568, "rewards/rejected": -17.370712280273438, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 44.84060293631811, "learning_rate": 8.952245334118413e-08, "logits/chosen": -1.1762316226959229, "logits/rejected": -1.1400468349456787, "logps/chosen": -5.951014041900635, "logps/rejected": -8.487456321716309, "loss": 0.372, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -11.90202808380127, "rewards/margins": 5.072883605957031, "rewards/rejected": -16.974912643432617, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 31.58697079899106, "learning_rate": 7.46956995260033e-08, "logits/chosen": -1.1965105533599854, "logits/rejected": -1.0948525667190552, "logps/chosen": -5.939952373504639, "logps/rejected": -8.576761245727539, "loss": 0.3642, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -11.879904747009277, "rewards/margins": 5.273618698120117, "rewards/rejected": -17.153522491455078, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 56.62718923940337, "learning_rate": 6.11630674440139e-08, "logits/chosen": -1.2364650964736938, "logits/rejected": -1.1493674516677856, "logps/chosen": -5.8380866050720215, "logps/rejected": -8.528668403625488, "loss": 0.3543, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -11.676173210144043, "rewards/margins": 5.381163597106934, "rewards/rejected": -17.057336807250977, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 26.88857335924454, "learning_rate": 4.8943483704846465e-08, "logits/chosen": -1.2132270336151123, "logits/rejected": -1.1822996139526367, "logps/chosen": -6.329747200012207, "logps/rejected": -8.68973159790039, "loss": 0.378, "rewards/accuracies": 0.90625, "rewards/chosen": -12.659494400024414, "rewards/margins": 4.719969749450684, "rewards/rejected": -17.37946319580078, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 29.204672971590583, "learning_rate": 3.805403850129407e-08, "logits/chosen": -1.1887871026992798, "logits/rejected": -1.1395562887191772, "logps/chosen": -6.298637866973877, "logps/rejected": -8.703396797180176, "loss": 0.3701, "rewards/accuracies": 0.90625, "rewards/chosen": -12.597275733947754, "rewards/margins": 4.809514999389648, "rewards/rejected": -17.40679359436035, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 41.83119701192464, "learning_rate": 2.8509961707132492e-08, "logits/chosen": -1.1526520252227783, "logits/rejected": -1.087210774421692, "logps/chosen": -5.99376106262207, "logps/rejected": -8.27347183227539, "loss": 0.3539, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -11.98752212524414, "rewards/margins": 4.559422492980957, "rewards/rejected": -16.54694366455078, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 27.28448585229794, "learning_rate": 2.032460157676452e-08, "logits/chosen": -1.1298894882202148, "logits/rejected": -1.049036979675293, "logps/chosen": -6.4232072830200195, "logps/rejected": -8.850305557250977, "loss": 0.3414, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -12.846414566040039, "rewards/margins": 4.854195594787598, "rewards/rejected": -17.700611114501953, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 31.150711268639814, "learning_rate": 1.3509406076478659e-08, "logits/chosen": -1.1100740432739258, "logits/rejected": -1.0567227602005005, "logps/chosen": -6.3755292892456055, "logps/rejected": -9.159284591674805, "loss": 0.344, "rewards/accuracies": 0.9375, "rewards/chosen": -12.751058578491211, "rewards/margins": 5.567511081695557, "rewards/rejected": -18.31856918334961, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 33.19068830748795, "learning_rate": 8.07390687343379e-09, "logits/chosen": -1.250570297241211, "logits/rejected": -1.1990430355072021, "logps/chosen": -6.264920711517334, "logps/rejected": -8.49793815612793, "loss": 0.3294, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -12.529841423034668, "rewards/margins": 4.466032981872559, "rewards/rejected": -16.99587631225586, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 42.76771467797157, "learning_rate": 4.025706004760931e-09, "logits/chosen": -1.1908820867538452, "logits/rejected": -1.1271415948867798, "logps/chosen": -6.330782890319824, "logps/rejected": -9.00413703918457, "loss": 0.3373, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -12.661565780639648, "rewards/margins": 5.346711158752441, "rewards/rejected": -18.00827407836914, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 28.947424332254975, "learning_rate": 1.3704652454261667e-09, "logits/chosen": -1.1968469619750977, "logits/rejected": -1.1597331762313843, "logps/chosen": -6.481853485107422, "logps/rejected": -9.090927124023438, "loss": 0.3515, "rewards/accuracies": 0.90625, "rewards/chosen": -12.963706970214844, "rewards/margins": 5.218146800994873, "rewards/rejected": -18.181854248046875, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 39.23731303488194, "learning_rate": 1.1189818972656696e-10, "logits/chosen": -1.163874864578247, "logits/rejected": -1.1393449306488037, "logps/chosen": -6.374614715576172, "logps/rejected": -9.016167640686035, "loss": 0.3467, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -12.749229431152344, "rewards/margins": 5.283105373382568, "rewards/rejected": -18.03233528137207, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 0.5656856803873622, "train_runtime": 11731.0657, "train_samples_per_second": 5.104, "train_steps_per_second": 0.04 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }