{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9985553308292401, "eval_steps": 100, "global_step": 432, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023114706732158336, "grad_norm": 65.15836334228516, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -0.335565984249115, "logits/rejected": -0.31526079773902893, "logps/chosen": -269.28985595703125, "logps/rejected": -267.5926818847656, "loss": 2.6152, "nll_loss": 0.7412666082382202, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -26.92898941040039, "rewards/margins": -0.1697184145450592, "rewards/rejected": -26.7592716217041, "step": 10 }, { "epoch": 0.04622941346431667, "grad_norm": 55.07333755493164, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.3471914827823639, "logits/rejected": -0.32920125126838684, "logps/chosen": -260.79205322265625, "logps/rejected": -267.349853515625, "loss": 2.5239, "nll_loss": 0.7186842560768127, "rewards/accuracies": 0.565625011920929, "rewards/chosen": -26.079208374023438, "rewards/margins": 0.6557787656784058, "rewards/rejected": -26.734989166259766, "step": 20 }, { "epoch": 0.06934412019647501, "grad_norm": 57.19869613647461, "learning_rate": 6.818181818181817e-07, "logits/chosen": -0.34477299451828003, "logits/rejected": -0.33347639441490173, "logps/chosen": -247.47900390625, "logps/rejected": -250.7107391357422, "loss": 2.3552, "nll_loss": 0.703576922416687, "rewards/accuracies": 0.515625, "rewards/chosen": -24.74790382385254, "rewards/margins": 0.3231719732284546, "rewards/rejected": -25.071073532104492, "step": 30 }, { "epoch": 0.09245882692863334, "grad_norm": 47.48102569580078, "learning_rate": 9.09090909090909e-07, "logits/chosen": -0.5700438618659973, "logits/rejected": -0.556909441947937, "logps/chosen": -215.1627197265625, "logps/rejected": -217.0400848388672, "loss": 2.1715, "nll_loss": 0.6503027081489563, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -21.51627540588379, "rewards/margins": 0.18773558735847473, "rewards/rejected": -21.704008102416992, "step": 40 }, { "epoch": 0.11557353366079168, "grad_norm": 48.25373458862305, "learning_rate": 9.845360824742267e-07, "logits/chosen": -0.8266013264656067, "logits/rejected": -0.8015046119689941, "logps/chosen": -196.6488800048828, "logps/rejected": -195.6967010498047, "loss": 2.1841, "nll_loss": 0.5290184020996094, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -19.664888381958008, "rewards/margins": -0.09521917253732681, "rewards/rejected": -19.56966781616211, "step": 50 }, { "epoch": 0.13868824039295002, "grad_norm": 55.580039978027344, "learning_rate": 9.587628865979382e-07, "logits/chosen": -0.6845192313194275, "logits/rejected": -0.689314067363739, "logps/chosen": -164.92901611328125, "logps/rejected": -165.1588592529297, "loss": 2.0022, "nll_loss": 0.4657168388366699, "rewards/accuracies": 0.53125, "rewards/chosen": -16.492902755737305, "rewards/margins": 0.022982392460107803, "rewards/rejected": -16.515884399414062, "step": 60 }, { "epoch": 0.16180294712510834, "grad_norm": 50.51268768310547, "learning_rate": 9.329896907216495e-07, "logits/chosen": -0.5050565004348755, "logits/rejected": -0.4807310998439789, "logps/chosen": -155.29498291015625, "logps/rejected": -157.361328125, "loss": 1.9881, "nll_loss": 0.44492220878601074, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -15.529500007629395, "rewards/margins": 0.20663371682167053, "rewards/rejected": -15.736132621765137, "step": 70 }, { "epoch": 0.1849176538572667, "grad_norm": 46.45564651489258, "learning_rate": 9.072164948453608e-07, "logits/chosen": -0.48326191306114197, "logits/rejected": -0.457420289516449, "logps/chosen": -158.08729553222656, "logps/rejected": -161.24571228027344, "loss": 1.8567, "nll_loss": 0.42924928665161133, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -15.80872917175293, "rewards/margins": 0.31584271788597107, "rewards/rejected": -16.124568939208984, "step": 80 }, { "epoch": 0.208032360589425, "grad_norm": 50.26318359375, "learning_rate": 8.814432989690721e-07, "logits/chosen": -0.4506359100341797, "logits/rejected": -0.43782296776771545, "logps/chosen": -152.2831573486328, "logps/rejected": -160.30429077148438, "loss": 1.7674, "nll_loss": 0.4159914553165436, "rewards/accuracies": 0.59375, "rewards/chosen": -15.228317260742188, "rewards/margins": 0.8021124005317688, "rewards/rejected": -16.03042984008789, "step": 90 }, { "epoch": 0.23114706732158335, "grad_norm": 45.81875991821289, "learning_rate": 8.556701030927834e-07, "logits/chosen": -0.40928536653518677, "logits/rejected": -0.39079341292381287, "logps/chosen": -153.24673461914062, "logps/rejected": -156.20919799804688, "loss": 1.9362, "nll_loss": 0.4179740846157074, "rewards/accuracies": 0.53125, "rewards/chosen": -15.324671745300293, "rewards/margins": 0.29624658823013306, "rewards/rejected": -15.620920181274414, "step": 100 }, { "epoch": 0.23114706732158335, "eval_logits/chosen": -0.4377523362636566, "eval_logits/rejected": -0.4122772812843323, "eval_logps/chosen": -149.33935546875, "eval_logps/rejected": -152.84754943847656, "eval_loss": 1.7930248975753784, "eval_nll_loss": 0.40668219327926636, "eval_rewards/accuracies": 0.5760869383811951, "eval_rewards/chosen": -14.933935165405273, "eval_rewards/margins": 0.35081860423088074, "eval_rewards/rejected": -15.28475284576416, "eval_runtime": 74.3015, "eval_samples_per_second": 24.576, "eval_steps_per_second": 1.548, "step": 100 }, { "epoch": 0.2542617740537417, "grad_norm": 45.55659103393555, "learning_rate": 8.298969072164948e-07, "logits/chosen": -0.38547706604003906, "logits/rejected": -0.3579915165901184, "logps/chosen": -146.1110382080078, "logps/rejected": -150.4032745361328, "loss": 1.7214, "nll_loss": 0.39803242683410645, "rewards/accuracies": 0.5625, "rewards/chosen": -14.611104011535645, "rewards/margins": 0.42922306060791016, "rewards/rejected": -15.040326118469238, "step": 110 }, { "epoch": 0.27737648078590005, "grad_norm": 44.77095031738281, "learning_rate": 8.041237113402062e-07, "logits/chosen": -0.376223623752594, "logits/rejected": -0.3552733063697815, "logps/chosen": -155.74005126953125, "logps/rejected": -157.14755249023438, "loss": 1.753, "nll_loss": 0.4237498342990875, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": -15.574007034301758, "rewards/margins": 0.14074988663196564, "rewards/rejected": -15.714755058288574, "step": 120 }, { "epoch": 0.30049118751805837, "grad_norm": 54.516483306884766, "learning_rate": 7.783505154639175e-07, "logits/chosen": -0.39556393027305603, "logits/rejected": -0.3727474808692932, "logps/chosen": -152.9895477294922, "logps/rejected": -161.13479614257812, "loss": 1.8165, "nll_loss": 0.42241328954696655, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -15.298955917358398, "rewards/margins": 0.8145230412483215, "rewards/rejected": -16.11347770690918, "step": 130 }, { "epoch": 0.3236058942502167, "grad_norm": 58.50905227661133, "learning_rate": 7.525773195876288e-07, "logits/chosen": -0.41800642013549805, "logits/rejected": -0.41197213530540466, "logps/chosen": -143.42355346679688, "logps/rejected": -148.9073486328125, "loss": 1.8037, "nll_loss": 0.41033467650413513, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -14.342355728149414, "rewards/margins": 0.5483782291412354, "rewards/rejected": -14.890734672546387, "step": 140 }, { "epoch": 0.34672060098237506, "grad_norm": 59.64632034301758, "learning_rate": 7.268041237113402e-07, "logits/chosen": -0.40256112813949585, "logits/rejected": -0.3912666440010071, "logps/chosen": -143.48622131347656, "logps/rejected": -148.83050537109375, "loss": 1.8835, "nll_loss": 0.41666117310523987, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -14.34862232208252, "rewards/margins": 0.5344293117523193, "rewards/rejected": -14.883050918579102, "step": 150 }, { "epoch": 0.3698353077145334, "grad_norm": 41.37995529174805, "learning_rate": 7.010309278350515e-07, "logits/chosen": -0.3729507327079773, "logits/rejected": -0.34710609912872314, "logps/chosen": -155.8257598876953, "logps/rejected": -159.4755096435547, "loss": 1.7067, "nll_loss": 0.41083773970603943, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -15.582575798034668, "rewards/margins": 0.36497658491134644, "rewards/rejected": -15.947550773620605, "step": 160 }, { "epoch": 0.3929500144466917, "grad_norm": 50.4566535949707, "learning_rate": 6.752577319587629e-07, "logits/chosen": -0.3252796530723572, "logits/rejected": -0.31979063153266907, "logps/chosen": -154.66848754882812, "logps/rejected": -161.5574951171875, "loss": 1.6017, "nll_loss": 0.42361512780189514, "rewards/accuracies": 0.578125, "rewards/chosen": -15.46684741973877, "rewards/margins": 0.6889010071754456, "rewards/rejected": -16.15574836730957, "step": 170 }, { "epoch": 0.41606472117885, "grad_norm": 48.24229431152344, "learning_rate": 6.494845360824742e-07, "logits/chosen": -0.3405265212059021, "logits/rejected": -0.33944639563560486, "logps/chosen": -147.56602478027344, "logps/rejected": -154.09613037109375, "loss": 1.6478, "nll_loss": 0.424372136592865, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -14.756604194641113, "rewards/margins": 0.653009295463562, "rewards/rejected": -15.409612655639648, "step": 180 }, { "epoch": 0.4391794279110084, "grad_norm": 50.57717514038086, "learning_rate": 6.237113402061855e-07, "logits/chosen": -0.3636409640312195, "logits/rejected": -0.3508070111274719, "logps/chosen": -156.1150360107422, "logps/rejected": -162.10330200195312, "loss": 1.7155, "nll_loss": 0.4282284379005432, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -15.611505508422852, "rewards/margins": 0.5988240838050842, "rewards/rejected": -16.210330963134766, "step": 190 }, { "epoch": 0.4622941346431667, "grad_norm": 44.41514205932617, "learning_rate": 5.979381443298969e-07, "logits/chosen": -0.32660025358200073, "logits/rejected": -0.3209044337272644, "logps/chosen": -156.2790985107422, "logps/rejected": -162.4671173095703, "loss": 1.7019, "nll_loss": 0.4315672516822815, "rewards/accuracies": 0.596875011920929, "rewards/chosen": -15.627909660339355, "rewards/margins": 0.6188000440597534, "rewards/rejected": -16.2467098236084, "step": 200 }, { "epoch": 0.4622941346431667, "eval_logits/chosen": -0.3579607307910919, "eval_logits/rejected": -0.3357972204685211, "eval_logps/chosen": -154.3026885986328, "eval_logps/rejected": -160.1311492919922, "eval_loss": 1.678566575050354, "eval_nll_loss": 0.4193345308303833, "eval_rewards/accuracies": 0.6086956262588501, "eval_rewards/chosen": -15.430268287658691, "eval_rewards/margins": 0.5828461647033691, "eval_rewards/rejected": -16.01311492919922, "eval_runtime": 74.1864, "eval_samples_per_second": 24.614, "eval_steps_per_second": 1.55, "step": 200 }, { "epoch": 0.48540884137532503, "grad_norm": 51.62085723876953, "learning_rate": 5.721649484536082e-07, "logits/chosen": -0.3630141615867615, "logits/rejected": -0.3378238081932068, "logps/chosen": -150.49215698242188, "logps/rejected": -152.28367614746094, "loss": 1.6739, "nll_loss": 0.41899624466896057, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -15.049214363098145, "rewards/margins": 0.17915421724319458, "rewards/rejected": -15.228368759155273, "step": 210 }, { "epoch": 0.5085235481074833, "grad_norm": 49.88188552856445, "learning_rate": 5.463917525773195e-07, "logits/chosen": -0.37590575218200684, "logits/rejected": -0.3511108160018921, "logps/chosen": -159.89659118652344, "logps/rejected": -165.49131774902344, "loss": 1.7447, "nll_loss": 0.42955484986305237, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -15.989659309387207, "rewards/margins": 0.5594727993011475, "rewards/rejected": -16.549131393432617, "step": 220 }, { "epoch": 0.5316382548396418, "grad_norm": 46.68313217163086, "learning_rate": 5.20618556701031e-07, "logits/chosen": -0.37392115592956543, "logits/rejected": -0.3575811982154846, "logps/chosen": -162.5522918701172, "logps/rejected": -168.78067016601562, "loss": 1.7586, "nll_loss": 0.4414497911930084, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -16.255229949951172, "rewards/margins": 0.6228369474411011, "rewards/rejected": -16.878068923950195, "step": 230 }, { "epoch": 0.5547529615718001, "grad_norm": 54.655609130859375, "learning_rate": 4.948453608247422e-07, "logits/chosen": -0.3484077453613281, "logits/rejected": -0.3337170481681824, "logps/chosen": -159.63836669921875, "logps/rejected": -164.4112091064453, "loss": 1.6017, "nll_loss": 0.4336668848991394, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -15.963836669921875, "rewards/margins": 0.47728481888771057, "rewards/rejected": -16.44112205505371, "step": 240 }, { "epoch": 0.5778676683039584, "grad_norm": 50.76809310913086, "learning_rate": 4.6907216494845357e-07, "logits/chosen": -0.30525675415992737, "logits/rejected": -0.2880803048610687, "logps/chosen": -156.4806365966797, "logps/rejected": -159.4465789794922, "loss": 1.7451, "nll_loss": 0.4165531098842621, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -15.648063659667969, "rewards/margins": 0.29659539461135864, "rewards/rejected": -15.944659233093262, "step": 250 }, { "epoch": 0.6009823750361167, "grad_norm": 51.902610778808594, "learning_rate": 4.432989690721649e-07, "logits/chosen": -0.3701649308204651, "logits/rejected": -0.3554461896419525, "logps/chosen": -152.5877685546875, "logps/rejected": -160.40426635742188, "loss": 1.6025, "nll_loss": 0.4253969192504883, "rewards/accuracies": 0.59375, "rewards/chosen": -15.258776664733887, "rewards/margins": 0.7816492319107056, "rewards/rejected": -16.04042625427246, "step": 260 }, { "epoch": 0.624097081768275, "grad_norm": 44.464599609375, "learning_rate": 4.175257731958763e-07, "logits/chosen": -0.3865426182746887, "logits/rejected": -0.3753945231437683, "logps/chosen": -153.08734130859375, "logps/rejected": -159.94705200195312, "loss": 1.628, "nll_loss": 0.4174048900604248, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -15.308735847473145, "rewards/margins": 0.6859728097915649, "rewards/rejected": -15.994707107543945, "step": 270 }, { "epoch": 0.6472117885004334, "grad_norm": 50.29905700683594, "learning_rate": 3.917525773195876e-07, "logits/chosen": -0.35409292578697205, "logits/rejected": -0.3260190784931183, "logps/chosen": -154.6301727294922, "logps/rejected": -163.79635620117188, "loss": 1.6203, "nll_loss": 0.4250774383544922, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -15.463017463684082, "rewards/margins": 0.916618824005127, "rewards/rejected": -16.379634857177734, "step": 280 }, { "epoch": 0.6703264952325917, "grad_norm": 54.7519416809082, "learning_rate": 3.659793814432989e-07, "logits/chosen": -0.42501506209373474, "logits/rejected": -0.39394429326057434, "logps/chosen": -159.5155487060547, "logps/rejected": -164.74307250976562, "loss": 1.5987, "nll_loss": 0.4190928339958191, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -15.951556205749512, "rewards/margins": 0.5227512717247009, "rewards/rejected": -16.474306106567383, "step": 290 }, { "epoch": 0.6934412019647501, "grad_norm": 44.03036880493164, "learning_rate": 3.402061855670103e-07, "logits/chosen": -0.4323659837245941, "logits/rejected": -0.4210866391658783, "logps/chosen": -163.0435333251953, "logps/rejected": -172.29119873046875, "loss": 1.6388, "nll_loss": 0.4356729984283447, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -16.304353713989258, "rewards/margins": 0.9247667193412781, "rewards/rejected": -17.229122161865234, "step": 300 }, { "epoch": 0.6934412019647501, "eval_logits/chosen": -0.38277825713157654, "eval_logits/rejected": -0.35816264152526855, "eval_logps/chosen": -155.46498107910156, "eval_logps/rejected": -162.12692260742188, "eval_loss": 1.6232643127441406, "eval_nll_loss": 0.4229773283004761, "eval_rewards/accuracies": 0.613043487071991, "eval_rewards/chosen": -15.546499252319336, "eval_rewards/margins": 0.6661920547485352, "eval_rewards/rejected": -16.212690353393555, "eval_runtime": 74.1312, "eval_samples_per_second": 24.632, "eval_steps_per_second": 1.551, "step": 300 }, { "epoch": 0.7165559086969084, "grad_norm": 47.341087341308594, "learning_rate": 3.1443298969072163e-07, "logits/chosen": -0.4356638789176941, "logits/rejected": -0.4280335307121277, "logps/chosen": -164.1811065673828, "logps/rejected": -167.7774200439453, "loss": 1.6949, "nll_loss": 0.4244704246520996, "rewards/accuracies": 0.546875, "rewards/chosen": -16.41811180114746, "rewards/margins": 0.3596319258213043, "rewards/rejected": -16.77774429321289, "step": 310 }, { "epoch": 0.7396706154290668, "grad_norm": 43.78164291381836, "learning_rate": 2.8865979381443296e-07, "logits/chosen": -0.4178016781806946, "logits/rejected": -0.40296635031700134, "logps/chosen": -152.5771484375, "logps/rejected": -160.88571166992188, "loss": 1.6922, "nll_loss": 0.4172099232673645, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -15.257715225219727, "rewards/margins": 0.8308565020561218, "rewards/rejected": -16.088571548461914, "step": 320 }, { "epoch": 0.7627853221612251, "grad_norm": 48.753013610839844, "learning_rate": 2.6288659793814435e-07, "logits/chosen": -0.4328450560569763, "logits/rejected": -0.43247896432876587, "logps/chosen": -153.868896484375, "logps/rejected": -160.49305725097656, "loss": 1.6731, "nll_loss": 0.4279722571372986, "rewards/accuracies": 0.621874988079071, "rewards/chosen": -15.38688850402832, "rewards/margins": 0.6624161601066589, "rewards/rejected": -16.049304962158203, "step": 330 }, { "epoch": 0.7859000288933834, "grad_norm": 48.8376350402832, "learning_rate": 2.3711340206185566e-07, "logits/chosen": -0.4575740694999695, "logits/rejected": -0.44574373960494995, "logps/chosen": -157.2711944580078, "logps/rejected": -161.98927307128906, "loss": 1.5679, "nll_loss": 0.4292600154876709, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -15.727119445800781, "rewards/margins": 0.4718071520328522, "rewards/rejected": -16.198925018310547, "step": 340 }, { "epoch": 0.8090147356255417, "grad_norm": 46.211063385009766, "learning_rate": 2.11340206185567e-07, "logits/chosen": -0.44085240364074707, "logits/rejected": -0.44065386056900024, "logps/chosen": -157.3097686767578, "logps/rejected": -166.4695281982422, "loss": 1.6698, "nll_loss": 0.4102792739868164, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -15.730977058410645, "rewards/margins": 0.9159765243530273, "rewards/rejected": -16.646953582763672, "step": 350 }, { "epoch": 0.8321294423577, "grad_norm": 52.41377639770508, "learning_rate": 1.8556701030927835e-07, "logits/chosen": -0.4241538941860199, "logits/rejected": -0.4094991087913513, "logps/chosen": -160.124267578125, "logps/rejected": -165.10821533203125, "loss": 1.7134, "nll_loss": 0.42789340019226074, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -16.012426376342773, "rewards/margins": 0.4983920156955719, "rewards/rejected": -16.510820388793945, "step": 360 }, { "epoch": 0.8552441490898585, "grad_norm": 48.11139678955078, "learning_rate": 1.5979381443298966e-07, "logits/chosen": -0.43041014671325684, "logits/rejected": -0.4028114676475525, "logps/chosen": -154.57138061523438, "logps/rejected": -164.22232055664062, "loss": 1.5667, "nll_loss": 0.4179977774620056, "rewards/accuracies": 0.65625, "rewards/chosen": -15.45713996887207, "rewards/margins": 0.9650918841362, "rewards/rejected": -16.422229766845703, "step": 370 }, { "epoch": 0.8783588558220168, "grad_norm": 47.23114776611328, "learning_rate": 1.3402061855670102e-07, "logits/chosen": -0.427821546792984, "logits/rejected": -0.4097885191440582, "logps/chosen": -154.52496337890625, "logps/rejected": -161.27987670898438, "loss": 1.5921, "nll_loss": 0.4322156012058258, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -15.452497482299805, "rewards/margins": 0.6754907369613647, "rewards/rejected": -16.127986907958984, "step": 380 }, { "epoch": 0.9014735625541751, "grad_norm": 55.62732696533203, "learning_rate": 1.0824742268041237e-07, "logits/chosen": -0.461261123418808, "logits/rejected": -0.44340047240257263, "logps/chosen": -157.7149658203125, "logps/rejected": -168.34735107421875, "loss": 1.6161, "nll_loss": 0.42217200994491577, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -15.771496772766113, "rewards/margins": 1.0632401704788208, "rewards/rejected": -16.834735870361328, "step": 390 }, { "epoch": 0.9245882692863334, "grad_norm": 52.596492767333984, "learning_rate": 8.24742268041237e-08, "logits/chosen": -0.43360406160354614, "logits/rejected": -0.41087478399276733, "logps/chosen": -162.21621704101562, "logps/rejected": -167.1909637451172, "loss": 1.632, "nll_loss": 0.4444475769996643, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -16.221622467041016, "rewards/margins": 0.4974748194217682, "rewards/rejected": -16.719097137451172, "step": 400 }, { "epoch": 0.9245882692863334, "eval_logits/chosen": -0.40716680884361267, "eval_logits/rejected": -0.3811309337615967, "eval_logps/chosen": -156.50477600097656, "eval_logps/rejected": -163.44790649414062, "eval_loss": 1.6007416248321533, "eval_nll_loss": 0.42774829268455505, "eval_rewards/accuracies": 0.636956512928009, "eval_rewards/chosen": -15.65047550201416, "eval_rewards/margins": 0.6943140625953674, "eval_rewards/rejected": -16.344789505004883, "eval_runtime": 74.2865, "eval_samples_per_second": 24.581, "eval_steps_per_second": 1.548, "step": 400 }, { "epoch": 0.9477029760184917, "grad_norm": 50.8940315246582, "learning_rate": 5.670103092783505e-08, "logits/chosen": -0.36925220489501953, "logits/rejected": -0.35820272564888, "logps/chosen": -148.66673278808594, "logps/rejected": -157.42532348632812, "loss": 1.566, "nll_loss": 0.42418622970581055, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -14.866673469543457, "rewards/margins": 0.8758570551872253, "rewards/rejected": -15.742530822753906, "step": 410 }, { "epoch": 0.9708176827506501, "grad_norm": 44.86955642700195, "learning_rate": 3.092783505154639e-08, "logits/chosen": -0.40748652815818787, "logits/rejected": -0.383215069770813, "logps/chosen": -150.21824645996094, "logps/rejected": -155.44349670410156, "loss": 1.5783, "nll_loss": 0.4278343617916107, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": -15.021825790405273, "rewards/margins": 0.5225244760513306, "rewards/rejected": -15.544349670410156, "step": 420 }, { "epoch": 0.9939323894828085, "grad_norm": 48.80271911621094, "learning_rate": 5.154639175257731e-09, "logits/chosen": -0.41907650232315063, "logits/rejected": -0.4291330873966217, "logps/chosen": -157.33888244628906, "logps/rejected": -164.2548370361328, "loss": 1.655, "nll_loss": 0.4265294075012207, "rewards/accuracies": 0.59375, "rewards/chosen": -15.733888626098633, "rewards/margins": 0.6915954351425171, "rewards/rejected": -16.425485610961914, "step": 430 }, { "epoch": 0.9985553308292401, "step": 432, "total_flos": 0.0, "train_loss": 1.77929983039697, "train_runtime": 9807.604, "train_samples_per_second": 5.646, "train_steps_per_second": 0.044 } ], "logging_steps": 10, "max_steps": 432, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }