diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -10,11 +10,11 @@ "log_history": [ { "epoch": 0.0008, - "grad_norm": 1.3400232791900635, + "grad_norm": 1.3400768041610718, "learning_rate": 4e-08, "logits/chosen": -2.951728105545044, "logits/rejected": -3.0115513801574707, - "logps/chosen": -261.5080261230469, + "logps/chosen": -261.50799560546875, "logps/rejected": -337.26708984375, "loss": 0.6931, "rewards/accuracies": 0.0, @@ -25,3954 +25,3954 @@ }, { "epoch": 0.004, - "grad_norm": 1.3120373487472534, + "grad_norm": 1.3155320882797241, "learning_rate": 2.0000000000000002e-07, - "logits/chosen": -2.892902374267578, - "logits/rejected": -2.8663315773010254, - "logps/chosen": -327.0978088378906, - "logps/rejected": -271.5657043457031, - "loss": 0.6929, - "rewards/accuracies": 0.421875, - "rewards/chosen": 0.0005052188062109053, - "rewards/margins": 0.0005590975051745772, - "rewards/rejected": -5.3878684411756694e-05, + "logits/chosen": -2.8931193351745605, + "logits/rejected": -2.8665506839752197, + "logps/chosen": -327.18511962890625, + "logps/rejected": -271.54595947265625, + "loss": 0.6934, + "rewards/accuracies": 0.359375, + "rewards/chosen": -0.0003679850487969816, + "rewards/margins": -0.0005117338732816279, + "rewards/rejected": 0.000143748868140392, "step": 5 }, { "epoch": 0.008, - "grad_norm": 1.4112803936004639, + "grad_norm": 1.4168583154678345, "learning_rate": 4.0000000000000003e-07, - "logits/chosen": -2.8438823223114014, - "logits/rejected": -2.8227336406707764, - "logps/chosen": -278.82293701171875, - "logps/rejected": -225.83895874023438, - "loss": 0.693, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.000398232601583004, - "rewards/margins": 0.0003281077661085874, - "rewards/rejected": -0.0007263403385877609, + "logits/chosen": -2.8454272747039795, + "logits/rejected": -2.8244102001190186, + "logps/chosen": -278.81390380859375, + "logps/rejected": -225.78091430664062, + "loss": 0.6932, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.0003080188180319965, + "rewards/margins": -0.00016189362213481218, + "rewards/rejected": -0.00014612523955293, "step": 10 }, { "epoch": 0.012, - "grad_norm": 1.446423888206482, + "grad_norm": 1.4461805820465088, "learning_rate": 6.000000000000001e-07, - "logits/chosen": -2.9413020610809326, - "logits/rejected": -2.9195456504821777, - "logps/chosen": -338.17449951171875, - "logps/rejected": -264.4447021484375, - "loss": 0.6932, - "rewards/accuracies": 0.4625000059604645, - "rewards/chosen": -0.00040648109279572964, - "rewards/margins": -0.00015654772869311273, - "rewards/rejected": -0.0002499335096217692, + "logits/chosen": -2.941542387008667, + "logits/rejected": -2.919604539871216, + "logps/chosen": -338.14361572265625, + "logps/rejected": -264.4473876953125, + "loss": 0.6931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -9.753620543051511e-05, + "rewards/margins": 0.00017976768140215427, + "rewards/rejected": -0.0002773039450403303, "step": 15 }, { "epoch": 0.016, - "grad_norm": 1.2574288845062256, + "grad_norm": 1.218361735343933, "learning_rate": 8.000000000000001e-07, - "logits/chosen": -2.8474764823913574, - "logits/rejected": -2.80472993850708, - "logps/chosen": -284.499755859375, - "logps/rejected": -265.3078918457031, - "loss": 0.6929, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.00043911076500080526, - "rewards/margins": 0.000450963998446241, - "rewards/rejected": -1.1853216165036429e-05, + "logits/chosen": -2.844390392303467, + "logits/rejected": -2.8012917041778564, + "logps/chosen": -284.53179931640625, + "logps/rejected": -265.3224792480469, + "loss": 0.693, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00011823275417555124, + "rewards/margins": 0.00027608583332039416, + "rewards/rejected": -0.00015785309369675815, "step": 20 }, { "epoch": 0.02, - "grad_norm": 1.0634217262268066, + "grad_norm": 1.0622657537460327, "learning_rate": 1.0000000000000002e-06, - "logits/chosen": -2.9212899208068848, - "logits/rejected": -2.8857905864715576, - "logps/chosen": -282.70477294921875, - "logps/rejected": -250.54092407226562, - "loss": 0.6931, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 4.3422729504527524e-05, - "rewards/margins": 0.0001133212135755457, - "rewards/rejected": -6.989858957240358e-05, + "logits/chosen": -2.919724941253662, + "logits/rejected": -2.8841071128845215, + "logps/chosen": -282.7057800292969, + "logps/rejected": -250.56005859375, + "loss": 0.693, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 3.2701333111617714e-05, + "rewards/margins": 0.00029431647271849215, + "rewards/rejected": -0.0002616152632981539, "step": 25 }, { "epoch": 0.024, - "grad_norm": 1.285556435585022, + "grad_norm": 1.2840291261672974, "learning_rate": 1.2000000000000002e-06, - "logits/chosen": -2.8694255352020264, - "logits/rejected": -2.821131944656372, - "logps/chosen": -248.3980255126953, - "logps/rejected": -239.7887420654297, - "loss": 0.693, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": 0.0004804051131941378, - "rewards/margins": 0.0003015303227584809, - "rewards/rejected": 0.00017887470312416553, + "logits/chosen": -2.8690571784973145, + "logits/rejected": -2.8205409049987793, + "logps/chosen": -248.4199981689453, + "logps/rejected": -239.7508544921875, + "loss": 0.6933, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0002605341433081776, + "rewards/margins": -0.0002968462067656219, + "rewards/rejected": 0.0005573804955929518, "step": 30 }, { "epoch": 0.028, - "grad_norm": 1.469935417175293, + "grad_norm": 1.4659631252288818, "learning_rate": 1.4000000000000001e-06, - "logits/chosen": -2.8213143348693848, - "logits/rejected": -2.830141305923462, - "logps/chosen": -260.638916015625, - "logps/rejected": -252.28701782226562, - "loss": 0.6935, + "logits/chosen": -2.819516181945801, + "logits/rejected": -2.8284599781036377, + "logps/chosen": -260.5746765136719, + "logps/rejected": -252.26657104492188, + "loss": 0.6933, "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": -0.0009641313808970153, - "rewards/margins": -0.0007172044133767486, - "rewards/rejected": -0.00024692711303941905, + "rewards/chosen": -0.00032235420076176524, + "rewards/margins": -0.00027956519625149667, + "rewards/rejected": -4.278900451026857e-05, "step": 35 }, { "epoch": 0.032, - "grad_norm": 1.6270555257797241, + "grad_norm": 1.641062617301941, "learning_rate": 1.6000000000000001e-06, - "logits/chosen": -2.841813802719116, - "logits/rejected": -2.8207640647888184, - "logps/chosen": -225.63058471679688, - "logps/rejected": -254.8463592529297, - "loss": 0.6926, - "rewards/accuracies": 0.5625, - "rewards/chosen": 5.770945062977262e-05, - "rewards/margins": 0.0010281356517225504, - "rewards/rejected": -0.0009704261319711804, + "logits/chosen": -2.8422532081604004, + "logits/rejected": -2.8213276863098145, + "logps/chosen": -225.60000610351562, + "logps/rejected": -254.83389282226562, + "loss": 0.6925, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0003634319291450083, + "rewards/margins": 0.001209324225783348, + "rewards/rejected": -0.0008458923548460007, "step": 40 }, { "epoch": 0.036, - "grad_norm": 1.2644410133361816, + "grad_norm": 1.2335015535354614, "learning_rate": 1.8000000000000001e-06, - "logits/chosen": -2.8922643661499023, - "logits/rejected": -2.8955490589141846, - "logps/chosen": -262.701416015625, - "logps/rejected": -257.9744567871094, - "loss": 0.6925, + "logits/chosen": -2.8927934169769287, + "logits/rejected": -2.895987033843994, + "logps/chosen": -262.75616455078125, + "logps/rejected": -258.01776123046875, + "loss": 0.6926, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.0004659576225094497, - "rewards/margins": 0.0013163817347958684, - "rewards/rejected": -0.0008504241704940796, + "rewards/chosen": -8.163524034898728e-05, + "rewards/margins": 0.0012018559500575066, + "rewards/rejected": -0.0012834911467507482, "step": 45 }, { "epoch": 0.04, - "grad_norm": 1.38533353805542, + "grad_norm": 1.3828603029251099, "learning_rate": 2.0000000000000003e-06, - "logits/chosen": -2.8122503757476807, - "logits/rejected": -2.768099069595337, - "logps/chosen": -246.9253387451172, - "logps/rejected": -221.22207641601562, + "logits/chosen": -2.8105452060699463, + "logits/rejected": -2.766021966934204, + "logps/chosen": -246.88064575195312, + "logps/rejected": -221.18325805664062, "loss": 0.6929, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": 0.00014323795039672405, - "rewards/margins": 0.0004161189717706293, - "rewards/rejected": -0.00027288100682199, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0005901859840378165, + "rewards/margins": 0.0004752330423798412, + "rewards/rejected": 0.00011495289800222963, "step": 50 }, { "epoch": 0.044, - "grad_norm": 1.1605815887451172, + "grad_norm": 1.1620622873306274, "learning_rate": 2.2e-06, - "logits/chosen": -2.8503968715667725, - "logits/rejected": -2.8304195404052734, - "logps/chosen": -289.88092041015625, - "logps/rejected": -304.974609375, - "loss": 0.6933, - "rewards/accuracies": 0.4625000059604645, - "rewards/chosen": -0.00040247183642350137, - "rewards/margins": -0.00022780350991524756, - "rewards/rejected": -0.00017466834106016904, + "logits/chosen": -2.8504276275634766, + "logits/rejected": -2.830573558807373, + "logps/chosen": -289.9104919433594, + "logps/rejected": -304.9803771972656, + "loss": 0.6934, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0006987753440625966, + "rewards/margins": -0.0004663577419705689, + "rewards/rejected": -0.00023241760209202766, "step": 55 }, { "epoch": 0.048, - "grad_norm": 1.273954153060913, + "grad_norm": 1.2738378047943115, "learning_rate": 2.4000000000000003e-06, - "logits/chosen": -2.89465594291687, - "logits/rejected": -2.875718832015991, - "logps/chosen": -265.1037902832031, - "logps/rejected": -274.2173767089844, - "loss": 0.6924, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0005437713698484004, - "rewards/margins": 0.001443098415620625, - "rewards/rejected": -0.0008993271621875465, + "logits/chosen": -2.893800735473633, + "logits/rejected": -2.874782085418701, + "logps/chosen": -265.0617370605469, + "logps/rejected": -274.21246337890625, + "loss": 0.6922, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0009635955793783069, + "rewards/margins": 0.0018142672488465905, + "rewards/rejected": -0.0008506716112606227, "step": 60 }, { "epoch": 0.052, - "grad_norm": 1.0502583980560303, + "grad_norm": 1.0527548789978027, "learning_rate": 2.6e-06, - "logits/chosen": -2.862156391143799, - "logits/rejected": -2.837881565093994, - "logps/chosen": -242.22341918945312, - "logps/rejected": -258.85699462890625, - "loss": 0.6923, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.000915366574190557, - "rewards/margins": 0.001800536410883069, - "rewards/rejected": -0.0008851696038618684, + "logits/chosen": -2.8645272254943848, + "logits/rejected": -2.840221881866455, + "logps/chosen": -242.2287139892578, + "logps/rejected": -258.8787536621094, + "loss": 0.6922, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.0008619900909252465, + "rewards/margins": 0.0019651155453175306, + "rewards/rejected": -0.0011031257454305887, "step": 65 }, { "epoch": 0.056, - "grad_norm": 1.203967571258545, + "grad_norm": 1.2068345546722412, "learning_rate": 2.8000000000000003e-06, - "logits/chosen": -2.8611202239990234, - "logits/rejected": -2.8584847450256348, - "logps/chosen": -256.2646179199219, - "logps/rejected": -239.5492401123047, - "loss": 0.692, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.0021997836884111166, - "rewards/margins": 0.0023759277537465096, - "rewards/rejected": -0.00017614415264688432, + "logits/chosen": -2.8592796325683594, + "logits/rejected": -2.856304168701172, + "logps/chosen": -256.22979736328125, + "logps/rejected": -239.5323944091797, + "loss": 0.6919, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0025476592127233744, + "rewards/margins": 0.002555294893682003, + "rewards/rejected": -7.636076588823926e-06, "step": 70 }, { "epoch": 0.06, - "grad_norm": 1.4282208681106567, + "grad_norm": 1.4119035005569458, "learning_rate": 3e-06, - "logits/chosen": -2.921938180923462, - "logits/rejected": -2.8600423336029053, - "logps/chosen": -286.9979553222656, - "logps/rejected": -258.0650939941406, - "loss": 0.6917, + "logits/chosen": -2.922961473464966, + "logits/rejected": -2.861196517944336, + "logps/chosen": -286.9571838378906, + "logps/rejected": -258.0143737792969, + "loss": 0.6918, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.0020814109593629837, - "rewards/margins": 0.0028700605034828186, - "rewards/rejected": -0.0007886493694968522, + "rewards/chosen": 0.0024884731974452734, + "rewards/margins": 0.0027703498490154743, + "rewards/rejected": -0.00028187656425870955, "step": 75 }, { "epoch": 0.064, - "grad_norm": 1.4035444259643555, + "grad_norm": 1.4012054204940796, "learning_rate": 3.2000000000000003e-06, - "logits/chosen": -2.8771731853485107, - "logits/rejected": -2.8718912601470947, - "logps/chosen": -257.95562744140625, - "logps/rejected": -248.6924285888672, - "loss": 0.6916, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.004527095705270767, - "rewards/margins": 0.003182282205671072, - "rewards/rejected": 0.0013448137324303389, + "logits/chosen": -2.8784067630767822, + "logits/rejected": -2.8731160163879395, + "logps/chosen": -257.916259765625, + "logps/rejected": -248.72305297851562, + "loss": 0.6912, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0049202474765479565, + "rewards/margins": 0.0038819201290607452, + "rewards/rejected": 0.0010383275803178549, "step": 80 }, { "epoch": 0.068, - "grad_norm": 1.4235211610794067, + "grad_norm": 1.4234269857406616, "learning_rate": 3.4000000000000005e-06, - "logits/chosen": -2.9201653003692627, - "logits/rejected": -2.8719587326049805, - "logps/chosen": -307.7834777832031, - "logps/rejected": -268.5202941894531, - "loss": 0.6914, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.005866583436727524, - "rewards/margins": 0.0034526665695011616, - "rewards/rejected": 0.0024139168672263622, + "logits/chosen": -2.9202146530151367, + "logits/rejected": -2.8719019889831543, + "logps/chosen": -307.8442077636719, + "logps/rejected": -268.5364990234375, + "loss": 0.6917, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005258677992969751, + "rewards/margins": 0.0030069469939917326, + "rewards/rejected": 0.0022517309989780188, "step": 85 }, { "epoch": 0.072, - "grad_norm": 1.1721140146255493, + "grad_norm": 1.1752641201019287, "learning_rate": 3.6000000000000003e-06, - "logits/chosen": -2.8801143169403076, - "logits/rejected": -2.844945192337036, - "logps/chosen": -238.00100708007812, - "logps/rejected": -238.72006225585938, - "loss": 0.6919, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.005659895483404398, - "rewards/margins": 0.0026023960672318935, - "rewards/rejected": 0.003057498950511217, + "logits/chosen": -2.8795719146728516, + "logits/rejected": -2.8445372581481934, + "logps/chosen": -238.05691528320312, + "logps/rejected": -238.66940307617188, + "loss": 0.6924, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.00510067492723465, + "rewards/margins": 0.0015365609433501959, + "rewards/rejected": 0.003564114449545741, "step": 90 }, { "epoch": 0.076, - "grad_norm": 1.2834060192108154, + "grad_norm": 1.273596167564392, "learning_rate": 3.8000000000000005e-06, - "logits/chosen": -2.896768093109131, - "logits/rejected": -2.9003615379333496, - "logps/chosen": -268.2308654785156, - "logps/rejected": -241.99667358398438, - "loss": 0.6904, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.0076117864809930325, - "rewards/margins": 0.005503328982740641, - "rewards/rejected": 0.0021084570325911045, + "logits/chosen": -2.8963799476623535, + "logits/rejected": -2.899864673614502, + "logps/chosen": -268.2062683105469, + "logps/rejected": -242.0111083984375, + "loss": 0.6902, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.007857006974518299, + "rewards/margins": 0.005892972461879253, + "rewards/rejected": 0.0019640345126390457, "step": 95 }, { "epoch": 0.08, - "grad_norm": 1.233703374862671, + "grad_norm": 1.2360827922821045, "learning_rate": 4.000000000000001e-06, - "logits/chosen": -2.8716769218444824, - "logits/rejected": -2.8608012199401855, - "logps/chosen": -292.55133056640625, - "logps/rejected": -255.74276733398438, - "loss": 0.6895, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.009478461928665638, - "rewards/margins": 0.007306996732950211, - "rewards/rejected": 0.00217146473005414, + "logits/chosen": -2.871992588043213, + "logits/rejected": -2.8619043827056885, + "logps/chosen": -292.5274353027344, + "logps/rejected": -255.6526641845703, + "loss": 0.6899, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.009717302396893501, + "rewards/margins": 0.006645149551331997, + "rewards/rejected": 0.0030721533112227917, "step": 100 }, { "epoch": 0.08, - "eval_logits/chosen": -2.890129566192627, - "eval_logits/rejected": -2.8480653762817383, - "eval_logps/chosen": -282.2447204589844, - "eval_logps/rejected": -247.7537384033203, - "eval_loss": 0.6896404027938843, - "eval_rewards/accuracies": 0.6626983880996704, - "eval_rewards/chosen": 0.00993373617529869, - "eval_rewards/margins": 0.007174656726419926, - "eval_rewards/rejected": 0.0027590803802013397, - "eval_runtime": 167.6825, - "eval_samples_per_second": 2.982, - "eval_steps_per_second": 0.376, + "eval_logits/chosen": -2.889031171798706, + "eval_logits/rejected": -2.8468213081359863, + "eval_logps/chosen": -282.2605285644531, + "eval_logps/rejected": -247.75430297851562, + "eval_loss": 0.6897016167640686, + "eval_rewards/accuracies": 0.6666666865348816, + "eval_rewards/chosen": 0.009775782003998756, + "eval_rewards/margins": 0.007023118901997805, + "eval_rewards/rejected": 0.0027526640333235264, + "eval_runtime": 166.8346, + "eval_samples_per_second": 2.997, + "eval_steps_per_second": 0.378, "step": 100 }, { "epoch": 0.084, - "grad_norm": 1.3204169273376465, + "grad_norm": 1.3187963962554932, "learning_rate": 4.2000000000000004e-06, - "logits/chosen": -2.8663666248321533, - "logits/rejected": -2.820469856262207, - "logps/chosen": -272.216552734375, - "logps/rejected": -261.4291076660156, - "loss": 0.6905, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.008843997493386269, - "rewards/margins": 0.005469636060297489, - "rewards/rejected": 0.003374360501766205, + "logits/chosen": -2.8663318157196045, + "logits/rejected": -2.8205113410949707, + "logps/chosen": -272.2581481933594, + "logps/rejected": -261.42620849609375, + "loss": 0.6907, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.00842782761901617, + "rewards/margins": 0.005024894140660763, + "rewards/rejected": 0.0034029334783554077, "step": 105 }, { "epoch": 0.088, - "grad_norm": 1.2558304071426392, + "grad_norm": 1.2544078826904297, "learning_rate": 4.4e-06, - "logits/chosen": -2.935101270675659, - "logits/rejected": -2.903439521789551, - "logps/chosen": -251.76089477539062, - "logps/rejected": -246.2103729248047, - "loss": 0.6905, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.010050037875771523, - "rewards/margins": 0.00540496688336134, - "rewards/rejected": 0.004645070992410183, + "logits/chosen": -2.9372715950012207, + "logits/rejected": -2.9057135581970215, + "logps/chosen": -251.8219757080078, + "logps/rejected": -246.0946044921875, + "loss": 0.6914, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.009438835084438324, + "rewards/margins": 0.0036361501552164555, + "rewards/rejected": 0.0058026849292218685, "step": 110 }, { "epoch": 0.092, - "grad_norm": 2.3479883670806885, + "grad_norm": 3.0026137828826904, "learning_rate": 4.600000000000001e-06, - "logits/chosen": -2.8293395042419434, - "logits/rejected": -2.8186140060424805, - "logps/chosen": -225.02096557617188, - "logps/rejected": -294.81719970703125, - "loss": 0.6865, + "logits/chosen": -2.827087879180908, + "logits/rejected": -2.816584348678589, + "logps/chosen": -225.01516723632812, + "logps/rejected": -294.75274658203125, + "loss": 0.6868, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.012401686981320381, - "rewards/margins": 0.01355154998600483, - "rewards/rejected": -0.0011498630046844482, + "rewards/chosen": 0.012459425255656242, + "rewards/margins": 0.012965649366378784, + "rewards/rejected": -0.0005062236450612545, "step": 115 }, { "epoch": 0.096, - "grad_norm": 3.1538350582122803, + "grad_norm": 3.147055149078369, "learning_rate": 4.800000000000001e-06, - "logits/chosen": -2.7399497032165527, - "logits/rejected": -2.749803066253662, - "logps/chosen": -275.8015441894531, - "logps/rejected": -249.06002807617188, - "loss": 0.6888, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.01128525473177433, - "rewards/margins": 0.009098999202251434, - "rewards/rejected": 0.002186256693676114, + "logits/chosen": -2.7388131618499756, + "logits/rejected": -2.748465061187744, + "logps/chosen": -275.8075866699219, + "logps/rejected": -249.1244659423828, + "loss": 0.6885, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.011224482208490372, + "rewards/margins": 0.009682848118245602, + "rewards/rejected": 0.0015416343230754137, "step": 120 }, { "epoch": 0.1, - "grad_norm": 1.5341688394546509, + "grad_norm": 1.534111738204956, "learning_rate": 5e-06, - "logits/chosen": -2.9408421516418457, - "logits/rejected": -2.924834728240967, - "logps/chosen": -310.1289367675781, - "logps/rejected": -280.08770751953125, - "loss": 0.6868, + "logits/chosen": -2.941195487976074, + "logits/rejected": -2.924978494644165, + "logps/chosen": -310.167236328125, + "logps/rejected": -280.0481262207031, + "loss": 0.6872, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.012295490130782127, - "rewards/margins": 0.01287610363215208, - "rewards/rejected": -0.0005806152475997806, + "rewards/chosen": 0.011911705136299133, + "rewards/margins": 0.012096909806132317, + "rewards/rejected": -0.00018520592129789293, "step": 125 }, { "epoch": 0.104, - "grad_norm": 1.4902671575546265, + "grad_norm": 1.4911248683929443, "learning_rate": 4.999756310023261e-06, - "logits/chosen": -2.889176845550537, - "logits/rejected": -2.896610975265503, - "logps/chosen": -286.29925537109375, - "logps/rejected": -309.28265380859375, - "loss": 0.6874, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.00844976119697094, - "rewards/margins": 0.011819533072412014, - "rewards/rejected": -0.0033697723411023617, + "logits/chosen": -2.8891565799713135, + "logits/rejected": -2.896601915359497, + "logps/chosen": -286.2426452636719, + "logps/rejected": -309.3197021484375, + "loss": 0.6869, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.009015440009534359, + "rewards/margins": 0.012755987234413624, + "rewards/rejected": -0.003740546526387334, "step": 130 }, { "epoch": 0.108, - "grad_norm": 1.4190293550491333, + "grad_norm": 1.4180755615234375, "learning_rate": 4.999025287600886e-06, - "logits/chosen": -2.892551898956299, - "logits/rejected": -2.908194065093994, - "logps/chosen": -274.4278869628906, - "logps/rejected": -265.5429382324219, - "loss": 0.6806, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.012322185561060905, - "rewards/margins": 0.025636225938796997, - "rewards/rejected": -0.013314038515090942, + "logits/chosen": -2.8916049003601074, + "logits/rejected": -2.9071428775787354, + "logps/chosen": -274.48236083984375, + "logps/rejected": -265.49786376953125, + "loss": 0.6811, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.011777431704103947, + "rewards/margins": 0.024640800431370735, + "rewards/rejected": -0.012863369658589363, "step": 135 }, { "epoch": 0.112, - "grad_norm": 1.3311868906021118, + "grad_norm": 1.3050034046173096, "learning_rate": 4.997807075247147e-06, - "logits/chosen": -2.8810911178588867, - "logits/rejected": -2.859647750854492, - "logps/chosen": -247.0091094970703, - "logps/rejected": -236.913818359375, - "loss": 0.6888, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.004415545146912336, - "rewards/margins": 0.009412359446287155, - "rewards/rejected": -0.013827905058860779, + "logits/chosen": -2.8806416988372803, + "logits/rejected": -2.8594279289245605, + "logps/chosen": -247.0726776123047, + "logps/rejected": -236.9187774658203, + "loss": 0.6891, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.005051865242421627, + "rewards/margins": 0.008825790137052536, + "rewards/rejected": -0.013877655379474163, "step": 140 }, { "epoch": 0.116, - "grad_norm": 1.286658525466919, + "grad_norm": 1.3001933097839355, "learning_rate": 4.996101910454953e-06, - "logits/chosen": -2.9036033153533936, - "logits/rejected": -2.8598740100860596, - "logps/chosen": -273.776123046875, - "logps/rejected": -244.0740509033203, + "logits/chosen": -2.903634548187256, + "logits/rejected": -2.859711170196533, + "logps/chosen": -273.8101806640625, + "logps/rejected": -244.11074829101562, "loss": 0.6801, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.0010833339765667915, - "rewards/margins": 0.026883091777563095, - "rewards/rejected": -0.02579975686967373, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.0007423794595524669, + "rewards/margins": 0.026909640058875084, + "rewards/rejected": -0.026167264208197594, "step": 145 }, { "epoch": 0.12, - "grad_norm": 1.6749712228775024, + "grad_norm": 1.671297550201416, "learning_rate": 4.993910125649561e-06, - "logits/chosen": -2.891787528991699, - "logits/rejected": -2.856782913208008, - "logps/chosen": -293.8242492675781, - "logps/rejected": -247.73068237304688, - "loss": 0.6806, + "logits/chosen": -2.891292095184326, + "logits/rejected": -2.856261968612671, + "logps/chosen": -293.83563232421875, + "logps/rejected": -247.8043975830078, + "loss": 0.6803, "rewards/accuracies": 0.6875, - "rewards/chosen": 0.010322624817490578, - "rewards/margins": 0.025890743359923363, - "rewards/rejected": -0.015568114817142487, + "rewards/chosen": 0.01020820252597332, + "rewards/margins": 0.026513541117310524, + "rewards/rejected": -0.016305336728692055, "step": 150 }, { "epoch": 0.124, - "grad_norm": 1.5311990976333618, + "grad_norm": 1.5288795232772827, "learning_rate": 4.9912321481237616e-06, - "logits/chosen": -2.7779576778411865, - "logits/rejected": -2.774000883102417, - "logps/chosen": -231.47427368164062, - "logps/rejected": -290.8375549316406, - "loss": 0.6839, + "logits/chosen": -2.778376340866089, + "logits/rejected": -2.774121046066284, + "logps/chosen": -231.49319458007812, + "logps/rejected": -290.89337158203125, + "loss": 0.6837, "rewards/accuracies": 0.625, - "rewards/chosen": 0.0004987965803593397, - "rewards/margins": 0.019459182396531105, - "rewards/rejected": -0.018960384652018547, + "rewards/chosen": 0.00030907365726307034, + "rewards/margins": 0.019828204065561295, + "rewards/rejected": -0.019519129768013954, "step": 155 }, { "epoch": 0.128, - "grad_norm": 1.5341625213623047, + "grad_norm": 1.5330896377563477, "learning_rate": 4.988068499954578e-06, - "logits/chosen": -2.890897512435913, - "logits/rejected": -2.889413356781006, - "logps/chosen": -316.82025146484375, - "logps/rejected": -312.2231140136719, - "loss": 0.6716, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.031246433034539223, - "rewards/margins": 0.045340072363615036, - "rewards/rejected": -0.014093644917011261, + "logits/chosen": -2.889814853668213, + "logits/rejected": -2.888610601425171, + "logps/chosen": -316.81927490234375, + "logps/rejected": -312.25006103515625, + "loss": 0.6715, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.031255967915058136, + "rewards/margins": 0.04561912640929222, + "rewards/rejected": -0.01436315942555666, "step": 160 }, { "epoch": 0.132, - "grad_norm": 1.6371203660964966, + "grad_norm": 1.637596607208252, "learning_rate": 4.984419797901491e-06, - "logits/chosen": -2.9241220951080322, - "logits/rejected": -2.9127113819122314, - "logps/chosen": -311.5802001953125, - "logps/rejected": -282.0766906738281, - "loss": 0.6702, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.03161744773387909, - "rewards/margins": 0.04772466421127319, - "rewards/rejected": -0.016107218340039253, + "logits/chosen": -2.922788143157959, + "logits/rejected": -2.911243438720703, + "logps/chosen": -311.63836669921875, + "logps/rejected": -282.0634765625, + "loss": 0.6705, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.031035322695970535, + "rewards/margins": 0.04701067879796028, + "rewards/rejected": -0.015975359827280045, "step": 165 }, { "epoch": 0.136, - "grad_norm": 1.5368744134902954, + "grad_norm": 1.531761884689331, "learning_rate": 4.980286753286196e-06, - "logits/chosen": -2.917865514755249, - "logits/rejected": -2.9101455211639404, - "logps/chosen": -275.5201110839844, - "logps/rejected": -273.40301513671875, - "loss": 0.6758, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.02875286340713501, - "rewards/margins": 0.03745796158909798, - "rewards/rejected": -0.008705099113285542, + "logits/chosen": -2.9153621196746826, + "logits/rejected": -2.9075608253479004, + "logps/chosen": -275.50396728515625, + "logps/rejected": -273.3793029785156, + "loss": 0.6759, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0289138313382864, + "rewards/margins": 0.0373816192150116, + "rewards/rejected": -0.008467786945402622, "step": 170 }, { "epoch": 0.14, - "grad_norm": 1.5606811046600342, + "grad_norm": 1.562333106994629, "learning_rate": 4.975670171853926e-06, - "logits/chosen": -2.881425380706787, - "logits/rejected": -2.82181978225708, - "logps/chosen": -268.76611328125, - "logps/rejected": -241.20675659179688, - "loss": 0.6724, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.01504159439355135, - "rewards/margins": 0.04509888216853142, - "rewards/rejected": -0.030057286843657494, + "logits/chosen": -2.881091833114624, + "logits/rejected": -2.8206849098205566, + "logps/chosen": -268.7303161621094, + "logps/rejected": -241.11801147460938, + "loss": 0.6727, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.01539912074804306, + "rewards/margins": 0.04456932842731476, + "rewards/rejected": -0.02917020581662655, "step": 175 }, { "epoch": 0.144, - "grad_norm": 1.5441210269927979, + "grad_norm": 1.5452988147735596, "learning_rate": 4.970570953616383e-06, - "logits/chosen": -2.871035099029541, - "logits/rejected": -2.8470711708068848, - "logps/chosen": -271.70977783203125, - "logps/rejected": -250.18887329101562, - "loss": 0.6577, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.02740972302854061, - "rewards/margins": 0.07525759935379028, - "rewards/rejected": -0.04784787446260452, + "logits/chosen": -2.870706558227539, + "logits/rejected": -2.846757173538208, + "logps/chosen": -271.70098876953125, + "logps/rejected": -250.15017700195312, + "loss": 0.6579, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.027496661990880966, + "rewards/margins": 0.07495806366205215, + "rewards/rejected": -0.04746139422059059, "step": 180 }, { "epoch": 0.148, - "grad_norm": 1.7091320753097534, + "grad_norm": 1.711881160736084, "learning_rate": 4.964990092676263e-06, - "logits/chosen": -2.827620506286621, - "logits/rejected": -2.824796676635742, - "logps/chosen": -272.52923583984375, - "logps/rejected": -226.22506713867188, - "loss": 0.6767, + "logits/chosen": -2.8256664276123047, + "logits/rejected": -2.8229262828826904, + "logps/chosen": -272.4619140625, + "logps/rejected": -226.0482177734375, + "loss": 0.6773, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.0010557698551565409, - "rewards/margins": 0.03629080206155777, - "rewards/rejected": -0.03734657168388367, + "rewards/chosen": -0.000382797239581123, + "rewards/margins": 0.03519537299871445, + "rewards/rejected": -0.0355781726539135, "step": 185 }, { "epoch": 0.152, - "grad_norm": 1.8644754886627197, + "grad_norm": 1.8593279123306274, "learning_rate": 4.958928677033465e-06, - "logits/chosen": -2.8307862281799316, - "logits/rejected": -2.8190550804138184, - "logps/chosen": -276.7012023925781, - "logps/rejected": -289.3759765625, - "loss": 0.6642, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.007176427636295557, - "rewards/margins": 0.06312907487154007, - "rewards/rejected": -0.05595264956355095, + "logits/chosen": -2.8317179679870605, + "logits/rejected": -2.820038318634033, + "logps/chosen": -276.53924560546875, + "logps/rejected": -289.26007080078125, + "loss": 0.6639, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.008795881643891335, + "rewards/margins": 0.06358983367681503, + "rewards/rejected": -0.05479395389556885, "step": 190 }, { "epoch": 0.156, - "grad_norm": 1.796454906463623, + "grad_norm": 1.802320957183838, "learning_rate": 4.9523878883729794e-06, - "logits/chosen": -2.8766260147094727, - "logits/rejected": -2.8518166542053223, - "logps/chosen": -288.412841796875, - "logps/rejected": -255.20870971679688, - "loss": 0.656, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.004013822879642248, - "rewards/margins": 0.0801478773355484, - "rewards/rejected": -0.07613405585289001, + "logits/chosen": -2.876426935195923, + "logits/rejected": -2.851534128189087, + "logps/chosen": -288.3893737792969, + "logps/rejected": -255.09683227539062, + "loss": 0.6564, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.004248014185577631, + "rewards/margins": 0.07926348596811295, + "rewards/rejected": -0.07501547038555145, "step": 195 }, { "epoch": 0.16, - "grad_norm": 1.861384391784668, + "grad_norm": 1.8666610717773438, "learning_rate": 4.9453690018345144e-06, - "logits/chosen": -2.8510830402374268, - "logits/rejected": -2.8279526233673096, - "logps/chosen": -255.71249389648438, - "logps/rejected": -257.49163818359375, - "loss": 0.653, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.003837744938209653, - "rewards/margins": 0.08905676007270813, - "rewards/rejected": -0.09289450943470001, + "logits/chosen": -2.852238416671753, + "logits/rejected": -2.8293018341064453, + "logps/chosen": -255.58560180664062, + "logps/rejected": -257.3184814453125, + "loss": 0.6532, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.0025687548331916332, + "rewards/margins": 0.08859384059906006, + "rewards/rejected": -0.09116257727146149, "step": 200 }, { "epoch": 0.16, - "eval_logits/chosen": -2.874197244644165, - "eval_logits/rejected": -2.8339242935180664, - "eval_logps/chosen": -284.5635070800781, - "eval_logps/rejected": -257.5692443847656, - "eval_loss": 0.6569300293922424, - "eval_rewards/accuracies": 0.6865079402923584, - "eval_rewards/chosen": -0.013254065066576004, - "eval_rewards/margins": 0.08214230835437775, - "eval_rewards/rejected": -0.09539636969566345, - "eval_runtime": 166.8148, - "eval_samples_per_second": 2.997, + "eval_logits/chosen": -2.878232717514038, + "eval_logits/rejected": -2.8385584354400635, + "eval_logps/chosen": -284.5143127441406, + "eval_logps/rejected": -257.5306091308594, + "eval_loss": 0.6568659543991089, + "eval_rewards/accuracies": 0.6884920597076416, + "eval_rewards/chosen": -0.012762677855789661, + "eval_rewards/margins": 0.08224756270647049, + "eval_rewards/rejected": -0.09501024335622787, + "eval_runtime": 166.7797, + "eval_samples_per_second": 2.998, "eval_steps_per_second": 0.378, "step": 200 }, { "epoch": 0.164, - "grad_norm": 1.9093883037567139, + "grad_norm": 1.9332078695297241, "learning_rate": 4.937873385763909e-06, - "logits/chosen": -2.862431764602661, - "logits/rejected": -2.829721689224243, - "logps/chosen": -287.107177734375, - "logps/rejected": -284.347412109375, + "logits/chosen": -2.8655571937561035, + "logits/rejected": -2.8335084915161133, + "logps/chosen": -287.10076904296875, + "logps/rejected": -284.3404846191406, "loss": 0.6582, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.03399428352713585, - "rewards/margins": 0.07983867824077606, - "rewards/rejected": -0.1138329729437828, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.033930666744709015, + "rewards/margins": 0.07983305305242538, + "rewards/rejected": -0.1137637123465538, "step": 205 }, { "epoch": 0.168, - "grad_norm": 1.8923628330230713, + "grad_norm": 1.877467393875122, "learning_rate": 4.9299025014463665e-06, - "logits/chosen": -2.8810172080993652, - "logits/rejected": -2.8641536235809326, - "logps/chosen": -248.9971466064453, - "logps/rejected": -245.23355102539062, - "loss": 0.6711, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.046056050807237625, - "rewards/margins": 0.05324975773692131, - "rewards/rejected": -0.09930581599473953, + "logits/chosen": -2.879312038421631, + "logits/rejected": -2.862196445465088, + "logps/chosen": -248.899169921875, + "logps/rejected": -245.27310180664062, + "loss": 0.6704, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.04507671296596527, + "rewards/margins": 0.054625023156404495, + "rewards/rejected": -0.09970173239707947, "step": 210 }, { "epoch": 0.172, - "grad_norm": 1.887384057044983, + "grad_norm": 1.8854491710662842, "learning_rate": 4.921457902821578e-06, - "logits/chosen": -2.8637945652008057, - "logits/rejected": -2.8070335388183594, - "logps/chosen": -316.33355712890625, - "logps/rejected": -285.98590087890625, - "loss": 0.6668, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.018491100519895554, - "rewards/margins": 0.06612573564052582, - "rewards/rejected": -0.08461683988571167, + "logits/chosen": -2.8618056774139404, + "logits/rejected": -2.8050172328948975, + "logps/chosen": -316.2086181640625, + "logps/rejected": -286.01239013671875, + "loss": 0.6661, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.017241844907402992, + "rewards/margins": 0.06764046102762222, + "rewards/rejected": -0.08488230407238007, "step": 215 }, { "epoch": 0.176, - "grad_norm": 2.3003458976745605, + "grad_norm": 2.2946383953094482, "learning_rate": 4.912541236180779e-06, - "logits/chosen": -2.7972946166992188, - "logits/rejected": -2.7600889205932617, - "logps/chosen": -325.50244140625, - "logps/rejected": -316.9506530761719, - "loss": 0.6418, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.016769036650657654, - "rewards/margins": 0.11797495931386948, - "rewards/rejected": -0.13474401831626892, + "logits/chosen": -2.7987911701202393, + "logits/rejected": -2.76237154006958, + "logps/chosen": -325.50177001953125, + "logps/rejected": -316.89739990234375, + "loss": 0.642, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.016762247309088707, + "rewards/margins": 0.11744923889636993, + "rewards/rejected": -0.1342114955186844, "step": 220 }, { "epoch": 0.18, - "grad_norm": 1.6942790746688843, + "grad_norm": 1.7292786836624146, "learning_rate": 4.903154239845798e-06, - "logits/chosen": -2.8859641551971436, - "logits/rejected": -2.8270339965820312, - "logps/chosen": -271.8771667480469, - "logps/rejected": -247.03146362304688, - "loss": 0.6483, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.06943339854478836, - "rewards/margins": 0.10388622432947159, - "rewards/rejected": -0.17331962287425995, + "logits/chosen": -2.8847053050994873, + "logits/rejected": -2.825892210006714, + "logps/chosen": -271.9214172363281, + "logps/rejected": -247.08193969726562, + "loss": 0.6482, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.06987594068050385, + "rewards/margins": 0.10394857078790665, + "rewards/rejected": -0.1738245040178299, "step": 225 }, { "epoch": 0.184, - "grad_norm": 2.1547300815582275, + "grad_norm": 2.149097204208374, "learning_rate": 4.893298743830168e-06, - "logits/chosen": -2.793713092803955, - "logits/rejected": -2.806272029876709, - "logps/chosen": -302.48150634765625, - "logps/rejected": -302.66973876953125, - "loss": 0.6308, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.04446331784129143, - "rewards/margins": 0.1429525464773178, - "rewards/rejected": -0.18741586804389954, + "logits/chosen": -2.792332172393799, + "logits/rejected": -2.80527925491333, + "logps/chosen": -302.56390380859375, + "logps/rejected": -302.79840087890625, + "loss": 0.6306, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.04528792202472687, + "rewards/margins": 0.1434146910905838, + "rewards/rejected": -0.18870261311531067, "step": 230 }, { "epoch": 0.188, - "grad_norm": 2.574212074279785, + "grad_norm": 2.5504279136657715, "learning_rate": 4.882976669482368e-06, - "logits/chosen": -2.8080554008483887, - "logits/rejected": -2.778135061264038, - "logps/chosen": -275.0320129394531, - "logps/rejected": -280.06610107421875, - "loss": 0.6431, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.09150179475545883, - "rewards/margins": 0.12502221763134003, - "rewards/rejected": -0.21652403473854065, + "logits/chosen": -2.8090176582336426, + "logits/rejected": -2.7789652347564697, + "logps/chosen": -274.94342041015625, + "logps/rejected": -279.92120361328125, + "loss": 0.6433, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.0906161516904831, + "rewards/margins": 0.124458909034729, + "rewards/rejected": -0.2150750607252121, "step": 235 }, { "epoch": 0.192, - "grad_norm": 2.8856008052825928, + "grad_norm": 2.925840377807617, "learning_rate": 4.8721900291112415e-06, - "logits/chosen": -2.8577849864959717, - "logits/rejected": -2.836282253265381, - "logps/chosen": -291.05352783203125, - "logps/rejected": -275.5093078613281, - "loss": 0.6433, + "logits/chosen": -2.8581314086914062, + "logits/rejected": -2.837096691131592, + "logps/chosen": -290.9739685058594, + "logps/rejected": -275.4525451660156, + "loss": 0.6432, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.016148822382092476, - "rewards/margins": 0.11701644957065582, - "rewards/rejected": -0.13316525518894196, + "rewards/chosen": -0.015353793278336525, + "rewards/margins": 0.11724452674388885, + "rewards/rejected": -0.13259831070899963, "step": 240 }, { "epoch": 0.196, - "grad_norm": 2.3244190216064453, + "grad_norm": 2.329665184020996, "learning_rate": 4.860940925593703e-06, - "logits/chosen": -2.8803956508636475, - "logits/rejected": -2.8489887714385986, - "logps/chosen": -288.46856689453125, - "logps/rejected": -274.30364990234375, - "loss": 0.637, + "logits/chosen": -2.878603458404541, + "logits/rejected": -2.8466429710388184, + "logps/chosen": -288.50494384765625, + "logps/rejected": -274.239990234375, + "loss": 0.6374, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.012815827503800392, - "rewards/margins": 0.13950756192207336, - "rewards/rejected": -0.12669174373149872, + "rewards/chosen": 0.012451673857867718, + "rewards/margins": 0.13850674033164978, + "rewards/rejected": -0.1260550618171692, "step": 245 }, { "epoch": 0.2, - "grad_norm": 2.1376349925994873, + "grad_norm": 2.1521079540252686, "learning_rate": 4.849231551964771e-06, - "logits/chosen": -2.8547749519348145, - "logits/rejected": -2.8269331455230713, - "logps/chosen": -254.5790557861328, - "logps/rejected": -242.4806365966797, - "loss": 0.6555, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.016418656334280968, - "rewards/margins": 0.09427244961261749, - "rewards/rejected": -0.11069109290838242, + "logits/chosen": -2.8556203842163086, + "logits/rejected": -2.82784104347229, + "logps/chosen": -254.6340789794922, + "logps/rejected": -242.66726684570312, + "loss": 0.6548, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.016969427466392517, + "rewards/margins": 0.09558813273906708, + "rewards/rejected": -0.112557552754879, "step": 250 }, { "epoch": 0.204, - "grad_norm": 3.7023279666900635, + "grad_norm": 3.7580788135528564, "learning_rate": 4.837064190990036e-06, - "logits/chosen": -2.7928366661071777, - "logits/rejected": -2.8065311908721924, - "logps/chosen": -286.6212158203125, - "logps/rejected": -284.4375, - "loss": 0.6467, + "logits/chosen": -2.7907662391662598, + "logits/rejected": -2.8043570518493652, + "logps/chosen": -287.1578674316406, + "logps/rejected": -284.9073181152344, + "loss": 0.6471, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.05689004808664322, - "rewards/margins": 0.11735578626394272, - "rewards/rejected": -0.17424583435058594, + "rewards/chosen": -0.06225704029202461, + "rewards/margins": 0.11668694019317627, + "rewards/rejected": -0.17894400656223297, "step": 255 }, { "epoch": 0.208, - "grad_norm": 3.1189470291137695, + "grad_norm": 3.2057318687438965, "learning_rate": 4.824441214720629e-06, - "logits/chosen": -2.826845169067383, - "logits/rejected": -2.839430332183838, - "logps/chosen": -330.69378662109375, - "logps/rejected": -295.32781982421875, - "loss": 0.6619, + "logits/chosen": -2.8258254528045654, + "logits/rejected": -2.838768720626831, + "logps/chosen": -331.3340148925781, + "logps/rejected": -295.8611145019531, + "loss": 0.6624, "rewards/accuracies": 0.625, - "rewards/chosen": -0.11839203536510468, - "rewards/margins": 0.09098449349403381, - "rewards/rejected": -0.2093765288591385, + "rewards/chosen": -0.12479463964700699, + "rewards/margins": 0.08991553634405136, + "rewards/rejected": -0.21471016108989716, "step": 260 }, { "epoch": 0.212, - "grad_norm": 5.202083110809326, + "grad_norm": 5.250330448150635, "learning_rate": 4.811365084030784e-06, - "logits/chosen": -2.7906274795532227, - "logits/rejected": -2.7402591705322266, - "logps/chosen": -240.4190673828125, - "logps/rejected": -258.24639892578125, - "loss": 0.6288, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.13684885203838348, - "rewards/margins": 0.1477588713169098, - "rewards/rejected": -0.28460773825645447, + "logits/chosen": -2.788186550140381, + "logits/rejected": -2.737650156021118, + "logps/chosen": -240.7392578125, + "logps/rejected": -258.4285583496094, + "loss": 0.6295, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.1400509476661682, + "rewards/margins": 0.14637869596481323, + "rewards/rejected": -0.28642964363098145, "step": 265 }, { "epoch": 0.216, - "grad_norm": 4.058558940887451, + "grad_norm": 4.085949420928955, "learning_rate": 4.7978383481380865e-06, - "logits/chosen": -2.8255257606506348, - "logits/rejected": -2.827247381210327, - "logps/chosen": -284.66058349609375, - "logps/rejected": -326.15008544921875, - "loss": 0.6278, + "logits/chosen": -2.8263564109802246, + "logits/rejected": -2.82792592048645, + "logps/chosen": -284.7565002441406, + "logps/rejected": -326.1434020996094, + "loss": 0.6284, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.1267203986644745, - "rewards/margins": 0.1748858392238617, - "rewards/rejected": -0.3016062378883362, + "rewards/chosen": -0.12767954170703888, + "rewards/margins": 0.17385998368263245, + "rewards/rejected": -0.30153951048851013, "step": 270 }, { "epoch": 0.22, - "grad_norm": 2.6630849838256836, + "grad_norm": 3.1048779487609863, "learning_rate": 4.783863644106502e-06, - "logits/chosen": -2.8801255226135254, - "logits/rejected": -2.8717246055603027, - "logps/chosen": -279.7027893066406, - "logps/rejected": -273.7590026855469, - "loss": 0.6354, + "logits/chosen": -2.881112575531006, + "logits/rejected": -2.87247896194458, + "logps/chosen": -279.97052001953125, + "logps/rejected": -273.80780029296875, + "loss": 0.6366, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.12922950088977814, - "rewards/margins": 0.1460111439228058, - "rewards/rejected": -0.2752406597137451, + "rewards/chosen": -0.1319071650505066, + "rewards/margins": 0.14382150769233704, + "rewards/rejected": -0.2757287323474884, "step": 275 }, { "epoch": 0.224, - "grad_norm": 3.321779251098633, + "grad_norm": 3.2117135524749756, "learning_rate": 4.769443696332272e-06, - "logits/chosen": -2.8768434524536133, - "logits/rejected": -2.8461215496063232, - "logps/chosen": -292.112548828125, - "logps/rejected": -293.89886474609375, - "loss": 0.6301, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.08245956897735596, - "rewards/margins": 0.16300079226493835, - "rewards/rejected": -0.24546034634113312, + "logits/chosen": -2.8749935626983643, + "logits/rejected": -2.844726085662842, + "logps/chosen": -292.9136657714844, + "logps/rejected": -294.96185302734375, + "loss": 0.629, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09047095477581024, + "rewards/margins": 0.1656198650598526, + "rewards/rejected": -0.25609081983566284, "step": 280 }, { "epoch": 0.228, - "grad_norm": 2.9796993732452393, + "grad_norm": 3.0179073810577393, "learning_rate": 4.754581316012785e-06, - "logits/chosen": -2.875678300857544, - "logits/rejected": -2.8014907836914062, - "logps/chosen": -321.65411376953125, - "logps/rejected": -297.4300842285156, - "loss": 0.6025, + "logits/chosen": -2.8741941452026367, + "logits/rejected": -2.799834728240967, + "logps/chosen": -323.2701721191406, + "logps/rejected": -299.8788146972656, + "loss": 0.5984, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.09748764336109161, - "rewards/margins": 0.22937169671058655, - "rewards/rejected": -0.32685935497283936, + "rewards/chosen": -0.11364835500717163, + "rewards/margins": 0.2376987189054489, + "rewards/rejected": -0.35134708881378174, "step": 285 }, { "epoch": 0.232, - "grad_norm": 3.133993148803711, + "grad_norm": 3.1179349422454834, "learning_rate": 4.7392794005985324e-06, - "logits/chosen": -2.8018627166748047, - "logits/rejected": -2.7942662239074707, - "logps/chosen": -292.52667236328125, - "logps/rejected": -271.00030517578125, - "loss": 0.5949, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.21974220871925354, - "rewards/margins": 0.23409290611743927, - "rewards/rejected": -0.4538350999355316, + "logits/chosen": -2.801036834716797, + "logits/rejected": -2.793466329574585, + "logps/chosen": -293.9784240722656, + "logps/rejected": -272.1210021972656, + "loss": 0.5962, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.23425976932048798, + "rewards/margins": 0.23078274726867676, + "rewards/rejected": -0.4650425314903259, "step": 290 }, { "epoch": 0.236, - "grad_norm": 3.11313796043396, + "grad_norm": 3.5481436252593994, "learning_rate": 4.723540933228245e-06, - "logits/chosen": -2.820415496826172, - "logits/rejected": -2.796581745147705, - "logps/chosen": -329.7557678222656, - "logps/rejected": -321.95233154296875, - "loss": 0.6621, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.3443993926048279, - "rewards/margins": 0.10115940868854523, - "rewards/rejected": -0.4455588757991791, + "logits/chosen": -2.8212785720825195, + "logits/rejected": -2.7978832721710205, + "logps/chosen": -327.6084899902344, + "logps/rejected": -320.15106201171875, + "loss": 0.6612, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.32292693853378296, + "rewards/margins": 0.1046195775270462, + "rewards/rejected": -0.42754650115966797, "step": 295 }, { "epoch": 0.24, - "grad_norm": 3.609873056411743, + "grad_norm": 4.389492034912109, "learning_rate": 4.707368982147318e-06, - "logits/chosen": -2.8759727478027344, - "logits/rejected": -2.825862407684326, - "logps/chosen": -333.0445556640625, - "logps/rejected": -285.5638122558594, - "loss": 0.6385, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.3127234876155853, - "rewards/margins": 0.1507173478603363, - "rewards/rejected": -0.46344083547592163, + "logits/chosen": -2.8768062591552734, + "logits/rejected": -2.8266239166259766, + "logps/chosen": -329.6361083984375, + "logps/rejected": -282.6575927734375, + "loss": 0.6372, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.2786393463611603, + "rewards/margins": 0.155739888548851, + "rewards/rejected": -0.4343792498111725, "step": 300 }, { "epoch": 0.24, - "eval_logits/chosen": -2.8398690223693848, - "eval_logits/rejected": -2.803138494491577, - "eval_logps/chosen": -310.65655517578125, - "eval_logps/rejected": -295.55364990234375, - "eval_loss": 0.6190334558486938, - "eval_rewards/accuracies": 0.6904761791229248, - "eval_rewards/chosen": -0.27418458461761475, - "eval_rewards/margins": 0.20105606317520142, - "eval_rewards/rejected": -0.47524064779281616, - "eval_runtime": 167.0651, - "eval_samples_per_second": 2.993, - "eval_steps_per_second": 0.377, + "eval_logits/chosen": -2.84016752243042, + "eval_logits/rejected": -2.803346872329712, + "eval_logps/chosen": -307.0444030761719, + "eval_logps/rejected": -292.0920715332031, + "eval_loss": 0.6181342005729675, + "eval_rewards/accuracies": 0.682539701461792, + "eval_rewards/chosen": -0.2380632609128952, + "eval_rewards/margins": 0.20256145298480988, + "eval_rewards/rejected": -0.4406247138977051, + "eval_runtime": 166.7743, + "eval_samples_per_second": 2.998, + "eval_steps_per_second": 0.378, "step": 300 }, { "epoch": 0.244, - "grad_norm": 5.130188941955566, + "grad_norm": 6.237858295440674, "learning_rate": 4.690766700109659e-06, - "logits/chosen": -2.814621925354004, - "logits/rejected": -2.7681021690368652, - "logps/chosen": -254.0991973876953, - "logps/rejected": -227.88961791992188, - "loss": 0.6404, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.3078501522541046, - "rewards/margins": 0.14654883742332458, - "rewards/rejected": -0.4543989598751068, + "logits/chosen": -2.813170909881592, + "logits/rejected": -2.765450954437256, + "logps/chosen": -251.8035430908203, + "logps/rejected": -226.10787963867188, + "loss": 0.6377, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.28489404916763306, + "rewards/margins": 0.15168778598308563, + "rewards/rejected": -0.4365817904472351, "step": 305 }, { "epoch": 0.248, - "grad_norm": 2.648085832595825, + "grad_norm": 2.941599130630493, "learning_rate": 4.673737323763048e-06, - "logits/chosen": -2.870044708251953, - "logits/rejected": -2.8921992778778076, - "logps/chosen": -322.07037353515625, - "logps/rejected": -309.27874755859375, - "loss": 0.5945, + "logits/chosen": -2.8621535301208496, + "logits/rejected": -2.883449077606201, + "logps/chosen": -323.72625732421875, + "logps/rejected": -309.9552307128906, + "loss": 0.5975, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.23309031128883362, - "rewards/margins": 0.25641053915023804, - "rewards/rejected": -0.48950082063674927, + "rewards/chosen": -0.2496490776538849, + "rewards/margins": 0.24661684036254883, + "rewards/rejected": -0.4962659478187561, "step": 310 }, { "epoch": 0.252, - "grad_norm": 2.9251554012298584, + "grad_norm": 2.81584095954895, "learning_rate": 4.656284173018144e-06, - "logits/chosen": -2.796626567840576, - "logits/rejected": -2.777113437652588, - "logps/chosen": -303.8788146972656, - "logps/rejected": -336.8778991699219, - "loss": 0.6197, + "logits/chosen": -2.7917304039001465, + "logits/rejected": -2.771953821182251, + "logps/chosen": -305.7306213378906, + "logps/rejected": -337.7940979003906, + "loss": 0.6245, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.22741813957691193, - "rewards/margins": 0.18721428513526917, - "rewards/rejected": -0.4146324098110199, + "rewards/chosen": -0.24593646824359894, + "rewards/margins": 0.17785824835300446, + "rewards/rejected": -0.423794686794281, "step": 315 }, { "epoch": 0.256, - "grad_norm": 3.248079776763916, + "grad_norm": 3.657536029815674, "learning_rate": 4.638410650401267e-06, - "logits/chosen": -2.868063449859619, - "logits/rejected": -2.8760440349578857, - "logps/chosen": -310.02423095703125, - "logps/rejected": -324.77301025390625, - "loss": 0.6175, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.21097686886787415, - "rewards/margins": 0.21348261833190918, - "rewards/rejected": -0.42445945739746094, + "logits/chosen": -2.863358974456787, + "logits/rejected": -2.8709418773651123, + "logps/chosen": -308.0937805175781, + "logps/rejected": -322.3440246582031, + "loss": 0.6189, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.19167309999465942, + "rewards/margins": 0.20849671959877014, + "rewards/rejected": -0.40016984939575195, "step": 320 }, { "epoch": 0.26, - "grad_norm": 3.228531837463379, + "grad_norm": 3.2525851726531982, "learning_rate": 4.620120240391065e-06, - "logits/chosen": -2.8391566276550293, - "logits/rejected": -2.864546775817871, - "logps/chosen": -333.9083251953125, - "logps/rejected": -310.19256591796875, - "loss": 0.6103, + "logits/chosen": -2.8361878395080566, + "logits/rejected": -2.8604865074157715, + "logps/chosen": -331.04949951171875, + "logps/rejected": -306.60662841796875, + "loss": 0.612, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.18001362681388855, - "rewards/margins": 0.2388877123594284, - "rewards/rejected": -0.41890135407447815, + "rewards/chosen": -0.15142570436000824, + "rewards/margins": 0.23161661624908447, + "rewards/rejected": -0.3830423355102539, "step": 325 }, { "epoch": 0.264, - "grad_norm": 4.000072479248047, + "grad_norm": 3.2161409854888916, "learning_rate": 4.601416508739211e-06, - "logits/chosen": -2.768284320831299, - "logits/rejected": -2.7349295616149902, - "logps/chosen": -294.70703125, - "logps/rejected": -288.8208923339844, - "loss": 0.61, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.18951205909252167, - "rewards/margins": 0.23551206290721893, - "rewards/rejected": -0.4250241816043854, + "logits/chosen": -2.765329360961914, + "logits/rejected": -2.731293201446533, + "logps/chosen": -294.65509033203125, + "logps/rejected": -288.2440185546875, + "loss": 0.6113, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.1889929473400116, + "rewards/margins": 0.23026308417320251, + "rewards/rejected": -0.4192560315132141, "step": 330 }, { "epoch": 0.268, - "grad_norm": 4.16896915435791, + "grad_norm": 4.34539270401001, "learning_rate": 4.582303101775249e-06, - "logits/chosen": -2.7774980068206787, - "logits/rejected": -2.754678726196289, - "logps/chosen": -302.11602783203125, - "logps/rejected": -277.4075012207031, - "loss": 0.613, + "logits/chosen": -2.773538112640381, + "logits/rejected": -2.750394582748413, + "logps/chosen": -301.92291259765625, + "logps/rejected": -276.76275634765625, + "loss": 0.6137, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.25999313592910767, - "rewards/margins": 0.24224285781383514, - "rewards/rejected": -0.502236008644104, + "rewards/chosen": -0.25806164741516113, + "rewards/margins": 0.237727090716362, + "rewards/rejected": -0.49578872323036194, "step": 335 }, { "epoch": 0.272, - "grad_norm": 3.032811403274536, + "grad_norm": 2.9809610843658447, "learning_rate": 4.562783745695738e-06, - "logits/chosen": -2.7642288208007812, - "logits/rejected": -2.8096585273742676, - "logps/chosen": -216.1359100341797, - "logps/rejected": -250.99014282226562, - "loss": 0.6156, + "logits/chosen": -2.7601351737976074, + "logits/rejected": -2.805574893951416, + "logps/chosen": -213.38693237304688, + "logps/rejected": -248.6228790283203, + "loss": 0.6131, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.3114262819290161, - "rewards/margins": 0.21660414338111877, - "rewards/rejected": -0.5280304551124573, + "rewards/chosen": -0.2839365601539612, + "rewards/margins": 0.2204209268093109, + "rewards/rejected": -0.5043575167655945, "step": 340 }, { "epoch": 0.276, - "grad_norm": 4.874217987060547, + "grad_norm": 3.7868945598602295, "learning_rate": 4.542862245837821e-06, - "logits/chosen": -2.868809700012207, - "logits/rejected": -2.817004680633545, - "logps/chosen": -327.43060302734375, - "logps/rejected": -330.1990966796875, - "loss": 0.5829, + "logits/chosen": -2.862086296081543, + "logits/rejected": -2.80869722366333, + "logps/chosen": -326.58392333984375, + "logps/rejected": -329.5889587402344, + "loss": 0.5811, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.24460521340370178, - "rewards/margins": 0.30477675795555115, - "rewards/rejected": -0.5493819117546082, + "rewards/chosen": -0.23613891005516052, + "rewards/margins": 0.30714207887649536, + "rewards/rejected": -0.5432809591293335, "step": 345 }, { "epoch": 0.28, - "grad_norm": 4.413153648376465, + "grad_norm": 4.723974227905273, "learning_rate": 4.522542485937369e-06, - "logits/chosen": -2.7310781478881836, - "logits/rejected": -2.7017111778259277, - "logps/chosen": -269.2838134765625, - "logps/rejected": -288.1488952636719, - "loss": 0.6175, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.31993380188941956, - "rewards/margins": 0.23448209464550018, - "rewards/rejected": -0.5544158220291138, + "logits/chosen": -2.723212242126465, + "logits/rejected": -2.6936533451080322, + "logps/chosen": -267.9949951171875, + "logps/rejected": -286.03448486328125, + "loss": 0.6194, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3070460557937622, + "rewards/margins": 0.2262255847454071, + "rewards/rejected": -0.5332716703414917, "step": 350 }, { "epoch": 0.284, - "grad_norm": 3.4240481853485107, + "grad_norm": 3.612205982208252, "learning_rate": 4.501828427371834e-06, - "logits/chosen": -2.8249359130859375, - "logits/rejected": -2.7783515453338623, - "logps/chosen": -279.47833251953125, - "logps/rejected": -266.2778015136719, - "loss": 0.6243, + "logits/chosen": -2.8160369396209717, + "logits/rejected": -2.7678263187408447, + "logps/chosen": -276.889892578125, + "logps/rejected": -262.567138671875, + "loss": 0.6269, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.2718459963798523, - "rewards/margins": 0.22681903839111328, - "rewards/rejected": -0.4986650049686432, + "rewards/chosen": -0.2459622323513031, + "rewards/margins": 0.21559634804725647, + "rewards/rejected": -0.4615585207939148, "step": 355 }, { "epoch": 0.288, - "grad_norm": 4.139309406280518, + "grad_norm": 4.156825065612793, "learning_rate": 4.4807241083879774e-06, - "logits/chosen": -2.8446147441864014, - "logits/rejected": -2.8624658584594727, - "logps/chosen": -301.0242614746094, - "logps/rejected": -329.738037109375, - "loss": 0.6215, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.29846441745758057, - "rewards/margins": 0.22665563225746155, - "rewards/rejected": -0.5251200199127197, + "logits/chosen": -2.8309903144836426, + "logits/rejected": -2.848707914352417, + "logps/chosen": -298.7277526855469, + "logps/rejected": -328.09912109375, + "loss": 0.6177, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.27550002932548523, + "rewards/margins": 0.23323087394237518, + "rewards/rejected": -0.508730947971344, "step": 360 }, { "epoch": 0.292, - "grad_norm": 3.9606006145477295, + "grad_norm": 4.314282417297363, "learning_rate": 4.4592336433146e-06, - "logits/chosen": -2.829144239425659, - "logits/rejected": -2.8264288902282715, - "logps/chosen": -306.7838439941406, - "logps/rejected": -312.4410095214844, - "loss": 0.6144, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.3578701913356781, - "rewards/margins": 0.22671571373939514, - "rewards/rejected": -0.5845859050750732, + "logits/chosen": -2.811722755432129, + "logits/rejected": -2.807515859603882, + "logps/chosen": -309.4399719238281, + "logps/rejected": -314.6178283691406, + "loss": 0.6153, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.3844314217567444, + "rewards/margins": 0.22192268073558807, + "rewards/rejected": -0.6063541173934937, "step": 365 }, { "epoch": 0.296, - "grad_norm": 4.797597885131836, + "grad_norm": 4.635516166687012, "learning_rate": 4.437361221760449e-06, - "logits/chosen": -2.8679544925689697, - "logits/rejected": -2.8508830070495605, - "logps/chosen": -314.5845947265625, - "logps/rejected": -294.0763244628906, - "loss": 0.5933, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.30855458974838257, - "rewards/margins": 0.29460665583610535, - "rewards/rejected": -0.6031612157821655, + "logits/chosen": -2.850919485092163, + "logits/rejected": -2.8320136070251465, + "logps/chosen": -316.4634704589844, + "logps/rejected": -295.4265441894531, + "loss": 0.5943, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.32734400033950806, + "rewards/margins": 0.2893194556236267, + "rewards/rejected": -0.61666339635849, "step": 370 }, { "epoch": 0.3, - "grad_norm": 4.122452259063721, + "grad_norm": 4.108780384063721, "learning_rate": 4.415111107797445e-06, - "logits/chosen": -2.7813663482666016, - "logits/rejected": -2.6964869499206543, - "logps/chosen": -304.09637451171875, - "logps/rejected": -295.1402587890625, - "loss": 0.635, + "logits/chosen": -2.763192892074585, + "logits/rejected": -2.6753077507019043, + "logps/chosen": -304.7072448730469, + "logps/rejected": -296.2711486816406, + "loss": 0.6325, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.27096718549728394, - "rewards/margins": 0.20372644066810608, - "rewards/rejected": -0.4746936857700348, + "rewards/chosen": -0.277075856924057, + "rewards/margins": 0.20892643928527832, + "rewards/rejected": -0.48600226640701294, "step": 375 }, { "epoch": 0.304, - "grad_norm": 3.014270305633545, + "grad_norm": 2.956279993057251, "learning_rate": 4.3924876391293915e-06, - "logits/chosen": -2.813854217529297, - "logits/rejected": -2.7791194915771484, - "logps/chosen": -274.0520935058594, - "logps/rejected": -276.63140869140625, - "loss": 0.6145, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.14454008638858795, - "rewards/margins": 0.22425620257854462, - "rewards/rejected": -0.36879628896713257, + "logits/chosen": -2.7950615882873535, + "logits/rejected": -2.7592384815216064, + "logps/chosen": -273.1991271972656, + "logps/rejected": -275.79229736328125, + "loss": 0.6129, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.13601061701774597, + "rewards/margins": 0.22439420223236084, + "rewards/rejected": -0.3604048192501068, "step": 380 }, { "epoch": 0.308, - "grad_norm": 3.01690936088562, + "grad_norm": 4.006164073944092, "learning_rate": 4.36949522624633e-06, - "logits/chosen": -2.8495421409606934, - "logits/rejected": -2.8250906467437744, - "logps/chosen": -324.4741516113281, - "logps/rejected": -310.1029968261719, - "loss": 0.5852, + "logits/chosen": -2.830416202545166, + "logits/rejected": -2.8047218322753906, + "logps/chosen": -323.8509826660156, + "logps/rejected": -308.55230712890625, + "loss": 0.5875, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.11460743099451065, - "rewards/margins": 0.2905317544937134, - "rewards/rejected": -0.4051392078399658, + "rewards/chosen": -0.10837619006633759, + "rewards/margins": 0.28125640749931335, + "rewards/rejected": -0.38963261246681213, "step": 385 }, { "epoch": 0.312, - "grad_norm": 6.7993245124816895, + "grad_norm": 6.139017581939697, "learning_rate": 4.346138351564711e-06, - "logits/chosen": -2.8487377166748047, - "logits/rejected": -2.778311252593994, - "logps/chosen": -363.527587890625, - "logps/rejected": -311.5059509277344, - "loss": 0.6315, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.21368345618247986, - "rewards/margins": 0.20435233414173126, - "rewards/rejected": -0.4180358052253723, + "logits/chosen": -2.8317887783050537, + "logits/rejected": -2.7582955360412598, + "logps/chosen": -362.7658996582031, + "logps/rejected": -310.333984375, + "loss": 0.6309, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.20606637001037598, + "rewards/margins": 0.2002502977848053, + "rewards/rejected": -0.4063166677951813, "step": 390 }, { "epoch": 0.316, - "grad_norm": 4.2425737380981445, + "grad_norm": 4.839846134185791, "learning_rate": 4.322421568553529e-06, - "logits/chosen": -2.8648476600646973, - "logits/rejected": -2.8160643577575684, - "logps/chosen": -383.7563781738281, - "logps/rejected": -340.1937561035156, + "logits/chosen": -2.848759174346924, + "logits/rejected": -2.7962448596954346, + "logps/chosen": -382.81048583984375, + "logps/rejected": -339.45672607421875, "loss": 0.6138, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.1808895766735077, - "rewards/margins": 0.22633692622184753, - "rewards/rejected": -0.40722647309303284, + "rewards/chosen": -0.1714310199022293, + "rewards/margins": 0.22842545807361603, + "rewards/rejected": -0.39985641837120056, "step": 395 }, { "epoch": 0.32, - "grad_norm": 3.8491086959838867, + "grad_norm": 3.8282814025878906, "learning_rate": 4.2983495008466285e-06, - "logits/chosen": -2.879483461380005, - "logits/rejected": -2.8419952392578125, - "logps/chosen": -320.96197509765625, - "logps/rejected": -318.02056884765625, - "loss": 0.5689, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.22649240493774414, - "rewards/margins": 0.3563699424266815, - "rewards/rejected": -0.5828623175621033, + "logits/chosen": -2.8639044761657715, + "logits/rejected": -2.8238472938537598, + "logps/chosen": -317.0664367675781, + "logps/rejected": -313.5345458984375, + "loss": 0.5699, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.1875368058681488, + "rewards/margins": 0.35046523809432983, + "rewards/rejected": -0.538002073764801, "step": 400 }, { "epoch": 0.32, - "eval_logits/chosen": -2.8437469005584717, - "eval_logits/rejected": -2.8083014488220215, - "eval_logps/chosen": -312.95733642578125, - "eval_logps/rejected": -305.2159423828125, - "eval_loss": 0.6026533246040344, - "eval_rewards/accuracies": 0.6944444179534912, - "eval_rewards/chosen": -0.29719212651252747, - "eval_rewards/margins": 0.27467086911201477, - "eval_rewards/rejected": -0.5718629360198975, - "eval_runtime": 166.7775, + "eval_logits/chosen": -2.831890344619751, + "eval_logits/rejected": -2.795173168182373, + "eval_logps/chosen": -309.8138427734375, + "eval_logps/rejected": -301.8563232421875, + "eval_loss": 0.6034325957298279, + "eval_rewards/accuracies": 0.6964285969734192, + "eval_rewards/chosen": -0.2657574713230133, + "eval_rewards/margins": 0.27250993251800537, + "eval_rewards/rejected": -0.5382674336433411, + "eval_runtime": 166.7653, "eval_samples_per_second": 2.998, "eval_steps_per_second": 0.378, "step": 400 }, { "epoch": 0.324, - "grad_norm": 5.923827648162842, + "grad_norm": 4.6939778327941895, "learning_rate": 4.273926841341303e-06, - "logits/chosen": -2.8288912773132324, - "logits/rejected": -2.811527967453003, - "logps/chosen": -270.85467529296875, - "logps/rejected": -300.5176086425781, - "loss": 0.6137, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.3265763223171234, - "rewards/margins": 0.2849840521812439, - "rewards/rejected": -0.6115604639053345, + "logits/chosen": -2.8153679370880127, + "logits/rejected": -2.797407388687134, + "logps/chosen": -267.6861877441406, + "logps/rejected": -296.2272033691406, + "loss": 0.6146, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2948915362358093, + "rewards/margins": 0.27376502752304077, + "rewards/rejected": -0.5686565637588501, "step": 405 }, { "epoch": 0.328, - "grad_norm": 4.179617404937744, + "grad_norm": 5.867495059967041, "learning_rate": 4.249158351283414e-06, - "logits/chosen": -2.830479860305786, - "logits/rejected": -2.7938389778137207, - "logps/chosen": -298.1662902832031, - "logps/rejected": -311.0763244628906, - "loss": 0.6207, + "logits/chosen": -2.8131103515625, + "logits/rejected": -2.7752747535705566, + "logps/chosen": -296.8202209472656, + "logps/rejected": -309.84991455078125, + "loss": 0.6198, "rewards/accuracies": 0.625, - "rewards/chosen": -0.3962728977203369, - "rewards/margins": 0.26946958899497986, - "rewards/rejected": -0.6657425165176392, + "rewards/chosen": -0.3828127086162567, + "rewards/margins": 0.27066582441329956, + "rewards/rejected": -0.6534786224365234, "step": 410 }, { "epoch": 0.332, - "grad_norm": 4.05703592300415, + "grad_norm": 3.4944844245910645, "learning_rate": 4.224048859339175e-06, - "logits/chosen": -2.8106844425201416, - "logits/rejected": -2.793975591659546, - "logps/chosen": -320.8597717285156, - "logps/rejected": -314.65643310546875, - "loss": 0.5808, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.2955939471721649, - "rewards/margins": 0.317967027425766, - "rewards/rejected": -0.6135609745979309, + "logits/chosen": -2.7919559478759766, + "logits/rejected": -2.7731316089630127, + "logps/chosen": -320.2292175292969, + "logps/rejected": -313.79827880859375, + "loss": 0.5827, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.28928855061531067, + "rewards/margins": 0.3156905770301819, + "rewards/rejected": -0.6049790978431702, "step": 415 }, { "epoch": 0.336, - "grad_norm": 3.603731632232666, + "grad_norm": 6.125190734863281, "learning_rate": 4.198603260653792e-06, - "logits/chosen": -2.828977108001709, - "logits/rejected": -2.8087880611419678, - "logps/chosen": -318.9561767578125, - "logps/rejected": -296.2648010253906, - "loss": 0.6219, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.312124639749527, - "rewards/margins": 0.2338072955608368, - "rewards/rejected": -0.5459319353103638, + "logits/chosen": -2.8130970001220703, + "logits/rejected": -2.7901930809020996, + "logps/chosen": -317.51165771484375, + "logps/rejected": -293.3788146972656, + "loss": 0.6275, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.2976795434951782, + "rewards/margins": 0.21939225494861603, + "rewards/rejected": -0.5170717239379883, "step": 420 }, { "epoch": 0.34, - "grad_norm": 3.502253293991089, + "grad_norm": 4.455983638763428, "learning_rate": 4.172826515897146e-06, - "logits/chosen": -2.840507984161377, - "logits/rejected": -2.8030855655670166, - "logps/chosen": -284.6482238769531, - "logps/rejected": -301.00970458984375, - "loss": 0.5747, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.24788649380207062, - "rewards/margins": 0.3554701507091522, - "rewards/rejected": -0.6033565998077393, + "logits/chosen": -2.8195388317108154, + "logits/rejected": -2.780494451522827, + "logps/chosen": -283.6789245605469, + "logps/rejected": -300.58563232421875, + "loss": 0.572, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23819419741630554, + "rewards/margins": 0.36092180013656616, + "rewards/rejected": -0.5991159677505493, "step": 425 }, { "epoch": 0.344, - "grad_norm": 3.493734359741211, + "grad_norm": 3.734440326690674, "learning_rate": 4.146723650296701e-06, - "logits/chosen": -2.842768430709839, - "logits/rejected": -2.8307838439941406, - "logps/chosen": -303.18511962890625, - "logps/rejected": -299.94586181640625, - "loss": 0.5988, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.238745778799057, - "rewards/margins": 0.2844436764717102, - "rewards/rejected": -0.5231894254684448, + "logits/chosen": -2.8214731216430664, + "logits/rejected": -2.80680775642395, + "logps/chosen": -305.1948547363281, + "logps/rejected": -301.62579345703125, + "loss": 0.603, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.25884318351745605, + "rewards/margins": 0.2811453640460968, + "rewards/rejected": -0.5399885773658752, "step": 430 }, { "epoch": 0.348, - "grad_norm": 4.337499618530273, + "grad_norm": 3.1842479705810547, "learning_rate": 4.120299752657828e-06, - "logits/chosen": -2.820674180984497, - "logits/rejected": -2.8131017684936523, - "logps/chosen": -309.2215270996094, - "logps/rejected": -302.0687561035156, - "loss": 0.5918, + "logits/chosen": -2.799774169921875, + "logits/rejected": -2.790123462677002, + "logps/chosen": -309.83477783203125, + "logps/rejected": -303.22332763671875, + "loss": 0.5892, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.20002242922782898, - "rewards/margins": 0.3058350086212158, - "rewards/rejected": -0.5058574080467224, + "rewards/chosen": -0.20615491271018982, + "rewards/margins": 0.3112487494945526, + "rewards/rejected": -0.5174037218093872, "step": 435 }, { "epoch": 0.352, - "grad_norm": 4.533146381378174, + "grad_norm": 6.865662574768066, "learning_rate": 4.093559974371725e-06, - "logits/chosen": -2.8260464668273926, - "logits/rejected": -2.8369338512420654, - "logps/chosen": -313.92413330078125, - "logps/rejected": -341.5757141113281, - "loss": 0.5944, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.24169588088989258, - "rewards/margins": 0.32093319296836853, - "rewards/rejected": -0.562628984451294, + "logits/chosen": -2.806478977203369, + "logits/rejected": -2.8173699378967285, + "logps/chosen": -312.33978271484375, + "logps/rejected": -340.7373046875, + "loss": 0.5898, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.22585222125053406, + "rewards/margins": 0.3283933103084564, + "rewards/rejected": -0.5542455911636353, "step": 440 }, { "epoch": 0.356, - "grad_norm": 6.319594860076904, + "grad_norm": 5.557225704193115, "learning_rate": 4.066509528411151e-06, - "logits/chosen": -2.7408385276794434, - "logits/rejected": -2.70278000831604, - "logps/chosen": -280.9513854980469, - "logps/rejected": -315.61260986328125, - "loss": 0.5481, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.34108084440231323, - "rewards/margins": 0.41155967116355896, - "rewards/rejected": -0.7526406049728394, + "logits/chosen": -2.7204031944274902, + "logits/rejected": -2.679771900177002, + "logps/chosen": -277.78057861328125, + "logps/rejected": -310.7375183105469, + "loss": 0.5563, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.30937278270721436, + "rewards/margins": 0.39451706409454346, + "rewards/rejected": -0.7038899660110474, "step": 445 }, { "epoch": 0.36, - "grad_norm": 7.433955192565918, + "grad_norm": 6.662594795227051, "learning_rate": 4.039153688314146e-06, - "logits/chosen": -2.865485906600952, - "logits/rejected": -2.8098623752593994, - "logps/chosen": -350.09820556640625, - "logps/rejected": -329.5234680175781, - "loss": 0.6008, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.49973931908607483, - "rewards/margins": 0.3033995032310486, - "rewards/rejected": -0.8031389117240906, + "logits/chosen": -2.8505125045776367, + "logits/rejected": -2.7928390502929688, + "logps/chosen": -344.05902099609375, + "logps/rejected": -324.29962158203125, + "loss": 0.596, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.43934765458106995, + "rewards/margins": 0.31155315041542053, + "rewards/rejected": -0.7509008049964905, "step": 450 }, { "epoch": 0.364, - "grad_norm": 4.991713047027588, + "grad_norm": 5.561422348022461, "learning_rate": 4.011497787155938e-06, - "logits/chosen": -2.777055501937866, - "logits/rejected": -2.717963695526123, - "logps/chosen": -340.73223876953125, - "logps/rejected": -334.4681091308594, - "loss": 0.5899, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.6034590005874634, - "rewards/margins": 0.3287494480609894, - "rewards/rejected": -0.9322085380554199, + "logits/chosen": -2.759361743927002, + "logits/rejected": -2.697282314300537, + "logps/chosen": -333.2874755859375, + "logps/rejected": -329.25225830078125, + "loss": 0.5838, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.529011607170105, + "rewards/margins": 0.3510381579399109, + "rewards/rejected": -0.8800498247146606, "step": 455 }, { "epoch": 0.368, - "grad_norm": 4.300924301147461, + "grad_norm": 4.668981075286865, "learning_rate": 3.983547216509254e-06, - "logits/chosen": -2.8502197265625, - "logits/rejected": -2.8119924068450928, - "logps/chosen": -382.5280456542969, - "logps/rejected": -336.8903503417969, - "loss": 0.5755, + "logits/chosen": -2.8310632705688477, + "logits/rejected": -2.7877674102783203, + "logps/chosen": -384.951904296875, + "logps/rejected": -339.9574890136719, + "loss": 0.571, "rewards/accuracies": 0.75, - "rewards/chosen": -0.48744526505470276, - "rewards/margins": 0.3749392628669739, - "rewards/rejected": -0.8623844981193542, + "rewards/chosen": -0.5116842985153198, + "rewards/margins": 0.3813716769218445, + "rewards/rejected": -0.8930560946464539, "step": 460 }, { "epoch": 0.372, - "grad_norm": 4.1107025146484375, + "grad_norm": 3.6945154666900635, "learning_rate": 3.955307425393224e-06, - "logits/chosen": -2.875734329223633, - "logits/rejected": -2.834575891494751, - "logps/chosen": -360.67852783203125, - "logps/rejected": -366.3440856933594, - "loss": 0.5174, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.29663369059562683, - "rewards/margins": 0.5048123002052307, - "rewards/rejected": -0.8014459609985352, + "logits/chosen": -2.860947370529175, + "logits/rejected": -2.817092180252075, + "logps/chosen": -368.18890380859375, + "logps/rejected": -373.40155029296875, + "loss": 0.5221, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3717382550239563, + "rewards/margins": 0.5002824664115906, + "rewards/rejected": -0.8720208406448364, "step": 465 }, { "epoch": 0.376, - "grad_norm": 5.430475234985352, + "grad_norm": 5.232949256896973, "learning_rate": 3.92678391921108e-06, - "logits/chosen": -2.725780963897705, - "logits/rejected": -2.7029075622558594, - "logps/chosen": -364.1918640136719, - "logps/rejected": -378.3152160644531, - "loss": 0.5719, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.42064231634140015, - "rewards/margins": 0.38249263167381287, - "rewards/rejected": -0.8031350374221802, + "logits/chosen": -2.7127513885498047, + "logits/rejected": -2.689349889755249, + "logps/chosen": -373.66119384765625, + "logps/rejected": -391.1875305175781, + "loss": 0.5644, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.5153363347053528, + "rewards/margins": 0.4165223240852356, + "rewards/rejected": -0.9318585395812988, "step": 470 }, { "epoch": 0.38, - "grad_norm": 6.495006561279297, + "grad_norm": 5.3401899337768555, "learning_rate": 3.897982258676867e-06, - "logits/chosen": -2.775744915008545, - "logits/rejected": -2.769195318222046, - "logps/chosen": -308.8157043457031, - "logps/rejected": -341.82037353515625, - "loss": 0.5823, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.36774376034736633, - "rewards/margins": 0.3316097855567932, - "rewards/rejected": -0.6993535161018372, + "logits/chosen": -2.75883150100708, + "logits/rejected": -2.7517054080963135, + "logps/chosen": -315.7061462402344, + "logps/rejected": -348.54168701171875, + "loss": 0.5835, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.4366493821144104, + "rewards/margins": 0.3299176096916199, + "rewards/rejected": -0.7665671110153198, "step": 475 }, { "epoch": 0.384, - "grad_norm": 7.480860233306885, + "grad_norm": 6.390676498413086, "learning_rate": 3.868908058731376e-06, - "logits/chosen": -2.7915594577789307, - "logits/rejected": -2.742253065109253, - "logps/chosen": -352.02996826171875, - "logps/rejected": -327.95928955078125, - "loss": 0.6819, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.5273100733757019, - "rewards/margins": 0.15378466248512268, - "rewards/rejected": -0.681094765663147, + "logits/chosen": -2.77325701713562, + "logits/rejected": -2.7217469215393066, + "logps/chosen": -355.1070251464844, + "logps/rejected": -332.23199462890625, + "loss": 0.6723, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.558080792427063, + "rewards/margins": 0.1657411754131317, + "rewards/rejected": -0.7238219380378723, "step": 480 }, { "epoch": 0.388, - "grad_norm": 8.06702709197998, + "grad_norm": 8.37760066986084, "learning_rate": 3.839566987447492e-06, - "logits/chosen": -2.7685608863830566, - "logits/rejected": -2.7417702674865723, - "logps/chosen": -345.76495361328125, - "logps/rejected": -346.05426025390625, - "loss": 0.573, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.5824685096740723, - "rewards/margins": 0.387678325176239, - "rewards/rejected": -0.9701469540596008, + "logits/chosen": -2.7543792724609375, + "logits/rejected": -2.7266902923583984, + "logps/chosen": -345.9546813964844, + "logps/rejected": -343.42669677734375, + "loss": 0.5789, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.584365963935852, + "rewards/margins": 0.35950571298599243, + "rewards/rejected": -0.9438716173171997, "step": 485 }, { "epoch": 0.392, - "grad_norm": 5.584991931915283, + "grad_norm": 4.165892124176025, "learning_rate": 3.8099647649251984e-06, - "logits/chosen": -2.816774845123291, - "logits/rejected": -2.766758441925049, - "logps/chosen": -344.7107849121094, - "logps/rejected": -350.0550842285156, - "loss": 0.621, + "logits/chosen": -2.798603057861328, + "logits/rejected": -2.746656894683838, + "logps/chosen": -339.4115905761719, + "logps/rejected": -342.5590515136719, + "loss": 0.6252, "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.6366808414459229, - "rewards/margins": 0.28556036949157715, - "rewards/rejected": -0.9222410917282104, + "rewards/chosen": -0.5836890935897827, + "rewards/margins": 0.2635918855667114, + "rewards/rejected": -0.8472809791564941, "step": 490 }, { "epoch": 0.396, - "grad_norm": 8.032232284545898, + "grad_norm": 5.796815872192383, "learning_rate": 3.780107162176429e-06, - "logits/chosen": -2.7901835441589355, - "logits/rejected": -2.771435260772705, - "logps/chosen": -359.55804443359375, - "logps/rejected": -320.9138488769531, - "loss": 0.5898, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.7569794058799744, - "rewards/margins": 0.3364938199520111, - "rewards/rejected": -1.0934733152389526, + "logits/chosen": -2.771759510040283, + "logits/rejected": -2.7516114711761475, + "logps/chosen": -340.46343994140625, + "logps/rejected": -309.8275451660156, + "loss": 0.5526, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.5660332441329956, + "rewards/margins": 0.4165772497653961, + "rewards/rejected": -0.9826105237007141, "step": 495 }, { "epoch": 0.4, - "grad_norm": 6.636690616607666, + "grad_norm": 10.051532745361328, "learning_rate": 3.7500000000000005e-06, - "logits/chosen": -2.7686400413513184, - "logits/rejected": -2.747738838195801, - "logps/chosen": -380.00469970703125, - "logps/rejected": -391.73797607421875, - "loss": 0.5689, + "logits/chosen": -2.7455780506134033, + "logits/rejected": -2.723891496658325, + "logps/chosen": -364.03936767578125, + "logps/rejected": -377.3976135253906, + "loss": 0.5622, "rewards/accuracies": 0.75, - "rewards/chosen": -0.6767264008522034, - "rewards/margins": 0.42483949661254883, - "rewards/rejected": -1.1015657186508179, + "rewards/chosen": -0.5170733332633972, + "rewards/margins": 0.44108885526657104, + "rewards/rejected": -0.9581623077392578, "step": 500 }, { "epoch": 0.4, - "eval_logits/chosen": -2.7560079097747803, - "eval_logits/rejected": -2.7151803970336914, - "eval_logps/chosen": -349.3812255859375, - "eval_logps/rejected": -355.066162109375, - "eval_loss": 0.5749732851982117, - "eval_rewards/accuracies": 0.7242063283920288, - "eval_rewards/chosen": -0.6614311933517456, - "eval_rewards/margins": 0.40893420577049255, - "eval_rewards/rejected": -1.0703654289245605, - "eval_runtime": 166.5841, - "eval_samples_per_second": 3.001, + "eval_logits/chosen": -2.732027053833008, + "eval_logits/rejected": -2.691253662109375, + "eval_logps/chosen": -338.8871765136719, + "eval_logps/rejected": -345.97265625, + "eval_loss": 0.5688419342041016, + "eval_rewards/accuracies": 0.7142857313156128, + "eval_rewards/chosen": -0.5564908385276794, + "eval_rewards/margins": 0.42293980717658997, + "eval_rewards/rejected": -0.9794306755065918, + "eval_runtime": 166.7904, + "eval_samples_per_second": 2.998, "eval_steps_per_second": 0.378, "step": 500 }, { "epoch": 0.404, - "grad_norm": 5.858213901519775, + "grad_norm": 7.153428554534912, "learning_rate": 3.7196491478468322e-06, - "logits/chosen": -2.6860787868499756, - "logits/rejected": -2.702346086502075, - "logps/chosen": -351.7030944824219, - "logps/rejected": -389.79852294921875, - "loss": 0.5747, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7111638784408569, - "rewards/margins": 0.4305337369441986, - "rewards/rejected": -1.141697645187378, + "logits/chosen": -2.662764549255371, + "logits/rejected": -2.6799817085266113, + "logps/chosen": -346.8814392089844, + "logps/rejected": -386.2395935058594, + "loss": 0.5618, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6629476547241211, + "rewards/margins": 0.44316092133522034, + "rewards/rejected": -1.106108546257019, "step": 505 }, { "epoch": 0.408, - "grad_norm": 6.421969890594482, + "grad_norm": 7.824460506439209, "learning_rate": 3.689060522675689e-06, - "logits/chosen": -2.7720227241516113, - "logits/rejected": -2.7550883293151855, - "logps/chosen": -335.5453186035156, - "logps/rejected": -353.2554931640625, - "loss": 0.6084, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.47883662581443787, - "rewards/margins": 0.3589397370815277, - "rewards/rejected": -0.8377763628959656, + "logits/chosen": -2.739622116088867, + "logits/rejected": -2.7229952812194824, + "logps/chosen": -341.31610107421875, + "logps/rejected": -361.16583251953125, + "loss": 0.6013, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5365445613861084, + "rewards/margins": 0.3803355097770691, + "rewards/rejected": -0.9168800115585327, "step": 510 }, { "epoch": 0.412, - "grad_norm": 6.96730375289917, + "grad_norm": 6.43038272857666, "learning_rate": 3.658240087799655e-06, - "logits/chosen": -2.702028751373291, - "logits/rejected": -2.715606689453125, - "logps/chosen": -309.8144836425781, - "logps/rejected": -362.7301330566406, - "loss": 0.5446, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.4930266737937927, - "rewards/margins": 0.47733306884765625, - "rewards/rejected": -0.9703596830368042, + "logits/chosen": -2.6712212562561035, + "logits/rejected": -2.6851272583007812, + "logps/chosen": -314.61614990234375, + "logps/rejected": -371.5428466796875, + "loss": 0.5377, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5410436391830444, + "rewards/margins": 0.517443835735321, + "rewards/rejected": -1.0584874153137207, "step": 515 }, { "epoch": 0.416, - "grad_norm": 4.9304680824279785, + "grad_norm": 8.127975463867188, "learning_rate": 3.627193851723577e-06, - "logits/chosen": -2.7465412616729736, - "logits/rejected": -2.7204620838165283, - "logps/chosen": -352.72674560546875, - "logps/rejected": -372.55242919921875, - "loss": 0.6349, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.7522457242012024, - "rewards/margins": 0.3193085789680481, - "rewards/rejected": -1.0715543031692505, + "logits/chosen": -2.717458724975586, + "logits/rejected": -2.690972328186035, + "logps/chosen": -355.3538818359375, + "logps/rejected": -379.71417236328125, + "loss": 0.6306, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7785177230834961, + "rewards/margins": 0.36465466022491455, + "rewards/rejected": -1.143172264099121, "step": 520 }, { "epoch": 0.42, - "grad_norm": 6.871084213256836, + "grad_norm": 8.312115669250488, "learning_rate": 3.595927866972694e-06, - "logits/chosen": -2.708409309387207, - "logits/rejected": -2.7058815956115723, - "logps/chosen": -292.47271728515625, - "logps/rejected": -334.9390869140625, - "loss": 0.5762, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.6716808080673218, - "rewards/margins": 0.46342235803604126, - "rewards/rejected": -1.1351032257080078, + "logits/chosen": -2.6868720054626465, + "logits/rejected": -2.6861917972564697, + "logps/chosen": -292.2730407714844, + "logps/rejected": -340.75592041015625, + "loss": 0.5573, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6696837544441223, + "rewards/margins": 0.5235880613327026, + "rewards/rejected": -1.1932718753814697, "step": 525 }, { "epoch": 0.424, - "grad_norm": 6.238135814666748, + "grad_norm": 9.0608491897583, "learning_rate": 3.564448228912682e-06, - "logits/chosen": -2.6425788402557373, - "logits/rejected": -2.6337852478027344, - "logps/chosen": -382.01910400390625, - "logps/rejected": -381.7391662597656, - "loss": 0.6097, + "logits/chosen": -2.6300408840179443, + "logits/rejected": -2.623781204223633, + "logps/chosen": -381.3360900878906, + "logps/rejected": -383.34161376953125, + "loss": 0.6091, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7499979734420776, - "rewards/margins": 0.37163788080215454, - "rewards/rejected": -1.1216356754302979, + "rewards/chosen": -0.7431681752204895, + "rewards/margins": 0.39449232816696167, + "rewards/rejected": -1.1376605033874512, "step": 530 }, { "epoch": 0.428, - "grad_norm": 14.889227867126465, + "grad_norm": 15.011635780334473, "learning_rate": 3.532761074561355e-06, - "logits/chosen": -2.6252596378326416, - "logits/rejected": -2.573930501937866, - "logps/chosen": -389.1572265625, - "logps/rejected": -428.70489501953125, - "loss": 0.5708, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.7006493806838989, - "rewards/margins": 0.508830189704895, - "rewards/rejected": -1.209479570388794, + "logits/chosen": -2.6198954582214355, + "logits/rejected": -2.571420192718506, + "logps/chosen": -389.4923095703125, + "logps/rejected": -430.41204833984375, + "loss": 0.5782, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7040005922317505, + "rewards/margins": 0.5225512385368347, + "rewards/rejected": -1.2265517711639404, "step": 535 }, { "epoch": 0.432, - "grad_norm": 5.182321548461914, + "grad_norm": 6.0668768882751465, "learning_rate": 3.5008725813922383e-06, - "logits/chosen": -2.7459521293640137, - "logits/rejected": -2.672208309173584, - "logps/chosen": -356.2322998046875, - "logps/rejected": -392.16021728515625, - "loss": 0.5393, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.6245570182800293, - "rewards/margins": 0.5389561057090759, - "rewards/rejected": -1.1635130643844604, + "logits/chosen": -2.7458198070526123, + "logits/rejected": -2.6754350662231445, + "logps/chosen": -353.69500732421875, + "logps/rejected": -393.26953125, + "loss": 0.529, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.5991845726966858, + "rewards/margins": 0.5754216313362122, + "rewards/rejected": -1.1746060848236084, "step": 540 }, { "epoch": 0.436, - "grad_norm": 8.57190990447998, + "grad_norm": 9.939155578613281, "learning_rate": 3.4687889661302577e-06, - "logits/chosen": -2.6495566368103027, - "logits/rejected": -2.649183511734009, - "logps/chosen": -315.9193115234375, - "logps/rejected": -356.46417236328125, - "loss": 0.5273, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.6088639497756958, - "rewards/margins": 0.5826314687728882, - "rewards/rejected": -1.1914955377578735, + "logits/chosen": -2.648597240447998, + "logits/rejected": -2.6514642238616943, + "logps/chosen": -319.5657653808594, + "logps/rejected": -362.01885986328125, + "loss": 0.5329, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.6453290581703186, + "rewards/margins": 0.6017133593559265, + "rewards/rejected": -1.2470424175262451, "step": 545 }, { "epoch": 0.44, - "grad_norm": 6.0855207443237305, + "grad_norm": 7.438101768493652, "learning_rate": 3.436516483539781e-06, - "logits/chosen": -2.6810402870178223, - "logits/rejected": -2.6679680347442627, - "logps/chosen": -341.8032531738281, - "logps/rejected": -360.6465759277344, - "loss": 0.6281, + "logits/chosen": -2.679978847503662, + "logits/rejected": -2.667757034301758, + "logps/chosen": -354.5327453613281, + "logps/rejected": -376.2767333984375, + "loss": 0.6251, "rewards/accuracies": 0.625, - "rewards/chosen": -0.7102030515670776, - "rewards/margins": 0.3800693154335022, - "rewards/rejected": -1.090272307395935, + "rewards/chosen": -0.8374980688095093, + "rewards/margins": 0.40907567739486694, + "rewards/rejected": -1.246573805809021, "step": 550 }, { "epoch": 0.444, - "grad_norm": 6.288063049316406, + "grad_norm": 6.391237735748291, "learning_rate": 3.4040614252052305e-06, - "logits/chosen": -2.672840118408203, - "logits/rejected": -2.665916919708252, - "logps/chosen": -357.5086975097656, - "logps/rejected": -378.4448547363281, - "loss": 0.6071, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7066811919212341, - "rewards/margins": 0.38481634855270386, - "rewards/rejected": -1.091497540473938, + "logits/chosen": -2.672637939453125, + "logits/rejected": -2.6665103435516357, + "logps/chosen": -380.81195068359375, + "logps/rejected": -405.7100524902344, + "loss": 0.6038, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.939714252948761, + "rewards/margins": 0.42443543672561646, + "rewards/rejected": -1.364149808883667, "step": 555 }, { "epoch": 0.448, - "grad_norm": 6.352199554443359, + "grad_norm": 9.154504776000977, "learning_rate": 3.3714301183045382e-06, - "logits/chosen": -2.631598711013794, - "logits/rejected": -2.574202299118042, - "logps/chosen": -298.14923095703125, - "logps/rejected": -323.47528076171875, - "loss": 0.61, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.6164783239364624, - "rewards/margins": 0.34816139936447144, - "rewards/rejected": -0.9646397829055786, + "logits/chosen": -2.6303160190582275, + "logits/rejected": -2.572775363922119, + "logps/chosen": -331.26007080078125, + "logps/rejected": -356.6570129394531, + "loss": 0.6228, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9475865364074707, + "rewards/margins": 0.3488699793815613, + "rewards/rejected": -1.2964565753936768, "step": 560 }, { "epoch": 0.452, - "grad_norm": 4.746293067932129, + "grad_norm": 5.733785629272461, "learning_rate": 3.338628924375638e-06, - "logits/chosen": -2.759838581085205, - "logits/rejected": -2.7202701568603516, - "logps/chosen": -286.07159423828125, - "logps/rejected": -346.5191955566406, - "loss": 0.5194, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.4790007174015045, - "rewards/margins": 0.5368759036064148, - "rewards/rejected": -1.0158765316009521, + "logits/chosen": -2.7566773891448975, + "logits/rejected": -2.7174875736236572, + "logps/chosen": -315.8184509277344, + "logps/rejected": -379.3972473144531, + "loss": 0.5181, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7764695286750793, + "rewards/margins": 0.5681883096694946, + "rewards/rejected": -1.3446576595306396, "step": 565 }, { "epoch": 0.456, - "grad_norm": 4.951190948486328, + "grad_norm": 5.354190349578857, "learning_rate": 3.3056642380762783e-06, - "logits/chosen": -2.7108511924743652, - "logits/rejected": -2.6962389945983887, - "logps/chosen": -269.0116271972656, - "logps/rejected": -301.1402893066406, - "loss": 0.5847, + "logits/chosen": -2.715209484100342, + "logits/rejected": -2.703580379486084, + "logps/chosen": -289.91387939453125, + "logps/rejected": -322.75933837890625, + "loss": 0.5887, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.3556315302848816, - "rewards/margins": 0.43046459555625916, - "rewards/rejected": -0.7860961556434631, + "rewards/chosen": -0.5646545886993408, + "rewards/margins": 0.43763160705566406, + "rewards/rejected": -1.0022861957550049, "step": 570 }, { "epoch": 0.46, - "grad_norm": 6.543835639953613, + "grad_norm": 6.91750955581665, "learning_rate": 3.272542485937369e-06, - "logits/chosen": -2.6798412799835205, - "logits/rejected": -2.6179356575012207, - "logps/chosen": -285.6194152832031, - "logps/rejected": -290.96771240234375, - "loss": 0.5738, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.43146055936813354, - "rewards/margins": 0.3925173878669739, - "rewards/rejected": -0.8239779472351074, + "logits/chosen": -2.6926093101501465, + "logits/rejected": -2.6333212852478027, + "logps/chosen": -300.4584045410156, + "logps/rejected": -308.9971008300781, + "loss": 0.5684, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.579850435256958, + "rewards/margins": 0.424421489238739, + "rewards/rejected": -1.0042719841003418, "step": 575 }, { "epoch": 0.464, - "grad_norm": 4.029048442840576, + "grad_norm": 5.240443706512451, "learning_rate": 3.2392701251101172e-06, - "logits/chosen": -2.7277991771698, - "logits/rejected": -2.6769065856933594, - "logps/chosen": -338.96826171875, - "logps/rejected": -356.6761169433594, - "loss": 0.517, + "logits/chosen": -2.745445966720581, + "logits/rejected": -2.6982531547546387, + "logps/chosen": -345.88031005859375, + "logps/rejected": -365.571533203125, + "loss": 0.5121, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.38630399107933044, - "rewards/margins": 0.5923217535018921, - "rewards/rejected": -0.9786256551742554, + "rewards/chosen": -0.45542454719543457, + "rewards/margins": 0.6121553778648376, + "rewards/rejected": -1.067579984664917, "step": 580 }, { "epoch": 0.468, - "grad_norm": 6.441251277923584, + "grad_norm": 5.993870258331299, "learning_rate": 3.205853642107192e-06, - "logits/chosen": -2.6596102714538574, - "logits/rejected": -2.6377530097961426, - "logps/chosen": -295.38067626953125, - "logps/rejected": -321.3651428222656, - "loss": 0.6042, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.518380343914032, - "rewards/margins": 0.3751484155654907, - "rewards/rejected": -0.8935287594795227, + "logits/chosen": -2.679216146469116, + "logits/rejected": -2.660553216934204, + "logps/chosen": -300.2437744140625, + "logps/rejected": -327.10833740234375, + "loss": 0.6046, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5670121312141418, + "rewards/margins": 0.38394877314567566, + "rewards/rejected": -0.9509609341621399, "step": 585 }, { "epoch": 0.472, - "grad_norm": 8.161737442016602, + "grad_norm": 9.442901611328125, "learning_rate": 3.1722995515381644e-06, - "logits/chosen": -2.6348493099212646, - "logits/rejected": -2.622722625732422, - "logps/chosen": -333.249755859375, - "logps/rejected": -362.6484069824219, - "loss": 0.5217, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.5237576365470886, - "rewards/margins": 0.5885855555534363, - "rewards/rejected": -1.1123430728912354, + "logits/chosen": -2.657310962677002, + "logits/rejected": -2.649341344833374, + "logps/chosen": -328.50543212890625, + "logps/rejected": -353.29852294921875, + "loss": 0.5305, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.47631463408470154, + "rewards/margins": 0.5425296425819397, + "rewards/rejected": -1.0188442468643188, "step": 590 }, { "epoch": 0.476, - "grad_norm": 5.944140911102295, + "grad_norm": 6.075891971588135, "learning_rate": 3.1386143948394764e-06, - "logits/chosen": -2.648714780807495, - "logits/rejected": -2.6416144371032715, - "logps/chosen": -329.64849853515625, - "logps/rejected": -402.10260009765625, - "loss": 0.5543, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.7717787027359009, - "rewards/margins": 0.5048314332962036, - "rewards/rejected": -1.2766101360321045, + "logits/chosen": -2.67082142829895, + "logits/rejected": -2.663555145263672, + "logps/chosen": -311.79595947265625, + "logps/rejected": -385.62750244140625, + "loss": 0.545, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.5932539701461792, + "rewards/margins": 0.5186047554016113, + "rewards/rejected": -1.11185884475708, "step": 595 }, { "epoch": 0.48, - "grad_norm": 7.663615703582764, + "grad_norm": 8.881017684936523, "learning_rate": 3.1048047389991693e-06, - "logits/chosen": -2.652611494064331, - "logits/rejected": -2.5794761180877686, - "logps/chosen": -387.5853576660156, - "logps/rejected": -337.8640441894531, - "loss": 0.5884, + "logits/chosen": -2.6688761711120605, + "logits/rejected": -2.598611354827881, + "logps/chosen": -374.6273193359375, + "logps/rejected": -324.2472839355469, + "loss": 0.5826, "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6710079908370972, - "rewards/margins": 0.47154727578163147, - "rewards/rejected": -1.1425553560256958, + "rewards/chosen": -0.5414284467697144, + "rewards/margins": 0.4649595320224762, + "rewards/rejected": -1.0063880681991577, "step": 600 }, { "epoch": 0.48, - "eval_logits/chosen": -2.6723644733428955, - "eval_logits/rejected": -2.6321890354156494, - "eval_logps/chosen": -352.8876647949219, - "eval_logps/rejected": -375.1052551269531, - "eval_loss": 0.547924816608429, - "eval_rewards/accuracies": 0.7123016119003296, - "eval_rewards/chosen": -0.6964960694313049, - "eval_rewards/margins": 0.5742600560188293, - "eval_rewards/rejected": -1.2707562446594238, - "eval_runtime": 166.6085, - "eval_samples_per_second": 3.001, + "eval_logits/chosen": -2.690697431564331, + "eval_logits/rejected": -2.6522767543792725, + "eval_logps/chosen": -337.7991638183594, + "eval_logps/rejected": -359.91156005859375, + "eval_loss": 0.5457041263580322, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -0.5456109642982483, + "eval_rewards/margins": 0.5732083916664124, + "eval_rewards/rejected": -1.1188193559646606, + "eval_runtime": 166.7663, + "eval_samples_per_second": 2.998, "eval_steps_per_second": 0.378, "step": 600 }, { "epoch": 0.484, - "grad_norm": 7.883712291717529, + "grad_norm": 8.860865592956543, "learning_rate": 3.0708771752766397e-06, - "logits/chosen": -2.699237585067749, - "logits/rejected": -2.656472682952881, - "logps/chosen": -396.58935546875, - "logps/rejected": -403.0001525878906, - "loss": 0.5645, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.6690188050270081, - "rewards/margins": 0.4926017224788666, - "rewards/rejected": -1.1616204977035522, + "logits/chosen": -2.714979648590088, + "logits/rejected": -2.67262601852417, + "logps/chosen": -379.689697265625, + "logps/rejected": -389.07537841796875, + "loss": 0.5436, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.500022292137146, + "rewards/margins": 0.5223508477210999, + "rewards/rejected": -1.0223733186721802, "step": 605 }, { "epoch": 0.488, - "grad_norm": 7.566442489624023, + "grad_norm": 7.4055280685424805, "learning_rate": 3.0368383179176584e-06, - "logits/chosen": -2.6338205337524414, - "logits/rejected": -2.5693295001983643, - "logps/chosen": -367.86016845703125, - "logps/rejected": -435.5393981933594, - "loss": 0.513, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.7284402847290039, - "rewards/margins": 0.6670497059822083, - "rewards/rejected": -1.395490050315857, + "logits/chosen": -2.648967981338501, + "logits/rejected": -2.5849640369415283, + "logps/chosen": -349.82244873046875, + "logps/rejected": -420.50494384765625, + "loss": 0.5099, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.5480636954307556, + "rewards/margins": 0.6970816850662231, + "rewards/rejected": -1.2451454401016235, "step": 610 }, { "epoch": 0.492, - "grad_norm": 8.526657104492188, + "grad_norm": 8.267292976379395, "learning_rate": 3.002694802864912e-06, - "logits/chosen": -2.618443727493286, - "logits/rejected": -2.598315715789795, - "logps/chosen": -354.45587158203125, - "logps/rejected": -394.5972900390625, - "loss": 0.5754, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.6152892112731934, - "rewards/margins": 0.5398699045181274, - "rewards/rejected": -1.1551592350006104, + "logits/chosen": -2.6305606365203857, + "logits/rejected": -2.6106619834899902, + "logps/chosen": -341.2841491699219, + "logps/rejected": -381.73297119140625, + "loss": 0.5663, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.48357224464416504, + "rewards/margins": 0.542944073677063, + "rewards/rejected": -1.026516318321228, "step": 615 }, { "epoch": 0.496, - "grad_norm": 6.0026044845581055, + "grad_norm": 6.144512176513672, "learning_rate": 2.9684532864643123e-06, - "logits/chosen": -2.611154317855835, - "logits/rejected": -2.594029188156128, - "logps/chosen": -322.4052734375, - "logps/rejected": -360.61016845703125, - "loss": 0.5007, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.49695730209350586, - "rewards/margins": 0.6773632764816284, - "rewards/rejected": -1.1743205785751343, + "logits/chosen": -2.6233174800872803, + "logits/rejected": -2.6076509952545166, + "logps/chosen": -316.0887756347656, + "logps/rejected": -354.6174621582031, + "loss": 0.5047, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.4337923526763916, + "rewards/margins": 0.6806012392044067, + "rewards/rejected": -1.1143935918807983, "step": 620 }, { "epoch": 0.5, - "grad_norm": 19.88625144958496, + "grad_norm": 12.604687690734863, "learning_rate": 2.9341204441673267e-06, - "logits/chosen": -2.6633830070495605, - "logits/rejected": -2.670114040374756, - "logps/chosen": -366.9710388183594, - "logps/rejected": -338.7015075683594, - "loss": 0.667, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.5960913300514221, - "rewards/margins": 0.2840344309806824, - "rewards/rejected": -0.8801258206367493, + "logits/chosen": -2.668950319290161, + "logits/rejected": -2.6760809421539307, + "logps/chosen": -368.0353698730469, + "logps/rejected": -343.40869140625, + "loss": 0.658, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.606735110282898, + "rewards/margins": 0.32046255469322205, + "rewards/rejected": -0.9271975755691528, "step": 625 }, { "epoch": 0.504, - "grad_norm": 14.67087173461914, + "grad_norm": 9.32141399383545, "learning_rate": 2.8997029692295875e-06, - "logits/chosen": -2.6533138751983643, - "logits/rejected": -2.623382329940796, - "logps/chosen": -282.66400146484375, - "logps/rejected": -314.5120849609375, - "loss": 0.6115, + "logits/chosen": -2.6581954956054688, + "logits/rejected": -2.6276352405548096, + "logps/chosen": -291.1399841308594, + "logps/rejected": -327.74859619140625, + "loss": 0.6077, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.46091246604919434, - "rewards/margins": 0.3925257921218872, - "rewards/rejected": -0.8534382581710815, + "rewards/chosen": -0.5456727743148804, + "rewards/margins": 0.44013065099716187, + "rewards/rejected": -0.9858034253120422, "step": 630 }, { "epoch": 0.508, - "grad_norm": 5.918943405151367, + "grad_norm": 8.439581871032715, "learning_rate": 2.8652075714060296e-06, - "logits/chosen": -2.6715197563171387, - "logits/rejected": -2.688370704650879, - "logps/chosen": -284.8374938964844, - "logps/rejected": -335.3628845214844, - "loss": 0.5706, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.3965570032596588, - "rewards/margins": 0.44703879952430725, - "rewards/rejected": -0.8435958027839661, + "logits/chosen": -2.673593044281006, + "logits/rejected": -2.6902694702148438, + "logps/chosen": -301.8299560546875, + "logps/rejected": -353.7574157714844, + "loss": 0.5776, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.5664817094802856, + "rewards/margins": 0.4610595107078552, + "rewards/rejected": -1.027541160583496, "step": 635 }, { "epoch": 0.512, - "grad_norm": 8.843329429626465, + "grad_norm": 6.46388053894043, "learning_rate": 2.8306409756428067e-06, - "logits/chosen": -2.623734951019287, - "logits/rejected": -2.5950891971588135, - "logps/chosen": -270.8127746582031, - "logps/rejected": -276.7856750488281, - "loss": 0.5817, + "logits/chosen": -2.6250970363616943, + "logits/rejected": -2.595864772796631, + "logps/chosen": -283.65887451171875, + "logps/rejected": -289.2749938964844, + "loss": 0.5845, "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.30773457884788513, - "rewards/margins": 0.39293187856674194, - "rewards/rejected": -0.7006665468215942, + "rewards/chosen": -0.4361953139305115, + "rewards/margins": 0.3893643915653229, + "rewards/rejected": -0.8255597949028015, "step": 640 }, { "epoch": 0.516, - "grad_norm": 7.708199501037598, + "grad_norm": 7.4935712814331055, "learning_rate": 2.7960099207662535e-06, - "logits/chosen": -2.634336471557617, - "logits/rejected": -2.602273941040039, - "logps/chosen": -293.1026611328125, - "logps/rejected": -321.48004150390625, - "loss": 0.5566, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.4055629372596741, - "rewards/margins": 0.4888216555118561, - "rewards/rejected": -0.8943845629692078, + "logits/chosen": -2.638918161392212, + "logits/rejected": -2.606156826019287, + "logps/chosen": -298.98980712890625, + "logps/rejected": -329.75079345703125, + "loss": 0.5492, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4644347131252289, + "rewards/margins": 0.5126577615737915, + "rewards/rejected": -0.9770925641059875, "step": 645 }, { "epoch": 0.52, - "grad_norm": 5.657192230224609, + "grad_norm": 5.561440467834473, "learning_rate": 2.761321158169134e-06, - "logits/chosen": -2.6778979301452637, - "logits/rejected": -2.681678533554077, - "logps/chosen": -330.0281066894531, - "logps/rejected": -331.8005065917969, - "loss": 0.5934, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.48604241013526917, - "rewards/margins": 0.386027991771698, - "rewards/rejected": -0.8720704317092896, + "logits/chosen": -2.6843574047088623, + "logits/rejected": -2.687722682952881, + "logps/chosen": -330.052001953125, + "logps/rejected": -330.7695007324219, + "loss": 0.5886, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.48628172278404236, + "rewards/margins": 0.3754786252975464, + "rewards/rejected": -0.8617603182792664, "step": 650 }, { "epoch": 0.524, - "grad_norm": 6.853002071380615, + "grad_norm": 5.8051862716674805, "learning_rate": 2.726581450494451e-06, - "logits/chosen": -2.6293766498565674, - "logits/rejected": -2.621476173400879, - "logps/chosen": -326.91363525390625, - "logps/rejected": -339.2771911621094, - "loss": 0.529, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.31295642256736755, - "rewards/margins": 0.5676072835922241, - "rewards/rejected": -0.8805637359619141, + "logits/chosen": -2.6416282653808594, + "logits/rejected": -2.6334142684936523, + "logps/chosen": -323.7715148925781, + "logps/rejected": -333.45831298828125, + "loss": 0.5329, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2815348505973816, + "rewards/margins": 0.5408404469490051, + "rewards/rejected": -0.8223752975463867, "step": 655 }, { "epoch": 0.528, - "grad_norm": 5.255690574645996, + "grad_norm": 4.751335144042969, "learning_rate": 2.6917975703170466e-06, - "logits/chosen": -2.6737003326416016, - "logits/rejected": -2.660823106765747, - "logps/chosen": -332.20001220703125, - "logps/rejected": -395.01177978515625, - "loss": 0.4708, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.47576016187667847, - "rewards/margins": 0.7829462885856628, - "rewards/rejected": -1.2587064504623413, + "logits/chosen": -2.691120147705078, + "logits/rejected": -2.6784567832946777, + "logps/chosen": -319.2125244140625, + "logps/rejected": -377.9074401855469, + "loss": 0.4832, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.34588542580604553, + "rewards/margins": 0.7417780160903931, + "rewards/rejected": -1.0876634120941162, "step": 660 }, { "epoch": 0.532, - "grad_norm": 6.153021335601807, + "grad_norm": 4.525731086730957, "learning_rate": 2.6569762988232838e-06, - "logits/chosen": -2.596040725708008, - "logits/rejected": -2.6018855571746826, - "logps/chosen": -314.1906433105469, - "logps/rejected": -368.1581115722656, - "loss": 0.5719, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.6212738752365112, - "rewards/margins": 0.5093709826469421, - "rewards/rejected": -1.1306449174880981, + "logits/chosen": -2.6197690963745117, + "logits/rejected": -2.6266191005706787, + "logps/chosen": -293.38787841796875, + "logps/rejected": -343.3548583984375, + "loss": 0.5703, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.4132465720176697, + "rewards/margins": 0.46936559677124023, + "rewards/rejected": -0.8826121091842651, "step": 665 }, { "epoch": 0.536, - "grad_norm": 12.637703895568848, + "grad_norm": 13.704550743103027, "learning_rate": 2.6221244244890336e-06, - "logits/chosen": -2.648380756378174, - "logits/rejected": -2.5612568855285645, - "logps/chosen": -381.6895751953125, - "logps/rejected": -417.946044921875, - "loss": 0.5273, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.9110687375068665, - "rewards/margins": 0.6113173961639404, - "rewards/rejected": -1.5223863124847412, + "logits/chosen": -2.677114963531494, + "logits/rejected": -2.5927250385284424, + "logps/chosen": -350.24310302734375, + "logps/rejected": -378.822021484375, + "loss": 0.5502, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.596604585647583, + "rewards/margins": 0.5345416069030762, + "rewards/rejected": -1.1311461925506592, "step": 670 }, { "epoch": 0.54, - "grad_norm": 15.107666015625, + "grad_norm": 13.029091835021973, "learning_rate": 2.587248741756253e-06, - "logits/chosen": -2.6709227561950684, - "logits/rejected": -2.6540889739990234, - "logps/chosen": -357.17413330078125, - "logps/rejected": -416.6585998535156, - "loss": 0.5336, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.7724355459213257, - "rewards/margins": 0.6448327898979187, - "rewards/rejected": -1.41726815700531, + "logits/chosen": -2.7029571533203125, + "logits/rejected": -2.6872310638427734, + "logps/chosen": -327.059326171875, + "logps/rejected": -376.5028991699219, + "loss": 0.5659, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.4712875485420227, + "rewards/margins": 0.5444241762161255, + "rewards/rejected": -1.0157115459442139, "step": 675 }, { "epoch": 0.544, - "grad_norm": 5.885760307312012, + "grad_norm": 5.572321891784668, "learning_rate": 2.5523560497083927e-06, - "logits/chosen": -2.681267023086548, - "logits/rejected": -2.6446001529693604, - "logps/chosen": -372.6912536621094, - "logps/rejected": -416.1136779785156, - "loss": 0.5489, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.9625101089477539, - "rewards/margins": 0.5871840715408325, - "rewards/rejected": -1.549694299697876, + "logits/chosen": -2.7125682830810547, + "logits/rejected": -2.6776933670043945, + "logps/chosen": -343.49658203125, + "logps/rejected": -386.14215087890625, + "loss": 0.5567, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.6705636978149414, + "rewards/margins": 0.5794155597686768, + "rewards/rejected": -1.2499791383743286, "step": 680 }, { "epoch": 0.548, - "grad_norm": 9.579086303710938, + "grad_norm": 11.680253028869629, "learning_rate": 2.517453150744904e-06, - "logits/chosen": -2.6798925399780273, - "logits/rejected": -2.6284408569335938, - "logps/chosen": -407.7876892089844, - "logps/rejected": -411.8699645996094, - "loss": 0.6101, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.9496753811836243, - "rewards/margins": 0.48396366834640503, - "rewards/rejected": -1.4336390495300293, + "logits/chosen": -2.708914041519165, + "logits/rejected": -2.659080982208252, + "logps/chosen": -388.8927307128906, + "logps/rejected": -396.7651062011719, + "loss": 0.6039, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.7607260942459106, + "rewards/margins": 0.5218645334243774, + "rewards/rejected": -1.2825905084609985, "step": 685 }, { "epoch": 0.552, - "grad_norm": 6.947661399841309, + "grad_norm": 7.640016555786133, "learning_rate": 2.482546849255096e-06, - "logits/chosen": -2.6654515266418457, - "logits/rejected": -2.622973918914795, - "logps/chosen": -384.18719482421875, - "logps/rejected": -447.71356201171875, - "loss": 0.5146, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.9958160519599915, - "rewards/margins": 0.7850648760795593, - "rewards/rejected": -1.7808809280395508, + "logits/chosen": -2.6931045055389404, + "logits/rejected": -2.6509275436401367, + "logps/chosen": -371.285400390625, + "logps/rejected": -441.174560546875, + "loss": 0.5044, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.866797924041748, + "rewards/margins": 0.8486925363540649, + "rewards/rejected": -1.7154903411865234, "step": 690 }, { "epoch": 0.556, - "grad_norm": 5.574069023132324, + "grad_norm": 6.240846157073975, "learning_rate": 2.447643950291608e-06, - "logits/chosen": -2.5033955574035645, - "logits/rejected": -2.449953556060791, - "logps/chosen": -345.85723876953125, - "logps/rejected": -342.82061767578125, - "loss": 0.5562, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.8573344349861145, - "rewards/margins": 0.5808383226394653, - "rewards/rejected": -1.438172698020935, + "logits/chosen": -2.527269124984741, + "logits/rejected": -2.4740500450134277, + "logps/chosen": -335.0668640136719, + "logps/rejected": -334.4136657714844, + "loss": 0.5401, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7494308352470398, + "rewards/margins": 0.6046732068061829, + "rewards/rejected": -1.3541040420532227, "step": 695 }, { "epoch": 0.56, - "grad_norm": 5.758213520050049, + "grad_norm": 10.70870590209961, "learning_rate": 2.4127512582437486e-06, - "logits/chosen": -2.626072406768799, - "logits/rejected": -2.6115658283233643, - "logps/chosen": -373.35693359375, - "logps/rejected": -407.4380187988281, - "loss": 0.5366, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.8556938171386719, - "rewards/margins": 0.6256057620048523, - "rewards/rejected": -1.4812995195388794, + "logits/chosen": -2.6449031829833984, + "logits/rejected": -2.6302623748779297, + "logps/chosen": -367.7524719238281, + "logps/rejected": -402.79736328125, + "loss": 0.5313, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.799649178981781, + "rewards/margins": 0.6352438926696777, + "rewards/rejected": -1.434893012046814, "step": 700 }, { "epoch": 0.56, - "eval_logits/chosen": -2.6541435718536377, - "eval_logits/rejected": -2.6143555641174316, - "eval_logps/chosen": -355.7809143066406, - "eval_logps/rejected": -381.54388427734375, - "eval_loss": 0.5462217330932617, - "eval_rewards/accuracies": 0.7123016119003296, - "eval_rewards/chosen": -0.7254281044006348, - "eval_rewards/margins": 0.6097148060798645, - "eval_rewards/rejected": -1.335142970085144, - "eval_runtime": 166.5614, - "eval_samples_per_second": 3.002, - "eval_steps_per_second": 0.378, + "eval_logits/chosen": -2.658555030822754, + "eval_logits/rejected": -2.617255449295044, + "eval_logps/chosen": -354.6570739746094, + "eval_logps/rejected": -381.07342529296875, + "eval_loss": 0.5387491583824158, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": -0.7141901850700378, + "eval_rewards/margins": 0.616247832775116, + "eval_rewards/rejected": -1.3304380178451538, + "eval_runtime": 165.7154, + "eval_samples_per_second": 3.017, + "eval_steps_per_second": 0.38, "step": 700 }, { "epoch": 0.564, - "grad_norm": 13.666606903076172, + "grad_norm": 16.324960708618164, "learning_rate": 2.377875575510967e-06, - "logits/chosen": -2.5781383514404297, - "logits/rejected": -2.516011953353882, - "logps/chosen": -360.7132873535156, - "logps/rejected": -367.84564208984375, - "loss": 0.587, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.8543895483016968, - "rewards/margins": 0.5572708249092102, - "rewards/rejected": -1.4116604328155518, + "logits/chosen": -2.5862739086151123, + "logits/rejected": -2.5246312618255615, + "logps/chosen": -361.58331298828125, + "logps/rejected": -368.5226745605469, + "loss": 0.5767, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8630898594856262, + "rewards/margins": 0.5553407669067383, + "rewards/rejected": -1.4184306859970093, "step": 705 }, { "epoch": 0.568, - "grad_norm": 15.681051254272461, + "grad_norm": 9.928940773010254, "learning_rate": 2.3430237011767166e-06, - "logits/chosen": -2.6835620403289795, - "logits/rejected": -2.6478209495544434, - "logps/chosen": -351.29498291015625, - "logps/rejected": -375.1694030761719, - "loss": 0.579, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.7802821397781372, - "rewards/margins": 0.47849854826927185, - "rewards/rejected": -1.2587807178497314, + "logits/chosen": -2.6898465156555176, + "logits/rejected": -2.6531174182891846, + "logps/chosen": -348.86614990234375, + "logps/rejected": -376.40582275390625, + "loss": 0.5582, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.7559942007064819, + "rewards/margins": 0.5151509642601013, + "rewards/rejected": -1.2711451053619385, "step": 710 }, { "epoch": 0.572, - "grad_norm": 8.994268417358398, + "grad_norm": 8.1253023147583, "learning_rate": 2.3082024296829538e-06, - "logits/chosen": -2.6145269870758057, - "logits/rejected": -2.5766470432281494, - "logps/chosen": -295.27313232421875, - "logps/rejected": -390.36279296875, - "loss": 0.4613, + "logits/chosen": -2.6176774501800537, + "logits/rejected": -2.579152822494507, + "logps/chosen": -302.24481201171875, + "logps/rejected": -392.26226806640625, + "loss": 0.4689, "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.6295270919799805, - "rewards/margins": 0.8818346858024597, - "rewards/rejected": -1.511361837387085, + "rewards/chosen": -0.6992444396018982, + "rewards/margins": 0.8311125040054321, + "rewards/rejected": -1.530356764793396, "step": 715 }, { "epoch": 0.576, - "grad_norm": 7.825490474700928, + "grad_norm": 13.087418556213379, "learning_rate": 2.2734185495055503e-06, - "logits/chosen": -2.672497272491455, - "logits/rejected": -2.5945022106170654, - "logps/chosen": -352.99005126953125, - "logps/rejected": -351.3237609863281, - "loss": 0.5693, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.6252859830856323, - "rewards/margins": 0.4798332750797272, - "rewards/rejected": -1.1051193475723267, + "logits/chosen": -2.67484974861145, + "logits/rejected": -2.5980706214904785, + "logps/chosen": -360.56085205078125, + "logps/rejected": -359.30743408203125, + "loss": 0.5599, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7009941935539246, + "rewards/margins": 0.48396244645118713, + "rewards/rejected": -1.184956669807434, "step": 720 }, { "epoch": 0.58, - "grad_norm": 7.579111099243164, + "grad_norm": 8.680456161499023, "learning_rate": 2.238678841830867e-06, - "logits/chosen": -2.6132864952087402, - "logits/rejected": -2.5825414657592773, - "logps/chosen": -348.1910705566406, - "logps/rejected": -377.7493896484375, - "loss": 0.5637, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.5163500905036926, - "rewards/margins": 0.5300666689872742, - "rewards/rejected": -1.0464167594909668, + "logits/chosen": -2.6140735149383545, + "logits/rejected": -2.5830371379852295, + "logps/chosen": -364.34942626953125, + "logps/rejected": -394.05047607421875, + "loss": 0.5686, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6779341697692871, + "rewards/margins": 0.5314933657646179, + "rewards/rejected": -1.2094275951385498, "step": 725 }, { "epoch": 0.584, - "grad_norm": 8.520023345947266, + "grad_norm": 10.017196655273438, "learning_rate": 2.2039900792337477e-06, - "logits/chosen": -2.6524438858032227, - "logits/rejected": -2.6290249824523926, - "logps/chosen": -356.4406433105469, - "logps/rejected": -374.92156982421875, - "loss": 0.5875, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.6103891730308533, - "rewards/margins": 0.5113776922225952, - "rewards/rejected": -1.1217668056488037, + "logits/chosen": -2.644421339035034, + "logits/rejected": -2.6203815937042236, + "logps/chosen": -375.94244384765625, + "logps/rejected": -401.27056884765625, + "loss": 0.5818, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8054073452949524, + "rewards/margins": 0.5798496007919312, + "rewards/rejected": -1.3852570056915283, "step": 730 }, { "epoch": 0.588, - "grad_norm": 4.65096378326416, + "grad_norm": 6.945478439331055, "learning_rate": 2.1693590243571937e-06, - "logits/chosen": -2.6716837882995605, - "logits/rejected": -2.613300323486328, - "logps/chosen": -326.7117614746094, - "logps/rejected": -364.78997802734375, - "loss": 0.5516, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6327379941940308, - "rewards/margins": 0.6464030742645264, - "rewards/rejected": -1.2791410684585571, + "logits/chosen": -2.6634681224823, + "logits/rejected": -2.6050820350646973, + "logps/chosen": -348.97540283203125, + "logps/rejected": -389.2197265625, + "loss": 0.5502, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.855374813079834, + "rewards/margins": 0.6680639982223511, + "rewards/rejected": -1.523438811302185, "step": 735 }, { "epoch": 0.592, - "grad_norm": 8.322911262512207, + "grad_norm": 10.911724090576172, "learning_rate": 2.134792428593971e-06, - "logits/chosen": -2.5958149433135986, - "logits/rejected": -2.5660290718078613, - "logps/chosen": -307.79656982421875, - "logps/rejected": -354.429443359375, - "loss": 0.5574, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7369760870933533, - "rewards/margins": 0.45006656646728516, - "rewards/rejected": -1.1870427131652832, + "logits/chosen": -2.5928680896759033, + "logits/rejected": -2.563474416732788, + "logps/chosen": -328.8409729003906, + "logps/rejected": -379.95599365234375, + "loss": 0.5491, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.9474200010299683, + "rewards/margins": 0.49488845467567444, + "rewards/rejected": -1.4423085451126099, "step": 740 }, { "epoch": 0.596, - "grad_norm": 7.646263599395752, + "grad_norm": 9.961637496948242, "learning_rate": 2.1002970307704134e-06, - "logits/chosen": -2.7141809463500977, - "logits/rejected": -2.65040922164917, - "logps/chosen": -403.737548828125, - "logps/rejected": -440.7948303222656, - "loss": 0.5606, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.694787859916687, - "rewards/margins": 0.6611425280570984, - "rewards/rejected": -1.3559304475784302, + "logits/chosen": -2.7088496685028076, + "logits/rejected": -2.646193027496338, + "logps/chosen": -421.77178955078125, + "logps/rejected": -460.4210510253906, + "loss": 0.5682, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.875130295753479, + "rewards/margins": 0.6770623922348022, + "rewards/rejected": -1.5521926879882812, "step": 745 }, { "epoch": 0.6, - "grad_norm": 6.563836097717285, + "grad_norm": 6.698659420013428, "learning_rate": 2.0658795558326745e-06, - "logits/chosen": -2.6275224685668945, - "logits/rejected": -2.6394450664520264, - "logps/chosen": -353.59765625, - "logps/rejected": -409.802001953125, - "loss": 0.517, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.674094021320343, - "rewards/margins": 0.7538131475448608, - "rewards/rejected": -1.427907109260559, + "logits/chosen": -2.6276183128356934, + "logits/rejected": -2.6407034397125244, + "logps/chosen": -371.37225341796875, + "logps/rejected": -429.2191467285156, + "loss": 0.5026, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8518401384353638, + "rewards/margins": 0.7702382206916809, + "rewards/rejected": -1.622078537940979, "step": 750 }, { "epoch": 0.604, - "grad_norm": 5.990789890289307, + "grad_norm": 9.7139310836792, "learning_rate": 2.031546713535688e-06, - "logits/chosen": -2.631734848022461, - "logits/rejected": -2.5736083984375, - "logps/chosen": -346.9231872558594, - "logps/rejected": -393.51727294921875, - "loss": 0.535, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.6008895039558411, - "rewards/margins": 0.6397835612297058, - "rewards/rejected": -1.240673303604126, + "logits/chosen": -2.6299374103546143, + "logits/rejected": -2.572783946990967, + "logps/chosen": -366.63360595703125, + "logps/rejected": -413.1153259277344, + "loss": 0.5502, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7979942560195923, + "rewards/margins": 0.6386594772338867, + "rewards/rejected": -1.4366536140441895, "step": 755 }, { "epoch": 0.608, - "grad_norm": 10.244848251342773, + "grad_norm": 12.361163139343262, "learning_rate": 1.997305197135089e-06, - "logits/chosen": -2.5579869747161865, - "logits/rejected": -2.5684762001037598, - "logps/chosen": -278.13470458984375, - "logps/rejected": -336.5936584472656, - "loss": 0.5631, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.6302391886711121, - "rewards/margins": 0.5114450454711914, - "rewards/rejected": -1.1416842937469482, + "logits/chosen": -2.554405689239502, + "logits/rejected": -2.564911365509033, + "logps/chosen": -300.50689697265625, + "logps/rejected": -360.9603271484375, + "loss": 0.5553, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8539615869522095, + "rewards/margins": 0.5313900709152222, + "rewards/rejected": -1.3853518962860107, "step": 760 }, { "epoch": 0.612, - "grad_norm": 7.769974708557129, + "grad_norm": 8.68078899383545, "learning_rate": 1.963161682082342e-06, - "logits/chosen": -2.536005735397339, - "logits/rejected": -2.584683656692505, - "logps/chosen": -334.3309020996094, - "logps/rejected": -362.7339782714844, - "loss": 0.548, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.5386423468589783, - "rewards/margins": 0.5726658701896667, - "rewards/rejected": -1.111308217048645, + "logits/chosen": -2.5307528972625732, + "logits/rejected": -2.5801663398742676, + "logps/chosen": -357.2311706542969, + "logps/rejected": -383.4229431152344, + "loss": 0.567, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7676454186439514, + "rewards/margins": 0.5505531430244446, + "rewards/rejected": -1.3181986808776855, "step": 765 }, { "epoch": 0.616, - "grad_norm": 4.755601406097412, + "grad_norm": 4.522902011871338, "learning_rate": 1.9291228247233607e-06, - "logits/chosen": -2.5625104904174805, - "logits/rejected": -2.5171782970428467, - "logps/chosen": -320.0146484375, - "logps/rejected": -359.59197998046875, - "loss": 0.543, + "logits/chosen": -2.5538437366485596, + "logits/rejected": -2.5088653564453125, + "logps/chosen": -338.6076965332031, + "logps/rejected": -382.578369140625, + "loss": 0.5448, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.3926158845424652, - "rewards/margins": 0.5113145112991333, - "rewards/rejected": -0.9039304852485657, + "rewards/chosen": -0.5785464644432068, + "rewards/margins": 0.5552471280097961, + "rewards/rejected": -1.133793592453003, "step": 770 }, { "epoch": 0.62, - "grad_norm": 8.86306095123291, + "grad_norm": 8.719709396362305, "learning_rate": 1.895195261000831e-06, - "logits/chosen": -2.6329500675201416, - "logits/rejected": -2.58791446685791, - "logps/chosen": -342.7969665527344, - "logps/rejected": -399.5611267089844, - "loss": 0.5278, + "logits/chosen": -2.619828462600708, + "logits/rejected": -2.574763298034668, + "logps/chosen": -361.0684814453125, + "logps/rejected": -423.12408447265625, + "loss": 0.5232, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.41523486375808716, - "rewards/margins": 0.5993978977203369, - "rewards/rejected": -1.0146328210830688, + "rewards/chosen": -0.5979502201080322, + "rewards/margins": 0.6523123383522034, + "rewards/rejected": -1.2502626180648804, "step": 775 }, { "epoch": 0.624, - "grad_norm": 7.572498321533203, + "grad_norm": 7.265892505645752, "learning_rate": 1.8613856051605242e-06, - "logits/chosen": -2.4784908294677734, - "logits/rejected": -2.511237621307373, - "logps/chosen": -283.90972900390625, - "logps/rejected": -327.43121337890625, - "loss": 0.5407, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.45636481046676636, - "rewards/margins": 0.4989432394504547, - "rewards/rejected": -0.9553079605102539, + "logits/chosen": -2.4674975872039795, + "logits/rejected": -2.4988579750061035, + "logps/chosen": -303.31695556640625, + "logps/rejected": -350.3982849121094, + "loss": 0.5336, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.65043705701828, + "rewards/margins": 0.534541666507721, + "rewards/rejected": -1.184978723526001, "step": 780 }, { "epoch": 0.628, - "grad_norm": 4.849424362182617, + "grad_norm": 5.215295314788818, "learning_rate": 1.827700448461836e-06, - "logits/chosen": -2.6728811264038086, - "logits/rejected": -2.5995230674743652, - "logps/chosen": -365.33587646484375, - "logps/rejected": -384.79388427734375, - "loss": 0.5746, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.6461849212646484, - "rewards/margins": 0.504442572593689, - "rewards/rejected": -1.150627613067627, + "logits/chosen": -2.6682486534118652, + "logits/rejected": -2.595508098602295, + "logps/chosen": -380.72271728515625, + "logps/rejected": -402.666748046875, + "loss": 0.5612, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8000537157058716, + "rewards/margins": 0.5293024182319641, + "rewards/rejected": -1.3293559551239014, "step": 785 }, { "epoch": 0.632, - "grad_norm": 4.748244285583496, + "grad_norm": 9.841978073120117, "learning_rate": 1.7941463578928088e-06, - "logits/chosen": -2.594512939453125, - "logits/rejected": -2.5624542236328125, - "logps/chosen": -407.58221435546875, - "logps/rejected": -416.52752685546875, - "loss": 0.5731, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.626873791217804, - "rewards/margins": 0.539578914642334, - "rewards/rejected": -1.1664526462554932, + "logits/chosen": -2.592263698577881, + "logits/rejected": -2.559999942779541, + "logps/chosen": -422.0464782714844, + "logps/rejected": -429.20831298828125, + "loss": 0.58, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7715168595314026, + "rewards/margins": 0.5217434167861938, + "rewards/rejected": -1.2932603359222412, "step": 790 }, { "epoch": 0.636, - "grad_norm": 6.187716007232666, + "grad_norm": 7.524374485015869, "learning_rate": 1.7607298748898844e-06, - "logits/chosen": -2.6329944133758545, - "logits/rejected": -2.623441219329834, - "logps/chosen": -341.26910400390625, - "logps/rejected": -384.6199645996094, - "loss": 0.5801, + "logits/chosen": -2.6286463737487793, + "logits/rejected": -2.6194465160369873, + "logps/chosen": -354.8101501464844, + "logps/rejected": -398.53948974609375, + "loss": 0.588, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.59977126121521, - "rewards/margins": 0.5203127264976501, - "rewards/rejected": -1.1200840473175049, + "rewards/chosen": -0.7351819276809692, + "rewards/margins": 0.5240973830223083, + "rewards/rejected": -1.2592793703079224, "step": 795 }, { "epoch": 0.64, - "grad_norm": 7.730658054351807, + "grad_norm": 7.135324954986572, "learning_rate": 1.7274575140626318e-06, - "logits/chosen": -2.5077157020568848, - "logits/rejected": -2.4561638832092285, - "logps/chosen": -329.1211853027344, - "logps/rejected": -394.10760498046875, - "loss": 0.542, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.562198281288147, - "rewards/margins": 0.5235614776611328, - "rewards/rejected": -1.0857596397399902, + "logits/chosen": -2.50719952583313, + "logits/rejected": -2.4550704956054688, + "logps/chosen": -336.1716003417969, + "logps/rejected": -404.1689147949219, + "loss": 0.5332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.632702648639679, + "rewards/margins": 0.5536705255508423, + "rewards/rejected": -1.1863731145858765, "step": 800 }, { "epoch": 0.64, - "eval_logits/chosen": -2.616276264190674, - "eval_logits/rejected": -2.5757086277008057, - "eval_logps/chosen": -352.43634033203125, - "eval_logps/rejected": -374.8915100097656, - "eval_loss": 0.5450887680053711, - "eval_rewards/accuracies": 0.726190447807312, - "eval_rewards/chosen": -0.6919824481010437, - "eval_rewards/margins": 0.5766366124153137, - "eval_rewards/rejected": -1.2686189413070679, - "eval_runtime": 166.5658, - "eval_samples_per_second": 3.002, - "eval_steps_per_second": 0.378, + "eval_logits/chosen": -2.6166913509368896, + "eval_logits/rejected": -2.5759570598602295, + "eval_logps/chosen": -355.7965393066406, + "eval_logps/rejected": -381.5441589355469, + "eval_loss": 0.5385683178901672, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": -0.7255847454071045, + "eval_rewards/margins": 0.6095607876777649, + "eval_rewards/rejected": -1.3351454734802246, + "eval_runtime": 165.673, + "eval_samples_per_second": 3.018, + "eval_steps_per_second": 0.38, "step": 800 }, { "epoch": 0.644, - "grad_norm": 5.391225337982178, + "grad_norm": 6.823562145233154, "learning_rate": 1.6943357619237227e-06, - "logits/chosen": -2.571699619293213, - "logits/rejected": -2.5610146522521973, - "logps/chosen": -340.8096618652344, - "logps/rejected": -366.74652099609375, - "loss": 0.4952, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.6746693849563599, - "rewards/margins": 0.645226240158081, - "rewards/rejected": -1.319895625114441, + "logits/chosen": -2.5743985176086426, + "logits/rejected": -2.563049793243408, + "logps/chosen": -344.70318603515625, + "logps/rejected": -374.0846862792969, + "loss": 0.4913, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7136049866676331, + "rewards/margins": 0.67967289686203, + "rewards/rejected": -1.393277883529663, "step": 805 }, { "epoch": 0.648, - "grad_norm": 7.21277379989624, + "grad_norm": 9.05685806274414, "learning_rate": 1.661371075624363e-06, - "logits/chosen": -2.5949785709381104, - "logits/rejected": -2.6454100608825684, - "logps/chosen": -344.26763916015625, - "logps/rejected": -466.04693603515625, - "loss": 0.5694, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.9223111867904663, - "rewards/margins": 0.545967161655426, - "rewards/rejected": -1.468278169631958, + "logits/chosen": -2.6020989418029785, + "logits/rejected": -2.6505770683288574, + "logps/chosen": -347.4692687988281, + "logps/rejected": -473.5855407714844, + "loss": 0.5726, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9543269872665405, + "rewards/margins": 0.5893380641937256, + "rewards/rejected": -1.5436651706695557, "step": 810 }, { "epoch": 0.652, - "grad_norm": 10.09349536895752, + "grad_norm": 10.01281452178955, "learning_rate": 1.6285698816954626e-06, - "logits/chosen": -2.614025115966797, - "logits/rejected": -2.5793397426605225, - "logps/chosen": -371.01141357421875, - "logps/rejected": -389.6199035644531, - "loss": 0.5079, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.7361218333244324, - "rewards/margins": 0.6544822454452515, - "rewards/rejected": -1.3906042575836182, + "logits/chosen": -2.6235404014587402, + "logits/rejected": -2.5892868041992188, + "logps/chosen": -362.3233947753906, + "logps/rejected": -383.004638671875, + "loss": 0.5152, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.6492418646812439, + "rewards/margins": 0.675209641456604, + "rewards/rejected": -1.3244515657424927, "step": 815 }, { "epoch": 0.656, - "grad_norm": 6.835726737976074, + "grad_norm": 10.048011779785156, "learning_rate": 1.5959385747947697e-06, - "logits/chosen": -2.546776294708252, - "logits/rejected": -2.489529848098755, - "logps/chosen": -324.41766357421875, - "logps/rejected": -338.0316162109375, - "loss": 0.5675, + "logits/chosen": -2.5589287281036377, + "logits/rejected": -2.503087043762207, + "logps/chosen": -325.8083801269531, + "logps/rejected": -343.68316650390625, + "loss": 0.5628, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.912179172039032, - "rewards/margins": 0.5451852083206177, - "rewards/rejected": -1.4573644399642944, + "rewards/chosen": -0.9260866045951843, + "rewards/margins": 0.5877935886383057, + "rewards/rejected": -1.5138801336288452, "step": 820 }, { "epoch": 0.66, - "grad_norm": 9.413851737976074, + "grad_norm": 10.148550987243652, "learning_rate": 1.56348351646022e-06, - "logits/chosen": -2.4420504570007324, - "logits/rejected": -2.3995440006256104, - "logps/chosen": -330.77532958984375, - "logps/rejected": -375.870849609375, - "loss": 0.5706, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9180534482002258, - "rewards/margins": 0.5257332921028137, - "rewards/rejected": -1.443786859512329, + "logits/chosen": -2.459164619445801, + "logits/rejected": -2.4178318977355957, + "logps/chosen": -334.05731201171875, + "logps/rejected": -384.0740661621094, + "loss": 0.5496, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.9508736729621887, + "rewards/margins": 0.5749450922012329, + "rewards/rejected": -1.5258188247680664, "step": 825 }, { "epoch": 0.664, - "grad_norm": 7.294904708862305, + "grad_norm": 11.33963680267334, "learning_rate": 1.5312110338697427e-06, - "logits/chosen": -2.534151792526245, - "logits/rejected": -2.457547426223755, - "logps/chosen": -343.48895263671875, - "logps/rejected": -404.5125427246094, - "loss": 0.5288, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.9970987439155579, - "rewards/margins": 0.7042299509048462, - "rewards/rejected": -1.7013286352157593, + "logits/chosen": -2.5506274700164795, + "logits/rejected": -2.476365566253662, + "logps/chosen": -354.90130615234375, + "logps/rejected": -414.8627014160156, + "loss": 0.5309, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.1112221479415894, + "rewards/margins": 0.6936079263687134, + "rewards/rejected": -1.8048301935195923, "step": 830 }, { "epoch": 0.668, - "grad_norm": 7.913896083831787, + "grad_norm": 9.498885154724121, "learning_rate": 1.4991274186077632e-06, - "logits/chosen": -2.5275869369506836, - "logits/rejected": -2.5172677040100098, - "logps/chosen": -372.0498046875, - "logps/rejected": -428.5511779785156, - "loss": 0.5155, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.8607346415519714, - "rewards/margins": 0.6735731959342957, - "rewards/rejected": -1.5343079566955566, + "logits/chosen": -2.5485012531280518, + "logits/rejected": -2.539670467376709, + "logps/chosen": -386.7198181152344, + "logps/rejected": -442.57421875, + "loss": 0.524, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.0074347257614136, + "rewards/margins": 0.6671037077903748, + "rewards/rejected": -1.6745383739471436, "step": 835 }, { "epoch": 0.672, - "grad_norm": 9.02523136138916, + "grad_norm": 10.999540328979492, "learning_rate": 1.467238925438646e-06, - "logits/chosen": -2.542417287826538, - "logits/rejected": -2.4911322593688965, - "logps/chosen": -409.1072082519531, - "logps/rejected": -441.38818359375, - "loss": 0.6043, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8746612668037415, - "rewards/margins": 0.6565597653388977, - "rewards/rejected": -1.5312209129333496, + "logits/chosen": -2.5577821731567383, + "logits/rejected": -2.5113046169281006, + "logps/chosen": -419.2740173339844, + "logps/rejected": -448.5807189941406, + "loss": 0.6084, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.9763299226760864, + "rewards/margins": 0.62681645154953, + "rewards/rejected": -1.6031463146209717, "step": 840 }, { "epoch": 0.676, - "grad_norm": 7.484790325164795, + "grad_norm": 8.105618476867676, "learning_rate": 1.4355517710873184e-06, - "logits/chosen": -2.505631446838379, - "logits/rejected": -2.4716596603393555, - "logps/chosen": -369.3199157714844, - "logps/rejected": -385.47271728515625, - "loss": 0.5162, + "logits/chosen": -2.524392604827881, + "logits/rejected": -2.4944310188293457, + "logps/chosen": -380.297119140625, + "logps/rejected": -396.42138671875, + "loss": 0.5097, "rewards/accuracies": 0.75, - "rewards/chosen": -0.8367518186569214, - "rewards/margins": 0.6746172904968262, - "rewards/rejected": -1.5113691091537476, + "rewards/chosen": -0.9465241432189941, + "rewards/margins": 0.6743323802947998, + "rewards/rejected": -1.6208562850952148, "step": 845 }, { "epoch": 0.68, - "grad_norm": 14.492733001708984, + "grad_norm": 14.872075080871582, "learning_rate": 1.4040721330273063e-06, - "logits/chosen": -2.4774107933044434, - "logits/rejected": -2.4899094104766846, - "logps/chosen": -358.19085693359375, - "logps/rejected": -413.7503967285156, - "loss": 0.6545, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.9458563923835754, - "rewards/margins": 0.5173967480659485, - "rewards/rejected": -1.4632532596588135, + "logits/chosen": -2.496351957321167, + "logits/rejected": -2.5108532905578613, + "logps/chosen": -367.57183837890625, + "logps/rejected": -420.6947326660156, + "loss": 0.6583, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.039666771888733, + "rewards/margins": 0.4930298328399658, + "rewards/rejected": -1.5326964855194092, "step": 850 }, { "epoch": 0.684, - "grad_norm": 9.713050842285156, + "grad_norm": 9.286355018615723, "learning_rate": 1.3728061482764238e-06, - "logits/chosen": -2.6106739044189453, - "logits/rejected": -2.6073949337005615, - "logps/chosen": -393.0316162109375, - "logps/rejected": -462.81964111328125, - "loss": 0.6226, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.7931762933731079, - "rewards/margins": 0.5521876215934753, - "rewards/rejected": -1.3453638553619385, + "logits/chosen": -2.626911163330078, + "logits/rejected": -2.6237576007843018, + "logps/chosen": -398.8912048339844, + "logps/rejected": -464.3138732910156, + "loss": 0.6356, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.851772129535675, + "rewards/margins": 0.5085344910621643, + "rewards/rejected": -1.3603065013885498, "step": 855 }, { "epoch": 0.688, - "grad_norm": 8.317811012268066, + "grad_norm": 9.629425048828125, "learning_rate": 1.3417599122003464e-06, - "logits/chosen": -2.5949435234069824, - "logits/rejected": -2.5823256969451904, - "logps/chosen": -336.19964599609375, - "logps/rejected": -373.7811279296875, - "loss": 0.6114, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7611545324325562, - "rewards/margins": 0.44461917877197266, - "rewards/rejected": -1.2057737112045288, + "logits/chosen": -2.6108672618865967, + "logits/rejected": -2.600440502166748, + "logps/chosen": -341.123046875, + "logps/rejected": -377.686767578125, + "loss": 0.6128, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8103886842727661, + "rewards/margins": 0.4344421327114105, + "rewards/rejected": -1.244830846786499, "step": 860 }, { "epoch": 0.692, - "grad_norm": 7.705052852630615, + "grad_norm": 10.558273315429688, "learning_rate": 1.3109394773243117e-06, - "logits/chosen": -2.5213277339935303, - "logits/rejected": -2.5227763652801514, - "logps/chosen": -378.220458984375, - "logps/rejected": -433.517822265625, - "loss": 0.5362, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.8036476373672485, - "rewards/margins": 0.7761087417602539, - "rewards/rejected": -1.5797563791275024, + "logits/chosen": -2.5375044345855713, + "logits/rejected": -2.5400888919830322, + "logps/chosen": -382.33154296875, + "logps/rejected": -431.41455078125, + "loss": 0.5468, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8447578549385071, + "rewards/margins": 0.7139667272567749, + "rewards/rejected": -1.5587245225906372, "step": 865 }, { "epoch": 0.696, - "grad_norm": 13.364889144897461, + "grad_norm": 11.613428115844727, "learning_rate": 1.280350852153168e-06, - "logits/chosen": -2.5971102714538574, - "logits/rejected": -2.5233542919158936, - "logps/chosen": -357.8860778808594, - "logps/rejected": -374.4933776855469, - "loss": 0.5301, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7840938568115234, - "rewards/margins": 0.6052092909812927, - "rewards/rejected": -1.3893029689788818, + "logits/chosen": -2.610506296157837, + "logits/rejected": -2.5400068759918213, + "logps/chosen": -361.366943359375, + "logps/rejected": -373.01904296875, + "loss": 0.5561, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.8189024925231934, + "rewards/margins": 0.5556577444076538, + "rewards/rejected": -1.3745602369308472, "step": 870 }, { "epoch": 0.7, - "grad_norm": 9.326637268066406, + "grad_norm": 11.380918502807617, "learning_rate": 1.2500000000000007e-06, - "logits/chosen": -2.5254058837890625, - "logits/rejected": -2.5030879974365234, - "logps/chosen": -361.384521484375, - "logps/rejected": -415.76226806640625, - "loss": 0.5189, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.733026385307312, - "rewards/margins": 0.6997817754745483, - "rewards/rejected": -1.4328081607818604, + "logits/chosen": -2.5413687229156494, + "logits/rejected": -2.5202109813690186, + "logps/chosen": -360.80889892578125, + "logps/rejected": -413.19012451171875, + "loss": 0.5164, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7272705435752869, + "rewards/margins": 0.6798168420791626, + "rewards/rejected": -1.4070874452590942, "step": 875 }, { "epoch": 0.704, - "grad_norm": 10.961359024047852, + "grad_norm": 11.80184555053711, "learning_rate": 1.2198928378235717e-06, - "logits/chosen": -2.572036027908325, - "logits/rejected": -2.561758518218994, - "logps/chosen": -297.1030578613281, - "logps/rejected": -387.46685791015625, - "loss": 0.5146, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.5809527635574341, - "rewards/margins": 0.6962177753448486, - "rewards/rejected": -1.2771704196929932, + "logits/chosen": -2.5873050689697266, + "logits/rejected": -2.576911211013794, + "logps/chosen": -299.10498046875, + "logps/rejected": -388.73211669921875, + "loss": 0.5155, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6009725332260132, + "rewards/margins": 0.6888505220413208, + "rewards/rejected": -1.2898229360580444, "step": 880 }, { "epoch": 0.708, - "grad_norm": 6.878293514251709, + "grad_norm": 5.85650634765625, "learning_rate": 1.1900352350748026e-06, - "logits/chosen": -2.54471492767334, - "logits/rejected": -2.507836103439331, - "logps/chosen": -375.3175354003906, - "logps/rejected": -410.13555908203125, - "loss": 0.5035, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.7230747938156128, - "rewards/margins": 0.817081093788147, - "rewards/rejected": -1.5401558876037598, + "logits/chosen": -2.560586929321289, + "logits/rejected": -2.5254247188568115, + "logps/chosen": -374.28692626953125, + "logps/rejected": -407.04827880859375, + "loss": 0.5088, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7127686142921448, + "rewards/margins": 0.7965149879455566, + "rewards/rejected": -1.509283423423767, "step": 885 }, { "epoch": 0.712, - "grad_norm": 10.813645362854004, + "grad_norm": 7.8027448654174805, "learning_rate": 1.160433012552508e-06, - "logits/chosen": -2.4845707416534424, - "logits/rejected": -2.4921040534973145, - "logps/chosen": -331.4786682128906, - "logps/rejected": -384.5702819824219, - "loss": 0.5406, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.7888541221618652, - "rewards/margins": 0.5834736824035645, - "rewards/rejected": -1.3723278045654297, + "logits/chosen": -2.5033535957336426, + "logits/rejected": -2.5112838745117188, + "logps/chosen": -330.83880615234375, + "logps/rejected": -380.98297119140625, + "loss": 0.5379, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7824558019638062, + "rewards/margins": 0.5539994239807129, + "rewards/rejected": -1.3364553451538086, "step": 890 }, { "epoch": 0.716, - "grad_norm": 8.971263885498047, + "grad_norm": 8.75863265991211, "learning_rate": 1.1310919412686248e-06, - "logits/chosen": -2.5675597190856934, - "logits/rejected": -2.566880464553833, - "logps/chosen": -377.36639404296875, - "logps/rejected": -408.0566711425781, - "loss": 0.5503, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8293668627738953, - "rewards/margins": 0.5729758739471436, - "rewards/rejected": -1.402342677116394, + "logits/chosen": -2.5839171409606934, + "logits/rejected": -2.5830111503601074, + "logps/chosen": -370.30780029296875, + "logps/rejected": -396.5429992675781, + "loss": 0.5589, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.7587816119194031, + "rewards/margins": 0.5284246802330017, + "rewards/rejected": -1.2872062921524048, "step": 895 }, { "epoch": 0.72, - "grad_norm": 6.206206321716309, + "grad_norm": 8.253792762756348, "learning_rate": 1.1020177413231334e-06, - "logits/chosen": -2.5698678493499756, - "logits/rejected": -2.544900417327881, - "logps/chosen": -358.0696716308594, - "logps/rejected": -385.30047607421875, - "loss": 0.5282, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.8030437231063843, - "rewards/margins": 0.6273149251937866, - "rewards/rejected": -1.430358648300171, + "logits/chosen": -2.5888657569885254, + "logits/rejected": -2.5641961097717285, + "logps/chosen": -352.4532775878906, + "logps/rejected": -376.51751708984375, + "loss": 0.5334, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.746880054473877, + "rewards/margins": 0.5956496000289917, + "rewards/rejected": -1.342529535293579, "step": 900 }, { "epoch": 0.72, - "eval_logits/chosen": -2.5715651512145996, - "eval_logits/rejected": -2.526627779006958, - "eval_logps/chosen": -362.9278869628906, - "eval_logps/rejected": -390.782470703125, - "eval_loss": 0.541217029094696, - "eval_rewards/accuracies": 0.7083333134651184, - "eval_rewards/chosen": -0.7968972325325012, - "eval_rewards/margins": 0.6306313872337341, - "eval_rewards/rejected": -1.427528738975525, - "eval_runtime": 166.5843, - "eval_samples_per_second": 3.001, - "eval_steps_per_second": 0.378, + "eval_logits/chosen": -2.5998997688293457, + "eval_logits/rejected": -2.5573904514312744, + "eval_logps/chosen": -353.8529357910156, + "eval_logps/rejected": -380.3204345703125, + "eval_loss": 0.536827027797699, + "eval_rewards/accuracies": 0.716269850730896, + "eval_rewards/chosen": -0.7061484456062317, + "eval_rewards/margins": 0.616759717464447, + "eval_rewards/rejected": -1.3229081630706787, + "eval_runtime": 165.7628, + "eval_samples_per_second": 3.016, + "eval_steps_per_second": 0.38, "step": 900 }, { "epoch": 0.724, - "grad_norm": 6.695834636688232, + "grad_norm": 7.283039093017578, "learning_rate": 1.073216080788921e-06, - "logits/chosen": -2.584725856781006, - "logits/rejected": -2.5650360584259033, - "logps/chosen": -371.1759033203125, - "logps/rejected": -381.4906311035156, - "loss": 0.6249, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.8290435671806335, - "rewards/margins": 0.34561312198638916, - "rewards/rejected": -1.1746567487716675, + "logits/chosen": -2.6033217906951904, + "logits/rejected": -2.5845823287963867, + "logps/chosen": -361.44329833984375, + "logps/rejected": -374.8320617675781, + "loss": 0.6075, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -0.7317181825637817, + "rewards/margins": 0.37635332345962524, + "rewards/rejected": -1.1080714464187622, "step": 905 }, { "epoch": 0.728, - "grad_norm": 9.597143173217773, + "grad_norm": 9.450459480285645, "learning_rate": 1.0446925746067768e-06, - "logits/chosen": -2.5334725379943848, - "logits/rejected": -2.476060152053833, - "logps/chosen": -324.6567687988281, - "logps/rejected": -336.16448974609375, - "loss": 0.5035, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.7902948260307312, - "rewards/margins": 0.6815158724784851, - "rewards/rejected": -1.4718106985092163, + "logits/chosen": -2.5516154766082764, + "logits/rejected": -2.495788097381592, + "logps/chosen": -316.8692321777344, + "logps/rejected": -324.5566101074219, + "loss": 0.5047, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7124193906784058, + "rewards/margins": 0.6433127522468567, + "rewards/rejected": -1.3557320833206177, "step": 910 }, { "epoch": 0.732, - "grad_norm": 8.9435396194458, + "grad_norm": 11.512716293334961, "learning_rate": 1.0164527834907468e-06, - "logits/chosen": -2.4533045291900635, - "logits/rejected": -2.449897050857544, - "logps/chosen": -347.75250244140625, - "logps/rejected": -426.3138122558594, - "loss": 0.4745, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.8129841685295105, - "rewards/margins": 0.7864343523979187, - "rewards/rejected": -1.5994184017181396, + "logits/chosen": -2.4677295684814453, + "logits/rejected": -2.4644956588745117, + "logps/chosen": -342.6759948730469, + "logps/rejected": -419.4685974121094, + "loss": 0.4815, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7622194886207581, + "rewards/margins": 0.7687476277351379, + "rewards/rejected": -1.5309669971466064, "step": 915 }, { "epoch": 0.736, - "grad_norm": 9.8597993850708, + "grad_norm": 27.01951789855957, "learning_rate": 9.88502212844063e-07, - "logits/chosen": -2.54966139793396, - "logits/rejected": -2.551966428756714, - "logps/chosen": -350.10125732421875, - "logps/rejected": -414.47265625, - "loss": 0.6307, + "logits/chosen": -2.5636465549468994, + "logits/rejected": -2.5651931762695312, + "logps/chosen": -345.3995666503906, + "logps/rejected": -412.34979248046875, + "loss": 0.622, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.8708189129829407, - "rewards/margins": 0.4037759304046631, - "rewards/rejected": -1.274594783782959, + "rewards/chosen": -0.8238021731376648, + "rewards/margins": 0.42956480383872986, + "rewards/rejected": -1.2533669471740723, "step": 920 }, { "epoch": 0.74, - "grad_norm": 11.812902450561523, + "grad_norm": 13.12364387512207, "learning_rate": 9.608463116858544e-07, - "logits/chosen": -2.5522665977478027, - "logits/rejected": -2.516507625579834, - "logps/chosen": -359.7716064453125, - "logps/rejected": -395.00726318359375, - "loss": 0.5559, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.8488423228263855, - "rewards/margins": 0.5956013798713684, - "rewards/rejected": -1.444443702697754, + "logits/chosen": -2.5695652961730957, + "logits/rejected": -2.5348198413848877, + "logps/chosen": -351.7240905761719, + "logps/rejected": -388.83319091796875, + "loss": 0.5433, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.76836758852005, + "rewards/margins": 0.6143354773521423, + "rewards/rejected": -1.3827030658721924, "step": 925 }, { "epoch": 0.744, - "grad_norm": 23.08143424987793, + "grad_norm": 10.578348159790039, "learning_rate": 9.334904715888496e-07, - "logits/chosen": -2.4799532890319824, - "logits/rejected": -2.480583667755127, - "logps/chosen": -345.7176208496094, - "logps/rejected": -404.23931884765625, - "loss": 0.5426, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.837591826915741, - "rewards/margins": 0.6740916967391968, - "rewards/rejected": -1.5116835832595825, + "logits/chosen": -2.4992146492004395, + "logits/rejected": -2.501399517059326, + "logps/chosen": -339.5255432128906, + "logps/rejected": -395.8268737792969, + "loss": 0.5339, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.775671660900116, + "rewards/margins": 0.6518876552581787, + "rewards/rejected": -1.4275591373443604, "step": 930 }, { "epoch": 0.748, - "grad_norm": 9.993616104125977, + "grad_norm": 7.748569011688232, "learning_rate": 9.064400256282757e-07, - "logits/chosen": -2.557299852371216, - "logits/rejected": -2.5293102264404297, - "logps/chosen": -361.8759765625, - "logps/rejected": -390.130859375, - "loss": 0.549, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.7642291784286499, - "rewards/margins": 0.6016563773155212, - "rewards/rejected": -1.3658854961395264, + "logits/chosen": -2.57441782951355, + "logits/rejected": -2.546863079071045, + "logps/chosen": -355.07818603515625, + "logps/rejected": -380.3440246582031, + "loss": 0.559, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.6962515115737915, + "rewards/margins": 0.5717657208442688, + "rewards/rejected": -1.268017292022705, "step": 935 }, { "epoch": 0.752, - "grad_norm": 8.252609252929688, + "grad_norm": 7.653827667236328, "learning_rate": 8.797002473421729e-07, - "logits/chosen": -2.5289082527160645, - "logits/rejected": -2.5367226600646973, - "logps/chosen": -387.4449462890625, - "logps/rejected": -411.28057861328125, - "loss": 0.5075, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.6062986254692078, - "rewards/margins": 0.6882797479629517, - "rewards/rejected": -1.2945783138275146, + "logits/chosen": -2.544231653213501, + "logits/rejected": -2.553048610687256, + "logps/chosen": -380.5497131347656, + "logps/rejected": -403.52191162109375, + "loss": 0.5081, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5373459458351135, + "rewards/margins": 0.6796460151672363, + "rewards/rejected": -1.216991901397705, "step": 940 }, { "epoch": 0.756, - "grad_norm": 15.335317611694336, + "grad_norm": 14.531281471252441, "learning_rate": 8.532763497032987e-07, - "logits/chosen": -2.4454190731048584, - "logits/rejected": -2.431666851043701, - "logps/chosen": -370.828857421875, - "logps/rejected": -450.534423828125, - "loss": 0.5028, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.7790817022323608, - "rewards/margins": 0.7363722324371338, - "rewards/rejected": -1.5154539346694946, + "logits/chosen": -2.4647645950317383, + "logits/rejected": -2.452423572540283, + "logps/chosen": -368.66497802734375, + "logps/rejected": -440.90313720703125, + "loss": 0.5264, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.7574427127838135, + "rewards/margins": 0.6616984605789185, + "rewards/rejected": -1.4191412925720215, "step": 945 }, { "epoch": 0.76, - "grad_norm": 8.324295997619629, + "grad_norm": 6.607179164886475, "learning_rate": 8.271734841028553e-07, - "logits/chosen": -2.608813762664795, - "logits/rejected": -2.6157126426696777, - "logps/chosen": -339.1045227050781, - "logps/rejected": -371.37139892578125, - "loss": 0.5261, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.7105227708816528, - "rewards/margins": 0.6277570128440857, - "rewards/rejected": -1.3382797241210938, + "logits/chosen": -2.6168630123138428, + "logits/rejected": -2.6241250038146973, + "logps/chosen": -340.37542724609375, + "logps/rejected": -366.9325866699219, + "loss": 0.5419, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7232319712638855, + "rewards/margins": 0.5706599950790405, + "rewards/rejected": -1.2938919067382812, "step": 950 }, { "epoch": 0.764, - "grad_norm": 9.136221885681152, + "grad_norm": 7.8033528327941895, "learning_rate": 8.013967393462094e-07, - "logits/chosen": -2.4687538146972656, - "logits/rejected": -2.490540027618408, - "logps/chosen": -356.60186767578125, - "logps/rejected": -391.22430419921875, - "loss": 0.6001, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.8641288876533508, - "rewards/margins": 0.5564495921134949, - "rewards/rejected": -1.4205783605575562, + "logits/chosen": -2.4783270359039307, + "logits/rejected": -2.501206874847412, + "logps/chosen": -348.3237609863281, + "logps/rejected": -384.16656494140625, + "loss": 0.5859, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.7813480496406555, + "rewards/margins": 0.5686533451080322, + "rewards/rejected": -1.3500014543533325, "step": 955 }, { "epoch": 0.768, - "grad_norm": 6.628035068511963, + "grad_norm": 6.114492893218994, "learning_rate": 7.759511406608255e-07, - "logits/chosen": -2.5683350563049316, - "logits/rejected": -2.500274181365967, - "logps/chosen": -406.158447265625, - "logps/rejected": -411.00360107421875, - "loss": 0.4962, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.9044888615608215, - "rewards/margins": 0.8631958961486816, - "rewards/rejected": -1.7676846981048584, + "logits/chosen": -2.5830774307250977, + "logits/rejected": -2.516847848892212, + "logps/chosen": -397.07305908203125, + "logps/rejected": -403.8395080566406, + "loss": 0.4834, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.8136352300643921, + "rewards/margins": 0.8824082612991333, + "rewards/rejected": -1.6960432529449463, "step": 960 }, { "epoch": 0.772, - "grad_norm": 8.616596221923828, + "grad_norm": 12.286111831665039, "learning_rate": 7.508416487165862e-07, - "logits/chosen": -2.4850573539733887, - "logits/rejected": -2.4977798461914062, - "logps/chosen": -373.7191467285156, - "logps/rejected": -406.65887451171875, - "loss": 0.5838, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8553136587142944, - "rewards/margins": 0.5251432657241821, - "rewards/rejected": -1.3804569244384766, + "logits/chosen": -2.4968883991241455, + "logits/rejected": -2.5091567039489746, + "logps/chosen": -366.52630615234375, + "logps/rejected": -400.1545715332031, + "loss": 0.5807, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7833856344223022, + "rewards/margins": 0.5320286154747009, + "rewards/rejected": -1.3154141902923584, "step": 965 }, { "epoch": 0.776, - "grad_norm": 13.198709487915039, + "grad_norm": 12.27044677734375, "learning_rate": 7.260731586586983e-07, - "logits/chosen": -2.459974527359009, - "logits/rejected": -2.463366746902466, - "logps/chosen": -344.30938720703125, - "logps/rejected": -415.8106994628906, - "loss": 0.6061, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.9792687296867371, - "rewards/margins": 0.5259648561477661, - "rewards/rejected": -1.505233645439148, + "logits/chosen": -2.4706804752349854, + "logits/rejected": -2.4732460975646973, + "logps/chosen": -339.1402587890625, + "logps/rejected": -404.2414245605469, + "loss": 0.6221, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.9275779724121094, + "rewards/margins": 0.46196335554122925, + "rewards/rejected": -1.3895412683486938, "step": 970 }, { "epoch": 0.78, - "grad_norm": 8.697169303894043, + "grad_norm": 7.917988300323486, "learning_rate": 7.016504991533727e-07, - "logits/chosen": -2.5883092880249023, - "logits/rejected": -2.5588154792785645, - "logps/chosen": -385.4810791015625, - "logps/rejected": -427.9122619628906, - "loss": 0.4801, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.6248154044151306, - "rewards/margins": 0.7402738928794861, - "rewards/rejected": -1.3650894165039062, + "logits/chosen": -2.593116283416748, + "logits/rejected": -2.565453290939331, + "logps/chosen": -383.8894348144531, + "logps/rejected": -424.5870666503906, + "loss": 0.4774, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.6088994145393372, + "rewards/margins": 0.7229386568069458, + "rewards/rejected": -1.3318378925323486, "step": 975 }, { "epoch": 0.784, - "grad_norm": 5.463360786437988, + "grad_norm": 5.051321983337402, "learning_rate": 6.775784314464717e-07, - "logits/chosen": -2.490438461303711, - "logits/rejected": -2.511699914932251, - "logps/chosen": -339.4949645996094, - "logps/rejected": -422.260498046875, - "loss": 0.4897, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.7619988918304443, - "rewards/margins": 0.7652215957641602, - "rewards/rejected": -1.5272204875946045, + "logits/chosen": -2.4984991550445557, + "logits/rejected": -2.5199942588806152, + "logps/chosen": -342.84515380859375, + "logps/rejected": -421.0189514160156, + "loss": 0.4971, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7955012917518616, + "rewards/margins": 0.7193040251731873, + "rewards/rejected": -1.5148054361343384, "step": 980 }, { "epoch": 0.788, - "grad_norm": 9.091290473937988, + "grad_norm": 8.092668533325195, "learning_rate": 6.538616484352902e-07, - "logits/chosen": -2.534930944442749, - "logits/rejected": -2.5236055850982666, - "logps/chosen": -342.08734130859375, - "logps/rejected": -378.9286804199219, - "loss": 0.4952, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.758894145488739, - "rewards/margins": 0.7211824655532837, - "rewards/rejected": -1.480076551437378, + "logits/chosen": -2.5383505821228027, + "logits/rejected": -2.526851177215576, + "logps/chosen": -345.52655029296875, + "logps/rejected": -379.8380432128906, + "loss": 0.5156, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7932868003845215, + "rewards/margins": 0.6958837509155273, + "rewards/rejected": -1.4891705513000488, "step": 985 }, { "epoch": 0.792, - "grad_norm": 8.22352123260498, + "grad_norm": 9.803926467895508, "learning_rate": 6.305047737536707e-07, - "logits/chosen": -2.502777576446533, - "logits/rejected": -2.4551658630371094, - "logps/chosen": -347.10638427734375, - "logps/rejected": -367.161376953125, - "loss": 0.555, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.8488829731941223, - "rewards/margins": 0.6530700325965881, - "rewards/rejected": -1.501952886581421, + "logits/chosen": -2.509049654006958, + "logits/rejected": -2.463141679763794, + "logps/chosen": -351.3589172363281, + "logps/rejected": -371.07281494140625, + "loss": 0.5485, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8914083242416382, + "rewards/margins": 0.6496592164039612, + "rewards/rejected": -1.5410678386688232, "step": 990 }, { "epoch": 0.796, - "grad_norm": 19.538936614990234, + "grad_norm": 15.167935371398926, "learning_rate": 6.075123608706093e-07, - "logits/chosen": -2.542541980743408, - "logits/rejected": -2.5630409717559814, - "logps/chosen": -365.1338806152344, - "logps/rejected": -388.2347106933594, - "loss": 0.5486, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.7868693470954895, - "rewards/margins": 0.5855667591094971, - "rewards/rejected": -1.3724360466003418, + "logits/chosen": -2.5473320484161377, + "logits/rejected": -2.5690910816192627, + "logps/chosen": -365.46673583984375, + "logps/rejected": -389.1126403808594, + "loss": 0.5431, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7901977300643921, + "rewards/margins": 0.5910181999206543, + "rewards/rejected": -1.381216049194336, "step": 995 }, { "epoch": 0.8, - "grad_norm": 7.248394012451172, + "grad_norm": 7.769952774047852, "learning_rate": 5.848888922025553e-07, - "logits/chosen": -2.4549734592437744, - "logits/rejected": -2.4434893131256104, - "logps/chosen": -330.5204162597656, - "logps/rejected": -420.1793518066406, - "loss": 0.5873, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9319890141487122, - "rewards/margins": 0.5578263401985168, - "rewards/rejected": -1.4898154735565186, + "logits/chosen": -2.461652994155884, + "logits/rejected": -2.4495410919189453, + "logps/chosen": -327.51654052734375, + "logps/rejected": -418.86737060546875, + "loss": 0.5837, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.9019506573677063, + "rewards/margins": 0.5747453570365906, + "rewards/rejected": -1.4766958951950073, "step": 1000 }, { "epoch": 0.8, - "eval_logits/chosen": -2.5692970752716064, - "eval_logits/rejected": -2.5253682136535645, - "eval_logps/chosen": -365.5719909667969, - "eval_logps/rejected": -399.30718994140625, - "eval_loss": 0.5368649363517761, - "eval_rewards/accuracies": 0.7083333134651184, - "eval_rewards/chosen": -0.8233387470245361, - "eval_rewards/margins": 0.689436674118042, - "eval_rewards/rejected": -1.5127756595611572, - "eval_runtime": 166.5941, - "eval_samples_per_second": 3.001, - "eval_steps_per_second": 0.378, + "eval_logits/chosen": -2.5706355571746826, + "eval_logits/rejected": -2.527315855026245, + "eval_logps/chosen": -362.7657165527344, + "eval_logps/rejected": -395.8990783691406, + "eval_loss": 0.5301549434661865, + "eval_rewards/accuracies": 0.716269850730896, + "eval_rewards/chosen": -0.795275866985321, + "eval_rewards/margins": 0.6834191083908081, + "eval_rewards/rejected": -1.4786947965621948, + "eval_runtime": 165.7401, + "eval_samples_per_second": 3.017, + "eval_steps_per_second": 0.38, "step": 1000 }, { "epoch": 0.804, - "grad_norm": 11.21343994140625, + "grad_norm": 9.499650001525879, "learning_rate": 5.626387782395512e-07, - "logits/chosen": -2.5684406757354736, - "logits/rejected": -2.536818265914917, - "logps/chosen": -391.46490478515625, - "logps/rejected": -438.1607971191406, - "loss": 0.5785, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9650428891181946, - "rewards/margins": 0.6077659726142883, - "rewards/rejected": -1.5728086233139038, + "logits/chosen": -2.570199489593506, + "logits/rejected": -2.5388243198394775, + "logps/chosen": -386.8207702636719, + "logps/rejected": -439.36053466796875, + "loss": 0.5546, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9186019897460938, + "rewards/margins": 0.6662044525146484, + "rewards/rejected": -1.5848064422607422, "step": 1005 }, { "epoch": 0.808, - "grad_norm": 7.5349884033203125, + "grad_norm": 8.864973068237305, "learning_rate": 5.407663566854008e-07, - "logits/chosen": -2.5107808113098145, - "logits/rejected": -2.4654345512390137, - "logps/chosen": -377.01324462890625, - "logps/rejected": -436.71728515625, - "loss": 0.4949, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.7660607695579529, - "rewards/margins": 0.7943149209022522, - "rewards/rejected": -1.5603755712509155, + "logits/chosen": -2.514481544494629, + "logits/rejected": -2.469686269760132, + "logps/chosen": -375.16436767578125, + "logps/rejected": -431.52813720703125, + "loss": 0.5046, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7475723028182983, + "rewards/margins": 0.7609124779701233, + "rewards/rejected": -1.5084848403930664, "step": 1010 }, { "epoch": 0.812, - "grad_norm": 18.765304565429688, + "grad_norm": 17.737668991088867, "learning_rate": 5.192758916120236e-07, - "logits/chosen": -2.5258936882019043, - "logits/rejected": -2.496175527572632, - "logps/chosen": -376.3697509765625, - "logps/rejected": -422.98529052734375, - "loss": 0.5471, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.8881324529647827, - "rewards/margins": 0.6989415884017944, - "rewards/rejected": -1.5870741605758667, + "logits/chosen": -2.5291812419891357, + "logits/rejected": -2.501344680786133, + "logps/chosen": -376.4272766113281, + "logps/rejected": -419.0375061035156, + "loss": 0.5571, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8887074589729309, + "rewards/margins": 0.6588888168334961, + "rewards/rejected": -1.5475962162017822, "step": 1015 }, { "epoch": 0.816, - "grad_norm": 9.102749824523926, + "grad_norm": 9.168149948120117, "learning_rate": 4.981715726281666e-07, - "logits/chosen": -2.5151665210723877, - "logits/rejected": -2.5129122734069824, - "logps/chosen": -376.29168701171875, - "logps/rejected": -392.8097229003906, - "loss": 0.6469, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.9464821815490723, - "rewards/margins": 0.4086835980415344, - "rewards/rejected": -1.355165958404541, + "logits/chosen": -2.5210018157958984, + "logits/rejected": -2.518200635910034, + "logps/chosen": -374.60687255859375, + "logps/rejected": -385.31317138671875, + "loss": 0.6639, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9296348690986633, + "rewards/margins": 0.35056614875793457, + "rewards/rejected": -1.2802008390426636, "step": 1020 }, { "epoch": 0.82, - "grad_norm": 5.625443935394287, + "grad_norm": 6.545177936553955, "learning_rate": 4.774575140626317e-07, - "logits/chosen": -2.5486044883728027, - "logits/rejected": -2.55946683883667, - "logps/chosen": -376.15631103515625, - "logps/rejected": -424.7813415527344, - "loss": 0.5013, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.7685045003890991, - "rewards/margins": 0.8368158340454102, - "rewards/rejected": -1.6053203344345093, + "logits/chosen": -2.553743839263916, + "logits/rejected": -2.564492702484131, + "logps/chosen": -374.25372314453125, + "logps/rejected": -418.3985290527344, + "loss": 0.5131, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7494795918464661, + "rewards/margins": 0.7920123338699341, + "rewards/rejected": -1.5414918661117554, "step": 1025 }, { "epoch": 0.824, - "grad_norm": 8.010042190551758, + "grad_norm": 10.368010520935059, "learning_rate": 4.5713775416217884e-07, - "logits/chosen": -2.535797357559204, - "logits/rejected": -2.5063552856445312, - "logps/chosen": -369.76129150390625, - "logps/rejected": -407.69281005859375, - "loss": 0.494, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.8518115282058716, - "rewards/margins": 0.8361819982528687, - "rewards/rejected": -1.6879936456680298, + "logits/chosen": -2.5401394367218018, + "logits/rejected": -2.5111076831817627, + "logps/chosen": -364.0164794921875, + "logps/rejected": -398.11749267578125, + "loss": 0.493, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7943639159202576, + "rewards/margins": 0.7978767156600952, + "rewards/rejected": -1.5922406911849976, "step": 1030 }, { "epoch": 0.828, - "grad_norm": 10.202120780944824, + "grad_norm": 12.131779670715332, "learning_rate": 4.372162543042624e-07, - "logits/chosen": -2.5730769634246826, - "logits/rejected": -2.5308516025543213, - "logps/chosen": -331.8706359863281, - "logps/rejected": -356.06585693359375, - "loss": 0.6335, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -1.003354787826538, - "rewards/margins": 0.4766755998134613, - "rewards/rejected": -1.4800306558609009, + "logits/chosen": -2.579563856124878, + "logits/rejected": -2.539201259613037, + "logps/chosen": -327.2681579589844, + "logps/rejected": -347.89068603515625, + "loss": 0.6285, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9573305249214172, + "rewards/margins": 0.44094863533973694, + "rewards/rejected": -1.3982793092727661, "step": 1035 }, { "epoch": 0.832, - "grad_norm": 6.7073163986206055, + "grad_norm": 7.402243137359619, "learning_rate": 4.1769689822475147e-07, - "logits/chosen": -2.527463912963867, - "logits/rejected": -2.508131980895996, - "logps/chosen": -340.91851806640625, - "logps/rejected": -381.59246826171875, - "loss": 0.5364, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.8549755215644836, - "rewards/margins": 0.6245896220207214, - "rewards/rejected": -1.479565143585205, + "logits/chosen": -2.533160924911499, + "logits/rejected": -2.514822244644165, + "logps/chosen": -332.4081726074219, + "logps/rejected": -374.7890930175781, + "loss": 0.5243, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.7698723673820496, + "rewards/margins": 0.6416595578193665, + "rewards/rejected": -1.4115320444107056, "step": 1040 }, { "epoch": 0.836, - "grad_norm": 11.138327598571777, + "grad_norm": 11.70563793182373, "learning_rate": 3.9858349126078945e-07, - "logits/chosen": -2.407921314239502, - "logits/rejected": -2.4323601722717285, - "logps/chosen": -367.8681640625, - "logps/rejected": -432.36505126953125, - "loss": 0.5952, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.9345201253890991, - "rewards/margins": 0.5518704652786255, - "rewards/rejected": -1.4863905906677246, + "logits/chosen": -2.4150428771972656, + "logits/rejected": -2.439276933670044, + "logps/chosen": -360.05657958984375, + "logps/rejected": -423.20184326171875, + "loss": 0.5974, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8564049005508423, + "rewards/margins": 0.5383543372154236, + "rewards/rejected": -1.394759178161621, "step": 1045 }, { "epoch": 0.84, - "grad_norm": 18.084115982055664, + "grad_norm": 13.589889526367188, "learning_rate": 3.798797596089351e-07, - "logits/chosen": -2.5881738662719727, - "logits/rejected": -2.557568073272705, - "logps/chosen": -387.9930725097656, - "logps/rejected": -412.61773681640625, - "loss": 0.5676, + "logits/chosen": -2.5914146900177, + "logits/rejected": -2.56174898147583, + "logps/chosen": -381.68048095703125, + "logps/rejected": -398.01007080078125, + "loss": 0.5775, "rewards/accuracies": 0.75, - "rewards/chosen": -0.9383655786514282, - "rewards/margins": 0.635163426399231, - "rewards/rejected": -1.5735290050506592, + "rewards/chosen": -0.8752404451370239, + "rewards/margins": 0.5522125959396362, + "rewards/rejected": -1.4274529218673706, "step": 1050 }, { "epoch": 0.844, - "grad_norm": 8.753804206848145, + "grad_norm": 8.736641883850098, "learning_rate": 3.615893495987335e-07, - "logits/chosen": -2.493389129638672, - "logits/rejected": -2.512821912765503, - "logps/chosen": -358.10015869140625, - "logps/rejected": -455.28802490234375, - "loss": 0.5181, + "logits/chosen": -2.4973983764648438, + "logits/rejected": -2.51640248298645, + "logps/chosen": -355.0185852050781, + "logps/rejected": -448.5818786621094, + "loss": 0.5172, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7339892983436584, - "rewards/margins": 0.7650957703590393, - "rewards/rejected": -1.4990851879119873, + "rewards/chosen": -0.703173816204071, + "rewards/margins": 0.7288501858711243, + "rewards/rejected": -1.4320241212844849, "step": 1055 }, { "epoch": 0.848, - "grad_norm": 10.629037857055664, + "grad_norm": 7.283242702484131, "learning_rate": 3.4371582698185636e-07, - "logits/chosen": -2.5065648555755615, - "logits/rejected": -2.5203864574432373, - "logps/chosen": -394.06390380859375, - "logps/rejected": -438.9784240722656, - "loss": 0.4584, + "logits/chosen": -2.510960578918457, + "logits/rejected": -2.5245649814605713, + "logps/chosen": -381.8621520996094, + "logps/rejected": -428.3202209472656, + "loss": 0.4451, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.9274409413337708, - "rewards/margins": 0.83965003490448, - "rewards/rejected": -1.7670910358428955, + "rewards/chosen": -0.8054243326187134, + "rewards/margins": 0.8550852537155151, + "rewards/rejected": -1.660509467124939, "step": 1060 }, { "epoch": 0.852, - "grad_norm": 12.203771591186523, + "grad_norm": 11.76474380493164, "learning_rate": 3.262626762369525e-07, - "logits/chosen": -2.5433928966522217, - "logits/rejected": -2.4640583992004395, - "logps/chosen": -335.0285339355469, - "logps/rejected": -358.05206298828125, - "loss": 0.5235, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.8205272555351257, - "rewards/margins": 0.7125735282897949, - "rewards/rejected": -1.5331008434295654, + "logits/chosen": -2.5506832599639893, + "logits/rejected": -2.4736270904541016, + "logps/chosen": -330.5300598144531, + "logps/rejected": -350.5076599121094, + "loss": 0.5339, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7755425572395325, + "rewards/margins": 0.6821144819259644, + "rewards/rejected": -1.4576570987701416, "step": 1065 }, { "epoch": 0.856, - "grad_norm": 10.726627349853516, + "grad_norm": 9.793760299682617, "learning_rate": 3.092332998903416e-07, - "logits/chosen": -2.5511505603790283, - "logits/rejected": -2.555680990219116, - "logps/chosen": -387.1297912597656, - "logps/rejected": -438.8545837402344, - "loss": 0.5697, + "logits/chosen": -2.554365634918213, + "logits/rejected": -2.5592918395996094, + "logps/chosen": -383.59014892578125, + "logps/rejected": -433.6626892089844, + "loss": 0.5678, "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.7436598539352417, - "rewards/margins": 0.5584943890571594, - "rewards/rejected": -1.302154302597046, + "rewards/chosen": -0.708263635635376, + "rewards/margins": 0.541972279548645, + "rewards/rejected": -1.250235915184021, "step": 1070 }, { "epoch": 0.86, - "grad_norm": 10.780182838439941, + "grad_norm": 8.709606170654297, "learning_rate": 2.9263101785268253e-07, - "logits/chosen": -2.5446937084198, - "logits/rejected": -2.5189390182495117, - "logps/chosen": -376.8099670410156, - "logps/rejected": -390.14105224609375, - "loss": 0.6353, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.9379167556762695, - "rewards/margins": 0.45755672454833984, - "rewards/rejected": -1.3954734802246094, + "logits/chosen": -2.5445055961608887, + "logits/rejected": -2.5182182788848877, + "logps/chosen": -370.8434753417969, + "logps/rejected": -384.3281555175781, + "loss": 0.6357, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.878252387046814, + "rewards/margins": 0.4590927064418793, + "rewards/rejected": -1.3373451232910156, "step": 1075 }, { "epoch": 0.864, - "grad_norm": 9.165637969970703, + "grad_norm": 7.788934230804443, "learning_rate": 2.764590667717562e-07, - "logits/chosen": -2.520571231842041, - "logits/rejected": -2.5012898445129395, - "logps/chosen": -348.20050048828125, - "logps/rejected": -430.0379333496094, - "loss": 0.4818, + "logits/chosen": -2.5197861194610596, + "logits/rejected": -2.498582363128662, + "logps/chosen": -348.8467712402344, + "logps/rejected": -429.28936767578125, + "loss": 0.4726, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.767501175403595, - "rewards/margins": 0.8683005571365356, - "rewards/rejected": -1.6358016729354858, + "rewards/chosen": -0.7739642858505249, + "rewards/margins": 0.8543514013290405, + "rewards/rejected": -1.6283156871795654, "step": 1080 }, { "epoch": 0.868, - "grad_norm": 15.374898910522461, + "grad_norm": 9.934327125549316, "learning_rate": 2.6072059940146775e-07, - "logits/chosen": -2.4875528812408447, - "logits/rejected": -2.4620859622955322, - "logps/chosen": -358.6385192871094, - "logps/rejected": -375.6526794433594, - "loss": 0.641, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.9842912554740906, - "rewards/margins": 0.40321025252342224, - "rewards/rejected": -1.3875017166137695, + "logits/chosen": -2.4858384132385254, + "logits/rejected": -2.4607391357421875, + "logps/chosen": -357.95025634765625, + "logps/rejected": -370.97479248046875, + "loss": 0.639, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -0.9774085283279419, + "rewards/margins": 0.36331382393836975, + "rewards/rejected": -1.3407223224639893, "step": 1085 }, { "epoch": 0.872, - "grad_norm": 16.681074142456055, + "grad_norm": 12.652565956115723, "learning_rate": 2.454186839872158e-07, - "logits/chosen": -2.4695944786071777, - "logits/rejected": -2.4315385818481445, - "logps/chosen": -369.12884521484375, - "logps/rejected": -432.47467041015625, - "loss": 0.5636, + "logits/chosen": -2.4667727947235107, + "logits/rejected": -2.428893566131592, + "logps/chosen": -368.6217346191406, + "logps/rejected": -427.3558654785156, + "loss": 0.5759, "rewards/accuracies": 0.6875, - "rewards/chosen": -0.8553116917610168, - "rewards/margins": 0.5986126065254211, - "rewards/rejected": -1.453924298286438, + "rewards/chosen": -0.8502403497695923, + "rewards/margins": 0.5524962544441223, + "rewards/rejected": -1.4027366638183594, "step": 1090 }, { "epoch": 0.876, - "grad_norm": 12.826825141906738, + "grad_norm": 7.642593860626221, "learning_rate": 2.3055630366772857e-07, - "logits/chosen": -2.560246229171753, - "logits/rejected": -2.5458738803863525, - "logps/chosen": -357.64227294921875, - "logps/rejected": -396.8709411621094, - "loss": 0.5257, + "logits/chosen": -2.5572714805603027, + "logits/rejected": -2.5431621074676514, + "logps/chosen": -356.75775146484375, + "logps/rejected": -395.92498779296875, + "loss": 0.5148, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.7637326717376709, - "rewards/margins": 0.737055242061615, - "rewards/rejected": -1.5007880926132202, + "rewards/chosen": -0.7548877000808716, + "rewards/margins": 0.7364410161972046, + "rewards/rejected": -1.4913287162780762, "step": 1095 }, { "epoch": 0.88, - "grad_norm": 10.747795104980469, + "grad_norm": 10.851374626159668, "learning_rate": 2.1613635589349756e-07, - "logits/chosen": -2.552135944366455, - "logits/rejected": -2.548779010772705, - "logps/chosen": -344.90142822265625, - "logps/rejected": -395.3324279785156, - "loss": 0.5152, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.8041820526123047, - "rewards/margins": 0.7682543396949768, - "rewards/rejected": -1.5724363327026367, + "logits/chosen": -2.549379825592041, + "logits/rejected": -2.547346830368042, + "logps/chosen": -347.64202880859375, + "logps/rejected": -392.60089111328125, + "loss": 0.5144, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.831588625907898, + "rewards/margins": 0.7135321497917175, + "rewards/rejected": -1.5451208353042603, "step": 1100 }, { "epoch": 0.88, - "eval_logits/chosen": -2.5619542598724365, - "eval_logits/rejected": -2.518826723098755, - "eval_logps/chosen": -357.70245361328125, - "eval_logps/rejected": -389.9855041503906, - "eval_loss": 0.5383636951446533, - "eval_rewards/accuracies": 0.7142857313156128, - "eval_rewards/chosen": -0.7446432113647461, - "eval_rewards/margins": 0.6749160289764404, - "eval_rewards/rejected": -1.4195590019226074, - "eval_runtime": 166.7191, - "eval_samples_per_second": 2.999, - "eval_steps_per_second": 0.378, + "eval_logits/chosen": -2.55863094329834, + "eval_logits/rejected": -2.516242742538452, + "eval_logps/chosen": -357.33807373046875, + "eval_logps/rejected": -388.2352600097656, + "eval_loss": 0.5326837301254272, + "eval_rewards/accuracies": 0.7123016119003296, + "eval_rewards/chosen": -0.7409996390342712, + "eval_rewards/margins": 0.6610568761825562, + "eval_rewards/rejected": -1.4020566940307617, + "eval_runtime": 166.2233, + "eval_samples_per_second": 3.008, + "eval_steps_per_second": 0.379, "step": 1100 }, { "epoch": 0.884, - "grad_norm": 15.551572799682617, + "grad_norm": 11.941755294799805, "learning_rate": 2.0216165186191406e-07, - "logits/chosen": -2.529543399810791, - "logits/rejected": -2.511012554168701, - "logps/chosen": -359.8935546875, - "logps/rejected": -426.3463439941406, - "loss": 0.5322, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.7537140846252441, - "rewards/margins": 0.747148871421814, - "rewards/rejected": -1.5008628368377686, + "logits/chosen": -2.5250916481018066, + "logits/rejected": -2.5078232288360596, + "logps/chosen": -360.7425842285156, + "logps/rejected": -419.687744140625, + "loss": 0.5508, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.7622045874595642, + "rewards/margins": 0.6720725297927856, + "rewards/rejected": -1.434277057647705, "step": 1105 }, { "epoch": 0.888, - "grad_norm": 10.654102325439453, + "grad_norm": 11.753776550292969, "learning_rate": 1.8863491596921745e-07, - "logits/chosen": -2.531877040863037, - "logits/rejected": -2.5003228187561035, - "logps/chosen": -394.6542053222656, - "logps/rejected": -419.4615173339844, - "loss": 0.6303, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.9292774200439453, - "rewards/margins": 0.46585217118263245, - "rewards/rejected": -1.3951294422149658, + "logits/chosen": -2.5271763801574707, + "logits/rejected": -2.495025396347046, + "logps/chosen": -394.4120788574219, + "logps/rejected": -420.91552734375, + "loss": 0.6139, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.926856517791748, + "rewards/margins": 0.48281335830688477, + "rewards/rejected": -1.4096698760986328, "step": 1110 }, { "epoch": 0.892, - "grad_norm": 12.998001098632812, + "grad_norm": 12.333569526672363, "learning_rate": 1.7555878527937164e-07, - "logits/chosen": -2.6154770851135254, - "logits/rejected": -2.573152542114258, - "logps/chosen": -378.4177551269531, - "logps/rejected": -407.0361328125, - "loss": 0.4692, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.8625243902206421, - "rewards/margins": 0.8568865060806274, - "rewards/rejected": -1.7194106578826904, + "logits/chosen": -2.6087048053741455, + "logits/rejected": -2.5676796436309814, + "logps/chosen": -378.8360595703125, + "logps/rejected": -399.5335693359375, + "loss": 0.4934, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.8667081594467163, + "rewards/margins": 0.7776769995689392, + "rewards/rejected": -1.6443853378295898, "step": 1115 }, { "epoch": 0.896, - "grad_norm": 9.31609058380127, + "grad_norm": 9.7278413772583, "learning_rate": 1.629358090099639e-07, - "logits/chosen": -2.4966423511505127, - "logits/rejected": -2.4891440868377686, - "logps/chosen": -389.1834411621094, - "logps/rejected": -428.36199951171875, - "loss": 0.5027, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.8896617889404297, - "rewards/margins": 0.7314151525497437, - "rewards/rejected": -1.6210769414901733, + "logits/chosen": -2.495575428009033, + "logits/rejected": -2.489112615585327, + "logps/chosen": -391.45159912109375, + "logps/rejected": -426.06085205078125, + "loss": 0.5054, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9123435020446777, + "rewards/margins": 0.6857225298881531, + "rewards/rejected": -1.5980660915374756, "step": 1120 }, { "epoch": 0.9, - "grad_norm": 10.968253135681152, + "grad_norm": 9.703481674194336, "learning_rate": 1.507684480352292e-07, - "logits/chosen": -2.5237255096435547, - "logits/rejected": -2.5314764976501465, - "logps/chosen": -367.54150390625, - "logps/rejected": -418.0208435058594, - "loss": 0.5305, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.9517295956611633, - "rewards/margins": 0.6929836869239807, - "rewards/rejected": -1.6447131633758545, + "logits/chosen": -2.5202584266662598, + "logits/rejected": -2.527817964553833, + "logps/chosen": -364.148681640625, + "logps/rejected": -412.2860412597656, + "loss": 0.529, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9178013801574707, + "rewards/margins": 0.6695634126663208, + "rewards/rejected": -1.5873647928237915, "step": 1125 }, { "epoch": 0.904, - "grad_norm": 9.858650207519531, + "grad_norm": 6.87813138961792, "learning_rate": 1.3905907440629752e-07, - "logits/chosen": -2.551881790161133, - "logits/rejected": -2.5350139141082764, - "logps/chosen": -366.10174560546875, - "logps/rejected": -396.09478759765625, - "loss": 0.551, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.8920289874076843, - "rewards/margins": 0.6426266431808472, - "rewards/rejected": -1.5346556901931763, + "logits/chosen": -2.5462465286254883, + "logits/rejected": -2.529540777206421, + "logps/chosen": -367.54986572265625, + "logps/rejected": -395.8585510253906, + "loss": 0.5463, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9065104722976685, + "rewards/margins": 0.6257832050323486, + "rewards/rejected": -1.532293677330017, "step": 1130 }, { "epoch": 0.908, - "grad_norm": 9.662504196166992, + "grad_norm": 9.895462989807129, "learning_rate": 1.278099708887587e-07, - "logits/chosen": -2.5561671257019043, - "logits/rejected": -2.5373544692993164, - "logps/chosen": -344.9742431640625, - "logps/rejected": -458.02313232421875, - "loss": 0.5248, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.779849112033844, - "rewards/margins": 0.7551368474960327, - "rewards/rejected": -1.534985899925232, + "logits/chosen": -2.552335262298584, + "logits/rejected": -2.5324137210845947, + "logps/chosen": -345.7575988769531, + "logps/rejected": -455.0641174316406, + "loss": 0.5316, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7876826524734497, + "rewards/margins": 0.7177135348320007, + "rewards/rejected": -1.5053961277008057, "step": 1135 }, { "epoch": 0.912, - "grad_norm": 12.876849174499512, + "grad_norm": 8.755758285522461, "learning_rate": 1.1702333051763271e-07, - "logits/chosen": -2.5694494247436523, - "logits/rejected": -2.5618202686309814, - "logps/chosen": -399.39739990234375, - "logps/rejected": -404.67535400390625, - "loss": 0.541, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.8758378028869629, - "rewards/margins": 0.7352150082588196, - "rewards/rejected": -1.6110527515411377, + "logits/chosen": -2.5616421699523926, + "logits/rejected": -2.554831027984619, + "logps/chosen": -397.1969909667969, + "logps/rejected": -403.56439208984375, + "loss": 0.5163, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.8538335561752319, + "rewards/margins": 0.7461098432540894, + "rewards/rejected": -1.5999435186386108, "step": 1140 }, { "epoch": 0.916, - "grad_norm": 10.277182579040527, + "grad_norm": 12.037848472595215, "learning_rate": 1.067012561698319e-07, - "logits/chosen": -2.5358214378356934, - "logits/rejected": -2.5226242542266846, - "logps/chosen": -378.08331298828125, - "logps/rejected": -406.0291442871094, - "loss": 0.6472, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.8850030899047852, - "rewards/margins": 0.44015589356422424, - "rewards/rejected": -1.3251588344573975, + "logits/chosen": -2.5323455333709717, + "logits/rejected": -2.519660472869873, + "logps/chosen": -379.17340087890625, + "logps/rejected": -407.46014404296875, + "loss": 0.6399, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.895904541015625, + "rewards/margins": 0.44356465339660645, + "rewards/rejected": -1.3394691944122314, "step": 1145 }, { "epoch": 0.92, - "grad_norm": 17.924530029296875, + "grad_norm": 14.188241958618164, "learning_rate": 9.684576015420277e-08, - "logits/chosen": -2.4903244972229004, - "logits/rejected": -2.4617245197296143, - "logps/chosen": -328.5221252441406, - "logps/rejected": -361.15728759765625, - "loss": 0.5055, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.7505359053611755, - "rewards/margins": 0.7073702812194824, - "rewards/rejected": -1.4579061269760132, + "logits/chosen": -2.4839751720428467, + "logits/rejected": -2.4552297592163086, + "logps/chosen": -331.6858215332031, + "logps/rejected": -358.20819091796875, + "loss": 0.5293, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.7821733355522156, + "rewards/margins": 0.6462420225143433, + "rewards/rejected": -1.4284155368804932, "step": 1150 }, { "epoch": 0.924, - "grad_norm": 14.877918243408203, + "grad_norm": 21.802221298217773, "learning_rate": 8.745876381922147e-08, - "logits/chosen": -2.4926960468292236, - "logits/rejected": -2.525510549545288, - "logps/chosen": -342.7877197265625, - "logps/rejected": -372.467041015625, - "loss": 0.5811, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.8225164413452148, - "rewards/margins": 0.6331211924552917, - "rewards/rejected": -1.4556376934051514, + "logits/chosen": -2.485172748565674, + "logits/rejected": -2.5178401470184326, + "logps/chosen": -343.31103515625, + "logps/rejected": -370.01336669921875, + "loss": 0.577, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8277499079704285, + "rewards/margins": 0.6033510565757751, + "rewards/rejected": -1.4311010837554932, "step": 1155 }, { "epoch": 0.928, - "grad_norm": 14.825825691223145, + "grad_norm": 11.248420715332031, "learning_rate": 7.854209717842231e-08, - "logits/chosen": -2.560148239135742, - "logits/rejected": -2.536252975463867, - "logps/chosen": -389.2018127441406, - "logps/rejected": -387.626220703125, - "loss": 0.6612, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.9954813122749329, - "rewards/margins": 0.3569512963294983, - "rewards/rejected": -1.3524326086044312, + "logits/chosen": -2.5530881881713867, + "logits/rejected": -2.5298221111297607, + "logps/chosen": -387.3213806152344, + "logps/rejected": -385.5107421875, + "loss": 0.6488, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.9766770601272583, + "rewards/margins": 0.35460105538368225, + "rewards/rejected": -1.3312779664993286, "step": 1160 }, { "epoch": 0.932, - "grad_norm": 14.72794246673584, + "grad_norm": 6.085402011871338, "learning_rate": 7.009749855363457e-08, - "logits/chosen": -2.5346550941467285, - "logits/rejected": -2.5153279304504395, - "logps/chosen": -340.6593322753906, - "logps/rejected": -409.87274169921875, - "loss": 0.5073, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.70791095495224, - "rewards/margins": 0.6868191361427307, - "rewards/rejected": -1.3947298526763916, + "logits/chosen": -2.5276684761047363, + "logits/rejected": -2.508495330810547, + "logps/chosen": -339.74969482421875, + "logps/rejected": -404.6656799316406, + "loss": 0.519, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.6988147497177124, + "rewards/margins": 0.6438443660736084, + "rewards/rejected": -1.3426591157913208, "step": 1165 }, { "epoch": 0.936, - "grad_norm": 12.192831993103027, + "grad_norm": 15.023430824279785, "learning_rate": 6.212661423609184e-08, - "logits/chosen": -2.601175308227539, - "logits/rejected": -2.5409445762634277, - "logps/chosen": -391.4974060058594, - "logps/rejected": -429.298583984375, - "loss": 0.5721, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.9178189039230347, - "rewards/margins": 0.643661379814148, - "rewards/rejected": -1.5614804029464722, + "logits/chosen": -2.5954625606536865, + "logits/rejected": -2.5354666709899902, + "logps/chosen": -389.9742736816406, + "logps/rejected": -427.60284423828125, + "loss": 0.5631, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9025875926017761, + "rewards/margins": 0.6419355869293213, + "rewards/rejected": -1.5445232391357422, "step": 1170 }, { "epoch": 0.94, - "grad_norm": 16.621145248413086, + "grad_norm": 12.646740913391113, "learning_rate": 5.463099816548578e-08, - "logits/chosen": -2.517141819000244, - "logits/rejected": -2.512964963912964, - "logps/chosen": -360.8076171875, - "logps/rejected": -446.977294921875, - "loss": 0.5067, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.9305839538574219, - "rewards/margins": 0.7509930729866028, - "rewards/rejected": -1.6815770864486694, + "logits/chosen": -2.5129947662353516, + "logits/rejected": -2.5076282024383545, + "logps/chosen": -355.4842224121094, + "logps/rejected": -443.46014404296875, + "loss": 0.4861, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8773505091667175, + "rewards/margins": 0.769055187702179, + "rewards/rejected": -1.646405816078186, "step": 1175 }, { "epoch": 0.944, - "grad_norm": 9.267435073852539, + "grad_norm": 8.745574951171875, "learning_rate": 4.761211162702117e-08, - "logits/chosen": -2.5712485313415527, - "logits/rejected": -2.508033037185669, - "logps/chosen": -396.81658935546875, - "logps/rejected": -446.858154296875, - "loss": 0.5266, + "logits/chosen": -2.5645899772644043, + "logits/rejected": -2.502182722091675, + "logps/chosen": -396.885498046875, + "logps/rejected": -444.1766662597656, + "loss": 0.5327, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.7593949437141418, - "rewards/margins": 0.6189205050468445, - "rewards/rejected": -1.3783155679702759, + "rewards/chosen": -0.760084331035614, + "rewards/margins": 0.5914161801338196, + "rewards/rejected": -1.3515005111694336, "step": 1180 }, { "epoch": 0.948, - "grad_norm": 6.9370436668396, + "grad_norm": 10.453509330749512, "learning_rate": 4.1071322966535487e-08, - "logits/chosen": -2.588221788406372, - "logits/rejected": -2.5175411701202393, - "logps/chosen": -418.30474853515625, - "logps/rejected": -404.17840576171875, - "loss": 0.4887, + "logits/chosen": -2.577366590499878, + "logits/rejected": -2.5066463947296143, + "logps/chosen": -418.02801513671875, + "logps/rejected": -403.1604309082031, + "loss": 0.4854, "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.7452821135520935, - "rewards/margins": 0.8689033389091492, - "rewards/rejected": -1.6141853332519531, + "rewards/chosen": -0.7425155639648438, + "rewards/margins": 0.8614899516105652, + "rewards/rejected": -1.6040055751800537, "step": 1185 }, { "epoch": 0.952, - "grad_norm": 4.675684452056885, + "grad_norm": 6.866016864776611, "learning_rate": 3.5009907323737826e-08, - "logits/chosen": -2.5127742290496826, - "logits/rejected": -2.58686900138855, - "logps/chosen": -371.75390625, - "logps/rejected": -481.8087463378906, - "loss": 0.4452, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.6970502138137817, - "rewards/margins": 0.9857767820358276, - "rewards/rejected": -1.6828269958496094, + "logits/chosen": -2.504338026046753, + "logits/rejected": -2.57658052444458, + "logps/chosen": -371.8552551269531, + "logps/rejected": -480.74066162109375, + "loss": 0.4368, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6980635523796082, + "rewards/margins": 0.9740827679634094, + "rewards/rejected": -1.6721464395523071, "step": 1190 }, { "epoch": 0.956, - "grad_norm": 8.315820693969727, + "grad_norm": 8.07772159576416, "learning_rate": 2.9429046383618042e-08, - "logits/chosen": -2.4666857719421387, - "logits/rejected": -2.4609508514404297, - "logps/chosen": -369.11480712890625, - "logps/rejected": -399.99835205078125, - "loss": 0.4684, + "logits/chosen": -2.459728717803955, + "logits/rejected": -2.4553236961364746, + "logps/chosen": -368.6015930175781, + "logps/rejected": -395.6241149902344, + "loss": 0.4823, "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.7124398946762085, - "rewards/margins": 0.7592355608940125, - "rewards/rejected": -1.4716756343841553, + "rewards/chosen": -0.7073078751564026, + "rewards/margins": 0.7206257581710815, + "rewards/rejected": -1.4279335737228394, "step": 1195 }, { "epoch": 0.96, - "grad_norm": 13.105049133300781, + "grad_norm": 13.021551132202148, "learning_rate": 2.4329828146074096e-08, - "logits/chosen": -2.531951904296875, - "logits/rejected": -2.5043699741363525, - "logps/chosen": -376.4866943359375, - "logps/rejected": -373.62109375, - "loss": 0.5213, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8802105188369751, - "rewards/margins": 0.7113884091377258, - "rewards/rejected": -1.5915989875793457, + "logits/chosen": -2.524336099624634, + "logits/rejected": -2.4975974559783936, + "logps/chosen": -377.58343505859375, + "logps/rejected": -374.9549255371094, + "loss": 0.5196, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -0.8911786079406738, + "rewards/margins": 0.713758647441864, + "rewards/rejected": -1.6049373149871826, "step": 1200 }, { "epoch": 0.96, - "eval_logits/chosen": -2.556800365447998, - "eval_logits/rejected": -2.513496160507202, - "eval_logps/chosen": -362.12188720703125, - "eval_logps/rejected": -395.5133361816406, - "eval_loss": 0.5369879007339478, - "eval_rewards/accuracies": 0.7063491940498352, - "eval_rewards/chosen": -0.7888382077217102, - "eval_rewards/margins": 0.6859992146492004, - "eval_rewards/rejected": -1.4748374223709106, - "eval_runtime": 166.4514, - "eval_samples_per_second": 3.004, - "eval_steps_per_second": 0.378, + "eval_logits/chosen": -2.5477142333984375, + "eval_logits/rejected": -2.504517078399658, + "eval_logps/chosen": -361.9387512207031, + "eval_logps/rejected": -394.47796630859375, + "eval_loss": 0.5300799608230591, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": -0.7870069146156311, + "eval_rewards/margins": 0.6774766445159912, + "eval_rewards/rejected": -1.4644837379455566, + "eval_runtime": 166.2408, + "eval_samples_per_second": 3.008, + "eval_steps_per_second": 0.379, "step": 1200 }, { "epoch": 0.964, - "grad_norm": 8.366193771362305, + "grad_norm": 8.705704689025879, "learning_rate": 1.9713246713805588e-08, - "logits/chosen": -2.4164879322052, - "logits/rejected": -2.395017623901367, - "logps/chosen": -332.5264892578125, - "logps/rejected": -406.001953125, - "loss": 0.4566, + "logits/chosen": -2.4079999923706055, + "logits/rejected": -2.3863213062286377, + "logps/chosen": -336.49639892578125, + "logps/rejected": -405.0527648925781, + "loss": 0.4696, "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.618050217628479, - "rewards/margins": 0.9078122973442078, - "rewards/rejected": -1.525862455368042, + "rewards/chosen": -0.6577492952346802, + "rewards/margins": 0.8586214780807495, + "rewards/rejected": -1.5163707733154297, "step": 1205 }, { "epoch": 0.968, - "grad_norm": 9.794474601745605, + "grad_norm": 9.633703231811523, "learning_rate": 1.5580202098509078e-08, - "logits/chosen": -2.498256206512451, - "logits/rejected": -2.455946207046509, - "logps/chosen": -413.91802978515625, - "logps/rejected": -458.69671630859375, - "loss": 0.6104, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.9604536890983582, - "rewards/margins": 0.4910499155521393, - "rewards/rejected": -1.4515035152435303, + "logits/chosen": -2.488119602203369, + "logits/rejected": -2.446547746658325, + "logps/chosen": -409.77557373046875, + "logps/rejected": -457.3531188964844, + "loss": 0.5975, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.9190298318862915, + "rewards/margins": 0.5190376043319702, + "rewards/rejected": -1.4380674362182617, "step": 1210 }, { "epoch": 0.972, - "grad_norm": 9.097041130065918, + "grad_norm": 9.208328247070312, "learning_rate": 1.193150004542204e-08, - "logits/chosen": -2.531789779663086, - "logits/rejected": -2.5278313159942627, - "logps/chosen": -356.40386962890625, - "logps/rejected": -406.5818786621094, - "loss": 0.5828, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.6918624639511108, - "rewards/margins": 0.5878754258155823, - "rewards/rejected": -1.2797380685806274, + "logits/chosen": -2.523573160171509, + "logits/rejected": -2.5186927318573, + "logps/chosen": -355.54656982421875, + "logps/rejected": -407.33172607421875, + "loss": 0.5734, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.683289647102356, + "rewards/margins": 0.6039477586746216, + "rewards/rejected": -1.2872374057769775, "step": 1215 }, { "epoch": 0.976, - "grad_norm": 10.505363464355469, + "grad_norm": 7.021068096160889, "learning_rate": 8.767851876239075e-09, - "logits/chosen": -2.512781858444214, - "logits/rejected": -2.4617748260498047, - "logps/chosen": -325.30657958984375, - "logps/rejected": -373.95013427734375, - "loss": 0.5752, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.7784557342529297, - "rewards/margins": 0.626113772392273, - "rewards/rejected": -1.4045695066452026, + "logits/chosen": -2.505402088165283, + "logits/rejected": -2.454876661300659, + "logps/chosen": -327.73358154296875, + "logps/rejected": -372.61370849609375, + "loss": 0.5824, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8027257919311523, + "rewards/margins": 0.5884792804718018, + "rewards/rejected": -1.391205072402954, "step": 1220 }, { "epoch": 0.98, - "grad_norm": 9.12982177734375, + "grad_norm": 8.4197416305542, "learning_rate": 6.089874350439507e-09, - "logits/chosen": -2.5082201957702637, - "logits/rejected": -2.4916832447052, - "logps/chosen": -437.98638916015625, - "logps/rejected": -452.052001953125, - "loss": 0.5096, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.8293756246566772, - "rewards/margins": 0.6941211819648743, - "rewards/rejected": -1.5234968662261963, + "logits/chosen": -2.5013089179992676, + "logits/rejected": -2.485605239868164, + "logps/chosen": -435.61669921875, + "logps/rejected": -448.99688720703125, + "loss": 0.5037, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.8056790232658386, + "rewards/margins": 0.687267005443573, + "rewards/rejected": -1.492945909500122, "step": 1225 }, { "epoch": 0.984, - "grad_norm": 9.886011123657227, + "grad_norm": 9.84626293182373, "learning_rate": 3.8980895450474455e-09, - "logits/chosen": -2.47562313079834, - "logits/rejected": -2.472003221511841, - "logps/chosen": -375.73370361328125, - "logps/rejected": -489.0946350097656, - "loss": 0.4262, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.7051881551742554, - "rewards/margins": 0.9877891540527344, - "rewards/rejected": -1.6929775476455688, + "logits/chosen": -2.469447612762451, + "logits/rejected": -2.4653396606445312, + "logps/chosen": -375.6591796875, + "logps/rejected": -485.65179443359375, + "loss": 0.4352, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -0.7044429779052734, + "rewards/margins": 0.9541055560112, + "rewards/rejected": -1.658548355102539, "step": 1230 }, { "epoch": 0.988, - "grad_norm": 8.874799728393555, + "grad_norm": 10.856142044067383, "learning_rate": 2.192924752854042e-09, - "logits/chosen": -2.579641819000244, - "logits/rejected": -2.561047077178955, - "logps/chosen": -357.88421630859375, - "logps/rejected": -407.43438720703125, - "loss": 0.5599, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": -0.7861407995223999, - "rewards/margins": 0.5933399796485901, - "rewards/rejected": -1.3794807195663452, + "logits/chosen": -2.5709242820739746, + "logits/rejected": -2.552412986755371, + "logps/chosen": -359.99749755859375, + "logps/rejected": -404.10693359375, + "loss": 0.5811, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.8072735071182251, + "rewards/margins": 0.5389326810836792, + "rewards/rejected": -1.3462061882019043, "step": 1235 }, { "epoch": 0.992, - "grad_norm": 8.425810813903809, + "grad_norm": 8.36683464050293, "learning_rate": 9.747123991141193e-10, - "logits/chosen": -2.442471981048584, - "logits/rejected": -2.426997661590576, - "logps/chosen": -376.2283020019531, - "logps/rejected": -400.55718994140625, - "loss": 0.575, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": -0.9100795984268188, - "rewards/margins": 0.6220163702964783, - "rewards/rejected": -1.5320959091186523, + "logits/chosen": -2.4341177940368652, + "logits/rejected": -2.4185235500335693, + "logps/chosen": -372.7251892089844, + "logps/rejected": -395.2005310058594, + "loss": 0.5735, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -0.8750492334365845, + "rewards/margins": 0.6034801006317139, + "rewards/rejected": -1.4785292148590088, "step": 1240 }, { "epoch": 0.996, - "grad_norm": 9.747213363647461, + "grad_norm": 9.960768699645996, "learning_rate": 2.43689976739403e-10, - "logits/chosen": -2.400465250015259, - "logits/rejected": -2.4487109184265137, - "logps/chosen": -408.04913330078125, - "logps/rejected": -413.04443359375, - "loss": 0.5348, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.8299952745437622, - "rewards/margins": 0.6040025353431702, - "rewards/rejected": -1.4339977502822876, + "logits/chosen": -2.397348642349243, + "logits/rejected": -2.444608688354492, + "logps/chosen": -407.68475341796875, + "logps/rejected": -409.0362243652344, + "loss": 0.5478, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.8263516426086426, + "rewards/margins": 0.5675632357597351, + "rewards/rejected": -1.3939149379730225, "step": 1245 }, { "epoch": 1.0, - "grad_norm": 9.561023712158203, + "grad_norm": 14.954544067382812, "learning_rate": 0.0, - "logits/chosen": -2.478743076324463, - "logits/rejected": -2.454713821411133, - "logps/chosen": -397.37994384765625, - "logps/rejected": -446.76275634765625, - "loss": 0.5129, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.9160435795783997, - "rewards/margins": 0.6437736749649048, - "rewards/rejected": -1.5598171949386597, + "logits/chosen": -2.471954822540283, + "logits/rejected": -2.448702335357666, + "logps/chosen": -397.40447998046875, + "logps/rejected": -444.6131896972656, + "loss": 0.5219, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": -0.9162886738777161, + "rewards/margins": 0.6220329999923706, + "rewards/rejected": -1.5383217334747314, "step": 1250 }, { "epoch": 1.0, "step": 1250, "total_flos": 0.0, - "train_loss": 0.0, - "train_runtime": 0.0105, - "train_samples_per_second": 1898476.441, - "train_steps_per_second": 118654.778 + "train_loss": 0.5873338260650635, + "train_runtime": 15803.1996, + "train_samples_per_second": 1.266, + "train_steps_per_second": 0.079 } ], "logging_steps": 5,