{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 485, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.020408163265306e-08, "logits/chosen": -3.094454526901245, "logits/rejected": -3.0498220920562744, "logps/chosen": -242.99183654785156, "logps/rejected": -74.66817474365234, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.0204081632653061e-07, "logits/chosen": -3.032047986984253, "logits/rejected": -3.029446840286255, "logps/chosen": -290.1824645996094, "logps/rejected": -75.82839965820312, "loss": 0.6935, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -0.007104851305484772, "rewards/margins": -0.0044839149340987206, "rewards/rejected": -0.0026209354400634766, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.0408163265306121e-07, "logits/chosen": -2.9773757457733154, "logits/rejected": -2.967517852783203, "logps/chosen": -297.57342529296875, "logps/rejected": -77.62318420410156, "loss": 0.692, "rewards/accuracies": 0.5625, "rewards/chosen": 0.00020697650325018913, "rewards/margins": 0.003021990181878209, "rewards/rejected": -0.0028150142170488834, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.0612244897959183e-07, "logits/chosen": -2.983607769012451, "logits/rejected": -2.9363152980804443, "logps/chosen": -288.51458740234375, "logps/rejected": -75.65086364746094, "loss": 0.6892, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0037677965592592955, "rewards/margins": 0.004846884869039059, "rewards/rejected": -0.008614679798483849, "step": 30 }, { "epoch": 0.08, "learning_rate": 4.0816326530612243e-07, "logits/chosen": -3.0467514991760254, "logits/rejected": -3.010239362716675, "logps/chosen": -243.7971954345703, "logps/rejected": -81.06056213378906, "loss": 0.685, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0063628097996115685, "rewards/margins": 0.02118637040257454, "rewards/rejected": -0.014823561534285545, "step": 40 }, { "epoch": 0.1, "learning_rate": 4.988532110091743e-07, "logits/chosen": -3.0095317363739014, "logits/rejected": -3.0367846488952637, "logps/chosen": -251.5819854736328, "logps/rejected": -78.19547271728516, "loss": 0.6784, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005416669882833958, "rewards/margins": 0.023932188749313354, "rewards/rejected": -0.018515516072511673, "step": 50 }, { "epoch": 0.12, "learning_rate": 4.873853211009174e-07, "logits/chosen": -3.0116028785705566, "logits/rejected": -3.0300631523132324, "logps/chosen": -281.01361083984375, "logps/rejected": -75.49365997314453, "loss": 0.6715, "rewards/accuracies": 0.8125, "rewards/chosen": 0.015385298058390617, "rewards/margins": 0.050571341067552567, "rewards/rejected": -0.0351860448718071, "step": 60 }, { "epoch": 0.14, "learning_rate": 4.7591743119266054e-07, "logits/chosen": -3.0327250957489014, "logits/rejected": -3.0184121131896973, "logps/chosen": -262.8722229003906, "logps/rejected": -71.65990447998047, "loss": 0.6649, "rewards/accuracies": 0.831250011920929, "rewards/chosen": 0.016824517399072647, "rewards/margins": 0.06025807186961174, "rewards/rejected": -0.043433547019958496, "step": 70 }, { "epoch": 0.16, "learning_rate": 4.644495412844037e-07, "logits/chosen": -3.0364532470703125, "logits/rejected": -2.988002300262451, "logps/chosen": -254.49423217773438, "logps/rejected": -70.27412414550781, "loss": 0.6556, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.022701723501086235, "rewards/margins": 0.07623252272605896, "rewards/rejected": -0.05353079363703728, "step": 80 }, { "epoch": 0.19, "learning_rate": 4.5298165137614677e-07, "logits/chosen": -3.068497657775879, "logits/rejected": -3.0402565002441406, "logps/chosen": -266.61614990234375, "logps/rejected": -81.87393951416016, "loss": 0.6455, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": 0.026070792227983475, "rewards/margins": 0.10358123481273651, "rewards/rejected": -0.07751044631004333, "step": 90 }, { "epoch": 0.21, "learning_rate": 4.4151376146788986e-07, "logits/chosen": -3.0521655082702637, "logits/rejected": -3.057821750640869, "logps/chosen": -286.0577087402344, "logps/rejected": -77.96414947509766, "loss": 0.6336, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.033475782722234726, "rewards/margins": 0.14013811945915222, "rewards/rejected": -0.10666234791278839, "step": 100 }, { "epoch": 0.23, "learning_rate": 4.30045871559633e-07, "logits/chosen": -3.003532886505127, "logits/rejected": -2.995978355407715, "logps/chosen": -276.5457458496094, "logps/rejected": -80.02079010009766, "loss": 0.6234, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0331401564180851, "rewards/margins": 0.14480046927928925, "rewards/rejected": -0.11166031658649445, "step": 110 }, { "epoch": 0.25, "learning_rate": 4.1857798165137613e-07, "logits/chosen": -3.0330376625061035, "logits/rejected": -3.030214548110962, "logps/chosen": -276.41632080078125, "logps/rejected": -77.67643737792969, "loss": 0.6164, "rewards/accuracies": 0.9375, "rewards/chosen": 0.043682295829057693, "rewards/margins": 0.177944153547287, "rewards/rejected": -0.1342618763446808, "step": 120 }, { "epoch": 0.27, "learning_rate": 4.071100917431192e-07, "logits/chosen": -2.9754703044891357, "logits/rejected": -2.9898681640625, "logps/chosen": -283.3277587890625, "logps/rejected": -83.87138366699219, "loss": 0.6121, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.048630841076374054, "rewards/margins": 0.19439519941806793, "rewards/rejected": -0.14576435089111328, "step": 130 }, { "epoch": 0.29, "learning_rate": 3.9564220183486236e-07, "logits/chosen": -3.0477757453918457, "logits/rejected": -3.0237550735473633, "logps/chosen": -291.98065185546875, "logps/rejected": -82.53144073486328, "loss": 0.5997, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.034745730459690094, "rewards/margins": 0.20989501476287842, "rewards/rejected": -0.17514929175376892, "step": 140 }, { "epoch": 0.31, "learning_rate": 3.841743119266055e-07, "logits/chosen": -3.033001661300659, "logits/rejected": -3.015845775604248, "logps/chosen": -289.15582275390625, "logps/rejected": -76.08447265625, "loss": 0.5925, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.0425817035138607, "rewards/margins": 0.21189098060131073, "rewards/rejected": -0.16930925846099854, "step": 150 }, { "epoch": 0.33, "learning_rate": 3.7270642201834864e-07, "logits/chosen": -3.0720551013946533, "logits/rejected": -3.0518932342529297, "logps/chosen": -271.08258056640625, "logps/rejected": -75.97576141357422, "loss": 0.5874, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.03000471368432045, "rewards/margins": 0.20934228599071503, "rewards/rejected": -0.17933759093284607, "step": 160 }, { "epoch": 0.35, "learning_rate": 3.612385321100918e-07, "logits/chosen": -3.026865243911743, "logits/rejected": -3.030813455581665, "logps/chosen": -287.5133361816406, "logps/rejected": -77.84892272949219, "loss": 0.5811, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.050167638808488846, "rewards/margins": 0.24577708542346954, "rewards/rejected": -0.1956094205379486, "step": 170 }, { "epoch": 0.37, "learning_rate": 3.497706422018348e-07, "logits/chosen": -3.064037322998047, "logits/rejected": -3.0434131622314453, "logps/chosen": -270.81378173828125, "logps/rejected": -78.64222717285156, "loss": 0.5708, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.0572846345603466, "rewards/margins": 0.27750909328460693, "rewards/rejected": -0.2202244997024536, "step": 180 }, { "epoch": 0.39, "learning_rate": 3.3830275229357795e-07, "logits/chosen": -3.0381369590759277, "logits/rejected": -3.031832456588745, "logps/chosen": -273.7306823730469, "logps/rejected": -79.31744384765625, "loss": 0.5604, "rewards/accuracies": 0.96875, "rewards/chosen": 0.05553610250353813, "rewards/margins": 0.29081013798713684, "rewards/rejected": -0.2352740317583084, "step": 190 }, { "epoch": 0.41, "learning_rate": 3.268348623853211e-07, "logits/chosen": -3.036811113357544, "logits/rejected": -3.0287680625915527, "logps/chosen": -266.4691467285156, "logps/rejected": -77.38215637207031, "loss": 0.5504, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.08118367195129395, "rewards/margins": 0.3425747752189636, "rewards/rejected": -0.2613911032676697, "step": 200 }, { "epoch": 0.43, "learning_rate": 3.1536697247706423e-07, "logits/chosen": -3.061699867248535, "logits/rejected": -3.042888641357422, "logps/chosen": -269.961181640625, "logps/rejected": -89.21647644042969, "loss": 0.5501, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.07142322510480881, "rewards/margins": 0.3240587115287781, "rewards/rejected": -0.25263547897338867, "step": 210 }, { "epoch": 0.45, "learning_rate": 3.038990825688073e-07, "logits/chosen": -3.04771089553833, "logits/rejected": -3.018721103668213, "logps/chosen": -250.44091796875, "logps/rejected": -72.33317565917969, "loss": 0.5488, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.06637217104434967, "rewards/margins": 0.3276647627353668, "rewards/rejected": -0.26129260659217834, "step": 220 }, { "epoch": 0.47, "learning_rate": 2.9243119266055045e-07, "logits/chosen": -2.9626972675323486, "logits/rejected": -2.9827158451080322, "logps/chosen": -293.9212646484375, "logps/rejected": -72.2821044921875, "loss": 0.5313, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.08349540829658508, "rewards/margins": 0.3892216682434082, "rewards/rejected": -0.30572623014450073, "step": 230 }, { "epoch": 0.49, "learning_rate": 2.809633027522936e-07, "logits/chosen": -3.034790277481079, "logits/rejected": -3.016634225845337, "logps/chosen": -280.6105651855469, "logps/rejected": -76.09197235107422, "loss": 0.5333, "rewards/accuracies": 0.9375, "rewards/chosen": 0.08378176391124725, "rewards/margins": 0.4068339467048645, "rewards/rejected": -0.32305219769477844, "step": 240 }, { "epoch": 0.52, "learning_rate": 2.6949541284403673e-07, "logits/chosen": -3.0789849758148193, "logits/rejected": -3.0785841941833496, "logps/chosen": -264.5536804199219, "logps/rejected": -82.22047424316406, "loss": 0.5282, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.06328760087490082, "rewards/margins": 0.40200409293174744, "rewards/rejected": -0.3387165069580078, "step": 250 }, { "epoch": 0.54, "learning_rate": 2.5802752293577976e-07, "logits/chosen": -2.9741625785827637, "logits/rejected": -2.9866743087768555, "logps/chosen": -282.30902099609375, "logps/rejected": -70.76858520507812, "loss": 0.5277, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.10191468149423599, "rewards/margins": 0.39590951800346375, "rewards/rejected": -0.29399481415748596, "step": 260 }, { "epoch": 0.56, "learning_rate": 2.465596330275229e-07, "logits/chosen": -3.032557964324951, "logits/rejected": -3.03240704536438, "logps/chosen": -274.0851135253906, "logps/rejected": -86.98384094238281, "loss": 0.5135, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07479412853717804, "rewards/margins": 0.4109489321708679, "rewards/rejected": -0.3361548185348511, "step": 270 }, { "epoch": 0.58, "learning_rate": 2.3509174311926604e-07, "logits/chosen": -3.060285806655884, "logits/rejected": -2.9775302410125732, "logps/chosen": -253.785888671875, "logps/rejected": -70.39444732666016, "loss": 0.5183, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.07235217839479446, "rewards/margins": 0.3860532343387604, "rewards/rejected": -0.31370100378990173, "step": 280 }, { "epoch": 0.6, "learning_rate": 2.2362385321100916e-07, "logits/chosen": -3.029343843460083, "logits/rejected": -3.0406129360198975, "logps/chosen": -276.57196044921875, "logps/rejected": -84.54597473144531, "loss": 0.5107, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.08857797086238861, "rewards/margins": 0.4803849756717682, "rewards/rejected": -0.3918069899082184, "step": 290 }, { "epoch": 0.62, "learning_rate": 2.121559633027523e-07, "logits/chosen": -2.9938578605651855, "logits/rejected": -2.9954426288604736, "logps/chosen": -273.7822265625, "logps/rejected": -77.98421478271484, "loss": 0.5079, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.08799968659877777, "rewards/margins": 0.40502768754959106, "rewards/rejected": -0.3170279860496521, "step": 300 }, { "epoch": 0.64, "learning_rate": 2.0068807339449538e-07, "logits/chosen": -3.052614212036133, "logits/rejected": -3.0461201667785645, "logps/chosen": -281.28814697265625, "logps/rejected": -81.84606170654297, "loss": 0.5038, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.05326849967241287, "rewards/margins": 0.46244749426841736, "rewards/rejected": -0.4091789722442627, "step": 310 }, { "epoch": 0.66, "learning_rate": 1.8922018348623852e-07, "logits/chosen": -3.031501054763794, "logits/rejected": -3.042961597442627, "logps/chosen": -271.274658203125, "logps/rejected": -87.3827133178711, "loss": 0.5003, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.07084844261407852, "rewards/margins": 0.445441871881485, "rewards/rejected": -0.37459343671798706, "step": 320 }, { "epoch": 0.68, "learning_rate": 1.7775229357798163e-07, "logits/chosen": -3.0476019382476807, "logits/rejected": -3.0447893142700195, "logps/chosen": -249.735595703125, "logps/rejected": -73.10395812988281, "loss": 0.4976, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.06198754906654358, "rewards/margins": 0.43834322690963745, "rewards/rejected": -0.37635567784309387, "step": 330 }, { "epoch": 0.7, "learning_rate": 1.6628440366972477e-07, "logits/chosen": -3.055901288986206, "logits/rejected": -3.0517029762268066, "logps/chosen": -273.3477478027344, "logps/rejected": -85.53290557861328, "loss": 0.496, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.08338963240385056, "rewards/margins": 0.5042273998260498, "rewards/rejected": -0.42083778977394104, "step": 340 }, { "epoch": 0.72, "learning_rate": 1.5481651376146786e-07, "logits/chosen": -3.063744306564331, "logits/rejected": -3.066366195678711, "logps/chosen": -277.1488952636719, "logps/rejected": -88.2572250366211, "loss": 0.4931, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.07289155572652817, "rewards/margins": 0.5126849412918091, "rewards/rejected": -0.4397934079170227, "step": 350 }, { "epoch": 0.74, "learning_rate": 1.43348623853211e-07, "logits/chosen": -3.0237436294555664, "logits/rejected": -3.0258359909057617, "logps/chosen": -292.0096740722656, "logps/rejected": -81.93167114257812, "loss": 0.4951, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.07367613166570663, "rewards/margins": 0.49797001481056213, "rewards/rejected": -0.4242939352989197, "step": 360 }, { "epoch": 0.76, "learning_rate": 1.318807339449541e-07, "logits/chosen": -2.9882092475891113, "logits/rejected": -2.9637956619262695, "logps/chosen": -274.551513671875, "logps/rejected": -73.8973388671875, "loss": 0.496, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0880483016371727, "rewards/margins": 0.49274787306785583, "rewards/rejected": -0.4046996533870697, "step": 370 }, { "epoch": 0.78, "learning_rate": 1.2041284403669725e-07, "logits/chosen": -3.070621967315674, "logits/rejected": -3.0683789253234863, "logps/chosen": -266.607177734375, "logps/rejected": -81.02775573730469, "loss": 0.493, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.10891600698232651, "rewards/margins": 0.5303564071655273, "rewards/rejected": -0.42144036293029785, "step": 380 }, { "epoch": 0.8, "learning_rate": 1.0894495412844036e-07, "logits/chosen": -3.0497114658355713, "logits/rejected": -3.053192615509033, "logps/chosen": -280.43218994140625, "logps/rejected": -80.42735290527344, "loss": 0.4892, "rewards/accuracies": 0.9375, "rewards/chosen": 0.10893626511096954, "rewards/margins": 0.5605167746543884, "rewards/rejected": -0.4515805244445801, "step": 390 }, { "epoch": 0.82, "learning_rate": 9.747706422018348e-08, "logits/chosen": -3.002933979034424, "logits/rejected": -3.0063657760620117, "logps/chosen": -241.24276733398438, "logps/rejected": -75.92924499511719, "loss": 0.4833, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.07781459391117096, "rewards/margins": 0.46425342559814453, "rewards/rejected": -0.38643890619277954, "step": 400 }, { "epoch": 0.85, "learning_rate": 8.60091743119266e-08, "logits/chosen": -3.0454163551330566, "logits/rejected": -3.035583972930908, "logps/chosen": -264.18585205078125, "logps/rejected": -78.031982421875, "loss": 0.4744, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.09802711009979248, "rewards/margins": 0.5436574816703796, "rewards/rejected": -0.44563040137290955, "step": 410 }, { "epoch": 0.87, "learning_rate": 7.454128440366971e-08, "logits/chosen": -3.0196666717529297, "logits/rejected": -3.0026302337646484, "logps/chosen": -272.02630615234375, "logps/rejected": -82.01240539550781, "loss": 0.481, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.08279488980770111, "rewards/margins": 0.5704164505004883, "rewards/rejected": -0.48762160539627075, "step": 420 }, { "epoch": 0.89, "learning_rate": 6.307339449541284e-08, "logits/chosen": -3.0509345531463623, "logits/rejected": -3.0137345790863037, "logps/chosen": -262.2018127441406, "logps/rejected": -77.63418579101562, "loss": 0.4731, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1073322519659996, "rewards/margins": 0.5776056051254272, "rewards/rejected": -0.4702734053134918, "step": 430 }, { "epoch": 0.91, "learning_rate": 5.1605504587155966e-08, "logits/chosen": -3.0285000801086426, "logits/rejected": -3.0236475467681885, "logps/chosen": -266.83599853515625, "logps/rejected": -77.38362121582031, "loss": 0.476, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.08291526138782501, "rewards/margins": 0.4984784722328186, "rewards/rejected": -0.41556310653686523, "step": 440 }, { "epoch": 0.93, "learning_rate": 4.0137614678899086e-08, "logits/chosen": -3.02640438079834, "logits/rejected": -3.011373996734619, "logps/chosen": -295.5868835449219, "logps/rejected": -80.76414489746094, "loss": 0.4707, "rewards/accuracies": 0.96875, "rewards/chosen": 0.09663239866495132, "rewards/margins": 0.5815601944923401, "rewards/rejected": -0.48492780327796936, "step": 450 }, { "epoch": 0.95, "learning_rate": 2.86697247706422e-08, "logits/chosen": -3.0195059776306152, "logits/rejected": -2.988323926925659, "logps/chosen": -300.5026550292969, "logps/rejected": -86.79838562011719, "loss": 0.4808, "rewards/accuracies": 0.96875, "rewards/chosen": 0.11054690927267075, "rewards/margins": 0.5899176001548767, "rewards/rejected": -0.47937074303627014, "step": 460 }, { "epoch": 0.97, "learning_rate": 1.720183486238532e-08, "logits/chosen": -3.0426931381225586, "logits/rejected": -3.0394179821014404, "logps/chosen": -235.52706909179688, "logps/rejected": -73.9857406616211, "loss": 0.4819, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.08785500377416611, "rewards/margins": 0.5274263620376587, "rewards/rejected": -0.4395713806152344, "step": 470 }, { "epoch": 0.99, "learning_rate": 5.73394495412844e-09, "logits/chosen": -3.0092616081237793, "logits/rejected": -2.972731590270996, "logps/chosen": -249.88876342773438, "logps/rejected": -85.80451965332031, "loss": 0.482, "rewards/accuracies": 0.96875, "rewards/chosen": 0.07512323558330536, "rewards/margins": 0.5230099558830261, "rewards/rejected": -0.44788676500320435, "step": 480 }, { "epoch": 1.0, "eval_logits/chosen": -3.034407377243042, "eval_logits/rejected": -3.069913864135742, "eval_logps/chosen": -271.40020751953125, "eval_logps/rejected": -175.5244140625, "eval_loss": 0.5650191903114319, "eval_rewards/accuracies": 0.76953125, "eval_rewards/chosen": 0.08157022297382355, "eval_rewards/margins": 0.33799096941947937, "eval_rewards/rejected": -0.25642073154449463, "eval_runtime": 256.4523, "eval_samples_per_second": 7.799, "eval_steps_per_second": 0.062, "step": 485 }, { "epoch": 1.0, "step": 485, "total_flos": 0.0, "train_loss": 0.5539181610972611, "train_runtime": 15602.6148, "train_samples_per_second": 3.978, "train_steps_per_second": 0.031 } ], "logging_steps": 10, "max_steps": 485, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }