{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9946666666666668, "eval_steps": 1000, "global_step": 374, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005333333333333333, "grad_norm": 0.5672486208114706, "learning_rate": 1.3157894736842107e-07, "logits/chosen": -0.9279001951217651, "logits/rejected": -0.858139157295227, "logps/chosen": -227.95245361328125, "logps/rejected": -298.680908203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05333333333333334, "grad_norm": 0.627338544388044, "learning_rate": 1.3157894736842106e-06, "logits/chosen": -1.0396056175231934, "logits/rejected": -1.0286777019500732, "logps/chosen": -272.0198974609375, "logps/rejected": -275.8685302734375, "loss": 0.693, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.0009284570114687085, "rewards/margins": 0.001860518823377788, "rewards/rejected": -0.0009320618119090796, "step": 10 }, { "epoch": 0.10666666666666667, "grad_norm": 0.6142473047006762, "learning_rate": 2.631578947368421e-06, "logits/chosen": -0.9923893809318542, "logits/rejected": -1.010837197303772, "logps/chosen": -281.47979736328125, "logps/rejected": -268.1535949707031, "loss": 0.6876, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.006842092610895634, "rewards/margins": 0.010892460122704506, "rewards/rejected": -0.0040503679774701595, "step": 20 }, { "epoch": 0.16, "grad_norm": 0.7939072580410426, "learning_rate": 3.947368421052632e-06, "logits/chosen": -1.0665647983551025, "logits/rejected": -1.030176043510437, "logps/chosen": -265.56134033203125, "logps/rejected": -276.7889404296875, "loss": 0.658, "rewards/accuracies": 1.0, "rewards/chosen": 0.03331710770726204, "rewards/margins": 0.06989365816116333, "rewards/rejected": -0.03657654672861099, "step": 30 }, { "epoch": 0.21333333333333335, "grad_norm": 1.582486884512004, "learning_rate": 4.999562902281866e-06, "logits/chosen": -1.0957633256912231, "logits/rejected": -1.0948419570922852, "logps/chosen": -266.51983642578125, "logps/rejected": -322.9562072753906, "loss": 0.5339, "rewards/accuracies": 1.0, "rewards/chosen": 0.11751838773488998, "rewards/margins": 0.3626277446746826, "rewards/rejected": -0.24510934948921204, "step": 40 }, { "epoch": 0.26666666666666666, "grad_norm": 0.8169663169771063, "learning_rate": 4.984280524733107e-06, "logits/chosen": -1.1005247831344604, "logits/rejected": -1.1034621000289917, "logps/chosen": -270.61724853515625, "logps/rejected": -416.21142578125, "loss": 0.2383, "rewards/accuracies": 1.0, "rewards/chosen": 0.15088006854057312, "rewards/margins": 1.5602823495864868, "rewards/rejected": -1.4094021320343018, "step": 50 }, { "epoch": 0.32, "grad_norm": 0.3705635446867794, "learning_rate": 4.947295864744121e-06, "logits/chosen": -1.1092312335968018, "logits/rejected": -1.0661927461624146, "logps/chosen": -294.2104797363281, "logps/rejected": -600.4303588867188, "loss": 0.0671, "rewards/accuracies": 1.0, "rewards/chosen": 0.08243656903505325, "rewards/margins": 3.372529983520508, "rewards/rejected": -3.2900936603546143, "step": 60 }, { "epoch": 0.37333333333333335, "grad_norm": 0.09049040384751605, "learning_rate": 4.8889320144653525e-06, "logits/chosen": -1.0999512672424316, "logits/rejected": -0.9580685496330261, "logps/chosen": -330.55194091796875, "logps/rejected": -985.8513793945312, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": -0.5064759254455566, "rewards/margins": 6.231157302856445, "rewards/rejected": -6.73763370513916, "step": 70 }, { "epoch": 0.4266666666666667, "grad_norm": 0.03915438076032923, "learning_rate": 4.809698831278217e-06, "logits/chosen": -0.9111706018447876, "logits/rejected": -0.7140064835548401, "logps/chosen": -339.89349365234375, "logps/rejected": -1143.648681640625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -0.66388338804245, "rewards/margins": 7.908216953277588, "rewards/rejected": -8.572099685668945, "step": 80 }, { "epoch": 0.48, "grad_norm": 0.14297159038439752, "learning_rate": 4.710288483761524e-06, "logits/chosen": -0.8663455247879028, "logits/rejected": -0.5593339800834656, "logps/chosen": -336.32037353515625, "logps/rejected": -1434.2740478515625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.8545030355453491, "rewards/margins": 10.544784545898438, "rewards/rejected": -11.399286270141602, "step": 90 }, { "epoch": 0.5333333333333333, "grad_norm": 0.040959450492445315, "learning_rate": 4.59156940501605e-06, "logits/chosen": -0.9001060724258423, "logits/rejected": -0.6381432414054871, "logps/chosen": -347.5130310058594, "logps/rejected": -1511.1595458984375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.7411862015724182, "rewards/margins": 11.320222854614258, "rewards/rejected": -12.061409950256348, "step": 100 }, { "epoch": 0.5866666666666667, "grad_norm": 0.010465140313811594, "learning_rate": 4.454578706170075e-06, "logits/chosen": -0.8447334170341492, "logits/rejected": -0.5461128950119019, "logps/chosen": -324.555908203125, "logps/rejected": -1563.4332275390625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.7364819645881653, "rewards/margins": 11.867830276489258, "rewards/rejected": -12.604310989379883, "step": 110 }, { "epoch": 0.64, "grad_norm": 0.018698800124617214, "learning_rate": 4.300513116340317e-06, "logits/chosen": -0.869040846824646, "logits/rejected": -0.6451767086982727, "logps/chosen": -368.4622497558594, "logps/rejected": -1524.347412109375, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.5922040939331055, "rewards/margins": 11.815667152404785, "rewards/rejected": -12.407870292663574, "step": 120 }, { "epoch": 0.6933333333333334, "grad_norm": 0.13074428382691303, "learning_rate": 4.130718528195303e-06, "logits/chosen": -0.7969690561294556, "logits/rejected": -0.5475348234176636, "logps/chosen": -344.4666442871094, "logps/rejected": -1474.3719482421875, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.5560614466667175, "rewards/margins": 11.487146377563477, "rewards/rejected": -12.043208122253418, "step": 130 }, { "epoch": 0.7466666666666667, "grad_norm": 0.007599436316894573, "learning_rate": 3.946678240449515e-06, "logits/chosen": -0.8450958132743835, "logits/rejected": -0.6068762540817261, "logps/chosen": -330.353271484375, "logps/rejected": -1493.345947265625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.6169044375419617, "rewards/margins": 11.569721221923828, "rewards/rejected": -12.186625480651855, "step": 140 }, { "epoch": 0.8, "grad_norm": 0.007906481570741206, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -0.7317169308662415, "logits/rejected": -0.472684770822525, "logps/chosen": -334.15936279296875, "logps/rejected": -1581.9859619140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.6376131176948547, "rewards/margins": 12.395392417907715, "rewards/rejected": -13.03300666809082, "step": 150 }, { "epoch": 0.8533333333333334, "grad_norm": 0.005373942509470849, "learning_rate": 3.542401956903321e-06, "logits/chosen": -0.802183985710144, "logits/rejected": -0.517475962638855, "logps/chosen": -346.039306640625, "logps/rejected": -1731.537841796875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.6884077787399292, "rewards/margins": 13.671854972839355, "rewards/rejected": -14.360262870788574, "step": 160 }, { "epoch": 0.9066666666666666, "grad_norm": 0.006855807463409515, "learning_rate": 3.3256976548879183e-06, "logits/chosen": -0.7976305484771729, "logits/rejected": -0.48461779952049255, "logps/chosen": -332.21539306640625, "logps/rejected": -1720.519775390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.6103914380073547, "rewards/margins": 13.717634201049805, "rewards/rejected": -14.328027725219727, "step": 170 }, { "epoch": 0.96, "grad_norm": 0.006530794838523059, "learning_rate": 3.1017801885224332e-06, "logits/chosen": -0.8089723587036133, "logits/rejected": -0.547804594039917, "logps/chosen": -331.06561279296875, "logps/rejected": -1655.431640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.4778788685798645, "rewards/margins": 13.217842102050781, "rewards/rejected": -13.695721626281738, "step": 180 }, { "epoch": 1.0133333333333334, "grad_norm": 0.002402189687585693, "learning_rate": 2.872605665440436e-06, "logits/chosen": -0.8274615406990051, "logits/rejected": -0.6256132125854492, "logps/chosen": -343.0153503417969, "logps/rejected": -1610.4468994140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.47418123483657837, "rewards/margins": 12.88371753692627, "rewards/rejected": -13.357897758483887, "step": 190 }, { "epoch": 1.0666666666666667, "grad_norm": 0.0030571071844198616, "learning_rate": 2.6401761180929798e-06, "logits/chosen": -0.8143685460090637, "logits/rejected": -0.5041629076004028, "logps/chosen": -334.1429138183594, "logps/rejected": -1818.0869140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.558273434638977, "rewards/margins": 14.656936645507812, "rewards/rejected": -15.2152099609375, "step": 200 }, { "epoch": 1.12, "grad_norm": 0.00820790094406569, "learning_rate": 2.4065220143091863e-06, "logits/chosen": -0.8139235377311707, "logits/rejected": -0.564848780632019, "logps/chosen": -345.9544982910156, "logps/rejected": -1764.953125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.475874662399292, "rewards/margins": 14.230936050415039, "rewards/rejected": -14.706808090209961, "step": 210 }, { "epoch": 1.1733333333333333, "grad_norm": 0.0043868434266810755, "learning_rate": 2.173684519449872e-06, "logits/chosen": -0.7210798263549805, "logits/rejected": -0.36144906282424927, "logps/chosen": -327.31622314453125, "logps/rejected": -1841.2216796875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.6532616019248962, "rewards/margins": 14.736944198608398, "rewards/rejected": -15.39020824432373, "step": 220 }, { "epoch": 1.2266666666666666, "grad_norm": 0.0068525897764614785, "learning_rate": 1.9436976651092143e-06, "logits/chosen": -0.7221536636352539, "logits/rejected": -0.5240283012390137, "logps/chosen": -350.7161865234375, "logps/rejected": -1619.64599609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.45207110047340393, "rewards/margins": 13.1558198928833, "rewards/rejected": -13.607892990112305, "step": 230 }, { "epoch": 1.28, "grad_norm": 0.0033954692926207583, "learning_rate": 1.7185705801358892e-06, "logits/chosen": -0.8645750880241394, "logits/rejected": -0.6266194581985474, "logps/chosen": -343.2956848144531, "logps/rejected": -1709.9945068359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.46298861503601074, "rewards/margins": 13.744463920593262, "rewards/rejected": -14.2074556350708, "step": 240 }, { "epoch": 1.3333333333333333, "grad_norm": 0.008787649845585386, "learning_rate": 1.500269939200648e-06, "logits/chosen": -0.7889136075973511, "logits/rejected": -0.5392887592315674, "logps/chosen": -342.34405517578125, "logps/rejected": -1703.201171875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.380669504404068, "rewards/margins": 13.953561782836914, "rewards/rejected": -14.334230422973633, "step": 250 }, { "epoch": 1.3866666666666667, "grad_norm": 0.0019219492245800027, "learning_rate": 1.2907027822369006e-06, "logits/chosen": -0.8140700459480286, "logits/rejected": -0.5784817337989807, "logps/chosen": -325.9125061035156, "logps/rejected": -1678.7923583984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.44779616594314575, "rewards/margins": 13.65271282196045, "rewards/rejected": -14.100509643554688, "step": 260 }, { "epoch": 1.44, "grad_norm": 0.0029707873083702468, "learning_rate": 1.0916998548409449e-06, "logits/chosen": -0.7845500111579895, "logits/rejected": -0.5012301206588745, "logps/chosen": -358.30419921875, "logps/rejected": -1745.314697265625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.6766600608825684, "rewards/margins": 13.99413013458252, "rewards/rejected": -14.67078971862793, "step": 270 }, { "epoch": 1.4933333333333334, "grad_norm": 0.003009319728743961, "learning_rate": 9.04999615167479e-07, "logits/chosen": -0.8230724334716797, "logits/rejected": -0.550376832485199, "logps/chosen": -346.7623596191406, "logps/rejected": -1745.7318115234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.5431645512580872, "rewards/margins": 14.117253303527832, "rewards/rejected": -14.660417556762695, "step": 280 }, { "epoch": 1.5466666666666666, "grad_norm": 0.05446004244419421, "learning_rate": 7.322330470336314e-07, "logits/chosen": -0.8042120933532715, "logits/rejected": -0.4271600842475891, "logps/chosen": -319.5382080078125, "logps/rejected": -1869.0816650390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.6211446523666382, "rewards/margins": 15.130853652954102, "rewards/rejected": -15.751996994018555, "step": 290 }, { "epoch": 1.6, "grad_norm": 0.0026251482158599366, "learning_rate": 5.749094119018431e-07, "logits/chosen": -0.8732158541679382, "logits/rejected": -0.5424922704696655, "logps/chosen": -324.2518005371094, "logps/rejected": -1899.756591796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.514877438545227, "rewards/margins": 15.413823127746582, "rewards/rejected": -15.928703308105469, "step": 300 }, { "epoch": 1.6533333333333333, "grad_norm": 0.0023975764198324104, "learning_rate": 4.344030642100133e-07, "logits/chosen": -0.8402504920959473, "logits/rejected": -0.5483088493347168, "logps/chosen": -330.42828369140625, "logps/rejected": -1794.327392578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.4678085446357727, "rewards/margins": 14.648382186889648, "rewards/rejected": -15.116189956665039, "step": 310 }, { "epoch": 1.7066666666666666, "grad_norm": 0.0028734478026904357, "learning_rate": 3.119414452281158e-07, "logits/chosen": -0.8355986475944519, "logits/rejected": -0.5001510977745056, "logps/chosen": -328.2898254394531, "logps/rejected": -1902.9224853515625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.504935622215271, "rewards/margins": 15.536231994628906, "rewards/rejected": -16.041166305541992, "step": 320 }, { "epoch": 1.76, "grad_norm": 0.002690221439778056, "learning_rate": 2.0859436032505954e-07, "logits/chosen": -0.896633505821228, "logits/rejected": -0.6399273872375488, "logps/chosen": -357.3143615722656, "logps/rejected": -1739.0318603515625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.39237022399902344, "rewards/margins": 14.242953300476074, "rewards/rejected": -14.635324478149414, "step": 330 }, { "epoch": 1.8133333333333335, "grad_norm": 0.002170040138657762, "learning_rate": 1.2526463331788503e-07, "logits/chosen": -0.847479522228241, "logits/rejected": -0.6152299642562866, "logps/chosen": -348.79742431640625, "logps/rejected": -1842.7880859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.48484006524086, "rewards/margins": 15.058262825012207, "rewards/rejected": -15.54310131072998, "step": 340 }, { "epoch": 1.8666666666666667, "grad_norm": 0.003300394252196583, "learning_rate": 6.268021954544095e-08, "logits/chosen": -0.8356849551200867, "logits/rejected": -0.4748550355434418, "logps/chosen": -336.01373291015625, "logps/rejected": -1930.8939208984375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.5835164785385132, "rewards/margins": 15.679295539855957, "rewards/rejected": -16.2628116607666, "step": 350 }, { "epoch": 1.92, "grad_norm": 0.0026600066446259086, "learning_rate": 2.1387846565474047e-08, "logits/chosen": -0.8271343111991882, "logits/rejected": -0.5607911348342896, "logps/chosen": -359.8587341308594, "logps/rejected": -1746.127685546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.4917505383491516, "rewards/margins": 14.259208679199219, "rewards/rejected": -14.750958442687988, "step": 360 }, { "epoch": 1.9733333333333334, "grad_norm": 0.014374372097938156, "learning_rate": 1.7482380290034795e-09, "logits/chosen": -0.807357668876648, "logits/rejected": -0.4851298928260803, "logps/chosen": -321.13861083984375, "logps/rejected": -1839.5152587890625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.5102803111076355, "rewards/margins": 15.034269332885742, "rewards/rejected": -15.544550895690918, "step": 370 }, { "epoch": 1.9946666666666668, "step": 374, "total_flos": 0.0, "train_loss": 0.07791340151829097, "train_runtime": 6908.9348, "train_samples_per_second": 3.474, "train_steps_per_second": 0.054 } ], "logging_steps": 10, "max_steps": 374, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }