{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.998828811243412, "eval_steps": 75, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024985360140542652, "grad_norm": 16.803013672547674, "learning_rate": 4e-09, "logits/chosen": -0.7169057726860046, "logits/rejected": -0.7742066979408264, "logps/chosen": -158.30039978027344, "logps/rejected": -167.5013427734375, "loss": 0.6922, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.0021153492853045464, "rewards/margins": 0.0017622699961066246, "rewards/rejected": 0.00035307969665154815, "step": 2 }, { "epoch": 0.049970720281085304, "grad_norm": 17.42934158508575, "learning_rate": 8e-09, "logits/chosen": -0.6620150804519653, "logits/rejected": -0.7335376143455505, "logps/chosen": -166.97694396972656, "logps/rejected": -166.01077270507812, "loss": 0.6934, "rewards/accuracies": 0.54296875, "rewards/chosen": 0.001203760621137917, "rewards/margins": 0.0035094027407467365, "rewards/rejected": -0.002305642468854785, "step": 4 }, { "epoch": 0.07495608042162795, "grad_norm": 17.33795202273814, "learning_rate": 1.1999999999999998e-08, "logits/chosen": -0.7035447359085083, "logits/rejected": -0.7770529985427856, "logps/chosen": -160.94981384277344, "logps/rejected": -169.4982147216797, "loss": 0.6942, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0030440501868724823, "rewards/margins": -0.003373978193849325, "rewards/rejected": 0.006418028846383095, "step": 6 }, { "epoch": 0.09994144056217061, "grad_norm": 16.8202461847326, "learning_rate": 1.6e-08, "logits/chosen": -0.6711893677711487, "logits/rejected": -0.7459686994552612, "logps/chosen": -164.15184020996094, "logps/rejected": -180.4791259765625, "loss": 0.6923, "rewards/accuracies": 0.49609375, "rewards/chosen": 0.0023907795548439026, "rewards/margins": -0.002676962874829769, "rewards/rejected": 0.005067741964012384, "step": 8 }, { "epoch": 0.12492680070271325, "grad_norm": 17.3204666289138, "learning_rate": 2e-08, "logits/chosen": -0.6638763546943665, "logits/rejected": -0.7240799069404602, "logps/chosen": -165.18699645996094, "logps/rejected": -153.55906677246094, "loss": 0.6933, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.0023628827184438705, "rewards/margins": 0.002067561261355877, "rewards/rejected": 0.00029532192274928093, "step": 10 }, { "epoch": 0.1499121608432559, "grad_norm": 18.418398156120897, "learning_rate": 2.3999999999999997e-08, "logits/chosen": -0.7016454935073853, "logits/rejected": -0.7838542461395264, "logps/chosen": -159.6986083984375, "logps/rejected": -275.36236572265625, "loss": 0.694, "rewards/accuracies": 0.5078125, "rewards/chosen": -0.0019385786727070808, "rewards/margins": 0.0008969469927251339, "rewards/rejected": -0.0028355256654322147, "step": 12 }, { "epoch": 0.17489752098379854, "grad_norm": 17.791533019243598, "learning_rate": 2.8000000000000003e-08, "logits/chosen": -0.6400444507598877, "logits/rejected": -0.7088255882263184, "logps/chosen": -159.34640502929688, "logps/rejected": -162.47824096679688, "loss": 0.6923, "rewards/accuracies": 0.51953125, "rewards/chosen": 0.0004528433782979846, "rewards/margins": 0.0028284057043492794, "rewards/rejected": -0.0023755626752972603, "step": 14 }, { "epoch": 0.19988288112434122, "grad_norm": 17.319201876452443, "learning_rate": 3.2e-08, "logits/chosen": -0.6704908609390259, "logits/rejected": -0.7314557433128357, "logps/chosen": -160.60862731933594, "logps/rejected": -166.46450805664062, "loss": 0.6936, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.001064170734025538, "rewards/margins": 3.689667209982872e-06, "rewards/rejected": 0.0010604818817228079, "step": 16 }, { "epoch": 0.22486824126488386, "grad_norm": 16.072247731283092, "learning_rate": 3.6e-08, "logits/chosen": -0.6766926050186157, "logits/rejected": -0.7459310293197632, "logps/chosen": -162.13914489746094, "logps/rejected": -191.6351318359375, "loss": 0.6931, "rewards/accuracies": 0.50390625, "rewards/chosen": 0.005069206468760967, "rewards/margins": 0.0026566418819129467, "rewards/rejected": 0.0024125645868480206, "step": 18 }, { "epoch": 0.2498536014054265, "grad_norm": 17.359137433037688, "learning_rate": 4e-08, "logits/chosen": -0.6625803709030151, "logits/rejected": -0.7203136682510376, "logps/chosen": -156.5331573486328, "logps/rejected": -222.60467529296875, "loss": 0.6923, "rewards/accuracies": 0.47265625, "rewards/chosen": 0.001491243951022625, "rewards/margins": -0.0024129198864102364, "rewards/rejected": 0.0039041636046022177, "step": 20 }, { "epoch": 0.27483896154596915, "grad_norm": 17.83000320149723, "learning_rate": 4.4e-08, "logits/chosen": -0.6529428958892822, "logits/rejected": -0.7184248566627502, "logps/chosen": -161.4114990234375, "logps/rejected": -171.13998413085938, "loss": 0.6933, "rewards/accuracies": 0.54296875, "rewards/chosen": 0.003907513804733753, "rewards/margins": 0.0027447110041975975, "rewards/rejected": 0.0011628026841208339, "step": 22 }, { "epoch": 0.2998243216865118, "grad_norm": 18.22059882059828, "learning_rate": 4.799999999999999e-08, "logits/chosen": -0.6906304955482483, "logits/rejected": -0.7680624723434448, "logps/chosen": -171.14309692382812, "logps/rejected": -251.1785888671875, "loss": 0.6916, "rewards/accuracies": 0.515625, "rewards/chosen": 0.004641087260097265, "rewards/margins": 0.0015279713552445173, "rewards/rejected": 0.0031131161376833916, "step": 24 }, { "epoch": 0.32480968182705444, "grad_norm": 16.628845467432836, "learning_rate": 5.2e-08, "logits/chosen": -0.6719599962234497, "logits/rejected": -0.743903636932373, "logps/chosen": -159.7659912109375, "logps/rejected": -191.9639129638672, "loss": 0.6898, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004612984135746956, "rewards/margins": 0.00576308136805892, "rewards/rejected": -0.001150098629295826, "step": 26 }, { "epoch": 0.3497950419675971, "grad_norm": 17.41091157106264, "learning_rate": 5.6000000000000005e-08, "logits/chosen": -0.6741428375244141, "logits/rejected": -0.7584381699562073, "logps/chosen": -162.17498779296875, "logps/rejected": -213.79025268554688, "loss": 0.6896, "rewards/accuracies": 0.55859375, "rewards/chosen": 0.011129561811685562, "rewards/margins": 0.009353543631732464, "rewards/rejected": 0.0017760168993845582, "step": 28 }, { "epoch": 0.3747804021081398, "grad_norm": 17.981525024641783, "learning_rate": 6e-08, "logits/chosen": -0.6742160320281982, "logits/rejected": -0.7543560266494751, "logps/chosen": -164.4986114501953, "logps/rejected": -202.91433715820312, "loss": 0.6892, "rewards/accuracies": 0.52734375, "rewards/chosen": 0.006884717848151922, "rewards/margins": 0.0076943556778132915, "rewards/rejected": -0.000809638062492013, "step": 30 }, { "epoch": 0.39976576224868243, "grad_norm": 17.109113741756868, "learning_rate": 6.4e-08, "logits/chosen": -0.6298938989639282, "logits/rejected": -0.7012688517570496, "logps/chosen": -160.2120361328125, "logps/rejected": -160.2590789794922, "loss": 0.6895, "rewards/accuracies": 0.515625, "rewards/chosen": 0.010141907259821892, "rewards/margins": 0.007148087956011295, "rewards/rejected": 0.00299381953664124, "step": 32 }, { "epoch": 0.4247511223892251, "grad_norm": 17.727097440594363, "learning_rate": 6.8e-08, "logits/chosen": -0.6654178500175476, "logits/rejected": -0.7475502490997314, "logps/chosen": -174.74790954589844, "logps/rejected": -165.39630126953125, "loss": 0.6846, "rewards/accuracies": 0.63671875, "rewards/chosen": 0.022659441456198692, "rewards/margins": 0.020327560603618622, "rewards/rejected": 0.002331881085410714, "step": 34 }, { "epoch": 0.4497364825297677, "grad_norm": 16.090260229163803, "learning_rate": 7.2e-08, "logits/chosen": -0.6315876841545105, "logits/rejected": -0.7063596844673157, "logps/chosen": -158.3714141845703, "logps/rejected": -169.9803924560547, "loss": 0.6823, "rewards/accuracies": 0.62109375, "rewards/chosen": 0.02803650312125683, "rewards/margins": 0.024142108857631683, "rewards/rejected": 0.0038943937979638577, "step": 36 }, { "epoch": 0.47472184267031037, "grad_norm": 17.989654826552492, "learning_rate": 7.599999999999999e-08, "logits/chosen": -0.6781339645385742, "logits/rejected": -0.749303936958313, "logps/chosen": -158.1745147705078, "logps/rejected": -174.4684295654297, "loss": 0.6798, "rewards/accuracies": 0.6484375, "rewards/chosen": 0.035830847918987274, "rewards/margins": 0.030665559694170952, "rewards/rejected": 0.005165286362171173, "step": 38 }, { "epoch": 0.499707202810853, "grad_norm": 16.659252701258378, "learning_rate": 8e-08, "logits/chosen": -0.6752834916114807, "logits/rejected": -0.7593508958816528, "logps/chosen": -171.382568359375, "logps/rejected": -163.63429260253906, "loss": 0.6797, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03702434524893761, "rewards/margins": 0.027536926791071892, "rewards/rejected": 0.009487415663897991, "step": 40 }, { "epoch": 0.5246925629513957, "grad_norm": 16.32043656758089, "learning_rate": 8.4e-08, "logits/chosen": -0.6655137538909912, "logits/rejected": -0.7505479454994202, "logps/chosen": -169.40570068359375, "logps/rejected": -204.10903930664062, "loss": 0.6792, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03623414784669876, "rewards/margins": 0.026394760236144066, "rewards/rejected": 0.00983938854187727, "step": 42 }, { "epoch": 0.5496779230919383, "grad_norm": 15.622546817842467, "learning_rate": 8.8e-08, "logits/chosen": -0.6490427255630493, "logits/rejected": -0.7285715937614441, "logps/chosen": -156.1206817626953, "logps/rejected": -163.50775146484375, "loss": 0.678, "rewards/accuracies": 0.68359375, "rewards/chosen": 0.03998086601495743, "rewards/margins": 0.03349429368972778, "rewards/rejected": 0.006486575584858656, "step": 44 }, { "epoch": 0.574663283232481, "grad_norm": 16.184465147770208, "learning_rate": 9.2e-08, "logits/chosen": -0.6619192361831665, "logits/rejected": -0.7221629023551941, "logps/chosen": -165.5501251220703, "logps/rejected": -181.84275817871094, "loss": 0.674, "rewards/accuracies": 0.72265625, "rewards/chosen": 0.054740943014621735, "rewards/margins": 0.04307195544242859, "rewards/rejected": 0.011668986640870571, "step": 46 }, { "epoch": 0.5996486433730236, "grad_norm": 16.443566147600368, "learning_rate": 9.599999999999999e-08, "logits/chosen": -0.6748422384262085, "logits/rejected": -0.7496626973152161, "logps/chosen": -164.22450256347656, "logps/rejected": -171.41998291015625, "loss": 0.6607, "rewards/accuracies": 0.75390625, "rewards/chosen": 0.07237192243337631, "rewards/margins": 0.06578801572322845, "rewards/rejected": 0.006583897862583399, "step": 48 }, { "epoch": 0.6246340035135662, "grad_norm": 15.12731079703187, "learning_rate": 1e-07, "logits/chosen": -0.6667000651359558, "logits/rejected": -0.7247492074966431, "logps/chosen": -152.322998046875, "logps/rejected": -198.47271728515625, "loss": 0.6565, "rewards/accuracies": 0.7421875, "rewards/chosen": 0.08942899107933044, "rewards/margins": 0.06775067746639252, "rewards/rejected": 0.021678313612937927, "step": 50 }, { "epoch": 0.6496193636541089, "grad_norm": 14.509532935330286, "learning_rate": 1.04e-07, "logits/chosen": -0.6652993559837341, "logits/rejected": -0.761294960975647, "logps/chosen": -161.75856018066406, "logps/rejected": -202.80789184570312, "loss": 0.6513, "rewards/accuracies": 0.80859375, "rewards/chosen": 0.10889974981546402, "rewards/margins": 0.10048462450504303, "rewards/rejected": 0.008415117859840393, "step": 52 }, { "epoch": 0.6746047237946515, "grad_norm": 13.828860573682139, "learning_rate": 1.08e-07, "logits/chosen": -0.6644891500473022, "logits/rejected": -0.7297866940498352, "logps/chosen": -167.40745544433594, "logps/rejected": -191.5254669189453, "loss": 0.649, "rewards/accuracies": 0.76953125, "rewards/chosen": 0.10965421050786972, "rewards/margins": 0.09407318383455276, "rewards/rejected": 0.015581016428768635, "step": 54 }, { "epoch": 0.6995900839351942, "grad_norm": 14.094010354912754, "learning_rate": 1.1200000000000001e-07, "logits/chosen": -0.6829609274864197, "logits/rejected": -0.7518411874771118, "logps/chosen": -161.49169921875, "logps/rejected": -194.95101928710938, "loss": 0.6448, "rewards/accuracies": 0.75390625, "rewards/chosen": 0.10309572517871857, "rewards/margins": 0.10116783529520035, "rewards/rejected": 0.0019278817344456911, "step": 56 }, { "epoch": 0.7245754440757368, "grad_norm": 14.33157630919911, "learning_rate": 1.1599999999999999e-07, "logits/chosen": -0.678025484085083, "logits/rejected": -0.7501699924468994, "logps/chosen": -163.56793212890625, "logps/rejected": -198.6992950439453, "loss": 0.6439, "rewards/accuracies": 0.77734375, "rewards/chosen": 0.12135004997253418, "rewards/margins": 0.11319853365421295, "rewards/rejected": 0.008151513524353504, "step": 58 }, { "epoch": 0.7495608042162796, "grad_norm": 14.250114975850236, "learning_rate": 1.2e-07, "logits/chosen": -0.6545270681381226, "logits/rejected": -0.742324709892273, "logps/chosen": -177.452880859375, "logps/rejected": -257.645263671875, "loss": 0.6364, "rewards/accuracies": 0.74609375, "rewards/chosen": 0.11762025952339172, "rewards/margins": 0.10628640651702881, "rewards/rejected": 0.011333855800330639, "step": 60 }, { "epoch": 0.7745461643568222, "grad_norm": 13.35663678176419, "learning_rate": 1.24e-07, "logits/chosen": -0.6429523229598999, "logits/rejected": -0.7005941867828369, "logps/chosen": -160.99609375, "logps/rejected": -158.8332061767578, "loss": 0.6357, "rewards/accuracies": 0.75390625, "rewards/chosen": 0.11651378124952316, "rewards/margins": 0.11703144758939743, "rewards/rejected": -0.0005176601116545498, "step": 62 }, { "epoch": 0.7995315244973649, "grad_norm": 13.344011017214148, "learning_rate": 1.28e-07, "logits/chosen": -0.671217143535614, "logits/rejected": -0.7481105923652649, "logps/chosen": -164.79576110839844, "logps/rejected": -214.5727081298828, "loss": 0.6272, "rewards/accuracies": 0.79296875, "rewards/chosen": 0.13732488453388214, "rewards/margins": 0.14830084145069122, "rewards/rejected": -0.010975953191518784, "step": 64 }, { "epoch": 0.8245168846379075, "grad_norm": 11.603847422508784, "learning_rate": 1.32e-07, "logits/chosen": -0.6748225092887878, "logits/rejected": -0.7619199752807617, "logps/chosen": -165.9488525390625, "logps/rejected": -259.8753967285156, "loss": 0.615, "rewards/accuracies": 0.76171875, "rewards/chosen": 0.13191932439804077, "rewards/margins": 0.1852141171693802, "rewards/rejected": -0.05329480394721031, "step": 66 }, { "epoch": 0.8495022447784502, "grad_norm": 11.310140143623643, "learning_rate": 1.36e-07, "logits/chosen": -0.6559648513793945, "logits/rejected": -0.7262955904006958, "logps/chosen": -162.48326110839844, "logps/rejected": -168.70834350585938, "loss": 0.5908, "rewards/accuracies": 0.80078125, "rewards/chosen": 0.15893952548503876, "rewards/margins": 0.24133484065532684, "rewards/rejected": -0.08239532262086868, "step": 68 }, { "epoch": 0.8744876049189928, "grad_norm": 11.076590712903254, "learning_rate": 1.3999999999999998e-07, "logits/chosen": -0.64164799451828, "logits/rejected": -0.6981642246246338, "logps/chosen": -159.47991943359375, "logps/rejected": -164.59423828125, "loss": 0.588, "rewards/accuracies": 0.765625, "rewards/chosen": 0.11835081875324249, "rewards/margins": 0.2690027356147766, "rewards/rejected": -0.15065191686153412, "step": 70 }, { "epoch": 0.8994729650595354, "grad_norm": 11.30376463380917, "learning_rate": 1.44e-07, "logits/chosen": -0.6537081003189087, "logits/rejected": -0.7259843349456787, "logps/chosen": -163.25912475585938, "logps/rejected": -186.23190307617188, "loss": 0.5899, "rewards/accuracies": 0.73828125, "rewards/chosen": 0.036043643951416016, "rewards/margins": 0.22015729546546936, "rewards/rejected": -0.18411365151405334, "step": 72 }, { "epoch": 0.9244583252000781, "grad_norm": 11.470685518141812, "learning_rate": 1.48e-07, "logits/chosen": -0.6623800992965698, "logits/rejected": -0.7280963063240051, "logps/chosen": -163.83189392089844, "logps/rejected": -162.69908142089844, "loss": 0.5701, "rewards/accuracies": 0.7734375, "rewards/chosen": 0.0784626230597496, "rewards/margins": 0.33121681213378906, "rewards/rejected": -0.25275421142578125, "step": 74 }, { "epoch": 0.9369510052703494, "eval_logits/chosen": -0.6098010540008545, "eval_logits/rejected": -0.6948941946029663, "eval_logps/chosen": -174.5200653076172, "eval_logps/rejected": -156.43321228027344, "eval_loss": 0.5377179384231567, "eval_rewards/accuracies": 0.8399999737739563, "eval_rewards/chosen": 0.11009039729833603, "eval_rewards/margins": 0.3738202750682831, "eval_rewards/rejected": -0.26372990012168884, "eval_runtime": 29.7619, "eval_samples_per_second": 3.36, "eval_steps_per_second": 0.84, "step": 75 }, { "epoch": 0.9494436853406207, "grad_norm": 10.843228504877107, "learning_rate": 1.5199999999999998e-07, "logits/chosen": -0.6558808088302612, "logits/rejected": -0.7365143299102783, "logps/chosen": -167.81484985351562, "logps/rejected": -233.72686767578125, "loss": 0.5714, "rewards/accuracies": 0.78125, "rewards/chosen": 0.06528769433498383, "rewards/margins": 0.31673550605773926, "rewards/rejected": -0.2514478266239166, "step": 76 }, { "epoch": 0.9744290454811634, "grad_norm": 11.247238407003374, "learning_rate": 1.56e-07, "logits/chosen": -0.6631561517715454, "logits/rejected": -0.732117772102356, "logps/chosen": -158.63388061523438, "logps/rejected": -186.37835693359375, "loss": 0.5622, "rewards/accuracies": 0.80078125, "rewards/chosen": 0.03322272002696991, "rewards/margins": 0.3244516849517822, "rewards/rejected": -0.2912289500236511, "step": 78 }, { "epoch": 0.999414405621706, "grad_norm": 10.873879267366776, "learning_rate": 1.6e-07, "logits/chosen": -0.678787887096405, "logits/rejected": -0.762289822101593, "logps/chosen": -173.52723693847656, "logps/rejected": -212.18414306640625, "loss": 0.5644, "rewards/accuracies": 0.765625, "rewards/chosen": 0.04449426010251045, "rewards/margins": 0.29196038842201233, "rewards/rejected": -0.24746613204479218, "step": 80 }, { "epoch": 1.0243997657622488, "grad_norm": 10.813522808672303, "learning_rate": 1.6399999999999999e-07, "logits/chosen": -0.6670259833335876, "logits/rejected": -0.731939971446991, "logps/chosen": -169.05836486816406, "logps/rejected": -183.41171264648438, "loss": 0.5481, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.09290473908185959, "rewards/margins": 0.3946535289287567, "rewards/rejected": -0.3017488121986389, "step": 82 }, { "epoch": 1.0493851259027913, "grad_norm": 10.85312267043124, "learning_rate": 1.68e-07, "logits/chosen": -0.6822367310523987, "logits/rejected": -0.7420221567153931, "logps/chosen": -168.18081665039062, "logps/rejected": -175.647705078125, "loss": 0.5433, "rewards/accuracies": 0.765625, "rewards/chosen": 0.017166346311569214, "rewards/margins": 0.35215744376182556, "rewards/rejected": -0.33499109745025635, "step": 84 }, { "epoch": 1.074370486043334, "grad_norm": 10.234969231047268, "learning_rate": 1.7199999999999998e-07, "logits/chosen": -0.6367188096046448, "logits/rejected": -0.7021892666816711, "logps/chosen": -168.204345703125, "logps/rejected": -172.60887145996094, "loss": 0.5389, "rewards/accuracies": 0.78515625, "rewards/chosen": 0.031016340479254723, "rewards/margins": 0.4050399959087372, "rewards/rejected": -0.3740236461162567, "step": 86 }, { "epoch": 1.0993558461838766, "grad_norm": 10.400074349909037, "learning_rate": 1.76e-07, "logits/chosen": -0.6521391272544861, "logits/rejected": -0.7159854769706726, "logps/chosen": -165.23777770996094, "logps/rejected": -179.17791748046875, "loss": 0.5255, "rewards/accuracies": 0.80859375, "rewards/chosen": 0.010984277352690697, "rewards/margins": 0.4423283338546753, "rewards/rejected": -0.43134409189224243, "step": 88 }, { "epoch": 1.1243412063244194, "grad_norm": 10.94221462203906, "learning_rate": 1.8e-07, "logits/chosen": -0.6484578847885132, "logits/rejected": -0.7167034149169922, "logps/chosen": -164.0042724609375, "logps/rejected": -189.57470703125, "loss": 0.4977, "rewards/accuracies": 0.80859375, "rewards/chosen": 0.044118743389844894, "rewards/margins": 0.5543583035469055, "rewards/rejected": -0.5102395415306091, "step": 90 }, { "epoch": 1.149326566464962, "grad_norm": 10.061790706384036, "learning_rate": 1.84e-07, "logits/chosen": -0.6450331211090088, "logits/rejected": -0.6989036798477173, "logps/chosen": -161.26258850097656, "logps/rejected": -176.26287841796875, "loss": 0.4822, "rewards/accuracies": 0.8046875, "rewards/chosen": -0.09776711463928223, "rewards/margins": 0.5529555678367615, "rewards/rejected": -0.6507226824760437, "step": 92 }, { "epoch": 1.1743119266055047, "grad_norm": 10.181525940489752, "learning_rate": 1.88e-07, "logits/chosen": -0.6763854026794434, "logits/rejected": -0.7379953861236572, "logps/chosen": -162.50949096679688, "logps/rejected": -195.75962829589844, "loss": 0.467, "rewards/accuracies": 0.84765625, "rewards/chosen": -0.21141284704208374, "rewards/margins": 0.6165250539779663, "rewards/rejected": -0.8279378414154053, "step": 94 }, { "epoch": 1.1992972867460472, "grad_norm": 9.59098495871882, "learning_rate": 1.9199999999999997e-07, "logits/chosen": -0.6583154201507568, "logits/rejected": -0.7248339653015137, "logps/chosen": -171.67164611816406, "logps/rejected": -204.2442626953125, "loss": 0.4525, "rewards/accuracies": 0.83203125, "rewards/chosen": -0.22590558230876923, "rewards/margins": 0.7413816452026367, "rewards/rejected": -0.9672871828079224, "step": 96 }, { "epoch": 1.22428264688659, "grad_norm": 9.611151986852143, "learning_rate": 1.9599999999999998e-07, "logits/chosen": -0.6739534139633179, "logits/rejected": -0.7209540605545044, "logps/chosen": -165.88185119628906, "logps/rejected": -198.14913940429688, "loss": 0.4362, "rewards/accuracies": 0.83984375, "rewards/chosen": -0.4009418785572052, "rewards/margins": 0.8526190519332886, "rewards/rejected": -1.2535607814788818, "step": 98 }, { "epoch": 1.2492680070271325, "grad_norm": 9.635547492152954, "learning_rate": 2e-07, "logits/chosen": -0.6529893279075623, "logits/rejected": -0.7117218971252441, "logps/chosen": -167.87130737304688, "logps/rejected": -199.54925537109375, "loss": 0.4253, "rewards/accuracies": 0.859375, "rewards/chosen": -0.5539758205413818, "rewards/margins": 0.8746498823165894, "rewards/rejected": -1.4286257028579712, "step": 100 }, { "epoch": 1.2742533671676752, "grad_norm": 9.323014566726187, "learning_rate": 1.9945218953682733e-07, "logits/chosen": -0.6634210348129272, "logits/rejected": -0.7515499591827393, "logps/chosen": -179.8871307373047, "logps/rejected": -211.56712341308594, "loss": 0.4162, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.6504544615745544, "rewards/margins": 0.9477463364601135, "rewards/rejected": -1.598200798034668, "step": 102 }, { "epoch": 1.2992387273082178, "grad_norm": 9.782518483212584, "learning_rate": 1.9781476007338056e-07, "logits/chosen": -0.6890003681182861, "logits/rejected": -0.7631358504295349, "logps/chosen": -178.91226196289062, "logps/rejected": -221.65196228027344, "loss": 0.4111, "rewards/accuracies": 0.84375, "rewards/chosen": -0.8466004729270935, "rewards/margins": 0.9782698154449463, "rewards/rejected": -1.8248703479766846, "step": 104 }, { "epoch": 1.3242240874487605, "grad_norm": 9.676632000972155, "learning_rate": 1.9510565162951537e-07, "logits/chosen": -0.6939510703086853, "logits/rejected": -0.7790961861610413, "logps/chosen": -175.7962188720703, "logps/rejected": -225.33914184570312, "loss": 0.3898, "rewards/accuracies": 0.85546875, "rewards/chosen": -0.8194781541824341, "rewards/margins": 1.1897538900375366, "rewards/rejected": -2.0092320442199707, "step": 106 }, { "epoch": 1.349209447589303, "grad_norm": 9.450466029531317, "learning_rate": 1.9135454576426007e-07, "logits/chosen": -0.6339809894561768, "logits/rejected": -0.7058581113815308, "logps/chosen": -176.75390625, "logps/rejected": -183.5647430419922, "loss": 0.3804, "rewards/accuracies": 0.83984375, "rewards/chosen": -0.9991697072982788, "rewards/margins": 1.0066075325012207, "rewards/rejected": -2.00577712059021, "step": 108 }, { "epoch": 1.3741948077298458, "grad_norm": 9.045883620582659, "learning_rate": 1.8660254037844388e-07, "logits/chosen": -0.6670467853546143, "logits/rejected": -0.7262380123138428, "logps/chosen": -170.4173126220703, "logps/rejected": -232.4281005859375, "loss": 0.3591, "rewards/accuracies": 0.89453125, "rewards/chosen": -0.9117798805236816, "rewards/margins": 1.2910921573638916, "rewards/rejected": -2.2028720378875732, "step": 110 }, { "epoch": 1.3991801678703886, "grad_norm": 8.752793412837908, "learning_rate": 1.8090169943749475e-07, "logits/chosen": -0.66654372215271, "logits/rejected": -0.738584041595459, "logps/chosen": -168.66757202148438, "logps/rejected": -189.63882446289062, "loss": 0.3534, "rewards/accuracies": 0.8515625, "rewards/chosen": -0.9708907604217529, "rewards/margins": 1.3288507461547852, "rewards/rejected": -2.299741506576538, "step": 112 }, { "epoch": 1.424165528010931, "grad_norm": 8.754108546199273, "learning_rate": 1.7431448254773942e-07, "logits/chosen": -0.6560633778572083, "logits/rejected": -0.7114984393119812, "logps/chosen": -163.83883666992188, "logps/rejected": -181.41429138183594, "loss": 0.3265, "rewards/accuracies": 0.86328125, "rewards/chosen": -1.0229204893112183, "rewards/margins": 1.4013088941574097, "rewards/rejected": -2.424229383468628, "step": 114 }, { "epoch": 1.4491508881514736, "grad_norm": 8.842470960325368, "learning_rate": 1.669130606358858e-07, "logits/chosen": -0.6747975945472717, "logits/rejected": -0.7556227445602417, "logps/chosen": -177.47671508789062, "logps/rejected": -229.03138732910156, "loss": 0.3496, "rewards/accuracies": 0.84765625, "rewards/chosen": -1.2031465768814087, "rewards/margins": 1.2502222061157227, "rewards/rejected": -2.453368663787842, "step": 116 }, { "epoch": 1.4741362482920164, "grad_norm": 8.560944140870841, "learning_rate": 1.5877852522924732e-07, "logits/chosen": -0.6661523580551147, "logits/rejected": -0.7374821305274963, "logps/chosen": -179.45278930664062, "logps/rejected": -200.83184814453125, "loss": 0.3463, "rewards/accuracies": 0.83984375, "rewards/chosen": -1.274424433708191, "rewards/margins": 1.3644013404846191, "rewards/rejected": -2.6388256549835205, "step": 118 }, { "epoch": 1.4991216084325591, "grad_norm": 8.135071032264696, "learning_rate": 1.5e-07, "logits/chosen": -0.6814154982566833, "logits/rejected": -0.7541234493255615, "logps/chosen": -176.5241241455078, "logps/rejected": -178.12158203125, "loss": 0.3172, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.236649513244629, "rewards/margins": 1.4433623552322388, "rewards/rejected": -2.680011749267578, "step": 120 }, { "epoch": 1.5241069685731017, "grad_norm": 8.611381059926462, "learning_rate": 1.4067366430758004e-07, "logits/chosen": -0.7052810192108154, "logits/rejected": -0.7841841578483582, "logps/chosen": -176.52615356445312, "logps/rejected": -265.2866516113281, "loss": 0.3464, "rewards/accuracies": 0.88671875, "rewards/chosen": -1.2547199726104736, "rewards/margins": 1.5170029401779175, "rewards/rejected": -2.7717230319976807, "step": 122 }, { "epoch": 1.5490923287136442, "grad_norm": 8.252873140772632, "learning_rate": 1.3090169943749475e-07, "logits/chosen": -0.6520602107048035, "logits/rejected": -0.7327940464019775, "logps/chosen": -176.3183135986328, "logps/rejected": -229.5253448486328, "loss": 0.3087, "rewards/accuracies": 0.88671875, "rewards/chosen": -1.2032685279846191, "rewards/margins": 1.7557697296142578, "rewards/rejected": -2.959038257598877, "step": 124 }, { "epoch": 1.574077688854187, "grad_norm": 10.055814181219855, "learning_rate": 1.207911690817759e-07, "logits/chosen": -0.6678023338317871, "logits/rejected": -0.7291412949562073, "logps/chosen": -170.61351013183594, "logps/rejected": -191.97714233398438, "loss": 0.3451, "rewards/accuracies": 0.88671875, "rewards/chosen": -1.2367451190948486, "rewards/margins": 1.7602653503417969, "rewards/rejected": -2.9970104694366455, "step": 126 }, { "epoch": 1.5990630489947297, "grad_norm": 7.755494267888693, "learning_rate": 1.1045284632676535e-07, "logits/chosen": -0.6732159852981567, "logits/rejected": -0.7458621263504028, "logps/chosen": -179.55528259277344, "logps/rejected": -194.04571533203125, "loss": 0.3013, "rewards/accuracies": 0.9140625, "rewards/chosen": -1.2336257696151733, "rewards/margins": 1.7932240962982178, "rewards/rejected": -3.0268499851226807, "step": 128 }, { "epoch": 1.6240484091352723, "grad_norm": 8.333802703761862, "learning_rate": 1e-07, "logits/chosen": -0.6721549034118652, "logits/rejected": -0.7458239793777466, "logps/chosen": -182.87872314453125, "logps/rejected": -235.57015991210938, "loss": 0.3049, "rewards/accuracies": 0.89453125, "rewards/chosen": -1.2039211988449097, "rewards/margins": 1.8136268854141235, "rewards/rejected": -3.017548084259033, "step": 130 }, { "epoch": 1.6490337692758148, "grad_norm": 8.273848396450413, "learning_rate": 8.954715367323466e-08, "logits/chosen": -0.693534255027771, "logits/rejected": -0.7786884307861328, "logps/chosen": -180.05812072753906, "logps/rejected": -233.4104461669922, "loss": 0.2977, "rewards/accuracies": 0.92578125, "rewards/chosen": -1.2325382232666016, "rewards/margins": 1.9344900846481323, "rewards/rejected": -3.1670281887054443, "step": 132 }, { "epoch": 1.6740191294163576, "grad_norm": 7.810872252646391, "learning_rate": 7.920883091822408e-08, "logits/chosen": -0.6568552255630493, "logits/rejected": -0.7265664935112, "logps/chosen": -176.93911743164062, "logps/rejected": -214.8884735107422, "loss": 0.2827, "rewards/accuracies": 0.8984375, "rewards/chosen": -1.2643663883209229, "rewards/margins": 1.8012300729751587, "rewards/rejected": -3.065596580505371, "step": 134 }, { "epoch": 1.6990044895569003, "grad_norm": 7.802511130852569, "learning_rate": 6.909830056250527e-08, "logits/chosen": -0.691701352596283, "logits/rejected": -0.7584172487258911, "logps/chosen": -180.76043701171875, "logps/rejected": -201.7536163330078, "loss": 0.2923, "rewards/accuracies": 0.8828125, "rewards/chosen": -1.3751585483551025, "rewards/margins": 1.8911828994750977, "rewards/rejected": -3.2663414478302, "step": 136 }, { "epoch": 1.723989849697443, "grad_norm": 7.661437671496506, "learning_rate": 5.9326335692419996e-08, "logits/chosen": -0.691138744354248, "logits/rejected": -0.7739748954772949, "logps/chosen": -179.13803100585938, "logps/rejected": -230.07017517089844, "loss": 0.2703, "rewards/accuracies": 0.9140625, "rewards/chosen": -1.3471490144729614, "rewards/margins": 2.1134843826293945, "rewards/rejected": -3.4606332778930664, "step": 138 }, { "epoch": 1.7489752098379856, "grad_norm": 7.425320729921027, "learning_rate": 5.000000000000002e-08, "logits/chosen": -0.6808772087097168, "logits/rejected": -0.7619104385375977, "logps/chosen": -178.50820922851562, "logps/rejected": -221.21847534179688, "loss": 0.2903, "rewards/accuracies": 0.8828125, "rewards/chosen": -1.328997015953064, "rewards/margins": 1.9231306314468384, "rewards/rejected": -3.2521276473999023, "step": 140 }, { "epoch": 1.7739605699785281, "grad_norm": 7.517073157187585, "learning_rate": 4.1221474770752695e-08, "logits/chosen": -0.6579867005348206, "logits/rejected": -0.7242329716682434, "logps/chosen": -174.0385284423828, "logps/rejected": -226.00914001464844, "loss": 0.2703, "rewards/accuracies": 0.9140625, "rewards/chosen": -1.3584879636764526, "rewards/margins": 1.9965893030166626, "rewards/rejected": -3.3550772666931152, "step": 142 }, { "epoch": 1.798945930119071, "grad_norm": 7.848642238408611, "learning_rate": 3.3086939364114206e-08, "logits/chosen": -0.6827540397644043, "logits/rejected": -0.7463814616203308, "logps/chosen": -181.77685546875, "logps/rejected": -202.3132781982422, "loss": 0.2875, "rewards/accuracies": 0.9140625, "rewards/chosen": -1.3786863088607788, "rewards/margins": 1.8975354433059692, "rewards/rejected": -3.276221990585327, "step": 144 }, { "epoch": 1.8239312902596136, "grad_norm": 7.224249615030409, "learning_rate": 2.5685517452260564e-08, "logits/chosen": -0.6343103647232056, "logits/rejected": -0.7141076326370239, "logps/chosen": -190.22946166992188, "logps/rejected": -238.4442901611328, "loss": 0.2742, "rewards/accuracies": 0.90625, "rewards/chosen": -1.3857475519180298, "rewards/margins": 2.068246841430664, "rewards/rejected": -3.4539945125579834, "step": 146 }, { "epoch": 1.8489166504001562, "grad_norm": 7.7461193794138135, "learning_rate": 1.9098300562505266e-08, "logits/chosen": -0.6793495416641235, "logits/rejected": -0.7641343474388123, "logps/chosen": -187.63839721679688, "logps/rejected": -191.110107421875, "loss": 0.2835, "rewards/accuracies": 0.890625, "rewards/chosen": -1.504347801208496, "rewards/margins": 1.7995954751968384, "rewards/rejected": -3.303943395614624, "step": 148 }, { "epoch": 1.8739020105406987, "grad_norm": 7.689771419387666, "learning_rate": 1.3397459621556128e-08, "logits/chosen": -0.6912616491317749, "logits/rejected": -0.7680445313453674, "logps/chosen": -182.9419708251953, "logps/rejected": -200.2938690185547, "loss": 0.2655, "rewards/accuracies": 0.9296875, "rewards/chosen": -1.4619263410568237, "rewards/margins": 2.098031759262085, "rewards/rejected": -3.559957981109619, "step": 150 }, { "epoch": 1.8739020105406987, "eval_logits/chosen": -0.6168845891952515, "eval_logits/rejected": -0.7024461627006531, "eval_logps/chosen": -189.5503692626953, "eval_logps/rejected": -188.0495147705078, "eval_loss": 0.26570188999176025, "eval_rewards/accuracies": 0.8399999737739563, "eval_rewards/chosen": -1.3929405212402344, "eval_rewards/margins": 2.0324180126190186, "eval_rewards/rejected": -3.425358533859253, "eval_runtime": 29.4565, "eval_samples_per_second": 3.395, "eval_steps_per_second": 0.849, "step": 150 }, { "epoch": 1.8988873706812415, "grad_norm": 8.195993031555705, "learning_rate": 8.645454235739902e-09, "logits/chosen": -0.676539957523346, "logits/rejected": -0.7394694685935974, "logps/chosen": -181.1811065673828, "logps/rejected": -207.39080810546875, "loss": 0.2738, "rewards/accuracies": 0.921875, "rewards/chosen": -1.422343373298645, "rewards/margins": 1.9836503267288208, "rewards/rejected": -3.405993700027466, "step": 152 }, { "epoch": 1.9238727308217842, "grad_norm": 7.242463445087286, "learning_rate": 4.8943483704846465e-09, "logits/chosen": -0.656994104385376, "logits/rejected": -0.7103748321533203, "logps/chosen": -183.3748321533203, "logps/rejected": -188.81219482421875, "loss": 0.2718, "rewards/accuracies": 0.89453125, "rewards/chosen": -1.4463797807693481, "rewards/margins": 1.880941390991211, "rewards/rejected": -3.3273210525512695, "step": 154 }, { "epoch": 1.9488580909623268, "grad_norm": 6.9367447852277575, "learning_rate": 2.1852399266194312e-09, "logits/chosen": -0.6909129619598389, "logits/rejected": -0.7760818004608154, "logps/chosen": -177.80374145507812, "logps/rejected": -266.32806396484375, "loss": 0.2686, "rewards/accuracies": 0.92578125, "rewards/chosen": -1.4405275583267212, "rewards/margins": 2.0439579486846924, "rewards/rejected": -3.484485626220703, "step": 156 }, { "epoch": 1.9738434511028693, "grad_norm": 7.621192478848495, "learning_rate": 5.47810463172671e-10, "logits/chosen": -0.6629250049591064, "logits/rejected": -0.7373142242431641, "logps/chosen": -183.51670837402344, "logps/rejected": -190.675537109375, "loss": 0.2845, "rewards/accuracies": 0.921875, "rewards/chosen": -1.451604962348938, "rewards/margins": 2.0776655673980713, "rewards/rejected": -3.5292704105377197, "step": 158 }, { "epoch": 1.998828811243412, "grad_norm": 7.292176844709846, "learning_rate": 0.0, "logits/chosen": -0.745610773563385, "logits/rejected": -0.8196827173233032, "logps/chosen": -172.93553161621094, "logps/rejected": -271.3219299316406, "loss": 0.264, "rewards/accuracies": 0.9140625, "rewards/chosen": -1.2879290580749512, "rewards/margins": 2.1956043243408203, "rewards/rejected": -3.4835333824157715, "step": 160 } ], "logging_steps": 2, "max_steps": 160, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 150, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }