{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9946666666666668, "eval_steps": 500, "global_step": 374, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02666666666666667, "grad_norm": 91.80353546142578, "learning_rate": 1.3157894736842104e-08, "logits/chosen": 0.05448655039072037, "logits/rejected": 0.061774831265211105, "logps/chosen": -70.88352966308594, "logps/rejected": -68.78348541259766, "loss": 0.6945, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": -0.007239150814712048, "rewards/margins": 0.00464630126953125, "rewards/rejected": -0.011885452084243298, "step": 5 }, { "epoch": 0.05333333333333334, "grad_norm": 98.0750732421875, "learning_rate": 2.6315789473684208e-08, "logits/chosen": -0.09558521211147308, "logits/rejected": -0.09429871290922165, "logps/chosen": -47.07535934448242, "logps/rejected": -57.48785400390625, "loss": 0.6855, "rewards/accuracies": 0.5, "rewards/chosen": 0.04929858446121216, "rewards/margins": 0.017685014754533768, "rewards/rejected": 0.03161356970667839, "step": 10 }, { "epoch": 0.08, "grad_norm": 86.46255493164062, "learning_rate": 3.947368421052631e-08, "logits/chosen": -0.0949799194931984, "logits/rejected": -0.08726556599140167, "logps/chosen": -58.88152313232422, "logps/rejected": -69.49443054199219, "loss": 0.6504, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.033757805824279785, "rewards/margins": 0.1223212480545044, "rewards/rejected": -0.08856344223022461, "step": 15 }, { "epoch": 0.10666666666666667, "grad_norm": 85.12960052490234, "learning_rate": 5.2631578947368416e-08, "logits/chosen": 0.04131064563989639, "logits/rejected": 0.047180645167827606, "logps/chosen": -64.10095977783203, "logps/rejected": -71.49341583251953, "loss": 0.622, "rewards/accuracies": 0.75, "rewards/chosen": 0.08759395778179169, "rewards/margins": 0.22742386162281036, "rewards/rejected": -0.13982990384101868, "step": 20 }, { "epoch": 0.13333333333333333, "grad_norm": 72.46548461914062, "learning_rate": 6.578947368421053e-08, "logits/chosen": 0.07103381305932999, "logits/rejected": 0.06991696357727051, "logps/chosen": -71.91819763183594, "logps/rejected": -76.61006927490234, "loss": 0.6107, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14151427149772644, "rewards/margins": 0.2075047492980957, "rewards/rejected": -0.06599047034978867, "step": 25 }, { "epoch": 0.16, "grad_norm": 71.43992614746094, "learning_rate": 7.894736842105262e-08, "logits/chosen": -0.0016497105825692415, "logits/rejected": -0.003721952438354492, "logps/chosen": -64.13134765625, "logps/rejected": -59.88336181640625, "loss": 0.5604, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.14888896048069, "rewards/margins": 0.5770025253295898, "rewards/rejected": -0.4281136095523834, "step": 30 }, { "epoch": 0.18666666666666668, "grad_norm": 86.11872100830078, "learning_rate": 9.210526315789473e-08, "logits/chosen": 0.09134344756603241, "logits/rejected": 0.08688181638717651, "logps/chosen": -74.20964050292969, "logps/rejected": -66.22145080566406, "loss": 0.5322, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3988303542137146, "rewards/margins": 0.3464100956916809, "rewards/rejected": 0.052420247346162796, "step": 35 }, { "epoch": 0.21333333333333335, "grad_norm": 83.05272674560547, "learning_rate": 1e-07, "logits/chosen": -0.051578253507614136, "logits/rejected": -0.04844808578491211, "logps/chosen": -91.24781036376953, "logps/rejected": -73.64341735839844, "loss": 0.5272, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.11290506273508072, "rewards/margins": 0.25997716188430786, "rewards/rejected": -0.14707207679748535, "step": 40 }, { "epoch": 0.24, "grad_norm": 74.43838500976562, "learning_rate": 1e-07, "logits/chosen": -0.06837300211191177, "logits/rejected": -0.06631087511777878, "logps/chosen": -60.6038818359375, "logps/rejected": -62.58821487426758, "loss": 0.5138, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5276230573654175, "rewards/margins": 0.8340091705322266, "rewards/rejected": -0.30638617277145386, "step": 45 }, { "epoch": 0.26666666666666666, "grad_norm": 62.49097442626953, "learning_rate": 1e-07, "logits/chosen": -0.00017045289860107005, "logits/rejected": -0.0001672551006777212, "logps/chosen": -82.781005859375, "logps/rejected": -82.09158325195312, "loss": 0.4674, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.34503522515296936, "rewards/margins": 1.0920366048812866, "rewards/rejected": -0.7470014691352844, "step": 50 }, { "epoch": 0.29333333333333333, "grad_norm": 80.10330200195312, "learning_rate": 1e-07, "logits/chosen": -0.02159346453845501, "logits/rejected": -0.023507962003350258, "logps/chosen": -66.51358032226562, "logps/rejected": -65.78565979003906, "loss": 0.5481, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3966673016548157, "rewards/margins": 0.8023613095283508, "rewards/rejected": -0.4056939482688904, "step": 55 }, { "epoch": 0.32, "grad_norm": 67.92163848876953, "learning_rate": 1e-07, "logits/chosen": -0.02508384920656681, "logits/rejected": -0.023196673020720482, "logps/chosen": -69.94526672363281, "logps/rejected": -54.190528869628906, "loss": 0.5126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.5231302976608276, "rewards/margins": 0.935653805732727, "rewards/rejected": -0.41252344846725464, "step": 60 }, { "epoch": 0.3466666666666667, "grad_norm": 62.79459762573242, "learning_rate": 1e-07, "logits/chosen": -0.14604972302913666, "logits/rejected": -0.1421947032213211, "logps/chosen": -71.80470275878906, "logps/rejected": -62.09037399291992, "loss": 0.5224, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6033689379692078, "rewards/margins": 0.9922472238540649, "rewards/rejected": -0.3888782560825348, "step": 65 }, { "epoch": 0.37333333333333335, "grad_norm": 63.457252502441406, "learning_rate": 1e-07, "logits/chosen": 0.06079145520925522, "logits/rejected": 0.07840003073215485, "logps/chosen": -71.81758117675781, "logps/rejected": -85.1531982421875, "loss": 0.39, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.24788689613342285, "rewards/margins": 1.4833877086639404, "rewards/rejected": -1.2355008125305176, "step": 70 }, { "epoch": 0.4, "grad_norm": 69.04164123535156, "learning_rate": 1e-07, "logits/chosen": 0.030724067240953445, "logits/rejected": 0.037750810384750366, "logps/chosen": -63.71375274658203, "logps/rejected": -74.11785888671875, "loss": 0.501, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5482760667800903, "rewards/margins": 1.5258257389068604, "rewards/rejected": -0.9775495529174805, "step": 75 }, { "epoch": 0.4266666666666667, "grad_norm": 68.25962829589844, "learning_rate": 1e-07, "logits/chosen": -0.0079464977607131, "logits/rejected": 0.0006744593265466392, "logps/chosen": -44.04719924926758, "logps/rejected": -63.342002868652344, "loss": 0.441, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5909361839294434, "rewards/margins": 0.9477832913398743, "rewards/rejected": -0.3568470776081085, "step": 80 }, { "epoch": 0.4533333333333333, "grad_norm": 61.337249755859375, "learning_rate": 1e-07, "logits/chosen": -0.13302002847194672, "logits/rejected": -0.12650710344314575, "logps/chosen": -76.5062255859375, "logps/rejected": -77.22346496582031, "loss": 0.3969, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.7510318160057068, "rewards/margins": 1.1422964334487915, "rewards/rejected": -0.3912646174430847, "step": 85 }, { "epoch": 0.48, "grad_norm": 60.21578598022461, "learning_rate": 1e-07, "logits/chosen": 0.08266867697238922, "logits/rejected": 0.08583595603704453, "logps/chosen": -85.27580261230469, "logps/rejected": -83.88575744628906, "loss": 0.4887, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2144974172115326, "rewards/margins": 1.155656337738037, "rewards/rejected": -0.9411588907241821, "step": 90 }, { "epoch": 0.5066666666666667, "grad_norm": 78.39936828613281, "learning_rate": 1e-07, "logits/chosen": -0.018161656334996223, "logits/rejected": -0.012231660075485706, "logps/chosen": -72.49486541748047, "logps/rejected": -81.66197967529297, "loss": 0.4794, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.487576961517334, "rewards/margins": 1.1082786321640015, "rewards/rejected": -0.6207017302513123, "step": 95 }, { "epoch": 0.5333333333333333, "grad_norm": 72.66243743896484, "learning_rate": 1e-07, "logits/chosen": -0.11775938421487808, "logits/rejected": -0.10979805141687393, "logps/chosen": -37.96864318847656, "logps/rejected": -41.957908630371094, "loss": 0.4535, "rewards/accuracies": 0.75, "rewards/chosen": 0.5172302722930908, "rewards/margins": 0.8645200729370117, "rewards/rejected": -0.3472897410392761, "step": 100 }, { "epoch": 0.56, "grad_norm": 84.33699798583984, "learning_rate": 1e-07, "logits/chosen": -0.07096003741025925, "logits/rejected": -0.06229569762945175, "logps/chosen": -79.24551391601562, "logps/rejected": -73.00336456298828, "loss": 0.4727, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.48409804701805115, "rewards/margins": 1.5594733953475952, "rewards/rejected": -1.0753753185272217, "step": 105 }, { "epoch": 0.5866666666666667, "grad_norm": 75.8474349975586, "learning_rate": 1e-07, "logits/chosen": 0.002287261188030243, "logits/rejected": 0.011448122560977936, "logps/chosen": -36.34912872314453, "logps/rejected": -46.311866760253906, "loss": 0.5285, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.3635416328907013, "rewards/margins": 0.3584732413291931, "rewards/rejected": 0.005068361759185791, "step": 110 }, { "epoch": 0.6133333333333333, "grad_norm": 75.15863800048828, "learning_rate": 1e-07, "logits/chosen": -0.07114384323358536, "logits/rejected": -0.06986474245786667, "logps/chosen": -53.876953125, "logps/rejected": -52.810569763183594, "loss": 0.4579, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.7901820540428162, "rewards/margins": 1.1014235019683838, "rewards/rejected": -0.3112414479255676, "step": 115 }, { "epoch": 0.64, "grad_norm": 64.49634552001953, "learning_rate": 1e-07, "logits/chosen": -0.023414045572280884, "logits/rejected": -0.02103838510811329, "logps/chosen": -58.588645935058594, "logps/rejected": -54.12468719482422, "loss": 0.4412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6539210081100464, "rewards/margins": 1.1925280094146729, "rewards/rejected": -0.5386068224906921, "step": 120 }, { "epoch": 0.6666666666666666, "grad_norm": 77.18801879882812, "learning_rate": 1e-07, "logits/chosen": 0.07651327550411224, "logits/rejected": 0.09625453501939774, "logps/chosen": -86.07091522216797, "logps/rejected": -89.9097671508789, "loss": 0.449, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.31949272751808167, "rewards/margins": 1.6389261484146118, "rewards/rejected": -1.3194334506988525, "step": 125 }, { "epoch": 0.6933333333333334, "grad_norm": 56.2886848449707, "learning_rate": 1e-07, "logits/chosen": 0.029146382585167885, "logits/rejected": 0.03658589348196983, "logps/chosen": -73.04118347167969, "logps/rejected": -76.2184829711914, "loss": 0.4175, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.45680832862854004, "rewards/margins": 1.049865484237671, "rewards/rejected": -0.5930570960044861, "step": 130 }, { "epoch": 0.72, "grad_norm": 77.33464813232422, "learning_rate": 1e-07, "logits/chosen": -0.075675830245018, "logits/rejected": -0.0697128027677536, "logps/chosen": -76.72476959228516, "logps/rejected": -83.47200012207031, "loss": 0.4749, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.5277665853500366, "rewards/margins": 1.3413928747177124, "rewards/rejected": -0.8136262893676758, "step": 135 }, { "epoch": 0.7466666666666667, "grad_norm": 86.7103271484375, "learning_rate": 1e-07, "logits/chosen": -0.06388665735721588, "logits/rejected": -0.055876873433589935, "logps/chosen": -48.2696647644043, "logps/rejected": -43.421836853027344, "loss": 0.4583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.27933669090270996, "rewards/margins": 0.9728415608406067, "rewards/rejected": -0.6935049295425415, "step": 140 }, { "epoch": 0.7733333333333333, "grad_norm": 77.15120697021484, "learning_rate": 1e-07, "logits/chosen": -0.04482016712427139, "logits/rejected": -0.0454244539141655, "logps/chosen": -58.773719787597656, "logps/rejected": -54.565284729003906, "loss": 0.4598, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.33584028482437134, "rewards/margins": 0.4276786744594574, "rewards/rejected": -0.09183841943740845, "step": 145 }, { "epoch": 0.8, "grad_norm": 62.76329803466797, "learning_rate": 1e-07, "logits/chosen": -0.046587713062763214, "logits/rejected": -0.04582952335476875, "logps/chosen": -77.17513275146484, "logps/rejected": -81.3776626586914, "loss": 0.414, "rewards/accuracies": 0.75, "rewards/chosen": -0.00032804012880660594, "rewards/margins": 0.8608261346817017, "rewards/rejected": -0.8611541986465454, "step": 150 }, { "epoch": 0.8266666666666667, "grad_norm": 48.6246337890625, "learning_rate": 1e-07, "logits/chosen": -0.0023877639323472977, "logits/rejected": 0.000560196116566658, "logps/chosen": -61.36391067504883, "logps/rejected": -74.22142028808594, "loss": 0.3678, "rewards/accuracies": 0.75, "rewards/chosen": 0.5382565855979919, "rewards/margins": 1.6945278644561768, "rewards/rejected": -1.15627121925354, "step": 155 }, { "epoch": 0.8533333333333334, "grad_norm": 66.40213775634766, "learning_rate": 1e-07, "logits/chosen": -0.07235555350780487, "logits/rejected": -0.06712132692337036, "logps/chosen": -47.608489990234375, "logps/rejected": -58.56159591674805, "loss": 0.4747, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.3097238540649414, "rewards/margins": 0.753086268901825, "rewards/rejected": -0.44336241483688354, "step": 160 }, { "epoch": 0.88, "grad_norm": 56.45853805541992, "learning_rate": 1e-07, "logits/chosen": 0.007908957079052925, "logits/rejected": 0.008793281391263008, "logps/chosen": -56.72149658203125, "logps/rejected": -59.84600067138672, "loss": 0.4839, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4165565073490143, "rewards/margins": 0.6458317637443542, "rewards/rejected": -0.22927527129650116, "step": 165 }, { "epoch": 0.9066666666666666, "grad_norm": 56.173946380615234, "learning_rate": 1e-07, "logits/chosen": 0.030985862016677856, "logits/rejected": 0.03558924049139023, "logps/chosen": -62.272987365722656, "logps/rejected": -69.59276580810547, "loss": 0.4139, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.3212319016456604, "rewards/margins": 1.2905486822128296, "rewards/rejected": -0.9693166613578796, "step": 170 }, { "epoch": 0.9333333333333333, "grad_norm": 78.22785949707031, "learning_rate": 1e-07, "logits/chosen": 0.0006909176590852439, "logits/rejected": 0.002517472254112363, "logps/chosen": -59.8375129699707, "logps/rejected": -58.26896286010742, "loss": 0.3862, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.49182993173599243, "rewards/margins": 1.2024425268173218, "rewards/rejected": -0.7106126546859741, "step": 175 }, { "epoch": 0.96, "grad_norm": 50.518157958984375, "learning_rate": 1e-07, "logits/chosen": 0.03367740660905838, "logits/rejected": 0.04943201318383217, "logps/chosen": -73.77796173095703, "logps/rejected": -83.64482879638672, "loss": 0.4253, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5700420141220093, "rewards/margins": 2.157716751098633, "rewards/rejected": -1.587674856185913, "step": 180 }, { "epoch": 0.9866666666666667, "grad_norm": 57.97651290893555, "learning_rate": 1e-07, "logits/chosen": -0.07821191847324371, "logits/rejected": -0.07423903793096542, "logps/chosen": -63.325286865234375, "logps/rejected": -49.606910705566406, "loss": 0.4184, "rewards/accuracies": 0.75, "rewards/chosen": 0.7217522859573364, "rewards/margins": 1.4272280931472778, "rewards/rejected": -0.7054757475852966, "step": 185 }, { "epoch": 1.0133333333333334, "grad_norm": 42.711490631103516, "learning_rate": 1e-07, "logits/chosen": -0.011517315171658993, "logits/rejected": -0.016577688977122307, "logps/chosen": -63.11487579345703, "logps/rejected": -71.26173400878906, "loss": 0.3845, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.48871931433677673, "rewards/margins": 1.1433701515197754, "rewards/rejected": -0.654650866985321, "step": 190 }, { "epoch": 1.04, "grad_norm": 36.31159973144531, "learning_rate": 1e-07, "logits/chosen": -0.006033201701939106, "logits/rejected": 0.016330499202013016, "logps/chosen": -57.083030700683594, "logps/rejected": -71.14306640625, "loss": 0.2752, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7944145202636719, "rewards/margins": 1.1578031778335571, "rewards/rejected": -0.3633885979652405, "step": 195 }, { "epoch": 1.0666666666666667, "grad_norm": 32.363685607910156, "learning_rate": 1e-07, "logits/chosen": -0.09754471480846405, "logits/rejected": -0.09675069153308868, "logps/chosen": -50.0403938293457, "logps/rejected": -50.053855895996094, "loss": 0.2892, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.6043117642402649, "rewards/margins": 1.4791052341461182, "rewards/rejected": -0.8747934103012085, "step": 200 }, { "epoch": 1.0933333333333333, "grad_norm": 49.40591812133789, "learning_rate": 1e-07, "logits/chosen": 0.03340379148721695, "logits/rejected": 0.03613832965493202, "logps/chosen": -75.98091888427734, "logps/rejected": -89.6574935913086, "loss": 0.2736, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5458163022994995, "rewards/margins": 1.8934462070465088, "rewards/rejected": -1.3476299047470093, "step": 205 }, { "epoch": 1.12, "grad_norm": 33.99880599975586, "learning_rate": 1e-07, "logits/chosen": -0.014881300739943981, "logits/rejected": -0.008952580392360687, "logps/chosen": -61.28364944458008, "logps/rejected": -67.7909164428711, "loss": 0.2599, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6829001903533936, "rewards/margins": 2.180811643600464, "rewards/rejected": -1.4979116916656494, "step": 210 }, { "epoch": 1.1466666666666667, "grad_norm": 32.734886169433594, "learning_rate": 1e-07, "logits/chosen": -0.1162935271859169, "logits/rejected": -0.1172286868095398, "logps/chosen": -72.66918182373047, "logps/rejected": -81.00222778320312, "loss": 0.2414, "rewards/accuracies": 1.0, "rewards/chosen": 0.747806191444397, "rewards/margins": 2.400660276412964, "rewards/rejected": -1.652854323387146, "step": 215 }, { "epoch": 1.1733333333333333, "grad_norm": 45.963077545166016, "learning_rate": 1e-07, "logits/chosen": 0.03322611004114151, "logits/rejected": 0.03860603645443916, "logps/chosen": -67.83540344238281, "logps/rejected": -77.8966293334961, "loss": 0.262, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.487581342458725, "rewards/margins": 2.018979072570801, "rewards/rejected": -1.531397819519043, "step": 220 }, { "epoch": 1.2, "grad_norm": 31.000837326049805, "learning_rate": 1e-07, "logits/chosen": -0.06304231286048889, "logits/rejected": -0.06262607872486115, "logps/chosen": -72.00182342529297, "logps/rejected": -69.18010711669922, "loss": 0.3098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.16077418625354767, "rewards/margins": 1.9396165609359741, "rewards/rejected": -2.100390911102295, "step": 225 }, { "epoch": 1.2266666666666666, "grad_norm": 41.64716339111328, "learning_rate": 1e-07, "logits/chosen": -0.06013431400060654, "logits/rejected": -0.05363202840089798, "logps/chosen": -71.9935531616211, "logps/rejected": -77.51136779785156, "loss": 0.276, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6929202675819397, "rewards/margins": 2.5861668586730957, "rewards/rejected": -1.8932468891143799, "step": 230 }, { "epoch": 1.2533333333333334, "grad_norm": 39.7431526184082, "learning_rate": 1e-07, "logits/chosen": -0.0543329119682312, "logits/rejected": -0.04811491817235947, "logps/chosen": -44.972129821777344, "logps/rejected": -48.955535888671875, "loss": 0.3028, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.212233066558838, "rewards/margins": 2.2813668251037598, "rewards/rejected": -1.069133996963501, "step": 235 }, { "epoch": 1.28, "grad_norm": 36.94087600708008, "learning_rate": 1e-07, "logits/chosen": -0.03034238889813423, "logits/rejected": -0.02829553559422493, "logps/chosen": -45.940433502197266, "logps/rejected": -47.90568161010742, "loss": 0.2795, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9621788263320923, "rewards/margins": 1.6789710521697998, "rewards/rejected": -0.7167922258377075, "step": 240 }, { "epoch": 1.3066666666666666, "grad_norm": 34.41165542602539, "learning_rate": 1e-07, "logits/chosen": 0.08339103311300278, "logits/rejected": 0.10816546529531479, "logps/chosen": -73.56194305419922, "logps/rejected": -73.25684356689453, "loss": 0.2708, "rewards/accuracies": 1.0, "rewards/chosen": 0.37739673256874084, "rewards/margins": 2.098477840423584, "rewards/rejected": -1.7210811376571655, "step": 245 }, { "epoch": 1.3333333333333333, "grad_norm": 31.31505584716797, "learning_rate": 1e-07, "logits/chosen": 0.04144307225942612, "logits/rejected": 0.04325007274746895, "logps/chosen": -71.2078628540039, "logps/rejected": -80.70985412597656, "loss": 0.274, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5666126012802124, "rewards/margins": 1.788511872291565, "rewards/rejected": -1.2218992710113525, "step": 250 }, { "epoch": 1.3599999999999999, "grad_norm": 35.27589416503906, "learning_rate": 1e-07, "logits/chosen": 0.09748046845197678, "logits/rejected": 0.08802054077386856, "logps/chosen": -70.91510772705078, "logps/rejected": -70.45256805419922, "loss": 0.2802, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.5191822648048401, "rewards/margins": 2.1395535469055176, "rewards/rejected": -1.6203714609146118, "step": 255 }, { "epoch": 1.3866666666666667, "grad_norm": 36.5883674621582, "learning_rate": 1e-07, "logits/chosen": -0.028733888640999794, "logits/rejected": -0.022805647924542427, "logps/chosen": -68.54586029052734, "logps/rejected": -66.59730529785156, "loss": 0.3011, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6113399267196655, "rewards/margins": 2.464463472366333, "rewards/rejected": -1.853123664855957, "step": 260 }, { "epoch": 1.4133333333333333, "grad_norm": 21.833608627319336, "learning_rate": 1e-07, "logits/chosen": 0.007651590742170811, "logits/rejected": 0.011075135320425034, "logps/chosen": -70.35153198242188, "logps/rejected": -72.46214294433594, "loss": 0.2587, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6863104701042175, "rewards/margins": 1.9516360759735107, "rewards/rejected": -1.2653255462646484, "step": 265 }, { "epoch": 1.44, "grad_norm": 42.52511215209961, "learning_rate": 1e-07, "logits/chosen": -0.019661933183670044, "logits/rejected": -0.020269040018320084, "logps/chosen": -81.03693389892578, "logps/rejected": -75.59123229980469, "loss": 0.263, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1058200597763062, "rewards/margins": 2.4359288215637207, "rewards/rejected": -1.3301087617874146, "step": 270 }, { "epoch": 1.4666666666666668, "grad_norm": 54.02488327026367, "learning_rate": 1e-07, "logits/chosen": -0.045877061784267426, "logits/rejected": -0.05564676970243454, "logps/chosen": -80.84842681884766, "logps/rejected": -79.160888671875, "loss": 0.3026, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.76087886095047, "rewards/margins": 1.8281453847885132, "rewards/rejected": -1.067266583442688, "step": 275 }, { "epoch": 1.4933333333333334, "grad_norm": 50.83435821533203, "learning_rate": 1e-07, "logits/chosen": -0.09148342907428741, "logits/rejected": -0.08500328660011292, "logps/chosen": -41.23069763183594, "logps/rejected": -40.55739212036133, "loss": 0.3747, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.123250126838684, "rewards/margins": 1.7874069213867188, "rewards/rejected": -0.6641567945480347, "step": 280 }, { "epoch": 1.52, "grad_norm": 49.45879364013672, "learning_rate": 1e-07, "logits/chosen": -0.001988143427297473, "logits/rejected": -0.006540440954267979, "logps/chosen": -50.0777587890625, "logps/rejected": -53.56150436401367, "loss": 0.3404, "rewards/accuracies": 0.75, "rewards/chosen": 0.7993362545967102, "rewards/margins": 1.6362117528915405, "rewards/rejected": -0.8368755578994751, "step": 285 }, { "epoch": 1.5466666666666666, "grad_norm": 47.30949020385742, "learning_rate": 1e-07, "logits/chosen": 0.02390037290751934, "logits/rejected": 0.01714068278670311, "logps/chosen": -59.122962951660156, "logps/rejected": -60.09889602661133, "loss": 0.2568, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1686420440673828, "rewards/margins": 2.2462868690490723, "rewards/rejected": -1.0776447057724, "step": 290 }, { "epoch": 1.5733333333333333, "grad_norm": 34.02714538574219, "learning_rate": 1e-07, "logits/chosen": 0.0011189490323886275, "logits/rejected": 0.0036900490522384644, "logps/chosen": -56.07664108276367, "logps/rejected": -59.93505859375, "loss": 0.2744, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0613322257995605, "rewards/margins": 1.9388986825942993, "rewards/rejected": -0.8775663375854492, "step": 295 }, { "epoch": 1.6, "grad_norm": 52.804840087890625, "learning_rate": 1e-07, "logits/chosen": -0.017051326110959053, "logits/rejected": -0.011620084755122662, "logps/chosen": -53.503883361816406, "logps/rejected": -56.5186882019043, "loss": 0.3073, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.9534000158309937, "rewards/margins": 1.7106940746307373, "rewards/rejected": -0.7572939395904541, "step": 300 }, { "epoch": 1.6266666666666667, "grad_norm": 38.153648376464844, "learning_rate": 1e-07, "logits/chosen": 0.00943007878959179, "logits/rejected": 0.028043877333402634, "logps/chosen": -66.48413848876953, "logps/rejected": -105.61421966552734, "loss": 0.2633, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.8428307771682739, "rewards/margins": 2.7967689037323, "rewards/rejected": -1.9539384841918945, "step": 305 }, { "epoch": 1.6533333333333333, "grad_norm": 61.87904739379883, "learning_rate": 1e-07, "logits/chosen": -0.03582911938428879, "logits/rejected": -0.0231150072067976, "logps/chosen": -59.17612838745117, "logps/rejected": -71.96540832519531, "loss": 0.2505, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.9612646102905273, "rewards/margins": 2.2090392112731934, "rewards/rejected": -1.247774600982666, "step": 310 }, { "epoch": 1.6800000000000002, "grad_norm": 55.832542419433594, "learning_rate": 1e-07, "logits/chosen": -0.044271357357501984, "logits/rejected": -0.021111857146024704, "logps/chosen": -69.70079040527344, "logps/rejected": -89.76176452636719, "loss": 0.2382, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.37344738841056824, "rewards/margins": 2.854713201522827, "rewards/rejected": -2.4812657833099365, "step": 315 }, { "epoch": 1.7066666666666666, "grad_norm": 47.42922592163086, "learning_rate": 1e-07, "logits/chosen": -0.07435096800327301, "logits/rejected": -0.074483223259449, "logps/chosen": -77.44403076171875, "logps/rejected": -71.1507797241211, "loss": 0.3022, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5832604169845581, "rewards/margins": 2.0570945739746094, "rewards/rejected": -1.4738342761993408, "step": 320 }, { "epoch": 1.7333333333333334, "grad_norm": 44.8173713684082, "learning_rate": 1e-07, "logits/chosen": -0.001697111176326871, "logits/rejected": 0.004212519619613886, "logps/chosen": -73.09126281738281, "logps/rejected": -77.27495574951172, "loss": 0.2864, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5175535678863525, "rewards/margins": 1.9960486888885498, "rewards/rejected": -1.4784951210021973, "step": 325 }, { "epoch": 1.76, "grad_norm": 52.862125396728516, "learning_rate": 1e-07, "logits/chosen": -0.05788411572575569, "logits/rejected": -0.054354745894670486, "logps/chosen": -77.21604919433594, "logps/rejected": -90.43736267089844, "loss": 0.2593, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5059728026390076, "rewards/margins": 2.594764232635498, "rewards/rejected": -2.088791847229004, "step": 330 }, { "epoch": 1.7866666666666666, "grad_norm": 37.6073112487793, "learning_rate": 1e-07, "logits/chosen": -0.029447251930832863, "logits/rejected": -0.02990039810538292, "logps/chosen": -61.29267501831055, "logps/rejected": -83.2472915649414, "loss": 0.1939, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3546075224876404, "rewards/margins": 2.5164620876312256, "rewards/rejected": -2.1618547439575195, "step": 335 }, { "epoch": 1.8133333333333335, "grad_norm": 49.868560791015625, "learning_rate": 1e-07, "logits/chosen": 0.016156326979398727, "logits/rejected": 0.03281542658805847, "logps/chosen": -81.39087677001953, "logps/rejected": -81.13011169433594, "loss": 0.2456, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4369918704032898, "rewards/margins": 2.3583126068115234, "rewards/rejected": -1.9213206768035889, "step": 340 }, { "epoch": 1.8399999999999999, "grad_norm": 41.97678756713867, "learning_rate": 1e-07, "logits/chosen": -0.021811150014400482, "logits/rejected": -0.02089584432542324, "logps/chosen": -69.17388916015625, "logps/rejected": -58.414764404296875, "loss": 0.3057, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2403881549835205, "rewards/margins": 2.0268137454986572, "rewards/rejected": -0.7864255905151367, "step": 345 }, { "epoch": 1.8666666666666667, "grad_norm": 46.311893463134766, "learning_rate": 1e-07, "logits/chosen": -0.02446429245173931, "logits/rejected": -0.014963751658797264, "logps/chosen": -36.05630874633789, "logps/rejected": -43.71437454223633, "loss": 0.2696, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.1039693355560303, "rewards/margins": 2.3277952671051025, "rewards/rejected": -1.2238258123397827, "step": 350 }, { "epoch": 1.8933333333333333, "grad_norm": 44.26567459106445, "learning_rate": 1e-07, "logits/chosen": -0.04746484011411667, "logits/rejected": -0.04588810354471207, "logps/chosen": -67.33465576171875, "logps/rejected": -73.4018325805664, "loss": 0.2557, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.6922093629837036, "rewards/margins": 1.8044688701629639, "rewards/rejected": -1.1122596263885498, "step": 355 }, { "epoch": 1.92, "grad_norm": 29.275634765625, "learning_rate": 1e-07, "logits/chosen": -0.00010519176430534571, "logits/rejected": 0.01266922615468502, "logps/chosen": -63.38561248779297, "logps/rejected": -76.10896301269531, "loss": 0.2347, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5989769697189331, "rewards/margins": 2.0385549068450928, "rewards/rejected": -1.4395779371261597, "step": 360 }, { "epoch": 1.9466666666666668, "grad_norm": 38.70104217529297, "learning_rate": 1e-07, "logits/chosen": -0.07840217649936676, "logits/rejected": -0.06996472924947739, "logps/chosen": -77.53556060791016, "logps/rejected": -79.41651916503906, "loss": 0.23, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.3816269040107727, "rewards/margins": 2.385305166244507, "rewards/rejected": -2.003678321838379, "step": 365 }, { "epoch": 1.9733333333333334, "grad_norm": 45.61483383178711, "learning_rate": 1e-07, "logits/chosen": -0.030093077570199966, "logits/rejected": -0.03412201628088951, "logps/chosen": -65.24625396728516, "logps/rejected": -68.85826873779297, "loss": 0.3033, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0214712619781494, "rewards/margins": 1.9241225719451904, "rewards/rejected": -0.902651309967041, "step": 370 }, { "epoch": 1.9946666666666668, "step": 374, "total_flos": 0.0, "train_loss": 0.3853599507222201, "train_runtime": 1020.3175, "train_samples_per_second": 11.759, "train_steps_per_second": 0.367 } ], "logging_steps": 5, "max_steps": 374, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }